diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100755--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,350433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8918060856847287, + "eval_steps": 1000, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.7836121713694573e-05, + "grad_norm": 323.2822570800781, + "learning_rate": 8.919015340706387e-09, + "loss": 10.7735, + "step": 1 + }, + { + "epoch": 3.567224342738915e-05, + "grad_norm": 210.89393615722656, + "learning_rate": 1.7838030681412774e-08, + "loss": 7.1321, + "step": 2 + }, + { + "epoch": 5.350836514108372e-05, + "grad_norm": 200.89666748046875, + "learning_rate": 2.6757046022119158e-08, + "loss": 9.5803, + "step": 3 + }, + { + "epoch": 7.13444868547783e-05, + "grad_norm": 125.82511138916016, + "learning_rate": 3.567606136282555e-08, + "loss": 7.126, + "step": 4 + }, + { + "epoch": 8.918060856847287e-05, + "grad_norm": 337.4522705078125, + "learning_rate": 4.459507670353193e-08, + "loss": 8.6375, + "step": 5 + }, + { + "epoch": 0.00010701673028216744, + "grad_norm": 210.69891357421875, + "learning_rate": 5.3514092044238315e-08, + "loss": 7.1199, + "step": 6 + }, + { + "epoch": 0.00012485285199586202, + "grad_norm": 94.18712615966797, + "learning_rate": 6.243310738494472e-08, + "loss": 6.4352, + "step": 7 + }, + { + "epoch": 0.0001426889737095566, + "grad_norm": 122.2567367553711, + "learning_rate": 7.13521227256511e-08, + "loss": 6.3767, + "step": 8 + }, + { + "epoch": 0.00016052509542325118, + "grad_norm": 133.38861083984375, + "learning_rate": 8.027113806635748e-08, + "loss": 6.6332, + "step": 9 + }, + { + "epoch": 0.00017836121713694575, + "grad_norm": 99.66352844238281, + "learning_rate": 8.919015340706386e-08, + "loss": 6.2766, + "step": 10 + }, + { + "epoch": 0.00019619733885064031, + "grad_norm": 104.68462371826172, + "learning_rate": 9.810916874777025e-08, + "loss": 7.0613, + "step": 11 + }, + { + "epoch": 0.00021403346056433488, + "grad_norm": 234.64990234375, + "learning_rate": 1.0702818408847663e-07, + "loss": 7.3977, + "step": 12 + }, + { + "epoch": 0.00023186958227802948, + "grad_norm": 235.16844177246094, + "learning_rate": 1.1594719942918301e-07, + "loss": 9.2356, + "step": 13 + }, + { + "epoch": 0.00024970570399172404, + "grad_norm": 119.6419448852539, + "learning_rate": 1.2486621476988943e-07, + "loss": 6.9626, + "step": 14 + }, + { + "epoch": 0.00026754182570541864, + "grad_norm": 182.8326416015625, + "learning_rate": 1.337852301105958e-07, + "loss": 7.3532, + "step": 15 + }, + { + "epoch": 0.0002853779474191132, + "grad_norm": 187.87606811523438, + "learning_rate": 1.427042454513022e-07, + "loss": 6.831, + "step": 16 + }, + { + "epoch": 0.00030321406913280777, + "grad_norm": 163.6082000732422, + "learning_rate": 1.5162326079200857e-07, + "loss": 7.0306, + "step": 17 + }, + { + "epoch": 0.00032105019084650236, + "grad_norm": 155.1985321044922, + "learning_rate": 1.6054227613271495e-07, + "loss": 7.0035, + "step": 18 + }, + { + "epoch": 0.0003388863125601969, + "grad_norm": 214.25897216796875, + "learning_rate": 1.6946129147342133e-07, + "loss": 7.3007, + "step": 19 + }, + { + "epoch": 0.0003567224342738915, + "grad_norm": 285.1042785644531, + "learning_rate": 1.7838030681412771e-07, + "loss": 6.9673, + "step": 20 + }, + { + "epoch": 0.00037455855598758604, + "grad_norm": 67.51636505126953, + "learning_rate": 1.872993221548341e-07, + "loss": 6.5743, + "step": 21 + }, + { + "epoch": 0.00039239467770128063, + "grad_norm": 198.23965454101562, + "learning_rate": 1.962183374955405e-07, + "loss": 8.0284, + "step": 22 + }, + { + "epoch": 0.0004102307994149752, + "grad_norm": 210.1754150390625, + "learning_rate": 2.0513735283624688e-07, + "loss": 6.7753, + "step": 23 + }, + { + "epoch": 0.00042806692112866976, + "grad_norm": 200.4541778564453, + "learning_rate": 2.1405636817695326e-07, + "loss": 8.327, + "step": 24 + }, + { + "epoch": 0.00044590304284236436, + "grad_norm": 268.79669189453125, + "learning_rate": 2.2297538351765964e-07, + "loss": 6.9922, + "step": 25 + }, + { + "epoch": 0.00046373916455605895, + "grad_norm": 78.06359100341797, + "learning_rate": 2.3189439885836602e-07, + "loss": 6.7067, + "step": 26 + }, + { + "epoch": 0.0004815752862697535, + "grad_norm": 310.5227355957031, + "learning_rate": 2.4081341419907243e-07, + "loss": 7.325, + "step": 27 + }, + { + "epoch": 0.0004994114079834481, + "grad_norm": 240.86062622070312, + "learning_rate": 2.4973242953977886e-07, + "loss": 6.1034, + "step": 28 + }, + { + "epoch": 0.0005172475296971427, + "grad_norm": 219.43072509765625, + "learning_rate": 2.5865144488048524e-07, + "loss": 7.0325, + "step": 29 + }, + { + "epoch": 0.0005350836514108373, + "grad_norm": 442.5327453613281, + "learning_rate": 2.675704602211916e-07, + "loss": 6.8672, + "step": 30 + }, + { + "epoch": 0.0005529197731245318, + "grad_norm": 151.15321350097656, + "learning_rate": 2.76489475561898e-07, + "loss": 7.2757, + "step": 31 + }, + { + "epoch": 0.0005707558948382264, + "grad_norm": 91.259765625, + "learning_rate": 2.854084909026044e-07, + "loss": 6.224, + "step": 32 + }, + { + "epoch": 0.0005885920165519209, + "grad_norm": 104.54844665527344, + "learning_rate": 2.9432750624331076e-07, + "loss": 6.3518, + "step": 33 + }, + { + "epoch": 0.0006064281382656155, + "grad_norm": 99.20610809326172, + "learning_rate": 3.0324652158401715e-07, + "loss": 5.9676, + "step": 34 + }, + { + "epoch": 0.0006242642599793101, + "grad_norm": 91.99747467041016, + "learning_rate": 3.121655369247235e-07, + "loss": 5.7934, + "step": 35 + }, + { + "epoch": 0.0006421003816930047, + "grad_norm": 108.061767578125, + "learning_rate": 3.210845522654299e-07, + "loss": 5.8156, + "step": 36 + }, + { + "epoch": 0.0006599365034066992, + "grad_norm": 252.54554748535156, + "learning_rate": 3.300035676061363e-07, + "loss": 6.7174, + "step": 37 + }, + { + "epoch": 0.0006777726251203938, + "grad_norm": 89.84114837646484, + "learning_rate": 3.3892258294684267e-07, + "loss": 5.8271, + "step": 38 + }, + { + "epoch": 0.0006956087468340884, + "grad_norm": 54.38611602783203, + "learning_rate": 3.4784159828754905e-07, + "loss": 5.9315, + "step": 39 + }, + { + "epoch": 0.000713444868547783, + "grad_norm": 258.0915832519531, + "learning_rate": 3.5676061362825543e-07, + "loss": 5.937, + "step": 40 + }, + { + "epoch": 0.0007312809902614776, + "grad_norm": 64.23783111572266, + "learning_rate": 3.656796289689618e-07, + "loss": 5.6457, + "step": 41 + }, + { + "epoch": 0.0007491171119751721, + "grad_norm": 81.00452423095703, + "learning_rate": 3.745986443096682e-07, + "loss": 5.6441, + "step": 42 + }, + { + "epoch": 0.0007669532336888667, + "grad_norm": 73.77316284179688, + "learning_rate": 3.835176596503746e-07, + "loss": 5.4766, + "step": 43 + }, + { + "epoch": 0.0007847893554025613, + "grad_norm": 101.67465209960938, + "learning_rate": 3.92436674991081e-07, + "loss": 5.1605, + "step": 44 + }, + { + "epoch": 0.0008026254771162559, + "grad_norm": 64.17410278320312, + "learning_rate": 4.0135569033178733e-07, + "loss": 4.8799, + "step": 45 + }, + { + "epoch": 0.0008204615988299504, + "grad_norm": 77.47195434570312, + "learning_rate": 4.1027470567249376e-07, + "loss": 4.9629, + "step": 46 + }, + { + "epoch": 0.000838297720543645, + "grad_norm": 90.45448303222656, + "learning_rate": 4.191937210132002e-07, + "loss": 4.929, + "step": 47 + }, + { + "epoch": 0.0008561338422573395, + "grad_norm": 58.99647521972656, + "learning_rate": 4.281127363539065e-07, + "loss": 4.5549, + "step": 48 + }, + { + "epoch": 0.0008739699639710341, + "grad_norm": 233.77806091308594, + "learning_rate": 4.3703175169461296e-07, + "loss": 5.069, + "step": 49 + }, + { + "epoch": 0.0008918060856847287, + "grad_norm": 92.89978790283203, + "learning_rate": 4.459507670353193e-07, + "loss": 4.781, + "step": 50 + }, + { + "epoch": 0.0009096422073984233, + "grad_norm": 192.8018798828125, + "learning_rate": 4.548697823760257e-07, + "loss": 4.6452, + "step": 51 + }, + { + "epoch": 0.0009274783291121179, + "grad_norm": 77.0093765258789, + "learning_rate": 4.6378879771673205e-07, + "loss": 4.4629, + "step": 52 + }, + { + "epoch": 0.0009453144508258124, + "grad_norm": 53.103431701660156, + "learning_rate": 4.727078130574385e-07, + "loss": 4.396, + "step": 53 + }, + { + "epoch": 0.000963150572539507, + "grad_norm": 73.26648712158203, + "learning_rate": 4.816268283981449e-07, + "loss": 4.5091, + "step": 54 + }, + { + "epoch": 0.0009809866942532017, + "grad_norm": 65.50182342529297, + "learning_rate": 4.905458437388512e-07, + "loss": 4.0329, + "step": 55 + }, + { + "epoch": 0.0009988228159668962, + "grad_norm": 122.55022430419922, + "learning_rate": 4.994648590795577e-07, + "loss": 4.1873, + "step": 56 + }, + { + "epoch": 0.0010166589376805907, + "grad_norm": 107.62737274169922, + "learning_rate": 5.08383874420264e-07, + "loss": 4.4333, + "step": 57 + }, + { + "epoch": 0.0010344950593942854, + "grad_norm": 51.7789306640625, + "learning_rate": 5.173028897609705e-07, + "loss": 3.8496, + "step": 58 + }, + { + "epoch": 0.0010523311811079798, + "grad_norm": 75.96542358398438, + "learning_rate": 5.262219051016768e-07, + "loss": 3.9539, + "step": 59 + }, + { + "epoch": 0.0010701673028216745, + "grad_norm": 56.87948989868164, + "learning_rate": 5.351409204423832e-07, + "loss": 3.6712, + "step": 60 + }, + { + "epoch": 0.001088003424535369, + "grad_norm": 135.2625274658203, + "learning_rate": 5.440599357830895e-07, + "loss": 3.334, + "step": 61 + }, + { + "epoch": 0.0011058395462490635, + "grad_norm": 53.386959075927734, + "learning_rate": 5.52978951123796e-07, + "loss": 3.4197, + "step": 62 + }, + { + "epoch": 0.0011236756679627582, + "grad_norm": 71.22798156738281, + "learning_rate": 5.618979664645024e-07, + "loss": 3.606, + "step": 63 + }, + { + "epoch": 0.0011415117896764527, + "grad_norm": 54.5213737487793, + "learning_rate": 5.708169818052088e-07, + "loss": 3.4468, + "step": 64 + }, + { + "epoch": 0.0011593479113901474, + "grad_norm": 186.6767120361328, + "learning_rate": 5.797359971459151e-07, + "loss": 4.5372, + "step": 65 + }, + { + "epoch": 0.0011771840331038419, + "grad_norm": 99.92565155029297, + "learning_rate": 5.886550124866215e-07, + "loss": 3.4468, + "step": 66 + }, + { + "epoch": 0.0011950201548175364, + "grad_norm": 36.229618072509766, + "learning_rate": 5.975740278273279e-07, + "loss": 2.7097, + "step": 67 + }, + { + "epoch": 0.001212856276531231, + "grad_norm": 44.98109817504883, + "learning_rate": 6.064930431680343e-07, + "loss": 3.1559, + "step": 68 + }, + { + "epoch": 0.0012306923982449256, + "grad_norm": 48.70180892944336, + "learning_rate": 6.154120585087407e-07, + "loss": 3.0656, + "step": 69 + }, + { + "epoch": 0.0012485285199586203, + "grad_norm": 61.8568229675293, + "learning_rate": 6.24331073849447e-07, + "loss": 3.0177, + "step": 70 + }, + { + "epoch": 0.0012663646416723147, + "grad_norm": 43.668888092041016, + "learning_rate": 6.332500891901534e-07, + "loss": 3.2232, + "step": 71 + }, + { + "epoch": 0.0012842007633860095, + "grad_norm": 80.41217803955078, + "learning_rate": 6.421691045308598e-07, + "loss": 3.0677, + "step": 72 + }, + { + "epoch": 0.001302036885099704, + "grad_norm": 44.075828552246094, + "learning_rate": 6.510881198715662e-07, + "loss": 2.6257, + "step": 73 + }, + { + "epoch": 0.0013198730068133984, + "grad_norm": 47.72416687011719, + "learning_rate": 6.600071352122726e-07, + "loss": 2.7277, + "step": 74 + }, + { + "epoch": 0.0013377091285270931, + "grad_norm": 44.380043029785156, + "learning_rate": 6.68926150552979e-07, + "loss": 2.7056, + "step": 75 + }, + { + "epoch": 0.0013555452502407876, + "grad_norm": 36.327720642089844, + "learning_rate": 6.778451658936853e-07, + "loss": 2.6033, + "step": 76 + }, + { + "epoch": 0.0013733813719544823, + "grad_norm": 29.031856536865234, + "learning_rate": 6.867641812343917e-07, + "loss": 2.5391, + "step": 77 + }, + { + "epoch": 0.0013912174936681768, + "grad_norm": 20.884868621826172, + "learning_rate": 6.956831965750981e-07, + "loss": 2.3389, + "step": 78 + }, + { + "epoch": 0.0014090536153818713, + "grad_norm": 65.40373992919922, + "learning_rate": 7.046022119158046e-07, + "loss": 2.7997, + "step": 79 + }, + { + "epoch": 0.001426889737095566, + "grad_norm": 66.30245208740234, + "learning_rate": 7.135212272565109e-07, + "loss": 2.8164, + "step": 80 + }, + { + "epoch": 0.0014447258588092605, + "grad_norm": 32.38147735595703, + "learning_rate": 7.224402425972173e-07, + "loss": 2.2865, + "step": 81 + }, + { + "epoch": 0.0014625619805229552, + "grad_norm": 18.060773849487305, + "learning_rate": 7.313592579379236e-07, + "loss": 2.156, + "step": 82 + }, + { + "epoch": 0.0014803981022366497, + "grad_norm": 34.35832977294922, + "learning_rate": 7.402782732786301e-07, + "loss": 2.5437, + "step": 83 + }, + { + "epoch": 0.0014982342239503441, + "grad_norm": 23.287845611572266, + "learning_rate": 7.491972886193364e-07, + "loss": 2.2245, + "step": 84 + }, + { + "epoch": 0.0015160703456640388, + "grad_norm": 54.39329147338867, + "learning_rate": 7.581163039600429e-07, + "loss": 2.161, + "step": 85 + }, + { + "epoch": 0.0015339064673777333, + "grad_norm": 18.95912742614746, + "learning_rate": 7.670353193007492e-07, + "loss": 2.156, + "step": 86 + }, + { + "epoch": 0.001551742589091428, + "grad_norm": 16.43122673034668, + "learning_rate": 7.759543346414556e-07, + "loss": 1.9135, + "step": 87 + }, + { + "epoch": 0.0015695787108051225, + "grad_norm": 29.445796966552734, + "learning_rate": 7.84873349982162e-07, + "loss": 1.9606, + "step": 88 + }, + { + "epoch": 0.001587414832518817, + "grad_norm": 15.478200912475586, + "learning_rate": 7.937923653228685e-07, + "loss": 1.9479, + "step": 89 + }, + { + "epoch": 0.0016052509542325117, + "grad_norm": 19.078706741333008, + "learning_rate": 8.027113806635747e-07, + "loss": 2.0803, + "step": 90 + }, + { + "epoch": 0.0016230870759462062, + "grad_norm": 74.88615417480469, + "learning_rate": 8.116303960042811e-07, + "loss": 1.9216, + "step": 91 + }, + { + "epoch": 0.001640923197659901, + "grad_norm": 10.065621376037598, + "learning_rate": 8.205494113449875e-07, + "loss": 1.785, + "step": 92 + }, + { + "epoch": 0.0016587593193735954, + "grad_norm": 16.853424072265625, + "learning_rate": 8.29468426685694e-07, + "loss": 1.7712, + "step": 93 + }, + { + "epoch": 0.00167659544108729, + "grad_norm": 14.692481994628906, + "learning_rate": 8.383874420264004e-07, + "loss": 1.7445, + "step": 94 + }, + { + "epoch": 0.0016944315628009846, + "grad_norm": 10.976871490478516, + "learning_rate": 8.473064573671067e-07, + "loss": 1.6523, + "step": 95 + }, + { + "epoch": 0.001712267684514679, + "grad_norm": 10.271615982055664, + "learning_rate": 8.56225472707813e-07, + "loss": 1.6485, + "step": 96 + }, + { + "epoch": 0.0017301038062283738, + "grad_norm": 10.25522232055664, + "learning_rate": 8.651444880485195e-07, + "loss": 1.7156, + "step": 97 + }, + { + "epoch": 0.0017479399279420682, + "grad_norm": 14.337623596191406, + "learning_rate": 8.740635033892259e-07, + "loss": 1.6553, + "step": 98 + }, + { + "epoch": 0.001765776049655763, + "grad_norm": 12.784684181213379, + "learning_rate": 8.829825187299322e-07, + "loss": 1.6241, + "step": 99 + }, + { + "epoch": 0.0017836121713694574, + "grad_norm": 12.90092945098877, + "learning_rate": 8.919015340706386e-07, + "loss": 1.6651, + "step": 100 + }, + { + "epoch": 0.001801448293083152, + "grad_norm": 37.45865249633789, + "learning_rate": 9.008205494113451e-07, + "loss": 1.6166, + "step": 101 + }, + { + "epoch": 0.0018192844147968466, + "grad_norm": 10.109075546264648, + "learning_rate": 9.097395647520514e-07, + "loss": 1.6241, + "step": 102 + }, + { + "epoch": 0.001837120536510541, + "grad_norm": 10.034185409545898, + "learning_rate": 9.186585800927579e-07, + "loss": 1.5487, + "step": 103 + }, + { + "epoch": 0.0018549566582242358, + "grad_norm": 12.023565292358398, + "learning_rate": 9.275775954334641e-07, + "loss": 1.5102, + "step": 104 + }, + { + "epoch": 0.0018727927799379303, + "grad_norm": 46.07319641113281, + "learning_rate": 9.364966107741706e-07, + "loss": 1.5761, + "step": 105 + }, + { + "epoch": 0.0018906289016516248, + "grad_norm": 7.093059539794922, + "learning_rate": 9.45415626114877e-07, + "loss": 1.4614, + "step": 106 + }, + { + "epoch": 0.0019084650233653195, + "grad_norm": 11.888785362243652, + "learning_rate": 9.543346414555833e-07, + "loss": 1.4124, + "step": 107 + }, + { + "epoch": 0.001926301145079014, + "grad_norm": 6.246786594390869, + "learning_rate": 9.632536567962897e-07, + "loss": 1.5451, + "step": 108 + }, + { + "epoch": 0.0019441372667927087, + "grad_norm": 107.81812286376953, + "learning_rate": 9.72172672136996e-07, + "loss": 1.461, + "step": 109 + }, + { + "epoch": 0.0019619733885064034, + "grad_norm": 6.224181175231934, + "learning_rate": 9.810916874777025e-07, + "loss": 1.2912, + "step": 110 + }, + { + "epoch": 0.001979809510220098, + "grad_norm": 4.665727615356445, + "learning_rate": 9.900107028184089e-07, + "loss": 1.3029, + "step": 111 + }, + { + "epoch": 0.0019976456319337923, + "grad_norm": 9.664981842041016, + "learning_rate": 9.989297181591155e-07, + "loss": 1.343, + "step": 112 + }, + { + "epoch": 0.002015481753647487, + "grad_norm": 9.630330085754395, + "learning_rate": 1.0078487334998216e-06, + "loss": 1.299, + "step": 113 + }, + { + "epoch": 0.0020333178753611813, + "grad_norm": 10.348453521728516, + "learning_rate": 1.016767748840528e-06, + "loss": 1.4833, + "step": 114 + }, + { + "epoch": 0.0020511539970748762, + "grad_norm": 4.746984958648682, + "learning_rate": 1.0256867641812344e-06, + "loss": 1.3208, + "step": 115 + }, + { + "epoch": 0.0020689901187885707, + "grad_norm": 34.37080383300781, + "learning_rate": 1.034605779521941e-06, + "loss": 1.4292, + "step": 116 + }, + { + "epoch": 0.002086826240502265, + "grad_norm": 21.08136558532715, + "learning_rate": 1.0435247948626474e-06, + "loss": 1.4235, + "step": 117 + }, + { + "epoch": 0.0021046623622159597, + "grad_norm": 8.585554122924805, + "learning_rate": 1.0524438102033535e-06, + "loss": 1.2834, + "step": 118 + }, + { + "epoch": 0.002122498483929654, + "grad_norm": 24.306896209716797, + "learning_rate": 1.06136282554406e-06, + "loss": 1.2809, + "step": 119 + }, + { + "epoch": 0.002140334605643349, + "grad_norm": 7.346068382263184, + "learning_rate": 1.0702818408847665e-06, + "loss": 1.5063, + "step": 120 + }, + { + "epoch": 0.0021581707273570436, + "grad_norm": 5.411838054656982, + "learning_rate": 1.0792008562254729e-06, + "loss": 1.2126, + "step": 121 + }, + { + "epoch": 0.002176006849070738, + "grad_norm": 4.67111873626709, + "learning_rate": 1.088119871566179e-06, + "loss": 1.3172, + "step": 122 + }, + { + "epoch": 0.0021938429707844325, + "grad_norm": 6.796841621398926, + "learning_rate": 1.0970388869068854e-06, + "loss": 1.2651, + "step": 123 + }, + { + "epoch": 0.002211679092498127, + "grad_norm": 5.288766384124756, + "learning_rate": 1.105957902247592e-06, + "loss": 1.3635, + "step": 124 + }, + { + "epoch": 0.002229515214211822, + "grad_norm": 4.216982841491699, + "learning_rate": 1.1148769175882984e-06, + "loss": 1.1853, + "step": 125 + }, + { + "epoch": 0.0022473513359255164, + "grad_norm": 10.145190238952637, + "learning_rate": 1.1237959329290048e-06, + "loss": 1.2562, + "step": 126 + }, + { + "epoch": 0.002265187457639211, + "grad_norm": 5.38865327835083, + "learning_rate": 1.132714948269711e-06, + "loss": 1.2521, + "step": 127 + }, + { + "epoch": 0.0022830235793529054, + "grad_norm": 4.854418754577637, + "learning_rate": 1.1416339636104175e-06, + "loss": 1.2694, + "step": 128 + }, + { + "epoch": 0.0023008597010666, + "grad_norm": 3.615403175354004, + "learning_rate": 1.150552978951124e-06, + "loss": 1.2194, + "step": 129 + }, + { + "epoch": 0.002318695822780295, + "grad_norm": 4.837342739105225, + "learning_rate": 1.1594719942918303e-06, + "loss": 1.2039, + "step": 130 + }, + { + "epoch": 0.0023365319444939893, + "grad_norm": 2.977159023284912, + "learning_rate": 1.1683910096325365e-06, + "loss": 1.113, + "step": 131 + }, + { + "epoch": 0.0023543680662076838, + "grad_norm": 9.740659713745117, + "learning_rate": 1.177310024973243e-06, + "loss": 1.1635, + "step": 132 + }, + { + "epoch": 0.0023722041879213783, + "grad_norm": 5.769473552703857, + "learning_rate": 1.1862290403139494e-06, + "loss": 1.1302, + "step": 133 + }, + { + "epoch": 0.0023900403096350727, + "grad_norm": 4.656841278076172, + "learning_rate": 1.1951480556546558e-06, + "loss": 1.1217, + "step": 134 + }, + { + "epoch": 0.0024078764313487677, + "grad_norm": 3.355837821960449, + "learning_rate": 1.2040670709953622e-06, + "loss": 1.0861, + "step": 135 + }, + { + "epoch": 0.002425712553062462, + "grad_norm": 5.188744068145752, + "learning_rate": 1.2129860863360686e-06, + "loss": 1.1827, + "step": 136 + }, + { + "epoch": 0.0024435486747761566, + "grad_norm": 4.206075668334961, + "learning_rate": 1.221905101676775e-06, + "loss": 1.0765, + "step": 137 + }, + { + "epoch": 0.002461384796489851, + "grad_norm": 4.9574079513549805, + "learning_rate": 1.2308241170174813e-06, + "loss": 1.2729, + "step": 138 + }, + { + "epoch": 0.0024792209182035456, + "grad_norm": 28.436365127563477, + "learning_rate": 1.2397431323581877e-06, + "loss": 1.1768, + "step": 139 + }, + { + "epoch": 0.0024970570399172405, + "grad_norm": 11.312703132629395, + "learning_rate": 1.248662147698894e-06, + "loss": 1.2622, + "step": 140 + }, + { + "epoch": 0.002514893161630935, + "grad_norm": 5.597165107727051, + "learning_rate": 1.2575811630396005e-06, + "loss": 1.0988, + "step": 141 + }, + { + "epoch": 0.0025327292833446295, + "grad_norm": 4.262165546417236, + "learning_rate": 1.2665001783803069e-06, + "loss": 0.9773, + "step": 142 + }, + { + "epoch": 0.002550565405058324, + "grad_norm": 3.6943838596343994, + "learning_rate": 1.2754191937210132e-06, + "loss": 1.1566, + "step": 143 + }, + { + "epoch": 0.002568401526772019, + "grad_norm": 3.584569215774536, + "learning_rate": 1.2843382090617196e-06, + "loss": 1.1361, + "step": 144 + }, + { + "epoch": 0.0025862376484857134, + "grad_norm": 3.577634334564209, + "learning_rate": 1.293257224402426e-06, + "loss": 1.0374, + "step": 145 + }, + { + "epoch": 0.002604073770199408, + "grad_norm": 4.231927394866943, + "learning_rate": 1.3021762397431324e-06, + "loss": 1.0556, + "step": 146 + }, + { + "epoch": 0.0026219098919131024, + "grad_norm": 4.495650291442871, + "learning_rate": 1.3110952550838388e-06, + "loss": 1.1813, + "step": 147 + }, + { + "epoch": 0.002639746013626797, + "grad_norm": 2.9533169269561768, + "learning_rate": 1.3200142704245451e-06, + "loss": 1.0133, + "step": 148 + }, + { + "epoch": 0.0026575821353404918, + "grad_norm": 5.6773858070373535, + "learning_rate": 1.3289332857652517e-06, + "loss": 1.134, + "step": 149 + }, + { + "epoch": 0.0026754182570541863, + "grad_norm": 4.191747665405273, + "learning_rate": 1.337852301105958e-06, + "loss": 1.0288, + "step": 150 + }, + { + "epoch": 0.0026932543787678807, + "grad_norm": 5.146604061126709, + "learning_rate": 1.3467713164466643e-06, + "loss": 0.9912, + "step": 151 + }, + { + "epoch": 0.0027110905004815752, + "grad_norm": 4.744831562042236, + "learning_rate": 1.3556903317873707e-06, + "loss": 0.9719, + "step": 152 + }, + { + "epoch": 0.0027289266221952697, + "grad_norm": 3.7088944911956787, + "learning_rate": 1.3646093471280773e-06, + "loss": 1.0839, + "step": 153 + }, + { + "epoch": 0.0027467627439089646, + "grad_norm": 2.484596014022827, + "learning_rate": 1.3735283624687834e-06, + "loss": 0.9486, + "step": 154 + }, + { + "epoch": 0.002764598865622659, + "grad_norm": 3.3776695728302, + "learning_rate": 1.3824473778094898e-06, + "loss": 1.0543, + "step": 155 + }, + { + "epoch": 0.0027824349873363536, + "grad_norm": 3.0981509685516357, + "learning_rate": 1.3913663931501962e-06, + "loss": 1.0516, + "step": 156 + }, + { + "epoch": 0.002800271109050048, + "grad_norm": 2.3308029174804688, + "learning_rate": 1.4002854084909028e-06, + "loss": 1.1079, + "step": 157 + }, + { + "epoch": 0.0028181072307637426, + "grad_norm": 6.98223352432251, + "learning_rate": 1.4092044238316092e-06, + "loss": 0.9549, + "step": 158 + }, + { + "epoch": 0.0028359433524774375, + "grad_norm": 2.6161375045776367, + "learning_rate": 1.4181234391723153e-06, + "loss": 0.9373, + "step": 159 + }, + { + "epoch": 0.002853779474191132, + "grad_norm": 6.54719352722168, + "learning_rate": 1.4270424545130217e-06, + "loss": 1.1123, + "step": 160 + }, + { + "epoch": 0.0028716155959048265, + "grad_norm": 5.496429443359375, + "learning_rate": 1.4359614698537283e-06, + "loss": 1.1083, + "step": 161 + }, + { + "epoch": 0.002889451717618521, + "grad_norm": 2.4545984268188477, + "learning_rate": 1.4448804851944347e-06, + "loss": 0.9655, + "step": 162 + }, + { + "epoch": 0.0029072878393322154, + "grad_norm": 3.0574238300323486, + "learning_rate": 1.453799500535141e-06, + "loss": 1.0865, + "step": 163 + }, + { + "epoch": 0.0029251239610459103, + "grad_norm": 2.148712158203125, + "learning_rate": 1.4627185158758472e-06, + "loss": 1.0007, + "step": 164 + }, + { + "epoch": 0.002942960082759605, + "grad_norm": 3.361168384552002, + "learning_rate": 1.4716375312165538e-06, + "loss": 0.973, + "step": 165 + }, + { + "epoch": 0.0029607962044732993, + "grad_norm": 2.2146620750427246, + "learning_rate": 1.4805565465572602e-06, + "loss": 0.9946, + "step": 166 + }, + { + "epoch": 0.002978632326186994, + "grad_norm": 2.1582112312316895, + "learning_rate": 1.4894755618979666e-06, + "loss": 0.9073, + "step": 167 + }, + { + "epoch": 0.0029964684479006883, + "grad_norm": 2.935107469558716, + "learning_rate": 1.4983945772386728e-06, + "loss": 0.9513, + "step": 168 + }, + { + "epoch": 0.003014304569614383, + "grad_norm": 4.1764960289001465, + "learning_rate": 1.5073135925793793e-06, + "loss": 0.9448, + "step": 169 + }, + { + "epoch": 0.0030321406913280777, + "grad_norm": 2.118518352508545, + "learning_rate": 1.5162326079200857e-06, + "loss": 0.9539, + "step": 170 + }, + { + "epoch": 0.003049976813041772, + "grad_norm": 1.9466509819030762, + "learning_rate": 1.5251516232607921e-06, + "loss": 0.9445, + "step": 171 + }, + { + "epoch": 0.0030678129347554667, + "grad_norm": 1.8523297309875488, + "learning_rate": 1.5340706386014985e-06, + "loss": 0.9683, + "step": 172 + }, + { + "epoch": 0.003085649056469161, + "grad_norm": 2.822516441345215, + "learning_rate": 1.5429896539422049e-06, + "loss": 0.9493, + "step": 173 + }, + { + "epoch": 0.003103485178182856, + "grad_norm": 2.1008737087249756, + "learning_rate": 1.5519086692829112e-06, + "loss": 1.0008, + "step": 174 + }, + { + "epoch": 0.0031213212998965506, + "grad_norm": 2.113912582397461, + "learning_rate": 1.5608276846236176e-06, + "loss": 0.8648, + "step": 175 + }, + { + "epoch": 0.003139157421610245, + "grad_norm": 5.489522457122803, + "learning_rate": 1.569746699964324e-06, + "loss": 0.8341, + "step": 176 + }, + { + "epoch": 0.0031569935433239395, + "grad_norm": 4.361125946044922, + "learning_rate": 1.5786657153050304e-06, + "loss": 0.8544, + "step": 177 + }, + { + "epoch": 0.003174829665037634, + "grad_norm": 2.187650442123413, + "learning_rate": 1.587584730645737e-06, + "loss": 0.9864, + "step": 178 + }, + { + "epoch": 0.003192665786751329, + "grad_norm": 2.0201544761657715, + "learning_rate": 1.5965037459864434e-06, + "loss": 0.8719, + "step": 179 + }, + { + "epoch": 0.0032105019084650234, + "grad_norm": 2.9684455394744873, + "learning_rate": 1.6054227613271493e-06, + "loss": 0.8732, + "step": 180 + }, + { + "epoch": 0.003228338030178718, + "grad_norm": 3.0871806144714355, + "learning_rate": 1.614341776667856e-06, + "loss": 0.9683, + "step": 181 + }, + { + "epoch": 0.0032461741518924124, + "grad_norm": 2.1151089668273926, + "learning_rate": 1.6232607920085623e-06, + "loss": 0.8258, + "step": 182 + }, + { + "epoch": 0.0032640102736061073, + "grad_norm": 2.6509292125701904, + "learning_rate": 1.6321798073492687e-06, + "loss": 0.7964, + "step": 183 + }, + { + "epoch": 0.003281846395319802, + "grad_norm": 2.232733726501465, + "learning_rate": 1.641098822689975e-06, + "loss": 0.9056, + "step": 184 + }, + { + "epoch": 0.0032996825170334963, + "grad_norm": 2.4667141437530518, + "learning_rate": 1.6500178380306814e-06, + "loss": 0.9627, + "step": 185 + }, + { + "epoch": 0.0033175186387471908, + "grad_norm": 2.943213939666748, + "learning_rate": 1.658936853371388e-06, + "loss": 0.9321, + "step": 186 + }, + { + "epoch": 0.0033353547604608852, + "grad_norm": 2.2956197261810303, + "learning_rate": 1.6678558687120944e-06, + "loss": 0.8541, + "step": 187 + }, + { + "epoch": 0.00335319088217458, + "grad_norm": 2.2546980381011963, + "learning_rate": 1.6767748840528008e-06, + "loss": 0.8759, + "step": 188 + }, + { + "epoch": 0.0033710270038882746, + "grad_norm": 2.286296844482422, + "learning_rate": 1.685693899393507e-06, + "loss": 0.8312, + "step": 189 + }, + { + "epoch": 0.003388863125601969, + "grad_norm": 1.9116014242172241, + "learning_rate": 1.6946129147342133e-06, + "loss": 0.7941, + "step": 190 + }, + { + "epoch": 0.0034066992473156636, + "grad_norm": 2.01637864112854, + "learning_rate": 1.7035319300749197e-06, + "loss": 0.8663, + "step": 191 + }, + { + "epoch": 0.003424535369029358, + "grad_norm": 1.940755844116211, + "learning_rate": 1.712450945415626e-06, + "loss": 0.7687, + "step": 192 + }, + { + "epoch": 0.003442371490743053, + "grad_norm": 1.4954153299331665, + "learning_rate": 1.7213699607563327e-06, + "loss": 0.7115, + "step": 193 + }, + { + "epoch": 0.0034602076124567475, + "grad_norm": 2.2623729705810547, + "learning_rate": 1.730288976097039e-06, + "loss": 0.8145, + "step": 194 + }, + { + "epoch": 0.003478043734170442, + "grad_norm": 3.0854616165161133, + "learning_rate": 1.7392079914377454e-06, + "loss": 0.9151, + "step": 195 + }, + { + "epoch": 0.0034958798558841365, + "grad_norm": 2.6582725048065186, + "learning_rate": 1.7481270067784518e-06, + "loss": 0.8509, + "step": 196 + }, + { + "epoch": 0.003513715977597831, + "grad_norm": 2.0415894985198975, + "learning_rate": 1.7570460221191582e-06, + "loss": 0.9229, + "step": 197 + }, + { + "epoch": 0.003531552099311526, + "grad_norm": 2.0389983654022217, + "learning_rate": 1.7659650374598644e-06, + "loss": 0.786, + "step": 198 + }, + { + "epoch": 0.0035493882210252204, + "grad_norm": 2.3008618354797363, + "learning_rate": 1.7748840528005708e-06, + "loss": 0.7101, + "step": 199 + }, + { + "epoch": 0.003567224342738915, + "grad_norm": 2.06827712059021, + "learning_rate": 1.7838030681412771e-06, + "loss": 0.8509, + "step": 200 + }, + { + "epoch": 0.0035850604644526093, + "grad_norm": 2.1113038063049316, + "learning_rate": 1.7927220834819837e-06, + "loss": 0.8245, + "step": 201 + }, + { + "epoch": 0.003602896586166304, + "grad_norm": 1.9879345893859863, + "learning_rate": 1.8016410988226901e-06, + "loss": 0.7701, + "step": 202 + }, + { + "epoch": 0.0036207327078799987, + "grad_norm": 2.397202968597412, + "learning_rate": 1.8105601141633965e-06, + "loss": 0.8071, + "step": 203 + }, + { + "epoch": 0.0036385688295936932, + "grad_norm": 2.3976638317108154, + "learning_rate": 1.8194791295041029e-06, + "loss": 0.7202, + "step": 204 + }, + { + "epoch": 0.0036564049513073877, + "grad_norm": 2.4036102294921875, + "learning_rate": 1.8283981448448093e-06, + "loss": 0.8314, + "step": 205 + }, + { + "epoch": 0.003674241073021082, + "grad_norm": 2.960175037384033, + "learning_rate": 1.8373171601855158e-06, + "loss": 0.7731, + "step": 206 + }, + { + "epoch": 0.0036920771947347767, + "grad_norm": 1.6719032526016235, + "learning_rate": 1.8462361755262218e-06, + "loss": 0.8398, + "step": 207 + }, + { + "epoch": 0.0037099133164484716, + "grad_norm": 1.6310625076293945, + "learning_rate": 1.8551551908669282e-06, + "loss": 0.7694, + "step": 208 + }, + { + "epoch": 0.003727749438162166, + "grad_norm": 2.1567938327789307, + "learning_rate": 1.8640742062076348e-06, + "loss": 0.9077, + "step": 209 + }, + { + "epoch": 0.0037455855598758606, + "grad_norm": 2.805018186569214, + "learning_rate": 1.8729932215483412e-06, + "loss": 0.8192, + "step": 210 + }, + { + "epoch": 0.003763421681589555, + "grad_norm": 4.236129283905029, + "learning_rate": 1.8819122368890475e-06, + "loss": 0.8155, + "step": 211 + }, + { + "epoch": 0.0037812578033032495, + "grad_norm": 2.8584091663360596, + "learning_rate": 1.890831252229754e-06, + "loss": 0.7163, + "step": 212 + }, + { + "epoch": 0.0037990939250169445, + "grad_norm": 3.4499547481536865, + "learning_rate": 1.8997502675704603e-06, + "loss": 0.7294, + "step": 213 + }, + { + "epoch": 0.003816930046730639, + "grad_norm": 1.8830771446228027, + "learning_rate": 1.9086692829111667e-06, + "loss": 0.7697, + "step": 214 + }, + { + "epoch": 0.0038347661684443334, + "grad_norm": 2.0773870944976807, + "learning_rate": 1.917588298251873e-06, + "loss": 0.7256, + "step": 215 + }, + { + "epoch": 0.003852602290158028, + "grad_norm": 3.0219199657440186, + "learning_rate": 1.9265073135925794e-06, + "loss": 0.6671, + "step": 216 + }, + { + "epoch": 0.0038704384118717224, + "grad_norm": 1.672107458114624, + "learning_rate": 1.935426328933286e-06, + "loss": 0.7603, + "step": 217 + }, + { + "epoch": 0.0038882745335854173, + "grad_norm": 2.161341428756714, + "learning_rate": 1.944345344273992e-06, + "loss": 0.8354, + "step": 218 + }, + { + "epoch": 0.003906110655299112, + "grad_norm": 2.58734393119812, + "learning_rate": 1.9532643596146986e-06, + "loss": 0.7265, + "step": 219 + }, + { + "epoch": 0.003923946777012807, + "grad_norm": 2.354545831680298, + "learning_rate": 1.962183374955405e-06, + "loss": 0.8154, + "step": 220 + }, + { + "epoch": 0.003941782898726501, + "grad_norm": 1.9903712272644043, + "learning_rate": 1.9711023902961113e-06, + "loss": 0.8039, + "step": 221 + }, + { + "epoch": 0.003959619020440196, + "grad_norm": 1.8442986011505127, + "learning_rate": 1.9800214056368177e-06, + "loss": 0.6834, + "step": 222 + }, + { + "epoch": 0.00397745514215389, + "grad_norm": 2.1357171535491943, + "learning_rate": 1.988940420977524e-06, + "loss": 0.718, + "step": 223 + }, + { + "epoch": 0.003995291263867585, + "grad_norm": 2.547299861907959, + "learning_rate": 1.997859436318231e-06, + "loss": 0.7586, + "step": 224 + }, + { + "epoch": 0.00401312738558128, + "grad_norm": 1.9040706157684326, + "learning_rate": 2.0067784516589373e-06, + "loss": 0.8541, + "step": 225 + }, + { + "epoch": 0.004030963507294974, + "grad_norm": 1.9173662662506104, + "learning_rate": 2.0156974669996432e-06, + "loss": 0.6373, + "step": 226 + }, + { + "epoch": 0.0040487996290086686, + "grad_norm": 2.092672109603882, + "learning_rate": 2.0246164823403496e-06, + "loss": 0.7019, + "step": 227 + }, + { + "epoch": 0.004066635750722363, + "grad_norm": 1.8871886730194092, + "learning_rate": 2.033535497681056e-06, + "loss": 0.7077, + "step": 228 + }, + { + "epoch": 0.0040844718724360575, + "grad_norm": 1.6429773569107056, + "learning_rate": 2.0424545130217624e-06, + "loss": 0.6934, + "step": 229 + }, + { + "epoch": 0.0041023079941497525, + "grad_norm": 2.3653886318206787, + "learning_rate": 2.0513735283624688e-06, + "loss": 0.7352, + "step": 230 + }, + { + "epoch": 0.0041201441158634465, + "grad_norm": 1.4735819101333618, + "learning_rate": 2.060292543703175e-06, + "loss": 0.583, + "step": 231 + }, + { + "epoch": 0.004137980237577141, + "grad_norm": 1.8437955379486084, + "learning_rate": 2.069211559043882e-06, + "loss": 0.6417, + "step": 232 + }, + { + "epoch": 0.0041558163592908355, + "grad_norm": 1.959718108177185, + "learning_rate": 2.0781305743845883e-06, + "loss": 0.7783, + "step": 233 + }, + { + "epoch": 0.00417365248100453, + "grad_norm": 2.854862928390503, + "learning_rate": 2.0870495897252947e-06, + "loss": 0.8445, + "step": 234 + }, + { + "epoch": 0.004191488602718225, + "grad_norm": 1.6057507991790771, + "learning_rate": 2.0959686050660007e-06, + "loss": 0.743, + "step": 235 + }, + { + "epoch": 0.004209324724431919, + "grad_norm": 1.6600632667541504, + "learning_rate": 2.104887620406707e-06, + "loss": 0.7014, + "step": 236 + }, + { + "epoch": 0.004227160846145614, + "grad_norm": 1.6484214067459106, + "learning_rate": 2.1138066357474134e-06, + "loss": 0.7308, + "step": 237 + }, + { + "epoch": 0.004244996967859308, + "grad_norm": 1.508714199066162, + "learning_rate": 2.12272565108812e-06, + "loss": 0.6619, + "step": 238 + }, + { + "epoch": 0.004262833089573003, + "grad_norm": 1.8737293481826782, + "learning_rate": 2.131644666428826e-06, + "loss": 0.7105, + "step": 239 + }, + { + "epoch": 0.004280669211286698, + "grad_norm": 2.0244300365448, + "learning_rate": 2.140563681769533e-06, + "loss": 0.6374, + "step": 240 + }, + { + "epoch": 0.004298505333000392, + "grad_norm": 1.944330096244812, + "learning_rate": 2.1494826971102394e-06, + "loss": 0.6326, + "step": 241 + }, + { + "epoch": 0.004316341454714087, + "grad_norm": 2.075991153717041, + "learning_rate": 2.1584017124509458e-06, + "loss": 0.7177, + "step": 242 + }, + { + "epoch": 0.004334177576427781, + "grad_norm": 1.7687220573425293, + "learning_rate": 2.167320727791652e-06, + "loss": 0.5802, + "step": 243 + }, + { + "epoch": 0.004352013698141476, + "grad_norm": 1.4437633752822876, + "learning_rate": 2.176239743132358e-06, + "loss": 0.5867, + "step": 244 + }, + { + "epoch": 0.004369849819855171, + "grad_norm": 1.7106932401657104, + "learning_rate": 2.1851587584730645e-06, + "loss": 0.6817, + "step": 245 + }, + { + "epoch": 0.004387685941568865, + "grad_norm": 1.6352781057357788, + "learning_rate": 2.194077773813771e-06, + "loss": 0.6019, + "step": 246 + }, + { + "epoch": 0.00440552206328256, + "grad_norm": 1.7784632444381714, + "learning_rate": 2.2029967891544772e-06, + "loss": 0.6686, + "step": 247 + }, + { + "epoch": 0.004423358184996254, + "grad_norm": 1.570668339729309, + "learning_rate": 2.211915804495184e-06, + "loss": 0.5983, + "step": 248 + }, + { + "epoch": 0.004441194306709949, + "grad_norm": 2.818324089050293, + "learning_rate": 2.2208348198358904e-06, + "loss": 0.6419, + "step": 249 + }, + { + "epoch": 0.004459030428423644, + "grad_norm": 1.8933470249176025, + "learning_rate": 2.229753835176597e-06, + "loss": 0.7321, + "step": 250 + }, + { + "epoch": 0.004476866550137338, + "grad_norm": 2.2469024658203125, + "learning_rate": 2.238672850517303e-06, + "loss": 0.6625, + "step": 251 + }, + { + "epoch": 0.004494702671851033, + "grad_norm": 2.026979684829712, + "learning_rate": 2.2475918658580096e-06, + "loss": 0.7948, + "step": 252 + }, + { + "epoch": 0.004512538793564727, + "grad_norm": 1.6033849716186523, + "learning_rate": 2.2565108811987155e-06, + "loss": 0.5719, + "step": 253 + }, + { + "epoch": 0.004530374915278422, + "grad_norm": 1.6025912761688232, + "learning_rate": 2.265429896539422e-06, + "loss": 0.5667, + "step": 254 + }, + { + "epoch": 0.004548211036992117, + "grad_norm": 1.8745354413986206, + "learning_rate": 2.2743489118801283e-06, + "loss": 0.578, + "step": 255 + }, + { + "epoch": 0.004566047158705811, + "grad_norm": 1.65565824508667, + "learning_rate": 2.283267927220835e-06, + "loss": 0.6516, + "step": 256 + }, + { + "epoch": 0.004583883280419506, + "grad_norm": 2.23405385017395, + "learning_rate": 2.2921869425615415e-06, + "loss": 0.6216, + "step": 257 + }, + { + "epoch": 0.0046017194021332, + "grad_norm": 1.5478591918945312, + "learning_rate": 2.301105957902248e-06, + "loss": 0.5952, + "step": 258 + }, + { + "epoch": 0.004619555523846895, + "grad_norm": 1.6051733493804932, + "learning_rate": 2.3100249732429542e-06, + "loss": 0.5946, + "step": 259 + }, + { + "epoch": 0.00463739164556059, + "grad_norm": 2.2132322788238525, + "learning_rate": 2.3189439885836606e-06, + "loss": 0.6655, + "step": 260 + }, + { + "epoch": 0.004655227767274284, + "grad_norm": 1.4680333137512207, + "learning_rate": 2.327863003924367e-06, + "loss": 0.5921, + "step": 261 + }, + { + "epoch": 0.004673063888987979, + "grad_norm": 1.6973994970321655, + "learning_rate": 2.336782019265073e-06, + "loss": 0.6678, + "step": 262 + }, + { + "epoch": 0.004690900010701673, + "grad_norm": 1.3919556140899658, + "learning_rate": 2.3457010346057793e-06, + "loss": 0.6527, + "step": 263 + }, + { + "epoch": 0.0047087361324153676, + "grad_norm": 1.5178241729736328, + "learning_rate": 2.354620049946486e-06, + "loss": 0.6835, + "step": 264 + }, + { + "epoch": 0.0047265722541290625, + "grad_norm": 2.493825912475586, + "learning_rate": 2.3635390652871925e-06, + "loss": 0.6242, + "step": 265 + }, + { + "epoch": 0.0047444083758427565, + "grad_norm": 1.703263759613037, + "learning_rate": 2.372458080627899e-06, + "loss": 0.5809, + "step": 266 + }, + { + "epoch": 0.0047622444975564514, + "grad_norm": 1.5449261665344238, + "learning_rate": 2.3813770959686053e-06, + "loss": 0.5676, + "step": 267 + }, + { + "epoch": 0.0047800806192701455, + "grad_norm": 1.3614991903305054, + "learning_rate": 2.3902961113093116e-06, + "loss": 0.5449, + "step": 268 + }, + { + "epoch": 0.00479791674098384, + "grad_norm": 1.0881032943725586, + "learning_rate": 2.399215126650018e-06, + "loss": 0.5325, + "step": 269 + }, + { + "epoch": 0.004815752862697535, + "grad_norm": 2.1187965869903564, + "learning_rate": 2.4081341419907244e-06, + "loss": 0.5731, + "step": 270 + }, + { + "epoch": 0.004833588984411229, + "grad_norm": 1.9353604316711426, + "learning_rate": 2.4170531573314308e-06, + "loss": 0.5831, + "step": 271 + }, + { + "epoch": 0.004851425106124924, + "grad_norm": 1.7603288888931274, + "learning_rate": 2.425972172672137e-06, + "loss": 0.6485, + "step": 272 + }, + { + "epoch": 0.004869261227838618, + "grad_norm": 1.321537733078003, + "learning_rate": 2.4348911880128435e-06, + "loss": 0.6277, + "step": 273 + }, + { + "epoch": 0.004887097349552313, + "grad_norm": 2.3718101978302, + "learning_rate": 2.44381020335355e-06, + "loss": 0.6817, + "step": 274 + }, + { + "epoch": 0.004904933471266008, + "grad_norm": 1.0946693420410156, + "learning_rate": 2.4527292186942563e-06, + "loss": 0.5611, + "step": 275 + }, + { + "epoch": 0.004922769592979702, + "grad_norm": 1.7237051725387573, + "learning_rate": 2.4616482340349627e-06, + "loss": 0.6627, + "step": 276 + }, + { + "epoch": 0.004940605714693397, + "grad_norm": 2.576864242553711, + "learning_rate": 2.470567249375669e-06, + "loss": 0.695, + "step": 277 + }, + { + "epoch": 0.004958441836407091, + "grad_norm": 1.3089861869812012, + "learning_rate": 2.4794862647163754e-06, + "loss": 0.5667, + "step": 278 + }, + { + "epoch": 0.004976277958120786, + "grad_norm": 1.6324995756149292, + "learning_rate": 2.488405280057082e-06, + "loss": 0.6064, + "step": 279 + }, + { + "epoch": 0.004994114079834481, + "grad_norm": 1.286194920539856, + "learning_rate": 2.497324295397788e-06, + "loss": 0.6176, + "step": 280 + }, + { + "epoch": 0.005011950201548175, + "grad_norm": 1.4862664937973022, + "learning_rate": 2.5062433107384946e-06, + "loss": 0.5522, + "step": 281 + }, + { + "epoch": 0.00502978632326187, + "grad_norm": 1.4922723770141602, + "learning_rate": 2.515162326079201e-06, + "loss": 0.5496, + "step": 282 + }, + { + "epoch": 0.005047622444975564, + "grad_norm": 1.2583637237548828, + "learning_rate": 2.5240813414199073e-06, + "loss": 0.5268, + "step": 283 + }, + { + "epoch": 0.005065458566689259, + "grad_norm": 1.5133681297302246, + "learning_rate": 2.5330003567606137e-06, + "loss": 0.5266, + "step": 284 + }, + { + "epoch": 0.005083294688402954, + "grad_norm": 1.3877657651901245, + "learning_rate": 2.54191937210132e-06, + "loss": 0.6455, + "step": 285 + }, + { + "epoch": 0.005101130810116648, + "grad_norm": 2.116668462753296, + "learning_rate": 2.5508383874420265e-06, + "loss": 0.7159, + "step": 286 + }, + { + "epoch": 0.005118966931830343, + "grad_norm": 1.7665098905563354, + "learning_rate": 2.559757402782733e-06, + "loss": 0.6775, + "step": 287 + }, + { + "epoch": 0.005136803053544038, + "grad_norm": 1.3720237016677856, + "learning_rate": 2.5686764181234392e-06, + "loss": 0.6835, + "step": 288 + }, + { + "epoch": 0.005154639175257732, + "grad_norm": 1.302917242050171, + "learning_rate": 2.5775954334641456e-06, + "loss": 0.5711, + "step": 289 + }, + { + "epoch": 0.005172475296971427, + "grad_norm": 1.238458275794983, + "learning_rate": 2.586514448804852e-06, + "loss": 0.5214, + "step": 290 + }, + { + "epoch": 0.005190311418685121, + "grad_norm": 1.5740962028503418, + "learning_rate": 2.5954334641455584e-06, + "loss": 0.5021, + "step": 291 + }, + { + "epoch": 0.005208147540398816, + "grad_norm": 2.9247536659240723, + "learning_rate": 2.6043524794862648e-06, + "loss": 0.6231, + "step": 292 + }, + { + "epoch": 0.005225983662112511, + "grad_norm": 1.5866360664367676, + "learning_rate": 2.613271494826971e-06, + "loss": 0.5794, + "step": 293 + }, + { + "epoch": 0.005243819783826205, + "grad_norm": 2.219635248184204, + "learning_rate": 2.6221905101676775e-06, + "loss": 0.6279, + "step": 294 + }, + { + "epoch": 0.0052616559055399, + "grad_norm": 1.5627270936965942, + "learning_rate": 2.631109525508384e-06, + "loss": 0.6574, + "step": 295 + }, + { + "epoch": 0.005279492027253594, + "grad_norm": 1.6502068042755127, + "learning_rate": 2.6400285408490903e-06, + "loss": 0.5, + "step": 296 + }, + { + "epoch": 0.005297328148967289, + "grad_norm": 1.1381497383117676, + "learning_rate": 2.6489475561897967e-06, + "loss": 0.5804, + "step": 297 + }, + { + "epoch": 0.0053151642706809835, + "grad_norm": 2.142641067504883, + "learning_rate": 2.6578665715305035e-06, + "loss": 0.5662, + "step": 298 + }, + { + "epoch": 0.005333000392394678, + "grad_norm": 1.1981918811798096, + "learning_rate": 2.6667855868712094e-06, + "loss": 0.5862, + "step": 299 + }, + { + "epoch": 0.0053508365141083725, + "grad_norm": 1.5230048894882202, + "learning_rate": 2.675704602211916e-06, + "loss": 0.5038, + "step": 300 + }, + { + "epoch": 0.0053686726358220666, + "grad_norm": 1.82878577709198, + "learning_rate": 2.684623617552622e-06, + "loss": 0.4595, + "step": 301 + }, + { + "epoch": 0.0053865087575357615, + "grad_norm": 1.7125508785247803, + "learning_rate": 2.6935426328933286e-06, + "loss": 0.5876, + "step": 302 + }, + { + "epoch": 0.005404344879249456, + "grad_norm": 1.678423285484314, + "learning_rate": 2.702461648234035e-06, + "loss": 0.627, + "step": 303 + }, + { + "epoch": 0.0054221810009631504, + "grad_norm": 2.2996435165405273, + "learning_rate": 2.7113806635747413e-06, + "loss": 0.6109, + "step": 304 + }, + { + "epoch": 0.005440017122676845, + "grad_norm": 3.5018179416656494, + "learning_rate": 2.720299678915448e-06, + "loss": 0.6813, + "step": 305 + }, + { + "epoch": 0.005457853244390539, + "grad_norm": 1.7643263339996338, + "learning_rate": 2.7292186942561545e-06, + "loss": 0.4911, + "step": 306 + }, + { + "epoch": 0.005475689366104234, + "grad_norm": 1.4608845710754395, + "learning_rate": 2.738137709596861e-06, + "loss": 0.5692, + "step": 307 + }, + { + "epoch": 0.005493525487817929, + "grad_norm": 1.527064561843872, + "learning_rate": 2.747056724937567e-06, + "loss": 0.5854, + "step": 308 + }, + { + "epoch": 0.005511361609531623, + "grad_norm": 1.1883187294006348, + "learning_rate": 2.7559757402782732e-06, + "loss": 0.4484, + "step": 309 + }, + { + "epoch": 0.005529197731245318, + "grad_norm": 1.9517115354537964, + "learning_rate": 2.7648947556189796e-06, + "loss": 0.5218, + "step": 310 + }, + { + "epoch": 0.005547033852959012, + "grad_norm": 1.6211800575256348, + "learning_rate": 2.773813770959686e-06, + "loss": 0.6561, + "step": 311 + }, + { + "epoch": 0.005564869974672707, + "grad_norm": 1.3129562139511108, + "learning_rate": 2.7827327863003924e-06, + "loss": 0.5958, + "step": 312 + }, + { + "epoch": 0.005582706096386402, + "grad_norm": 1.5863853693008423, + "learning_rate": 2.791651801641099e-06, + "loss": 0.5121, + "step": 313 + }, + { + "epoch": 0.005600542218100096, + "grad_norm": 1.2407137155532837, + "learning_rate": 2.8005708169818056e-06, + "loss": 0.5192, + "step": 314 + }, + { + "epoch": 0.005618378339813791, + "grad_norm": 1.194892406463623, + "learning_rate": 2.809489832322512e-06, + "loss": 0.5311, + "step": 315 + }, + { + "epoch": 0.005636214461527485, + "grad_norm": 1.25631844997406, + "learning_rate": 2.8184088476632183e-06, + "loss": 0.591, + "step": 316 + }, + { + "epoch": 0.00565405058324118, + "grad_norm": 1.6166402101516724, + "learning_rate": 2.8273278630039247e-06, + "loss": 0.5415, + "step": 317 + }, + { + "epoch": 0.005671886704954875, + "grad_norm": 1.407192587852478, + "learning_rate": 2.8362468783446307e-06, + "loss": 0.5413, + "step": 318 + }, + { + "epoch": 0.005689722826668569, + "grad_norm": 1.281314492225647, + "learning_rate": 2.845165893685337e-06, + "loss": 0.4938, + "step": 319 + }, + { + "epoch": 0.005707558948382264, + "grad_norm": 1.1050326824188232, + "learning_rate": 2.8540849090260434e-06, + "loss": 0.5446, + "step": 320 + }, + { + "epoch": 0.005725395070095958, + "grad_norm": 1.2858392000198364, + "learning_rate": 2.8630039243667502e-06, + "loss": 0.5782, + "step": 321 + }, + { + "epoch": 0.005743231191809653, + "grad_norm": 1.362825870513916, + "learning_rate": 2.8719229397074566e-06, + "loss": 0.5673, + "step": 322 + }, + { + "epoch": 0.005761067313523348, + "grad_norm": 2.028330087661743, + "learning_rate": 2.880841955048163e-06, + "loss": 0.6749, + "step": 323 + }, + { + "epoch": 0.005778903435237042, + "grad_norm": 1.6358400583267212, + "learning_rate": 2.8897609703888694e-06, + "loss": 0.586, + "step": 324 + }, + { + "epoch": 0.005796739556950737, + "grad_norm": 1.5424126386642456, + "learning_rate": 2.8986799857295757e-06, + "loss": 0.4897, + "step": 325 + }, + { + "epoch": 0.005814575678664431, + "grad_norm": 1.7203178405761719, + "learning_rate": 2.907599001070282e-06, + "loss": 0.5751, + "step": 326 + }, + { + "epoch": 0.005832411800378126, + "grad_norm": 1.8745055198669434, + "learning_rate": 2.916518016410988e-06, + "loss": 0.512, + "step": 327 + }, + { + "epoch": 0.005850247922091821, + "grad_norm": 1.4206955432891846, + "learning_rate": 2.9254370317516945e-06, + "loss": 0.519, + "step": 328 + }, + { + "epoch": 0.005868084043805515, + "grad_norm": 1.4659600257873535, + "learning_rate": 2.9343560470924013e-06, + "loss": 0.5376, + "step": 329 + }, + { + "epoch": 0.00588592016551921, + "grad_norm": 1.2667752504348755, + "learning_rate": 2.9432750624331076e-06, + "loss": 0.4819, + "step": 330 + }, + { + "epoch": 0.005903756287232904, + "grad_norm": 1.199530839920044, + "learning_rate": 2.952194077773814e-06, + "loss": 0.5012, + "step": 331 + }, + { + "epoch": 0.005921592408946599, + "grad_norm": 1.3267625570297241, + "learning_rate": 2.9611130931145204e-06, + "loss": 0.4084, + "step": 332 + }, + { + "epoch": 0.0059394285306602936, + "grad_norm": 1.6121457815170288, + "learning_rate": 2.970032108455227e-06, + "loss": 0.4773, + "step": 333 + }, + { + "epoch": 0.005957264652373988, + "grad_norm": 1.4322900772094727, + "learning_rate": 2.978951123795933e-06, + "loss": 0.4992, + "step": 334 + }, + { + "epoch": 0.0059751007740876825, + "grad_norm": 1.4893734455108643, + "learning_rate": 2.9878701391366396e-06, + "loss": 0.4496, + "step": 335 + }, + { + "epoch": 0.005992936895801377, + "grad_norm": 1.2784552574157715, + "learning_rate": 2.9967891544773455e-06, + "loss": 0.5306, + "step": 336 + }, + { + "epoch": 0.0060107730175150715, + "grad_norm": 2.11749529838562, + "learning_rate": 3.0057081698180523e-06, + "loss": 0.6142, + "step": 337 + }, + { + "epoch": 0.006028609139228766, + "grad_norm": 3.343623399734497, + "learning_rate": 3.0146271851587587e-06, + "loss": 0.4995, + "step": 338 + }, + { + "epoch": 0.0060464452609424605, + "grad_norm": 1.6374093294143677, + "learning_rate": 3.023546200499465e-06, + "loss": 0.5425, + "step": 339 + }, + { + "epoch": 0.006064281382656155, + "grad_norm": 1.5746098756790161, + "learning_rate": 3.0324652158401715e-06, + "loss": 0.5573, + "step": 340 + }, + { + "epoch": 0.0060821175043698494, + "grad_norm": 1.7472163438796997, + "learning_rate": 3.041384231180878e-06, + "loss": 0.5513, + "step": 341 + }, + { + "epoch": 0.006099953626083544, + "grad_norm": 1.0055726766586304, + "learning_rate": 3.0503032465215842e-06, + "loss": 0.4851, + "step": 342 + }, + { + "epoch": 0.006117789747797239, + "grad_norm": 1.4767462015151978, + "learning_rate": 3.0592222618622906e-06, + "loss": 0.4739, + "step": 343 + }, + { + "epoch": 0.006135625869510933, + "grad_norm": 1.3134757280349731, + "learning_rate": 3.068141277202997e-06, + "loss": 0.5837, + "step": 344 + }, + { + "epoch": 0.006153461991224628, + "grad_norm": 1.0430761575698853, + "learning_rate": 3.0770602925437034e-06, + "loss": 0.5398, + "step": 345 + }, + { + "epoch": 0.006171298112938322, + "grad_norm": 1.345146894454956, + "learning_rate": 3.0859793078844097e-06, + "loss": 0.4762, + "step": 346 + }, + { + "epoch": 0.006189134234652017, + "grad_norm": 1.6122509241104126, + "learning_rate": 3.094898323225116e-06, + "loss": 0.479, + "step": 347 + }, + { + "epoch": 0.006206970356365712, + "grad_norm": 1.1161093711853027, + "learning_rate": 3.1038173385658225e-06, + "loss": 0.5598, + "step": 348 + }, + { + "epoch": 0.006224806478079406, + "grad_norm": 1.5138565301895142, + "learning_rate": 3.112736353906529e-06, + "loss": 0.4709, + "step": 349 + }, + { + "epoch": 0.006242642599793101, + "grad_norm": 1.3539141416549683, + "learning_rate": 3.1216553692472353e-06, + "loss": 0.4499, + "step": 350 + }, + { + "epoch": 0.006260478721506795, + "grad_norm": 1.2136832475662231, + "learning_rate": 3.1305743845879412e-06, + "loss": 0.4589, + "step": 351 + }, + { + "epoch": 0.00627831484322049, + "grad_norm": 1.1060885190963745, + "learning_rate": 3.139493399928648e-06, + "loss": 0.5172, + "step": 352 + }, + { + "epoch": 0.006296150964934185, + "grad_norm": 1.2628618478775024, + "learning_rate": 3.1484124152693544e-06, + "loss": 0.5603, + "step": 353 + }, + { + "epoch": 0.006313987086647879, + "grad_norm": 1.2681447267532349, + "learning_rate": 3.1573314306100608e-06, + "loss": 0.5185, + "step": 354 + }, + { + "epoch": 0.006331823208361574, + "grad_norm": 1.3920072317123413, + "learning_rate": 3.166250445950767e-06, + "loss": 0.4208, + "step": 355 + }, + { + "epoch": 0.006349659330075268, + "grad_norm": 1.335545301437378, + "learning_rate": 3.175169461291474e-06, + "loss": 0.4835, + "step": 356 + }, + { + "epoch": 0.006367495451788963, + "grad_norm": 1.1903812885284424, + "learning_rate": 3.18408847663218e-06, + "loss": 0.4654, + "step": 357 + }, + { + "epoch": 0.006385331573502658, + "grad_norm": 1.6971962451934814, + "learning_rate": 3.1930074919728867e-06, + "loss": 0.6008, + "step": 358 + }, + { + "epoch": 0.006403167695216352, + "grad_norm": 1.1480566263198853, + "learning_rate": 3.2019265073135927e-06, + "loss": 0.4019, + "step": 359 + }, + { + "epoch": 0.006421003816930047, + "grad_norm": 2.0387771129608154, + "learning_rate": 3.2108455226542986e-06, + "loss": 0.5006, + "step": 360 + }, + { + "epoch": 0.006438839938643741, + "grad_norm": 1.2148463726043701, + "learning_rate": 3.2197645379950054e-06, + "loss": 0.4913, + "step": 361 + }, + { + "epoch": 0.006456676060357436, + "grad_norm": 2.1562607288360596, + "learning_rate": 3.228683553335712e-06, + "loss": 0.514, + "step": 362 + }, + { + "epoch": 0.006474512182071131, + "grad_norm": 1.2904720306396484, + "learning_rate": 3.2376025686764186e-06, + "loss": 0.4266, + "step": 363 + }, + { + "epoch": 0.006492348303784825, + "grad_norm": 1.3285740613937378, + "learning_rate": 3.2465215840171246e-06, + "loss": 0.4309, + "step": 364 + }, + { + "epoch": 0.00651018442549852, + "grad_norm": 1.137514591217041, + "learning_rate": 3.2554405993578314e-06, + "loss": 0.5002, + "step": 365 + }, + { + "epoch": 0.006528020547212215, + "grad_norm": 1.4586100578308105, + "learning_rate": 3.2643596146985373e-06, + "loss": 0.5156, + "step": 366 + }, + { + "epoch": 0.006545856668925909, + "grad_norm": 1.1244300603866577, + "learning_rate": 3.273278630039244e-06, + "loss": 0.4338, + "step": 367 + }, + { + "epoch": 0.006563692790639604, + "grad_norm": 1.8239957094192505, + "learning_rate": 3.28219764537995e-06, + "loss": 0.4752, + "step": 368 + }, + { + "epoch": 0.006581528912353298, + "grad_norm": 1.3585281372070312, + "learning_rate": 3.2911166607206565e-06, + "loss": 0.5186, + "step": 369 + }, + { + "epoch": 0.0065993650340669925, + "grad_norm": 1.4697260856628418, + "learning_rate": 3.300035676061363e-06, + "loss": 0.53, + "step": 370 + }, + { + "epoch": 0.0066172011557806875, + "grad_norm": 1.2938551902770996, + "learning_rate": 3.3089546914020692e-06, + "loss": 0.4337, + "step": 371 + }, + { + "epoch": 0.0066350372774943815, + "grad_norm": 1.4707707166671753, + "learning_rate": 3.317873706742776e-06, + "loss": 0.482, + "step": 372 + }, + { + "epoch": 0.0066528733992080764, + "grad_norm": 2.7919442653656006, + "learning_rate": 3.326792722083482e-06, + "loss": 0.4809, + "step": 373 + }, + { + "epoch": 0.0066707095209217705, + "grad_norm": 1.4272215366363525, + "learning_rate": 3.335711737424189e-06, + "loss": 0.5062, + "step": 374 + }, + { + "epoch": 0.006688545642635465, + "grad_norm": 1.4810606241226196, + "learning_rate": 3.3446307527648948e-06, + "loss": 0.5378, + "step": 375 + }, + { + "epoch": 0.00670638176434916, + "grad_norm": 1.3353627920150757, + "learning_rate": 3.3535497681056016e-06, + "loss": 0.5338, + "step": 376 + }, + { + "epoch": 0.006724217886062854, + "grad_norm": 1.8348220586776733, + "learning_rate": 3.3624687834463075e-06, + "loss": 0.4637, + "step": 377 + }, + { + "epoch": 0.006742054007776549, + "grad_norm": 1.79783034324646, + "learning_rate": 3.371387798787014e-06, + "loss": 0.5172, + "step": 378 + }, + { + "epoch": 0.006759890129490243, + "grad_norm": 1.573477864265442, + "learning_rate": 3.3803068141277207e-06, + "loss": 0.4414, + "step": 379 + }, + { + "epoch": 0.006777726251203938, + "grad_norm": 1.387686848640442, + "learning_rate": 3.3892258294684267e-06, + "loss": 0.4532, + "step": 380 + }, + { + "epoch": 0.006795562372917633, + "grad_norm": 1.5073461532592773, + "learning_rate": 3.3981448448091335e-06, + "loss": 0.5559, + "step": 381 + }, + { + "epoch": 0.006813398494631327, + "grad_norm": 1.1114751100540161, + "learning_rate": 3.4070638601498394e-06, + "loss": 0.4802, + "step": 382 + }, + { + "epoch": 0.006831234616345022, + "grad_norm": 1.208899974822998, + "learning_rate": 3.4159828754905462e-06, + "loss": 0.4944, + "step": 383 + }, + { + "epoch": 0.006849070738058716, + "grad_norm": 1.5076521635055542, + "learning_rate": 3.424901890831252e-06, + "loss": 0.4995, + "step": 384 + }, + { + "epoch": 0.006866906859772411, + "grad_norm": 1.2716889381408691, + "learning_rate": 3.433820906171959e-06, + "loss": 0.4668, + "step": 385 + }, + { + "epoch": 0.006884742981486106, + "grad_norm": 1.4431092739105225, + "learning_rate": 3.4427399215126654e-06, + "loss": 0.4506, + "step": 386 + }, + { + "epoch": 0.0069025791031998, + "grad_norm": 1.7871001958847046, + "learning_rate": 3.4516589368533713e-06, + "loss": 0.4809, + "step": 387 + }, + { + "epoch": 0.006920415224913495, + "grad_norm": 1.1170417070388794, + "learning_rate": 3.460577952194078e-06, + "loss": 0.4382, + "step": 388 + }, + { + "epoch": 0.006938251346627189, + "grad_norm": 2.742556571960449, + "learning_rate": 3.469496967534784e-06, + "loss": 0.4807, + "step": 389 + }, + { + "epoch": 0.006956087468340884, + "grad_norm": 1.1922411918640137, + "learning_rate": 3.478415982875491e-06, + "loss": 0.4868, + "step": 390 + }, + { + "epoch": 0.006973923590054579, + "grad_norm": 1.179193139076233, + "learning_rate": 3.487334998216197e-06, + "loss": 0.4649, + "step": 391 + }, + { + "epoch": 0.006991759711768273, + "grad_norm": 1.147597074508667, + "learning_rate": 3.4962540135569037e-06, + "loss": 0.4213, + "step": 392 + }, + { + "epoch": 0.007009595833481968, + "grad_norm": 1.1630451679229736, + "learning_rate": 3.5051730288976096e-06, + "loss": 0.4022, + "step": 393 + }, + { + "epoch": 0.007027431955195662, + "grad_norm": 1.588491439819336, + "learning_rate": 3.5140920442383164e-06, + "loss": 0.5299, + "step": 394 + }, + { + "epoch": 0.007045268076909357, + "grad_norm": 2.2154128551483154, + "learning_rate": 3.523011059579023e-06, + "loss": 0.5224, + "step": 395 + }, + { + "epoch": 0.007063104198623052, + "grad_norm": 2.1236326694488525, + "learning_rate": 3.5319300749197288e-06, + "loss": 0.5057, + "step": 396 + }, + { + "epoch": 0.007080940320336746, + "grad_norm": 1.2421132326126099, + "learning_rate": 3.5408490902604356e-06, + "loss": 0.5127, + "step": 397 + }, + { + "epoch": 0.007098776442050441, + "grad_norm": 1.6268608570098877, + "learning_rate": 3.5497681056011415e-06, + "loss": 0.5657, + "step": 398 + }, + { + "epoch": 0.007116612563764135, + "grad_norm": 1.184870719909668, + "learning_rate": 3.5586871209418483e-06, + "loss": 0.4631, + "step": 399 + }, + { + "epoch": 0.00713444868547783, + "grad_norm": 1.7901984453201294, + "learning_rate": 3.5676061362825543e-06, + "loss": 0.4324, + "step": 400 + }, + { + "epoch": 0.007152284807191525, + "grad_norm": 1.1787022352218628, + "learning_rate": 3.576525151623261e-06, + "loss": 0.4364, + "step": 401 + }, + { + "epoch": 0.007170120928905219, + "grad_norm": 2.1964609622955322, + "learning_rate": 3.5854441669639675e-06, + "loss": 0.6286, + "step": 402 + }, + { + "epoch": 0.007187957050618914, + "grad_norm": 5.040952205657959, + "learning_rate": 3.594363182304674e-06, + "loss": 0.4476, + "step": 403 + }, + { + "epoch": 0.007205793172332608, + "grad_norm": 1.3974846601486206, + "learning_rate": 3.6032821976453802e-06, + "loss": 0.5002, + "step": 404 + }, + { + "epoch": 0.007223629294046303, + "grad_norm": 1.2034550905227661, + "learning_rate": 3.612201212986086e-06, + "loss": 0.4535, + "step": 405 + }, + { + "epoch": 0.0072414654157599975, + "grad_norm": 1.8403637409210205, + "learning_rate": 3.621120228326793e-06, + "loss": 0.5211, + "step": 406 + }, + { + "epoch": 0.0072593015374736915, + "grad_norm": 2.155287504196167, + "learning_rate": 3.630039243667499e-06, + "loss": 0.5379, + "step": 407 + }, + { + "epoch": 0.0072771376591873865, + "grad_norm": 1.310027003288269, + "learning_rate": 3.6389582590082057e-06, + "loss": 0.5387, + "step": 408 + }, + { + "epoch": 0.0072949737809010805, + "grad_norm": 1.2275962829589844, + "learning_rate": 3.6478772743489117e-06, + "loss": 0.4531, + "step": 409 + }, + { + "epoch": 0.007312809902614775, + "grad_norm": 1.025327444076538, + "learning_rate": 3.6567962896896185e-06, + "loss": 0.4605, + "step": 410 + }, + { + "epoch": 0.00733064602432847, + "grad_norm": 1.4332187175750732, + "learning_rate": 3.665715305030325e-06, + "loss": 0.6221, + "step": 411 + }, + { + "epoch": 0.007348482146042164, + "grad_norm": 1.6840649843215942, + "learning_rate": 3.6746343203710317e-06, + "loss": 0.4743, + "step": 412 + }, + { + "epoch": 0.007366318267755859, + "grad_norm": 1.555158019065857, + "learning_rate": 3.6835533357117376e-06, + "loss": 0.4867, + "step": 413 + }, + { + "epoch": 0.007384154389469553, + "grad_norm": 1.8823628425598145, + "learning_rate": 3.6924723510524436e-06, + "loss": 0.3919, + "step": 414 + }, + { + "epoch": 0.007401990511183248, + "grad_norm": 1.327907681465149, + "learning_rate": 3.7013913663931504e-06, + "loss": 0.4655, + "step": 415 + }, + { + "epoch": 0.007419826632896943, + "grad_norm": 1.3640567064285278, + "learning_rate": 3.7103103817338564e-06, + "loss": 0.4552, + "step": 416 + }, + { + "epoch": 0.007437662754610637, + "grad_norm": 1.4085242748260498, + "learning_rate": 3.719229397074563e-06, + "loss": 0.5038, + "step": 417 + }, + { + "epoch": 0.007455498876324332, + "grad_norm": 1.4482516050338745, + "learning_rate": 3.7281484124152695e-06, + "loss": 0.5309, + "step": 418 + }, + { + "epoch": 0.007473334998038026, + "grad_norm": 1.718651533126831, + "learning_rate": 3.737067427755976e-06, + "loss": 0.4717, + "step": 419 + }, + { + "epoch": 0.007491171119751721, + "grad_norm": 1.246842622756958, + "learning_rate": 3.7459864430966823e-06, + "loss": 0.5153, + "step": 420 + }, + { + "epoch": 0.007509007241465416, + "grad_norm": 1.363585114479065, + "learning_rate": 3.754905458437389e-06, + "loss": 0.4304, + "step": 421 + }, + { + "epoch": 0.00752684336317911, + "grad_norm": 1.3058159351348877, + "learning_rate": 3.763824473778095e-06, + "loss": 0.5269, + "step": 422 + }, + { + "epoch": 0.007544679484892805, + "grad_norm": 1.8864489793777466, + "learning_rate": 3.772743489118801e-06, + "loss": 0.4435, + "step": 423 + }, + { + "epoch": 0.007562515606606499, + "grad_norm": 3.019227981567383, + "learning_rate": 3.781662504459508e-06, + "loss": 0.4389, + "step": 424 + }, + { + "epoch": 0.007580351728320194, + "grad_norm": 1.8623355627059937, + "learning_rate": 3.7905815198002138e-06, + "loss": 0.5844, + "step": 425 + }, + { + "epoch": 0.007598187850033889, + "grad_norm": 3.843503713607788, + "learning_rate": 3.7995005351409206e-06, + "loss": 0.3918, + "step": 426 + }, + { + "epoch": 0.007616023971747583, + "grad_norm": 1.4885210990905762, + "learning_rate": 3.808419550481627e-06, + "loss": 0.5026, + "step": 427 + }, + { + "epoch": 0.007633860093461278, + "grad_norm": 1.3066977262496948, + "learning_rate": 3.817338565822333e-06, + "loss": 0.4086, + "step": 428 + }, + { + "epoch": 0.007651696215174972, + "grad_norm": 1.1006132364273071, + "learning_rate": 3.82625758116304e-06, + "loss": 0.4779, + "step": 429 + }, + { + "epoch": 0.007669532336888667, + "grad_norm": 1.1822632551193237, + "learning_rate": 3.835176596503746e-06, + "loss": 0.4337, + "step": 430 + }, + { + "epoch": 0.007687368458602362, + "grad_norm": 1.4812755584716797, + "learning_rate": 3.8440956118444525e-06, + "loss": 0.5076, + "step": 431 + }, + { + "epoch": 0.007705204580316056, + "grad_norm": 1.482459545135498, + "learning_rate": 3.853014627185159e-06, + "loss": 0.3985, + "step": 432 + }, + { + "epoch": 0.007723040702029751, + "grad_norm": 1.344508171081543, + "learning_rate": 3.861933642525865e-06, + "loss": 0.4249, + "step": 433 + }, + { + "epoch": 0.007740876823743445, + "grad_norm": 2.2285561561584473, + "learning_rate": 3.870852657866572e-06, + "loss": 0.5626, + "step": 434 + }, + { + "epoch": 0.00775871294545714, + "grad_norm": 1.119579553604126, + "learning_rate": 3.879771673207278e-06, + "loss": 0.4266, + "step": 435 + }, + { + "epoch": 0.007776549067170835, + "grad_norm": 3.160632610321045, + "learning_rate": 3.888690688547984e-06, + "loss": 0.5444, + "step": 436 + }, + { + "epoch": 0.007794385188884529, + "grad_norm": 1.4120960235595703, + "learning_rate": 3.897609703888691e-06, + "loss": 0.4857, + "step": 437 + }, + { + "epoch": 0.007812221310598224, + "grad_norm": 1.0629377365112305, + "learning_rate": 3.906528719229397e-06, + "loss": 0.5042, + "step": 438 + }, + { + "epoch": 0.007830057432311919, + "grad_norm": 1.1580361127853394, + "learning_rate": 3.9154477345701035e-06, + "loss": 0.4843, + "step": 439 + }, + { + "epoch": 0.007847893554025613, + "grad_norm": 1.8523987531661987, + "learning_rate": 3.92436674991081e-06, + "loss": 0.4615, + "step": 440 + }, + { + "epoch": 0.007865729675739307, + "grad_norm": 1.5833494663238525, + "learning_rate": 3.933285765251517e-06, + "loss": 0.5434, + "step": 441 + }, + { + "epoch": 0.007883565797453002, + "grad_norm": 2.201798677444458, + "learning_rate": 3.942204780592223e-06, + "loss": 0.478, + "step": 442 + }, + { + "epoch": 0.007901401919166696, + "grad_norm": 3.5636396408081055, + "learning_rate": 3.951123795932929e-06, + "loss": 0.5597, + "step": 443 + }, + { + "epoch": 0.007919238040880391, + "grad_norm": 1.3248926401138306, + "learning_rate": 3.9600428112736354e-06, + "loss": 0.5368, + "step": 444 + }, + { + "epoch": 0.007937074162594086, + "grad_norm": 1.5661994218826294, + "learning_rate": 3.968961826614342e-06, + "loss": 0.514, + "step": 445 + }, + { + "epoch": 0.00795491028430778, + "grad_norm": 1.6248027086257935, + "learning_rate": 3.977880841955048e-06, + "loss": 0.4774, + "step": 446 + }, + { + "epoch": 0.007972746406021474, + "grad_norm": 1.3313586711883545, + "learning_rate": 3.986799857295755e-06, + "loss": 0.5059, + "step": 447 + }, + { + "epoch": 0.00799058252773517, + "grad_norm": 1.2870584726333618, + "learning_rate": 3.995718872636462e-06, + "loss": 0.5068, + "step": 448 + }, + { + "epoch": 0.008008418649448864, + "grad_norm": 1.2870802879333496, + "learning_rate": 4.004637887977167e-06, + "loss": 0.4465, + "step": 449 + }, + { + "epoch": 0.00802625477116256, + "grad_norm": 1.390815019607544, + "learning_rate": 4.0135569033178746e-06, + "loss": 0.5422, + "step": 450 + }, + { + "epoch": 0.008044090892876252, + "grad_norm": 1.075946569442749, + "learning_rate": 4.02247591865858e-06, + "loss": 0.44, + "step": 451 + }, + { + "epoch": 0.008061927014589947, + "grad_norm": 0.9873358011245728, + "learning_rate": 4.0313949339992865e-06, + "loss": 0.4567, + "step": 452 + }, + { + "epoch": 0.008079763136303642, + "grad_norm": 1.2685288190841675, + "learning_rate": 4.040313949339993e-06, + "loss": 0.5163, + "step": 453 + }, + { + "epoch": 0.008097599258017337, + "grad_norm": 1.3385316133499146, + "learning_rate": 4.049232964680699e-06, + "loss": 0.4355, + "step": 454 + }, + { + "epoch": 0.008115435379731032, + "grad_norm": 1.371286392211914, + "learning_rate": 4.058151980021406e-06, + "loss": 0.4362, + "step": 455 + }, + { + "epoch": 0.008133271501444725, + "grad_norm": 0.9873465299606323, + "learning_rate": 4.067070995362112e-06, + "loss": 0.4447, + "step": 456 + }, + { + "epoch": 0.00815110762315842, + "grad_norm": 6.998069763183594, + "learning_rate": 4.075990010702819e-06, + "loss": 0.5213, + "step": 457 + }, + { + "epoch": 0.008168943744872115, + "grad_norm": 1.1350767612457275, + "learning_rate": 4.084909026043525e-06, + "loss": 0.4208, + "step": 458 + }, + { + "epoch": 0.00818677986658581, + "grad_norm": 3.4948246479034424, + "learning_rate": 4.093828041384232e-06, + "loss": 0.5256, + "step": 459 + }, + { + "epoch": 0.008204615988299505, + "grad_norm": 0.9854333400726318, + "learning_rate": 4.1027470567249375e-06, + "loss": 0.4685, + "step": 460 + }, + { + "epoch": 0.008222452110013198, + "grad_norm": 1.5371170043945312, + "learning_rate": 4.111666072065644e-06, + "loss": 0.4657, + "step": 461 + }, + { + "epoch": 0.008240288231726893, + "grad_norm": 1.978710412979126, + "learning_rate": 4.12058508740635e-06, + "loss": 0.4877, + "step": 462 + }, + { + "epoch": 0.008258124353440588, + "grad_norm": 1.7735456228256226, + "learning_rate": 4.129504102747057e-06, + "loss": 0.5351, + "step": 463 + }, + { + "epoch": 0.008275960475154283, + "grad_norm": 1.8886734247207642, + "learning_rate": 4.138423118087764e-06, + "loss": 0.4245, + "step": 464 + }, + { + "epoch": 0.008293796596867978, + "grad_norm": 1.1141098737716675, + "learning_rate": 4.1473421334284694e-06, + "loss": 0.49, + "step": 465 + }, + { + "epoch": 0.008311632718581671, + "grad_norm": 1.384822130203247, + "learning_rate": 4.156261148769177e-06, + "loss": 0.472, + "step": 466 + }, + { + "epoch": 0.008329468840295366, + "grad_norm": 1.2739297151565552, + "learning_rate": 4.165180164109882e-06, + "loss": 0.4554, + "step": 467 + }, + { + "epoch": 0.00834730496200906, + "grad_norm": 2.041304111480713, + "learning_rate": 4.174099179450589e-06, + "loss": 0.4724, + "step": 468 + }, + { + "epoch": 0.008365141083722756, + "grad_norm": 1.703652262687683, + "learning_rate": 4.183018194791295e-06, + "loss": 0.4973, + "step": 469 + }, + { + "epoch": 0.00838297720543645, + "grad_norm": 1.402593970298767, + "learning_rate": 4.191937210132001e-06, + "loss": 0.4945, + "step": 470 + }, + { + "epoch": 0.008400813327150144, + "grad_norm": 2.099918842315674, + "learning_rate": 4.200856225472708e-06, + "loss": 0.4303, + "step": 471 + }, + { + "epoch": 0.008418649448863839, + "grad_norm": 0.8471234440803528, + "learning_rate": 4.209775240813414e-06, + "loss": 0.3937, + "step": 472 + }, + { + "epoch": 0.008436485570577534, + "grad_norm": 1.1729215383529663, + "learning_rate": 4.218694256154121e-06, + "loss": 0.4453, + "step": 473 + }, + { + "epoch": 0.008454321692291229, + "grad_norm": 2.4997663497924805, + "learning_rate": 4.227613271494827e-06, + "loss": 0.4948, + "step": 474 + }, + { + "epoch": 0.008472157814004923, + "grad_norm": 1.68740975856781, + "learning_rate": 4.236532286835534e-06, + "loss": 0.5015, + "step": 475 + }, + { + "epoch": 0.008489993935718617, + "grad_norm": 1.1653437614440918, + "learning_rate": 4.24545130217624e-06, + "loss": 0.4326, + "step": 476 + }, + { + "epoch": 0.008507830057432312, + "grad_norm": 1.3887028694152832, + "learning_rate": 4.254370317516947e-06, + "loss": 0.4663, + "step": 477 + }, + { + "epoch": 0.008525666179146007, + "grad_norm": 1.160707950592041, + "learning_rate": 4.263289332857652e-06, + "loss": 0.4161, + "step": 478 + }, + { + "epoch": 0.008543502300859701, + "grad_norm": 1.1170053482055664, + "learning_rate": 4.272208348198359e-06, + "loss": 0.451, + "step": 479 + }, + { + "epoch": 0.008561338422573396, + "grad_norm": 1.5061770677566528, + "learning_rate": 4.281127363539066e-06, + "loss": 0.5079, + "step": 480 + }, + { + "epoch": 0.00857917454428709, + "grad_norm": 3.0044429302215576, + "learning_rate": 4.2900463788797715e-06, + "loss": 0.4474, + "step": 481 + }, + { + "epoch": 0.008597010666000784, + "grad_norm": 1.0271059274673462, + "learning_rate": 4.298965394220479e-06, + "loss": 0.3791, + "step": 482 + }, + { + "epoch": 0.00861484678771448, + "grad_norm": 1.7308597564697266, + "learning_rate": 4.307884409561184e-06, + "loss": 0.5119, + "step": 483 + }, + { + "epoch": 0.008632682909428174, + "grad_norm": 1.008509874343872, + "learning_rate": 4.3168034249018915e-06, + "loss": 0.4223, + "step": 484 + }, + { + "epoch": 0.00865051903114187, + "grad_norm": 1.2962771654129028, + "learning_rate": 4.325722440242597e-06, + "loss": 0.5098, + "step": 485 + }, + { + "epoch": 0.008668355152855562, + "grad_norm": 1.059078335762024, + "learning_rate": 4.334641455583304e-06, + "loss": 0.3876, + "step": 486 + }, + { + "epoch": 0.008686191274569257, + "grad_norm": 1.275657296180725, + "learning_rate": 4.34356047092401e-06, + "loss": 0.4581, + "step": 487 + }, + { + "epoch": 0.008704027396282952, + "grad_norm": 1.208746075630188, + "learning_rate": 4.352479486264716e-06, + "loss": 0.4419, + "step": 488 + }, + { + "epoch": 0.008721863517996647, + "grad_norm": 1.7713290452957153, + "learning_rate": 4.361398501605423e-06, + "loss": 0.5964, + "step": 489 + }, + { + "epoch": 0.008739699639710342, + "grad_norm": 1.4434576034545898, + "learning_rate": 4.370317516946129e-06, + "loss": 0.4817, + "step": 490 + }, + { + "epoch": 0.008757535761424035, + "grad_norm": 1.8831814527511597, + "learning_rate": 4.379236532286836e-06, + "loss": 0.5045, + "step": 491 + }, + { + "epoch": 0.00877537188313773, + "grad_norm": 2.0850300788879395, + "learning_rate": 4.388155547627542e-06, + "loss": 0.5111, + "step": 492 + }, + { + "epoch": 0.008793208004851425, + "grad_norm": 1.17160964012146, + "learning_rate": 4.397074562968249e-06, + "loss": 0.4362, + "step": 493 + }, + { + "epoch": 0.00881104412656512, + "grad_norm": 2.0882840156555176, + "learning_rate": 4.4059935783089545e-06, + "loss": 0.3787, + "step": 494 + }, + { + "epoch": 0.008828880248278815, + "grad_norm": 4.528671741485596, + "learning_rate": 4.414912593649662e-06, + "loss": 0.4315, + "step": 495 + }, + { + "epoch": 0.008846716369992508, + "grad_norm": 1.4445877075195312, + "learning_rate": 4.423831608990368e-06, + "loss": 0.3801, + "step": 496 + }, + { + "epoch": 0.008864552491706203, + "grad_norm": 1.9293122291564941, + "learning_rate": 4.432750624331074e-06, + "loss": 0.4549, + "step": 497 + }, + { + "epoch": 0.008882388613419898, + "grad_norm": 1.4013196229934692, + "learning_rate": 4.441669639671781e-06, + "loss": 0.5211, + "step": 498 + }, + { + "epoch": 0.008900224735133593, + "grad_norm": 1.7515922784805298, + "learning_rate": 4.450588655012486e-06, + "loss": 0.4308, + "step": 499 + }, + { + "epoch": 0.008918060856847288, + "grad_norm": 3.94881272315979, + "learning_rate": 4.459507670353194e-06, + "loss": 0.5709, + "step": 500 + }, + { + "epoch": 0.008935896978560981, + "grad_norm": 3.567277431488037, + "learning_rate": 4.468426685693899e-06, + "loss": 0.4617, + "step": 501 + }, + { + "epoch": 0.008953733100274676, + "grad_norm": 1.5492817163467407, + "learning_rate": 4.477345701034606e-06, + "loss": 0.4814, + "step": 502 + }, + { + "epoch": 0.00897156922198837, + "grad_norm": 0.9941692352294922, + "learning_rate": 4.486264716375312e-06, + "loss": 0.4129, + "step": 503 + }, + { + "epoch": 0.008989405343702066, + "grad_norm": 1.9810105562210083, + "learning_rate": 4.495183731716019e-06, + "loss": 0.4271, + "step": 504 + }, + { + "epoch": 0.00900724146541576, + "grad_norm": 4.075789451599121, + "learning_rate": 4.5041027470567255e-06, + "loss": 0.5615, + "step": 505 + }, + { + "epoch": 0.009025077587129454, + "grad_norm": 0.9861582517623901, + "learning_rate": 4.513021762397431e-06, + "loss": 0.4656, + "step": 506 + }, + { + "epoch": 0.009042913708843149, + "grad_norm": 1.3125861883163452, + "learning_rate": 4.521940777738138e-06, + "loss": 0.477, + "step": 507 + }, + { + "epoch": 0.009060749830556844, + "grad_norm": 1.2995643615722656, + "learning_rate": 4.530859793078844e-06, + "loss": 0.4401, + "step": 508 + }, + { + "epoch": 0.009078585952270539, + "grad_norm": 1.8894366025924683, + "learning_rate": 4.539778808419551e-06, + "loss": 0.4924, + "step": 509 + }, + { + "epoch": 0.009096422073984234, + "grad_norm": 1.2428226470947266, + "learning_rate": 4.5486978237602565e-06, + "loss": 0.4792, + "step": 510 + }, + { + "epoch": 0.009114258195697927, + "grad_norm": 1.9830949306488037, + "learning_rate": 4.557616839100964e-06, + "loss": 0.4324, + "step": 511 + }, + { + "epoch": 0.009132094317411622, + "grad_norm": 1.3322826623916626, + "learning_rate": 4.56653585444167e-06, + "loss": 0.5436, + "step": 512 + }, + { + "epoch": 0.009149930439125317, + "grad_norm": 1.3594346046447754, + "learning_rate": 4.5754548697823765e-06, + "loss": 0.4621, + "step": 513 + }, + { + "epoch": 0.009167766560839011, + "grad_norm": 1.4227405786514282, + "learning_rate": 4.584373885123083e-06, + "loss": 0.4373, + "step": 514 + }, + { + "epoch": 0.009185602682552706, + "grad_norm": 1.657763123512268, + "learning_rate": 4.5932929004637884e-06, + "loss": 0.4845, + "step": 515 + }, + { + "epoch": 0.0092034388042664, + "grad_norm": 1.3375740051269531, + "learning_rate": 4.602211915804496e-06, + "loss": 0.4677, + "step": 516 + }, + { + "epoch": 0.009221274925980094, + "grad_norm": 1.4107847213745117, + "learning_rate": 4.611130931145201e-06, + "loss": 0.448, + "step": 517 + }, + { + "epoch": 0.00923911104769379, + "grad_norm": 1.0197649002075195, + "learning_rate": 4.6200499464859084e-06, + "loss": 0.4242, + "step": 518 + }, + { + "epoch": 0.009256947169407484, + "grad_norm": 1.2332050800323486, + "learning_rate": 4.628968961826614e-06, + "loss": 0.4631, + "step": 519 + }, + { + "epoch": 0.00927478329112118, + "grad_norm": 1.6185851097106934, + "learning_rate": 4.637887977167321e-06, + "loss": 0.471, + "step": 520 + }, + { + "epoch": 0.009292619412834872, + "grad_norm": 1.300220012664795, + "learning_rate": 4.6468069925080276e-06, + "loss": 0.4296, + "step": 521 + }, + { + "epoch": 0.009310455534548567, + "grad_norm": 1.1333004236221313, + "learning_rate": 4.655726007848734e-06, + "loss": 0.4188, + "step": 522 + }, + { + "epoch": 0.009328291656262262, + "grad_norm": 1.6971864700317383, + "learning_rate": 4.66464502318944e-06, + "loss": 0.5606, + "step": 523 + }, + { + "epoch": 0.009346127777975957, + "grad_norm": 1.00901198387146, + "learning_rate": 4.673564038530146e-06, + "loss": 0.4002, + "step": 524 + }, + { + "epoch": 0.009363963899689652, + "grad_norm": 1.1150277853012085, + "learning_rate": 4.682483053870853e-06, + "loss": 0.4832, + "step": 525 + }, + { + "epoch": 0.009381800021403345, + "grad_norm": 1.2056350708007812, + "learning_rate": 4.691402069211559e-06, + "loss": 0.4359, + "step": 526 + }, + { + "epoch": 0.00939963614311704, + "grad_norm": 1.6105479001998901, + "learning_rate": 4.700321084552266e-06, + "loss": 0.4294, + "step": 527 + }, + { + "epoch": 0.009417472264830735, + "grad_norm": 1.207774043083191, + "learning_rate": 4.709240099892972e-06, + "loss": 0.4171, + "step": 528 + }, + { + "epoch": 0.00943530838654443, + "grad_norm": 1.2855324745178223, + "learning_rate": 4.718159115233679e-06, + "loss": 0.4268, + "step": 529 + }, + { + "epoch": 0.009453144508258125, + "grad_norm": 1.6374174356460571, + "learning_rate": 4.727078130574385e-06, + "loss": 0.445, + "step": 530 + }, + { + "epoch": 0.009470980629971818, + "grad_norm": 1.2425190210342407, + "learning_rate": 4.735997145915091e-06, + "loss": 0.4469, + "step": 531 + }, + { + "epoch": 0.009488816751685513, + "grad_norm": 1.3253897428512573, + "learning_rate": 4.744916161255798e-06, + "loss": 0.4303, + "step": 532 + }, + { + "epoch": 0.009506652873399208, + "grad_norm": 1.1075959205627441, + "learning_rate": 4.753835176596504e-06, + "loss": 0.486, + "step": 533 + }, + { + "epoch": 0.009524488995112903, + "grad_norm": 1.1825791597366333, + "learning_rate": 4.7627541919372105e-06, + "loss": 0.3865, + "step": 534 + }, + { + "epoch": 0.009542325116826598, + "grad_norm": 1.493388295173645, + "learning_rate": 4.771673207277916e-06, + "loss": 0.5005, + "step": 535 + }, + { + "epoch": 0.009560161238540291, + "grad_norm": 1.1791242361068726, + "learning_rate": 4.780592222618623e-06, + "loss": 0.4891, + "step": 536 + }, + { + "epoch": 0.009577997360253986, + "grad_norm": 1.343740701675415, + "learning_rate": 4.78951123795933e-06, + "loss": 0.4821, + "step": 537 + }, + { + "epoch": 0.00959583348196768, + "grad_norm": 1.3518675565719604, + "learning_rate": 4.798430253300036e-06, + "loss": 0.4596, + "step": 538 + }, + { + "epoch": 0.009613669603681376, + "grad_norm": 1.4564203023910522, + "learning_rate": 4.807349268640742e-06, + "loss": 0.4375, + "step": 539 + }, + { + "epoch": 0.00963150572539507, + "grad_norm": 3.196500062942505, + "learning_rate": 4.816268283981449e-06, + "loss": 0.4146, + "step": 540 + }, + { + "epoch": 0.009649341847108764, + "grad_norm": 1.102216124534607, + "learning_rate": 4.825187299322155e-06, + "loss": 0.4836, + "step": 541 + }, + { + "epoch": 0.009667177968822459, + "grad_norm": 1.8648408651351929, + "learning_rate": 4.8341063146628616e-06, + "loss": 0.511, + "step": 542 + }, + { + "epoch": 0.009685014090536154, + "grad_norm": 0.9851669669151306, + "learning_rate": 4.843025330003568e-06, + "loss": 0.3768, + "step": 543 + }, + { + "epoch": 0.009702850212249849, + "grad_norm": 1.006560206413269, + "learning_rate": 4.851944345344274e-06, + "loss": 0.4066, + "step": 544 + }, + { + "epoch": 0.009720686333963544, + "grad_norm": 1.0856674909591675, + "learning_rate": 4.860863360684981e-06, + "loss": 0.3364, + "step": 545 + }, + { + "epoch": 0.009738522455677237, + "grad_norm": 1.352399230003357, + "learning_rate": 4.869782376025687e-06, + "loss": 0.4577, + "step": 546 + }, + { + "epoch": 0.009756358577390932, + "grad_norm": 1.6652281284332275, + "learning_rate": 4.8787013913663935e-06, + "loss": 0.413, + "step": 547 + }, + { + "epoch": 0.009774194699104627, + "grad_norm": 1.9173638820648193, + "learning_rate": 4.8876204067071e-06, + "loss": 0.4688, + "step": 548 + }, + { + "epoch": 0.009792030820818321, + "grad_norm": 0.9917348623275757, + "learning_rate": 4.896539422047806e-06, + "loss": 0.425, + "step": 549 + }, + { + "epoch": 0.009809866942532016, + "grad_norm": 1.6308515071868896, + "learning_rate": 4.905458437388513e-06, + "loss": 0.4273, + "step": 550 + }, + { + "epoch": 0.00982770306424571, + "grad_norm": 1.0930702686309814, + "learning_rate": 4.914377452729219e-06, + "loss": 0.4444, + "step": 551 + }, + { + "epoch": 0.009845539185959404, + "grad_norm": 1.4137258529663086, + "learning_rate": 4.923296468069925e-06, + "loss": 0.442, + "step": 552 + }, + { + "epoch": 0.0098633753076731, + "grad_norm": 1.506640076637268, + "learning_rate": 4.932215483410632e-06, + "loss": 0.4724, + "step": 553 + }, + { + "epoch": 0.009881211429386794, + "grad_norm": 1.3620201349258423, + "learning_rate": 4.941134498751338e-06, + "loss": 0.4142, + "step": 554 + }, + { + "epoch": 0.00989904755110049, + "grad_norm": 1.3157777786254883, + "learning_rate": 4.9500535140920445e-06, + "loss": 0.4635, + "step": 555 + }, + { + "epoch": 0.009916883672814182, + "grad_norm": 1.209380030632019, + "learning_rate": 4.958972529432751e-06, + "loss": 0.4792, + "step": 556 + }, + { + "epoch": 0.009934719794527877, + "grad_norm": 1.1213328838348389, + "learning_rate": 4.967891544773457e-06, + "loss": 0.4473, + "step": 557 + }, + { + "epoch": 0.009952555916241572, + "grad_norm": 1.7144397497177124, + "learning_rate": 4.976810560114164e-06, + "loss": 0.496, + "step": 558 + }, + { + "epoch": 0.009970392037955267, + "grad_norm": 1.9864416122436523, + "learning_rate": 4.98572957545487e-06, + "loss": 0.5314, + "step": 559 + }, + { + "epoch": 0.009988228159668962, + "grad_norm": 1.0490071773529053, + "learning_rate": 4.994648590795576e-06, + "loss": 0.4173, + "step": 560 + }, + { + "epoch": 0.010006064281382655, + "grad_norm": 1.5442557334899902, + "learning_rate": 5.003567606136283e-06, + "loss": 0.4887, + "step": 561 + }, + { + "epoch": 0.01002390040309635, + "grad_norm": 0.9276285171508789, + "learning_rate": 5.012486621476989e-06, + "loss": 0.4398, + "step": 562 + }, + { + "epoch": 0.010041736524810045, + "grad_norm": 1.0979938507080078, + "learning_rate": 5.0214056368176956e-06, + "loss": 0.4143, + "step": 563 + }, + { + "epoch": 0.01005957264652374, + "grad_norm": 2.0221452713012695, + "learning_rate": 5.030324652158402e-06, + "loss": 0.4373, + "step": 564 + }, + { + "epoch": 0.010077408768237435, + "grad_norm": 2.4793107509613037, + "learning_rate": 5.039243667499108e-06, + "loss": 0.4251, + "step": 565 + }, + { + "epoch": 0.010095244889951128, + "grad_norm": 1.1153239011764526, + "learning_rate": 5.048162682839815e-06, + "loss": 0.4382, + "step": 566 + }, + { + "epoch": 0.010113081011664823, + "grad_norm": 1.1595510244369507, + "learning_rate": 5.057081698180521e-06, + "loss": 0.4315, + "step": 567 + }, + { + "epoch": 0.010130917133378518, + "grad_norm": 1.217896580696106, + "learning_rate": 5.0660007135212275e-06, + "loss": 0.4296, + "step": 568 + }, + { + "epoch": 0.010148753255092213, + "grad_norm": 1.097383975982666, + "learning_rate": 5.074919728861934e-06, + "loss": 0.3926, + "step": 569 + }, + { + "epoch": 0.010166589376805908, + "grad_norm": 1.653509497642517, + "learning_rate": 5.08383874420264e-06, + "loss": 0.5525, + "step": 570 + }, + { + "epoch": 0.010184425498519603, + "grad_norm": 1.1412622928619385, + "learning_rate": 5.092757759543347e-06, + "loss": 0.467, + "step": 571 + }, + { + "epoch": 0.010202261620233296, + "grad_norm": 1.1681066751480103, + "learning_rate": 5.101676774884053e-06, + "loss": 0.419, + "step": 572 + }, + { + "epoch": 0.01022009774194699, + "grad_norm": 1.49830162525177, + "learning_rate": 5.110595790224759e-06, + "loss": 0.503, + "step": 573 + }, + { + "epoch": 0.010237933863660686, + "grad_norm": 1.6811169385910034, + "learning_rate": 5.119514805565466e-06, + "loss": 0.4756, + "step": 574 + }, + { + "epoch": 0.01025576998537438, + "grad_norm": 1.2853678464889526, + "learning_rate": 5.128433820906172e-06, + "loss": 0.3656, + "step": 575 + }, + { + "epoch": 0.010273606107088076, + "grad_norm": 1.0954298973083496, + "learning_rate": 5.1373528362468785e-06, + "loss": 0.4478, + "step": 576 + }, + { + "epoch": 0.010291442228801769, + "grad_norm": 0.8055521249771118, + "learning_rate": 5.146271851587585e-06, + "loss": 0.3895, + "step": 577 + }, + { + "epoch": 0.010309278350515464, + "grad_norm": 1.025450587272644, + "learning_rate": 5.155190866928291e-06, + "loss": 0.3921, + "step": 578 + }, + { + "epoch": 0.010327114472229159, + "grad_norm": 1.1558235883712769, + "learning_rate": 5.164109882268998e-06, + "loss": 0.4946, + "step": 579 + }, + { + "epoch": 0.010344950593942854, + "grad_norm": 1.2848806381225586, + "learning_rate": 5.173028897609704e-06, + "loss": 0.4041, + "step": 580 + }, + { + "epoch": 0.010362786715656548, + "grad_norm": 1.412864327430725, + "learning_rate": 5.18194791295041e-06, + "loss": 0.4781, + "step": 581 + }, + { + "epoch": 0.010380622837370242, + "grad_norm": 1.0885170698165894, + "learning_rate": 5.190866928291117e-06, + "loss": 0.4267, + "step": 582 + }, + { + "epoch": 0.010398458959083937, + "grad_norm": 1.2009660005569458, + "learning_rate": 5.199785943631823e-06, + "loss": 0.4196, + "step": 583 + }, + { + "epoch": 0.010416295080797632, + "grad_norm": 1.0177676677703857, + "learning_rate": 5.2087049589725295e-06, + "loss": 0.4083, + "step": 584 + }, + { + "epoch": 0.010434131202511326, + "grad_norm": 1.601419448852539, + "learning_rate": 5.217623974313236e-06, + "loss": 0.3974, + "step": 585 + }, + { + "epoch": 0.010451967324225021, + "grad_norm": 2.39469838142395, + "learning_rate": 5.226542989653942e-06, + "loss": 0.4309, + "step": 586 + }, + { + "epoch": 0.010469803445938715, + "grad_norm": 0.9701213240623474, + "learning_rate": 5.2354620049946495e-06, + "loss": 0.4188, + "step": 587 + }, + { + "epoch": 0.01048763956765241, + "grad_norm": 1.7994223833084106, + "learning_rate": 5.244381020335355e-06, + "loss": 0.497, + "step": 588 + }, + { + "epoch": 0.010505475689366104, + "grad_norm": 0.9525409936904907, + "learning_rate": 5.2533000356760614e-06, + "loss": 0.4708, + "step": 589 + }, + { + "epoch": 0.0105233118110798, + "grad_norm": 1.0439306497573853, + "learning_rate": 5.262219051016768e-06, + "loss": 0.3759, + "step": 590 + }, + { + "epoch": 0.010541147932793494, + "grad_norm": 1.2639439105987549, + "learning_rate": 5.271138066357474e-06, + "loss": 0.4916, + "step": 591 + }, + { + "epoch": 0.010558984054507187, + "grad_norm": 1.2964301109313965, + "learning_rate": 5.280057081698181e-06, + "loss": 0.5315, + "step": 592 + }, + { + "epoch": 0.010576820176220882, + "grad_norm": 1.2846462726593018, + "learning_rate": 5.288976097038887e-06, + "loss": 0.4064, + "step": 593 + }, + { + "epoch": 0.010594656297934577, + "grad_norm": 1.2763394117355347, + "learning_rate": 5.297895112379593e-06, + "loss": 0.3243, + "step": 594 + }, + { + "epoch": 0.010612492419648272, + "grad_norm": 1.6506128311157227, + "learning_rate": 5.3068141277203e-06, + "loss": 0.4227, + "step": 595 + }, + { + "epoch": 0.010630328541361967, + "grad_norm": 1.3190261125564575, + "learning_rate": 5.315733143061007e-06, + "loss": 0.4495, + "step": 596 + }, + { + "epoch": 0.01064816466307566, + "grad_norm": 1.0525825023651123, + "learning_rate": 5.3246521584017125e-06, + "loss": 0.44, + "step": 597 + }, + { + "epoch": 0.010666000784789355, + "grad_norm": 1.310211420059204, + "learning_rate": 5.333571173742419e-06, + "loss": 0.4271, + "step": 598 + }, + { + "epoch": 0.01068383690650305, + "grad_norm": 1.5680582523345947, + "learning_rate": 5.342490189083125e-06, + "loss": 0.5139, + "step": 599 + }, + { + "epoch": 0.010701673028216745, + "grad_norm": 1.169052243232727, + "learning_rate": 5.351409204423832e-06, + "loss": 0.3914, + "step": 600 + }, + { + "epoch": 0.01071950914993044, + "grad_norm": 1.0858758687973022, + "learning_rate": 5.360328219764538e-06, + "loss": 0.5053, + "step": 601 + }, + { + "epoch": 0.010737345271644133, + "grad_norm": 1.5651134252548218, + "learning_rate": 5.369247235105244e-06, + "loss": 0.387, + "step": 602 + }, + { + "epoch": 0.010755181393357828, + "grad_norm": 1.2751938104629517, + "learning_rate": 5.378166250445952e-06, + "loss": 0.4827, + "step": 603 + }, + { + "epoch": 0.010773017515071523, + "grad_norm": 1.2434167861938477, + "learning_rate": 5.387085265786657e-06, + "loss": 0.4936, + "step": 604 + }, + { + "epoch": 0.010790853636785218, + "grad_norm": 1.2758976221084595, + "learning_rate": 5.396004281127364e-06, + "loss": 0.3476, + "step": 605 + }, + { + "epoch": 0.010808689758498913, + "grad_norm": 1.2970452308654785, + "learning_rate": 5.40492329646807e-06, + "loss": 0.4917, + "step": 606 + }, + { + "epoch": 0.010826525880212606, + "grad_norm": 0.9976676106452942, + "learning_rate": 5.413842311808776e-06, + "loss": 0.3342, + "step": 607 + }, + { + "epoch": 0.010844362001926301, + "grad_norm": 1.245278239250183, + "learning_rate": 5.422761327149483e-06, + "loss": 0.4906, + "step": 608 + }, + { + "epoch": 0.010862198123639996, + "grad_norm": 0.8573669791221619, + "learning_rate": 5.431680342490189e-06, + "loss": 0.3658, + "step": 609 + }, + { + "epoch": 0.01088003424535369, + "grad_norm": 1.2254329919815063, + "learning_rate": 5.440599357830896e-06, + "loss": 0.4932, + "step": 610 + }, + { + "epoch": 0.010897870367067386, + "grad_norm": 1.2445495128631592, + "learning_rate": 5.449518373171602e-06, + "loss": 0.4537, + "step": 611 + }, + { + "epoch": 0.010915706488781079, + "grad_norm": 1.8366972208023071, + "learning_rate": 5.458437388512309e-06, + "loss": 0.471, + "step": 612 + }, + { + "epoch": 0.010933542610494774, + "grad_norm": 1.1743152141571045, + "learning_rate": 5.4673564038530146e-06, + "loss": 0.4653, + "step": 613 + }, + { + "epoch": 0.010951378732208469, + "grad_norm": 0.6553313136100769, + "learning_rate": 5.476275419193722e-06, + "loss": 0.3757, + "step": 614 + }, + { + "epoch": 0.010969214853922164, + "grad_norm": 1.457043170928955, + "learning_rate": 5.485194434534427e-06, + "loss": 0.4795, + "step": 615 + }, + { + "epoch": 0.010987050975635859, + "grad_norm": 1.235823631286621, + "learning_rate": 5.494113449875134e-06, + "loss": 0.3929, + "step": 616 + }, + { + "epoch": 0.011004887097349552, + "grad_norm": 1.167606234550476, + "learning_rate": 5.50303246521584e-06, + "loss": 0.4161, + "step": 617 + }, + { + "epoch": 0.011022723219063247, + "grad_norm": 1.3439496755599976, + "learning_rate": 5.5119514805565465e-06, + "loss": 0.4563, + "step": 618 + }, + { + "epoch": 0.011040559340776942, + "grad_norm": 0.9970110058784485, + "learning_rate": 5.520870495897254e-06, + "loss": 0.3599, + "step": 619 + }, + { + "epoch": 0.011058395462490636, + "grad_norm": 1.0019282102584839, + "learning_rate": 5.529789511237959e-06, + "loss": 0.3712, + "step": 620 + }, + { + "epoch": 0.011076231584204331, + "grad_norm": 1.249360203742981, + "learning_rate": 5.5387085265786665e-06, + "loss": 0.4243, + "step": 621 + }, + { + "epoch": 0.011094067705918025, + "grad_norm": 1.39357590675354, + "learning_rate": 5.547627541919372e-06, + "loss": 0.4015, + "step": 622 + }, + { + "epoch": 0.01111190382763172, + "grad_norm": 1.085498332977295, + "learning_rate": 5.556546557260079e-06, + "loss": 0.4042, + "step": 623 + }, + { + "epoch": 0.011129739949345414, + "grad_norm": 1.1499019861221313, + "learning_rate": 5.565465572600785e-06, + "loss": 0.4452, + "step": 624 + }, + { + "epoch": 0.01114757607105911, + "grad_norm": 1.0987162590026855, + "learning_rate": 5.574384587941491e-06, + "loss": 0.4568, + "step": 625 + }, + { + "epoch": 0.011165412192772804, + "grad_norm": 1.8718235492706299, + "learning_rate": 5.583303603282198e-06, + "loss": 0.3886, + "step": 626 + }, + { + "epoch": 0.011183248314486497, + "grad_norm": 0.8406670689582825, + "learning_rate": 5.592222618622904e-06, + "loss": 0.4412, + "step": 627 + }, + { + "epoch": 0.011201084436200192, + "grad_norm": 1.0919603109359741, + "learning_rate": 5.601141633963611e-06, + "loss": 0.4251, + "step": 628 + }, + { + "epoch": 0.011218920557913887, + "grad_norm": 1.5491275787353516, + "learning_rate": 5.610060649304317e-06, + "loss": 0.4637, + "step": 629 + }, + { + "epoch": 0.011236756679627582, + "grad_norm": 1.3336869478225708, + "learning_rate": 5.618979664645024e-06, + "loss": 0.433, + "step": 630 + }, + { + "epoch": 0.011254592801341277, + "grad_norm": 1.1420201063156128, + "learning_rate": 5.627898679985729e-06, + "loss": 0.4171, + "step": 631 + }, + { + "epoch": 0.01127242892305497, + "grad_norm": 1.058229923248291, + "learning_rate": 5.636817695326437e-06, + "loss": 0.4333, + "step": 632 + }, + { + "epoch": 0.011290265044768665, + "grad_norm": 1.1878844499588013, + "learning_rate": 5.645736710667142e-06, + "loss": 0.3893, + "step": 633 + }, + { + "epoch": 0.01130810116648236, + "grad_norm": 0.9159463047981262, + "learning_rate": 5.654655726007849e-06, + "loss": 0.3675, + "step": 634 + }, + { + "epoch": 0.011325937288196055, + "grad_norm": 1.9920088052749634, + "learning_rate": 5.663574741348556e-06, + "loss": 0.4744, + "step": 635 + }, + { + "epoch": 0.01134377340990975, + "grad_norm": 1.0459561347961426, + "learning_rate": 5.672493756689261e-06, + "loss": 0.4143, + "step": 636 + }, + { + "epoch": 0.011361609531623443, + "grad_norm": 0.9108535647392273, + "learning_rate": 5.6814127720299685e-06, + "loss": 0.3295, + "step": 637 + }, + { + "epoch": 0.011379445653337138, + "grad_norm": 1.207665205001831, + "learning_rate": 5.690331787370674e-06, + "loss": 0.3778, + "step": 638 + }, + { + "epoch": 0.011397281775050833, + "grad_norm": 1.1077829599380493, + "learning_rate": 5.699250802711381e-06, + "loss": 0.4113, + "step": 639 + }, + { + "epoch": 0.011415117896764528, + "grad_norm": 1.5911637544631958, + "learning_rate": 5.708169818052087e-06, + "loss": 0.426, + "step": 640 + }, + { + "epoch": 0.011432954018478223, + "grad_norm": 1.0696449279785156, + "learning_rate": 5.717088833392794e-06, + "loss": 0.4117, + "step": 641 + }, + { + "epoch": 0.011450790140191916, + "grad_norm": 0.9678241014480591, + "learning_rate": 5.7260078487335005e-06, + "loss": 0.4008, + "step": 642 + }, + { + "epoch": 0.011468626261905611, + "grad_norm": 1.048011064529419, + "learning_rate": 5.734926864074207e-06, + "loss": 0.3812, + "step": 643 + }, + { + "epoch": 0.011486462383619306, + "grad_norm": 1.1569571495056152, + "learning_rate": 5.743845879414913e-06, + "loss": 0.3869, + "step": 644 + }, + { + "epoch": 0.011504298505333, + "grad_norm": 1.0135393142700195, + "learning_rate": 5.752764894755619e-06, + "loss": 0.3841, + "step": 645 + }, + { + "epoch": 0.011522134627046696, + "grad_norm": 0.8639402389526367, + "learning_rate": 5.761683910096326e-06, + "loss": 0.3285, + "step": 646 + }, + { + "epoch": 0.011539970748760389, + "grad_norm": 1.4575084447860718, + "learning_rate": 5.7706029254370315e-06, + "loss": 0.4309, + "step": 647 + }, + { + "epoch": 0.011557806870474084, + "grad_norm": 1.0409088134765625, + "learning_rate": 5.779521940777739e-06, + "loss": 0.4194, + "step": 648 + }, + { + "epoch": 0.011575642992187779, + "grad_norm": 0.8197188377380371, + "learning_rate": 5.788440956118444e-06, + "loss": 0.4557, + "step": 649 + }, + { + "epoch": 0.011593479113901474, + "grad_norm": 0.9931203126907349, + "learning_rate": 5.7973599714591515e-06, + "loss": 0.3672, + "step": 650 + }, + { + "epoch": 0.011611315235615169, + "grad_norm": 2.18489933013916, + "learning_rate": 5.806278986799858e-06, + "loss": 0.4526, + "step": 651 + }, + { + "epoch": 0.011629151357328862, + "grad_norm": 0.8428823351860046, + "learning_rate": 5.815198002140564e-06, + "loss": 0.4116, + "step": 652 + }, + { + "epoch": 0.011646987479042557, + "grad_norm": 0.8353315591812134, + "learning_rate": 5.824117017481271e-06, + "loss": 0.421, + "step": 653 + }, + { + "epoch": 0.011664823600756252, + "grad_norm": 1.0589710474014282, + "learning_rate": 5.833036032821976e-06, + "loss": 0.4442, + "step": 654 + }, + { + "epoch": 0.011682659722469946, + "grad_norm": 1.1102938652038574, + "learning_rate": 5.841955048162683e-06, + "loss": 0.3566, + "step": 655 + }, + { + "epoch": 0.011700495844183641, + "grad_norm": 1.614175796508789, + "learning_rate": 5.850874063503389e-06, + "loss": 0.4367, + "step": 656 + }, + { + "epoch": 0.011718331965897335, + "grad_norm": 1.0842281579971313, + "learning_rate": 5.859793078844096e-06, + "loss": 0.4961, + "step": 657 + }, + { + "epoch": 0.01173616808761103, + "grad_norm": 0.9871965050697327, + "learning_rate": 5.8687120941848025e-06, + "loss": 0.4513, + "step": 658 + }, + { + "epoch": 0.011754004209324724, + "grad_norm": 0.9325709342956543, + "learning_rate": 5.877631109525509e-06, + "loss": 0.4133, + "step": 659 + }, + { + "epoch": 0.01177184033103842, + "grad_norm": 0.9642876386642456, + "learning_rate": 5.886550124866215e-06, + "loss": 0.4521, + "step": 660 + }, + { + "epoch": 0.011789676452752114, + "grad_norm": 1.7349592447280884, + "learning_rate": 5.895469140206922e-06, + "loss": 0.4424, + "step": 661 + }, + { + "epoch": 0.011807512574465807, + "grad_norm": 1.0229498147964478, + "learning_rate": 5.904388155547628e-06, + "loss": 0.5209, + "step": 662 + }, + { + "epoch": 0.011825348696179502, + "grad_norm": 0.950793981552124, + "learning_rate": 5.913307170888334e-06, + "loss": 0.4087, + "step": 663 + }, + { + "epoch": 0.011843184817893197, + "grad_norm": 1.3746693134307861, + "learning_rate": 5.922226186229041e-06, + "loss": 0.4052, + "step": 664 + }, + { + "epoch": 0.011861020939606892, + "grad_norm": 1.0733963251113892, + "learning_rate": 5.931145201569746e-06, + "loss": 0.4143, + "step": 665 + }, + { + "epoch": 0.011878857061320587, + "grad_norm": 1.106656551361084, + "learning_rate": 5.940064216910454e-06, + "loss": 0.4357, + "step": 666 + }, + { + "epoch": 0.01189669318303428, + "grad_norm": 1.124107003211975, + "learning_rate": 5.94898323225116e-06, + "loss": 0.4674, + "step": 667 + }, + { + "epoch": 0.011914529304747975, + "grad_norm": 1.2719773054122925, + "learning_rate": 5.957902247591866e-06, + "loss": 0.4667, + "step": 668 + }, + { + "epoch": 0.01193236542646167, + "grad_norm": 1.3913277387619019, + "learning_rate": 5.966821262932573e-06, + "loss": 0.4545, + "step": 669 + }, + { + "epoch": 0.011950201548175365, + "grad_norm": 1.3351109027862549, + "learning_rate": 5.975740278273279e-06, + "loss": 0.4962, + "step": 670 + }, + { + "epoch": 0.01196803766988906, + "grad_norm": 1.03583824634552, + "learning_rate": 5.9846592936139855e-06, + "loss": 0.3828, + "step": 671 + }, + { + "epoch": 0.011985873791602753, + "grad_norm": 1.5020333528518677, + "learning_rate": 5.993578308954691e-06, + "loss": 0.4187, + "step": 672 + }, + { + "epoch": 0.012003709913316448, + "grad_norm": 1.5509532690048218, + "learning_rate": 6.002497324295398e-06, + "loss": 0.3522, + "step": 673 + }, + { + "epoch": 0.012021546035030143, + "grad_norm": 1.3156205415725708, + "learning_rate": 6.011416339636105e-06, + "loss": 0.4496, + "step": 674 + }, + { + "epoch": 0.012039382156743838, + "grad_norm": 1.3814486265182495, + "learning_rate": 6.020335354976811e-06, + "loss": 0.4037, + "step": 675 + }, + { + "epoch": 0.012057218278457533, + "grad_norm": 1.0259685516357422, + "learning_rate": 6.029254370317517e-06, + "loss": 0.3465, + "step": 676 + }, + { + "epoch": 0.012075054400171226, + "grad_norm": 1.0664241313934326, + "learning_rate": 6.038173385658224e-06, + "loss": 0.364, + "step": 677 + }, + { + "epoch": 0.012092890521884921, + "grad_norm": 1.151801347732544, + "learning_rate": 6.04709240099893e-06, + "loss": 0.3997, + "step": 678 + }, + { + "epoch": 0.012110726643598616, + "grad_norm": 1.0412648916244507, + "learning_rate": 6.0560114163396365e-06, + "loss": 0.4186, + "step": 679 + }, + { + "epoch": 0.01212856276531231, + "grad_norm": 1.2787470817565918, + "learning_rate": 6.064930431680343e-06, + "loss": 0.4059, + "step": 680 + }, + { + "epoch": 0.012146398887026006, + "grad_norm": 1.2930010557174683, + "learning_rate": 6.0738494470210484e-06, + "loss": 0.4146, + "step": 681 + }, + { + "epoch": 0.012164235008739699, + "grad_norm": 1.3312135934829712, + "learning_rate": 6.082768462361756e-06, + "loss": 0.4424, + "step": 682 + }, + { + "epoch": 0.012182071130453394, + "grad_norm": 1.0913116931915283, + "learning_rate": 6.091687477702462e-06, + "loss": 0.4532, + "step": 683 + }, + { + "epoch": 0.012199907252167089, + "grad_norm": 2.2016854286193848, + "learning_rate": 6.1006064930431684e-06, + "loss": 0.419, + "step": 684 + }, + { + "epoch": 0.012217743373880784, + "grad_norm": 1.046238660812378, + "learning_rate": 6.109525508383875e-06, + "loss": 0.4202, + "step": 685 + }, + { + "epoch": 0.012235579495594479, + "grad_norm": 1.246010184288025, + "learning_rate": 6.118444523724581e-06, + "loss": 0.425, + "step": 686 + }, + { + "epoch": 0.012253415617308172, + "grad_norm": 1.2753421068191528, + "learning_rate": 6.1273635390652876e-06, + "loss": 0.3787, + "step": 687 + }, + { + "epoch": 0.012271251739021867, + "grad_norm": 0.980410099029541, + "learning_rate": 6.136282554405994e-06, + "loss": 0.418, + "step": 688 + }, + { + "epoch": 0.012289087860735562, + "grad_norm": 1.0079503059387207, + "learning_rate": 6.1452015697467e-06, + "loss": 0.4031, + "step": 689 + }, + { + "epoch": 0.012306923982449256, + "grad_norm": 1.3984042406082153, + "learning_rate": 6.154120585087407e-06, + "loss": 0.3477, + "step": 690 + }, + { + "epoch": 0.012324760104162951, + "grad_norm": 1.1816221475601196, + "learning_rate": 6.163039600428113e-06, + "loss": 0.3699, + "step": 691 + }, + { + "epoch": 0.012342596225876645, + "grad_norm": 0.9247754812240601, + "learning_rate": 6.1719586157688195e-06, + "loss": 0.3727, + "step": 692 + }, + { + "epoch": 0.01236043234759034, + "grad_norm": 0.8835532665252686, + "learning_rate": 6.180877631109526e-06, + "loss": 0.4442, + "step": 693 + }, + { + "epoch": 0.012378268469304034, + "grad_norm": 0.9995298981666565, + "learning_rate": 6.189796646450232e-06, + "loss": 0.4309, + "step": 694 + }, + { + "epoch": 0.01239610459101773, + "grad_norm": 1.4439276456832886, + "learning_rate": 6.198715661790939e-06, + "loss": 0.377, + "step": 695 + }, + { + "epoch": 0.012413940712731424, + "grad_norm": 1.3238965272903442, + "learning_rate": 6.207634677131645e-06, + "loss": 0.4259, + "step": 696 + }, + { + "epoch": 0.012431776834445117, + "grad_norm": 1.1488457918167114, + "learning_rate": 6.216553692472351e-06, + "loss": 0.4529, + "step": 697 + }, + { + "epoch": 0.012449612956158812, + "grad_norm": 1.1248880624771118, + "learning_rate": 6.225472707813058e-06, + "loss": 0.3931, + "step": 698 + }, + { + "epoch": 0.012467449077872507, + "grad_norm": 2.2086665630340576, + "learning_rate": 6.234391723153764e-06, + "loss": 0.4266, + "step": 699 + }, + { + "epoch": 0.012485285199586202, + "grad_norm": 1.4757678508758545, + "learning_rate": 6.2433107384944705e-06, + "loss": 0.422, + "step": 700 + }, + { + "epoch": 0.012503121321299897, + "grad_norm": 1.495514988899231, + "learning_rate": 6.252229753835178e-06, + "loss": 0.4223, + "step": 701 + }, + { + "epoch": 0.01252095744301359, + "grad_norm": 1.1658685207366943, + "learning_rate": 6.2611487691758824e-06, + "loss": 0.364, + "step": 702 + }, + { + "epoch": 0.012538793564727285, + "grad_norm": 1.3786637783050537, + "learning_rate": 6.27006778451659e-06, + "loss": 0.3867, + "step": 703 + }, + { + "epoch": 0.01255662968644098, + "grad_norm": 1.184095025062561, + "learning_rate": 6.278986799857296e-06, + "loss": 0.4947, + "step": 704 + }, + { + "epoch": 0.012574465808154675, + "grad_norm": 1.1303995847702026, + "learning_rate": 6.287905815198003e-06, + "loss": 0.3957, + "step": 705 + }, + { + "epoch": 0.01259230192986837, + "grad_norm": 1.4637501239776611, + "learning_rate": 6.296824830538709e-06, + "loss": 0.4926, + "step": 706 + }, + { + "epoch": 0.012610138051582063, + "grad_norm": 1.5414247512817383, + "learning_rate": 6.305743845879415e-06, + "loss": 0.4001, + "step": 707 + }, + { + "epoch": 0.012627974173295758, + "grad_norm": 1.0155268907546997, + "learning_rate": 6.3146628612201216e-06, + "loss": 0.4371, + "step": 708 + }, + { + "epoch": 0.012645810295009453, + "grad_norm": 1.166196584701538, + "learning_rate": 6.323581876560827e-06, + "loss": 0.4294, + "step": 709 + }, + { + "epoch": 0.012663646416723148, + "grad_norm": 1.2898826599121094, + "learning_rate": 6.332500891901534e-06, + "loss": 0.438, + "step": 710 + }, + { + "epoch": 0.012681482538436843, + "grad_norm": 1.0751768350601196, + "learning_rate": 6.341419907242241e-06, + "loss": 0.4241, + "step": 711 + }, + { + "epoch": 0.012699318660150536, + "grad_norm": 0.8998647928237915, + "learning_rate": 6.350338922582948e-06, + "loss": 0.3414, + "step": 712 + }, + { + "epoch": 0.012717154781864231, + "grad_norm": 1.0985339879989624, + "learning_rate": 6.3592579379236535e-06, + "loss": 0.4041, + "step": 713 + }, + { + "epoch": 0.012734990903577926, + "grad_norm": 1.154080867767334, + "learning_rate": 6.36817695326436e-06, + "loss": 0.3809, + "step": 714 + }, + { + "epoch": 0.01275282702529162, + "grad_norm": 1.9730243682861328, + "learning_rate": 6.377095968605066e-06, + "loss": 0.3941, + "step": 715 + }, + { + "epoch": 0.012770663147005316, + "grad_norm": 1.3270517587661743, + "learning_rate": 6.3860149839457734e-06, + "loss": 0.4365, + "step": 716 + }, + { + "epoch": 0.012788499268719009, + "grad_norm": 1.0822250843048096, + "learning_rate": 6.394933999286479e-06, + "loss": 0.4683, + "step": 717 + }, + { + "epoch": 0.012806335390432704, + "grad_norm": 1.1357783079147339, + "learning_rate": 6.403853014627185e-06, + "loss": 0.4114, + "step": 718 + }, + { + "epoch": 0.012824171512146399, + "grad_norm": 0.8394474387168884, + "learning_rate": 6.412772029967893e-06, + "loss": 0.3545, + "step": 719 + }, + { + "epoch": 0.012842007633860094, + "grad_norm": 1.2799073457717896, + "learning_rate": 6.421691045308597e-06, + "loss": 0.4275, + "step": 720 + }, + { + "epoch": 0.012859843755573789, + "grad_norm": 1.0163384675979614, + "learning_rate": 6.4306100606493045e-06, + "loss": 0.3397, + "step": 721 + }, + { + "epoch": 0.012877679877287482, + "grad_norm": 1.1710898876190186, + "learning_rate": 6.439529075990011e-06, + "loss": 0.4035, + "step": 722 + }, + { + "epoch": 0.012895515999001177, + "grad_norm": 0.9285147786140442, + "learning_rate": 6.448448091330718e-06, + "loss": 0.4014, + "step": 723 + }, + { + "epoch": 0.012913352120714872, + "grad_norm": 1.1520582437515259, + "learning_rate": 6.457367106671424e-06, + "loss": 0.4091, + "step": 724 + }, + { + "epoch": 0.012931188242428567, + "grad_norm": 2.1584787368774414, + "learning_rate": 6.46628612201213e-06, + "loss": 0.5232, + "step": 725 + }, + { + "epoch": 0.012949024364142261, + "grad_norm": 1.363916277885437, + "learning_rate": 6.475205137352837e-06, + "loss": 0.3715, + "step": 726 + }, + { + "epoch": 0.012966860485855955, + "grad_norm": 0.9038228988647461, + "learning_rate": 6.484124152693542e-06, + "loss": 0.4156, + "step": 727 + }, + { + "epoch": 0.01298469660756965, + "grad_norm": 0.9826972484588623, + "learning_rate": 6.493043168034249e-06, + "loss": 0.4079, + "step": 728 + }, + { + "epoch": 0.013002532729283344, + "grad_norm": 0.8310179114341736, + "learning_rate": 6.5019621833749555e-06, + "loss": 0.3577, + "step": 729 + }, + { + "epoch": 0.01302036885099704, + "grad_norm": 1.237696647644043, + "learning_rate": 6.510881198715663e-06, + "loss": 0.4967, + "step": 730 + }, + { + "epoch": 0.013038204972710734, + "grad_norm": 1.3650190830230713, + "learning_rate": 6.519800214056368e-06, + "loss": 0.4011, + "step": 731 + }, + { + "epoch": 0.01305604109442443, + "grad_norm": 1.4350529909133911, + "learning_rate": 6.528719229397075e-06, + "loss": 0.3749, + "step": 732 + }, + { + "epoch": 0.013073877216138122, + "grad_norm": 0.9284170269966125, + "learning_rate": 6.537638244737782e-06, + "loss": 0.416, + "step": 733 + }, + { + "epoch": 0.013091713337851817, + "grad_norm": 1.1350346803665161, + "learning_rate": 6.546557260078488e-06, + "loss": 0.4542, + "step": 734 + }, + { + "epoch": 0.013109549459565512, + "grad_norm": 2.328464984893799, + "learning_rate": 6.555476275419194e-06, + "loss": 0.3681, + "step": 735 + }, + { + "epoch": 0.013127385581279207, + "grad_norm": 0.8249056339263916, + "learning_rate": 6.5643952907599e-06, + "loss": 0.397, + "step": 736 + }, + { + "epoch": 0.013145221702992902, + "grad_norm": 1.0406147241592407, + "learning_rate": 6.5733143061006074e-06, + "loss": 0.4341, + "step": 737 + }, + { + "epoch": 0.013163057824706595, + "grad_norm": 0.7927587628364563, + "learning_rate": 6.582233321441313e-06, + "loss": 0.3395, + "step": 738 + }, + { + "epoch": 0.01318089394642029, + "grad_norm": 0.9958735704421997, + "learning_rate": 6.591152336782019e-06, + "loss": 0.3833, + "step": 739 + }, + { + "epoch": 0.013198730068133985, + "grad_norm": 0.8341773748397827, + "learning_rate": 6.600071352122726e-06, + "loss": 0.344, + "step": 740 + }, + { + "epoch": 0.01321656618984768, + "grad_norm": 1.1658989191055298, + "learning_rate": 6.608990367463433e-06, + "loss": 0.4142, + "step": 741 + }, + { + "epoch": 0.013234402311561375, + "grad_norm": 1.5032583475112915, + "learning_rate": 6.6179093828041385e-06, + "loss": 0.4346, + "step": 742 + }, + { + "epoch": 0.013252238433275068, + "grad_norm": 1.3449758291244507, + "learning_rate": 6.626828398144845e-06, + "loss": 0.4578, + "step": 743 + }, + { + "epoch": 0.013270074554988763, + "grad_norm": 0.9802781343460083, + "learning_rate": 6.635747413485552e-06, + "loss": 0.3982, + "step": 744 + }, + { + "epoch": 0.013287910676702458, + "grad_norm": 1.0957345962524414, + "learning_rate": 6.644666428826258e-06, + "loss": 0.3745, + "step": 745 + }, + { + "epoch": 0.013305746798416153, + "grad_norm": 1.1497217416763306, + "learning_rate": 6.653585444166964e-06, + "loss": 0.4154, + "step": 746 + }, + { + "epoch": 0.013323582920129848, + "grad_norm": 1.134560465812683, + "learning_rate": 6.66250445950767e-06, + "loss": 0.4289, + "step": 747 + }, + { + "epoch": 0.013341419041843541, + "grad_norm": 1.1097346544265747, + "learning_rate": 6.671423474848378e-06, + "loss": 0.3922, + "step": 748 + }, + { + "epoch": 0.013359255163557236, + "grad_norm": 1.1429896354675293, + "learning_rate": 6.680342490189083e-06, + "loss": 0.3488, + "step": 749 + }, + { + "epoch": 0.01337709128527093, + "grad_norm": 3.192944288253784, + "learning_rate": 6.6892615055297895e-06, + "loss": 0.4736, + "step": 750 + }, + { + "epoch": 0.013394927406984626, + "grad_norm": 0.9527766108512878, + "learning_rate": 6.698180520870497e-06, + "loss": 0.4484, + "step": 751 + }, + { + "epoch": 0.01341276352869832, + "grad_norm": 0.9491976499557495, + "learning_rate": 6.707099536211203e-06, + "loss": 0.4781, + "step": 752 + }, + { + "epoch": 0.013430599650412014, + "grad_norm": 1.2711595296859741, + "learning_rate": 6.716018551551909e-06, + "loss": 0.4414, + "step": 753 + }, + { + "epoch": 0.013448435772125709, + "grad_norm": 0.9956739544868469, + "learning_rate": 6.724937566892615e-06, + "loss": 0.3846, + "step": 754 + }, + { + "epoch": 0.013466271893839404, + "grad_norm": 1.4637160301208496, + "learning_rate": 6.733856582233322e-06, + "loss": 0.4572, + "step": 755 + }, + { + "epoch": 0.013484108015553099, + "grad_norm": 1.0149110555648804, + "learning_rate": 6.742775597574028e-06, + "loss": 0.365, + "step": 756 + }, + { + "epoch": 0.013501944137266794, + "grad_norm": 0.9533802270889282, + "learning_rate": 6.751694612914734e-06, + "loss": 0.4999, + "step": 757 + }, + { + "epoch": 0.013519780258980487, + "grad_norm": 0.9060002565383911, + "learning_rate": 6.7606136282554414e-06, + "loss": 0.4258, + "step": 758 + }, + { + "epoch": 0.013537616380694182, + "grad_norm": 1.1750162839889526, + "learning_rate": 6.769532643596148e-06, + "loss": 0.3791, + "step": 759 + }, + { + "epoch": 0.013555452502407877, + "grad_norm": 0.9837682247161865, + "learning_rate": 6.778451658936853e-06, + "loss": 0.392, + "step": 760 + }, + { + "epoch": 0.013573288624121571, + "grad_norm": 1.0854389667510986, + "learning_rate": 6.78737067427756e-06, + "loss": 0.3655, + "step": 761 + }, + { + "epoch": 0.013591124745835266, + "grad_norm": 1.6050208806991577, + "learning_rate": 6.796289689618267e-06, + "loss": 0.4137, + "step": 762 + }, + { + "epoch": 0.01360896086754896, + "grad_norm": 0.9369100332260132, + "learning_rate": 6.8052087049589725e-06, + "loss": 0.4084, + "step": 763 + }, + { + "epoch": 0.013626796989262654, + "grad_norm": 0.9988794326782227, + "learning_rate": 6.814127720299679e-06, + "loss": 0.3253, + "step": 764 + }, + { + "epoch": 0.01364463311097635, + "grad_norm": 1.476580262184143, + "learning_rate": 6.823046735640386e-06, + "loss": 0.5194, + "step": 765 + }, + { + "epoch": 0.013662469232690044, + "grad_norm": 1.4281305074691772, + "learning_rate": 6.8319657509810925e-06, + "loss": 0.5293, + "step": 766 + }, + { + "epoch": 0.01368030535440374, + "grad_norm": 1.12155282497406, + "learning_rate": 6.840884766321798e-06, + "loss": 0.4609, + "step": 767 + }, + { + "epoch": 0.013698141476117432, + "grad_norm": 1.1472570896148682, + "learning_rate": 6.849803781662504e-06, + "loss": 0.4228, + "step": 768 + }, + { + "epoch": 0.013715977597831127, + "grad_norm": 1.2671374082565308, + "learning_rate": 6.858722797003212e-06, + "loss": 0.4665, + "step": 769 + }, + { + "epoch": 0.013733813719544822, + "grad_norm": 0.9118152856826782, + "learning_rate": 6.867641812343918e-06, + "loss": 0.4147, + "step": 770 + }, + { + "epoch": 0.013751649841258517, + "grad_norm": 1.4787702560424805, + "learning_rate": 6.8765608276846235e-06, + "loss": 0.4077, + "step": 771 + }, + { + "epoch": 0.013769485962972212, + "grad_norm": 0.890640914440155, + "learning_rate": 6.885479843025331e-06, + "loss": 0.3717, + "step": 772 + }, + { + "epoch": 0.013787322084685905, + "grad_norm": 1.2971957921981812, + "learning_rate": 6.894398858366037e-06, + "loss": 0.4762, + "step": 773 + }, + { + "epoch": 0.0138051582063996, + "grad_norm": 1.8639717102050781, + "learning_rate": 6.903317873706743e-06, + "loss": 0.4415, + "step": 774 + }, + { + "epoch": 0.013822994328113295, + "grad_norm": 0.9721428751945496, + "learning_rate": 6.912236889047449e-06, + "loss": 0.3898, + "step": 775 + }, + { + "epoch": 0.01384083044982699, + "grad_norm": 1.3769142627716064, + "learning_rate": 6.921155904388156e-06, + "loss": 0.4044, + "step": 776 + }, + { + "epoch": 0.013858666571540685, + "grad_norm": 4.9716973304748535, + "learning_rate": 6.930074919728863e-06, + "loss": 0.4096, + "step": 777 + }, + { + "epoch": 0.013876502693254378, + "grad_norm": 0.7622309923171997, + "learning_rate": 6.938993935069568e-06, + "loss": 0.3708, + "step": 778 + }, + { + "epoch": 0.013894338814968073, + "grad_norm": 1.4392539262771606, + "learning_rate": 6.9479129504102746e-06, + "loss": 0.39, + "step": 779 + }, + { + "epoch": 0.013912174936681768, + "grad_norm": 0.7795321941375732, + "learning_rate": 6.956831965750982e-06, + "loss": 0.3472, + "step": 780 + }, + { + "epoch": 0.013930011058395463, + "grad_norm": 1.8850880861282349, + "learning_rate": 6.965750981091688e-06, + "loss": 0.4258, + "step": 781 + }, + { + "epoch": 0.013947847180109158, + "grad_norm": 1.548249363899231, + "learning_rate": 6.974669996432394e-06, + "loss": 0.5222, + "step": 782 + }, + { + "epoch": 0.013965683301822851, + "grad_norm": 1.331453800201416, + "learning_rate": 6.983589011773101e-06, + "loss": 0.3634, + "step": 783 + }, + { + "epoch": 0.013983519423536546, + "grad_norm": 1.0235885381698608, + "learning_rate": 6.992508027113807e-06, + "loss": 0.4164, + "step": 784 + }, + { + "epoch": 0.01400135554525024, + "grad_norm": 1.1273366212844849, + "learning_rate": 7.001427042454513e-06, + "loss": 0.3802, + "step": 785 + }, + { + "epoch": 0.014019191666963936, + "grad_norm": 1.1602329015731812, + "learning_rate": 7.010346057795219e-06, + "loss": 0.4367, + "step": 786 + }, + { + "epoch": 0.01403702778867763, + "grad_norm": 0.9095342755317688, + "learning_rate": 7.0192650731359265e-06, + "loss": 0.4024, + "step": 787 + }, + { + "epoch": 0.014054863910391324, + "grad_norm": 1.5537116527557373, + "learning_rate": 7.028184088476633e-06, + "loss": 0.3809, + "step": 788 + }, + { + "epoch": 0.014072700032105019, + "grad_norm": 1.4078360795974731, + "learning_rate": 7.037103103817338e-06, + "loss": 0.4279, + "step": 789 + }, + { + "epoch": 0.014090536153818714, + "grad_norm": 1.0890816450119019, + "learning_rate": 7.046022119158046e-06, + "loss": 0.3316, + "step": 790 + }, + { + "epoch": 0.014108372275532409, + "grad_norm": 1.2133907079696655, + "learning_rate": 7.054941134498752e-06, + "loss": 0.3913, + "step": 791 + }, + { + "epoch": 0.014126208397246104, + "grad_norm": 0.9176425337791443, + "learning_rate": 7.0638601498394575e-06, + "loss": 0.3924, + "step": 792 + }, + { + "epoch": 0.014144044518959797, + "grad_norm": 0.8917650580406189, + "learning_rate": 7.072779165180164e-06, + "loss": 0.3654, + "step": 793 + }, + { + "epoch": 0.014161880640673492, + "grad_norm": 1.9989323616027832, + "learning_rate": 7.081698180520871e-06, + "loss": 0.5011, + "step": 794 + }, + { + "epoch": 0.014179716762387187, + "grad_norm": 2.3614730834960938, + "learning_rate": 7.0906171958615775e-06, + "loss": 0.4123, + "step": 795 + }, + { + "epoch": 0.014197552884100881, + "grad_norm": 1.1702730655670166, + "learning_rate": 7.099536211202283e-06, + "loss": 0.3909, + "step": 796 + }, + { + "epoch": 0.014215389005814576, + "grad_norm": 0.99686199426651, + "learning_rate": 7.10845522654299e-06, + "loss": 0.3776, + "step": 797 + }, + { + "epoch": 0.01423322512752827, + "grad_norm": 1.4835487604141235, + "learning_rate": 7.117374241883697e-06, + "loss": 0.3679, + "step": 798 + }, + { + "epoch": 0.014251061249241965, + "grad_norm": 0.9821210503578186, + "learning_rate": 7.126293257224403e-06, + "loss": 0.3408, + "step": 799 + }, + { + "epoch": 0.01426889737095566, + "grad_norm": 1.773571252822876, + "learning_rate": 7.1352122725651086e-06, + "loss": 0.3431, + "step": 800 + }, + { + "epoch": 0.014286733492669354, + "grad_norm": 1.0360254049301147, + "learning_rate": 7.144131287905816e-06, + "loss": 0.3545, + "step": 801 + }, + { + "epoch": 0.01430456961438305, + "grad_norm": 2.319758892059326, + "learning_rate": 7.153050303246522e-06, + "loss": 0.3851, + "step": 802 + }, + { + "epoch": 0.014322405736096742, + "grad_norm": 0.7496076226234436, + "learning_rate": 7.161969318587228e-06, + "loss": 0.344, + "step": 803 + }, + { + "epoch": 0.014340241857810437, + "grad_norm": 0.831169068813324, + "learning_rate": 7.170888333927935e-06, + "loss": 0.3482, + "step": 804 + }, + { + "epoch": 0.014358077979524132, + "grad_norm": 0.9808482527732849, + "learning_rate": 7.179807349268641e-06, + "loss": 0.4026, + "step": 805 + }, + { + "epoch": 0.014375914101237827, + "grad_norm": 0.9589486122131348, + "learning_rate": 7.188726364609348e-06, + "loss": 0.4512, + "step": 806 + }, + { + "epoch": 0.014393750222951522, + "grad_norm": 1.0854859352111816, + "learning_rate": 7.197645379950053e-06, + "loss": 0.3563, + "step": 807 + }, + { + "epoch": 0.014411586344665215, + "grad_norm": 0.8584607839584351, + "learning_rate": 7.2065643952907604e-06, + "loss": 0.3587, + "step": 808 + }, + { + "epoch": 0.01442942246637891, + "grad_norm": 1.3820878267288208, + "learning_rate": 7.215483410631467e-06, + "loss": 0.3984, + "step": 809 + }, + { + "epoch": 0.014447258588092605, + "grad_norm": 0.9056740403175354, + "learning_rate": 7.224402425972172e-06, + "loss": 0.3696, + "step": 810 + }, + { + "epoch": 0.0144650947098063, + "grad_norm": 0.830923855304718, + "learning_rate": 7.233321441312879e-06, + "loss": 0.4189, + "step": 811 + }, + { + "epoch": 0.014482930831519995, + "grad_norm": 0.7032369375228882, + "learning_rate": 7.242240456653586e-06, + "loss": 0.358, + "step": 812 + }, + { + "epoch": 0.014500766953233688, + "grad_norm": 1.0487927198410034, + "learning_rate": 7.251159471994292e-06, + "loss": 0.3414, + "step": 813 + }, + { + "epoch": 0.014518603074947383, + "grad_norm": 1.0935640335083008, + "learning_rate": 7.260078487334998e-06, + "loss": 0.3727, + "step": 814 + }, + { + "epoch": 0.014536439196661078, + "grad_norm": 1.4685693979263306, + "learning_rate": 7.268997502675705e-06, + "loss": 0.4056, + "step": 815 + }, + { + "epoch": 0.014554275318374773, + "grad_norm": 1.1004257202148438, + "learning_rate": 7.2779165180164115e-06, + "loss": 0.4911, + "step": 816 + }, + { + "epoch": 0.014572111440088468, + "grad_norm": 0.8749274015426636, + "learning_rate": 7.286835533357119e-06, + "loss": 0.3947, + "step": 817 + }, + { + "epoch": 0.014589947561802161, + "grad_norm": 1.633952021598816, + "learning_rate": 7.295754548697823e-06, + "loss": 0.4308, + "step": 818 + }, + { + "epoch": 0.014607783683515856, + "grad_norm": 0.7927972674369812, + "learning_rate": 7.304673564038531e-06, + "loss": 0.3455, + "step": 819 + }, + { + "epoch": 0.01462561980522955, + "grad_norm": 2.7083334922790527, + "learning_rate": 7.313592579379237e-06, + "loss": 0.372, + "step": 820 + }, + { + "epoch": 0.014643455926943246, + "grad_norm": 1.5031018257141113, + "learning_rate": 7.3225115947199425e-06, + "loss": 0.4165, + "step": 821 + }, + { + "epoch": 0.01466129204865694, + "grad_norm": 1.1417412757873535, + "learning_rate": 7.33143061006065e-06, + "loss": 0.3365, + "step": 822 + }, + { + "epoch": 0.014679128170370634, + "grad_norm": 1.0912084579467773, + "learning_rate": 7.340349625401356e-06, + "loss": 0.4463, + "step": 823 + }, + { + "epoch": 0.014696964292084329, + "grad_norm": 1.1397372484207153, + "learning_rate": 7.349268640742063e-06, + "loss": 0.3502, + "step": 824 + }, + { + "epoch": 0.014714800413798024, + "grad_norm": 1.1142958402633667, + "learning_rate": 7.358187656082768e-06, + "loss": 0.4658, + "step": 825 + }, + { + "epoch": 0.014732636535511719, + "grad_norm": 1.18959379196167, + "learning_rate": 7.367106671423475e-06, + "loss": 0.3681, + "step": 826 + }, + { + "epoch": 0.014750472657225414, + "grad_norm": 1.7012782096862793, + "learning_rate": 7.376025686764182e-06, + "loss": 0.4448, + "step": 827 + }, + { + "epoch": 0.014768308778939107, + "grad_norm": 0.7677625417709351, + "learning_rate": 7.384944702104887e-06, + "loss": 0.3694, + "step": 828 + }, + { + "epoch": 0.014786144900652802, + "grad_norm": 0.9499193429946899, + "learning_rate": 7.3938637174455944e-06, + "loss": 0.3595, + "step": 829 + }, + { + "epoch": 0.014803981022366497, + "grad_norm": 1.0089343786239624, + "learning_rate": 7.402782732786301e-06, + "loss": 0.5221, + "step": 830 + }, + { + "epoch": 0.014821817144080192, + "grad_norm": 1.5784305334091187, + "learning_rate": 7.411701748127008e-06, + "loss": 0.3789, + "step": 831 + }, + { + "epoch": 0.014839653265793886, + "grad_norm": 1.0147290229797363, + "learning_rate": 7.420620763467713e-06, + "loss": 0.4003, + "step": 832 + }, + { + "epoch": 0.01485748938750758, + "grad_norm": 0.8705704212188721, + "learning_rate": 7.42953977880842e-06, + "loss": 0.3909, + "step": 833 + }, + { + "epoch": 0.014875325509221275, + "grad_norm": 1.50275719165802, + "learning_rate": 7.438458794149126e-06, + "loss": 0.4745, + "step": 834 + }, + { + "epoch": 0.01489316163093497, + "grad_norm": 1.1221299171447754, + "learning_rate": 7.4473778094898336e-06, + "loss": 0.419, + "step": 835 + }, + { + "epoch": 0.014910997752648664, + "grad_norm": 1.7666137218475342, + "learning_rate": 7.456296824830539e-06, + "loss": 0.4896, + "step": 836 + }, + { + "epoch": 0.01492883387436236, + "grad_norm": 1.1369482278823853, + "learning_rate": 7.4652158401712455e-06, + "loss": 0.4144, + "step": 837 + }, + { + "epoch": 0.014946669996076052, + "grad_norm": 0.9525813460350037, + "learning_rate": 7.474134855511952e-06, + "loss": 0.4688, + "step": 838 + }, + { + "epoch": 0.014964506117789747, + "grad_norm": 0.9223721623420715, + "learning_rate": 7.483053870852657e-06, + "loss": 0.4354, + "step": 839 + }, + { + "epoch": 0.014982342239503442, + "grad_norm": 0.9912000298500061, + "learning_rate": 7.491972886193365e-06, + "loss": 0.3508, + "step": 840 + }, + { + "epoch": 0.015000178361217137, + "grad_norm": 1.470048427581787, + "learning_rate": 7.500891901534071e-06, + "loss": 0.3996, + "step": 841 + }, + { + "epoch": 0.015018014482930832, + "grad_norm": 0.8923424482345581, + "learning_rate": 7.509810916874778e-06, + "loss": 0.3814, + "step": 842 + }, + { + "epoch": 0.015035850604644525, + "grad_norm": 1.0893371105194092, + "learning_rate": 7.518729932215483e-06, + "loss": 0.3633, + "step": 843 + }, + { + "epoch": 0.01505368672635822, + "grad_norm": 1.2783355712890625, + "learning_rate": 7.52764894755619e-06, + "loss": 0.3868, + "step": 844 + }, + { + "epoch": 0.015071522848071915, + "grad_norm": 1.3496581315994263, + "learning_rate": 7.5365679628968965e-06, + "loss": 0.4137, + "step": 845 + }, + { + "epoch": 0.01508935896978561, + "grad_norm": 0.757161557674408, + "learning_rate": 7.545486978237602e-06, + "loss": 0.3529, + "step": 846 + }, + { + "epoch": 0.015107195091499305, + "grad_norm": 1.3636268377304077, + "learning_rate": 7.554405993578309e-06, + "loss": 0.3748, + "step": 847 + }, + { + "epoch": 0.015125031213212998, + "grad_norm": 0.9378393888473511, + "learning_rate": 7.563325008919016e-06, + "loss": 0.3729, + "step": 848 + }, + { + "epoch": 0.015142867334926693, + "grad_norm": 0.8216644525527954, + "learning_rate": 7.572244024259723e-06, + "loss": 0.3436, + "step": 849 + }, + { + "epoch": 0.015160703456640388, + "grad_norm": 0.9007389545440674, + "learning_rate": 7.5811630396004276e-06, + "loss": 0.37, + "step": 850 + }, + { + "epoch": 0.015178539578354083, + "grad_norm": 1.465600848197937, + "learning_rate": 7.590082054941135e-06, + "loss": 0.3756, + "step": 851 + }, + { + "epoch": 0.015196375700067778, + "grad_norm": 1.0671945810317993, + "learning_rate": 7.599001070281841e-06, + "loss": 0.3238, + "step": 852 + }, + { + "epoch": 0.015214211821781471, + "grad_norm": 0.9975453019142151, + "learning_rate": 7.607920085622548e-06, + "loss": 0.3849, + "step": 853 + }, + { + "epoch": 0.015232047943495166, + "grad_norm": 1.298552393913269, + "learning_rate": 7.616839100963254e-06, + "loss": 0.4651, + "step": 854 + }, + { + "epoch": 0.015249884065208861, + "grad_norm": 1.0898548364639282, + "learning_rate": 7.62575811630396e-06, + "loss": 0.4237, + "step": 855 + }, + { + "epoch": 0.015267720186922556, + "grad_norm": 1.3157875537872314, + "learning_rate": 7.634677131644667e-06, + "loss": 0.3605, + "step": 856 + }, + { + "epoch": 0.01528555630863625, + "grad_norm": 1.0036447048187256, + "learning_rate": 7.643596146985373e-06, + "loss": 0.3343, + "step": 857 + }, + { + "epoch": 0.015303392430349944, + "grad_norm": 1.3900219202041626, + "learning_rate": 7.65251516232608e-06, + "loss": 0.3951, + "step": 858 + }, + { + "epoch": 0.015321228552063639, + "grad_norm": 0.8167012333869934, + "learning_rate": 7.661434177666786e-06, + "loss": 0.3923, + "step": 859 + }, + { + "epoch": 0.015339064673777334, + "grad_norm": 1.1097334623336792, + "learning_rate": 7.670353193007492e-06, + "loss": 0.3583, + "step": 860 + }, + { + "epoch": 0.015356900795491029, + "grad_norm": 0.9676361083984375, + "learning_rate": 7.679272208348199e-06, + "loss": 0.339, + "step": 861 + }, + { + "epoch": 0.015374736917204724, + "grad_norm": 0.8648974895477295, + "learning_rate": 7.688191223688905e-06, + "loss": 0.3853, + "step": 862 + }, + { + "epoch": 0.015392573038918417, + "grad_norm": 1.5109702348709106, + "learning_rate": 7.697110239029611e-06, + "loss": 0.3522, + "step": 863 + }, + { + "epoch": 0.015410409160632112, + "grad_norm": 1.46962308883667, + "learning_rate": 7.706029254370318e-06, + "loss": 0.4402, + "step": 864 + }, + { + "epoch": 0.015428245282345807, + "grad_norm": 0.7502992153167725, + "learning_rate": 7.714948269711024e-06, + "loss": 0.3603, + "step": 865 + }, + { + "epoch": 0.015446081404059502, + "grad_norm": 0.9077794551849365, + "learning_rate": 7.72386728505173e-06, + "loss": 0.3762, + "step": 866 + }, + { + "epoch": 0.015463917525773196, + "grad_norm": 1.4740957021713257, + "learning_rate": 7.732786300392437e-06, + "loss": 0.3627, + "step": 867 + }, + { + "epoch": 0.01548175364748689, + "grad_norm": 0.938251256942749, + "learning_rate": 7.741705315733143e-06, + "loss": 0.4625, + "step": 868 + }, + { + "epoch": 0.015499589769200585, + "grad_norm": 0.9048126339912415, + "learning_rate": 7.75062433107385e-06, + "loss": 0.3593, + "step": 869 + }, + { + "epoch": 0.01551742589091428, + "grad_norm": 0.9404629468917847, + "learning_rate": 7.759543346414556e-06, + "loss": 0.4342, + "step": 870 + }, + { + "epoch": 0.015535262012627974, + "grad_norm": 1.225550651550293, + "learning_rate": 7.768462361755262e-06, + "loss": 0.4465, + "step": 871 + }, + { + "epoch": 0.01555309813434167, + "grad_norm": 2.438214063644409, + "learning_rate": 7.777381377095969e-06, + "loss": 0.3859, + "step": 872 + }, + { + "epoch": 0.015570934256055362, + "grad_norm": 0.8626007437705994, + "learning_rate": 7.786300392436675e-06, + "loss": 0.5049, + "step": 873 + }, + { + "epoch": 0.015588770377769057, + "grad_norm": 0.9151098728179932, + "learning_rate": 7.795219407777382e-06, + "loss": 0.4445, + "step": 874 + }, + { + "epoch": 0.015606606499482752, + "grad_norm": 1.3421666622161865, + "learning_rate": 7.804138423118088e-06, + "loss": 0.4097, + "step": 875 + }, + { + "epoch": 0.015624442621196447, + "grad_norm": 1.99418044090271, + "learning_rate": 7.813057438458794e-06, + "loss": 0.4548, + "step": 876 + }, + { + "epoch": 0.015642278742910142, + "grad_norm": 1.5321028232574463, + "learning_rate": 7.8219764537995e-06, + "loss": 0.3853, + "step": 877 + }, + { + "epoch": 0.015660114864623837, + "grad_norm": 0.7856518030166626, + "learning_rate": 7.830895469140207e-06, + "loss": 0.3634, + "step": 878 + }, + { + "epoch": 0.015677950986337532, + "grad_norm": 1.006925106048584, + "learning_rate": 7.839814484480913e-06, + "loss": 0.4125, + "step": 879 + }, + { + "epoch": 0.015695787108051227, + "grad_norm": 2.1221554279327393, + "learning_rate": 7.84873349982162e-06, + "loss": 0.415, + "step": 880 + }, + { + "epoch": 0.01571362322976492, + "grad_norm": 0.8153918981552124, + "learning_rate": 7.857652515162326e-06, + "loss": 0.4215, + "step": 881 + }, + { + "epoch": 0.015731459351478613, + "grad_norm": 1.1103639602661133, + "learning_rate": 7.866571530503034e-06, + "loss": 0.4071, + "step": 882 + }, + { + "epoch": 0.015749295473192308, + "grad_norm": 1.2810118198394775, + "learning_rate": 7.875490545843739e-06, + "loss": 0.3632, + "step": 883 + }, + { + "epoch": 0.015767131594906003, + "grad_norm": 0.8250142931938171, + "learning_rate": 7.884409561184445e-06, + "loss": 0.3739, + "step": 884 + }, + { + "epoch": 0.015784967716619698, + "grad_norm": 1.2615044116973877, + "learning_rate": 7.893328576525152e-06, + "loss": 0.4946, + "step": 885 + }, + { + "epoch": 0.015802803838333393, + "grad_norm": 0.9218889474868774, + "learning_rate": 7.902247591865858e-06, + "loss": 0.3791, + "step": 886 + }, + { + "epoch": 0.015820639960047088, + "grad_norm": 1.1563516855239868, + "learning_rate": 7.911166607206564e-06, + "loss": 0.3406, + "step": 887 + }, + { + "epoch": 0.015838476081760783, + "grad_norm": 1.1569664478302002, + "learning_rate": 7.920085622547271e-06, + "loss": 0.3709, + "step": 888 + }, + { + "epoch": 0.015856312203474478, + "grad_norm": 0.928895115852356, + "learning_rate": 7.929004637887979e-06, + "loss": 0.4082, + "step": 889 + }, + { + "epoch": 0.015874148325188173, + "grad_norm": 1.0746430158615112, + "learning_rate": 7.937923653228684e-06, + "loss": 0.3343, + "step": 890 + }, + { + "epoch": 0.015891984446901864, + "grad_norm": 0.9965145587921143, + "learning_rate": 7.94684266856939e-06, + "loss": 0.3528, + "step": 891 + }, + { + "epoch": 0.01590982056861556, + "grad_norm": 1.1421681642532349, + "learning_rate": 7.955761683910096e-06, + "loss": 0.4042, + "step": 892 + }, + { + "epoch": 0.015927656690329254, + "grad_norm": 1.7538610696792603, + "learning_rate": 7.964680699250803e-06, + "loss": 0.4018, + "step": 893 + }, + { + "epoch": 0.01594549281204295, + "grad_norm": 1.7577452659606934, + "learning_rate": 7.97359971459151e-06, + "loss": 0.3575, + "step": 894 + }, + { + "epoch": 0.015963328933756644, + "grad_norm": 0.785850465297699, + "learning_rate": 7.982518729932216e-06, + "loss": 0.4608, + "step": 895 + }, + { + "epoch": 0.01598116505547034, + "grad_norm": 0.99677973985672, + "learning_rate": 7.991437745272924e-06, + "loss": 0.357, + "step": 896 + }, + { + "epoch": 0.015999001177184034, + "grad_norm": 1.9706685543060303, + "learning_rate": 8.000356760613628e-06, + "loss": 0.3966, + "step": 897 + }, + { + "epoch": 0.01601683729889773, + "grad_norm": 0.6862720251083374, + "learning_rate": 8.009275775954335e-06, + "loss": 0.3992, + "step": 898 + }, + { + "epoch": 0.016034673420611423, + "grad_norm": 1.010366678237915, + "learning_rate": 8.018194791295041e-06, + "loss": 0.3913, + "step": 899 + }, + { + "epoch": 0.01605250954232512, + "grad_norm": 1.3852301836013794, + "learning_rate": 8.027113806635749e-06, + "loss": 0.4702, + "step": 900 + }, + { + "epoch": 0.01607034566403881, + "grad_norm": 1.0316989421844482, + "learning_rate": 8.036032821976454e-06, + "loss": 0.4703, + "step": 901 + }, + { + "epoch": 0.016088181785752505, + "grad_norm": 0.9395390748977661, + "learning_rate": 8.04495183731716e-06, + "loss": 0.3565, + "step": 902 + }, + { + "epoch": 0.0161060179074662, + "grad_norm": 0.8489232659339905, + "learning_rate": 8.053870852657867e-06, + "loss": 0.4159, + "step": 903 + }, + { + "epoch": 0.016123854029179895, + "grad_norm": 0.9959050416946411, + "learning_rate": 8.062789867998573e-06, + "loss": 0.4334, + "step": 904 + }, + { + "epoch": 0.01614169015089359, + "grad_norm": 1.297946810722351, + "learning_rate": 8.07170888333928e-06, + "loss": 0.4919, + "step": 905 + }, + { + "epoch": 0.016159526272607284, + "grad_norm": 1.3732280731201172, + "learning_rate": 8.080627898679986e-06, + "loss": 0.3933, + "step": 906 + }, + { + "epoch": 0.01617736239432098, + "grad_norm": 0.8733242750167847, + "learning_rate": 8.089546914020694e-06, + "loss": 0.3343, + "step": 907 + }, + { + "epoch": 0.016195198516034674, + "grad_norm": 1.3466911315917969, + "learning_rate": 8.098465929361398e-06, + "loss": 0.3641, + "step": 908 + }, + { + "epoch": 0.01621303463774837, + "grad_norm": 1.0537189245224, + "learning_rate": 8.107384944702105e-06, + "loss": 0.3967, + "step": 909 + }, + { + "epoch": 0.016230870759462064, + "grad_norm": 1.1082985401153564, + "learning_rate": 8.116303960042811e-06, + "loss": 0.4435, + "step": 910 + }, + { + "epoch": 0.016248706881175756, + "grad_norm": 0.7933439612388611, + "learning_rate": 8.125222975383518e-06, + "loss": 0.3795, + "step": 911 + }, + { + "epoch": 0.01626654300288945, + "grad_norm": 1.3988990783691406, + "learning_rate": 8.134141990724224e-06, + "loss": 0.4147, + "step": 912 + }, + { + "epoch": 0.016284379124603145, + "grad_norm": 0.6847274303436279, + "learning_rate": 8.14306100606493e-06, + "loss": 0.3548, + "step": 913 + }, + { + "epoch": 0.01630221524631684, + "grad_norm": 1.28653883934021, + "learning_rate": 8.151980021405638e-06, + "loss": 0.4874, + "step": 914 + }, + { + "epoch": 0.016320051368030535, + "grad_norm": 0.8804856538772583, + "learning_rate": 8.160899036746343e-06, + "loss": 0.4444, + "step": 915 + }, + { + "epoch": 0.01633788748974423, + "grad_norm": 0.8684842586517334, + "learning_rate": 8.16981805208705e-06, + "loss": 0.3481, + "step": 916 + }, + { + "epoch": 0.016355723611457925, + "grad_norm": 0.8577004671096802, + "learning_rate": 8.178737067427756e-06, + "loss": 0.3945, + "step": 917 + }, + { + "epoch": 0.01637355973317162, + "grad_norm": 1.2012475728988647, + "learning_rate": 8.187656082768464e-06, + "loss": 0.4704, + "step": 918 + }, + { + "epoch": 0.016391395854885315, + "grad_norm": 0.8696227669715881, + "learning_rate": 8.196575098109169e-06, + "loss": 0.343, + "step": 919 + }, + { + "epoch": 0.01640923197659901, + "grad_norm": 1.07411527633667, + "learning_rate": 8.205494113449875e-06, + "loss": 0.411, + "step": 920 + }, + { + "epoch": 0.0164270680983127, + "grad_norm": 0.7849026322364807, + "learning_rate": 8.214413128790583e-06, + "loss": 0.3985, + "step": 921 + }, + { + "epoch": 0.016444904220026396, + "grad_norm": 1.356942057609558, + "learning_rate": 8.223332144131288e-06, + "loss": 0.3756, + "step": 922 + }, + { + "epoch": 0.01646274034174009, + "grad_norm": 0.95868319272995, + "learning_rate": 8.232251159471994e-06, + "loss": 0.4238, + "step": 923 + }, + { + "epoch": 0.016480576463453786, + "grad_norm": 0.9421398043632507, + "learning_rate": 8.2411701748127e-06, + "loss": 0.3874, + "step": 924 + }, + { + "epoch": 0.01649841258516748, + "grad_norm": 1.05433189868927, + "learning_rate": 8.250089190153409e-06, + "loss": 0.4118, + "step": 925 + }, + { + "epoch": 0.016516248706881176, + "grad_norm": 2.117459774017334, + "learning_rate": 8.259008205494113e-06, + "loss": 0.3496, + "step": 926 + }, + { + "epoch": 0.01653408482859487, + "grad_norm": 1.3546380996704102, + "learning_rate": 8.26792722083482e-06, + "loss": 0.4632, + "step": 927 + }, + { + "epoch": 0.016551920950308566, + "grad_norm": 0.9243427515029907, + "learning_rate": 8.276846236175528e-06, + "loss": 0.3983, + "step": 928 + }, + { + "epoch": 0.01656975707202226, + "grad_norm": 0.9842472076416016, + "learning_rate": 8.285765251516232e-06, + "loss": 0.4043, + "step": 929 + }, + { + "epoch": 0.016587593193735956, + "grad_norm": 0.8520750999450684, + "learning_rate": 8.294684266856939e-06, + "loss": 0.386, + "step": 930 + }, + { + "epoch": 0.016605429315449647, + "grad_norm": 1.1591039896011353, + "learning_rate": 8.303603282197645e-06, + "loss": 0.4513, + "step": 931 + }, + { + "epoch": 0.016623265437163342, + "grad_norm": 1.1286969184875488, + "learning_rate": 8.312522297538353e-06, + "loss": 0.358, + "step": 932 + }, + { + "epoch": 0.016641101558877037, + "grad_norm": 0.8448970913887024, + "learning_rate": 8.321441312879058e-06, + "loss": 0.3736, + "step": 933 + }, + { + "epoch": 0.016658937680590732, + "grad_norm": 1.1036772727966309, + "learning_rate": 8.330360328219764e-06, + "loss": 0.378, + "step": 934 + }, + { + "epoch": 0.016676773802304427, + "grad_norm": 1.2469936609268188, + "learning_rate": 8.33927934356047e-06, + "loss": 0.3758, + "step": 935 + }, + { + "epoch": 0.01669460992401812, + "grad_norm": 1.0058586597442627, + "learning_rate": 8.348198358901179e-06, + "loss": 0.3772, + "step": 936 + }, + { + "epoch": 0.016712446045731816, + "grad_norm": 1.0551007986068726, + "learning_rate": 8.357117374241884e-06, + "loss": 0.3603, + "step": 937 + }, + { + "epoch": 0.01673028216744551, + "grad_norm": 1.0873944759368896, + "learning_rate": 8.36603638958259e-06, + "loss": 0.3745, + "step": 938 + }, + { + "epoch": 0.016748118289159206, + "grad_norm": 1.276474952697754, + "learning_rate": 8.374955404923298e-06, + "loss": 0.3629, + "step": 939 + }, + { + "epoch": 0.0167659544108729, + "grad_norm": 0.892585277557373, + "learning_rate": 8.383874420264003e-06, + "loss": 0.409, + "step": 940 + }, + { + "epoch": 0.016783790532586593, + "grad_norm": 1.0321881771087646, + "learning_rate": 8.392793435604709e-06, + "loss": 0.332, + "step": 941 + }, + { + "epoch": 0.016801626654300288, + "grad_norm": 1.2248462438583374, + "learning_rate": 8.401712450945415e-06, + "loss": 0.3936, + "step": 942 + }, + { + "epoch": 0.016819462776013983, + "grad_norm": 1.0450223684310913, + "learning_rate": 8.410631466286123e-06, + "loss": 0.3439, + "step": 943 + }, + { + "epoch": 0.016837298897727677, + "grad_norm": 2.632742166519165, + "learning_rate": 8.419550481626828e-06, + "loss": 0.3911, + "step": 944 + }, + { + "epoch": 0.016855135019441372, + "grad_norm": 0.907788872718811, + "learning_rate": 8.428469496967535e-06, + "loss": 0.3474, + "step": 945 + }, + { + "epoch": 0.016872971141155067, + "grad_norm": 0.9532281756401062, + "learning_rate": 8.437388512308243e-06, + "loss": 0.4082, + "step": 946 + }, + { + "epoch": 0.016890807262868762, + "grad_norm": 0.7200486063957214, + "learning_rate": 8.446307527648947e-06, + "loss": 0.3763, + "step": 947 + }, + { + "epoch": 0.016908643384582457, + "grad_norm": 0.8066374659538269, + "learning_rate": 8.455226542989654e-06, + "loss": 0.4267, + "step": 948 + }, + { + "epoch": 0.016926479506296152, + "grad_norm": 0.9796708822250366, + "learning_rate": 8.46414555833036e-06, + "loss": 0.4264, + "step": 949 + }, + { + "epoch": 0.016944315628009847, + "grad_norm": 1.1514612436294556, + "learning_rate": 8.473064573671068e-06, + "loss": 0.3832, + "step": 950 + }, + { + "epoch": 0.01696215174972354, + "grad_norm": 1.1260454654693604, + "learning_rate": 8.481983589011773e-06, + "loss": 0.3337, + "step": 951 + }, + { + "epoch": 0.016979987871437233, + "grad_norm": 1.0035359859466553, + "learning_rate": 8.49090260435248e-06, + "loss": 0.4341, + "step": 952 + }, + { + "epoch": 0.016997823993150928, + "grad_norm": 0.8457959890365601, + "learning_rate": 8.499821619693187e-06, + "loss": 0.4132, + "step": 953 + }, + { + "epoch": 0.017015660114864623, + "grad_norm": 1.1532012224197388, + "learning_rate": 8.508740635033894e-06, + "loss": 0.3535, + "step": 954 + }, + { + "epoch": 0.017033496236578318, + "grad_norm": 0.841249942779541, + "learning_rate": 8.517659650374598e-06, + "loss": 0.3795, + "step": 955 + }, + { + "epoch": 0.017051332358292013, + "grad_norm": 0.74698805809021, + "learning_rate": 8.526578665715305e-06, + "loss": 0.4153, + "step": 956 + }, + { + "epoch": 0.017069168480005708, + "grad_norm": 1.704779028892517, + "learning_rate": 8.535497681056013e-06, + "loss": 0.4107, + "step": 957 + }, + { + "epoch": 0.017087004601719403, + "grad_norm": 1.5854723453521729, + "learning_rate": 8.544416696396718e-06, + "loss": 0.3867, + "step": 958 + }, + { + "epoch": 0.017104840723433098, + "grad_norm": 2.241612672805786, + "learning_rate": 8.553335711737424e-06, + "loss": 0.3918, + "step": 959 + }, + { + "epoch": 0.017122676845146793, + "grad_norm": 0.8506503701210022, + "learning_rate": 8.562254727078132e-06, + "loss": 0.3158, + "step": 960 + }, + { + "epoch": 0.017140512966860484, + "grad_norm": 0.7391118407249451, + "learning_rate": 8.571173742418838e-06, + "loss": 0.352, + "step": 961 + }, + { + "epoch": 0.01715834908857418, + "grad_norm": 0.9272975325584412, + "learning_rate": 8.580092757759543e-06, + "loss": 0.4031, + "step": 962 + }, + { + "epoch": 0.017176185210287874, + "grad_norm": 2.544142007827759, + "learning_rate": 8.58901177310025e-06, + "loss": 0.4406, + "step": 963 + }, + { + "epoch": 0.01719402133200157, + "grad_norm": 1.7289552688598633, + "learning_rate": 8.597930788440957e-06, + "loss": 0.3602, + "step": 964 + }, + { + "epoch": 0.017211857453715264, + "grad_norm": 1.2334113121032715, + "learning_rate": 8.606849803781662e-06, + "loss": 0.3795, + "step": 965 + }, + { + "epoch": 0.01722969357542896, + "grad_norm": 0.8848539590835571, + "learning_rate": 8.615768819122369e-06, + "loss": 0.2925, + "step": 966 + }, + { + "epoch": 0.017247529697142654, + "grad_norm": 0.8424369096755981, + "learning_rate": 8.624687834463075e-06, + "loss": 0.3758, + "step": 967 + }, + { + "epoch": 0.01726536581885635, + "grad_norm": 0.876915693283081, + "learning_rate": 8.633606849803783e-06, + "loss": 0.4027, + "step": 968 + }, + { + "epoch": 0.017283201940570043, + "grad_norm": 1.4270676374435425, + "learning_rate": 8.642525865144488e-06, + "loss": 0.3421, + "step": 969 + }, + { + "epoch": 0.01730103806228374, + "grad_norm": 0.8328204154968262, + "learning_rate": 8.651444880485194e-06, + "loss": 0.3745, + "step": 970 + }, + { + "epoch": 0.01731887418399743, + "grad_norm": 1.0704902410507202, + "learning_rate": 8.660363895825902e-06, + "loss": 0.3493, + "step": 971 + }, + { + "epoch": 0.017336710305711125, + "grad_norm": 0.818196177482605, + "learning_rate": 8.669282911166609e-06, + "loss": 0.3426, + "step": 972 + }, + { + "epoch": 0.01735454642742482, + "grad_norm": 0.8499037027359009, + "learning_rate": 8.678201926507313e-06, + "loss": 0.3264, + "step": 973 + }, + { + "epoch": 0.017372382549138515, + "grad_norm": 1.086616039276123, + "learning_rate": 8.68712094184802e-06, + "loss": 0.3822, + "step": 974 + }, + { + "epoch": 0.01739021867085221, + "grad_norm": 0.8740732073783875, + "learning_rate": 8.696039957188728e-06, + "loss": 0.3772, + "step": 975 + }, + { + "epoch": 0.017408054792565904, + "grad_norm": 1.0218117237091064, + "learning_rate": 8.704958972529432e-06, + "loss": 0.3795, + "step": 976 + }, + { + "epoch": 0.0174258909142796, + "grad_norm": 1.5836724042892456, + "learning_rate": 8.713877987870139e-06, + "loss": 0.4078, + "step": 977 + }, + { + "epoch": 0.017443727035993294, + "grad_norm": 0.7581110000610352, + "learning_rate": 8.722797003210847e-06, + "loss": 0.3992, + "step": 978 + }, + { + "epoch": 0.01746156315770699, + "grad_norm": 1.0162327289581299, + "learning_rate": 8.731716018551553e-06, + "loss": 0.3754, + "step": 979 + }, + { + "epoch": 0.017479399279420684, + "grad_norm": 1.0005271434783936, + "learning_rate": 8.740635033892258e-06, + "loss": 0.3391, + "step": 980 + }, + { + "epoch": 0.01749723540113438, + "grad_norm": 0.902283251285553, + "learning_rate": 8.749554049232964e-06, + "loss": 0.3565, + "step": 981 + }, + { + "epoch": 0.01751507152284807, + "grad_norm": 0.7696515321731567, + "learning_rate": 8.758473064573672e-06, + "loss": 0.4062, + "step": 982 + }, + { + "epoch": 0.017532907644561765, + "grad_norm": 0.7952608466148376, + "learning_rate": 8.767392079914379e-06, + "loss": 0.3125, + "step": 983 + }, + { + "epoch": 0.01755074376627546, + "grad_norm": 0.9154542684555054, + "learning_rate": 8.776311095255083e-06, + "loss": 0.3663, + "step": 984 + }, + { + "epoch": 0.017568579887989155, + "grad_norm": 0.9744145274162292, + "learning_rate": 8.785230110595791e-06, + "loss": 0.4255, + "step": 985 + }, + { + "epoch": 0.01758641600970285, + "grad_norm": 1.046738862991333, + "learning_rate": 8.794149125936498e-06, + "loss": 0.3386, + "step": 986 + }, + { + "epoch": 0.017604252131416545, + "grad_norm": 1.0191091299057007, + "learning_rate": 8.803068141277203e-06, + "loss": 0.398, + "step": 987 + }, + { + "epoch": 0.01762208825313024, + "grad_norm": 1.1875132322311401, + "learning_rate": 8.811987156617909e-06, + "loss": 0.4394, + "step": 988 + }, + { + "epoch": 0.017639924374843935, + "grad_norm": 0.9273723363876343, + "learning_rate": 8.820906171958617e-06, + "loss": 0.3593, + "step": 989 + }, + { + "epoch": 0.01765776049655763, + "grad_norm": 2.2347023487091064, + "learning_rate": 8.829825187299323e-06, + "loss": 0.3727, + "step": 990 + }, + { + "epoch": 0.017675596618271325, + "grad_norm": 0.8893629908561707, + "learning_rate": 8.838744202640028e-06, + "loss": 0.4047, + "step": 991 + }, + { + "epoch": 0.017693432739985016, + "grad_norm": 1.4326751232147217, + "learning_rate": 8.847663217980736e-06, + "loss": 0.4774, + "step": 992 + }, + { + "epoch": 0.01771126886169871, + "grad_norm": 1.0148738622665405, + "learning_rate": 8.856582233321443e-06, + "loss": 0.4132, + "step": 993 + }, + { + "epoch": 0.017729104983412406, + "grad_norm": 1.1211464405059814, + "learning_rate": 8.865501248662147e-06, + "loss": 0.3738, + "step": 994 + }, + { + "epoch": 0.0177469411051261, + "grad_norm": 0.6987782120704651, + "learning_rate": 8.874420264002854e-06, + "loss": 0.3284, + "step": 995 + }, + { + "epoch": 0.017764777226839796, + "grad_norm": 0.861219048500061, + "learning_rate": 8.883339279343562e-06, + "loss": 0.3263, + "step": 996 + }, + { + "epoch": 0.01778261334855349, + "grad_norm": 1.0110158920288086, + "learning_rate": 8.892258294684268e-06, + "loss": 0.4485, + "step": 997 + }, + { + "epoch": 0.017800449470267186, + "grad_norm": 1.0399854183197021, + "learning_rate": 8.901177310024973e-06, + "loss": 0.4472, + "step": 998 + }, + { + "epoch": 0.01781828559198088, + "grad_norm": 0.9171674251556396, + "learning_rate": 8.91009632536568e-06, + "loss": 0.3886, + "step": 999 + }, + { + "epoch": 0.017836121713694576, + "grad_norm": 1.298419713973999, + "learning_rate": 8.919015340706387e-06, + "loss": 0.3966, + "step": 1000 + }, + { + "epoch": 0.017836121713694576, + "eval_loss": 0.33756929636001587, + "eval_runtime": 275.5663, + "eval_samples_per_second": 3.716, + "eval_steps_per_second": 0.621, + "step": 1000 + }, + { + "epoch": 0.01785395783540827, + "grad_norm": 0.8770831227302551, + "learning_rate": 8.927934356047094e-06, + "loss": 0.3391, + "step": 1001 + }, + { + "epoch": 0.017871793957121962, + "grad_norm": 3.754714250564575, + "learning_rate": 8.936853371387798e-06, + "loss": 0.3939, + "step": 1002 + }, + { + "epoch": 0.017889630078835657, + "grad_norm": 0.8046690821647644, + "learning_rate": 8.945772386728506e-06, + "loss": 0.2857, + "step": 1003 + }, + { + "epoch": 0.017907466200549352, + "grad_norm": 1.0197075605392456, + "learning_rate": 8.954691402069213e-06, + "loss": 0.3775, + "step": 1004 + }, + { + "epoch": 0.017925302322263047, + "grad_norm": 1.0234031677246094, + "learning_rate": 8.963610417409917e-06, + "loss": 0.3761, + "step": 1005 + }, + { + "epoch": 0.01794313844397674, + "grad_norm": 2.2465391159057617, + "learning_rate": 8.972529432750624e-06, + "loss": 0.3777, + "step": 1006 + }, + { + "epoch": 0.017960974565690437, + "grad_norm": 0.9311102628707886, + "learning_rate": 8.981448448091332e-06, + "loss": 0.3959, + "step": 1007 + }, + { + "epoch": 0.01797881068740413, + "grad_norm": 0.8143541216850281, + "learning_rate": 8.990367463432038e-06, + "loss": 0.4143, + "step": 1008 + }, + { + "epoch": 0.017996646809117826, + "grad_norm": 0.8146178722381592, + "learning_rate": 8.999286478772743e-06, + "loss": 0.4048, + "step": 1009 + }, + { + "epoch": 0.01801448293083152, + "grad_norm": 0.7817711234092712, + "learning_rate": 9.008205494113451e-06, + "loss": 0.2997, + "step": 1010 + }, + { + "epoch": 0.018032319052545216, + "grad_norm": 1.109965205192566, + "learning_rate": 9.017124509454157e-06, + "loss": 0.3481, + "step": 1011 + }, + { + "epoch": 0.018050155174258908, + "grad_norm": 0.6897335648536682, + "learning_rate": 9.026043524794862e-06, + "loss": 0.332, + "step": 1012 + }, + { + "epoch": 0.018067991295972603, + "grad_norm": 1.485617756843567, + "learning_rate": 9.034962540135568e-06, + "loss": 0.3758, + "step": 1013 + }, + { + "epoch": 0.018085827417686298, + "grad_norm": 0.9818662405014038, + "learning_rate": 9.043881555476277e-06, + "loss": 0.3444, + "step": 1014 + }, + { + "epoch": 0.018103663539399992, + "grad_norm": 0.9142575263977051, + "learning_rate": 9.052800570816983e-06, + "loss": 0.4152, + "step": 1015 + }, + { + "epoch": 0.018121499661113687, + "grad_norm": 1.0542312860488892, + "learning_rate": 9.061719586157688e-06, + "loss": 0.4386, + "step": 1016 + }, + { + "epoch": 0.018139335782827382, + "grad_norm": 1.3937829732894897, + "learning_rate": 9.070638601498396e-06, + "loss": 0.4342, + "step": 1017 + }, + { + "epoch": 0.018157171904541077, + "grad_norm": 0.78803950548172, + "learning_rate": 9.079557616839102e-06, + "loss": 0.402, + "step": 1018 + }, + { + "epoch": 0.018175008026254772, + "grad_norm": 1.7907694578170776, + "learning_rate": 9.088476632179808e-06, + "loss": 0.4358, + "step": 1019 + }, + { + "epoch": 0.018192844147968467, + "grad_norm": 0.5500625371932983, + "learning_rate": 9.097395647520513e-06, + "loss": 0.3381, + "step": 1020 + }, + { + "epoch": 0.018210680269682162, + "grad_norm": 0.7231621146202087, + "learning_rate": 9.106314662861221e-06, + "loss": 0.3358, + "step": 1021 + }, + { + "epoch": 0.018228516391395853, + "grad_norm": 0.8248460292816162, + "learning_rate": 9.115233678201928e-06, + "loss": 0.4092, + "step": 1022 + }, + { + "epoch": 0.01824635251310955, + "grad_norm": 1.1761342287063599, + "learning_rate": 9.124152693542632e-06, + "loss": 0.3016, + "step": 1023 + }, + { + "epoch": 0.018264188634823243, + "grad_norm": 0.8809008598327637, + "learning_rate": 9.13307170888334e-06, + "loss": 0.3401, + "step": 1024 + }, + { + "epoch": 0.018282024756536938, + "grad_norm": 0.9658706784248352, + "learning_rate": 9.141990724224047e-06, + "loss": 0.3635, + "step": 1025 + }, + { + "epoch": 0.018299860878250633, + "grad_norm": 1.1166636943817139, + "learning_rate": 9.150909739564753e-06, + "loss": 0.4423, + "step": 1026 + }, + { + "epoch": 0.018317696999964328, + "grad_norm": 0.8470121026039124, + "learning_rate": 9.159828754905458e-06, + "loss": 0.4163, + "step": 1027 + }, + { + "epoch": 0.018335533121678023, + "grad_norm": 0.7772458791732788, + "learning_rate": 9.168747770246166e-06, + "loss": 0.4036, + "step": 1028 + }, + { + "epoch": 0.018353369243391718, + "grad_norm": 1.0990639925003052, + "learning_rate": 9.177666785586872e-06, + "loss": 0.3164, + "step": 1029 + }, + { + "epoch": 0.018371205365105413, + "grad_norm": 0.9506021738052368, + "learning_rate": 9.186585800927577e-06, + "loss": 0.3709, + "step": 1030 + }, + { + "epoch": 0.018389041486819108, + "grad_norm": 0.909835934638977, + "learning_rate": 9.195504816268285e-06, + "loss": 0.3815, + "step": 1031 + }, + { + "epoch": 0.0184068776085328, + "grad_norm": 0.7399672865867615, + "learning_rate": 9.204423831608991e-06, + "loss": 0.3234, + "step": 1032 + }, + { + "epoch": 0.018424713730246494, + "grad_norm": 1.0614478588104248, + "learning_rate": 9.213342846949698e-06, + "loss": 0.3931, + "step": 1033 + }, + { + "epoch": 0.01844254985196019, + "grad_norm": 0.6906810402870178, + "learning_rate": 9.222261862290402e-06, + "loss": 0.3074, + "step": 1034 + }, + { + "epoch": 0.018460385973673884, + "grad_norm": 0.7197563648223877, + "learning_rate": 9.23118087763111e-06, + "loss": 0.3373, + "step": 1035 + }, + { + "epoch": 0.01847822209538758, + "grad_norm": 0.9273034334182739, + "learning_rate": 9.240099892971817e-06, + "loss": 0.4292, + "step": 1036 + }, + { + "epoch": 0.018496058217101274, + "grad_norm": 1.1073459386825562, + "learning_rate": 9.249018908312523e-06, + "loss": 0.2683, + "step": 1037 + }, + { + "epoch": 0.01851389433881497, + "grad_norm": 1.104052186012268, + "learning_rate": 9.257937923653228e-06, + "loss": 0.4062, + "step": 1038 + }, + { + "epoch": 0.018531730460528664, + "grad_norm": 2.026197910308838, + "learning_rate": 9.266856938993936e-06, + "loss": 0.3907, + "step": 1039 + }, + { + "epoch": 0.01854956658224236, + "grad_norm": 1.305974006652832, + "learning_rate": 9.275775954334642e-06, + "loss": 0.3665, + "step": 1040 + }, + { + "epoch": 0.018567402703956053, + "grad_norm": 0.6929371356964111, + "learning_rate": 9.284694969675347e-06, + "loss": 0.3842, + "step": 1041 + }, + { + "epoch": 0.018585238825669745, + "grad_norm": 0.6076000332832336, + "learning_rate": 9.293613985016055e-06, + "loss": 0.3296, + "step": 1042 + }, + { + "epoch": 0.01860307494738344, + "grad_norm": 1.3677473068237305, + "learning_rate": 9.302533000356762e-06, + "loss": 0.3563, + "step": 1043 + }, + { + "epoch": 0.018620911069097135, + "grad_norm": 1.170035719871521, + "learning_rate": 9.311452015697468e-06, + "loss": 0.401, + "step": 1044 + }, + { + "epoch": 0.01863874719081083, + "grad_norm": 1.1717208623886108, + "learning_rate": 9.320371031038173e-06, + "loss": 0.4063, + "step": 1045 + }, + { + "epoch": 0.018656583312524525, + "grad_norm": 0.9528186917304993, + "learning_rate": 9.32929004637888e-06, + "loss": 0.3779, + "step": 1046 + }, + { + "epoch": 0.01867441943423822, + "grad_norm": 0.8370277285575867, + "learning_rate": 9.338209061719587e-06, + "loss": 0.4015, + "step": 1047 + }, + { + "epoch": 0.018692255555951914, + "grad_norm": 0.793533980846405, + "learning_rate": 9.347128077060292e-06, + "loss": 0.3512, + "step": 1048 + }, + { + "epoch": 0.01871009167766561, + "grad_norm": 0.9082489609718323, + "learning_rate": 9.356047092401e-06, + "loss": 0.3966, + "step": 1049 + }, + { + "epoch": 0.018727927799379304, + "grad_norm": 0.9899812340736389, + "learning_rate": 9.364966107741706e-06, + "loss": 0.3731, + "step": 1050 + }, + { + "epoch": 0.018745763921093, + "grad_norm": 4.113532066345215, + "learning_rate": 9.373885123082413e-06, + "loss": 0.4431, + "step": 1051 + }, + { + "epoch": 0.01876360004280669, + "grad_norm": 0.5420061945915222, + "learning_rate": 9.382804138423117e-06, + "loss": 0.303, + "step": 1052 + }, + { + "epoch": 0.018781436164520385, + "grad_norm": 0.7249712347984314, + "learning_rate": 9.391723153763825e-06, + "loss": 0.3765, + "step": 1053 + }, + { + "epoch": 0.01879927228623408, + "grad_norm": 0.8847532272338867, + "learning_rate": 9.400642169104532e-06, + "loss": 0.3782, + "step": 1054 + }, + { + "epoch": 0.018817108407947775, + "grad_norm": 0.8722662925720215, + "learning_rate": 9.409561184445238e-06, + "loss": 0.334, + "step": 1055 + }, + { + "epoch": 0.01883494452966147, + "grad_norm": 0.6470535397529602, + "learning_rate": 9.418480199785944e-06, + "loss": 0.4192, + "step": 1056 + }, + { + "epoch": 0.018852780651375165, + "grad_norm": 0.7459009289741516, + "learning_rate": 9.427399215126651e-06, + "loss": 0.4015, + "step": 1057 + }, + { + "epoch": 0.01887061677308886, + "grad_norm": 1.0923402309417725, + "learning_rate": 9.436318230467357e-06, + "loss": 0.3711, + "step": 1058 + }, + { + "epoch": 0.018888452894802555, + "grad_norm": 0.802423357963562, + "learning_rate": 9.445237245808062e-06, + "loss": 0.3794, + "step": 1059 + }, + { + "epoch": 0.01890628901651625, + "grad_norm": 0.9659205079078674, + "learning_rate": 9.45415626114877e-06, + "loss": 0.4019, + "step": 1060 + }, + { + "epoch": 0.018924125138229945, + "grad_norm": 0.9830519556999207, + "learning_rate": 9.463075276489476e-06, + "loss": 0.3762, + "step": 1061 + }, + { + "epoch": 0.018941961259943636, + "grad_norm": 0.6956446766853333, + "learning_rate": 9.471994291830183e-06, + "loss": 0.3534, + "step": 1062 + }, + { + "epoch": 0.01895979738165733, + "grad_norm": 1.0788321495056152, + "learning_rate": 9.480913307170889e-06, + "loss": 0.3809, + "step": 1063 + }, + { + "epoch": 0.018977633503371026, + "grad_norm": 0.707240641117096, + "learning_rate": 9.489832322511596e-06, + "loss": 0.3505, + "step": 1064 + }, + { + "epoch": 0.01899546962508472, + "grad_norm": 0.8199694156646729, + "learning_rate": 9.498751337852302e-06, + "loss": 0.4184, + "step": 1065 + }, + { + "epoch": 0.019013305746798416, + "grad_norm": 0.7357836961746216, + "learning_rate": 9.507670353193008e-06, + "loss": 0.3826, + "step": 1066 + }, + { + "epoch": 0.01903114186851211, + "grad_norm": 0.7746564149856567, + "learning_rate": 9.516589368533715e-06, + "loss": 0.3603, + "step": 1067 + }, + { + "epoch": 0.019048977990225806, + "grad_norm": 0.7702838778495789, + "learning_rate": 9.525508383874421e-06, + "loss": 0.4102, + "step": 1068 + }, + { + "epoch": 0.0190668141119395, + "grad_norm": 0.7079132795333862, + "learning_rate": 9.534427399215127e-06, + "loss": 0.305, + "step": 1069 + }, + { + "epoch": 0.019084650233653196, + "grad_norm": 0.9278112649917603, + "learning_rate": 9.543346414555832e-06, + "loss": 0.449, + "step": 1070 + }, + { + "epoch": 0.01910248635536689, + "grad_norm": 0.8345721364021301, + "learning_rate": 9.55226542989654e-06, + "loss": 0.3828, + "step": 1071 + }, + { + "epoch": 0.019120322477080582, + "grad_norm": 1.123867154121399, + "learning_rate": 9.561184445237247e-06, + "loss": 0.4336, + "step": 1072 + }, + { + "epoch": 0.019138158598794277, + "grad_norm": 1.0548595190048218, + "learning_rate": 9.570103460577953e-06, + "loss": 0.3897, + "step": 1073 + }, + { + "epoch": 0.019155994720507972, + "grad_norm": 0.8144445419311523, + "learning_rate": 9.57902247591866e-06, + "loss": 0.4039, + "step": 1074 + }, + { + "epoch": 0.019173830842221667, + "grad_norm": 0.8764538168907166, + "learning_rate": 9.587941491259366e-06, + "loss": 0.4016, + "step": 1075 + }, + { + "epoch": 0.01919166696393536, + "grad_norm": 0.8950740694999695, + "learning_rate": 9.596860506600072e-06, + "loss": 0.3898, + "step": 1076 + }, + { + "epoch": 0.019209503085649057, + "grad_norm": 0.746254026889801, + "learning_rate": 9.605779521940777e-06, + "loss": 0.3414, + "step": 1077 + }, + { + "epoch": 0.01922733920736275, + "grad_norm": 0.614499032497406, + "learning_rate": 9.614698537281485e-06, + "loss": 0.3527, + "step": 1078 + }, + { + "epoch": 0.019245175329076446, + "grad_norm": 0.626251220703125, + "learning_rate": 9.623617552622191e-06, + "loss": 0.3094, + "step": 1079 + }, + { + "epoch": 0.01926301145079014, + "grad_norm": 0.8994700312614441, + "learning_rate": 9.632536567962898e-06, + "loss": 0.3647, + "step": 1080 + }, + { + "epoch": 0.019280847572503836, + "grad_norm": 0.8249965906143188, + "learning_rate": 9.641455583303604e-06, + "loss": 0.3451, + "step": 1081 + }, + { + "epoch": 0.019298683694217528, + "grad_norm": 0.6884198784828186, + "learning_rate": 9.65037459864431e-06, + "loss": 0.3033, + "step": 1082 + }, + { + "epoch": 0.019316519815931223, + "grad_norm": 0.80061936378479, + "learning_rate": 9.659293613985017e-06, + "loss": 0.3434, + "step": 1083 + }, + { + "epoch": 0.019334355937644918, + "grad_norm": 0.8543218970298767, + "learning_rate": 9.668212629325723e-06, + "loss": 0.3605, + "step": 1084 + }, + { + "epoch": 0.019352192059358612, + "grad_norm": 0.8076745867729187, + "learning_rate": 9.67713164466643e-06, + "loss": 0.3879, + "step": 1085 + }, + { + "epoch": 0.019370028181072307, + "grad_norm": 1.2448673248291016, + "learning_rate": 9.686050660007136e-06, + "loss": 0.4107, + "step": 1086 + }, + { + "epoch": 0.019387864302786002, + "grad_norm": 0.6801688075065613, + "learning_rate": 9.694969675347842e-06, + "loss": 0.3263, + "step": 1087 + }, + { + "epoch": 0.019405700424499697, + "grad_norm": 0.7092694044113159, + "learning_rate": 9.703888690688549e-06, + "loss": 0.3444, + "step": 1088 + }, + { + "epoch": 0.019423536546213392, + "grad_norm": 0.6756287813186646, + "learning_rate": 9.712807706029255e-06, + "loss": 0.3616, + "step": 1089 + }, + { + "epoch": 0.019441372667927087, + "grad_norm": 0.589930534362793, + "learning_rate": 9.721726721369961e-06, + "loss": 0.3845, + "step": 1090 + }, + { + "epoch": 0.019459208789640782, + "grad_norm": 0.7394228577613831, + "learning_rate": 9.730645736710668e-06, + "loss": 0.3178, + "step": 1091 + }, + { + "epoch": 0.019477044911354473, + "grad_norm": 0.8474282622337341, + "learning_rate": 9.739564752051374e-06, + "loss": 0.4054, + "step": 1092 + }, + { + "epoch": 0.01949488103306817, + "grad_norm": 0.8308938145637512, + "learning_rate": 9.74848376739208e-06, + "loss": 0.4189, + "step": 1093 + }, + { + "epoch": 0.019512717154781863, + "grad_norm": 0.7115844488143921, + "learning_rate": 9.757402782732787e-06, + "loss": 0.3978, + "step": 1094 + }, + { + "epoch": 0.019530553276495558, + "grad_norm": 0.948319673538208, + "learning_rate": 9.766321798073493e-06, + "loss": 0.3886, + "step": 1095 + }, + { + "epoch": 0.019548389398209253, + "grad_norm": 1.2271658182144165, + "learning_rate": 9.7752408134142e-06, + "loss": 0.4676, + "step": 1096 + }, + { + "epoch": 0.019566225519922948, + "grad_norm": 1.0044245719909668, + "learning_rate": 9.784159828754906e-06, + "loss": 0.3595, + "step": 1097 + }, + { + "epoch": 0.019584061641636643, + "grad_norm": 0.5866712927818298, + "learning_rate": 9.793078844095612e-06, + "loss": 0.3656, + "step": 1098 + }, + { + "epoch": 0.019601897763350338, + "grad_norm": 0.7712075710296631, + "learning_rate": 9.801997859436319e-06, + "loss": 0.3324, + "step": 1099 + }, + { + "epoch": 0.019619733885064033, + "grad_norm": 0.8975673913955688, + "learning_rate": 9.810916874777025e-06, + "loss": 0.4464, + "step": 1100 + }, + { + "epoch": 0.019637570006777728, + "grad_norm": 0.6946120262145996, + "learning_rate": 9.819835890117732e-06, + "loss": 0.3799, + "step": 1101 + }, + { + "epoch": 0.01965540612849142, + "grad_norm": 1.1008415222167969, + "learning_rate": 9.828754905458438e-06, + "loss": 0.3513, + "step": 1102 + }, + { + "epoch": 0.019673242250205114, + "grad_norm": 0.7117588520050049, + "learning_rate": 9.837673920799144e-06, + "loss": 0.3698, + "step": 1103 + }, + { + "epoch": 0.01969107837191881, + "grad_norm": 0.8079916834831238, + "learning_rate": 9.84659293613985e-06, + "loss": 0.3122, + "step": 1104 + }, + { + "epoch": 0.019708914493632504, + "grad_norm": 0.7888576984405518, + "learning_rate": 9.855511951480557e-06, + "loss": 0.3777, + "step": 1105 + }, + { + "epoch": 0.0197267506153462, + "grad_norm": 0.5892645120620728, + "learning_rate": 9.864430966821263e-06, + "loss": 0.2792, + "step": 1106 + }, + { + "epoch": 0.019744586737059894, + "grad_norm": 0.869814932346344, + "learning_rate": 9.87334998216197e-06, + "loss": 0.3784, + "step": 1107 + }, + { + "epoch": 0.01976242285877359, + "grad_norm": 0.7885927557945251, + "learning_rate": 9.882268997502676e-06, + "loss": 0.388, + "step": 1108 + }, + { + "epoch": 0.019780258980487284, + "grad_norm": 0.6155068278312683, + "learning_rate": 9.891188012843383e-06, + "loss": 0.3696, + "step": 1109 + }, + { + "epoch": 0.01979809510220098, + "grad_norm": 0.8185555934906006, + "learning_rate": 9.900107028184089e-06, + "loss": 0.3531, + "step": 1110 + }, + { + "epoch": 0.019815931223914673, + "grad_norm": 0.7030999660491943, + "learning_rate": 9.909026043524795e-06, + "loss": 0.4374, + "step": 1111 + }, + { + "epoch": 0.019833767345628365, + "grad_norm": 0.6789141893386841, + "learning_rate": 9.917945058865502e-06, + "loss": 0.3667, + "step": 1112 + }, + { + "epoch": 0.01985160346734206, + "grad_norm": 1.3756961822509766, + "learning_rate": 9.926864074206208e-06, + "loss": 0.4171, + "step": 1113 + }, + { + "epoch": 0.019869439589055755, + "grad_norm": 0.7162296772003174, + "learning_rate": 9.935783089546915e-06, + "loss": 0.3242, + "step": 1114 + }, + { + "epoch": 0.01988727571076945, + "grad_norm": 0.8953533172607422, + "learning_rate": 9.944702104887621e-06, + "loss": 0.4475, + "step": 1115 + }, + { + "epoch": 0.019905111832483145, + "grad_norm": 0.8703616261482239, + "learning_rate": 9.953621120228327e-06, + "loss": 0.3667, + "step": 1116 + }, + { + "epoch": 0.01992294795419684, + "grad_norm": 1.4214484691619873, + "learning_rate": 9.962540135569034e-06, + "loss": 0.3954, + "step": 1117 + }, + { + "epoch": 0.019940784075910534, + "grad_norm": 0.8139647841453552, + "learning_rate": 9.97145915090974e-06, + "loss": 0.416, + "step": 1118 + }, + { + "epoch": 0.01995862019762423, + "grad_norm": 1.535498023033142, + "learning_rate": 9.980378166250446e-06, + "loss": 0.4378, + "step": 1119 + }, + { + "epoch": 0.019976456319337924, + "grad_norm": 0.7993893623352051, + "learning_rate": 9.989297181591153e-06, + "loss": 0.3323, + "step": 1120 + }, + { + "epoch": 0.01999429244105162, + "grad_norm": 0.7614473700523376, + "learning_rate": 9.99821619693186e-06, + "loss": 0.3553, + "step": 1121 + }, + { + "epoch": 0.02001212856276531, + "grad_norm": 1.301061987876892, + "learning_rate": 1.0007135212272566e-05, + "loss": 0.3721, + "step": 1122 + }, + { + "epoch": 0.020029964684479006, + "grad_norm": 0.9857878088951111, + "learning_rate": 1.0016054227613272e-05, + "loss": 0.3418, + "step": 1123 + }, + { + "epoch": 0.0200478008061927, + "grad_norm": 0.6484463810920715, + "learning_rate": 1.0024973242953978e-05, + "loss": 0.3097, + "step": 1124 + }, + { + "epoch": 0.020065636927906395, + "grad_norm": 0.82742840051651, + "learning_rate": 1.0033892258294685e-05, + "loss": 0.3952, + "step": 1125 + }, + { + "epoch": 0.02008347304962009, + "grad_norm": 0.798607587814331, + "learning_rate": 1.0042811273635391e-05, + "loss": 0.3306, + "step": 1126 + }, + { + "epoch": 0.020101309171333785, + "grad_norm": 1.0455095767974854, + "learning_rate": 1.0051730288976097e-05, + "loss": 0.4239, + "step": 1127 + }, + { + "epoch": 0.02011914529304748, + "grad_norm": 1.093558669090271, + "learning_rate": 1.0060649304316804e-05, + "loss": 0.3184, + "step": 1128 + }, + { + "epoch": 0.020136981414761175, + "grad_norm": 0.8237136602401733, + "learning_rate": 1.006956831965751e-05, + "loss": 0.3713, + "step": 1129 + }, + { + "epoch": 0.02015481753647487, + "grad_norm": 0.7110701203346252, + "learning_rate": 1.0078487334998217e-05, + "loss": 0.367, + "step": 1130 + }, + { + "epoch": 0.020172653658188565, + "grad_norm": 0.6957160830497742, + "learning_rate": 1.0087406350338923e-05, + "loss": 0.3648, + "step": 1131 + }, + { + "epoch": 0.020190489779902256, + "grad_norm": 1.1493804454803467, + "learning_rate": 1.009632536567963e-05, + "loss": 0.4201, + "step": 1132 + }, + { + "epoch": 0.02020832590161595, + "grad_norm": 0.9538556933403015, + "learning_rate": 1.0105244381020336e-05, + "loss": 0.3024, + "step": 1133 + }, + { + "epoch": 0.020226162023329646, + "grad_norm": 0.7720417976379395, + "learning_rate": 1.0114163396361042e-05, + "loss": 0.3309, + "step": 1134 + }, + { + "epoch": 0.02024399814504334, + "grad_norm": 0.7839877605438232, + "learning_rate": 1.0123082411701749e-05, + "loss": 0.3195, + "step": 1135 + }, + { + "epoch": 0.020261834266757036, + "grad_norm": 0.7205426096916199, + "learning_rate": 1.0132001427042455e-05, + "loss": 0.3747, + "step": 1136 + }, + { + "epoch": 0.02027967038847073, + "grad_norm": 0.7438391447067261, + "learning_rate": 1.0140920442383161e-05, + "loss": 0.3975, + "step": 1137 + }, + { + "epoch": 0.020297506510184426, + "grad_norm": 0.8830706477165222, + "learning_rate": 1.0149839457723868e-05, + "loss": 0.3599, + "step": 1138 + }, + { + "epoch": 0.02031534263189812, + "grad_norm": 0.7458361387252808, + "learning_rate": 1.0158758473064574e-05, + "loss": 0.3519, + "step": 1139 + }, + { + "epoch": 0.020333178753611816, + "grad_norm": 0.6681880950927734, + "learning_rate": 1.016767748840528e-05, + "loss": 0.3044, + "step": 1140 + }, + { + "epoch": 0.02035101487532551, + "grad_norm": 1.059791922569275, + "learning_rate": 1.0176596503745987e-05, + "loss": 0.3972, + "step": 1141 + }, + { + "epoch": 0.020368850997039206, + "grad_norm": 0.7251273393630981, + "learning_rate": 1.0185515519086693e-05, + "loss": 0.3404, + "step": 1142 + }, + { + "epoch": 0.020386687118752897, + "grad_norm": 1.1100664138793945, + "learning_rate": 1.01944345344274e-05, + "loss": 0.3718, + "step": 1143 + }, + { + "epoch": 0.020404523240466592, + "grad_norm": 0.662216305732727, + "learning_rate": 1.0203353549768106e-05, + "loss": 0.3716, + "step": 1144 + }, + { + "epoch": 0.020422359362180287, + "grad_norm": 0.6917687654495239, + "learning_rate": 1.0212272565108812e-05, + "loss": 0.4127, + "step": 1145 + }, + { + "epoch": 0.02044019548389398, + "grad_norm": 0.7356336712837219, + "learning_rate": 1.0221191580449519e-05, + "loss": 0.3553, + "step": 1146 + }, + { + "epoch": 0.020458031605607677, + "grad_norm": 0.8886292576789856, + "learning_rate": 1.0230110595790225e-05, + "loss": 0.401, + "step": 1147 + }, + { + "epoch": 0.02047586772732137, + "grad_norm": 0.93147873878479, + "learning_rate": 1.0239029611130931e-05, + "loss": 0.3351, + "step": 1148 + }, + { + "epoch": 0.020493703849035066, + "grad_norm": 1.390363097190857, + "learning_rate": 1.0247948626471638e-05, + "loss": 0.3611, + "step": 1149 + }, + { + "epoch": 0.02051153997074876, + "grad_norm": 1.0718594789505005, + "learning_rate": 1.0256867641812344e-05, + "loss": 0.304, + "step": 1150 + }, + { + "epoch": 0.020529376092462456, + "grad_norm": 0.7545117735862732, + "learning_rate": 1.026578665715305e-05, + "loss": 0.3446, + "step": 1151 + }, + { + "epoch": 0.02054721221417615, + "grad_norm": 0.7707802057266235, + "learning_rate": 1.0274705672493757e-05, + "loss": 0.3562, + "step": 1152 + }, + { + "epoch": 0.020565048335889843, + "grad_norm": 0.658502995967865, + "learning_rate": 1.0283624687834463e-05, + "loss": 0.331, + "step": 1153 + }, + { + "epoch": 0.020582884457603538, + "grad_norm": 0.7341593503952026, + "learning_rate": 1.029254370317517e-05, + "loss": 0.2746, + "step": 1154 + }, + { + "epoch": 0.020600720579317233, + "grad_norm": 0.8296139240264893, + "learning_rate": 1.0301462718515876e-05, + "loss": 0.3797, + "step": 1155 + }, + { + "epoch": 0.020618556701030927, + "grad_norm": 1.3040196895599365, + "learning_rate": 1.0310381733856583e-05, + "loss": 0.3781, + "step": 1156 + }, + { + "epoch": 0.020636392822744622, + "grad_norm": 0.8125171661376953, + "learning_rate": 1.0319300749197289e-05, + "loss": 0.3832, + "step": 1157 + }, + { + "epoch": 0.020654228944458317, + "grad_norm": 0.8667579293251038, + "learning_rate": 1.0328219764537995e-05, + "loss": 0.4097, + "step": 1158 + }, + { + "epoch": 0.020672065066172012, + "grad_norm": 0.6190834641456604, + "learning_rate": 1.0337138779878702e-05, + "loss": 0.358, + "step": 1159 + }, + { + "epoch": 0.020689901187885707, + "grad_norm": 1.077289342880249, + "learning_rate": 1.0346057795219408e-05, + "loss": 0.3257, + "step": 1160 + }, + { + "epoch": 0.020707737309599402, + "grad_norm": 0.8260953426361084, + "learning_rate": 1.0354976810560114e-05, + "loss": 0.3942, + "step": 1161 + }, + { + "epoch": 0.020725573431313097, + "grad_norm": 0.8963320851325989, + "learning_rate": 1.036389582590082e-05, + "loss": 0.3753, + "step": 1162 + }, + { + "epoch": 0.02074340955302679, + "grad_norm": 0.7348595261573792, + "learning_rate": 1.0372814841241527e-05, + "loss": 0.3758, + "step": 1163 + }, + { + "epoch": 0.020761245674740483, + "grad_norm": 1.2362557649612427, + "learning_rate": 1.0381733856582234e-05, + "loss": 0.3453, + "step": 1164 + }, + { + "epoch": 0.020779081796454178, + "grad_norm": 0.6578987836837769, + "learning_rate": 1.039065287192294e-05, + "loss": 0.3942, + "step": 1165 + }, + { + "epoch": 0.020796917918167873, + "grad_norm": 0.7647325396537781, + "learning_rate": 1.0399571887263646e-05, + "loss": 0.3809, + "step": 1166 + }, + { + "epoch": 0.020814754039881568, + "grad_norm": 0.7755059003829956, + "learning_rate": 1.0408490902604354e-05, + "loss": 0.3624, + "step": 1167 + }, + { + "epoch": 0.020832590161595263, + "grad_norm": 0.8467023968696594, + "learning_rate": 1.0417409917945059e-05, + "loss": 0.3655, + "step": 1168 + }, + { + "epoch": 0.020850426283308958, + "grad_norm": 1.1363590955734253, + "learning_rate": 1.0426328933285765e-05, + "loss": 0.3233, + "step": 1169 + }, + { + "epoch": 0.020868262405022653, + "grad_norm": 0.7776278853416443, + "learning_rate": 1.0435247948626472e-05, + "loss": 0.3514, + "step": 1170 + }, + { + "epoch": 0.020886098526736348, + "grad_norm": 0.6754946708679199, + "learning_rate": 1.0444166963967178e-05, + "loss": 0.3799, + "step": 1171 + }, + { + "epoch": 0.020903934648450043, + "grad_norm": 0.6129249930381775, + "learning_rate": 1.0453085979307885e-05, + "loss": 0.3272, + "step": 1172 + }, + { + "epoch": 0.020921770770163734, + "grad_norm": 0.5411374568939209, + "learning_rate": 1.0462004994648591e-05, + "loss": 0.3353, + "step": 1173 + }, + { + "epoch": 0.02093960689187743, + "grad_norm": 1.0081175565719604, + "learning_rate": 1.0470924009989299e-05, + "loss": 0.312, + "step": 1174 + }, + { + "epoch": 0.020957443013591124, + "grad_norm": 0.7088974118232727, + "learning_rate": 1.0479843025330004e-05, + "loss": 0.3653, + "step": 1175 + }, + { + "epoch": 0.02097527913530482, + "grad_norm": 0.9669176936149597, + "learning_rate": 1.048876204067071e-05, + "loss": 0.3799, + "step": 1176 + }, + { + "epoch": 0.020993115257018514, + "grad_norm": 0.717627763748169, + "learning_rate": 1.0497681056011417e-05, + "loss": 0.2725, + "step": 1177 + }, + { + "epoch": 0.02101095137873221, + "grad_norm": 0.5851650834083557, + "learning_rate": 1.0506600071352123e-05, + "loss": 0.3497, + "step": 1178 + }, + { + "epoch": 0.021028787500445904, + "grad_norm": 1.4409462213516235, + "learning_rate": 1.051551908669283e-05, + "loss": 0.3455, + "step": 1179 + }, + { + "epoch": 0.0210466236221596, + "grad_norm": 0.6856871843338013, + "learning_rate": 1.0524438102033536e-05, + "loss": 0.355, + "step": 1180 + }, + { + "epoch": 0.021064459743873293, + "grad_norm": 0.7965103983879089, + "learning_rate": 1.0533357117374244e-05, + "loss": 0.403, + "step": 1181 + }, + { + "epoch": 0.02108229586558699, + "grad_norm": 0.845029890537262, + "learning_rate": 1.0542276132714948e-05, + "loss": 0.3826, + "step": 1182 + }, + { + "epoch": 0.02110013198730068, + "grad_norm": 0.6509142518043518, + "learning_rate": 1.0551195148055655e-05, + "loss": 0.3505, + "step": 1183 + }, + { + "epoch": 0.021117968109014375, + "grad_norm": 0.6117712259292603, + "learning_rate": 1.0560114163396361e-05, + "loss": 0.3301, + "step": 1184 + }, + { + "epoch": 0.02113580423072807, + "grad_norm": 0.79875648021698, + "learning_rate": 1.056903317873707e-05, + "loss": 0.3352, + "step": 1185 + }, + { + "epoch": 0.021153640352441765, + "grad_norm": 0.5840588808059692, + "learning_rate": 1.0577952194077774e-05, + "loss": 0.3113, + "step": 1186 + }, + { + "epoch": 0.02117147647415546, + "grad_norm": 0.6373933553695679, + "learning_rate": 1.058687120941848e-05, + "loss": 0.3975, + "step": 1187 + }, + { + "epoch": 0.021189312595869154, + "grad_norm": 0.6922791004180908, + "learning_rate": 1.0595790224759187e-05, + "loss": 0.3619, + "step": 1188 + }, + { + "epoch": 0.02120714871758285, + "grad_norm": 0.9521928429603577, + "learning_rate": 1.0604709240099893e-05, + "loss": 0.3215, + "step": 1189 + }, + { + "epoch": 0.021224984839296544, + "grad_norm": 0.5857126712799072, + "learning_rate": 1.06136282554406e-05, + "loss": 0.3182, + "step": 1190 + }, + { + "epoch": 0.02124282096101024, + "grad_norm": 0.5465163588523865, + "learning_rate": 1.0622547270781306e-05, + "loss": 0.3526, + "step": 1191 + }, + { + "epoch": 0.021260657082723934, + "grad_norm": 0.5520526170730591, + "learning_rate": 1.0631466286122014e-05, + "loss": 0.3466, + "step": 1192 + }, + { + "epoch": 0.021278493204437626, + "grad_norm": 0.7705773711204529, + "learning_rate": 1.0640385301462719e-05, + "loss": 0.3893, + "step": 1193 + }, + { + "epoch": 0.02129632932615132, + "grad_norm": 0.6714303493499756, + "learning_rate": 1.0649304316803425e-05, + "loss": 0.3373, + "step": 1194 + }, + { + "epoch": 0.021314165447865015, + "grad_norm": 0.8076788783073425, + "learning_rate": 1.0658223332144131e-05, + "loss": 0.388, + "step": 1195 + }, + { + "epoch": 0.02133200156957871, + "grad_norm": 0.7793228626251221, + "learning_rate": 1.0667142347484838e-05, + "loss": 0.4123, + "step": 1196 + }, + { + "epoch": 0.021349837691292405, + "grad_norm": 0.9169655442237854, + "learning_rate": 1.0676061362825544e-05, + "loss": 0.3764, + "step": 1197 + }, + { + "epoch": 0.0213676738130061, + "grad_norm": 0.8548396825790405, + "learning_rate": 1.068498037816625e-05, + "loss": 0.3923, + "step": 1198 + }, + { + "epoch": 0.021385509934719795, + "grad_norm": 1.0099925994873047, + "learning_rate": 1.0693899393506959e-05, + "loss": 0.3566, + "step": 1199 + }, + { + "epoch": 0.02140334605643349, + "grad_norm": 0.9970804452896118, + "learning_rate": 1.0702818408847663e-05, + "loss": 0.4287, + "step": 1200 + }, + { + "epoch": 0.021421182178147185, + "grad_norm": 0.8104790449142456, + "learning_rate": 1.071173742418837e-05, + "loss": 0.4214, + "step": 1201 + }, + { + "epoch": 0.02143901829986088, + "grad_norm": 0.6155812740325928, + "learning_rate": 1.0720656439529076e-05, + "loss": 0.3532, + "step": 1202 + }, + { + "epoch": 0.02145685442157457, + "grad_norm": 0.6520779728889465, + "learning_rate": 1.0729575454869784e-05, + "loss": 0.3725, + "step": 1203 + }, + { + "epoch": 0.021474690543288266, + "grad_norm": 0.9459627866744995, + "learning_rate": 1.0738494470210489e-05, + "loss": 0.4364, + "step": 1204 + }, + { + "epoch": 0.02149252666500196, + "grad_norm": 0.9855123162269592, + "learning_rate": 1.0747413485551195e-05, + "loss": 0.3301, + "step": 1205 + }, + { + "epoch": 0.021510362786715656, + "grad_norm": 0.6528222560882568, + "learning_rate": 1.0756332500891903e-05, + "loss": 0.3703, + "step": 1206 + }, + { + "epoch": 0.02152819890842935, + "grad_norm": 0.9400623440742493, + "learning_rate": 1.0765251516232608e-05, + "loss": 0.3596, + "step": 1207 + }, + { + "epoch": 0.021546035030143046, + "grad_norm": 0.5530434846878052, + "learning_rate": 1.0774170531573314e-05, + "loss": 0.3084, + "step": 1208 + }, + { + "epoch": 0.02156387115185674, + "grad_norm": 0.6158286929130554, + "learning_rate": 1.078308954691402e-05, + "loss": 0.4104, + "step": 1209 + }, + { + "epoch": 0.021581707273570436, + "grad_norm": 0.5364154577255249, + "learning_rate": 1.0792008562254729e-05, + "loss": 0.3691, + "step": 1210 + }, + { + "epoch": 0.02159954339528413, + "grad_norm": 0.8397731184959412, + "learning_rate": 1.0800927577595433e-05, + "loss": 0.402, + "step": 1211 + }, + { + "epoch": 0.021617379516997826, + "grad_norm": 1.206375241279602, + "learning_rate": 1.080984659293614e-05, + "loss": 0.3939, + "step": 1212 + }, + { + "epoch": 0.021635215638711517, + "grad_norm": 0.8143543004989624, + "learning_rate": 1.0818765608276848e-05, + "loss": 0.3266, + "step": 1213 + }, + { + "epoch": 0.021653051760425212, + "grad_norm": 0.5477320551872253, + "learning_rate": 1.0827684623617553e-05, + "loss": 0.3103, + "step": 1214 + }, + { + "epoch": 0.021670887882138907, + "grad_norm": 0.7129520177841187, + "learning_rate": 1.0836603638958259e-05, + "loss": 0.4018, + "step": 1215 + }, + { + "epoch": 0.021688724003852602, + "grad_norm": 0.6713840365409851, + "learning_rate": 1.0845522654298965e-05, + "loss": 0.3195, + "step": 1216 + }, + { + "epoch": 0.021706560125566297, + "grad_norm": 0.5806293487548828, + "learning_rate": 1.0854441669639673e-05, + "loss": 0.2921, + "step": 1217 + }, + { + "epoch": 0.02172439624727999, + "grad_norm": 0.6675949692726135, + "learning_rate": 1.0863360684980378e-05, + "loss": 0.3185, + "step": 1218 + }, + { + "epoch": 0.021742232368993687, + "grad_norm": 0.6240116953849792, + "learning_rate": 1.0872279700321084e-05, + "loss": 0.3002, + "step": 1219 + }, + { + "epoch": 0.02176006849070738, + "grad_norm": 0.5665594339370728, + "learning_rate": 1.0881198715661793e-05, + "loss": 0.3386, + "step": 1220 + }, + { + "epoch": 0.021777904612421076, + "grad_norm": 0.6369854807853699, + "learning_rate": 1.0890117731002499e-05, + "loss": 0.2952, + "step": 1221 + }, + { + "epoch": 0.02179574073413477, + "grad_norm": 0.9375613331794739, + "learning_rate": 1.0899036746343204e-05, + "loss": 0.3836, + "step": 1222 + }, + { + "epoch": 0.021813576855848463, + "grad_norm": 0.7689806818962097, + "learning_rate": 1.090795576168391e-05, + "loss": 0.321, + "step": 1223 + }, + { + "epoch": 0.021831412977562158, + "grad_norm": 0.6562024354934692, + "learning_rate": 1.0916874777024618e-05, + "loss": 0.3737, + "step": 1224 + }, + { + "epoch": 0.021849249099275853, + "grad_norm": 0.6183013916015625, + "learning_rate": 1.0925793792365323e-05, + "loss": 0.4069, + "step": 1225 + }, + { + "epoch": 0.021867085220989547, + "grad_norm": 0.5989112257957458, + "learning_rate": 1.0934712807706029e-05, + "loss": 0.3535, + "step": 1226 + }, + { + "epoch": 0.021884921342703242, + "grad_norm": 0.8406489491462708, + "learning_rate": 1.0943631823046736e-05, + "loss": 0.506, + "step": 1227 + }, + { + "epoch": 0.021902757464416937, + "grad_norm": 0.7310986518859863, + "learning_rate": 1.0952550838387444e-05, + "loss": 0.3709, + "step": 1228 + }, + { + "epoch": 0.021920593586130632, + "grad_norm": 0.6726301312446594, + "learning_rate": 1.0961469853728148e-05, + "loss": 0.399, + "step": 1229 + }, + { + "epoch": 0.021938429707844327, + "grad_norm": 1.2989193201065063, + "learning_rate": 1.0970388869068855e-05, + "loss": 0.3506, + "step": 1230 + }, + { + "epoch": 0.021956265829558022, + "grad_norm": 1.3558590412139893, + "learning_rate": 1.0979307884409563e-05, + "loss": 0.3131, + "step": 1231 + }, + { + "epoch": 0.021974101951271717, + "grad_norm": 0.4981108009815216, + "learning_rate": 1.0988226899750267e-05, + "loss": 0.3116, + "step": 1232 + }, + { + "epoch": 0.02199193807298541, + "grad_norm": 0.7580838799476624, + "learning_rate": 1.0997145915090974e-05, + "loss": 0.3459, + "step": 1233 + }, + { + "epoch": 0.022009774194699103, + "grad_norm": 0.8278317451477051, + "learning_rate": 1.100606493043168e-05, + "loss": 0.2989, + "step": 1234 + }, + { + "epoch": 0.0220276103164128, + "grad_norm": 0.7717764973640442, + "learning_rate": 1.1014983945772388e-05, + "loss": 0.3764, + "step": 1235 + }, + { + "epoch": 0.022045446438126493, + "grad_norm": 0.9159849286079407, + "learning_rate": 1.1023902961113093e-05, + "loss": 0.3438, + "step": 1236 + }, + { + "epoch": 0.022063282559840188, + "grad_norm": 1.1090803146362305, + "learning_rate": 1.10328219764538e-05, + "loss": 0.3421, + "step": 1237 + }, + { + "epoch": 0.022081118681553883, + "grad_norm": 0.7285106182098389, + "learning_rate": 1.1041740991794507e-05, + "loss": 0.3698, + "step": 1238 + }, + { + "epoch": 0.022098954803267578, + "grad_norm": 0.6520172953605652, + "learning_rate": 1.1050660007135214e-05, + "loss": 0.3547, + "step": 1239 + }, + { + "epoch": 0.022116790924981273, + "grad_norm": 0.9744372367858887, + "learning_rate": 1.1059579022475918e-05, + "loss": 0.3417, + "step": 1240 + }, + { + "epoch": 0.022134627046694968, + "grad_norm": 0.75611811876297, + "learning_rate": 1.1068498037816625e-05, + "loss": 0.3388, + "step": 1241 + }, + { + "epoch": 0.022152463168408663, + "grad_norm": 1.007924199104309, + "learning_rate": 1.1077417053157333e-05, + "loss": 0.3832, + "step": 1242 + }, + { + "epoch": 0.022170299290122354, + "grad_norm": 0.6002600789070129, + "learning_rate": 1.1086336068498038e-05, + "loss": 0.2941, + "step": 1243 + }, + { + "epoch": 0.02218813541183605, + "grad_norm": 0.7874862551689148, + "learning_rate": 1.1095255083838744e-05, + "loss": 0.389, + "step": 1244 + }, + { + "epoch": 0.022205971533549744, + "grad_norm": 0.8228870630264282, + "learning_rate": 1.1104174099179452e-05, + "loss": 0.3955, + "step": 1245 + }, + { + "epoch": 0.02222380765526344, + "grad_norm": 0.6489484906196594, + "learning_rate": 1.1113093114520158e-05, + "loss": 0.3003, + "step": 1246 + }, + { + "epoch": 0.022241643776977134, + "grad_norm": 0.778408944606781, + "learning_rate": 1.1122012129860863e-05, + "loss": 0.3468, + "step": 1247 + }, + { + "epoch": 0.02225947989869083, + "grad_norm": 0.8412859439849854, + "learning_rate": 1.113093114520157e-05, + "loss": 0.4323, + "step": 1248 + }, + { + "epoch": 0.022277316020404524, + "grad_norm": 0.8153448104858398, + "learning_rate": 1.1139850160542278e-05, + "loss": 0.3486, + "step": 1249 + }, + { + "epoch": 0.02229515214211822, + "grad_norm": 0.7701181769371033, + "learning_rate": 1.1148769175882982e-05, + "loss": 0.3746, + "step": 1250 + }, + { + "epoch": 0.022312988263831914, + "grad_norm": 0.9697129726409912, + "learning_rate": 1.1157688191223689e-05, + "loss": 0.4327, + "step": 1251 + }, + { + "epoch": 0.02233082438554561, + "grad_norm": 0.7470831871032715, + "learning_rate": 1.1166607206564397e-05, + "loss": 0.4245, + "step": 1252 + }, + { + "epoch": 0.0223486605072593, + "grad_norm": 0.9449819326400757, + "learning_rate": 1.1175526221905103e-05, + "loss": 0.3416, + "step": 1253 + }, + { + "epoch": 0.022366496628972995, + "grad_norm": 0.5881229639053345, + "learning_rate": 1.1184445237245808e-05, + "loss": 0.3463, + "step": 1254 + }, + { + "epoch": 0.02238433275068669, + "grad_norm": 0.6636262536048889, + "learning_rate": 1.1193364252586514e-05, + "loss": 0.3682, + "step": 1255 + }, + { + "epoch": 0.022402168872400385, + "grad_norm": 0.7649720311164856, + "learning_rate": 1.1202283267927222e-05, + "loss": 0.3458, + "step": 1256 + }, + { + "epoch": 0.02242000499411408, + "grad_norm": 0.7601820230484009, + "learning_rate": 1.1211202283267929e-05, + "loss": 0.3582, + "step": 1257 + }, + { + "epoch": 0.022437841115827774, + "grad_norm": 0.5499415993690491, + "learning_rate": 1.1220121298608633e-05, + "loss": 0.3632, + "step": 1258 + }, + { + "epoch": 0.02245567723754147, + "grad_norm": 0.6030718088150024, + "learning_rate": 1.122904031394934e-05, + "loss": 0.3435, + "step": 1259 + }, + { + "epoch": 0.022473513359255164, + "grad_norm": 1.0028022527694702, + "learning_rate": 1.1237959329290048e-05, + "loss": 0.362, + "step": 1260 + }, + { + "epoch": 0.02249134948096886, + "grad_norm": 0.9477298259735107, + "learning_rate": 1.1246878344630752e-05, + "loss": 0.3707, + "step": 1261 + }, + { + "epoch": 0.022509185602682554, + "grad_norm": 0.6312440633773804, + "learning_rate": 1.1255797359971459e-05, + "loss": 0.3758, + "step": 1262 + }, + { + "epoch": 0.022527021724396246, + "grad_norm": 0.7580999135971069, + "learning_rate": 1.1264716375312167e-05, + "loss": 0.3863, + "step": 1263 + }, + { + "epoch": 0.02254485784610994, + "grad_norm": 0.6269981265068054, + "learning_rate": 1.1273635390652873e-05, + "loss": 0.3736, + "step": 1264 + }, + { + "epoch": 0.022562693967823635, + "grad_norm": 0.6420585513114929, + "learning_rate": 1.1282554405993578e-05, + "loss": 0.3995, + "step": 1265 + }, + { + "epoch": 0.02258053008953733, + "grad_norm": 0.6503485441207886, + "learning_rate": 1.1291473421334284e-05, + "loss": 0.4484, + "step": 1266 + }, + { + "epoch": 0.022598366211251025, + "grad_norm": 0.9527643918991089, + "learning_rate": 1.1300392436674992e-05, + "loss": 0.3842, + "step": 1267 + }, + { + "epoch": 0.02261620233296472, + "grad_norm": 0.8242455720901489, + "learning_rate": 1.1309311452015699e-05, + "loss": 0.2785, + "step": 1268 + }, + { + "epoch": 0.022634038454678415, + "grad_norm": 0.7459538578987122, + "learning_rate": 1.1318230467356404e-05, + "loss": 0.3611, + "step": 1269 + }, + { + "epoch": 0.02265187457639211, + "grad_norm": 0.7388677597045898, + "learning_rate": 1.1327149482697112e-05, + "loss": 0.3404, + "step": 1270 + }, + { + "epoch": 0.022669710698105805, + "grad_norm": 0.5609098672866821, + "learning_rate": 1.1336068498037818e-05, + "loss": 0.3264, + "step": 1271 + }, + { + "epoch": 0.0226875468198195, + "grad_norm": 0.902957022190094, + "learning_rate": 1.1344987513378523e-05, + "loss": 0.3632, + "step": 1272 + }, + { + "epoch": 0.02270538294153319, + "grad_norm": 0.9123650193214417, + "learning_rate": 1.1353906528719229e-05, + "loss": 0.4122, + "step": 1273 + }, + { + "epoch": 0.022723219063246886, + "grad_norm": 0.6583011150360107, + "learning_rate": 1.1362825544059937e-05, + "loss": 0.3491, + "step": 1274 + }, + { + "epoch": 0.02274105518496058, + "grad_norm": 0.7611053586006165, + "learning_rate": 1.1371744559400643e-05, + "loss": 0.362, + "step": 1275 + }, + { + "epoch": 0.022758891306674276, + "grad_norm": 1.9730926752090454, + "learning_rate": 1.1380663574741348e-05, + "loss": 0.4072, + "step": 1276 + }, + { + "epoch": 0.02277672742838797, + "grad_norm": 0.9904792904853821, + "learning_rate": 1.1389582590082056e-05, + "loss": 0.3232, + "step": 1277 + }, + { + "epoch": 0.022794563550101666, + "grad_norm": 0.9127361178398132, + "learning_rate": 1.1398501605422763e-05, + "loss": 0.4263, + "step": 1278 + }, + { + "epoch": 0.02281239967181536, + "grad_norm": 0.6934426426887512, + "learning_rate": 1.1407420620763467e-05, + "loss": 0.3319, + "step": 1279 + }, + { + "epoch": 0.022830235793529056, + "grad_norm": 0.7358129620552063, + "learning_rate": 1.1416339636104174e-05, + "loss": 0.3492, + "step": 1280 + }, + { + "epoch": 0.02284807191524275, + "grad_norm": 0.7250738739967346, + "learning_rate": 1.1425258651444882e-05, + "loss": 0.3692, + "step": 1281 + }, + { + "epoch": 0.022865908036956446, + "grad_norm": 0.6895067691802979, + "learning_rate": 1.1434177666785588e-05, + "loss": 0.3441, + "step": 1282 + }, + { + "epoch": 0.022883744158670137, + "grad_norm": 0.6137292385101318, + "learning_rate": 1.1443096682126293e-05, + "loss": 0.3553, + "step": 1283 + }, + { + "epoch": 0.022901580280383832, + "grad_norm": 0.629615843296051, + "learning_rate": 1.1452015697467001e-05, + "loss": 0.3379, + "step": 1284 + }, + { + "epoch": 0.022919416402097527, + "grad_norm": 0.8250669836997986, + "learning_rate": 1.1460934712807707e-05, + "loss": 0.3899, + "step": 1285 + }, + { + "epoch": 0.022937252523811222, + "grad_norm": 3.8415939807891846, + "learning_rate": 1.1469853728148414e-05, + "loss": 0.4383, + "step": 1286 + }, + { + "epoch": 0.022955088645524917, + "grad_norm": 0.5856311321258545, + "learning_rate": 1.1478772743489118e-05, + "loss": 0.3239, + "step": 1287 + }, + { + "epoch": 0.02297292476723861, + "grad_norm": 1.1937249898910522, + "learning_rate": 1.1487691758829826e-05, + "loss": 0.3129, + "step": 1288 + }, + { + "epoch": 0.022990760888952307, + "grad_norm": 0.7404820919036865, + "learning_rate": 1.1496610774170533e-05, + "loss": 0.4015, + "step": 1289 + }, + { + "epoch": 0.023008597010666, + "grad_norm": 0.6083435416221619, + "learning_rate": 1.1505529789511237e-05, + "loss": 0.332, + "step": 1290 + }, + { + "epoch": 0.023026433132379696, + "grad_norm": 0.5573935508728027, + "learning_rate": 1.1514448804851944e-05, + "loss": 0.3302, + "step": 1291 + }, + { + "epoch": 0.02304426925409339, + "grad_norm": 0.5558791160583496, + "learning_rate": 1.1523367820192652e-05, + "loss": 0.3388, + "step": 1292 + }, + { + "epoch": 0.023062105375807083, + "grad_norm": 0.6461998224258423, + "learning_rate": 1.1532286835533358e-05, + "loss": 0.3233, + "step": 1293 + }, + { + "epoch": 0.023079941497520778, + "grad_norm": 0.609324038028717, + "learning_rate": 1.1541205850874063e-05, + "loss": 0.3886, + "step": 1294 + }, + { + "epoch": 0.023097777619234473, + "grad_norm": 0.7898502945899963, + "learning_rate": 1.1550124866214771e-05, + "loss": 0.3979, + "step": 1295 + }, + { + "epoch": 0.023115613740948168, + "grad_norm": 0.7442877292633057, + "learning_rate": 1.1559043881555477e-05, + "loss": 0.3901, + "step": 1296 + }, + { + "epoch": 0.023133449862661862, + "grad_norm": 0.5739049315452576, + "learning_rate": 1.1567962896896182e-05, + "loss": 0.3253, + "step": 1297 + }, + { + "epoch": 0.023151285984375557, + "grad_norm": 0.7304110527038574, + "learning_rate": 1.1576881912236889e-05, + "loss": 0.3857, + "step": 1298 + }, + { + "epoch": 0.023169122106089252, + "grad_norm": 0.7546870708465576, + "learning_rate": 1.1585800927577597e-05, + "loss": 0.3216, + "step": 1299 + }, + { + "epoch": 0.023186958227802947, + "grad_norm": 0.6870408654212952, + "learning_rate": 1.1594719942918303e-05, + "loss": 0.4039, + "step": 1300 + }, + { + "epoch": 0.023204794349516642, + "grad_norm": 0.563120424747467, + "learning_rate": 1.1603638958259008e-05, + "loss": 0.3156, + "step": 1301 + }, + { + "epoch": 0.023222630471230337, + "grad_norm": 0.6151665449142456, + "learning_rate": 1.1612557973599716e-05, + "loss": 0.424, + "step": 1302 + }, + { + "epoch": 0.023240466592944032, + "grad_norm": 0.5403041839599609, + "learning_rate": 1.1621476988940422e-05, + "loss": 0.3983, + "step": 1303 + }, + { + "epoch": 0.023258302714657723, + "grad_norm": 1.3480035066604614, + "learning_rate": 1.1630396004281129e-05, + "loss": 0.3477, + "step": 1304 + }, + { + "epoch": 0.02327613883637142, + "grad_norm": 0.7916358113288879, + "learning_rate": 1.1639315019621833e-05, + "loss": 0.3757, + "step": 1305 + }, + { + "epoch": 0.023293974958085113, + "grad_norm": 0.597547709941864, + "learning_rate": 1.1648234034962541e-05, + "loss": 0.3411, + "step": 1306 + }, + { + "epoch": 0.023311811079798808, + "grad_norm": 0.9466458559036255, + "learning_rate": 1.1657153050303248e-05, + "loss": 0.4627, + "step": 1307 + }, + { + "epoch": 0.023329647201512503, + "grad_norm": 0.6916009783744812, + "learning_rate": 1.1666072065643952e-05, + "loss": 0.3854, + "step": 1308 + }, + { + "epoch": 0.023347483323226198, + "grad_norm": 0.743757963180542, + "learning_rate": 1.167499108098466e-05, + "loss": 0.3372, + "step": 1309 + }, + { + "epoch": 0.023365319444939893, + "grad_norm": 0.6970309615135193, + "learning_rate": 1.1683910096325367e-05, + "loss": 0.3078, + "step": 1310 + }, + { + "epoch": 0.023383155566653588, + "grad_norm": 1.208333134651184, + "learning_rate": 1.1692829111666073e-05, + "loss": 0.3811, + "step": 1311 + }, + { + "epoch": 0.023400991688367283, + "grad_norm": 0.7694846987724304, + "learning_rate": 1.1701748127006778e-05, + "loss": 0.3797, + "step": 1312 + }, + { + "epoch": 0.023418827810080978, + "grad_norm": 0.9177335500717163, + "learning_rate": 1.1710667142347486e-05, + "loss": 0.3947, + "step": 1313 + }, + { + "epoch": 0.02343666393179467, + "grad_norm": 0.8459893465042114, + "learning_rate": 1.1719586157688192e-05, + "loss": 0.3446, + "step": 1314 + }, + { + "epoch": 0.023454500053508364, + "grad_norm": 1.0277235507965088, + "learning_rate": 1.1728505173028897e-05, + "loss": 0.4476, + "step": 1315 + }, + { + "epoch": 0.02347233617522206, + "grad_norm": 1.5987002849578857, + "learning_rate": 1.1737424188369605e-05, + "loss": 0.3207, + "step": 1316 + }, + { + "epoch": 0.023490172296935754, + "grad_norm": 0.7976105809211731, + "learning_rate": 1.1746343203710311e-05, + "loss": 0.292, + "step": 1317 + }, + { + "epoch": 0.02350800841864945, + "grad_norm": 0.7589207887649536, + "learning_rate": 1.1755262219051018e-05, + "loss": 0.4273, + "step": 1318 + }, + { + "epoch": 0.023525844540363144, + "grad_norm": 0.9924066066741943, + "learning_rate": 1.1764181234391723e-05, + "loss": 0.3277, + "step": 1319 + }, + { + "epoch": 0.02354368066207684, + "grad_norm": 0.5412936210632324, + "learning_rate": 1.177310024973243e-05, + "loss": 0.3073, + "step": 1320 + }, + { + "epoch": 0.023561516783790534, + "grad_norm": 0.7021755576133728, + "learning_rate": 1.1782019265073137e-05, + "loss": 0.3623, + "step": 1321 + }, + { + "epoch": 0.02357935290550423, + "grad_norm": 0.7117661833763123, + "learning_rate": 1.1790938280413843e-05, + "loss": 0.3996, + "step": 1322 + }, + { + "epoch": 0.023597189027217923, + "grad_norm": 0.9206212162971497, + "learning_rate": 1.179985729575455e-05, + "loss": 0.4311, + "step": 1323 + }, + { + "epoch": 0.023615025148931615, + "grad_norm": 0.5773261785507202, + "learning_rate": 1.1808776311095256e-05, + "loss": 0.3323, + "step": 1324 + }, + { + "epoch": 0.02363286127064531, + "grad_norm": 0.5009505748748779, + "learning_rate": 1.1817695326435962e-05, + "loss": 0.3071, + "step": 1325 + }, + { + "epoch": 0.023650697392359005, + "grad_norm": 1.3784102201461792, + "learning_rate": 1.1826614341776667e-05, + "loss": 0.2955, + "step": 1326 + }, + { + "epoch": 0.0236685335140727, + "grad_norm": 0.7203867435455322, + "learning_rate": 1.1835533357117375e-05, + "loss": 0.3577, + "step": 1327 + }, + { + "epoch": 0.023686369635786395, + "grad_norm": 0.6099973917007446, + "learning_rate": 1.1844452372458082e-05, + "loss": 0.337, + "step": 1328 + }, + { + "epoch": 0.02370420575750009, + "grad_norm": 1.6613069772720337, + "learning_rate": 1.1853371387798788e-05, + "loss": 0.3803, + "step": 1329 + }, + { + "epoch": 0.023722041879213784, + "grad_norm": 0.5442463755607605, + "learning_rate": 1.1862290403139493e-05, + "loss": 0.3242, + "step": 1330 + }, + { + "epoch": 0.02373987800092748, + "grad_norm": 0.6935420632362366, + "learning_rate": 1.18712094184802e-05, + "loss": 0.3526, + "step": 1331 + }, + { + "epoch": 0.023757714122641174, + "grad_norm": 0.8325861692428589, + "learning_rate": 1.1880128433820907e-05, + "loss": 0.2866, + "step": 1332 + }, + { + "epoch": 0.02377555024435487, + "grad_norm": 0.7822994589805603, + "learning_rate": 1.1889047449161612e-05, + "loss": 0.3613, + "step": 1333 + }, + { + "epoch": 0.02379338636606856, + "grad_norm": 0.7527873516082764, + "learning_rate": 1.189796646450232e-05, + "loss": 0.3506, + "step": 1334 + }, + { + "epoch": 0.023811222487782256, + "grad_norm": 0.9543324708938599, + "learning_rate": 1.1906885479843026e-05, + "loss": 0.3824, + "step": 1335 + }, + { + "epoch": 0.02382905860949595, + "grad_norm": 0.5387325286865234, + "learning_rate": 1.1915804495183733e-05, + "loss": 0.343, + "step": 1336 + }, + { + "epoch": 0.023846894731209645, + "grad_norm": 1.0645473003387451, + "learning_rate": 1.1924723510524437e-05, + "loss": 0.3869, + "step": 1337 + }, + { + "epoch": 0.02386473085292334, + "grad_norm": 0.7024256587028503, + "learning_rate": 1.1933642525865145e-05, + "loss": 0.3263, + "step": 1338 + }, + { + "epoch": 0.023882566974637035, + "grad_norm": 0.7108498215675354, + "learning_rate": 1.1942561541205852e-05, + "loss": 0.2696, + "step": 1339 + }, + { + "epoch": 0.02390040309635073, + "grad_norm": 0.8734039664268494, + "learning_rate": 1.1951480556546558e-05, + "loss": 0.4181, + "step": 1340 + }, + { + "epoch": 0.023918239218064425, + "grad_norm": 0.723659873008728, + "learning_rate": 1.1960399571887265e-05, + "loss": 0.3302, + "step": 1341 + }, + { + "epoch": 0.02393607533977812, + "grad_norm": 0.6870841383934021, + "learning_rate": 1.1969318587227971e-05, + "loss": 0.3548, + "step": 1342 + }, + { + "epoch": 0.023953911461491815, + "grad_norm": 1.9990111589431763, + "learning_rate": 1.1978237602568677e-05, + "loss": 0.3612, + "step": 1343 + }, + { + "epoch": 0.023971747583205506, + "grad_norm": 0.785835862159729, + "learning_rate": 1.1987156617909382e-05, + "loss": 0.3895, + "step": 1344 + }, + { + "epoch": 0.0239895837049192, + "grad_norm": 0.902116596698761, + "learning_rate": 1.199607563325009e-05, + "loss": 0.4551, + "step": 1345 + }, + { + "epoch": 0.024007419826632896, + "grad_norm": 0.6511051058769226, + "learning_rate": 1.2004994648590796e-05, + "loss": 0.3417, + "step": 1346 + }, + { + "epoch": 0.02402525594834659, + "grad_norm": 0.6050383448600769, + "learning_rate": 1.2013913663931503e-05, + "loss": 0.2843, + "step": 1347 + }, + { + "epoch": 0.024043092070060286, + "grad_norm": 0.745337188243866, + "learning_rate": 1.202283267927221e-05, + "loss": 0.3152, + "step": 1348 + }, + { + "epoch": 0.02406092819177398, + "grad_norm": 0.7404078841209412, + "learning_rate": 1.2031751694612916e-05, + "loss": 0.3834, + "step": 1349 + }, + { + "epoch": 0.024078764313487676, + "grad_norm": 0.5614080429077148, + "learning_rate": 1.2040670709953622e-05, + "loss": 0.3731, + "step": 1350 + }, + { + "epoch": 0.02409660043520137, + "grad_norm": 0.7362163066864014, + "learning_rate": 1.2049589725294327e-05, + "loss": 0.3349, + "step": 1351 + }, + { + "epoch": 0.024114436556915066, + "grad_norm": 0.8937839865684509, + "learning_rate": 1.2058508740635035e-05, + "loss": 0.3636, + "step": 1352 + }, + { + "epoch": 0.02413227267862876, + "grad_norm": 1.1626262664794922, + "learning_rate": 1.2067427755975741e-05, + "loss": 0.4247, + "step": 1353 + }, + { + "epoch": 0.024150108800342452, + "grad_norm": 0.6161826848983765, + "learning_rate": 1.2076346771316448e-05, + "loss": 0.3348, + "step": 1354 + }, + { + "epoch": 0.024167944922056147, + "grad_norm": 0.6281194090843201, + "learning_rate": 1.2085265786657154e-05, + "loss": 0.2897, + "step": 1355 + }, + { + "epoch": 0.024185781043769842, + "grad_norm": 0.8214020729064941, + "learning_rate": 1.209418480199786e-05, + "loss": 0.3801, + "step": 1356 + }, + { + "epoch": 0.024203617165483537, + "grad_norm": 0.854334831237793, + "learning_rate": 1.2103103817338567e-05, + "loss": 0.5048, + "step": 1357 + }, + { + "epoch": 0.02422145328719723, + "grad_norm": 0.675391435623169, + "learning_rate": 1.2112022832679273e-05, + "loss": 0.2958, + "step": 1358 + }, + { + "epoch": 0.024239289408910927, + "grad_norm": 0.5455700159072876, + "learning_rate": 1.212094184801998e-05, + "loss": 0.299, + "step": 1359 + }, + { + "epoch": 0.02425712553062462, + "grad_norm": 0.55274498462677, + "learning_rate": 1.2129860863360686e-05, + "loss": 0.2746, + "step": 1360 + }, + { + "epoch": 0.024274961652338316, + "grad_norm": 0.6536729335784912, + "learning_rate": 1.2138779878701392e-05, + "loss": 0.345, + "step": 1361 + }, + { + "epoch": 0.02429279777405201, + "grad_norm": 0.647176206111908, + "learning_rate": 1.2147698894042097e-05, + "loss": 0.3799, + "step": 1362 + }, + { + "epoch": 0.024310633895765706, + "grad_norm": 0.6187819242477417, + "learning_rate": 1.2156617909382805e-05, + "loss": 0.3152, + "step": 1363 + }, + { + "epoch": 0.024328470017479398, + "grad_norm": 0.835293710231781, + "learning_rate": 1.2165536924723511e-05, + "loss": 0.3179, + "step": 1364 + }, + { + "epoch": 0.024346306139193093, + "grad_norm": 0.6720101833343506, + "learning_rate": 1.2174455940064218e-05, + "loss": 0.3186, + "step": 1365 + }, + { + "epoch": 0.024364142260906788, + "grad_norm": 0.5965019464492798, + "learning_rate": 1.2183374955404924e-05, + "loss": 0.3023, + "step": 1366 + }, + { + "epoch": 0.024381978382620483, + "grad_norm": 0.6113612055778503, + "learning_rate": 1.219229397074563e-05, + "loss": 0.2907, + "step": 1367 + }, + { + "epoch": 0.024399814504334177, + "grad_norm": 0.5294867753982544, + "learning_rate": 1.2201212986086337e-05, + "loss": 0.2751, + "step": 1368 + }, + { + "epoch": 0.024417650626047872, + "grad_norm": 0.9566969275474548, + "learning_rate": 1.2210132001427043e-05, + "loss": 0.3855, + "step": 1369 + }, + { + "epoch": 0.024435486747761567, + "grad_norm": 0.9819380044937134, + "learning_rate": 1.221905101676775e-05, + "loss": 0.3529, + "step": 1370 + }, + { + "epoch": 0.024453322869475262, + "grad_norm": 0.591349720954895, + "learning_rate": 1.2227970032108456e-05, + "loss": 0.3364, + "step": 1371 + }, + { + "epoch": 0.024471158991188957, + "grad_norm": 1.2572211027145386, + "learning_rate": 1.2236889047449162e-05, + "loss": 0.4125, + "step": 1372 + }, + { + "epoch": 0.024488995112902652, + "grad_norm": 1.1575344800949097, + "learning_rate": 1.2245808062789869e-05, + "loss": 0.4375, + "step": 1373 + }, + { + "epoch": 0.024506831234616343, + "grad_norm": 0.6487801671028137, + "learning_rate": 1.2254727078130575e-05, + "loss": 0.3314, + "step": 1374 + }, + { + "epoch": 0.02452466735633004, + "grad_norm": 0.6622575521469116, + "learning_rate": 1.2263646093471282e-05, + "loss": 0.2818, + "step": 1375 + }, + { + "epoch": 0.024542503478043733, + "grad_norm": 0.6123865842819214, + "learning_rate": 1.2272565108811988e-05, + "loss": 0.3141, + "step": 1376 + }, + { + "epoch": 0.024560339599757428, + "grad_norm": 0.641710102558136, + "learning_rate": 1.2281484124152694e-05, + "loss": 0.3636, + "step": 1377 + }, + { + "epoch": 0.024578175721471123, + "grad_norm": 0.8112438917160034, + "learning_rate": 1.22904031394934e-05, + "loss": 0.3388, + "step": 1378 + }, + { + "epoch": 0.024596011843184818, + "grad_norm": 0.7893108129501343, + "learning_rate": 1.2299322154834107e-05, + "loss": 0.4094, + "step": 1379 + }, + { + "epoch": 0.024613847964898513, + "grad_norm": 1.0157471895217896, + "learning_rate": 1.2308241170174813e-05, + "loss": 0.328, + "step": 1380 + }, + { + "epoch": 0.024631684086612208, + "grad_norm": 0.933978259563446, + "learning_rate": 1.231716018551552e-05, + "loss": 0.3566, + "step": 1381 + }, + { + "epoch": 0.024649520208325903, + "grad_norm": 0.6377274394035339, + "learning_rate": 1.2326079200856226e-05, + "loss": 0.3758, + "step": 1382 + }, + { + "epoch": 0.024667356330039598, + "grad_norm": 0.6422837376594543, + "learning_rate": 1.2334998216196933e-05, + "loss": 0.3149, + "step": 1383 + }, + { + "epoch": 0.02468519245175329, + "grad_norm": 1.0213490724563599, + "learning_rate": 1.2343917231537639e-05, + "loss": 0.3905, + "step": 1384 + }, + { + "epoch": 0.024703028573466984, + "grad_norm": 0.532490074634552, + "learning_rate": 1.2352836246878345e-05, + "loss": 0.2899, + "step": 1385 + }, + { + "epoch": 0.02472086469518068, + "grad_norm": 0.4421103596687317, + "learning_rate": 1.2361755262219052e-05, + "loss": 0.2889, + "step": 1386 + }, + { + "epoch": 0.024738700816894374, + "grad_norm": 0.5894536375999451, + "learning_rate": 1.2370674277559758e-05, + "loss": 0.2592, + "step": 1387 + }, + { + "epoch": 0.02475653693860807, + "grad_norm": 1.573372721672058, + "learning_rate": 1.2379593292900464e-05, + "loss": 0.3207, + "step": 1388 + }, + { + "epoch": 0.024774373060321764, + "grad_norm": 0.7059410214424133, + "learning_rate": 1.238851230824117e-05, + "loss": 0.3469, + "step": 1389 + }, + { + "epoch": 0.02479220918203546, + "grad_norm": 0.6540749669075012, + "learning_rate": 1.2397431323581877e-05, + "loss": 0.3465, + "step": 1390 + }, + { + "epoch": 0.024810045303749154, + "grad_norm": 0.8303396105766296, + "learning_rate": 1.2406350338922584e-05, + "loss": 0.4224, + "step": 1391 + }, + { + "epoch": 0.02482788142546285, + "grad_norm": 0.4805128276348114, + "learning_rate": 1.241526935426329e-05, + "loss": 0.2973, + "step": 1392 + }, + { + "epoch": 0.024845717547176543, + "grad_norm": 0.583146870136261, + "learning_rate": 1.2424188369603996e-05, + "loss": 0.3722, + "step": 1393 + }, + { + "epoch": 0.024863553668890235, + "grad_norm": 0.7658040523529053, + "learning_rate": 1.2433107384944703e-05, + "loss": 0.3301, + "step": 1394 + }, + { + "epoch": 0.02488138979060393, + "grad_norm": 0.8137679696083069, + "learning_rate": 1.2442026400285409e-05, + "loss": 0.3548, + "step": 1395 + }, + { + "epoch": 0.024899225912317625, + "grad_norm": 0.6280669569969177, + "learning_rate": 1.2450945415626116e-05, + "loss": 0.3361, + "step": 1396 + }, + { + "epoch": 0.02491706203403132, + "grad_norm": 0.7460063695907593, + "learning_rate": 1.2459864430966822e-05, + "loss": 0.344, + "step": 1397 + }, + { + "epoch": 0.024934898155745015, + "grad_norm": 0.5598887801170349, + "learning_rate": 1.2468783446307528e-05, + "loss": 0.2878, + "step": 1398 + }, + { + "epoch": 0.02495273427745871, + "grad_norm": 0.6991429328918457, + "learning_rate": 1.2477702461648235e-05, + "loss": 0.3496, + "step": 1399 + }, + { + "epoch": 0.024970570399172404, + "grad_norm": 0.7219420075416565, + "learning_rate": 1.2486621476988941e-05, + "loss": 0.3826, + "step": 1400 + }, + { + "epoch": 0.0249884065208861, + "grad_norm": 0.698403537273407, + "learning_rate": 1.2495540492329647e-05, + "loss": 0.349, + "step": 1401 + }, + { + "epoch": 0.025006242642599794, + "grad_norm": 0.622410774230957, + "learning_rate": 1.2504459507670355e-05, + "loss": 0.3996, + "step": 1402 + }, + { + "epoch": 0.02502407876431349, + "grad_norm": 0.5343538522720337, + "learning_rate": 1.251337852301106e-05, + "loss": 0.3327, + "step": 1403 + }, + { + "epoch": 0.02504191488602718, + "grad_norm": 0.6778825521469116, + "learning_rate": 1.2522297538351765e-05, + "loss": 0.3212, + "step": 1404 + }, + { + "epoch": 0.025059751007740876, + "grad_norm": 0.8760435581207275, + "learning_rate": 1.2531216553692473e-05, + "loss": 0.4228, + "step": 1405 + }, + { + "epoch": 0.02507758712945457, + "grad_norm": 0.5947842001914978, + "learning_rate": 1.254013556903318e-05, + "loss": 0.3459, + "step": 1406 + }, + { + "epoch": 0.025095423251168265, + "grad_norm": 0.7000359892845154, + "learning_rate": 1.2549054584373884e-05, + "loss": 0.3194, + "step": 1407 + }, + { + "epoch": 0.02511325937288196, + "grad_norm": 1.1343384981155396, + "learning_rate": 1.2557973599714592e-05, + "loss": 0.3618, + "step": 1408 + }, + { + "epoch": 0.025131095494595655, + "grad_norm": 0.6769985556602478, + "learning_rate": 1.2566892615055298e-05, + "loss": 0.3123, + "step": 1409 + }, + { + "epoch": 0.02514893161630935, + "grad_norm": 0.614509105682373, + "learning_rate": 1.2575811630396007e-05, + "loss": 0.3353, + "step": 1410 + }, + { + "epoch": 0.025166767738023045, + "grad_norm": 0.468760222196579, + "learning_rate": 1.2584730645736711e-05, + "loss": 0.3096, + "step": 1411 + }, + { + "epoch": 0.02518460385973674, + "grad_norm": 0.7535741329193115, + "learning_rate": 1.2593649661077418e-05, + "loss": 0.3808, + "step": 1412 + }, + { + "epoch": 0.025202439981450435, + "grad_norm": 0.9200375080108643, + "learning_rate": 1.2602568676418126e-05, + "loss": 0.3471, + "step": 1413 + }, + { + "epoch": 0.025220276103164126, + "grad_norm": 0.714401125907898, + "learning_rate": 1.261148769175883e-05, + "loss": 0.3231, + "step": 1414 + }, + { + "epoch": 0.02523811222487782, + "grad_norm": 0.5711655616760254, + "learning_rate": 1.2620406707099535e-05, + "loss": 0.3341, + "step": 1415 + }, + { + "epoch": 0.025255948346591516, + "grad_norm": 0.5743415355682373, + "learning_rate": 1.2629325722440243e-05, + "loss": 0.2701, + "step": 1416 + }, + { + "epoch": 0.02527378446830521, + "grad_norm": 0.787744402885437, + "learning_rate": 1.263824473778095e-05, + "loss": 0.323, + "step": 1417 + }, + { + "epoch": 0.025291620590018906, + "grad_norm": 0.6809655427932739, + "learning_rate": 1.2647163753121654e-05, + "loss": 0.417, + "step": 1418 + }, + { + "epoch": 0.0253094567117326, + "grad_norm": 0.5578548908233643, + "learning_rate": 1.2656082768462362e-05, + "loss": 0.3488, + "step": 1419 + }, + { + "epoch": 0.025327292833446296, + "grad_norm": 0.617712676525116, + "learning_rate": 1.2665001783803069e-05, + "loss": 0.3481, + "step": 1420 + }, + { + "epoch": 0.02534512895515999, + "grad_norm": 0.6966971158981323, + "learning_rate": 1.2673920799143777e-05, + "loss": 0.3653, + "step": 1421 + }, + { + "epoch": 0.025362965076873686, + "grad_norm": 0.5483299493789673, + "learning_rate": 1.2682839814484481e-05, + "loss": 0.3565, + "step": 1422 + }, + { + "epoch": 0.02538080119858738, + "grad_norm": 0.5309726595878601, + "learning_rate": 1.2691758829825188e-05, + "loss": 0.3281, + "step": 1423 + }, + { + "epoch": 0.025398637320301072, + "grad_norm": 0.727033257484436, + "learning_rate": 1.2700677845165896e-05, + "loss": 0.2909, + "step": 1424 + }, + { + "epoch": 0.025416473442014767, + "grad_norm": 0.5525379180908203, + "learning_rate": 1.27095968605066e-05, + "loss": 0.3079, + "step": 1425 + }, + { + "epoch": 0.025434309563728462, + "grad_norm": 0.6853298544883728, + "learning_rate": 1.2718515875847307e-05, + "loss": 0.4044, + "step": 1426 + }, + { + "epoch": 0.025452145685442157, + "grad_norm": 0.7836421728134155, + "learning_rate": 1.2727434891188015e-05, + "loss": 0.3994, + "step": 1427 + }, + { + "epoch": 0.025469981807155852, + "grad_norm": 0.736638069152832, + "learning_rate": 1.273635390652872e-05, + "loss": 0.3416, + "step": 1428 + }, + { + "epoch": 0.025487817928869547, + "grad_norm": 0.4796784818172455, + "learning_rate": 1.2745272921869424e-05, + "loss": 0.3432, + "step": 1429 + }, + { + "epoch": 0.02550565405058324, + "grad_norm": 0.4791879951953888, + "learning_rate": 1.2754191937210132e-05, + "loss": 0.3685, + "step": 1430 + }, + { + "epoch": 0.025523490172296937, + "grad_norm": 0.5028378367424011, + "learning_rate": 1.2763110952550839e-05, + "loss": 0.3364, + "step": 1431 + }, + { + "epoch": 0.02554132629401063, + "grad_norm": 0.6632663607597351, + "learning_rate": 1.2772029967891547e-05, + "loss": 0.3623, + "step": 1432 + }, + { + "epoch": 0.025559162415724326, + "grad_norm": 0.6015079021453857, + "learning_rate": 1.2780948983232252e-05, + "loss": 0.3378, + "step": 1433 + }, + { + "epoch": 0.025576998537438018, + "grad_norm": 0.7614780068397522, + "learning_rate": 1.2789867998572958e-05, + "loss": 0.3269, + "step": 1434 + }, + { + "epoch": 0.025594834659151713, + "grad_norm": 0.5741811990737915, + "learning_rate": 1.2798787013913666e-05, + "loss": 0.3403, + "step": 1435 + }, + { + "epoch": 0.025612670780865408, + "grad_norm": 0.6418488621711731, + "learning_rate": 1.280770602925437e-05, + "loss": 0.3904, + "step": 1436 + }, + { + "epoch": 0.025630506902579103, + "grad_norm": 0.6873551607131958, + "learning_rate": 1.2816625044595077e-05, + "loss": 0.2914, + "step": 1437 + }, + { + "epoch": 0.025648343024292797, + "grad_norm": 0.9841389060020447, + "learning_rate": 1.2825544059935785e-05, + "loss": 0.3743, + "step": 1438 + }, + { + "epoch": 0.025666179146006492, + "grad_norm": 0.5809444189071655, + "learning_rate": 1.283446307527649e-05, + "loss": 0.3552, + "step": 1439 + }, + { + "epoch": 0.025684015267720187, + "grad_norm": 1.1492770910263062, + "learning_rate": 1.2843382090617195e-05, + "loss": 0.3157, + "step": 1440 + }, + { + "epoch": 0.025701851389433882, + "grad_norm": 0.6113795042037964, + "learning_rate": 1.2852301105957904e-05, + "loss": 0.3476, + "step": 1441 + }, + { + "epoch": 0.025719687511147577, + "grad_norm": 0.6590428948402405, + "learning_rate": 1.2861220121298609e-05, + "loss": 0.4073, + "step": 1442 + }, + { + "epoch": 0.025737523632861272, + "grad_norm": 0.4011369049549103, + "learning_rate": 1.2870139136639314e-05, + "loss": 0.3252, + "step": 1443 + }, + { + "epoch": 0.025755359754574964, + "grad_norm": 0.49081823229789734, + "learning_rate": 1.2879058151980022e-05, + "loss": 0.3137, + "step": 1444 + }, + { + "epoch": 0.02577319587628866, + "grad_norm": 0.4497537910938263, + "learning_rate": 1.2887977167320728e-05, + "loss": 0.3052, + "step": 1445 + }, + { + "epoch": 0.025791031998002353, + "grad_norm": 0.6175629496574402, + "learning_rate": 1.2896896182661436e-05, + "loss": 0.343, + "step": 1446 + }, + { + "epoch": 0.02580886811971605, + "grad_norm": 3.055866241455078, + "learning_rate": 1.2905815198002141e-05, + "loss": 0.3096, + "step": 1447 + }, + { + "epoch": 0.025826704241429743, + "grad_norm": 0.5497235655784607, + "learning_rate": 1.2914734213342847e-05, + "loss": 0.3371, + "step": 1448 + }, + { + "epoch": 0.025844540363143438, + "grad_norm": 0.469852477312088, + "learning_rate": 1.2923653228683555e-05, + "loss": 0.3138, + "step": 1449 + }, + { + "epoch": 0.025862376484857133, + "grad_norm": 0.9721972942352295, + "learning_rate": 1.293257224402426e-05, + "loss": 0.3592, + "step": 1450 + }, + { + "epoch": 0.025880212606570828, + "grad_norm": 0.46501076221466064, + "learning_rate": 1.2941491259364966e-05, + "loss": 0.2978, + "step": 1451 + }, + { + "epoch": 0.025898048728284523, + "grad_norm": 0.5042346119880676, + "learning_rate": 1.2950410274705675e-05, + "loss": 0.3551, + "step": 1452 + }, + { + "epoch": 0.025915884849998218, + "grad_norm": 0.5198994874954224, + "learning_rate": 1.295932929004638e-05, + "loss": 0.3364, + "step": 1453 + }, + { + "epoch": 0.02593372097171191, + "grad_norm": 0.624998927116394, + "learning_rate": 1.2968248305387084e-05, + "loss": 0.356, + "step": 1454 + }, + { + "epoch": 0.025951557093425604, + "grad_norm": 0.6525312662124634, + "learning_rate": 1.2977167320727792e-05, + "loss": 0.3148, + "step": 1455 + }, + { + "epoch": 0.0259693932151393, + "grad_norm": 0.5993296504020691, + "learning_rate": 1.2986086336068498e-05, + "loss": 0.3396, + "step": 1456 + }, + { + "epoch": 0.025987229336852994, + "grad_norm": 0.47724413871765137, + "learning_rate": 1.2995005351409206e-05, + "loss": 0.3155, + "step": 1457 + }, + { + "epoch": 0.02600506545856669, + "grad_norm": 0.5700188279151917, + "learning_rate": 1.3003924366749911e-05, + "loss": 0.353, + "step": 1458 + }, + { + "epoch": 0.026022901580280384, + "grad_norm": 1.5542047023773193, + "learning_rate": 1.3012843382090617e-05, + "loss": 0.3417, + "step": 1459 + }, + { + "epoch": 0.02604073770199408, + "grad_norm": 0.5037944912910461, + "learning_rate": 1.3021762397431326e-05, + "loss": 0.321, + "step": 1460 + }, + { + "epoch": 0.026058573823707774, + "grad_norm": 0.5542275309562683, + "learning_rate": 1.303068141277203e-05, + "loss": 0.3451, + "step": 1461 + }, + { + "epoch": 0.02607640994542147, + "grad_norm": 0.5835364460945129, + "learning_rate": 1.3039600428112737e-05, + "loss": 0.3028, + "step": 1462 + }, + { + "epoch": 0.026094246067135164, + "grad_norm": 0.6984533667564392, + "learning_rate": 1.3048519443453445e-05, + "loss": 0.3474, + "step": 1463 + }, + { + "epoch": 0.02611208218884886, + "grad_norm": 0.6229071021080017, + "learning_rate": 1.305743845879415e-05, + "loss": 0.3347, + "step": 1464 + }, + { + "epoch": 0.02612991831056255, + "grad_norm": 0.9868772625923157, + "learning_rate": 1.3066357474134854e-05, + "loss": 0.3256, + "step": 1465 + }, + { + "epoch": 0.026147754432276245, + "grad_norm": 0.6247256398200989, + "learning_rate": 1.3075276489475564e-05, + "loss": 0.3471, + "step": 1466 + }, + { + "epoch": 0.02616559055398994, + "grad_norm": 0.63973468542099, + "learning_rate": 1.3084195504816269e-05, + "loss": 0.3835, + "step": 1467 + }, + { + "epoch": 0.026183426675703635, + "grad_norm": 0.6585602760314941, + "learning_rate": 1.3093114520156977e-05, + "loss": 0.3757, + "step": 1468 + }, + { + "epoch": 0.02620126279741733, + "grad_norm": 0.6673021912574768, + "learning_rate": 1.3102033535497681e-05, + "loss": 0.3514, + "step": 1469 + }, + { + "epoch": 0.026219098919131024, + "grad_norm": 0.6367130875587463, + "learning_rate": 1.3110952550838388e-05, + "loss": 0.3627, + "step": 1470 + }, + { + "epoch": 0.02623693504084472, + "grad_norm": 7.9414753913879395, + "learning_rate": 1.3119871566179096e-05, + "loss": 0.3573, + "step": 1471 + }, + { + "epoch": 0.026254771162558414, + "grad_norm": 0.48212894797325134, + "learning_rate": 1.31287905815198e-05, + "loss": 0.311, + "step": 1472 + }, + { + "epoch": 0.02627260728427211, + "grad_norm": 0.7470137476921082, + "learning_rate": 1.3137709596860507e-05, + "loss": 0.3415, + "step": 1473 + }, + { + "epoch": 0.026290443405985804, + "grad_norm": 0.6094146966934204, + "learning_rate": 1.3146628612201215e-05, + "loss": 0.3442, + "step": 1474 + }, + { + "epoch": 0.026308279527699496, + "grad_norm": 0.5558764338493347, + "learning_rate": 1.315554762754192e-05, + "loss": 0.3039, + "step": 1475 + }, + { + "epoch": 0.02632611564941319, + "grad_norm": 0.6077886819839478, + "learning_rate": 1.3164466642882626e-05, + "loss": 0.3539, + "step": 1476 + }, + { + "epoch": 0.026343951771126885, + "grad_norm": 0.5881067514419556, + "learning_rate": 1.3173385658223334e-05, + "loss": 0.3076, + "step": 1477 + }, + { + "epoch": 0.02636178789284058, + "grad_norm": 0.7823331356048584, + "learning_rate": 1.3182304673564039e-05, + "loss": 0.4019, + "step": 1478 + }, + { + "epoch": 0.026379624014554275, + "grad_norm": 0.6178668737411499, + "learning_rate": 1.3191223688904747e-05, + "loss": 0.2783, + "step": 1479 + }, + { + "epoch": 0.02639746013626797, + "grad_norm": 0.9432263374328613, + "learning_rate": 1.3200142704245451e-05, + "loss": 0.3398, + "step": 1480 + }, + { + "epoch": 0.026415296257981665, + "grad_norm": 0.49985745549201965, + "learning_rate": 1.3209061719586158e-05, + "loss": 0.3239, + "step": 1481 + }, + { + "epoch": 0.02643313237969536, + "grad_norm": 0.560070276260376, + "learning_rate": 1.3217980734926866e-05, + "loss": 0.3822, + "step": 1482 + }, + { + "epoch": 0.026450968501409055, + "grad_norm": 0.635017454624176, + "learning_rate": 1.322689975026757e-05, + "loss": 0.3437, + "step": 1483 + }, + { + "epoch": 0.02646880462312275, + "grad_norm": 0.5243181586265564, + "learning_rate": 1.3235818765608277e-05, + "loss": 0.3217, + "step": 1484 + }, + { + "epoch": 0.02648664074483644, + "grad_norm": 0.5804629325866699, + "learning_rate": 1.3244737780948985e-05, + "loss": 0.348, + "step": 1485 + }, + { + "epoch": 0.026504476866550136, + "grad_norm": 0.6111013889312744, + "learning_rate": 1.325365679628969e-05, + "loss": 0.359, + "step": 1486 + }, + { + "epoch": 0.02652231298826383, + "grad_norm": 0.8193077445030212, + "learning_rate": 1.3262575811630396e-05, + "loss": 0.3612, + "step": 1487 + }, + { + "epoch": 0.026540149109977526, + "grad_norm": 0.6436665654182434, + "learning_rate": 1.3271494826971104e-05, + "loss": 0.3515, + "step": 1488 + }, + { + "epoch": 0.02655798523169122, + "grad_norm": 0.48903459310531616, + "learning_rate": 1.3280413842311809e-05, + "loss": 0.3013, + "step": 1489 + }, + { + "epoch": 0.026575821353404916, + "grad_norm": 1.1819206476211548, + "learning_rate": 1.3289332857652515e-05, + "loss": 0.3706, + "step": 1490 + }, + { + "epoch": 0.02659365747511861, + "grad_norm": 0.5459044575691223, + "learning_rate": 1.3298251872993223e-05, + "loss": 0.3337, + "step": 1491 + }, + { + "epoch": 0.026611493596832306, + "grad_norm": 0.6691137552261353, + "learning_rate": 1.3307170888333928e-05, + "loss": 0.3001, + "step": 1492 + }, + { + "epoch": 0.026629329718546, + "grad_norm": 0.7296347618103027, + "learning_rate": 1.3316089903674636e-05, + "loss": 0.3423, + "step": 1493 + }, + { + "epoch": 0.026647165840259696, + "grad_norm": 0.6832718849182129, + "learning_rate": 1.332500891901534e-05, + "loss": 0.3492, + "step": 1494 + }, + { + "epoch": 0.026665001961973387, + "grad_norm": 0.5545360445976257, + "learning_rate": 1.3333927934356047e-05, + "loss": 0.2849, + "step": 1495 + }, + { + "epoch": 0.026682838083687082, + "grad_norm": 0.588440477848053, + "learning_rate": 1.3342846949696755e-05, + "loss": 0.3028, + "step": 1496 + }, + { + "epoch": 0.026700674205400777, + "grad_norm": 0.7887428998947144, + "learning_rate": 1.335176596503746e-05, + "loss": 0.343, + "step": 1497 + }, + { + "epoch": 0.026718510327114472, + "grad_norm": 0.49277839064598083, + "learning_rate": 1.3360684980378166e-05, + "loss": 0.3225, + "step": 1498 + }, + { + "epoch": 0.026736346448828167, + "grad_norm": 0.6437626481056213, + "learning_rate": 1.3369603995718874e-05, + "loss": 0.2881, + "step": 1499 + }, + { + "epoch": 0.02675418257054186, + "grad_norm": 0.5196444392204285, + "learning_rate": 1.3378523011059579e-05, + "loss": 0.3201, + "step": 1500 + }, + { + "epoch": 0.026772018692255557, + "grad_norm": 0.5267537236213684, + "learning_rate": 1.3387442026400285e-05, + "loss": 0.3445, + "step": 1501 + }, + { + "epoch": 0.02678985481396925, + "grad_norm": 0.621799647808075, + "learning_rate": 1.3396361041740994e-05, + "loss": 0.3505, + "step": 1502 + }, + { + "epoch": 0.026807690935682946, + "grad_norm": 0.7187339067459106, + "learning_rate": 1.3405280057081698e-05, + "loss": 0.3729, + "step": 1503 + }, + { + "epoch": 0.02682552705739664, + "grad_norm": 1.050341248512268, + "learning_rate": 1.3414199072422406e-05, + "loss": 0.3127, + "step": 1504 + }, + { + "epoch": 0.026843363179110333, + "grad_norm": 1.0283368825912476, + "learning_rate": 1.3423118087763113e-05, + "loss": 0.2707, + "step": 1505 + }, + { + "epoch": 0.026861199300824028, + "grad_norm": 0.6192741394042969, + "learning_rate": 1.3432037103103817e-05, + "loss": 0.3436, + "step": 1506 + }, + { + "epoch": 0.026879035422537723, + "grad_norm": 0.6149123311042786, + "learning_rate": 1.3440956118444525e-05, + "loss": 0.3437, + "step": 1507 + }, + { + "epoch": 0.026896871544251418, + "grad_norm": 0.5970794558525085, + "learning_rate": 1.344987513378523e-05, + "loss": 0.3129, + "step": 1508 + }, + { + "epoch": 0.026914707665965112, + "grad_norm": 0.8853777050971985, + "learning_rate": 1.3458794149125936e-05, + "loss": 0.3137, + "step": 1509 + }, + { + "epoch": 0.026932543787678807, + "grad_norm": 0.4688767194747925, + "learning_rate": 1.3467713164466645e-05, + "loss": 0.3101, + "step": 1510 + }, + { + "epoch": 0.026950379909392502, + "grad_norm": 0.7186697125434875, + "learning_rate": 1.347663217980735e-05, + "loss": 0.3505, + "step": 1511 + }, + { + "epoch": 0.026968216031106197, + "grad_norm": 0.6535070538520813, + "learning_rate": 1.3485551195148056e-05, + "loss": 0.3267, + "step": 1512 + }, + { + "epoch": 0.026986052152819892, + "grad_norm": 0.5069955587387085, + "learning_rate": 1.3494470210488764e-05, + "loss": 0.3396, + "step": 1513 + }, + { + "epoch": 0.027003888274533587, + "grad_norm": 0.5632001161575317, + "learning_rate": 1.3503389225829468e-05, + "loss": 0.3781, + "step": 1514 + }, + { + "epoch": 0.02702172439624728, + "grad_norm": 0.5032448172569275, + "learning_rate": 1.3512308241170176e-05, + "loss": 0.3771, + "step": 1515 + }, + { + "epoch": 0.027039560517960973, + "grad_norm": 0.622677206993103, + "learning_rate": 1.3521227256510883e-05, + "loss": 0.335, + "step": 1516 + }, + { + "epoch": 0.02705739663967467, + "grad_norm": 0.5769949555397034, + "learning_rate": 1.3530146271851588e-05, + "loss": 0.2998, + "step": 1517 + }, + { + "epoch": 0.027075232761388363, + "grad_norm": 0.6577006578445435, + "learning_rate": 1.3539065287192296e-05, + "loss": 0.3837, + "step": 1518 + }, + { + "epoch": 0.027093068883102058, + "grad_norm": 0.6344306468963623, + "learning_rate": 1.3547984302533e-05, + "loss": 0.3558, + "step": 1519 + }, + { + "epoch": 0.027110905004815753, + "grad_norm": 0.5616790056228638, + "learning_rate": 1.3556903317873707e-05, + "loss": 0.3214, + "step": 1520 + }, + { + "epoch": 0.027128741126529448, + "grad_norm": 0.5310586094856262, + "learning_rate": 1.3565822333214415e-05, + "loss": 0.3476, + "step": 1521 + }, + { + "epoch": 0.027146577248243143, + "grad_norm": 0.6532830595970154, + "learning_rate": 1.357474134855512e-05, + "loss": 0.4263, + "step": 1522 + }, + { + "epoch": 0.027164413369956838, + "grad_norm": 0.51121586561203, + "learning_rate": 1.3583660363895826e-05, + "loss": 0.3389, + "step": 1523 + }, + { + "epoch": 0.027182249491670533, + "grad_norm": 0.7261708378791809, + "learning_rate": 1.3592579379236534e-05, + "loss": 0.3153, + "step": 1524 + }, + { + "epoch": 0.027200085613384224, + "grad_norm": 0.6957829594612122, + "learning_rate": 1.3601498394577239e-05, + "loss": 0.3352, + "step": 1525 + }, + { + "epoch": 0.02721792173509792, + "grad_norm": 0.8378706574440002, + "learning_rate": 1.3610417409917945e-05, + "loss": 0.3713, + "step": 1526 + }, + { + "epoch": 0.027235757856811614, + "grad_norm": 0.5874273777008057, + "learning_rate": 1.3619336425258653e-05, + "loss": 0.3264, + "step": 1527 + }, + { + "epoch": 0.02725359397852531, + "grad_norm": 0.6011341214179993, + "learning_rate": 1.3628255440599358e-05, + "loss": 0.3308, + "step": 1528 + }, + { + "epoch": 0.027271430100239004, + "grad_norm": 0.5864375829696655, + "learning_rate": 1.3637174455940066e-05, + "loss": 0.3237, + "step": 1529 + }, + { + "epoch": 0.0272892662219527, + "grad_norm": 0.8733957409858704, + "learning_rate": 1.3646093471280772e-05, + "loss": 0.3715, + "step": 1530 + }, + { + "epoch": 0.027307102343666394, + "grad_norm": 0.4712708294391632, + "learning_rate": 1.3655012486621477e-05, + "loss": 0.3008, + "step": 1531 + }, + { + "epoch": 0.02732493846538009, + "grad_norm": 0.5903174877166748, + "learning_rate": 1.3663931501962185e-05, + "loss": 0.351, + "step": 1532 + }, + { + "epoch": 0.027342774587093784, + "grad_norm": 0.5234763026237488, + "learning_rate": 1.367285051730289e-05, + "loss": 0.2853, + "step": 1533 + }, + { + "epoch": 0.02736061070880748, + "grad_norm": 0.6879488229751587, + "learning_rate": 1.3681769532643596e-05, + "loss": 0.3956, + "step": 1534 + }, + { + "epoch": 0.02737844683052117, + "grad_norm": 0.5998932123184204, + "learning_rate": 1.3690688547984304e-05, + "loss": 0.276, + "step": 1535 + }, + { + "epoch": 0.027396282952234865, + "grad_norm": 0.6094096899032593, + "learning_rate": 1.3699607563325009e-05, + "loss": 0.3341, + "step": 1536 + }, + { + "epoch": 0.02741411907394856, + "grad_norm": 0.6380568146705627, + "learning_rate": 1.3708526578665715e-05, + "loss": 0.2824, + "step": 1537 + }, + { + "epoch": 0.027431955195662255, + "grad_norm": 0.6752215623855591, + "learning_rate": 1.3717445594006423e-05, + "loss": 0.3041, + "step": 1538 + }, + { + "epoch": 0.02744979131737595, + "grad_norm": 0.7440658211708069, + "learning_rate": 1.3726364609347128e-05, + "loss": 0.4142, + "step": 1539 + }, + { + "epoch": 0.027467627439089645, + "grad_norm": 0.6443586349487305, + "learning_rate": 1.3735283624687836e-05, + "loss": 0.4107, + "step": 1540 + }, + { + "epoch": 0.02748546356080334, + "grad_norm": 0.648849368095398, + "learning_rate": 1.3744202640028542e-05, + "loss": 0.3403, + "step": 1541 + }, + { + "epoch": 0.027503299682517034, + "grad_norm": 0.6645941138267517, + "learning_rate": 1.3753121655369247e-05, + "loss": 0.3903, + "step": 1542 + }, + { + "epoch": 0.02752113580423073, + "grad_norm": 0.5554529428482056, + "learning_rate": 1.3762040670709955e-05, + "loss": 0.3771, + "step": 1543 + }, + { + "epoch": 0.027538971925944424, + "grad_norm": 0.5533734560012817, + "learning_rate": 1.3770959686050661e-05, + "loss": 0.3372, + "step": 1544 + }, + { + "epoch": 0.027556808047658116, + "grad_norm": 0.46411558985710144, + "learning_rate": 1.3779878701391366e-05, + "loss": 0.2343, + "step": 1545 + }, + { + "epoch": 0.02757464416937181, + "grad_norm": 0.5473881363868713, + "learning_rate": 1.3788797716732074e-05, + "loss": 0.3417, + "step": 1546 + }, + { + "epoch": 0.027592480291085505, + "grad_norm": 1.1106927394866943, + "learning_rate": 1.3797716732072779e-05, + "loss": 0.4138, + "step": 1547 + }, + { + "epoch": 0.0276103164127992, + "grad_norm": 0.5700728297233582, + "learning_rate": 1.3806635747413485e-05, + "loss": 0.3319, + "step": 1548 + }, + { + "epoch": 0.027628152534512895, + "grad_norm": 0.7012478113174438, + "learning_rate": 1.3815554762754193e-05, + "loss": 0.3755, + "step": 1549 + }, + { + "epoch": 0.02764598865622659, + "grad_norm": 0.5423607230186462, + "learning_rate": 1.3824473778094898e-05, + "loss": 0.3136, + "step": 1550 + }, + { + "epoch": 0.027663824777940285, + "grad_norm": 0.5336237549781799, + "learning_rate": 1.3833392793435606e-05, + "loss": 0.3408, + "step": 1551 + }, + { + "epoch": 0.02768166089965398, + "grad_norm": 0.4705270528793335, + "learning_rate": 1.3842311808776313e-05, + "loss": 0.3227, + "step": 1552 + }, + { + "epoch": 0.027699497021367675, + "grad_norm": 0.5503244996070862, + "learning_rate": 1.3851230824117017e-05, + "loss": 0.3757, + "step": 1553 + }, + { + "epoch": 0.02771733314308137, + "grad_norm": 0.5783112049102783, + "learning_rate": 1.3860149839457725e-05, + "loss": 0.286, + "step": 1554 + }, + { + "epoch": 0.02773516926479506, + "grad_norm": 0.6264418363571167, + "learning_rate": 1.3869068854798432e-05, + "loss": 0.3062, + "step": 1555 + }, + { + "epoch": 0.027753005386508756, + "grad_norm": 0.7298933863639832, + "learning_rate": 1.3877987870139136e-05, + "loss": 0.3553, + "step": 1556 + }, + { + "epoch": 0.02777084150822245, + "grad_norm": 0.4315170645713806, + "learning_rate": 1.3886906885479844e-05, + "loss": 0.2857, + "step": 1557 + }, + { + "epoch": 0.027788677629936146, + "grad_norm": 0.43308988213539124, + "learning_rate": 1.3895825900820549e-05, + "loss": 0.3102, + "step": 1558 + }, + { + "epoch": 0.02780651375164984, + "grad_norm": 0.5154407024383545, + "learning_rate": 1.3904744916161256e-05, + "loss": 0.2972, + "step": 1559 + }, + { + "epoch": 0.027824349873363536, + "grad_norm": 0.6666773557662964, + "learning_rate": 1.3913663931501964e-05, + "loss": 0.4002, + "step": 1560 + }, + { + "epoch": 0.02784218599507723, + "grad_norm": 0.4917806386947632, + "learning_rate": 1.3922582946842668e-05, + "loss": 0.3263, + "step": 1561 + }, + { + "epoch": 0.027860022116790926, + "grad_norm": 0.6318265199661255, + "learning_rate": 1.3931501962183376e-05, + "loss": 0.4311, + "step": 1562 + }, + { + "epoch": 0.02787785823850462, + "grad_norm": 0.5178397297859192, + "learning_rate": 1.3940420977524083e-05, + "loss": 0.3612, + "step": 1563 + }, + { + "epoch": 0.027895694360218316, + "grad_norm": 1.172103762626648, + "learning_rate": 1.3949339992864787e-05, + "loss": 0.3614, + "step": 1564 + }, + { + "epoch": 0.027913530481932007, + "grad_norm": 0.5760762691497803, + "learning_rate": 1.3958259008205495e-05, + "loss": 0.3181, + "step": 1565 + }, + { + "epoch": 0.027931366603645702, + "grad_norm": 0.5701383352279663, + "learning_rate": 1.3967178023546202e-05, + "loss": 0.3881, + "step": 1566 + }, + { + "epoch": 0.027949202725359397, + "grad_norm": 0.4625813066959381, + "learning_rate": 1.3976097038886907e-05, + "loss": 0.2817, + "step": 1567 + }, + { + "epoch": 0.027967038847073092, + "grad_norm": 0.5565279126167297, + "learning_rate": 1.3985016054227615e-05, + "loss": 0.4024, + "step": 1568 + }, + { + "epoch": 0.027984874968786787, + "grad_norm": 0.5355582237243652, + "learning_rate": 1.3993935069568321e-05, + "loss": 0.2787, + "step": 1569 + }, + { + "epoch": 0.02800271109050048, + "grad_norm": 0.5604333281517029, + "learning_rate": 1.4002854084909026e-05, + "loss": 0.2843, + "step": 1570 + }, + { + "epoch": 0.028020547212214177, + "grad_norm": 0.6077826023101807, + "learning_rate": 1.4011773100249734e-05, + "loss": 0.3973, + "step": 1571 + }, + { + "epoch": 0.02803838333392787, + "grad_norm": 0.6005956530570984, + "learning_rate": 1.4020692115590438e-05, + "loss": 0.3106, + "step": 1572 + }, + { + "epoch": 0.028056219455641566, + "grad_norm": 0.6257055997848511, + "learning_rate": 1.4029611130931145e-05, + "loss": 0.3178, + "step": 1573 + }, + { + "epoch": 0.02807405557735526, + "grad_norm": 0.7041438817977905, + "learning_rate": 1.4038530146271853e-05, + "loss": 0.3679, + "step": 1574 + }, + { + "epoch": 0.028091891699068953, + "grad_norm": 0.6367039084434509, + "learning_rate": 1.4047449161612558e-05, + "loss": 0.3408, + "step": 1575 + }, + { + "epoch": 0.028109727820782648, + "grad_norm": 0.6223627924919128, + "learning_rate": 1.4056368176953266e-05, + "loss": 0.339, + "step": 1576 + }, + { + "epoch": 0.028127563942496343, + "grad_norm": 0.7636797428131104, + "learning_rate": 1.4065287192293972e-05, + "loss": 0.3413, + "step": 1577 + }, + { + "epoch": 0.028145400064210038, + "grad_norm": 0.5388514995574951, + "learning_rate": 1.4074206207634677e-05, + "loss": 0.2687, + "step": 1578 + }, + { + "epoch": 0.028163236185923732, + "grad_norm": 0.4802737534046173, + "learning_rate": 1.4083125222975385e-05, + "loss": 0.3133, + "step": 1579 + }, + { + "epoch": 0.028181072307637427, + "grad_norm": 0.5418912768363953, + "learning_rate": 1.4092044238316091e-05, + "loss": 0.345, + "step": 1580 + }, + { + "epoch": 0.028198908429351122, + "grad_norm": 0.9693113565444946, + "learning_rate": 1.4100963253656796e-05, + "loss": 0.4423, + "step": 1581 + }, + { + "epoch": 0.028216744551064817, + "grad_norm": 0.7385473847389221, + "learning_rate": 1.4109882268997504e-05, + "loss": 0.3985, + "step": 1582 + }, + { + "epoch": 0.028234580672778512, + "grad_norm": 1.2881739139556885, + "learning_rate": 1.4118801284338209e-05, + "loss": 0.3586, + "step": 1583 + }, + { + "epoch": 0.028252416794492207, + "grad_norm": 0.6537035703659058, + "learning_rate": 1.4127720299678915e-05, + "loss": 0.3643, + "step": 1584 + }, + { + "epoch": 0.0282702529162059, + "grad_norm": 0.4728448688983917, + "learning_rate": 1.4136639315019623e-05, + "loss": 0.3705, + "step": 1585 + }, + { + "epoch": 0.028288089037919593, + "grad_norm": 0.6739163398742676, + "learning_rate": 1.4145558330360328e-05, + "loss": 0.415, + "step": 1586 + }, + { + "epoch": 0.02830592515963329, + "grad_norm": 0.6766735911369324, + "learning_rate": 1.4154477345701036e-05, + "loss": 0.3254, + "step": 1587 + }, + { + "epoch": 0.028323761281346983, + "grad_norm": 0.527630090713501, + "learning_rate": 1.4163396361041742e-05, + "loss": 0.3266, + "step": 1588 + }, + { + "epoch": 0.028341597403060678, + "grad_norm": 0.5813126564025879, + "learning_rate": 1.4172315376382447e-05, + "loss": 0.335, + "step": 1589 + }, + { + "epoch": 0.028359433524774373, + "grad_norm": 0.5767165422439575, + "learning_rate": 1.4181234391723155e-05, + "loss": 0.3075, + "step": 1590 + }, + { + "epoch": 0.028377269646488068, + "grad_norm": 0.5451858639717102, + "learning_rate": 1.4190153407063861e-05, + "loss": 0.3574, + "step": 1591 + }, + { + "epoch": 0.028395105768201763, + "grad_norm": 0.5933049917221069, + "learning_rate": 1.4199072422404566e-05, + "loss": 0.35, + "step": 1592 + }, + { + "epoch": 0.028412941889915458, + "grad_norm": 0.47686028480529785, + "learning_rate": 1.4207991437745274e-05, + "loss": 0.3517, + "step": 1593 + }, + { + "epoch": 0.028430778011629153, + "grad_norm": 1.1062206029891968, + "learning_rate": 1.421691045308598e-05, + "loss": 0.372, + "step": 1594 + }, + { + "epoch": 0.028448614133342844, + "grad_norm": 0.5275440812110901, + "learning_rate": 1.4225829468426685e-05, + "loss": 0.3307, + "step": 1595 + }, + { + "epoch": 0.02846645025505654, + "grad_norm": 0.5416312217712402, + "learning_rate": 1.4234748483767393e-05, + "loss": 0.3859, + "step": 1596 + }, + { + "epoch": 0.028484286376770234, + "grad_norm": 0.5466942191123962, + "learning_rate": 1.4243667499108098e-05, + "loss": 0.3436, + "step": 1597 + }, + { + "epoch": 0.02850212249848393, + "grad_norm": 0.6469510793685913, + "learning_rate": 1.4252586514448806e-05, + "loss": 0.3629, + "step": 1598 + }, + { + "epoch": 0.028519958620197624, + "grad_norm": 0.683988630771637, + "learning_rate": 1.4261505529789512e-05, + "loss": 0.372, + "step": 1599 + }, + { + "epoch": 0.02853779474191132, + "grad_norm": 0.533053457736969, + "learning_rate": 1.4270424545130217e-05, + "loss": 0.2931, + "step": 1600 + }, + { + "epoch": 0.028555630863625014, + "grad_norm": 0.5505555868148804, + "learning_rate": 1.4279343560470925e-05, + "loss": 0.357, + "step": 1601 + }, + { + "epoch": 0.02857346698533871, + "grad_norm": 0.5483716726303101, + "learning_rate": 1.4288262575811632e-05, + "loss": 0.364, + "step": 1602 + }, + { + "epoch": 0.028591303107052404, + "grad_norm": 0.6058002710342407, + "learning_rate": 1.4297181591152336e-05, + "loss": 0.3096, + "step": 1603 + }, + { + "epoch": 0.0286091392287661, + "grad_norm": 0.9553171992301941, + "learning_rate": 1.4306100606493044e-05, + "loss": 0.3439, + "step": 1604 + }, + { + "epoch": 0.02862697535047979, + "grad_norm": 0.4612777829170227, + "learning_rate": 1.431501962183375e-05, + "loss": 0.3047, + "step": 1605 + }, + { + "epoch": 0.028644811472193485, + "grad_norm": 0.7179811596870422, + "learning_rate": 1.4323938637174455e-05, + "loss": 0.3473, + "step": 1606 + }, + { + "epoch": 0.02866264759390718, + "grad_norm": 0.7585439682006836, + "learning_rate": 1.4332857652515163e-05, + "loss": 0.3683, + "step": 1607 + }, + { + "epoch": 0.028680483715620875, + "grad_norm": 0.5608382821083069, + "learning_rate": 1.434177666785587e-05, + "loss": 0.2939, + "step": 1608 + }, + { + "epoch": 0.02869831983733457, + "grad_norm": 0.8679600358009338, + "learning_rate": 1.4350695683196575e-05, + "loss": 0.3439, + "step": 1609 + }, + { + "epoch": 0.028716155959048265, + "grad_norm": 0.582264244556427, + "learning_rate": 1.4359614698537283e-05, + "loss": 0.3402, + "step": 1610 + }, + { + "epoch": 0.02873399208076196, + "grad_norm": 0.5689700245857239, + "learning_rate": 1.4368533713877987e-05, + "loss": 0.3582, + "step": 1611 + }, + { + "epoch": 0.028751828202475654, + "grad_norm": 0.5647953152656555, + "learning_rate": 1.4377452729218695e-05, + "loss": 0.3006, + "step": 1612 + }, + { + "epoch": 0.02876966432418935, + "grad_norm": 0.6635328531265259, + "learning_rate": 1.4386371744559402e-05, + "loss": 0.3907, + "step": 1613 + }, + { + "epoch": 0.028787500445903044, + "grad_norm": 0.45135289430618286, + "learning_rate": 1.4395290759900106e-05, + "loss": 0.3128, + "step": 1614 + }, + { + "epoch": 0.028805336567616736, + "grad_norm": 0.5480947494506836, + "learning_rate": 1.4404209775240815e-05, + "loss": 0.3452, + "step": 1615 + }, + { + "epoch": 0.02882317268933043, + "grad_norm": 0.5812448859214783, + "learning_rate": 1.4413128790581521e-05, + "loss": 0.3827, + "step": 1616 + }, + { + "epoch": 0.028841008811044126, + "grad_norm": 0.4756130874156952, + "learning_rate": 1.4422047805922226e-05, + "loss": 0.2957, + "step": 1617 + }, + { + "epoch": 0.02885884493275782, + "grad_norm": 0.614660382270813, + "learning_rate": 1.4430966821262934e-05, + "loss": 0.3268, + "step": 1618 + }, + { + "epoch": 0.028876681054471515, + "grad_norm": 0.5634617805480957, + "learning_rate": 1.443988583660364e-05, + "loss": 0.3221, + "step": 1619 + }, + { + "epoch": 0.02889451717618521, + "grad_norm": 0.5246202349662781, + "learning_rate": 1.4448804851944345e-05, + "loss": 0.3003, + "step": 1620 + }, + { + "epoch": 0.028912353297898905, + "grad_norm": 0.5220772624015808, + "learning_rate": 1.4457723867285053e-05, + "loss": 0.3124, + "step": 1621 + }, + { + "epoch": 0.0289301894196126, + "grad_norm": 0.5555007457733154, + "learning_rate": 1.4466642882625757e-05, + "loss": 0.321, + "step": 1622 + }, + { + "epoch": 0.028948025541326295, + "grad_norm": 0.45659974217414856, + "learning_rate": 1.4475561897966467e-05, + "loss": 0.3601, + "step": 1623 + }, + { + "epoch": 0.02896586166303999, + "grad_norm": 0.658943772315979, + "learning_rate": 1.4484480913307172e-05, + "loss": 0.3118, + "step": 1624 + }, + { + "epoch": 0.028983697784753685, + "grad_norm": 0.59748375415802, + "learning_rate": 1.4493399928647877e-05, + "loss": 0.2728, + "step": 1625 + }, + { + "epoch": 0.029001533906467376, + "grad_norm": 0.7809659242630005, + "learning_rate": 1.4502318943988585e-05, + "loss": 0.3662, + "step": 1626 + }, + { + "epoch": 0.02901937002818107, + "grad_norm": 0.8069256544113159, + "learning_rate": 1.4511237959329291e-05, + "loss": 0.2995, + "step": 1627 + }, + { + "epoch": 0.029037206149894766, + "grad_norm": 0.4573381543159485, + "learning_rate": 1.4520156974669996e-05, + "loss": 0.3255, + "step": 1628 + }, + { + "epoch": 0.02905504227160846, + "grad_norm": 0.7915101647377014, + "learning_rate": 1.4529075990010704e-05, + "loss": 0.3378, + "step": 1629 + }, + { + "epoch": 0.029072878393322156, + "grad_norm": 0.7040252685546875, + "learning_rate": 1.453799500535141e-05, + "loss": 0.3905, + "step": 1630 + }, + { + "epoch": 0.02909071451503585, + "grad_norm": 0.5259699821472168, + "learning_rate": 1.4546914020692115e-05, + "loss": 0.3348, + "step": 1631 + }, + { + "epoch": 0.029108550636749546, + "grad_norm": 0.5336809158325195, + "learning_rate": 1.4555833036032823e-05, + "loss": 0.2882, + "step": 1632 + }, + { + "epoch": 0.02912638675846324, + "grad_norm": 0.6732202172279358, + "learning_rate": 1.456475205137353e-05, + "loss": 0.3696, + "step": 1633 + }, + { + "epoch": 0.029144222880176936, + "grad_norm": 0.6707174777984619, + "learning_rate": 1.4573671066714237e-05, + "loss": 0.3388, + "step": 1634 + }, + { + "epoch": 0.02916205900189063, + "grad_norm": 0.6094953417778015, + "learning_rate": 1.4582590082054942e-05, + "loss": 0.3194, + "step": 1635 + }, + { + "epoch": 0.029179895123604322, + "grad_norm": 0.7700886726379395, + "learning_rate": 1.4591509097395647e-05, + "loss": 0.4018, + "step": 1636 + }, + { + "epoch": 0.029197731245318017, + "grad_norm": 0.5840463042259216, + "learning_rate": 1.4600428112736355e-05, + "loss": 0.3459, + "step": 1637 + }, + { + "epoch": 0.029215567367031712, + "grad_norm": 0.7443630695343018, + "learning_rate": 1.4609347128077061e-05, + "loss": 0.3808, + "step": 1638 + }, + { + "epoch": 0.029233403488745407, + "grad_norm": 0.5667780637741089, + "learning_rate": 1.4618266143417766e-05, + "loss": 0.2996, + "step": 1639 + }, + { + "epoch": 0.0292512396104591, + "grad_norm": 0.48666754364967346, + "learning_rate": 1.4627185158758474e-05, + "loss": 0.3225, + "step": 1640 + }, + { + "epoch": 0.029269075732172797, + "grad_norm": 0.5524598360061646, + "learning_rate": 1.463610417409918e-05, + "loss": 0.3021, + "step": 1641 + }, + { + "epoch": 0.02928691185388649, + "grad_norm": 0.6522731781005859, + "learning_rate": 1.4645023189439885e-05, + "loss": 0.4239, + "step": 1642 + }, + { + "epoch": 0.029304747975600186, + "grad_norm": 0.5739700198173523, + "learning_rate": 1.4653942204780593e-05, + "loss": 0.3228, + "step": 1643 + }, + { + "epoch": 0.02932258409731388, + "grad_norm": 0.47902143001556396, + "learning_rate": 1.46628612201213e-05, + "loss": 0.3283, + "step": 1644 + }, + { + "epoch": 0.029340420219027576, + "grad_norm": 0.6247063279151917, + "learning_rate": 1.4671780235462008e-05, + "loss": 0.3888, + "step": 1645 + }, + { + "epoch": 0.029358256340741268, + "grad_norm": 0.6175928115844727, + "learning_rate": 1.4680699250802712e-05, + "loss": 0.4071, + "step": 1646 + }, + { + "epoch": 0.029376092462454963, + "grad_norm": 0.6941092610359192, + "learning_rate": 1.4689618266143419e-05, + "loss": 0.317, + "step": 1647 + }, + { + "epoch": 0.029393928584168658, + "grad_norm": 0.6310111880302429, + "learning_rate": 1.4698537281484127e-05, + "loss": 0.375, + "step": 1648 + }, + { + "epoch": 0.029411764705882353, + "grad_norm": 0.7415862083435059, + "learning_rate": 1.4707456296824831e-05, + "loss": 0.3676, + "step": 1649 + }, + { + "epoch": 0.029429600827596047, + "grad_norm": 0.7599250078201294, + "learning_rate": 1.4716375312165536e-05, + "loss": 0.3719, + "step": 1650 + }, + { + "epoch": 0.029447436949309742, + "grad_norm": 0.5519864559173584, + "learning_rate": 1.4725294327506244e-05, + "loss": 0.3253, + "step": 1651 + }, + { + "epoch": 0.029465273071023437, + "grad_norm": 0.6969557404518127, + "learning_rate": 1.473421334284695e-05, + "loss": 0.3283, + "step": 1652 + }, + { + "epoch": 0.029483109192737132, + "grad_norm": 0.7012293934822083, + "learning_rate": 1.4743132358187655e-05, + "loss": 0.3987, + "step": 1653 + }, + { + "epoch": 0.029500945314450827, + "grad_norm": 0.5322942137718201, + "learning_rate": 1.4752051373528363e-05, + "loss": 0.3612, + "step": 1654 + }, + { + "epoch": 0.029518781436164522, + "grad_norm": 0.5493043065071106, + "learning_rate": 1.476097038886907e-05, + "loss": 0.3537, + "step": 1655 + }, + { + "epoch": 0.029536617557878213, + "grad_norm": 0.5930426120758057, + "learning_rate": 1.4769889404209774e-05, + "loss": 0.3685, + "step": 1656 + }, + { + "epoch": 0.02955445367959191, + "grad_norm": 0.5160814523696899, + "learning_rate": 1.4778808419550482e-05, + "loss": 0.353, + "step": 1657 + }, + { + "epoch": 0.029572289801305603, + "grad_norm": 0.9225820899009705, + "learning_rate": 1.4787727434891189e-05, + "loss": 0.3089, + "step": 1658 + }, + { + "epoch": 0.029590125923019298, + "grad_norm": 0.6422647833824158, + "learning_rate": 1.4796646450231897e-05, + "loss": 0.3474, + "step": 1659 + }, + { + "epoch": 0.029607962044732993, + "grad_norm": 0.5172878503799438, + "learning_rate": 1.4805565465572602e-05, + "loss": 0.33, + "step": 1660 + }, + { + "epoch": 0.029625798166446688, + "grad_norm": 0.7302867770195007, + "learning_rate": 1.4814484480913306e-05, + "loss": 0.3845, + "step": 1661 + }, + { + "epoch": 0.029643634288160383, + "grad_norm": 0.5681297183036804, + "learning_rate": 1.4823403496254016e-05, + "loss": 0.3181, + "step": 1662 + }, + { + "epoch": 0.029661470409874078, + "grad_norm": 0.5470948219299316, + "learning_rate": 1.483232251159472e-05, + "loss": 0.2965, + "step": 1663 + }, + { + "epoch": 0.029679306531587773, + "grad_norm": 0.9585771560668945, + "learning_rate": 1.4841241526935425e-05, + "loss": 0.3601, + "step": 1664 + }, + { + "epoch": 0.029697142653301468, + "grad_norm": 0.46626949310302734, + "learning_rate": 1.4850160542276134e-05, + "loss": 0.2903, + "step": 1665 + }, + { + "epoch": 0.02971497877501516, + "grad_norm": 0.5196533203125, + "learning_rate": 1.485907955761684e-05, + "loss": 0.3026, + "step": 1666 + }, + { + "epoch": 0.029732814896728854, + "grad_norm": 0.6445388793945312, + "learning_rate": 1.4867998572957545e-05, + "loss": 0.3207, + "step": 1667 + }, + { + "epoch": 0.02975065101844255, + "grad_norm": 0.49068230390548706, + "learning_rate": 1.4876917588298253e-05, + "loss": 0.3813, + "step": 1668 + }, + { + "epoch": 0.029768487140156244, + "grad_norm": 0.5983552932739258, + "learning_rate": 1.4885836603638959e-05, + "loss": 0.3986, + "step": 1669 + }, + { + "epoch": 0.02978632326186994, + "grad_norm": 0.6433863043785095, + "learning_rate": 1.4894755618979667e-05, + "loss": 0.3469, + "step": 1670 + }, + { + "epoch": 0.029804159383583634, + "grad_norm": 0.7386375069618225, + "learning_rate": 1.4903674634320372e-05, + "loss": 0.3405, + "step": 1671 + }, + { + "epoch": 0.02982199550529733, + "grad_norm": 0.7433804869651794, + "learning_rate": 1.4912593649661078e-05, + "loss": 0.3014, + "step": 1672 + }, + { + "epoch": 0.029839831627011024, + "grad_norm": 0.6054247617721558, + "learning_rate": 1.4921512665001786e-05, + "loss": 0.3161, + "step": 1673 + }, + { + "epoch": 0.02985766774872472, + "grad_norm": 0.7582512497901917, + "learning_rate": 1.4930431680342491e-05, + "loss": 0.3116, + "step": 1674 + }, + { + "epoch": 0.029875503870438413, + "grad_norm": 0.6766425967216492, + "learning_rate": 1.4939350695683196e-05, + "loss": 0.3331, + "step": 1675 + }, + { + "epoch": 0.029893339992152105, + "grad_norm": 0.5102553963661194, + "learning_rate": 1.4948269711023904e-05, + "loss": 0.2678, + "step": 1676 + }, + { + "epoch": 0.0299111761138658, + "grad_norm": 0.4020669758319855, + "learning_rate": 1.495718872636461e-05, + "loss": 0.2832, + "step": 1677 + }, + { + "epoch": 0.029929012235579495, + "grad_norm": 0.566681444644928, + "learning_rate": 1.4966107741705315e-05, + "loss": 0.3955, + "step": 1678 + }, + { + "epoch": 0.02994684835729319, + "grad_norm": 0.6386412382125854, + "learning_rate": 1.4975026757046023e-05, + "loss": 0.3676, + "step": 1679 + }, + { + "epoch": 0.029964684479006885, + "grad_norm": 0.6427171230316162, + "learning_rate": 1.498394577238673e-05, + "loss": 0.3432, + "step": 1680 + }, + { + "epoch": 0.02998252060072058, + "grad_norm": 0.6253827810287476, + "learning_rate": 1.4992864787727437e-05, + "loss": 0.3553, + "step": 1681 + }, + { + "epoch": 0.030000356722434274, + "grad_norm": 0.44797033071517944, + "learning_rate": 1.5001783803068142e-05, + "loss": 0.3329, + "step": 1682 + }, + { + "epoch": 0.03001819284414797, + "grad_norm": 0.5939450263977051, + "learning_rate": 1.5010702818408848e-05, + "loss": 0.3701, + "step": 1683 + }, + { + "epoch": 0.030036028965861664, + "grad_norm": 0.6008427739143372, + "learning_rate": 1.5019621833749556e-05, + "loss": 0.3999, + "step": 1684 + }, + { + "epoch": 0.03005386508757536, + "grad_norm": 0.5785388350486755, + "learning_rate": 1.5028540849090261e-05, + "loss": 0.344, + "step": 1685 + }, + { + "epoch": 0.03007170120928905, + "grad_norm": 0.4829639196395874, + "learning_rate": 1.5037459864430966e-05, + "loss": 0.3043, + "step": 1686 + }, + { + "epoch": 0.030089537331002746, + "grad_norm": 0.46917492151260376, + "learning_rate": 1.5046378879771676e-05, + "loss": 0.28, + "step": 1687 + }, + { + "epoch": 0.03010737345271644, + "grad_norm": 0.6790163516998291, + "learning_rate": 1.505529789511238e-05, + "loss": 0.3928, + "step": 1688 + }, + { + "epoch": 0.030125209574430135, + "grad_norm": 1.2584736347198486, + "learning_rate": 1.5064216910453085e-05, + "loss": 0.3235, + "step": 1689 + }, + { + "epoch": 0.03014304569614383, + "grad_norm": 0.6248006820678711, + "learning_rate": 1.5073135925793793e-05, + "loss": 0.4477, + "step": 1690 + }, + { + "epoch": 0.030160881817857525, + "grad_norm": 0.9073899388313293, + "learning_rate": 1.50820549411345e-05, + "loss": 0.3809, + "step": 1691 + }, + { + "epoch": 0.03017871793957122, + "grad_norm": 0.4276426136493683, + "learning_rate": 1.5090973956475204e-05, + "loss": 0.268, + "step": 1692 + }, + { + "epoch": 0.030196554061284915, + "grad_norm": 0.5526872873306274, + "learning_rate": 1.5099892971815912e-05, + "loss": 0.347, + "step": 1693 + }, + { + "epoch": 0.03021439018299861, + "grad_norm": 0.6889521479606628, + "learning_rate": 1.5108811987156619e-05, + "loss": 0.3236, + "step": 1694 + }, + { + "epoch": 0.030232226304712305, + "grad_norm": 0.5612066984176636, + "learning_rate": 1.5117731002497327e-05, + "loss": 0.3442, + "step": 1695 + }, + { + "epoch": 0.030250062426425996, + "grad_norm": 0.6143417358398438, + "learning_rate": 1.5126650017838031e-05, + "loss": 0.3385, + "step": 1696 + }, + { + "epoch": 0.03026789854813969, + "grad_norm": 0.5099233984947205, + "learning_rate": 1.5135569033178738e-05, + "loss": 0.2929, + "step": 1697 + }, + { + "epoch": 0.030285734669853386, + "grad_norm": 0.5673273801803589, + "learning_rate": 1.5144488048519446e-05, + "loss": 0.3271, + "step": 1698 + }, + { + "epoch": 0.03030357079156708, + "grad_norm": 0.6243776679039001, + "learning_rate": 1.515340706386015e-05, + "loss": 0.3393, + "step": 1699 + }, + { + "epoch": 0.030321406913280776, + "grad_norm": 0.44045859575271606, + "learning_rate": 1.5162326079200855e-05, + "loss": 0.3336, + "step": 1700 + }, + { + "epoch": 0.03033924303499447, + "grad_norm": 0.7044060230255127, + "learning_rate": 1.5171245094541563e-05, + "loss": 0.311, + "step": 1701 + }, + { + "epoch": 0.030357079156708166, + "grad_norm": 0.7421000003814697, + "learning_rate": 1.518016410988227e-05, + "loss": 0.3022, + "step": 1702 + }, + { + "epoch": 0.03037491527842186, + "grad_norm": 0.7464106678962708, + "learning_rate": 1.5189083125222974e-05, + "loss": 0.3607, + "step": 1703 + }, + { + "epoch": 0.030392751400135556, + "grad_norm": 0.547999918460846, + "learning_rate": 1.5198002140563682e-05, + "loss": 0.331, + "step": 1704 + }, + { + "epoch": 0.03041058752184925, + "grad_norm": 0.747925341129303, + "learning_rate": 1.5206921155904389e-05, + "loss": 0.3451, + "step": 1705 + }, + { + "epoch": 0.030428423643562942, + "grad_norm": 0.5418623089790344, + "learning_rate": 1.5215840171245097e-05, + "loss": 0.3059, + "step": 1706 + }, + { + "epoch": 0.030446259765276637, + "grad_norm": 0.7048314213752747, + "learning_rate": 1.5224759186585802e-05, + "loss": 0.4262, + "step": 1707 + }, + { + "epoch": 0.030464095886990332, + "grad_norm": 0.7050665616989136, + "learning_rate": 1.5233678201926508e-05, + "loss": 0.3639, + "step": 1708 + }, + { + "epoch": 0.030481932008704027, + "grad_norm": 0.6242355108261108, + "learning_rate": 1.5242597217267216e-05, + "loss": 0.3864, + "step": 1709 + }, + { + "epoch": 0.030499768130417722, + "grad_norm": 0.5299994945526123, + "learning_rate": 1.525151623260792e-05, + "loss": 0.3267, + "step": 1710 + }, + { + "epoch": 0.030517604252131417, + "grad_norm": 0.4984886646270752, + "learning_rate": 1.5260435247948627e-05, + "loss": 0.3338, + "step": 1711 + }, + { + "epoch": 0.03053544037384511, + "grad_norm": 0.6587681174278259, + "learning_rate": 1.5269354263289333e-05, + "loss": 0.3578, + "step": 1712 + }, + { + "epoch": 0.030553276495558807, + "grad_norm": 0.5379131436347961, + "learning_rate": 1.527827327863004e-05, + "loss": 0.3552, + "step": 1713 + }, + { + "epoch": 0.0305711126172725, + "grad_norm": 0.5867990851402283, + "learning_rate": 1.5287192293970746e-05, + "loss": 0.3984, + "step": 1714 + }, + { + "epoch": 0.030588948738986196, + "grad_norm": 0.5907850861549377, + "learning_rate": 1.5296111309311453e-05, + "loss": 0.3518, + "step": 1715 + }, + { + "epoch": 0.030606784860699888, + "grad_norm": 0.467940092086792, + "learning_rate": 1.530503032465216e-05, + "loss": 0.2849, + "step": 1716 + }, + { + "epoch": 0.030624620982413583, + "grad_norm": 0.4978936016559601, + "learning_rate": 1.5313949339992865e-05, + "loss": 0.3223, + "step": 1717 + }, + { + "epoch": 0.030642457104127278, + "grad_norm": 0.40754082798957825, + "learning_rate": 1.532286835533357e-05, + "loss": 0.293, + "step": 1718 + }, + { + "epoch": 0.030660293225840973, + "grad_norm": 0.41278594732284546, + "learning_rate": 1.5331787370674278e-05, + "loss": 0.3247, + "step": 1719 + }, + { + "epoch": 0.030678129347554667, + "grad_norm": 0.5860134363174438, + "learning_rate": 1.5340706386014984e-05, + "loss": 0.3044, + "step": 1720 + }, + { + "epoch": 0.030695965469268362, + "grad_norm": 0.4603513479232788, + "learning_rate": 1.534962540135569e-05, + "loss": 0.2784, + "step": 1721 + }, + { + "epoch": 0.030713801590982057, + "grad_norm": 0.5335546135902405, + "learning_rate": 1.5358544416696397e-05, + "loss": 0.2867, + "step": 1722 + }, + { + "epoch": 0.030731637712695752, + "grad_norm": 0.8522798418998718, + "learning_rate": 1.5367463432037104e-05, + "loss": 0.353, + "step": 1723 + }, + { + "epoch": 0.030749473834409447, + "grad_norm": 0.6604307889938354, + "learning_rate": 1.537638244737781e-05, + "loss": 0.3571, + "step": 1724 + }, + { + "epoch": 0.030767309956123142, + "grad_norm": 0.5790998935699463, + "learning_rate": 1.5385301462718516e-05, + "loss": 0.3584, + "step": 1725 + }, + { + "epoch": 0.030785146077836834, + "grad_norm": 0.728276252746582, + "learning_rate": 1.5394220478059223e-05, + "loss": 0.371, + "step": 1726 + }, + { + "epoch": 0.03080298219955053, + "grad_norm": 0.4201717674732208, + "learning_rate": 1.540313949339993e-05, + "loss": 0.2946, + "step": 1727 + }, + { + "epoch": 0.030820818321264223, + "grad_norm": 0.5037418603897095, + "learning_rate": 1.5412058508740635e-05, + "loss": 0.3255, + "step": 1728 + }, + { + "epoch": 0.03083865444297792, + "grad_norm": 0.5051530599594116, + "learning_rate": 1.5420977524081342e-05, + "loss": 0.3207, + "step": 1729 + }, + { + "epoch": 0.030856490564691613, + "grad_norm": 0.5607863664627075, + "learning_rate": 1.5429896539422048e-05, + "loss": 0.3102, + "step": 1730 + }, + { + "epoch": 0.030874326686405308, + "grad_norm": 0.5118757486343384, + "learning_rate": 1.5438815554762755e-05, + "loss": 0.3617, + "step": 1731 + }, + { + "epoch": 0.030892162808119003, + "grad_norm": 0.8359280228614807, + "learning_rate": 1.544773457010346e-05, + "loss": 0.31, + "step": 1732 + }, + { + "epoch": 0.030909998929832698, + "grad_norm": 0.7327261567115784, + "learning_rate": 1.5456653585444167e-05, + "loss": 0.3672, + "step": 1733 + }, + { + "epoch": 0.030927835051546393, + "grad_norm": 1.0370457172393799, + "learning_rate": 1.5465572600784874e-05, + "loss": 0.3358, + "step": 1734 + }, + { + "epoch": 0.030945671173260088, + "grad_norm": 0.5791711211204529, + "learning_rate": 1.547449161612558e-05, + "loss": 0.3458, + "step": 1735 + }, + { + "epoch": 0.03096350729497378, + "grad_norm": 1.1872961521148682, + "learning_rate": 1.5483410631466287e-05, + "loss": 0.3646, + "step": 1736 + }, + { + "epoch": 0.030981343416687474, + "grad_norm": 0.9485112428665161, + "learning_rate": 1.5492329646806993e-05, + "loss": 0.3421, + "step": 1737 + }, + { + "epoch": 0.03099917953840117, + "grad_norm": 0.8620749711990356, + "learning_rate": 1.55012486621477e-05, + "loss": 0.3419, + "step": 1738 + }, + { + "epoch": 0.031017015660114864, + "grad_norm": 0.7116780281066895, + "learning_rate": 1.5510167677488406e-05, + "loss": 0.4405, + "step": 1739 + }, + { + "epoch": 0.03103485178182856, + "grad_norm": 0.720676600933075, + "learning_rate": 1.5519086692829112e-05, + "loss": 0.3119, + "step": 1740 + }, + { + "epoch": 0.031052687903542254, + "grad_norm": 0.4957166612148285, + "learning_rate": 1.552800570816982e-05, + "loss": 0.3314, + "step": 1741 + }, + { + "epoch": 0.03107052402525595, + "grad_norm": 0.5247991681098938, + "learning_rate": 1.5536924723510525e-05, + "loss": 0.324, + "step": 1742 + }, + { + "epoch": 0.031088360146969644, + "grad_norm": 0.6245825886726379, + "learning_rate": 1.554584373885123e-05, + "loss": 0.3569, + "step": 1743 + }, + { + "epoch": 0.03110619626868334, + "grad_norm": 0.47443920373916626, + "learning_rate": 1.5554762754191938e-05, + "loss": 0.3254, + "step": 1744 + }, + { + "epoch": 0.031124032390397034, + "grad_norm": 0.532443642616272, + "learning_rate": 1.5563681769532644e-05, + "loss": 0.2956, + "step": 1745 + }, + { + "epoch": 0.031141868512110725, + "grad_norm": 0.6366216540336609, + "learning_rate": 1.557260078487335e-05, + "loss": 0.3432, + "step": 1746 + }, + { + "epoch": 0.03115970463382442, + "grad_norm": 0.4841454029083252, + "learning_rate": 1.5581519800214057e-05, + "loss": 0.3041, + "step": 1747 + }, + { + "epoch": 0.031177540755538115, + "grad_norm": 0.5549114942550659, + "learning_rate": 1.5590438815554763e-05, + "loss": 0.3335, + "step": 1748 + }, + { + "epoch": 0.03119537687725181, + "grad_norm": 0.5786772966384888, + "learning_rate": 1.559935783089547e-05, + "loss": 0.2767, + "step": 1749 + }, + { + "epoch": 0.031213212998965505, + "grad_norm": 0.4859549403190613, + "learning_rate": 1.5608276846236176e-05, + "loss": 0.3186, + "step": 1750 + }, + { + "epoch": 0.0312310491206792, + "grad_norm": 0.6097210645675659, + "learning_rate": 1.5617195861576882e-05, + "loss": 0.3325, + "step": 1751 + }, + { + "epoch": 0.031248885242392895, + "grad_norm": 0.5827273726463318, + "learning_rate": 1.562611487691759e-05, + "loss": 0.2936, + "step": 1752 + }, + { + "epoch": 0.03126672136410659, + "grad_norm": 0.9926132559776306, + "learning_rate": 1.56350338922583e-05, + "loss": 0.2715, + "step": 1753 + }, + { + "epoch": 0.031284557485820284, + "grad_norm": 0.6341984868049622, + "learning_rate": 1.5643952907599e-05, + "loss": 0.3361, + "step": 1754 + }, + { + "epoch": 0.03130239360753398, + "grad_norm": 0.5048485994338989, + "learning_rate": 1.5652871922939708e-05, + "loss": 0.2997, + "step": 1755 + }, + { + "epoch": 0.031320229729247674, + "grad_norm": 0.9564552307128906, + "learning_rate": 1.5661790938280414e-05, + "loss": 0.3162, + "step": 1756 + }, + { + "epoch": 0.03133806585096137, + "grad_norm": 0.5259284973144531, + "learning_rate": 1.567070995362112e-05, + "loss": 0.3426, + "step": 1757 + }, + { + "epoch": 0.031355901972675064, + "grad_norm": 0.4777657389640808, + "learning_rate": 1.5679628968961827e-05, + "loss": 0.3213, + "step": 1758 + }, + { + "epoch": 0.03137373809438876, + "grad_norm": 0.4671401381492615, + "learning_rate": 1.5688547984302533e-05, + "loss": 0.3262, + "step": 1759 + }, + { + "epoch": 0.031391574216102454, + "grad_norm": 0.6501715183258057, + "learning_rate": 1.569746699964324e-05, + "loss": 0.3214, + "step": 1760 + }, + { + "epoch": 0.03140941033781614, + "grad_norm": 0.6746953129768372, + "learning_rate": 1.5706386014983946e-05, + "loss": 0.3335, + "step": 1761 + }, + { + "epoch": 0.03142724645952984, + "grad_norm": 0.47077369689941406, + "learning_rate": 1.5715305030324652e-05, + "loss": 0.3455, + "step": 1762 + }, + { + "epoch": 0.03144508258124353, + "grad_norm": 0.5709850192070007, + "learning_rate": 1.572422404566536e-05, + "loss": 0.2756, + "step": 1763 + }, + { + "epoch": 0.03146291870295723, + "grad_norm": 0.6588783264160156, + "learning_rate": 1.573314306100607e-05, + "loss": 0.3302, + "step": 1764 + }, + { + "epoch": 0.03148075482467092, + "grad_norm": 0.519670307636261, + "learning_rate": 1.574206207634677e-05, + "loss": 0.3091, + "step": 1765 + }, + { + "epoch": 0.031498590946384616, + "grad_norm": 0.7215790748596191, + "learning_rate": 1.5750981091687478e-05, + "loss": 0.3603, + "step": 1766 + }, + { + "epoch": 0.03151642706809831, + "grad_norm": 0.565071702003479, + "learning_rate": 1.5759900107028184e-05, + "loss": 0.3369, + "step": 1767 + }, + { + "epoch": 0.031534263189812006, + "grad_norm": 0.5075246095657349, + "learning_rate": 1.576881912236889e-05, + "loss": 0.3287, + "step": 1768 + }, + { + "epoch": 0.0315520993115257, + "grad_norm": 0.6114002466201782, + "learning_rate": 1.5777738137709597e-05, + "loss": 0.3298, + "step": 1769 + }, + { + "epoch": 0.031569935433239396, + "grad_norm": 0.6642633676528931, + "learning_rate": 1.5786657153050303e-05, + "loss": 0.3377, + "step": 1770 + }, + { + "epoch": 0.03158777155495309, + "grad_norm": 0.5811492800712585, + "learning_rate": 1.579557616839101e-05, + "loss": 0.3749, + "step": 1771 + }, + { + "epoch": 0.031605607676666786, + "grad_norm": 0.708658754825592, + "learning_rate": 1.5804495183731716e-05, + "loss": 0.324, + "step": 1772 + }, + { + "epoch": 0.03162344379838048, + "grad_norm": 0.4351314306259155, + "learning_rate": 1.5813414199072423e-05, + "loss": 0.302, + "step": 1773 + }, + { + "epoch": 0.031641279920094176, + "grad_norm": 0.6305572986602783, + "learning_rate": 1.582233321441313e-05, + "loss": 0.3818, + "step": 1774 + }, + { + "epoch": 0.03165911604180787, + "grad_norm": 0.5669882297515869, + "learning_rate": 1.5831252229753835e-05, + "loss": 0.263, + "step": 1775 + }, + { + "epoch": 0.031676952163521566, + "grad_norm": 0.5446153283119202, + "learning_rate": 1.5840171245094542e-05, + "loss": 0.3097, + "step": 1776 + }, + { + "epoch": 0.03169478828523526, + "grad_norm": 0.7341017723083496, + "learning_rate": 1.5849090260435248e-05, + "loss": 0.4043, + "step": 1777 + }, + { + "epoch": 0.031712624406948955, + "grad_norm": 0.7255344986915588, + "learning_rate": 1.5858009275775958e-05, + "loss": 0.3509, + "step": 1778 + }, + { + "epoch": 0.03173046052866265, + "grad_norm": 0.6021090745925903, + "learning_rate": 1.586692829111666e-05, + "loss": 0.3776, + "step": 1779 + }, + { + "epoch": 0.031748296650376345, + "grad_norm": 0.5457401871681213, + "learning_rate": 1.5875847306457367e-05, + "loss": 0.3262, + "step": 1780 + }, + { + "epoch": 0.03176613277209003, + "grad_norm": 0.6532450914382935, + "learning_rate": 1.5884766321798074e-05, + "loss": 0.3474, + "step": 1781 + }, + { + "epoch": 0.03178396889380373, + "grad_norm": 0.5987659692764282, + "learning_rate": 1.589368533713878e-05, + "loss": 0.3386, + "step": 1782 + }, + { + "epoch": 0.03180180501551742, + "grad_norm": 0.5551632642745972, + "learning_rate": 1.5902604352479486e-05, + "loss": 0.3638, + "step": 1783 + }, + { + "epoch": 0.03181964113723112, + "grad_norm": 0.42713162302970886, + "learning_rate": 1.5911523367820193e-05, + "loss": 0.2944, + "step": 1784 + }, + { + "epoch": 0.03183747725894481, + "grad_norm": 0.6613355875015259, + "learning_rate": 1.59204423831609e-05, + "loss": 0.3616, + "step": 1785 + }, + { + "epoch": 0.03185531338065851, + "grad_norm": 0.6508033275604248, + "learning_rate": 1.5929361398501606e-05, + "loss": 0.351, + "step": 1786 + }, + { + "epoch": 0.0318731495023722, + "grad_norm": 0.4862409830093384, + "learning_rate": 1.5938280413842312e-05, + "loss": 0.2744, + "step": 1787 + }, + { + "epoch": 0.0318909856240859, + "grad_norm": 0.5413964986801147, + "learning_rate": 1.594719942918302e-05, + "loss": 0.3702, + "step": 1788 + }, + { + "epoch": 0.03190882174579959, + "grad_norm": 0.5709779858589172, + "learning_rate": 1.5956118444523728e-05, + "loss": 0.3765, + "step": 1789 + }, + { + "epoch": 0.03192665786751329, + "grad_norm": 0.5577303171157837, + "learning_rate": 1.596503745986443e-05, + "loss": 0.3755, + "step": 1790 + }, + { + "epoch": 0.03194449398922698, + "grad_norm": 0.653723955154419, + "learning_rate": 1.5973956475205137e-05, + "loss": 0.3781, + "step": 1791 + }, + { + "epoch": 0.03196233011094068, + "grad_norm": 0.6088376045227051, + "learning_rate": 1.5982875490545847e-05, + "loss": 0.3763, + "step": 1792 + }, + { + "epoch": 0.03198016623265437, + "grad_norm": 0.4915998578071594, + "learning_rate": 1.599179450588655e-05, + "loss": 0.306, + "step": 1793 + }, + { + "epoch": 0.03199800235436807, + "grad_norm": 0.6609599590301514, + "learning_rate": 1.6000713521227257e-05, + "loss": 0.2767, + "step": 1794 + }, + { + "epoch": 0.03201583847608176, + "grad_norm": 0.7062175273895264, + "learning_rate": 1.6009632536567963e-05, + "loss": 0.3303, + "step": 1795 + }, + { + "epoch": 0.03203367459779546, + "grad_norm": 0.6269365549087524, + "learning_rate": 1.601855155190867e-05, + "loss": 0.4083, + "step": 1796 + }, + { + "epoch": 0.03205151071950915, + "grad_norm": 0.4888173043727875, + "learning_rate": 1.6027470567249376e-05, + "loss": 0.3019, + "step": 1797 + }, + { + "epoch": 0.03206934684122285, + "grad_norm": 0.5467481017112732, + "learning_rate": 1.6036389582590082e-05, + "loss": 0.2895, + "step": 1798 + }, + { + "epoch": 0.03208718296293654, + "grad_norm": 0.6655282378196716, + "learning_rate": 1.604530859793079e-05, + "loss": 0.3434, + "step": 1799 + }, + { + "epoch": 0.03210501908465024, + "grad_norm": 0.5917126536369324, + "learning_rate": 1.6054227613271498e-05, + "loss": 0.3853, + "step": 1800 + }, + { + "epoch": 0.03212285520636393, + "grad_norm": 0.524863600730896, + "learning_rate": 1.60631466286122e-05, + "loss": 0.3772, + "step": 1801 + }, + { + "epoch": 0.03214069132807762, + "grad_norm": 0.8061359524726868, + "learning_rate": 1.6072065643952908e-05, + "loss": 0.4004, + "step": 1802 + }, + { + "epoch": 0.032158527449791315, + "grad_norm": 0.4886523187160492, + "learning_rate": 1.6080984659293617e-05, + "loss": 0.3083, + "step": 1803 + }, + { + "epoch": 0.03217636357150501, + "grad_norm": 0.6840428113937378, + "learning_rate": 1.608990367463432e-05, + "loss": 0.3674, + "step": 1804 + }, + { + "epoch": 0.032194199693218704, + "grad_norm": 0.6019099950790405, + "learning_rate": 1.6098822689975027e-05, + "loss": 0.342, + "step": 1805 + }, + { + "epoch": 0.0322120358149324, + "grad_norm": 0.5354987382888794, + "learning_rate": 1.6107741705315733e-05, + "loss": 0.3151, + "step": 1806 + }, + { + "epoch": 0.032229871936646094, + "grad_norm": 0.4807489812374115, + "learning_rate": 1.611666072065644e-05, + "loss": 0.3347, + "step": 1807 + }, + { + "epoch": 0.03224770805835979, + "grad_norm": 0.7448999881744385, + "learning_rate": 1.6125579735997146e-05, + "loss": 0.3125, + "step": 1808 + }, + { + "epoch": 0.032265544180073484, + "grad_norm": 0.6970858573913574, + "learning_rate": 1.6134498751337852e-05, + "loss": 0.3795, + "step": 1809 + }, + { + "epoch": 0.03228338030178718, + "grad_norm": 0.6009823679924011, + "learning_rate": 1.614341776667856e-05, + "loss": 0.3251, + "step": 1810 + }, + { + "epoch": 0.032301216423500874, + "grad_norm": 0.7352908849716187, + "learning_rate": 1.6152336782019265e-05, + "loss": 0.3759, + "step": 1811 + }, + { + "epoch": 0.03231905254521457, + "grad_norm": 0.5852219462394714, + "learning_rate": 1.616125579735997e-05, + "loss": 0.3477, + "step": 1812 + }, + { + "epoch": 0.032336888666928264, + "grad_norm": 0.6261390447616577, + "learning_rate": 1.6170174812700678e-05, + "loss": 0.2943, + "step": 1813 + }, + { + "epoch": 0.03235472478864196, + "grad_norm": 0.6401010155677795, + "learning_rate": 1.6179093828041388e-05, + "loss": 0.3198, + "step": 1814 + }, + { + "epoch": 0.032372560910355654, + "grad_norm": 0.6377858519554138, + "learning_rate": 1.618801284338209e-05, + "loss": 0.3611, + "step": 1815 + }, + { + "epoch": 0.03239039703206935, + "grad_norm": 0.507521390914917, + "learning_rate": 1.6196931858722797e-05, + "loss": 0.305, + "step": 1816 + }, + { + "epoch": 0.03240823315378304, + "grad_norm": 0.675403892993927, + "learning_rate": 1.6205850874063507e-05, + "loss": 0.3398, + "step": 1817 + }, + { + "epoch": 0.03242606927549674, + "grad_norm": 0.591806173324585, + "learning_rate": 1.621476988940421e-05, + "loss": 0.3112, + "step": 1818 + }, + { + "epoch": 0.03244390539721043, + "grad_norm": 0.5430218577384949, + "learning_rate": 1.6223688904744916e-05, + "loss": 0.3599, + "step": 1819 + }, + { + "epoch": 0.03246174151892413, + "grad_norm": 0.5644108653068542, + "learning_rate": 1.6232607920085622e-05, + "loss": 0.38, + "step": 1820 + }, + { + "epoch": 0.03247957764063782, + "grad_norm": 0.6039701104164124, + "learning_rate": 1.624152693542633e-05, + "loss": 0.3463, + "step": 1821 + }, + { + "epoch": 0.03249741376235151, + "grad_norm": 0.5967687368392944, + "learning_rate": 1.6250445950767035e-05, + "loss": 0.3288, + "step": 1822 + }, + { + "epoch": 0.032515249884065206, + "grad_norm": 0.6029810309410095, + "learning_rate": 1.625936496610774e-05, + "loss": 0.2951, + "step": 1823 + }, + { + "epoch": 0.0325330860057789, + "grad_norm": 0.9297475218772888, + "learning_rate": 1.6268283981448448e-05, + "loss": 0.3111, + "step": 1824 + }, + { + "epoch": 0.032550922127492596, + "grad_norm": 0.5336518883705139, + "learning_rate": 1.6277202996789158e-05, + "loss": 0.2874, + "step": 1825 + }, + { + "epoch": 0.03256875824920629, + "grad_norm": 0.6874714493751526, + "learning_rate": 1.628612201212986e-05, + "loss": 0.3765, + "step": 1826 + }, + { + "epoch": 0.032586594370919986, + "grad_norm": 0.9035260677337646, + "learning_rate": 1.6295041027470567e-05, + "loss": 0.2961, + "step": 1827 + }, + { + "epoch": 0.03260443049263368, + "grad_norm": 1.1584899425506592, + "learning_rate": 1.6303960042811277e-05, + "loss": 0.2838, + "step": 1828 + }, + { + "epoch": 0.032622266614347376, + "grad_norm": 0.6553873419761658, + "learning_rate": 1.631287905815198e-05, + "loss": 0.2886, + "step": 1829 + }, + { + "epoch": 0.03264010273606107, + "grad_norm": 0.8739922046661377, + "learning_rate": 1.6321798073492686e-05, + "loss": 0.3112, + "step": 1830 + }, + { + "epoch": 0.032657938857774765, + "grad_norm": 0.7506694793701172, + "learning_rate": 1.6330717088833393e-05, + "loss": 0.3454, + "step": 1831 + }, + { + "epoch": 0.03267577497948846, + "grad_norm": 0.5153954029083252, + "learning_rate": 1.63396361041741e-05, + "loss": 0.2918, + "step": 1832 + }, + { + "epoch": 0.032693611101202155, + "grad_norm": 0.7300745844841003, + "learning_rate": 1.6348555119514805e-05, + "loss": 0.2963, + "step": 1833 + }, + { + "epoch": 0.03271144722291585, + "grad_norm": 0.4347021281719208, + "learning_rate": 1.6357474134855512e-05, + "loss": 0.2743, + "step": 1834 + }, + { + "epoch": 0.032729283344629545, + "grad_norm": 0.4705832898616791, + "learning_rate": 1.6366393150196218e-05, + "loss": 0.3328, + "step": 1835 + }, + { + "epoch": 0.03274711946634324, + "grad_norm": 0.5846000909805298, + "learning_rate": 1.6375312165536928e-05, + "loss": 0.3243, + "step": 1836 + }, + { + "epoch": 0.032764955588056935, + "grad_norm": 0.4604929983615875, + "learning_rate": 1.638423118087763e-05, + "loss": 0.3167, + "step": 1837 + }, + { + "epoch": 0.03278279170977063, + "grad_norm": 0.5665956735610962, + "learning_rate": 1.6393150196218337e-05, + "loss": 0.3499, + "step": 1838 + }, + { + "epoch": 0.032800627831484325, + "grad_norm": 0.410715252161026, + "learning_rate": 1.6402069211559047e-05, + "loss": 0.287, + "step": 1839 + }, + { + "epoch": 0.03281846395319802, + "grad_norm": 0.6050565242767334, + "learning_rate": 1.641098822689975e-05, + "loss": 0.3052, + "step": 1840 + }, + { + "epoch": 0.032836300074911715, + "grad_norm": 0.6299166679382324, + "learning_rate": 1.6419907242240456e-05, + "loss": 0.3288, + "step": 1841 + }, + { + "epoch": 0.0328541361966254, + "grad_norm": 0.5011922121047974, + "learning_rate": 1.6428826257581166e-05, + "loss": 0.3266, + "step": 1842 + }, + { + "epoch": 0.0328719723183391, + "grad_norm": 0.5295569896697998, + "learning_rate": 1.643774527292187e-05, + "loss": 0.3224, + "step": 1843 + }, + { + "epoch": 0.03288980844005279, + "grad_norm": 0.5636371374130249, + "learning_rate": 1.6446664288262576e-05, + "loss": 0.337, + "step": 1844 + }, + { + "epoch": 0.03290764456176649, + "grad_norm": 0.8197712302207947, + "learning_rate": 1.6455583303603282e-05, + "loss": 0.3129, + "step": 1845 + }, + { + "epoch": 0.03292548068348018, + "grad_norm": 0.588846743106842, + "learning_rate": 1.646450231894399e-05, + "loss": 0.3359, + "step": 1846 + }, + { + "epoch": 0.03294331680519388, + "grad_norm": 0.6380104422569275, + "learning_rate": 1.6473421334284698e-05, + "loss": 0.3615, + "step": 1847 + }, + { + "epoch": 0.03296115292690757, + "grad_norm": 0.4611605107784271, + "learning_rate": 1.64823403496254e-05, + "loss": 0.3519, + "step": 1848 + }, + { + "epoch": 0.03297898904862127, + "grad_norm": 0.42634543776512146, + "learning_rate": 1.6491259364966108e-05, + "loss": 0.2981, + "step": 1849 + }, + { + "epoch": 0.03299682517033496, + "grad_norm": 0.5107312202453613, + "learning_rate": 1.6500178380306817e-05, + "loss": 0.3887, + "step": 1850 + }, + { + "epoch": 0.03301466129204866, + "grad_norm": 0.629116952419281, + "learning_rate": 1.650909739564752e-05, + "loss": 0.3734, + "step": 1851 + }, + { + "epoch": 0.03303249741376235, + "grad_norm": 0.5779350996017456, + "learning_rate": 1.6518016410988227e-05, + "loss": 0.2981, + "step": 1852 + }, + { + "epoch": 0.03305033353547605, + "grad_norm": 0.45092159509658813, + "learning_rate": 1.6526935426328936e-05, + "loss": 0.301, + "step": 1853 + }, + { + "epoch": 0.03306816965718974, + "grad_norm": 0.6753671169281006, + "learning_rate": 1.653585444166964e-05, + "loss": 0.3351, + "step": 1854 + }, + { + "epoch": 0.033086005778903436, + "grad_norm": 0.49014580249786377, + "learning_rate": 1.6544773457010346e-05, + "loss": 0.307, + "step": 1855 + }, + { + "epoch": 0.03310384190061713, + "grad_norm": 0.7182915806770325, + "learning_rate": 1.6553692472351056e-05, + "loss": 0.3411, + "step": 1856 + }, + { + "epoch": 0.033121678022330826, + "grad_norm": 0.5920625329017639, + "learning_rate": 1.656261148769176e-05, + "loss": 0.3439, + "step": 1857 + }, + { + "epoch": 0.03313951414404452, + "grad_norm": 0.7158372402191162, + "learning_rate": 1.6571530503032465e-05, + "loss": 0.3642, + "step": 1858 + }, + { + "epoch": 0.033157350265758216, + "grad_norm": 0.6029052734375, + "learning_rate": 1.658044951837317e-05, + "loss": 0.2697, + "step": 1859 + }, + { + "epoch": 0.03317518638747191, + "grad_norm": 0.43378040194511414, + "learning_rate": 1.6589368533713878e-05, + "loss": 0.2955, + "step": 1860 + }, + { + "epoch": 0.033193022509185606, + "grad_norm": 0.47693780064582825, + "learning_rate": 1.6598287549054587e-05, + "loss": 0.2576, + "step": 1861 + }, + { + "epoch": 0.033210858630899294, + "grad_norm": 0.4743706285953522, + "learning_rate": 1.660720656439529e-05, + "loss": 0.3305, + "step": 1862 + }, + { + "epoch": 0.03322869475261299, + "grad_norm": 0.5159932374954224, + "learning_rate": 1.6616125579735997e-05, + "loss": 0.356, + "step": 1863 + }, + { + "epoch": 0.033246530874326684, + "grad_norm": 0.5189605355262756, + "learning_rate": 1.6625044595076707e-05, + "loss": 0.341, + "step": 1864 + }, + { + "epoch": 0.03326436699604038, + "grad_norm": 0.8925591707229614, + "learning_rate": 1.663396361041741e-05, + "loss": 0.2823, + "step": 1865 + }, + { + "epoch": 0.033282203117754074, + "grad_norm": 0.648122251033783, + "learning_rate": 1.6642882625758116e-05, + "loss": 0.3705, + "step": 1866 + }, + { + "epoch": 0.03330003923946777, + "grad_norm": 0.5023366212844849, + "learning_rate": 1.6651801641098826e-05, + "loss": 0.2878, + "step": 1867 + }, + { + "epoch": 0.033317875361181463, + "grad_norm": 0.5993767380714417, + "learning_rate": 1.666072065643953e-05, + "loss": 0.3251, + "step": 1868 + }, + { + "epoch": 0.03333571148289516, + "grad_norm": 0.6962936520576477, + "learning_rate": 1.6669639671780235e-05, + "loss": 0.3821, + "step": 1869 + }, + { + "epoch": 0.03335354760460885, + "grad_norm": 0.5646497011184692, + "learning_rate": 1.667855868712094e-05, + "loss": 0.3277, + "step": 1870 + }, + { + "epoch": 0.03337138372632255, + "grad_norm": 0.47788524627685547, + "learning_rate": 1.6687477702461648e-05, + "loss": 0.299, + "step": 1871 + }, + { + "epoch": 0.03338921984803624, + "grad_norm": 0.4944382309913635, + "learning_rate": 1.6696396717802358e-05, + "loss": 0.3252, + "step": 1872 + }, + { + "epoch": 0.03340705596974994, + "grad_norm": 0.5403092503547668, + "learning_rate": 1.670531573314306e-05, + "loss": 0.2805, + "step": 1873 + }, + { + "epoch": 0.03342489209146363, + "grad_norm": 0.5597968697547913, + "learning_rate": 1.6714234748483767e-05, + "loss": 0.2987, + "step": 1874 + }, + { + "epoch": 0.03344272821317733, + "grad_norm": 0.7297112345695496, + "learning_rate": 1.6723153763824477e-05, + "loss": 0.3411, + "step": 1875 + }, + { + "epoch": 0.03346056433489102, + "grad_norm": 0.6010320782661438, + "learning_rate": 1.673207277916518e-05, + "loss": 0.3082, + "step": 1876 + }, + { + "epoch": 0.03347840045660472, + "grad_norm": 0.5059168338775635, + "learning_rate": 1.6740991794505886e-05, + "loss": 0.3048, + "step": 1877 + }, + { + "epoch": 0.03349623657831841, + "grad_norm": 0.6566490530967712, + "learning_rate": 1.6749910809846596e-05, + "loss": 0.3637, + "step": 1878 + }, + { + "epoch": 0.03351407270003211, + "grad_norm": 0.6993565559387207, + "learning_rate": 1.67588298251873e-05, + "loss": 0.3342, + "step": 1879 + }, + { + "epoch": 0.0335319088217458, + "grad_norm": 0.6532073616981506, + "learning_rate": 1.6767748840528005e-05, + "loss": 0.3718, + "step": 1880 + }, + { + "epoch": 0.0335497449434595, + "grad_norm": 0.5813199877738953, + "learning_rate": 1.6776667855868715e-05, + "loss": 0.3486, + "step": 1881 + }, + { + "epoch": 0.033567581065173185, + "grad_norm": 0.7437688708305359, + "learning_rate": 1.6785586871209418e-05, + "loss": 0.3141, + "step": 1882 + }, + { + "epoch": 0.03358541718688688, + "grad_norm": 0.6422274112701416, + "learning_rate": 1.6794505886550128e-05, + "loss": 0.304, + "step": 1883 + }, + { + "epoch": 0.033603253308600575, + "grad_norm": 0.7330365777015686, + "learning_rate": 1.680342490189083e-05, + "loss": 0.3759, + "step": 1884 + }, + { + "epoch": 0.03362108943031427, + "grad_norm": 0.6227088570594788, + "learning_rate": 1.6812343917231537e-05, + "loss": 0.295, + "step": 1885 + }, + { + "epoch": 0.033638925552027965, + "grad_norm": 1.264343500137329, + "learning_rate": 1.6821262932572247e-05, + "loss": 0.2965, + "step": 1886 + }, + { + "epoch": 0.03365676167374166, + "grad_norm": 0.7335077524185181, + "learning_rate": 1.683018194791295e-05, + "loss": 0.3577, + "step": 1887 + }, + { + "epoch": 0.033674597795455355, + "grad_norm": 0.49058955907821655, + "learning_rate": 1.6839100963253656e-05, + "loss": 0.2781, + "step": 1888 + }, + { + "epoch": 0.03369243391716905, + "grad_norm": 0.4763009548187256, + "learning_rate": 1.6848019978594366e-05, + "loss": 0.3053, + "step": 1889 + }, + { + "epoch": 0.033710270038882745, + "grad_norm": 0.9236243367195129, + "learning_rate": 1.685693899393507e-05, + "loss": 0.3252, + "step": 1890 + }, + { + "epoch": 0.03372810616059644, + "grad_norm": 0.4823125898838043, + "learning_rate": 1.6865858009275775e-05, + "loss": 0.3179, + "step": 1891 + }, + { + "epoch": 0.033745942282310135, + "grad_norm": 0.6278830170631409, + "learning_rate": 1.6874777024616485e-05, + "loss": 0.3573, + "step": 1892 + }, + { + "epoch": 0.03376377840402383, + "grad_norm": 0.7841582298278809, + "learning_rate": 1.6883696039957188e-05, + "loss": 0.3618, + "step": 1893 + }, + { + "epoch": 0.033781614525737524, + "grad_norm": 0.559029221534729, + "learning_rate": 1.6892615055297895e-05, + "loss": 0.3047, + "step": 1894 + }, + { + "epoch": 0.03379945064745122, + "grad_norm": 0.5621006488800049, + "learning_rate": 1.6901534070638604e-05, + "loss": 0.2932, + "step": 1895 + }, + { + "epoch": 0.033817286769164914, + "grad_norm": 0.6017270088195801, + "learning_rate": 1.6910453085979307e-05, + "loss": 0.3518, + "step": 1896 + }, + { + "epoch": 0.03383512289087861, + "grad_norm": 0.47827112674713135, + "learning_rate": 1.6919372101320017e-05, + "loss": 0.2866, + "step": 1897 + }, + { + "epoch": 0.033852959012592304, + "grad_norm": 0.572151780128479, + "learning_rate": 1.692829111666072e-05, + "loss": 0.3321, + "step": 1898 + }, + { + "epoch": 0.033870795134306, + "grad_norm": 0.47333481907844543, + "learning_rate": 1.6937210132001427e-05, + "loss": 0.313, + "step": 1899 + }, + { + "epoch": 0.033888631256019694, + "grad_norm": 0.5933525562286377, + "learning_rate": 1.6946129147342136e-05, + "loss": 0.2863, + "step": 1900 + }, + { + "epoch": 0.03390646737773339, + "grad_norm": 0.5429176688194275, + "learning_rate": 1.695504816268284e-05, + "loss": 0.3129, + "step": 1901 + }, + { + "epoch": 0.03392430349944708, + "grad_norm": 0.7529359459877014, + "learning_rate": 1.6963967178023546e-05, + "loss": 0.3121, + "step": 1902 + }, + { + "epoch": 0.03394213962116077, + "grad_norm": 0.5944604873657227, + "learning_rate": 1.6972886193364255e-05, + "loss": 0.3949, + "step": 1903 + }, + { + "epoch": 0.03395997574287447, + "grad_norm": 0.6306872367858887, + "learning_rate": 1.698180520870496e-05, + "loss": 0.4058, + "step": 1904 + }, + { + "epoch": 0.03397781186458816, + "grad_norm": 0.6834776401519775, + "learning_rate": 1.6990724224045665e-05, + "loss": 0.3669, + "step": 1905 + }, + { + "epoch": 0.033995647986301857, + "grad_norm": 0.5179616212844849, + "learning_rate": 1.6999643239386375e-05, + "loss": 0.371, + "step": 1906 + }, + { + "epoch": 0.03401348410801555, + "grad_norm": 0.7298871874809265, + "learning_rate": 1.7008562254727078e-05, + "loss": 0.3361, + "step": 1907 + }, + { + "epoch": 0.034031320229729246, + "grad_norm": 0.5836283564567566, + "learning_rate": 1.7017481270067787e-05, + "loss": 0.3226, + "step": 1908 + }, + { + "epoch": 0.03404915635144294, + "grad_norm": 0.5526504516601562, + "learning_rate": 1.702640028540849e-05, + "loss": 0.3, + "step": 1909 + }, + { + "epoch": 0.034066992473156636, + "grad_norm": 0.6150591969490051, + "learning_rate": 1.7035319300749197e-05, + "loss": 0.3208, + "step": 1910 + }, + { + "epoch": 0.03408482859487033, + "grad_norm": 0.4714590609073639, + "learning_rate": 1.7044238316089906e-05, + "loss": 0.2804, + "step": 1911 + }, + { + "epoch": 0.034102664716584026, + "grad_norm": 0.5982564091682434, + "learning_rate": 1.705315733143061e-05, + "loss": 0.2963, + "step": 1912 + }, + { + "epoch": 0.03412050083829772, + "grad_norm": 0.6177613735198975, + "learning_rate": 1.7062076346771316e-05, + "loss": 0.3265, + "step": 1913 + }, + { + "epoch": 0.034138336960011416, + "grad_norm": 0.9625240564346313, + "learning_rate": 1.7070995362112026e-05, + "loss": 0.4255, + "step": 1914 + }, + { + "epoch": 0.03415617308172511, + "grad_norm": 0.5063337683677673, + "learning_rate": 1.707991437745273e-05, + "loss": 0.3545, + "step": 1915 + }, + { + "epoch": 0.034174009203438806, + "grad_norm": 0.7352203726768494, + "learning_rate": 1.7088833392793435e-05, + "loss": 0.3495, + "step": 1916 + }, + { + "epoch": 0.0341918453251525, + "grad_norm": 0.5571827292442322, + "learning_rate": 1.7097752408134145e-05, + "loss": 0.314, + "step": 1917 + }, + { + "epoch": 0.034209681446866196, + "grad_norm": 0.5359615683555603, + "learning_rate": 1.7106671423474848e-05, + "loss": 0.3052, + "step": 1918 + }, + { + "epoch": 0.03422751756857989, + "grad_norm": 0.9982262849807739, + "learning_rate": 1.7115590438815558e-05, + "loss": 0.3557, + "step": 1919 + }, + { + "epoch": 0.034245353690293585, + "grad_norm": 0.5018155574798584, + "learning_rate": 1.7124509454156264e-05, + "loss": 0.3421, + "step": 1920 + }, + { + "epoch": 0.03426318981200728, + "grad_norm": 0.5375298261642456, + "learning_rate": 1.7133428469496967e-05, + "loss": 0.3424, + "step": 1921 + }, + { + "epoch": 0.03428102593372097, + "grad_norm": 0.6136654615402222, + "learning_rate": 1.7142347484837677e-05, + "loss": 0.3418, + "step": 1922 + }, + { + "epoch": 0.03429886205543466, + "grad_norm": 0.5145890116691589, + "learning_rate": 1.715126650017838e-05, + "loss": 0.3421, + "step": 1923 + }, + { + "epoch": 0.03431669817714836, + "grad_norm": 0.7545877695083618, + "learning_rate": 1.7160185515519086e-05, + "loss": 0.3328, + "step": 1924 + }, + { + "epoch": 0.03433453429886205, + "grad_norm": 0.5231462121009827, + "learning_rate": 1.7169104530859796e-05, + "loss": 0.3492, + "step": 1925 + }, + { + "epoch": 0.03435237042057575, + "grad_norm": 0.6672552227973938, + "learning_rate": 1.71780235462005e-05, + "loss": 0.2874, + "step": 1926 + }, + { + "epoch": 0.03437020654228944, + "grad_norm": 0.5202288627624512, + "learning_rate": 1.7186942561541205e-05, + "loss": 0.3062, + "step": 1927 + }, + { + "epoch": 0.03438804266400314, + "grad_norm": 0.4937322437763214, + "learning_rate": 1.7195861576881915e-05, + "loss": 0.377, + "step": 1928 + }, + { + "epoch": 0.03440587878571683, + "grad_norm": 0.5765814185142517, + "learning_rate": 1.7204780592222618e-05, + "loss": 0.3257, + "step": 1929 + }, + { + "epoch": 0.03442371490743053, + "grad_norm": 0.479180246591568, + "learning_rate": 1.7213699607563324e-05, + "loss": 0.2886, + "step": 1930 + }, + { + "epoch": 0.03444155102914422, + "grad_norm": 0.7214354872703552, + "learning_rate": 1.7222618622904034e-05, + "loss": 0.3852, + "step": 1931 + }, + { + "epoch": 0.03445938715085792, + "grad_norm": 0.4235411584377289, + "learning_rate": 1.7231537638244737e-05, + "loss": 0.2914, + "step": 1932 + }, + { + "epoch": 0.03447722327257161, + "grad_norm": 0.5858898162841797, + "learning_rate": 1.7240456653585447e-05, + "loss": 0.3587, + "step": 1933 + }, + { + "epoch": 0.03449505939428531, + "grad_norm": 0.517701268196106, + "learning_rate": 1.724937566892615e-05, + "loss": 0.3122, + "step": 1934 + }, + { + "epoch": 0.034512895515999, + "grad_norm": 0.521261990070343, + "learning_rate": 1.7258294684266856e-05, + "loss": 0.3977, + "step": 1935 + }, + { + "epoch": 0.0345307316377127, + "grad_norm": 0.4932325482368469, + "learning_rate": 1.7267213699607566e-05, + "loss": 0.3296, + "step": 1936 + }, + { + "epoch": 0.03454856775942639, + "grad_norm": 0.6196635365486145, + "learning_rate": 1.727613271494827e-05, + "loss": 0.3566, + "step": 1937 + }, + { + "epoch": 0.03456640388114009, + "grad_norm": 0.8973121643066406, + "learning_rate": 1.7285051730288975e-05, + "loss": 0.3155, + "step": 1938 + }, + { + "epoch": 0.03458424000285378, + "grad_norm": 0.6645421385765076, + "learning_rate": 1.7293970745629685e-05, + "loss": 0.3316, + "step": 1939 + }, + { + "epoch": 0.03460207612456748, + "grad_norm": 0.6201606392860413, + "learning_rate": 1.7302889760970388e-05, + "loss": 0.3755, + "step": 1940 + }, + { + "epoch": 0.03461991224628117, + "grad_norm": 0.4170590043067932, + "learning_rate": 1.7311808776311095e-05, + "loss": 0.2868, + "step": 1941 + }, + { + "epoch": 0.03463774836799486, + "grad_norm": 0.42760705947875977, + "learning_rate": 1.7320727791651804e-05, + "loss": 0.2734, + "step": 1942 + }, + { + "epoch": 0.034655584489708555, + "grad_norm": 0.6392554044723511, + "learning_rate": 1.7329646806992507e-05, + "loss": 0.306, + "step": 1943 + }, + { + "epoch": 0.03467342061142225, + "grad_norm": 0.37376856803894043, + "learning_rate": 1.7338565822333217e-05, + "loss": 0.2673, + "step": 1944 + }, + { + "epoch": 0.034691256733135944, + "grad_norm": 0.534887969493866, + "learning_rate": 1.7347484837673923e-05, + "loss": 0.3133, + "step": 1945 + }, + { + "epoch": 0.03470909285484964, + "grad_norm": 0.7607902884483337, + "learning_rate": 1.7356403853014626e-05, + "loss": 0.3811, + "step": 1946 + }, + { + "epoch": 0.034726928976563334, + "grad_norm": 0.5955637097358704, + "learning_rate": 1.7365322868355336e-05, + "loss": 0.413, + "step": 1947 + }, + { + "epoch": 0.03474476509827703, + "grad_norm": 0.5730603337287903, + "learning_rate": 1.737424188369604e-05, + "loss": 0.3591, + "step": 1948 + }, + { + "epoch": 0.034762601219990724, + "grad_norm": 0.7538769841194153, + "learning_rate": 1.7383160899036746e-05, + "loss": 0.3859, + "step": 1949 + }, + { + "epoch": 0.03478043734170442, + "grad_norm": 0.4605296552181244, + "learning_rate": 1.7392079914377455e-05, + "loss": 0.3387, + "step": 1950 + }, + { + "epoch": 0.034798273463418114, + "grad_norm": 0.5367212891578674, + "learning_rate": 1.740099892971816e-05, + "loss": 0.3332, + "step": 1951 + }, + { + "epoch": 0.03481610958513181, + "grad_norm": 0.5772411823272705, + "learning_rate": 1.7409917945058865e-05, + "loss": 0.3773, + "step": 1952 + }, + { + "epoch": 0.034833945706845504, + "grad_norm": 0.5616193413734436, + "learning_rate": 1.7418836960399574e-05, + "loss": 0.3162, + "step": 1953 + }, + { + "epoch": 0.0348517818285592, + "grad_norm": 0.5653926134109497, + "learning_rate": 1.7427755975740277e-05, + "loss": 0.3691, + "step": 1954 + }, + { + "epoch": 0.034869617950272894, + "grad_norm": 0.4547867774963379, + "learning_rate": 1.7436674991080987e-05, + "loss": 0.2656, + "step": 1955 + }, + { + "epoch": 0.03488745407198659, + "grad_norm": 0.6967268586158752, + "learning_rate": 1.7445594006421694e-05, + "loss": 0.4115, + "step": 1956 + }, + { + "epoch": 0.034905290193700284, + "grad_norm": 0.6118252873420715, + "learning_rate": 1.7454513021762397e-05, + "loss": 0.3779, + "step": 1957 + }, + { + "epoch": 0.03492312631541398, + "grad_norm": 0.618803083896637, + "learning_rate": 1.7463432037103106e-05, + "loss": 0.3375, + "step": 1958 + }, + { + "epoch": 0.03494096243712767, + "grad_norm": 0.5251666307449341, + "learning_rate": 1.7472351052443813e-05, + "loss": 0.28, + "step": 1959 + }, + { + "epoch": 0.03495879855884137, + "grad_norm": 0.5035682916641235, + "learning_rate": 1.7481270067784516e-05, + "loss": 0.3915, + "step": 1960 + }, + { + "epoch": 0.03497663468055506, + "grad_norm": 0.5120066404342651, + "learning_rate": 1.7490189083125226e-05, + "loss": 0.2507, + "step": 1961 + }, + { + "epoch": 0.03499447080226876, + "grad_norm": 0.9391610622406006, + "learning_rate": 1.749910809846593e-05, + "loss": 0.3243, + "step": 1962 + }, + { + "epoch": 0.035012306923982446, + "grad_norm": 0.6144405007362366, + "learning_rate": 1.7508027113806635e-05, + "loss": 0.3327, + "step": 1963 + }, + { + "epoch": 0.03503014304569614, + "grad_norm": 0.46523281931877136, + "learning_rate": 1.7516946129147345e-05, + "loss": 0.3191, + "step": 1964 + }, + { + "epoch": 0.035047979167409836, + "grad_norm": 0.5145363211631775, + "learning_rate": 1.7525865144488048e-05, + "loss": 0.2986, + "step": 1965 + }, + { + "epoch": 0.03506581528912353, + "grad_norm": 0.6042940616607666, + "learning_rate": 1.7534784159828757e-05, + "loss": 0.3554, + "step": 1966 + }, + { + "epoch": 0.035083651410837226, + "grad_norm": 0.6096752285957336, + "learning_rate": 1.7543703175169464e-05, + "loss": 0.3637, + "step": 1967 + }, + { + "epoch": 0.03510148753255092, + "grad_norm": 0.7093321681022644, + "learning_rate": 1.7552622190510167e-05, + "loss": 0.3532, + "step": 1968 + }, + { + "epoch": 0.035119323654264616, + "grad_norm": 0.7536479234695435, + "learning_rate": 1.7561541205850877e-05, + "loss": 0.427, + "step": 1969 + }, + { + "epoch": 0.03513715977597831, + "grad_norm": 0.5811682343482971, + "learning_rate": 1.7570460221191583e-05, + "loss": 0.3018, + "step": 1970 + }, + { + "epoch": 0.035154995897692005, + "grad_norm": 0.5609343647956848, + "learning_rate": 1.7579379236532286e-05, + "loss": 0.3644, + "step": 1971 + }, + { + "epoch": 0.0351728320194057, + "grad_norm": 0.6832839250564575, + "learning_rate": 1.7588298251872996e-05, + "loss": 0.3655, + "step": 1972 + }, + { + "epoch": 0.035190668141119395, + "grad_norm": 0.6124131679534912, + "learning_rate": 1.75972172672137e-05, + "loss": 0.3418, + "step": 1973 + }, + { + "epoch": 0.03520850426283309, + "grad_norm": 0.49659475684165955, + "learning_rate": 1.7606136282554405e-05, + "loss": 0.3601, + "step": 1974 + }, + { + "epoch": 0.035226340384546785, + "grad_norm": 0.5575053691864014, + "learning_rate": 1.7615055297895115e-05, + "loss": 0.357, + "step": 1975 + }, + { + "epoch": 0.03524417650626048, + "grad_norm": 0.7826827168464661, + "learning_rate": 1.7623974313235818e-05, + "loss": 0.347, + "step": 1976 + }, + { + "epoch": 0.035262012627974175, + "grad_norm": 0.49193763732910156, + "learning_rate": 1.7632893328576524e-05, + "loss": 0.3214, + "step": 1977 + }, + { + "epoch": 0.03527984874968787, + "grad_norm": 0.4988035261631012, + "learning_rate": 1.7641812343917234e-05, + "loss": 0.2832, + "step": 1978 + }, + { + "epoch": 0.035297684871401565, + "grad_norm": 0.6013734936714172, + "learning_rate": 1.7650731359257937e-05, + "loss": 0.4104, + "step": 1979 + }, + { + "epoch": 0.03531552099311526, + "grad_norm": 0.4478839337825775, + "learning_rate": 1.7659650374598647e-05, + "loss": 0.2856, + "step": 1980 + }, + { + "epoch": 0.035333357114828955, + "grad_norm": 0.48602673411369324, + "learning_rate": 1.7668569389939353e-05, + "loss": 0.3413, + "step": 1981 + }, + { + "epoch": 0.03535119323654265, + "grad_norm": 0.5484108328819275, + "learning_rate": 1.7677488405280056e-05, + "loss": 0.2965, + "step": 1982 + }, + { + "epoch": 0.03536902935825634, + "grad_norm": 0.7806851267814636, + "learning_rate": 1.7686407420620766e-05, + "loss": 0.3772, + "step": 1983 + }, + { + "epoch": 0.03538686547997003, + "grad_norm": 0.6586794853210449, + "learning_rate": 1.7695326435961472e-05, + "loss": 0.2941, + "step": 1984 + }, + { + "epoch": 0.03540470160168373, + "grad_norm": 0.5919625759124756, + "learning_rate": 1.7704245451302175e-05, + "loss": 0.3908, + "step": 1985 + }, + { + "epoch": 0.03542253772339742, + "grad_norm": 0.6526595950126648, + "learning_rate": 1.7713164466642885e-05, + "loss": 0.3964, + "step": 1986 + }, + { + "epoch": 0.03544037384511112, + "grad_norm": 0.6588309407234192, + "learning_rate": 1.7722083481983588e-05, + "loss": 0.3099, + "step": 1987 + }, + { + "epoch": 0.03545820996682481, + "grad_norm": 0.6340320706367493, + "learning_rate": 1.7731002497324294e-05, + "loss": 0.3396, + "step": 1988 + }, + { + "epoch": 0.03547604608853851, + "grad_norm": 0.5385860204696655, + "learning_rate": 1.7739921512665004e-05, + "loss": 0.3035, + "step": 1989 + }, + { + "epoch": 0.0354938822102522, + "grad_norm": 0.5645444393157959, + "learning_rate": 1.7748840528005707e-05, + "loss": 0.2993, + "step": 1990 + }, + { + "epoch": 0.0355117183319659, + "grad_norm": 0.5452271699905396, + "learning_rate": 1.7757759543346417e-05, + "loss": 0.3267, + "step": 1991 + }, + { + "epoch": 0.03552955445367959, + "grad_norm": 0.6098746061325073, + "learning_rate": 1.7766678558687123e-05, + "loss": 0.3503, + "step": 1992 + }, + { + "epoch": 0.03554739057539329, + "grad_norm": 0.5513267517089844, + "learning_rate": 1.7775597574027826e-05, + "loss": 0.3347, + "step": 1993 + }, + { + "epoch": 0.03556522669710698, + "grad_norm": 0.5899681448936462, + "learning_rate": 1.7784516589368536e-05, + "loss": 0.3196, + "step": 1994 + }, + { + "epoch": 0.03558306281882068, + "grad_norm": 0.4549456238746643, + "learning_rate": 1.7793435604709242e-05, + "loss": 0.2949, + "step": 1995 + }, + { + "epoch": 0.03560089894053437, + "grad_norm": 0.40827202796936035, + "learning_rate": 1.7802354620049945e-05, + "loss": 0.3124, + "step": 1996 + }, + { + "epoch": 0.035618735062248066, + "grad_norm": 0.6398820877075195, + "learning_rate": 1.7811273635390655e-05, + "loss": 0.4124, + "step": 1997 + }, + { + "epoch": 0.03563657118396176, + "grad_norm": 0.5127965807914734, + "learning_rate": 1.782019265073136e-05, + "loss": 0.3529, + "step": 1998 + }, + { + "epoch": 0.035654407305675456, + "grad_norm": 0.5904882550239563, + "learning_rate": 1.7829111666072065e-05, + "loss": 0.3783, + "step": 1999 + }, + { + "epoch": 0.03567224342738915, + "grad_norm": 0.4309324026107788, + "learning_rate": 1.7838030681412774e-05, + "loss": 0.3074, + "step": 2000 + }, + { + "epoch": 0.03567224342738915, + "eval_loss": 0.2980582118034363, + "eval_runtime": 1601.0462, + "eval_samples_per_second": 0.64, + "eval_steps_per_second": 0.107, + "step": 2000 + }, + { + "epoch": 0.035690079549102846, + "grad_norm": 0.4615575671195984, + "learning_rate": 1.7846949696753477e-05, + "loss": 0.2872, + "step": 2001 + }, + { + "epoch": 0.03570791567081654, + "grad_norm": 0.631496250629425, + "learning_rate": 1.7855868712094187e-05, + "loss": 0.3246, + "step": 2002 + }, + { + "epoch": 0.03572575179253023, + "grad_norm": 0.6639876365661621, + "learning_rate": 1.7864787727434893e-05, + "loss": 0.3376, + "step": 2003 + }, + { + "epoch": 0.035743587914243924, + "grad_norm": 0.7973620891571045, + "learning_rate": 1.7873706742775596e-05, + "loss": 0.348, + "step": 2004 + }, + { + "epoch": 0.03576142403595762, + "grad_norm": 0.5009528398513794, + "learning_rate": 1.7882625758116306e-05, + "loss": 0.2803, + "step": 2005 + }, + { + "epoch": 0.035779260157671314, + "grad_norm": 0.47912833094596863, + "learning_rate": 1.7891544773457013e-05, + "loss": 0.293, + "step": 2006 + }, + { + "epoch": 0.03579709627938501, + "grad_norm": 0.45988672971725464, + "learning_rate": 1.7900463788797716e-05, + "loss": 0.3069, + "step": 2007 + }, + { + "epoch": 0.035814932401098704, + "grad_norm": 0.5593786239624023, + "learning_rate": 1.7909382804138425e-05, + "loss": 0.3296, + "step": 2008 + }, + { + "epoch": 0.0358327685228124, + "grad_norm": 0.4574684500694275, + "learning_rate": 1.7918301819479132e-05, + "loss": 0.2787, + "step": 2009 + }, + { + "epoch": 0.03585060464452609, + "grad_norm": 0.6230701208114624, + "learning_rate": 1.7927220834819835e-05, + "loss": 0.3394, + "step": 2010 + }, + { + "epoch": 0.03586844076623979, + "grad_norm": 0.580540657043457, + "learning_rate": 1.7936139850160545e-05, + "loss": 0.275, + "step": 2011 + }, + { + "epoch": 0.03588627688795348, + "grad_norm": 0.8210400342941284, + "learning_rate": 1.7945058865501248e-05, + "loss": 0.3043, + "step": 2012 + }, + { + "epoch": 0.03590411300966718, + "grad_norm": 0.5110849142074585, + "learning_rate": 1.7953977880841954e-05, + "loss": 0.2394, + "step": 2013 + }, + { + "epoch": 0.03592194913138087, + "grad_norm": 0.5304075479507446, + "learning_rate": 1.7962896896182664e-05, + "loss": 0.3163, + "step": 2014 + }, + { + "epoch": 0.03593978525309457, + "grad_norm": 0.5845052599906921, + "learning_rate": 1.7971815911523367e-05, + "loss": 0.3757, + "step": 2015 + }, + { + "epoch": 0.03595762137480826, + "grad_norm": 0.4767301380634308, + "learning_rate": 1.7980734926864076e-05, + "loss": 0.3626, + "step": 2016 + }, + { + "epoch": 0.03597545749652196, + "grad_norm": 0.5138675570487976, + "learning_rate": 1.7989653942204783e-05, + "loss": 0.28, + "step": 2017 + }, + { + "epoch": 0.03599329361823565, + "grad_norm": 0.5862236618995667, + "learning_rate": 1.7998572957545486e-05, + "loss": 0.3175, + "step": 2018 + }, + { + "epoch": 0.03601112973994935, + "grad_norm": 0.5228481292724609, + "learning_rate": 1.8007491972886196e-05, + "loss": 0.3104, + "step": 2019 + }, + { + "epoch": 0.03602896586166304, + "grad_norm": 0.5921441912651062, + "learning_rate": 1.8016410988226902e-05, + "loss": 0.3723, + "step": 2020 + }, + { + "epoch": 0.03604680198337674, + "grad_norm": 0.38433346152305603, + "learning_rate": 1.8025330003567605e-05, + "loss": 0.2831, + "step": 2021 + }, + { + "epoch": 0.03606463810509043, + "grad_norm": 0.6258277893066406, + "learning_rate": 1.8034249018908315e-05, + "loss": 0.4046, + "step": 2022 + }, + { + "epoch": 0.03608247422680412, + "grad_norm": 0.4964272379875183, + "learning_rate": 1.804316803424902e-05, + "loss": 0.2942, + "step": 2023 + }, + { + "epoch": 0.036100310348517815, + "grad_norm": 0.5187031030654907, + "learning_rate": 1.8052087049589724e-05, + "loss": 0.3077, + "step": 2024 + }, + { + "epoch": 0.03611814647023151, + "grad_norm": 0.6166718602180481, + "learning_rate": 1.8061006064930434e-05, + "loss": 0.3355, + "step": 2025 + }, + { + "epoch": 0.036135982591945205, + "grad_norm": 0.44471755623817444, + "learning_rate": 1.8069925080271137e-05, + "loss": 0.2677, + "step": 2026 + }, + { + "epoch": 0.0361538187136589, + "grad_norm": 0.8543984293937683, + "learning_rate": 1.8078844095611847e-05, + "loss": 0.3373, + "step": 2027 + }, + { + "epoch": 0.036171654835372595, + "grad_norm": 0.5724572539329529, + "learning_rate": 1.8087763110952553e-05, + "loss": 0.3119, + "step": 2028 + }, + { + "epoch": 0.03618949095708629, + "grad_norm": 0.5950539708137512, + "learning_rate": 1.8096682126293256e-05, + "loss": 0.3323, + "step": 2029 + }, + { + "epoch": 0.036207327078799985, + "grad_norm": 0.4790019094944, + "learning_rate": 1.8105601141633966e-05, + "loss": 0.2828, + "step": 2030 + }, + { + "epoch": 0.03622516320051368, + "grad_norm": 0.9051364660263062, + "learning_rate": 1.8114520156974672e-05, + "loss": 0.3469, + "step": 2031 + }, + { + "epoch": 0.036242999322227375, + "grad_norm": 0.4470467269420624, + "learning_rate": 1.8123439172315375e-05, + "loss": 0.3137, + "step": 2032 + }, + { + "epoch": 0.03626083544394107, + "grad_norm": 0.39944523572921753, + "learning_rate": 1.8132358187656085e-05, + "loss": 0.3224, + "step": 2033 + }, + { + "epoch": 0.036278671565654765, + "grad_norm": 0.558316171169281, + "learning_rate": 1.814127720299679e-05, + "loss": 0.282, + "step": 2034 + }, + { + "epoch": 0.03629650768736846, + "grad_norm": 0.45616936683654785, + "learning_rate": 1.8150196218337494e-05, + "loss": 0.2741, + "step": 2035 + }, + { + "epoch": 0.036314343809082154, + "grad_norm": 0.5811489820480347, + "learning_rate": 1.8159115233678204e-05, + "loss": 0.3417, + "step": 2036 + }, + { + "epoch": 0.03633217993079585, + "grad_norm": 0.5237880349159241, + "learning_rate": 1.8168034249018907e-05, + "loss": 0.3467, + "step": 2037 + }, + { + "epoch": 0.036350016052509544, + "grad_norm": 0.535847008228302, + "learning_rate": 1.8176953264359617e-05, + "loss": 0.3689, + "step": 2038 + }, + { + "epoch": 0.03636785217422324, + "grad_norm": 0.6724188327789307, + "learning_rate": 1.8185872279700323e-05, + "loss": 0.3404, + "step": 2039 + }, + { + "epoch": 0.036385688295936934, + "grad_norm": 0.5290500521659851, + "learning_rate": 1.8194791295041026e-05, + "loss": 0.3456, + "step": 2040 + }, + { + "epoch": 0.03640352441765063, + "grad_norm": 0.5779337286949158, + "learning_rate": 1.8203710310381736e-05, + "loss": 0.2705, + "step": 2041 + }, + { + "epoch": 0.036421360539364324, + "grad_norm": 0.5553800463676453, + "learning_rate": 1.8212629325722442e-05, + "loss": 0.3096, + "step": 2042 + }, + { + "epoch": 0.03643919666107801, + "grad_norm": 0.43628913164138794, + "learning_rate": 1.8221548341063145e-05, + "loss": 0.3255, + "step": 2043 + }, + { + "epoch": 0.03645703278279171, + "grad_norm": 0.4505508542060852, + "learning_rate": 1.8230467356403855e-05, + "loss": 0.3219, + "step": 2044 + }, + { + "epoch": 0.0364748689045054, + "grad_norm": 0.5080452561378479, + "learning_rate": 1.823938637174456e-05, + "loss": 0.3251, + "step": 2045 + }, + { + "epoch": 0.0364927050262191, + "grad_norm": 0.6649768352508545, + "learning_rate": 1.8248305387085264e-05, + "loss": 0.4309, + "step": 2046 + }, + { + "epoch": 0.03651054114793279, + "grad_norm": 0.4447711408138275, + "learning_rate": 1.8257224402425974e-05, + "loss": 0.3008, + "step": 2047 + }, + { + "epoch": 0.036528377269646486, + "grad_norm": 0.44222530722618103, + "learning_rate": 1.826614341776668e-05, + "loss": 0.2426, + "step": 2048 + }, + { + "epoch": 0.03654621339136018, + "grad_norm": 0.47739681601524353, + "learning_rate": 1.8275062433107387e-05, + "loss": 0.3007, + "step": 2049 + }, + { + "epoch": 0.036564049513073876, + "grad_norm": 0.9375199675559998, + "learning_rate": 1.8283981448448093e-05, + "loss": 0.3428, + "step": 2050 + }, + { + "epoch": 0.03658188563478757, + "grad_norm": 0.48347771167755127, + "learning_rate": 1.8292900463788796e-05, + "loss": 0.3713, + "step": 2051 + }, + { + "epoch": 0.036599721756501266, + "grad_norm": 0.5927991271018982, + "learning_rate": 1.8301819479129506e-05, + "loss": 0.2797, + "step": 2052 + }, + { + "epoch": 0.03661755787821496, + "grad_norm": 0.6231663227081299, + "learning_rate": 1.8310738494470213e-05, + "loss": 0.2601, + "step": 2053 + }, + { + "epoch": 0.036635393999928656, + "grad_norm": 0.5645677447319031, + "learning_rate": 1.8319657509810916e-05, + "loss": 0.3493, + "step": 2054 + }, + { + "epoch": 0.03665323012164235, + "grad_norm": 0.5088529586791992, + "learning_rate": 1.8328576525151625e-05, + "loss": 0.2935, + "step": 2055 + }, + { + "epoch": 0.036671066243356046, + "grad_norm": 0.4417657256126404, + "learning_rate": 1.833749554049233e-05, + "loss": 0.2936, + "step": 2056 + }, + { + "epoch": 0.03668890236506974, + "grad_norm": 0.47601479291915894, + "learning_rate": 1.8346414555833035e-05, + "loss": 0.3226, + "step": 2057 + }, + { + "epoch": 0.036706738486783436, + "grad_norm": 0.49755915999412537, + "learning_rate": 1.8355333571173744e-05, + "loss": 0.255, + "step": 2058 + }, + { + "epoch": 0.03672457460849713, + "grad_norm": 0.6121598482131958, + "learning_rate": 1.836425258651445e-05, + "loss": 0.291, + "step": 2059 + }, + { + "epoch": 0.036742410730210825, + "grad_norm": 0.5487620830535889, + "learning_rate": 1.8373171601855154e-05, + "loss": 0.3913, + "step": 2060 + }, + { + "epoch": 0.03676024685192452, + "grad_norm": 0.422702431678772, + "learning_rate": 1.8382090617195864e-05, + "loss": 0.2662, + "step": 2061 + }, + { + "epoch": 0.036778082973638215, + "grad_norm": 0.5154445171356201, + "learning_rate": 1.839100963253657e-05, + "loss": 0.318, + "step": 2062 + }, + { + "epoch": 0.0367959190953519, + "grad_norm": 0.8006268739700317, + "learning_rate": 1.8399928647877276e-05, + "loss": 0.3853, + "step": 2063 + }, + { + "epoch": 0.0368137552170656, + "grad_norm": 0.45608943700790405, + "learning_rate": 1.8408847663217983e-05, + "loss": 0.3074, + "step": 2064 + }, + { + "epoch": 0.03683159133877929, + "grad_norm": 0.7384217977523804, + "learning_rate": 1.8417766678558686e-05, + "loss": 0.3378, + "step": 2065 + }, + { + "epoch": 0.03684942746049299, + "grad_norm": 0.5264842510223389, + "learning_rate": 1.8426685693899395e-05, + "loss": 0.3162, + "step": 2066 + }, + { + "epoch": 0.03686726358220668, + "grad_norm": 0.5197806358337402, + "learning_rate": 1.8435604709240102e-05, + "loss": 0.3134, + "step": 2067 + }, + { + "epoch": 0.03688509970392038, + "grad_norm": 0.5372104048728943, + "learning_rate": 1.8444523724580805e-05, + "loss": 0.2726, + "step": 2068 + }, + { + "epoch": 0.03690293582563407, + "grad_norm": 0.5655398368835449, + "learning_rate": 1.8453442739921515e-05, + "loss": 0.2893, + "step": 2069 + }, + { + "epoch": 0.03692077194734777, + "grad_norm": 1.0206730365753174, + "learning_rate": 1.846236175526222e-05, + "loss": 0.3049, + "step": 2070 + }, + { + "epoch": 0.03693860806906146, + "grad_norm": 0.7678471207618713, + "learning_rate": 1.8471280770602924e-05, + "loss": 0.3114, + "step": 2071 + }, + { + "epoch": 0.03695644419077516, + "grad_norm": 0.5387485027313232, + "learning_rate": 1.8480199785943634e-05, + "loss": 0.3264, + "step": 2072 + }, + { + "epoch": 0.03697428031248885, + "grad_norm": 0.5938040018081665, + "learning_rate": 1.848911880128434e-05, + "loss": 0.3455, + "step": 2073 + }, + { + "epoch": 0.03699211643420255, + "grad_norm": 0.5923523306846619, + "learning_rate": 1.8498037816625046e-05, + "loss": 0.3772, + "step": 2074 + }, + { + "epoch": 0.03700995255591624, + "grad_norm": 0.8520667552947998, + "learning_rate": 1.8506956831965753e-05, + "loss": 0.313, + "step": 2075 + }, + { + "epoch": 0.03702778867762994, + "grad_norm": 0.897028923034668, + "learning_rate": 1.8515875847306456e-05, + "loss": 0.2922, + "step": 2076 + }, + { + "epoch": 0.03704562479934363, + "grad_norm": 0.6438193321228027, + "learning_rate": 1.8524794862647166e-05, + "loss": 0.3407, + "step": 2077 + }, + { + "epoch": 0.03706346092105733, + "grad_norm": 0.4676547050476074, + "learning_rate": 1.8533713877987872e-05, + "loss": 0.3665, + "step": 2078 + }, + { + "epoch": 0.03708129704277102, + "grad_norm": 0.6205225586891174, + "learning_rate": 1.8542632893328575e-05, + "loss": 0.3666, + "step": 2079 + }, + { + "epoch": 0.03709913316448472, + "grad_norm": 0.4931584298610687, + "learning_rate": 1.8551551908669285e-05, + "loss": 0.2992, + "step": 2080 + }, + { + "epoch": 0.03711696928619841, + "grad_norm": 1.0366450548171997, + "learning_rate": 1.856047092400999e-05, + "loss": 0.3023, + "step": 2081 + }, + { + "epoch": 0.03713480540791211, + "grad_norm": 0.530633807182312, + "learning_rate": 1.8569389939350694e-05, + "loss": 0.2977, + "step": 2082 + }, + { + "epoch": 0.037152641529625795, + "grad_norm": 0.44546690583229065, + "learning_rate": 1.8578308954691404e-05, + "loss": 0.3121, + "step": 2083 + }, + { + "epoch": 0.03717047765133949, + "grad_norm": 0.535554051399231, + "learning_rate": 1.858722797003211e-05, + "loss": 0.3141, + "step": 2084 + }, + { + "epoch": 0.037188313773053185, + "grad_norm": 0.5873515009880066, + "learning_rate": 1.8596146985372817e-05, + "loss": 0.3546, + "step": 2085 + }, + { + "epoch": 0.03720614989476688, + "grad_norm": 0.5182445049285889, + "learning_rate": 1.8605066000713523e-05, + "loss": 0.3037, + "step": 2086 + }, + { + "epoch": 0.037223986016480574, + "grad_norm": 0.38863617181777954, + "learning_rate": 1.861398501605423e-05, + "loss": 0.2312, + "step": 2087 + }, + { + "epoch": 0.03724182213819427, + "grad_norm": 0.4798535406589508, + "learning_rate": 1.8622904031394936e-05, + "loss": 0.305, + "step": 2088 + }, + { + "epoch": 0.037259658259907964, + "grad_norm": 0.5328872799873352, + "learning_rate": 1.8631823046735642e-05, + "loss": 0.3493, + "step": 2089 + }, + { + "epoch": 0.03727749438162166, + "grad_norm": 0.6318294405937195, + "learning_rate": 1.8640742062076345e-05, + "loss": 0.3118, + "step": 2090 + }, + { + "epoch": 0.037295330503335354, + "grad_norm": 0.37853115797042847, + "learning_rate": 1.8649661077417055e-05, + "loss": 0.2864, + "step": 2091 + }, + { + "epoch": 0.03731316662504905, + "grad_norm": 0.41678526997566223, + "learning_rate": 1.865858009275776e-05, + "loss": 0.2949, + "step": 2092 + }, + { + "epoch": 0.037331002746762744, + "grad_norm": 0.4275762140750885, + "learning_rate": 1.8667499108098464e-05, + "loss": 0.2948, + "step": 2093 + }, + { + "epoch": 0.03734883886847644, + "grad_norm": 0.4504309892654419, + "learning_rate": 1.8676418123439174e-05, + "loss": 0.284, + "step": 2094 + }, + { + "epoch": 0.037366674990190134, + "grad_norm": 0.445270299911499, + "learning_rate": 1.868533713877988e-05, + "loss": 0.2869, + "step": 2095 + }, + { + "epoch": 0.03738451111190383, + "grad_norm": 0.9869561791419983, + "learning_rate": 1.8694256154120583e-05, + "loss": 0.351, + "step": 2096 + }, + { + "epoch": 0.037402347233617524, + "grad_norm": 0.616989254951477, + "learning_rate": 1.8703175169461293e-05, + "loss": 0.3719, + "step": 2097 + }, + { + "epoch": 0.03742018335533122, + "grad_norm": 0.9821555018424988, + "learning_rate": 1.8712094184802e-05, + "loss": 0.3739, + "step": 2098 + }, + { + "epoch": 0.03743801947704491, + "grad_norm": 0.6926895976066589, + "learning_rate": 1.8721013200142706e-05, + "loss": 0.3178, + "step": 2099 + }, + { + "epoch": 0.03745585559875861, + "grad_norm": 0.46907535195350647, + "learning_rate": 1.8729932215483412e-05, + "loss": 0.2822, + "step": 2100 + }, + { + "epoch": 0.0374736917204723, + "grad_norm": 0.46654942631721497, + "learning_rate": 1.873885123082412e-05, + "loss": 0.3239, + "step": 2101 + }, + { + "epoch": 0.037491527842186, + "grad_norm": 0.576643705368042, + "learning_rate": 1.8747770246164825e-05, + "loss": 0.3603, + "step": 2102 + }, + { + "epoch": 0.037509363963899686, + "grad_norm": 0.5382814407348633, + "learning_rate": 1.875668926150553e-05, + "loss": 0.3458, + "step": 2103 + }, + { + "epoch": 0.03752720008561338, + "grad_norm": 0.45998647809028625, + "learning_rate": 1.8765608276846235e-05, + "loss": 0.2816, + "step": 2104 + }, + { + "epoch": 0.037545036207327076, + "grad_norm": 0.5114442706108093, + "learning_rate": 1.8774527292186944e-05, + "loss": 0.3352, + "step": 2105 + }, + { + "epoch": 0.03756287232904077, + "grad_norm": 0.4888891577720642, + "learning_rate": 1.878344630752765e-05, + "loss": 0.3226, + "step": 2106 + }, + { + "epoch": 0.037580708450754466, + "grad_norm": 0.6054527759552002, + "learning_rate": 1.8792365322868354e-05, + "loss": 0.3493, + "step": 2107 + }, + { + "epoch": 0.03759854457246816, + "grad_norm": 0.5286481380462646, + "learning_rate": 1.8801284338209063e-05, + "loss": 0.3151, + "step": 2108 + }, + { + "epoch": 0.037616380694181856, + "grad_norm": 0.551657497882843, + "learning_rate": 1.881020335354977e-05, + "loss": 0.2909, + "step": 2109 + }, + { + "epoch": 0.03763421681589555, + "grad_norm": 0.524308443069458, + "learning_rate": 1.8819122368890476e-05, + "loss": 0.314, + "step": 2110 + }, + { + "epoch": 0.037652052937609246, + "grad_norm": 0.8535147309303284, + "learning_rate": 1.8828041384231183e-05, + "loss": 0.336, + "step": 2111 + }, + { + "epoch": 0.03766988905932294, + "grad_norm": 0.5297061800956726, + "learning_rate": 1.883696039957189e-05, + "loss": 0.425, + "step": 2112 + }, + { + "epoch": 0.037687725181036635, + "grad_norm": 0.46865183115005493, + "learning_rate": 1.8845879414912595e-05, + "loss": 0.252, + "step": 2113 + }, + { + "epoch": 0.03770556130275033, + "grad_norm": 0.5673788785934448, + "learning_rate": 1.8854798430253302e-05, + "loss": 0.366, + "step": 2114 + }, + { + "epoch": 0.037723397424464025, + "grad_norm": 0.4931465685367584, + "learning_rate": 1.8863717445594005e-05, + "loss": 0.3285, + "step": 2115 + }, + { + "epoch": 0.03774123354617772, + "grad_norm": 0.4411163926124573, + "learning_rate": 1.8872636460934714e-05, + "loss": 0.3042, + "step": 2116 + }, + { + "epoch": 0.037759069667891415, + "grad_norm": 0.7002178430557251, + "learning_rate": 1.888155547627542e-05, + "loss": 0.3235, + "step": 2117 + }, + { + "epoch": 0.03777690578960511, + "grad_norm": 0.5767307281494141, + "learning_rate": 1.8890474491616124e-05, + "loss": 0.3519, + "step": 2118 + }, + { + "epoch": 0.037794741911318805, + "grad_norm": 0.6850040555000305, + "learning_rate": 1.8899393506956834e-05, + "loss": 0.3319, + "step": 2119 + }, + { + "epoch": 0.0378125780330325, + "grad_norm": 0.4674622714519501, + "learning_rate": 1.890831252229754e-05, + "loss": 0.2911, + "step": 2120 + }, + { + "epoch": 0.037830414154746195, + "grad_norm": 0.4767864942550659, + "learning_rate": 1.8917231537638246e-05, + "loss": 0.345, + "step": 2121 + }, + { + "epoch": 0.03784825027645989, + "grad_norm": 0.3986469507217407, + "learning_rate": 1.8926150552978953e-05, + "loss": 0.3092, + "step": 2122 + }, + { + "epoch": 0.037866086398173585, + "grad_norm": 0.4365829527378082, + "learning_rate": 1.893506956831966e-05, + "loss": 0.2793, + "step": 2123 + }, + { + "epoch": 0.03788392251988727, + "grad_norm": 0.6047873497009277, + "learning_rate": 1.8943988583660366e-05, + "loss": 0.3328, + "step": 2124 + }, + { + "epoch": 0.03790175864160097, + "grad_norm": 0.42645469307899475, + "learning_rate": 1.8952907599001072e-05, + "loss": 0.3107, + "step": 2125 + }, + { + "epoch": 0.03791959476331466, + "grad_norm": 0.5384413003921509, + "learning_rate": 1.8961826614341778e-05, + "loss": 0.374, + "step": 2126 + }, + { + "epoch": 0.03793743088502836, + "grad_norm": 0.4081381559371948, + "learning_rate": 1.8970745629682485e-05, + "loss": 0.2542, + "step": 2127 + }, + { + "epoch": 0.03795526700674205, + "grad_norm": 0.4883652925491333, + "learning_rate": 1.897966464502319e-05, + "loss": 0.3227, + "step": 2128 + }, + { + "epoch": 0.03797310312845575, + "grad_norm": 0.45527878403663635, + "learning_rate": 1.8988583660363894e-05, + "loss": 0.2766, + "step": 2129 + }, + { + "epoch": 0.03799093925016944, + "grad_norm": 0.560852587223053, + "learning_rate": 1.8997502675704604e-05, + "loss": 0.3393, + "step": 2130 + }, + { + "epoch": 0.03800877537188314, + "grad_norm": 0.5652244687080383, + "learning_rate": 1.900642169104531e-05, + "loss": 0.3451, + "step": 2131 + }, + { + "epoch": 0.03802661149359683, + "grad_norm": 1.061940312385559, + "learning_rate": 1.9015340706386017e-05, + "loss": 0.2981, + "step": 2132 + }, + { + "epoch": 0.03804444761531053, + "grad_norm": 0.5608810186386108, + "learning_rate": 1.9024259721726723e-05, + "loss": 0.2944, + "step": 2133 + }, + { + "epoch": 0.03806228373702422, + "grad_norm": 0.6922007203102112, + "learning_rate": 1.903317873706743e-05, + "loss": 0.3341, + "step": 2134 + }, + { + "epoch": 0.03808011985873792, + "grad_norm": 0.5143476724624634, + "learning_rate": 1.9042097752408136e-05, + "loss": 0.2751, + "step": 2135 + }, + { + "epoch": 0.03809795598045161, + "grad_norm": 0.4784122109413147, + "learning_rate": 1.9051016767748842e-05, + "loss": 0.3042, + "step": 2136 + }, + { + "epoch": 0.038115792102165306, + "grad_norm": 0.41936033964157104, + "learning_rate": 1.905993578308955e-05, + "loss": 0.3194, + "step": 2137 + }, + { + "epoch": 0.038133628223879, + "grad_norm": 0.6206886172294617, + "learning_rate": 1.9068854798430255e-05, + "loss": 0.3369, + "step": 2138 + }, + { + "epoch": 0.038151464345592696, + "grad_norm": 0.4272455871105194, + "learning_rate": 1.907777381377096e-05, + "loss": 0.2785, + "step": 2139 + }, + { + "epoch": 0.03816930046730639, + "grad_norm": 0.39360836148262024, + "learning_rate": 1.9086692829111664e-05, + "loss": 0.322, + "step": 2140 + }, + { + "epoch": 0.038187136589020086, + "grad_norm": 0.4603310823440552, + "learning_rate": 1.9095611844452374e-05, + "loss": 0.3164, + "step": 2141 + }, + { + "epoch": 0.03820497271073378, + "grad_norm": 0.6076071858406067, + "learning_rate": 1.910453085979308e-05, + "loss": 0.3911, + "step": 2142 + }, + { + "epoch": 0.038222808832447476, + "grad_norm": 0.658783495426178, + "learning_rate": 1.9113449875133783e-05, + "loss": 0.2904, + "step": 2143 + }, + { + "epoch": 0.038240644954161164, + "grad_norm": 0.5335296988487244, + "learning_rate": 1.9122368890474493e-05, + "loss": 0.3337, + "step": 2144 + }, + { + "epoch": 0.03825848107587486, + "grad_norm": 0.5562210083007812, + "learning_rate": 1.91312879058152e-05, + "loss": 0.2884, + "step": 2145 + }, + { + "epoch": 0.038276317197588554, + "grad_norm": 0.5467308759689331, + "learning_rate": 1.9140206921155906e-05, + "loss": 0.2867, + "step": 2146 + }, + { + "epoch": 0.03829415331930225, + "grad_norm": 0.8593414425849915, + "learning_rate": 1.9149125936496612e-05, + "loss": 0.3441, + "step": 2147 + }, + { + "epoch": 0.038311989441015944, + "grad_norm": 0.5408951640129089, + "learning_rate": 1.915804495183732e-05, + "loss": 0.3497, + "step": 2148 + }, + { + "epoch": 0.03832982556272964, + "grad_norm": 0.5210184454917908, + "learning_rate": 1.9166963967178025e-05, + "loss": 0.3445, + "step": 2149 + }, + { + "epoch": 0.038347661684443334, + "grad_norm": 0.5488189458847046, + "learning_rate": 1.917588298251873e-05, + "loss": 0.3829, + "step": 2150 + }, + { + "epoch": 0.03836549780615703, + "grad_norm": 0.597741425037384, + "learning_rate": 1.9184801997859438e-05, + "loss": 0.3547, + "step": 2151 + }, + { + "epoch": 0.03838333392787072, + "grad_norm": 0.48983892798423767, + "learning_rate": 1.9193721013200144e-05, + "loss": 0.3371, + "step": 2152 + }, + { + "epoch": 0.03840117004958442, + "grad_norm": 0.4413568675518036, + "learning_rate": 1.920264002854085e-05, + "loss": 0.3583, + "step": 2153 + }, + { + "epoch": 0.03841900617129811, + "grad_norm": 0.9008107781410217, + "learning_rate": 1.9211559043881554e-05, + "loss": 0.3591, + "step": 2154 + }, + { + "epoch": 0.03843684229301181, + "grad_norm": 0.7133767604827881, + "learning_rate": 1.9220478059222263e-05, + "loss": 0.3278, + "step": 2155 + }, + { + "epoch": 0.0384546784147255, + "grad_norm": 0.5932398438453674, + "learning_rate": 1.922939707456297e-05, + "loss": 0.3929, + "step": 2156 + }, + { + "epoch": 0.0384725145364392, + "grad_norm": 0.5169602036476135, + "learning_rate": 1.9238316089903676e-05, + "loss": 0.3668, + "step": 2157 + }, + { + "epoch": 0.03849035065815289, + "grad_norm": 0.4599943161010742, + "learning_rate": 1.9247235105244382e-05, + "loss": 0.3313, + "step": 2158 + }, + { + "epoch": 0.03850818677986659, + "grad_norm": 0.5276328921318054, + "learning_rate": 1.925615412058509e-05, + "loss": 0.3127, + "step": 2159 + }, + { + "epoch": 0.03852602290158028, + "grad_norm": 0.4524444341659546, + "learning_rate": 1.9265073135925795e-05, + "loss": 0.2773, + "step": 2160 + }, + { + "epoch": 0.03854385902329398, + "grad_norm": 0.5512828826904297, + "learning_rate": 1.92739921512665e-05, + "loss": 0.3223, + "step": 2161 + }, + { + "epoch": 0.03856169514500767, + "grad_norm": 0.4261593222618103, + "learning_rate": 1.9282911166607208e-05, + "loss": 0.2805, + "step": 2162 + }, + { + "epoch": 0.03857953126672137, + "grad_norm": 1.0806833505630493, + "learning_rate": 1.9291830181947914e-05, + "loss": 0.3005, + "step": 2163 + }, + { + "epoch": 0.038597367388435055, + "grad_norm": 0.5254951119422913, + "learning_rate": 1.930074919728862e-05, + "loss": 0.3243, + "step": 2164 + }, + { + "epoch": 0.03861520351014875, + "grad_norm": 0.9907423257827759, + "learning_rate": 1.9309668212629327e-05, + "loss": 0.2794, + "step": 2165 + }, + { + "epoch": 0.038633039631862445, + "grad_norm": 0.5167833566665649, + "learning_rate": 1.9318587227970033e-05, + "loss": 0.297, + "step": 2166 + }, + { + "epoch": 0.03865087575357614, + "grad_norm": 0.7189813852310181, + "learning_rate": 1.932750624331074e-05, + "loss": 0.3538, + "step": 2167 + }, + { + "epoch": 0.038668711875289835, + "grad_norm": 0.4462776482105255, + "learning_rate": 1.9336425258651446e-05, + "loss": 0.3379, + "step": 2168 + }, + { + "epoch": 0.03868654799700353, + "grad_norm": 0.5882665514945984, + "learning_rate": 1.9345344273992153e-05, + "loss": 0.2956, + "step": 2169 + }, + { + "epoch": 0.038704384118717225, + "grad_norm": 0.41937246918678284, + "learning_rate": 1.935426328933286e-05, + "loss": 0.3274, + "step": 2170 + }, + { + "epoch": 0.03872222024043092, + "grad_norm": 0.418430358171463, + "learning_rate": 1.9363182304673565e-05, + "loss": 0.2888, + "step": 2171 + }, + { + "epoch": 0.038740056362144615, + "grad_norm": 0.4083843231201172, + "learning_rate": 1.9372101320014272e-05, + "loss": 0.3005, + "step": 2172 + }, + { + "epoch": 0.03875789248385831, + "grad_norm": 0.4693141579627991, + "learning_rate": 1.9381020335354978e-05, + "loss": 0.3078, + "step": 2173 + }, + { + "epoch": 0.038775728605572005, + "grad_norm": 0.43590137362480164, + "learning_rate": 1.9389939350695685e-05, + "loss": 0.302, + "step": 2174 + }, + { + "epoch": 0.0387935647272857, + "grad_norm": 0.5631271004676819, + "learning_rate": 1.939885836603639e-05, + "loss": 0.2835, + "step": 2175 + }, + { + "epoch": 0.038811400848999394, + "grad_norm": 0.6374658346176147, + "learning_rate": 1.9407777381377097e-05, + "loss": 0.2953, + "step": 2176 + }, + { + "epoch": 0.03882923697071309, + "grad_norm": 0.5642484426498413, + "learning_rate": 1.9416696396717804e-05, + "loss": 0.2956, + "step": 2177 + }, + { + "epoch": 0.038847073092426784, + "grad_norm": 0.42883986234664917, + "learning_rate": 1.942561541205851e-05, + "loss": 0.2713, + "step": 2178 + }, + { + "epoch": 0.03886490921414048, + "grad_norm": 0.6185327768325806, + "learning_rate": 1.9434534427399213e-05, + "loss": 0.3097, + "step": 2179 + }, + { + "epoch": 0.038882745335854174, + "grad_norm": 0.47632378339767456, + "learning_rate": 1.9443453442739923e-05, + "loss": 0.3023, + "step": 2180 + }, + { + "epoch": 0.03890058145756787, + "grad_norm": 0.6266023516654968, + "learning_rate": 1.945237245808063e-05, + "loss": 0.3306, + "step": 2181 + }, + { + "epoch": 0.038918417579281564, + "grad_norm": 0.7256380915641785, + "learning_rate": 1.9461291473421336e-05, + "loss": 0.2863, + "step": 2182 + }, + { + "epoch": 0.03893625370099526, + "grad_norm": 0.5386956334114075, + "learning_rate": 1.9470210488762042e-05, + "loss": 0.331, + "step": 2183 + }, + { + "epoch": 0.03895408982270895, + "grad_norm": 0.5050217509269714, + "learning_rate": 1.947912950410275e-05, + "loss": 0.2826, + "step": 2184 + }, + { + "epoch": 0.03897192594442264, + "grad_norm": 0.7591911554336548, + "learning_rate": 1.9488048519443455e-05, + "loss": 0.3181, + "step": 2185 + }, + { + "epoch": 0.03898976206613634, + "grad_norm": 0.5025246739387512, + "learning_rate": 1.949696753478416e-05, + "loss": 0.2636, + "step": 2186 + }, + { + "epoch": 0.03900759818785003, + "grad_norm": 0.6930098533630371, + "learning_rate": 1.9505886550124867e-05, + "loss": 0.3118, + "step": 2187 + }, + { + "epoch": 0.03902543430956373, + "grad_norm": 0.6897455453872681, + "learning_rate": 1.9514805565465574e-05, + "loss": 0.3103, + "step": 2188 + }, + { + "epoch": 0.03904327043127742, + "grad_norm": 0.4128413200378418, + "learning_rate": 1.952372458080628e-05, + "loss": 0.3076, + "step": 2189 + }, + { + "epoch": 0.039061106552991116, + "grad_norm": 0.5418302416801453, + "learning_rate": 1.9532643596146987e-05, + "loss": 0.2823, + "step": 2190 + }, + { + "epoch": 0.03907894267470481, + "grad_norm": 0.8713166117668152, + "learning_rate": 1.9541562611487693e-05, + "loss": 0.2933, + "step": 2191 + }, + { + "epoch": 0.039096778796418506, + "grad_norm": 0.4038330018520355, + "learning_rate": 1.95504816268284e-05, + "loss": 0.3019, + "step": 2192 + }, + { + "epoch": 0.0391146149181322, + "grad_norm": 0.4462745189666748, + "learning_rate": 1.9559400642169106e-05, + "loss": 0.3249, + "step": 2193 + }, + { + "epoch": 0.039132451039845896, + "grad_norm": 0.5353034138679504, + "learning_rate": 1.9568319657509812e-05, + "loss": 0.2453, + "step": 2194 + }, + { + "epoch": 0.03915028716155959, + "grad_norm": 0.45148083567619324, + "learning_rate": 1.957723867285052e-05, + "loss": 0.3134, + "step": 2195 + }, + { + "epoch": 0.039168123283273286, + "grad_norm": 0.48606717586517334, + "learning_rate": 1.9586157688191225e-05, + "loss": 0.3017, + "step": 2196 + }, + { + "epoch": 0.03918595940498698, + "grad_norm": 0.6264825463294983, + "learning_rate": 1.959507670353193e-05, + "loss": 0.2939, + "step": 2197 + }, + { + "epoch": 0.039203795526700676, + "grad_norm": 0.481516569852829, + "learning_rate": 1.9603995718872638e-05, + "loss": 0.2816, + "step": 2198 + }, + { + "epoch": 0.03922163164841437, + "grad_norm": 0.5619494318962097, + "learning_rate": 1.9612914734213344e-05, + "loss": 0.3279, + "step": 2199 + }, + { + "epoch": 0.039239467770128066, + "grad_norm": 0.4352967441082001, + "learning_rate": 1.962183374955405e-05, + "loss": 0.2899, + "step": 2200 + }, + { + "epoch": 0.03925730389184176, + "grad_norm": 0.4811791479587555, + "learning_rate": 1.9630752764894757e-05, + "loss": 0.3475, + "step": 2201 + }, + { + "epoch": 0.039275140013555455, + "grad_norm": 0.5548359751701355, + "learning_rate": 1.9639671780235463e-05, + "loss": 0.2496, + "step": 2202 + }, + { + "epoch": 0.03929297613526915, + "grad_norm": 0.4365730285644531, + "learning_rate": 1.964859079557617e-05, + "loss": 0.2935, + "step": 2203 + }, + { + "epoch": 0.03931081225698284, + "grad_norm": 0.3903549313545227, + "learning_rate": 1.9657509810916876e-05, + "loss": 0.3007, + "step": 2204 + }, + { + "epoch": 0.03932864837869653, + "grad_norm": 0.5752881765365601, + "learning_rate": 1.9666428826257582e-05, + "loss": 0.3403, + "step": 2205 + }, + { + "epoch": 0.03934648450041023, + "grad_norm": 0.4807453155517578, + "learning_rate": 1.967534784159829e-05, + "loss": 0.2952, + "step": 2206 + }, + { + "epoch": 0.03936432062212392, + "grad_norm": 0.5764487981796265, + "learning_rate": 1.9684266856938995e-05, + "loss": 0.363, + "step": 2207 + }, + { + "epoch": 0.03938215674383762, + "grad_norm": 0.5190560221672058, + "learning_rate": 1.96931858722797e-05, + "loss": 0.2629, + "step": 2208 + }, + { + "epoch": 0.03939999286555131, + "grad_norm": 0.7670960426330566, + "learning_rate": 1.9702104887620408e-05, + "loss": 0.3365, + "step": 2209 + }, + { + "epoch": 0.03941782898726501, + "grad_norm": 0.45047062635421753, + "learning_rate": 1.9711023902961114e-05, + "loss": 0.2565, + "step": 2210 + }, + { + "epoch": 0.0394356651089787, + "grad_norm": 0.4333064556121826, + "learning_rate": 1.971994291830182e-05, + "loss": 0.2913, + "step": 2211 + }, + { + "epoch": 0.0394535012306924, + "grad_norm": 0.4812524914741516, + "learning_rate": 1.9728861933642527e-05, + "loss": 0.2618, + "step": 2212 + }, + { + "epoch": 0.03947133735240609, + "grad_norm": 0.7026241421699524, + "learning_rate": 1.9737780948983233e-05, + "loss": 0.315, + "step": 2213 + }, + { + "epoch": 0.03948917347411979, + "grad_norm": 0.6039960384368896, + "learning_rate": 1.974669996432394e-05, + "loss": 0.2947, + "step": 2214 + }, + { + "epoch": 0.03950700959583348, + "grad_norm": 0.5692183971405029, + "learning_rate": 1.9755618979664646e-05, + "loss": 0.3261, + "step": 2215 + }, + { + "epoch": 0.03952484571754718, + "grad_norm": 0.9089668393135071, + "learning_rate": 1.9764537995005353e-05, + "loss": 0.3524, + "step": 2216 + }, + { + "epoch": 0.03954268183926087, + "grad_norm": 0.6296941041946411, + "learning_rate": 1.977345701034606e-05, + "loss": 0.3383, + "step": 2217 + }, + { + "epoch": 0.03956051796097457, + "grad_norm": 0.4746566414833069, + "learning_rate": 1.9782376025686765e-05, + "loss": 0.2781, + "step": 2218 + }, + { + "epoch": 0.03957835408268826, + "grad_norm": 0.43100976943969727, + "learning_rate": 1.979129504102747e-05, + "loss": 0.2636, + "step": 2219 + }, + { + "epoch": 0.03959619020440196, + "grad_norm": 0.44783324003219604, + "learning_rate": 1.9800214056368178e-05, + "loss": 0.2593, + "step": 2220 + }, + { + "epoch": 0.03961402632611565, + "grad_norm": 0.5706475973129272, + "learning_rate": 1.9809133071708884e-05, + "loss": 0.3122, + "step": 2221 + }, + { + "epoch": 0.03963186244782935, + "grad_norm": 0.4727674722671509, + "learning_rate": 1.981805208704959e-05, + "loss": 0.264, + "step": 2222 + }, + { + "epoch": 0.03964969856954304, + "grad_norm": 0.41251450777053833, + "learning_rate": 1.9826971102390297e-05, + "loss": 0.3104, + "step": 2223 + }, + { + "epoch": 0.03966753469125673, + "grad_norm": 0.533075749874115, + "learning_rate": 1.9835890117731004e-05, + "loss": 0.3244, + "step": 2224 + }, + { + "epoch": 0.039685370812970425, + "grad_norm": 0.4140982925891876, + "learning_rate": 1.984480913307171e-05, + "loss": 0.3028, + "step": 2225 + }, + { + "epoch": 0.03970320693468412, + "grad_norm": 0.6157547831535339, + "learning_rate": 1.9853728148412416e-05, + "loss": 0.3311, + "step": 2226 + }, + { + "epoch": 0.039721043056397815, + "grad_norm": 0.5803517699241638, + "learning_rate": 1.9862647163753123e-05, + "loss": 0.3808, + "step": 2227 + }, + { + "epoch": 0.03973887917811151, + "grad_norm": 0.5734399557113647, + "learning_rate": 1.987156617909383e-05, + "loss": 0.3543, + "step": 2228 + }, + { + "epoch": 0.039756715299825204, + "grad_norm": 0.618794858455658, + "learning_rate": 1.9880485194434535e-05, + "loss": 0.3758, + "step": 2229 + }, + { + "epoch": 0.0397745514215389, + "grad_norm": 0.5295289158821106, + "learning_rate": 1.9889404209775242e-05, + "loss": 0.3212, + "step": 2230 + }, + { + "epoch": 0.039792387543252594, + "grad_norm": 0.6505454182624817, + "learning_rate": 1.9898323225115948e-05, + "loss": 0.3918, + "step": 2231 + }, + { + "epoch": 0.03981022366496629, + "grad_norm": 0.4140424132347107, + "learning_rate": 1.9907242240456655e-05, + "loss": 0.2818, + "step": 2232 + }, + { + "epoch": 0.039828059786679984, + "grad_norm": 0.7063069939613342, + "learning_rate": 1.991616125579736e-05, + "loss": 0.3091, + "step": 2233 + }, + { + "epoch": 0.03984589590839368, + "grad_norm": 0.6158521175384521, + "learning_rate": 1.9925080271138067e-05, + "loss": 0.4188, + "step": 2234 + }, + { + "epoch": 0.039863732030107374, + "grad_norm": 0.44827064871788025, + "learning_rate": 1.9933999286478774e-05, + "loss": 0.2364, + "step": 2235 + }, + { + "epoch": 0.03988156815182107, + "grad_norm": 0.44625604152679443, + "learning_rate": 1.994291830181948e-05, + "loss": 0.3028, + "step": 2236 + }, + { + "epoch": 0.039899404273534764, + "grad_norm": 0.5541078448295593, + "learning_rate": 1.9951837317160187e-05, + "loss": 0.3322, + "step": 2237 + }, + { + "epoch": 0.03991724039524846, + "grad_norm": 0.4406847655773163, + "learning_rate": 1.9960756332500893e-05, + "loss": 0.314, + "step": 2238 + }, + { + "epoch": 0.039935076516962154, + "grad_norm": 0.47166356444358826, + "learning_rate": 1.99696753478416e-05, + "loss": 0.3667, + "step": 2239 + }, + { + "epoch": 0.03995291263867585, + "grad_norm": 0.36501219868659973, + "learning_rate": 1.9978594363182306e-05, + "loss": 0.2824, + "step": 2240 + }, + { + "epoch": 0.03997074876038954, + "grad_norm": 0.6005622148513794, + "learning_rate": 1.9987513378523012e-05, + "loss": 0.3024, + "step": 2241 + }, + { + "epoch": 0.03998858488210324, + "grad_norm": 0.43297284841537476, + "learning_rate": 1.999643239386372e-05, + "loss": 0.2632, + "step": 2242 + }, + { + "epoch": 0.04000642100381693, + "grad_norm": 0.7355133891105652, + "learning_rate": 2.0005351409204425e-05, + "loss": 0.3686, + "step": 2243 + }, + { + "epoch": 0.04002425712553062, + "grad_norm": 0.48259422183036804, + "learning_rate": 2.001427042454513e-05, + "loss": 0.3532, + "step": 2244 + }, + { + "epoch": 0.040042093247244316, + "grad_norm": 0.39164793491363525, + "learning_rate": 2.0023189439885838e-05, + "loss": 0.3117, + "step": 2245 + }, + { + "epoch": 0.04005992936895801, + "grad_norm": 0.5173423290252686, + "learning_rate": 2.0032108455226544e-05, + "loss": 0.3257, + "step": 2246 + }, + { + "epoch": 0.040077765490671706, + "grad_norm": 0.6469289660453796, + "learning_rate": 2.004102747056725e-05, + "loss": 0.355, + "step": 2247 + }, + { + "epoch": 0.0400956016123854, + "grad_norm": 0.40094152092933655, + "learning_rate": 2.0049946485907957e-05, + "loss": 0.2898, + "step": 2248 + }, + { + "epoch": 0.040113437734099096, + "grad_norm": 0.6378834247589111, + "learning_rate": 2.0058865501248663e-05, + "loss": 0.3511, + "step": 2249 + }, + { + "epoch": 0.04013127385581279, + "grad_norm": 0.5820125937461853, + "learning_rate": 2.006778451658937e-05, + "loss": 0.3662, + "step": 2250 + }, + { + "epoch": 0.040149109977526486, + "grad_norm": 0.5466359853744507, + "learning_rate": 2.0076703531930076e-05, + "loss": 0.2816, + "step": 2251 + }, + { + "epoch": 0.04016694609924018, + "grad_norm": 0.4169778525829315, + "learning_rate": 2.0085622547270782e-05, + "loss": 0.2557, + "step": 2252 + }, + { + "epoch": 0.040184782220953875, + "grad_norm": 0.44945377111434937, + "learning_rate": 2.009454156261149e-05, + "loss": 0.3279, + "step": 2253 + }, + { + "epoch": 0.04020261834266757, + "grad_norm": 0.38507401943206787, + "learning_rate": 2.0103460577952195e-05, + "loss": 0.2579, + "step": 2254 + }, + { + "epoch": 0.040220454464381265, + "grad_norm": 0.6948372721672058, + "learning_rate": 2.01123795932929e-05, + "loss": 0.4155, + "step": 2255 + }, + { + "epoch": 0.04023829058609496, + "grad_norm": 0.4727243185043335, + "learning_rate": 2.0121298608633608e-05, + "loss": 0.4056, + "step": 2256 + }, + { + "epoch": 0.040256126707808655, + "grad_norm": 0.5559164881706238, + "learning_rate": 2.0130217623974314e-05, + "loss": 0.3192, + "step": 2257 + }, + { + "epoch": 0.04027396282952235, + "grad_norm": 0.4496062695980072, + "learning_rate": 2.013913663931502e-05, + "loss": 0.346, + "step": 2258 + }, + { + "epoch": 0.040291798951236045, + "grad_norm": 0.5188528895378113, + "learning_rate": 2.0148055654655727e-05, + "loss": 0.3029, + "step": 2259 + }, + { + "epoch": 0.04030963507294974, + "grad_norm": 0.3797195553779602, + "learning_rate": 2.0156974669996433e-05, + "loss": 0.2717, + "step": 2260 + }, + { + "epoch": 0.040327471194663435, + "grad_norm": 0.5428307056427002, + "learning_rate": 2.016589368533714e-05, + "loss": 0.3135, + "step": 2261 + }, + { + "epoch": 0.04034530731637713, + "grad_norm": 0.8119169473648071, + "learning_rate": 2.0174812700677846e-05, + "loss": 0.339, + "step": 2262 + }, + { + "epoch": 0.040363143438090825, + "grad_norm": 0.6079069375991821, + "learning_rate": 2.0183731716018552e-05, + "loss": 0.3559, + "step": 2263 + }, + { + "epoch": 0.04038097955980451, + "grad_norm": 0.603508710861206, + "learning_rate": 2.019265073135926e-05, + "loss": 0.396, + "step": 2264 + }, + { + "epoch": 0.04039881568151821, + "grad_norm": 0.5170609951019287, + "learning_rate": 2.0201569746699965e-05, + "loss": 0.3224, + "step": 2265 + }, + { + "epoch": 0.0404166518032319, + "grad_norm": 0.4569767117500305, + "learning_rate": 2.021048876204067e-05, + "loss": 0.2836, + "step": 2266 + }, + { + "epoch": 0.0404344879249456, + "grad_norm": 0.5537528395652771, + "learning_rate": 2.0219407777381378e-05, + "loss": 0.367, + "step": 2267 + }, + { + "epoch": 0.04045232404665929, + "grad_norm": 0.4843725264072418, + "learning_rate": 2.0228326792722084e-05, + "loss": 0.3144, + "step": 2268 + }, + { + "epoch": 0.04047016016837299, + "grad_norm": 0.5590280294418335, + "learning_rate": 2.023724580806279e-05, + "loss": 0.3184, + "step": 2269 + }, + { + "epoch": 0.04048799629008668, + "grad_norm": 0.44634491205215454, + "learning_rate": 2.0246164823403497e-05, + "loss": 0.301, + "step": 2270 + }, + { + "epoch": 0.04050583241180038, + "grad_norm": 0.6276856660842896, + "learning_rate": 2.0255083838744203e-05, + "loss": 0.3689, + "step": 2271 + }, + { + "epoch": 0.04052366853351407, + "grad_norm": 0.38367608189582825, + "learning_rate": 2.026400285408491e-05, + "loss": 0.3083, + "step": 2272 + }, + { + "epoch": 0.04054150465522777, + "grad_norm": 0.518040657043457, + "learning_rate": 2.0272921869425616e-05, + "loss": 0.2921, + "step": 2273 + }, + { + "epoch": 0.04055934077694146, + "grad_norm": 0.5452049970626831, + "learning_rate": 2.0281840884766323e-05, + "loss": 0.314, + "step": 2274 + }, + { + "epoch": 0.04057717689865516, + "grad_norm": 0.5913676619529724, + "learning_rate": 2.029075990010703e-05, + "loss": 0.4119, + "step": 2275 + }, + { + "epoch": 0.04059501302036885, + "grad_norm": 0.47705668210983276, + "learning_rate": 2.0299678915447735e-05, + "loss": 0.2675, + "step": 2276 + }, + { + "epoch": 0.04061284914208255, + "grad_norm": 0.5474919080734253, + "learning_rate": 2.0308597930788442e-05, + "loss": 0.3613, + "step": 2277 + }, + { + "epoch": 0.04063068526379624, + "grad_norm": 0.43692636489868164, + "learning_rate": 2.0317516946129148e-05, + "loss": 0.2709, + "step": 2278 + }, + { + "epoch": 0.040648521385509936, + "grad_norm": 0.6070582270622253, + "learning_rate": 2.0326435961469854e-05, + "loss": 0.2958, + "step": 2279 + }, + { + "epoch": 0.04066635750722363, + "grad_norm": 0.7254700064659119, + "learning_rate": 2.033535497681056e-05, + "loss": 0.4509, + "step": 2280 + }, + { + "epoch": 0.040684193628937326, + "grad_norm": 1.0255929231643677, + "learning_rate": 2.0344273992151267e-05, + "loss": 0.3368, + "step": 2281 + }, + { + "epoch": 0.04070202975065102, + "grad_norm": 0.5275586247444153, + "learning_rate": 2.0353193007491974e-05, + "loss": 0.3552, + "step": 2282 + }, + { + "epoch": 0.040719865872364716, + "grad_norm": 0.361937940120697, + "learning_rate": 2.036211202283268e-05, + "loss": 0.2558, + "step": 2283 + }, + { + "epoch": 0.04073770199407841, + "grad_norm": 0.4156278371810913, + "learning_rate": 2.0371031038173386e-05, + "loss": 0.264, + "step": 2284 + }, + { + "epoch": 0.0407555381157921, + "grad_norm": 0.48568591475486755, + "learning_rate": 2.0379950053514093e-05, + "loss": 0.296, + "step": 2285 + }, + { + "epoch": 0.040773374237505794, + "grad_norm": 0.5209950804710388, + "learning_rate": 2.03888690688548e-05, + "loss": 0.2349, + "step": 2286 + }, + { + "epoch": 0.04079121035921949, + "grad_norm": 0.5858215689659119, + "learning_rate": 2.0397788084195506e-05, + "loss": 0.3768, + "step": 2287 + }, + { + "epoch": 0.040809046480933184, + "grad_norm": 0.4604300856590271, + "learning_rate": 2.0406707099536212e-05, + "loss": 0.2949, + "step": 2288 + }, + { + "epoch": 0.04082688260264688, + "grad_norm": 0.7059128880500793, + "learning_rate": 2.0415626114876918e-05, + "loss": 0.2974, + "step": 2289 + }, + { + "epoch": 0.040844718724360574, + "grad_norm": 0.47811001539230347, + "learning_rate": 2.0424545130217625e-05, + "loss": 0.3108, + "step": 2290 + }, + { + "epoch": 0.04086255484607427, + "grad_norm": 0.4712660610675812, + "learning_rate": 2.043346414555833e-05, + "loss": 0.3033, + "step": 2291 + }, + { + "epoch": 0.04088039096778796, + "grad_norm": 0.586381196975708, + "learning_rate": 2.0442383160899037e-05, + "loss": 0.3293, + "step": 2292 + }, + { + "epoch": 0.04089822708950166, + "grad_norm": 0.6724193096160889, + "learning_rate": 2.0451302176239744e-05, + "loss": 0.3365, + "step": 2293 + }, + { + "epoch": 0.04091606321121535, + "grad_norm": 0.5653788447380066, + "learning_rate": 2.046022119158045e-05, + "loss": 0.3819, + "step": 2294 + }, + { + "epoch": 0.04093389933292905, + "grad_norm": 0.4913775622844696, + "learning_rate": 2.0469140206921157e-05, + "loss": 0.2767, + "step": 2295 + }, + { + "epoch": 0.04095173545464274, + "grad_norm": 0.6029754877090454, + "learning_rate": 2.0478059222261863e-05, + "loss": 0.3332, + "step": 2296 + }, + { + "epoch": 0.04096957157635644, + "grad_norm": 0.5877436399459839, + "learning_rate": 2.048697823760257e-05, + "loss": 0.3302, + "step": 2297 + }, + { + "epoch": 0.04098740769807013, + "grad_norm": 0.3311327397823334, + "learning_rate": 2.0495897252943276e-05, + "loss": 0.2755, + "step": 2298 + }, + { + "epoch": 0.04100524381978383, + "grad_norm": 0.5030799508094788, + "learning_rate": 2.0504816268283982e-05, + "loss": 0.3081, + "step": 2299 + }, + { + "epoch": 0.04102307994149752, + "grad_norm": 0.4549279808998108, + "learning_rate": 2.051373528362469e-05, + "loss": 0.3393, + "step": 2300 + }, + { + "epoch": 0.04104091606321122, + "grad_norm": 0.42377546429634094, + "learning_rate": 2.0522654298965395e-05, + "loss": 0.2816, + "step": 2301 + }, + { + "epoch": 0.04105875218492491, + "grad_norm": 0.48868995904922485, + "learning_rate": 2.05315733143061e-05, + "loss": 0.3191, + "step": 2302 + }, + { + "epoch": 0.04107658830663861, + "grad_norm": 0.5731281638145447, + "learning_rate": 2.0540492329646808e-05, + "loss": 0.326, + "step": 2303 + }, + { + "epoch": 0.0410944244283523, + "grad_norm": 0.48770371079444885, + "learning_rate": 2.0549411344987514e-05, + "loss": 0.3263, + "step": 2304 + }, + { + "epoch": 0.04111226055006599, + "grad_norm": 0.5813767910003662, + "learning_rate": 2.055833036032822e-05, + "loss": 0.2899, + "step": 2305 + }, + { + "epoch": 0.041130096671779685, + "grad_norm": 0.5096173882484436, + "learning_rate": 2.0567249375668927e-05, + "loss": 0.3702, + "step": 2306 + }, + { + "epoch": 0.04114793279349338, + "grad_norm": 0.470236212015152, + "learning_rate": 2.0576168391009633e-05, + "loss": 0.2751, + "step": 2307 + }, + { + "epoch": 0.041165768915207075, + "grad_norm": 0.7232435345649719, + "learning_rate": 2.058508740635034e-05, + "loss": 0.3457, + "step": 2308 + }, + { + "epoch": 0.04118360503692077, + "grad_norm": 0.6438577771186829, + "learning_rate": 2.0594006421691046e-05, + "loss": 0.3594, + "step": 2309 + }, + { + "epoch": 0.041201441158634465, + "grad_norm": 0.5336143970489502, + "learning_rate": 2.0602925437031752e-05, + "loss": 0.3039, + "step": 2310 + }, + { + "epoch": 0.04121927728034816, + "grad_norm": 0.3439522087574005, + "learning_rate": 2.061184445237246e-05, + "loss": 0.2676, + "step": 2311 + }, + { + "epoch": 0.041237113402061855, + "grad_norm": 0.6166762113571167, + "learning_rate": 2.0620763467713165e-05, + "loss": 0.4102, + "step": 2312 + }, + { + "epoch": 0.04125494952377555, + "grad_norm": 0.4449693560600281, + "learning_rate": 2.062968248305387e-05, + "loss": 0.3031, + "step": 2313 + }, + { + "epoch": 0.041272785645489245, + "grad_norm": 0.6253373622894287, + "learning_rate": 2.0638601498394578e-05, + "loss": 0.3601, + "step": 2314 + }, + { + "epoch": 0.04129062176720294, + "grad_norm": 0.38799795508384705, + "learning_rate": 2.0647520513735284e-05, + "loss": 0.2674, + "step": 2315 + }, + { + "epoch": 0.041308457888916635, + "grad_norm": 0.4372713565826416, + "learning_rate": 2.065643952907599e-05, + "loss": 0.288, + "step": 2316 + }, + { + "epoch": 0.04132629401063033, + "grad_norm": 0.42214682698249817, + "learning_rate": 2.0665358544416697e-05, + "loss": 0.3154, + "step": 2317 + }, + { + "epoch": 0.041344130132344024, + "grad_norm": 0.5073306560516357, + "learning_rate": 2.0674277559757403e-05, + "loss": 0.3148, + "step": 2318 + }, + { + "epoch": 0.04136196625405772, + "grad_norm": 0.41248980164527893, + "learning_rate": 2.068319657509811e-05, + "loss": 0.2518, + "step": 2319 + }, + { + "epoch": 0.041379802375771414, + "grad_norm": 0.4477764964103699, + "learning_rate": 2.0692115590438816e-05, + "loss": 0.3297, + "step": 2320 + }, + { + "epoch": 0.04139763849748511, + "grad_norm": 0.5122391581535339, + "learning_rate": 2.0701034605779522e-05, + "loss": 0.3157, + "step": 2321 + }, + { + "epoch": 0.041415474619198804, + "grad_norm": 0.6319571137428284, + "learning_rate": 2.070995362112023e-05, + "loss": 0.2594, + "step": 2322 + }, + { + "epoch": 0.0414333107409125, + "grad_norm": 0.5080597400665283, + "learning_rate": 2.071887263646094e-05, + "loss": 0.2905, + "step": 2323 + }, + { + "epoch": 0.041451146862626194, + "grad_norm": 0.5869331359863281, + "learning_rate": 2.072779165180164e-05, + "loss": 0.3042, + "step": 2324 + }, + { + "epoch": 0.04146898298433988, + "grad_norm": 0.535035252571106, + "learning_rate": 2.0736710667142348e-05, + "loss": 0.246, + "step": 2325 + }, + { + "epoch": 0.04148681910605358, + "grad_norm": 0.52500981092453, + "learning_rate": 2.0745629682483054e-05, + "loss": 0.2982, + "step": 2326 + }, + { + "epoch": 0.04150465522776727, + "grad_norm": 0.5187718868255615, + "learning_rate": 2.075454869782376e-05, + "loss": 0.3345, + "step": 2327 + }, + { + "epoch": 0.04152249134948097, + "grad_norm": 0.5416833758354187, + "learning_rate": 2.0763467713164467e-05, + "loss": 0.3724, + "step": 2328 + }, + { + "epoch": 0.04154032747119466, + "grad_norm": 0.5092459917068481, + "learning_rate": 2.0772386728505174e-05, + "loss": 0.2979, + "step": 2329 + }, + { + "epoch": 0.041558163592908356, + "grad_norm": 0.6160767078399658, + "learning_rate": 2.078130574384588e-05, + "loss": 0.396, + "step": 2330 + }, + { + "epoch": 0.04157599971462205, + "grad_norm": 0.9167537093162537, + "learning_rate": 2.0790224759186586e-05, + "loss": 0.3068, + "step": 2331 + }, + { + "epoch": 0.041593835836335746, + "grad_norm": 0.6085203289985657, + "learning_rate": 2.0799143774527293e-05, + "loss": 0.3381, + "step": 2332 + }, + { + "epoch": 0.04161167195804944, + "grad_norm": 0.42790383100509644, + "learning_rate": 2.0808062789868e-05, + "loss": 0.2761, + "step": 2333 + }, + { + "epoch": 0.041629508079763136, + "grad_norm": 0.40416112542152405, + "learning_rate": 2.081698180520871e-05, + "loss": 0.2872, + "step": 2334 + }, + { + "epoch": 0.04164734420147683, + "grad_norm": 0.5766844749450684, + "learning_rate": 2.0825900820549412e-05, + "loss": 0.2891, + "step": 2335 + }, + { + "epoch": 0.041665180323190526, + "grad_norm": 0.535537600517273, + "learning_rate": 2.0834819835890118e-05, + "loss": 0.3242, + "step": 2336 + }, + { + "epoch": 0.04168301644490422, + "grad_norm": 0.49317222833633423, + "learning_rate": 2.0843738851230828e-05, + "loss": 0.2905, + "step": 2337 + }, + { + "epoch": 0.041700852566617916, + "grad_norm": 0.6061891317367554, + "learning_rate": 2.085265786657153e-05, + "loss": 0.3397, + "step": 2338 + }, + { + "epoch": 0.04171868868833161, + "grad_norm": 0.47608044743537903, + "learning_rate": 2.0861576881912237e-05, + "loss": 0.3388, + "step": 2339 + }, + { + "epoch": 0.041736524810045306, + "grad_norm": 0.6357356905937195, + "learning_rate": 2.0870495897252944e-05, + "loss": 0.3539, + "step": 2340 + }, + { + "epoch": 0.041754360931759, + "grad_norm": 0.5893821120262146, + "learning_rate": 2.087941491259365e-05, + "loss": 0.358, + "step": 2341 + }, + { + "epoch": 0.041772197053472696, + "grad_norm": 0.5480794310569763, + "learning_rate": 2.0888333927934356e-05, + "loss": 0.3279, + "step": 2342 + }, + { + "epoch": 0.04179003317518639, + "grad_norm": 0.6927926540374756, + "learning_rate": 2.0897252943275063e-05, + "loss": 0.3196, + "step": 2343 + }, + { + "epoch": 0.041807869296900085, + "grad_norm": 0.5544004440307617, + "learning_rate": 2.090617195861577e-05, + "loss": 0.3307, + "step": 2344 + }, + { + "epoch": 0.04182570541861377, + "grad_norm": 0.5229715704917908, + "learning_rate": 2.0915090973956476e-05, + "loss": 0.2475, + "step": 2345 + }, + { + "epoch": 0.04184354154032747, + "grad_norm": 0.5360671281814575, + "learning_rate": 2.0924009989297182e-05, + "loss": 0.2907, + "step": 2346 + }, + { + "epoch": 0.04186137766204116, + "grad_norm": 0.469929575920105, + "learning_rate": 2.093292900463789e-05, + "loss": 0.307, + "step": 2347 + }, + { + "epoch": 0.04187921378375486, + "grad_norm": 0.5420454740524292, + "learning_rate": 2.0941848019978598e-05, + "loss": 0.3359, + "step": 2348 + }, + { + "epoch": 0.04189704990546855, + "grad_norm": 0.5146106481552124, + "learning_rate": 2.09507670353193e-05, + "loss": 0.3435, + "step": 2349 + }, + { + "epoch": 0.04191488602718225, + "grad_norm": 0.7414354085922241, + "learning_rate": 2.0959686050660007e-05, + "loss": 0.3271, + "step": 2350 + }, + { + "epoch": 0.04193272214889594, + "grad_norm": 0.5352063775062561, + "learning_rate": 2.0968605066000714e-05, + "loss": 0.2821, + "step": 2351 + }, + { + "epoch": 0.04195055827060964, + "grad_norm": 0.4640241265296936, + "learning_rate": 2.097752408134142e-05, + "loss": 0.232, + "step": 2352 + }, + { + "epoch": 0.04196839439232333, + "grad_norm": 0.5043139457702637, + "learning_rate": 2.0986443096682127e-05, + "loss": 0.3616, + "step": 2353 + }, + { + "epoch": 0.04198623051403703, + "grad_norm": 0.6212011575698853, + "learning_rate": 2.0995362112022833e-05, + "loss": 0.3228, + "step": 2354 + }, + { + "epoch": 0.04200406663575072, + "grad_norm": 0.5618504881858826, + "learning_rate": 2.100428112736354e-05, + "loss": 0.3604, + "step": 2355 + }, + { + "epoch": 0.04202190275746442, + "grad_norm": 0.42103826999664307, + "learning_rate": 2.1013200142704246e-05, + "loss": 0.3094, + "step": 2356 + }, + { + "epoch": 0.04203973887917811, + "grad_norm": 0.5442635416984558, + "learning_rate": 2.1022119158044952e-05, + "loss": 0.3092, + "step": 2357 + }, + { + "epoch": 0.04205757500089181, + "grad_norm": 0.5144985318183899, + "learning_rate": 2.103103817338566e-05, + "loss": 0.3384, + "step": 2358 + }, + { + "epoch": 0.0420754111226055, + "grad_norm": 0.8168731331825256, + "learning_rate": 2.1039957188726368e-05, + "loss": 0.3549, + "step": 2359 + }, + { + "epoch": 0.0420932472443192, + "grad_norm": 0.8109040856361389, + "learning_rate": 2.104887620406707e-05, + "loss": 0.3192, + "step": 2360 + }, + { + "epoch": 0.04211108336603289, + "grad_norm": 0.5028811693191528, + "learning_rate": 2.1057795219407778e-05, + "loss": 0.3227, + "step": 2361 + }, + { + "epoch": 0.04212891948774659, + "grad_norm": 0.5158123970031738, + "learning_rate": 2.1066714234748487e-05, + "loss": 0.3088, + "step": 2362 + }, + { + "epoch": 0.04214675560946028, + "grad_norm": 0.4773781895637512, + "learning_rate": 2.107563325008919e-05, + "loss": 0.3319, + "step": 2363 + }, + { + "epoch": 0.04216459173117398, + "grad_norm": 0.45322614908218384, + "learning_rate": 2.1084552265429897e-05, + "loss": 0.323, + "step": 2364 + }, + { + "epoch": 0.042182427852887665, + "grad_norm": 0.4446718990802765, + "learning_rate": 2.1093471280770603e-05, + "loss": 0.2892, + "step": 2365 + }, + { + "epoch": 0.04220026397460136, + "grad_norm": 0.8757718801498413, + "learning_rate": 2.110239029611131e-05, + "loss": 0.3844, + "step": 2366 + }, + { + "epoch": 0.042218100096315055, + "grad_norm": 0.5051206946372986, + "learning_rate": 2.1111309311452016e-05, + "loss": 0.2808, + "step": 2367 + }, + { + "epoch": 0.04223593621802875, + "grad_norm": 0.6871507167816162, + "learning_rate": 2.1120228326792722e-05, + "loss": 0.3357, + "step": 2368 + }, + { + "epoch": 0.042253772339742444, + "grad_norm": 0.5976685285568237, + "learning_rate": 2.112914734213343e-05, + "loss": 0.3442, + "step": 2369 + }, + { + "epoch": 0.04227160846145614, + "grad_norm": 0.4438973367214203, + "learning_rate": 2.113806635747414e-05, + "loss": 0.3416, + "step": 2370 + }, + { + "epoch": 0.042289444583169834, + "grad_norm": 0.40539249777793884, + "learning_rate": 2.114698537281484e-05, + "loss": 0.3113, + "step": 2371 + }, + { + "epoch": 0.04230728070488353, + "grad_norm": 0.5118851661682129, + "learning_rate": 2.1155904388155548e-05, + "loss": 0.2747, + "step": 2372 + }, + { + "epoch": 0.042325116826597224, + "grad_norm": 0.4912601411342621, + "learning_rate": 2.1164823403496258e-05, + "loss": 0.3323, + "step": 2373 + }, + { + "epoch": 0.04234295294831092, + "grad_norm": 0.5088908672332764, + "learning_rate": 2.117374241883696e-05, + "loss": 0.3285, + "step": 2374 + }, + { + "epoch": 0.042360789070024614, + "grad_norm": 0.49872055649757385, + "learning_rate": 2.1182661434177667e-05, + "loss": 0.2725, + "step": 2375 + }, + { + "epoch": 0.04237862519173831, + "grad_norm": 0.5044299364089966, + "learning_rate": 2.1191580449518373e-05, + "loss": 0.3251, + "step": 2376 + }, + { + "epoch": 0.042396461313452004, + "grad_norm": 0.3838597238063812, + "learning_rate": 2.120049946485908e-05, + "loss": 0.2625, + "step": 2377 + }, + { + "epoch": 0.0424142974351657, + "grad_norm": 0.522831380367279, + "learning_rate": 2.1209418480199786e-05, + "loss": 0.2986, + "step": 2378 + }, + { + "epoch": 0.042432133556879394, + "grad_norm": 0.6911067366600037, + "learning_rate": 2.1218337495540493e-05, + "loss": 0.3627, + "step": 2379 + }, + { + "epoch": 0.04244996967859309, + "grad_norm": 0.37572261691093445, + "learning_rate": 2.12272565108812e-05, + "loss": 0.2998, + "step": 2380 + }, + { + "epoch": 0.042467805800306783, + "grad_norm": 0.4918407201766968, + "learning_rate": 2.1236175526221905e-05, + "loss": 0.3174, + "step": 2381 + }, + { + "epoch": 0.04248564192202048, + "grad_norm": 0.43629008531570435, + "learning_rate": 2.124509454156261e-05, + "loss": 0.3344, + "step": 2382 + }, + { + "epoch": 0.04250347804373417, + "grad_norm": 0.4399808645248413, + "learning_rate": 2.1254013556903318e-05, + "loss": 0.339, + "step": 2383 + }, + { + "epoch": 0.04252131416544787, + "grad_norm": 0.5683116912841797, + "learning_rate": 2.1262932572244028e-05, + "loss": 0.3148, + "step": 2384 + }, + { + "epoch": 0.042539150287161556, + "grad_norm": 0.4692027270793915, + "learning_rate": 2.127185158758473e-05, + "loss": 0.3272, + "step": 2385 + }, + { + "epoch": 0.04255698640887525, + "grad_norm": 0.5380603671073914, + "learning_rate": 2.1280770602925437e-05, + "loss": 0.3644, + "step": 2386 + }, + { + "epoch": 0.042574822530588946, + "grad_norm": 0.420159250497818, + "learning_rate": 2.1289689618266147e-05, + "loss": 0.3036, + "step": 2387 + }, + { + "epoch": 0.04259265865230264, + "grad_norm": 0.5040472745895386, + "learning_rate": 2.129860863360685e-05, + "loss": 0.3368, + "step": 2388 + }, + { + "epoch": 0.042610494774016336, + "grad_norm": 0.5153719782829285, + "learning_rate": 2.1307527648947556e-05, + "loss": 0.3385, + "step": 2389 + }, + { + "epoch": 0.04262833089573003, + "grad_norm": 0.5071914196014404, + "learning_rate": 2.1316446664288263e-05, + "loss": 0.3093, + "step": 2390 + }, + { + "epoch": 0.042646167017443726, + "grad_norm": 0.4509255588054657, + "learning_rate": 2.132536567962897e-05, + "loss": 0.3208, + "step": 2391 + }, + { + "epoch": 0.04266400313915742, + "grad_norm": 0.48281392455101013, + "learning_rate": 2.1334284694969675e-05, + "loss": 0.329, + "step": 2392 + }, + { + "epoch": 0.042681839260871116, + "grad_norm": 0.5003436207771301, + "learning_rate": 2.1343203710310382e-05, + "loss": 0.3021, + "step": 2393 + }, + { + "epoch": 0.04269967538258481, + "grad_norm": 0.4456157982349396, + "learning_rate": 2.1352122725651088e-05, + "loss": 0.3087, + "step": 2394 + }, + { + "epoch": 0.042717511504298505, + "grad_norm": 0.6597629189491272, + "learning_rate": 2.1361041740991798e-05, + "loss": 0.3174, + "step": 2395 + }, + { + "epoch": 0.0427353476260122, + "grad_norm": 0.4982938766479492, + "learning_rate": 2.13699607563325e-05, + "loss": 0.3214, + "step": 2396 + }, + { + "epoch": 0.042753183747725895, + "grad_norm": 0.31174424290657043, + "learning_rate": 2.1378879771673207e-05, + "loss": 0.3062, + "step": 2397 + }, + { + "epoch": 0.04277101986943959, + "grad_norm": 0.3858984112739563, + "learning_rate": 2.1387798787013917e-05, + "loss": 0.2666, + "step": 2398 + }, + { + "epoch": 0.042788855991153285, + "grad_norm": 0.446459025144577, + "learning_rate": 2.139671780235462e-05, + "loss": 0.2738, + "step": 2399 + }, + { + "epoch": 0.04280669211286698, + "grad_norm": 0.5048308968544006, + "learning_rate": 2.1405636817695327e-05, + "loss": 0.3442, + "step": 2400 + }, + { + "epoch": 0.042824528234580675, + "grad_norm": 0.4895900785923004, + "learning_rate": 2.1414555833036036e-05, + "loss": 0.2963, + "step": 2401 + }, + { + "epoch": 0.04284236435629437, + "grad_norm": 0.7040649652481079, + "learning_rate": 2.142347484837674e-05, + "loss": 0.3823, + "step": 2402 + }, + { + "epoch": 0.042860200478008065, + "grad_norm": 0.4924567937850952, + "learning_rate": 2.1432393863717446e-05, + "loss": 0.2977, + "step": 2403 + }, + { + "epoch": 0.04287803659972176, + "grad_norm": 0.46823903918266296, + "learning_rate": 2.1441312879058152e-05, + "loss": 0.321, + "step": 2404 + }, + { + "epoch": 0.04289587272143545, + "grad_norm": 0.4611259698867798, + "learning_rate": 2.145023189439886e-05, + "loss": 0.2637, + "step": 2405 + }, + { + "epoch": 0.04291370884314914, + "grad_norm": 0.6974970102310181, + "learning_rate": 2.1459150909739568e-05, + "loss": 0.2996, + "step": 2406 + }, + { + "epoch": 0.04293154496486284, + "grad_norm": 0.5615206360816956, + "learning_rate": 2.146806992508027e-05, + "loss": 0.318, + "step": 2407 + }, + { + "epoch": 0.04294938108657653, + "grad_norm": 0.41417187452316284, + "learning_rate": 2.1476988940420978e-05, + "loss": 0.3238, + "step": 2408 + }, + { + "epoch": 0.04296721720829023, + "grad_norm": 0.5840863585472107, + "learning_rate": 2.1485907955761687e-05, + "loss": 0.3218, + "step": 2409 + }, + { + "epoch": 0.04298505333000392, + "grad_norm": 0.48224252462387085, + "learning_rate": 2.149482697110239e-05, + "loss": 0.2747, + "step": 2410 + }, + { + "epoch": 0.04300288945171762, + "grad_norm": 0.4228859841823578, + "learning_rate": 2.1503745986443097e-05, + "loss": 0.3057, + "step": 2411 + }, + { + "epoch": 0.04302072557343131, + "grad_norm": 0.47475579380989075, + "learning_rate": 2.1512665001783806e-05, + "loss": 0.3379, + "step": 2412 + }, + { + "epoch": 0.04303856169514501, + "grad_norm": 0.8998664617538452, + "learning_rate": 2.152158401712451e-05, + "loss": 0.3426, + "step": 2413 + }, + { + "epoch": 0.0430563978168587, + "grad_norm": 0.44181686639785767, + "learning_rate": 2.1530503032465216e-05, + "loss": 0.2845, + "step": 2414 + }, + { + "epoch": 0.0430742339385724, + "grad_norm": 0.6558424234390259, + "learning_rate": 2.1539422047805922e-05, + "loss": 0.3703, + "step": 2415 + }, + { + "epoch": 0.04309207006028609, + "grad_norm": 0.4764523208141327, + "learning_rate": 2.154834106314663e-05, + "loss": 0.2719, + "step": 2416 + }, + { + "epoch": 0.04310990618199979, + "grad_norm": 0.5102486610412598, + "learning_rate": 2.155726007848734e-05, + "loss": 0.3327, + "step": 2417 + }, + { + "epoch": 0.04312774230371348, + "grad_norm": 0.45058155059814453, + "learning_rate": 2.156617909382804e-05, + "loss": 0.3298, + "step": 2418 + }, + { + "epoch": 0.043145578425427177, + "grad_norm": 0.43510904908180237, + "learning_rate": 2.1575098109168748e-05, + "loss": 0.3043, + "step": 2419 + }, + { + "epoch": 0.04316341454714087, + "grad_norm": 0.5134738087654114, + "learning_rate": 2.1584017124509458e-05, + "loss": 0.2935, + "step": 2420 + }, + { + "epoch": 0.043181250668854566, + "grad_norm": 0.46419647336006165, + "learning_rate": 2.159293613985016e-05, + "loss": 0.2851, + "step": 2421 + }, + { + "epoch": 0.04319908679056826, + "grad_norm": 0.3891203999519348, + "learning_rate": 2.1601855155190867e-05, + "loss": 0.3058, + "step": 2422 + }, + { + "epoch": 0.043216922912281956, + "grad_norm": 0.48797038197517395, + "learning_rate": 2.1610774170531577e-05, + "loss": 0.3117, + "step": 2423 + }, + { + "epoch": 0.04323475903399565, + "grad_norm": 0.5592419505119324, + "learning_rate": 2.161969318587228e-05, + "loss": 0.3143, + "step": 2424 + }, + { + "epoch": 0.04325259515570934, + "grad_norm": 0.38748639822006226, + "learning_rate": 2.1628612201212986e-05, + "loss": 0.2404, + "step": 2425 + }, + { + "epoch": 0.043270431277423034, + "grad_norm": 0.4412252604961395, + "learning_rate": 2.1637531216553696e-05, + "loss": 0.267, + "step": 2426 + }, + { + "epoch": 0.04328826739913673, + "grad_norm": 0.5482329726219177, + "learning_rate": 2.16464502318944e-05, + "loss": 0.3145, + "step": 2427 + }, + { + "epoch": 0.043306103520850424, + "grad_norm": 0.6546977758407593, + "learning_rate": 2.1655369247235105e-05, + "loss": 0.3593, + "step": 2428 + }, + { + "epoch": 0.04332393964256412, + "grad_norm": 0.31310614943504333, + "learning_rate": 2.166428826257581e-05, + "loss": 0.2483, + "step": 2429 + }, + { + "epoch": 0.043341775764277814, + "grad_norm": 0.39456674456596375, + "learning_rate": 2.1673207277916518e-05, + "loss": 0.2918, + "step": 2430 + }, + { + "epoch": 0.04335961188599151, + "grad_norm": 0.6820403337478638, + "learning_rate": 2.1682126293257228e-05, + "loss": 0.2927, + "step": 2431 + }, + { + "epoch": 0.043377448007705204, + "grad_norm": 0.5346879363059998, + "learning_rate": 2.169104530859793e-05, + "loss": 0.3189, + "step": 2432 + }, + { + "epoch": 0.0433952841294189, + "grad_norm": 0.4923313558101654, + "learning_rate": 2.1699964323938637e-05, + "loss": 0.2876, + "step": 2433 + }, + { + "epoch": 0.04341312025113259, + "grad_norm": 0.7940296530723572, + "learning_rate": 2.1708883339279347e-05, + "loss": 0.3012, + "step": 2434 + }, + { + "epoch": 0.04343095637284629, + "grad_norm": 0.4262382686138153, + "learning_rate": 2.171780235462005e-05, + "loss": 0.3329, + "step": 2435 + }, + { + "epoch": 0.04344879249455998, + "grad_norm": 0.48211798071861267, + "learning_rate": 2.1726721369960756e-05, + "loss": 0.3255, + "step": 2436 + }, + { + "epoch": 0.04346662861627368, + "grad_norm": 0.42853403091430664, + "learning_rate": 2.1735640385301466e-05, + "loss": 0.2451, + "step": 2437 + }, + { + "epoch": 0.04348446473798737, + "grad_norm": 0.47822248935699463, + "learning_rate": 2.174455940064217e-05, + "loss": 0.3623, + "step": 2438 + }, + { + "epoch": 0.04350230085970107, + "grad_norm": 0.5484446883201599, + "learning_rate": 2.1753478415982875e-05, + "loss": 0.3191, + "step": 2439 + }, + { + "epoch": 0.04352013698141476, + "grad_norm": 0.5001198649406433, + "learning_rate": 2.1762397431323585e-05, + "loss": 0.327, + "step": 2440 + }, + { + "epoch": 0.04353797310312846, + "grad_norm": 0.4653749465942383, + "learning_rate": 2.1771316446664288e-05, + "loss": 0.3595, + "step": 2441 + }, + { + "epoch": 0.04355580922484215, + "grad_norm": 0.4383077621459961, + "learning_rate": 2.1780235462004998e-05, + "loss": 0.3661, + "step": 2442 + }, + { + "epoch": 0.04357364534655585, + "grad_norm": 0.4560093879699707, + "learning_rate": 2.17891544773457e-05, + "loss": 0.2966, + "step": 2443 + }, + { + "epoch": 0.04359148146826954, + "grad_norm": 0.570239245891571, + "learning_rate": 2.1798073492686407e-05, + "loss": 0.3332, + "step": 2444 + }, + { + "epoch": 0.04360931758998324, + "grad_norm": 0.47931545972824097, + "learning_rate": 2.1806992508027117e-05, + "loss": 0.308, + "step": 2445 + }, + { + "epoch": 0.043627153711696925, + "grad_norm": 0.5334298014640808, + "learning_rate": 2.181591152336782e-05, + "loss": 0.3508, + "step": 2446 + }, + { + "epoch": 0.04364498983341062, + "grad_norm": 0.4662988483905792, + "learning_rate": 2.1824830538708526e-05, + "loss": 0.2944, + "step": 2447 + }, + { + "epoch": 0.043662825955124315, + "grad_norm": 0.5688419342041016, + "learning_rate": 2.1833749554049236e-05, + "loss": 0.3501, + "step": 2448 + }, + { + "epoch": 0.04368066207683801, + "grad_norm": 0.44827187061309814, + "learning_rate": 2.184266856938994e-05, + "loss": 0.2859, + "step": 2449 + }, + { + "epoch": 0.043698498198551705, + "grad_norm": 0.5130137801170349, + "learning_rate": 2.1851587584730646e-05, + "loss": 0.3706, + "step": 2450 + }, + { + "epoch": 0.0437163343202654, + "grad_norm": 0.4982846975326538, + "learning_rate": 2.1860506600071355e-05, + "loss": 0.2911, + "step": 2451 + }, + { + "epoch": 0.043734170441979095, + "grad_norm": 0.5739523768424988, + "learning_rate": 2.1869425615412058e-05, + "loss": 0.2886, + "step": 2452 + }, + { + "epoch": 0.04375200656369279, + "grad_norm": 0.5005897283554077, + "learning_rate": 2.1878344630752768e-05, + "loss": 0.3455, + "step": 2453 + }, + { + "epoch": 0.043769842685406485, + "grad_norm": 0.5920192003250122, + "learning_rate": 2.188726364609347e-05, + "loss": 0.3557, + "step": 2454 + }, + { + "epoch": 0.04378767880712018, + "grad_norm": 0.34021905064582825, + "learning_rate": 2.1896182661434177e-05, + "loss": 0.2805, + "step": 2455 + }, + { + "epoch": 0.043805514928833875, + "grad_norm": 0.4086892604827881, + "learning_rate": 2.1905101676774887e-05, + "loss": 0.2633, + "step": 2456 + }, + { + "epoch": 0.04382335105054757, + "grad_norm": 0.5249380469322205, + "learning_rate": 2.191402069211559e-05, + "loss": 0.2992, + "step": 2457 + }, + { + "epoch": 0.043841187172261264, + "grad_norm": 0.3988586664199829, + "learning_rate": 2.1922939707456297e-05, + "loss": 0.2593, + "step": 2458 + }, + { + "epoch": 0.04385902329397496, + "grad_norm": 0.6643022298812866, + "learning_rate": 2.1931858722797006e-05, + "loss": 0.3897, + "step": 2459 + }, + { + "epoch": 0.043876859415688654, + "grad_norm": 0.4478844702243805, + "learning_rate": 2.194077773813771e-05, + "loss": 0.3488, + "step": 2460 + }, + { + "epoch": 0.04389469553740235, + "grad_norm": 0.42367005348205566, + "learning_rate": 2.1949696753478416e-05, + "loss": 0.33, + "step": 2461 + }, + { + "epoch": 0.043912531659116044, + "grad_norm": 0.48345765471458435, + "learning_rate": 2.1958615768819125e-05, + "loss": 0.3366, + "step": 2462 + }, + { + "epoch": 0.04393036778082974, + "grad_norm": 0.4464993178844452, + "learning_rate": 2.196753478415983e-05, + "loss": 0.3225, + "step": 2463 + }, + { + "epoch": 0.043948203902543434, + "grad_norm": 0.43755653500556946, + "learning_rate": 2.1976453799500535e-05, + "loss": 0.2638, + "step": 2464 + }, + { + "epoch": 0.04396604002425713, + "grad_norm": 0.5456868410110474, + "learning_rate": 2.1985372814841245e-05, + "loss": 0.3164, + "step": 2465 + }, + { + "epoch": 0.04398387614597082, + "grad_norm": 0.5072988271713257, + "learning_rate": 2.1994291830181948e-05, + "loss": 0.3305, + "step": 2466 + }, + { + "epoch": 0.04400171226768451, + "grad_norm": 0.6825007796287537, + "learning_rate": 2.2003210845522657e-05, + "loss": 0.3381, + "step": 2467 + }, + { + "epoch": 0.04401954838939821, + "grad_norm": 0.38148221373558044, + "learning_rate": 2.201212986086336e-05, + "loss": 0.2493, + "step": 2468 + }, + { + "epoch": 0.0440373845111119, + "grad_norm": 0.48421597480773926, + "learning_rate": 2.2021048876204067e-05, + "loss": 0.288, + "step": 2469 + }, + { + "epoch": 0.0440552206328256, + "grad_norm": 0.3696889579296112, + "learning_rate": 2.2029967891544777e-05, + "loss": 0.2892, + "step": 2470 + }, + { + "epoch": 0.04407305675453929, + "grad_norm": 0.45962435007095337, + "learning_rate": 2.203888690688548e-05, + "loss": 0.3065, + "step": 2471 + }, + { + "epoch": 0.044090892876252986, + "grad_norm": 0.4431440234184265, + "learning_rate": 2.2047805922226186e-05, + "loss": 0.262, + "step": 2472 + }, + { + "epoch": 0.04410872899796668, + "grad_norm": 0.5578441023826599, + "learning_rate": 2.2056724937566896e-05, + "loss": 0.327, + "step": 2473 + }, + { + "epoch": 0.044126565119680376, + "grad_norm": 0.7166373133659363, + "learning_rate": 2.20656439529076e-05, + "loss": 0.3849, + "step": 2474 + }, + { + "epoch": 0.04414440124139407, + "grad_norm": 0.3914194107055664, + "learning_rate": 2.2074562968248305e-05, + "loss": 0.2384, + "step": 2475 + }, + { + "epoch": 0.044162237363107766, + "grad_norm": 0.696711540222168, + "learning_rate": 2.2083481983589015e-05, + "loss": 0.3363, + "step": 2476 + }, + { + "epoch": 0.04418007348482146, + "grad_norm": 0.5842909216880798, + "learning_rate": 2.2092400998929718e-05, + "loss": 0.3365, + "step": 2477 + }, + { + "epoch": 0.044197909606535156, + "grad_norm": 0.41654083132743835, + "learning_rate": 2.2101320014270428e-05, + "loss": 0.2739, + "step": 2478 + }, + { + "epoch": 0.04421574572824885, + "grad_norm": 0.5053631067276001, + "learning_rate": 2.211023902961113e-05, + "loss": 0.2666, + "step": 2479 + }, + { + "epoch": 0.044233581849962546, + "grad_norm": 0.4576139748096466, + "learning_rate": 2.2119158044951837e-05, + "loss": 0.266, + "step": 2480 + }, + { + "epoch": 0.04425141797167624, + "grad_norm": 0.5171185731887817, + "learning_rate": 2.2128077060292547e-05, + "loss": 0.3709, + "step": 2481 + }, + { + "epoch": 0.044269254093389936, + "grad_norm": 0.5942704677581787, + "learning_rate": 2.213699607563325e-05, + "loss": 0.2764, + "step": 2482 + }, + { + "epoch": 0.04428709021510363, + "grad_norm": 0.5231592655181885, + "learning_rate": 2.2145915090973956e-05, + "loss": 0.3039, + "step": 2483 + }, + { + "epoch": 0.044304926336817325, + "grad_norm": 0.7226346731185913, + "learning_rate": 2.2154834106314666e-05, + "loss": 0.2838, + "step": 2484 + }, + { + "epoch": 0.04432276245853102, + "grad_norm": 0.44686105847358704, + "learning_rate": 2.216375312165537e-05, + "loss": 0.3098, + "step": 2485 + }, + { + "epoch": 0.04434059858024471, + "grad_norm": 0.4780190587043762, + "learning_rate": 2.2172672136996075e-05, + "loss": 0.285, + "step": 2486 + }, + { + "epoch": 0.0443584347019584, + "grad_norm": 0.5844359397888184, + "learning_rate": 2.2181591152336785e-05, + "loss": 0.2865, + "step": 2487 + }, + { + "epoch": 0.0443762708236721, + "grad_norm": 0.36143380403518677, + "learning_rate": 2.2190510167677488e-05, + "loss": 0.2614, + "step": 2488 + }, + { + "epoch": 0.04439410694538579, + "grad_norm": 0.5340003967285156, + "learning_rate": 2.2199429183018198e-05, + "loss": 0.3734, + "step": 2489 + }, + { + "epoch": 0.04441194306709949, + "grad_norm": 0.3827823996543884, + "learning_rate": 2.2208348198358904e-05, + "loss": 0.2946, + "step": 2490 + }, + { + "epoch": 0.04442977918881318, + "grad_norm": 0.5796611905097961, + "learning_rate": 2.2217267213699607e-05, + "loss": 0.3181, + "step": 2491 + }, + { + "epoch": 0.04444761531052688, + "grad_norm": 0.7845086455345154, + "learning_rate": 2.2226186229040317e-05, + "loss": 0.4209, + "step": 2492 + }, + { + "epoch": 0.04446545143224057, + "grad_norm": 0.8272339701652527, + "learning_rate": 2.223510524438102e-05, + "loss": 0.2958, + "step": 2493 + }, + { + "epoch": 0.04448328755395427, + "grad_norm": 0.5014579892158508, + "learning_rate": 2.2244024259721726e-05, + "loss": 0.2514, + "step": 2494 + }, + { + "epoch": 0.04450112367566796, + "grad_norm": 0.5427517294883728, + "learning_rate": 2.2252943275062436e-05, + "loss": 0.3478, + "step": 2495 + }, + { + "epoch": 0.04451895979738166, + "grad_norm": 0.592760443687439, + "learning_rate": 2.226186229040314e-05, + "loss": 0.3795, + "step": 2496 + }, + { + "epoch": 0.04453679591909535, + "grad_norm": 0.40188559889793396, + "learning_rate": 2.2270781305743845e-05, + "loss": 0.3014, + "step": 2497 + }, + { + "epoch": 0.04455463204080905, + "grad_norm": 0.5569307804107666, + "learning_rate": 2.2279700321084555e-05, + "loss": 0.3045, + "step": 2498 + }, + { + "epoch": 0.04457246816252274, + "grad_norm": 0.43672263622283936, + "learning_rate": 2.2288619336425258e-05, + "loss": 0.2574, + "step": 2499 + }, + { + "epoch": 0.04459030428423644, + "grad_norm": 0.42467162013053894, + "learning_rate": 2.2297538351765965e-05, + "loss": 0.3068, + "step": 2500 + }, + { + "epoch": 0.04460814040595013, + "grad_norm": 0.4768904745578766, + "learning_rate": 2.2306457367106674e-05, + "loss": 0.2945, + "step": 2501 + }, + { + "epoch": 0.04462597652766383, + "grad_norm": 0.44962868094444275, + "learning_rate": 2.2315376382447377e-05, + "loss": 0.3175, + "step": 2502 + }, + { + "epoch": 0.04464381264937752, + "grad_norm": 0.4747069478034973, + "learning_rate": 2.2324295397788087e-05, + "loss": 0.3069, + "step": 2503 + }, + { + "epoch": 0.04466164877109122, + "grad_norm": 0.5932212471961975, + "learning_rate": 2.2333214413128793e-05, + "loss": 0.3127, + "step": 2504 + }, + { + "epoch": 0.04467948489280491, + "grad_norm": 0.5715842247009277, + "learning_rate": 2.2342133428469496e-05, + "loss": 0.2807, + "step": 2505 + }, + { + "epoch": 0.0446973210145186, + "grad_norm": 0.45579037070274353, + "learning_rate": 2.2351052443810206e-05, + "loss": 0.3088, + "step": 2506 + }, + { + "epoch": 0.044715157136232295, + "grad_norm": 0.5731293559074402, + "learning_rate": 2.235997145915091e-05, + "loss": 0.3475, + "step": 2507 + }, + { + "epoch": 0.04473299325794599, + "grad_norm": 0.9777519702911377, + "learning_rate": 2.2368890474491616e-05, + "loss": 0.2962, + "step": 2508 + }, + { + "epoch": 0.044750829379659685, + "grad_norm": 0.5079028010368347, + "learning_rate": 2.2377809489832325e-05, + "loss": 0.3421, + "step": 2509 + }, + { + "epoch": 0.04476866550137338, + "grad_norm": 0.40570247173309326, + "learning_rate": 2.238672850517303e-05, + "loss": 0.2726, + "step": 2510 + }, + { + "epoch": 0.044786501623087074, + "grad_norm": 0.39341166615486145, + "learning_rate": 2.2395647520513735e-05, + "loss": 0.2931, + "step": 2511 + }, + { + "epoch": 0.04480433774480077, + "grad_norm": 0.34767574071884155, + "learning_rate": 2.2404566535854445e-05, + "loss": 0.2642, + "step": 2512 + }, + { + "epoch": 0.044822173866514464, + "grad_norm": 0.4499184191226959, + "learning_rate": 2.2413485551195147e-05, + "loss": 0.3253, + "step": 2513 + }, + { + "epoch": 0.04484000998822816, + "grad_norm": 0.6738024950027466, + "learning_rate": 2.2422404566535857e-05, + "loss": 0.3187, + "step": 2514 + }, + { + "epoch": 0.044857846109941854, + "grad_norm": 0.4801851212978363, + "learning_rate": 2.2431323581876564e-05, + "loss": 0.312, + "step": 2515 + }, + { + "epoch": 0.04487568223165555, + "grad_norm": 0.3929755985736847, + "learning_rate": 2.2440242597217267e-05, + "loss": 0.2512, + "step": 2516 + }, + { + "epoch": 0.044893518353369244, + "grad_norm": 0.5497623085975647, + "learning_rate": 2.2449161612557976e-05, + "loss": 0.2886, + "step": 2517 + }, + { + "epoch": 0.04491135447508294, + "grad_norm": 0.49939242005348206, + "learning_rate": 2.245808062789868e-05, + "loss": 0.3265, + "step": 2518 + }, + { + "epoch": 0.044929190596796634, + "grad_norm": 0.45610734820365906, + "learning_rate": 2.2466999643239386e-05, + "loss": 0.2786, + "step": 2519 + }, + { + "epoch": 0.04494702671851033, + "grad_norm": 0.542926549911499, + "learning_rate": 2.2475918658580096e-05, + "loss": 0.3651, + "step": 2520 + }, + { + "epoch": 0.044964862840224024, + "grad_norm": 0.3948073089122772, + "learning_rate": 2.24848376739208e-05, + "loss": 0.269, + "step": 2521 + }, + { + "epoch": 0.04498269896193772, + "grad_norm": 0.5280768275260925, + "learning_rate": 2.2493756689261505e-05, + "loss": 0.3078, + "step": 2522 + }, + { + "epoch": 0.04500053508365141, + "grad_norm": 0.6438367366790771, + "learning_rate": 2.2502675704602215e-05, + "loss": 0.3172, + "step": 2523 + }, + { + "epoch": 0.04501837120536511, + "grad_norm": 0.4665434956550598, + "learning_rate": 2.2511594719942918e-05, + "loss": 0.2809, + "step": 2524 + }, + { + "epoch": 0.0450362073270788, + "grad_norm": 0.35218581557273865, + "learning_rate": 2.2520513735283627e-05, + "loss": 0.2688, + "step": 2525 + }, + { + "epoch": 0.04505404344879249, + "grad_norm": 0.5222311019897461, + "learning_rate": 2.2529432750624334e-05, + "loss": 0.3775, + "step": 2526 + }, + { + "epoch": 0.045071879570506186, + "grad_norm": 0.479929655790329, + "learning_rate": 2.2538351765965037e-05, + "loss": 0.3371, + "step": 2527 + }, + { + "epoch": 0.04508971569221988, + "grad_norm": 0.5023528337478638, + "learning_rate": 2.2547270781305747e-05, + "loss": 0.2818, + "step": 2528 + }, + { + "epoch": 0.045107551813933576, + "grad_norm": 1.1559685468673706, + "learning_rate": 2.2556189796646453e-05, + "loss": 0.3098, + "step": 2529 + }, + { + "epoch": 0.04512538793564727, + "grad_norm": 0.7924062013626099, + "learning_rate": 2.2565108811987156e-05, + "loss": 0.2848, + "step": 2530 + }, + { + "epoch": 0.045143224057360966, + "grad_norm": 0.4267258942127228, + "learning_rate": 2.2574027827327866e-05, + "loss": 0.31, + "step": 2531 + }, + { + "epoch": 0.04516106017907466, + "grad_norm": 0.3967335820198059, + "learning_rate": 2.258294684266857e-05, + "loss": 0.2621, + "step": 2532 + }, + { + "epoch": 0.045178896300788356, + "grad_norm": 0.46475982666015625, + "learning_rate": 2.2591865858009275e-05, + "loss": 0.2736, + "step": 2533 + }, + { + "epoch": 0.04519673242250205, + "grad_norm": 0.43687716126441956, + "learning_rate": 2.2600784873349985e-05, + "loss": 0.2698, + "step": 2534 + }, + { + "epoch": 0.045214568544215746, + "grad_norm": 0.486655056476593, + "learning_rate": 2.2609703888690688e-05, + "loss": 0.3086, + "step": 2535 + }, + { + "epoch": 0.04523240466592944, + "grad_norm": 0.4176377058029175, + "learning_rate": 2.2618622904031398e-05, + "loss": 0.3316, + "step": 2536 + }, + { + "epoch": 0.045250240787643135, + "grad_norm": 0.6278610825538635, + "learning_rate": 2.2627541919372104e-05, + "loss": 0.3458, + "step": 2537 + }, + { + "epoch": 0.04526807690935683, + "grad_norm": 0.5996041893959045, + "learning_rate": 2.2636460934712807e-05, + "loss": 0.3162, + "step": 2538 + }, + { + "epoch": 0.045285913031070525, + "grad_norm": 0.5668736100196838, + "learning_rate": 2.2645379950053517e-05, + "loss": 0.3195, + "step": 2539 + }, + { + "epoch": 0.04530374915278422, + "grad_norm": 0.5813754796981812, + "learning_rate": 2.2654298965394223e-05, + "loss": 0.3059, + "step": 2540 + }, + { + "epoch": 0.045321585274497915, + "grad_norm": 0.672214150428772, + "learning_rate": 2.2663217980734926e-05, + "loss": 0.2986, + "step": 2541 + }, + { + "epoch": 0.04533942139621161, + "grad_norm": 0.8300511837005615, + "learning_rate": 2.2672136996075636e-05, + "loss": 0.2726, + "step": 2542 + }, + { + "epoch": 0.045357257517925305, + "grad_norm": 0.3515639305114746, + "learning_rate": 2.2681056011416342e-05, + "loss": 0.2656, + "step": 2543 + }, + { + "epoch": 0.045375093639639, + "grad_norm": 0.38729098439216614, + "learning_rate": 2.2689975026757045e-05, + "loss": 0.3299, + "step": 2544 + }, + { + "epoch": 0.045392929761352695, + "grad_norm": 0.49949008226394653, + "learning_rate": 2.2698894042097755e-05, + "loss": 0.3128, + "step": 2545 + }, + { + "epoch": 0.04541076588306638, + "grad_norm": 0.6647465229034424, + "learning_rate": 2.2707813057438458e-05, + "loss": 0.3633, + "step": 2546 + }, + { + "epoch": 0.04542860200478008, + "grad_norm": 0.40272918343544006, + "learning_rate": 2.2716732072779164e-05, + "loss": 0.3302, + "step": 2547 + }, + { + "epoch": 0.04544643812649377, + "grad_norm": 0.4645010232925415, + "learning_rate": 2.2725651088119874e-05, + "loss": 0.297, + "step": 2548 + }, + { + "epoch": 0.04546427424820747, + "grad_norm": 0.8034027218818665, + "learning_rate": 2.2734570103460577e-05, + "loss": 0.265, + "step": 2549 + }, + { + "epoch": 0.04548211036992116, + "grad_norm": 0.5579841136932373, + "learning_rate": 2.2743489118801287e-05, + "loss": 0.2942, + "step": 2550 + }, + { + "epoch": 0.04549994649163486, + "grad_norm": 0.7305700778961182, + "learning_rate": 2.2752408134141993e-05, + "loss": 0.3412, + "step": 2551 + }, + { + "epoch": 0.04551778261334855, + "grad_norm": 0.4577408730983734, + "learning_rate": 2.2761327149482696e-05, + "loss": 0.339, + "step": 2552 + }, + { + "epoch": 0.04553561873506225, + "grad_norm": 0.6619099974632263, + "learning_rate": 2.2770246164823406e-05, + "loss": 0.2905, + "step": 2553 + }, + { + "epoch": 0.04555345485677594, + "grad_norm": 0.3685494065284729, + "learning_rate": 2.2779165180164112e-05, + "loss": 0.3248, + "step": 2554 + }, + { + "epoch": 0.04557129097848964, + "grad_norm": 0.5546826124191284, + "learning_rate": 2.2788084195504815e-05, + "loss": 0.2518, + "step": 2555 + }, + { + "epoch": 0.04558912710020333, + "grad_norm": 0.4956215023994446, + "learning_rate": 2.2797003210845525e-05, + "loss": 0.3182, + "step": 2556 + }, + { + "epoch": 0.04560696322191703, + "grad_norm": 0.5660912394523621, + "learning_rate": 2.2805922226186228e-05, + "loss": 0.31, + "step": 2557 + }, + { + "epoch": 0.04562479934363072, + "grad_norm": 0.6326999664306641, + "learning_rate": 2.2814841241526935e-05, + "loss": 0.2687, + "step": 2558 + }, + { + "epoch": 0.04564263546534442, + "grad_norm": 0.5014093518257141, + "learning_rate": 2.2823760256867644e-05, + "loss": 0.2829, + "step": 2559 + }, + { + "epoch": 0.04566047158705811, + "grad_norm": 0.3998367488384247, + "learning_rate": 2.2832679272208347e-05, + "loss": 0.2754, + "step": 2560 + }, + { + "epoch": 0.045678307708771806, + "grad_norm": 0.5275185704231262, + "learning_rate": 2.2841598287549057e-05, + "loss": 0.2952, + "step": 2561 + }, + { + "epoch": 0.0456961438304855, + "grad_norm": 0.8106608986854553, + "learning_rate": 2.2850517302889764e-05, + "loss": 0.3418, + "step": 2562 + }, + { + "epoch": 0.045713979952199196, + "grad_norm": 0.5508556962013245, + "learning_rate": 2.2859436318230467e-05, + "loss": 0.3197, + "step": 2563 + }, + { + "epoch": 0.04573181607391289, + "grad_norm": 0.7296937108039856, + "learning_rate": 2.2868355333571176e-05, + "loss": 0.2893, + "step": 2564 + }, + { + "epoch": 0.045749652195626586, + "grad_norm": 0.40568533539772034, + "learning_rate": 2.2877274348911883e-05, + "loss": 0.3217, + "step": 2565 + }, + { + "epoch": 0.045767488317340274, + "grad_norm": 0.5739626884460449, + "learning_rate": 2.2886193364252586e-05, + "loss": 0.271, + "step": 2566 + }, + { + "epoch": 0.04578532443905397, + "grad_norm": 0.6474645137786865, + "learning_rate": 2.2895112379593295e-05, + "loss": 0.4017, + "step": 2567 + }, + { + "epoch": 0.045803160560767664, + "grad_norm": 0.3779413104057312, + "learning_rate": 2.2904031394934002e-05, + "loss": 0.235, + "step": 2568 + }, + { + "epoch": 0.04582099668248136, + "grad_norm": 0.4976176619529724, + "learning_rate": 2.2912950410274705e-05, + "loss": 0.3104, + "step": 2569 + }, + { + "epoch": 0.045838832804195054, + "grad_norm": 0.5540872812271118, + "learning_rate": 2.2921869425615415e-05, + "loss": 0.3095, + "step": 2570 + }, + { + "epoch": 0.04585666892590875, + "grad_norm": 0.6369584202766418, + "learning_rate": 2.2930788440956118e-05, + "loss": 0.2425, + "step": 2571 + }, + { + "epoch": 0.045874505047622444, + "grad_norm": 0.4630275368690491, + "learning_rate": 2.2939707456296827e-05, + "loss": 0.2756, + "step": 2572 + }, + { + "epoch": 0.04589234116933614, + "grad_norm": 0.6564838886260986, + "learning_rate": 2.2948626471637534e-05, + "loss": 0.335, + "step": 2573 + }, + { + "epoch": 0.045910177291049833, + "grad_norm": 0.4954453110694885, + "learning_rate": 2.2957545486978237e-05, + "loss": 0.2609, + "step": 2574 + }, + { + "epoch": 0.04592801341276353, + "grad_norm": 0.6018038988113403, + "learning_rate": 2.2966464502318946e-05, + "loss": 0.3276, + "step": 2575 + }, + { + "epoch": 0.04594584953447722, + "grad_norm": 0.6929775476455688, + "learning_rate": 2.2975383517659653e-05, + "loss": 0.327, + "step": 2576 + }, + { + "epoch": 0.04596368565619092, + "grad_norm": 0.7452569007873535, + "learning_rate": 2.2984302533000356e-05, + "loss": 0.2921, + "step": 2577 + }, + { + "epoch": 0.04598152177790461, + "grad_norm": 0.7317106127738953, + "learning_rate": 2.2993221548341066e-05, + "loss": 0.2935, + "step": 2578 + }, + { + "epoch": 0.04599935789961831, + "grad_norm": 0.47860607504844666, + "learning_rate": 2.3002140563681772e-05, + "loss": 0.3166, + "step": 2579 + }, + { + "epoch": 0.046017194021332, + "grad_norm": 1.1246650218963623, + "learning_rate": 2.3011059579022475e-05, + "loss": 0.3304, + "step": 2580 + }, + { + "epoch": 0.0460350301430457, + "grad_norm": 0.49457529187202454, + "learning_rate": 2.3019978594363185e-05, + "loss": 0.3005, + "step": 2581 + }, + { + "epoch": 0.04605286626475939, + "grad_norm": 0.49166783690452576, + "learning_rate": 2.3028897609703888e-05, + "loss": 0.3016, + "step": 2582 + }, + { + "epoch": 0.04607070238647309, + "grad_norm": 0.7067668437957764, + "learning_rate": 2.3037816625044594e-05, + "loss": 0.3757, + "step": 2583 + }, + { + "epoch": 0.04608853850818678, + "grad_norm": 0.6487246155738831, + "learning_rate": 2.3046735640385304e-05, + "loss": 0.2881, + "step": 2584 + }, + { + "epoch": 0.04610637462990048, + "grad_norm": 0.873086154460907, + "learning_rate": 2.3055654655726007e-05, + "loss": 0.2515, + "step": 2585 + }, + { + "epoch": 0.046124210751614166, + "grad_norm": 0.4572746157646179, + "learning_rate": 2.3064573671066717e-05, + "loss": 0.2373, + "step": 2586 + }, + { + "epoch": 0.04614204687332786, + "grad_norm": 0.5847325325012207, + "learning_rate": 2.3073492686407423e-05, + "loss": 0.282, + "step": 2587 + }, + { + "epoch": 0.046159882995041555, + "grad_norm": 0.35473737120628357, + "learning_rate": 2.3082411701748126e-05, + "loss": 0.278, + "step": 2588 + }, + { + "epoch": 0.04617771911675525, + "grad_norm": 0.5584276914596558, + "learning_rate": 2.3091330717088836e-05, + "loss": 0.2903, + "step": 2589 + }, + { + "epoch": 0.046195555238468945, + "grad_norm": 0.458571195602417, + "learning_rate": 2.3100249732429542e-05, + "loss": 0.3213, + "step": 2590 + }, + { + "epoch": 0.04621339136018264, + "grad_norm": 0.44304999709129333, + "learning_rate": 2.3109168747770245e-05, + "loss": 0.2991, + "step": 2591 + }, + { + "epoch": 0.046231227481896335, + "grad_norm": 0.5245705246925354, + "learning_rate": 2.3118087763110955e-05, + "loss": 0.381, + "step": 2592 + }, + { + "epoch": 0.04624906360361003, + "grad_norm": 0.48145824670791626, + "learning_rate": 2.312700677845166e-05, + "loss": 0.3074, + "step": 2593 + }, + { + "epoch": 0.046266899725323725, + "grad_norm": 0.6381738781929016, + "learning_rate": 2.3135925793792364e-05, + "loss": 0.2799, + "step": 2594 + }, + { + "epoch": 0.04628473584703742, + "grad_norm": 0.5631144642829895, + "learning_rate": 2.3144844809133074e-05, + "loss": 0.3401, + "step": 2595 + }, + { + "epoch": 0.046302571968751115, + "grad_norm": 0.3579598367214203, + "learning_rate": 2.3153763824473777e-05, + "loss": 0.2796, + "step": 2596 + }, + { + "epoch": 0.04632040809046481, + "grad_norm": 0.8319517970085144, + "learning_rate": 2.3162682839814487e-05, + "loss": 0.2859, + "step": 2597 + }, + { + "epoch": 0.046338244212178505, + "grad_norm": 0.6031146049499512, + "learning_rate": 2.3171601855155193e-05, + "loss": 0.2885, + "step": 2598 + }, + { + "epoch": 0.0463560803338922, + "grad_norm": 0.44405749440193176, + "learning_rate": 2.3180520870495896e-05, + "loss": 0.3487, + "step": 2599 + }, + { + "epoch": 0.046373916455605894, + "grad_norm": 0.5020195245742798, + "learning_rate": 2.3189439885836606e-05, + "loss": 0.3071, + "step": 2600 + }, + { + "epoch": 0.04639175257731959, + "grad_norm": 0.39273878931999207, + "learning_rate": 2.3198358901177312e-05, + "loss": 0.2778, + "step": 2601 + }, + { + "epoch": 0.046409588699033284, + "grad_norm": 0.38813886046409607, + "learning_rate": 2.3207277916518015e-05, + "loss": 0.2887, + "step": 2602 + }, + { + "epoch": 0.04642742482074698, + "grad_norm": 0.4507793188095093, + "learning_rate": 2.3216196931858725e-05, + "loss": 0.2909, + "step": 2603 + }, + { + "epoch": 0.046445260942460674, + "grad_norm": 0.32840240001678467, + "learning_rate": 2.322511594719943e-05, + "loss": 0.2849, + "step": 2604 + }, + { + "epoch": 0.04646309706417437, + "grad_norm": 0.5821751356124878, + "learning_rate": 2.3234034962540134e-05, + "loss": 0.3296, + "step": 2605 + }, + { + "epoch": 0.046480933185888064, + "grad_norm": 0.5966184735298157, + "learning_rate": 2.3242953977880844e-05, + "loss": 0.3656, + "step": 2606 + }, + { + "epoch": 0.04649876930760175, + "grad_norm": 0.42971763014793396, + "learning_rate": 2.325187299322155e-05, + "loss": 0.3098, + "step": 2607 + }, + { + "epoch": 0.04651660542931545, + "grad_norm": 0.43057823181152344, + "learning_rate": 2.3260792008562257e-05, + "loss": 0.3478, + "step": 2608 + }, + { + "epoch": 0.04653444155102914, + "grad_norm": 0.38382527232170105, + "learning_rate": 2.3269711023902963e-05, + "loss": 0.2529, + "step": 2609 + }, + { + "epoch": 0.04655227767274284, + "grad_norm": 0.3996851146221161, + "learning_rate": 2.3278630039243666e-05, + "loss": 0.3395, + "step": 2610 + }, + { + "epoch": 0.04657011379445653, + "grad_norm": 0.5662938952445984, + "learning_rate": 2.3287549054584376e-05, + "loss": 0.3764, + "step": 2611 + }, + { + "epoch": 0.046587949916170227, + "grad_norm": 0.4093371331691742, + "learning_rate": 2.3296468069925083e-05, + "loss": 0.2487, + "step": 2612 + }, + { + "epoch": 0.04660578603788392, + "grad_norm": 0.5064240097999573, + "learning_rate": 2.3305387085265786e-05, + "loss": 0.3307, + "step": 2613 + }, + { + "epoch": 0.046623622159597616, + "grad_norm": 0.4669743776321411, + "learning_rate": 2.3314306100606495e-05, + "loss": 0.284, + "step": 2614 + }, + { + "epoch": 0.04664145828131131, + "grad_norm": 0.30909350514411926, + "learning_rate": 2.33232251159472e-05, + "loss": 0.2529, + "step": 2615 + }, + { + "epoch": 0.046659294403025006, + "grad_norm": 0.43209806084632874, + "learning_rate": 2.3332144131287905e-05, + "loss": 0.3041, + "step": 2616 + }, + { + "epoch": 0.0466771305247387, + "grad_norm": 0.35900524258613586, + "learning_rate": 2.3341063146628614e-05, + "loss": 0.3106, + "step": 2617 + }, + { + "epoch": 0.046694966646452396, + "grad_norm": 0.423440158367157, + "learning_rate": 2.334998216196932e-05, + "loss": 0.3344, + "step": 2618 + }, + { + "epoch": 0.04671280276816609, + "grad_norm": 0.3105817139148712, + "learning_rate": 2.3358901177310027e-05, + "loss": 0.2517, + "step": 2619 + }, + { + "epoch": 0.046730638889879786, + "grad_norm": 0.43081656098365784, + "learning_rate": 2.3367820192650734e-05, + "loss": 0.3203, + "step": 2620 + }, + { + "epoch": 0.04674847501159348, + "grad_norm": 0.35156628489494324, + "learning_rate": 2.3376739207991437e-05, + "loss": 0.2499, + "step": 2621 + }, + { + "epoch": 0.046766311133307176, + "grad_norm": 0.4836219847202301, + "learning_rate": 2.3385658223332146e-05, + "loss": 0.3276, + "step": 2622 + }, + { + "epoch": 0.04678414725502087, + "grad_norm": 0.700020432472229, + "learning_rate": 2.3394577238672853e-05, + "loss": 0.2757, + "step": 2623 + }, + { + "epoch": 0.046801983376734566, + "grad_norm": 0.45737606287002563, + "learning_rate": 2.3403496254013556e-05, + "loss": 0.277, + "step": 2624 + }, + { + "epoch": 0.04681981949844826, + "grad_norm": 0.47991931438446045, + "learning_rate": 2.3412415269354265e-05, + "loss": 0.3075, + "step": 2625 + }, + { + "epoch": 0.046837655620161955, + "grad_norm": 0.7000904083251953, + "learning_rate": 2.3421334284694972e-05, + "loss": 0.3013, + "step": 2626 + }, + { + "epoch": 0.04685549174187564, + "grad_norm": 0.44958168268203735, + "learning_rate": 2.3430253300035675e-05, + "loss": 0.3125, + "step": 2627 + }, + { + "epoch": 0.04687332786358934, + "grad_norm": 0.5032778382301331, + "learning_rate": 2.3439172315376385e-05, + "loss": 0.3368, + "step": 2628 + }, + { + "epoch": 0.04689116398530303, + "grad_norm": 0.5663533806800842, + "learning_rate": 2.344809133071709e-05, + "loss": 0.362, + "step": 2629 + }, + { + "epoch": 0.04690900010701673, + "grad_norm": 0.4374352693557739, + "learning_rate": 2.3457010346057794e-05, + "loss": 0.2393, + "step": 2630 + }, + { + "epoch": 0.04692683622873042, + "grad_norm": 0.4122285842895508, + "learning_rate": 2.3465929361398504e-05, + "loss": 0.2952, + "step": 2631 + }, + { + "epoch": 0.04694467235044412, + "grad_norm": 0.500024139881134, + "learning_rate": 2.347484837673921e-05, + "loss": 0.3238, + "step": 2632 + }, + { + "epoch": 0.04696250847215781, + "grad_norm": 0.31356459856033325, + "learning_rate": 2.3483767392079917e-05, + "loss": 0.2604, + "step": 2633 + }, + { + "epoch": 0.04698034459387151, + "grad_norm": 0.4864186644554138, + "learning_rate": 2.3492686407420623e-05, + "loss": 0.2548, + "step": 2634 + }, + { + "epoch": 0.0469981807155852, + "grad_norm": 0.531172513961792, + "learning_rate": 2.3501605422761326e-05, + "loss": 0.3647, + "step": 2635 + }, + { + "epoch": 0.0470160168372989, + "grad_norm": 0.7010324597358704, + "learning_rate": 2.3510524438102036e-05, + "loss": 0.3049, + "step": 2636 + }, + { + "epoch": 0.04703385295901259, + "grad_norm": 0.5139377117156982, + "learning_rate": 2.3519443453442742e-05, + "loss": 0.3048, + "step": 2637 + }, + { + "epoch": 0.04705168908072629, + "grad_norm": 0.4551222324371338, + "learning_rate": 2.3528362468783445e-05, + "loss": 0.3419, + "step": 2638 + }, + { + "epoch": 0.04706952520243998, + "grad_norm": 0.4624652564525604, + "learning_rate": 2.3537281484124155e-05, + "loss": 0.2818, + "step": 2639 + }, + { + "epoch": 0.04708736132415368, + "grad_norm": 0.3740825951099396, + "learning_rate": 2.354620049946486e-05, + "loss": 0.2796, + "step": 2640 + }, + { + "epoch": 0.04710519744586737, + "grad_norm": 0.4301629066467285, + "learning_rate": 2.3555119514805564e-05, + "loss": 0.2806, + "step": 2641 + }, + { + "epoch": 0.04712303356758107, + "grad_norm": 0.3898015320301056, + "learning_rate": 2.3564038530146274e-05, + "loss": 0.2972, + "step": 2642 + }, + { + "epoch": 0.04714086968929476, + "grad_norm": 0.3900796175003052, + "learning_rate": 2.357295754548698e-05, + "loss": 0.3241, + "step": 2643 + }, + { + "epoch": 0.04715870581100846, + "grad_norm": 0.50234055519104, + "learning_rate": 2.3581876560827687e-05, + "loss": 0.3363, + "step": 2644 + }, + { + "epoch": 0.04717654193272215, + "grad_norm": 0.4925941228866577, + "learning_rate": 2.3590795576168393e-05, + "loss": 0.3285, + "step": 2645 + }, + { + "epoch": 0.04719437805443585, + "grad_norm": 0.41006824374198914, + "learning_rate": 2.35997145915091e-05, + "loss": 0.2546, + "step": 2646 + }, + { + "epoch": 0.047212214176149535, + "grad_norm": 0.7196215391159058, + "learning_rate": 2.3608633606849806e-05, + "loss": 0.2943, + "step": 2647 + }, + { + "epoch": 0.04723005029786323, + "grad_norm": 0.3472844362258911, + "learning_rate": 2.3617552622190512e-05, + "loss": 0.2876, + "step": 2648 + }, + { + "epoch": 0.047247886419576925, + "grad_norm": 0.5129361748695374, + "learning_rate": 2.3626471637531215e-05, + "loss": 0.3762, + "step": 2649 + }, + { + "epoch": 0.04726572254129062, + "grad_norm": 0.452271968126297, + "learning_rate": 2.3635390652871925e-05, + "loss": 0.2862, + "step": 2650 + }, + { + "epoch": 0.047283558663004314, + "grad_norm": 0.4501533508300781, + "learning_rate": 2.364430966821263e-05, + "loss": 0.3249, + "step": 2651 + }, + { + "epoch": 0.04730139478471801, + "grad_norm": 0.49350038170814514, + "learning_rate": 2.3653228683553334e-05, + "loss": 0.2701, + "step": 2652 + }, + { + "epoch": 0.047319230906431704, + "grad_norm": 0.5184900760650635, + "learning_rate": 2.3662147698894044e-05, + "loss": 0.2975, + "step": 2653 + }, + { + "epoch": 0.0473370670281454, + "grad_norm": 0.5035743713378906, + "learning_rate": 2.367106671423475e-05, + "loss": 0.335, + "step": 2654 + }, + { + "epoch": 0.047354903149859094, + "grad_norm": 0.4158850312232971, + "learning_rate": 2.3679985729575457e-05, + "loss": 0.2374, + "step": 2655 + }, + { + "epoch": 0.04737273927157279, + "grad_norm": 0.35874706506729126, + "learning_rate": 2.3688904744916163e-05, + "loss": 0.289, + "step": 2656 + }, + { + "epoch": 0.047390575393286484, + "grad_norm": 0.370504766702652, + "learning_rate": 2.369782376025687e-05, + "loss": 0.3031, + "step": 2657 + }, + { + "epoch": 0.04740841151500018, + "grad_norm": 0.39797043800354004, + "learning_rate": 2.3706742775597576e-05, + "loss": 0.3068, + "step": 2658 + }, + { + "epoch": 0.047426247636713874, + "grad_norm": 0.4915393590927124, + "learning_rate": 2.3715661790938282e-05, + "loss": 0.3244, + "step": 2659 + }, + { + "epoch": 0.04744408375842757, + "grad_norm": 0.515765368938446, + "learning_rate": 2.3724580806278985e-05, + "loss": 0.3086, + "step": 2660 + }, + { + "epoch": 0.047461919880141264, + "grad_norm": 0.4049784243106842, + "learning_rate": 2.3733499821619695e-05, + "loss": 0.3482, + "step": 2661 + }, + { + "epoch": 0.04747975600185496, + "grad_norm": 0.475574254989624, + "learning_rate": 2.37424188369604e-05, + "loss": 0.3228, + "step": 2662 + }, + { + "epoch": 0.047497592123568654, + "grad_norm": 0.6775217056274414, + "learning_rate": 2.3751337852301105e-05, + "loss": 0.2793, + "step": 2663 + }, + { + "epoch": 0.04751542824528235, + "grad_norm": 0.4063296318054199, + "learning_rate": 2.3760256867641814e-05, + "loss": 0.3473, + "step": 2664 + }, + { + "epoch": 0.04753326436699604, + "grad_norm": 0.4551413059234619, + "learning_rate": 2.376917588298252e-05, + "loss": 0.2967, + "step": 2665 + }, + { + "epoch": 0.04755110048870974, + "grad_norm": 0.4418201744556427, + "learning_rate": 2.3778094898323224e-05, + "loss": 0.3215, + "step": 2666 + }, + { + "epoch": 0.047568936610423426, + "grad_norm": 0.4525734782218933, + "learning_rate": 2.3787013913663933e-05, + "loss": 0.3215, + "step": 2667 + }, + { + "epoch": 0.04758677273213712, + "grad_norm": 0.36298611760139465, + "learning_rate": 2.379593292900464e-05, + "loss": 0.2704, + "step": 2668 + }, + { + "epoch": 0.047604608853850816, + "grad_norm": 0.42858415842056274, + "learning_rate": 2.3804851944345346e-05, + "loss": 0.3266, + "step": 2669 + }, + { + "epoch": 0.04762244497556451, + "grad_norm": 0.4325609505176544, + "learning_rate": 2.3813770959686053e-05, + "loss": 0.3107, + "step": 2670 + }, + { + "epoch": 0.047640281097278206, + "grad_norm": 0.5073983669281006, + "learning_rate": 2.382268997502676e-05, + "loss": 0.3308, + "step": 2671 + }, + { + "epoch": 0.0476581172189919, + "grad_norm": 0.3791866600513458, + "learning_rate": 2.3831608990367465e-05, + "loss": 0.3009, + "step": 2672 + }, + { + "epoch": 0.047675953340705596, + "grad_norm": 0.37248167395591736, + "learning_rate": 2.3840528005708172e-05, + "loss": 0.2915, + "step": 2673 + }, + { + "epoch": 0.04769378946241929, + "grad_norm": 0.49261602759361267, + "learning_rate": 2.3849447021048875e-05, + "loss": 0.2993, + "step": 2674 + }, + { + "epoch": 0.047711625584132986, + "grad_norm": 0.36422669887542725, + "learning_rate": 2.3858366036389585e-05, + "loss": 0.2618, + "step": 2675 + }, + { + "epoch": 0.04772946170584668, + "grad_norm": 0.31536272168159485, + "learning_rate": 2.386728505173029e-05, + "loss": 0.2807, + "step": 2676 + }, + { + "epoch": 0.047747297827560375, + "grad_norm": 0.3920343518257141, + "learning_rate": 2.3876204067070994e-05, + "loss": 0.2772, + "step": 2677 + }, + { + "epoch": 0.04776513394927407, + "grad_norm": 0.3932455778121948, + "learning_rate": 2.3885123082411704e-05, + "loss": 0.3162, + "step": 2678 + }, + { + "epoch": 0.047782970070987765, + "grad_norm": 0.455054372549057, + "learning_rate": 2.389404209775241e-05, + "loss": 0.314, + "step": 2679 + }, + { + "epoch": 0.04780080619270146, + "grad_norm": 0.4854518175125122, + "learning_rate": 2.3902961113093116e-05, + "loss": 0.2581, + "step": 2680 + }, + { + "epoch": 0.047818642314415155, + "grad_norm": 0.4947347640991211, + "learning_rate": 2.3911880128433823e-05, + "loss": 0.3596, + "step": 2681 + }, + { + "epoch": 0.04783647843612885, + "grad_norm": 0.48636892437934875, + "learning_rate": 2.392079914377453e-05, + "loss": 0.3124, + "step": 2682 + }, + { + "epoch": 0.047854314557842545, + "grad_norm": 0.4568527042865753, + "learning_rate": 2.3929718159115236e-05, + "loss": 0.3023, + "step": 2683 + }, + { + "epoch": 0.04787215067955624, + "grad_norm": 0.4741967022418976, + "learning_rate": 2.3938637174455942e-05, + "loss": 0.3257, + "step": 2684 + }, + { + "epoch": 0.047889986801269935, + "grad_norm": 0.44635698199272156, + "learning_rate": 2.3947556189796645e-05, + "loss": 0.294, + "step": 2685 + }, + { + "epoch": 0.04790782292298363, + "grad_norm": 0.4073943793773651, + "learning_rate": 2.3956475205137355e-05, + "loss": 0.2582, + "step": 2686 + }, + { + "epoch": 0.04792565904469732, + "grad_norm": 0.5220855474472046, + "learning_rate": 2.396539422047806e-05, + "loss": 0.3106, + "step": 2687 + }, + { + "epoch": 0.04794349516641101, + "grad_norm": 0.46042484045028687, + "learning_rate": 2.3974313235818764e-05, + "loss": 0.2694, + "step": 2688 + }, + { + "epoch": 0.04796133128812471, + "grad_norm": 0.4090389013290405, + "learning_rate": 2.3983232251159474e-05, + "loss": 0.3377, + "step": 2689 + }, + { + "epoch": 0.0479791674098384, + "grad_norm": 0.6514447331428528, + "learning_rate": 2.399215126650018e-05, + "loss": 0.3257, + "step": 2690 + }, + { + "epoch": 0.0479970035315521, + "grad_norm": 0.5830377340316772, + "learning_rate": 2.4001070281840887e-05, + "loss": 0.3815, + "step": 2691 + }, + { + "epoch": 0.04801483965326579, + "grad_norm": 0.5281463861465454, + "learning_rate": 2.4009989297181593e-05, + "loss": 0.3444, + "step": 2692 + }, + { + "epoch": 0.04803267577497949, + "grad_norm": 0.4410761594772339, + "learning_rate": 2.40189083125223e-05, + "loss": 0.3353, + "step": 2693 + }, + { + "epoch": 0.04805051189669318, + "grad_norm": 0.5555058717727661, + "learning_rate": 2.4027827327863006e-05, + "loss": 0.3452, + "step": 2694 + }, + { + "epoch": 0.04806834801840688, + "grad_norm": 0.5104464888572693, + "learning_rate": 2.4036746343203712e-05, + "loss": 0.3165, + "step": 2695 + }, + { + "epoch": 0.04808618414012057, + "grad_norm": 0.5032020211219788, + "learning_rate": 2.404566535854442e-05, + "loss": 0.3224, + "step": 2696 + }, + { + "epoch": 0.04810402026183427, + "grad_norm": 0.41885411739349365, + "learning_rate": 2.4054584373885125e-05, + "loss": 0.2923, + "step": 2697 + }, + { + "epoch": 0.04812185638354796, + "grad_norm": 0.5228544473648071, + "learning_rate": 2.406350338922583e-05, + "loss": 0.4019, + "step": 2698 + }, + { + "epoch": 0.04813969250526166, + "grad_norm": 0.40701359510421753, + "learning_rate": 2.4072422404566534e-05, + "loss": 0.2907, + "step": 2699 + }, + { + "epoch": 0.04815752862697535, + "grad_norm": 0.3381403982639313, + "learning_rate": 2.4081341419907244e-05, + "loss": 0.2482, + "step": 2700 + }, + { + "epoch": 0.04817536474868905, + "grad_norm": 0.9533022046089172, + "learning_rate": 2.409026043524795e-05, + "loss": 0.291, + "step": 2701 + }, + { + "epoch": 0.04819320087040274, + "grad_norm": 0.7095092535018921, + "learning_rate": 2.4099179450588653e-05, + "loss": 0.2945, + "step": 2702 + }, + { + "epoch": 0.048211036992116436, + "grad_norm": 0.580734372138977, + "learning_rate": 2.4108098465929363e-05, + "loss": 0.3661, + "step": 2703 + }, + { + "epoch": 0.04822887311383013, + "grad_norm": 0.6417325139045715, + "learning_rate": 2.411701748127007e-05, + "loss": 0.337, + "step": 2704 + }, + { + "epoch": 0.048246709235543826, + "grad_norm": 0.5337891578674316, + "learning_rate": 2.4125936496610776e-05, + "loss": 0.3528, + "step": 2705 + }, + { + "epoch": 0.04826454535725752, + "grad_norm": 0.5764961838722229, + "learning_rate": 2.4134855511951482e-05, + "loss": 0.341, + "step": 2706 + }, + { + "epoch": 0.04828238147897121, + "grad_norm": 0.37059906125068665, + "learning_rate": 2.414377452729219e-05, + "loss": 0.2727, + "step": 2707 + }, + { + "epoch": 0.048300217600684904, + "grad_norm": 0.5342280268669128, + "learning_rate": 2.4152693542632895e-05, + "loss": 0.3702, + "step": 2708 + }, + { + "epoch": 0.0483180537223986, + "grad_norm": 0.4810682237148285, + "learning_rate": 2.41616125579736e-05, + "loss": 0.2814, + "step": 2709 + }, + { + "epoch": 0.048335889844112294, + "grad_norm": 0.5229276418685913, + "learning_rate": 2.4170531573314308e-05, + "loss": 0.3175, + "step": 2710 + }, + { + "epoch": 0.04835372596582599, + "grad_norm": 0.4699035882949829, + "learning_rate": 2.4179450588655014e-05, + "loss": 0.3248, + "step": 2711 + }, + { + "epoch": 0.048371562087539684, + "grad_norm": 0.48911675810813904, + "learning_rate": 2.418836960399572e-05, + "loss": 0.3354, + "step": 2712 + }, + { + "epoch": 0.04838939820925338, + "grad_norm": 0.7131666541099548, + "learning_rate": 2.4197288619336424e-05, + "loss": 0.3201, + "step": 2713 + }, + { + "epoch": 0.048407234330967074, + "grad_norm": 0.45483145117759705, + "learning_rate": 2.4206207634677133e-05, + "loss": 0.3282, + "step": 2714 + }, + { + "epoch": 0.04842507045268077, + "grad_norm": 0.36584872007369995, + "learning_rate": 2.421512665001784e-05, + "loss": 0.2612, + "step": 2715 + }, + { + "epoch": 0.04844290657439446, + "grad_norm": 0.47576916217803955, + "learning_rate": 2.4224045665358546e-05, + "loss": 0.2965, + "step": 2716 + }, + { + "epoch": 0.04846074269610816, + "grad_norm": 0.7287359237670898, + "learning_rate": 2.4232964680699252e-05, + "loss": 0.275, + "step": 2717 + }, + { + "epoch": 0.04847857881782185, + "grad_norm": 0.5523488521575928, + "learning_rate": 2.424188369603996e-05, + "loss": 0.322, + "step": 2718 + }, + { + "epoch": 0.04849641493953555, + "grad_norm": 0.5767621397972107, + "learning_rate": 2.4250802711380665e-05, + "loss": 0.317, + "step": 2719 + }, + { + "epoch": 0.04851425106124924, + "grad_norm": 1.306100606918335, + "learning_rate": 2.425972172672137e-05, + "loss": 0.2773, + "step": 2720 + }, + { + "epoch": 0.04853208718296294, + "grad_norm": 0.38748273253440857, + "learning_rate": 2.4268640742062078e-05, + "loss": 0.2858, + "step": 2721 + }, + { + "epoch": 0.04854992330467663, + "grad_norm": 0.49541452527046204, + "learning_rate": 2.4277559757402784e-05, + "loss": 0.2528, + "step": 2722 + }, + { + "epoch": 0.04856775942639033, + "grad_norm": 0.3869876265525818, + "learning_rate": 2.428647877274349e-05, + "loss": 0.3253, + "step": 2723 + }, + { + "epoch": 0.04858559554810402, + "grad_norm": 0.6745806932449341, + "learning_rate": 2.4295397788084194e-05, + "loss": 0.3011, + "step": 2724 + }, + { + "epoch": 0.04860343166981772, + "grad_norm": 0.4834120273590088, + "learning_rate": 2.4304316803424904e-05, + "loss": 0.3062, + "step": 2725 + }, + { + "epoch": 0.04862126779153141, + "grad_norm": 0.3577546179294586, + "learning_rate": 2.431323581876561e-05, + "loss": 0.2635, + "step": 2726 + }, + { + "epoch": 0.0486391039132451, + "grad_norm": 0.6025672554969788, + "learning_rate": 2.4322154834106316e-05, + "loss": 0.3077, + "step": 2727 + }, + { + "epoch": 0.048656940034958795, + "grad_norm": 0.5578452944755554, + "learning_rate": 2.4331073849447023e-05, + "loss": 0.2975, + "step": 2728 + }, + { + "epoch": 0.04867477615667249, + "grad_norm": 0.40623703598976135, + "learning_rate": 2.433999286478773e-05, + "loss": 0.2308, + "step": 2729 + }, + { + "epoch": 0.048692612278386185, + "grad_norm": 0.40554261207580566, + "learning_rate": 2.4348911880128435e-05, + "loss": 0.267, + "step": 2730 + }, + { + "epoch": 0.04871044840009988, + "grad_norm": 0.5625680685043335, + "learning_rate": 2.4357830895469142e-05, + "loss": 0.3159, + "step": 2731 + }, + { + "epoch": 0.048728284521813575, + "grad_norm": 0.5170318484306335, + "learning_rate": 2.4366749910809848e-05, + "loss": 0.3053, + "step": 2732 + }, + { + "epoch": 0.04874612064352727, + "grad_norm": 0.41871926188468933, + "learning_rate": 2.4375668926150555e-05, + "loss": 0.2673, + "step": 2733 + }, + { + "epoch": 0.048763956765240965, + "grad_norm": 0.4423244595527649, + "learning_rate": 2.438458794149126e-05, + "loss": 0.2791, + "step": 2734 + }, + { + "epoch": 0.04878179288695466, + "grad_norm": 0.47761958837509155, + "learning_rate": 2.4393506956831967e-05, + "loss": 0.2933, + "step": 2735 + }, + { + "epoch": 0.048799629008668355, + "grad_norm": 0.5461574792861938, + "learning_rate": 2.4402425972172674e-05, + "loss": 0.3336, + "step": 2736 + }, + { + "epoch": 0.04881746513038205, + "grad_norm": 0.5050264000892639, + "learning_rate": 2.441134498751338e-05, + "loss": 0.276, + "step": 2737 + }, + { + "epoch": 0.048835301252095745, + "grad_norm": 1.313736915588379, + "learning_rate": 2.4420264002854086e-05, + "loss": 0.3089, + "step": 2738 + }, + { + "epoch": 0.04885313737380944, + "grad_norm": 0.5269498229026794, + "learning_rate": 2.4429183018194793e-05, + "loss": 0.3282, + "step": 2739 + }, + { + "epoch": 0.048870973495523135, + "grad_norm": 0.5273535847663879, + "learning_rate": 2.44381020335355e-05, + "loss": 0.3163, + "step": 2740 + }, + { + "epoch": 0.04888880961723683, + "grad_norm": 0.8047959804534912, + "learning_rate": 2.4447021048876206e-05, + "loss": 0.2968, + "step": 2741 + }, + { + "epoch": 0.048906645738950524, + "grad_norm": 0.9569438099861145, + "learning_rate": 2.4455940064216912e-05, + "loss": 0.3439, + "step": 2742 + }, + { + "epoch": 0.04892448186066422, + "grad_norm": 0.828893780708313, + "learning_rate": 2.446485907955762e-05, + "loss": 0.2937, + "step": 2743 + }, + { + "epoch": 0.048942317982377914, + "grad_norm": 0.3880298435688019, + "learning_rate": 2.4473778094898325e-05, + "loss": 0.248, + "step": 2744 + }, + { + "epoch": 0.04896015410409161, + "grad_norm": 0.5503308773040771, + "learning_rate": 2.448269711023903e-05, + "loss": 0.2936, + "step": 2745 + }, + { + "epoch": 0.048977990225805304, + "grad_norm": 0.4243747293949127, + "learning_rate": 2.4491616125579738e-05, + "loss": 0.2479, + "step": 2746 + }, + { + "epoch": 0.04899582634751899, + "grad_norm": 0.37995001673698425, + "learning_rate": 2.4500535140920444e-05, + "loss": 0.2733, + "step": 2747 + }, + { + "epoch": 0.04901366246923269, + "grad_norm": 0.4783131778240204, + "learning_rate": 2.450945415626115e-05, + "loss": 0.3016, + "step": 2748 + }, + { + "epoch": 0.04903149859094638, + "grad_norm": 0.41291674971580505, + "learning_rate": 2.4518373171601857e-05, + "loss": 0.2507, + "step": 2749 + }, + { + "epoch": 0.04904933471266008, + "grad_norm": 0.470490962266922, + "learning_rate": 2.4527292186942563e-05, + "loss": 0.3242, + "step": 2750 + }, + { + "epoch": 0.04906717083437377, + "grad_norm": 0.4082280099391937, + "learning_rate": 2.453621120228327e-05, + "loss": 0.2922, + "step": 2751 + }, + { + "epoch": 0.04908500695608747, + "grad_norm": 0.4830440580844879, + "learning_rate": 2.4545130217623976e-05, + "loss": 0.3209, + "step": 2752 + }, + { + "epoch": 0.04910284307780116, + "grad_norm": 0.4787217676639557, + "learning_rate": 2.4554049232964682e-05, + "loss": 0.2934, + "step": 2753 + }, + { + "epoch": 0.049120679199514856, + "grad_norm": 0.5571637749671936, + "learning_rate": 2.456296824830539e-05, + "loss": 0.3278, + "step": 2754 + }, + { + "epoch": 0.04913851532122855, + "grad_norm": 0.48631027340888977, + "learning_rate": 2.4571887263646095e-05, + "loss": 0.2961, + "step": 2755 + }, + { + "epoch": 0.049156351442942246, + "grad_norm": 0.3554821312427521, + "learning_rate": 2.45808062789868e-05, + "loss": 0.2561, + "step": 2756 + }, + { + "epoch": 0.04917418756465594, + "grad_norm": 0.365263432264328, + "learning_rate": 2.4589725294327508e-05, + "loss": 0.2289, + "step": 2757 + }, + { + "epoch": 0.049192023686369636, + "grad_norm": 0.5041653513908386, + "learning_rate": 2.4598644309668214e-05, + "loss": 0.3571, + "step": 2758 + }, + { + "epoch": 0.04920985980808333, + "grad_norm": 0.4902588427066803, + "learning_rate": 2.460756332500892e-05, + "loss": 0.3084, + "step": 2759 + }, + { + "epoch": 0.049227695929797026, + "grad_norm": 0.5168266296386719, + "learning_rate": 2.4616482340349627e-05, + "loss": 0.3666, + "step": 2760 + }, + { + "epoch": 0.04924553205151072, + "grad_norm": 0.44216209650039673, + "learning_rate": 2.4625401355690333e-05, + "loss": 0.2371, + "step": 2761 + }, + { + "epoch": 0.049263368173224416, + "grad_norm": 0.35730811953544617, + "learning_rate": 2.463432037103104e-05, + "loss": 0.2762, + "step": 2762 + }, + { + "epoch": 0.04928120429493811, + "grad_norm": 0.5990766882896423, + "learning_rate": 2.4643239386371746e-05, + "loss": 0.2745, + "step": 2763 + }, + { + "epoch": 0.049299040416651806, + "grad_norm": 0.43171870708465576, + "learning_rate": 2.4652158401712452e-05, + "loss": 0.2774, + "step": 2764 + }, + { + "epoch": 0.0493168765383655, + "grad_norm": 0.384461373090744, + "learning_rate": 2.466107741705316e-05, + "loss": 0.2606, + "step": 2765 + }, + { + "epoch": 0.049334712660079195, + "grad_norm": 0.6836086511611938, + "learning_rate": 2.4669996432393865e-05, + "loss": 0.2675, + "step": 2766 + }, + { + "epoch": 0.04935254878179289, + "grad_norm": 0.4748278558254242, + "learning_rate": 2.467891544773457e-05, + "loss": 0.3026, + "step": 2767 + }, + { + "epoch": 0.04937038490350658, + "grad_norm": 0.4196484386920929, + "learning_rate": 2.4687834463075278e-05, + "loss": 0.252, + "step": 2768 + }, + { + "epoch": 0.04938822102522027, + "grad_norm": 0.4558366537094116, + "learning_rate": 2.4696753478415984e-05, + "loss": 0.3075, + "step": 2769 + }, + { + "epoch": 0.04940605714693397, + "grad_norm": 0.7233859896659851, + "learning_rate": 2.470567249375669e-05, + "loss": 0.3477, + "step": 2770 + }, + { + "epoch": 0.04942389326864766, + "grad_norm": 0.6275060772895813, + "learning_rate": 2.4714591509097397e-05, + "loss": 0.3877, + "step": 2771 + }, + { + "epoch": 0.04944172939036136, + "grad_norm": 0.462270051240921, + "learning_rate": 2.4723510524438103e-05, + "loss": 0.2872, + "step": 2772 + }, + { + "epoch": 0.04945956551207505, + "grad_norm": 0.38063880801200867, + "learning_rate": 2.473242953977881e-05, + "loss": 0.286, + "step": 2773 + }, + { + "epoch": 0.04947740163378875, + "grad_norm": 0.6094436645507812, + "learning_rate": 2.4741348555119516e-05, + "loss": 0.282, + "step": 2774 + }, + { + "epoch": 0.04949523775550244, + "grad_norm": 0.43387001752853394, + "learning_rate": 2.4750267570460223e-05, + "loss": 0.2644, + "step": 2775 + }, + { + "epoch": 0.04951307387721614, + "grad_norm": 0.47578999400138855, + "learning_rate": 2.475918658580093e-05, + "loss": 0.2786, + "step": 2776 + }, + { + "epoch": 0.04953090999892983, + "grad_norm": 0.6152646541595459, + "learning_rate": 2.4768105601141635e-05, + "loss": 0.3084, + "step": 2777 + }, + { + "epoch": 0.04954874612064353, + "grad_norm": 0.7436725497245789, + "learning_rate": 2.477702461648234e-05, + "loss": 0.3556, + "step": 2778 + }, + { + "epoch": 0.04956658224235722, + "grad_norm": 0.376797616481781, + "learning_rate": 2.4785943631823048e-05, + "loss": 0.2824, + "step": 2779 + }, + { + "epoch": 0.04958441836407092, + "grad_norm": 0.5116544365882874, + "learning_rate": 2.4794862647163754e-05, + "loss": 0.3458, + "step": 2780 + }, + { + "epoch": 0.04960225448578461, + "grad_norm": 0.4267617464065552, + "learning_rate": 2.480378166250446e-05, + "loss": 0.2801, + "step": 2781 + }, + { + "epoch": 0.04962009060749831, + "grad_norm": 0.7026563286781311, + "learning_rate": 2.4812700677845167e-05, + "loss": 0.2947, + "step": 2782 + }, + { + "epoch": 0.049637926729212, + "grad_norm": 0.6572867035865784, + "learning_rate": 2.4821619693185874e-05, + "loss": 0.2629, + "step": 2783 + }, + { + "epoch": 0.0496557628509257, + "grad_norm": 0.4855692982673645, + "learning_rate": 2.483053870852658e-05, + "loss": 0.2915, + "step": 2784 + }, + { + "epoch": 0.04967359897263939, + "grad_norm": 0.34191015362739563, + "learning_rate": 2.4839457723867286e-05, + "loss": 0.2672, + "step": 2785 + }, + { + "epoch": 0.04969143509435309, + "grad_norm": 0.5398693680763245, + "learning_rate": 2.4848376739207993e-05, + "loss": 0.285, + "step": 2786 + }, + { + "epoch": 0.04970927121606678, + "grad_norm": 0.5253065824508667, + "learning_rate": 2.48572957545487e-05, + "loss": 0.3257, + "step": 2787 + }, + { + "epoch": 0.04972710733778047, + "grad_norm": 0.4885236322879791, + "learning_rate": 2.4866214769889405e-05, + "loss": 0.3412, + "step": 2788 + }, + { + "epoch": 0.049744943459494165, + "grad_norm": 0.5017727017402649, + "learning_rate": 2.4875133785230112e-05, + "loss": 0.3061, + "step": 2789 + }, + { + "epoch": 0.04976277958120786, + "grad_norm": 0.5432559251785278, + "learning_rate": 2.4884052800570818e-05, + "loss": 0.3005, + "step": 2790 + }, + { + "epoch": 0.049780615702921555, + "grad_norm": 0.39586201310157776, + "learning_rate": 2.4892971815911525e-05, + "loss": 0.3135, + "step": 2791 + }, + { + "epoch": 0.04979845182463525, + "grad_norm": 0.4748157262802124, + "learning_rate": 2.490189083125223e-05, + "loss": 0.2895, + "step": 2792 + }, + { + "epoch": 0.049816287946348944, + "grad_norm": 0.4283110201358795, + "learning_rate": 2.4910809846592937e-05, + "loss": 0.3083, + "step": 2793 + }, + { + "epoch": 0.04983412406806264, + "grad_norm": 0.427421897649765, + "learning_rate": 2.4919728861933644e-05, + "loss": 0.291, + "step": 2794 + }, + { + "epoch": 0.049851960189776334, + "grad_norm": 0.5595164895057678, + "learning_rate": 2.492864787727435e-05, + "loss": 0.3381, + "step": 2795 + }, + { + "epoch": 0.04986979631149003, + "grad_norm": 0.36938780546188354, + "learning_rate": 2.4937566892615057e-05, + "loss": 0.2386, + "step": 2796 + }, + { + "epoch": 0.049887632433203724, + "grad_norm": 0.495216965675354, + "learning_rate": 2.4946485907955763e-05, + "loss": 0.2872, + "step": 2797 + }, + { + "epoch": 0.04990546855491742, + "grad_norm": 0.5744511485099792, + "learning_rate": 2.495540492329647e-05, + "loss": 0.2605, + "step": 2798 + }, + { + "epoch": 0.049923304676631114, + "grad_norm": 0.4692898392677307, + "learning_rate": 2.4964323938637176e-05, + "loss": 0.2574, + "step": 2799 + }, + { + "epoch": 0.04994114079834481, + "grad_norm": 0.6649041771888733, + "learning_rate": 2.4973242953977882e-05, + "loss": 0.3636, + "step": 2800 + }, + { + "epoch": 0.049958976920058504, + "grad_norm": 0.4850791394710541, + "learning_rate": 2.498216196931859e-05, + "loss": 0.3021, + "step": 2801 + }, + { + "epoch": 0.0499768130417722, + "grad_norm": 0.3736015260219574, + "learning_rate": 2.4991080984659295e-05, + "loss": 0.2793, + "step": 2802 + }, + { + "epoch": 0.049994649163485894, + "grad_norm": 0.45169389247894287, + "learning_rate": 2.5e-05, + "loss": 0.2555, + "step": 2803 + }, + { + "epoch": 0.05001248528519959, + "grad_norm": 0.40781697630882263, + "learning_rate": 2.500891901534071e-05, + "loss": 0.3162, + "step": 2804 + }, + { + "epoch": 0.05003032140691328, + "grad_norm": 0.48869258165359497, + "learning_rate": 2.501783803068141e-05, + "loss": 0.341, + "step": 2805 + }, + { + "epoch": 0.05004815752862698, + "grad_norm": 0.4770379960536957, + "learning_rate": 2.502675704602212e-05, + "loss": 0.2912, + "step": 2806 + }, + { + "epoch": 0.05006599365034067, + "grad_norm": 0.5151196122169495, + "learning_rate": 2.5035676061362827e-05, + "loss": 0.2994, + "step": 2807 + }, + { + "epoch": 0.05008382977205436, + "grad_norm": 0.9012157320976257, + "learning_rate": 2.504459507670353e-05, + "loss": 0.2494, + "step": 2808 + }, + { + "epoch": 0.050101665893768056, + "grad_norm": 0.48816928267478943, + "learning_rate": 2.505351409204424e-05, + "loss": 0.335, + "step": 2809 + }, + { + "epoch": 0.05011950201548175, + "grad_norm": 0.478715181350708, + "learning_rate": 2.5062433107384946e-05, + "loss": 0.2973, + "step": 2810 + }, + { + "epoch": 0.050137338137195446, + "grad_norm": 0.5877270102500916, + "learning_rate": 2.507135212272565e-05, + "loss": 0.4271, + "step": 2811 + }, + { + "epoch": 0.05015517425890914, + "grad_norm": 0.39555367827415466, + "learning_rate": 2.508027113806636e-05, + "loss": 0.2391, + "step": 2812 + }, + { + "epoch": 0.050173010380622836, + "grad_norm": 0.3225173354148865, + "learning_rate": 2.5089190153407065e-05, + "loss": 0.2875, + "step": 2813 + }, + { + "epoch": 0.05019084650233653, + "grad_norm": 0.5344067811965942, + "learning_rate": 2.5098109168747768e-05, + "loss": 0.3146, + "step": 2814 + }, + { + "epoch": 0.050208682624050226, + "grad_norm": 0.4604073464870453, + "learning_rate": 2.5107028184088478e-05, + "loss": 0.2374, + "step": 2815 + }, + { + "epoch": 0.05022651874576392, + "grad_norm": 0.5769338011741638, + "learning_rate": 2.5115947199429184e-05, + "loss": 0.3054, + "step": 2816 + }, + { + "epoch": 0.050244354867477616, + "grad_norm": 0.519643247127533, + "learning_rate": 2.5124866214769894e-05, + "loss": 0.3037, + "step": 2817 + }, + { + "epoch": 0.05026219098919131, + "grad_norm": 0.3918305039405823, + "learning_rate": 2.5133785230110597e-05, + "loss": 0.287, + "step": 2818 + }, + { + "epoch": 0.050280027110905005, + "grad_norm": 0.8546401262283325, + "learning_rate": 2.5142704245451303e-05, + "loss": 0.3325, + "step": 2819 + }, + { + "epoch": 0.0502978632326187, + "grad_norm": 0.47385314106941223, + "learning_rate": 2.5151623260792013e-05, + "loss": 0.2745, + "step": 2820 + }, + { + "epoch": 0.050315699354332395, + "grad_norm": 0.426802396774292, + "learning_rate": 2.5160542276132716e-05, + "loss": 0.3601, + "step": 2821 + }, + { + "epoch": 0.05033353547604609, + "grad_norm": 0.5062058568000793, + "learning_rate": 2.5169461291473422e-05, + "loss": 0.2857, + "step": 2822 + }, + { + "epoch": 0.050351371597759785, + "grad_norm": 0.48094964027404785, + "learning_rate": 2.5178380306814132e-05, + "loss": 0.2974, + "step": 2823 + }, + { + "epoch": 0.05036920771947348, + "grad_norm": 0.4357699751853943, + "learning_rate": 2.5187299322154835e-05, + "loss": 0.298, + "step": 2824 + }, + { + "epoch": 0.050387043841187175, + "grad_norm": 0.5114694833755493, + "learning_rate": 2.519621833749554e-05, + "loss": 0.3536, + "step": 2825 + }, + { + "epoch": 0.05040487996290087, + "grad_norm": 0.4184374213218689, + "learning_rate": 2.520513735283625e-05, + "loss": 0.2515, + "step": 2826 + }, + { + "epoch": 0.050422716084614565, + "grad_norm": 0.492558091878891, + "learning_rate": 2.521405636817695e-05, + "loss": 0.3362, + "step": 2827 + }, + { + "epoch": 0.05044055220632825, + "grad_norm": 0.5285553932189941, + "learning_rate": 2.522297538351766e-05, + "loss": 0.2891, + "step": 2828 + }, + { + "epoch": 0.05045838832804195, + "grad_norm": 0.42458224296569824, + "learning_rate": 2.523189439885837e-05, + "loss": 0.2751, + "step": 2829 + }, + { + "epoch": 0.05047622444975564, + "grad_norm": 0.4313815236091614, + "learning_rate": 2.524081341419907e-05, + "loss": 0.3602, + "step": 2830 + }, + { + "epoch": 0.05049406057146934, + "grad_norm": 0.5407450199127197, + "learning_rate": 2.524973242953978e-05, + "loss": 0.3251, + "step": 2831 + }, + { + "epoch": 0.05051189669318303, + "grad_norm": 0.412727028131485, + "learning_rate": 2.5258651444880486e-05, + "loss": 0.2634, + "step": 2832 + }, + { + "epoch": 0.05052973281489673, + "grad_norm": 0.2968381345272064, + "learning_rate": 2.526757046022119e-05, + "loss": 0.2663, + "step": 2833 + }, + { + "epoch": 0.05054756893661042, + "grad_norm": 0.6257922053337097, + "learning_rate": 2.52764894755619e-05, + "loss": 0.3231, + "step": 2834 + }, + { + "epoch": 0.05056540505832412, + "grad_norm": 0.5430404543876648, + "learning_rate": 2.5285408490902605e-05, + "loss": 0.3239, + "step": 2835 + }, + { + "epoch": 0.05058324118003781, + "grad_norm": 0.4025420546531677, + "learning_rate": 2.529432750624331e-05, + "loss": 0.2496, + "step": 2836 + }, + { + "epoch": 0.05060107730175151, + "grad_norm": 0.49791595339775085, + "learning_rate": 2.5303246521584018e-05, + "loss": 0.2949, + "step": 2837 + }, + { + "epoch": 0.0506189134234652, + "grad_norm": 0.5615082383155823, + "learning_rate": 2.5312165536924725e-05, + "loss": 0.3253, + "step": 2838 + }, + { + "epoch": 0.0506367495451789, + "grad_norm": 0.4231736958026886, + "learning_rate": 2.5321084552265434e-05, + "loss": 0.2426, + "step": 2839 + }, + { + "epoch": 0.05065458566689259, + "grad_norm": 0.3824504017829895, + "learning_rate": 2.5330003567606137e-05, + "loss": 0.2586, + "step": 2840 + }, + { + "epoch": 0.05067242178860629, + "grad_norm": 0.4817415475845337, + "learning_rate": 2.5338922582946844e-05, + "loss": 0.2663, + "step": 2841 + }, + { + "epoch": 0.05069025791031998, + "grad_norm": 0.45310771465301514, + "learning_rate": 2.5347841598287553e-05, + "loss": 0.2725, + "step": 2842 + }, + { + "epoch": 0.050708094032033676, + "grad_norm": 0.6365208625793457, + "learning_rate": 2.5356760613628256e-05, + "loss": 0.297, + "step": 2843 + }, + { + "epoch": 0.05072593015374737, + "grad_norm": 0.4242004454135895, + "learning_rate": 2.5365679628968963e-05, + "loss": 0.2659, + "step": 2844 + }, + { + "epoch": 0.050743766275461066, + "grad_norm": 0.7390111684799194, + "learning_rate": 2.5374598644309673e-05, + "loss": 0.3009, + "step": 2845 + }, + { + "epoch": 0.05076160239717476, + "grad_norm": 0.38651764392852783, + "learning_rate": 2.5383517659650376e-05, + "loss": 0.2754, + "step": 2846 + }, + { + "epoch": 0.050779438518888456, + "grad_norm": 0.4260665774345398, + "learning_rate": 2.5392436674991082e-05, + "loss": 0.3592, + "step": 2847 + }, + { + "epoch": 0.050797274640602144, + "grad_norm": 0.45752131938934326, + "learning_rate": 2.5401355690331792e-05, + "loss": 0.2646, + "step": 2848 + }, + { + "epoch": 0.05081511076231584, + "grad_norm": 0.4391932189464569, + "learning_rate": 2.5410274705672495e-05, + "loss": 0.3309, + "step": 2849 + }, + { + "epoch": 0.050832946884029534, + "grad_norm": 0.4140661060810089, + "learning_rate": 2.54191937210132e-05, + "loss": 0.2656, + "step": 2850 + }, + { + "epoch": 0.05085078300574323, + "grad_norm": 0.4588901400566101, + "learning_rate": 2.542811273635391e-05, + "loss": 0.3797, + "step": 2851 + }, + { + "epoch": 0.050868619127456924, + "grad_norm": 0.5754001140594482, + "learning_rate": 2.5437031751694614e-05, + "loss": 0.2538, + "step": 2852 + }, + { + "epoch": 0.05088645524917062, + "grad_norm": 0.37646663188934326, + "learning_rate": 2.544595076703532e-05, + "loss": 0.2321, + "step": 2853 + }, + { + "epoch": 0.050904291370884314, + "grad_norm": 0.5341233015060425, + "learning_rate": 2.545486978237603e-05, + "loss": 0.2833, + "step": 2854 + }, + { + "epoch": 0.05092212749259801, + "grad_norm": 0.44105154275894165, + "learning_rate": 2.546378879771673e-05, + "loss": 0.2872, + "step": 2855 + }, + { + "epoch": 0.050939963614311703, + "grad_norm": 0.4411798119544983, + "learning_rate": 2.547270781305744e-05, + "loss": 0.3343, + "step": 2856 + }, + { + "epoch": 0.0509577997360254, + "grad_norm": 0.4009215533733368, + "learning_rate": 2.5481626828398146e-05, + "loss": 0.2879, + "step": 2857 + }, + { + "epoch": 0.05097563585773909, + "grad_norm": 0.43954575061798096, + "learning_rate": 2.549054584373885e-05, + "loss": 0.3138, + "step": 2858 + }, + { + "epoch": 0.05099347197945279, + "grad_norm": 0.425375759601593, + "learning_rate": 2.549946485907956e-05, + "loss": 0.31, + "step": 2859 + }, + { + "epoch": 0.05101130810116648, + "grad_norm": 0.3734534680843353, + "learning_rate": 2.5508383874420265e-05, + "loss": 0.2811, + "step": 2860 + }, + { + "epoch": 0.05102914422288018, + "grad_norm": 0.449495792388916, + "learning_rate": 2.5517302889760968e-05, + "loss": 0.2842, + "step": 2861 + }, + { + "epoch": 0.05104698034459387, + "grad_norm": 0.415831595659256, + "learning_rate": 2.5526221905101678e-05, + "loss": 0.3001, + "step": 2862 + }, + { + "epoch": 0.05106481646630757, + "grad_norm": 0.522121787071228, + "learning_rate": 2.5535140920442384e-05, + "loss": 0.2817, + "step": 2863 + }, + { + "epoch": 0.05108265258802126, + "grad_norm": 2.2089128494262695, + "learning_rate": 2.5544059935783094e-05, + "loss": 0.2395, + "step": 2864 + }, + { + "epoch": 0.05110048870973496, + "grad_norm": 0.41909059882164, + "learning_rate": 2.5552978951123797e-05, + "loss": 0.3211, + "step": 2865 + }, + { + "epoch": 0.05111832483144865, + "grad_norm": 0.377627968788147, + "learning_rate": 2.5561897966464503e-05, + "loss": 0.2508, + "step": 2866 + }, + { + "epoch": 0.05113616095316235, + "grad_norm": 0.4478866755962372, + "learning_rate": 2.5570816981805213e-05, + "loss": 0.3244, + "step": 2867 + }, + { + "epoch": 0.051153997074876036, + "grad_norm": 0.47881847620010376, + "learning_rate": 2.5579735997145916e-05, + "loss": 0.2786, + "step": 2868 + }, + { + "epoch": 0.05117183319658973, + "grad_norm": 0.5202912092208862, + "learning_rate": 2.5588655012486622e-05, + "loss": 0.3574, + "step": 2869 + }, + { + "epoch": 0.051189669318303425, + "grad_norm": 0.7261667847633362, + "learning_rate": 2.5597574027827332e-05, + "loss": 0.3479, + "step": 2870 + }, + { + "epoch": 0.05120750544001712, + "grad_norm": 0.5155419111251831, + "learning_rate": 2.5606493043168035e-05, + "loss": 0.2844, + "step": 2871 + }, + { + "epoch": 0.051225341561730815, + "grad_norm": 0.5000239610671997, + "learning_rate": 2.561541205850874e-05, + "loss": 0.2889, + "step": 2872 + }, + { + "epoch": 0.05124317768344451, + "grad_norm": 0.5503202080726624, + "learning_rate": 2.562433107384945e-05, + "loss": 0.2718, + "step": 2873 + }, + { + "epoch": 0.051261013805158205, + "grad_norm": 0.4166640043258667, + "learning_rate": 2.5633250089190154e-05, + "loss": 0.2782, + "step": 2874 + }, + { + "epoch": 0.0512788499268719, + "grad_norm": 0.4286460280418396, + "learning_rate": 2.564216910453086e-05, + "loss": 0.333, + "step": 2875 + }, + { + "epoch": 0.051296686048585595, + "grad_norm": 0.3176979124546051, + "learning_rate": 2.565108811987157e-05, + "loss": 0.2494, + "step": 2876 + }, + { + "epoch": 0.05131452217029929, + "grad_norm": 0.3552425503730774, + "learning_rate": 2.5660007135212273e-05, + "loss": 0.2382, + "step": 2877 + }, + { + "epoch": 0.051332358292012985, + "grad_norm": 0.5758682489395142, + "learning_rate": 2.566892615055298e-05, + "loss": 0.3464, + "step": 2878 + }, + { + "epoch": 0.05135019441372668, + "grad_norm": 0.3782997131347656, + "learning_rate": 2.567784516589369e-05, + "loss": 0.2879, + "step": 2879 + }, + { + "epoch": 0.051368030535440375, + "grad_norm": 0.5711967349052429, + "learning_rate": 2.568676418123439e-05, + "loss": 0.2249, + "step": 2880 + }, + { + "epoch": 0.05138586665715407, + "grad_norm": 0.48385098576545715, + "learning_rate": 2.56956831965751e-05, + "loss": 0.3048, + "step": 2881 + }, + { + "epoch": 0.051403702778867764, + "grad_norm": 0.4495086371898651, + "learning_rate": 2.570460221191581e-05, + "loss": 0.2982, + "step": 2882 + }, + { + "epoch": 0.05142153890058146, + "grad_norm": 0.49659374356269836, + "learning_rate": 2.5713521227256508e-05, + "loss": 0.3092, + "step": 2883 + }, + { + "epoch": 0.051439375022295154, + "grad_norm": 0.3788304626941681, + "learning_rate": 2.5722440242597218e-05, + "loss": 0.2807, + "step": 2884 + }, + { + "epoch": 0.05145721114400885, + "grad_norm": 0.4766710102558136, + "learning_rate": 2.5731359257937924e-05, + "loss": 0.3716, + "step": 2885 + }, + { + "epoch": 0.051475047265722544, + "grad_norm": 0.4333093464374542, + "learning_rate": 2.5740278273278627e-05, + "loss": 0.2942, + "step": 2886 + }, + { + "epoch": 0.05149288338743624, + "grad_norm": 0.5158129930496216, + "learning_rate": 2.5749197288619337e-05, + "loss": 0.286, + "step": 2887 + }, + { + "epoch": 0.05151071950914993, + "grad_norm": 0.9444575309753418, + "learning_rate": 2.5758116303960044e-05, + "loss": 0.3562, + "step": 2888 + }, + { + "epoch": 0.05152855563086362, + "grad_norm": 0.592723548412323, + "learning_rate": 2.5767035319300753e-05, + "loss": 0.3498, + "step": 2889 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 0.4227851629257202, + "learning_rate": 2.5775954334641456e-05, + "loss": 0.3013, + "step": 2890 + }, + { + "epoch": 0.05156422787429101, + "grad_norm": 0.4196017384529114, + "learning_rate": 2.5784873349982163e-05, + "loss": 0.2958, + "step": 2891 + }, + { + "epoch": 0.05158206399600471, + "grad_norm": 0.496559739112854, + "learning_rate": 2.5793792365322872e-05, + "loss": 0.3345, + "step": 2892 + }, + { + "epoch": 0.0515999001177184, + "grad_norm": 0.5769611597061157, + "learning_rate": 2.5802711380663575e-05, + "loss": 0.3352, + "step": 2893 + }, + { + "epoch": 0.0516177362394321, + "grad_norm": 0.49038267135620117, + "learning_rate": 2.5811630396004282e-05, + "loss": 0.2857, + "step": 2894 + }, + { + "epoch": 0.05163557236114579, + "grad_norm": 0.3836861252784729, + "learning_rate": 2.582054941134499e-05, + "loss": 0.2732, + "step": 2895 + }, + { + "epoch": 0.051653408482859486, + "grad_norm": 0.531437337398529, + "learning_rate": 2.5829468426685695e-05, + "loss": 0.3088, + "step": 2896 + }, + { + "epoch": 0.05167124460457318, + "grad_norm": 0.927949845790863, + "learning_rate": 2.58383874420264e-05, + "loss": 0.367, + "step": 2897 + }, + { + "epoch": 0.051689080726286876, + "grad_norm": 0.44664403796195984, + "learning_rate": 2.584730645736711e-05, + "loss": 0.3421, + "step": 2898 + }, + { + "epoch": 0.05170691684800057, + "grad_norm": 0.5556526780128479, + "learning_rate": 2.5856225472707814e-05, + "loss": 0.2948, + "step": 2899 + }, + { + "epoch": 0.051724752969714266, + "grad_norm": 0.49114635586738586, + "learning_rate": 2.586514448804852e-05, + "loss": 0.2775, + "step": 2900 + }, + { + "epoch": 0.05174258909142796, + "grad_norm": 0.4353705048561096, + "learning_rate": 2.587406350338923e-05, + "loss": 0.2956, + "step": 2901 + }, + { + "epoch": 0.051760425213141656, + "grad_norm": 0.6163909435272217, + "learning_rate": 2.5882982518729933e-05, + "loss": 0.4008, + "step": 2902 + }, + { + "epoch": 0.05177826133485535, + "grad_norm": 0.483247846364975, + "learning_rate": 2.589190153407064e-05, + "loss": 0.2734, + "step": 2903 + }, + { + "epoch": 0.051796097456569046, + "grad_norm": 0.49507302045822144, + "learning_rate": 2.590082054941135e-05, + "loss": 0.3181, + "step": 2904 + }, + { + "epoch": 0.05181393357828274, + "grad_norm": 0.4445972144603729, + "learning_rate": 2.590973956475205e-05, + "loss": 0.3219, + "step": 2905 + }, + { + "epoch": 0.051831769699996436, + "grad_norm": 0.4449050724506378, + "learning_rate": 2.591865858009276e-05, + "loss": 0.283, + "step": 2906 + }, + { + "epoch": 0.05184960582171013, + "grad_norm": 0.390539288520813, + "learning_rate": 2.5927577595433468e-05, + "loss": 0.2845, + "step": 2907 + }, + { + "epoch": 0.05186744194342382, + "grad_norm": 0.43584051728248596, + "learning_rate": 2.5936496610774168e-05, + "loss": 0.3225, + "step": 2908 + }, + { + "epoch": 0.05188527806513751, + "grad_norm": 0.35692664980888367, + "learning_rate": 2.5945415626114878e-05, + "loss": 0.3023, + "step": 2909 + }, + { + "epoch": 0.05190311418685121, + "grad_norm": 0.3913831114768982, + "learning_rate": 2.5954334641455584e-05, + "loss": 0.21, + "step": 2910 + }, + { + "epoch": 0.0519209503085649, + "grad_norm": 0.5844758152961731, + "learning_rate": 2.5963253656796294e-05, + "loss": 0.2907, + "step": 2911 + }, + { + "epoch": 0.0519387864302786, + "grad_norm": 0.46606016159057617, + "learning_rate": 2.5972172672136997e-05, + "loss": 0.2727, + "step": 2912 + }, + { + "epoch": 0.05195662255199229, + "grad_norm": 0.4373425543308258, + "learning_rate": 2.5981091687477703e-05, + "loss": 0.3108, + "step": 2913 + }, + { + "epoch": 0.05197445867370599, + "grad_norm": 0.3102944493293762, + "learning_rate": 2.5990010702818413e-05, + "loss": 0.2366, + "step": 2914 + }, + { + "epoch": 0.05199229479541968, + "grad_norm": 0.5374158024787903, + "learning_rate": 2.5998929718159116e-05, + "loss": 0.3073, + "step": 2915 + }, + { + "epoch": 0.05201013091713338, + "grad_norm": 0.4365256428718567, + "learning_rate": 2.6007848733499822e-05, + "loss": 0.2712, + "step": 2916 + }, + { + "epoch": 0.05202796703884707, + "grad_norm": 0.5548803806304932, + "learning_rate": 2.6016767748840532e-05, + "loss": 0.375, + "step": 2917 + }, + { + "epoch": 0.05204580316056077, + "grad_norm": 0.5051923990249634, + "learning_rate": 2.6025686764181235e-05, + "loss": 0.3929, + "step": 2918 + }, + { + "epoch": 0.05206363928227446, + "grad_norm": 0.45372942090034485, + "learning_rate": 2.603460577952194e-05, + "loss": 0.28, + "step": 2919 + }, + { + "epoch": 0.05208147540398816, + "grad_norm": 0.5156953930854797, + "learning_rate": 2.604352479486265e-05, + "loss": 0.2965, + "step": 2920 + }, + { + "epoch": 0.05209931152570185, + "grad_norm": 1.1953109502792358, + "learning_rate": 2.6052443810203354e-05, + "loss": 0.3607, + "step": 2921 + }, + { + "epoch": 0.05211714764741555, + "grad_norm": 0.3817157745361328, + "learning_rate": 2.606136282554406e-05, + "loss": 0.288, + "step": 2922 + }, + { + "epoch": 0.05213498376912924, + "grad_norm": 0.46914711594581604, + "learning_rate": 2.607028184088477e-05, + "loss": 0.2943, + "step": 2923 + }, + { + "epoch": 0.05215281989084294, + "grad_norm": 0.485501766204834, + "learning_rate": 2.6079200856225473e-05, + "loss": 0.3004, + "step": 2924 + }, + { + "epoch": 0.05217065601255663, + "grad_norm": 0.44770991802215576, + "learning_rate": 2.608811987156618e-05, + "loss": 0.2327, + "step": 2925 + }, + { + "epoch": 0.05218849213427033, + "grad_norm": 0.3211217522621155, + "learning_rate": 2.609703888690689e-05, + "loss": 0.2568, + "step": 2926 + }, + { + "epoch": 0.05220632825598402, + "grad_norm": 0.39864063262939453, + "learning_rate": 2.6105957902247592e-05, + "loss": 0.2804, + "step": 2927 + }, + { + "epoch": 0.05222416437769772, + "grad_norm": 0.49669742584228516, + "learning_rate": 2.61148769175883e-05, + "loss": 0.3116, + "step": 2928 + }, + { + "epoch": 0.052242000499411405, + "grad_norm": 0.5273682475090027, + "learning_rate": 2.612379593292901e-05, + "loss": 0.291, + "step": 2929 + }, + { + "epoch": 0.0522598366211251, + "grad_norm": 0.39144521951675415, + "learning_rate": 2.6132714948269708e-05, + "loss": 0.301, + "step": 2930 + }, + { + "epoch": 0.052277672742838795, + "grad_norm": 0.402148962020874, + "learning_rate": 2.6141633963610418e-05, + "loss": 0.2788, + "step": 2931 + }, + { + "epoch": 0.05229550886455249, + "grad_norm": 0.38230326771736145, + "learning_rate": 2.6150552978951128e-05, + "loss": 0.3084, + "step": 2932 + }, + { + "epoch": 0.052313344986266185, + "grad_norm": 0.4424063563346863, + "learning_rate": 2.6159471994291827e-05, + "loss": 0.2951, + "step": 2933 + }, + { + "epoch": 0.05233118110797988, + "grad_norm": 0.4585544168949127, + "learning_rate": 2.6168391009632537e-05, + "loss": 0.3357, + "step": 2934 + }, + { + "epoch": 0.052349017229693574, + "grad_norm": 0.4543989598751068, + "learning_rate": 2.6177310024973243e-05, + "loss": 0.2956, + "step": 2935 + }, + { + "epoch": 0.05236685335140727, + "grad_norm": 0.39039894938468933, + "learning_rate": 2.6186229040313953e-05, + "loss": 0.2764, + "step": 2936 + }, + { + "epoch": 0.052384689473120964, + "grad_norm": 0.3535137474536896, + "learning_rate": 2.6195148055654656e-05, + "loss": 0.2503, + "step": 2937 + }, + { + "epoch": 0.05240252559483466, + "grad_norm": 0.5466296076774597, + "learning_rate": 2.6204067070995363e-05, + "loss": 0.3026, + "step": 2938 + }, + { + "epoch": 0.052420361716548354, + "grad_norm": 0.47230756282806396, + "learning_rate": 2.6212986086336072e-05, + "loss": 0.3632, + "step": 2939 + }, + { + "epoch": 0.05243819783826205, + "grad_norm": 0.3603994846343994, + "learning_rate": 2.6221905101676775e-05, + "loss": 0.289, + "step": 2940 + }, + { + "epoch": 0.052456033959975744, + "grad_norm": 0.5505605340003967, + "learning_rate": 2.623082411701748e-05, + "loss": 0.3249, + "step": 2941 + }, + { + "epoch": 0.05247387008168944, + "grad_norm": 0.4415774345397949, + "learning_rate": 2.623974313235819e-05, + "loss": 0.3526, + "step": 2942 + }, + { + "epoch": 0.052491706203403134, + "grad_norm": 0.4135698676109314, + "learning_rate": 2.6248662147698894e-05, + "loss": 0.253, + "step": 2943 + }, + { + "epoch": 0.05250954232511683, + "grad_norm": 0.5751461982727051, + "learning_rate": 2.62575811630396e-05, + "loss": 0.3269, + "step": 2944 + }, + { + "epoch": 0.052527378446830524, + "grad_norm": 0.4263665974140167, + "learning_rate": 2.626650017838031e-05, + "loss": 0.3046, + "step": 2945 + }, + { + "epoch": 0.05254521456854422, + "grad_norm": 0.5448289513587952, + "learning_rate": 2.6275419193721014e-05, + "loss": 0.3038, + "step": 2946 + }, + { + "epoch": 0.05256305069025791, + "grad_norm": 0.40797874331474304, + "learning_rate": 2.628433820906172e-05, + "loss": 0.2939, + "step": 2947 + }, + { + "epoch": 0.05258088681197161, + "grad_norm": 0.5384146571159363, + "learning_rate": 2.629325722440243e-05, + "loss": 0.2963, + "step": 2948 + }, + { + "epoch": 0.052598722933685296, + "grad_norm": 0.37856200337409973, + "learning_rate": 2.6302176239743133e-05, + "loss": 0.254, + "step": 2949 + }, + { + "epoch": 0.05261655905539899, + "grad_norm": 0.43756726384162903, + "learning_rate": 2.631109525508384e-05, + "loss": 0.3324, + "step": 2950 + }, + { + "epoch": 0.052634395177112686, + "grad_norm": 1.7020657062530518, + "learning_rate": 2.632001427042455e-05, + "loss": 0.387, + "step": 2951 + }, + { + "epoch": 0.05265223129882638, + "grad_norm": 0.4374551475048065, + "learning_rate": 2.6328933285765252e-05, + "loss": 0.3146, + "step": 2952 + }, + { + "epoch": 0.052670067420540076, + "grad_norm": 0.3677287995815277, + "learning_rate": 2.6337852301105958e-05, + "loss": 0.2629, + "step": 2953 + }, + { + "epoch": 0.05268790354225377, + "grad_norm": 0.4279806315898895, + "learning_rate": 2.6346771316446668e-05, + "loss": 0.2785, + "step": 2954 + }, + { + "epoch": 0.052705739663967466, + "grad_norm": 0.4172787368297577, + "learning_rate": 2.635569033178737e-05, + "loss": 0.3172, + "step": 2955 + }, + { + "epoch": 0.05272357578568116, + "grad_norm": 0.7966938018798828, + "learning_rate": 2.6364609347128077e-05, + "loss": 0.2457, + "step": 2956 + }, + { + "epoch": 0.052741411907394856, + "grad_norm": 0.5118768215179443, + "learning_rate": 2.6373528362468787e-05, + "loss": 0.2471, + "step": 2957 + }, + { + "epoch": 0.05275924802910855, + "grad_norm": 0.41275715827941895, + "learning_rate": 2.6382447377809494e-05, + "loss": 0.2778, + "step": 2958 + }, + { + "epoch": 0.052777084150822245, + "grad_norm": 0.5184250473976135, + "learning_rate": 2.6391366393150197e-05, + "loss": 0.361, + "step": 2959 + }, + { + "epoch": 0.05279492027253594, + "grad_norm": 0.601280689239502, + "learning_rate": 2.6400285408490903e-05, + "loss": 0.2862, + "step": 2960 + }, + { + "epoch": 0.052812756394249635, + "grad_norm": 0.4165460467338562, + "learning_rate": 2.6409204423831613e-05, + "loss": 0.2429, + "step": 2961 + }, + { + "epoch": 0.05283059251596333, + "grad_norm": 0.48465052247047424, + "learning_rate": 2.6418123439172316e-05, + "loss": 0.2706, + "step": 2962 + }, + { + "epoch": 0.052848428637677025, + "grad_norm": 0.33272939920425415, + "learning_rate": 2.6427042454513022e-05, + "loss": 0.2427, + "step": 2963 + }, + { + "epoch": 0.05286626475939072, + "grad_norm": 0.4371368885040283, + "learning_rate": 2.6435961469853732e-05, + "loss": 0.2818, + "step": 2964 + }, + { + "epoch": 0.052884100881104415, + "grad_norm": 0.3477995693683624, + "learning_rate": 2.6444880485194435e-05, + "loss": 0.273, + "step": 2965 + }, + { + "epoch": 0.05290193700281811, + "grad_norm": 0.41649848222732544, + "learning_rate": 2.645379950053514e-05, + "loss": 0.2682, + "step": 2966 + }, + { + "epoch": 0.052919773124531805, + "grad_norm": 0.45629921555519104, + "learning_rate": 2.646271851587585e-05, + "loss": 0.2497, + "step": 2967 + }, + { + "epoch": 0.0529376092462455, + "grad_norm": 0.3903038501739502, + "learning_rate": 2.6471637531216554e-05, + "loss": 0.3093, + "step": 2968 + }, + { + "epoch": 0.05295544536795919, + "grad_norm": 0.3767849802970886, + "learning_rate": 2.648055654655726e-05, + "loss": 0.2813, + "step": 2969 + }, + { + "epoch": 0.05297328148967288, + "grad_norm": 0.6601611971855164, + "learning_rate": 2.648947556189797e-05, + "loss": 0.2368, + "step": 2970 + }, + { + "epoch": 0.05299111761138658, + "grad_norm": 0.399946391582489, + "learning_rate": 2.6498394577238673e-05, + "loss": 0.2905, + "step": 2971 + }, + { + "epoch": 0.05300895373310027, + "grad_norm": 0.37308692932128906, + "learning_rate": 2.650731359257938e-05, + "loss": 0.2735, + "step": 2972 + }, + { + "epoch": 0.05302678985481397, + "grad_norm": 0.4638592302799225, + "learning_rate": 2.651623260792009e-05, + "loss": 0.3048, + "step": 2973 + }, + { + "epoch": 0.05304462597652766, + "grad_norm": 0.4922824203968048, + "learning_rate": 2.6525151623260792e-05, + "loss": 0.3118, + "step": 2974 + }, + { + "epoch": 0.05306246209824136, + "grad_norm": 0.46285897493362427, + "learning_rate": 2.65340706386015e-05, + "loss": 0.3461, + "step": 2975 + }, + { + "epoch": 0.05308029821995505, + "grad_norm": 0.6495586037635803, + "learning_rate": 2.654298965394221e-05, + "loss": 0.2968, + "step": 2976 + }, + { + "epoch": 0.05309813434166875, + "grad_norm": 0.335318922996521, + "learning_rate": 2.655190866928291e-05, + "loss": 0.2448, + "step": 2977 + }, + { + "epoch": 0.05311597046338244, + "grad_norm": 0.45703160762786865, + "learning_rate": 2.6560827684623618e-05, + "loss": 0.2687, + "step": 2978 + }, + { + "epoch": 0.05313380658509614, + "grad_norm": 0.411228746175766, + "learning_rate": 2.6569746699964328e-05, + "loss": 0.2964, + "step": 2979 + }, + { + "epoch": 0.05315164270680983, + "grad_norm": 0.32256802916526794, + "learning_rate": 2.657866571530503e-05, + "loss": 0.2477, + "step": 2980 + }, + { + "epoch": 0.05316947882852353, + "grad_norm": 0.3313572108745575, + "learning_rate": 2.6587584730645737e-05, + "loss": 0.2502, + "step": 2981 + }, + { + "epoch": 0.05318731495023722, + "grad_norm": 0.3550911843776703, + "learning_rate": 2.6596503745986447e-05, + "loss": 0.2223, + "step": 2982 + }, + { + "epoch": 0.05320515107195092, + "grad_norm": 0.41639426350593567, + "learning_rate": 2.6605422761327153e-05, + "loss": 0.2759, + "step": 2983 + }, + { + "epoch": 0.05322298719366461, + "grad_norm": 0.4413129687309265, + "learning_rate": 2.6614341776667856e-05, + "loss": 0.3204, + "step": 2984 + }, + { + "epoch": 0.053240823315378306, + "grad_norm": 0.3897430896759033, + "learning_rate": 2.6623260792008566e-05, + "loss": 0.2577, + "step": 2985 + }, + { + "epoch": 0.053258659437092, + "grad_norm": 0.4310801923274994, + "learning_rate": 2.6632179807349272e-05, + "loss": 0.2786, + "step": 2986 + }, + { + "epoch": 0.053276495558805696, + "grad_norm": 0.41899827122688293, + "learning_rate": 2.6641098822689975e-05, + "loss": 0.2959, + "step": 2987 + }, + { + "epoch": 0.05329433168051939, + "grad_norm": 0.42233455181121826, + "learning_rate": 2.665001783803068e-05, + "loss": 0.2553, + "step": 2988 + }, + { + "epoch": 0.05331216780223308, + "grad_norm": 0.4711173474788666, + "learning_rate": 2.665893685337139e-05, + "loss": 0.2947, + "step": 2989 + }, + { + "epoch": 0.053330003923946774, + "grad_norm": 0.48387402296066284, + "learning_rate": 2.6667855868712094e-05, + "loss": 0.3444, + "step": 2990 + }, + { + "epoch": 0.05334784004566047, + "grad_norm": 0.35652148723602295, + "learning_rate": 2.66767748840528e-05, + "loss": 0.2864, + "step": 2991 + }, + { + "epoch": 0.053365676167374164, + "grad_norm": 0.3819200396537781, + "learning_rate": 2.668569389939351e-05, + "loss": 0.2841, + "step": 2992 + }, + { + "epoch": 0.05338351228908786, + "grad_norm": 0.32867348194122314, + "learning_rate": 2.6694612914734213e-05, + "loss": 0.2999, + "step": 2993 + }, + { + "epoch": 0.053401348410801554, + "grad_norm": 0.3875643014907837, + "learning_rate": 2.670353193007492e-05, + "loss": 0.2543, + "step": 2994 + }, + { + "epoch": 0.05341918453251525, + "grad_norm": 0.5814125537872314, + "learning_rate": 2.671245094541563e-05, + "loss": 0.2709, + "step": 2995 + }, + { + "epoch": 0.053437020654228944, + "grad_norm": 0.3573005199432373, + "learning_rate": 2.6721369960756333e-05, + "loss": 0.2316, + "step": 2996 + }, + { + "epoch": 0.05345485677594264, + "grad_norm": 0.5520204901695251, + "learning_rate": 2.673028897609704e-05, + "loss": 0.3047, + "step": 2997 + }, + { + "epoch": 0.05347269289765633, + "grad_norm": 0.41722971200942993, + "learning_rate": 2.673920799143775e-05, + "loss": 0.2709, + "step": 2998 + }, + { + "epoch": 0.05349052901937003, + "grad_norm": 0.5371643304824829, + "learning_rate": 2.6748127006778452e-05, + "loss": 0.255, + "step": 2999 + }, + { + "epoch": 0.05350836514108372, + "grad_norm": 0.4850537180900574, + "learning_rate": 2.6757046022119158e-05, + "loss": 0.3051, + "step": 3000 + }, + { + "epoch": 0.05350836514108372, + "eval_loss": 0.28895989060401917, + "eval_runtime": 1569.3044, + "eval_samples_per_second": 0.653, + "eval_steps_per_second": 0.109, + "step": 3000 + }, + { + "epoch": 0.05352620126279742, + "grad_norm": 0.5266303420066833, + "learning_rate": 2.6765965037459868e-05, + "loss": 0.2834, + "step": 3001 + }, + { + "epoch": 0.05354403738451111, + "grad_norm": 0.43296658992767334, + "learning_rate": 2.677488405280057e-05, + "loss": 0.2907, + "step": 3002 + }, + { + "epoch": 0.05356187350622481, + "grad_norm": 0.4872777760028839, + "learning_rate": 2.6783803068141277e-05, + "loss": 0.3463, + "step": 3003 + }, + { + "epoch": 0.0535797096279385, + "grad_norm": 0.6589179039001465, + "learning_rate": 2.6792722083481987e-05, + "loss": 0.2644, + "step": 3004 + }, + { + "epoch": 0.0535975457496522, + "grad_norm": 0.4832260012626648, + "learning_rate": 2.6801641098822693e-05, + "loss": 0.2683, + "step": 3005 + }, + { + "epoch": 0.05361538187136589, + "grad_norm": 0.33053654432296753, + "learning_rate": 2.6810560114163396e-05, + "loss": 0.2564, + "step": 3006 + }, + { + "epoch": 0.05363321799307959, + "grad_norm": 0.38049063086509705, + "learning_rate": 2.6819479129504106e-05, + "loss": 0.3414, + "step": 3007 + }, + { + "epoch": 0.05365105411479328, + "grad_norm": 0.41180679202079773, + "learning_rate": 2.6828398144844813e-05, + "loss": 0.2937, + "step": 3008 + }, + { + "epoch": 0.05366889023650697, + "grad_norm": 0.3803144693374634, + "learning_rate": 2.6837317160185516e-05, + "loss": 0.3214, + "step": 3009 + }, + { + "epoch": 0.053686726358220666, + "grad_norm": 0.36073756217956543, + "learning_rate": 2.6846236175526225e-05, + "loss": 0.2575, + "step": 3010 + }, + { + "epoch": 0.05370456247993436, + "grad_norm": 0.3315037190914154, + "learning_rate": 2.6855155190866932e-05, + "loss": 0.2746, + "step": 3011 + }, + { + "epoch": 0.053722398601648055, + "grad_norm": 0.39566829800605774, + "learning_rate": 2.6864074206207635e-05, + "loss": 0.2887, + "step": 3012 + }, + { + "epoch": 0.05374023472336175, + "grad_norm": 0.3574841320514679, + "learning_rate": 2.687299322154834e-05, + "loss": 0.301, + "step": 3013 + }, + { + "epoch": 0.053758070845075445, + "grad_norm": 0.3679479956626892, + "learning_rate": 2.688191223688905e-05, + "loss": 0.2782, + "step": 3014 + }, + { + "epoch": 0.05377590696678914, + "grad_norm": 0.516028106212616, + "learning_rate": 2.6890831252229754e-05, + "loss": 0.3331, + "step": 3015 + }, + { + "epoch": 0.053793743088502835, + "grad_norm": 0.4886375963687897, + "learning_rate": 2.689975026757046e-05, + "loss": 0.3141, + "step": 3016 + }, + { + "epoch": 0.05381157921021653, + "grad_norm": 0.4060991406440735, + "learning_rate": 2.690866928291117e-05, + "loss": 0.303, + "step": 3017 + }, + { + "epoch": 0.053829415331930225, + "grad_norm": 0.3348287343978882, + "learning_rate": 2.6917588298251873e-05, + "loss": 0.2586, + "step": 3018 + }, + { + "epoch": 0.05384725145364392, + "grad_norm": 0.44578617811203003, + "learning_rate": 2.692650731359258e-05, + "loss": 0.2988, + "step": 3019 + }, + { + "epoch": 0.053865087575357615, + "grad_norm": 0.33656173944473267, + "learning_rate": 2.693542632893329e-05, + "loss": 0.261, + "step": 3020 + }, + { + "epoch": 0.05388292369707131, + "grad_norm": 0.39461520314216614, + "learning_rate": 2.6944345344273992e-05, + "loss": 0.275, + "step": 3021 + }, + { + "epoch": 0.053900759818785005, + "grad_norm": 0.557515561580658, + "learning_rate": 2.69532643596147e-05, + "loss": 0.3078, + "step": 3022 + }, + { + "epoch": 0.0539185959404987, + "grad_norm": 0.407611608505249, + "learning_rate": 2.6962183374955408e-05, + "loss": 0.2137, + "step": 3023 + }, + { + "epoch": 0.053936432062212394, + "grad_norm": 0.4608760178089142, + "learning_rate": 2.697110239029611e-05, + "loss": 0.3493, + "step": 3024 + }, + { + "epoch": 0.05395426818392609, + "grad_norm": 0.3761187493801117, + "learning_rate": 2.6980021405636818e-05, + "loss": 0.24, + "step": 3025 + }, + { + "epoch": 0.053972104305639784, + "grad_norm": 0.6394914984703064, + "learning_rate": 2.6988940420977527e-05, + "loss": 0.3024, + "step": 3026 + }, + { + "epoch": 0.05398994042735348, + "grad_norm": 0.7831883430480957, + "learning_rate": 2.699785943631823e-05, + "loss": 0.3252, + "step": 3027 + }, + { + "epoch": 0.054007776549067174, + "grad_norm": 0.43982136249542236, + "learning_rate": 2.7006778451658937e-05, + "loss": 0.3415, + "step": 3028 + }, + { + "epoch": 0.05402561267078086, + "grad_norm": 0.5146108865737915, + "learning_rate": 2.7015697466999647e-05, + "loss": 0.3062, + "step": 3029 + }, + { + "epoch": 0.05404344879249456, + "grad_norm": 0.3952732980251312, + "learning_rate": 2.7024616482340353e-05, + "loss": 0.2898, + "step": 3030 + }, + { + "epoch": 0.05406128491420825, + "grad_norm": 0.3943594992160797, + "learning_rate": 2.7033535497681056e-05, + "loss": 0.2686, + "step": 3031 + }, + { + "epoch": 0.05407912103592195, + "grad_norm": 0.5055153965950012, + "learning_rate": 2.7042454513021766e-05, + "loss": 0.3217, + "step": 3032 + }, + { + "epoch": 0.05409695715763564, + "grad_norm": 0.48227664828300476, + "learning_rate": 2.7051373528362472e-05, + "loss": 0.2487, + "step": 3033 + }, + { + "epoch": 0.05411479327934934, + "grad_norm": 0.3622559607028961, + "learning_rate": 2.7060292543703175e-05, + "loss": 0.2929, + "step": 3034 + }, + { + "epoch": 0.05413262940106303, + "grad_norm": 0.41722092032432556, + "learning_rate": 2.7069211559043885e-05, + "loss": 0.2558, + "step": 3035 + }, + { + "epoch": 0.054150465522776726, + "grad_norm": 0.4224678874015808, + "learning_rate": 2.707813057438459e-05, + "loss": 0.3332, + "step": 3036 + }, + { + "epoch": 0.05416830164449042, + "grad_norm": 0.536918580532074, + "learning_rate": 2.7087049589725294e-05, + "loss": 0.2774, + "step": 3037 + }, + { + "epoch": 0.054186137766204116, + "grad_norm": 0.45742127299308777, + "learning_rate": 2.7095968605066e-05, + "loss": 0.286, + "step": 3038 + }, + { + "epoch": 0.05420397388791781, + "grad_norm": 0.43477529287338257, + "learning_rate": 2.710488762040671e-05, + "loss": 0.3558, + "step": 3039 + }, + { + "epoch": 0.054221810009631506, + "grad_norm": 0.4702891707420349, + "learning_rate": 2.7113806635747413e-05, + "loss": 0.2599, + "step": 3040 + }, + { + "epoch": 0.0542396461313452, + "grad_norm": 0.6312264800071716, + "learning_rate": 2.712272565108812e-05, + "loss": 0.3242, + "step": 3041 + }, + { + "epoch": 0.054257482253058896, + "grad_norm": 0.42315179109573364, + "learning_rate": 2.713164466642883e-05, + "loss": 0.3345, + "step": 3042 + }, + { + "epoch": 0.05427531837477259, + "grad_norm": 0.3658837676048279, + "learning_rate": 2.7140563681769532e-05, + "loss": 0.2793, + "step": 3043 + }, + { + "epoch": 0.054293154496486286, + "grad_norm": 0.4221414625644684, + "learning_rate": 2.714948269711024e-05, + "loss": 0.3228, + "step": 3044 + }, + { + "epoch": 0.05431099061819998, + "grad_norm": 0.44399699568748474, + "learning_rate": 2.715840171245095e-05, + "loss": 0.2625, + "step": 3045 + }, + { + "epoch": 0.054328826739913676, + "grad_norm": 0.6177511215209961, + "learning_rate": 2.716732072779165e-05, + "loss": 0.2173, + "step": 3046 + }, + { + "epoch": 0.05434666286162737, + "grad_norm": 0.5023239254951477, + "learning_rate": 2.7176239743132358e-05, + "loss": 0.2987, + "step": 3047 + }, + { + "epoch": 0.054364498983341066, + "grad_norm": 0.3383076786994934, + "learning_rate": 2.7185158758473068e-05, + "loss": 0.231, + "step": 3048 + }, + { + "epoch": 0.054382335105054753, + "grad_norm": 0.5170038938522339, + "learning_rate": 2.719407777381377e-05, + "loss": 0.2843, + "step": 3049 + }, + { + "epoch": 0.05440017122676845, + "grad_norm": 0.5562129020690918, + "learning_rate": 2.7202996789154477e-05, + "loss": 0.3023, + "step": 3050 + }, + { + "epoch": 0.05441800734848214, + "grad_norm": 0.38035914301872253, + "learning_rate": 2.7211915804495187e-05, + "loss": 0.2858, + "step": 3051 + }, + { + "epoch": 0.05443584347019584, + "grad_norm": 0.4255860149860382, + "learning_rate": 2.722083481983589e-05, + "loss": 0.2881, + "step": 3052 + }, + { + "epoch": 0.05445367959190953, + "grad_norm": 0.2930384576320648, + "learning_rate": 2.7229753835176596e-05, + "loss": 0.2275, + "step": 3053 + }, + { + "epoch": 0.05447151571362323, + "grad_norm": 0.42791998386383057, + "learning_rate": 2.7238672850517306e-05, + "loss": 0.3013, + "step": 3054 + }, + { + "epoch": 0.05448935183533692, + "grad_norm": 0.4384118616580963, + "learning_rate": 2.7247591865858012e-05, + "loss": 0.3061, + "step": 3055 + }, + { + "epoch": 0.05450718795705062, + "grad_norm": 0.584568202495575, + "learning_rate": 2.7256510881198715e-05, + "loss": 0.335, + "step": 3056 + }, + { + "epoch": 0.05452502407876431, + "grad_norm": 0.4241713881492615, + "learning_rate": 2.7265429896539425e-05, + "loss": 0.2707, + "step": 3057 + }, + { + "epoch": 0.05454286020047801, + "grad_norm": 0.5592104196548462, + "learning_rate": 2.727434891188013e-05, + "loss": 0.3268, + "step": 3058 + }, + { + "epoch": 0.0545606963221917, + "grad_norm": 0.38657495379447937, + "learning_rate": 2.7283267927220835e-05, + "loss": 0.2936, + "step": 3059 + }, + { + "epoch": 0.0545785324439054, + "grad_norm": 0.5117170214653015, + "learning_rate": 2.7292186942561544e-05, + "loss": 0.3072, + "step": 3060 + }, + { + "epoch": 0.05459636856561909, + "grad_norm": 0.616719126701355, + "learning_rate": 2.730110595790225e-05, + "loss": 0.3407, + "step": 3061 + }, + { + "epoch": 0.05461420468733279, + "grad_norm": 0.5758919715881348, + "learning_rate": 2.7310024973242954e-05, + "loss": 0.2504, + "step": 3062 + }, + { + "epoch": 0.05463204080904648, + "grad_norm": 0.45872604846954346, + "learning_rate": 2.731894398858366e-05, + "loss": 0.2728, + "step": 3063 + }, + { + "epoch": 0.05464987693076018, + "grad_norm": 0.5148894190788269, + "learning_rate": 2.732786300392437e-05, + "loss": 0.276, + "step": 3064 + }, + { + "epoch": 0.05466771305247387, + "grad_norm": 0.3636108934879303, + "learning_rate": 2.7336782019265073e-05, + "loss": 0.2539, + "step": 3065 + }, + { + "epoch": 0.05468554917418757, + "grad_norm": 0.4732643961906433, + "learning_rate": 2.734570103460578e-05, + "loss": 0.2315, + "step": 3066 + }, + { + "epoch": 0.05470338529590126, + "grad_norm": 0.4992886781692505, + "learning_rate": 2.735462004994649e-05, + "loss": 0.3466, + "step": 3067 + }, + { + "epoch": 0.05472122141761496, + "grad_norm": 0.5815374255180359, + "learning_rate": 2.7363539065287192e-05, + "loss": 0.2843, + "step": 3068 + }, + { + "epoch": 0.054739057539328645, + "grad_norm": 0.3680073320865631, + "learning_rate": 2.73724580806279e-05, + "loss": 0.2919, + "step": 3069 + }, + { + "epoch": 0.05475689366104234, + "grad_norm": 0.537388026714325, + "learning_rate": 2.7381377095968608e-05, + "loss": 0.3381, + "step": 3070 + }, + { + "epoch": 0.054774729782756035, + "grad_norm": 0.5731326341629028, + "learning_rate": 2.739029611130931e-05, + "loss": 0.3667, + "step": 3071 + }, + { + "epoch": 0.05479256590446973, + "grad_norm": 0.4754807949066162, + "learning_rate": 2.7399215126650018e-05, + "loss": 0.2875, + "step": 3072 + }, + { + "epoch": 0.054810402026183425, + "grad_norm": 0.4064819812774658, + "learning_rate": 2.7408134141990727e-05, + "loss": 0.3255, + "step": 3073 + }, + { + "epoch": 0.05482823814789712, + "grad_norm": 0.43807828426361084, + "learning_rate": 2.741705315733143e-05, + "loss": 0.2859, + "step": 3074 + }, + { + "epoch": 0.054846074269610814, + "grad_norm": 0.3859362006187439, + "learning_rate": 2.7425972172672137e-05, + "loss": 0.301, + "step": 3075 + }, + { + "epoch": 0.05486391039132451, + "grad_norm": 0.4559905529022217, + "learning_rate": 2.7434891188012846e-05, + "loss": 0.2836, + "step": 3076 + }, + { + "epoch": 0.054881746513038204, + "grad_norm": 0.36179181933403015, + "learning_rate": 2.7443810203353553e-05, + "loss": 0.2946, + "step": 3077 + }, + { + "epoch": 0.0548995826347519, + "grad_norm": 0.4795432686805725, + "learning_rate": 2.7452729218694256e-05, + "loss": 0.3012, + "step": 3078 + }, + { + "epoch": 0.054917418756465594, + "grad_norm": 0.36690446734428406, + "learning_rate": 2.7461648234034966e-05, + "loss": 0.2312, + "step": 3079 + }, + { + "epoch": 0.05493525487817929, + "grad_norm": 0.4657461941242218, + "learning_rate": 2.7470567249375672e-05, + "loss": 0.2878, + "step": 3080 + }, + { + "epoch": 0.054953090999892984, + "grad_norm": 0.3776685893535614, + "learning_rate": 2.7479486264716375e-05, + "loss": 0.317, + "step": 3081 + }, + { + "epoch": 0.05497092712160668, + "grad_norm": 0.5808919072151184, + "learning_rate": 2.7488405280057085e-05, + "loss": 0.3307, + "step": 3082 + }, + { + "epoch": 0.054988763243320374, + "grad_norm": 0.35996294021606445, + "learning_rate": 2.749732429539779e-05, + "loss": 0.2642, + "step": 3083 + }, + { + "epoch": 0.05500659936503407, + "grad_norm": 0.45299363136291504, + "learning_rate": 2.7506243310738494e-05, + "loss": 0.2558, + "step": 3084 + }, + { + "epoch": 0.055024435486747764, + "grad_norm": 0.35902100801467896, + "learning_rate": 2.7515162326079204e-05, + "loss": 0.2665, + "step": 3085 + }, + { + "epoch": 0.05504227160846146, + "grad_norm": 0.42359569668769836, + "learning_rate": 2.752408134141991e-05, + "loss": 0.2782, + "step": 3086 + }, + { + "epoch": 0.055060107730175153, + "grad_norm": 0.5339044332504272, + "learning_rate": 2.7533000356760613e-05, + "loss": 0.3598, + "step": 3087 + }, + { + "epoch": 0.05507794385188885, + "grad_norm": 0.3382940888404846, + "learning_rate": 2.7541919372101323e-05, + "loss": 0.2719, + "step": 3088 + }, + { + "epoch": 0.05509577997360254, + "grad_norm": 0.5310173630714417, + "learning_rate": 2.755083838744203e-05, + "loss": 0.3674, + "step": 3089 + }, + { + "epoch": 0.05511361609531623, + "grad_norm": 0.6789987683296204, + "learning_rate": 2.7559757402782732e-05, + "loss": 0.3267, + "step": 3090 + }, + { + "epoch": 0.055131452217029926, + "grad_norm": 0.42640647292137146, + "learning_rate": 2.756867641812344e-05, + "loss": 0.2734, + "step": 3091 + }, + { + "epoch": 0.05514928833874362, + "grad_norm": 0.47140589356422424, + "learning_rate": 2.757759543346415e-05, + "loss": 0.3008, + "step": 3092 + }, + { + "epoch": 0.055167124460457316, + "grad_norm": 0.42404425144195557, + "learning_rate": 2.758651444880485e-05, + "loss": 0.3032, + "step": 3093 + }, + { + "epoch": 0.05518496058217101, + "grad_norm": 0.7277231812477112, + "learning_rate": 2.7595433464145558e-05, + "loss": 0.3785, + "step": 3094 + }, + { + "epoch": 0.055202796703884706, + "grad_norm": 0.4522213339805603, + "learning_rate": 2.7604352479486268e-05, + "loss": 0.2837, + "step": 3095 + }, + { + "epoch": 0.0552206328255984, + "grad_norm": 0.367829829454422, + "learning_rate": 2.761327149482697e-05, + "loss": 0.2646, + "step": 3096 + }, + { + "epoch": 0.055238468947312096, + "grad_norm": 0.384769469499588, + "learning_rate": 2.7622190510167677e-05, + "loss": 0.268, + "step": 3097 + }, + { + "epoch": 0.05525630506902579, + "grad_norm": 0.4247475564479828, + "learning_rate": 2.7631109525508387e-05, + "loss": 0.3188, + "step": 3098 + }, + { + "epoch": 0.055274141190739486, + "grad_norm": 0.5494495630264282, + "learning_rate": 2.764002854084909e-05, + "loss": 0.3724, + "step": 3099 + }, + { + "epoch": 0.05529197731245318, + "grad_norm": 0.5495931506156921, + "learning_rate": 2.7648947556189796e-05, + "loss": 0.3231, + "step": 3100 + }, + { + "epoch": 0.055309813434166875, + "grad_norm": 0.5376266241073608, + "learning_rate": 2.7657866571530506e-05, + "loss": 0.2964, + "step": 3101 + }, + { + "epoch": 0.05532764955588057, + "grad_norm": 0.49344953894615173, + "learning_rate": 2.7666785586871212e-05, + "loss": 0.3465, + "step": 3102 + }, + { + "epoch": 0.055345485677594265, + "grad_norm": 0.3952866196632385, + "learning_rate": 2.7675704602211915e-05, + "loss": 0.3028, + "step": 3103 + }, + { + "epoch": 0.05536332179930796, + "grad_norm": 0.39372286200523376, + "learning_rate": 2.7684623617552625e-05, + "loss": 0.2968, + "step": 3104 + }, + { + "epoch": 0.055381157921021655, + "grad_norm": 0.53909832239151, + "learning_rate": 2.769354263289333e-05, + "loss": 0.288, + "step": 3105 + }, + { + "epoch": 0.05539899404273535, + "grad_norm": 0.6521942615509033, + "learning_rate": 2.7702461648234034e-05, + "loss": 0.3753, + "step": 3106 + }, + { + "epoch": 0.055416830164449045, + "grad_norm": 0.3129529654979706, + "learning_rate": 2.7711380663574744e-05, + "loss": 0.2636, + "step": 3107 + }, + { + "epoch": 0.05543466628616274, + "grad_norm": 0.3928622007369995, + "learning_rate": 2.772029967891545e-05, + "loss": 0.2633, + "step": 3108 + }, + { + "epoch": 0.055452502407876435, + "grad_norm": 0.4225257635116577, + "learning_rate": 2.7729218694256154e-05, + "loss": 0.2967, + "step": 3109 + }, + { + "epoch": 0.05547033852959012, + "grad_norm": 0.43595537543296814, + "learning_rate": 2.7738137709596863e-05, + "loss": 0.2984, + "step": 3110 + }, + { + "epoch": 0.05548817465130382, + "grad_norm": 0.43720415234565735, + "learning_rate": 2.774705672493757e-05, + "loss": 0.3059, + "step": 3111 + }, + { + "epoch": 0.05550601077301751, + "grad_norm": 0.6234900951385498, + "learning_rate": 2.7755975740278273e-05, + "loss": 0.2663, + "step": 3112 + }, + { + "epoch": 0.05552384689473121, + "grad_norm": 0.4558577239513397, + "learning_rate": 2.7764894755618983e-05, + "loss": 0.3029, + "step": 3113 + }, + { + "epoch": 0.0555416830164449, + "grad_norm": 0.4778018593788147, + "learning_rate": 2.777381377095969e-05, + "loss": 0.3317, + "step": 3114 + }, + { + "epoch": 0.0555595191381586, + "grad_norm": 0.43754979968070984, + "learning_rate": 2.7782732786300392e-05, + "loss": 0.2861, + "step": 3115 + }, + { + "epoch": 0.05557735525987229, + "grad_norm": 0.441175639629364, + "learning_rate": 2.7791651801641098e-05, + "loss": 0.2817, + "step": 3116 + }, + { + "epoch": 0.05559519138158599, + "grad_norm": 0.40181174874305725, + "learning_rate": 2.7800570816981808e-05, + "loss": 0.2885, + "step": 3117 + }, + { + "epoch": 0.05561302750329968, + "grad_norm": 0.38011544942855835, + "learning_rate": 2.780948983232251e-05, + "loss": 0.2688, + "step": 3118 + }, + { + "epoch": 0.05563086362501338, + "grad_norm": 0.42211681604385376, + "learning_rate": 2.7818408847663217e-05, + "loss": 0.258, + "step": 3119 + }, + { + "epoch": 0.05564869974672707, + "grad_norm": 0.35453712940216064, + "learning_rate": 2.7827327863003927e-05, + "loss": 0.252, + "step": 3120 + }, + { + "epoch": 0.05566653586844077, + "grad_norm": 0.4386732578277588, + "learning_rate": 2.783624687834463e-05, + "loss": 0.2356, + "step": 3121 + }, + { + "epoch": 0.05568437199015446, + "grad_norm": 0.38466301560401917, + "learning_rate": 2.7845165893685337e-05, + "loss": 0.3157, + "step": 3122 + }, + { + "epoch": 0.05570220811186816, + "grad_norm": 0.5103141069412231, + "learning_rate": 2.7854084909026046e-05, + "loss": 0.3021, + "step": 3123 + }, + { + "epoch": 0.05572004423358185, + "grad_norm": 0.5577016472816467, + "learning_rate": 2.7863003924366753e-05, + "loss": 0.3025, + "step": 3124 + }, + { + "epoch": 0.055737880355295547, + "grad_norm": 0.34246450662612915, + "learning_rate": 2.7871922939707456e-05, + "loss": 0.2536, + "step": 3125 + }, + { + "epoch": 0.05575571647700924, + "grad_norm": 0.4228273630142212, + "learning_rate": 2.7880841955048165e-05, + "loss": 0.2965, + "step": 3126 + }, + { + "epoch": 0.055773552598722936, + "grad_norm": 0.3759939670562744, + "learning_rate": 2.7889760970388872e-05, + "loss": 0.2798, + "step": 3127 + }, + { + "epoch": 0.05579138872043663, + "grad_norm": 0.49205154180526733, + "learning_rate": 2.7898679985729575e-05, + "loss": 0.3592, + "step": 3128 + }, + { + "epoch": 0.055809224842150326, + "grad_norm": 0.46467289328575134, + "learning_rate": 2.7907599001070285e-05, + "loss": 0.2554, + "step": 3129 + }, + { + "epoch": 0.055827060963864014, + "grad_norm": 0.3272590637207031, + "learning_rate": 2.791651801641099e-05, + "loss": 0.2732, + "step": 3130 + }, + { + "epoch": 0.05584489708557771, + "grad_norm": 0.45574623346328735, + "learning_rate": 2.7925437031751694e-05, + "loss": 0.2875, + "step": 3131 + }, + { + "epoch": 0.055862733207291404, + "grad_norm": 0.41233009099960327, + "learning_rate": 2.7934356047092404e-05, + "loss": 0.3017, + "step": 3132 + }, + { + "epoch": 0.0558805693290051, + "grad_norm": 0.5841701030731201, + "learning_rate": 2.794327506243311e-05, + "loss": 0.2297, + "step": 3133 + }, + { + "epoch": 0.055898405450718794, + "grad_norm": 0.41142526268959045, + "learning_rate": 2.7952194077773813e-05, + "loss": 0.2537, + "step": 3134 + }, + { + "epoch": 0.05591624157243249, + "grad_norm": 0.3923339247703552, + "learning_rate": 2.7961113093114523e-05, + "loss": 0.2709, + "step": 3135 + }, + { + "epoch": 0.055934077694146184, + "grad_norm": 0.34697508811950684, + "learning_rate": 2.797003210845523e-05, + "loss": 0.3215, + "step": 3136 + }, + { + "epoch": 0.05595191381585988, + "grad_norm": 0.631820797920227, + "learning_rate": 2.7978951123795932e-05, + "loss": 0.3416, + "step": 3137 + }, + { + "epoch": 0.055969749937573574, + "grad_norm": 0.4826674461364746, + "learning_rate": 2.7987870139136642e-05, + "loss": 0.311, + "step": 3138 + }, + { + "epoch": 0.05598758605928727, + "grad_norm": 0.4957122504711151, + "learning_rate": 2.799678915447735e-05, + "loss": 0.2977, + "step": 3139 + }, + { + "epoch": 0.05600542218100096, + "grad_norm": 0.495969295501709, + "learning_rate": 2.800570816981805e-05, + "loss": 0.2867, + "step": 3140 + }, + { + "epoch": 0.05602325830271466, + "grad_norm": 0.5296710729598999, + "learning_rate": 2.8014627185158758e-05, + "loss": 0.3355, + "step": 3141 + }, + { + "epoch": 0.05604109442442835, + "grad_norm": 0.42540377378463745, + "learning_rate": 2.8023546200499468e-05, + "loss": 0.2963, + "step": 3142 + }, + { + "epoch": 0.05605893054614205, + "grad_norm": 0.38158974051475525, + "learning_rate": 2.803246521584017e-05, + "loss": 0.2822, + "step": 3143 + }, + { + "epoch": 0.05607676666785574, + "grad_norm": 0.5512579083442688, + "learning_rate": 2.8041384231180877e-05, + "loss": 0.3114, + "step": 3144 + }, + { + "epoch": 0.05609460278956944, + "grad_norm": 0.5109096765518188, + "learning_rate": 2.8050303246521587e-05, + "loss": 0.32, + "step": 3145 + }, + { + "epoch": 0.05611243891128313, + "grad_norm": 0.3573777377605438, + "learning_rate": 2.805922226186229e-05, + "loss": 0.312, + "step": 3146 + }, + { + "epoch": 0.05613027503299683, + "grad_norm": 0.525793194770813, + "learning_rate": 2.8068141277202996e-05, + "loss": 0.3378, + "step": 3147 + }, + { + "epoch": 0.05614811115471052, + "grad_norm": 1.7347043752670288, + "learning_rate": 2.8077060292543706e-05, + "loss": 0.292, + "step": 3148 + }, + { + "epoch": 0.05616594727642422, + "grad_norm": 0.477344274520874, + "learning_rate": 2.8085979307884412e-05, + "loss": 0.3393, + "step": 3149 + }, + { + "epoch": 0.056183783398137906, + "grad_norm": 0.36261841654777527, + "learning_rate": 2.8094898323225115e-05, + "loss": 0.2598, + "step": 3150 + }, + { + "epoch": 0.0562016195198516, + "grad_norm": 0.3269127905368805, + "learning_rate": 2.8103817338565825e-05, + "loss": 0.2411, + "step": 3151 + }, + { + "epoch": 0.056219455641565295, + "grad_norm": 0.43891724944114685, + "learning_rate": 2.811273635390653e-05, + "loss": 0.3135, + "step": 3152 + }, + { + "epoch": 0.05623729176327899, + "grad_norm": 0.6503772735595703, + "learning_rate": 2.8121655369247234e-05, + "loss": 0.326, + "step": 3153 + }, + { + "epoch": 0.056255127884992685, + "grad_norm": 0.43998587131500244, + "learning_rate": 2.8130574384587944e-05, + "loss": 0.2725, + "step": 3154 + }, + { + "epoch": 0.05627296400670638, + "grad_norm": 0.4373207688331604, + "learning_rate": 2.813949339992865e-05, + "loss": 0.2543, + "step": 3155 + }, + { + "epoch": 0.056290800128420075, + "grad_norm": 0.5187900066375732, + "learning_rate": 2.8148412415269353e-05, + "loss": 0.3075, + "step": 3156 + }, + { + "epoch": 0.05630863625013377, + "grad_norm": 0.6164479851722717, + "learning_rate": 2.8157331430610063e-05, + "loss": 0.3565, + "step": 3157 + }, + { + "epoch": 0.056326472371847465, + "grad_norm": 0.4550468623638153, + "learning_rate": 2.816625044595077e-05, + "loss": 0.276, + "step": 3158 + }, + { + "epoch": 0.05634430849356116, + "grad_norm": 0.3523094654083252, + "learning_rate": 2.8175169461291473e-05, + "loss": 0.2965, + "step": 3159 + }, + { + "epoch": 0.056362144615274855, + "grad_norm": 0.3450637757778168, + "learning_rate": 2.8184088476632182e-05, + "loss": 0.2419, + "step": 3160 + }, + { + "epoch": 0.05637998073698855, + "grad_norm": 0.4436189532279968, + "learning_rate": 2.819300749197289e-05, + "loss": 0.2392, + "step": 3161 + }, + { + "epoch": 0.056397816858702245, + "grad_norm": 0.5303342342376709, + "learning_rate": 2.8201926507313592e-05, + "loss": 0.3043, + "step": 3162 + }, + { + "epoch": 0.05641565298041594, + "grad_norm": 0.46184423565864563, + "learning_rate": 2.82108455226543e-05, + "loss": 0.2825, + "step": 3163 + }, + { + "epoch": 0.056433489102129634, + "grad_norm": 0.5427693724632263, + "learning_rate": 2.8219764537995008e-05, + "loss": 0.2885, + "step": 3164 + }, + { + "epoch": 0.05645132522384333, + "grad_norm": 0.46107038855552673, + "learning_rate": 2.822868355333571e-05, + "loss": 0.3059, + "step": 3165 + }, + { + "epoch": 0.056469161345557024, + "grad_norm": 0.3580935001373291, + "learning_rate": 2.8237602568676417e-05, + "loss": 0.2672, + "step": 3166 + }, + { + "epoch": 0.05648699746727072, + "grad_norm": 0.3495483696460724, + "learning_rate": 2.8246521584017127e-05, + "loss": 0.2541, + "step": 3167 + }, + { + "epoch": 0.056504833588984414, + "grad_norm": 0.44915691018104553, + "learning_rate": 2.825544059935783e-05, + "loss": 0.2697, + "step": 3168 + }, + { + "epoch": 0.05652266971069811, + "grad_norm": 0.8128893971443176, + "learning_rate": 2.8264359614698536e-05, + "loss": 0.2975, + "step": 3169 + }, + { + "epoch": 0.0565405058324118, + "grad_norm": 0.7385756969451904, + "learning_rate": 2.8273278630039246e-05, + "loss": 0.3223, + "step": 3170 + }, + { + "epoch": 0.05655834195412549, + "grad_norm": 0.43033161759376526, + "learning_rate": 2.828219764537995e-05, + "loss": 0.2983, + "step": 3171 + }, + { + "epoch": 0.05657617807583919, + "grad_norm": 0.5504375696182251, + "learning_rate": 2.8291116660720656e-05, + "loss": 0.2947, + "step": 3172 + }, + { + "epoch": 0.05659401419755288, + "grad_norm": 0.4506465792655945, + "learning_rate": 2.8300035676061365e-05, + "loss": 0.2637, + "step": 3173 + }, + { + "epoch": 0.05661185031926658, + "grad_norm": 0.39166396856307983, + "learning_rate": 2.8308954691402072e-05, + "loss": 0.2906, + "step": 3174 + }, + { + "epoch": 0.05662968644098027, + "grad_norm": 0.45482513308525085, + "learning_rate": 2.8317873706742775e-05, + "loss": 0.3005, + "step": 3175 + }, + { + "epoch": 0.05664752256269397, + "grad_norm": 0.4915219843387604, + "learning_rate": 2.8326792722083484e-05, + "loss": 0.2792, + "step": 3176 + }, + { + "epoch": 0.05666535868440766, + "grad_norm": 0.3989510238170624, + "learning_rate": 2.833571173742419e-05, + "loss": 0.3002, + "step": 3177 + }, + { + "epoch": 0.056683194806121356, + "grad_norm": 0.351775199174881, + "learning_rate": 2.8344630752764894e-05, + "loss": 0.2325, + "step": 3178 + }, + { + "epoch": 0.05670103092783505, + "grad_norm": 0.47354358434677124, + "learning_rate": 2.8353549768105604e-05, + "loss": 0.3084, + "step": 3179 + }, + { + "epoch": 0.056718867049548746, + "grad_norm": 0.44085946679115295, + "learning_rate": 2.836246878344631e-05, + "loss": 0.3073, + "step": 3180 + }, + { + "epoch": 0.05673670317126244, + "grad_norm": 0.49750787019729614, + "learning_rate": 2.8371387798787013e-05, + "loss": 0.2789, + "step": 3181 + }, + { + "epoch": 0.056754539292976136, + "grad_norm": 0.5748381614685059, + "learning_rate": 2.8380306814127723e-05, + "loss": 0.31, + "step": 3182 + }, + { + "epoch": 0.05677237541468983, + "grad_norm": 0.45841026306152344, + "learning_rate": 2.838922582946843e-05, + "loss": 0.3374, + "step": 3183 + }, + { + "epoch": 0.056790211536403526, + "grad_norm": 0.5971012711524963, + "learning_rate": 2.8398144844809132e-05, + "loss": 0.3508, + "step": 3184 + }, + { + "epoch": 0.05680804765811722, + "grad_norm": 0.507256269454956, + "learning_rate": 2.8407063860149842e-05, + "loss": 0.2557, + "step": 3185 + }, + { + "epoch": 0.056825883779830916, + "grad_norm": 0.7194445133209229, + "learning_rate": 2.8415982875490548e-05, + "loss": 0.3054, + "step": 3186 + }, + { + "epoch": 0.05684371990154461, + "grad_norm": 0.4229884445667267, + "learning_rate": 2.842490189083125e-05, + "loss": 0.319, + "step": 3187 + }, + { + "epoch": 0.056861556023258306, + "grad_norm": 0.4635477364063263, + "learning_rate": 2.843382090617196e-05, + "loss": 0.316, + "step": 3188 + }, + { + "epoch": 0.056879392144972, + "grad_norm": 0.39817649126052856, + "learning_rate": 2.8442739921512667e-05, + "loss": 0.2788, + "step": 3189 + }, + { + "epoch": 0.05689722826668569, + "grad_norm": 0.42672228813171387, + "learning_rate": 2.845165893685337e-05, + "loss": 0.2903, + "step": 3190 + }, + { + "epoch": 0.05691506438839938, + "grad_norm": 0.5355145931243896, + "learning_rate": 2.846057795219408e-05, + "loss": 0.3236, + "step": 3191 + }, + { + "epoch": 0.05693290051011308, + "grad_norm": 0.4850665330886841, + "learning_rate": 2.8469496967534787e-05, + "loss": 0.2521, + "step": 3192 + }, + { + "epoch": 0.05695073663182677, + "grad_norm": 0.3593953549861908, + "learning_rate": 2.847841598287549e-05, + "loss": 0.2494, + "step": 3193 + }, + { + "epoch": 0.05696857275354047, + "grad_norm": 0.48539307713508606, + "learning_rate": 2.8487334998216196e-05, + "loss": 0.2909, + "step": 3194 + }, + { + "epoch": 0.05698640887525416, + "grad_norm": 0.4970082938671112, + "learning_rate": 2.8496254013556906e-05, + "loss": 0.2917, + "step": 3195 + }, + { + "epoch": 0.05700424499696786, + "grad_norm": 0.7430002093315125, + "learning_rate": 2.8505173028897612e-05, + "loss": 0.2189, + "step": 3196 + }, + { + "epoch": 0.05702208111868155, + "grad_norm": 0.30442625284194946, + "learning_rate": 2.8514092044238315e-05, + "loss": 0.217, + "step": 3197 + }, + { + "epoch": 0.05703991724039525, + "grad_norm": 0.3949531316757202, + "learning_rate": 2.8523011059579025e-05, + "loss": 0.306, + "step": 3198 + }, + { + "epoch": 0.05705775336210894, + "grad_norm": 0.42441514134407043, + "learning_rate": 2.853193007491973e-05, + "loss": 0.3281, + "step": 3199 + }, + { + "epoch": 0.05707558948382264, + "grad_norm": 0.3786030411720276, + "learning_rate": 2.8540849090260434e-05, + "loss": 0.3032, + "step": 3200 + }, + { + "epoch": 0.05709342560553633, + "grad_norm": 0.46103936433792114, + "learning_rate": 2.8549768105601144e-05, + "loss": 0.341, + "step": 3201 + }, + { + "epoch": 0.05711126172725003, + "grad_norm": 0.4143133759498596, + "learning_rate": 2.855868712094185e-05, + "loss": 0.2032, + "step": 3202 + }, + { + "epoch": 0.05712909784896372, + "grad_norm": 0.6137906908988953, + "learning_rate": 2.8567606136282553e-05, + "loss": 0.3155, + "step": 3203 + }, + { + "epoch": 0.05714693397067742, + "grad_norm": 0.607319712638855, + "learning_rate": 2.8576525151623263e-05, + "loss": 0.2459, + "step": 3204 + }, + { + "epoch": 0.05716477009239111, + "grad_norm": 0.503551721572876, + "learning_rate": 2.858544416696397e-05, + "loss": 0.3582, + "step": 3205 + }, + { + "epoch": 0.05718260621410481, + "grad_norm": 0.37274956703186035, + "learning_rate": 2.8594363182304672e-05, + "loss": 0.2643, + "step": 3206 + }, + { + "epoch": 0.0572004423358185, + "grad_norm": 0.4710714817047119, + "learning_rate": 2.8603282197645382e-05, + "loss": 0.2802, + "step": 3207 + }, + { + "epoch": 0.0572182784575322, + "grad_norm": 0.5470686554908752, + "learning_rate": 2.861220121298609e-05, + "loss": 0.3143, + "step": 3208 + }, + { + "epoch": 0.05723611457924589, + "grad_norm": 0.4025944471359253, + "learning_rate": 2.862112022832679e-05, + "loss": 0.2134, + "step": 3209 + }, + { + "epoch": 0.05725395070095958, + "grad_norm": 0.4664728045463562, + "learning_rate": 2.86300392436675e-05, + "loss": 0.3075, + "step": 3210 + }, + { + "epoch": 0.057271786822673275, + "grad_norm": 0.4440355896949768, + "learning_rate": 2.8638958259008208e-05, + "loss": 0.3069, + "step": 3211 + }, + { + "epoch": 0.05728962294438697, + "grad_norm": 0.36694321036338806, + "learning_rate": 2.864787727434891e-05, + "loss": 0.2778, + "step": 3212 + }, + { + "epoch": 0.057307459066100665, + "grad_norm": 0.3843688368797302, + "learning_rate": 2.865679628968962e-05, + "loss": 0.2474, + "step": 3213 + }, + { + "epoch": 0.05732529518781436, + "grad_norm": 0.41927361488342285, + "learning_rate": 2.8665715305030327e-05, + "loss": 0.3291, + "step": 3214 + }, + { + "epoch": 0.057343131309528055, + "grad_norm": 0.4616337716579437, + "learning_rate": 2.867463432037103e-05, + "loss": 0.3504, + "step": 3215 + }, + { + "epoch": 0.05736096743124175, + "grad_norm": 0.6417986154556274, + "learning_rate": 2.868355333571174e-05, + "loss": 0.3368, + "step": 3216 + }, + { + "epoch": 0.057378803552955444, + "grad_norm": 0.42413899302482605, + "learning_rate": 2.8692472351052446e-05, + "loss": 0.3405, + "step": 3217 + }, + { + "epoch": 0.05739663967466914, + "grad_norm": 0.44954735040664673, + "learning_rate": 2.870139136639315e-05, + "loss": 0.2605, + "step": 3218 + }, + { + "epoch": 0.057414475796382834, + "grad_norm": 0.3479384779930115, + "learning_rate": 2.8710310381733855e-05, + "loss": 0.2449, + "step": 3219 + }, + { + "epoch": 0.05743231191809653, + "grad_norm": 0.5870296955108643, + "learning_rate": 2.8719229397074565e-05, + "loss": 0.3004, + "step": 3220 + }, + { + "epoch": 0.057450148039810224, + "grad_norm": 0.34973809123039246, + "learning_rate": 2.8728148412415275e-05, + "loss": 0.2774, + "step": 3221 + }, + { + "epoch": 0.05746798416152392, + "grad_norm": 0.45131930708885193, + "learning_rate": 2.8737067427755975e-05, + "loss": 0.2686, + "step": 3222 + }, + { + "epoch": 0.057485820283237614, + "grad_norm": 0.3726236820220947, + "learning_rate": 2.8745986443096684e-05, + "loss": 0.2909, + "step": 3223 + }, + { + "epoch": 0.05750365640495131, + "grad_norm": 0.39420604705810547, + "learning_rate": 2.875490545843739e-05, + "loss": 0.2844, + "step": 3224 + }, + { + "epoch": 0.057521492526665004, + "grad_norm": 0.6217480301856995, + "learning_rate": 2.8763824473778094e-05, + "loss": 0.2524, + "step": 3225 + }, + { + "epoch": 0.0575393286483787, + "grad_norm": 0.5144774913787842, + "learning_rate": 2.8772743489118803e-05, + "loss": 0.2974, + "step": 3226 + }, + { + "epoch": 0.057557164770092394, + "grad_norm": 0.45274215936660767, + "learning_rate": 2.878166250445951e-05, + "loss": 0.2856, + "step": 3227 + }, + { + "epoch": 0.05757500089180609, + "grad_norm": 0.4292668402194977, + "learning_rate": 2.8790581519800213e-05, + "loss": 0.2682, + "step": 3228 + }, + { + "epoch": 0.05759283701351978, + "grad_norm": 0.4211280047893524, + "learning_rate": 2.8799500535140923e-05, + "loss": 0.2684, + "step": 3229 + }, + { + "epoch": 0.05761067313523347, + "grad_norm": 0.39303717017173767, + "learning_rate": 2.880841955048163e-05, + "loss": 0.2816, + "step": 3230 + }, + { + "epoch": 0.057628509256947166, + "grad_norm": 0.3582131266593933, + "learning_rate": 2.8817338565822332e-05, + "loss": 0.2378, + "step": 3231 + }, + { + "epoch": 0.05764634537866086, + "grad_norm": 0.5083575248718262, + "learning_rate": 2.8826257581163042e-05, + "loss": 0.3035, + "step": 3232 + }, + { + "epoch": 0.057664181500374556, + "grad_norm": 0.39351096749305725, + "learning_rate": 2.8835176596503748e-05, + "loss": 0.2771, + "step": 3233 + }, + { + "epoch": 0.05768201762208825, + "grad_norm": 0.5952514410018921, + "learning_rate": 2.884409561184445e-05, + "loss": 0.3101, + "step": 3234 + }, + { + "epoch": 0.057699853743801946, + "grad_norm": 0.41662126779556274, + "learning_rate": 2.885301462718516e-05, + "loss": 0.2836, + "step": 3235 + }, + { + "epoch": 0.05771768986551564, + "grad_norm": 0.6105010509490967, + "learning_rate": 2.8861933642525867e-05, + "loss": 0.2997, + "step": 3236 + }, + { + "epoch": 0.057735525987229336, + "grad_norm": 0.4033588767051697, + "learning_rate": 2.887085265786657e-05, + "loss": 0.2772, + "step": 3237 + }, + { + "epoch": 0.05775336210894303, + "grad_norm": 0.44319161772727966, + "learning_rate": 2.887977167320728e-05, + "loss": 0.2574, + "step": 3238 + }, + { + "epoch": 0.057771198230656726, + "grad_norm": 0.31570732593536377, + "learning_rate": 2.8888690688547986e-05, + "loss": 0.2735, + "step": 3239 + }, + { + "epoch": 0.05778903435237042, + "grad_norm": 0.48176056146621704, + "learning_rate": 2.889760970388869e-05, + "loss": 0.3235, + "step": 3240 + }, + { + "epoch": 0.057806870474084115, + "grad_norm": 0.554405927658081, + "learning_rate": 2.89065287192294e-05, + "loss": 0.2098, + "step": 3241 + }, + { + "epoch": 0.05782470659579781, + "grad_norm": 0.31652870774269104, + "learning_rate": 2.8915447734570106e-05, + "loss": 0.2697, + "step": 3242 + }, + { + "epoch": 0.057842542717511505, + "grad_norm": 0.434579074382782, + "learning_rate": 2.8924366749910815e-05, + "loss": 0.3108, + "step": 3243 + }, + { + "epoch": 0.0578603788392252, + "grad_norm": 0.4084996283054352, + "learning_rate": 2.8933285765251515e-05, + "loss": 0.2703, + "step": 3244 + }, + { + "epoch": 0.057878214960938895, + "grad_norm": 0.41963431239128113, + "learning_rate": 2.8942204780592225e-05, + "loss": 0.2826, + "step": 3245 + }, + { + "epoch": 0.05789605108265259, + "grad_norm": 0.36174237728118896, + "learning_rate": 2.8951123795932934e-05, + "loss": 0.3111, + "step": 3246 + }, + { + "epoch": 0.057913887204366285, + "grad_norm": 0.4229978919029236, + "learning_rate": 2.8960042811273634e-05, + "loss": 0.2628, + "step": 3247 + }, + { + "epoch": 0.05793172332607998, + "grad_norm": 0.4644685387611389, + "learning_rate": 2.8968961826614344e-05, + "loss": 0.2847, + "step": 3248 + }, + { + "epoch": 0.057949559447793675, + "grad_norm": 0.43232446908950806, + "learning_rate": 2.897788084195505e-05, + "loss": 0.2995, + "step": 3249 + }, + { + "epoch": 0.05796739556950737, + "grad_norm": 0.4653993844985962, + "learning_rate": 2.8986799857295753e-05, + "loss": 0.2814, + "step": 3250 + }, + { + "epoch": 0.05798523169122106, + "grad_norm": 0.314706414937973, + "learning_rate": 2.8995718872636463e-05, + "loss": 0.2123, + "step": 3251 + }, + { + "epoch": 0.05800306781293475, + "grad_norm": 0.5298987030982971, + "learning_rate": 2.900463788797717e-05, + "loss": 0.2624, + "step": 3252 + }, + { + "epoch": 0.05802090393464845, + "grad_norm": 0.40126827359199524, + "learning_rate": 2.9013556903317872e-05, + "loss": 0.3133, + "step": 3253 + }, + { + "epoch": 0.05803874005636214, + "grad_norm": 0.371821790933609, + "learning_rate": 2.9022475918658582e-05, + "loss": 0.2738, + "step": 3254 + }, + { + "epoch": 0.05805657617807584, + "grad_norm": 0.43823331594467163, + "learning_rate": 2.903139493399929e-05, + "loss": 0.2927, + "step": 3255 + }, + { + "epoch": 0.05807441229978953, + "grad_norm": 0.4936355650424957, + "learning_rate": 2.904031394933999e-05, + "loss": 0.3306, + "step": 3256 + }, + { + "epoch": 0.05809224842150323, + "grad_norm": 0.4949367344379425, + "learning_rate": 2.90492329646807e-05, + "loss": 0.31, + "step": 3257 + }, + { + "epoch": 0.05811008454321692, + "grad_norm": 0.5128595232963562, + "learning_rate": 2.9058151980021408e-05, + "loss": 0.2954, + "step": 3258 + }, + { + "epoch": 0.05812792066493062, + "grad_norm": 0.46815380454063416, + "learning_rate": 2.906707099536211e-05, + "loss": 0.332, + "step": 3259 + }, + { + "epoch": 0.05814575678664431, + "grad_norm": 0.4047621190547943, + "learning_rate": 2.907599001070282e-05, + "loss": 0.3294, + "step": 3260 + }, + { + "epoch": 0.05816359290835801, + "grad_norm": 0.5759481191635132, + "learning_rate": 2.9084909026043527e-05, + "loss": 0.3203, + "step": 3261 + }, + { + "epoch": 0.0581814290300717, + "grad_norm": 0.4076763093471527, + "learning_rate": 2.909382804138423e-05, + "loss": 0.3038, + "step": 3262 + }, + { + "epoch": 0.0581992651517854, + "grad_norm": 0.3462975323200226, + "learning_rate": 2.910274705672494e-05, + "loss": 0.2987, + "step": 3263 + }, + { + "epoch": 0.05821710127349909, + "grad_norm": 0.3002260625362396, + "learning_rate": 2.9111666072065646e-05, + "loss": 0.2713, + "step": 3264 + }, + { + "epoch": 0.05823493739521279, + "grad_norm": 0.3066962659358978, + "learning_rate": 2.912058508740635e-05, + "loss": 0.2486, + "step": 3265 + }, + { + "epoch": 0.05825277351692648, + "grad_norm": 0.42274385690689087, + "learning_rate": 2.912950410274706e-05, + "loss": 0.2916, + "step": 3266 + }, + { + "epoch": 0.058270609638640176, + "grad_norm": 0.39436569809913635, + "learning_rate": 2.9138423118087765e-05, + "loss": 0.2891, + "step": 3267 + }, + { + "epoch": 0.05828844576035387, + "grad_norm": 0.39736512303352356, + "learning_rate": 2.9147342133428475e-05, + "loss": 0.2744, + "step": 3268 + }, + { + "epoch": 0.058306281882067566, + "grad_norm": 0.44282907247543335, + "learning_rate": 2.9156261148769174e-05, + "loss": 0.2887, + "step": 3269 + }, + { + "epoch": 0.05832411800378126, + "grad_norm": 0.5134050846099854, + "learning_rate": 2.9165180164109884e-05, + "loss": 0.2974, + "step": 3270 + }, + { + "epoch": 0.05834195412549495, + "grad_norm": 0.5337728261947632, + "learning_rate": 2.9174099179450594e-05, + "loss": 0.2578, + "step": 3271 + }, + { + "epoch": 0.058359790247208644, + "grad_norm": 0.42524704337120056, + "learning_rate": 2.9183018194791294e-05, + "loss": 0.2468, + "step": 3272 + }, + { + "epoch": 0.05837762636892234, + "grad_norm": 0.4540986716747284, + "learning_rate": 2.9191937210132003e-05, + "loss": 0.2605, + "step": 3273 + }, + { + "epoch": 0.058395462490636034, + "grad_norm": 0.49019667506217957, + "learning_rate": 2.920085622547271e-05, + "loss": 0.2807, + "step": 3274 + }, + { + "epoch": 0.05841329861234973, + "grad_norm": 0.6835748553276062, + "learning_rate": 2.9209775240813413e-05, + "loss": 0.2746, + "step": 3275 + }, + { + "epoch": 0.058431134734063424, + "grad_norm": 0.5628316402435303, + "learning_rate": 2.9218694256154123e-05, + "loss": 0.3014, + "step": 3276 + }, + { + "epoch": 0.05844897085577712, + "grad_norm": 0.6078162789344788, + "learning_rate": 2.922761327149483e-05, + "loss": 0.3247, + "step": 3277 + }, + { + "epoch": 0.058466806977490814, + "grad_norm": 0.5041225552558899, + "learning_rate": 2.9236532286835532e-05, + "loss": 0.2905, + "step": 3278 + }, + { + "epoch": 0.05848464309920451, + "grad_norm": 0.4063977599143982, + "learning_rate": 2.924545130217624e-05, + "loss": 0.3328, + "step": 3279 + }, + { + "epoch": 0.0585024792209182, + "grad_norm": 0.34691980481147766, + "learning_rate": 2.9254370317516948e-05, + "loss": 0.2946, + "step": 3280 + }, + { + "epoch": 0.0585203153426319, + "grad_norm": 0.5628437399864197, + "learning_rate": 2.926328933285765e-05, + "loss": 0.2631, + "step": 3281 + }, + { + "epoch": 0.05853815146434559, + "grad_norm": 0.4472805857658386, + "learning_rate": 2.927220834819836e-05, + "loss": 0.2897, + "step": 3282 + }, + { + "epoch": 0.05855598758605929, + "grad_norm": 0.4052281379699707, + "learning_rate": 2.9281127363539067e-05, + "loss": 0.2916, + "step": 3283 + }, + { + "epoch": 0.05857382370777298, + "grad_norm": 0.36523932218551636, + "learning_rate": 2.929004637887977e-05, + "loss": 0.2733, + "step": 3284 + }, + { + "epoch": 0.05859165982948668, + "grad_norm": 0.359784334897995, + "learning_rate": 2.929896539422048e-05, + "loss": 0.2527, + "step": 3285 + }, + { + "epoch": 0.05860949595120037, + "grad_norm": 0.47422704100608826, + "learning_rate": 2.9307884409561186e-05, + "loss": 0.2655, + "step": 3286 + }, + { + "epoch": 0.05862733207291407, + "grad_norm": 0.3405548334121704, + "learning_rate": 2.931680342490189e-05, + "loss": 0.2814, + "step": 3287 + }, + { + "epoch": 0.05864516819462776, + "grad_norm": 0.820716381072998, + "learning_rate": 2.93257224402426e-05, + "loss": 0.3035, + "step": 3288 + }, + { + "epoch": 0.05866300431634146, + "grad_norm": 0.37102848291397095, + "learning_rate": 2.9334641455583305e-05, + "loss": 0.2539, + "step": 3289 + }, + { + "epoch": 0.05868084043805515, + "grad_norm": 0.393527090549469, + "learning_rate": 2.9343560470924015e-05, + "loss": 0.2849, + "step": 3290 + }, + { + "epoch": 0.05869867655976884, + "grad_norm": 0.363849937915802, + "learning_rate": 2.9352479486264718e-05, + "loss": 0.2399, + "step": 3291 + }, + { + "epoch": 0.058716512681482536, + "grad_norm": 0.3990871012210846, + "learning_rate": 2.9361398501605425e-05, + "loss": 0.2837, + "step": 3292 + }, + { + "epoch": 0.05873434880319623, + "grad_norm": 0.48869359493255615, + "learning_rate": 2.9370317516946134e-05, + "loss": 0.2937, + "step": 3293 + }, + { + "epoch": 0.058752184924909925, + "grad_norm": 0.53365558385849, + "learning_rate": 2.9379236532286837e-05, + "loss": 0.2619, + "step": 3294 + }, + { + "epoch": 0.05877002104662362, + "grad_norm": 0.3252737522125244, + "learning_rate": 2.9388155547627544e-05, + "loss": 0.2834, + "step": 3295 + }, + { + "epoch": 0.058787857168337315, + "grad_norm": 0.38567742705345154, + "learning_rate": 2.9397074562968254e-05, + "loss": 0.2501, + "step": 3296 + }, + { + "epoch": 0.05880569329005101, + "grad_norm": 0.5182756781578064, + "learning_rate": 2.9405993578308953e-05, + "loss": 0.2895, + "step": 3297 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.4358648359775543, + "learning_rate": 2.9414912593649663e-05, + "loss": 0.2808, + "step": 3298 + }, + { + "epoch": 0.0588413655334784, + "grad_norm": 0.4711143672466278, + "learning_rate": 2.942383160899037e-05, + "loss": 0.2937, + "step": 3299 + }, + { + "epoch": 0.058859201655192095, + "grad_norm": 0.34037455916404724, + "learning_rate": 2.9432750624331072e-05, + "loss": 0.2976, + "step": 3300 + }, + { + "epoch": 0.05887703777690579, + "grad_norm": 0.35021427273750305, + "learning_rate": 2.9441669639671782e-05, + "loss": 0.3063, + "step": 3301 + }, + { + "epoch": 0.058894873898619485, + "grad_norm": 0.2765119671821594, + "learning_rate": 2.945058865501249e-05, + "loss": 0.2473, + "step": 3302 + }, + { + "epoch": 0.05891271002033318, + "grad_norm": 0.35572266578674316, + "learning_rate": 2.945950767035319e-05, + "loss": 0.3039, + "step": 3303 + }, + { + "epoch": 0.058930546142046875, + "grad_norm": 0.5072250962257385, + "learning_rate": 2.94684266856939e-05, + "loss": 0.2853, + "step": 3304 + }, + { + "epoch": 0.05894838226376057, + "grad_norm": 0.45801055431365967, + "learning_rate": 2.9477345701034608e-05, + "loss": 0.299, + "step": 3305 + }, + { + "epoch": 0.058966218385474264, + "grad_norm": 0.44712984561920166, + "learning_rate": 2.948626471637531e-05, + "loss": 0.2835, + "step": 3306 + }, + { + "epoch": 0.05898405450718796, + "grad_norm": 0.5450993180274963, + "learning_rate": 2.949518373171602e-05, + "loss": 0.2719, + "step": 3307 + }, + { + "epoch": 0.059001890628901654, + "grad_norm": 0.48879894614219666, + "learning_rate": 2.9504102747056727e-05, + "loss": 0.2843, + "step": 3308 + }, + { + "epoch": 0.05901972675061535, + "grad_norm": 0.6736392974853516, + "learning_rate": 2.951302176239743e-05, + "loss": 0.3514, + "step": 3309 + }, + { + "epoch": 0.059037562872329044, + "grad_norm": 0.422929584980011, + "learning_rate": 2.952194077773814e-05, + "loss": 0.3429, + "step": 3310 + }, + { + "epoch": 0.05905539899404273, + "grad_norm": 0.3613201975822449, + "learning_rate": 2.9530859793078846e-05, + "loss": 0.2876, + "step": 3311 + }, + { + "epoch": 0.05907323511575643, + "grad_norm": 0.4560771584510803, + "learning_rate": 2.953977880841955e-05, + "loss": 0.3221, + "step": 3312 + }, + { + "epoch": 0.05909107123747012, + "grad_norm": 0.5454785227775574, + "learning_rate": 2.954869782376026e-05, + "loss": 0.3115, + "step": 3313 + }, + { + "epoch": 0.05910890735918382, + "grad_norm": 0.3850862681865692, + "learning_rate": 2.9557616839100965e-05, + "loss": 0.298, + "step": 3314 + }, + { + "epoch": 0.05912674348089751, + "grad_norm": 0.3268927335739136, + "learning_rate": 2.9566535854441675e-05, + "loss": 0.2836, + "step": 3315 + }, + { + "epoch": 0.05914457960261121, + "grad_norm": 0.4352155923843384, + "learning_rate": 2.9575454869782378e-05, + "loss": 0.2801, + "step": 3316 + }, + { + "epoch": 0.0591624157243249, + "grad_norm": 0.4850205183029175, + "learning_rate": 2.9584373885123084e-05, + "loss": 0.2127, + "step": 3317 + }, + { + "epoch": 0.059180251846038597, + "grad_norm": 0.3261580467224121, + "learning_rate": 2.9593292900463794e-05, + "loss": 0.276, + "step": 3318 + }, + { + "epoch": 0.05919808796775229, + "grad_norm": 0.285980761051178, + "learning_rate": 2.9602211915804497e-05, + "loss": 0.2693, + "step": 3319 + }, + { + "epoch": 0.059215924089465986, + "grad_norm": 0.3186729848384857, + "learning_rate": 2.9611130931145203e-05, + "loss": 0.2809, + "step": 3320 + }, + { + "epoch": 0.05923376021117968, + "grad_norm": 0.4164327383041382, + "learning_rate": 2.9620049946485913e-05, + "loss": 0.2969, + "step": 3321 + }, + { + "epoch": 0.059251596332893376, + "grad_norm": 0.32970285415649414, + "learning_rate": 2.9628968961826613e-05, + "loss": 0.2078, + "step": 3322 + }, + { + "epoch": 0.05926943245460707, + "grad_norm": 0.4292951822280884, + "learning_rate": 2.9637887977167322e-05, + "loss": 0.3259, + "step": 3323 + }, + { + "epoch": 0.059287268576320766, + "grad_norm": 0.4053258001804352, + "learning_rate": 2.9646806992508032e-05, + "loss": 0.2574, + "step": 3324 + }, + { + "epoch": 0.05930510469803446, + "grad_norm": 0.37580397725105286, + "learning_rate": 2.9655726007848732e-05, + "loss": 0.2447, + "step": 3325 + }, + { + "epoch": 0.059322940819748156, + "grad_norm": 0.35199934244155884, + "learning_rate": 2.966464502318944e-05, + "loss": 0.2659, + "step": 3326 + }, + { + "epoch": 0.05934077694146185, + "grad_norm": 0.2869519889354706, + "learning_rate": 2.9673564038530148e-05, + "loss": 0.2857, + "step": 3327 + }, + { + "epoch": 0.059358613063175546, + "grad_norm": 0.621752917766571, + "learning_rate": 2.968248305387085e-05, + "loss": 0.3008, + "step": 3328 + }, + { + "epoch": 0.05937644918488924, + "grad_norm": 0.3756980299949646, + "learning_rate": 2.969140206921156e-05, + "loss": 0.2663, + "step": 3329 + }, + { + "epoch": 0.059394285306602936, + "grad_norm": 0.3002112805843353, + "learning_rate": 2.9700321084552267e-05, + "loss": 0.2411, + "step": 3330 + }, + { + "epoch": 0.059412121428316624, + "grad_norm": 0.40645831823349, + "learning_rate": 2.970924009989297e-05, + "loss": 0.2975, + "step": 3331 + }, + { + "epoch": 0.05942995755003032, + "grad_norm": 0.42232415080070496, + "learning_rate": 2.971815911523368e-05, + "loss": 0.2972, + "step": 3332 + }, + { + "epoch": 0.05944779367174401, + "grad_norm": 0.3313126564025879, + "learning_rate": 2.9727078130574386e-05, + "loss": 0.2475, + "step": 3333 + }, + { + "epoch": 0.05946562979345771, + "grad_norm": 0.43232089281082153, + "learning_rate": 2.973599714591509e-05, + "loss": 0.2695, + "step": 3334 + }, + { + "epoch": 0.0594834659151714, + "grad_norm": 0.5244986414909363, + "learning_rate": 2.97449161612558e-05, + "loss": 0.3476, + "step": 3335 + }, + { + "epoch": 0.0595013020368851, + "grad_norm": 0.5530703067779541, + "learning_rate": 2.9753835176596505e-05, + "loss": 0.4008, + "step": 3336 + }, + { + "epoch": 0.05951913815859879, + "grad_norm": 0.5310500860214233, + "learning_rate": 2.976275419193721e-05, + "loss": 0.3504, + "step": 3337 + }, + { + "epoch": 0.05953697428031249, + "grad_norm": 0.49514028429985046, + "learning_rate": 2.9771673207277918e-05, + "loss": 0.2656, + "step": 3338 + }, + { + "epoch": 0.05955481040202618, + "grad_norm": 0.49703454971313477, + "learning_rate": 2.9780592222618624e-05, + "loss": 0.2958, + "step": 3339 + }, + { + "epoch": 0.05957264652373988, + "grad_norm": 0.4270615577697754, + "learning_rate": 2.9789511237959334e-05, + "loss": 0.2962, + "step": 3340 + }, + { + "epoch": 0.05959048264545357, + "grad_norm": 0.36220037937164307, + "learning_rate": 2.9798430253300037e-05, + "loss": 0.2802, + "step": 3341 + }, + { + "epoch": 0.05960831876716727, + "grad_norm": 0.39774414896965027, + "learning_rate": 2.9807349268640744e-05, + "loss": 0.2688, + "step": 3342 + }, + { + "epoch": 0.05962615488888096, + "grad_norm": 0.41961362957954407, + "learning_rate": 2.9816268283981453e-05, + "loss": 0.3242, + "step": 3343 + }, + { + "epoch": 0.05964399101059466, + "grad_norm": 0.3730867803096771, + "learning_rate": 2.9825187299322156e-05, + "loss": 0.3207, + "step": 3344 + }, + { + "epoch": 0.05966182713230835, + "grad_norm": 0.3605899512767792, + "learning_rate": 2.9834106314662863e-05, + "loss": 0.2675, + "step": 3345 + }, + { + "epoch": 0.05967966325402205, + "grad_norm": 0.44611263275146484, + "learning_rate": 2.9843025330003573e-05, + "loss": 0.28, + "step": 3346 + }, + { + "epoch": 0.05969749937573574, + "grad_norm": 0.3818523585796356, + "learning_rate": 2.9851944345344272e-05, + "loss": 0.303, + "step": 3347 + }, + { + "epoch": 0.05971533549744944, + "grad_norm": 0.46706482768058777, + "learning_rate": 2.9860863360684982e-05, + "loss": 0.3455, + "step": 3348 + }, + { + "epoch": 0.05973317161916313, + "grad_norm": 0.3815406858921051, + "learning_rate": 2.986978237602569e-05, + "loss": 0.2768, + "step": 3349 + }, + { + "epoch": 0.05975100774087683, + "grad_norm": 0.5149982571601868, + "learning_rate": 2.987870139136639e-05, + "loss": 0.3037, + "step": 3350 + }, + { + "epoch": 0.059768843862590515, + "grad_norm": 0.5229239463806152, + "learning_rate": 2.98876204067071e-05, + "loss": 0.2862, + "step": 3351 + }, + { + "epoch": 0.05978667998430421, + "grad_norm": 0.4386540651321411, + "learning_rate": 2.9896539422047807e-05, + "loss": 0.2427, + "step": 3352 + }, + { + "epoch": 0.059804516106017905, + "grad_norm": 0.7024639248847961, + "learning_rate": 2.990545843738851e-05, + "loss": 0.2877, + "step": 3353 + }, + { + "epoch": 0.0598223522277316, + "grad_norm": 0.47102904319763184, + "learning_rate": 2.991437745272922e-05, + "loss": 0.2814, + "step": 3354 + }, + { + "epoch": 0.059840188349445295, + "grad_norm": 0.39220118522644043, + "learning_rate": 2.9923296468069927e-05, + "loss": 0.3083, + "step": 3355 + }, + { + "epoch": 0.05985802447115899, + "grad_norm": 0.3915637135505676, + "learning_rate": 2.993221548341063e-05, + "loss": 0.2863, + "step": 3356 + }, + { + "epoch": 0.059875860592872684, + "grad_norm": 0.35335150361061096, + "learning_rate": 2.994113449875134e-05, + "loss": 0.2521, + "step": 3357 + }, + { + "epoch": 0.05989369671458638, + "grad_norm": 0.44048142433166504, + "learning_rate": 2.9950053514092046e-05, + "loss": 0.2733, + "step": 3358 + }, + { + "epoch": 0.059911532836300074, + "grad_norm": 0.3673648238182068, + "learning_rate": 2.995897252943275e-05, + "loss": 0.212, + "step": 3359 + }, + { + "epoch": 0.05992936895801377, + "grad_norm": 0.46625861525535583, + "learning_rate": 2.996789154477346e-05, + "loss": 0.2779, + "step": 3360 + }, + { + "epoch": 0.059947205079727464, + "grad_norm": 0.39080703258514404, + "learning_rate": 2.9976810560114165e-05, + "loss": 0.3021, + "step": 3361 + }, + { + "epoch": 0.05996504120144116, + "grad_norm": 0.5033069849014282, + "learning_rate": 2.9985729575454875e-05, + "loss": 0.2822, + "step": 3362 + }, + { + "epoch": 0.059982877323154854, + "grad_norm": 0.455512672662735, + "learning_rate": 2.9994648590795578e-05, + "loss": 0.3281, + "step": 3363 + }, + { + "epoch": 0.06000071344486855, + "grad_norm": 0.3449738323688507, + "learning_rate": 3.0003567606136284e-05, + "loss": 0.2781, + "step": 3364 + }, + { + "epoch": 0.060018549566582244, + "grad_norm": 0.37134668231010437, + "learning_rate": 3.0012486621476994e-05, + "loss": 0.2746, + "step": 3365 + }, + { + "epoch": 0.06003638568829594, + "grad_norm": 0.5062718391418457, + "learning_rate": 3.0021405636817697e-05, + "loss": 0.2992, + "step": 3366 + }, + { + "epoch": 0.060054221810009634, + "grad_norm": 0.4795890748500824, + "learning_rate": 3.0030324652158403e-05, + "loss": 0.2962, + "step": 3367 + }, + { + "epoch": 0.06007205793172333, + "grad_norm": 1.082323670387268, + "learning_rate": 3.0039243667499113e-05, + "loss": 0.3115, + "step": 3368 + }, + { + "epoch": 0.060089894053437024, + "grad_norm": 0.4421440362930298, + "learning_rate": 3.0048162682839816e-05, + "loss": 0.2604, + "step": 3369 + }, + { + "epoch": 0.06010773017515072, + "grad_norm": 0.3249231278896332, + "learning_rate": 3.0057081698180522e-05, + "loss": 0.2524, + "step": 3370 + }, + { + "epoch": 0.060125566296864406, + "grad_norm": 0.4153946340084076, + "learning_rate": 3.0066000713521232e-05, + "loss": 0.3268, + "step": 3371 + }, + { + "epoch": 0.0601434024185781, + "grad_norm": 0.35149988532066345, + "learning_rate": 3.007491972886193e-05, + "loss": 0.2984, + "step": 3372 + }, + { + "epoch": 0.060161238540291796, + "grad_norm": 0.4687574803829193, + "learning_rate": 3.008383874420264e-05, + "loss": 0.3355, + "step": 3373 + }, + { + "epoch": 0.06017907466200549, + "grad_norm": 0.4659954309463501, + "learning_rate": 3.009275775954335e-05, + "loss": 0.2917, + "step": 3374 + }, + { + "epoch": 0.060196910783719186, + "grad_norm": 0.461733341217041, + "learning_rate": 3.010167677488405e-05, + "loss": 0.2683, + "step": 3375 + }, + { + "epoch": 0.06021474690543288, + "grad_norm": 0.44231653213500977, + "learning_rate": 3.011059579022476e-05, + "loss": 0.2721, + "step": 3376 + }, + { + "epoch": 0.060232583027146576, + "grad_norm": 0.38076111674308777, + "learning_rate": 3.0119514805565467e-05, + "loss": 0.26, + "step": 3377 + }, + { + "epoch": 0.06025041914886027, + "grad_norm": 0.32276830077171326, + "learning_rate": 3.012843382090617e-05, + "loss": 0.2661, + "step": 3378 + }, + { + "epoch": 0.060268255270573966, + "grad_norm": 0.40255284309387207, + "learning_rate": 3.013735283624688e-05, + "loss": 0.3408, + "step": 3379 + }, + { + "epoch": 0.06028609139228766, + "grad_norm": 0.34506767988204956, + "learning_rate": 3.0146271851587586e-05, + "loss": 0.2859, + "step": 3380 + }, + { + "epoch": 0.060303927514001356, + "grad_norm": 0.4300769865512848, + "learning_rate": 3.015519086692829e-05, + "loss": 0.2825, + "step": 3381 + }, + { + "epoch": 0.06032176363571505, + "grad_norm": 0.41303813457489014, + "learning_rate": 3.0164109882269e-05, + "loss": 0.325, + "step": 3382 + }, + { + "epoch": 0.060339599757428745, + "grad_norm": 0.33736518025398254, + "learning_rate": 3.0173028897609705e-05, + "loss": 0.2657, + "step": 3383 + }, + { + "epoch": 0.06035743587914244, + "grad_norm": 0.46587398648262024, + "learning_rate": 3.0181947912950408e-05, + "loss": 0.3808, + "step": 3384 + }, + { + "epoch": 0.060375272000856135, + "grad_norm": 0.3951577842235565, + "learning_rate": 3.0190866928291118e-05, + "loss": 0.258, + "step": 3385 + }, + { + "epoch": 0.06039310812256983, + "grad_norm": 0.4404039680957794, + "learning_rate": 3.0199785943631824e-05, + "loss": 0.2642, + "step": 3386 + }, + { + "epoch": 0.060410944244283525, + "grad_norm": 0.42050695419311523, + "learning_rate": 3.0208704958972534e-05, + "loss": 0.2364, + "step": 3387 + }, + { + "epoch": 0.06042878036599722, + "grad_norm": 0.5026699304580688, + "learning_rate": 3.0217623974313237e-05, + "loss": 0.2618, + "step": 3388 + }, + { + "epoch": 0.060446616487710915, + "grad_norm": 0.3759438097476959, + "learning_rate": 3.0226542989653943e-05, + "loss": 0.3204, + "step": 3389 + }, + { + "epoch": 0.06046445260942461, + "grad_norm": 0.34291404485702515, + "learning_rate": 3.0235462004994653e-05, + "loss": 0.2917, + "step": 3390 + }, + { + "epoch": 0.0604822887311383, + "grad_norm": 0.4279941916465759, + "learning_rate": 3.0244381020335356e-05, + "loss": 0.3, + "step": 3391 + }, + { + "epoch": 0.06050012485285199, + "grad_norm": 0.5196157693862915, + "learning_rate": 3.0253300035676063e-05, + "loss": 0.3284, + "step": 3392 + }, + { + "epoch": 0.06051796097456569, + "grad_norm": 0.4175237715244293, + "learning_rate": 3.0262219051016772e-05, + "loss": 0.3099, + "step": 3393 + }, + { + "epoch": 0.06053579709627938, + "grad_norm": 0.5252898931503296, + "learning_rate": 3.0271138066357475e-05, + "loss": 0.3187, + "step": 3394 + }, + { + "epoch": 0.06055363321799308, + "grad_norm": 0.3560069799423218, + "learning_rate": 3.0280057081698182e-05, + "loss": 0.2334, + "step": 3395 + }, + { + "epoch": 0.06057146933970677, + "grad_norm": 0.3487046957015991, + "learning_rate": 3.028897609703889e-05, + "loss": 0.2925, + "step": 3396 + }, + { + "epoch": 0.06058930546142047, + "grad_norm": 0.35152021050453186, + "learning_rate": 3.0297895112379595e-05, + "loss": 0.3076, + "step": 3397 + }, + { + "epoch": 0.06060714158313416, + "grad_norm": 0.3704475462436676, + "learning_rate": 3.03068141277203e-05, + "loss": 0.281, + "step": 3398 + }, + { + "epoch": 0.06062497770484786, + "grad_norm": 0.3680011034011841, + "learning_rate": 3.031573314306101e-05, + "loss": 0.3198, + "step": 3399 + }, + { + "epoch": 0.06064281382656155, + "grad_norm": 0.30677318572998047, + "learning_rate": 3.032465215840171e-05, + "loss": 0.2769, + "step": 3400 + }, + { + "epoch": 0.06066064994827525, + "grad_norm": 0.3701297342777252, + "learning_rate": 3.033357117374242e-05, + "loss": 0.3025, + "step": 3401 + }, + { + "epoch": 0.06067848606998894, + "grad_norm": 0.44856762886047363, + "learning_rate": 3.0342490189083126e-05, + "loss": 0.2882, + "step": 3402 + }, + { + "epoch": 0.06069632219170264, + "grad_norm": 0.4001961350440979, + "learning_rate": 3.035140920442383e-05, + "loss": 0.3045, + "step": 3403 + }, + { + "epoch": 0.06071415831341633, + "grad_norm": 0.35330164432525635, + "learning_rate": 3.036032821976454e-05, + "loss": 0.278, + "step": 3404 + }, + { + "epoch": 0.06073199443513003, + "grad_norm": 0.47330203652381897, + "learning_rate": 3.0369247235105246e-05, + "loss": 0.2694, + "step": 3405 + }, + { + "epoch": 0.06074983055684372, + "grad_norm": 0.5003327131271362, + "learning_rate": 3.037816625044595e-05, + "loss": 0.3379, + "step": 3406 + }, + { + "epoch": 0.06076766667855742, + "grad_norm": 0.3417317867279053, + "learning_rate": 3.038708526578666e-05, + "loss": 0.274, + "step": 3407 + }, + { + "epoch": 0.06078550280027111, + "grad_norm": 0.42910662293434143, + "learning_rate": 3.0396004281127365e-05, + "loss": 0.2872, + "step": 3408 + }, + { + "epoch": 0.060803338921984806, + "grad_norm": 0.5343186855316162, + "learning_rate": 3.0404923296468074e-05, + "loss": 0.2583, + "step": 3409 + }, + { + "epoch": 0.0608211750436985, + "grad_norm": 0.30897626280784607, + "learning_rate": 3.0413842311808777e-05, + "loss": 0.2667, + "step": 3410 + }, + { + "epoch": 0.060839011165412196, + "grad_norm": 0.43187278509140015, + "learning_rate": 3.0422761327149484e-05, + "loss": 0.3362, + "step": 3411 + }, + { + "epoch": 0.060856847287125884, + "grad_norm": 0.48747536540031433, + "learning_rate": 3.0431680342490194e-05, + "loss": 0.3141, + "step": 3412 + }, + { + "epoch": 0.06087468340883958, + "grad_norm": 0.4179791808128357, + "learning_rate": 3.0440599357830897e-05, + "loss": 0.2906, + "step": 3413 + }, + { + "epoch": 0.060892519530553274, + "grad_norm": 0.4218780994415283, + "learning_rate": 3.0449518373171603e-05, + "loss": 0.285, + "step": 3414 + }, + { + "epoch": 0.06091035565226697, + "grad_norm": 0.4083127975463867, + "learning_rate": 3.0458437388512313e-05, + "loss": 0.2415, + "step": 3415 + }, + { + "epoch": 0.060928191773980664, + "grad_norm": 0.42035984992980957, + "learning_rate": 3.0467356403853016e-05, + "loss": 0.3514, + "step": 3416 + }, + { + "epoch": 0.06094602789569436, + "grad_norm": 0.520683765411377, + "learning_rate": 3.0476275419193722e-05, + "loss": 0.3256, + "step": 3417 + }, + { + "epoch": 0.060963864017408054, + "grad_norm": 0.310117244720459, + "learning_rate": 3.0485194434534432e-05, + "loss": 0.256, + "step": 3418 + }, + { + "epoch": 0.06098170013912175, + "grad_norm": 0.4333760738372803, + "learning_rate": 3.0494113449875135e-05, + "loss": 0.2229, + "step": 3419 + }, + { + "epoch": 0.060999536260835444, + "grad_norm": 0.32307708263397217, + "learning_rate": 3.050303246521584e-05, + "loss": 0.2846, + "step": 3420 + }, + { + "epoch": 0.06101737238254914, + "grad_norm": 0.7281751036643982, + "learning_rate": 3.051195148055655e-05, + "loss": 0.2683, + "step": 3421 + }, + { + "epoch": 0.06103520850426283, + "grad_norm": 0.2901424765586853, + "learning_rate": 3.0520870495897254e-05, + "loss": 0.2593, + "step": 3422 + }, + { + "epoch": 0.06105304462597653, + "grad_norm": 0.6676464080810547, + "learning_rate": 3.052978951123796e-05, + "loss": 0.2919, + "step": 3423 + }, + { + "epoch": 0.06107088074769022, + "grad_norm": 0.5041801333427429, + "learning_rate": 3.053870852657867e-05, + "loss": 0.2544, + "step": 3424 + }, + { + "epoch": 0.06108871686940392, + "grad_norm": 0.5568588376045227, + "learning_rate": 3.054762754191937e-05, + "loss": 0.2998, + "step": 3425 + }, + { + "epoch": 0.06110655299111761, + "grad_norm": 0.4442998170852661, + "learning_rate": 3.055654655726008e-05, + "loss": 0.2591, + "step": 3426 + }, + { + "epoch": 0.06112438911283131, + "grad_norm": 0.5679812431335449, + "learning_rate": 3.0565465572600786e-05, + "loss": 0.3145, + "step": 3427 + }, + { + "epoch": 0.061142225234545, + "grad_norm": 0.4436073303222656, + "learning_rate": 3.057438458794149e-05, + "loss": 0.2811, + "step": 3428 + }, + { + "epoch": 0.0611600613562587, + "grad_norm": 0.2981335520744324, + "learning_rate": 3.05833036032822e-05, + "loss": 0.2326, + "step": 3429 + }, + { + "epoch": 0.06117789747797239, + "grad_norm": 0.3030025064945221, + "learning_rate": 3.0592222618622905e-05, + "loss": 0.2513, + "step": 3430 + }, + { + "epoch": 0.06119573359968609, + "grad_norm": 0.36721524596214294, + "learning_rate": 3.060114163396361e-05, + "loss": 0.2662, + "step": 3431 + }, + { + "epoch": 0.061213569721399776, + "grad_norm": 0.43514400720596313, + "learning_rate": 3.061006064930432e-05, + "loss": 0.2615, + "step": 3432 + }, + { + "epoch": 0.06123140584311347, + "grad_norm": 0.4660835266113281, + "learning_rate": 3.0618979664645024e-05, + "loss": 0.2659, + "step": 3433 + }, + { + "epoch": 0.061249241964827165, + "grad_norm": 0.3673686683177948, + "learning_rate": 3.062789867998573e-05, + "loss": 0.259, + "step": 3434 + }, + { + "epoch": 0.06126707808654086, + "grad_norm": 0.37457528710365295, + "learning_rate": 3.063681769532644e-05, + "loss": 0.2998, + "step": 3435 + }, + { + "epoch": 0.061284914208254555, + "grad_norm": 0.37120434641838074, + "learning_rate": 3.064573671066714e-05, + "loss": 0.3192, + "step": 3436 + }, + { + "epoch": 0.06130275032996825, + "grad_norm": 0.3894599378108978, + "learning_rate": 3.065465572600785e-05, + "loss": 0.2815, + "step": 3437 + }, + { + "epoch": 0.061320586451681945, + "grad_norm": 0.4034176766872406, + "learning_rate": 3.0663574741348556e-05, + "loss": 0.3205, + "step": 3438 + }, + { + "epoch": 0.06133842257339564, + "grad_norm": 0.39361411333084106, + "learning_rate": 3.067249375668926e-05, + "loss": 0.2544, + "step": 3439 + }, + { + "epoch": 0.061356258695109335, + "grad_norm": 0.37965863943099976, + "learning_rate": 3.068141277202997e-05, + "loss": 0.2889, + "step": 3440 + }, + { + "epoch": 0.06137409481682303, + "grad_norm": 0.5079521536827087, + "learning_rate": 3.0690331787370675e-05, + "loss": 0.2821, + "step": 3441 + }, + { + "epoch": 0.061391930938536725, + "grad_norm": 0.3491654396057129, + "learning_rate": 3.069925080271138e-05, + "loss": 0.2874, + "step": 3442 + }, + { + "epoch": 0.06140976706025042, + "grad_norm": 0.5786492824554443, + "learning_rate": 3.070816981805209e-05, + "loss": 0.3637, + "step": 3443 + }, + { + "epoch": 0.061427603181964115, + "grad_norm": 0.3809356689453125, + "learning_rate": 3.0717088833392794e-05, + "loss": 0.2616, + "step": 3444 + }, + { + "epoch": 0.06144543930367781, + "grad_norm": 0.2865469753742218, + "learning_rate": 3.07260078487335e-05, + "loss": 0.2462, + "step": 3445 + }, + { + "epoch": 0.061463275425391505, + "grad_norm": 0.2671143710613251, + "learning_rate": 3.073492686407421e-05, + "loss": 0.2725, + "step": 3446 + }, + { + "epoch": 0.0614811115471052, + "grad_norm": 0.38948559761047363, + "learning_rate": 3.0743845879414914e-05, + "loss": 0.3297, + "step": 3447 + }, + { + "epoch": 0.061498947668818894, + "grad_norm": 0.4456574618816376, + "learning_rate": 3.075276489475562e-05, + "loss": 0.3657, + "step": 3448 + }, + { + "epoch": 0.06151678379053259, + "grad_norm": 0.29803067445755005, + "learning_rate": 3.0761683910096326e-05, + "loss": 0.2696, + "step": 3449 + }, + { + "epoch": 0.061534619912246284, + "grad_norm": 0.33508118987083435, + "learning_rate": 3.077060292543703e-05, + "loss": 0.291, + "step": 3450 + }, + { + "epoch": 0.06155245603395998, + "grad_norm": 0.5954883098602295, + "learning_rate": 3.077952194077774e-05, + "loss": 0.3106, + "step": 3451 + }, + { + "epoch": 0.06157029215567367, + "grad_norm": 0.4862188994884491, + "learning_rate": 3.0788440956118445e-05, + "loss": 0.2664, + "step": 3452 + }, + { + "epoch": 0.06158812827738736, + "grad_norm": 0.347736656665802, + "learning_rate": 3.079735997145915e-05, + "loss": 0.3012, + "step": 3453 + }, + { + "epoch": 0.06160596439910106, + "grad_norm": 0.4035290777683258, + "learning_rate": 3.080627898679986e-05, + "loss": 0.2769, + "step": 3454 + }, + { + "epoch": 0.06162380052081475, + "grad_norm": 0.4547941982746124, + "learning_rate": 3.0815198002140565e-05, + "loss": 0.327, + "step": 3455 + }, + { + "epoch": 0.06164163664252845, + "grad_norm": 0.34405431151390076, + "learning_rate": 3.082411701748127e-05, + "loss": 0.3032, + "step": 3456 + }, + { + "epoch": 0.06165947276424214, + "grad_norm": 0.36265790462493896, + "learning_rate": 3.083303603282198e-05, + "loss": 0.277, + "step": 3457 + }, + { + "epoch": 0.06167730888595584, + "grad_norm": 0.41423872113227844, + "learning_rate": 3.0841955048162684e-05, + "loss": 0.3509, + "step": 3458 + }, + { + "epoch": 0.06169514500766953, + "grad_norm": 0.3635760247707367, + "learning_rate": 3.085087406350339e-05, + "loss": 0.1995, + "step": 3459 + }, + { + "epoch": 0.061712981129383226, + "grad_norm": 0.5014716982841492, + "learning_rate": 3.0859793078844097e-05, + "loss": 0.2987, + "step": 3460 + }, + { + "epoch": 0.06173081725109692, + "grad_norm": 0.3897286653518677, + "learning_rate": 3.08687120941848e-05, + "loss": 0.307, + "step": 3461 + }, + { + "epoch": 0.061748653372810616, + "grad_norm": 0.3641478717327118, + "learning_rate": 3.087763110952551e-05, + "loss": 0.3255, + "step": 3462 + }, + { + "epoch": 0.06176648949452431, + "grad_norm": 0.32254454493522644, + "learning_rate": 3.0886550124866216e-05, + "loss": 0.2529, + "step": 3463 + }, + { + "epoch": 0.061784325616238006, + "grad_norm": 0.44392848014831543, + "learning_rate": 3.089546914020692e-05, + "loss": 0.3061, + "step": 3464 + }, + { + "epoch": 0.0618021617379517, + "grad_norm": 0.39981019496917725, + "learning_rate": 3.090438815554763e-05, + "loss": 0.2907, + "step": 3465 + }, + { + "epoch": 0.061819997859665396, + "grad_norm": 0.43697279691696167, + "learning_rate": 3.0913307170888335e-05, + "loss": 0.2752, + "step": 3466 + }, + { + "epoch": 0.06183783398137909, + "grad_norm": 0.33411675691604614, + "learning_rate": 3.092222618622904e-05, + "loss": 0.2644, + "step": 3467 + }, + { + "epoch": 0.061855670103092786, + "grad_norm": 0.3456276059150696, + "learning_rate": 3.093114520156975e-05, + "loss": 0.2696, + "step": 3468 + }, + { + "epoch": 0.06187350622480648, + "grad_norm": 0.38292551040649414, + "learning_rate": 3.0940064216910454e-05, + "loss": 0.2888, + "step": 3469 + }, + { + "epoch": 0.061891342346520176, + "grad_norm": 0.5598361492156982, + "learning_rate": 3.094898323225116e-05, + "loss": 0.33, + "step": 3470 + }, + { + "epoch": 0.06190917846823387, + "grad_norm": 0.3957209885120392, + "learning_rate": 3.095790224759187e-05, + "loss": 0.2901, + "step": 3471 + }, + { + "epoch": 0.06192701458994756, + "grad_norm": 0.2978615164756775, + "learning_rate": 3.096682126293257e-05, + "loss": 0.2637, + "step": 3472 + }, + { + "epoch": 0.06194485071166125, + "grad_norm": 0.29691120982170105, + "learning_rate": 3.097574027827328e-05, + "loss": 0.2315, + "step": 3473 + }, + { + "epoch": 0.06196268683337495, + "grad_norm": 0.32011178135871887, + "learning_rate": 3.0984659293613986e-05, + "loss": 0.2801, + "step": 3474 + }, + { + "epoch": 0.06198052295508864, + "grad_norm": 0.3960849344730377, + "learning_rate": 3.099357830895469e-05, + "loss": 0.2955, + "step": 3475 + }, + { + "epoch": 0.06199835907680234, + "grad_norm": 0.42267149686813354, + "learning_rate": 3.10024973242954e-05, + "loss": 0.2892, + "step": 3476 + }, + { + "epoch": 0.06201619519851603, + "grad_norm": 0.4643610715866089, + "learning_rate": 3.1011416339636105e-05, + "loss": 0.2749, + "step": 3477 + }, + { + "epoch": 0.06203403132022973, + "grad_norm": 0.3296380043029785, + "learning_rate": 3.102033535497681e-05, + "loss": 0.283, + "step": 3478 + }, + { + "epoch": 0.06205186744194342, + "grad_norm": 0.43161651492118835, + "learning_rate": 3.102925437031752e-05, + "loss": 0.3162, + "step": 3479 + }, + { + "epoch": 0.06206970356365712, + "grad_norm": 0.31484830379486084, + "learning_rate": 3.1038173385658224e-05, + "loss": 0.289, + "step": 3480 + }, + { + "epoch": 0.06208753968537081, + "grad_norm": 0.29794466495513916, + "learning_rate": 3.104709240099894e-05, + "loss": 0.2864, + "step": 3481 + }, + { + "epoch": 0.06210537580708451, + "grad_norm": 0.31414487957954407, + "learning_rate": 3.105601141633964e-05, + "loss": 0.2573, + "step": 3482 + }, + { + "epoch": 0.0621232119287982, + "grad_norm": 0.40964171290397644, + "learning_rate": 3.106493043168034e-05, + "loss": 0.3083, + "step": 3483 + }, + { + "epoch": 0.0621410480505119, + "grad_norm": 0.5551015138626099, + "learning_rate": 3.107384944702105e-05, + "loss": 0.3108, + "step": 3484 + }, + { + "epoch": 0.06215888417222559, + "grad_norm": 0.43493130803108215, + "learning_rate": 3.1082768462361756e-05, + "loss": 0.2971, + "step": 3485 + }, + { + "epoch": 0.06217672029393929, + "grad_norm": 0.5982611179351807, + "learning_rate": 3.109168747770246e-05, + "loss": 0.3517, + "step": 3486 + }, + { + "epoch": 0.06219455641565298, + "grad_norm": 0.45484572649002075, + "learning_rate": 3.110060649304317e-05, + "loss": 0.3272, + "step": 3487 + }, + { + "epoch": 0.06221239253736668, + "grad_norm": 0.3246980607509613, + "learning_rate": 3.1109525508383875e-05, + "loss": 0.2528, + "step": 3488 + }, + { + "epoch": 0.06223022865908037, + "grad_norm": 0.2784026563167572, + "learning_rate": 3.111844452372458e-05, + "loss": 0.2535, + "step": 3489 + }, + { + "epoch": 0.06224806478079407, + "grad_norm": 0.3554953336715698, + "learning_rate": 3.112736353906529e-05, + "loss": 0.2815, + "step": 3490 + }, + { + "epoch": 0.06226590090250776, + "grad_norm": 0.3764471113681793, + "learning_rate": 3.1136282554405994e-05, + "loss": 0.2925, + "step": 3491 + }, + { + "epoch": 0.06228373702422145, + "grad_norm": 0.3088894188404083, + "learning_rate": 3.11452015697467e-05, + "loss": 0.2624, + "step": 3492 + }, + { + "epoch": 0.062301573145935145, + "grad_norm": 0.347292959690094, + "learning_rate": 3.115412058508741e-05, + "loss": 0.251, + "step": 3493 + }, + { + "epoch": 0.06231940926764884, + "grad_norm": 0.26496273279190063, + "learning_rate": 3.1163039600428113e-05, + "loss": 0.2185, + "step": 3494 + }, + { + "epoch": 0.062337245389362535, + "grad_norm": 0.3232201933860779, + "learning_rate": 3.117195861576882e-05, + "loss": 0.2905, + "step": 3495 + }, + { + "epoch": 0.06235508151107623, + "grad_norm": 0.3585107922554016, + "learning_rate": 3.1180877631109526e-05, + "loss": 0.2946, + "step": 3496 + }, + { + "epoch": 0.062372917632789925, + "grad_norm": 0.31758782267570496, + "learning_rate": 3.118979664645023e-05, + "loss": 0.2713, + "step": 3497 + }, + { + "epoch": 0.06239075375450362, + "grad_norm": 0.33788156509399414, + "learning_rate": 3.119871566179094e-05, + "loss": 0.2883, + "step": 3498 + }, + { + "epoch": 0.062408589876217314, + "grad_norm": 0.699823260307312, + "learning_rate": 3.1207634677131645e-05, + "loss": 0.2454, + "step": 3499 + }, + { + "epoch": 0.06242642599793101, + "grad_norm": 0.37101513147354126, + "learning_rate": 3.121655369247235e-05, + "loss": 0.3186, + "step": 3500 + }, + { + "epoch": 0.062444262119644704, + "grad_norm": 0.3172360360622406, + "learning_rate": 3.122547270781306e-05, + "loss": 0.3276, + "step": 3501 + }, + { + "epoch": 0.0624620982413584, + "grad_norm": 0.3600565493106842, + "learning_rate": 3.1234391723153764e-05, + "loss": 0.2516, + "step": 3502 + }, + { + "epoch": 0.062479934363072094, + "grad_norm": 0.41846194863319397, + "learning_rate": 3.124331073849447e-05, + "loss": 0.3684, + "step": 3503 + }, + { + "epoch": 0.06249777048478579, + "grad_norm": 0.4761829376220703, + "learning_rate": 3.125222975383518e-05, + "loss": 0.3737, + "step": 3504 + }, + { + "epoch": 0.06251560660649948, + "grad_norm": 0.36271214485168457, + "learning_rate": 3.1261148769175884e-05, + "loss": 0.2952, + "step": 3505 + }, + { + "epoch": 0.06253344272821318, + "grad_norm": 0.37938109040260315, + "learning_rate": 3.12700677845166e-05, + "loss": 0.196, + "step": 3506 + }, + { + "epoch": 0.06255127884992687, + "grad_norm": 0.29861417412757874, + "learning_rate": 3.1278986799857296e-05, + "loss": 0.2606, + "step": 3507 + }, + { + "epoch": 0.06256911497164057, + "grad_norm": 0.27266961336135864, + "learning_rate": 3.1287905815198e-05, + "loss": 0.2467, + "step": 3508 + }, + { + "epoch": 0.06258695109335426, + "grad_norm": 0.450681209564209, + "learning_rate": 3.129682483053871e-05, + "loss": 0.3139, + "step": 3509 + }, + { + "epoch": 0.06260478721506796, + "grad_norm": 0.3376198709011078, + "learning_rate": 3.1305743845879416e-05, + "loss": 0.2381, + "step": 3510 + }, + { + "epoch": 0.06262262333678165, + "grad_norm": 0.37846675515174866, + "learning_rate": 3.131466286122012e-05, + "loss": 0.3134, + "step": 3511 + }, + { + "epoch": 0.06264045945849535, + "grad_norm": 0.4619830250740051, + "learning_rate": 3.132358187656083e-05, + "loss": 0.2902, + "step": 3512 + }, + { + "epoch": 0.06265829558020904, + "grad_norm": 0.42697346210479736, + "learning_rate": 3.1332500891901535e-05, + "loss": 0.3192, + "step": 3513 + }, + { + "epoch": 0.06267613170192274, + "grad_norm": 0.3949091136455536, + "learning_rate": 3.134141990724224e-05, + "loss": 0.304, + "step": 3514 + }, + { + "epoch": 0.06269396782363643, + "grad_norm": 0.37694236636161804, + "learning_rate": 3.135033892258295e-05, + "loss": 0.3137, + "step": 3515 + }, + { + "epoch": 0.06271180394535013, + "grad_norm": 0.2839398682117462, + "learning_rate": 3.1359257937923654e-05, + "loss": 0.2351, + "step": 3516 + }, + { + "epoch": 0.06272964006706382, + "grad_norm": 0.392132431268692, + "learning_rate": 3.136817695326436e-05, + "loss": 0.2615, + "step": 3517 + }, + { + "epoch": 0.06274747618877752, + "grad_norm": 0.38654953241348267, + "learning_rate": 3.1377095968605067e-05, + "loss": 0.2745, + "step": 3518 + }, + { + "epoch": 0.0627653123104912, + "grad_norm": 0.44307559728622437, + "learning_rate": 3.138601498394577e-05, + "loss": 0.3217, + "step": 3519 + }, + { + "epoch": 0.06278314843220491, + "grad_norm": 0.41050493717193604, + "learning_rate": 3.139493399928648e-05, + "loss": 0.2646, + "step": 3520 + }, + { + "epoch": 0.0628009845539186, + "grad_norm": 0.49982941150665283, + "learning_rate": 3.1403853014627186e-05, + "loss": 0.2802, + "step": 3521 + }, + { + "epoch": 0.06281882067563228, + "grad_norm": 0.3722202479839325, + "learning_rate": 3.141277202996789e-05, + "loss": 0.2991, + "step": 3522 + }, + { + "epoch": 0.06283665679734599, + "grad_norm": 0.31066420674324036, + "learning_rate": 3.14216910453086e-05, + "loss": 0.2825, + "step": 3523 + }, + { + "epoch": 0.06285449291905967, + "grad_norm": 0.5403444170951843, + "learning_rate": 3.1430610060649305e-05, + "loss": 0.3364, + "step": 3524 + }, + { + "epoch": 0.06287232904077338, + "grad_norm": 0.3716685473918915, + "learning_rate": 3.143952907599001e-05, + "loss": 0.3082, + "step": 3525 + }, + { + "epoch": 0.06289016516248706, + "grad_norm": 0.36720171570777893, + "learning_rate": 3.144844809133072e-05, + "loss": 0.2294, + "step": 3526 + }, + { + "epoch": 0.06290800128420077, + "grad_norm": 0.4109004735946655, + "learning_rate": 3.1457367106671424e-05, + "loss": 0.325, + "step": 3527 + }, + { + "epoch": 0.06292583740591445, + "grad_norm": 0.40544942021369934, + "learning_rate": 3.146628612201214e-05, + "loss": 0.2929, + "step": 3528 + }, + { + "epoch": 0.06294367352762816, + "grad_norm": 0.43174639344215393, + "learning_rate": 3.147520513735284e-05, + "loss": 0.2968, + "step": 3529 + }, + { + "epoch": 0.06296150964934184, + "grad_norm": 0.42321160435676575, + "learning_rate": 3.148412415269354e-05, + "loss": 0.3088, + "step": 3530 + }, + { + "epoch": 0.06297934577105554, + "grad_norm": 0.406230092048645, + "learning_rate": 3.1493043168034256e-05, + "loss": 0.2873, + "step": 3531 + }, + { + "epoch": 0.06299718189276923, + "grad_norm": 0.457004189491272, + "learning_rate": 3.1501962183374956e-05, + "loss": 0.2867, + "step": 3532 + }, + { + "epoch": 0.06301501801448293, + "grad_norm": 0.34689781069755554, + "learning_rate": 3.151088119871566e-05, + "loss": 0.3056, + "step": 3533 + }, + { + "epoch": 0.06303285413619662, + "grad_norm": 0.34096047282218933, + "learning_rate": 3.151980021405637e-05, + "loss": 0.2842, + "step": 3534 + }, + { + "epoch": 0.06305069025791032, + "grad_norm": 0.2742103636264801, + "learning_rate": 3.1528719229397075e-05, + "loss": 0.2458, + "step": 3535 + }, + { + "epoch": 0.06306852637962401, + "grad_norm": 0.5038346648216248, + "learning_rate": 3.153763824473778e-05, + "loss": 0.3018, + "step": 3536 + }, + { + "epoch": 0.06308636250133771, + "grad_norm": 0.5167878270149231, + "learning_rate": 3.154655726007849e-05, + "loss": 0.296, + "step": 3537 + }, + { + "epoch": 0.0631041986230514, + "grad_norm": 0.37561002373695374, + "learning_rate": 3.1555476275419194e-05, + "loss": 0.2719, + "step": 3538 + }, + { + "epoch": 0.0631220347447651, + "grad_norm": 0.390768438577652, + "learning_rate": 3.15643952907599e-05, + "loss": 0.2553, + "step": 3539 + }, + { + "epoch": 0.06313987086647879, + "grad_norm": 0.4308592975139618, + "learning_rate": 3.157331430610061e-05, + "loss": 0.2845, + "step": 3540 + }, + { + "epoch": 0.0631577069881925, + "grad_norm": 0.36119189858436584, + "learning_rate": 3.158223332144131e-05, + "loss": 0.29, + "step": 3541 + }, + { + "epoch": 0.06317554310990618, + "grad_norm": 0.46786993741989136, + "learning_rate": 3.159115233678202e-05, + "loss": 0.306, + "step": 3542 + }, + { + "epoch": 0.06319337923161987, + "grad_norm": 0.3725983202457428, + "learning_rate": 3.1600071352122726e-05, + "loss": 0.2224, + "step": 3543 + }, + { + "epoch": 0.06321121535333357, + "grad_norm": 0.30451688170433044, + "learning_rate": 3.160899036746343e-05, + "loss": 0.2545, + "step": 3544 + }, + { + "epoch": 0.06322905147504726, + "grad_norm": 0.3579949140548706, + "learning_rate": 3.161790938280414e-05, + "loss": 0.2581, + "step": 3545 + }, + { + "epoch": 0.06324688759676096, + "grad_norm": 0.4460763931274414, + "learning_rate": 3.1626828398144845e-05, + "loss": 0.3024, + "step": 3546 + }, + { + "epoch": 0.06326472371847465, + "grad_norm": 0.3446510136127472, + "learning_rate": 3.163574741348555e-05, + "loss": 0.2252, + "step": 3547 + }, + { + "epoch": 0.06328255984018835, + "grad_norm": 0.43941184878349304, + "learning_rate": 3.164466642882626e-05, + "loss": 0.2902, + "step": 3548 + }, + { + "epoch": 0.06330039596190204, + "grad_norm": 0.4447469413280487, + "learning_rate": 3.1653585444166964e-05, + "loss": 0.312, + "step": 3549 + }, + { + "epoch": 0.06331823208361574, + "grad_norm": 0.4542385935783386, + "learning_rate": 3.166250445950767e-05, + "loss": 0.3153, + "step": 3550 + }, + { + "epoch": 0.06333606820532943, + "grad_norm": 0.4392056167125702, + "learning_rate": 3.167142347484838e-05, + "loss": 0.3273, + "step": 3551 + }, + { + "epoch": 0.06335390432704313, + "grad_norm": 0.5697815418243408, + "learning_rate": 3.1680342490189084e-05, + "loss": 0.3172, + "step": 3552 + }, + { + "epoch": 0.06337174044875682, + "grad_norm": 0.4335101842880249, + "learning_rate": 3.16892615055298e-05, + "loss": 0.3071, + "step": 3553 + }, + { + "epoch": 0.06338957657047052, + "grad_norm": 0.36603108048439026, + "learning_rate": 3.1698180520870496e-05, + "loss": 0.2853, + "step": 3554 + }, + { + "epoch": 0.06340741269218421, + "grad_norm": 0.34847649931907654, + "learning_rate": 3.17070995362112e-05, + "loss": 0.3197, + "step": 3555 + }, + { + "epoch": 0.06342524881389791, + "grad_norm": 0.3712030351161957, + "learning_rate": 3.1716018551551916e-05, + "loss": 0.3124, + "step": 3556 + }, + { + "epoch": 0.0634430849356116, + "grad_norm": 0.3167654573917389, + "learning_rate": 3.1724937566892615e-05, + "loss": 0.2548, + "step": 3557 + }, + { + "epoch": 0.0634609210573253, + "grad_norm": 0.27881067991256714, + "learning_rate": 3.173385658223332e-05, + "loss": 0.2637, + "step": 3558 + }, + { + "epoch": 0.06347875717903899, + "grad_norm": 0.34619900584220886, + "learning_rate": 3.174277559757403e-05, + "loss": 0.2414, + "step": 3559 + }, + { + "epoch": 0.06349659330075269, + "grad_norm": 0.4399753510951996, + "learning_rate": 3.1751694612914735e-05, + "loss": 0.2797, + "step": 3560 + }, + { + "epoch": 0.06351442942246638, + "grad_norm": 0.44486740231513977, + "learning_rate": 3.176061362825544e-05, + "loss": 0.3748, + "step": 3561 + }, + { + "epoch": 0.06353226554418007, + "grad_norm": 0.364371657371521, + "learning_rate": 3.176953264359615e-05, + "loss": 0.3002, + "step": 3562 + }, + { + "epoch": 0.06355010166589377, + "grad_norm": 0.43306100368499756, + "learning_rate": 3.1778451658936854e-05, + "loss": 0.3043, + "step": 3563 + }, + { + "epoch": 0.06356793778760746, + "grad_norm": 0.38034042716026306, + "learning_rate": 3.178737067427756e-05, + "loss": 0.2758, + "step": 3564 + }, + { + "epoch": 0.06358577390932116, + "grad_norm": 0.37408167123794556, + "learning_rate": 3.1796289689618266e-05, + "loss": 0.2848, + "step": 3565 + }, + { + "epoch": 0.06360361003103485, + "grad_norm": 0.41384997963905334, + "learning_rate": 3.180520870495897e-05, + "loss": 0.3306, + "step": 3566 + }, + { + "epoch": 0.06362144615274855, + "grad_norm": 0.2774743437767029, + "learning_rate": 3.181412772029968e-05, + "loss": 0.2325, + "step": 3567 + }, + { + "epoch": 0.06363928227446224, + "grad_norm": 0.428325891494751, + "learning_rate": 3.1823046735640386e-05, + "loss": 0.3005, + "step": 3568 + }, + { + "epoch": 0.06365711839617594, + "grad_norm": 0.36428940296173096, + "learning_rate": 3.183196575098109e-05, + "loss": 0.2596, + "step": 3569 + }, + { + "epoch": 0.06367495451788963, + "grad_norm": 0.4848487079143524, + "learning_rate": 3.18408847663218e-05, + "loss": 0.2533, + "step": 3570 + }, + { + "epoch": 0.06369279063960333, + "grad_norm": 0.3982924818992615, + "learning_rate": 3.1849803781662505e-05, + "loss": 0.2648, + "step": 3571 + }, + { + "epoch": 0.06371062676131702, + "grad_norm": 0.46774011850357056, + "learning_rate": 3.185872279700321e-05, + "loss": 0.257, + "step": 3572 + }, + { + "epoch": 0.06372846288303072, + "grad_norm": 0.6224254369735718, + "learning_rate": 3.186764181234392e-05, + "loss": 0.3214, + "step": 3573 + }, + { + "epoch": 0.0637462990047444, + "grad_norm": 0.31088629364967346, + "learning_rate": 3.1876560827684624e-05, + "loss": 0.2476, + "step": 3574 + }, + { + "epoch": 0.06376413512645811, + "grad_norm": 0.543942391872406, + "learning_rate": 3.188547984302533e-05, + "loss": 0.2444, + "step": 3575 + }, + { + "epoch": 0.0637819712481718, + "grad_norm": 0.5251997709274292, + "learning_rate": 3.189439885836604e-05, + "loss": 0.274, + "step": 3576 + }, + { + "epoch": 0.0637998073698855, + "grad_norm": 0.32639220356941223, + "learning_rate": 3.190331787370674e-05, + "loss": 0.2272, + "step": 3577 + }, + { + "epoch": 0.06381764349159919, + "grad_norm": 0.41618624329566956, + "learning_rate": 3.1912236889047456e-05, + "loss": 0.2965, + "step": 3578 + }, + { + "epoch": 0.06383547961331289, + "grad_norm": 0.5315070748329163, + "learning_rate": 3.1921155904388156e-05, + "loss": 0.3144, + "step": 3579 + }, + { + "epoch": 0.06385331573502658, + "grad_norm": 0.463633269071579, + "learning_rate": 3.193007491972886e-05, + "loss": 0.265, + "step": 3580 + }, + { + "epoch": 0.06387115185674028, + "grad_norm": 0.4081169068813324, + "learning_rate": 3.1938993935069575e-05, + "loss": 0.2818, + "step": 3581 + }, + { + "epoch": 0.06388898797845396, + "grad_norm": 0.3589589297771454, + "learning_rate": 3.1947912950410275e-05, + "loss": 0.2707, + "step": 3582 + }, + { + "epoch": 0.06390682410016765, + "grad_norm": 0.3879191279411316, + "learning_rate": 3.195683196575098e-05, + "loss": 0.3095, + "step": 3583 + }, + { + "epoch": 0.06392466022188135, + "grad_norm": 0.38584789633750916, + "learning_rate": 3.1965750981091694e-05, + "loss": 0.2721, + "step": 3584 + }, + { + "epoch": 0.06394249634359504, + "grad_norm": 0.3974877893924713, + "learning_rate": 3.1974669996432394e-05, + "loss": 0.3207, + "step": 3585 + }, + { + "epoch": 0.06396033246530874, + "grad_norm": 0.4395434856414795, + "learning_rate": 3.19835890117731e-05, + "loss": 0.287, + "step": 3586 + }, + { + "epoch": 0.06397816858702243, + "grad_norm": 0.37194618582725525, + "learning_rate": 3.199250802711381e-05, + "loss": 0.3245, + "step": 3587 + }, + { + "epoch": 0.06399600470873613, + "grad_norm": 0.47053906321525574, + "learning_rate": 3.200142704245451e-05, + "loss": 0.287, + "step": 3588 + }, + { + "epoch": 0.06401384083044982, + "grad_norm": 0.4476097524166107, + "learning_rate": 3.201034605779522e-05, + "loss": 0.2847, + "step": 3589 + }, + { + "epoch": 0.06403167695216352, + "grad_norm": 0.35727089643478394, + "learning_rate": 3.2019265073135926e-05, + "loss": 0.2516, + "step": 3590 + }, + { + "epoch": 0.06404951307387721, + "grad_norm": 0.43988677859306335, + "learning_rate": 3.202818408847663e-05, + "loss": 0.277, + "step": 3591 + }, + { + "epoch": 0.06406734919559091, + "grad_norm": 0.341366171836853, + "learning_rate": 3.203710310381734e-05, + "loss": 0.2247, + "step": 3592 + }, + { + "epoch": 0.0640851853173046, + "grad_norm": 0.32592204213142395, + "learning_rate": 3.2046022119158045e-05, + "loss": 0.2191, + "step": 3593 + }, + { + "epoch": 0.0641030214390183, + "grad_norm": 0.41548824310302734, + "learning_rate": 3.205494113449875e-05, + "loss": 0.2722, + "step": 3594 + }, + { + "epoch": 0.06412085756073199, + "grad_norm": 0.7157866954803467, + "learning_rate": 3.206386014983946e-05, + "loss": 0.3582, + "step": 3595 + }, + { + "epoch": 0.0641386936824457, + "grad_norm": 0.39347320795059204, + "learning_rate": 3.2072779165180164e-05, + "loss": 0.308, + "step": 3596 + }, + { + "epoch": 0.06415652980415938, + "grad_norm": 0.4340120553970337, + "learning_rate": 3.208169818052087e-05, + "loss": 0.2817, + "step": 3597 + }, + { + "epoch": 0.06417436592587308, + "grad_norm": 0.36449071764945984, + "learning_rate": 3.209061719586158e-05, + "loss": 0.3306, + "step": 3598 + }, + { + "epoch": 0.06419220204758677, + "grad_norm": 0.3659517765045166, + "learning_rate": 3.209953621120228e-05, + "loss": 0.2781, + "step": 3599 + }, + { + "epoch": 0.06421003816930047, + "grad_norm": 0.3556494116783142, + "learning_rate": 3.2108455226542997e-05, + "loss": 0.2497, + "step": 3600 + }, + { + "epoch": 0.06422787429101416, + "grad_norm": 0.3153342008590698, + "learning_rate": 3.2117374241883696e-05, + "loss": 0.2479, + "step": 3601 + }, + { + "epoch": 0.06424571041272786, + "grad_norm": 0.3864276707172394, + "learning_rate": 3.21262932572244e-05, + "loss": 0.2888, + "step": 3602 + }, + { + "epoch": 0.06426354653444155, + "grad_norm": 0.34588074684143066, + "learning_rate": 3.2135212272565116e-05, + "loss": 0.2665, + "step": 3603 + }, + { + "epoch": 0.06428138265615524, + "grad_norm": 0.5801905393600464, + "learning_rate": 3.2144131287905815e-05, + "loss": 0.355, + "step": 3604 + }, + { + "epoch": 0.06429921877786894, + "grad_norm": 0.40911412239074707, + "learning_rate": 3.215305030324652e-05, + "loss": 0.3098, + "step": 3605 + }, + { + "epoch": 0.06431705489958263, + "grad_norm": 0.37570998072624207, + "learning_rate": 3.2161969318587235e-05, + "loss": 0.2766, + "step": 3606 + }, + { + "epoch": 0.06433489102129633, + "grad_norm": 0.30709129571914673, + "learning_rate": 3.2170888333927934e-05, + "loss": 0.295, + "step": 3607 + }, + { + "epoch": 0.06435272714301002, + "grad_norm": 0.39083558320999146, + "learning_rate": 3.217980734926864e-05, + "loss": 0.2984, + "step": 3608 + }, + { + "epoch": 0.06437056326472372, + "grad_norm": 0.30531859397888184, + "learning_rate": 3.2188726364609354e-05, + "loss": 0.2722, + "step": 3609 + }, + { + "epoch": 0.06438839938643741, + "grad_norm": 0.2474583238363266, + "learning_rate": 3.2197645379950054e-05, + "loss": 0.2155, + "step": 3610 + }, + { + "epoch": 0.06440623550815111, + "grad_norm": 0.46579623222351074, + "learning_rate": 3.220656439529076e-05, + "loss": 0.2127, + "step": 3611 + }, + { + "epoch": 0.0644240716298648, + "grad_norm": 0.4663061201572418, + "learning_rate": 3.2215483410631466e-05, + "loss": 0.3162, + "step": 3612 + }, + { + "epoch": 0.0644419077515785, + "grad_norm": 0.4508155584335327, + "learning_rate": 3.222440242597217e-05, + "loss": 0.3349, + "step": 3613 + }, + { + "epoch": 0.06445974387329219, + "grad_norm": 0.4759390354156494, + "learning_rate": 3.223332144131288e-05, + "loss": 0.3283, + "step": 3614 + }, + { + "epoch": 0.06447757999500589, + "grad_norm": 0.35923439264297485, + "learning_rate": 3.2242240456653585e-05, + "loss": 0.3003, + "step": 3615 + }, + { + "epoch": 0.06449541611671958, + "grad_norm": 0.37564563751220703, + "learning_rate": 3.225115947199429e-05, + "loss": 0.2498, + "step": 3616 + }, + { + "epoch": 0.06451325223843328, + "grad_norm": 0.5484115481376648, + "learning_rate": 3.2260078487335e-05, + "loss": 0.245, + "step": 3617 + }, + { + "epoch": 0.06453108836014697, + "grad_norm": 0.26685571670532227, + "learning_rate": 3.2268997502675705e-05, + "loss": 0.2475, + "step": 3618 + }, + { + "epoch": 0.06454892448186067, + "grad_norm": 0.288831502199173, + "learning_rate": 3.227791651801641e-05, + "loss": 0.2275, + "step": 3619 + }, + { + "epoch": 0.06456676060357436, + "grad_norm": 0.3965565860271454, + "learning_rate": 3.228683553335712e-05, + "loss": 0.2997, + "step": 3620 + }, + { + "epoch": 0.06458459672528806, + "grad_norm": 0.2891729176044464, + "learning_rate": 3.2295754548697824e-05, + "loss": 0.2471, + "step": 3621 + }, + { + "epoch": 0.06460243284700175, + "grad_norm": 0.295961856842041, + "learning_rate": 3.230467356403853e-05, + "loss": 0.2369, + "step": 3622 + }, + { + "epoch": 0.06462026896871544, + "grad_norm": 0.463190495967865, + "learning_rate": 3.2313592579379237e-05, + "loss": 0.2722, + "step": 3623 + }, + { + "epoch": 0.06463810509042914, + "grad_norm": 0.46243876218795776, + "learning_rate": 3.232251159471994e-05, + "loss": 0.2688, + "step": 3624 + }, + { + "epoch": 0.06465594121214283, + "grad_norm": 0.3638942837715149, + "learning_rate": 3.2331430610060656e-05, + "loss": 0.2565, + "step": 3625 + }, + { + "epoch": 0.06467377733385653, + "grad_norm": 0.3956579267978668, + "learning_rate": 3.2340349625401356e-05, + "loss": 0.2673, + "step": 3626 + }, + { + "epoch": 0.06469161345557022, + "grad_norm": 0.33142149448394775, + "learning_rate": 3.234926864074206e-05, + "loss": 0.227, + "step": 3627 + }, + { + "epoch": 0.06470944957728392, + "grad_norm": 0.36795392632484436, + "learning_rate": 3.2358187656082775e-05, + "loss": 0.2773, + "step": 3628 + }, + { + "epoch": 0.0647272856989976, + "grad_norm": 0.4311741590499878, + "learning_rate": 3.2367106671423475e-05, + "loss": 0.2779, + "step": 3629 + }, + { + "epoch": 0.06474512182071131, + "grad_norm": 0.3273148238658905, + "learning_rate": 3.237602568676418e-05, + "loss": 0.2647, + "step": 3630 + }, + { + "epoch": 0.064762957942425, + "grad_norm": 0.3899909555912018, + "learning_rate": 3.2384944702104894e-05, + "loss": 0.2757, + "step": 3631 + }, + { + "epoch": 0.0647807940641387, + "grad_norm": 0.5008993148803711, + "learning_rate": 3.2393863717445594e-05, + "loss": 0.2592, + "step": 3632 + }, + { + "epoch": 0.06479863018585238, + "grad_norm": 0.41284021735191345, + "learning_rate": 3.24027827327863e-05, + "loss": 0.3079, + "step": 3633 + }, + { + "epoch": 0.06481646630756609, + "grad_norm": 0.3901015520095825, + "learning_rate": 3.2411701748127013e-05, + "loss": 0.2754, + "step": 3634 + }, + { + "epoch": 0.06483430242927977, + "grad_norm": 0.39610543847084045, + "learning_rate": 3.242062076346771e-05, + "loss": 0.3192, + "step": 3635 + }, + { + "epoch": 0.06485213855099348, + "grad_norm": 0.42231959104537964, + "learning_rate": 3.242953977880842e-05, + "loss": 0.3102, + "step": 3636 + }, + { + "epoch": 0.06486997467270716, + "grad_norm": 0.39817750453948975, + "learning_rate": 3.2438458794149126e-05, + "loss": 0.2594, + "step": 3637 + }, + { + "epoch": 0.06488781079442087, + "grad_norm": 0.39597004652023315, + "learning_rate": 3.244737780948983e-05, + "loss": 0.2865, + "step": 3638 + }, + { + "epoch": 0.06490564691613455, + "grad_norm": 0.6299710273742676, + "learning_rate": 3.245629682483054e-05, + "loss": 0.3464, + "step": 3639 + }, + { + "epoch": 0.06492348303784826, + "grad_norm": 0.2853766977787018, + "learning_rate": 3.2465215840171245e-05, + "loss": 0.2727, + "step": 3640 + }, + { + "epoch": 0.06494131915956194, + "grad_norm": 0.33514028787612915, + "learning_rate": 3.247413485551195e-05, + "loss": 0.3077, + "step": 3641 + }, + { + "epoch": 0.06495915528127565, + "grad_norm": 0.4610214829444885, + "learning_rate": 3.248305387085266e-05, + "loss": 0.2716, + "step": 3642 + }, + { + "epoch": 0.06497699140298933, + "grad_norm": 0.38317734003067017, + "learning_rate": 3.2491972886193364e-05, + "loss": 0.2327, + "step": 3643 + }, + { + "epoch": 0.06499482752470302, + "grad_norm": 0.48809385299682617, + "learning_rate": 3.250089190153407e-05, + "loss": 0.29, + "step": 3644 + }, + { + "epoch": 0.06501266364641672, + "grad_norm": 0.335397332906723, + "learning_rate": 3.250981091687478e-05, + "loss": 0.2377, + "step": 3645 + }, + { + "epoch": 0.06503049976813041, + "grad_norm": 0.39842742681503296, + "learning_rate": 3.251872993221548e-05, + "loss": 0.2766, + "step": 3646 + }, + { + "epoch": 0.06504833588984411, + "grad_norm": 0.38238203525543213, + "learning_rate": 3.2527648947556196e-05, + "loss": 0.2747, + "step": 3647 + }, + { + "epoch": 0.0650661720115578, + "grad_norm": 0.5524346232414246, + "learning_rate": 3.2536567962896896e-05, + "loss": 0.3329, + "step": 3648 + }, + { + "epoch": 0.0650840081332715, + "grad_norm": 0.6394782662391663, + "learning_rate": 3.25454869782376e-05, + "loss": 0.3151, + "step": 3649 + }, + { + "epoch": 0.06510184425498519, + "grad_norm": 0.4849854111671448, + "learning_rate": 3.2554405993578316e-05, + "loss": 0.2973, + "step": 3650 + }, + { + "epoch": 0.0651196803766989, + "grad_norm": 0.39272361993789673, + "learning_rate": 3.2563325008919015e-05, + "loss": 0.3105, + "step": 3651 + }, + { + "epoch": 0.06513751649841258, + "grad_norm": 0.45926278829574585, + "learning_rate": 3.257224402425972e-05, + "loss": 0.2627, + "step": 3652 + }, + { + "epoch": 0.06515535262012628, + "grad_norm": 0.3858993351459503, + "learning_rate": 3.2581163039600435e-05, + "loss": 0.2869, + "step": 3653 + }, + { + "epoch": 0.06517318874183997, + "grad_norm": 0.5117769241333008, + "learning_rate": 3.2590082054941134e-05, + "loss": 0.2853, + "step": 3654 + }, + { + "epoch": 0.06519102486355367, + "grad_norm": 0.3571889400482178, + "learning_rate": 3.259900107028184e-05, + "loss": 0.2114, + "step": 3655 + }, + { + "epoch": 0.06520886098526736, + "grad_norm": 0.4695039391517639, + "learning_rate": 3.2607920085622554e-05, + "loss": 0.2397, + "step": 3656 + }, + { + "epoch": 0.06522669710698106, + "grad_norm": 0.35199299454689026, + "learning_rate": 3.2616839100963253e-05, + "loss": 0.2937, + "step": 3657 + }, + { + "epoch": 0.06524453322869475, + "grad_norm": 0.31346431374549866, + "learning_rate": 3.262575811630396e-05, + "loss": 0.2429, + "step": 3658 + }, + { + "epoch": 0.06526236935040845, + "grad_norm": 0.3331112265586853, + "learning_rate": 3.263467713164467e-05, + "loss": 0.2519, + "step": 3659 + }, + { + "epoch": 0.06528020547212214, + "grad_norm": 0.4858771562576294, + "learning_rate": 3.264359614698537e-05, + "loss": 0.2245, + "step": 3660 + }, + { + "epoch": 0.06529804159383584, + "grad_norm": 0.572431743144989, + "learning_rate": 3.265251516232608e-05, + "loss": 0.3461, + "step": 3661 + }, + { + "epoch": 0.06531587771554953, + "grad_norm": 0.39851173758506775, + "learning_rate": 3.2661434177666785e-05, + "loss": 0.2891, + "step": 3662 + }, + { + "epoch": 0.06533371383726322, + "grad_norm": 0.48051783442497253, + "learning_rate": 3.267035319300749e-05, + "loss": 0.3196, + "step": 3663 + }, + { + "epoch": 0.06535154995897692, + "grad_norm": 0.3494710922241211, + "learning_rate": 3.26792722083482e-05, + "loss": 0.2234, + "step": 3664 + }, + { + "epoch": 0.06536938608069061, + "grad_norm": 0.4011964499950409, + "learning_rate": 3.2688191223688904e-05, + "loss": 0.2579, + "step": 3665 + }, + { + "epoch": 0.06538722220240431, + "grad_norm": 1.0890759229660034, + "learning_rate": 3.269711023902961e-05, + "loss": 0.2564, + "step": 3666 + }, + { + "epoch": 0.065405058324118, + "grad_norm": 0.4458734691143036, + "learning_rate": 3.270602925437032e-05, + "loss": 0.2897, + "step": 3667 + }, + { + "epoch": 0.0654228944458317, + "grad_norm": 0.36088064312934875, + "learning_rate": 3.2714948269711024e-05, + "loss": 0.2671, + "step": 3668 + }, + { + "epoch": 0.06544073056754539, + "grad_norm": 0.33298173546791077, + "learning_rate": 3.272386728505173e-05, + "loss": 0.2884, + "step": 3669 + }, + { + "epoch": 0.06545856668925909, + "grad_norm": 0.3666765093803406, + "learning_rate": 3.2732786300392436e-05, + "loss": 0.234, + "step": 3670 + }, + { + "epoch": 0.06547640281097278, + "grad_norm": 0.41905704140663147, + "learning_rate": 3.274170531573314e-05, + "loss": 0.2734, + "step": 3671 + }, + { + "epoch": 0.06549423893268648, + "grad_norm": 0.3498532772064209, + "learning_rate": 3.2750624331073856e-05, + "loss": 0.2335, + "step": 3672 + }, + { + "epoch": 0.06551207505440017, + "grad_norm": 0.2984253764152527, + "learning_rate": 3.2759543346414556e-05, + "loss": 0.2417, + "step": 3673 + }, + { + "epoch": 0.06552991117611387, + "grad_norm": 0.3009212911128998, + "learning_rate": 3.276846236175526e-05, + "loss": 0.2879, + "step": 3674 + }, + { + "epoch": 0.06554774729782756, + "grad_norm": 0.4682937562465668, + "learning_rate": 3.2777381377095975e-05, + "loss": 0.3186, + "step": 3675 + }, + { + "epoch": 0.06556558341954126, + "grad_norm": 0.3837187886238098, + "learning_rate": 3.2786300392436675e-05, + "loss": 0.288, + "step": 3676 + }, + { + "epoch": 0.06558341954125495, + "grad_norm": 0.4935886561870575, + "learning_rate": 3.279521940777738e-05, + "loss": 0.271, + "step": 3677 + }, + { + "epoch": 0.06560125566296865, + "grad_norm": 0.5601122975349426, + "learning_rate": 3.2804138423118094e-05, + "loss": 0.3151, + "step": 3678 + }, + { + "epoch": 0.06561909178468234, + "grad_norm": 0.40890321135520935, + "learning_rate": 3.2813057438458794e-05, + "loss": 0.2328, + "step": 3679 + }, + { + "epoch": 0.06563692790639604, + "grad_norm": 0.35992440581321716, + "learning_rate": 3.28219764537995e-05, + "loss": 0.2751, + "step": 3680 + }, + { + "epoch": 0.06565476402810973, + "grad_norm": 0.4070022702217102, + "learning_rate": 3.283089546914021e-05, + "loss": 0.2957, + "step": 3681 + }, + { + "epoch": 0.06567260014982343, + "grad_norm": 0.3672012686729431, + "learning_rate": 3.283981448448091e-05, + "loss": 0.2714, + "step": 3682 + }, + { + "epoch": 0.06569043627153712, + "grad_norm": 0.37558531761169434, + "learning_rate": 3.284873349982162e-05, + "loss": 0.2856, + "step": 3683 + }, + { + "epoch": 0.0657082723932508, + "grad_norm": 0.3072632849216461, + "learning_rate": 3.285765251516233e-05, + "loss": 0.2335, + "step": 3684 + }, + { + "epoch": 0.0657261085149645, + "grad_norm": 0.37256982922554016, + "learning_rate": 3.286657153050303e-05, + "loss": 0.289, + "step": 3685 + }, + { + "epoch": 0.0657439446366782, + "grad_norm": 0.48035314679145813, + "learning_rate": 3.287549054584374e-05, + "loss": 0.2732, + "step": 3686 + }, + { + "epoch": 0.0657617807583919, + "grad_norm": 0.4267140328884125, + "learning_rate": 3.288440956118445e-05, + "loss": 0.2876, + "step": 3687 + }, + { + "epoch": 0.06577961688010558, + "grad_norm": 0.4718540906906128, + "learning_rate": 3.289332857652515e-05, + "loss": 0.2523, + "step": 3688 + }, + { + "epoch": 0.06579745300181929, + "grad_norm": 0.457587867975235, + "learning_rate": 3.290224759186586e-05, + "loss": 0.2334, + "step": 3689 + }, + { + "epoch": 0.06581528912353297, + "grad_norm": 0.4299467206001282, + "learning_rate": 3.2911166607206564e-05, + "loss": 0.3089, + "step": 3690 + }, + { + "epoch": 0.06583312524524668, + "grad_norm": 0.5274016857147217, + "learning_rate": 3.292008562254727e-05, + "loss": 0.3176, + "step": 3691 + }, + { + "epoch": 0.06585096136696036, + "grad_norm": 0.4209674298763275, + "learning_rate": 3.292900463788798e-05, + "loss": 0.2692, + "step": 3692 + }, + { + "epoch": 0.06586879748867407, + "grad_norm": 0.47414302825927734, + "learning_rate": 3.293792365322868e-05, + "loss": 0.2704, + "step": 3693 + }, + { + "epoch": 0.06588663361038775, + "grad_norm": 0.46226176619529724, + "learning_rate": 3.2946842668569396e-05, + "loss": 0.3259, + "step": 3694 + }, + { + "epoch": 0.06590446973210146, + "grad_norm": 0.30368199944496155, + "learning_rate": 3.2955761683910096e-05, + "loss": 0.2585, + "step": 3695 + }, + { + "epoch": 0.06592230585381514, + "grad_norm": 0.44019100069999695, + "learning_rate": 3.29646806992508e-05, + "loss": 0.306, + "step": 3696 + }, + { + "epoch": 0.06594014197552885, + "grad_norm": 0.2963986098766327, + "learning_rate": 3.2973599714591515e-05, + "loss": 0.2265, + "step": 3697 + }, + { + "epoch": 0.06595797809724253, + "grad_norm": 0.3986966609954834, + "learning_rate": 3.2982518729932215e-05, + "loss": 0.2693, + "step": 3698 + }, + { + "epoch": 0.06597581421895624, + "grad_norm": 0.4319208562374115, + "learning_rate": 3.299143774527292e-05, + "loss": 0.2329, + "step": 3699 + }, + { + "epoch": 0.06599365034066992, + "grad_norm": 0.395369291305542, + "learning_rate": 3.3000356760613635e-05, + "loss": 0.3071, + "step": 3700 + }, + { + "epoch": 0.06601148646238363, + "grad_norm": 0.333501398563385, + "learning_rate": 3.3009275775954334e-05, + "loss": 0.198, + "step": 3701 + }, + { + "epoch": 0.06602932258409731, + "grad_norm": 0.36454272270202637, + "learning_rate": 3.301819479129504e-05, + "loss": 0.2753, + "step": 3702 + }, + { + "epoch": 0.066047158705811, + "grad_norm": 0.3821306526660919, + "learning_rate": 3.3027113806635754e-05, + "loss": 0.3269, + "step": 3703 + }, + { + "epoch": 0.0660649948275247, + "grad_norm": 0.45392683148384094, + "learning_rate": 3.303603282197645e-05, + "loss": 0.2712, + "step": 3704 + }, + { + "epoch": 0.06608283094923839, + "grad_norm": 0.34545281529426575, + "learning_rate": 3.304495183731716e-05, + "loss": 0.2623, + "step": 3705 + }, + { + "epoch": 0.0661006670709521, + "grad_norm": 0.45389291644096375, + "learning_rate": 3.305387085265787e-05, + "loss": 0.3121, + "step": 3706 + }, + { + "epoch": 0.06611850319266578, + "grad_norm": 0.4296933710575104, + "learning_rate": 3.306278986799857e-05, + "loss": 0.2881, + "step": 3707 + }, + { + "epoch": 0.06613633931437948, + "grad_norm": 0.3538849651813507, + "learning_rate": 3.307170888333928e-05, + "loss": 0.2602, + "step": 3708 + }, + { + "epoch": 0.06615417543609317, + "grad_norm": 0.364667147397995, + "learning_rate": 3.308062789867999e-05, + "loss": 0.2652, + "step": 3709 + }, + { + "epoch": 0.06617201155780687, + "grad_norm": 0.37340229749679565, + "learning_rate": 3.308954691402069e-05, + "loss": 0.2335, + "step": 3710 + }, + { + "epoch": 0.06618984767952056, + "grad_norm": 0.4735252857208252, + "learning_rate": 3.30984659293614e-05, + "loss": 0.2283, + "step": 3711 + }, + { + "epoch": 0.06620768380123426, + "grad_norm": 0.47776302695274353, + "learning_rate": 3.310738494470211e-05, + "loss": 0.2659, + "step": 3712 + }, + { + "epoch": 0.06622551992294795, + "grad_norm": 0.32557588815689087, + "learning_rate": 3.311630396004281e-05, + "loss": 0.2596, + "step": 3713 + }, + { + "epoch": 0.06624335604466165, + "grad_norm": 0.3454476296901703, + "learning_rate": 3.312522297538352e-05, + "loss": 0.3095, + "step": 3714 + }, + { + "epoch": 0.06626119216637534, + "grad_norm": 0.30635809898376465, + "learning_rate": 3.3134141990724224e-05, + "loss": 0.2544, + "step": 3715 + }, + { + "epoch": 0.06627902828808904, + "grad_norm": 0.2915476858615875, + "learning_rate": 3.314306100606493e-05, + "loss": 0.2495, + "step": 3716 + }, + { + "epoch": 0.06629686440980273, + "grad_norm": 0.3952147662639618, + "learning_rate": 3.3151980021405636e-05, + "loss": 0.2576, + "step": 3717 + }, + { + "epoch": 0.06631470053151643, + "grad_norm": 0.3233875632286072, + "learning_rate": 3.316089903674634e-05, + "loss": 0.293, + "step": 3718 + }, + { + "epoch": 0.06633253665323012, + "grad_norm": 0.4111897945404053, + "learning_rate": 3.3169818052087056e-05, + "loss": 0.289, + "step": 3719 + }, + { + "epoch": 0.06635037277494382, + "grad_norm": 0.4717160761356354, + "learning_rate": 3.3178737067427755e-05, + "loss": 0.3012, + "step": 3720 + }, + { + "epoch": 0.06636820889665751, + "grad_norm": 0.4196506142616272, + "learning_rate": 3.318765608276846e-05, + "loss": 0.3043, + "step": 3721 + }, + { + "epoch": 0.06638604501837121, + "grad_norm": 0.3230936825275421, + "learning_rate": 3.3196575098109175e-05, + "loss": 0.2729, + "step": 3722 + }, + { + "epoch": 0.0664038811400849, + "grad_norm": 0.3927689790725708, + "learning_rate": 3.3205494113449875e-05, + "loss": 0.313, + "step": 3723 + }, + { + "epoch": 0.06642171726179859, + "grad_norm": 0.34997114539146423, + "learning_rate": 3.321441312879058e-05, + "loss": 0.2494, + "step": 3724 + }, + { + "epoch": 0.06643955338351229, + "grad_norm": 0.5233971476554871, + "learning_rate": 3.3223332144131294e-05, + "loss": 0.3109, + "step": 3725 + }, + { + "epoch": 0.06645738950522598, + "grad_norm": 0.40648800134658813, + "learning_rate": 3.3232251159471994e-05, + "loss": 0.261, + "step": 3726 + }, + { + "epoch": 0.06647522562693968, + "grad_norm": 0.3055565357208252, + "learning_rate": 3.32411701748127e-05, + "loss": 0.2286, + "step": 3727 + }, + { + "epoch": 0.06649306174865337, + "grad_norm": 0.3339119255542755, + "learning_rate": 3.325008919015341e-05, + "loss": 0.2776, + "step": 3728 + }, + { + "epoch": 0.06651089787036707, + "grad_norm": 0.3152942955493927, + "learning_rate": 3.325900820549411e-05, + "loss": 0.2692, + "step": 3729 + }, + { + "epoch": 0.06652873399208076, + "grad_norm": 0.3575889766216278, + "learning_rate": 3.326792722083482e-05, + "loss": 0.2838, + "step": 3730 + }, + { + "epoch": 0.06654657011379446, + "grad_norm": 0.3464291989803314, + "learning_rate": 3.327684623617553e-05, + "loss": 0.259, + "step": 3731 + }, + { + "epoch": 0.06656440623550815, + "grad_norm": 0.41277116537094116, + "learning_rate": 3.328576525151623e-05, + "loss": 0.3467, + "step": 3732 + }, + { + "epoch": 0.06658224235722185, + "grad_norm": 0.39770737290382385, + "learning_rate": 3.329468426685694e-05, + "loss": 0.2349, + "step": 3733 + }, + { + "epoch": 0.06660007847893554, + "grad_norm": 0.488250195980072, + "learning_rate": 3.330360328219765e-05, + "loss": 0.2937, + "step": 3734 + }, + { + "epoch": 0.06661791460064924, + "grad_norm": 0.3958609402179718, + "learning_rate": 3.331252229753835e-05, + "loss": 0.264, + "step": 3735 + }, + { + "epoch": 0.06663575072236293, + "grad_norm": 0.3692286014556885, + "learning_rate": 3.332144131287906e-05, + "loss": 0.2635, + "step": 3736 + }, + { + "epoch": 0.06665358684407663, + "grad_norm": 0.35026273131370544, + "learning_rate": 3.333036032821977e-05, + "loss": 0.2521, + "step": 3737 + }, + { + "epoch": 0.06667142296579032, + "grad_norm": 0.3904498517513275, + "learning_rate": 3.333927934356047e-05, + "loss": 0.2839, + "step": 3738 + }, + { + "epoch": 0.06668925908750402, + "grad_norm": 0.5343726873397827, + "learning_rate": 3.334819835890118e-05, + "loss": 0.3157, + "step": 3739 + }, + { + "epoch": 0.0667070952092177, + "grad_norm": 0.33324769139289856, + "learning_rate": 3.335711737424188e-05, + "loss": 0.312, + "step": 3740 + }, + { + "epoch": 0.06672493133093141, + "grad_norm": 0.28762882947921753, + "learning_rate": 3.336603638958259e-05, + "loss": 0.2498, + "step": 3741 + }, + { + "epoch": 0.0667427674526451, + "grad_norm": 0.3631058633327484, + "learning_rate": 3.3374955404923296e-05, + "loss": 0.2942, + "step": 3742 + }, + { + "epoch": 0.0667606035743588, + "grad_norm": 0.36567845940589905, + "learning_rate": 3.3383874420264e-05, + "loss": 0.2842, + "step": 3743 + }, + { + "epoch": 0.06677843969607249, + "grad_norm": 0.43224385380744934, + "learning_rate": 3.3392793435604715e-05, + "loss": 0.2397, + "step": 3744 + }, + { + "epoch": 0.06679627581778617, + "grad_norm": 0.551596999168396, + "learning_rate": 3.3401712450945415e-05, + "loss": 0.2957, + "step": 3745 + }, + { + "epoch": 0.06681411193949988, + "grad_norm": 0.40593668818473816, + "learning_rate": 3.341063146628612e-05, + "loss": 0.3221, + "step": 3746 + }, + { + "epoch": 0.06683194806121356, + "grad_norm": 0.3912992477416992, + "learning_rate": 3.3419550481626834e-05, + "loss": 0.2849, + "step": 3747 + }, + { + "epoch": 0.06684978418292727, + "grad_norm": 0.45108121633529663, + "learning_rate": 3.3428469496967534e-05, + "loss": 0.3243, + "step": 3748 + }, + { + "epoch": 0.06686762030464095, + "grad_norm": 0.355159729719162, + "learning_rate": 3.343738851230824e-05, + "loss": 0.319, + "step": 3749 + }, + { + "epoch": 0.06688545642635466, + "grad_norm": 0.3352654278278351, + "learning_rate": 3.3446307527648954e-05, + "loss": 0.2296, + "step": 3750 + }, + { + "epoch": 0.06690329254806834, + "grad_norm": 0.38211750984191895, + "learning_rate": 3.345522654298965e-05, + "loss": 0.275, + "step": 3751 + }, + { + "epoch": 0.06692112866978205, + "grad_norm": 0.35876527428627014, + "learning_rate": 3.346414555833036e-05, + "loss": 0.32, + "step": 3752 + }, + { + "epoch": 0.06693896479149573, + "grad_norm": 0.3367149829864502, + "learning_rate": 3.347306457367107e-05, + "loss": 0.2959, + "step": 3753 + }, + { + "epoch": 0.06695680091320944, + "grad_norm": 0.41035372018814087, + "learning_rate": 3.348198358901177e-05, + "loss": 0.3198, + "step": 3754 + }, + { + "epoch": 0.06697463703492312, + "grad_norm": 0.45888209342956543, + "learning_rate": 3.349090260435248e-05, + "loss": 0.366, + "step": 3755 + }, + { + "epoch": 0.06699247315663683, + "grad_norm": 0.32987546920776367, + "learning_rate": 3.349982161969319e-05, + "loss": 0.2587, + "step": 3756 + }, + { + "epoch": 0.06701030927835051, + "grad_norm": 0.4455758333206177, + "learning_rate": 3.350874063503389e-05, + "loss": 0.2873, + "step": 3757 + }, + { + "epoch": 0.06702814540006422, + "grad_norm": 0.3964990973472595, + "learning_rate": 3.35176596503746e-05, + "loss": 0.3251, + "step": 3758 + }, + { + "epoch": 0.0670459815217779, + "grad_norm": 0.3634512722492218, + "learning_rate": 3.352657866571531e-05, + "loss": 0.3136, + "step": 3759 + }, + { + "epoch": 0.0670638176434916, + "grad_norm": 0.469315767288208, + "learning_rate": 3.353549768105601e-05, + "loss": 0.2852, + "step": 3760 + }, + { + "epoch": 0.06708165376520529, + "grad_norm": 0.3811940550804138, + "learning_rate": 3.354441669639672e-05, + "loss": 0.2642, + "step": 3761 + }, + { + "epoch": 0.067099489886919, + "grad_norm": 0.38483431935310364, + "learning_rate": 3.355333571173743e-05, + "loss": 0.2834, + "step": 3762 + }, + { + "epoch": 0.06711732600863268, + "grad_norm": 0.46264129877090454, + "learning_rate": 3.356225472707813e-05, + "loss": 0.2831, + "step": 3763 + }, + { + "epoch": 0.06713516213034637, + "grad_norm": 0.6043981909751892, + "learning_rate": 3.3571173742418836e-05, + "loss": 0.2669, + "step": 3764 + }, + { + "epoch": 0.06715299825206007, + "grad_norm": 0.4582202732563019, + "learning_rate": 3.358009275775954e-05, + "loss": 0.3498, + "step": 3765 + }, + { + "epoch": 0.06717083437377376, + "grad_norm": 0.41268259286880493, + "learning_rate": 3.3589011773100256e-05, + "loss": 0.3132, + "step": 3766 + }, + { + "epoch": 0.06718867049548746, + "grad_norm": 0.43114957213401794, + "learning_rate": 3.3597930788440955e-05, + "loss": 0.2888, + "step": 3767 + }, + { + "epoch": 0.06720650661720115, + "grad_norm": 0.45636048913002014, + "learning_rate": 3.360684980378166e-05, + "loss": 0.3206, + "step": 3768 + }, + { + "epoch": 0.06722434273891485, + "grad_norm": 0.349748820066452, + "learning_rate": 3.3615768819122375e-05, + "loss": 0.3148, + "step": 3769 + }, + { + "epoch": 0.06724217886062854, + "grad_norm": 0.39590850472450256, + "learning_rate": 3.3624687834463074e-05, + "loss": 0.2405, + "step": 3770 + }, + { + "epoch": 0.06726001498234224, + "grad_norm": 0.36425086855888367, + "learning_rate": 3.363360684980378e-05, + "loss": 0.3189, + "step": 3771 + }, + { + "epoch": 0.06727785110405593, + "grad_norm": 0.39826929569244385, + "learning_rate": 3.3642525865144494e-05, + "loss": 0.2915, + "step": 3772 + }, + { + "epoch": 0.06729568722576963, + "grad_norm": 0.4167521297931671, + "learning_rate": 3.3651444880485194e-05, + "loss": 0.2234, + "step": 3773 + }, + { + "epoch": 0.06731352334748332, + "grad_norm": 0.3726811408996582, + "learning_rate": 3.36603638958259e-05, + "loss": 0.2925, + "step": 3774 + }, + { + "epoch": 0.06733135946919702, + "grad_norm": 0.40640175342559814, + "learning_rate": 3.366928291116661e-05, + "loss": 0.3425, + "step": 3775 + }, + { + "epoch": 0.06734919559091071, + "grad_norm": 0.39370375871658325, + "learning_rate": 3.367820192650731e-05, + "loss": 0.2884, + "step": 3776 + }, + { + "epoch": 0.06736703171262441, + "grad_norm": 0.4084145724773407, + "learning_rate": 3.368712094184802e-05, + "loss": 0.2457, + "step": 3777 + }, + { + "epoch": 0.0673848678343381, + "grad_norm": 0.3812226355075836, + "learning_rate": 3.369603995718873e-05, + "loss": 0.2311, + "step": 3778 + }, + { + "epoch": 0.0674027039560518, + "grad_norm": 0.3929128646850586, + "learning_rate": 3.370495897252943e-05, + "loss": 0.3168, + "step": 3779 + }, + { + "epoch": 0.06742054007776549, + "grad_norm": 0.47142481803894043, + "learning_rate": 3.371387798787014e-05, + "loss": 0.3074, + "step": 3780 + }, + { + "epoch": 0.06743837619947919, + "grad_norm": 0.45046961307525635, + "learning_rate": 3.372279700321085e-05, + "loss": 0.2902, + "step": 3781 + }, + { + "epoch": 0.06745621232119288, + "grad_norm": 0.2943269908428192, + "learning_rate": 3.373171601855155e-05, + "loss": 0.3122, + "step": 3782 + }, + { + "epoch": 0.06747404844290658, + "grad_norm": 0.33599725365638733, + "learning_rate": 3.374063503389226e-05, + "loss": 0.2657, + "step": 3783 + }, + { + "epoch": 0.06749188456462027, + "grad_norm": 0.3057636022567749, + "learning_rate": 3.374955404923297e-05, + "loss": 0.2462, + "step": 3784 + }, + { + "epoch": 0.06750972068633396, + "grad_norm": 0.46105214953422546, + "learning_rate": 3.375847306457367e-05, + "loss": 0.2781, + "step": 3785 + }, + { + "epoch": 0.06752755680804766, + "grad_norm": 0.31299689412117004, + "learning_rate": 3.3767392079914377e-05, + "loss": 0.2442, + "step": 3786 + }, + { + "epoch": 0.06754539292976135, + "grad_norm": 0.2745397090911865, + "learning_rate": 3.377631109525509e-05, + "loss": 0.2368, + "step": 3787 + }, + { + "epoch": 0.06756322905147505, + "grad_norm": 0.3316921889781952, + "learning_rate": 3.378523011059579e-05, + "loss": 0.2968, + "step": 3788 + }, + { + "epoch": 0.06758106517318874, + "grad_norm": 0.4789838492870331, + "learning_rate": 3.3794149125936496e-05, + "loss": 0.298, + "step": 3789 + }, + { + "epoch": 0.06759890129490244, + "grad_norm": 0.42464661598205566, + "learning_rate": 3.380306814127721e-05, + "loss": 0.2966, + "step": 3790 + }, + { + "epoch": 0.06761673741661613, + "grad_norm": 0.487000972032547, + "learning_rate": 3.3811987156617915e-05, + "loss": 0.3135, + "step": 3791 + }, + { + "epoch": 0.06763457353832983, + "grad_norm": 0.5048577785491943, + "learning_rate": 3.3820906171958615e-05, + "loss": 0.2712, + "step": 3792 + }, + { + "epoch": 0.06765240966004352, + "grad_norm": 0.4884682893753052, + "learning_rate": 3.382982518729932e-05, + "loss": 0.338, + "step": 3793 + }, + { + "epoch": 0.06767024578175722, + "grad_norm": 0.496457576751709, + "learning_rate": 3.3838744202640034e-05, + "loss": 0.3332, + "step": 3794 + }, + { + "epoch": 0.0676880819034709, + "grad_norm": 0.41949984431266785, + "learning_rate": 3.3847663217980734e-05, + "loss": 0.3072, + "step": 3795 + }, + { + "epoch": 0.06770591802518461, + "grad_norm": 0.36391133069992065, + "learning_rate": 3.385658223332144e-05, + "loss": 0.3448, + "step": 3796 + }, + { + "epoch": 0.0677237541468983, + "grad_norm": 0.39628341794013977, + "learning_rate": 3.3865501248662153e-05, + "loss": 0.2987, + "step": 3797 + }, + { + "epoch": 0.067741590268612, + "grad_norm": 0.38303142786026, + "learning_rate": 3.387442026400285e-05, + "loss": 0.3096, + "step": 3798 + }, + { + "epoch": 0.06775942639032569, + "grad_norm": 0.34878623485565186, + "learning_rate": 3.388333927934356e-05, + "loss": 0.2538, + "step": 3799 + }, + { + "epoch": 0.06777726251203939, + "grad_norm": 0.5400440096855164, + "learning_rate": 3.389225829468427e-05, + "loss": 0.279, + "step": 3800 + }, + { + "epoch": 0.06779509863375308, + "grad_norm": 0.356579452753067, + "learning_rate": 3.390117731002497e-05, + "loss": 0.2814, + "step": 3801 + }, + { + "epoch": 0.06781293475546678, + "grad_norm": 0.4017574191093445, + "learning_rate": 3.391009632536568e-05, + "loss": 0.2788, + "step": 3802 + }, + { + "epoch": 0.06783077087718047, + "grad_norm": 0.44138064980506897, + "learning_rate": 3.391901534070639e-05, + "loss": 0.2787, + "step": 3803 + }, + { + "epoch": 0.06784860699889415, + "grad_norm": 0.31653985381126404, + "learning_rate": 3.392793435604709e-05, + "loss": 0.2499, + "step": 3804 + }, + { + "epoch": 0.06786644312060786, + "grad_norm": 0.33393359184265137, + "learning_rate": 3.39368533713878e-05, + "loss": 0.2892, + "step": 3805 + }, + { + "epoch": 0.06788427924232154, + "grad_norm": 0.3687049150466919, + "learning_rate": 3.394577238672851e-05, + "loss": 0.3218, + "step": 3806 + }, + { + "epoch": 0.06790211536403525, + "grad_norm": 0.32453712821006775, + "learning_rate": 3.395469140206921e-05, + "loss": 0.3042, + "step": 3807 + }, + { + "epoch": 0.06791995148574893, + "grad_norm": 0.3493601083755493, + "learning_rate": 3.396361041740992e-05, + "loss": 0.3096, + "step": 3808 + }, + { + "epoch": 0.06793778760746264, + "grad_norm": 0.42554107308387756, + "learning_rate": 3.397252943275063e-05, + "loss": 0.344, + "step": 3809 + }, + { + "epoch": 0.06795562372917632, + "grad_norm": 0.4338083565235138, + "learning_rate": 3.398144844809133e-05, + "loss": 0.2956, + "step": 3810 + }, + { + "epoch": 0.06797345985089003, + "grad_norm": 0.3571920096874237, + "learning_rate": 3.3990367463432036e-05, + "loss": 0.3157, + "step": 3811 + }, + { + "epoch": 0.06799129597260371, + "grad_norm": 0.3709001839160919, + "learning_rate": 3.399928647877275e-05, + "loss": 0.2741, + "step": 3812 + }, + { + "epoch": 0.06800913209431741, + "grad_norm": 0.5087075233459473, + "learning_rate": 3.4008205494113456e-05, + "loss": 0.2707, + "step": 3813 + }, + { + "epoch": 0.0680269682160311, + "grad_norm": 0.5026617050170898, + "learning_rate": 3.4017124509454155e-05, + "loss": 0.3218, + "step": 3814 + }, + { + "epoch": 0.0680448043377448, + "grad_norm": 0.6777828931808472, + "learning_rate": 3.402604352479487e-05, + "loss": 0.2865, + "step": 3815 + }, + { + "epoch": 0.06806264045945849, + "grad_norm": 0.3735826313495636, + "learning_rate": 3.4034962540135575e-05, + "loss": 0.2753, + "step": 3816 + }, + { + "epoch": 0.0680804765811722, + "grad_norm": 0.4043390154838562, + "learning_rate": 3.4043881555476274e-05, + "loss": 0.2149, + "step": 3817 + }, + { + "epoch": 0.06809831270288588, + "grad_norm": 0.3440175950527191, + "learning_rate": 3.405280057081698e-05, + "loss": 0.2406, + "step": 3818 + }, + { + "epoch": 0.06811614882459958, + "grad_norm": 0.3285858631134033, + "learning_rate": 3.4061719586157694e-05, + "loss": 0.2485, + "step": 3819 + }, + { + "epoch": 0.06813398494631327, + "grad_norm": 1.160111427307129, + "learning_rate": 3.4070638601498393e-05, + "loss": 0.3213, + "step": 3820 + }, + { + "epoch": 0.06815182106802697, + "grad_norm": 0.3132086396217346, + "learning_rate": 3.40795576168391e-05, + "loss": 0.2925, + "step": 3821 + }, + { + "epoch": 0.06816965718974066, + "grad_norm": 0.28393521904945374, + "learning_rate": 3.408847663217981e-05, + "loss": 0.227, + "step": 3822 + }, + { + "epoch": 0.06818749331145436, + "grad_norm": 0.32588687539100647, + "learning_rate": 3.409739564752051e-05, + "loss": 0.2418, + "step": 3823 + }, + { + "epoch": 0.06820532943316805, + "grad_norm": 0.7148998379707336, + "learning_rate": 3.410631466286122e-05, + "loss": 0.2946, + "step": 3824 + }, + { + "epoch": 0.06822316555488174, + "grad_norm": 0.5567273497581482, + "learning_rate": 3.411523367820193e-05, + "loss": 0.294, + "step": 3825 + }, + { + "epoch": 0.06824100167659544, + "grad_norm": 0.6443267464637756, + "learning_rate": 3.412415269354263e-05, + "loss": 0.3973, + "step": 3826 + }, + { + "epoch": 0.06825883779830913, + "grad_norm": 0.3435632586479187, + "learning_rate": 3.413307170888334e-05, + "loss": 0.2265, + "step": 3827 + }, + { + "epoch": 0.06827667392002283, + "grad_norm": 0.5158803462982178, + "learning_rate": 3.414199072422405e-05, + "loss": 0.2797, + "step": 3828 + }, + { + "epoch": 0.06829451004173652, + "grad_norm": 0.3423105478286743, + "learning_rate": 3.415090973956475e-05, + "loss": 0.2347, + "step": 3829 + }, + { + "epoch": 0.06831234616345022, + "grad_norm": 0.3610781729221344, + "learning_rate": 3.415982875490546e-05, + "loss": 0.2695, + "step": 3830 + }, + { + "epoch": 0.06833018228516391, + "grad_norm": 0.44379502534866333, + "learning_rate": 3.416874777024617e-05, + "loss": 0.3214, + "step": 3831 + }, + { + "epoch": 0.06834801840687761, + "grad_norm": 0.4114704132080078, + "learning_rate": 3.417766678558687e-05, + "loss": 0.3092, + "step": 3832 + }, + { + "epoch": 0.0683658545285913, + "grad_norm": 0.4542233943939209, + "learning_rate": 3.4186585800927576e-05, + "loss": 0.3203, + "step": 3833 + }, + { + "epoch": 0.068383690650305, + "grad_norm": 0.36704519391059875, + "learning_rate": 3.419550481626829e-05, + "loss": 0.3023, + "step": 3834 + }, + { + "epoch": 0.06840152677201869, + "grad_norm": 0.46070805191993713, + "learning_rate": 3.420442383160899e-05, + "loss": 0.3288, + "step": 3835 + }, + { + "epoch": 0.06841936289373239, + "grad_norm": 0.41485700011253357, + "learning_rate": 3.4213342846949696e-05, + "loss": 0.2391, + "step": 3836 + }, + { + "epoch": 0.06843719901544608, + "grad_norm": 0.38974466919898987, + "learning_rate": 3.422226186229041e-05, + "loss": 0.2638, + "step": 3837 + }, + { + "epoch": 0.06845503513715978, + "grad_norm": 0.39768311381340027, + "learning_rate": 3.4231180877631115e-05, + "loss": 0.294, + "step": 3838 + }, + { + "epoch": 0.06847287125887347, + "grad_norm": 0.47503039240837097, + "learning_rate": 3.4240099892971815e-05, + "loss": 0.2459, + "step": 3839 + }, + { + "epoch": 0.06849070738058717, + "grad_norm": 0.3560754358768463, + "learning_rate": 3.424901890831253e-05, + "loss": 0.2705, + "step": 3840 + }, + { + "epoch": 0.06850854350230086, + "grad_norm": 0.47175276279449463, + "learning_rate": 3.4257937923653234e-05, + "loss": 0.3157, + "step": 3841 + }, + { + "epoch": 0.06852637962401456, + "grad_norm": 0.3012832999229431, + "learning_rate": 3.4266856938993934e-05, + "loss": 0.2555, + "step": 3842 + }, + { + "epoch": 0.06854421574572825, + "grad_norm": 0.33884280920028687, + "learning_rate": 3.427577595433464e-05, + "loss": 0.3307, + "step": 3843 + }, + { + "epoch": 0.06856205186744194, + "grad_norm": 0.3281342387199402, + "learning_rate": 3.428469496967535e-05, + "loss": 0.2783, + "step": 3844 + }, + { + "epoch": 0.06857988798915564, + "grad_norm": 0.3344869315624237, + "learning_rate": 3.429361398501605e-05, + "loss": 0.2581, + "step": 3845 + }, + { + "epoch": 0.06859772411086933, + "grad_norm": 0.44027483463287354, + "learning_rate": 3.430253300035676e-05, + "loss": 0.2441, + "step": 3846 + }, + { + "epoch": 0.06861556023258303, + "grad_norm": 0.450890451669693, + "learning_rate": 3.431145201569747e-05, + "loss": 0.2752, + "step": 3847 + }, + { + "epoch": 0.06863339635429672, + "grad_norm": 0.36177682876586914, + "learning_rate": 3.432037103103817e-05, + "loss": 0.3086, + "step": 3848 + }, + { + "epoch": 0.06865123247601042, + "grad_norm": 0.32734236121177673, + "learning_rate": 3.432929004637888e-05, + "loss": 0.2784, + "step": 3849 + }, + { + "epoch": 0.0686690685977241, + "grad_norm": 0.39323222637176514, + "learning_rate": 3.433820906171959e-05, + "loss": 0.2962, + "step": 3850 + }, + { + "epoch": 0.06868690471943781, + "grad_norm": 0.3322768807411194, + "learning_rate": 3.434712807706029e-05, + "loss": 0.2418, + "step": 3851 + }, + { + "epoch": 0.0687047408411515, + "grad_norm": 0.4296260178089142, + "learning_rate": 3.4356047092401e-05, + "loss": 0.3382, + "step": 3852 + }, + { + "epoch": 0.0687225769628652, + "grad_norm": 0.3480355739593506, + "learning_rate": 3.436496610774171e-05, + "loss": 0.3022, + "step": 3853 + }, + { + "epoch": 0.06874041308457889, + "grad_norm": 0.30536550283432007, + "learning_rate": 3.437388512308241e-05, + "loss": 0.2667, + "step": 3854 + }, + { + "epoch": 0.06875824920629259, + "grad_norm": 0.3209473788738251, + "learning_rate": 3.438280413842312e-05, + "loss": 0.2236, + "step": 3855 + }, + { + "epoch": 0.06877608532800628, + "grad_norm": 0.4037458002567291, + "learning_rate": 3.439172315376383e-05, + "loss": 0.2622, + "step": 3856 + }, + { + "epoch": 0.06879392144971998, + "grad_norm": 0.41891300678253174, + "learning_rate": 3.440064216910453e-05, + "loss": 0.3025, + "step": 3857 + }, + { + "epoch": 0.06881175757143367, + "grad_norm": 0.5626822113990784, + "learning_rate": 3.4409561184445236e-05, + "loss": 0.347, + "step": 3858 + }, + { + "epoch": 0.06882959369314737, + "grad_norm": 0.4131893813610077, + "learning_rate": 3.441848019978595e-05, + "loss": 0.3589, + "step": 3859 + }, + { + "epoch": 0.06884742981486106, + "grad_norm": 0.362048864364624, + "learning_rate": 3.442739921512665e-05, + "loss": 0.2738, + "step": 3860 + }, + { + "epoch": 0.06886526593657476, + "grad_norm": 0.47151756286621094, + "learning_rate": 3.4436318230467355e-05, + "loss": 0.2536, + "step": 3861 + }, + { + "epoch": 0.06888310205828845, + "grad_norm": 0.36796677112579346, + "learning_rate": 3.444523724580807e-05, + "loss": 0.265, + "step": 3862 + }, + { + "epoch": 0.06890093818000215, + "grad_norm": 0.4151969254016876, + "learning_rate": 3.4454156261148775e-05, + "loss": 0.3413, + "step": 3863 + }, + { + "epoch": 0.06891877430171583, + "grad_norm": 0.3809218406677246, + "learning_rate": 3.4463075276489474e-05, + "loss": 0.2893, + "step": 3864 + }, + { + "epoch": 0.06893661042342952, + "grad_norm": 0.32801833748817444, + "learning_rate": 3.447199429183019e-05, + "loss": 0.2596, + "step": 3865 + }, + { + "epoch": 0.06895444654514322, + "grad_norm": 0.4232890009880066, + "learning_rate": 3.4480913307170894e-05, + "loss": 0.3158, + "step": 3866 + }, + { + "epoch": 0.06897228266685691, + "grad_norm": 0.4460662603378296, + "learning_rate": 3.448983232251159e-05, + "loss": 0.3222, + "step": 3867 + }, + { + "epoch": 0.06899011878857061, + "grad_norm": 0.48599353432655334, + "learning_rate": 3.44987513378523e-05, + "loss": 0.3031, + "step": 3868 + }, + { + "epoch": 0.0690079549102843, + "grad_norm": 0.3497062623500824, + "learning_rate": 3.450767035319301e-05, + "loss": 0.321, + "step": 3869 + }, + { + "epoch": 0.069025791031998, + "grad_norm": 0.359834223985672, + "learning_rate": 3.451658936853371e-05, + "loss": 0.3348, + "step": 3870 + }, + { + "epoch": 0.06904362715371169, + "grad_norm": 0.3681846559047699, + "learning_rate": 3.452550838387442e-05, + "loss": 0.2541, + "step": 3871 + }, + { + "epoch": 0.0690614632754254, + "grad_norm": 0.49886760115623474, + "learning_rate": 3.453442739921513e-05, + "loss": 0.3258, + "step": 3872 + }, + { + "epoch": 0.06907929939713908, + "grad_norm": 0.4724777042865753, + "learning_rate": 3.454334641455583e-05, + "loss": 0.3097, + "step": 3873 + }, + { + "epoch": 0.06909713551885278, + "grad_norm": 0.31848880648612976, + "learning_rate": 3.455226542989654e-05, + "loss": 0.3064, + "step": 3874 + }, + { + "epoch": 0.06911497164056647, + "grad_norm": 0.3555191159248352, + "learning_rate": 3.456118444523725e-05, + "loss": 0.2794, + "step": 3875 + }, + { + "epoch": 0.06913280776228017, + "grad_norm": 0.31329837441444397, + "learning_rate": 3.457010346057795e-05, + "loss": 0.2147, + "step": 3876 + }, + { + "epoch": 0.06915064388399386, + "grad_norm": 0.44533130526542664, + "learning_rate": 3.457902247591866e-05, + "loss": 0.2606, + "step": 3877 + }, + { + "epoch": 0.06916848000570756, + "grad_norm": 0.48295438289642334, + "learning_rate": 3.458794149125937e-05, + "loss": 0.2537, + "step": 3878 + }, + { + "epoch": 0.06918631612742125, + "grad_norm": 0.27674633264541626, + "learning_rate": 3.459686050660007e-05, + "loss": 0.2392, + "step": 3879 + }, + { + "epoch": 0.06920415224913495, + "grad_norm": 0.5630616545677185, + "learning_rate": 3.4605779521940776e-05, + "loss": 0.2683, + "step": 3880 + }, + { + "epoch": 0.06922198837084864, + "grad_norm": 0.42769375443458557, + "learning_rate": 3.461469853728149e-05, + "loss": 0.2992, + "step": 3881 + }, + { + "epoch": 0.06923982449256234, + "grad_norm": 0.32193562388420105, + "learning_rate": 3.462361755262219e-05, + "loss": 0.2421, + "step": 3882 + }, + { + "epoch": 0.06925766061427603, + "grad_norm": 0.32061731815338135, + "learning_rate": 3.4632536567962895e-05, + "loss": 0.2413, + "step": 3883 + }, + { + "epoch": 0.06927549673598972, + "grad_norm": 0.5281691551208496, + "learning_rate": 3.464145558330361e-05, + "loss": 0.3424, + "step": 3884 + }, + { + "epoch": 0.06929333285770342, + "grad_norm": 0.39620906114578247, + "learning_rate": 3.4650374598644315e-05, + "loss": 0.2695, + "step": 3885 + }, + { + "epoch": 0.06931116897941711, + "grad_norm": 0.298551082611084, + "learning_rate": 3.4659293613985015e-05, + "loss": 0.2885, + "step": 3886 + }, + { + "epoch": 0.06932900510113081, + "grad_norm": 0.3618530035018921, + "learning_rate": 3.466821262932573e-05, + "loss": 0.2825, + "step": 3887 + }, + { + "epoch": 0.0693468412228445, + "grad_norm": 0.36034056544303894, + "learning_rate": 3.4677131644666434e-05, + "loss": 0.272, + "step": 3888 + }, + { + "epoch": 0.0693646773445582, + "grad_norm": 0.4203408658504486, + "learning_rate": 3.4686050660007134e-05, + "loss": 0.3086, + "step": 3889 + }, + { + "epoch": 0.06938251346627189, + "grad_norm": 0.5754683017730713, + "learning_rate": 3.469496967534785e-05, + "loss": 0.2561, + "step": 3890 + }, + { + "epoch": 0.06940034958798559, + "grad_norm": 0.2742818593978882, + "learning_rate": 3.470388869068855e-05, + "loss": 0.2605, + "step": 3891 + }, + { + "epoch": 0.06941818570969928, + "grad_norm": 0.39720579981803894, + "learning_rate": 3.471280770602925e-05, + "loss": 0.3189, + "step": 3892 + }, + { + "epoch": 0.06943602183141298, + "grad_norm": 0.4149906039237976, + "learning_rate": 3.4721726721369966e-05, + "loss": 0.2566, + "step": 3893 + }, + { + "epoch": 0.06945385795312667, + "grad_norm": 0.4420456290245056, + "learning_rate": 3.473064573671067e-05, + "loss": 0.2696, + "step": 3894 + }, + { + "epoch": 0.06947169407484037, + "grad_norm": 0.4822978377342224, + "learning_rate": 3.473956475205137e-05, + "loss": 0.3003, + "step": 3895 + }, + { + "epoch": 0.06948953019655406, + "grad_norm": 0.3878283202648163, + "learning_rate": 3.474848376739208e-05, + "loss": 0.3146, + "step": 3896 + }, + { + "epoch": 0.06950736631826776, + "grad_norm": 0.45220157504081726, + "learning_rate": 3.475740278273279e-05, + "loss": 0.2782, + "step": 3897 + }, + { + "epoch": 0.06952520243998145, + "grad_norm": 0.6714602708816528, + "learning_rate": 3.476632179807349e-05, + "loss": 0.2221, + "step": 3898 + }, + { + "epoch": 0.06954303856169515, + "grad_norm": 0.4458344280719757, + "learning_rate": 3.47752408134142e-05, + "loss": 0.2841, + "step": 3899 + }, + { + "epoch": 0.06956087468340884, + "grad_norm": 0.3231016993522644, + "learning_rate": 3.478415982875491e-05, + "loss": 0.2742, + "step": 3900 + }, + { + "epoch": 0.06957871080512254, + "grad_norm": 0.35883525013923645, + "learning_rate": 3.479307884409561e-05, + "loss": 0.2197, + "step": 3901 + }, + { + "epoch": 0.06959654692683623, + "grad_norm": 0.3651270866394043, + "learning_rate": 3.480199785943632e-05, + "loss": 0.2386, + "step": 3902 + }, + { + "epoch": 0.06961438304854993, + "grad_norm": 0.5279820561408997, + "learning_rate": 3.481091687477703e-05, + "loss": 0.3437, + "step": 3903 + }, + { + "epoch": 0.06963221917026362, + "grad_norm": 0.3838633596897125, + "learning_rate": 3.481983589011773e-05, + "loss": 0.2449, + "step": 3904 + }, + { + "epoch": 0.0696500552919773, + "grad_norm": 0.457460880279541, + "learning_rate": 3.4828754905458436e-05, + "loss": 0.2595, + "step": 3905 + }, + { + "epoch": 0.06966789141369101, + "grad_norm": 0.32760775089263916, + "learning_rate": 3.483767392079915e-05, + "loss": 0.2645, + "step": 3906 + }, + { + "epoch": 0.0696857275354047, + "grad_norm": 0.3443721532821655, + "learning_rate": 3.484659293613985e-05, + "loss": 0.2681, + "step": 3907 + }, + { + "epoch": 0.0697035636571184, + "grad_norm": 0.44602057337760925, + "learning_rate": 3.4855511951480555e-05, + "loss": 0.2649, + "step": 3908 + }, + { + "epoch": 0.06972139977883209, + "grad_norm": 0.3007403016090393, + "learning_rate": 3.486443096682127e-05, + "loss": 0.238, + "step": 3909 + }, + { + "epoch": 0.06973923590054579, + "grad_norm": 0.4458249807357788, + "learning_rate": 3.4873349982161974e-05, + "loss": 0.3543, + "step": 3910 + }, + { + "epoch": 0.06975707202225948, + "grad_norm": 0.3666561543941498, + "learning_rate": 3.4882268997502674e-05, + "loss": 0.2512, + "step": 3911 + }, + { + "epoch": 0.06977490814397318, + "grad_norm": 0.42663225531578064, + "learning_rate": 3.489118801284339e-05, + "loss": 0.2493, + "step": 3912 + }, + { + "epoch": 0.06979274426568687, + "grad_norm": 0.3961727023124695, + "learning_rate": 3.4900107028184094e-05, + "loss": 0.2509, + "step": 3913 + }, + { + "epoch": 0.06981058038740057, + "grad_norm": 0.33371153473854065, + "learning_rate": 3.490902604352479e-05, + "loss": 0.263, + "step": 3914 + }, + { + "epoch": 0.06982841650911426, + "grad_norm": 0.26920273900032043, + "learning_rate": 3.4917945058865506e-05, + "loss": 0.236, + "step": 3915 + }, + { + "epoch": 0.06984625263082796, + "grad_norm": 0.40448737144470215, + "learning_rate": 3.492686407420621e-05, + "loss": 0.2938, + "step": 3916 + }, + { + "epoch": 0.06986408875254164, + "grad_norm": 0.30076855421066284, + "learning_rate": 3.493578308954691e-05, + "loss": 0.235, + "step": 3917 + }, + { + "epoch": 0.06988192487425535, + "grad_norm": 0.359047532081604, + "learning_rate": 3.4944702104887626e-05, + "loss": 0.243, + "step": 3918 + }, + { + "epoch": 0.06989976099596903, + "grad_norm": 0.4224449396133423, + "learning_rate": 3.495362112022833e-05, + "loss": 0.3106, + "step": 3919 + }, + { + "epoch": 0.06991759711768274, + "grad_norm": 0.33418577909469604, + "learning_rate": 3.496254013556903e-05, + "loss": 0.2603, + "step": 3920 + }, + { + "epoch": 0.06993543323939642, + "grad_norm": 0.3688289225101471, + "learning_rate": 3.497145915090974e-05, + "loss": 0.2496, + "step": 3921 + }, + { + "epoch": 0.06995326936111013, + "grad_norm": 0.36470526456832886, + "learning_rate": 3.498037816625045e-05, + "loss": 0.2371, + "step": 3922 + }, + { + "epoch": 0.06997110548282381, + "grad_norm": 0.4117518663406372, + "learning_rate": 3.498929718159115e-05, + "loss": 0.2466, + "step": 3923 + }, + { + "epoch": 0.06998894160453752, + "grad_norm": 0.2733934223651886, + "learning_rate": 3.499821619693186e-05, + "loss": 0.2382, + "step": 3924 + }, + { + "epoch": 0.0700067777262512, + "grad_norm": 0.2900852859020233, + "learning_rate": 3.500713521227257e-05, + "loss": 0.2283, + "step": 3925 + }, + { + "epoch": 0.07002461384796489, + "grad_norm": 0.4061088562011719, + "learning_rate": 3.501605422761327e-05, + "loss": 0.2618, + "step": 3926 + }, + { + "epoch": 0.0700424499696786, + "grad_norm": 0.41803494095802307, + "learning_rate": 3.5024973242953976e-05, + "loss": 0.2771, + "step": 3927 + }, + { + "epoch": 0.07006028609139228, + "grad_norm": 0.40184682607650757, + "learning_rate": 3.503389225829469e-05, + "loss": 0.2863, + "step": 3928 + }, + { + "epoch": 0.07007812221310598, + "grad_norm": 0.3220658302307129, + "learning_rate": 3.504281127363539e-05, + "loss": 0.2603, + "step": 3929 + }, + { + "epoch": 0.07009595833481967, + "grad_norm": 0.32238003611564636, + "learning_rate": 3.5051730288976095e-05, + "loss": 0.2585, + "step": 3930 + }, + { + "epoch": 0.07011379445653337, + "grad_norm": 0.3569925129413605, + "learning_rate": 3.506064930431681e-05, + "loss": 0.311, + "step": 3931 + }, + { + "epoch": 0.07013163057824706, + "grad_norm": 0.34401437640190125, + "learning_rate": 3.5069568319657515e-05, + "loss": 0.2875, + "step": 3932 + }, + { + "epoch": 0.07014946669996076, + "grad_norm": 0.4462296962738037, + "learning_rate": 3.5078487334998214e-05, + "loss": 0.295, + "step": 3933 + }, + { + "epoch": 0.07016730282167445, + "grad_norm": 0.3038732409477234, + "learning_rate": 3.508740635033893e-05, + "loss": 0.2505, + "step": 3934 + }, + { + "epoch": 0.07018513894338815, + "grad_norm": 0.3701113164424896, + "learning_rate": 3.5096325365679634e-05, + "loss": 0.328, + "step": 3935 + }, + { + "epoch": 0.07020297506510184, + "grad_norm": 0.37391605973243713, + "learning_rate": 3.5105244381020334e-05, + "loss": 0.3039, + "step": 3936 + }, + { + "epoch": 0.07022081118681554, + "grad_norm": 0.3115938603878021, + "learning_rate": 3.511416339636105e-05, + "loss": 0.2341, + "step": 3937 + }, + { + "epoch": 0.07023864730852923, + "grad_norm": 0.39460307359695435, + "learning_rate": 3.512308241170175e-05, + "loss": 0.3388, + "step": 3938 + }, + { + "epoch": 0.07025648343024293, + "grad_norm": 0.40903112292289734, + "learning_rate": 3.513200142704245e-05, + "loss": 0.2475, + "step": 3939 + }, + { + "epoch": 0.07027431955195662, + "grad_norm": 0.3764410614967346, + "learning_rate": 3.5140920442383166e-05, + "loss": 0.2743, + "step": 3940 + }, + { + "epoch": 0.07029215567367032, + "grad_norm": 0.4501705467700958, + "learning_rate": 3.514983945772387e-05, + "loss": 0.3301, + "step": 3941 + }, + { + "epoch": 0.07030999179538401, + "grad_norm": 0.39972686767578125, + "learning_rate": 3.515875847306457e-05, + "loss": 0.3006, + "step": 3942 + }, + { + "epoch": 0.07032782791709771, + "grad_norm": 0.4252086579799652, + "learning_rate": 3.5167677488405285e-05, + "loss": 0.2854, + "step": 3943 + }, + { + "epoch": 0.0703456640388114, + "grad_norm": 0.23194964230060577, + "learning_rate": 3.517659650374599e-05, + "loss": 0.2212, + "step": 3944 + }, + { + "epoch": 0.07036350016052509, + "grad_norm": 0.3495014011859894, + "learning_rate": 3.518551551908669e-05, + "loss": 0.2693, + "step": 3945 + }, + { + "epoch": 0.07038133628223879, + "grad_norm": 0.409226655960083, + "learning_rate": 3.51944345344274e-05, + "loss": 0.2739, + "step": 3946 + }, + { + "epoch": 0.07039917240395248, + "grad_norm": 0.3513595759868622, + "learning_rate": 3.520335354976811e-05, + "loss": 0.2998, + "step": 3947 + }, + { + "epoch": 0.07041700852566618, + "grad_norm": 0.48407235741615295, + "learning_rate": 3.521227256510881e-05, + "loss": 0.2471, + "step": 3948 + }, + { + "epoch": 0.07043484464737987, + "grad_norm": 0.29064351320266724, + "learning_rate": 3.5221191580449517e-05, + "loss": 0.2737, + "step": 3949 + }, + { + "epoch": 0.07045268076909357, + "grad_norm": 0.3176039457321167, + "learning_rate": 3.523011059579023e-05, + "loss": 0.2277, + "step": 3950 + }, + { + "epoch": 0.07047051689080726, + "grad_norm": 0.40843313932418823, + "learning_rate": 3.523902961113093e-05, + "loss": 0.2813, + "step": 3951 + }, + { + "epoch": 0.07048835301252096, + "grad_norm": 0.4520396590232849, + "learning_rate": 3.5247948626471636e-05, + "loss": 0.2917, + "step": 3952 + }, + { + "epoch": 0.07050618913423465, + "grad_norm": 0.3149385452270508, + "learning_rate": 3.525686764181235e-05, + "loss": 0.2436, + "step": 3953 + }, + { + "epoch": 0.07052402525594835, + "grad_norm": 0.29427260160446167, + "learning_rate": 3.526578665715305e-05, + "loss": 0.2319, + "step": 3954 + }, + { + "epoch": 0.07054186137766204, + "grad_norm": 0.3799297511577606, + "learning_rate": 3.5274705672493755e-05, + "loss": 0.2549, + "step": 3955 + }, + { + "epoch": 0.07055969749937574, + "grad_norm": 0.38813266158103943, + "learning_rate": 3.528362468783447e-05, + "loss": 0.3213, + "step": 3956 + }, + { + "epoch": 0.07057753362108943, + "grad_norm": 0.3146475553512573, + "learning_rate": 3.5292543703175174e-05, + "loss": 0.2468, + "step": 3957 + }, + { + "epoch": 0.07059536974280313, + "grad_norm": 0.3026726543903351, + "learning_rate": 3.5301462718515874e-05, + "loss": 0.2763, + "step": 3958 + }, + { + "epoch": 0.07061320586451682, + "grad_norm": 0.46944424510002136, + "learning_rate": 3.531038173385659e-05, + "loss": 0.2786, + "step": 3959 + }, + { + "epoch": 0.07063104198623052, + "grad_norm": 0.4122660458087921, + "learning_rate": 3.5319300749197293e-05, + "loss": 0.2496, + "step": 3960 + }, + { + "epoch": 0.07064887810794421, + "grad_norm": 0.39696258306503296, + "learning_rate": 3.532821976453799e-05, + "loss": 0.3159, + "step": 3961 + }, + { + "epoch": 0.07066671422965791, + "grad_norm": 0.4710720181465149, + "learning_rate": 3.5337138779878706e-05, + "loss": 0.3014, + "step": 3962 + }, + { + "epoch": 0.0706845503513716, + "grad_norm": 0.43712565302848816, + "learning_rate": 3.534605779521941e-05, + "loss": 0.2285, + "step": 3963 + }, + { + "epoch": 0.0707023864730853, + "grad_norm": 0.4206271171569824, + "learning_rate": 3.535497681056011e-05, + "loss": 0.2532, + "step": 3964 + }, + { + "epoch": 0.07072022259479899, + "grad_norm": 0.38638371229171753, + "learning_rate": 3.5363895825900825e-05, + "loss": 0.2827, + "step": 3965 + }, + { + "epoch": 0.07073805871651268, + "grad_norm": 0.397139310836792, + "learning_rate": 3.537281484124153e-05, + "loss": 0.3291, + "step": 3966 + }, + { + "epoch": 0.07075589483822638, + "grad_norm": 0.3956685960292816, + "learning_rate": 3.538173385658223e-05, + "loss": 0.2965, + "step": 3967 + }, + { + "epoch": 0.07077373095994006, + "grad_norm": 0.45953574776649475, + "learning_rate": 3.5390652871922945e-05, + "loss": 0.3032, + "step": 3968 + }, + { + "epoch": 0.07079156708165377, + "grad_norm": 0.39530250430107117, + "learning_rate": 3.539957188726365e-05, + "loss": 0.2633, + "step": 3969 + }, + { + "epoch": 0.07080940320336745, + "grad_norm": 0.3210938572883606, + "learning_rate": 3.540849090260435e-05, + "loss": 0.2402, + "step": 3970 + }, + { + "epoch": 0.07082723932508116, + "grad_norm": 0.4133577346801758, + "learning_rate": 3.541740991794506e-05, + "loss": 0.2767, + "step": 3971 + }, + { + "epoch": 0.07084507544679484, + "grad_norm": 0.3882431983947754, + "learning_rate": 3.542632893328577e-05, + "loss": 0.2685, + "step": 3972 + }, + { + "epoch": 0.07086291156850855, + "grad_norm": 0.3944385051727295, + "learning_rate": 3.543524794862647e-05, + "loss": 0.2436, + "step": 3973 + }, + { + "epoch": 0.07088074769022223, + "grad_norm": 0.3825366795063019, + "learning_rate": 3.5444166963967176e-05, + "loss": 0.2329, + "step": 3974 + }, + { + "epoch": 0.07089858381193594, + "grad_norm": 0.39589712023735046, + "learning_rate": 3.545308597930789e-05, + "loss": 0.2512, + "step": 3975 + }, + { + "epoch": 0.07091641993364962, + "grad_norm": 0.6119086742401123, + "learning_rate": 3.546200499464859e-05, + "loss": 0.2515, + "step": 3976 + }, + { + "epoch": 0.07093425605536333, + "grad_norm": 0.3706018328666687, + "learning_rate": 3.5470924009989295e-05, + "loss": 0.2414, + "step": 3977 + }, + { + "epoch": 0.07095209217707701, + "grad_norm": 0.47150352597236633, + "learning_rate": 3.547984302533001e-05, + "loss": 0.3101, + "step": 3978 + }, + { + "epoch": 0.07096992829879072, + "grad_norm": 0.36461305618286133, + "learning_rate": 3.5488762040670715e-05, + "loss": 0.3009, + "step": 3979 + }, + { + "epoch": 0.0709877644205044, + "grad_norm": 0.28881222009658813, + "learning_rate": 3.5497681056011414e-05, + "loss": 0.2497, + "step": 3980 + }, + { + "epoch": 0.0710056005422181, + "grad_norm": 0.29770681262016296, + "learning_rate": 3.550660007135213e-05, + "loss": 0.297, + "step": 3981 + }, + { + "epoch": 0.0710234366639318, + "grad_norm": 0.44741079211235046, + "learning_rate": 3.5515519086692834e-05, + "loss": 0.2879, + "step": 3982 + }, + { + "epoch": 0.0710412727856455, + "grad_norm": 0.4213384687900543, + "learning_rate": 3.5524438102033533e-05, + "loss": 0.2741, + "step": 3983 + }, + { + "epoch": 0.07105910890735918, + "grad_norm": 0.4397996962070465, + "learning_rate": 3.553335711737425e-05, + "loss": 0.2937, + "step": 3984 + }, + { + "epoch": 0.07107694502907287, + "grad_norm": 0.3693235218524933, + "learning_rate": 3.554227613271495e-05, + "loss": 0.239, + "step": 3985 + }, + { + "epoch": 0.07109478115078657, + "grad_norm": 0.34017184376716614, + "learning_rate": 3.555119514805565e-05, + "loss": 0.2538, + "step": 3986 + }, + { + "epoch": 0.07111261727250026, + "grad_norm": 0.5534206628799438, + "learning_rate": 3.5560114163396366e-05, + "loss": 0.2594, + "step": 3987 + }, + { + "epoch": 0.07113045339421396, + "grad_norm": 0.3913263976573944, + "learning_rate": 3.556903317873707e-05, + "loss": 0.2784, + "step": 3988 + }, + { + "epoch": 0.07114828951592765, + "grad_norm": 0.5018420219421387, + "learning_rate": 3.557795219407777e-05, + "loss": 0.2451, + "step": 3989 + }, + { + "epoch": 0.07116612563764135, + "grad_norm": 0.3958134055137634, + "learning_rate": 3.5586871209418485e-05, + "loss": 0.2874, + "step": 3990 + }, + { + "epoch": 0.07118396175935504, + "grad_norm": 0.33712950348854065, + "learning_rate": 3.559579022475919e-05, + "loss": 0.2508, + "step": 3991 + }, + { + "epoch": 0.07120179788106874, + "grad_norm": 0.429338663816452, + "learning_rate": 3.560470924009989e-05, + "loss": 0.2761, + "step": 3992 + }, + { + "epoch": 0.07121963400278243, + "grad_norm": 0.4817236065864563, + "learning_rate": 3.5613628255440604e-05, + "loss": 0.3382, + "step": 3993 + }, + { + "epoch": 0.07123747012449613, + "grad_norm": 0.38059717416763306, + "learning_rate": 3.562254727078131e-05, + "loss": 0.2443, + "step": 3994 + }, + { + "epoch": 0.07125530624620982, + "grad_norm": 0.38838183879852295, + "learning_rate": 3.563146628612201e-05, + "loss": 0.2502, + "step": 3995 + }, + { + "epoch": 0.07127314236792352, + "grad_norm": 0.45766136050224304, + "learning_rate": 3.564038530146272e-05, + "loss": 0.295, + "step": 3996 + }, + { + "epoch": 0.07129097848963721, + "grad_norm": 0.406605064868927, + "learning_rate": 3.564930431680343e-05, + "loss": 0.2862, + "step": 3997 + }, + { + "epoch": 0.07130881461135091, + "grad_norm": 0.32387053966522217, + "learning_rate": 3.565822333214413e-05, + "loss": 0.2469, + "step": 3998 + }, + { + "epoch": 0.0713266507330646, + "grad_norm": 0.4547450542449951, + "learning_rate": 3.5667142347484836e-05, + "loss": 0.3002, + "step": 3999 + }, + { + "epoch": 0.0713444868547783, + "grad_norm": 0.33579307794570923, + "learning_rate": 3.567606136282555e-05, + "loss": 0.2714, + "step": 4000 + }, + { + "epoch": 0.0713444868547783, + "eval_loss": 0.2601618766784668, + "eval_runtime": 1615.4968, + "eval_samples_per_second": 0.634, + "eval_steps_per_second": 0.106, + "step": 4000 + }, + { + "epoch": 0.07136232297649199, + "grad_norm": 0.37511202692985535, + "learning_rate": 3.568498037816625e-05, + "loss": 0.2985, + "step": 4001 + }, + { + "epoch": 0.07138015909820569, + "grad_norm": 0.4718485176563263, + "learning_rate": 3.5693899393506955e-05, + "loss": 0.2835, + "step": 4002 + }, + { + "epoch": 0.07139799521991938, + "grad_norm": 0.3611677885055542, + "learning_rate": 3.570281840884767e-05, + "loss": 0.2438, + "step": 4003 + }, + { + "epoch": 0.07141583134163308, + "grad_norm": 0.47874966263771057, + "learning_rate": 3.5711737424188374e-05, + "loss": 0.2678, + "step": 4004 + }, + { + "epoch": 0.07143366746334677, + "grad_norm": 0.4065699875354767, + "learning_rate": 3.5720656439529074e-05, + "loss": 0.2874, + "step": 4005 + }, + { + "epoch": 0.07145150358506046, + "grad_norm": 0.35797980427742004, + "learning_rate": 3.572957545486979e-05, + "loss": 0.2596, + "step": 4006 + }, + { + "epoch": 0.07146933970677416, + "grad_norm": 0.3942461311817169, + "learning_rate": 3.573849447021049e-05, + "loss": 0.2218, + "step": 4007 + }, + { + "epoch": 0.07148717582848785, + "grad_norm": 0.48561549186706543, + "learning_rate": 3.574741348555119e-05, + "loss": 0.4059, + "step": 4008 + }, + { + "epoch": 0.07150501195020155, + "grad_norm": 0.3484053611755371, + "learning_rate": 3.5756332500891906e-05, + "loss": 0.2734, + "step": 4009 + }, + { + "epoch": 0.07152284807191524, + "grad_norm": 0.46138960123062134, + "learning_rate": 3.576525151623261e-05, + "loss": 0.2802, + "step": 4010 + }, + { + "epoch": 0.07154068419362894, + "grad_norm": 0.3479671776294708, + "learning_rate": 3.577417053157331e-05, + "loss": 0.2435, + "step": 4011 + }, + { + "epoch": 0.07155852031534263, + "grad_norm": 0.35693997144699097, + "learning_rate": 3.5783089546914025e-05, + "loss": 0.1869, + "step": 4012 + }, + { + "epoch": 0.07157635643705633, + "grad_norm": 0.39361152052879333, + "learning_rate": 3.579200856225473e-05, + "loss": 0.3071, + "step": 4013 + }, + { + "epoch": 0.07159419255877002, + "grad_norm": 0.4611194133758545, + "learning_rate": 3.580092757759543e-05, + "loss": 0.3552, + "step": 4014 + }, + { + "epoch": 0.07161202868048372, + "grad_norm": 0.3547293543815613, + "learning_rate": 3.5809846592936144e-05, + "loss": 0.2846, + "step": 4015 + }, + { + "epoch": 0.07162986480219741, + "grad_norm": 0.4279080331325531, + "learning_rate": 3.581876560827685e-05, + "loss": 0.3042, + "step": 4016 + }, + { + "epoch": 0.07164770092391111, + "grad_norm": 0.3108879327774048, + "learning_rate": 3.582768462361755e-05, + "loss": 0.2675, + "step": 4017 + }, + { + "epoch": 0.0716655370456248, + "grad_norm": 0.39170220494270325, + "learning_rate": 3.5836603638958264e-05, + "loss": 0.3341, + "step": 4018 + }, + { + "epoch": 0.0716833731673385, + "grad_norm": 0.343044638633728, + "learning_rate": 3.584552265429897e-05, + "loss": 0.2676, + "step": 4019 + }, + { + "epoch": 0.07170120928905219, + "grad_norm": 0.40363213419914246, + "learning_rate": 3.585444166963967e-05, + "loss": 0.3478, + "step": 4020 + }, + { + "epoch": 0.07171904541076589, + "grad_norm": 0.33018651604652405, + "learning_rate": 3.586336068498038e-05, + "loss": 0.2652, + "step": 4021 + }, + { + "epoch": 0.07173688153247958, + "grad_norm": 0.26141229271888733, + "learning_rate": 3.587227970032109e-05, + "loss": 0.2274, + "step": 4022 + }, + { + "epoch": 0.07175471765419328, + "grad_norm": 0.37298089265823364, + "learning_rate": 3.588119871566179e-05, + "loss": 0.2807, + "step": 4023 + }, + { + "epoch": 0.07177255377590697, + "grad_norm": 0.34697332978248596, + "learning_rate": 3.5890117731002495e-05, + "loss": 0.2605, + "step": 4024 + }, + { + "epoch": 0.07179038989762065, + "grad_norm": 0.4437396824359894, + "learning_rate": 3.589903674634321e-05, + "loss": 0.3161, + "step": 4025 + }, + { + "epoch": 0.07180822601933436, + "grad_norm": 0.36411213874816895, + "learning_rate": 3.590795576168391e-05, + "loss": 0.2671, + "step": 4026 + }, + { + "epoch": 0.07182606214104804, + "grad_norm": 0.3403491973876953, + "learning_rate": 3.5916874777024614e-05, + "loss": 0.2379, + "step": 4027 + }, + { + "epoch": 0.07184389826276175, + "grad_norm": 0.34452518820762634, + "learning_rate": 3.592579379236533e-05, + "loss": 0.2731, + "step": 4028 + }, + { + "epoch": 0.07186173438447543, + "grad_norm": 0.3484019935131073, + "learning_rate": 3.5934712807706034e-05, + "loss": 0.3297, + "step": 4029 + }, + { + "epoch": 0.07187957050618914, + "grad_norm": 0.28667059540748596, + "learning_rate": 3.594363182304673e-05, + "loss": 0.2342, + "step": 4030 + }, + { + "epoch": 0.07189740662790282, + "grad_norm": 0.46884414553642273, + "learning_rate": 3.5952550838387446e-05, + "loss": 0.2701, + "step": 4031 + }, + { + "epoch": 0.07191524274961653, + "grad_norm": 0.4335917532444, + "learning_rate": 3.596146985372815e-05, + "loss": 0.3208, + "step": 4032 + }, + { + "epoch": 0.07193307887133021, + "grad_norm": 0.3155769109725952, + "learning_rate": 3.597038886906885e-05, + "loss": 0.2903, + "step": 4033 + }, + { + "epoch": 0.07195091499304392, + "grad_norm": 0.34757399559020996, + "learning_rate": 3.5979307884409566e-05, + "loss": 0.2854, + "step": 4034 + }, + { + "epoch": 0.0719687511147576, + "grad_norm": 0.42476123571395874, + "learning_rate": 3.598822689975027e-05, + "loss": 0.2812, + "step": 4035 + }, + { + "epoch": 0.0719865872364713, + "grad_norm": 0.3788479268550873, + "learning_rate": 3.599714591509097e-05, + "loss": 0.2432, + "step": 4036 + }, + { + "epoch": 0.072004423358185, + "grad_norm": 0.3588060736656189, + "learning_rate": 3.6006064930431685e-05, + "loss": 0.2805, + "step": 4037 + }, + { + "epoch": 0.0720222594798987, + "grad_norm": 0.35240060091018677, + "learning_rate": 3.601498394577239e-05, + "loss": 0.2647, + "step": 4038 + }, + { + "epoch": 0.07204009560161238, + "grad_norm": 0.3642655313014984, + "learning_rate": 3.602390296111309e-05, + "loss": 0.2678, + "step": 4039 + }, + { + "epoch": 0.07205793172332609, + "grad_norm": 0.41785523295402527, + "learning_rate": 3.6032821976453804e-05, + "loss": 0.269, + "step": 4040 + }, + { + "epoch": 0.07207576784503977, + "grad_norm": 0.48622530698776245, + "learning_rate": 3.604174099179451e-05, + "loss": 0.2998, + "step": 4041 + }, + { + "epoch": 0.07209360396675348, + "grad_norm": 0.5432617664337158, + "learning_rate": 3.605066000713521e-05, + "loss": 0.2878, + "step": 4042 + }, + { + "epoch": 0.07211144008846716, + "grad_norm": 0.40627986192703247, + "learning_rate": 3.605957902247592e-05, + "loss": 0.3137, + "step": 4043 + }, + { + "epoch": 0.07212927621018086, + "grad_norm": 0.3522103428840637, + "learning_rate": 3.606849803781663e-05, + "loss": 0.3098, + "step": 4044 + }, + { + "epoch": 0.07214711233189455, + "grad_norm": 0.2785491347312927, + "learning_rate": 3.607741705315733e-05, + "loss": 0.2428, + "step": 4045 + }, + { + "epoch": 0.07216494845360824, + "grad_norm": 0.3922576606273651, + "learning_rate": 3.608633606849804e-05, + "loss": 0.2949, + "step": 4046 + }, + { + "epoch": 0.07218278457532194, + "grad_norm": 0.41649967432022095, + "learning_rate": 3.609525508383875e-05, + "loss": 0.2817, + "step": 4047 + }, + { + "epoch": 0.07220062069703563, + "grad_norm": 0.284446656703949, + "learning_rate": 3.610417409917945e-05, + "loss": 0.2594, + "step": 4048 + }, + { + "epoch": 0.07221845681874933, + "grad_norm": 0.37044814229011536, + "learning_rate": 3.6113093114520155e-05, + "loss": 0.2537, + "step": 4049 + }, + { + "epoch": 0.07223629294046302, + "grad_norm": 0.3564845621585846, + "learning_rate": 3.612201212986087e-05, + "loss": 0.2759, + "step": 4050 + }, + { + "epoch": 0.07225412906217672, + "grad_norm": 0.44440025091171265, + "learning_rate": 3.6130931145201574e-05, + "loss": 0.3013, + "step": 4051 + }, + { + "epoch": 0.07227196518389041, + "grad_norm": 0.39229390025138855, + "learning_rate": 3.6139850160542274e-05, + "loss": 0.3123, + "step": 4052 + }, + { + "epoch": 0.07228980130560411, + "grad_norm": 0.3334742486476898, + "learning_rate": 3.614876917588299e-05, + "loss": 0.2487, + "step": 4053 + }, + { + "epoch": 0.0723076374273178, + "grad_norm": 0.41078320145606995, + "learning_rate": 3.615768819122369e-05, + "loss": 0.2915, + "step": 4054 + }, + { + "epoch": 0.0723254735490315, + "grad_norm": 0.402371883392334, + "learning_rate": 3.616660720656439e-05, + "loss": 0.2813, + "step": 4055 + }, + { + "epoch": 0.07234330967074519, + "grad_norm": 0.3465537428855896, + "learning_rate": 3.6175526221905106e-05, + "loss": 0.2211, + "step": 4056 + }, + { + "epoch": 0.07236114579245889, + "grad_norm": 0.4069521129131317, + "learning_rate": 3.618444523724581e-05, + "loss": 0.308, + "step": 4057 + }, + { + "epoch": 0.07237898191417258, + "grad_norm": 0.3115183115005493, + "learning_rate": 3.619336425258651e-05, + "loss": 0.2539, + "step": 4058 + }, + { + "epoch": 0.07239681803588628, + "grad_norm": 0.3484099805355072, + "learning_rate": 3.6202283267927225e-05, + "loss": 0.2711, + "step": 4059 + }, + { + "epoch": 0.07241465415759997, + "grad_norm": 0.3692646026611328, + "learning_rate": 3.621120228326793e-05, + "loss": 0.2349, + "step": 4060 + }, + { + "epoch": 0.07243249027931367, + "grad_norm": 0.29867902398109436, + "learning_rate": 3.622012129860863e-05, + "loss": 0.2434, + "step": 4061 + }, + { + "epoch": 0.07245032640102736, + "grad_norm": 0.3988645672798157, + "learning_rate": 3.6229040313949344e-05, + "loss": 0.3059, + "step": 4062 + }, + { + "epoch": 0.07246816252274106, + "grad_norm": 0.3653406500816345, + "learning_rate": 3.623795932929005e-05, + "loss": 0.2944, + "step": 4063 + }, + { + "epoch": 0.07248599864445475, + "grad_norm": 0.4839371144771576, + "learning_rate": 3.624687834463075e-05, + "loss": 0.2584, + "step": 4064 + }, + { + "epoch": 0.07250383476616845, + "grad_norm": 0.30030038952827454, + "learning_rate": 3.6255797359971463e-05, + "loss": 0.2203, + "step": 4065 + }, + { + "epoch": 0.07252167088788214, + "grad_norm": 0.34645602107048035, + "learning_rate": 3.626471637531217e-05, + "loss": 0.2825, + "step": 4066 + }, + { + "epoch": 0.07253950700959583, + "grad_norm": 0.37782979011535645, + "learning_rate": 3.627363539065287e-05, + "loss": 0.2746, + "step": 4067 + }, + { + "epoch": 0.07255734313130953, + "grad_norm": 0.34229543805122375, + "learning_rate": 3.628255440599358e-05, + "loss": 0.2778, + "step": 4068 + }, + { + "epoch": 0.07257517925302322, + "grad_norm": 0.3464083969593048, + "learning_rate": 3.629147342133429e-05, + "loss": 0.2561, + "step": 4069 + }, + { + "epoch": 0.07259301537473692, + "grad_norm": 0.31461194157600403, + "learning_rate": 3.630039243667499e-05, + "loss": 0.2399, + "step": 4070 + }, + { + "epoch": 0.0726108514964506, + "grad_norm": 0.25036537647247314, + "learning_rate": 3.63093114520157e-05, + "loss": 0.207, + "step": 4071 + }, + { + "epoch": 0.07262868761816431, + "grad_norm": 0.314247727394104, + "learning_rate": 3.631823046735641e-05, + "loss": 0.2621, + "step": 4072 + }, + { + "epoch": 0.072646523739878, + "grad_norm": 0.3913421332836151, + "learning_rate": 3.632714948269711e-05, + "loss": 0.2461, + "step": 4073 + }, + { + "epoch": 0.0726643598615917, + "grad_norm": 0.3956703841686249, + "learning_rate": 3.6336068498037814e-05, + "loss": 0.3336, + "step": 4074 + }, + { + "epoch": 0.07268219598330539, + "grad_norm": 0.3772980570793152, + "learning_rate": 3.634498751337853e-05, + "loss": 0.2774, + "step": 4075 + }, + { + "epoch": 0.07270003210501909, + "grad_norm": 0.37001925706863403, + "learning_rate": 3.6353906528719234e-05, + "loss": 0.2694, + "step": 4076 + }, + { + "epoch": 0.07271786822673278, + "grad_norm": 0.46089768409729004, + "learning_rate": 3.636282554405993e-05, + "loss": 0.3106, + "step": 4077 + }, + { + "epoch": 0.07273570434844648, + "grad_norm": 0.3164888620376587, + "learning_rate": 3.6371744559400646e-05, + "loss": 0.2444, + "step": 4078 + }, + { + "epoch": 0.07275354047016017, + "grad_norm": 0.3862965703010559, + "learning_rate": 3.638066357474135e-05, + "loss": 0.3327, + "step": 4079 + }, + { + "epoch": 0.07277137659187387, + "grad_norm": 0.43099600076675415, + "learning_rate": 3.638958259008205e-05, + "loss": 0.2883, + "step": 4080 + }, + { + "epoch": 0.07278921271358756, + "grad_norm": 0.3839138448238373, + "learning_rate": 3.6398501605422766e-05, + "loss": 0.3069, + "step": 4081 + }, + { + "epoch": 0.07280704883530126, + "grad_norm": 0.4318084716796875, + "learning_rate": 3.640742062076347e-05, + "loss": 0.3343, + "step": 4082 + }, + { + "epoch": 0.07282488495701495, + "grad_norm": 0.4460492432117462, + "learning_rate": 3.641633963610417e-05, + "loss": 0.2989, + "step": 4083 + }, + { + "epoch": 0.07284272107872865, + "grad_norm": 0.3706546723842621, + "learning_rate": 3.6425258651444885e-05, + "loss": 0.28, + "step": 4084 + }, + { + "epoch": 0.07286055720044234, + "grad_norm": 0.3881326913833618, + "learning_rate": 3.643417766678559e-05, + "loss": 0.2534, + "step": 4085 + }, + { + "epoch": 0.07287839332215602, + "grad_norm": 0.34988898038864136, + "learning_rate": 3.644309668212629e-05, + "loss": 0.2833, + "step": 4086 + }, + { + "epoch": 0.07289622944386973, + "grad_norm": 0.35669222474098206, + "learning_rate": 3.6452015697467004e-05, + "loss": 0.2807, + "step": 4087 + }, + { + "epoch": 0.07291406556558341, + "grad_norm": 0.470527708530426, + "learning_rate": 3.646093471280771e-05, + "loss": 0.2369, + "step": 4088 + }, + { + "epoch": 0.07293190168729712, + "grad_norm": 0.3309403359889984, + "learning_rate": 3.646985372814841e-05, + "loss": 0.2655, + "step": 4089 + }, + { + "epoch": 0.0729497378090108, + "grad_norm": 0.29176491498947144, + "learning_rate": 3.647877274348912e-05, + "loss": 0.2524, + "step": 4090 + }, + { + "epoch": 0.0729675739307245, + "grad_norm": 0.30771803855895996, + "learning_rate": 3.648769175882983e-05, + "loss": 0.2541, + "step": 4091 + }, + { + "epoch": 0.0729854100524382, + "grad_norm": 0.37845999002456665, + "learning_rate": 3.649661077417053e-05, + "loss": 0.2424, + "step": 4092 + }, + { + "epoch": 0.0730032461741519, + "grad_norm": 0.49875691533088684, + "learning_rate": 3.650552978951124e-05, + "loss": 0.2667, + "step": 4093 + }, + { + "epoch": 0.07302108229586558, + "grad_norm": 0.6584007143974304, + "learning_rate": 3.651444880485195e-05, + "loss": 0.3108, + "step": 4094 + }, + { + "epoch": 0.07303891841757928, + "grad_norm": 0.43037962913513184, + "learning_rate": 3.652336782019265e-05, + "loss": 0.2849, + "step": 4095 + }, + { + "epoch": 0.07305675453929297, + "grad_norm": 0.554920494556427, + "learning_rate": 3.653228683553336e-05, + "loss": 0.2853, + "step": 4096 + }, + { + "epoch": 0.07307459066100667, + "grad_norm": 0.3984546959400177, + "learning_rate": 3.654120585087407e-05, + "loss": 0.2665, + "step": 4097 + }, + { + "epoch": 0.07309242678272036, + "grad_norm": 0.36884525418281555, + "learning_rate": 3.6550124866214774e-05, + "loss": 0.29, + "step": 4098 + }, + { + "epoch": 0.07311026290443406, + "grad_norm": 0.4039285480976105, + "learning_rate": 3.655904388155548e-05, + "loss": 0.2768, + "step": 4099 + }, + { + "epoch": 0.07312809902614775, + "grad_norm": 0.4425659775733948, + "learning_rate": 3.656796289689619e-05, + "loss": 0.3542, + "step": 4100 + }, + { + "epoch": 0.07314593514786145, + "grad_norm": 0.2978968024253845, + "learning_rate": 3.657688191223689e-05, + "loss": 0.2677, + "step": 4101 + }, + { + "epoch": 0.07316377126957514, + "grad_norm": 0.46927177906036377, + "learning_rate": 3.658580092757759e-05, + "loss": 0.3002, + "step": 4102 + }, + { + "epoch": 0.07318160739128884, + "grad_norm": 0.42345908284187317, + "learning_rate": 3.6594719942918306e-05, + "loss": 0.3118, + "step": 4103 + }, + { + "epoch": 0.07319944351300253, + "grad_norm": 0.37958866357803345, + "learning_rate": 3.660363895825901e-05, + "loss": 0.3022, + "step": 4104 + }, + { + "epoch": 0.07321727963471623, + "grad_norm": 0.3434056043624878, + "learning_rate": 3.661255797359971e-05, + "loss": 0.2343, + "step": 4105 + }, + { + "epoch": 0.07323511575642992, + "grad_norm": 0.3816780745983124, + "learning_rate": 3.6621476988940425e-05, + "loss": 0.3118, + "step": 4106 + }, + { + "epoch": 0.07325295187814361, + "grad_norm": 0.37504827976226807, + "learning_rate": 3.663039600428113e-05, + "loss": 0.2506, + "step": 4107 + }, + { + "epoch": 0.07327078799985731, + "grad_norm": 0.5456230044364929, + "learning_rate": 3.663931501962183e-05, + "loss": 0.3328, + "step": 4108 + }, + { + "epoch": 0.073288624121571, + "grad_norm": 0.37133750319480896, + "learning_rate": 3.6648234034962544e-05, + "loss": 0.3241, + "step": 4109 + }, + { + "epoch": 0.0733064602432847, + "grad_norm": 0.49705299735069275, + "learning_rate": 3.665715305030325e-05, + "loss": 0.3726, + "step": 4110 + }, + { + "epoch": 0.07332429636499839, + "grad_norm": 0.32278305292129517, + "learning_rate": 3.666607206564395e-05, + "loss": 0.2904, + "step": 4111 + }, + { + "epoch": 0.07334213248671209, + "grad_norm": 0.3209100663661957, + "learning_rate": 3.667499108098466e-05, + "loss": 0.2812, + "step": 4112 + }, + { + "epoch": 0.07335996860842578, + "grad_norm": 0.4132683277130127, + "learning_rate": 3.668391009632537e-05, + "loss": 0.3324, + "step": 4113 + }, + { + "epoch": 0.07337780473013948, + "grad_norm": 0.5577122569084167, + "learning_rate": 3.669282911166607e-05, + "loss": 0.3768, + "step": 4114 + }, + { + "epoch": 0.07339564085185317, + "grad_norm": 0.403633177280426, + "learning_rate": 3.670174812700678e-05, + "loss": 0.2625, + "step": 4115 + }, + { + "epoch": 0.07341347697356687, + "grad_norm": 0.3335971534252167, + "learning_rate": 3.671066714234749e-05, + "loss": 0.2696, + "step": 4116 + }, + { + "epoch": 0.07343131309528056, + "grad_norm": 0.34153881669044495, + "learning_rate": 3.671958615768819e-05, + "loss": 0.2817, + "step": 4117 + }, + { + "epoch": 0.07344914921699426, + "grad_norm": 0.49378255009651184, + "learning_rate": 3.67285051730289e-05, + "loss": 0.3012, + "step": 4118 + }, + { + "epoch": 0.07346698533870795, + "grad_norm": 0.38548022508621216, + "learning_rate": 3.673742418836961e-05, + "loss": 0.2728, + "step": 4119 + }, + { + "epoch": 0.07348482146042165, + "grad_norm": 0.26984280347824097, + "learning_rate": 3.674634320371031e-05, + "loss": 0.2286, + "step": 4120 + }, + { + "epoch": 0.07350265758213534, + "grad_norm": 0.41002365946769714, + "learning_rate": 3.675526221905102e-05, + "loss": 0.2589, + "step": 4121 + }, + { + "epoch": 0.07352049370384904, + "grad_norm": 0.45269328355789185, + "learning_rate": 3.676418123439173e-05, + "loss": 0.2685, + "step": 4122 + }, + { + "epoch": 0.07353832982556273, + "grad_norm": 0.4685843586921692, + "learning_rate": 3.6773100249732433e-05, + "loss": 0.353, + "step": 4123 + }, + { + "epoch": 0.07355616594727643, + "grad_norm": 0.32437804341316223, + "learning_rate": 3.678201926507314e-05, + "loss": 0.3025, + "step": 4124 + }, + { + "epoch": 0.07357400206899012, + "grad_norm": 0.29815438389778137, + "learning_rate": 3.6790938280413846e-05, + "loss": 0.3155, + "step": 4125 + }, + { + "epoch": 0.0735918381907038, + "grad_norm": 0.4532708525657654, + "learning_rate": 3.679985729575455e-05, + "loss": 0.3055, + "step": 4126 + }, + { + "epoch": 0.07360967431241751, + "grad_norm": 0.3672865331172943, + "learning_rate": 3.680877631109525e-05, + "loss": 0.2607, + "step": 4127 + }, + { + "epoch": 0.0736275104341312, + "grad_norm": 0.4694015085697174, + "learning_rate": 3.6817695326435965e-05, + "loss": 0.3161, + "step": 4128 + }, + { + "epoch": 0.0736453465558449, + "grad_norm": 0.43574950098991394, + "learning_rate": 3.682661434177667e-05, + "loss": 0.3091, + "step": 4129 + }, + { + "epoch": 0.07366318267755859, + "grad_norm": 0.3772227466106415, + "learning_rate": 3.683553335711737e-05, + "loss": 0.2929, + "step": 4130 + }, + { + "epoch": 0.07368101879927229, + "grad_norm": 0.3468133509159088, + "learning_rate": 3.6844452372458085e-05, + "loss": 0.2648, + "step": 4131 + }, + { + "epoch": 0.07369885492098598, + "grad_norm": 0.3430408835411072, + "learning_rate": 3.685337138779879e-05, + "loss": 0.2749, + "step": 4132 + }, + { + "epoch": 0.07371669104269968, + "grad_norm": 0.3087609112262726, + "learning_rate": 3.686229040313949e-05, + "loss": 0.2363, + "step": 4133 + }, + { + "epoch": 0.07373452716441337, + "grad_norm": 0.29770421981811523, + "learning_rate": 3.6871209418480204e-05, + "loss": 0.2397, + "step": 4134 + }, + { + "epoch": 0.07375236328612707, + "grad_norm": 0.4533739984035492, + "learning_rate": 3.688012843382091e-05, + "loss": 0.3015, + "step": 4135 + }, + { + "epoch": 0.07377019940784076, + "grad_norm": 0.34918293356895447, + "learning_rate": 3.688904744916161e-05, + "loss": 0.3049, + "step": 4136 + }, + { + "epoch": 0.07378803552955446, + "grad_norm": 0.3501115143299103, + "learning_rate": 3.689796646450232e-05, + "loss": 0.2774, + "step": 4137 + }, + { + "epoch": 0.07380587165126815, + "grad_norm": 0.31595152616500854, + "learning_rate": 3.690688547984303e-05, + "loss": 0.2736, + "step": 4138 + }, + { + "epoch": 0.07382370777298185, + "grad_norm": 0.3417123854160309, + "learning_rate": 3.691580449518373e-05, + "loss": 0.2105, + "step": 4139 + }, + { + "epoch": 0.07384154389469554, + "grad_norm": 0.348685622215271, + "learning_rate": 3.692472351052444e-05, + "loss": 0.2623, + "step": 4140 + }, + { + "epoch": 0.07385938001640924, + "grad_norm": 0.3683890700340271, + "learning_rate": 3.693364252586515e-05, + "loss": 0.3127, + "step": 4141 + }, + { + "epoch": 0.07387721613812293, + "grad_norm": 0.4189675450325012, + "learning_rate": 3.694256154120585e-05, + "loss": 0.3463, + "step": 4142 + }, + { + "epoch": 0.07389505225983663, + "grad_norm": 0.28707680106163025, + "learning_rate": 3.695148055654656e-05, + "loss": 0.2451, + "step": 4143 + }, + { + "epoch": 0.07391288838155032, + "grad_norm": 0.48650121688842773, + "learning_rate": 3.696039957188727e-05, + "loss": 0.2875, + "step": 4144 + }, + { + "epoch": 0.07393072450326402, + "grad_norm": 0.3325197100639343, + "learning_rate": 3.696931858722797e-05, + "loss": 0.2779, + "step": 4145 + }, + { + "epoch": 0.0739485606249777, + "grad_norm": 0.3326643109321594, + "learning_rate": 3.697823760256868e-05, + "loss": 0.2357, + "step": 4146 + }, + { + "epoch": 0.07396639674669139, + "grad_norm": 0.4176287353038788, + "learning_rate": 3.698715661790939e-05, + "loss": 0.3024, + "step": 4147 + }, + { + "epoch": 0.0739842328684051, + "grad_norm": 0.4428405165672302, + "learning_rate": 3.699607563325009e-05, + "loss": 0.2801, + "step": 4148 + }, + { + "epoch": 0.07400206899011878, + "grad_norm": 0.33560845255851746, + "learning_rate": 3.70049946485908e-05, + "loss": 0.2431, + "step": 4149 + }, + { + "epoch": 0.07401990511183248, + "grad_norm": 0.41161322593688965, + "learning_rate": 3.7013913663931506e-05, + "loss": 0.3095, + "step": 4150 + }, + { + "epoch": 0.07403774123354617, + "grad_norm": 0.34065937995910645, + "learning_rate": 3.702283267927221e-05, + "loss": 0.3246, + "step": 4151 + }, + { + "epoch": 0.07405557735525987, + "grad_norm": 0.35372689366340637, + "learning_rate": 3.703175169461291e-05, + "loss": 0.2366, + "step": 4152 + }, + { + "epoch": 0.07407341347697356, + "grad_norm": 0.41967201232910156, + "learning_rate": 3.7040670709953625e-05, + "loss": 0.2655, + "step": 4153 + }, + { + "epoch": 0.07409124959868726, + "grad_norm": 0.47905057668685913, + "learning_rate": 3.704958972529433e-05, + "loss": 0.2473, + "step": 4154 + }, + { + "epoch": 0.07410908572040095, + "grad_norm": 0.49359965324401855, + "learning_rate": 3.705850874063503e-05, + "loss": 0.2915, + "step": 4155 + }, + { + "epoch": 0.07412692184211465, + "grad_norm": 0.36214080452919006, + "learning_rate": 3.7067427755975744e-05, + "loss": 0.2135, + "step": 4156 + }, + { + "epoch": 0.07414475796382834, + "grad_norm": 0.4641368091106415, + "learning_rate": 3.707634677131645e-05, + "loss": 0.3361, + "step": 4157 + }, + { + "epoch": 0.07416259408554204, + "grad_norm": 0.4047207832336426, + "learning_rate": 3.708526578665715e-05, + "loss": 0.2564, + "step": 4158 + }, + { + "epoch": 0.07418043020725573, + "grad_norm": 0.45328521728515625, + "learning_rate": 3.709418480199786e-05, + "loss": 0.3188, + "step": 4159 + }, + { + "epoch": 0.07419826632896943, + "grad_norm": 0.43802186846733093, + "learning_rate": 3.710310381733857e-05, + "loss": 0.2796, + "step": 4160 + }, + { + "epoch": 0.07421610245068312, + "grad_norm": 0.2616294026374817, + "learning_rate": 3.711202283267927e-05, + "loss": 0.2456, + "step": 4161 + }, + { + "epoch": 0.07423393857239682, + "grad_norm": 0.39143669605255127, + "learning_rate": 3.712094184801998e-05, + "loss": 0.2534, + "step": 4162 + }, + { + "epoch": 0.07425177469411051, + "grad_norm": 0.5078184604644775, + "learning_rate": 3.712986086336069e-05, + "loss": 0.3468, + "step": 4163 + }, + { + "epoch": 0.07426961081582421, + "grad_norm": 0.5399411916732788, + "learning_rate": 3.713877987870139e-05, + "loss": 0.3326, + "step": 4164 + }, + { + "epoch": 0.0742874469375379, + "grad_norm": 0.32032978534698486, + "learning_rate": 3.71476988940421e-05, + "loss": 0.2905, + "step": 4165 + }, + { + "epoch": 0.07430528305925159, + "grad_norm": 0.3497765064239502, + "learning_rate": 3.715661790938281e-05, + "loss": 0.2818, + "step": 4166 + }, + { + "epoch": 0.07432311918096529, + "grad_norm": 0.44904711842536926, + "learning_rate": 3.716553692472351e-05, + "loss": 0.3238, + "step": 4167 + }, + { + "epoch": 0.07434095530267898, + "grad_norm": 0.36625099182128906, + "learning_rate": 3.717445594006422e-05, + "loss": 0.3127, + "step": 4168 + }, + { + "epoch": 0.07435879142439268, + "grad_norm": 0.365116685628891, + "learning_rate": 3.718337495540493e-05, + "loss": 0.2337, + "step": 4169 + }, + { + "epoch": 0.07437662754610637, + "grad_norm": 1.0939042568206787, + "learning_rate": 3.719229397074563e-05, + "loss": 0.2261, + "step": 4170 + }, + { + "epoch": 0.07439446366782007, + "grad_norm": 0.36195090413093567, + "learning_rate": 3.720121298608634e-05, + "loss": 0.2614, + "step": 4171 + }, + { + "epoch": 0.07441229978953376, + "grad_norm": 0.49965110421180725, + "learning_rate": 3.7210132001427046e-05, + "loss": 0.2952, + "step": 4172 + }, + { + "epoch": 0.07443013591124746, + "grad_norm": 0.6151539087295532, + "learning_rate": 3.721905101676775e-05, + "loss": 0.2978, + "step": 4173 + }, + { + "epoch": 0.07444797203296115, + "grad_norm": 0.33340463042259216, + "learning_rate": 3.722797003210846e-05, + "loss": 0.2262, + "step": 4174 + }, + { + "epoch": 0.07446580815467485, + "grad_norm": 0.4527425765991211, + "learning_rate": 3.7236889047449165e-05, + "loss": 0.2469, + "step": 4175 + }, + { + "epoch": 0.07448364427638854, + "grad_norm": 0.5253557562828064, + "learning_rate": 3.724580806278987e-05, + "loss": 0.3149, + "step": 4176 + }, + { + "epoch": 0.07450148039810224, + "grad_norm": 0.41695138812065125, + "learning_rate": 3.725472707813057e-05, + "loss": 0.2749, + "step": 4177 + }, + { + "epoch": 0.07451931651981593, + "grad_norm": 0.40068671107292175, + "learning_rate": 3.7263646093471284e-05, + "loss": 0.2867, + "step": 4178 + }, + { + "epoch": 0.07453715264152963, + "grad_norm": 0.31247401237487793, + "learning_rate": 3.727256510881199e-05, + "loss": 0.2669, + "step": 4179 + }, + { + "epoch": 0.07455498876324332, + "grad_norm": 0.28745976090431213, + "learning_rate": 3.728148412415269e-05, + "loss": 0.2492, + "step": 4180 + }, + { + "epoch": 0.07457282488495702, + "grad_norm": 0.3374185264110565, + "learning_rate": 3.7290403139493404e-05, + "loss": 0.2182, + "step": 4181 + }, + { + "epoch": 0.07459066100667071, + "grad_norm": 0.35271865129470825, + "learning_rate": 3.729932215483411e-05, + "loss": 0.258, + "step": 4182 + }, + { + "epoch": 0.07460849712838441, + "grad_norm": 0.3584800362586975, + "learning_rate": 3.730824117017481e-05, + "loss": 0.2396, + "step": 4183 + }, + { + "epoch": 0.0746263332500981, + "grad_norm": 0.40084826946258545, + "learning_rate": 3.731716018551552e-05, + "loss": 0.2503, + "step": 4184 + }, + { + "epoch": 0.0746441693718118, + "grad_norm": 0.42655277252197266, + "learning_rate": 3.732607920085623e-05, + "loss": 0.2585, + "step": 4185 + }, + { + "epoch": 0.07466200549352549, + "grad_norm": 0.5718599557876587, + "learning_rate": 3.733499821619693e-05, + "loss": 0.3211, + "step": 4186 + }, + { + "epoch": 0.07467984161523918, + "grad_norm": 0.3136085867881775, + "learning_rate": 3.734391723153764e-05, + "loss": 0.2252, + "step": 4187 + }, + { + "epoch": 0.07469767773695288, + "grad_norm": 0.4330437481403351, + "learning_rate": 3.735283624687835e-05, + "loss": 0.2141, + "step": 4188 + }, + { + "epoch": 0.07471551385866657, + "grad_norm": 0.3692575991153717, + "learning_rate": 3.736175526221905e-05, + "loss": 0.3017, + "step": 4189 + }, + { + "epoch": 0.07473334998038027, + "grad_norm": 0.39885830879211426, + "learning_rate": 3.737067427755976e-05, + "loss": 0.3543, + "step": 4190 + }, + { + "epoch": 0.07475118610209396, + "grad_norm": 0.5391544103622437, + "learning_rate": 3.737959329290047e-05, + "loss": 0.2122, + "step": 4191 + }, + { + "epoch": 0.07476902222380766, + "grad_norm": 0.33087414503097534, + "learning_rate": 3.738851230824117e-05, + "loss": 0.2777, + "step": 4192 + }, + { + "epoch": 0.07478685834552135, + "grad_norm": 0.32284751534461975, + "learning_rate": 3.739743132358188e-05, + "loss": 0.2488, + "step": 4193 + }, + { + "epoch": 0.07480469446723505, + "grad_norm": 0.38482213020324707, + "learning_rate": 3.7406350338922586e-05, + "loss": 0.3177, + "step": 4194 + }, + { + "epoch": 0.07482253058894874, + "grad_norm": 0.3215670883655548, + "learning_rate": 3.741526935426329e-05, + "loss": 0.2671, + "step": 4195 + }, + { + "epoch": 0.07484036671066244, + "grad_norm": 0.36106786131858826, + "learning_rate": 3.7424188369604e-05, + "loss": 0.2864, + "step": 4196 + }, + { + "epoch": 0.07485820283237613, + "grad_norm": 0.4248841106891632, + "learning_rate": 3.7433107384944706e-05, + "loss": 0.2748, + "step": 4197 + }, + { + "epoch": 0.07487603895408983, + "grad_norm": 0.3289128839969635, + "learning_rate": 3.744202640028541e-05, + "loss": 0.2389, + "step": 4198 + }, + { + "epoch": 0.07489387507580351, + "grad_norm": 0.4033012092113495, + "learning_rate": 3.745094541562612e-05, + "loss": 0.3701, + "step": 4199 + }, + { + "epoch": 0.07491171119751722, + "grad_norm": 0.3788880407810211, + "learning_rate": 3.7459864430966825e-05, + "loss": 0.3216, + "step": 4200 + }, + { + "epoch": 0.0749295473192309, + "grad_norm": 0.35157310962677, + "learning_rate": 3.746878344630753e-05, + "loss": 0.3199, + "step": 4201 + }, + { + "epoch": 0.0749473834409446, + "grad_norm": 0.330655574798584, + "learning_rate": 3.747770246164824e-05, + "loss": 0.2625, + "step": 4202 + }, + { + "epoch": 0.0749652195626583, + "grad_norm": 0.3260590732097626, + "learning_rate": 3.7486621476988944e-05, + "loss": 0.2614, + "step": 4203 + }, + { + "epoch": 0.074983055684372, + "grad_norm": 0.36597350239753723, + "learning_rate": 3.749554049232965e-05, + "loss": 0.2439, + "step": 4204 + }, + { + "epoch": 0.07500089180608568, + "grad_norm": 0.35310667753219604, + "learning_rate": 3.750445950767035e-05, + "loss": 0.2776, + "step": 4205 + }, + { + "epoch": 0.07501872792779937, + "grad_norm": 0.3559993803501129, + "learning_rate": 3.751337852301106e-05, + "loss": 0.2699, + "step": 4206 + }, + { + "epoch": 0.07503656404951307, + "grad_norm": 0.32624587416648865, + "learning_rate": 3.752229753835177e-05, + "loss": 0.2645, + "step": 4207 + }, + { + "epoch": 0.07505440017122676, + "grad_norm": 0.5166625380516052, + "learning_rate": 3.753121655369247e-05, + "loss": 0.3367, + "step": 4208 + }, + { + "epoch": 0.07507223629294046, + "grad_norm": 0.2428450733423233, + "learning_rate": 3.754013556903318e-05, + "loss": 0.2431, + "step": 4209 + }, + { + "epoch": 0.07509007241465415, + "grad_norm": 0.4116324484348297, + "learning_rate": 3.754905458437389e-05, + "loss": 0.2807, + "step": 4210 + }, + { + "epoch": 0.07510790853636785, + "grad_norm": 0.27581650018692017, + "learning_rate": 3.755797359971459e-05, + "loss": 0.2234, + "step": 4211 + }, + { + "epoch": 0.07512574465808154, + "grad_norm": 0.3597171902656555, + "learning_rate": 3.75668926150553e-05, + "loss": 0.2942, + "step": 4212 + }, + { + "epoch": 0.07514358077979524, + "grad_norm": 0.4038372039794922, + "learning_rate": 3.757581163039601e-05, + "loss": 0.2864, + "step": 4213 + }, + { + "epoch": 0.07516141690150893, + "grad_norm": 0.4361227750778198, + "learning_rate": 3.758473064573671e-05, + "loss": 0.2777, + "step": 4214 + }, + { + "epoch": 0.07517925302322263, + "grad_norm": 0.5329325795173645, + "learning_rate": 3.759364966107742e-05, + "loss": 0.2835, + "step": 4215 + }, + { + "epoch": 0.07519708914493632, + "grad_norm": 0.3828251361846924, + "learning_rate": 3.760256867641813e-05, + "loss": 0.2802, + "step": 4216 + }, + { + "epoch": 0.07521492526665002, + "grad_norm": 0.35325562953948975, + "learning_rate": 3.761148769175883e-05, + "loss": 0.2741, + "step": 4217 + }, + { + "epoch": 0.07523276138836371, + "grad_norm": 0.37237346172332764, + "learning_rate": 3.762040670709954e-05, + "loss": 0.2834, + "step": 4218 + }, + { + "epoch": 0.07525059751007741, + "grad_norm": 0.44692501425743103, + "learning_rate": 3.7629325722440246e-05, + "loss": 0.2587, + "step": 4219 + }, + { + "epoch": 0.0752684336317911, + "grad_norm": 0.3682231903076172, + "learning_rate": 3.763824473778095e-05, + "loss": 0.2642, + "step": 4220 + }, + { + "epoch": 0.0752862697535048, + "grad_norm": 0.4995605945587158, + "learning_rate": 3.764716375312166e-05, + "loss": 0.2394, + "step": 4221 + }, + { + "epoch": 0.07530410587521849, + "grad_norm": 0.33465439081192017, + "learning_rate": 3.7656082768462365e-05, + "loss": 0.2605, + "step": 4222 + }, + { + "epoch": 0.07532194199693219, + "grad_norm": 0.39678364992141724, + "learning_rate": 3.766500178380307e-05, + "loss": 0.355, + "step": 4223 + }, + { + "epoch": 0.07533977811864588, + "grad_norm": 0.3365713655948639, + "learning_rate": 3.767392079914378e-05, + "loss": 0.2951, + "step": 4224 + }, + { + "epoch": 0.07535761424035958, + "grad_norm": 0.35178980231285095, + "learning_rate": 3.7682839814484484e-05, + "loss": 0.2638, + "step": 4225 + }, + { + "epoch": 0.07537545036207327, + "grad_norm": 0.47750401496887207, + "learning_rate": 3.769175882982519e-05, + "loss": 0.3176, + "step": 4226 + }, + { + "epoch": 0.07539328648378696, + "grad_norm": 0.3698870539665222, + "learning_rate": 3.77006778451659e-05, + "loss": 0.2659, + "step": 4227 + }, + { + "epoch": 0.07541112260550066, + "grad_norm": 0.25361302495002747, + "learning_rate": 3.7709596860506603e-05, + "loss": 0.2323, + "step": 4228 + }, + { + "epoch": 0.07542895872721435, + "grad_norm": 0.36756184697151184, + "learning_rate": 3.771851587584731e-05, + "loss": 0.2129, + "step": 4229 + }, + { + "epoch": 0.07544679484892805, + "grad_norm": 0.516951322555542, + "learning_rate": 3.772743489118801e-05, + "loss": 0.2984, + "step": 4230 + }, + { + "epoch": 0.07546463097064174, + "grad_norm": 0.31373366713523865, + "learning_rate": 3.773635390652872e-05, + "loss": 0.2698, + "step": 4231 + }, + { + "epoch": 0.07548246709235544, + "grad_norm": 0.2851543724536896, + "learning_rate": 3.774527292186943e-05, + "loss": 0.2624, + "step": 4232 + }, + { + "epoch": 0.07550030321406913, + "grad_norm": 0.33759239315986633, + "learning_rate": 3.775419193721013e-05, + "loss": 0.3008, + "step": 4233 + }, + { + "epoch": 0.07551813933578283, + "grad_norm": 0.38497021794319153, + "learning_rate": 3.776311095255084e-05, + "loss": 0.2882, + "step": 4234 + }, + { + "epoch": 0.07553597545749652, + "grad_norm": 0.4482494294643402, + "learning_rate": 3.777202996789155e-05, + "loss": 0.2576, + "step": 4235 + }, + { + "epoch": 0.07555381157921022, + "grad_norm": 0.31993567943573, + "learning_rate": 3.778094898323225e-05, + "loss": 0.2589, + "step": 4236 + }, + { + "epoch": 0.07557164770092391, + "grad_norm": 0.3316757380962372, + "learning_rate": 3.778986799857296e-05, + "loss": 0.3218, + "step": 4237 + }, + { + "epoch": 0.07558948382263761, + "grad_norm": 0.3332213759422302, + "learning_rate": 3.779878701391367e-05, + "loss": 0.2829, + "step": 4238 + }, + { + "epoch": 0.0756073199443513, + "grad_norm": 0.27364617586135864, + "learning_rate": 3.780770602925437e-05, + "loss": 0.2453, + "step": 4239 + }, + { + "epoch": 0.075625156066065, + "grad_norm": 0.5406433939933777, + "learning_rate": 3.781662504459508e-05, + "loss": 0.2576, + "step": 4240 + }, + { + "epoch": 0.07564299218777869, + "grad_norm": 0.31070637702941895, + "learning_rate": 3.7825544059935786e-05, + "loss": 0.2643, + "step": 4241 + }, + { + "epoch": 0.07566082830949239, + "grad_norm": 0.43498286604881287, + "learning_rate": 3.783446307527649e-05, + "loss": 0.339, + "step": 4242 + }, + { + "epoch": 0.07567866443120608, + "grad_norm": 0.43137484788894653, + "learning_rate": 3.78433820906172e-05, + "loss": 0.2889, + "step": 4243 + }, + { + "epoch": 0.07569650055291978, + "grad_norm": 0.3390316367149353, + "learning_rate": 3.7852301105957906e-05, + "loss": 0.2694, + "step": 4244 + }, + { + "epoch": 0.07571433667463347, + "grad_norm": 0.4314836263656616, + "learning_rate": 3.786122012129861e-05, + "loss": 0.3117, + "step": 4245 + }, + { + "epoch": 0.07573217279634717, + "grad_norm": 0.5436944365501404, + "learning_rate": 3.787013913663932e-05, + "loss": 0.3048, + "step": 4246 + }, + { + "epoch": 0.07575000891806086, + "grad_norm": 0.4359453022480011, + "learning_rate": 3.7879058151980025e-05, + "loss": 0.2824, + "step": 4247 + }, + { + "epoch": 0.07576784503977455, + "grad_norm": 0.4316657781600952, + "learning_rate": 3.788797716732073e-05, + "loss": 0.3018, + "step": 4248 + }, + { + "epoch": 0.07578568116148825, + "grad_norm": 0.593874990940094, + "learning_rate": 3.789689618266144e-05, + "loss": 0.2782, + "step": 4249 + }, + { + "epoch": 0.07580351728320193, + "grad_norm": 0.36799126863479614, + "learning_rate": 3.7905815198002144e-05, + "loss": 0.2435, + "step": 4250 + }, + { + "epoch": 0.07582135340491564, + "grad_norm": 0.4341272711753845, + "learning_rate": 3.791473421334285e-05, + "loss": 0.2978, + "step": 4251 + }, + { + "epoch": 0.07583918952662932, + "grad_norm": 0.33046677708625793, + "learning_rate": 3.7923653228683557e-05, + "loss": 0.2872, + "step": 4252 + }, + { + "epoch": 0.07585702564834303, + "grad_norm": 0.3701927661895752, + "learning_rate": 3.793257224402426e-05, + "loss": 0.2577, + "step": 4253 + }, + { + "epoch": 0.07587486177005671, + "grad_norm": 0.38218578696250916, + "learning_rate": 3.794149125936497e-05, + "loss": 0.2744, + "step": 4254 + }, + { + "epoch": 0.07589269789177042, + "grad_norm": 0.513279378414154, + "learning_rate": 3.795041027470567e-05, + "loss": 0.2901, + "step": 4255 + }, + { + "epoch": 0.0759105340134841, + "grad_norm": 0.3848417103290558, + "learning_rate": 3.795932929004638e-05, + "loss": 0.3132, + "step": 4256 + }, + { + "epoch": 0.0759283701351978, + "grad_norm": 0.39559414982795715, + "learning_rate": 3.796824830538709e-05, + "loss": 0.3409, + "step": 4257 + }, + { + "epoch": 0.0759462062569115, + "grad_norm": 0.3061651587486267, + "learning_rate": 3.797716732072779e-05, + "loss": 0.2477, + "step": 4258 + }, + { + "epoch": 0.0759640423786252, + "grad_norm": 0.3205283284187317, + "learning_rate": 3.79860863360685e-05, + "loss": 0.2137, + "step": 4259 + }, + { + "epoch": 0.07598187850033888, + "grad_norm": 0.3794453740119934, + "learning_rate": 3.799500535140921e-05, + "loss": 0.2938, + "step": 4260 + }, + { + "epoch": 0.07599971462205259, + "grad_norm": 0.3018369972705841, + "learning_rate": 3.800392436674991e-05, + "loss": 0.2972, + "step": 4261 + }, + { + "epoch": 0.07601755074376627, + "grad_norm": 0.28759652376174927, + "learning_rate": 3.801284338209062e-05, + "loss": 0.2394, + "step": 4262 + }, + { + "epoch": 0.07603538686547998, + "grad_norm": 0.404039204120636, + "learning_rate": 3.802176239743133e-05, + "loss": 0.2396, + "step": 4263 + }, + { + "epoch": 0.07605322298719366, + "grad_norm": 0.3380386531352997, + "learning_rate": 3.803068141277203e-05, + "loss": 0.3133, + "step": 4264 + }, + { + "epoch": 0.07607105910890737, + "grad_norm": 0.3418702185153961, + "learning_rate": 3.803960042811274e-05, + "loss": 0.2456, + "step": 4265 + }, + { + "epoch": 0.07608889523062105, + "grad_norm": 0.3194468021392822, + "learning_rate": 3.8048519443453446e-05, + "loss": 0.245, + "step": 4266 + }, + { + "epoch": 0.07610673135233474, + "grad_norm": 0.32654035091400146, + "learning_rate": 3.805743845879415e-05, + "loss": 0.2416, + "step": 4267 + }, + { + "epoch": 0.07612456747404844, + "grad_norm": 0.3586922585964203, + "learning_rate": 3.806635747413486e-05, + "loss": 0.2773, + "step": 4268 + }, + { + "epoch": 0.07614240359576213, + "grad_norm": 0.4663313031196594, + "learning_rate": 3.8075276489475565e-05, + "loss": 0.3336, + "step": 4269 + }, + { + "epoch": 0.07616023971747583, + "grad_norm": 0.3342205286026001, + "learning_rate": 3.808419550481627e-05, + "loss": 0.2854, + "step": 4270 + }, + { + "epoch": 0.07617807583918952, + "grad_norm": 0.3806861937046051, + "learning_rate": 3.809311452015698e-05, + "loss": 0.3085, + "step": 4271 + }, + { + "epoch": 0.07619591196090322, + "grad_norm": 0.29361093044281006, + "learning_rate": 3.8102033535497684e-05, + "loss": 0.2345, + "step": 4272 + }, + { + "epoch": 0.07621374808261691, + "grad_norm": 0.34427595138549805, + "learning_rate": 3.811095255083839e-05, + "loss": 0.2064, + "step": 4273 + }, + { + "epoch": 0.07623158420433061, + "grad_norm": 0.3486906588077545, + "learning_rate": 3.81198715661791e-05, + "loss": 0.2433, + "step": 4274 + }, + { + "epoch": 0.0762494203260443, + "grad_norm": 0.390524685382843, + "learning_rate": 3.81287905815198e-05, + "loss": 0.239, + "step": 4275 + }, + { + "epoch": 0.076267256447758, + "grad_norm": 0.4219571352005005, + "learning_rate": 3.813770959686051e-05, + "loss": 0.2721, + "step": 4276 + }, + { + "epoch": 0.07628509256947169, + "grad_norm": 0.39834532141685486, + "learning_rate": 3.8146628612201216e-05, + "loss": 0.2484, + "step": 4277 + }, + { + "epoch": 0.07630292869118539, + "grad_norm": 0.4055795967578888, + "learning_rate": 3.815554762754192e-05, + "loss": 0.3014, + "step": 4278 + }, + { + "epoch": 0.07632076481289908, + "grad_norm": 0.4223778247833252, + "learning_rate": 3.816446664288263e-05, + "loss": 0.3145, + "step": 4279 + }, + { + "epoch": 0.07633860093461278, + "grad_norm": 0.3109219968318939, + "learning_rate": 3.817338565822333e-05, + "loss": 0.2424, + "step": 4280 + }, + { + "epoch": 0.07635643705632647, + "grad_norm": 0.31068381667137146, + "learning_rate": 3.818230467356404e-05, + "loss": 0.2978, + "step": 4281 + }, + { + "epoch": 0.07637427317804017, + "grad_norm": 0.3217722773551941, + "learning_rate": 3.819122368890475e-05, + "loss": 0.25, + "step": 4282 + }, + { + "epoch": 0.07639210929975386, + "grad_norm": 0.304267942905426, + "learning_rate": 3.820014270424545e-05, + "loss": 0.2374, + "step": 4283 + }, + { + "epoch": 0.07640994542146756, + "grad_norm": 0.2824154794216156, + "learning_rate": 3.820906171958616e-05, + "loss": 0.2216, + "step": 4284 + }, + { + "epoch": 0.07642778154318125, + "grad_norm": 0.5808196067810059, + "learning_rate": 3.821798073492687e-05, + "loss": 0.3523, + "step": 4285 + }, + { + "epoch": 0.07644561766489495, + "grad_norm": 0.46458321809768677, + "learning_rate": 3.822689975026757e-05, + "loss": 0.3002, + "step": 4286 + }, + { + "epoch": 0.07646345378660864, + "grad_norm": 0.35721060633659363, + "learning_rate": 3.823581876560828e-05, + "loss": 0.2472, + "step": 4287 + }, + { + "epoch": 0.07648128990832233, + "grad_norm": 0.30313968658447266, + "learning_rate": 3.8244737780948986e-05, + "loss": 0.2663, + "step": 4288 + }, + { + "epoch": 0.07649912603003603, + "grad_norm": 0.30494025349617004, + "learning_rate": 3.825365679628969e-05, + "loss": 0.2412, + "step": 4289 + }, + { + "epoch": 0.07651696215174972, + "grad_norm": 0.46083828806877136, + "learning_rate": 3.82625758116304e-05, + "loss": 0.2804, + "step": 4290 + }, + { + "epoch": 0.07653479827346342, + "grad_norm": 0.4277346134185791, + "learning_rate": 3.8271494826971105e-05, + "loss": 0.283, + "step": 4291 + }, + { + "epoch": 0.07655263439517711, + "grad_norm": 0.3467538058757782, + "learning_rate": 3.828041384231181e-05, + "loss": 0.3038, + "step": 4292 + }, + { + "epoch": 0.07657047051689081, + "grad_norm": 0.3192565143108368, + "learning_rate": 3.828933285765252e-05, + "loss": 0.2661, + "step": 4293 + }, + { + "epoch": 0.0765883066386045, + "grad_norm": 0.25358909368515015, + "learning_rate": 3.8298251872993225e-05, + "loss": 0.2309, + "step": 4294 + }, + { + "epoch": 0.0766061427603182, + "grad_norm": 0.8111319541931152, + "learning_rate": 3.830717088833393e-05, + "loss": 0.317, + "step": 4295 + }, + { + "epoch": 0.07662397888203189, + "grad_norm": 0.21290086209774017, + "learning_rate": 3.831608990367464e-05, + "loss": 0.2004, + "step": 4296 + }, + { + "epoch": 0.07664181500374559, + "grad_norm": 0.3975752592086792, + "learning_rate": 3.8325008919015344e-05, + "loss": 0.2432, + "step": 4297 + }, + { + "epoch": 0.07665965112545928, + "grad_norm": 0.3503962457180023, + "learning_rate": 3.833392793435605e-05, + "loss": 0.2263, + "step": 4298 + }, + { + "epoch": 0.07667748724717298, + "grad_norm": 0.25473836064338684, + "learning_rate": 3.8342846949696756e-05, + "loss": 0.232, + "step": 4299 + }, + { + "epoch": 0.07669532336888667, + "grad_norm": 0.423564076423645, + "learning_rate": 3.835176596503746e-05, + "loss": 0.2917, + "step": 4300 + }, + { + "epoch": 0.07671315949060037, + "grad_norm": 0.39943602681159973, + "learning_rate": 3.836068498037817e-05, + "loss": 0.317, + "step": 4301 + }, + { + "epoch": 0.07673099561231406, + "grad_norm": 0.3379661738872528, + "learning_rate": 3.8369603995718876e-05, + "loss": 0.2949, + "step": 4302 + }, + { + "epoch": 0.07674883173402776, + "grad_norm": 0.36557328701019287, + "learning_rate": 3.837852301105958e-05, + "loss": 0.2942, + "step": 4303 + }, + { + "epoch": 0.07676666785574145, + "grad_norm": 0.2946944832801819, + "learning_rate": 3.838744202640029e-05, + "loss": 0.2648, + "step": 4304 + }, + { + "epoch": 0.07678450397745515, + "grad_norm": 0.3372962474822998, + "learning_rate": 3.8396361041740995e-05, + "loss": 0.2751, + "step": 4305 + }, + { + "epoch": 0.07680234009916884, + "grad_norm": 0.31931331753730774, + "learning_rate": 3.84052800570817e-05, + "loss": 0.2634, + "step": 4306 + }, + { + "epoch": 0.07682017622088252, + "grad_norm": 0.28606119751930237, + "learning_rate": 3.841419907242241e-05, + "loss": 0.2607, + "step": 4307 + }, + { + "epoch": 0.07683801234259623, + "grad_norm": 0.2836773097515106, + "learning_rate": 3.842311808776311e-05, + "loss": 0.2223, + "step": 4308 + }, + { + "epoch": 0.07685584846430991, + "grad_norm": 0.35678985714912415, + "learning_rate": 3.843203710310382e-05, + "loss": 0.3045, + "step": 4309 + }, + { + "epoch": 0.07687368458602362, + "grad_norm": 0.3318808078765869, + "learning_rate": 3.844095611844453e-05, + "loss": 0.2658, + "step": 4310 + }, + { + "epoch": 0.0768915207077373, + "grad_norm": 0.30833348631858826, + "learning_rate": 3.8449875133785226e-05, + "loss": 0.2436, + "step": 4311 + }, + { + "epoch": 0.076909356829451, + "grad_norm": 0.21143841743469238, + "learning_rate": 3.845879414912594e-05, + "loss": 0.2027, + "step": 4312 + }, + { + "epoch": 0.0769271929511647, + "grad_norm": 0.3417409360408783, + "learning_rate": 3.8467713164466646e-05, + "loss": 0.2491, + "step": 4313 + }, + { + "epoch": 0.0769450290728784, + "grad_norm": 0.3178887665271759, + "learning_rate": 3.847663217980735e-05, + "loss": 0.2471, + "step": 4314 + }, + { + "epoch": 0.07696286519459208, + "grad_norm": 0.4646736681461334, + "learning_rate": 3.848555119514806e-05, + "loss": 0.3514, + "step": 4315 + }, + { + "epoch": 0.07698070131630579, + "grad_norm": 0.414309561252594, + "learning_rate": 3.8494470210488765e-05, + "loss": 0.3137, + "step": 4316 + }, + { + "epoch": 0.07699853743801947, + "grad_norm": 0.41656121611595154, + "learning_rate": 3.850338922582947e-05, + "loss": 0.3102, + "step": 4317 + }, + { + "epoch": 0.07701637355973318, + "grad_norm": 0.3836608827114105, + "learning_rate": 3.851230824117018e-05, + "loss": 0.2531, + "step": 4318 + }, + { + "epoch": 0.07703420968144686, + "grad_norm": 0.34572160243988037, + "learning_rate": 3.8521227256510884e-05, + "loss": 0.2616, + "step": 4319 + }, + { + "epoch": 0.07705204580316057, + "grad_norm": 0.2973364591598511, + "learning_rate": 3.853014627185159e-05, + "loss": 0.2206, + "step": 4320 + }, + { + "epoch": 0.07706988192487425, + "grad_norm": 0.2860994338989258, + "learning_rate": 3.85390652871923e-05, + "loss": 0.2764, + "step": 4321 + }, + { + "epoch": 0.07708771804658796, + "grad_norm": 0.3224015533924103, + "learning_rate": 3.8547984302533e-05, + "loss": 0.2014, + "step": 4322 + }, + { + "epoch": 0.07710555416830164, + "grad_norm": 0.35749709606170654, + "learning_rate": 3.855690331787371e-05, + "loss": 0.2518, + "step": 4323 + }, + { + "epoch": 0.07712339029001535, + "grad_norm": 0.36966726183891296, + "learning_rate": 3.8565822333214416e-05, + "loss": 0.25, + "step": 4324 + }, + { + "epoch": 0.07714122641172903, + "grad_norm": 0.39020729064941406, + "learning_rate": 3.857474134855512e-05, + "loss": 0.2743, + "step": 4325 + }, + { + "epoch": 0.07715906253344273, + "grad_norm": 0.41506093740463257, + "learning_rate": 3.858366036389583e-05, + "loss": 0.243, + "step": 4326 + }, + { + "epoch": 0.07717689865515642, + "grad_norm": 0.34525230526924133, + "learning_rate": 3.8592579379236535e-05, + "loss": 0.2623, + "step": 4327 + }, + { + "epoch": 0.07719473477687011, + "grad_norm": 0.39055317640304565, + "learning_rate": 3.860149839457724e-05, + "loss": 0.2801, + "step": 4328 + }, + { + "epoch": 0.07721257089858381, + "grad_norm": 0.2509728968143463, + "learning_rate": 3.861041740991795e-05, + "loss": 0.2534, + "step": 4329 + }, + { + "epoch": 0.0772304070202975, + "grad_norm": 0.337094783782959, + "learning_rate": 3.8619336425258654e-05, + "loss": 0.2573, + "step": 4330 + }, + { + "epoch": 0.0772482431420112, + "grad_norm": 0.3051152229309082, + "learning_rate": 3.862825544059936e-05, + "loss": 0.2413, + "step": 4331 + }, + { + "epoch": 0.07726607926372489, + "grad_norm": 0.38071292638778687, + "learning_rate": 3.863717445594007e-05, + "loss": 0.2962, + "step": 4332 + }, + { + "epoch": 0.07728391538543859, + "grad_norm": 0.3132109045982361, + "learning_rate": 3.8646093471280767e-05, + "loss": 0.2417, + "step": 4333 + }, + { + "epoch": 0.07730175150715228, + "grad_norm": 0.3177194595336914, + "learning_rate": 3.865501248662148e-05, + "loss": 0.2613, + "step": 4334 + }, + { + "epoch": 0.07731958762886598, + "grad_norm": 0.3333134353160858, + "learning_rate": 3.8663931501962186e-05, + "loss": 0.2534, + "step": 4335 + }, + { + "epoch": 0.07733742375057967, + "grad_norm": 0.4596266448497772, + "learning_rate": 3.867285051730289e-05, + "loss": 0.3223, + "step": 4336 + }, + { + "epoch": 0.07735525987229337, + "grad_norm": 0.3712513744831085, + "learning_rate": 3.86817695326436e-05, + "loss": 0.2586, + "step": 4337 + }, + { + "epoch": 0.07737309599400706, + "grad_norm": 0.42716965079307556, + "learning_rate": 3.8690688547984305e-05, + "loss": 0.2528, + "step": 4338 + }, + { + "epoch": 0.07739093211572076, + "grad_norm": 0.3493940234184265, + "learning_rate": 3.869960756332501e-05, + "loss": 0.2737, + "step": 4339 + }, + { + "epoch": 0.07740876823743445, + "grad_norm": 0.33019155263900757, + "learning_rate": 3.870852657866572e-05, + "loss": 0.2258, + "step": 4340 + }, + { + "epoch": 0.07742660435914815, + "grad_norm": 0.37247031927108765, + "learning_rate": 3.8717445594006424e-05, + "loss": 0.3025, + "step": 4341 + }, + { + "epoch": 0.07744444048086184, + "grad_norm": 0.4213121831417084, + "learning_rate": 3.872636460934713e-05, + "loss": 0.2612, + "step": 4342 + }, + { + "epoch": 0.07746227660257554, + "grad_norm": 0.3395713269710541, + "learning_rate": 3.873528362468784e-05, + "loss": 0.2536, + "step": 4343 + }, + { + "epoch": 0.07748011272428923, + "grad_norm": 0.3237455487251282, + "learning_rate": 3.8744202640028544e-05, + "loss": 0.2746, + "step": 4344 + }, + { + "epoch": 0.07749794884600293, + "grad_norm": 0.427746057510376, + "learning_rate": 3.875312165536925e-05, + "loss": 0.2843, + "step": 4345 + }, + { + "epoch": 0.07751578496771662, + "grad_norm": 0.2901923656463623, + "learning_rate": 3.8762040670709956e-05, + "loss": 0.2611, + "step": 4346 + }, + { + "epoch": 0.07753362108943031, + "grad_norm": 0.2814500331878662, + "learning_rate": 3.877095968605066e-05, + "loss": 0.218, + "step": 4347 + }, + { + "epoch": 0.07755145721114401, + "grad_norm": 0.35352224111557007, + "learning_rate": 3.877987870139137e-05, + "loss": 0.236, + "step": 4348 + }, + { + "epoch": 0.0775692933328577, + "grad_norm": 0.4134027063846588, + "learning_rate": 3.8788797716732075e-05, + "loss": 0.2982, + "step": 4349 + }, + { + "epoch": 0.0775871294545714, + "grad_norm": 0.21507737040519714, + "learning_rate": 3.879771673207278e-05, + "loss": 0.2349, + "step": 4350 + }, + { + "epoch": 0.07760496557628509, + "grad_norm": 0.37958672642707825, + "learning_rate": 3.880663574741349e-05, + "loss": 0.2793, + "step": 4351 + }, + { + "epoch": 0.07762280169799879, + "grad_norm": 0.39292657375335693, + "learning_rate": 3.8815554762754195e-05, + "loss": 0.3436, + "step": 4352 + }, + { + "epoch": 0.07764063781971248, + "grad_norm": 0.28580406308174133, + "learning_rate": 3.88244737780949e-05, + "loss": 0.259, + "step": 4353 + }, + { + "epoch": 0.07765847394142618, + "grad_norm": 0.3572327494621277, + "learning_rate": 3.883339279343561e-05, + "loss": 0.2466, + "step": 4354 + }, + { + "epoch": 0.07767631006313987, + "grad_norm": 0.2864575684070587, + "learning_rate": 3.8842311808776314e-05, + "loss": 0.2505, + "step": 4355 + }, + { + "epoch": 0.07769414618485357, + "grad_norm": 0.3204314410686493, + "learning_rate": 3.885123082411702e-05, + "loss": 0.2431, + "step": 4356 + }, + { + "epoch": 0.07771198230656726, + "grad_norm": 0.4472067952156067, + "learning_rate": 3.8860149839457727e-05, + "loss": 0.3314, + "step": 4357 + }, + { + "epoch": 0.07772981842828096, + "grad_norm": 0.5395408868789673, + "learning_rate": 3.8869068854798426e-05, + "loss": 0.3384, + "step": 4358 + }, + { + "epoch": 0.07774765454999465, + "grad_norm": 0.3061109185218811, + "learning_rate": 3.887798787013914e-05, + "loss": 0.3083, + "step": 4359 + }, + { + "epoch": 0.07776549067170835, + "grad_norm": 0.3717777132987976, + "learning_rate": 3.8886906885479846e-05, + "loss": 0.2306, + "step": 4360 + }, + { + "epoch": 0.07778332679342204, + "grad_norm": 0.3284724950790405, + "learning_rate": 3.889582590082055e-05, + "loss": 0.2799, + "step": 4361 + }, + { + "epoch": 0.07780116291513574, + "grad_norm": 0.299607515335083, + "learning_rate": 3.890474491616126e-05, + "loss": 0.2749, + "step": 4362 + }, + { + "epoch": 0.07781899903684943, + "grad_norm": 0.2945071756839752, + "learning_rate": 3.8913663931501965e-05, + "loss": 0.2773, + "step": 4363 + }, + { + "epoch": 0.07783683515856313, + "grad_norm": 0.32768872380256653, + "learning_rate": 3.892258294684267e-05, + "loss": 0.3034, + "step": 4364 + }, + { + "epoch": 0.07785467128027682, + "grad_norm": 0.3400993347167969, + "learning_rate": 3.893150196218338e-05, + "loss": 0.2791, + "step": 4365 + }, + { + "epoch": 0.07787250740199052, + "grad_norm": 0.302428662776947, + "learning_rate": 3.8940420977524084e-05, + "loss": 0.2617, + "step": 4366 + }, + { + "epoch": 0.0778903435237042, + "grad_norm": 0.3427959978580475, + "learning_rate": 3.894933999286479e-05, + "loss": 0.2976, + "step": 4367 + }, + { + "epoch": 0.0779081796454179, + "grad_norm": 0.36683788895606995, + "learning_rate": 3.89582590082055e-05, + "loss": 0.2897, + "step": 4368 + }, + { + "epoch": 0.0779260157671316, + "grad_norm": 0.44130566716194153, + "learning_rate": 3.89671780235462e-05, + "loss": 0.3146, + "step": 4369 + }, + { + "epoch": 0.07794385188884528, + "grad_norm": 0.3785611093044281, + "learning_rate": 3.897609703888691e-05, + "loss": 0.2641, + "step": 4370 + }, + { + "epoch": 0.07796168801055899, + "grad_norm": 0.47966429591178894, + "learning_rate": 3.8985016054227616e-05, + "loss": 0.2876, + "step": 4371 + }, + { + "epoch": 0.07797952413227267, + "grad_norm": 0.49851420521736145, + "learning_rate": 3.899393506956832e-05, + "loss": 0.3524, + "step": 4372 + }, + { + "epoch": 0.07799736025398638, + "grad_norm": 0.34156444668769836, + "learning_rate": 3.900285408490903e-05, + "loss": 0.2443, + "step": 4373 + }, + { + "epoch": 0.07801519637570006, + "grad_norm": 0.35304003953933716, + "learning_rate": 3.9011773100249735e-05, + "loss": 0.2609, + "step": 4374 + }, + { + "epoch": 0.07803303249741377, + "grad_norm": 0.5084397196769714, + "learning_rate": 3.902069211559044e-05, + "loss": 0.2587, + "step": 4375 + }, + { + "epoch": 0.07805086861912745, + "grad_norm": 0.4033340513706207, + "learning_rate": 3.902961113093115e-05, + "loss": 0.3194, + "step": 4376 + }, + { + "epoch": 0.07806870474084115, + "grad_norm": 0.31719550490379333, + "learning_rate": 3.9038530146271854e-05, + "loss": 0.2694, + "step": 4377 + }, + { + "epoch": 0.07808654086255484, + "grad_norm": 0.38203608989715576, + "learning_rate": 3.904744916161256e-05, + "loss": 0.2555, + "step": 4378 + }, + { + "epoch": 0.07810437698426854, + "grad_norm": 0.34577763080596924, + "learning_rate": 3.905636817695327e-05, + "loss": 0.2822, + "step": 4379 + }, + { + "epoch": 0.07812221310598223, + "grad_norm": 0.40282416343688965, + "learning_rate": 3.906528719229397e-05, + "loss": 0.2629, + "step": 4380 + }, + { + "epoch": 0.07814004922769593, + "grad_norm": 0.33227238059043884, + "learning_rate": 3.907420620763468e-05, + "loss": 0.2605, + "step": 4381 + }, + { + "epoch": 0.07815788534940962, + "grad_norm": 0.32662463188171387, + "learning_rate": 3.9083125222975386e-05, + "loss": 0.27, + "step": 4382 + }, + { + "epoch": 0.07817572147112332, + "grad_norm": 0.40394240617752075, + "learning_rate": 3.909204423831609e-05, + "loss": 0.2442, + "step": 4383 + }, + { + "epoch": 0.07819355759283701, + "grad_norm": 0.3501577377319336, + "learning_rate": 3.91009632536568e-05, + "loss": 0.2719, + "step": 4384 + }, + { + "epoch": 0.07821139371455071, + "grad_norm": 0.3697444796562195, + "learning_rate": 3.9109882268997505e-05, + "loss": 0.2741, + "step": 4385 + }, + { + "epoch": 0.0782292298362644, + "grad_norm": 0.47111180424690247, + "learning_rate": 3.911880128433821e-05, + "loss": 0.2686, + "step": 4386 + }, + { + "epoch": 0.0782470659579781, + "grad_norm": 0.4611196517944336, + "learning_rate": 3.912772029967892e-05, + "loss": 0.2824, + "step": 4387 + }, + { + "epoch": 0.07826490207969179, + "grad_norm": 0.39992693066596985, + "learning_rate": 3.9136639315019624e-05, + "loss": 0.233, + "step": 4388 + }, + { + "epoch": 0.07828273820140548, + "grad_norm": 0.32952433824539185, + "learning_rate": 3.914555833036033e-05, + "loss": 0.2121, + "step": 4389 + }, + { + "epoch": 0.07830057432311918, + "grad_norm": 0.4698813557624817, + "learning_rate": 3.915447734570104e-05, + "loss": 0.2841, + "step": 4390 + }, + { + "epoch": 0.07831841044483287, + "grad_norm": 0.38414064049720764, + "learning_rate": 3.9163396361041743e-05, + "loss": 0.2978, + "step": 4391 + }, + { + "epoch": 0.07833624656654657, + "grad_norm": 0.3118569850921631, + "learning_rate": 3.917231537638245e-05, + "loss": 0.2657, + "step": 4392 + }, + { + "epoch": 0.07835408268826026, + "grad_norm": 0.3157293200492859, + "learning_rate": 3.9181234391723156e-05, + "loss": 0.257, + "step": 4393 + }, + { + "epoch": 0.07837191880997396, + "grad_norm": 0.4550689160823822, + "learning_rate": 3.919015340706386e-05, + "loss": 0.2503, + "step": 4394 + }, + { + "epoch": 0.07838975493168765, + "grad_norm": 0.4010894000530243, + "learning_rate": 3.919907242240457e-05, + "loss": 0.2791, + "step": 4395 + }, + { + "epoch": 0.07840759105340135, + "grad_norm": 0.2876130938529968, + "learning_rate": 3.9207991437745275e-05, + "loss": 0.2589, + "step": 4396 + }, + { + "epoch": 0.07842542717511504, + "grad_norm": 0.4091978371143341, + "learning_rate": 3.921691045308598e-05, + "loss": 0.2293, + "step": 4397 + }, + { + "epoch": 0.07844326329682874, + "grad_norm": 0.3687492311000824, + "learning_rate": 3.922582946842669e-05, + "loss": 0.3011, + "step": 4398 + }, + { + "epoch": 0.07846109941854243, + "grad_norm": 0.524689257144928, + "learning_rate": 3.9234748483767394e-05, + "loss": 0.2327, + "step": 4399 + }, + { + "epoch": 0.07847893554025613, + "grad_norm": 0.3993457555770874, + "learning_rate": 3.92436674991081e-05, + "loss": 0.2281, + "step": 4400 + }, + { + "epoch": 0.07849677166196982, + "grad_norm": 0.40008744597435, + "learning_rate": 3.925258651444881e-05, + "loss": 0.2462, + "step": 4401 + }, + { + "epoch": 0.07851460778368352, + "grad_norm": 0.36328330636024475, + "learning_rate": 3.9261505529789514e-05, + "loss": 0.291, + "step": 4402 + }, + { + "epoch": 0.07853244390539721, + "grad_norm": 0.9162819981575012, + "learning_rate": 3.927042454513022e-05, + "loss": 0.2926, + "step": 4403 + }, + { + "epoch": 0.07855028002711091, + "grad_norm": 0.45906102657318115, + "learning_rate": 3.9279343560470926e-05, + "loss": 0.3202, + "step": 4404 + }, + { + "epoch": 0.0785681161488246, + "grad_norm": 0.30613192915916443, + "learning_rate": 3.928826257581163e-05, + "loss": 0.2819, + "step": 4405 + }, + { + "epoch": 0.0785859522705383, + "grad_norm": 0.40524908900260925, + "learning_rate": 3.929718159115234e-05, + "loss": 0.289, + "step": 4406 + }, + { + "epoch": 0.07860378839225199, + "grad_norm": 0.2773537337779999, + "learning_rate": 3.9306100606493046e-05, + "loss": 0.2311, + "step": 4407 + }, + { + "epoch": 0.07862162451396568, + "grad_norm": 0.3180605173110962, + "learning_rate": 3.931501962183375e-05, + "loss": 0.2668, + "step": 4408 + }, + { + "epoch": 0.07863946063567938, + "grad_norm": 0.3289400339126587, + "learning_rate": 3.932393863717446e-05, + "loss": 0.3044, + "step": 4409 + }, + { + "epoch": 0.07865729675739307, + "grad_norm": 0.33620643615722656, + "learning_rate": 3.9332857652515165e-05, + "loss": 0.2644, + "step": 4410 + }, + { + "epoch": 0.07867513287910677, + "grad_norm": 0.5794734954833984, + "learning_rate": 3.934177666785587e-05, + "loss": 0.2851, + "step": 4411 + }, + { + "epoch": 0.07869296900082046, + "grad_norm": 0.33227694034576416, + "learning_rate": 3.935069568319658e-05, + "loss": 0.2819, + "step": 4412 + }, + { + "epoch": 0.07871080512253416, + "grad_norm": 0.33045604825019836, + "learning_rate": 3.9359614698537284e-05, + "loss": 0.235, + "step": 4413 + }, + { + "epoch": 0.07872864124424785, + "grad_norm": 0.4035234749317169, + "learning_rate": 3.936853371387799e-05, + "loss": 0.2513, + "step": 4414 + }, + { + "epoch": 0.07874647736596155, + "grad_norm": 0.3834822177886963, + "learning_rate": 3.9377452729218697e-05, + "loss": 0.2543, + "step": 4415 + }, + { + "epoch": 0.07876431348767524, + "grad_norm": 0.3251235783100128, + "learning_rate": 3.93863717445594e-05, + "loss": 0.2536, + "step": 4416 + }, + { + "epoch": 0.07878214960938894, + "grad_norm": 0.2962441146373749, + "learning_rate": 3.939529075990011e-05, + "loss": 0.2541, + "step": 4417 + }, + { + "epoch": 0.07879998573110263, + "grad_norm": 0.29714933037757874, + "learning_rate": 3.9404209775240816e-05, + "loss": 0.2252, + "step": 4418 + }, + { + "epoch": 0.07881782185281633, + "grad_norm": 0.28641360998153687, + "learning_rate": 3.941312879058152e-05, + "loss": 0.2547, + "step": 4419 + }, + { + "epoch": 0.07883565797453002, + "grad_norm": 0.2969341576099396, + "learning_rate": 3.942204780592223e-05, + "loss": 0.2608, + "step": 4420 + }, + { + "epoch": 0.07885349409624372, + "grad_norm": 0.3222426772117615, + "learning_rate": 3.9430966821262935e-05, + "loss": 0.2769, + "step": 4421 + }, + { + "epoch": 0.0788713302179574, + "grad_norm": 0.3375169634819031, + "learning_rate": 3.943988583660364e-05, + "loss": 0.2995, + "step": 4422 + }, + { + "epoch": 0.07888916633967111, + "grad_norm": 0.2864764630794525, + "learning_rate": 3.944880485194435e-05, + "loss": 0.2167, + "step": 4423 + }, + { + "epoch": 0.0789070024613848, + "grad_norm": 0.3258938193321228, + "learning_rate": 3.9457723867285054e-05, + "loss": 0.2607, + "step": 4424 + }, + { + "epoch": 0.0789248385830985, + "grad_norm": 0.3061988353729248, + "learning_rate": 3.946664288262576e-05, + "loss": 0.2533, + "step": 4425 + }, + { + "epoch": 0.07894267470481219, + "grad_norm": 0.4185546636581421, + "learning_rate": 3.947556189796647e-05, + "loss": 0.262, + "step": 4426 + }, + { + "epoch": 0.07896051082652589, + "grad_norm": 0.38305729627609253, + "learning_rate": 3.948448091330717e-05, + "loss": 0.271, + "step": 4427 + }, + { + "epoch": 0.07897834694823958, + "grad_norm": 0.7182749509811401, + "learning_rate": 3.949339992864788e-05, + "loss": 0.3611, + "step": 4428 + }, + { + "epoch": 0.07899618306995326, + "grad_norm": 0.2859683036804199, + "learning_rate": 3.9502318943988586e-05, + "loss": 0.2356, + "step": 4429 + }, + { + "epoch": 0.07901401919166696, + "grad_norm": 0.33025452494621277, + "learning_rate": 3.951123795932929e-05, + "loss": 0.2419, + "step": 4430 + }, + { + "epoch": 0.07903185531338065, + "grad_norm": 0.5470520853996277, + "learning_rate": 3.952015697467e-05, + "loss": 0.3139, + "step": 4431 + }, + { + "epoch": 0.07904969143509435, + "grad_norm": 0.2969801127910614, + "learning_rate": 3.9529075990010705e-05, + "loss": 0.2381, + "step": 4432 + }, + { + "epoch": 0.07906752755680804, + "grad_norm": 0.327876478433609, + "learning_rate": 3.953799500535141e-05, + "loss": 0.2929, + "step": 4433 + }, + { + "epoch": 0.07908536367852174, + "grad_norm": 0.37913572788238525, + "learning_rate": 3.954691402069212e-05, + "loss": 0.3041, + "step": 4434 + }, + { + "epoch": 0.07910319980023543, + "grad_norm": 0.43590933084487915, + "learning_rate": 3.9555833036032824e-05, + "loss": 0.2954, + "step": 4435 + }, + { + "epoch": 0.07912103592194913, + "grad_norm": 0.3637465238571167, + "learning_rate": 3.956475205137353e-05, + "loss": 0.244, + "step": 4436 + }, + { + "epoch": 0.07913887204366282, + "grad_norm": 0.3469159007072449, + "learning_rate": 3.957367106671424e-05, + "loss": 0.2725, + "step": 4437 + }, + { + "epoch": 0.07915670816537652, + "grad_norm": 0.2922632694244385, + "learning_rate": 3.958259008205494e-05, + "loss": 0.2612, + "step": 4438 + }, + { + "epoch": 0.07917454428709021, + "grad_norm": 0.3450004756450653, + "learning_rate": 3.959150909739565e-05, + "loss": 0.2596, + "step": 4439 + }, + { + "epoch": 0.07919238040880391, + "grad_norm": 0.29621705412864685, + "learning_rate": 3.9600428112736356e-05, + "loss": 0.2281, + "step": 4440 + }, + { + "epoch": 0.0792102165305176, + "grad_norm": 0.2661037743091583, + "learning_rate": 3.960934712807706e-05, + "loss": 0.2408, + "step": 4441 + }, + { + "epoch": 0.0792280526522313, + "grad_norm": 0.2868179976940155, + "learning_rate": 3.961826614341777e-05, + "loss": 0.2661, + "step": 4442 + }, + { + "epoch": 0.07924588877394499, + "grad_norm": 0.34746330976486206, + "learning_rate": 3.9627185158758475e-05, + "loss": 0.2566, + "step": 4443 + }, + { + "epoch": 0.0792637248956587, + "grad_norm": 0.2857220470905304, + "learning_rate": 3.963610417409918e-05, + "loss": 0.2432, + "step": 4444 + }, + { + "epoch": 0.07928156101737238, + "grad_norm": 0.35333573818206787, + "learning_rate": 3.964502318943989e-05, + "loss": 0.2684, + "step": 4445 + }, + { + "epoch": 0.07929939713908608, + "grad_norm": 0.28018221259117126, + "learning_rate": 3.9653942204780594e-05, + "loss": 0.2479, + "step": 4446 + }, + { + "epoch": 0.07931723326079977, + "grad_norm": 0.30884823203086853, + "learning_rate": 3.96628612201213e-05, + "loss": 0.2589, + "step": 4447 + }, + { + "epoch": 0.07933506938251346, + "grad_norm": 0.41764792799949646, + "learning_rate": 3.967178023546201e-05, + "loss": 0.2632, + "step": 4448 + }, + { + "epoch": 0.07935290550422716, + "grad_norm": 0.30291947722435, + "learning_rate": 3.9680699250802713e-05, + "loss": 0.2336, + "step": 4449 + }, + { + "epoch": 0.07937074162594085, + "grad_norm": 0.45131614804267883, + "learning_rate": 3.968961826614342e-05, + "loss": 0.2791, + "step": 4450 + }, + { + "epoch": 0.07938857774765455, + "grad_norm": 0.951149582862854, + "learning_rate": 3.9698537281484126e-05, + "loss": 0.2663, + "step": 4451 + }, + { + "epoch": 0.07940641386936824, + "grad_norm": 0.324270099401474, + "learning_rate": 3.970745629682483e-05, + "loss": 0.222, + "step": 4452 + }, + { + "epoch": 0.07942424999108194, + "grad_norm": 0.3507010340690613, + "learning_rate": 3.971637531216554e-05, + "loss": 0.2753, + "step": 4453 + }, + { + "epoch": 0.07944208611279563, + "grad_norm": 0.45777058601379395, + "learning_rate": 3.9725294327506245e-05, + "loss": 0.3207, + "step": 4454 + }, + { + "epoch": 0.07945992223450933, + "grad_norm": 0.4051269590854645, + "learning_rate": 3.973421334284695e-05, + "loss": 0.274, + "step": 4455 + }, + { + "epoch": 0.07947775835622302, + "grad_norm": 0.28481754660606384, + "learning_rate": 3.974313235818766e-05, + "loss": 0.2515, + "step": 4456 + }, + { + "epoch": 0.07949559447793672, + "grad_norm": 0.5084226727485657, + "learning_rate": 3.9752051373528365e-05, + "loss": 0.3343, + "step": 4457 + }, + { + "epoch": 0.07951343059965041, + "grad_norm": 0.44690635800361633, + "learning_rate": 3.976097038886907e-05, + "loss": 0.3349, + "step": 4458 + }, + { + "epoch": 0.07953126672136411, + "grad_norm": 0.3401438593864441, + "learning_rate": 3.976988940420978e-05, + "loss": 0.2988, + "step": 4459 + }, + { + "epoch": 0.0795491028430778, + "grad_norm": 0.34159207344055176, + "learning_rate": 3.9778808419550484e-05, + "loss": 0.2574, + "step": 4460 + }, + { + "epoch": 0.0795669389647915, + "grad_norm": 0.5162934064865112, + "learning_rate": 3.978772743489119e-05, + "loss": 0.2968, + "step": 4461 + }, + { + "epoch": 0.07958477508650519, + "grad_norm": 0.3637939393520355, + "learning_rate": 3.9796646450231896e-05, + "loss": 0.3083, + "step": 4462 + }, + { + "epoch": 0.07960261120821889, + "grad_norm": 0.516302764415741, + "learning_rate": 3.98055654655726e-05, + "loss": 0.3011, + "step": 4463 + }, + { + "epoch": 0.07962044732993258, + "grad_norm": 0.6440324783325195, + "learning_rate": 3.981448448091331e-05, + "loss": 0.3123, + "step": 4464 + }, + { + "epoch": 0.07963828345164628, + "grad_norm": 0.8769925236701965, + "learning_rate": 3.9823403496254016e-05, + "loss": 0.2623, + "step": 4465 + }, + { + "epoch": 0.07965611957335997, + "grad_norm": 0.40881508588790894, + "learning_rate": 3.983232251159472e-05, + "loss": 0.2222, + "step": 4466 + }, + { + "epoch": 0.07967395569507367, + "grad_norm": 0.36568158864974976, + "learning_rate": 3.984124152693543e-05, + "loss": 0.2677, + "step": 4467 + }, + { + "epoch": 0.07969179181678736, + "grad_norm": 0.3032169044017792, + "learning_rate": 3.9850160542276135e-05, + "loss": 0.2382, + "step": 4468 + }, + { + "epoch": 0.07970962793850105, + "grad_norm": 0.3037692606449127, + "learning_rate": 3.985907955761684e-05, + "loss": 0.2552, + "step": 4469 + }, + { + "epoch": 0.07972746406021475, + "grad_norm": 0.3624633848667145, + "learning_rate": 3.986799857295755e-05, + "loss": 0.2787, + "step": 4470 + }, + { + "epoch": 0.07974530018192844, + "grad_norm": 0.31159213185310364, + "learning_rate": 3.9876917588298254e-05, + "loss": 0.263, + "step": 4471 + }, + { + "epoch": 0.07976313630364214, + "grad_norm": 0.28747597336769104, + "learning_rate": 3.988583660363896e-05, + "loss": 0.2823, + "step": 4472 + }, + { + "epoch": 0.07978097242535583, + "grad_norm": 0.30559128522872925, + "learning_rate": 3.989475561897967e-05, + "loss": 0.2686, + "step": 4473 + }, + { + "epoch": 0.07979880854706953, + "grad_norm": 0.35138267278671265, + "learning_rate": 3.990367463432037e-05, + "loss": 0.2751, + "step": 4474 + }, + { + "epoch": 0.07981664466878322, + "grad_norm": 0.30839818716049194, + "learning_rate": 3.991259364966108e-05, + "loss": 0.2345, + "step": 4475 + }, + { + "epoch": 0.07983448079049692, + "grad_norm": 0.3390248417854309, + "learning_rate": 3.9921512665001786e-05, + "loss": 0.2221, + "step": 4476 + }, + { + "epoch": 0.0798523169122106, + "grad_norm": 0.42686232924461365, + "learning_rate": 3.993043168034249e-05, + "loss": 0.2587, + "step": 4477 + }, + { + "epoch": 0.07987015303392431, + "grad_norm": 0.36679717898368835, + "learning_rate": 3.99393506956832e-05, + "loss": 0.2642, + "step": 4478 + }, + { + "epoch": 0.079887989155638, + "grad_norm": 0.31725236773490906, + "learning_rate": 3.9948269711023905e-05, + "loss": 0.2474, + "step": 4479 + }, + { + "epoch": 0.0799058252773517, + "grad_norm": 0.33426448702812195, + "learning_rate": 3.995718872636461e-05, + "loss": 0.2462, + "step": 4480 + }, + { + "epoch": 0.07992366139906538, + "grad_norm": 0.3012705445289612, + "learning_rate": 3.996610774170532e-05, + "loss": 0.2501, + "step": 4481 + }, + { + "epoch": 0.07994149752077909, + "grad_norm": 0.38509663939476013, + "learning_rate": 3.9975026757046024e-05, + "loss": 0.2362, + "step": 4482 + }, + { + "epoch": 0.07995933364249277, + "grad_norm": 0.29118025302886963, + "learning_rate": 3.998394577238673e-05, + "loss": 0.224, + "step": 4483 + }, + { + "epoch": 0.07997716976420648, + "grad_norm": 0.4841828942298889, + "learning_rate": 3.999286478772744e-05, + "loss": 0.2055, + "step": 4484 + }, + { + "epoch": 0.07999500588592016, + "grad_norm": 0.33768928050994873, + "learning_rate": 4.000178380306814e-05, + "loss": 0.2323, + "step": 4485 + }, + { + "epoch": 0.08001284200763387, + "grad_norm": 0.27681782841682434, + "learning_rate": 4.001070281840885e-05, + "loss": 0.2742, + "step": 4486 + }, + { + "epoch": 0.08003067812934755, + "grad_norm": 0.3340684771537781, + "learning_rate": 4.0019621833749556e-05, + "loss": 0.2292, + "step": 4487 + }, + { + "epoch": 0.08004851425106124, + "grad_norm": 0.889191210269928, + "learning_rate": 4.002854084909026e-05, + "loss": 0.2753, + "step": 4488 + }, + { + "epoch": 0.08006635037277494, + "grad_norm": 0.36859461665153503, + "learning_rate": 4.003745986443097e-05, + "loss": 0.301, + "step": 4489 + }, + { + "epoch": 0.08008418649448863, + "grad_norm": 0.30707311630249023, + "learning_rate": 4.0046378879771675e-05, + "loss": 0.2847, + "step": 4490 + }, + { + "epoch": 0.08010202261620233, + "grad_norm": 0.27876847982406616, + "learning_rate": 4.005529789511238e-05, + "loss": 0.2281, + "step": 4491 + }, + { + "epoch": 0.08011985873791602, + "grad_norm": 0.3858083188533783, + "learning_rate": 4.006421691045309e-05, + "loss": 0.2778, + "step": 4492 + }, + { + "epoch": 0.08013769485962972, + "grad_norm": 0.3480464518070221, + "learning_rate": 4.0073135925793794e-05, + "loss": 0.2583, + "step": 4493 + }, + { + "epoch": 0.08015553098134341, + "grad_norm": 0.4255104660987854, + "learning_rate": 4.00820549411345e-05, + "loss": 0.2828, + "step": 4494 + }, + { + "epoch": 0.08017336710305711, + "grad_norm": 0.27423787117004395, + "learning_rate": 4.009097395647521e-05, + "loss": 0.2374, + "step": 4495 + }, + { + "epoch": 0.0801912032247708, + "grad_norm": 0.3389829397201538, + "learning_rate": 4.009989297181591e-05, + "loss": 0.2782, + "step": 4496 + }, + { + "epoch": 0.0802090393464845, + "grad_norm": 0.319293349981308, + "learning_rate": 4.010881198715662e-05, + "loss": 0.2538, + "step": 4497 + }, + { + "epoch": 0.08022687546819819, + "grad_norm": 0.5192918181419373, + "learning_rate": 4.0117731002497326e-05, + "loss": 0.3155, + "step": 4498 + }, + { + "epoch": 0.0802447115899119, + "grad_norm": 0.3251531422138214, + "learning_rate": 4.012665001783803e-05, + "loss": 0.2976, + "step": 4499 + }, + { + "epoch": 0.08026254771162558, + "grad_norm": 0.2674753963947296, + "learning_rate": 4.013556903317874e-05, + "loss": 0.2079, + "step": 4500 + }, + { + "epoch": 0.08028038383333928, + "grad_norm": 0.4634806513786316, + "learning_rate": 4.0144488048519445e-05, + "loss": 0.2518, + "step": 4501 + }, + { + "epoch": 0.08029821995505297, + "grad_norm": 0.33979034423828125, + "learning_rate": 4.015340706386015e-05, + "loss": 0.3003, + "step": 4502 + }, + { + "epoch": 0.08031605607676667, + "grad_norm": 0.424564391374588, + "learning_rate": 4.016232607920086e-05, + "loss": 0.3032, + "step": 4503 + }, + { + "epoch": 0.08033389219848036, + "grad_norm": 0.3706408143043518, + "learning_rate": 4.0171245094541564e-05, + "loss": 0.2565, + "step": 4504 + }, + { + "epoch": 0.08035172832019406, + "grad_norm": 0.3482458293437958, + "learning_rate": 4.018016410988227e-05, + "loss": 0.2641, + "step": 4505 + }, + { + "epoch": 0.08036956444190775, + "grad_norm": 0.3800894618034363, + "learning_rate": 4.018908312522298e-05, + "loss": 0.275, + "step": 4506 + }, + { + "epoch": 0.08038740056362145, + "grad_norm": 0.36897557973861694, + "learning_rate": 4.0198002140563684e-05, + "loss": 0.2997, + "step": 4507 + }, + { + "epoch": 0.08040523668533514, + "grad_norm": 0.3793332576751709, + "learning_rate": 4.020692115590439e-05, + "loss": 0.2919, + "step": 4508 + }, + { + "epoch": 0.08042307280704883, + "grad_norm": 0.41461849212646484, + "learning_rate": 4.0215840171245096e-05, + "loss": 0.2816, + "step": 4509 + }, + { + "epoch": 0.08044090892876253, + "grad_norm": 0.3164636194705963, + "learning_rate": 4.02247591865858e-05, + "loss": 0.2255, + "step": 4510 + }, + { + "epoch": 0.08045874505047622, + "grad_norm": 0.32979562878608704, + "learning_rate": 4.023367820192651e-05, + "loss": 0.2201, + "step": 4511 + }, + { + "epoch": 0.08047658117218992, + "grad_norm": 0.3270874321460724, + "learning_rate": 4.0242597217267215e-05, + "loss": 0.2697, + "step": 4512 + }, + { + "epoch": 0.08049441729390361, + "grad_norm": 0.61639404296875, + "learning_rate": 4.025151623260792e-05, + "loss": 0.3907, + "step": 4513 + }, + { + "epoch": 0.08051225341561731, + "grad_norm": 0.2716813385486603, + "learning_rate": 4.026043524794863e-05, + "loss": 0.2619, + "step": 4514 + }, + { + "epoch": 0.080530089537331, + "grad_norm": 0.2883681058883667, + "learning_rate": 4.0269354263289335e-05, + "loss": 0.2264, + "step": 4515 + }, + { + "epoch": 0.0805479256590447, + "grad_norm": 0.2986631393432617, + "learning_rate": 4.027827327863004e-05, + "loss": 0.3007, + "step": 4516 + }, + { + "epoch": 0.08056576178075839, + "grad_norm": 0.36737969517707825, + "learning_rate": 4.028719229397075e-05, + "loss": 0.3252, + "step": 4517 + }, + { + "epoch": 0.08058359790247209, + "grad_norm": 0.33752569556236267, + "learning_rate": 4.0296111309311454e-05, + "loss": 0.2935, + "step": 4518 + }, + { + "epoch": 0.08060143402418578, + "grad_norm": 0.28609350323677063, + "learning_rate": 4.030503032465216e-05, + "loss": 0.209, + "step": 4519 + }, + { + "epoch": 0.08061927014589948, + "grad_norm": 0.3234710693359375, + "learning_rate": 4.0313949339992867e-05, + "loss": 0.2614, + "step": 4520 + }, + { + "epoch": 0.08063710626761317, + "grad_norm": 0.47285953164100647, + "learning_rate": 4.032286835533357e-05, + "loss": 0.2311, + "step": 4521 + }, + { + "epoch": 0.08065494238932687, + "grad_norm": 0.5395081639289856, + "learning_rate": 4.033178737067428e-05, + "loss": 0.304, + "step": 4522 + }, + { + "epoch": 0.08067277851104056, + "grad_norm": 0.2810700833797455, + "learning_rate": 4.0340706386014986e-05, + "loss": 0.2268, + "step": 4523 + }, + { + "epoch": 0.08069061463275426, + "grad_norm": 0.30713704228401184, + "learning_rate": 4.034962540135569e-05, + "loss": 0.2732, + "step": 4524 + }, + { + "epoch": 0.08070845075446795, + "grad_norm": 0.3583277463912964, + "learning_rate": 4.03585444166964e-05, + "loss": 0.3099, + "step": 4525 + }, + { + "epoch": 0.08072628687618165, + "grad_norm": 0.28816354274749756, + "learning_rate": 4.0367463432037105e-05, + "loss": 0.2827, + "step": 4526 + }, + { + "epoch": 0.08074412299789534, + "grad_norm": 0.29862967133522034, + "learning_rate": 4.037638244737781e-05, + "loss": 0.2634, + "step": 4527 + }, + { + "epoch": 0.08076195911960903, + "grad_norm": 0.28392142057418823, + "learning_rate": 4.038530146271852e-05, + "loss": 0.2388, + "step": 4528 + }, + { + "epoch": 0.08077979524132273, + "grad_norm": 0.25666290521621704, + "learning_rate": 4.0394220478059224e-05, + "loss": 0.2536, + "step": 4529 + }, + { + "epoch": 0.08079763136303642, + "grad_norm": 0.35175782442092896, + "learning_rate": 4.040313949339993e-05, + "loss": 0.2528, + "step": 4530 + }, + { + "epoch": 0.08081546748475012, + "grad_norm": 0.334356427192688, + "learning_rate": 4.041205850874064e-05, + "loss": 0.2205, + "step": 4531 + }, + { + "epoch": 0.0808333036064638, + "grad_norm": 0.43859541416168213, + "learning_rate": 4.042097752408134e-05, + "loss": 0.2763, + "step": 4532 + }, + { + "epoch": 0.0808511397281775, + "grad_norm": 0.35310041904449463, + "learning_rate": 4.042989653942205e-05, + "loss": 0.2671, + "step": 4533 + }, + { + "epoch": 0.0808689758498912, + "grad_norm": 0.3610978126525879, + "learning_rate": 4.0438815554762756e-05, + "loss": 0.2506, + "step": 4534 + }, + { + "epoch": 0.0808868119716049, + "grad_norm": 0.3457763195037842, + "learning_rate": 4.044773457010346e-05, + "loss": 0.2822, + "step": 4535 + }, + { + "epoch": 0.08090464809331858, + "grad_norm": 0.31126368045806885, + "learning_rate": 4.045665358544417e-05, + "loss": 0.2295, + "step": 4536 + }, + { + "epoch": 0.08092248421503229, + "grad_norm": 0.3494953513145447, + "learning_rate": 4.0465572600784875e-05, + "loss": 0.3176, + "step": 4537 + }, + { + "epoch": 0.08094032033674597, + "grad_norm": 0.486221045255661, + "learning_rate": 4.047449161612558e-05, + "loss": 0.3022, + "step": 4538 + }, + { + "epoch": 0.08095815645845968, + "grad_norm": 0.4887290894985199, + "learning_rate": 4.048341063146629e-05, + "loss": 0.2503, + "step": 4539 + }, + { + "epoch": 0.08097599258017336, + "grad_norm": 0.3604911267757416, + "learning_rate": 4.0492329646806994e-05, + "loss": 0.3364, + "step": 4540 + }, + { + "epoch": 0.08099382870188707, + "grad_norm": 0.29284030199050903, + "learning_rate": 4.05012486621477e-05, + "loss": 0.2838, + "step": 4541 + }, + { + "epoch": 0.08101166482360075, + "grad_norm": 0.3474516272544861, + "learning_rate": 4.051016767748841e-05, + "loss": 0.2505, + "step": 4542 + }, + { + "epoch": 0.08102950094531446, + "grad_norm": 0.3401896059513092, + "learning_rate": 4.051908669282911e-05, + "loss": 0.252, + "step": 4543 + }, + { + "epoch": 0.08104733706702814, + "grad_norm": 0.3116951882839203, + "learning_rate": 4.052800570816982e-05, + "loss": 0.2708, + "step": 4544 + }, + { + "epoch": 0.08106517318874185, + "grad_norm": 0.3368425965309143, + "learning_rate": 4.0536924723510526e-05, + "loss": 0.2603, + "step": 4545 + }, + { + "epoch": 0.08108300931045553, + "grad_norm": 0.3442945182323456, + "learning_rate": 4.054584373885123e-05, + "loss": 0.3056, + "step": 4546 + }, + { + "epoch": 0.08110084543216924, + "grad_norm": 0.32995864748954773, + "learning_rate": 4.055476275419194e-05, + "loss": 0.2738, + "step": 4547 + }, + { + "epoch": 0.08111868155388292, + "grad_norm": 0.3128892481327057, + "learning_rate": 4.0563681769532645e-05, + "loss": 0.2339, + "step": 4548 + }, + { + "epoch": 0.08113651767559661, + "grad_norm": 0.30660542845726013, + "learning_rate": 4.057260078487335e-05, + "loss": 0.2371, + "step": 4549 + }, + { + "epoch": 0.08115435379731031, + "grad_norm": 0.3472464680671692, + "learning_rate": 4.058151980021406e-05, + "loss": 0.2861, + "step": 4550 + }, + { + "epoch": 0.081172189919024, + "grad_norm": 0.2973649203777313, + "learning_rate": 4.0590438815554764e-05, + "loss": 0.2997, + "step": 4551 + }, + { + "epoch": 0.0811900260407377, + "grad_norm": 0.41225185990333557, + "learning_rate": 4.059935783089547e-05, + "loss": 0.2532, + "step": 4552 + }, + { + "epoch": 0.08120786216245139, + "grad_norm": 0.33151519298553467, + "learning_rate": 4.060827684623618e-05, + "loss": 0.3301, + "step": 4553 + }, + { + "epoch": 0.0812256982841651, + "grad_norm": 0.27119380235671997, + "learning_rate": 4.0617195861576883e-05, + "loss": 0.2326, + "step": 4554 + }, + { + "epoch": 0.08124353440587878, + "grad_norm": 0.2899776101112366, + "learning_rate": 4.062611487691759e-05, + "loss": 0.2739, + "step": 4555 + }, + { + "epoch": 0.08126137052759248, + "grad_norm": 0.4134208559989929, + "learning_rate": 4.0635033892258296e-05, + "loss": 0.2748, + "step": 4556 + }, + { + "epoch": 0.08127920664930617, + "grad_norm": 0.4175553321838379, + "learning_rate": 4.0643952907599e-05, + "loss": 0.288, + "step": 4557 + }, + { + "epoch": 0.08129704277101987, + "grad_norm": 0.3602147400379181, + "learning_rate": 4.065287192293971e-05, + "loss": 0.2814, + "step": 4558 + }, + { + "epoch": 0.08131487889273356, + "grad_norm": 0.42320945858955383, + "learning_rate": 4.0661790938280415e-05, + "loss": 0.2402, + "step": 4559 + }, + { + "epoch": 0.08133271501444726, + "grad_norm": 0.3441101014614105, + "learning_rate": 4.067070995362112e-05, + "loss": 0.2287, + "step": 4560 + }, + { + "epoch": 0.08135055113616095, + "grad_norm": 0.42264416813850403, + "learning_rate": 4.067962896896183e-05, + "loss": 0.2616, + "step": 4561 + }, + { + "epoch": 0.08136838725787465, + "grad_norm": 0.48212215304374695, + "learning_rate": 4.0688547984302534e-05, + "loss": 0.3084, + "step": 4562 + }, + { + "epoch": 0.08138622337958834, + "grad_norm": 0.2982146143913269, + "learning_rate": 4.069746699964324e-05, + "loss": 0.2819, + "step": 4563 + }, + { + "epoch": 0.08140405950130204, + "grad_norm": 0.28909289836883545, + "learning_rate": 4.070638601498395e-05, + "loss": 0.2011, + "step": 4564 + }, + { + "epoch": 0.08142189562301573, + "grad_norm": 0.3757196068763733, + "learning_rate": 4.0715305030324654e-05, + "loss": 0.2634, + "step": 4565 + }, + { + "epoch": 0.08143973174472943, + "grad_norm": 0.29326966404914856, + "learning_rate": 4.072422404566536e-05, + "loss": 0.2332, + "step": 4566 + }, + { + "epoch": 0.08145756786644312, + "grad_norm": 0.38509637117385864, + "learning_rate": 4.0733143061006066e-05, + "loss": 0.3351, + "step": 4567 + }, + { + "epoch": 0.08147540398815682, + "grad_norm": 0.4050680994987488, + "learning_rate": 4.074206207634677e-05, + "loss": 0.2804, + "step": 4568 + }, + { + "epoch": 0.08149324010987051, + "grad_norm": 0.2764883041381836, + "learning_rate": 4.075098109168748e-05, + "loss": 0.2557, + "step": 4569 + }, + { + "epoch": 0.0815110762315842, + "grad_norm": 0.35010194778442383, + "learning_rate": 4.0759900107028186e-05, + "loss": 0.2069, + "step": 4570 + }, + { + "epoch": 0.0815289123532979, + "grad_norm": 0.4712359607219696, + "learning_rate": 4.076881912236889e-05, + "loss": 0.2897, + "step": 4571 + }, + { + "epoch": 0.08154674847501159, + "grad_norm": 0.3436199724674225, + "learning_rate": 4.07777381377096e-05, + "loss": 0.2809, + "step": 4572 + }, + { + "epoch": 0.08156458459672529, + "grad_norm": 0.408003032207489, + "learning_rate": 4.0786657153050305e-05, + "loss": 0.2496, + "step": 4573 + }, + { + "epoch": 0.08158242071843898, + "grad_norm": 0.3031524121761322, + "learning_rate": 4.079557616839101e-05, + "loss": 0.2403, + "step": 4574 + }, + { + "epoch": 0.08160025684015268, + "grad_norm": 0.36796116828918457, + "learning_rate": 4.080449518373172e-05, + "loss": 0.2636, + "step": 4575 + }, + { + "epoch": 0.08161809296186637, + "grad_norm": 0.3667079210281372, + "learning_rate": 4.0813414199072424e-05, + "loss": 0.2927, + "step": 4576 + }, + { + "epoch": 0.08163592908358007, + "grad_norm": 0.3545800745487213, + "learning_rate": 4.082233321441313e-05, + "loss": 0.2731, + "step": 4577 + }, + { + "epoch": 0.08165376520529376, + "grad_norm": 0.44302111864089966, + "learning_rate": 4.0831252229753837e-05, + "loss": 0.3496, + "step": 4578 + }, + { + "epoch": 0.08167160132700746, + "grad_norm": 0.3333475887775421, + "learning_rate": 4.084017124509454e-05, + "loss": 0.2595, + "step": 4579 + }, + { + "epoch": 0.08168943744872115, + "grad_norm": 0.353995144367218, + "learning_rate": 4.084909026043525e-05, + "loss": 0.2505, + "step": 4580 + }, + { + "epoch": 0.08170727357043485, + "grad_norm": 0.3846065104007721, + "learning_rate": 4.0858009275775956e-05, + "loss": 0.2802, + "step": 4581 + }, + { + "epoch": 0.08172510969214854, + "grad_norm": 0.4532749056816101, + "learning_rate": 4.086692829111666e-05, + "loss": 0.2179, + "step": 4582 + }, + { + "epoch": 0.08174294581386224, + "grad_norm": 0.3650430738925934, + "learning_rate": 4.087584730645737e-05, + "loss": 0.2687, + "step": 4583 + }, + { + "epoch": 0.08176078193557593, + "grad_norm": 0.49413353204727173, + "learning_rate": 4.0884766321798075e-05, + "loss": 0.4242, + "step": 4584 + }, + { + "epoch": 0.08177861805728963, + "grad_norm": 0.34336015582084656, + "learning_rate": 4.089368533713878e-05, + "loss": 0.2786, + "step": 4585 + }, + { + "epoch": 0.08179645417900332, + "grad_norm": 0.355006605386734, + "learning_rate": 4.090260435247949e-05, + "loss": 0.3308, + "step": 4586 + }, + { + "epoch": 0.08181429030071702, + "grad_norm": 0.30480387806892395, + "learning_rate": 4.0911523367820194e-05, + "loss": 0.2222, + "step": 4587 + }, + { + "epoch": 0.0818321264224307, + "grad_norm": 0.2543972134590149, + "learning_rate": 4.09204423831609e-05, + "loss": 0.2506, + "step": 4588 + }, + { + "epoch": 0.0818499625441444, + "grad_norm": 0.350434273481369, + "learning_rate": 4.092936139850161e-05, + "loss": 0.2291, + "step": 4589 + }, + { + "epoch": 0.0818677986658581, + "grad_norm": 0.3976143002510071, + "learning_rate": 4.093828041384231e-05, + "loss": 0.2843, + "step": 4590 + }, + { + "epoch": 0.08188563478757178, + "grad_norm": 0.3548804819583893, + "learning_rate": 4.094719942918302e-05, + "loss": 0.2537, + "step": 4591 + }, + { + "epoch": 0.08190347090928549, + "grad_norm": 0.37451016902923584, + "learning_rate": 4.0956118444523726e-05, + "loss": 0.2519, + "step": 4592 + }, + { + "epoch": 0.08192130703099917, + "grad_norm": 0.3483447730541229, + "learning_rate": 4.096503745986443e-05, + "loss": 0.2511, + "step": 4593 + }, + { + "epoch": 0.08193914315271288, + "grad_norm": 0.33760204911231995, + "learning_rate": 4.097395647520514e-05, + "loss": 0.2498, + "step": 4594 + }, + { + "epoch": 0.08195697927442656, + "grad_norm": 0.3115421235561371, + "learning_rate": 4.0982875490545845e-05, + "loss": 0.2285, + "step": 4595 + }, + { + "epoch": 0.08197481539614027, + "grad_norm": 0.3708973228931427, + "learning_rate": 4.099179450588655e-05, + "loss": 0.2727, + "step": 4596 + }, + { + "epoch": 0.08199265151785395, + "grad_norm": 0.3650602102279663, + "learning_rate": 4.100071352122726e-05, + "loss": 0.2245, + "step": 4597 + }, + { + "epoch": 0.08201048763956766, + "grad_norm": 0.2705913782119751, + "learning_rate": 4.1009632536567964e-05, + "loss": 0.2337, + "step": 4598 + }, + { + "epoch": 0.08202832376128134, + "grad_norm": 0.3268061876296997, + "learning_rate": 4.101855155190867e-05, + "loss": 0.2284, + "step": 4599 + }, + { + "epoch": 0.08204615988299505, + "grad_norm": 0.4855281412601471, + "learning_rate": 4.102747056724938e-05, + "loss": 0.2852, + "step": 4600 + }, + { + "epoch": 0.08206399600470873, + "grad_norm": 0.33552852272987366, + "learning_rate": 4.103638958259008e-05, + "loss": 0.2666, + "step": 4601 + }, + { + "epoch": 0.08208183212642244, + "grad_norm": 0.29675179719924927, + "learning_rate": 4.104530859793079e-05, + "loss": 0.2499, + "step": 4602 + }, + { + "epoch": 0.08209966824813612, + "grad_norm": 0.38519686460494995, + "learning_rate": 4.1054227613271496e-05, + "loss": 0.2671, + "step": 4603 + }, + { + "epoch": 0.08211750436984983, + "grad_norm": 0.29932722449302673, + "learning_rate": 4.10631466286122e-05, + "loss": 0.2524, + "step": 4604 + }, + { + "epoch": 0.08213534049156351, + "grad_norm": 0.3295300602912903, + "learning_rate": 4.107206564395291e-05, + "loss": 0.2657, + "step": 4605 + }, + { + "epoch": 0.08215317661327722, + "grad_norm": 0.2959982752799988, + "learning_rate": 4.1080984659293615e-05, + "loss": 0.2508, + "step": 4606 + }, + { + "epoch": 0.0821710127349909, + "grad_norm": 0.4345143437385559, + "learning_rate": 4.108990367463432e-05, + "loss": 0.2031, + "step": 4607 + }, + { + "epoch": 0.0821888488567046, + "grad_norm": 0.3866029977798462, + "learning_rate": 4.109882268997503e-05, + "loss": 0.3025, + "step": 4608 + }, + { + "epoch": 0.08220668497841829, + "grad_norm": 0.2634138762950897, + "learning_rate": 4.1107741705315734e-05, + "loss": 0.214, + "step": 4609 + }, + { + "epoch": 0.08222452110013198, + "grad_norm": 0.3317263722419739, + "learning_rate": 4.111666072065644e-05, + "loss": 0.2711, + "step": 4610 + }, + { + "epoch": 0.08224235722184568, + "grad_norm": 0.3610650897026062, + "learning_rate": 4.112557973599715e-05, + "loss": 0.2235, + "step": 4611 + }, + { + "epoch": 0.08226019334355937, + "grad_norm": 0.42297473549842834, + "learning_rate": 4.1134498751337854e-05, + "loss": 0.2279, + "step": 4612 + }, + { + "epoch": 0.08227802946527307, + "grad_norm": 0.58136385679245, + "learning_rate": 4.114341776667856e-05, + "loss": 0.2404, + "step": 4613 + }, + { + "epoch": 0.08229586558698676, + "grad_norm": 0.3265692889690399, + "learning_rate": 4.1152336782019266e-05, + "loss": 0.2354, + "step": 4614 + }, + { + "epoch": 0.08231370170870046, + "grad_norm": 0.34121695160865784, + "learning_rate": 4.116125579735997e-05, + "loss": 0.2437, + "step": 4615 + }, + { + "epoch": 0.08233153783041415, + "grad_norm": 0.3945043087005615, + "learning_rate": 4.117017481270068e-05, + "loss": 0.2072, + "step": 4616 + }, + { + "epoch": 0.08234937395212785, + "grad_norm": 0.48390698432922363, + "learning_rate": 4.1179093828041385e-05, + "loss": 0.3129, + "step": 4617 + }, + { + "epoch": 0.08236721007384154, + "grad_norm": 0.46820446848869324, + "learning_rate": 4.118801284338209e-05, + "loss": 0.2492, + "step": 4618 + }, + { + "epoch": 0.08238504619555524, + "grad_norm": 0.2748047709465027, + "learning_rate": 4.11969318587228e-05, + "loss": 0.2898, + "step": 4619 + }, + { + "epoch": 0.08240288231726893, + "grad_norm": 0.28265729546546936, + "learning_rate": 4.1205850874063505e-05, + "loss": 0.2751, + "step": 4620 + }, + { + "epoch": 0.08242071843898263, + "grad_norm": 0.37369540333747864, + "learning_rate": 4.121476988940422e-05, + "loss": 0.2816, + "step": 4621 + }, + { + "epoch": 0.08243855456069632, + "grad_norm": 0.3699372112751007, + "learning_rate": 4.122368890474492e-05, + "loss": 0.2982, + "step": 4622 + }, + { + "epoch": 0.08245639068241002, + "grad_norm": 0.289980947971344, + "learning_rate": 4.1232607920085624e-05, + "loss": 0.2412, + "step": 4623 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 0.36862096190452576, + "learning_rate": 4.124152693542633e-05, + "loss": 0.303, + "step": 4624 + }, + { + "epoch": 0.08249206292583741, + "grad_norm": 0.4404594600200653, + "learning_rate": 4.1250445950767036e-05, + "loss": 0.2698, + "step": 4625 + }, + { + "epoch": 0.0825098990475511, + "grad_norm": 0.2618829309940338, + "learning_rate": 4.125936496610774e-05, + "loss": 0.2127, + "step": 4626 + }, + { + "epoch": 0.0825277351692648, + "grad_norm": 0.2572970986366272, + "learning_rate": 4.126828398144845e-05, + "loss": 0.2156, + "step": 4627 + }, + { + "epoch": 0.08254557129097849, + "grad_norm": 0.3470957279205322, + "learning_rate": 4.1277202996789156e-05, + "loss": 0.2556, + "step": 4628 + }, + { + "epoch": 0.08256340741269218, + "grad_norm": 0.42865556478500366, + "learning_rate": 4.128612201212986e-05, + "loss": 0.3219, + "step": 4629 + }, + { + "epoch": 0.08258124353440588, + "grad_norm": 0.43934494256973267, + "learning_rate": 4.129504102747057e-05, + "loss": 0.3883, + "step": 4630 + }, + { + "epoch": 0.08259907965611957, + "grad_norm": 0.3967353105545044, + "learning_rate": 4.1303960042811275e-05, + "loss": 0.3307, + "step": 4631 + }, + { + "epoch": 0.08261691577783327, + "grad_norm": 0.3168458342552185, + "learning_rate": 4.131287905815198e-05, + "loss": 0.2746, + "step": 4632 + }, + { + "epoch": 0.08263475189954696, + "grad_norm": 0.35633203387260437, + "learning_rate": 4.132179807349269e-05, + "loss": 0.2502, + "step": 4633 + }, + { + "epoch": 0.08265258802126066, + "grad_norm": 0.36781826615333557, + "learning_rate": 4.1330717088833394e-05, + "loss": 0.2349, + "step": 4634 + }, + { + "epoch": 0.08267042414297435, + "grad_norm": 0.4396607279777527, + "learning_rate": 4.13396361041741e-05, + "loss": 0.2412, + "step": 4635 + }, + { + "epoch": 0.08268826026468805, + "grad_norm": 0.35209110379219055, + "learning_rate": 4.134855511951481e-05, + "loss": 0.259, + "step": 4636 + }, + { + "epoch": 0.08270609638640174, + "grad_norm": 0.3407614529132843, + "learning_rate": 4.135747413485551e-05, + "loss": 0.271, + "step": 4637 + }, + { + "epoch": 0.08272393250811544, + "grad_norm": 0.3865599036216736, + "learning_rate": 4.136639315019622e-05, + "loss": 0.2731, + "step": 4638 + }, + { + "epoch": 0.08274176862982913, + "grad_norm": 0.3795560300350189, + "learning_rate": 4.1375312165536926e-05, + "loss": 0.2632, + "step": 4639 + }, + { + "epoch": 0.08275960475154283, + "grad_norm": 0.4535025358200073, + "learning_rate": 4.138423118087763e-05, + "loss": 0.3025, + "step": 4640 + }, + { + "epoch": 0.08277744087325652, + "grad_norm": 0.36958104372024536, + "learning_rate": 4.139315019621834e-05, + "loss": 0.2574, + "step": 4641 + }, + { + "epoch": 0.08279527699497022, + "grad_norm": 0.30590102076530457, + "learning_rate": 4.1402069211559045e-05, + "loss": 0.2429, + "step": 4642 + }, + { + "epoch": 0.0828131131166839, + "grad_norm": 0.41996458172798157, + "learning_rate": 4.141098822689975e-05, + "loss": 0.3062, + "step": 4643 + }, + { + "epoch": 0.08283094923839761, + "grad_norm": 0.28037649393081665, + "learning_rate": 4.141990724224046e-05, + "loss": 0.238, + "step": 4644 + }, + { + "epoch": 0.0828487853601113, + "grad_norm": 0.23906131088733673, + "learning_rate": 4.1428826257581164e-05, + "loss": 0.2348, + "step": 4645 + }, + { + "epoch": 0.082866621481825, + "grad_norm": 0.30123984813690186, + "learning_rate": 4.143774527292188e-05, + "loss": 0.272, + "step": 4646 + }, + { + "epoch": 0.08288445760353869, + "grad_norm": 0.45062610507011414, + "learning_rate": 4.144666428826258e-05, + "loss": 0.2212, + "step": 4647 + }, + { + "epoch": 0.08290229372525239, + "grad_norm": 0.3282548189163208, + "learning_rate": 4.145558330360328e-05, + "loss": 0.2685, + "step": 4648 + }, + { + "epoch": 0.08292012984696608, + "grad_norm": 0.37194085121154785, + "learning_rate": 4.146450231894399e-05, + "loss": 0.2971, + "step": 4649 + }, + { + "epoch": 0.08293796596867976, + "grad_norm": 0.35184234380722046, + "learning_rate": 4.1473421334284696e-05, + "loss": 0.3189, + "step": 4650 + }, + { + "epoch": 0.08295580209039347, + "grad_norm": 0.3653015196323395, + "learning_rate": 4.14823403496254e-05, + "loss": 0.2731, + "step": 4651 + }, + { + "epoch": 0.08297363821210715, + "grad_norm": 0.2842966616153717, + "learning_rate": 4.149125936496611e-05, + "loss": 0.225, + "step": 4652 + }, + { + "epoch": 0.08299147433382086, + "grad_norm": 0.29867467284202576, + "learning_rate": 4.1500178380306815e-05, + "loss": 0.216, + "step": 4653 + }, + { + "epoch": 0.08300931045553454, + "grad_norm": 0.49323219060897827, + "learning_rate": 4.150909739564752e-05, + "loss": 0.3274, + "step": 4654 + }, + { + "epoch": 0.08302714657724825, + "grad_norm": 0.3544803261756897, + "learning_rate": 4.151801641098823e-05, + "loss": 0.2479, + "step": 4655 + }, + { + "epoch": 0.08304498269896193, + "grad_norm": 0.4178151488304138, + "learning_rate": 4.1526935426328934e-05, + "loss": 0.2595, + "step": 4656 + }, + { + "epoch": 0.08306281882067564, + "grad_norm": 0.36962756514549255, + "learning_rate": 4.153585444166964e-05, + "loss": 0.3009, + "step": 4657 + }, + { + "epoch": 0.08308065494238932, + "grad_norm": 0.3434098958969116, + "learning_rate": 4.154477345701035e-05, + "loss": 0.3182, + "step": 4658 + }, + { + "epoch": 0.08309849106410302, + "grad_norm": 0.36081305146217346, + "learning_rate": 4.155369247235105e-05, + "loss": 0.262, + "step": 4659 + }, + { + "epoch": 0.08311632718581671, + "grad_norm": 0.3830731511116028, + "learning_rate": 4.156261148769176e-05, + "loss": 0.2614, + "step": 4660 + }, + { + "epoch": 0.08313416330753041, + "grad_norm": 0.353041410446167, + "learning_rate": 4.1571530503032466e-05, + "loss": 0.2485, + "step": 4661 + }, + { + "epoch": 0.0831519994292441, + "grad_norm": 0.34955528378486633, + "learning_rate": 4.158044951837317e-05, + "loss": 0.3005, + "step": 4662 + }, + { + "epoch": 0.0831698355509578, + "grad_norm": 1.1263805627822876, + "learning_rate": 4.158936853371388e-05, + "loss": 0.2772, + "step": 4663 + }, + { + "epoch": 0.08318767167267149, + "grad_norm": 0.29771795868873596, + "learning_rate": 4.1598287549054585e-05, + "loss": 0.2828, + "step": 4664 + }, + { + "epoch": 0.0832055077943852, + "grad_norm": 0.36346837878227234, + "learning_rate": 4.160720656439529e-05, + "loss": 0.2666, + "step": 4665 + }, + { + "epoch": 0.08322334391609888, + "grad_norm": 0.3174096345901489, + "learning_rate": 4.1616125579736e-05, + "loss": 0.2775, + "step": 4666 + }, + { + "epoch": 0.08324118003781258, + "grad_norm": 0.316211998462677, + "learning_rate": 4.1625044595076704e-05, + "loss": 0.246, + "step": 4667 + }, + { + "epoch": 0.08325901615952627, + "grad_norm": 0.5889096260070801, + "learning_rate": 4.163396361041742e-05, + "loss": 0.3302, + "step": 4668 + }, + { + "epoch": 0.08327685228123996, + "grad_norm": 0.3345775008201599, + "learning_rate": 4.164288262575812e-05, + "loss": 0.2898, + "step": 4669 + }, + { + "epoch": 0.08329468840295366, + "grad_norm": 0.3662806451320648, + "learning_rate": 4.1651801641098824e-05, + "loss": 0.2822, + "step": 4670 + }, + { + "epoch": 0.08331252452466735, + "grad_norm": 0.36785194277763367, + "learning_rate": 4.166072065643954e-05, + "loss": 0.2595, + "step": 4671 + }, + { + "epoch": 0.08333036064638105, + "grad_norm": 0.35829958319664, + "learning_rate": 4.1669639671780236e-05, + "loss": 0.2396, + "step": 4672 + }, + { + "epoch": 0.08334819676809474, + "grad_norm": 0.303079754114151, + "learning_rate": 4.167855868712094e-05, + "loss": 0.2397, + "step": 4673 + }, + { + "epoch": 0.08336603288980844, + "grad_norm": 0.2755659520626068, + "learning_rate": 4.1687477702461656e-05, + "loss": 0.2469, + "step": 4674 + }, + { + "epoch": 0.08338386901152213, + "grad_norm": 0.26110124588012695, + "learning_rate": 4.1696396717802355e-05, + "loss": 0.2236, + "step": 4675 + }, + { + "epoch": 0.08340170513323583, + "grad_norm": 0.34288933873176575, + "learning_rate": 4.170531573314306e-05, + "loss": 0.2766, + "step": 4676 + }, + { + "epoch": 0.08341954125494952, + "grad_norm": 0.31066226959228516, + "learning_rate": 4.171423474848377e-05, + "loss": 0.241, + "step": 4677 + }, + { + "epoch": 0.08343737737666322, + "grad_norm": 0.3882811367511749, + "learning_rate": 4.1723153763824475e-05, + "loss": 0.2887, + "step": 4678 + }, + { + "epoch": 0.08345521349837691, + "grad_norm": 0.3345036208629608, + "learning_rate": 4.173207277916518e-05, + "loss": 0.2761, + "step": 4679 + }, + { + "epoch": 0.08347304962009061, + "grad_norm": 0.37505725026130676, + "learning_rate": 4.174099179450589e-05, + "loss": 0.2276, + "step": 4680 + }, + { + "epoch": 0.0834908857418043, + "grad_norm": 0.32556086778640747, + "learning_rate": 4.1749910809846594e-05, + "loss": 0.2486, + "step": 4681 + }, + { + "epoch": 0.083508721863518, + "grad_norm": 0.4078800678253174, + "learning_rate": 4.17588298251873e-05, + "loss": 0.3024, + "step": 4682 + }, + { + "epoch": 0.08352655798523169, + "grad_norm": 0.24213099479675293, + "learning_rate": 4.1767748840528007e-05, + "loss": 0.2071, + "step": 4683 + }, + { + "epoch": 0.08354439410694539, + "grad_norm": 0.39423611760139465, + "learning_rate": 4.177666785586871e-05, + "loss": 0.2496, + "step": 4684 + }, + { + "epoch": 0.08356223022865908, + "grad_norm": 0.3784507215023041, + "learning_rate": 4.178558687120942e-05, + "loss": 0.3164, + "step": 4685 + }, + { + "epoch": 0.08358006635037278, + "grad_norm": 0.304168164730072, + "learning_rate": 4.1794505886550126e-05, + "loss": 0.2341, + "step": 4686 + }, + { + "epoch": 0.08359790247208647, + "grad_norm": 0.29561179876327515, + "learning_rate": 4.180342490189083e-05, + "loss": 0.264, + "step": 4687 + }, + { + "epoch": 0.08361573859380017, + "grad_norm": 0.2837843894958496, + "learning_rate": 4.181234391723154e-05, + "loss": 0.2345, + "step": 4688 + }, + { + "epoch": 0.08363357471551386, + "grad_norm": 0.26228925585746765, + "learning_rate": 4.1821262932572245e-05, + "loss": 0.2656, + "step": 4689 + }, + { + "epoch": 0.08365141083722755, + "grad_norm": 0.32619327306747437, + "learning_rate": 4.183018194791295e-05, + "loss": 0.2209, + "step": 4690 + }, + { + "epoch": 0.08366924695894125, + "grad_norm": 0.49905359745025635, + "learning_rate": 4.183910096325366e-05, + "loss": 0.2872, + "step": 4691 + }, + { + "epoch": 0.08368708308065494, + "grad_norm": 0.4743531346321106, + "learning_rate": 4.1848019978594364e-05, + "loss": 0.2501, + "step": 4692 + }, + { + "epoch": 0.08370491920236864, + "grad_norm": 0.46660152077674866, + "learning_rate": 4.185693899393508e-05, + "loss": 0.3212, + "step": 4693 + }, + { + "epoch": 0.08372275532408233, + "grad_norm": 0.5808331966400146, + "learning_rate": 4.186585800927578e-05, + "loss": 0.3478, + "step": 4694 + }, + { + "epoch": 0.08374059144579603, + "grad_norm": 0.3189285099506378, + "learning_rate": 4.187477702461648e-05, + "loss": 0.3159, + "step": 4695 + }, + { + "epoch": 0.08375842756750972, + "grad_norm": 0.374616414308548, + "learning_rate": 4.1883696039957196e-05, + "loss": 0.2653, + "step": 4696 + }, + { + "epoch": 0.08377626368922342, + "grad_norm": 0.2993185520172119, + "learning_rate": 4.1892615055297896e-05, + "loss": 0.2669, + "step": 4697 + }, + { + "epoch": 0.0837940998109371, + "grad_norm": 0.2776098847389221, + "learning_rate": 4.19015340706386e-05, + "loss": 0.3056, + "step": 4698 + }, + { + "epoch": 0.08381193593265081, + "grad_norm": 0.2787780165672302, + "learning_rate": 4.1910453085979315e-05, + "loss": 0.2396, + "step": 4699 + }, + { + "epoch": 0.0838297720543645, + "grad_norm": 0.36942046880722046, + "learning_rate": 4.1919372101320015e-05, + "loss": 0.2533, + "step": 4700 + }, + { + "epoch": 0.0838476081760782, + "grad_norm": 0.3226310610771179, + "learning_rate": 4.192829111666072e-05, + "loss": 0.2087, + "step": 4701 + }, + { + "epoch": 0.08386544429779189, + "grad_norm": 0.23606103658676147, + "learning_rate": 4.193721013200143e-05, + "loss": 0.229, + "step": 4702 + }, + { + "epoch": 0.08388328041950559, + "grad_norm": 0.32441607117652893, + "learning_rate": 4.1946129147342134e-05, + "loss": 0.1895, + "step": 4703 + }, + { + "epoch": 0.08390111654121928, + "grad_norm": 0.9351107478141785, + "learning_rate": 4.195504816268284e-05, + "loss": 0.306, + "step": 4704 + }, + { + "epoch": 0.08391895266293298, + "grad_norm": 0.2972072958946228, + "learning_rate": 4.196396717802355e-05, + "loss": 0.2418, + "step": 4705 + }, + { + "epoch": 0.08393678878464667, + "grad_norm": 0.2862665057182312, + "learning_rate": 4.197288619336425e-05, + "loss": 0.2529, + "step": 4706 + }, + { + "epoch": 0.08395462490636037, + "grad_norm": 0.3217792510986328, + "learning_rate": 4.198180520870496e-05, + "loss": 0.233, + "step": 4707 + }, + { + "epoch": 0.08397246102807406, + "grad_norm": 0.38995981216430664, + "learning_rate": 4.1990724224045666e-05, + "loss": 0.2702, + "step": 4708 + }, + { + "epoch": 0.08399029714978776, + "grad_norm": 0.46338629722595215, + "learning_rate": 4.199964323938637e-05, + "loss": 0.3342, + "step": 4709 + }, + { + "epoch": 0.08400813327150145, + "grad_norm": 0.44229286909103394, + "learning_rate": 4.200856225472708e-05, + "loss": 0.2368, + "step": 4710 + }, + { + "epoch": 0.08402596939321513, + "grad_norm": 0.6200202107429504, + "learning_rate": 4.2017481270067785e-05, + "loss": 0.3033, + "step": 4711 + }, + { + "epoch": 0.08404380551492883, + "grad_norm": 0.34727299213409424, + "learning_rate": 4.202640028540849e-05, + "loss": 0.2699, + "step": 4712 + }, + { + "epoch": 0.08406164163664252, + "grad_norm": 0.31790176033973694, + "learning_rate": 4.20353193007492e-05, + "loss": 0.2482, + "step": 4713 + }, + { + "epoch": 0.08407947775835622, + "grad_norm": 0.37162309885025024, + "learning_rate": 4.2044238316089904e-05, + "loss": 0.2802, + "step": 4714 + }, + { + "epoch": 0.08409731388006991, + "grad_norm": 0.23286005854606628, + "learning_rate": 4.205315733143061e-05, + "loss": 0.2324, + "step": 4715 + }, + { + "epoch": 0.08411515000178361, + "grad_norm": 0.3594549894332886, + "learning_rate": 4.206207634677132e-05, + "loss": 0.2363, + "step": 4716 + }, + { + "epoch": 0.0841329861234973, + "grad_norm": 0.2847643196582794, + "learning_rate": 4.2070995362112023e-05, + "loss": 0.2127, + "step": 4717 + }, + { + "epoch": 0.084150822245211, + "grad_norm": 0.3336940407752991, + "learning_rate": 4.2079914377452737e-05, + "loss": 0.3024, + "step": 4718 + }, + { + "epoch": 0.08416865836692469, + "grad_norm": 0.4410804212093353, + "learning_rate": 4.2088833392793436e-05, + "loss": 0.3492, + "step": 4719 + }, + { + "epoch": 0.0841864944886384, + "grad_norm": 0.2904675304889679, + "learning_rate": 4.209775240813414e-05, + "loss": 0.2854, + "step": 4720 + }, + { + "epoch": 0.08420433061035208, + "grad_norm": 0.2894403338432312, + "learning_rate": 4.2106671423474856e-05, + "loss": 0.2432, + "step": 4721 + }, + { + "epoch": 0.08422216673206578, + "grad_norm": 0.33036935329437256, + "learning_rate": 4.2115590438815555e-05, + "loss": 0.2492, + "step": 4722 + }, + { + "epoch": 0.08424000285377947, + "grad_norm": 0.3528401255607605, + "learning_rate": 4.212450945415626e-05, + "loss": 0.2365, + "step": 4723 + }, + { + "epoch": 0.08425783897549317, + "grad_norm": 0.46657130122184753, + "learning_rate": 4.2133428469496975e-05, + "loss": 0.2519, + "step": 4724 + }, + { + "epoch": 0.08427567509720686, + "grad_norm": 0.3133507966995239, + "learning_rate": 4.2142347484837674e-05, + "loss": 0.2592, + "step": 4725 + }, + { + "epoch": 0.08429351121892056, + "grad_norm": 0.3040401339530945, + "learning_rate": 4.215126650017838e-05, + "loss": 0.2392, + "step": 4726 + }, + { + "epoch": 0.08431134734063425, + "grad_norm": 0.32957813143730164, + "learning_rate": 4.216018551551909e-05, + "loss": 0.2375, + "step": 4727 + }, + { + "epoch": 0.08432918346234795, + "grad_norm": 0.3673066198825836, + "learning_rate": 4.2169104530859794e-05, + "loss": 0.2749, + "step": 4728 + }, + { + "epoch": 0.08434701958406164, + "grad_norm": 0.366629958152771, + "learning_rate": 4.21780235462005e-05, + "loss": 0.2603, + "step": 4729 + }, + { + "epoch": 0.08436485570577533, + "grad_norm": 0.33293500542640686, + "learning_rate": 4.2186942561541206e-05, + "loss": 0.2725, + "step": 4730 + }, + { + "epoch": 0.08438269182748903, + "grad_norm": 0.3752448558807373, + "learning_rate": 4.219586157688191e-05, + "loss": 0.3308, + "step": 4731 + }, + { + "epoch": 0.08440052794920272, + "grad_norm": 0.44500109553337097, + "learning_rate": 4.220478059222262e-05, + "loss": 0.2536, + "step": 4732 + }, + { + "epoch": 0.08441836407091642, + "grad_norm": 0.48183923959732056, + "learning_rate": 4.2213699607563326e-05, + "loss": 0.2799, + "step": 4733 + }, + { + "epoch": 0.08443620019263011, + "grad_norm": 0.3570837676525116, + "learning_rate": 4.222261862290403e-05, + "loss": 0.2604, + "step": 4734 + }, + { + "epoch": 0.08445403631434381, + "grad_norm": 0.35328057408332825, + "learning_rate": 4.223153763824474e-05, + "loss": 0.2773, + "step": 4735 + }, + { + "epoch": 0.0844718724360575, + "grad_norm": 0.40506699681282043, + "learning_rate": 4.2240456653585445e-05, + "loss": 0.3003, + "step": 4736 + }, + { + "epoch": 0.0844897085577712, + "grad_norm": 0.4074116051197052, + "learning_rate": 4.224937566892615e-05, + "loss": 0.2588, + "step": 4737 + }, + { + "epoch": 0.08450754467948489, + "grad_norm": 0.27763742208480835, + "learning_rate": 4.225829468426686e-05, + "loss": 0.2267, + "step": 4738 + }, + { + "epoch": 0.08452538080119859, + "grad_norm": 0.4040248394012451, + "learning_rate": 4.2267213699607564e-05, + "loss": 0.2307, + "step": 4739 + }, + { + "epoch": 0.08454321692291228, + "grad_norm": 0.24238283932209015, + "learning_rate": 4.227613271494828e-05, + "loss": 0.219, + "step": 4740 + }, + { + "epoch": 0.08456105304462598, + "grad_norm": 0.27702492475509644, + "learning_rate": 4.2285051730288977e-05, + "loss": 0.2147, + "step": 4741 + }, + { + "epoch": 0.08457888916633967, + "grad_norm": 0.3211560547351837, + "learning_rate": 4.229397074562968e-05, + "loss": 0.275, + "step": 4742 + }, + { + "epoch": 0.08459672528805337, + "grad_norm": 0.3873059153556824, + "learning_rate": 4.2302889760970396e-05, + "loss": 0.2974, + "step": 4743 + }, + { + "epoch": 0.08461456140976706, + "grad_norm": 0.3301689922809601, + "learning_rate": 4.2311808776311096e-05, + "loss": 0.2102, + "step": 4744 + }, + { + "epoch": 0.08463239753148076, + "grad_norm": 0.3535199761390686, + "learning_rate": 4.23207277916518e-05, + "loss": 0.2865, + "step": 4745 + }, + { + "epoch": 0.08465023365319445, + "grad_norm": 0.45673027634620667, + "learning_rate": 4.2329646806992515e-05, + "loss": 0.2752, + "step": 4746 + }, + { + "epoch": 0.08466806977490815, + "grad_norm": 0.6006236672401428, + "learning_rate": 4.2338565822333215e-05, + "loss": 0.31, + "step": 4747 + }, + { + "epoch": 0.08468590589662184, + "grad_norm": 1.914745569229126, + "learning_rate": 4.234748483767392e-05, + "loss": 0.5449, + "step": 4748 + }, + { + "epoch": 0.08470374201833554, + "grad_norm": 0.38808149099349976, + "learning_rate": 4.2356403853014634e-05, + "loss": 0.276, + "step": 4749 + }, + { + "epoch": 0.08472157814004923, + "grad_norm": 0.3978177309036255, + "learning_rate": 4.2365322868355334e-05, + "loss": 0.2015, + "step": 4750 + }, + { + "epoch": 0.08473941426176292, + "grad_norm": 0.37178748846054077, + "learning_rate": 4.237424188369604e-05, + "loss": 0.2532, + "step": 4751 + }, + { + "epoch": 0.08475725038347662, + "grad_norm": 0.40695932507514954, + "learning_rate": 4.238316089903675e-05, + "loss": 0.2796, + "step": 4752 + }, + { + "epoch": 0.0847750865051903, + "grad_norm": 0.31890979409217834, + "learning_rate": 4.239207991437745e-05, + "loss": 0.2404, + "step": 4753 + }, + { + "epoch": 0.08479292262690401, + "grad_norm": 0.30806058645248413, + "learning_rate": 4.240099892971816e-05, + "loss": 0.2567, + "step": 4754 + }, + { + "epoch": 0.0848107587486177, + "grad_norm": 0.5412458181381226, + "learning_rate": 4.2409917945058866e-05, + "loss": 0.2848, + "step": 4755 + }, + { + "epoch": 0.0848285948703314, + "grad_norm": 0.38702741265296936, + "learning_rate": 4.241883696039957e-05, + "loss": 0.2732, + "step": 4756 + }, + { + "epoch": 0.08484643099204509, + "grad_norm": 0.2890656590461731, + "learning_rate": 4.242775597574028e-05, + "loss": 0.2578, + "step": 4757 + }, + { + "epoch": 0.08486426711375879, + "grad_norm": 0.3491271436214447, + "learning_rate": 4.2436674991080985e-05, + "loss": 0.3044, + "step": 4758 + }, + { + "epoch": 0.08488210323547248, + "grad_norm": 0.2936517596244812, + "learning_rate": 4.244559400642169e-05, + "loss": 0.2387, + "step": 4759 + }, + { + "epoch": 0.08489993935718618, + "grad_norm": 0.3863624036312103, + "learning_rate": 4.24545130217624e-05, + "loss": 0.2293, + "step": 4760 + }, + { + "epoch": 0.08491777547889987, + "grad_norm": 0.4840512275695801, + "learning_rate": 4.2463432037103104e-05, + "loss": 0.2772, + "step": 4761 + }, + { + "epoch": 0.08493561160061357, + "grad_norm": 0.3251959979534149, + "learning_rate": 4.247235105244381e-05, + "loss": 0.259, + "step": 4762 + }, + { + "epoch": 0.08495344772232725, + "grad_norm": 0.3124004602432251, + "learning_rate": 4.248127006778452e-05, + "loss": 0.2654, + "step": 4763 + }, + { + "epoch": 0.08497128384404096, + "grad_norm": 0.35165154933929443, + "learning_rate": 4.249018908312522e-05, + "loss": 0.2775, + "step": 4764 + }, + { + "epoch": 0.08498911996575464, + "grad_norm": 0.4545726478099823, + "learning_rate": 4.2499108098465936e-05, + "loss": 0.269, + "step": 4765 + }, + { + "epoch": 0.08500695608746835, + "grad_norm": 0.31335318088531494, + "learning_rate": 4.2508027113806636e-05, + "loss": 0.2238, + "step": 4766 + }, + { + "epoch": 0.08502479220918203, + "grad_norm": 0.2870141863822937, + "learning_rate": 4.251694612914734e-05, + "loss": 0.2346, + "step": 4767 + }, + { + "epoch": 0.08504262833089574, + "grad_norm": 0.5971843004226685, + "learning_rate": 4.2525865144488056e-05, + "loss": 0.2718, + "step": 4768 + }, + { + "epoch": 0.08506046445260942, + "grad_norm": 0.3538011610507965, + "learning_rate": 4.2534784159828755e-05, + "loss": 0.3217, + "step": 4769 + }, + { + "epoch": 0.08507830057432311, + "grad_norm": 0.36990422010421753, + "learning_rate": 4.254370317516946e-05, + "loss": 0.2289, + "step": 4770 + }, + { + "epoch": 0.08509613669603681, + "grad_norm": 0.2870731055736542, + "learning_rate": 4.2552622190510175e-05, + "loss": 0.2477, + "step": 4771 + }, + { + "epoch": 0.0851139728177505, + "grad_norm": 0.3276619017124176, + "learning_rate": 4.2561541205850874e-05, + "loss": 0.2961, + "step": 4772 + }, + { + "epoch": 0.0851318089394642, + "grad_norm": 0.7462770342826843, + "learning_rate": 4.257046022119158e-05, + "loss": 0.2708, + "step": 4773 + }, + { + "epoch": 0.08514964506117789, + "grad_norm": 0.28238746523857117, + "learning_rate": 4.2579379236532294e-05, + "loss": 0.2303, + "step": 4774 + }, + { + "epoch": 0.0851674811828916, + "grad_norm": 0.3139684796333313, + "learning_rate": 4.2588298251872994e-05, + "loss": 0.2244, + "step": 4775 + }, + { + "epoch": 0.08518531730460528, + "grad_norm": 0.4672798216342926, + "learning_rate": 4.25972172672137e-05, + "loss": 0.3217, + "step": 4776 + }, + { + "epoch": 0.08520315342631898, + "grad_norm": 0.5569276809692383, + "learning_rate": 4.260613628255441e-05, + "loss": 0.3116, + "step": 4777 + }, + { + "epoch": 0.08522098954803267, + "grad_norm": 0.2758088707923889, + "learning_rate": 4.261505529789511e-05, + "loss": 0.2401, + "step": 4778 + }, + { + "epoch": 0.08523882566974637, + "grad_norm": 0.4149360954761505, + "learning_rate": 4.262397431323582e-05, + "loss": 0.2809, + "step": 4779 + }, + { + "epoch": 0.08525666179146006, + "grad_norm": 0.3154550790786743, + "learning_rate": 4.2632893328576525e-05, + "loss": 0.2593, + "step": 4780 + }, + { + "epoch": 0.08527449791317376, + "grad_norm": 0.27809467911720276, + "learning_rate": 4.264181234391723e-05, + "loss": 0.2198, + "step": 4781 + }, + { + "epoch": 0.08529233403488745, + "grad_norm": 0.29021480679512024, + "learning_rate": 4.265073135925794e-05, + "loss": 0.2276, + "step": 4782 + }, + { + "epoch": 0.08531017015660115, + "grad_norm": 0.31899502873420715, + "learning_rate": 4.2659650374598645e-05, + "loss": 0.2811, + "step": 4783 + }, + { + "epoch": 0.08532800627831484, + "grad_norm": 0.6962217092514038, + "learning_rate": 4.266856938993935e-05, + "loss": 0.2812, + "step": 4784 + }, + { + "epoch": 0.08534584240002854, + "grad_norm": 0.46125882863998413, + "learning_rate": 4.267748840528006e-05, + "loss": 0.3221, + "step": 4785 + }, + { + "epoch": 0.08536367852174223, + "grad_norm": 0.4934694468975067, + "learning_rate": 4.2686407420620764e-05, + "loss": 0.3668, + "step": 4786 + }, + { + "epoch": 0.08538151464345593, + "grad_norm": 0.45117488503456116, + "learning_rate": 4.269532643596148e-05, + "loss": 0.3162, + "step": 4787 + }, + { + "epoch": 0.08539935076516962, + "grad_norm": 0.4191190302371979, + "learning_rate": 4.2704245451302176e-05, + "loss": 0.2955, + "step": 4788 + }, + { + "epoch": 0.08541718688688332, + "grad_norm": 0.3699866533279419, + "learning_rate": 4.271316446664288e-05, + "loss": 0.2278, + "step": 4789 + }, + { + "epoch": 0.08543502300859701, + "grad_norm": 0.4417411983013153, + "learning_rate": 4.2722083481983596e-05, + "loss": 0.2011, + "step": 4790 + }, + { + "epoch": 0.0854528591303107, + "grad_norm": 0.3505558669567108, + "learning_rate": 4.2731002497324296e-05, + "loss": 0.2514, + "step": 4791 + }, + { + "epoch": 0.0854706952520244, + "grad_norm": 0.5170753002166748, + "learning_rate": 4.2739921512665e-05, + "loss": 0.2881, + "step": 4792 + }, + { + "epoch": 0.08548853137373809, + "grad_norm": 0.29534634947776794, + "learning_rate": 4.2748840528005715e-05, + "loss": 0.2593, + "step": 4793 + }, + { + "epoch": 0.08550636749545179, + "grad_norm": 0.523849606513977, + "learning_rate": 4.2757759543346415e-05, + "loss": 0.2785, + "step": 4794 + }, + { + "epoch": 0.08552420361716548, + "grad_norm": 0.7056588530540466, + "learning_rate": 4.276667855868712e-05, + "loss": 0.2263, + "step": 4795 + }, + { + "epoch": 0.08554203973887918, + "grad_norm": 0.3499701917171478, + "learning_rate": 4.2775597574027834e-05, + "loss": 0.2559, + "step": 4796 + }, + { + "epoch": 0.08555987586059287, + "grad_norm": 0.39317333698272705, + "learning_rate": 4.2784516589368534e-05, + "loss": 0.2301, + "step": 4797 + }, + { + "epoch": 0.08557771198230657, + "grad_norm": 0.4301225244998932, + "learning_rate": 4.279343560470924e-05, + "loss": 0.2462, + "step": 4798 + }, + { + "epoch": 0.08559554810402026, + "grad_norm": 0.3244914710521698, + "learning_rate": 4.2802354620049953e-05, + "loss": 0.2666, + "step": 4799 + }, + { + "epoch": 0.08561338422573396, + "grad_norm": 0.31595122814178467, + "learning_rate": 4.281127363539065e-05, + "loss": 0.2732, + "step": 4800 + }, + { + "epoch": 0.08563122034744765, + "grad_norm": 0.4005768597126007, + "learning_rate": 4.282019265073136e-05, + "loss": 0.274, + "step": 4801 + }, + { + "epoch": 0.08564905646916135, + "grad_norm": 0.524563729763031, + "learning_rate": 4.282911166607207e-05, + "loss": 0.3328, + "step": 4802 + }, + { + "epoch": 0.08566689259087504, + "grad_norm": 0.3591447174549103, + "learning_rate": 4.283803068141277e-05, + "loss": 0.2556, + "step": 4803 + }, + { + "epoch": 0.08568472871258874, + "grad_norm": 0.3772144913673401, + "learning_rate": 4.284694969675348e-05, + "loss": 0.2674, + "step": 4804 + }, + { + "epoch": 0.08570256483430243, + "grad_norm": 0.39869189262390137, + "learning_rate": 4.2855868712094185e-05, + "loss": 0.2846, + "step": 4805 + }, + { + "epoch": 0.08572040095601613, + "grad_norm": 0.27792346477508545, + "learning_rate": 4.286478772743489e-05, + "loss": 0.2132, + "step": 4806 + }, + { + "epoch": 0.08573823707772982, + "grad_norm": 0.39283329248428345, + "learning_rate": 4.28737067427756e-05, + "loss": 0.2601, + "step": 4807 + }, + { + "epoch": 0.08575607319944352, + "grad_norm": 0.3831244111061096, + "learning_rate": 4.2882625758116304e-05, + "loss": 0.2661, + "step": 4808 + }, + { + "epoch": 0.08577390932115721, + "grad_norm": 0.298026442527771, + "learning_rate": 4.289154477345701e-05, + "loss": 0.2356, + "step": 4809 + }, + { + "epoch": 0.0857917454428709, + "grad_norm": 0.40651407837867737, + "learning_rate": 4.290046378879772e-05, + "loss": 0.2839, + "step": 4810 + }, + { + "epoch": 0.0858095815645846, + "grad_norm": 0.398034930229187, + "learning_rate": 4.290938280413842e-05, + "loss": 0.2299, + "step": 4811 + }, + { + "epoch": 0.08582741768629829, + "grad_norm": 0.3721598982810974, + "learning_rate": 4.2918301819479136e-05, + "loss": 0.2356, + "step": 4812 + }, + { + "epoch": 0.08584525380801199, + "grad_norm": 0.5915565490722656, + "learning_rate": 4.2927220834819836e-05, + "loss": 0.2818, + "step": 4813 + }, + { + "epoch": 0.08586308992972567, + "grad_norm": 0.31399139761924744, + "learning_rate": 4.293613985016054e-05, + "loss": 0.2513, + "step": 4814 + }, + { + "epoch": 0.08588092605143938, + "grad_norm": 0.5328007340431213, + "learning_rate": 4.2945058865501256e-05, + "loss": 0.2917, + "step": 4815 + }, + { + "epoch": 0.08589876217315306, + "grad_norm": 0.39275050163269043, + "learning_rate": 4.2953977880841955e-05, + "loss": 0.2309, + "step": 4816 + }, + { + "epoch": 0.08591659829486677, + "grad_norm": 0.3472362160682678, + "learning_rate": 4.296289689618266e-05, + "loss": 0.2641, + "step": 4817 + }, + { + "epoch": 0.08593443441658045, + "grad_norm": 0.3209400177001953, + "learning_rate": 4.2971815911523375e-05, + "loss": 0.2538, + "step": 4818 + }, + { + "epoch": 0.08595227053829416, + "grad_norm": 0.9367354512214661, + "learning_rate": 4.2980734926864074e-05, + "loss": 0.2625, + "step": 4819 + }, + { + "epoch": 0.08597010666000784, + "grad_norm": 0.3279639780521393, + "learning_rate": 4.298965394220478e-05, + "loss": 0.1536, + "step": 4820 + }, + { + "epoch": 0.08598794278172155, + "grad_norm": 0.39545565843582153, + "learning_rate": 4.2998572957545494e-05, + "loss": 0.261, + "step": 4821 + }, + { + "epoch": 0.08600577890343523, + "grad_norm": 0.4068867266178131, + "learning_rate": 4.300749197288619e-05, + "loss": 0.2438, + "step": 4822 + }, + { + "epoch": 0.08602361502514894, + "grad_norm": 0.5767574906349182, + "learning_rate": 4.30164109882269e-05, + "loss": 0.2691, + "step": 4823 + }, + { + "epoch": 0.08604145114686262, + "grad_norm": 0.44339218735694885, + "learning_rate": 4.302533000356761e-05, + "loss": 0.3335, + "step": 4824 + }, + { + "epoch": 0.08605928726857633, + "grad_norm": 0.4091067314147949, + "learning_rate": 4.303424901890831e-05, + "loss": 0.3007, + "step": 4825 + }, + { + "epoch": 0.08607712339029001, + "grad_norm": 0.38071176409721375, + "learning_rate": 4.304316803424902e-05, + "loss": 0.313, + "step": 4826 + }, + { + "epoch": 0.08609495951200372, + "grad_norm": 0.4248177707195282, + "learning_rate": 4.305208704958973e-05, + "loss": 0.2525, + "step": 4827 + }, + { + "epoch": 0.0861127956337174, + "grad_norm": 0.41568171977996826, + "learning_rate": 4.306100606493043e-05, + "loss": 0.272, + "step": 4828 + }, + { + "epoch": 0.0861306317554311, + "grad_norm": 0.33429378271102905, + "learning_rate": 4.306992508027114e-05, + "loss": 0.2147, + "step": 4829 + }, + { + "epoch": 0.0861484678771448, + "grad_norm": 0.4362045228481293, + "learning_rate": 4.3078844095611844e-05, + "loss": 0.2641, + "step": 4830 + }, + { + "epoch": 0.08616630399885848, + "grad_norm": 0.2800341248512268, + "learning_rate": 4.308776311095255e-05, + "loss": 0.2061, + "step": 4831 + }, + { + "epoch": 0.08618414012057218, + "grad_norm": 0.49194765090942383, + "learning_rate": 4.309668212629326e-05, + "loss": 0.283, + "step": 4832 + }, + { + "epoch": 0.08620197624228587, + "grad_norm": 0.4083649814128876, + "learning_rate": 4.3105601141633964e-05, + "loss": 0.29, + "step": 4833 + }, + { + "epoch": 0.08621981236399957, + "grad_norm": 0.32335180044174194, + "learning_rate": 4.311452015697468e-05, + "loss": 0.2529, + "step": 4834 + }, + { + "epoch": 0.08623764848571326, + "grad_norm": 0.3874923586845398, + "learning_rate": 4.3123439172315376e-05, + "loss": 0.27, + "step": 4835 + }, + { + "epoch": 0.08625548460742696, + "grad_norm": 0.3521764874458313, + "learning_rate": 4.313235818765608e-05, + "loss": 0.22, + "step": 4836 + }, + { + "epoch": 0.08627332072914065, + "grad_norm": 0.3749926686286926, + "learning_rate": 4.3141277202996796e-05, + "loss": 0.3044, + "step": 4837 + }, + { + "epoch": 0.08629115685085435, + "grad_norm": 0.3045305609703064, + "learning_rate": 4.3150196218337495e-05, + "loss": 0.2834, + "step": 4838 + }, + { + "epoch": 0.08630899297256804, + "grad_norm": 0.30828580260276794, + "learning_rate": 4.31591152336782e-05, + "loss": 0.2686, + "step": 4839 + }, + { + "epoch": 0.08632682909428174, + "grad_norm": 0.30732011795043945, + "learning_rate": 4.3168034249018915e-05, + "loss": 0.2588, + "step": 4840 + }, + { + "epoch": 0.08634466521599543, + "grad_norm": 0.3094451427459717, + "learning_rate": 4.3176953264359615e-05, + "loss": 0.2927, + "step": 4841 + }, + { + "epoch": 0.08636250133770913, + "grad_norm": 0.32332518696784973, + "learning_rate": 4.318587227970032e-05, + "loss": 0.2538, + "step": 4842 + }, + { + "epoch": 0.08638033745942282, + "grad_norm": 0.33366659283638, + "learning_rate": 4.3194791295041034e-05, + "loss": 0.2611, + "step": 4843 + }, + { + "epoch": 0.08639817358113652, + "grad_norm": 0.36465194821357727, + "learning_rate": 4.3203710310381734e-05, + "loss": 0.2466, + "step": 4844 + }, + { + "epoch": 0.08641600970285021, + "grad_norm": 0.3875217139720917, + "learning_rate": 4.321262932572244e-05, + "loss": 0.2456, + "step": 4845 + }, + { + "epoch": 0.08643384582456391, + "grad_norm": 0.4086056649684906, + "learning_rate": 4.322154834106315e-05, + "loss": 0.289, + "step": 4846 + }, + { + "epoch": 0.0864516819462776, + "grad_norm": 0.3160537779331207, + "learning_rate": 4.323046735640385e-05, + "loss": 0.2946, + "step": 4847 + }, + { + "epoch": 0.0864695180679913, + "grad_norm": 0.29965537786483765, + "learning_rate": 4.323938637174456e-05, + "loss": 0.2784, + "step": 4848 + }, + { + "epoch": 0.08648735418970499, + "grad_norm": 0.2963491976261139, + "learning_rate": 4.324830538708527e-05, + "loss": 0.243, + "step": 4849 + }, + { + "epoch": 0.08650519031141868, + "grad_norm": 0.41574347019195557, + "learning_rate": 4.325722440242597e-05, + "loss": 0.2818, + "step": 4850 + }, + { + "epoch": 0.08652302643313238, + "grad_norm": 0.44083961844444275, + "learning_rate": 4.326614341776668e-05, + "loss": 0.2307, + "step": 4851 + }, + { + "epoch": 0.08654086255484607, + "grad_norm": 0.7231858372688293, + "learning_rate": 4.327506243310739e-05, + "loss": 0.3058, + "step": 4852 + }, + { + "epoch": 0.08655869867655977, + "grad_norm": 0.28039807081222534, + "learning_rate": 4.328398144844809e-05, + "loss": 0.2623, + "step": 4853 + }, + { + "epoch": 0.08657653479827346, + "grad_norm": 0.23842006921768188, + "learning_rate": 4.32929004637888e-05, + "loss": 0.2636, + "step": 4854 + }, + { + "epoch": 0.08659437091998716, + "grad_norm": 0.28298041224479675, + "learning_rate": 4.3301819479129504e-05, + "loss": 0.245, + "step": 4855 + }, + { + "epoch": 0.08661220704170085, + "grad_norm": 0.34194597601890564, + "learning_rate": 4.331073849447021e-05, + "loss": 0.3005, + "step": 4856 + }, + { + "epoch": 0.08663004316341455, + "grad_norm": 0.3175301253795624, + "learning_rate": 4.331965750981092e-05, + "loss": 0.2486, + "step": 4857 + }, + { + "epoch": 0.08664787928512824, + "grad_norm": 0.3859957754611969, + "learning_rate": 4.332857652515162e-05, + "loss": 0.2834, + "step": 4858 + }, + { + "epoch": 0.08666571540684194, + "grad_norm": 0.373903751373291, + "learning_rate": 4.3337495540492336e-05, + "loss": 0.2815, + "step": 4859 + }, + { + "epoch": 0.08668355152855563, + "grad_norm": 0.32694053649902344, + "learning_rate": 4.3346414555833036e-05, + "loss": 0.2797, + "step": 4860 + }, + { + "epoch": 0.08670138765026933, + "grad_norm": 0.35671985149383545, + "learning_rate": 4.335533357117374e-05, + "loss": 0.2894, + "step": 4861 + }, + { + "epoch": 0.08671922377198302, + "grad_norm": 0.35586464405059814, + "learning_rate": 4.3364252586514455e-05, + "loss": 0.223, + "step": 4862 + }, + { + "epoch": 0.08673705989369672, + "grad_norm": 0.36284685134887695, + "learning_rate": 4.3373171601855155e-05, + "loss": 0.267, + "step": 4863 + }, + { + "epoch": 0.08675489601541041, + "grad_norm": 0.3396737277507782, + "learning_rate": 4.338209061719586e-05, + "loss": 0.2622, + "step": 4864 + }, + { + "epoch": 0.08677273213712411, + "grad_norm": 0.2970930337905884, + "learning_rate": 4.3391009632536575e-05, + "loss": 0.2633, + "step": 4865 + }, + { + "epoch": 0.0867905682588378, + "grad_norm": 0.3354485332965851, + "learning_rate": 4.3399928647877274e-05, + "loss": 0.2849, + "step": 4866 + }, + { + "epoch": 0.0868084043805515, + "grad_norm": 0.38399219512939453, + "learning_rate": 4.340884766321798e-05, + "loss": 0.2683, + "step": 4867 + }, + { + "epoch": 0.08682624050226519, + "grad_norm": 0.2917868494987488, + "learning_rate": 4.3417766678558694e-05, + "loss": 0.2204, + "step": 4868 + }, + { + "epoch": 0.08684407662397889, + "grad_norm": 0.28292855620384216, + "learning_rate": 4.342668569389939e-05, + "loss": 0.2388, + "step": 4869 + }, + { + "epoch": 0.08686191274569258, + "grad_norm": 0.329440712928772, + "learning_rate": 4.34356047092401e-05, + "loss": 0.2281, + "step": 4870 + }, + { + "epoch": 0.08687974886740626, + "grad_norm": 0.39377719163894653, + "learning_rate": 4.344452372458081e-05, + "loss": 0.2473, + "step": 4871 + }, + { + "epoch": 0.08689758498911997, + "grad_norm": 0.3619420826435089, + "learning_rate": 4.345344273992151e-05, + "loss": 0.2349, + "step": 4872 + }, + { + "epoch": 0.08691542111083365, + "grad_norm": 0.4105657935142517, + "learning_rate": 4.346236175526222e-05, + "loss": 0.2981, + "step": 4873 + }, + { + "epoch": 0.08693325723254736, + "grad_norm": 0.3426814675331116, + "learning_rate": 4.347128077060293e-05, + "loss": 0.2835, + "step": 4874 + }, + { + "epoch": 0.08695109335426104, + "grad_norm": 0.46077296137809753, + "learning_rate": 4.348019978594363e-05, + "loss": 0.3519, + "step": 4875 + }, + { + "epoch": 0.08696892947597475, + "grad_norm": 0.3634946048259735, + "learning_rate": 4.348911880128434e-05, + "loss": 0.2829, + "step": 4876 + }, + { + "epoch": 0.08698676559768843, + "grad_norm": 0.3040212094783783, + "learning_rate": 4.349803781662505e-05, + "loss": 0.2569, + "step": 4877 + }, + { + "epoch": 0.08700460171940214, + "grad_norm": 0.4141254723072052, + "learning_rate": 4.350695683196575e-05, + "loss": 0.2879, + "step": 4878 + }, + { + "epoch": 0.08702243784111582, + "grad_norm": 0.35047221183776855, + "learning_rate": 4.351587584730646e-05, + "loss": 0.2323, + "step": 4879 + }, + { + "epoch": 0.08704027396282953, + "grad_norm": 0.3511478006839752, + "learning_rate": 4.352479486264717e-05, + "loss": 0.2889, + "step": 4880 + }, + { + "epoch": 0.08705811008454321, + "grad_norm": 0.2961639165878296, + "learning_rate": 4.353371387798787e-05, + "loss": 0.2844, + "step": 4881 + }, + { + "epoch": 0.08707594620625692, + "grad_norm": 0.32740113139152527, + "learning_rate": 4.3542632893328576e-05, + "loss": 0.2521, + "step": 4882 + }, + { + "epoch": 0.0870937823279706, + "grad_norm": 0.39200204610824585, + "learning_rate": 4.355155190866928e-05, + "loss": 0.2783, + "step": 4883 + }, + { + "epoch": 0.0871116184496843, + "grad_norm": 0.3524141311645508, + "learning_rate": 4.3560470924009996e-05, + "loss": 0.2351, + "step": 4884 + }, + { + "epoch": 0.087129454571398, + "grad_norm": 0.5068562030792236, + "learning_rate": 4.3569389939350695e-05, + "loss": 0.262, + "step": 4885 + }, + { + "epoch": 0.0871472906931117, + "grad_norm": 0.3330923616886139, + "learning_rate": 4.35783089546914e-05, + "loss": 0.2847, + "step": 4886 + }, + { + "epoch": 0.08716512681482538, + "grad_norm": 0.4281753897666931, + "learning_rate": 4.3587227970032115e-05, + "loss": 0.2862, + "step": 4887 + }, + { + "epoch": 0.08718296293653909, + "grad_norm": 0.39838263392448425, + "learning_rate": 4.3596146985372814e-05, + "loss": 0.2529, + "step": 4888 + }, + { + "epoch": 0.08720079905825277, + "grad_norm": 0.431342214345932, + "learning_rate": 4.360506600071352e-05, + "loss": 0.2347, + "step": 4889 + }, + { + "epoch": 0.08721863517996647, + "grad_norm": 0.34118667244911194, + "learning_rate": 4.3613985016054234e-05, + "loss": 0.1946, + "step": 4890 + }, + { + "epoch": 0.08723647130168016, + "grad_norm": 0.6141383051872253, + "learning_rate": 4.3622904031394934e-05, + "loss": 0.2618, + "step": 4891 + }, + { + "epoch": 0.08725430742339385, + "grad_norm": 0.3077762722969055, + "learning_rate": 4.363182304673564e-05, + "loss": 0.2068, + "step": 4892 + }, + { + "epoch": 0.08727214354510755, + "grad_norm": 0.4342086911201477, + "learning_rate": 4.364074206207635e-05, + "loss": 0.2514, + "step": 4893 + }, + { + "epoch": 0.08728997966682124, + "grad_norm": 0.48361364006996155, + "learning_rate": 4.364966107741705e-05, + "loss": 0.3261, + "step": 4894 + }, + { + "epoch": 0.08730781578853494, + "grad_norm": 0.4839523732662201, + "learning_rate": 4.365858009275776e-05, + "loss": 0.346, + "step": 4895 + }, + { + "epoch": 0.08732565191024863, + "grad_norm": 0.32546982169151306, + "learning_rate": 4.366749910809847e-05, + "loss": 0.2169, + "step": 4896 + }, + { + "epoch": 0.08734348803196233, + "grad_norm": 0.3815949857234955, + "learning_rate": 4.367641812343917e-05, + "loss": 0.2752, + "step": 4897 + }, + { + "epoch": 0.08736132415367602, + "grad_norm": 0.3673657476902008, + "learning_rate": 4.368533713877988e-05, + "loss": 0.3015, + "step": 4898 + }, + { + "epoch": 0.08737916027538972, + "grad_norm": 0.42818859219551086, + "learning_rate": 4.369425615412059e-05, + "loss": 0.2208, + "step": 4899 + }, + { + "epoch": 0.08739699639710341, + "grad_norm": 0.3679737150669098, + "learning_rate": 4.370317516946129e-05, + "loss": 0.2526, + "step": 4900 + }, + { + "epoch": 0.08741483251881711, + "grad_norm": 0.3902592360973358, + "learning_rate": 4.3712094184802e-05, + "loss": 0.3113, + "step": 4901 + }, + { + "epoch": 0.0874326686405308, + "grad_norm": 0.33876484632492065, + "learning_rate": 4.372101320014271e-05, + "loss": 0.2189, + "step": 4902 + }, + { + "epoch": 0.0874505047622445, + "grad_norm": 0.4807701110839844, + "learning_rate": 4.372993221548341e-05, + "loss": 0.2128, + "step": 4903 + }, + { + "epoch": 0.08746834088395819, + "grad_norm": 0.37112441658973694, + "learning_rate": 4.3738851230824117e-05, + "loss": 0.2826, + "step": 4904 + }, + { + "epoch": 0.08748617700567189, + "grad_norm": 0.41438496112823486, + "learning_rate": 4.374777024616483e-05, + "loss": 0.3159, + "step": 4905 + }, + { + "epoch": 0.08750401312738558, + "grad_norm": 0.3171751797199249, + "learning_rate": 4.3756689261505536e-05, + "loss": 0.2804, + "step": 4906 + }, + { + "epoch": 0.08752184924909928, + "grad_norm": 0.3540882170200348, + "learning_rate": 4.3765608276846236e-05, + "loss": 0.282, + "step": 4907 + }, + { + "epoch": 0.08753968537081297, + "grad_norm": 0.2823682725429535, + "learning_rate": 4.377452729218694e-05, + "loss": 0.2582, + "step": 4908 + }, + { + "epoch": 0.08755752149252667, + "grad_norm": 0.3573967218399048, + "learning_rate": 4.3783446307527655e-05, + "loss": 0.2801, + "step": 4909 + }, + { + "epoch": 0.08757535761424036, + "grad_norm": 0.3363186717033386, + "learning_rate": 4.3792365322868355e-05, + "loss": 0.2844, + "step": 4910 + }, + { + "epoch": 0.08759319373595405, + "grad_norm": 0.32518690824508667, + "learning_rate": 4.380128433820906e-05, + "loss": 0.2676, + "step": 4911 + }, + { + "epoch": 0.08761102985766775, + "grad_norm": 0.37290501594543457, + "learning_rate": 4.3810203353549774e-05, + "loss": 0.2933, + "step": 4912 + }, + { + "epoch": 0.08762886597938144, + "grad_norm": 0.29589971899986267, + "learning_rate": 4.3819122368890474e-05, + "loss": 0.2411, + "step": 4913 + }, + { + "epoch": 0.08764670210109514, + "grad_norm": 0.23060445487499237, + "learning_rate": 4.382804138423118e-05, + "loss": 0.2192, + "step": 4914 + }, + { + "epoch": 0.08766453822280883, + "grad_norm": 0.2838451564311981, + "learning_rate": 4.3836960399571894e-05, + "loss": 0.2231, + "step": 4915 + }, + { + "epoch": 0.08768237434452253, + "grad_norm": 0.243721142411232, + "learning_rate": 4.384587941491259e-05, + "loss": 0.2279, + "step": 4916 + }, + { + "epoch": 0.08770021046623622, + "grad_norm": 0.30721530318260193, + "learning_rate": 4.38547984302533e-05, + "loss": 0.2438, + "step": 4917 + }, + { + "epoch": 0.08771804658794992, + "grad_norm": 0.2916942536830902, + "learning_rate": 4.386371744559401e-05, + "loss": 0.2324, + "step": 4918 + }, + { + "epoch": 0.0877358827096636, + "grad_norm": 0.4337362051010132, + "learning_rate": 4.387263646093471e-05, + "loss": 0.2572, + "step": 4919 + }, + { + "epoch": 0.08775371883137731, + "grad_norm": 0.3278912901878357, + "learning_rate": 4.388155547627542e-05, + "loss": 0.2708, + "step": 4920 + }, + { + "epoch": 0.087771554953091, + "grad_norm": 0.30770426988601685, + "learning_rate": 4.389047449161613e-05, + "loss": 0.2531, + "step": 4921 + }, + { + "epoch": 0.0877893910748047, + "grad_norm": 0.36513054370880127, + "learning_rate": 4.389939350695683e-05, + "loss": 0.2824, + "step": 4922 + }, + { + "epoch": 0.08780722719651839, + "grad_norm": 0.3902776837348938, + "learning_rate": 4.390831252229754e-05, + "loss": 0.3049, + "step": 4923 + }, + { + "epoch": 0.08782506331823209, + "grad_norm": 0.46934065222740173, + "learning_rate": 4.391723153763825e-05, + "loss": 0.3479, + "step": 4924 + }, + { + "epoch": 0.08784289943994578, + "grad_norm": 0.3181341588497162, + "learning_rate": 4.392615055297895e-05, + "loss": 0.243, + "step": 4925 + }, + { + "epoch": 0.08786073556165948, + "grad_norm": 0.3682335317134857, + "learning_rate": 4.393506956831966e-05, + "loss": 0.288, + "step": 4926 + }, + { + "epoch": 0.08787857168337317, + "grad_norm": 0.37030425667762756, + "learning_rate": 4.394398858366037e-05, + "loss": 0.1971, + "step": 4927 + }, + { + "epoch": 0.08789640780508687, + "grad_norm": 0.3158690631389618, + "learning_rate": 4.395290759900107e-05, + "loss": 0.2084, + "step": 4928 + }, + { + "epoch": 0.08791424392680056, + "grad_norm": 0.4250951111316681, + "learning_rate": 4.3961826614341776e-05, + "loss": 0.2729, + "step": 4929 + }, + { + "epoch": 0.08793208004851426, + "grad_norm": 0.36358439922332764, + "learning_rate": 4.397074562968249e-05, + "loss": 0.3262, + "step": 4930 + }, + { + "epoch": 0.08794991617022795, + "grad_norm": 0.3131383955478668, + "learning_rate": 4.3979664645023196e-05, + "loss": 0.2715, + "step": 4931 + }, + { + "epoch": 0.08796775229194163, + "grad_norm": 0.33680614829063416, + "learning_rate": 4.3988583660363895e-05, + "loss": 0.1973, + "step": 4932 + }, + { + "epoch": 0.08798558841365534, + "grad_norm": 0.25862354040145874, + "learning_rate": 4.39975026757046e-05, + "loss": 0.2638, + "step": 4933 + }, + { + "epoch": 0.08800342453536902, + "grad_norm": 0.32706525921821594, + "learning_rate": 4.4006421691045315e-05, + "loss": 0.2727, + "step": 4934 + }, + { + "epoch": 0.08802126065708273, + "grad_norm": 0.4544707238674164, + "learning_rate": 4.4015340706386014e-05, + "loss": 0.2808, + "step": 4935 + }, + { + "epoch": 0.08803909677879641, + "grad_norm": 0.3016246557235718, + "learning_rate": 4.402425972172672e-05, + "loss": 0.2681, + "step": 4936 + }, + { + "epoch": 0.08805693290051012, + "grad_norm": 0.3369017243385315, + "learning_rate": 4.4033178737067434e-05, + "loss": 0.2466, + "step": 4937 + }, + { + "epoch": 0.0880747690222238, + "grad_norm": 0.2844545543193817, + "learning_rate": 4.4042097752408134e-05, + "loss": 0.2976, + "step": 4938 + }, + { + "epoch": 0.0880926051439375, + "grad_norm": 0.3822677433490753, + "learning_rate": 4.405101676774884e-05, + "loss": 0.2206, + "step": 4939 + }, + { + "epoch": 0.0881104412656512, + "grad_norm": 0.2786327600479126, + "learning_rate": 4.405993578308955e-05, + "loss": 0.2259, + "step": 4940 + }, + { + "epoch": 0.0881282773873649, + "grad_norm": 0.3102079927921295, + "learning_rate": 4.406885479843025e-05, + "loss": 0.2371, + "step": 4941 + }, + { + "epoch": 0.08814611350907858, + "grad_norm": 0.39839714765548706, + "learning_rate": 4.407777381377096e-05, + "loss": 0.2997, + "step": 4942 + }, + { + "epoch": 0.08816394963079228, + "grad_norm": 0.43621885776519775, + "learning_rate": 4.408669282911167e-05, + "loss": 0.262, + "step": 4943 + }, + { + "epoch": 0.08818178575250597, + "grad_norm": 0.2633258104324341, + "learning_rate": 4.409561184445237e-05, + "loss": 0.2241, + "step": 4944 + }, + { + "epoch": 0.08819962187421967, + "grad_norm": 0.3812224268913269, + "learning_rate": 4.410453085979308e-05, + "loss": 0.308, + "step": 4945 + }, + { + "epoch": 0.08821745799593336, + "grad_norm": 0.38104119896888733, + "learning_rate": 4.411344987513379e-05, + "loss": 0.2128, + "step": 4946 + }, + { + "epoch": 0.08823529411764706, + "grad_norm": 0.46683916449546814, + "learning_rate": 4.412236889047449e-05, + "loss": 0.2555, + "step": 4947 + }, + { + "epoch": 0.08825313023936075, + "grad_norm": 0.2919166088104248, + "learning_rate": 4.41312879058152e-05, + "loss": 0.2596, + "step": 4948 + }, + { + "epoch": 0.08827096636107445, + "grad_norm": 0.36694854497909546, + "learning_rate": 4.414020692115591e-05, + "loss": 0.2762, + "step": 4949 + }, + { + "epoch": 0.08828880248278814, + "grad_norm": 0.3404950797557831, + "learning_rate": 4.414912593649661e-05, + "loss": 0.2348, + "step": 4950 + }, + { + "epoch": 0.08830663860450183, + "grad_norm": 0.3095071017742157, + "learning_rate": 4.4158044951837316e-05, + "loss": 0.239, + "step": 4951 + }, + { + "epoch": 0.08832447472621553, + "grad_norm": 0.36194124817848206, + "learning_rate": 4.416696396717803e-05, + "loss": 0.2648, + "step": 4952 + }, + { + "epoch": 0.08834231084792922, + "grad_norm": 0.46115776896476746, + "learning_rate": 4.4175882982518736e-05, + "loss": 0.2588, + "step": 4953 + }, + { + "epoch": 0.08836014696964292, + "grad_norm": 0.289004921913147, + "learning_rate": 4.4184801997859436e-05, + "loss": 0.2245, + "step": 4954 + }, + { + "epoch": 0.08837798309135661, + "grad_norm": 0.5100419521331787, + "learning_rate": 4.419372101320015e-05, + "loss": 0.1669, + "step": 4955 + }, + { + "epoch": 0.08839581921307031, + "grad_norm": 0.35682612657546997, + "learning_rate": 4.4202640028540855e-05, + "loss": 0.306, + "step": 4956 + }, + { + "epoch": 0.088413655334784, + "grad_norm": 0.27299579977989197, + "learning_rate": 4.4211559043881555e-05, + "loss": 0.2441, + "step": 4957 + }, + { + "epoch": 0.0884314914564977, + "grad_norm": 0.292504221200943, + "learning_rate": 4.422047805922226e-05, + "loss": 0.2482, + "step": 4958 + }, + { + "epoch": 0.08844932757821139, + "grad_norm": 0.2863588035106659, + "learning_rate": 4.4229397074562974e-05, + "loss": 0.2306, + "step": 4959 + }, + { + "epoch": 0.08846716369992509, + "grad_norm": 0.3589595556259155, + "learning_rate": 4.4238316089903674e-05, + "loss": 0.2794, + "step": 4960 + }, + { + "epoch": 0.08848499982163878, + "grad_norm": 0.2773011028766632, + "learning_rate": 4.424723510524438e-05, + "loss": 0.2462, + "step": 4961 + }, + { + "epoch": 0.08850283594335248, + "grad_norm": 0.26829320192337036, + "learning_rate": 4.4256154120585093e-05, + "loss": 0.2119, + "step": 4962 + }, + { + "epoch": 0.08852067206506617, + "grad_norm": 0.4274921119213104, + "learning_rate": 4.426507313592579e-05, + "loss": 0.2513, + "step": 4963 + }, + { + "epoch": 0.08853850818677987, + "grad_norm": 1.098132848739624, + "learning_rate": 4.42739921512665e-05, + "loss": 0.2293, + "step": 4964 + }, + { + "epoch": 0.08855634430849356, + "grad_norm": 0.2858351171016693, + "learning_rate": 4.428291116660721e-05, + "loss": 0.2577, + "step": 4965 + }, + { + "epoch": 0.08857418043020726, + "grad_norm": 0.25789639353752136, + "learning_rate": 4.429183018194791e-05, + "loss": 0.23, + "step": 4966 + }, + { + "epoch": 0.08859201655192095, + "grad_norm": 0.3119671940803528, + "learning_rate": 4.430074919728862e-05, + "loss": 0.2268, + "step": 4967 + }, + { + "epoch": 0.08860985267363465, + "grad_norm": 0.3688700199127197, + "learning_rate": 4.430966821262933e-05, + "loss": 0.295, + "step": 4968 + }, + { + "epoch": 0.08862768879534834, + "grad_norm": 0.3509555160999298, + "learning_rate": 4.431858722797003e-05, + "loss": 0.3027, + "step": 4969 + }, + { + "epoch": 0.08864552491706204, + "grad_norm": 0.2632255256175995, + "learning_rate": 4.432750624331074e-05, + "loss": 0.1859, + "step": 4970 + }, + { + "epoch": 0.08866336103877573, + "grad_norm": 0.28133201599121094, + "learning_rate": 4.433642525865145e-05, + "loss": 0.2543, + "step": 4971 + }, + { + "epoch": 0.08868119716048942, + "grad_norm": 0.26210176944732666, + "learning_rate": 4.434534427399215e-05, + "loss": 0.2369, + "step": 4972 + }, + { + "epoch": 0.08869903328220312, + "grad_norm": 0.2326745241880417, + "learning_rate": 4.435426328933286e-05, + "loss": 0.2463, + "step": 4973 + }, + { + "epoch": 0.0887168694039168, + "grad_norm": 0.2702323794364929, + "learning_rate": 4.436318230467357e-05, + "loss": 0.2667, + "step": 4974 + }, + { + "epoch": 0.08873470552563051, + "grad_norm": 0.2708186209201813, + "learning_rate": 4.437210132001427e-05, + "loss": 0.2358, + "step": 4975 + }, + { + "epoch": 0.0887525416473442, + "grad_norm": 0.3113460838794708, + "learning_rate": 4.4381020335354976e-05, + "loss": 0.2446, + "step": 4976 + }, + { + "epoch": 0.0887703777690579, + "grad_norm": 0.42462819814682007, + "learning_rate": 4.438993935069569e-05, + "loss": 0.277, + "step": 4977 + }, + { + "epoch": 0.08878821389077159, + "grad_norm": 0.3471202254295349, + "learning_rate": 4.4398858366036396e-05, + "loss": 0.2758, + "step": 4978 + }, + { + "epoch": 0.08880605001248529, + "grad_norm": 0.4598161578178406, + "learning_rate": 4.4407777381377095e-05, + "loss": 0.3003, + "step": 4979 + }, + { + "epoch": 0.08882388613419898, + "grad_norm": 0.3271152079105377, + "learning_rate": 4.441669639671781e-05, + "loss": 0.3212, + "step": 4980 + }, + { + "epoch": 0.08884172225591268, + "grad_norm": 0.35090136528015137, + "learning_rate": 4.4425615412058515e-05, + "loss": 0.2549, + "step": 4981 + }, + { + "epoch": 0.08885955837762637, + "grad_norm": 0.3053995668888092, + "learning_rate": 4.4434534427399214e-05, + "loss": 0.2736, + "step": 4982 + }, + { + "epoch": 0.08887739449934007, + "grad_norm": 0.3149849772453308, + "learning_rate": 4.444345344273993e-05, + "loss": 0.2712, + "step": 4983 + }, + { + "epoch": 0.08889523062105376, + "grad_norm": 0.37885212898254395, + "learning_rate": 4.4452372458080634e-05, + "loss": 0.2931, + "step": 4984 + }, + { + "epoch": 0.08891306674276746, + "grad_norm": 0.30231600999832153, + "learning_rate": 4.4461291473421333e-05, + "loss": 0.2529, + "step": 4985 + }, + { + "epoch": 0.08893090286448115, + "grad_norm": 0.371078222990036, + "learning_rate": 4.447021048876204e-05, + "loss": 0.3152, + "step": 4986 + }, + { + "epoch": 0.08894873898619485, + "grad_norm": 0.29761597514152527, + "learning_rate": 4.447912950410275e-05, + "loss": 0.3016, + "step": 4987 + }, + { + "epoch": 0.08896657510790854, + "grad_norm": 0.32581159472465515, + "learning_rate": 4.448804851944345e-05, + "loss": 0.2872, + "step": 4988 + }, + { + "epoch": 0.08898441122962224, + "grad_norm": 0.32894980907440186, + "learning_rate": 4.449696753478416e-05, + "loss": 0.2952, + "step": 4989 + }, + { + "epoch": 0.08900224735133593, + "grad_norm": 0.36970022320747375, + "learning_rate": 4.450588655012487e-05, + "loss": 0.261, + "step": 4990 + }, + { + "epoch": 0.08902008347304961, + "grad_norm": 0.38638272881507874, + "learning_rate": 4.451480556546557e-05, + "loss": 0.3021, + "step": 4991 + }, + { + "epoch": 0.08903791959476332, + "grad_norm": 0.2911534607410431, + "learning_rate": 4.452372458080628e-05, + "loss": 0.2856, + "step": 4992 + }, + { + "epoch": 0.089055755716477, + "grad_norm": 0.33513301610946655, + "learning_rate": 4.453264359614699e-05, + "loss": 0.2912, + "step": 4993 + }, + { + "epoch": 0.0890735918381907, + "grad_norm": 0.2692374289035797, + "learning_rate": 4.454156261148769e-05, + "loss": 0.2442, + "step": 4994 + }, + { + "epoch": 0.08909142795990439, + "grad_norm": 0.38486340641975403, + "learning_rate": 4.45504816268284e-05, + "loss": 0.2701, + "step": 4995 + }, + { + "epoch": 0.0891092640816181, + "grad_norm": 0.34908363223075867, + "learning_rate": 4.455940064216911e-05, + "loss": 0.2173, + "step": 4996 + }, + { + "epoch": 0.08912710020333178, + "grad_norm": 0.30877748131752014, + "learning_rate": 4.456831965750981e-05, + "loss": 0.2494, + "step": 4997 + }, + { + "epoch": 0.08914493632504548, + "grad_norm": 0.3526933789253235, + "learning_rate": 4.4577238672850516e-05, + "loss": 0.2588, + "step": 4998 + }, + { + "epoch": 0.08916277244675917, + "grad_norm": 0.36577802896499634, + "learning_rate": 4.458615768819123e-05, + "loss": 0.2813, + "step": 4999 + }, + { + "epoch": 0.08918060856847287, + "grad_norm": 0.29689255356788635, + "learning_rate": 4.459507670353193e-05, + "loss": 0.2499, + "step": 5000 + }, + { + "epoch": 0.08918060856847287, + "eval_loss": 0.2507364749908447, + "eval_runtime": 2922.4672, + "eval_samples_per_second": 0.35, + "eval_steps_per_second": 0.059, + "step": 5000 + }, + { + "epoch": 0.08919844469018656, + "grad_norm": 0.3900417983531952, + "learning_rate": 4.4603995718872635e-05, + "loss": 0.2357, + "step": 5001 + }, + { + "epoch": 0.08921628081190026, + "grad_norm": 0.38965821266174316, + "learning_rate": 4.461291473421335e-05, + "loss": 0.2817, + "step": 5002 + }, + { + "epoch": 0.08923411693361395, + "grad_norm": 0.367204874753952, + "learning_rate": 4.4621833749554055e-05, + "loss": 0.2932, + "step": 5003 + }, + { + "epoch": 0.08925195305532765, + "grad_norm": 0.3464229702949524, + "learning_rate": 4.4630752764894755e-05, + "loss": 0.2551, + "step": 5004 + }, + { + "epoch": 0.08926978917704134, + "grad_norm": 0.4570404589176178, + "learning_rate": 4.463967178023547e-05, + "loss": 0.3015, + "step": 5005 + }, + { + "epoch": 0.08928762529875504, + "grad_norm": 0.35674813389778137, + "learning_rate": 4.4648590795576174e-05, + "loss": 0.2493, + "step": 5006 + }, + { + "epoch": 0.08930546142046873, + "grad_norm": 0.3751763701438904, + "learning_rate": 4.4657509810916874e-05, + "loss": 0.2948, + "step": 5007 + }, + { + "epoch": 0.08932329754218243, + "grad_norm": 0.3086523115634918, + "learning_rate": 4.466642882625759e-05, + "loss": 0.2599, + "step": 5008 + }, + { + "epoch": 0.08934113366389612, + "grad_norm": 0.2897014319896698, + "learning_rate": 4.467534784159829e-05, + "loss": 0.2575, + "step": 5009 + }, + { + "epoch": 0.08935896978560982, + "grad_norm": 0.29343584179878235, + "learning_rate": 4.468426685693899e-05, + "loss": 0.2027, + "step": 5010 + }, + { + "epoch": 0.08937680590732351, + "grad_norm": 0.30803829431533813, + "learning_rate": 4.46931858722797e-05, + "loss": 0.2133, + "step": 5011 + }, + { + "epoch": 0.0893946420290372, + "grad_norm": 0.3076460063457489, + "learning_rate": 4.470210488762041e-05, + "loss": 0.2652, + "step": 5012 + }, + { + "epoch": 0.0894124781507509, + "grad_norm": 0.3930947780609131, + "learning_rate": 4.471102390296111e-05, + "loss": 0.3208, + "step": 5013 + }, + { + "epoch": 0.08943031427246459, + "grad_norm": 0.2848929762840271, + "learning_rate": 4.471994291830182e-05, + "loss": 0.2346, + "step": 5014 + }, + { + "epoch": 0.08944815039417829, + "grad_norm": 0.3582049608230591, + "learning_rate": 4.472886193364253e-05, + "loss": 0.2863, + "step": 5015 + }, + { + "epoch": 0.08946598651589198, + "grad_norm": 0.28518861532211304, + "learning_rate": 4.473778094898323e-05, + "loss": 0.3019, + "step": 5016 + }, + { + "epoch": 0.08948382263760568, + "grad_norm": 0.44029179215431213, + "learning_rate": 4.474669996432394e-05, + "loss": 0.2976, + "step": 5017 + }, + { + "epoch": 0.08950165875931937, + "grad_norm": 0.26706603169441223, + "learning_rate": 4.475561897966465e-05, + "loss": 0.2644, + "step": 5018 + }, + { + "epoch": 0.08951949488103307, + "grad_norm": 0.4160476326942444, + "learning_rate": 4.476453799500535e-05, + "loss": 0.2355, + "step": 5019 + }, + { + "epoch": 0.08953733100274676, + "grad_norm": 0.35158583521842957, + "learning_rate": 4.477345701034606e-05, + "loss": 0.2537, + "step": 5020 + }, + { + "epoch": 0.08955516712446046, + "grad_norm": 0.3176898956298828, + "learning_rate": 4.478237602568677e-05, + "loss": 0.2393, + "step": 5021 + }, + { + "epoch": 0.08957300324617415, + "grad_norm": 0.3681906461715698, + "learning_rate": 4.479129504102747e-05, + "loss": 0.3118, + "step": 5022 + }, + { + "epoch": 0.08959083936788785, + "grad_norm": 0.4188217520713806, + "learning_rate": 4.4800214056368176e-05, + "loss": 0.2476, + "step": 5023 + }, + { + "epoch": 0.08960867548960154, + "grad_norm": 0.25581496953964233, + "learning_rate": 4.480913307170889e-05, + "loss": 0.255, + "step": 5024 + }, + { + "epoch": 0.08962651161131524, + "grad_norm": 0.29971587657928467, + "learning_rate": 4.4818052087049595e-05, + "loss": 0.2445, + "step": 5025 + }, + { + "epoch": 0.08964434773302893, + "grad_norm": 0.312463641166687, + "learning_rate": 4.4826971102390295e-05, + "loss": 0.2476, + "step": 5026 + }, + { + "epoch": 0.08966218385474263, + "grad_norm": 0.35160860419273376, + "learning_rate": 4.483589011773101e-05, + "loss": 0.2737, + "step": 5027 + }, + { + "epoch": 0.08968001997645632, + "grad_norm": 0.4038321375846863, + "learning_rate": 4.4844809133071715e-05, + "loss": 0.3297, + "step": 5028 + }, + { + "epoch": 0.08969785609817002, + "grad_norm": 0.43063589930534363, + "learning_rate": 4.4853728148412414e-05, + "loss": 0.2686, + "step": 5029 + }, + { + "epoch": 0.08971569221988371, + "grad_norm": 0.31962692737579346, + "learning_rate": 4.486264716375313e-05, + "loss": 0.2471, + "step": 5030 + }, + { + "epoch": 0.08973352834159741, + "grad_norm": 0.40811246633529663, + "learning_rate": 4.4871566179093834e-05, + "loss": 0.231, + "step": 5031 + }, + { + "epoch": 0.0897513644633111, + "grad_norm": 0.2768667936325073, + "learning_rate": 4.488048519443453e-05, + "loss": 0.2312, + "step": 5032 + }, + { + "epoch": 0.08976920058502479, + "grad_norm": 0.38604292273521423, + "learning_rate": 4.4889404209775246e-05, + "loss": 0.1854, + "step": 5033 + }, + { + "epoch": 0.08978703670673849, + "grad_norm": 0.27677640318870544, + "learning_rate": 4.489832322511595e-05, + "loss": 0.2339, + "step": 5034 + }, + { + "epoch": 0.08980487282845218, + "grad_norm": 0.32057884335517883, + "learning_rate": 4.490724224045665e-05, + "loss": 0.2316, + "step": 5035 + }, + { + "epoch": 0.08982270895016588, + "grad_norm": 0.3619246780872345, + "learning_rate": 4.491616125579736e-05, + "loss": 0.2728, + "step": 5036 + }, + { + "epoch": 0.08984054507187957, + "grad_norm": 0.4090946316719055, + "learning_rate": 4.492508027113807e-05, + "loss": 0.2635, + "step": 5037 + }, + { + "epoch": 0.08985838119359327, + "grad_norm": 0.4344578683376312, + "learning_rate": 4.493399928647877e-05, + "loss": 0.3319, + "step": 5038 + }, + { + "epoch": 0.08987621731530696, + "grad_norm": 0.3746897578239441, + "learning_rate": 4.494291830181948e-05, + "loss": 0.2776, + "step": 5039 + }, + { + "epoch": 0.08989405343702066, + "grad_norm": 0.45085152983665466, + "learning_rate": 4.495183731716019e-05, + "loss": 0.2663, + "step": 5040 + }, + { + "epoch": 0.08991188955873435, + "grad_norm": 0.3534441590309143, + "learning_rate": 4.496075633250089e-05, + "loss": 0.2743, + "step": 5041 + }, + { + "epoch": 0.08992972568044805, + "grad_norm": 0.319785475730896, + "learning_rate": 4.49696753478416e-05, + "loss": 0.2439, + "step": 5042 + }, + { + "epoch": 0.08994756180216174, + "grad_norm": 0.37395069003105164, + "learning_rate": 4.497859436318231e-05, + "loss": 0.2875, + "step": 5043 + }, + { + "epoch": 0.08996539792387544, + "grad_norm": 0.294408917427063, + "learning_rate": 4.498751337852301e-05, + "loss": 0.2552, + "step": 5044 + }, + { + "epoch": 0.08998323404558912, + "grad_norm": 0.291103720664978, + "learning_rate": 4.4996432393863716e-05, + "loss": 0.2545, + "step": 5045 + }, + { + "epoch": 0.09000107016730283, + "grad_norm": 0.24002714455127716, + "learning_rate": 4.500535140920443e-05, + "loss": 0.2506, + "step": 5046 + }, + { + "epoch": 0.09001890628901651, + "grad_norm": 0.2830826938152313, + "learning_rate": 4.501427042454513e-05, + "loss": 0.2416, + "step": 5047 + }, + { + "epoch": 0.09003674241073022, + "grad_norm": 0.4030207395553589, + "learning_rate": 4.5023189439885835e-05, + "loss": 0.3342, + "step": 5048 + }, + { + "epoch": 0.0900545785324439, + "grad_norm": 0.31538429856300354, + "learning_rate": 4.503210845522655e-05, + "loss": 0.2747, + "step": 5049 + }, + { + "epoch": 0.0900724146541576, + "grad_norm": 0.2968837320804596, + "learning_rate": 4.5041027470567255e-05, + "loss": 0.2613, + "step": 5050 + }, + { + "epoch": 0.0900902507758713, + "grad_norm": 0.2753019630908966, + "learning_rate": 4.5049946485907954e-05, + "loss": 0.2555, + "step": 5051 + }, + { + "epoch": 0.09010808689758498, + "grad_norm": 0.3237540125846863, + "learning_rate": 4.505886550124867e-05, + "loss": 0.2543, + "step": 5052 + }, + { + "epoch": 0.09012592301929868, + "grad_norm": 0.26061129570007324, + "learning_rate": 4.5067784516589374e-05, + "loss": 0.2532, + "step": 5053 + }, + { + "epoch": 0.09014375914101237, + "grad_norm": 0.23188789188861847, + "learning_rate": 4.5076703531930074e-05, + "loss": 0.2195, + "step": 5054 + }, + { + "epoch": 0.09016159526272607, + "grad_norm": 0.3326053321361542, + "learning_rate": 4.508562254727079e-05, + "loss": 0.2786, + "step": 5055 + }, + { + "epoch": 0.09017943138443976, + "grad_norm": 0.30801907181739807, + "learning_rate": 4.509454156261149e-05, + "loss": 0.2436, + "step": 5056 + }, + { + "epoch": 0.09019726750615346, + "grad_norm": 0.24577617645263672, + "learning_rate": 4.510346057795219e-05, + "loss": 0.2679, + "step": 5057 + }, + { + "epoch": 0.09021510362786715, + "grad_norm": 0.2857646048069, + "learning_rate": 4.5112379593292906e-05, + "loss": 0.2798, + "step": 5058 + }, + { + "epoch": 0.09023293974958085, + "grad_norm": 0.3690529465675354, + "learning_rate": 4.512129860863361e-05, + "loss": 0.3231, + "step": 5059 + }, + { + "epoch": 0.09025077587129454, + "grad_norm": 0.5927678346633911, + "learning_rate": 4.513021762397431e-05, + "loss": 0.3099, + "step": 5060 + }, + { + "epoch": 0.09026861199300824, + "grad_norm": 0.3103553056716919, + "learning_rate": 4.513913663931502e-05, + "loss": 0.3029, + "step": 5061 + }, + { + "epoch": 0.09028644811472193, + "grad_norm": 0.28680020570755005, + "learning_rate": 4.514805565465573e-05, + "loss": 0.2599, + "step": 5062 + }, + { + "epoch": 0.09030428423643563, + "grad_norm": 0.3033756911754608, + "learning_rate": 4.515697466999643e-05, + "loss": 0.2815, + "step": 5063 + }, + { + "epoch": 0.09032212035814932, + "grad_norm": 0.3972911238670349, + "learning_rate": 4.516589368533714e-05, + "loss": 0.2772, + "step": 5064 + }, + { + "epoch": 0.09033995647986302, + "grad_norm": 0.3096371293067932, + "learning_rate": 4.517481270067785e-05, + "loss": 0.2618, + "step": 5065 + }, + { + "epoch": 0.09035779260157671, + "grad_norm": 0.45703354477882385, + "learning_rate": 4.518373171601855e-05, + "loss": 0.2896, + "step": 5066 + }, + { + "epoch": 0.09037562872329041, + "grad_norm": 0.3486328125, + "learning_rate": 4.5192650731359257e-05, + "loss": 0.3234, + "step": 5067 + }, + { + "epoch": 0.0903934648450041, + "grad_norm": 0.29529869556427, + "learning_rate": 4.520156974669997e-05, + "loss": 0.2375, + "step": 5068 + }, + { + "epoch": 0.0904113009667178, + "grad_norm": 0.28080740571022034, + "learning_rate": 4.521048876204067e-05, + "loss": 0.2382, + "step": 5069 + }, + { + "epoch": 0.09042913708843149, + "grad_norm": 0.3457990288734436, + "learning_rate": 4.5219407777381376e-05, + "loss": 0.2526, + "step": 5070 + }, + { + "epoch": 0.09044697321014519, + "grad_norm": 0.36086606979370117, + "learning_rate": 4.522832679272209e-05, + "loss": 0.2684, + "step": 5071 + }, + { + "epoch": 0.09046480933185888, + "grad_norm": 0.3191145062446594, + "learning_rate": 4.5237245808062795e-05, + "loss": 0.2999, + "step": 5072 + }, + { + "epoch": 0.09048264545357257, + "grad_norm": 0.28212714195251465, + "learning_rate": 4.5246164823403495e-05, + "loss": 0.2525, + "step": 5073 + }, + { + "epoch": 0.09050048157528627, + "grad_norm": 0.31841787695884705, + "learning_rate": 4.525508383874421e-05, + "loss": 0.2332, + "step": 5074 + }, + { + "epoch": 0.09051831769699996, + "grad_norm": 0.26870378851890564, + "learning_rate": 4.5264002854084914e-05, + "loss": 0.2565, + "step": 5075 + }, + { + "epoch": 0.09053615381871366, + "grad_norm": 0.31851646304130554, + "learning_rate": 4.5272921869425614e-05, + "loss": 0.246, + "step": 5076 + }, + { + "epoch": 0.09055398994042735, + "grad_norm": 0.3371538817882538, + "learning_rate": 4.528184088476633e-05, + "loss": 0.2946, + "step": 5077 + }, + { + "epoch": 0.09057182606214105, + "grad_norm": 0.34399983286857605, + "learning_rate": 4.5290759900107034e-05, + "loss": 0.2527, + "step": 5078 + }, + { + "epoch": 0.09058966218385474, + "grad_norm": 0.34212473034858704, + "learning_rate": 4.529967891544773e-05, + "loss": 0.2578, + "step": 5079 + }, + { + "epoch": 0.09060749830556844, + "grad_norm": 0.31323686242103577, + "learning_rate": 4.5308597930788446e-05, + "loss": 0.265, + "step": 5080 + }, + { + "epoch": 0.09062533442728213, + "grad_norm": 0.31873616576194763, + "learning_rate": 4.531751694612915e-05, + "loss": 0.2875, + "step": 5081 + }, + { + "epoch": 0.09064317054899583, + "grad_norm": 0.28577306866645813, + "learning_rate": 4.532643596146985e-05, + "loss": 0.2017, + "step": 5082 + }, + { + "epoch": 0.09066100667070952, + "grad_norm": 0.2733782231807709, + "learning_rate": 4.5335354976810565e-05, + "loss": 0.2722, + "step": 5083 + }, + { + "epoch": 0.09067884279242322, + "grad_norm": 0.32702550292015076, + "learning_rate": 4.534427399215127e-05, + "loss": 0.3115, + "step": 5084 + }, + { + "epoch": 0.09069667891413691, + "grad_norm": 0.36680689454078674, + "learning_rate": 4.535319300749197e-05, + "loss": 0.2593, + "step": 5085 + }, + { + "epoch": 0.09071451503585061, + "grad_norm": 0.3376055955886841, + "learning_rate": 4.5362112022832685e-05, + "loss": 0.2365, + "step": 5086 + }, + { + "epoch": 0.0907323511575643, + "grad_norm": 0.3718792200088501, + "learning_rate": 4.537103103817339e-05, + "loss": 0.247, + "step": 5087 + }, + { + "epoch": 0.090750187279278, + "grad_norm": 0.28810974955558777, + "learning_rate": 4.537995005351409e-05, + "loss": 0.2292, + "step": 5088 + }, + { + "epoch": 0.09076802340099169, + "grad_norm": 0.3739653527736664, + "learning_rate": 4.53888690688548e-05, + "loss": 0.2592, + "step": 5089 + }, + { + "epoch": 0.09078585952270539, + "grad_norm": 0.6185100078582764, + "learning_rate": 4.539778808419551e-05, + "loss": 0.2439, + "step": 5090 + }, + { + "epoch": 0.09080369564441908, + "grad_norm": 0.33142533898353577, + "learning_rate": 4.540670709953621e-05, + "loss": 0.2394, + "step": 5091 + }, + { + "epoch": 0.09082153176613277, + "grad_norm": 0.38457319140434265, + "learning_rate": 4.5415626114876916e-05, + "loss": 0.3097, + "step": 5092 + }, + { + "epoch": 0.09083936788784647, + "grad_norm": 0.34872955083847046, + "learning_rate": 4.542454513021763e-05, + "loss": 0.273, + "step": 5093 + }, + { + "epoch": 0.09085720400956016, + "grad_norm": 0.3680689036846161, + "learning_rate": 4.543346414555833e-05, + "loss": 0.2887, + "step": 5094 + }, + { + "epoch": 0.09087504013127386, + "grad_norm": 0.35620009899139404, + "learning_rate": 4.5442383160899035e-05, + "loss": 0.2702, + "step": 5095 + }, + { + "epoch": 0.09089287625298755, + "grad_norm": 0.32059526443481445, + "learning_rate": 4.545130217623975e-05, + "loss": 0.2848, + "step": 5096 + }, + { + "epoch": 0.09091071237470125, + "grad_norm": 0.29133662581443787, + "learning_rate": 4.5460221191580455e-05, + "loss": 0.2701, + "step": 5097 + }, + { + "epoch": 0.09092854849641493, + "grad_norm": 0.31909459829330444, + "learning_rate": 4.5469140206921154e-05, + "loss": 0.2644, + "step": 5098 + }, + { + "epoch": 0.09094638461812864, + "grad_norm": 0.8448095917701721, + "learning_rate": 4.547805922226187e-05, + "loss": 0.3252, + "step": 5099 + }, + { + "epoch": 0.09096422073984232, + "grad_norm": 0.3424850404262543, + "learning_rate": 4.5486978237602574e-05, + "loss": 0.311, + "step": 5100 + }, + { + "epoch": 0.09098205686155603, + "grad_norm": 0.2732887864112854, + "learning_rate": 4.5495897252943274e-05, + "loss": 0.2886, + "step": 5101 + }, + { + "epoch": 0.09099989298326971, + "grad_norm": 0.265444815158844, + "learning_rate": 4.550481626828399e-05, + "loss": 0.2267, + "step": 5102 + }, + { + "epoch": 0.09101772910498342, + "grad_norm": 0.4205586314201355, + "learning_rate": 4.551373528362469e-05, + "loss": 0.322, + "step": 5103 + }, + { + "epoch": 0.0910355652266971, + "grad_norm": 0.3834914267063141, + "learning_rate": 4.552265429896539e-05, + "loss": 0.3337, + "step": 5104 + }, + { + "epoch": 0.0910534013484108, + "grad_norm": 0.2619171142578125, + "learning_rate": 4.5531573314306106e-05, + "loss": 0.2565, + "step": 5105 + }, + { + "epoch": 0.0910712374701245, + "grad_norm": 0.3529984951019287, + "learning_rate": 4.554049232964681e-05, + "loss": 0.2569, + "step": 5106 + }, + { + "epoch": 0.0910890735918382, + "grad_norm": 0.32578882575035095, + "learning_rate": 4.554941134498751e-05, + "loss": 0.2809, + "step": 5107 + }, + { + "epoch": 0.09110690971355188, + "grad_norm": 0.3088315427303314, + "learning_rate": 4.5558330360328225e-05, + "loss": 0.2674, + "step": 5108 + }, + { + "epoch": 0.09112474583526559, + "grad_norm": 0.3643244504928589, + "learning_rate": 4.556724937566893e-05, + "loss": 0.2374, + "step": 5109 + }, + { + "epoch": 0.09114258195697927, + "grad_norm": 0.3342103958129883, + "learning_rate": 4.557616839100963e-05, + "loss": 0.2853, + "step": 5110 + }, + { + "epoch": 0.09116041807869298, + "grad_norm": 0.2729085087776184, + "learning_rate": 4.5585087406350344e-05, + "loss": 0.2261, + "step": 5111 + }, + { + "epoch": 0.09117825420040666, + "grad_norm": 0.3470782935619354, + "learning_rate": 4.559400642169105e-05, + "loss": 0.2994, + "step": 5112 + }, + { + "epoch": 0.09119609032212035, + "grad_norm": 0.4202685058116913, + "learning_rate": 4.560292543703175e-05, + "loss": 0.2927, + "step": 5113 + }, + { + "epoch": 0.09121392644383405, + "grad_norm": 0.42108237743377686, + "learning_rate": 4.5611844452372456e-05, + "loss": 0.2534, + "step": 5114 + }, + { + "epoch": 0.09123176256554774, + "grad_norm": 0.29236888885498047, + "learning_rate": 4.562076346771317e-05, + "loss": 0.2436, + "step": 5115 + }, + { + "epoch": 0.09124959868726144, + "grad_norm": 0.40555712580680847, + "learning_rate": 4.562968248305387e-05, + "loss": 0.2821, + "step": 5116 + }, + { + "epoch": 0.09126743480897513, + "grad_norm": 0.3334695100784302, + "learning_rate": 4.5638601498394576e-05, + "loss": 0.253, + "step": 5117 + }, + { + "epoch": 0.09128527093068883, + "grad_norm": 0.2780584394931793, + "learning_rate": 4.564752051373529e-05, + "loss": 0.2183, + "step": 5118 + }, + { + "epoch": 0.09130310705240252, + "grad_norm": 0.4002425968647003, + "learning_rate": 4.565643952907599e-05, + "loss": 0.2399, + "step": 5119 + }, + { + "epoch": 0.09132094317411622, + "grad_norm": 0.3527221083641052, + "learning_rate": 4.5665358544416695e-05, + "loss": 0.2548, + "step": 5120 + }, + { + "epoch": 0.09133877929582991, + "grad_norm": 0.5263444185256958, + "learning_rate": 4.567427755975741e-05, + "loss": 0.2404, + "step": 5121 + }, + { + "epoch": 0.09135661541754361, + "grad_norm": 0.4406464695930481, + "learning_rate": 4.5683196575098114e-05, + "loss": 0.2126, + "step": 5122 + }, + { + "epoch": 0.0913744515392573, + "grad_norm": 0.29340827465057373, + "learning_rate": 4.5692115590438814e-05, + "loss": 0.2557, + "step": 5123 + }, + { + "epoch": 0.091392287660971, + "grad_norm": 0.4390156865119934, + "learning_rate": 4.570103460577953e-05, + "loss": 0.3126, + "step": 5124 + }, + { + "epoch": 0.09141012378268469, + "grad_norm": 0.26983797550201416, + "learning_rate": 4.5709953621120233e-05, + "loss": 0.26, + "step": 5125 + }, + { + "epoch": 0.09142795990439839, + "grad_norm": 0.34206798672676086, + "learning_rate": 4.571887263646093e-05, + "loss": 0.2877, + "step": 5126 + }, + { + "epoch": 0.09144579602611208, + "grad_norm": 0.3147221803665161, + "learning_rate": 4.5727791651801646e-05, + "loss": 0.3144, + "step": 5127 + }, + { + "epoch": 0.09146363214782578, + "grad_norm": 0.3480060398578644, + "learning_rate": 4.573671066714235e-05, + "loss": 0.2452, + "step": 5128 + }, + { + "epoch": 0.09148146826953947, + "grad_norm": 0.2871723175048828, + "learning_rate": 4.574562968248305e-05, + "loss": 0.2648, + "step": 5129 + }, + { + "epoch": 0.09149930439125317, + "grad_norm": 0.44824180006980896, + "learning_rate": 4.5754548697823765e-05, + "loss": 0.2682, + "step": 5130 + }, + { + "epoch": 0.09151714051296686, + "grad_norm": 0.2663950026035309, + "learning_rate": 4.576346771316447e-05, + "loss": 0.2481, + "step": 5131 + }, + { + "epoch": 0.09153497663468055, + "grad_norm": 0.33258599042892456, + "learning_rate": 4.577238672850517e-05, + "loss": 0.2881, + "step": 5132 + }, + { + "epoch": 0.09155281275639425, + "grad_norm": 0.33275696635246277, + "learning_rate": 4.5781305743845884e-05, + "loss": 0.1321, + "step": 5133 + }, + { + "epoch": 0.09157064887810794, + "grad_norm": 0.2972774803638458, + "learning_rate": 4.579022475918659e-05, + "loss": 0.2484, + "step": 5134 + }, + { + "epoch": 0.09158848499982164, + "grad_norm": 0.2894943654537201, + "learning_rate": 4.579914377452729e-05, + "loss": 0.2503, + "step": 5135 + }, + { + "epoch": 0.09160632112153533, + "grad_norm": 0.26859593391418457, + "learning_rate": 4.5808062789868004e-05, + "loss": 0.2456, + "step": 5136 + }, + { + "epoch": 0.09162415724324903, + "grad_norm": 0.31814223527908325, + "learning_rate": 4.581698180520871e-05, + "loss": 0.2894, + "step": 5137 + }, + { + "epoch": 0.09164199336496272, + "grad_norm": 0.31929415464401245, + "learning_rate": 4.582590082054941e-05, + "loss": 0.2556, + "step": 5138 + }, + { + "epoch": 0.09165982948667642, + "grad_norm": 0.28758710622787476, + "learning_rate": 4.5834819835890116e-05, + "loss": 0.2454, + "step": 5139 + }, + { + "epoch": 0.09167766560839011, + "grad_norm": 0.3237939178943634, + "learning_rate": 4.584373885123083e-05, + "loss": 0.2807, + "step": 5140 + }, + { + "epoch": 0.09169550173010381, + "grad_norm": 0.434439480304718, + "learning_rate": 4.585265786657153e-05, + "loss": 0.3086, + "step": 5141 + }, + { + "epoch": 0.0917133378518175, + "grad_norm": 0.3564210832118988, + "learning_rate": 4.5861576881912235e-05, + "loss": 0.2419, + "step": 5142 + }, + { + "epoch": 0.0917311739735312, + "grad_norm": 0.3027274012565613, + "learning_rate": 4.587049589725295e-05, + "loss": 0.2107, + "step": 5143 + }, + { + "epoch": 0.09174901009524489, + "grad_norm": 0.3049252927303314, + "learning_rate": 4.5879414912593655e-05, + "loss": 0.2887, + "step": 5144 + }, + { + "epoch": 0.09176684621695859, + "grad_norm": 0.31892675161361694, + "learning_rate": 4.5888333927934354e-05, + "loss": 0.251, + "step": 5145 + }, + { + "epoch": 0.09178468233867228, + "grad_norm": 0.34583544731140137, + "learning_rate": 4.589725294327507e-05, + "loss": 0.2338, + "step": 5146 + }, + { + "epoch": 0.09180251846038598, + "grad_norm": 0.2539772391319275, + "learning_rate": 4.5906171958615774e-05, + "loss": 0.2323, + "step": 5147 + }, + { + "epoch": 0.09182035458209967, + "grad_norm": 0.20981602370738983, + "learning_rate": 4.5915090973956473e-05, + "loss": 0.2041, + "step": 5148 + }, + { + "epoch": 0.09183819070381337, + "grad_norm": 0.32836177945137024, + "learning_rate": 4.5924009989297187e-05, + "loss": 0.262, + "step": 5149 + }, + { + "epoch": 0.09185602682552706, + "grad_norm": 0.3607218265533447, + "learning_rate": 4.593292900463789e-05, + "loss": 0.2704, + "step": 5150 + }, + { + "epoch": 0.09187386294724076, + "grad_norm": 0.33284297585487366, + "learning_rate": 4.594184801997859e-05, + "loss": 0.254, + "step": 5151 + }, + { + "epoch": 0.09189169906895445, + "grad_norm": 0.28724274039268494, + "learning_rate": 4.5950767035319306e-05, + "loss": 0.2936, + "step": 5152 + }, + { + "epoch": 0.09190953519066813, + "grad_norm": 0.3357253968715668, + "learning_rate": 4.595968605066001e-05, + "loss": 0.2872, + "step": 5153 + }, + { + "epoch": 0.09192737131238184, + "grad_norm": 0.28800761699676514, + "learning_rate": 4.596860506600071e-05, + "loss": 0.2446, + "step": 5154 + }, + { + "epoch": 0.09194520743409552, + "grad_norm": 0.41326186060905457, + "learning_rate": 4.5977524081341425e-05, + "loss": 0.2245, + "step": 5155 + }, + { + "epoch": 0.09196304355580923, + "grad_norm": 0.3237703740596771, + "learning_rate": 4.598644309668213e-05, + "loss": 0.292, + "step": 5156 + }, + { + "epoch": 0.09198087967752291, + "grad_norm": 0.33980509638786316, + "learning_rate": 4.599536211202283e-05, + "loss": 0.2641, + "step": 5157 + }, + { + "epoch": 0.09199871579923662, + "grad_norm": 0.4123384356498718, + "learning_rate": 4.6004281127363544e-05, + "loss": 0.2272, + "step": 5158 + }, + { + "epoch": 0.0920165519209503, + "grad_norm": 0.30670472979545593, + "learning_rate": 4.601320014270425e-05, + "loss": 0.2422, + "step": 5159 + }, + { + "epoch": 0.092034388042664, + "grad_norm": 0.31674468517303467, + "learning_rate": 4.602211915804495e-05, + "loss": 0.2867, + "step": 5160 + }, + { + "epoch": 0.0920522241643777, + "grad_norm": 0.3755401074886322, + "learning_rate": 4.603103817338566e-05, + "loss": 0.317, + "step": 5161 + }, + { + "epoch": 0.0920700602860914, + "grad_norm": 0.4044760763645172, + "learning_rate": 4.603995718872637e-05, + "loss": 0.3102, + "step": 5162 + }, + { + "epoch": 0.09208789640780508, + "grad_norm": 0.2901434600353241, + "learning_rate": 4.604887620406707e-05, + "loss": 0.2637, + "step": 5163 + }, + { + "epoch": 0.09210573252951879, + "grad_norm": 0.2719894349575043, + "learning_rate": 4.6057795219407775e-05, + "loss": 0.2421, + "step": 5164 + }, + { + "epoch": 0.09212356865123247, + "grad_norm": 0.27546483278274536, + "learning_rate": 4.606671423474849e-05, + "loss": 0.2401, + "step": 5165 + }, + { + "epoch": 0.09214140477294618, + "grad_norm": 0.3608858585357666, + "learning_rate": 4.607563325008919e-05, + "loss": 0.3028, + "step": 5166 + }, + { + "epoch": 0.09215924089465986, + "grad_norm": 0.2674354016780853, + "learning_rate": 4.6084552265429895e-05, + "loss": 0.2262, + "step": 5167 + }, + { + "epoch": 0.09217707701637357, + "grad_norm": 0.41327106952667236, + "learning_rate": 4.609347128077061e-05, + "loss": 0.2814, + "step": 5168 + }, + { + "epoch": 0.09219491313808725, + "grad_norm": 0.31517377495765686, + "learning_rate": 4.6102390296111314e-05, + "loss": 0.2511, + "step": 5169 + }, + { + "epoch": 0.09221274925980096, + "grad_norm": 0.2404153048992157, + "learning_rate": 4.6111309311452014e-05, + "loss": 0.2113, + "step": 5170 + }, + { + "epoch": 0.09223058538151464, + "grad_norm": 0.30768147110939026, + "learning_rate": 4.612022832679273e-05, + "loss": 0.2614, + "step": 5171 + }, + { + "epoch": 0.09224842150322833, + "grad_norm": 0.26597127318382263, + "learning_rate": 4.612914734213343e-05, + "loss": 0.2392, + "step": 5172 + }, + { + "epoch": 0.09226625762494203, + "grad_norm": 0.43181541562080383, + "learning_rate": 4.613806635747413e-05, + "loss": 0.2166, + "step": 5173 + }, + { + "epoch": 0.09228409374665572, + "grad_norm": 0.3380992114543915, + "learning_rate": 4.6146985372814846e-05, + "loss": 0.2685, + "step": 5174 + }, + { + "epoch": 0.09230192986836942, + "grad_norm": 0.32007479667663574, + "learning_rate": 4.615590438815555e-05, + "loss": 0.319, + "step": 5175 + }, + { + "epoch": 0.09231976599008311, + "grad_norm": 0.4780307710170746, + "learning_rate": 4.616482340349625e-05, + "loss": 0.2976, + "step": 5176 + }, + { + "epoch": 0.09233760211179681, + "grad_norm": 0.37829354405403137, + "learning_rate": 4.6173742418836965e-05, + "loss": 0.2676, + "step": 5177 + }, + { + "epoch": 0.0923554382335105, + "grad_norm": 0.34576961398124695, + "learning_rate": 4.618266143417767e-05, + "loss": 0.2991, + "step": 5178 + }, + { + "epoch": 0.0923732743552242, + "grad_norm": 0.3248400390148163, + "learning_rate": 4.619158044951837e-05, + "loss": 0.2702, + "step": 5179 + }, + { + "epoch": 0.09239111047693789, + "grad_norm": 0.3472265601158142, + "learning_rate": 4.6200499464859084e-05, + "loss": 0.2362, + "step": 5180 + }, + { + "epoch": 0.09240894659865159, + "grad_norm": 0.3690730035305023, + "learning_rate": 4.620941848019979e-05, + "loss": 0.2594, + "step": 5181 + }, + { + "epoch": 0.09242678272036528, + "grad_norm": 0.4123768210411072, + "learning_rate": 4.621833749554049e-05, + "loss": 0.3584, + "step": 5182 + }, + { + "epoch": 0.09244461884207898, + "grad_norm": 0.34526124596595764, + "learning_rate": 4.6227256510881203e-05, + "loss": 0.3334, + "step": 5183 + }, + { + "epoch": 0.09246245496379267, + "grad_norm": 0.4360329508781433, + "learning_rate": 4.623617552622191e-05, + "loss": 0.1976, + "step": 5184 + }, + { + "epoch": 0.09248029108550637, + "grad_norm": 0.27596762776374817, + "learning_rate": 4.624509454156261e-05, + "loss": 0.2612, + "step": 5185 + }, + { + "epoch": 0.09249812720722006, + "grad_norm": 0.3860202431678772, + "learning_rate": 4.625401355690332e-05, + "loss": 0.2488, + "step": 5186 + }, + { + "epoch": 0.09251596332893376, + "grad_norm": 0.3286002576351166, + "learning_rate": 4.626293257224403e-05, + "loss": 0.2386, + "step": 5187 + }, + { + "epoch": 0.09253379945064745, + "grad_norm": 0.6195679903030396, + "learning_rate": 4.627185158758473e-05, + "loss": 0.2499, + "step": 5188 + }, + { + "epoch": 0.09255163557236115, + "grad_norm": 0.30288946628570557, + "learning_rate": 4.628077060292544e-05, + "loss": 0.2776, + "step": 5189 + }, + { + "epoch": 0.09256947169407484, + "grad_norm": 0.43319621682167053, + "learning_rate": 4.628968961826615e-05, + "loss": 0.3176, + "step": 5190 + }, + { + "epoch": 0.09258730781578854, + "grad_norm": 0.2994520664215088, + "learning_rate": 4.6298608633606855e-05, + "loss": 0.276, + "step": 5191 + }, + { + "epoch": 0.09260514393750223, + "grad_norm": 0.3264714479446411, + "learning_rate": 4.6307527648947554e-05, + "loss": 0.327, + "step": 5192 + }, + { + "epoch": 0.09262298005921592, + "grad_norm": 0.4793960154056549, + "learning_rate": 4.631644666428827e-05, + "loss": 0.2698, + "step": 5193 + }, + { + "epoch": 0.09264081618092962, + "grad_norm": 0.3602599501609802, + "learning_rate": 4.6325365679628974e-05, + "loss": 0.2978, + "step": 5194 + }, + { + "epoch": 0.09265865230264331, + "grad_norm": 0.2885452210903168, + "learning_rate": 4.633428469496967e-05, + "loss": 0.2504, + "step": 5195 + }, + { + "epoch": 0.09267648842435701, + "grad_norm": 0.3294510245323181, + "learning_rate": 4.6343203710310386e-05, + "loss": 0.2831, + "step": 5196 + }, + { + "epoch": 0.0926943245460707, + "grad_norm": 0.25922679901123047, + "learning_rate": 4.635212272565109e-05, + "loss": 0.2595, + "step": 5197 + }, + { + "epoch": 0.0927121606677844, + "grad_norm": 0.25883087515830994, + "learning_rate": 4.636104174099179e-05, + "loss": 0.2436, + "step": 5198 + }, + { + "epoch": 0.09272999678949809, + "grad_norm": 0.35774874687194824, + "learning_rate": 4.6369960756332506e-05, + "loss": 0.2953, + "step": 5199 + }, + { + "epoch": 0.09274783291121179, + "grad_norm": 0.39790821075439453, + "learning_rate": 4.637887977167321e-05, + "loss": 0.288, + "step": 5200 + }, + { + "epoch": 0.09276566903292548, + "grad_norm": 0.3712117373943329, + "learning_rate": 4.638779878701391e-05, + "loss": 0.29, + "step": 5201 + }, + { + "epoch": 0.09278350515463918, + "grad_norm": 0.22073934972286224, + "learning_rate": 4.6396717802354625e-05, + "loss": 0.235, + "step": 5202 + }, + { + "epoch": 0.09280134127635287, + "grad_norm": 0.4138656258583069, + "learning_rate": 4.640563681769533e-05, + "loss": 0.2835, + "step": 5203 + }, + { + "epoch": 0.09281917739806657, + "grad_norm": 0.3466394543647766, + "learning_rate": 4.641455583303603e-05, + "loss": 0.2616, + "step": 5204 + }, + { + "epoch": 0.09283701351978026, + "grad_norm": 0.27189114689826965, + "learning_rate": 4.6423474848376744e-05, + "loss": 0.2538, + "step": 5205 + }, + { + "epoch": 0.09285484964149396, + "grad_norm": 0.2862494885921478, + "learning_rate": 4.643239386371745e-05, + "loss": 0.2021, + "step": 5206 + }, + { + "epoch": 0.09287268576320765, + "grad_norm": 0.3478764593601227, + "learning_rate": 4.644131287905815e-05, + "loss": 0.2739, + "step": 5207 + }, + { + "epoch": 0.09289052188492135, + "grad_norm": 0.4753137528896332, + "learning_rate": 4.645023189439886e-05, + "loss": 0.35, + "step": 5208 + }, + { + "epoch": 0.09290835800663504, + "grad_norm": 0.263412207365036, + "learning_rate": 4.645915090973957e-05, + "loss": 0.2261, + "step": 5209 + }, + { + "epoch": 0.09292619412834874, + "grad_norm": 0.33546873927116394, + "learning_rate": 4.646806992508027e-05, + "loss": 0.2356, + "step": 5210 + }, + { + "epoch": 0.09294403025006243, + "grad_norm": 0.4098803997039795, + "learning_rate": 4.647698894042098e-05, + "loss": 0.28, + "step": 5211 + }, + { + "epoch": 0.09296186637177613, + "grad_norm": 0.37068724632263184, + "learning_rate": 4.648590795576169e-05, + "loss": 0.312, + "step": 5212 + }, + { + "epoch": 0.09297970249348982, + "grad_norm": 0.2915990948677063, + "learning_rate": 4.649482697110239e-05, + "loss": 0.2282, + "step": 5213 + }, + { + "epoch": 0.0929975386152035, + "grad_norm": 0.425316721200943, + "learning_rate": 4.65037459864431e-05, + "loss": 0.2766, + "step": 5214 + }, + { + "epoch": 0.0930153747369172, + "grad_norm": 0.2935897707939148, + "learning_rate": 4.651266500178381e-05, + "loss": 0.2302, + "step": 5215 + }, + { + "epoch": 0.0930332108586309, + "grad_norm": 0.3157883584499359, + "learning_rate": 4.6521584017124514e-05, + "loss": 0.2216, + "step": 5216 + }, + { + "epoch": 0.0930510469803446, + "grad_norm": 0.2781791090965271, + "learning_rate": 4.6530503032465214e-05, + "loss": 0.243, + "step": 5217 + }, + { + "epoch": 0.09306888310205828, + "grad_norm": 0.25051501393318176, + "learning_rate": 4.653942204780593e-05, + "loss": 0.211, + "step": 5218 + }, + { + "epoch": 0.09308671922377199, + "grad_norm": 0.3523801565170288, + "learning_rate": 4.654834106314663e-05, + "loss": 0.2804, + "step": 5219 + }, + { + "epoch": 0.09310455534548567, + "grad_norm": 0.29995816946029663, + "learning_rate": 4.655726007848733e-05, + "loss": 0.2772, + "step": 5220 + }, + { + "epoch": 0.09312239146719938, + "grad_norm": 0.35171619057655334, + "learning_rate": 4.6566179093828046e-05, + "loss": 0.2686, + "step": 5221 + }, + { + "epoch": 0.09314022758891306, + "grad_norm": 0.37922653555870056, + "learning_rate": 4.657509810916875e-05, + "loss": 0.2992, + "step": 5222 + }, + { + "epoch": 0.09315806371062677, + "grad_norm": 0.3115438222885132, + "learning_rate": 4.658401712450945e-05, + "loss": 0.2606, + "step": 5223 + }, + { + "epoch": 0.09317589983234045, + "grad_norm": 0.3204931616783142, + "learning_rate": 4.6592936139850165e-05, + "loss": 0.2658, + "step": 5224 + }, + { + "epoch": 0.09319373595405415, + "grad_norm": 0.38351380825042725, + "learning_rate": 4.660185515519087e-05, + "loss": 0.239, + "step": 5225 + }, + { + "epoch": 0.09321157207576784, + "grad_norm": 0.3427961766719818, + "learning_rate": 4.661077417053157e-05, + "loss": 0.2878, + "step": 5226 + }, + { + "epoch": 0.09322940819748154, + "grad_norm": 0.3301418721675873, + "learning_rate": 4.6619693185872284e-05, + "loss": 0.2721, + "step": 5227 + }, + { + "epoch": 0.09324724431919523, + "grad_norm": 0.5384332537651062, + "learning_rate": 4.662861220121299e-05, + "loss": 0.3164, + "step": 5228 + }, + { + "epoch": 0.09326508044090893, + "grad_norm": 0.2636808753013611, + "learning_rate": 4.663753121655369e-05, + "loss": 0.2435, + "step": 5229 + }, + { + "epoch": 0.09328291656262262, + "grad_norm": 0.4945782423019409, + "learning_rate": 4.66464502318944e-05, + "loss": 0.2538, + "step": 5230 + }, + { + "epoch": 0.09330075268433632, + "grad_norm": 0.25015437602996826, + "learning_rate": 4.665536924723511e-05, + "loss": 0.2274, + "step": 5231 + }, + { + "epoch": 0.09331858880605001, + "grad_norm": 0.32471516728401184, + "learning_rate": 4.666428826257581e-05, + "loss": 0.2635, + "step": 5232 + }, + { + "epoch": 0.0933364249277637, + "grad_norm": 0.2583746314048767, + "learning_rate": 4.667320727791652e-05, + "loss": 0.2349, + "step": 5233 + }, + { + "epoch": 0.0933542610494774, + "grad_norm": 0.3352038264274597, + "learning_rate": 4.668212629325723e-05, + "loss": 0.2629, + "step": 5234 + }, + { + "epoch": 0.09337209717119109, + "grad_norm": 0.3794134259223938, + "learning_rate": 4.669104530859793e-05, + "loss": 0.3188, + "step": 5235 + }, + { + "epoch": 0.09338993329290479, + "grad_norm": 0.2985273003578186, + "learning_rate": 4.669996432393864e-05, + "loss": 0.2435, + "step": 5236 + }, + { + "epoch": 0.09340776941461848, + "grad_norm": 0.2983872592449188, + "learning_rate": 4.670888333927935e-05, + "loss": 0.2601, + "step": 5237 + }, + { + "epoch": 0.09342560553633218, + "grad_norm": 0.3920135498046875, + "learning_rate": 4.6717802354620054e-05, + "loss": 0.2733, + "step": 5238 + }, + { + "epoch": 0.09344344165804587, + "grad_norm": 0.3617052137851715, + "learning_rate": 4.672672136996076e-05, + "loss": 0.2423, + "step": 5239 + }, + { + "epoch": 0.09346127777975957, + "grad_norm": 0.3396837115287781, + "learning_rate": 4.673564038530147e-05, + "loss": 0.2946, + "step": 5240 + }, + { + "epoch": 0.09347911390147326, + "grad_norm": 0.2999764680862427, + "learning_rate": 4.6744559400642174e-05, + "loss": 0.2479, + "step": 5241 + }, + { + "epoch": 0.09349695002318696, + "grad_norm": 0.3175443112850189, + "learning_rate": 4.675347841598287e-05, + "loss": 0.2647, + "step": 5242 + }, + { + "epoch": 0.09351478614490065, + "grad_norm": 0.2902558445930481, + "learning_rate": 4.6762397431323586e-05, + "loss": 0.2912, + "step": 5243 + }, + { + "epoch": 0.09353262226661435, + "grad_norm": 0.2970888614654541, + "learning_rate": 4.677131644666429e-05, + "loss": 0.2887, + "step": 5244 + }, + { + "epoch": 0.09355045838832804, + "grad_norm": 0.26533448696136475, + "learning_rate": 4.678023546200499e-05, + "loss": 0.2495, + "step": 5245 + }, + { + "epoch": 0.09356829451004174, + "grad_norm": 0.42240414023399353, + "learning_rate": 4.6789154477345705e-05, + "loss": 0.2821, + "step": 5246 + }, + { + "epoch": 0.09358613063175543, + "grad_norm": 0.34711533784866333, + "learning_rate": 4.679807349268641e-05, + "loss": 0.3043, + "step": 5247 + }, + { + "epoch": 0.09360396675346913, + "grad_norm": 0.3298856317996979, + "learning_rate": 4.680699250802711e-05, + "loss": 0.3044, + "step": 5248 + }, + { + "epoch": 0.09362180287518282, + "grad_norm": 0.4107668101787567, + "learning_rate": 4.6815911523367825e-05, + "loss": 0.2271, + "step": 5249 + }, + { + "epoch": 0.09363963899689652, + "grad_norm": 0.3656250238418579, + "learning_rate": 4.682483053870853e-05, + "loss": 0.2262, + "step": 5250 + }, + { + "epoch": 0.09365747511861021, + "grad_norm": 0.2461140900850296, + "learning_rate": 4.683374955404923e-05, + "loss": 0.2149, + "step": 5251 + }, + { + "epoch": 0.09367531124032391, + "grad_norm": 0.3072248101234436, + "learning_rate": 4.6842668569389944e-05, + "loss": 0.255, + "step": 5252 + }, + { + "epoch": 0.0936931473620376, + "grad_norm": 0.27899712324142456, + "learning_rate": 4.685158758473065e-05, + "loss": 0.2694, + "step": 5253 + }, + { + "epoch": 0.09371098348375129, + "grad_norm": 0.23378194868564606, + "learning_rate": 4.686050660007135e-05, + "loss": 0.2397, + "step": 5254 + }, + { + "epoch": 0.09372881960546499, + "grad_norm": 0.3421534597873688, + "learning_rate": 4.686942561541206e-05, + "loss": 0.2887, + "step": 5255 + }, + { + "epoch": 0.09374665572717868, + "grad_norm": 0.2325400561094284, + "learning_rate": 4.687834463075277e-05, + "loss": 0.2041, + "step": 5256 + }, + { + "epoch": 0.09376449184889238, + "grad_norm": 0.3640061318874359, + "learning_rate": 4.688726364609347e-05, + "loss": 0.2629, + "step": 5257 + }, + { + "epoch": 0.09378232797060607, + "grad_norm": 0.41511374711990356, + "learning_rate": 4.689618266143418e-05, + "loss": 0.3519, + "step": 5258 + }, + { + "epoch": 0.09380016409231977, + "grad_norm": 0.30702728033065796, + "learning_rate": 4.690510167677489e-05, + "loss": 0.2396, + "step": 5259 + }, + { + "epoch": 0.09381800021403346, + "grad_norm": 0.3504076600074768, + "learning_rate": 4.691402069211559e-05, + "loss": 0.2682, + "step": 5260 + }, + { + "epoch": 0.09383583633574716, + "grad_norm": 0.4777551293373108, + "learning_rate": 4.69229397074563e-05, + "loss": 0.314, + "step": 5261 + }, + { + "epoch": 0.09385367245746085, + "grad_norm": 0.32501375675201416, + "learning_rate": 4.693185872279701e-05, + "loss": 0.2605, + "step": 5262 + }, + { + "epoch": 0.09387150857917455, + "grad_norm": 0.3325204849243164, + "learning_rate": 4.6940777738137714e-05, + "loss": 0.2222, + "step": 5263 + }, + { + "epoch": 0.09388934470088824, + "grad_norm": 0.2469899207353592, + "learning_rate": 4.694969675347842e-05, + "loss": 0.22, + "step": 5264 + }, + { + "epoch": 0.09390718082260194, + "grad_norm": 0.30303481221199036, + "learning_rate": 4.695861576881913e-05, + "loss": 0.253, + "step": 5265 + }, + { + "epoch": 0.09392501694431563, + "grad_norm": 0.25014814734458923, + "learning_rate": 4.696753478415983e-05, + "loss": 0.2438, + "step": 5266 + }, + { + "epoch": 0.09394285306602933, + "grad_norm": 0.3906334936618805, + "learning_rate": 4.697645379950053e-05, + "loss": 0.3427, + "step": 5267 + }, + { + "epoch": 0.09396068918774302, + "grad_norm": 0.32432055473327637, + "learning_rate": 4.6985372814841246e-05, + "loss": 0.2924, + "step": 5268 + }, + { + "epoch": 0.09397852530945672, + "grad_norm": 0.34173494577407837, + "learning_rate": 4.699429183018195e-05, + "loss": 0.2475, + "step": 5269 + }, + { + "epoch": 0.0939963614311704, + "grad_norm": 0.3409624397754669, + "learning_rate": 4.700321084552265e-05, + "loss": 0.2213, + "step": 5270 + }, + { + "epoch": 0.09401419755288411, + "grad_norm": 0.29635345935821533, + "learning_rate": 4.7012129860863365e-05, + "loss": 0.2292, + "step": 5271 + }, + { + "epoch": 0.0940320336745978, + "grad_norm": 0.2836691737174988, + "learning_rate": 4.702104887620407e-05, + "loss": 0.252, + "step": 5272 + }, + { + "epoch": 0.09404986979631148, + "grad_norm": 0.31412312388420105, + "learning_rate": 4.702996789154477e-05, + "loss": 0.2582, + "step": 5273 + }, + { + "epoch": 0.09406770591802519, + "grad_norm": 0.45611318945884705, + "learning_rate": 4.7038886906885484e-05, + "loss": 0.2486, + "step": 5274 + }, + { + "epoch": 0.09408554203973887, + "grad_norm": 0.30665716528892517, + "learning_rate": 4.704780592222619e-05, + "loss": 0.2198, + "step": 5275 + }, + { + "epoch": 0.09410337816145257, + "grad_norm": 0.3930690884590149, + "learning_rate": 4.705672493756689e-05, + "loss": 0.2439, + "step": 5276 + }, + { + "epoch": 0.09412121428316626, + "grad_norm": 0.2558427155017853, + "learning_rate": 4.70656439529076e-05, + "loss": 0.2428, + "step": 5277 + }, + { + "epoch": 0.09413905040487996, + "grad_norm": 0.3950040340423584, + "learning_rate": 4.707456296824831e-05, + "loss": 0.2951, + "step": 5278 + }, + { + "epoch": 0.09415688652659365, + "grad_norm": 0.41172513365745544, + "learning_rate": 4.708348198358901e-05, + "loss": 0.3457, + "step": 5279 + }, + { + "epoch": 0.09417472264830735, + "grad_norm": 0.31354084610939026, + "learning_rate": 4.709240099892972e-05, + "loss": 0.3089, + "step": 5280 + }, + { + "epoch": 0.09419255877002104, + "grad_norm": 0.26249364018440247, + "learning_rate": 4.710132001427043e-05, + "loss": 0.2708, + "step": 5281 + }, + { + "epoch": 0.09421039489173474, + "grad_norm": 0.3628444969654083, + "learning_rate": 4.711023902961113e-05, + "loss": 0.3027, + "step": 5282 + }, + { + "epoch": 0.09422823101344843, + "grad_norm": 0.40232929587364197, + "learning_rate": 4.711915804495184e-05, + "loss": 0.2462, + "step": 5283 + }, + { + "epoch": 0.09424606713516213, + "grad_norm": 0.4444749653339386, + "learning_rate": 4.712807706029255e-05, + "loss": 0.2286, + "step": 5284 + }, + { + "epoch": 0.09426390325687582, + "grad_norm": 0.31657105684280396, + "learning_rate": 4.713699607563325e-05, + "loss": 0.2834, + "step": 5285 + }, + { + "epoch": 0.09428173937858952, + "grad_norm": 0.36973780393600464, + "learning_rate": 4.714591509097396e-05, + "loss": 0.2961, + "step": 5286 + }, + { + "epoch": 0.09429957550030321, + "grad_norm": 0.28114810585975647, + "learning_rate": 4.715483410631467e-05, + "loss": 0.2506, + "step": 5287 + }, + { + "epoch": 0.09431741162201691, + "grad_norm": 0.3041308522224426, + "learning_rate": 4.7163753121655373e-05, + "loss": 0.2832, + "step": 5288 + }, + { + "epoch": 0.0943352477437306, + "grad_norm": 0.3077862560749054, + "learning_rate": 4.717267213699608e-05, + "loss": 0.2693, + "step": 5289 + }, + { + "epoch": 0.0943530838654443, + "grad_norm": 0.2772987484931946, + "learning_rate": 4.7181591152336786e-05, + "loss": 0.2663, + "step": 5290 + }, + { + "epoch": 0.09437091998715799, + "grad_norm": 0.3235217332839966, + "learning_rate": 4.719051016767749e-05, + "loss": 0.2553, + "step": 5291 + }, + { + "epoch": 0.0943887561088717, + "grad_norm": 0.3222964406013489, + "learning_rate": 4.71994291830182e-05, + "loss": 0.228, + "step": 5292 + }, + { + "epoch": 0.09440659223058538, + "grad_norm": 0.3652717173099518, + "learning_rate": 4.7208348198358905e-05, + "loss": 0.3347, + "step": 5293 + }, + { + "epoch": 0.09442442835229907, + "grad_norm": 0.3086948096752167, + "learning_rate": 4.721726721369961e-05, + "loss": 0.2518, + "step": 5294 + }, + { + "epoch": 0.09444226447401277, + "grad_norm": 0.3053515553474426, + "learning_rate": 4.722618622904031e-05, + "loss": 0.2566, + "step": 5295 + }, + { + "epoch": 0.09446010059572646, + "grad_norm": 0.3497569262981415, + "learning_rate": 4.7235105244381024e-05, + "loss": 0.2766, + "step": 5296 + }, + { + "epoch": 0.09447793671744016, + "grad_norm": 0.2803959846496582, + "learning_rate": 4.724402425972173e-05, + "loss": 0.2321, + "step": 5297 + }, + { + "epoch": 0.09449577283915385, + "grad_norm": 0.2764897346496582, + "learning_rate": 4.725294327506243e-05, + "loss": 0.2117, + "step": 5298 + }, + { + "epoch": 0.09451360896086755, + "grad_norm": 0.4972369074821472, + "learning_rate": 4.7261862290403144e-05, + "loss": 0.2178, + "step": 5299 + }, + { + "epoch": 0.09453144508258124, + "grad_norm": 0.31249862909317017, + "learning_rate": 4.727078130574385e-05, + "loss": 0.2337, + "step": 5300 + }, + { + "epoch": 0.09454928120429494, + "grad_norm": 0.39970827102661133, + "learning_rate": 4.727970032108455e-05, + "loss": 0.2638, + "step": 5301 + }, + { + "epoch": 0.09456711732600863, + "grad_norm": 0.28730496764183044, + "learning_rate": 4.728861933642526e-05, + "loss": 0.2407, + "step": 5302 + }, + { + "epoch": 0.09458495344772233, + "grad_norm": 0.3620060384273529, + "learning_rate": 4.729753835176597e-05, + "loss": 0.2429, + "step": 5303 + }, + { + "epoch": 0.09460278956943602, + "grad_norm": 0.281197726726532, + "learning_rate": 4.730645736710667e-05, + "loss": 0.1939, + "step": 5304 + }, + { + "epoch": 0.09462062569114972, + "grad_norm": 0.3197873830795288, + "learning_rate": 4.731537638244738e-05, + "loss": 0.2763, + "step": 5305 + }, + { + "epoch": 0.09463846181286341, + "grad_norm": 0.23553217947483063, + "learning_rate": 4.732429539778809e-05, + "loss": 0.2014, + "step": 5306 + }, + { + "epoch": 0.09465629793457711, + "grad_norm": 0.355283260345459, + "learning_rate": 4.733321441312879e-05, + "loss": 0.2888, + "step": 5307 + }, + { + "epoch": 0.0946741340562908, + "grad_norm": 0.34615418314933777, + "learning_rate": 4.73421334284695e-05, + "loss": 0.2915, + "step": 5308 + }, + { + "epoch": 0.0946919701780045, + "grad_norm": 0.3840189278125763, + "learning_rate": 4.735105244381021e-05, + "loss": 0.2847, + "step": 5309 + }, + { + "epoch": 0.09470980629971819, + "grad_norm": 0.37069171667099, + "learning_rate": 4.7359971459150914e-05, + "loss": 0.2917, + "step": 5310 + }, + { + "epoch": 0.09472764242143189, + "grad_norm": 0.27350127696990967, + "learning_rate": 4.736889047449162e-05, + "loss": 0.2408, + "step": 5311 + }, + { + "epoch": 0.09474547854314558, + "grad_norm": 0.31825244426727295, + "learning_rate": 4.7377809489832327e-05, + "loss": 0.2986, + "step": 5312 + }, + { + "epoch": 0.09476331466485927, + "grad_norm": 0.524906575679779, + "learning_rate": 4.738672850517303e-05, + "loss": 0.2627, + "step": 5313 + }, + { + "epoch": 0.09478115078657297, + "grad_norm": 0.3162332773208618, + "learning_rate": 4.739564752051374e-05, + "loss": 0.2529, + "step": 5314 + }, + { + "epoch": 0.09479898690828666, + "grad_norm": 0.3042309582233429, + "learning_rate": 4.7404566535854446e-05, + "loss": 0.2023, + "step": 5315 + }, + { + "epoch": 0.09481682303000036, + "grad_norm": 0.27491849660873413, + "learning_rate": 4.741348555119515e-05, + "loss": 0.2531, + "step": 5316 + }, + { + "epoch": 0.09483465915171405, + "grad_norm": 0.2788602411746979, + "learning_rate": 4.742240456653586e-05, + "loss": 0.2505, + "step": 5317 + }, + { + "epoch": 0.09485249527342775, + "grad_norm": 0.23961052298545837, + "learning_rate": 4.7431323581876565e-05, + "loss": 0.2002, + "step": 5318 + }, + { + "epoch": 0.09487033139514144, + "grad_norm": 0.27642861008644104, + "learning_rate": 4.744024259721727e-05, + "loss": 0.2329, + "step": 5319 + }, + { + "epoch": 0.09488816751685514, + "grad_norm": 0.21278324723243713, + "learning_rate": 4.744916161255797e-05, + "loss": 0.2096, + "step": 5320 + }, + { + "epoch": 0.09490600363856883, + "grad_norm": 0.24676239490509033, + "learning_rate": 4.7458080627898684e-05, + "loss": 0.2357, + "step": 5321 + }, + { + "epoch": 0.09492383976028253, + "grad_norm": 0.2675091326236725, + "learning_rate": 4.746699964323939e-05, + "loss": 0.2178, + "step": 5322 + }, + { + "epoch": 0.09494167588199622, + "grad_norm": 0.26896363496780396, + "learning_rate": 4.747591865858009e-05, + "loss": 0.228, + "step": 5323 + }, + { + "epoch": 0.09495951200370992, + "grad_norm": 0.3176270127296448, + "learning_rate": 4.74848376739208e-05, + "loss": 0.2915, + "step": 5324 + }, + { + "epoch": 0.0949773481254236, + "grad_norm": 0.2676611840724945, + "learning_rate": 4.749375668926151e-05, + "loss": 0.2518, + "step": 5325 + }, + { + "epoch": 0.09499518424713731, + "grad_norm": 0.4852003753185272, + "learning_rate": 4.750267570460221e-05, + "loss": 0.3007, + "step": 5326 + }, + { + "epoch": 0.095013020368851, + "grad_norm": 0.32852569222450256, + "learning_rate": 4.751159471994292e-05, + "loss": 0.2334, + "step": 5327 + }, + { + "epoch": 0.0950308564905647, + "grad_norm": 0.33095842599868774, + "learning_rate": 4.752051373528363e-05, + "loss": 0.2846, + "step": 5328 + }, + { + "epoch": 0.09504869261227838, + "grad_norm": 0.2583049535751343, + "learning_rate": 4.752943275062433e-05, + "loss": 0.2385, + "step": 5329 + }, + { + "epoch": 0.09506652873399209, + "grad_norm": 0.3359510004520416, + "learning_rate": 4.753835176596504e-05, + "loss": 0.2899, + "step": 5330 + }, + { + "epoch": 0.09508436485570577, + "grad_norm": 0.3022625744342804, + "learning_rate": 4.754727078130575e-05, + "loss": 0.2725, + "step": 5331 + }, + { + "epoch": 0.09510220097741948, + "grad_norm": 0.3416677713394165, + "learning_rate": 4.755618979664645e-05, + "loss": 0.2925, + "step": 5332 + }, + { + "epoch": 0.09512003709913316, + "grad_norm": 0.3762374520301819, + "learning_rate": 4.756510881198716e-05, + "loss": 0.2084, + "step": 5333 + }, + { + "epoch": 0.09513787322084685, + "grad_norm": 0.28740838170051575, + "learning_rate": 4.757402782732787e-05, + "loss": 0.2459, + "step": 5334 + }, + { + "epoch": 0.09515570934256055, + "grad_norm": 0.30601343512535095, + "learning_rate": 4.758294684266857e-05, + "loss": 0.2231, + "step": 5335 + }, + { + "epoch": 0.09517354546427424, + "grad_norm": 0.3672378659248352, + "learning_rate": 4.759186585800928e-05, + "loss": 0.2593, + "step": 5336 + }, + { + "epoch": 0.09519138158598794, + "grad_norm": 0.39896172285079956, + "learning_rate": 4.7600784873349986e-05, + "loss": 0.3436, + "step": 5337 + }, + { + "epoch": 0.09520921770770163, + "grad_norm": 0.22721467912197113, + "learning_rate": 4.760970388869069e-05, + "loss": 0.2066, + "step": 5338 + }, + { + "epoch": 0.09522705382941533, + "grad_norm": 0.22346056997776031, + "learning_rate": 4.76186229040314e-05, + "loss": 0.2144, + "step": 5339 + }, + { + "epoch": 0.09524488995112902, + "grad_norm": 0.29692816734313965, + "learning_rate": 4.7627541919372105e-05, + "loss": 0.2772, + "step": 5340 + }, + { + "epoch": 0.09526272607284272, + "grad_norm": 0.2819810211658478, + "learning_rate": 4.763646093471281e-05, + "loss": 0.2394, + "step": 5341 + }, + { + "epoch": 0.09528056219455641, + "grad_norm": 0.3281337022781372, + "learning_rate": 4.764537995005352e-05, + "loss": 0.2827, + "step": 5342 + }, + { + "epoch": 0.09529839831627011, + "grad_norm": 0.4458293318748474, + "learning_rate": 4.7654298965394224e-05, + "loss": 0.3562, + "step": 5343 + }, + { + "epoch": 0.0953162344379838, + "grad_norm": 0.33938318490982056, + "learning_rate": 4.766321798073493e-05, + "loss": 0.2642, + "step": 5344 + }, + { + "epoch": 0.0953340705596975, + "grad_norm": 0.4199501872062683, + "learning_rate": 4.767213699607563e-05, + "loss": 0.2311, + "step": 5345 + }, + { + "epoch": 0.09535190668141119, + "grad_norm": 0.25305017828941345, + "learning_rate": 4.7681056011416343e-05, + "loss": 0.2398, + "step": 5346 + }, + { + "epoch": 0.0953697428031249, + "grad_norm": 0.2190845012664795, + "learning_rate": 4.768997502675705e-05, + "loss": 0.241, + "step": 5347 + }, + { + "epoch": 0.09538757892483858, + "grad_norm": 0.2902757227420807, + "learning_rate": 4.769889404209775e-05, + "loss": 0.2832, + "step": 5348 + }, + { + "epoch": 0.09540541504655228, + "grad_norm": 0.302707314491272, + "learning_rate": 4.770781305743846e-05, + "loss": 0.2757, + "step": 5349 + }, + { + "epoch": 0.09542325116826597, + "grad_norm": 0.29162052273750305, + "learning_rate": 4.771673207277917e-05, + "loss": 0.2517, + "step": 5350 + }, + { + "epoch": 0.09544108728997967, + "grad_norm": 0.22246886789798737, + "learning_rate": 4.772565108811987e-05, + "loss": 0.2389, + "step": 5351 + }, + { + "epoch": 0.09545892341169336, + "grad_norm": 0.2861518859863281, + "learning_rate": 4.773457010346058e-05, + "loss": 0.1903, + "step": 5352 + }, + { + "epoch": 0.09547675953340706, + "grad_norm": 0.28319862484931946, + "learning_rate": 4.774348911880129e-05, + "loss": 0.1957, + "step": 5353 + }, + { + "epoch": 0.09549459565512075, + "grad_norm": 0.31579911708831787, + "learning_rate": 4.775240813414199e-05, + "loss": 0.2167, + "step": 5354 + }, + { + "epoch": 0.09551243177683444, + "grad_norm": 0.2679722309112549, + "learning_rate": 4.77613271494827e-05, + "loss": 0.2543, + "step": 5355 + }, + { + "epoch": 0.09553026789854814, + "grad_norm": 0.40429264307022095, + "learning_rate": 4.777024616482341e-05, + "loss": 0.2748, + "step": 5356 + }, + { + "epoch": 0.09554810402026183, + "grad_norm": 0.2967761158943176, + "learning_rate": 4.7779165180164114e-05, + "loss": 0.2459, + "step": 5357 + }, + { + "epoch": 0.09556594014197553, + "grad_norm": 0.35104766488075256, + "learning_rate": 4.778808419550482e-05, + "loss": 0.2629, + "step": 5358 + }, + { + "epoch": 0.09558377626368922, + "grad_norm": 0.31640005111694336, + "learning_rate": 4.7797003210845526e-05, + "loss": 0.2828, + "step": 5359 + }, + { + "epoch": 0.09560161238540292, + "grad_norm": 0.32061511278152466, + "learning_rate": 4.780592222618623e-05, + "loss": 0.3031, + "step": 5360 + }, + { + "epoch": 0.09561944850711661, + "grad_norm": 0.31277865171432495, + "learning_rate": 4.781484124152694e-05, + "loss": 0.2654, + "step": 5361 + }, + { + "epoch": 0.09563728462883031, + "grad_norm": 0.26078227162361145, + "learning_rate": 4.7823760256867646e-05, + "loss": 0.2486, + "step": 5362 + }, + { + "epoch": 0.095655120750544, + "grad_norm": 0.3343501389026642, + "learning_rate": 4.783267927220835e-05, + "loss": 0.2137, + "step": 5363 + }, + { + "epoch": 0.0956729568722577, + "grad_norm": 0.35817328095436096, + "learning_rate": 4.784159828754906e-05, + "loss": 0.2698, + "step": 5364 + }, + { + "epoch": 0.09569079299397139, + "grad_norm": 0.3671835958957672, + "learning_rate": 4.7850517302889765e-05, + "loss": 0.3125, + "step": 5365 + }, + { + "epoch": 0.09570862911568509, + "grad_norm": 0.27490052580833435, + "learning_rate": 4.785943631823047e-05, + "loss": 0.2397, + "step": 5366 + }, + { + "epoch": 0.09572646523739878, + "grad_norm": 0.3614984154701233, + "learning_rate": 4.786835533357118e-05, + "loss": 0.285, + "step": 5367 + }, + { + "epoch": 0.09574430135911248, + "grad_norm": 0.3216387927532196, + "learning_rate": 4.7877274348911884e-05, + "loss": 0.2746, + "step": 5368 + }, + { + "epoch": 0.09576213748082617, + "grad_norm": 0.43186280131340027, + "learning_rate": 4.788619336425259e-05, + "loss": 0.3013, + "step": 5369 + }, + { + "epoch": 0.09577997360253987, + "grad_norm": 0.29047054052352905, + "learning_rate": 4.789511237959329e-05, + "loss": 0.2795, + "step": 5370 + }, + { + "epoch": 0.09579780972425356, + "grad_norm": 0.30678704380989075, + "learning_rate": 4.7904031394934e-05, + "loss": 0.2537, + "step": 5371 + }, + { + "epoch": 0.09581564584596726, + "grad_norm": 0.2642018795013428, + "learning_rate": 4.791295041027471e-05, + "loss": 0.2307, + "step": 5372 + }, + { + "epoch": 0.09583348196768095, + "grad_norm": 0.30768072605133057, + "learning_rate": 4.792186942561541e-05, + "loss": 0.3143, + "step": 5373 + }, + { + "epoch": 0.09585131808939464, + "grad_norm": 0.3095901310443878, + "learning_rate": 4.793078844095612e-05, + "loss": 0.2578, + "step": 5374 + }, + { + "epoch": 0.09586915421110834, + "grad_norm": 0.34432873129844666, + "learning_rate": 4.793970745629683e-05, + "loss": 0.2356, + "step": 5375 + }, + { + "epoch": 0.09588699033282203, + "grad_norm": 0.24537573754787445, + "learning_rate": 4.794862647163753e-05, + "loss": 0.2332, + "step": 5376 + }, + { + "epoch": 0.09590482645453573, + "grad_norm": 0.2724122703075409, + "learning_rate": 4.795754548697824e-05, + "loss": 0.2105, + "step": 5377 + }, + { + "epoch": 0.09592266257624942, + "grad_norm": 0.42682817578315735, + "learning_rate": 4.796646450231895e-05, + "loss": 0.2668, + "step": 5378 + }, + { + "epoch": 0.09594049869796312, + "grad_norm": 0.29142510890960693, + "learning_rate": 4.797538351765965e-05, + "loss": 0.2147, + "step": 5379 + }, + { + "epoch": 0.0959583348196768, + "grad_norm": 0.357695996761322, + "learning_rate": 4.798430253300036e-05, + "loss": 0.2597, + "step": 5380 + }, + { + "epoch": 0.0959761709413905, + "grad_norm": 0.35795679688453674, + "learning_rate": 4.799322154834107e-05, + "loss": 0.208, + "step": 5381 + }, + { + "epoch": 0.0959940070631042, + "grad_norm": 0.299063503742218, + "learning_rate": 4.800214056368177e-05, + "loss": 0.2133, + "step": 5382 + }, + { + "epoch": 0.0960118431848179, + "grad_norm": 0.34796178340911865, + "learning_rate": 4.801105957902248e-05, + "loss": 0.2668, + "step": 5383 + }, + { + "epoch": 0.09602967930653158, + "grad_norm": 0.32690906524658203, + "learning_rate": 4.8019978594363186e-05, + "loss": 0.2751, + "step": 5384 + }, + { + "epoch": 0.09604751542824529, + "grad_norm": 0.271119624376297, + "learning_rate": 4.802889760970389e-05, + "loss": 0.2691, + "step": 5385 + }, + { + "epoch": 0.09606535154995897, + "grad_norm": 0.35632744431495667, + "learning_rate": 4.80378166250446e-05, + "loss": 0.3074, + "step": 5386 + }, + { + "epoch": 0.09608318767167268, + "grad_norm": 0.32233545184135437, + "learning_rate": 4.8046735640385305e-05, + "loss": 0.3139, + "step": 5387 + }, + { + "epoch": 0.09610102379338636, + "grad_norm": 0.3475266695022583, + "learning_rate": 4.805565465572601e-05, + "loss": 0.2816, + "step": 5388 + }, + { + "epoch": 0.09611885991510007, + "grad_norm": 0.3758280575275421, + "learning_rate": 4.806457367106672e-05, + "loss": 0.2633, + "step": 5389 + }, + { + "epoch": 0.09613669603681375, + "grad_norm": 0.2562137544155121, + "learning_rate": 4.8073492686407424e-05, + "loss": 0.2342, + "step": 5390 + }, + { + "epoch": 0.09615453215852746, + "grad_norm": 0.25612273812294006, + "learning_rate": 4.808241170174813e-05, + "loss": 0.2334, + "step": 5391 + }, + { + "epoch": 0.09617236828024114, + "grad_norm": 0.22181525826454163, + "learning_rate": 4.809133071708884e-05, + "loss": 0.2256, + "step": 5392 + }, + { + "epoch": 0.09619020440195485, + "grad_norm": 0.28419962525367737, + "learning_rate": 4.810024973242954e-05, + "loss": 0.2522, + "step": 5393 + }, + { + "epoch": 0.09620804052366853, + "grad_norm": 0.2620362639427185, + "learning_rate": 4.810916874777025e-05, + "loss": 0.2352, + "step": 5394 + }, + { + "epoch": 0.09622587664538222, + "grad_norm": 0.48510581254959106, + "learning_rate": 4.8118087763110956e-05, + "loss": 0.3419, + "step": 5395 + }, + { + "epoch": 0.09624371276709592, + "grad_norm": 0.20889174938201904, + "learning_rate": 4.812700677845166e-05, + "loss": 0.2265, + "step": 5396 + }, + { + "epoch": 0.09626154888880961, + "grad_norm": 0.20810353755950928, + "learning_rate": 4.813592579379237e-05, + "loss": 0.2407, + "step": 5397 + }, + { + "epoch": 0.09627938501052331, + "grad_norm": 0.25173771381378174, + "learning_rate": 4.814484480913307e-05, + "loss": 0.2459, + "step": 5398 + }, + { + "epoch": 0.096297221132237, + "grad_norm": 0.25757449865341187, + "learning_rate": 4.815376382447378e-05, + "loss": 0.2338, + "step": 5399 + }, + { + "epoch": 0.0963150572539507, + "grad_norm": 0.27500173449516296, + "learning_rate": 4.816268283981449e-05, + "loss": 0.2514, + "step": 5400 + }, + { + "epoch": 0.09633289337566439, + "grad_norm": 0.3292427659034729, + "learning_rate": 4.817160185515519e-05, + "loss": 0.2288, + "step": 5401 + }, + { + "epoch": 0.0963507294973781, + "grad_norm": 0.38144394755363464, + "learning_rate": 4.81805208704959e-05, + "loss": 0.3091, + "step": 5402 + }, + { + "epoch": 0.09636856561909178, + "grad_norm": 0.2699589729309082, + "learning_rate": 4.818943988583661e-05, + "loss": 0.2236, + "step": 5403 + }, + { + "epoch": 0.09638640174080548, + "grad_norm": 0.31738099455833435, + "learning_rate": 4.819835890117731e-05, + "loss": 0.2289, + "step": 5404 + }, + { + "epoch": 0.09640423786251917, + "grad_norm": 0.36914145946502686, + "learning_rate": 4.820727791651802e-05, + "loss": 0.2181, + "step": 5405 + }, + { + "epoch": 0.09642207398423287, + "grad_norm": 0.37151944637298584, + "learning_rate": 4.8216196931858726e-05, + "loss": 0.2308, + "step": 5406 + }, + { + "epoch": 0.09643991010594656, + "grad_norm": 0.3182366192340851, + "learning_rate": 4.822511594719943e-05, + "loss": 0.2641, + "step": 5407 + }, + { + "epoch": 0.09645774622766026, + "grad_norm": 0.3091346323490143, + "learning_rate": 4.823403496254014e-05, + "loss": 0.2352, + "step": 5408 + }, + { + "epoch": 0.09647558234937395, + "grad_norm": 0.3038909137248993, + "learning_rate": 4.8242953977880845e-05, + "loss": 0.247, + "step": 5409 + }, + { + "epoch": 0.09649341847108765, + "grad_norm": 0.41333627700805664, + "learning_rate": 4.825187299322155e-05, + "loss": 0.3271, + "step": 5410 + }, + { + "epoch": 0.09651125459280134, + "grad_norm": 0.46484991908073425, + "learning_rate": 4.826079200856226e-05, + "loss": 0.3217, + "step": 5411 + }, + { + "epoch": 0.09652909071451504, + "grad_norm": 0.3277193605899811, + "learning_rate": 4.8269711023902965e-05, + "loss": 0.2014, + "step": 5412 + }, + { + "epoch": 0.09654692683622873, + "grad_norm": 0.3197810649871826, + "learning_rate": 4.827863003924367e-05, + "loss": 0.2664, + "step": 5413 + }, + { + "epoch": 0.09656476295794242, + "grad_norm": 0.31339699029922485, + "learning_rate": 4.828754905458438e-05, + "loss": 0.2722, + "step": 5414 + }, + { + "epoch": 0.09658259907965612, + "grad_norm": 0.3026168942451477, + "learning_rate": 4.8296468069925084e-05, + "loss": 0.2607, + "step": 5415 + }, + { + "epoch": 0.09660043520136981, + "grad_norm": 0.44053876399993896, + "learning_rate": 4.830538708526579e-05, + "loss": 0.2941, + "step": 5416 + }, + { + "epoch": 0.09661827132308351, + "grad_norm": 0.33031144738197327, + "learning_rate": 4.8314306100606497e-05, + "loss": 0.2289, + "step": 5417 + }, + { + "epoch": 0.0966361074447972, + "grad_norm": 0.3416697680950165, + "learning_rate": 4.83232251159472e-05, + "loss": 0.2692, + "step": 5418 + }, + { + "epoch": 0.0966539435665109, + "grad_norm": 0.2503843903541565, + "learning_rate": 4.833214413128791e-05, + "loss": 0.2092, + "step": 5419 + }, + { + "epoch": 0.09667177968822459, + "grad_norm": 0.3404706120491028, + "learning_rate": 4.8341063146628616e-05, + "loss": 0.2676, + "step": 5420 + }, + { + "epoch": 0.09668961580993829, + "grad_norm": 0.33275705575942993, + "learning_rate": 4.834998216196932e-05, + "loss": 0.2713, + "step": 5421 + }, + { + "epoch": 0.09670745193165198, + "grad_norm": 0.32658156752586365, + "learning_rate": 4.835890117731003e-05, + "loss": 0.2691, + "step": 5422 + }, + { + "epoch": 0.09672528805336568, + "grad_norm": 0.270504355430603, + "learning_rate": 4.836782019265073e-05, + "loss": 0.2213, + "step": 5423 + }, + { + "epoch": 0.09674312417507937, + "grad_norm": 0.410617470741272, + "learning_rate": 4.837673920799144e-05, + "loss": 0.2993, + "step": 5424 + }, + { + "epoch": 0.09676096029679307, + "grad_norm": 0.2827683091163635, + "learning_rate": 4.838565822333215e-05, + "loss": 0.218, + "step": 5425 + }, + { + "epoch": 0.09677879641850676, + "grad_norm": 0.2774164080619812, + "learning_rate": 4.839457723867285e-05, + "loss": 0.2318, + "step": 5426 + }, + { + "epoch": 0.09679663254022046, + "grad_norm": 0.570665717124939, + "learning_rate": 4.840349625401356e-05, + "loss": 0.2653, + "step": 5427 + }, + { + "epoch": 0.09681446866193415, + "grad_norm": 0.3123404383659363, + "learning_rate": 4.841241526935427e-05, + "loss": 0.2858, + "step": 5428 + }, + { + "epoch": 0.09683230478364785, + "grad_norm": 0.3542862832546234, + "learning_rate": 4.842133428469497e-05, + "loss": 0.2319, + "step": 5429 + }, + { + "epoch": 0.09685014090536154, + "grad_norm": 0.26503410935401917, + "learning_rate": 4.843025330003568e-05, + "loss": 0.218, + "step": 5430 + }, + { + "epoch": 0.09686797702707524, + "grad_norm": 0.26394808292388916, + "learning_rate": 4.8439172315376386e-05, + "loss": 0.2321, + "step": 5431 + }, + { + "epoch": 0.09688581314878893, + "grad_norm": 0.37308600544929504, + "learning_rate": 4.844809133071709e-05, + "loss": 0.3088, + "step": 5432 + }, + { + "epoch": 0.09690364927050263, + "grad_norm": 0.2688678801059723, + "learning_rate": 4.84570103460578e-05, + "loss": 0.2611, + "step": 5433 + }, + { + "epoch": 0.09692148539221632, + "grad_norm": 0.2813870310783386, + "learning_rate": 4.8465929361398505e-05, + "loss": 0.2311, + "step": 5434 + }, + { + "epoch": 0.09693932151393, + "grad_norm": 0.27423232793807983, + "learning_rate": 4.847484837673921e-05, + "loss": 0.2561, + "step": 5435 + }, + { + "epoch": 0.0969571576356437, + "grad_norm": 0.2912781834602356, + "learning_rate": 4.848376739207992e-05, + "loss": 0.2867, + "step": 5436 + }, + { + "epoch": 0.0969749937573574, + "grad_norm": 0.38686278462409973, + "learning_rate": 4.8492686407420624e-05, + "loss": 0.217, + "step": 5437 + }, + { + "epoch": 0.0969928298790711, + "grad_norm": 0.48345714807510376, + "learning_rate": 4.850160542276133e-05, + "loss": 0.3164, + "step": 5438 + }, + { + "epoch": 0.09701066600078478, + "grad_norm": 0.3831360638141632, + "learning_rate": 4.851052443810204e-05, + "loss": 0.2246, + "step": 5439 + }, + { + "epoch": 0.09702850212249849, + "grad_norm": 0.32400625944137573, + "learning_rate": 4.851944345344274e-05, + "loss": 0.2892, + "step": 5440 + }, + { + "epoch": 0.09704633824421217, + "grad_norm": 0.24737146496772766, + "learning_rate": 4.852836246878345e-05, + "loss": 0.2222, + "step": 5441 + }, + { + "epoch": 0.09706417436592588, + "grad_norm": 0.31008994579315186, + "learning_rate": 4.8537281484124156e-05, + "loss": 0.2704, + "step": 5442 + }, + { + "epoch": 0.09708201048763956, + "grad_norm": 0.27175506949424744, + "learning_rate": 4.854620049946486e-05, + "loss": 0.1977, + "step": 5443 + }, + { + "epoch": 0.09709984660935327, + "grad_norm": 0.39254119992256165, + "learning_rate": 4.855511951480557e-05, + "loss": 0.2606, + "step": 5444 + }, + { + "epoch": 0.09711768273106695, + "grad_norm": 0.33907565474510193, + "learning_rate": 4.8564038530146275e-05, + "loss": 0.3119, + "step": 5445 + }, + { + "epoch": 0.09713551885278066, + "grad_norm": 0.3085770010948181, + "learning_rate": 4.857295754548698e-05, + "loss": 0.2404, + "step": 5446 + }, + { + "epoch": 0.09715335497449434, + "grad_norm": 0.3873482048511505, + "learning_rate": 4.858187656082769e-05, + "loss": 0.2773, + "step": 5447 + }, + { + "epoch": 0.09717119109620805, + "grad_norm": 0.3149682283401489, + "learning_rate": 4.859079557616839e-05, + "loss": 0.2989, + "step": 5448 + }, + { + "epoch": 0.09718902721792173, + "grad_norm": 0.35003674030303955, + "learning_rate": 4.85997145915091e-05, + "loss": 0.257, + "step": 5449 + }, + { + "epoch": 0.09720686333963544, + "grad_norm": 0.2753257155418396, + "learning_rate": 4.860863360684981e-05, + "loss": 0.2999, + "step": 5450 + }, + { + "epoch": 0.09722469946134912, + "grad_norm": 0.2559419870376587, + "learning_rate": 4.861755262219051e-05, + "loss": 0.2464, + "step": 5451 + }, + { + "epoch": 0.09724253558306283, + "grad_norm": 0.34245607256889343, + "learning_rate": 4.862647163753122e-05, + "loss": 0.3227, + "step": 5452 + }, + { + "epoch": 0.09726037170477651, + "grad_norm": 0.28112515807151794, + "learning_rate": 4.8635390652871926e-05, + "loss": 0.2514, + "step": 5453 + }, + { + "epoch": 0.0972782078264902, + "grad_norm": 0.32024791836738586, + "learning_rate": 4.864430966821263e-05, + "loss": 0.2171, + "step": 5454 + }, + { + "epoch": 0.0972960439482039, + "grad_norm": 0.2943370044231415, + "learning_rate": 4.865322868355334e-05, + "loss": 0.263, + "step": 5455 + }, + { + "epoch": 0.09731388006991759, + "grad_norm": 0.2543685734272003, + "learning_rate": 4.8662147698894045e-05, + "loss": 0.273, + "step": 5456 + }, + { + "epoch": 0.09733171619163129, + "grad_norm": 0.4515843689441681, + "learning_rate": 4.867106671423475e-05, + "loss": 0.2636, + "step": 5457 + }, + { + "epoch": 0.09734955231334498, + "grad_norm": 0.29468849301338196, + "learning_rate": 4.867998572957546e-05, + "loss": 0.2461, + "step": 5458 + }, + { + "epoch": 0.09736738843505868, + "grad_norm": 0.353140652179718, + "learning_rate": 4.8688904744916164e-05, + "loss": 0.2307, + "step": 5459 + }, + { + "epoch": 0.09738522455677237, + "grad_norm": 0.3664565086364746, + "learning_rate": 4.869782376025687e-05, + "loss": 0.3103, + "step": 5460 + }, + { + "epoch": 0.09740306067848607, + "grad_norm": 0.3541789948940277, + "learning_rate": 4.870674277559758e-05, + "loss": 0.27, + "step": 5461 + }, + { + "epoch": 0.09742089680019976, + "grad_norm": 0.4054414927959442, + "learning_rate": 4.8715661790938284e-05, + "loss": 0.1805, + "step": 5462 + }, + { + "epoch": 0.09743873292191346, + "grad_norm": 0.34643250703811646, + "learning_rate": 4.872458080627899e-05, + "loss": 0.2653, + "step": 5463 + }, + { + "epoch": 0.09745656904362715, + "grad_norm": 0.30009132623672485, + "learning_rate": 4.8733499821619696e-05, + "loss": 0.2255, + "step": 5464 + }, + { + "epoch": 0.09747440516534085, + "grad_norm": 0.29243677854537964, + "learning_rate": 4.87424188369604e-05, + "loss": 0.2862, + "step": 5465 + }, + { + "epoch": 0.09749224128705454, + "grad_norm": 0.3629752993583679, + "learning_rate": 4.875133785230111e-05, + "loss": 0.31, + "step": 5466 + }, + { + "epoch": 0.09751007740876824, + "grad_norm": 0.29389265179634094, + "learning_rate": 4.8760256867641816e-05, + "loss": 0.2725, + "step": 5467 + }, + { + "epoch": 0.09752791353048193, + "grad_norm": 0.35133081674575806, + "learning_rate": 4.876917588298252e-05, + "loss": 0.2451, + "step": 5468 + }, + { + "epoch": 0.09754574965219563, + "grad_norm": 0.3372002840042114, + "learning_rate": 4.877809489832323e-05, + "loss": 0.3232, + "step": 5469 + }, + { + "epoch": 0.09756358577390932, + "grad_norm": 0.2796429395675659, + "learning_rate": 4.8787013913663935e-05, + "loss": 0.2139, + "step": 5470 + }, + { + "epoch": 0.09758142189562302, + "grad_norm": 0.3312259018421173, + "learning_rate": 4.879593292900464e-05, + "loss": 0.3258, + "step": 5471 + }, + { + "epoch": 0.09759925801733671, + "grad_norm": 0.3399675190448761, + "learning_rate": 4.880485194434535e-05, + "loss": 0.2568, + "step": 5472 + }, + { + "epoch": 0.09761709413905041, + "grad_norm": 0.31488168239593506, + "learning_rate": 4.881377095968605e-05, + "loss": 0.2757, + "step": 5473 + }, + { + "epoch": 0.0976349302607641, + "grad_norm": 0.34534600377082825, + "learning_rate": 4.882268997502676e-05, + "loss": 0.3318, + "step": 5474 + }, + { + "epoch": 0.09765276638247779, + "grad_norm": 0.32041308283805847, + "learning_rate": 4.8831608990367467e-05, + "loss": 0.2694, + "step": 5475 + }, + { + "epoch": 0.09767060250419149, + "grad_norm": 0.23527833819389343, + "learning_rate": 4.884052800570817e-05, + "loss": 0.2659, + "step": 5476 + }, + { + "epoch": 0.09768843862590518, + "grad_norm": 0.43653279542922974, + "learning_rate": 4.884944702104888e-05, + "loss": 0.2946, + "step": 5477 + }, + { + "epoch": 0.09770627474761888, + "grad_norm": 0.2972584068775177, + "learning_rate": 4.8858366036389586e-05, + "loss": 0.2748, + "step": 5478 + }, + { + "epoch": 0.09772411086933257, + "grad_norm": 0.2785152494907379, + "learning_rate": 4.886728505173029e-05, + "loss": 0.2956, + "step": 5479 + }, + { + "epoch": 0.09774194699104627, + "grad_norm": 0.2990381121635437, + "learning_rate": 4.8876204067071e-05, + "loss": 0.2354, + "step": 5480 + }, + { + "epoch": 0.09775978311275996, + "grad_norm": 0.29772382974624634, + "learning_rate": 4.8885123082411705e-05, + "loss": 0.247, + "step": 5481 + }, + { + "epoch": 0.09777761923447366, + "grad_norm": 0.3232877850532532, + "learning_rate": 4.889404209775241e-05, + "loss": 0.2143, + "step": 5482 + }, + { + "epoch": 0.09779545535618735, + "grad_norm": 0.3917143940925598, + "learning_rate": 4.890296111309312e-05, + "loss": 0.2619, + "step": 5483 + }, + { + "epoch": 0.09781329147790105, + "grad_norm": 0.2928524911403656, + "learning_rate": 4.8911880128433824e-05, + "loss": 0.2634, + "step": 5484 + }, + { + "epoch": 0.09783112759961474, + "grad_norm": 0.4801047444343567, + "learning_rate": 4.892079914377453e-05, + "loss": 0.3373, + "step": 5485 + }, + { + "epoch": 0.09784896372132844, + "grad_norm": 0.2984585464000702, + "learning_rate": 4.892971815911524e-05, + "loss": 0.2119, + "step": 5486 + }, + { + "epoch": 0.09786679984304213, + "grad_norm": 0.2977357804775238, + "learning_rate": 4.893863717445594e-05, + "loss": 0.272, + "step": 5487 + }, + { + "epoch": 0.09788463596475583, + "grad_norm": 0.24359388649463654, + "learning_rate": 4.894755618979665e-05, + "loss": 0.2274, + "step": 5488 + }, + { + "epoch": 0.09790247208646952, + "grad_norm": 0.3061986565589905, + "learning_rate": 4.8956475205137356e-05, + "loss": 0.223, + "step": 5489 + }, + { + "epoch": 0.09792030820818322, + "grad_norm": 0.36622732877731323, + "learning_rate": 4.896539422047806e-05, + "loss": 0.2769, + "step": 5490 + }, + { + "epoch": 0.0979381443298969, + "grad_norm": 0.3394840657711029, + "learning_rate": 4.897431323581877e-05, + "loss": 0.189, + "step": 5491 + }, + { + "epoch": 0.09795598045161061, + "grad_norm": 0.3280045986175537, + "learning_rate": 4.8983232251159475e-05, + "loss": 0.2626, + "step": 5492 + }, + { + "epoch": 0.0979738165733243, + "grad_norm": 0.2870257496833801, + "learning_rate": 4.899215126650018e-05, + "loss": 0.2451, + "step": 5493 + }, + { + "epoch": 0.09799165269503798, + "grad_norm": 0.19842657446861267, + "learning_rate": 4.900107028184089e-05, + "loss": 0.2062, + "step": 5494 + }, + { + "epoch": 0.09800948881675169, + "grad_norm": 0.30510666966438293, + "learning_rate": 4.9009989297181594e-05, + "loss": 0.2533, + "step": 5495 + }, + { + "epoch": 0.09802732493846537, + "grad_norm": 0.28291577100753784, + "learning_rate": 4.90189083125223e-05, + "loss": 0.2441, + "step": 5496 + }, + { + "epoch": 0.09804516106017908, + "grad_norm": 0.3856520652770996, + "learning_rate": 4.902782732786301e-05, + "loss": 0.1969, + "step": 5497 + }, + { + "epoch": 0.09806299718189276, + "grad_norm": 0.2804949879646301, + "learning_rate": 4.903674634320371e-05, + "loss": 0.2579, + "step": 5498 + }, + { + "epoch": 0.09808083330360647, + "grad_norm": 0.3010903596878052, + "learning_rate": 4.904566535854442e-05, + "loss": 0.2279, + "step": 5499 + }, + { + "epoch": 0.09809866942532015, + "grad_norm": 0.3615068197250366, + "learning_rate": 4.9054584373885126e-05, + "loss": 0.2812, + "step": 5500 + }, + { + "epoch": 0.09811650554703386, + "grad_norm": 0.24918560683727264, + "learning_rate": 4.906350338922583e-05, + "loss": 0.2402, + "step": 5501 + }, + { + "epoch": 0.09813434166874754, + "grad_norm": 0.37588775157928467, + "learning_rate": 4.907242240456654e-05, + "loss": 0.2753, + "step": 5502 + }, + { + "epoch": 0.09815217779046125, + "grad_norm": 0.35019493103027344, + "learning_rate": 4.9081341419907245e-05, + "loss": 0.3374, + "step": 5503 + }, + { + "epoch": 0.09817001391217493, + "grad_norm": 0.28179264068603516, + "learning_rate": 4.909026043524795e-05, + "loss": 0.2798, + "step": 5504 + }, + { + "epoch": 0.09818785003388864, + "grad_norm": 0.2809401750564575, + "learning_rate": 4.909917945058866e-05, + "loss": 0.2487, + "step": 5505 + }, + { + "epoch": 0.09820568615560232, + "grad_norm": 0.6416525840759277, + "learning_rate": 4.9108098465929364e-05, + "loss": 0.2463, + "step": 5506 + }, + { + "epoch": 0.09822352227731602, + "grad_norm": 0.25745102763175964, + "learning_rate": 4.911701748127007e-05, + "loss": 0.2536, + "step": 5507 + }, + { + "epoch": 0.09824135839902971, + "grad_norm": 0.27652454376220703, + "learning_rate": 4.912593649661078e-05, + "loss": 0.2515, + "step": 5508 + }, + { + "epoch": 0.09825919452074341, + "grad_norm": 0.3008754253387451, + "learning_rate": 4.9134855511951483e-05, + "loss": 0.3092, + "step": 5509 + }, + { + "epoch": 0.0982770306424571, + "grad_norm": 0.320084810256958, + "learning_rate": 4.914377452729219e-05, + "loss": 0.264, + "step": 5510 + }, + { + "epoch": 0.0982948667641708, + "grad_norm": 0.23950159549713135, + "learning_rate": 4.9152693542632896e-05, + "loss": 0.2431, + "step": 5511 + }, + { + "epoch": 0.09831270288588449, + "grad_norm": 0.28555992245674133, + "learning_rate": 4.91616125579736e-05, + "loss": 0.2205, + "step": 5512 + }, + { + "epoch": 0.0983305390075982, + "grad_norm": 0.29226920008659363, + "learning_rate": 4.917053157331431e-05, + "loss": 0.2353, + "step": 5513 + }, + { + "epoch": 0.09834837512931188, + "grad_norm": 0.28247639536857605, + "learning_rate": 4.9179450588655015e-05, + "loss": 0.2411, + "step": 5514 + }, + { + "epoch": 0.09836621125102557, + "grad_norm": 0.3029201924800873, + "learning_rate": 4.918836960399572e-05, + "loss": 0.2995, + "step": 5515 + }, + { + "epoch": 0.09838404737273927, + "grad_norm": 0.4019959568977356, + "learning_rate": 4.919728861933643e-05, + "loss": 0.3222, + "step": 5516 + }, + { + "epoch": 0.09840188349445296, + "grad_norm": 0.2866835296154022, + "learning_rate": 4.9206207634677135e-05, + "loss": 0.2493, + "step": 5517 + }, + { + "epoch": 0.09841971961616666, + "grad_norm": 0.2806166708469391, + "learning_rate": 4.921512665001784e-05, + "loss": 0.2411, + "step": 5518 + }, + { + "epoch": 0.09843755573788035, + "grad_norm": 0.26713719964027405, + "learning_rate": 4.922404566535855e-05, + "loss": 0.2303, + "step": 5519 + }, + { + "epoch": 0.09845539185959405, + "grad_norm": 0.34005218744277954, + "learning_rate": 4.9232964680699254e-05, + "loss": 0.2905, + "step": 5520 + }, + { + "epoch": 0.09847322798130774, + "grad_norm": 0.2824673056602478, + "learning_rate": 4.924188369603996e-05, + "loss": 0.2812, + "step": 5521 + }, + { + "epoch": 0.09849106410302144, + "grad_norm": 0.27901577949523926, + "learning_rate": 4.9250802711380666e-05, + "loss": 0.2723, + "step": 5522 + }, + { + "epoch": 0.09850890022473513, + "grad_norm": 0.2748558223247528, + "learning_rate": 4.925972172672137e-05, + "loss": 0.2371, + "step": 5523 + }, + { + "epoch": 0.09852673634644883, + "grad_norm": 0.29768821597099304, + "learning_rate": 4.926864074206208e-05, + "loss": 0.2534, + "step": 5524 + }, + { + "epoch": 0.09854457246816252, + "grad_norm": 0.2774190902709961, + "learning_rate": 4.9277559757402786e-05, + "loss": 0.2607, + "step": 5525 + }, + { + "epoch": 0.09856240858987622, + "grad_norm": 0.3144364058971405, + "learning_rate": 4.928647877274349e-05, + "loss": 0.3096, + "step": 5526 + }, + { + "epoch": 0.09858024471158991, + "grad_norm": 0.28779545426368713, + "learning_rate": 4.92953977880842e-05, + "loss": 0.2668, + "step": 5527 + }, + { + "epoch": 0.09859808083330361, + "grad_norm": 0.34362363815307617, + "learning_rate": 4.9304316803424905e-05, + "loss": 0.2792, + "step": 5528 + }, + { + "epoch": 0.0986159169550173, + "grad_norm": 0.29747605323791504, + "learning_rate": 4.931323581876561e-05, + "loss": 0.2639, + "step": 5529 + }, + { + "epoch": 0.098633753076731, + "grad_norm": 0.4144671857357025, + "learning_rate": 4.932215483410632e-05, + "loss": 0.2851, + "step": 5530 + }, + { + "epoch": 0.09865158919844469, + "grad_norm": 0.28177231550216675, + "learning_rate": 4.9331073849447024e-05, + "loss": 0.2432, + "step": 5531 + }, + { + "epoch": 0.09866942532015839, + "grad_norm": 0.7640838027000427, + "learning_rate": 4.933999286478773e-05, + "loss": 0.3023, + "step": 5532 + }, + { + "epoch": 0.09868726144187208, + "grad_norm": 0.21771247684955597, + "learning_rate": 4.934891188012844e-05, + "loss": 0.1982, + "step": 5533 + }, + { + "epoch": 0.09870509756358578, + "grad_norm": 0.33608943223953247, + "learning_rate": 4.935783089546914e-05, + "loss": 0.2625, + "step": 5534 + }, + { + "epoch": 0.09872293368529947, + "grad_norm": 0.46725186705589294, + "learning_rate": 4.936674991080985e-05, + "loss": 0.3063, + "step": 5535 + }, + { + "epoch": 0.09874076980701316, + "grad_norm": 0.2878686487674713, + "learning_rate": 4.9375668926150556e-05, + "loss": 0.2658, + "step": 5536 + }, + { + "epoch": 0.09875860592872686, + "grad_norm": 0.40644723176956177, + "learning_rate": 4.938458794149126e-05, + "loss": 0.2406, + "step": 5537 + }, + { + "epoch": 0.09877644205044055, + "grad_norm": 0.25225239992141724, + "learning_rate": 4.939350695683197e-05, + "loss": 0.2202, + "step": 5538 + }, + { + "epoch": 0.09879427817215425, + "grad_norm": 0.2533118724822998, + "learning_rate": 4.9402425972172675e-05, + "loss": 0.2248, + "step": 5539 + }, + { + "epoch": 0.09881211429386794, + "grad_norm": 0.2974401116371155, + "learning_rate": 4.941134498751338e-05, + "loss": 0.2286, + "step": 5540 + }, + { + "epoch": 0.09882995041558164, + "grad_norm": 0.4625476896762848, + "learning_rate": 4.942026400285409e-05, + "loss": 0.2355, + "step": 5541 + }, + { + "epoch": 0.09884778653729533, + "grad_norm": 0.37035489082336426, + "learning_rate": 4.9429183018194794e-05, + "loss": 0.2426, + "step": 5542 + }, + { + "epoch": 0.09886562265900903, + "grad_norm": 0.34101033210754395, + "learning_rate": 4.94381020335355e-05, + "loss": 0.2118, + "step": 5543 + }, + { + "epoch": 0.09888345878072272, + "grad_norm": 0.24928370118141174, + "learning_rate": 4.944702104887621e-05, + "loss": 0.2442, + "step": 5544 + }, + { + "epoch": 0.09890129490243642, + "grad_norm": 0.36414381861686707, + "learning_rate": 4.945594006421691e-05, + "loss": 0.2824, + "step": 5545 + }, + { + "epoch": 0.0989191310241501, + "grad_norm": 0.3175963759422302, + "learning_rate": 4.946485907955762e-05, + "loss": 0.2028, + "step": 5546 + }, + { + "epoch": 0.09893696714586381, + "grad_norm": 0.3281267285346985, + "learning_rate": 4.9473778094898326e-05, + "loss": 0.2759, + "step": 5547 + }, + { + "epoch": 0.0989548032675775, + "grad_norm": 0.26249727606773376, + "learning_rate": 4.948269711023903e-05, + "loss": 0.2248, + "step": 5548 + }, + { + "epoch": 0.0989726393892912, + "grad_norm": 0.2374362200498581, + "learning_rate": 4.949161612557974e-05, + "loss": 0.2033, + "step": 5549 + }, + { + "epoch": 0.09899047551100489, + "grad_norm": 0.41762882471084595, + "learning_rate": 4.9500535140920445e-05, + "loss": 0.2507, + "step": 5550 + }, + { + "epoch": 0.09900831163271859, + "grad_norm": 0.29843926429748535, + "learning_rate": 4.950945415626115e-05, + "loss": 0.2185, + "step": 5551 + }, + { + "epoch": 0.09902614775443228, + "grad_norm": 0.3904193937778473, + "learning_rate": 4.951837317160186e-05, + "loss": 0.2024, + "step": 5552 + }, + { + "epoch": 0.09904398387614598, + "grad_norm": 0.3317916691303253, + "learning_rate": 4.9527292186942564e-05, + "loss": 0.2769, + "step": 5553 + }, + { + "epoch": 0.09906181999785967, + "grad_norm": 0.32630684971809387, + "learning_rate": 4.953621120228327e-05, + "loss": 0.2816, + "step": 5554 + }, + { + "epoch": 0.09907965611957335, + "grad_norm": 0.2944846749305725, + "learning_rate": 4.954513021762398e-05, + "loss": 0.2315, + "step": 5555 + }, + { + "epoch": 0.09909749224128706, + "grad_norm": 0.22315825521945953, + "learning_rate": 4.955404923296468e-05, + "loss": 0.1903, + "step": 5556 + }, + { + "epoch": 0.09911532836300074, + "grad_norm": 0.3081098794937134, + "learning_rate": 4.956296824830539e-05, + "loss": 0.277, + "step": 5557 + }, + { + "epoch": 0.09913316448471444, + "grad_norm": 0.216439887881279, + "learning_rate": 4.9571887263646096e-05, + "loss": 0.2403, + "step": 5558 + }, + { + "epoch": 0.09915100060642813, + "grad_norm": 0.3412705659866333, + "learning_rate": 4.95808062789868e-05, + "loss": 0.2618, + "step": 5559 + }, + { + "epoch": 0.09916883672814183, + "grad_norm": 0.24157923460006714, + "learning_rate": 4.958972529432751e-05, + "loss": 0.2594, + "step": 5560 + }, + { + "epoch": 0.09918667284985552, + "grad_norm": 0.3635483980178833, + "learning_rate": 4.9598644309668215e-05, + "loss": 0.2726, + "step": 5561 + }, + { + "epoch": 0.09920450897156922, + "grad_norm": 0.27654680609703064, + "learning_rate": 4.960756332500892e-05, + "loss": 0.2553, + "step": 5562 + }, + { + "epoch": 0.09922234509328291, + "grad_norm": 0.2773616909980774, + "learning_rate": 4.961648234034963e-05, + "loss": 0.2837, + "step": 5563 + }, + { + "epoch": 0.09924018121499661, + "grad_norm": 0.31285059452056885, + "learning_rate": 4.9625401355690334e-05, + "loss": 0.2946, + "step": 5564 + }, + { + "epoch": 0.0992580173367103, + "grad_norm": 0.3870413303375244, + "learning_rate": 4.963432037103104e-05, + "loss": 0.2738, + "step": 5565 + }, + { + "epoch": 0.099275853458424, + "grad_norm": 0.40917330980300903, + "learning_rate": 4.964323938637175e-05, + "loss": 0.2788, + "step": 5566 + }, + { + "epoch": 0.09929368958013769, + "grad_norm": 0.22341284155845642, + "learning_rate": 4.9652158401712454e-05, + "loss": 0.2471, + "step": 5567 + }, + { + "epoch": 0.0993115257018514, + "grad_norm": 0.22843582928180695, + "learning_rate": 4.966107741705316e-05, + "loss": 0.2513, + "step": 5568 + }, + { + "epoch": 0.09932936182356508, + "grad_norm": 0.24894820153713226, + "learning_rate": 4.9669996432393866e-05, + "loss": 0.2078, + "step": 5569 + }, + { + "epoch": 0.09934719794527878, + "grad_norm": 0.28142452239990234, + "learning_rate": 4.967891544773457e-05, + "loss": 0.2699, + "step": 5570 + }, + { + "epoch": 0.09936503406699247, + "grad_norm": 0.33661895990371704, + "learning_rate": 4.968783446307528e-05, + "loss": 0.2675, + "step": 5571 + }, + { + "epoch": 0.09938287018870617, + "grad_norm": 0.2572460472583771, + "learning_rate": 4.9696753478415985e-05, + "loss": 0.2508, + "step": 5572 + }, + { + "epoch": 0.09940070631041986, + "grad_norm": 0.3926204442977905, + "learning_rate": 4.970567249375669e-05, + "loss": 0.352, + "step": 5573 + }, + { + "epoch": 0.09941854243213356, + "grad_norm": 0.2722055912017822, + "learning_rate": 4.97145915090974e-05, + "loss": 0.2167, + "step": 5574 + }, + { + "epoch": 0.09943637855384725, + "grad_norm": 0.42704060673713684, + "learning_rate": 4.9723510524438105e-05, + "loss": 0.2592, + "step": 5575 + }, + { + "epoch": 0.09945421467556094, + "grad_norm": 0.25675666332244873, + "learning_rate": 4.973242953977881e-05, + "loss": 0.2744, + "step": 5576 + }, + { + "epoch": 0.09947205079727464, + "grad_norm": 0.24697034060955048, + "learning_rate": 4.974134855511952e-05, + "loss": 0.2202, + "step": 5577 + }, + { + "epoch": 0.09948988691898833, + "grad_norm": 0.31476113200187683, + "learning_rate": 4.9750267570460224e-05, + "loss": 0.2498, + "step": 5578 + }, + { + "epoch": 0.09950772304070203, + "grad_norm": 0.4493424594402313, + "learning_rate": 4.975918658580093e-05, + "loss": 0.2208, + "step": 5579 + }, + { + "epoch": 0.09952555916241572, + "grad_norm": 0.2961020767688751, + "learning_rate": 4.9768105601141637e-05, + "loss": 0.2459, + "step": 5580 + }, + { + "epoch": 0.09954339528412942, + "grad_norm": 0.3160838186740875, + "learning_rate": 4.977702461648234e-05, + "loss": 0.2754, + "step": 5581 + }, + { + "epoch": 0.09956123140584311, + "grad_norm": 0.364326536655426, + "learning_rate": 4.978594363182305e-05, + "loss": 0.2418, + "step": 5582 + }, + { + "epoch": 0.09957906752755681, + "grad_norm": 0.4317879378795624, + "learning_rate": 4.9794862647163756e-05, + "loss": 0.2736, + "step": 5583 + }, + { + "epoch": 0.0995969036492705, + "grad_norm": 0.30823612213134766, + "learning_rate": 4.980378166250446e-05, + "loss": 0.2642, + "step": 5584 + }, + { + "epoch": 0.0996147397709842, + "grad_norm": 0.3373742997646332, + "learning_rate": 4.981270067784517e-05, + "loss": 0.2666, + "step": 5585 + }, + { + "epoch": 0.09963257589269789, + "grad_norm": 0.34264281392097473, + "learning_rate": 4.9821619693185875e-05, + "loss": 0.2491, + "step": 5586 + }, + { + "epoch": 0.09965041201441159, + "grad_norm": 0.35036471486091614, + "learning_rate": 4.983053870852658e-05, + "loss": 0.2584, + "step": 5587 + }, + { + "epoch": 0.09966824813612528, + "grad_norm": 0.2319411039352417, + "learning_rate": 4.983945772386729e-05, + "loss": 0.2341, + "step": 5588 + }, + { + "epoch": 0.09968608425783898, + "grad_norm": 0.27314355969429016, + "learning_rate": 4.9848376739207994e-05, + "loss": 0.1929, + "step": 5589 + }, + { + "epoch": 0.09970392037955267, + "grad_norm": 0.35963067412376404, + "learning_rate": 4.98572957545487e-05, + "loss": 0.2817, + "step": 5590 + }, + { + "epoch": 0.09972175650126637, + "grad_norm": 0.2947644591331482, + "learning_rate": 4.986621476988941e-05, + "loss": 0.254, + "step": 5591 + }, + { + "epoch": 0.09973959262298006, + "grad_norm": 0.3208530843257904, + "learning_rate": 4.987513378523011e-05, + "loss": 0.2564, + "step": 5592 + }, + { + "epoch": 0.09975742874469376, + "grad_norm": 0.2744399607181549, + "learning_rate": 4.988405280057082e-05, + "loss": 0.2137, + "step": 5593 + }, + { + "epoch": 0.09977526486640745, + "grad_norm": 0.3182908892631531, + "learning_rate": 4.9892971815911526e-05, + "loss": 0.2708, + "step": 5594 + }, + { + "epoch": 0.09979310098812114, + "grad_norm": 0.272396057844162, + "learning_rate": 4.990189083125223e-05, + "loss": 0.1979, + "step": 5595 + }, + { + "epoch": 0.09981093710983484, + "grad_norm": 0.2817678153514862, + "learning_rate": 4.991080984659294e-05, + "loss": 0.229, + "step": 5596 + }, + { + "epoch": 0.09982877323154853, + "grad_norm": 0.29018929600715637, + "learning_rate": 4.9919728861933645e-05, + "loss": 0.2305, + "step": 5597 + }, + { + "epoch": 0.09984660935326223, + "grad_norm": 0.27567121386528015, + "learning_rate": 4.992864787727435e-05, + "loss": 0.2499, + "step": 5598 + }, + { + "epoch": 0.09986444547497592, + "grad_norm": 0.3599814474582672, + "learning_rate": 4.993756689261506e-05, + "loss": 0.2833, + "step": 5599 + }, + { + "epoch": 0.09988228159668962, + "grad_norm": 0.4604446291923523, + "learning_rate": 4.9946485907955764e-05, + "loss": 0.3642, + "step": 5600 + }, + { + "epoch": 0.0999001177184033, + "grad_norm": 0.27221474051475525, + "learning_rate": 4.995540492329647e-05, + "loss": 0.2104, + "step": 5601 + }, + { + "epoch": 0.09991795384011701, + "grad_norm": 0.22608445584774017, + "learning_rate": 4.996432393863718e-05, + "loss": 0.2142, + "step": 5602 + }, + { + "epoch": 0.0999357899618307, + "grad_norm": 0.2663249373435974, + "learning_rate": 4.997324295397788e-05, + "loss": 0.1488, + "step": 5603 + }, + { + "epoch": 0.0999536260835444, + "grad_norm": 0.33296507596969604, + "learning_rate": 4.998216196931859e-05, + "loss": 0.2993, + "step": 5604 + }, + { + "epoch": 0.09997146220525809, + "grad_norm": 0.36306607723236084, + "learning_rate": 4.9991080984659296e-05, + "loss": 0.2707, + "step": 5605 + }, + { + "epoch": 0.09998929832697179, + "grad_norm": 0.3702142536640167, + "learning_rate": 5e-05, + "loss": 0.2977, + "step": 5606 + }, + { + "epoch": 0.10000713444868548, + "grad_norm": 0.3072991669178009, + "learning_rate": 4.9999999951545686e-05, + "loss": 0.3107, + "step": 5607 + }, + { + "epoch": 0.10002497057039918, + "grad_norm": 0.2944137156009674, + "learning_rate": 4.999999980618273e-05, + "loss": 0.2808, + "step": 5608 + }, + { + "epoch": 0.10004280669211287, + "grad_norm": 0.24709758162498474, + "learning_rate": 4.999999956391115e-05, + "loss": 0.2487, + "step": 5609 + }, + { + "epoch": 0.10006064281382657, + "grad_norm": 0.25758326053619385, + "learning_rate": 4.999999922473093e-05, + "loss": 0.2091, + "step": 5610 + }, + { + "epoch": 0.10007847893554025, + "grad_norm": 0.35120391845703125, + "learning_rate": 4.999999878864208e-05, + "loss": 0.2288, + "step": 5611 + }, + { + "epoch": 0.10009631505725396, + "grad_norm": 0.2889556586742401, + "learning_rate": 4.9999998255644596e-05, + "loss": 0.2766, + "step": 5612 + }, + { + "epoch": 0.10011415117896764, + "grad_norm": 0.2990957796573639, + "learning_rate": 4.999999762573849e-05, + "loss": 0.2544, + "step": 5613 + }, + { + "epoch": 0.10013198730068135, + "grad_norm": 0.3255777359008789, + "learning_rate": 4.999999689892375e-05, + "loss": 0.2411, + "step": 5614 + }, + { + "epoch": 0.10014982342239503, + "grad_norm": 0.34438732266426086, + "learning_rate": 4.9999996075200396e-05, + "loss": 0.3331, + "step": 5615 + }, + { + "epoch": 0.10016765954410872, + "grad_norm": 0.3306005597114563, + "learning_rate": 4.9999995154568424e-05, + "loss": 0.2287, + "step": 5616 + }, + { + "epoch": 0.10018549566582242, + "grad_norm": 0.3421599864959717, + "learning_rate": 4.9999994137027826e-05, + "loss": 0.3158, + "step": 5617 + }, + { + "epoch": 0.10020333178753611, + "grad_norm": 0.3410884737968445, + "learning_rate": 4.999999302257863e-05, + "loss": 0.2612, + "step": 5618 + }, + { + "epoch": 0.10022116790924981, + "grad_norm": 0.2748681306838989, + "learning_rate": 4.999999181122081e-05, + "loss": 0.2826, + "step": 5619 + }, + { + "epoch": 0.1002390040309635, + "grad_norm": 0.24706009030342102, + "learning_rate": 4.9999990502954396e-05, + "loss": 0.2432, + "step": 5620 + }, + { + "epoch": 0.1002568401526772, + "grad_norm": 0.30699586868286133, + "learning_rate": 4.999998909777939e-05, + "loss": 0.2646, + "step": 5621 + }, + { + "epoch": 0.10027467627439089, + "grad_norm": 0.3587009906768799, + "learning_rate": 4.999998759569578e-05, + "loss": 0.235, + "step": 5622 + }, + { + "epoch": 0.1002925123961046, + "grad_norm": 0.3154565393924713, + "learning_rate": 4.999998599670359e-05, + "loss": 0.2378, + "step": 5623 + }, + { + "epoch": 0.10031034851781828, + "grad_norm": 0.35002949833869934, + "learning_rate": 4.999998430080282e-05, + "loss": 0.2322, + "step": 5624 + }, + { + "epoch": 0.10032818463953198, + "grad_norm": 0.2938433587551117, + "learning_rate": 4.999998250799347e-05, + "loss": 0.2588, + "step": 5625 + }, + { + "epoch": 0.10034602076124567, + "grad_norm": 0.23168541491031647, + "learning_rate": 4.999998061827555e-05, + "loss": 0.2228, + "step": 5626 + }, + { + "epoch": 0.10036385688295937, + "grad_norm": 0.23731808364391327, + "learning_rate": 4.999997863164908e-05, + "loss": 0.2205, + "step": 5627 + }, + { + "epoch": 0.10038169300467306, + "grad_norm": 0.2636774182319641, + "learning_rate": 4.999997654811406e-05, + "loss": 0.2284, + "step": 5628 + }, + { + "epoch": 0.10039952912638676, + "grad_norm": 0.28431543707847595, + "learning_rate": 4.9999974367670485e-05, + "loss": 0.209, + "step": 5629 + }, + { + "epoch": 0.10041736524810045, + "grad_norm": 0.25456735491752625, + "learning_rate": 4.9999972090318384e-05, + "loss": 0.2457, + "step": 5630 + }, + { + "epoch": 0.10043520136981415, + "grad_norm": 0.2534412443637848, + "learning_rate": 4.999996971605776e-05, + "loss": 0.2239, + "step": 5631 + }, + { + "epoch": 0.10045303749152784, + "grad_norm": 0.28452956676483154, + "learning_rate": 4.999996724488861e-05, + "loss": 0.2831, + "step": 5632 + }, + { + "epoch": 0.10047087361324154, + "grad_norm": 0.4677983820438385, + "learning_rate": 4.9999964676810954e-05, + "loss": 0.2832, + "step": 5633 + }, + { + "epoch": 0.10048870973495523, + "grad_norm": 0.23367977142333984, + "learning_rate": 4.9999962011824795e-05, + "loss": 0.2143, + "step": 5634 + }, + { + "epoch": 0.10050654585666892, + "grad_norm": 0.39357468485832214, + "learning_rate": 4.999995924993016e-05, + "loss": 0.2005, + "step": 5635 + }, + { + "epoch": 0.10052438197838262, + "grad_norm": 0.3034566044807434, + "learning_rate": 4.999995639112705e-05, + "loss": 0.2211, + "step": 5636 + }, + { + "epoch": 0.10054221810009631, + "grad_norm": 0.3892858028411865, + "learning_rate": 4.999995343541546e-05, + "loss": 0.2833, + "step": 5637 + }, + { + "epoch": 0.10056005422181001, + "grad_norm": 0.277678519487381, + "learning_rate": 4.9999950382795425e-05, + "loss": 0.2725, + "step": 5638 + }, + { + "epoch": 0.1005778903435237, + "grad_norm": 0.2668309807777405, + "learning_rate": 4.999994723326694e-05, + "loss": 0.2385, + "step": 5639 + }, + { + "epoch": 0.1005957264652374, + "grad_norm": 0.2892187833786011, + "learning_rate": 4.9999943986830036e-05, + "loss": 0.2852, + "step": 5640 + }, + { + "epoch": 0.10061356258695109, + "grad_norm": 0.3583581745624542, + "learning_rate": 4.999994064348471e-05, + "loss": 0.2539, + "step": 5641 + }, + { + "epoch": 0.10063139870866479, + "grad_norm": 0.42422887682914734, + "learning_rate": 4.999993720323097e-05, + "loss": 0.2262, + "step": 5642 + }, + { + "epoch": 0.10064923483037848, + "grad_norm": 0.33774709701538086, + "learning_rate": 4.999993366606885e-05, + "loss": 0.2758, + "step": 5643 + }, + { + "epoch": 0.10066707095209218, + "grad_norm": 0.3390183746814728, + "learning_rate": 4.999993003199834e-05, + "loss": 0.2244, + "step": 5644 + }, + { + "epoch": 0.10068490707380587, + "grad_norm": 0.27029553055763245, + "learning_rate": 4.9999926301019484e-05, + "loss": 0.2306, + "step": 5645 + }, + { + "epoch": 0.10070274319551957, + "grad_norm": 0.2525337338447571, + "learning_rate": 4.9999922473132264e-05, + "loss": 0.2746, + "step": 5646 + }, + { + "epoch": 0.10072057931723326, + "grad_norm": 0.2487715780735016, + "learning_rate": 4.9999918548336724e-05, + "loss": 0.2114, + "step": 5647 + }, + { + "epoch": 0.10073841543894696, + "grad_norm": 0.3058359920978546, + "learning_rate": 4.999991452663285e-05, + "loss": 0.2158, + "step": 5648 + }, + { + "epoch": 0.10075625156066065, + "grad_norm": 0.5530399084091187, + "learning_rate": 4.9999910408020686e-05, + "loss": 0.2488, + "step": 5649 + }, + { + "epoch": 0.10077408768237435, + "grad_norm": 0.3230868875980377, + "learning_rate": 4.999990619250022e-05, + "loss": 0.2466, + "step": 5650 + }, + { + "epoch": 0.10079192380408804, + "grad_norm": 0.2973959445953369, + "learning_rate": 4.99999018800715e-05, + "loss": 0.2662, + "step": 5651 + }, + { + "epoch": 0.10080975992580174, + "grad_norm": 0.3296448886394501, + "learning_rate": 4.9999897470734516e-05, + "loss": 0.2797, + "step": 5652 + }, + { + "epoch": 0.10082759604751543, + "grad_norm": 0.307454913854599, + "learning_rate": 4.999989296448929e-05, + "loss": 0.2391, + "step": 5653 + }, + { + "epoch": 0.10084543216922913, + "grad_norm": 0.3258278965950012, + "learning_rate": 4.9999888361335855e-05, + "loss": 0.2601, + "step": 5654 + }, + { + "epoch": 0.10086326829094282, + "grad_norm": 0.348908007144928, + "learning_rate": 4.999988366127421e-05, + "loss": 0.2148, + "step": 5655 + }, + { + "epoch": 0.1008811044126565, + "grad_norm": 0.29810166358947754, + "learning_rate": 4.9999878864304385e-05, + "loss": 0.2393, + "step": 5656 + }, + { + "epoch": 0.10089894053437021, + "grad_norm": 0.3841148614883423, + "learning_rate": 4.999987397042639e-05, + "loss": 0.2822, + "step": 5657 + }, + { + "epoch": 0.1009167766560839, + "grad_norm": 0.26545315980911255, + "learning_rate": 4.999986897964026e-05, + "loss": 0.2279, + "step": 5658 + }, + { + "epoch": 0.1009346127777976, + "grad_norm": 0.3630913197994232, + "learning_rate": 4.9999863891945996e-05, + "loss": 0.2391, + "step": 5659 + }, + { + "epoch": 0.10095244889951129, + "grad_norm": 0.23462393879890442, + "learning_rate": 4.999985870734362e-05, + "loss": 0.2047, + "step": 5660 + }, + { + "epoch": 0.10097028502122499, + "grad_norm": 0.32495900988578796, + "learning_rate": 4.999985342583316e-05, + "loss": 0.2041, + "step": 5661 + }, + { + "epoch": 0.10098812114293867, + "grad_norm": 0.27943190932273865, + "learning_rate": 4.999984804741464e-05, + "loss": 0.249, + "step": 5662 + }, + { + "epoch": 0.10100595726465238, + "grad_norm": 0.30690786242485046, + "learning_rate": 4.999984257208807e-05, + "loss": 0.298, + "step": 5663 + }, + { + "epoch": 0.10102379338636606, + "grad_norm": 0.21446751058101654, + "learning_rate": 4.999983699985348e-05, + "loss": 0.199, + "step": 5664 + }, + { + "epoch": 0.10104162950807977, + "grad_norm": 0.29535484313964844, + "learning_rate": 4.9999831330710875e-05, + "loss": 0.256, + "step": 5665 + }, + { + "epoch": 0.10105946562979345, + "grad_norm": 0.30024975538253784, + "learning_rate": 4.9999825564660295e-05, + "loss": 0.3043, + "step": 5666 + }, + { + "epoch": 0.10107730175150716, + "grad_norm": 0.5012380480766296, + "learning_rate": 4.999981970170176e-05, + "loss": 0.3331, + "step": 5667 + }, + { + "epoch": 0.10109513787322084, + "grad_norm": 0.3446192145347595, + "learning_rate": 4.999981374183529e-05, + "loss": 0.2525, + "step": 5668 + }, + { + "epoch": 0.10111297399493455, + "grad_norm": 0.2471040040254593, + "learning_rate": 4.99998076850609e-05, + "loss": 0.2047, + "step": 5669 + }, + { + "epoch": 0.10113081011664823, + "grad_norm": 0.2476903200149536, + "learning_rate": 4.999980153137862e-05, + "loss": 0.2291, + "step": 5670 + }, + { + "epoch": 0.10114864623836194, + "grad_norm": 0.35032692551612854, + "learning_rate": 4.999979528078849e-05, + "loss": 0.2677, + "step": 5671 + }, + { + "epoch": 0.10116648236007562, + "grad_norm": 0.3351408839225769, + "learning_rate": 4.99997889332905e-05, + "loss": 0.2191, + "step": 5672 + }, + { + "epoch": 0.10118431848178933, + "grad_norm": 0.29639095067977905, + "learning_rate": 4.99997824888847e-05, + "loss": 0.2514, + "step": 5673 + }, + { + "epoch": 0.10120215460350301, + "grad_norm": 0.31570735573768616, + "learning_rate": 4.9999775947571117e-05, + "loss": 0.2302, + "step": 5674 + }, + { + "epoch": 0.10121999072521672, + "grad_norm": 0.4335422217845917, + "learning_rate": 4.9999769309349765e-05, + "loss": 0.2015, + "step": 5675 + }, + { + "epoch": 0.1012378268469304, + "grad_norm": 0.24284081161022186, + "learning_rate": 4.999976257422067e-05, + "loss": 0.2256, + "step": 5676 + }, + { + "epoch": 0.10125566296864409, + "grad_norm": 0.2578946053981781, + "learning_rate": 4.9999755742183854e-05, + "loss": 0.2317, + "step": 5677 + }, + { + "epoch": 0.1012734990903578, + "grad_norm": 0.271124005317688, + "learning_rate": 4.9999748813239355e-05, + "loss": 0.274, + "step": 5678 + }, + { + "epoch": 0.10129133521207148, + "grad_norm": 0.3532411456108093, + "learning_rate": 4.999974178738719e-05, + "loss": 0.3066, + "step": 5679 + }, + { + "epoch": 0.10130917133378518, + "grad_norm": 0.283805251121521, + "learning_rate": 4.99997346646274e-05, + "loss": 0.2396, + "step": 5680 + }, + { + "epoch": 0.10132700745549887, + "grad_norm": 0.4033229947090149, + "learning_rate": 4.9999727444959996e-05, + "loss": 0.2961, + "step": 5681 + }, + { + "epoch": 0.10134484357721257, + "grad_norm": 0.3610017001628876, + "learning_rate": 4.999972012838502e-05, + "loss": 0.2043, + "step": 5682 + }, + { + "epoch": 0.10136267969892626, + "grad_norm": 0.29770293831825256, + "learning_rate": 4.999971271490249e-05, + "loss": 0.2342, + "step": 5683 + }, + { + "epoch": 0.10138051582063996, + "grad_norm": 0.3253975212574005, + "learning_rate": 4.999970520451245e-05, + "loss": 0.2812, + "step": 5684 + }, + { + "epoch": 0.10139835194235365, + "grad_norm": 0.2652103006839752, + "learning_rate": 4.9999697597214905e-05, + "loss": 0.1806, + "step": 5685 + }, + { + "epoch": 0.10141618806406735, + "grad_norm": 0.2191380113363266, + "learning_rate": 4.999968989300991e-05, + "loss": 0.2126, + "step": 5686 + }, + { + "epoch": 0.10143402418578104, + "grad_norm": 0.33241501450538635, + "learning_rate": 4.999968209189746e-05, + "loss": 0.2798, + "step": 5687 + }, + { + "epoch": 0.10145186030749474, + "grad_norm": 0.3966387212276459, + "learning_rate": 4.9999674193877626e-05, + "loss": 0.2326, + "step": 5688 + }, + { + "epoch": 0.10146969642920843, + "grad_norm": 0.34030452370643616, + "learning_rate": 4.9999666198950416e-05, + "loss": 0.2784, + "step": 5689 + }, + { + "epoch": 0.10148753255092213, + "grad_norm": 0.24570925533771515, + "learning_rate": 4.999965810711587e-05, + "loss": 0.259, + "step": 5690 + }, + { + "epoch": 0.10150536867263582, + "grad_norm": 0.3062693178653717, + "learning_rate": 4.9999649918374e-05, + "loss": 0.2246, + "step": 5691 + }, + { + "epoch": 0.10152320479434952, + "grad_norm": 0.3460516333580017, + "learning_rate": 4.999964163272487e-05, + "loss": 0.2719, + "step": 5692 + }, + { + "epoch": 0.10154104091606321, + "grad_norm": 0.35425227880477905, + "learning_rate": 4.999963325016849e-05, + "loss": 0.2319, + "step": 5693 + }, + { + "epoch": 0.10155887703777691, + "grad_norm": 0.3349691331386566, + "learning_rate": 4.999962477070489e-05, + "loss": 0.3157, + "step": 5694 + }, + { + "epoch": 0.1015767131594906, + "grad_norm": 0.36242732405662537, + "learning_rate": 4.999961619433411e-05, + "loss": 0.2333, + "step": 5695 + }, + { + "epoch": 0.10159454928120429, + "grad_norm": 0.2270660549402237, + "learning_rate": 4.999960752105619e-05, + "loss": 0.1958, + "step": 5696 + }, + { + "epoch": 0.10161238540291799, + "grad_norm": 0.3322550058364868, + "learning_rate": 4.999959875087115e-05, + "loss": 0.2724, + "step": 5697 + }, + { + "epoch": 0.10163022152463168, + "grad_norm": 0.42938780784606934, + "learning_rate": 4.999958988377903e-05, + "loss": 0.3261, + "step": 5698 + }, + { + "epoch": 0.10164805764634538, + "grad_norm": 0.3311031758785248, + "learning_rate": 4.999958091977987e-05, + "loss": 0.3073, + "step": 5699 + }, + { + "epoch": 0.10166589376805907, + "grad_norm": 0.38504278659820557, + "learning_rate": 4.99995718588737e-05, + "loss": 0.2384, + "step": 5700 + }, + { + "epoch": 0.10168372988977277, + "grad_norm": 0.274027556180954, + "learning_rate": 4.999956270106055e-05, + "loss": 0.27, + "step": 5701 + }, + { + "epoch": 0.10170156601148646, + "grad_norm": 0.3279690444469452, + "learning_rate": 4.999955344634046e-05, + "loss": 0.2501, + "step": 5702 + }, + { + "epoch": 0.10171940213320016, + "grad_norm": 0.27175334095954895, + "learning_rate": 4.999954409471347e-05, + "loss": 0.2419, + "step": 5703 + }, + { + "epoch": 0.10173723825491385, + "grad_norm": 0.3714657127857208, + "learning_rate": 4.999953464617961e-05, + "loss": 0.2913, + "step": 5704 + }, + { + "epoch": 0.10175507437662755, + "grad_norm": 0.22872139513492584, + "learning_rate": 4.999952510073893e-05, + "loss": 0.2326, + "step": 5705 + }, + { + "epoch": 0.10177291049834124, + "grad_norm": 0.2462315410375595, + "learning_rate": 4.9999515458391445e-05, + "loss": 0.2677, + "step": 5706 + }, + { + "epoch": 0.10179074662005494, + "grad_norm": 0.3426641523838043, + "learning_rate": 4.999950571913721e-05, + "loss": 0.2055, + "step": 5707 + }, + { + "epoch": 0.10180858274176863, + "grad_norm": 0.25017473101615906, + "learning_rate": 4.999949588297625e-05, + "loss": 0.2119, + "step": 5708 + }, + { + "epoch": 0.10182641886348233, + "grad_norm": 0.3377556800842285, + "learning_rate": 4.999948594990861e-05, + "loss": 0.252, + "step": 5709 + }, + { + "epoch": 0.10184425498519602, + "grad_norm": 0.2705136835575104, + "learning_rate": 4.9999475919934335e-05, + "loss": 0.2757, + "step": 5710 + }, + { + "epoch": 0.10186209110690972, + "grad_norm": 0.2471962869167328, + "learning_rate": 4.999946579305345e-05, + "loss": 0.2451, + "step": 5711 + }, + { + "epoch": 0.10187992722862341, + "grad_norm": 0.37865251302719116, + "learning_rate": 4.9999455569266e-05, + "loss": 0.2051, + "step": 5712 + }, + { + "epoch": 0.10189776335033711, + "grad_norm": 0.24027401208877563, + "learning_rate": 4.999944524857203e-05, + "loss": 0.2222, + "step": 5713 + }, + { + "epoch": 0.1019155994720508, + "grad_norm": 0.26842200756073, + "learning_rate": 4.999943483097157e-05, + "loss": 0.2364, + "step": 5714 + }, + { + "epoch": 0.1019334355937645, + "grad_norm": 0.30966103076934814, + "learning_rate": 4.999942431646467e-05, + "loss": 0.2507, + "step": 5715 + }, + { + "epoch": 0.10195127171547819, + "grad_norm": 0.3614092469215393, + "learning_rate": 4.999941370505137e-05, + "loss": 0.2796, + "step": 5716 + }, + { + "epoch": 0.10196910783719187, + "grad_norm": 0.30434978008270264, + "learning_rate": 4.99994029967317e-05, + "loss": 0.2268, + "step": 5717 + }, + { + "epoch": 0.10198694395890558, + "grad_norm": 0.35362622141838074, + "learning_rate": 4.999939219150572e-05, + "loss": 0.3003, + "step": 5718 + }, + { + "epoch": 0.10200478008061926, + "grad_norm": 0.28988972306251526, + "learning_rate": 4.9999381289373454e-05, + "loss": 0.2105, + "step": 5719 + }, + { + "epoch": 0.10202261620233297, + "grad_norm": 0.4682668149471283, + "learning_rate": 4.9999370290334955e-05, + "loss": 0.2407, + "step": 5720 + }, + { + "epoch": 0.10204045232404665, + "grad_norm": 0.3408457338809967, + "learning_rate": 4.999935919439026e-05, + "loss": 0.247, + "step": 5721 + }, + { + "epoch": 0.10205828844576036, + "grad_norm": 0.31839820742607117, + "learning_rate": 4.999934800153942e-05, + "loss": 0.2188, + "step": 5722 + }, + { + "epoch": 0.10207612456747404, + "grad_norm": 0.25762316584587097, + "learning_rate": 4.9999336711782466e-05, + "loss": 0.1989, + "step": 5723 + }, + { + "epoch": 0.10209396068918775, + "grad_norm": 0.39140570163726807, + "learning_rate": 4.9999325325119444e-05, + "loss": 0.2484, + "step": 5724 + }, + { + "epoch": 0.10211179681090143, + "grad_norm": 0.26773861050605774, + "learning_rate": 4.999931384155041e-05, + "loss": 0.2661, + "step": 5725 + }, + { + "epoch": 0.10212963293261514, + "grad_norm": 0.39659813046455383, + "learning_rate": 4.9999302261075395e-05, + "loss": 0.2233, + "step": 5726 + }, + { + "epoch": 0.10214746905432882, + "grad_norm": 0.3014323115348816, + "learning_rate": 4.9999290583694456e-05, + "loss": 0.2384, + "step": 5727 + }, + { + "epoch": 0.10216530517604253, + "grad_norm": 0.27936092019081116, + "learning_rate": 4.9999278809407636e-05, + "loss": 0.2289, + "step": 5728 + }, + { + "epoch": 0.10218314129775621, + "grad_norm": 0.26172640919685364, + "learning_rate": 4.999926693821497e-05, + "loss": 0.2483, + "step": 5729 + }, + { + "epoch": 0.10220097741946992, + "grad_norm": 0.4068194627761841, + "learning_rate": 4.999925497011651e-05, + "loss": 0.2644, + "step": 5730 + }, + { + "epoch": 0.1022188135411836, + "grad_norm": 0.2824203073978424, + "learning_rate": 4.999924290511231e-05, + "loss": 0.2548, + "step": 5731 + }, + { + "epoch": 0.1022366496628973, + "grad_norm": 0.28223201632499695, + "learning_rate": 4.9999230743202404e-05, + "loss": 0.2183, + "step": 5732 + }, + { + "epoch": 0.102254485784611, + "grad_norm": 0.34141239523887634, + "learning_rate": 4.9999218484386846e-05, + "loss": 0.2794, + "step": 5733 + }, + { + "epoch": 0.1022723219063247, + "grad_norm": 0.3454535901546478, + "learning_rate": 4.9999206128665684e-05, + "loss": 0.2327, + "step": 5734 + }, + { + "epoch": 0.10229015802803838, + "grad_norm": 0.2345094233751297, + "learning_rate": 4.999919367603896e-05, + "loss": 0.2354, + "step": 5735 + }, + { + "epoch": 0.10230799414975207, + "grad_norm": 0.2947339415550232, + "learning_rate": 4.999918112650673e-05, + "loss": 0.2253, + "step": 5736 + }, + { + "epoch": 0.10232583027146577, + "grad_norm": 0.4752649664878845, + "learning_rate": 4.999916848006904e-05, + "loss": 0.2339, + "step": 5737 + }, + { + "epoch": 0.10234366639317946, + "grad_norm": 0.331943541765213, + "learning_rate": 4.9999155736725945e-05, + "loss": 0.2462, + "step": 5738 + }, + { + "epoch": 0.10236150251489316, + "grad_norm": 0.31686171889305115, + "learning_rate": 4.999914289647748e-05, + "loss": 0.2493, + "step": 5739 + }, + { + "epoch": 0.10237933863660685, + "grad_norm": 0.3673936724662781, + "learning_rate": 4.9999129959323705e-05, + "loss": 0.2504, + "step": 5740 + }, + { + "epoch": 0.10239717475832055, + "grad_norm": 0.4037603735923767, + "learning_rate": 4.9999116925264664e-05, + "loss": 0.2993, + "step": 5741 + }, + { + "epoch": 0.10241501088003424, + "grad_norm": 0.3081677556037903, + "learning_rate": 4.999910379430042e-05, + "loss": 0.211, + "step": 5742 + }, + { + "epoch": 0.10243284700174794, + "grad_norm": 0.3046434819698334, + "learning_rate": 4.9999090566431e-05, + "loss": 0.2212, + "step": 5743 + }, + { + "epoch": 0.10245068312346163, + "grad_norm": 0.2917368710041046, + "learning_rate": 4.999907724165649e-05, + "loss": 0.2807, + "step": 5744 + }, + { + "epoch": 0.10246851924517533, + "grad_norm": 0.40417367219924927, + "learning_rate": 4.999906381997691e-05, + "loss": 0.2992, + "step": 5745 + }, + { + "epoch": 0.10248635536688902, + "grad_norm": 0.3042461574077606, + "learning_rate": 4.9999050301392324e-05, + "loss": 0.2463, + "step": 5746 + }, + { + "epoch": 0.10250419148860272, + "grad_norm": 0.22511489689350128, + "learning_rate": 4.999903668590279e-05, + "loss": 0.2004, + "step": 5747 + }, + { + "epoch": 0.10252202761031641, + "grad_norm": 0.241920605301857, + "learning_rate": 4.9999022973508357e-05, + "loss": 0.2162, + "step": 5748 + }, + { + "epoch": 0.10253986373203011, + "grad_norm": 0.49574771523475647, + "learning_rate": 4.999900916420907e-05, + "loss": 0.3038, + "step": 5749 + }, + { + "epoch": 0.1025576998537438, + "grad_norm": 0.28990188241004944, + "learning_rate": 4.9998995258004996e-05, + "loss": 0.2031, + "step": 5750 + }, + { + "epoch": 0.1025755359754575, + "grad_norm": 0.42998576164245605, + "learning_rate": 4.999898125489617e-05, + "loss": 0.2422, + "step": 5751 + }, + { + "epoch": 0.10259337209717119, + "grad_norm": 0.5102670788764954, + "learning_rate": 4.999896715488267e-05, + "loss": 0.212, + "step": 5752 + }, + { + "epoch": 0.10261120821888489, + "grad_norm": 0.36709269881248474, + "learning_rate": 4.999895295796453e-05, + "loss": 0.2441, + "step": 5753 + }, + { + "epoch": 0.10262904434059858, + "grad_norm": 0.3221031725406647, + "learning_rate": 4.999893866414183e-05, + "loss": 0.3232, + "step": 5754 + }, + { + "epoch": 0.10264688046231228, + "grad_norm": 0.3416358530521393, + "learning_rate": 4.999892427341459e-05, + "loss": 0.2875, + "step": 5755 + }, + { + "epoch": 0.10266471658402597, + "grad_norm": 0.3036465346813202, + "learning_rate": 4.99989097857829e-05, + "loss": 0.2678, + "step": 5756 + }, + { + "epoch": 0.10268255270573966, + "grad_norm": 0.2555624842643738, + "learning_rate": 4.9998895201246795e-05, + "loss": 0.2577, + "step": 5757 + }, + { + "epoch": 0.10270038882745336, + "grad_norm": 0.3073364794254303, + "learning_rate": 4.999888051980634e-05, + "loss": 0.2499, + "step": 5758 + }, + { + "epoch": 0.10271822494916705, + "grad_norm": 0.4039995074272156, + "learning_rate": 4.9998865741461584e-05, + "loss": 0.287, + "step": 5759 + }, + { + "epoch": 0.10273606107088075, + "grad_norm": 0.28116482496261597, + "learning_rate": 4.9998850866212595e-05, + "loss": 0.2452, + "step": 5760 + }, + { + "epoch": 0.10275389719259444, + "grad_norm": 0.29124900698661804, + "learning_rate": 4.999883589405942e-05, + "loss": 0.2399, + "step": 5761 + }, + { + "epoch": 0.10277173331430814, + "grad_norm": 0.34035050868988037, + "learning_rate": 4.999882082500213e-05, + "loss": 0.2678, + "step": 5762 + }, + { + "epoch": 0.10278956943602183, + "grad_norm": 0.3873896896839142, + "learning_rate": 4.999880565904077e-05, + "loss": 0.3366, + "step": 5763 + }, + { + "epoch": 0.10280740555773553, + "grad_norm": 0.23793981969356537, + "learning_rate": 4.999879039617541e-05, + "loss": 0.205, + "step": 5764 + }, + { + "epoch": 0.10282524167944922, + "grad_norm": 0.26890796422958374, + "learning_rate": 4.9998775036406104e-05, + "loss": 0.1626, + "step": 5765 + }, + { + "epoch": 0.10284307780116292, + "grad_norm": 0.2527380883693695, + "learning_rate": 4.999875957973291e-05, + "loss": 0.2383, + "step": 5766 + }, + { + "epoch": 0.1028609139228766, + "grad_norm": 0.27098050713539124, + "learning_rate": 4.999874402615588e-05, + "loss": 0.228, + "step": 5767 + }, + { + "epoch": 0.10287875004459031, + "grad_norm": 0.39944028854370117, + "learning_rate": 4.99987283756751e-05, + "loss": 0.2379, + "step": 5768 + }, + { + "epoch": 0.102896586166304, + "grad_norm": 0.294586181640625, + "learning_rate": 4.999871262829061e-05, + "loss": 0.262, + "step": 5769 + }, + { + "epoch": 0.1029144222880177, + "grad_norm": 0.3405201733112335, + "learning_rate": 4.9998696784002476e-05, + "loss": 0.2371, + "step": 5770 + }, + { + "epoch": 0.10293225840973139, + "grad_norm": 0.2673748731613159, + "learning_rate": 4.999868084281075e-05, + "loss": 0.199, + "step": 5771 + }, + { + "epoch": 0.10295009453144509, + "grad_norm": 0.25981229543685913, + "learning_rate": 4.9998664804715514e-05, + "loss": 0.2129, + "step": 5772 + }, + { + "epoch": 0.10296793065315878, + "grad_norm": 0.2996998131275177, + "learning_rate": 4.9998648669716816e-05, + "loss": 0.2112, + "step": 5773 + }, + { + "epoch": 0.10298576677487248, + "grad_norm": 0.29292044043540955, + "learning_rate": 4.9998632437814715e-05, + "loss": 0.2557, + "step": 5774 + }, + { + "epoch": 0.10300360289658617, + "grad_norm": 0.3624270260334015, + "learning_rate": 4.999861610900929e-05, + "loss": 0.2401, + "step": 5775 + }, + { + "epoch": 0.10302143901829985, + "grad_norm": 0.29216569662094116, + "learning_rate": 4.999859968330059e-05, + "loss": 0.2192, + "step": 5776 + }, + { + "epoch": 0.10303927514001356, + "grad_norm": 0.3137858510017395, + "learning_rate": 4.999858316068868e-05, + "loss": 0.2675, + "step": 5777 + }, + { + "epoch": 0.10305711126172724, + "grad_norm": 0.3349331021308899, + "learning_rate": 4.999856654117363e-05, + "loss": 0.259, + "step": 5778 + }, + { + "epoch": 0.10307494738344095, + "grad_norm": 0.3540928363800049, + "learning_rate": 4.9998549824755506e-05, + "loss": 0.2862, + "step": 5779 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 0.3264806568622589, + "learning_rate": 4.9998533011434365e-05, + "loss": 0.2485, + "step": 5780 + }, + { + "epoch": 0.10311061962686834, + "grad_norm": 0.3131437301635742, + "learning_rate": 4.9998516101210276e-05, + "loss": 0.285, + "step": 5781 + }, + { + "epoch": 0.10312845574858202, + "grad_norm": 0.3235114812850952, + "learning_rate": 4.999849909408331e-05, + "loss": 0.1961, + "step": 5782 + }, + { + "epoch": 0.10314629187029573, + "grad_norm": 0.2606179118156433, + "learning_rate": 4.999848199005351e-05, + "loss": 0.2159, + "step": 5783 + }, + { + "epoch": 0.10316412799200941, + "grad_norm": 0.3242022395133972, + "learning_rate": 4.999846478912098e-05, + "loss": 0.2388, + "step": 5784 + }, + { + "epoch": 0.10318196411372312, + "grad_norm": 0.3644527792930603, + "learning_rate": 4.999844749128576e-05, + "loss": 0.2711, + "step": 5785 + }, + { + "epoch": 0.1031998002354368, + "grad_norm": 0.2748754024505615, + "learning_rate": 4.999843009654791e-05, + "loss": 0.2912, + "step": 5786 + }, + { + "epoch": 0.1032176363571505, + "grad_norm": 0.25084471702575684, + "learning_rate": 4.999841260490753e-05, + "loss": 0.246, + "step": 5787 + }, + { + "epoch": 0.1032354724788642, + "grad_norm": 0.30010634660720825, + "learning_rate": 4.9998395016364655e-05, + "loss": 0.2393, + "step": 5788 + }, + { + "epoch": 0.1032533086005779, + "grad_norm": 0.24173887073993683, + "learning_rate": 4.999837733091938e-05, + "loss": 0.2316, + "step": 5789 + }, + { + "epoch": 0.10327114472229158, + "grad_norm": 0.3200397789478302, + "learning_rate": 4.999835954857175e-05, + "loss": 0.2771, + "step": 5790 + }, + { + "epoch": 0.10328898084400528, + "grad_norm": 0.38963747024536133, + "learning_rate": 4.999834166932185e-05, + "loss": 0.2775, + "step": 5791 + }, + { + "epoch": 0.10330681696571897, + "grad_norm": 0.38244643807411194, + "learning_rate": 4.999832369316973e-05, + "loss": 0.2183, + "step": 5792 + }, + { + "epoch": 0.10332465308743267, + "grad_norm": 0.35715675354003906, + "learning_rate": 4.999830562011549e-05, + "loss": 0.2464, + "step": 5793 + }, + { + "epoch": 0.10334248920914636, + "grad_norm": 0.3591226637363434, + "learning_rate": 4.999828745015917e-05, + "loss": 0.2362, + "step": 5794 + }, + { + "epoch": 0.10336032533086006, + "grad_norm": 0.33126264810562134, + "learning_rate": 4.999826918330086e-05, + "loss": 0.253, + "step": 5795 + }, + { + "epoch": 0.10337816145257375, + "grad_norm": 0.316311776638031, + "learning_rate": 4.9998250819540625e-05, + "loss": 0.2309, + "step": 5796 + }, + { + "epoch": 0.10339599757428744, + "grad_norm": 0.3163365423679352, + "learning_rate": 4.999823235887854e-05, + "loss": 0.235, + "step": 5797 + }, + { + "epoch": 0.10341383369600114, + "grad_norm": 0.30197620391845703, + "learning_rate": 4.999821380131466e-05, + "loss": 0.2085, + "step": 5798 + }, + { + "epoch": 0.10343166981771483, + "grad_norm": 0.30523353815078735, + "learning_rate": 4.9998195146849084e-05, + "loss": 0.2587, + "step": 5799 + }, + { + "epoch": 0.10344950593942853, + "grad_norm": 0.35222527384757996, + "learning_rate": 4.9998176395481865e-05, + "loss": 0.2618, + "step": 5800 + }, + { + "epoch": 0.10346734206114222, + "grad_norm": 0.2785562574863434, + "learning_rate": 4.999815754721307e-05, + "loss": 0.2285, + "step": 5801 + }, + { + "epoch": 0.10348517818285592, + "grad_norm": 0.2471916824579239, + "learning_rate": 4.99981386020428e-05, + "loss": 0.2459, + "step": 5802 + }, + { + "epoch": 0.10350301430456961, + "grad_norm": 0.2487059384584427, + "learning_rate": 4.999811955997109e-05, + "loss": 0.2219, + "step": 5803 + }, + { + "epoch": 0.10352085042628331, + "grad_norm": 0.3147665560245514, + "learning_rate": 4.999810042099805e-05, + "loss": 0.2411, + "step": 5804 + }, + { + "epoch": 0.103538686547997, + "grad_norm": 0.292324036359787, + "learning_rate": 4.999808118512373e-05, + "loss": 0.2462, + "step": 5805 + }, + { + "epoch": 0.1035565226697107, + "grad_norm": 0.33168265223503113, + "learning_rate": 4.999806185234822e-05, + "loss": 0.2359, + "step": 5806 + }, + { + "epoch": 0.10357435879142439, + "grad_norm": 0.29198211431503296, + "learning_rate": 4.999804242267159e-05, + "loss": 0.2226, + "step": 5807 + }, + { + "epoch": 0.10359219491313809, + "grad_norm": 0.2885533571243286, + "learning_rate": 4.999802289609391e-05, + "loss": 0.1946, + "step": 5808 + }, + { + "epoch": 0.10361003103485178, + "grad_norm": 0.32933491468429565, + "learning_rate": 4.9998003272615256e-05, + "loss": 0.31, + "step": 5809 + }, + { + "epoch": 0.10362786715656548, + "grad_norm": 0.3106703460216522, + "learning_rate": 4.999798355223571e-05, + "loss": 0.2462, + "step": 5810 + }, + { + "epoch": 0.10364570327827917, + "grad_norm": 0.351720929145813, + "learning_rate": 4.999796373495535e-05, + "loss": 0.2698, + "step": 5811 + }, + { + "epoch": 0.10366353939999287, + "grad_norm": 0.2700147032737732, + "learning_rate": 4.999794382077424e-05, + "loss": 0.2287, + "step": 5812 + }, + { + "epoch": 0.10368137552170656, + "grad_norm": 0.2648630440235138, + "learning_rate": 4.999792380969247e-05, + "loss": 0.233, + "step": 5813 + }, + { + "epoch": 0.10369921164342026, + "grad_norm": 0.39086654782295227, + "learning_rate": 4.999790370171011e-05, + "loss": 0.2633, + "step": 5814 + }, + { + "epoch": 0.10371704776513395, + "grad_norm": 0.43334561586380005, + "learning_rate": 4.999788349682725e-05, + "loss": 0.3197, + "step": 5815 + }, + { + "epoch": 0.10373488388684764, + "grad_norm": 0.27810490131378174, + "learning_rate": 4.999786319504395e-05, + "loss": 0.2838, + "step": 5816 + }, + { + "epoch": 0.10375272000856134, + "grad_norm": 0.2762657403945923, + "learning_rate": 4.99978427963603e-05, + "loss": 0.2933, + "step": 5817 + }, + { + "epoch": 0.10377055613027503, + "grad_norm": 0.26107022166252136, + "learning_rate": 4.999782230077638e-05, + "loss": 0.238, + "step": 5818 + }, + { + "epoch": 0.10378839225198873, + "grad_norm": 0.31598934531211853, + "learning_rate": 4.999780170829227e-05, + "loss": 0.2712, + "step": 5819 + }, + { + "epoch": 0.10380622837370242, + "grad_norm": 0.2611783444881439, + "learning_rate": 4.999778101890804e-05, + "loss": 0.2322, + "step": 5820 + }, + { + "epoch": 0.10382406449541612, + "grad_norm": 0.3476533889770508, + "learning_rate": 4.9997760232623784e-05, + "loss": 0.3011, + "step": 5821 + }, + { + "epoch": 0.1038419006171298, + "grad_norm": 0.3373652994632721, + "learning_rate": 4.9997739349439564e-05, + "loss": 0.2402, + "step": 5822 + }, + { + "epoch": 0.10385973673884351, + "grad_norm": 0.2581215500831604, + "learning_rate": 4.9997718369355486e-05, + "loss": 0.1991, + "step": 5823 + }, + { + "epoch": 0.1038775728605572, + "grad_norm": 0.2656016945838928, + "learning_rate": 4.9997697292371605e-05, + "loss": 0.2623, + "step": 5824 + }, + { + "epoch": 0.1038954089822709, + "grad_norm": 0.3778791129589081, + "learning_rate": 4.999767611848802e-05, + "loss": 0.2231, + "step": 5825 + }, + { + "epoch": 0.10391324510398459, + "grad_norm": 0.32868921756744385, + "learning_rate": 4.999765484770481e-05, + "loss": 0.217, + "step": 5826 + }, + { + "epoch": 0.10393108122569829, + "grad_norm": 0.3548049032688141, + "learning_rate": 4.9997633480022056e-05, + "loss": 0.2353, + "step": 5827 + }, + { + "epoch": 0.10394891734741198, + "grad_norm": 0.35475409030914307, + "learning_rate": 4.999761201543984e-05, + "loss": 0.2562, + "step": 5828 + }, + { + "epoch": 0.10396675346912568, + "grad_norm": 0.33357155323028564, + "learning_rate": 4.999759045395825e-05, + "loss": 0.2434, + "step": 5829 + }, + { + "epoch": 0.10398458959083937, + "grad_norm": 0.43696609139442444, + "learning_rate": 4.999756879557736e-05, + "loss": 0.3114, + "step": 5830 + }, + { + "epoch": 0.10400242571255307, + "grad_norm": 0.2838720977306366, + "learning_rate": 4.999754704029726e-05, + "loss": 0.2984, + "step": 5831 + }, + { + "epoch": 0.10402026183426676, + "grad_norm": 0.38694697618484497, + "learning_rate": 4.9997525188118034e-05, + "loss": 0.2429, + "step": 5832 + }, + { + "epoch": 0.10403809795598046, + "grad_norm": 0.37511295080184937, + "learning_rate": 4.9997503239039764e-05, + "loss": 0.2851, + "step": 5833 + }, + { + "epoch": 0.10405593407769415, + "grad_norm": 0.46333423256874084, + "learning_rate": 4.9997481193062544e-05, + "loss": 0.3164, + "step": 5834 + }, + { + "epoch": 0.10407377019940785, + "grad_norm": 0.29584962129592896, + "learning_rate": 4.999745905018645e-05, + "loss": 0.2635, + "step": 5835 + }, + { + "epoch": 0.10409160632112154, + "grad_norm": 0.35017719864845276, + "learning_rate": 4.9997436810411575e-05, + "loss": 0.2436, + "step": 5836 + }, + { + "epoch": 0.10410944244283522, + "grad_norm": 0.3249483108520508, + "learning_rate": 4.9997414473737994e-05, + "loss": 0.2944, + "step": 5837 + }, + { + "epoch": 0.10412727856454893, + "grad_norm": 0.3080081343650818, + "learning_rate": 4.99973920401658e-05, + "loss": 0.2757, + "step": 5838 + }, + { + "epoch": 0.10414511468626261, + "grad_norm": 0.4317755401134491, + "learning_rate": 4.999736950969509e-05, + "loss": 0.351, + "step": 5839 + }, + { + "epoch": 0.10416295080797632, + "grad_norm": 0.4012046456336975, + "learning_rate": 4.999734688232593e-05, + "loss": 0.319, + "step": 5840 + }, + { + "epoch": 0.10418078692969, + "grad_norm": 0.3114517033100128, + "learning_rate": 4.999732415805844e-05, + "loss": 0.2468, + "step": 5841 + }, + { + "epoch": 0.1041986230514037, + "grad_norm": 0.41210392117500305, + "learning_rate": 4.999730133689266e-05, + "loss": 0.3366, + "step": 5842 + }, + { + "epoch": 0.10421645917311739, + "grad_norm": 0.38541272282600403, + "learning_rate": 4.9997278418828725e-05, + "loss": 0.3144, + "step": 5843 + }, + { + "epoch": 0.1042342952948311, + "grad_norm": 0.34774795174598694, + "learning_rate": 4.9997255403866705e-05, + "loss": 0.2748, + "step": 5844 + }, + { + "epoch": 0.10425213141654478, + "grad_norm": 0.4897577464580536, + "learning_rate": 4.999723229200668e-05, + "loss": 0.2691, + "step": 5845 + }, + { + "epoch": 0.10426996753825848, + "grad_norm": 0.36025020480155945, + "learning_rate": 4.999720908324875e-05, + "loss": 0.2654, + "step": 5846 + }, + { + "epoch": 0.10428780365997217, + "grad_norm": 0.3165666460990906, + "learning_rate": 4.999718577759301e-05, + "loss": 0.2814, + "step": 5847 + }, + { + "epoch": 0.10430563978168587, + "grad_norm": 0.25389689207077026, + "learning_rate": 4.9997162375039544e-05, + "loss": 0.2511, + "step": 5848 + }, + { + "epoch": 0.10432347590339956, + "grad_norm": 0.3201467990875244, + "learning_rate": 4.999713887558844e-05, + "loss": 0.2896, + "step": 5849 + }, + { + "epoch": 0.10434131202511326, + "grad_norm": 0.34935447573661804, + "learning_rate": 4.999711527923979e-05, + "loss": 0.2486, + "step": 5850 + }, + { + "epoch": 0.10435914814682695, + "grad_norm": 0.3226553797721863, + "learning_rate": 4.9997091585993695e-05, + "loss": 0.2357, + "step": 5851 + }, + { + "epoch": 0.10437698426854065, + "grad_norm": 0.31256410479545593, + "learning_rate": 4.999706779585023e-05, + "loss": 0.2493, + "step": 5852 + }, + { + "epoch": 0.10439482039025434, + "grad_norm": 0.34214910864830017, + "learning_rate": 4.99970439088095e-05, + "loss": 0.2646, + "step": 5853 + }, + { + "epoch": 0.10441265651196804, + "grad_norm": 0.40745365619659424, + "learning_rate": 4.999701992487159e-05, + "loss": 0.2674, + "step": 5854 + }, + { + "epoch": 0.10443049263368173, + "grad_norm": 0.2790455222129822, + "learning_rate": 4.999699584403661e-05, + "loss": 0.2163, + "step": 5855 + }, + { + "epoch": 0.10444832875539543, + "grad_norm": 0.2634226083755493, + "learning_rate": 4.999697166630463e-05, + "loss": 0.2138, + "step": 5856 + }, + { + "epoch": 0.10446616487710912, + "grad_norm": 0.33517295122146606, + "learning_rate": 4.999694739167575e-05, + "loss": 0.2415, + "step": 5857 + }, + { + "epoch": 0.10448400099882281, + "grad_norm": 0.34214141964912415, + "learning_rate": 4.999692302015008e-05, + "loss": 0.2779, + "step": 5858 + }, + { + "epoch": 0.10450183712053651, + "grad_norm": 0.3027956187725067, + "learning_rate": 4.9996898551727694e-05, + "loss": 0.2355, + "step": 5859 + }, + { + "epoch": 0.1045196732422502, + "grad_norm": 0.2922017574310303, + "learning_rate": 4.99968739864087e-05, + "loss": 0.2166, + "step": 5860 + }, + { + "epoch": 0.1045375093639639, + "grad_norm": 0.3346848487854004, + "learning_rate": 4.999684932419318e-05, + "loss": 0.26, + "step": 5861 + }, + { + "epoch": 0.10455534548567759, + "grad_norm": 0.2706536650657654, + "learning_rate": 4.9996824565081254e-05, + "loss": 0.2521, + "step": 5862 + }, + { + "epoch": 0.10457318160739129, + "grad_norm": 0.3135501742362976, + "learning_rate": 4.9996799709073e-05, + "loss": 0.2548, + "step": 5863 + }, + { + "epoch": 0.10459101772910498, + "grad_norm": 0.32489100098609924, + "learning_rate": 4.999677475616851e-05, + "loss": 0.2502, + "step": 5864 + }, + { + "epoch": 0.10460885385081868, + "grad_norm": 0.3062833547592163, + "learning_rate": 4.999674970636788e-05, + "loss": 0.2481, + "step": 5865 + }, + { + "epoch": 0.10462668997253237, + "grad_norm": 0.29157453775405884, + "learning_rate": 4.999672455967123e-05, + "loss": 0.2616, + "step": 5866 + }, + { + "epoch": 0.10464452609424607, + "grad_norm": 0.48722150921821594, + "learning_rate": 4.999669931607863e-05, + "loss": 0.2408, + "step": 5867 + }, + { + "epoch": 0.10466236221595976, + "grad_norm": 0.3130400478839874, + "learning_rate": 4.999667397559019e-05, + "loss": 0.2491, + "step": 5868 + }, + { + "epoch": 0.10468019833767346, + "grad_norm": 0.32957836985588074, + "learning_rate": 4.9996648538206015e-05, + "loss": 0.2785, + "step": 5869 + }, + { + "epoch": 0.10469803445938715, + "grad_norm": 0.5479748845100403, + "learning_rate": 4.99966230039262e-05, + "loss": 0.3644, + "step": 5870 + }, + { + "epoch": 0.10471587058110085, + "grad_norm": 0.28692811727523804, + "learning_rate": 4.999659737275083e-05, + "loss": 0.2437, + "step": 5871 + }, + { + "epoch": 0.10473370670281454, + "grad_norm": 0.33135369420051575, + "learning_rate": 4.9996571644680024e-05, + "loss": 0.2929, + "step": 5872 + }, + { + "epoch": 0.10475154282452824, + "grad_norm": 0.2850666046142578, + "learning_rate": 4.999654581971387e-05, + "loss": 0.2455, + "step": 5873 + }, + { + "epoch": 0.10476937894624193, + "grad_norm": 0.3062702417373657, + "learning_rate": 4.9996519897852464e-05, + "loss": 0.2169, + "step": 5874 + }, + { + "epoch": 0.10478721506795563, + "grad_norm": 0.3672844469547272, + "learning_rate": 4.9996493879095925e-05, + "loss": 0.2922, + "step": 5875 + }, + { + "epoch": 0.10480505118966932, + "grad_norm": 0.33063754439353943, + "learning_rate": 4.999646776344433e-05, + "loss": 0.2719, + "step": 5876 + }, + { + "epoch": 0.104822887311383, + "grad_norm": 0.39049437642097473, + "learning_rate": 4.99964415508978e-05, + "loss": 0.2576, + "step": 5877 + }, + { + "epoch": 0.10484072343309671, + "grad_norm": 0.41826221346855164, + "learning_rate": 4.999641524145643e-05, + "loss": 0.2655, + "step": 5878 + }, + { + "epoch": 0.1048585595548104, + "grad_norm": 0.3125578463077545, + "learning_rate": 4.9996388835120325e-05, + "loss": 0.257, + "step": 5879 + }, + { + "epoch": 0.1048763956765241, + "grad_norm": 0.24703185260295868, + "learning_rate": 4.9996362331889576e-05, + "loss": 0.2294, + "step": 5880 + }, + { + "epoch": 0.10489423179823779, + "grad_norm": 0.3470175564289093, + "learning_rate": 4.9996335731764296e-05, + "loss": 0.2481, + "step": 5881 + }, + { + "epoch": 0.10491206791995149, + "grad_norm": 0.2972065806388855, + "learning_rate": 4.999630903474458e-05, + "loss": 0.2645, + "step": 5882 + }, + { + "epoch": 0.10492990404166518, + "grad_norm": 0.3808644711971283, + "learning_rate": 4.999628224083054e-05, + "loss": 0.2175, + "step": 5883 + }, + { + "epoch": 0.10494774016337888, + "grad_norm": 0.22417323291301727, + "learning_rate": 4.999625535002228e-05, + "loss": 0.2087, + "step": 5884 + }, + { + "epoch": 0.10496557628509257, + "grad_norm": 0.3009660840034485, + "learning_rate": 4.99962283623199e-05, + "loss": 0.2511, + "step": 5885 + }, + { + "epoch": 0.10498341240680627, + "grad_norm": 0.20104964077472687, + "learning_rate": 4.99962012777235e-05, + "loss": 0.2113, + "step": 5886 + }, + { + "epoch": 0.10500124852851996, + "grad_norm": 0.24783194065093994, + "learning_rate": 4.999617409623319e-05, + "loss": 0.236, + "step": 5887 + }, + { + "epoch": 0.10501908465023366, + "grad_norm": 0.3385120630264282, + "learning_rate": 4.9996146817849084e-05, + "loss": 0.2301, + "step": 5888 + }, + { + "epoch": 0.10503692077194735, + "grad_norm": 0.312938392162323, + "learning_rate": 4.999611944257128e-05, + "loss": 0.2524, + "step": 5889 + }, + { + "epoch": 0.10505475689366105, + "grad_norm": 0.24730493128299713, + "learning_rate": 4.999609197039987e-05, + "loss": 0.2224, + "step": 5890 + }, + { + "epoch": 0.10507259301537474, + "grad_norm": 0.3070394694805145, + "learning_rate": 4.999606440133499e-05, + "loss": 0.2688, + "step": 5891 + }, + { + "epoch": 0.10509042913708844, + "grad_norm": 0.3449821472167969, + "learning_rate": 4.999603673537672e-05, + "loss": 0.2964, + "step": 5892 + }, + { + "epoch": 0.10510826525880212, + "grad_norm": 0.4380045235157013, + "learning_rate": 4.9996008972525184e-05, + "loss": 0.2741, + "step": 5893 + }, + { + "epoch": 0.10512610138051583, + "grad_norm": 0.34592729806900024, + "learning_rate": 4.999598111278048e-05, + "loss": 0.2765, + "step": 5894 + }, + { + "epoch": 0.10514393750222951, + "grad_norm": 0.3012355864048004, + "learning_rate": 4.999595315614272e-05, + "loss": 0.2323, + "step": 5895 + }, + { + "epoch": 0.10516177362394322, + "grad_norm": 0.280671626329422, + "learning_rate": 4.999592510261202e-05, + "loss": 0.2605, + "step": 5896 + }, + { + "epoch": 0.1051796097456569, + "grad_norm": 0.3939959704875946, + "learning_rate": 4.999589695218847e-05, + "loss": 0.3428, + "step": 5897 + }, + { + "epoch": 0.10519744586737059, + "grad_norm": 0.34676697850227356, + "learning_rate": 4.9995868704872195e-05, + "loss": 0.1837, + "step": 5898 + }, + { + "epoch": 0.1052152819890843, + "grad_norm": 0.30784639716148376, + "learning_rate": 4.9995840360663305e-05, + "loss": 0.2864, + "step": 5899 + }, + { + "epoch": 0.10523311811079798, + "grad_norm": 0.4207679331302643, + "learning_rate": 4.9995811919561895e-05, + "loss": 0.3062, + "step": 5900 + }, + { + "epoch": 0.10525095423251168, + "grad_norm": 0.28703030943870544, + "learning_rate": 4.9995783381568095e-05, + "loss": 0.2601, + "step": 5901 + }, + { + "epoch": 0.10526879035422537, + "grad_norm": 0.34877583384513855, + "learning_rate": 4.9995754746682e-05, + "loss": 0.2362, + "step": 5902 + }, + { + "epoch": 0.10528662647593907, + "grad_norm": 0.24295946955680847, + "learning_rate": 4.999572601490372e-05, + "loss": 0.2107, + "step": 5903 + }, + { + "epoch": 0.10530446259765276, + "grad_norm": 0.31496644020080566, + "learning_rate": 4.999569718623338e-05, + "loss": 0.2445, + "step": 5904 + }, + { + "epoch": 0.10532229871936646, + "grad_norm": 0.34387901425361633, + "learning_rate": 4.999566826067108e-05, + "loss": 0.2827, + "step": 5905 + }, + { + "epoch": 0.10534013484108015, + "grad_norm": 0.3703104257583618, + "learning_rate": 4.9995639238216944e-05, + "loss": 0.2736, + "step": 5906 + }, + { + "epoch": 0.10535797096279385, + "grad_norm": 0.3689406216144562, + "learning_rate": 4.999561011887107e-05, + "loss": 0.2828, + "step": 5907 + }, + { + "epoch": 0.10537580708450754, + "grad_norm": 0.30629584193229675, + "learning_rate": 4.9995580902633584e-05, + "loss": 0.2434, + "step": 5908 + }, + { + "epoch": 0.10539364320622124, + "grad_norm": 0.4159982204437256, + "learning_rate": 4.9995551589504586e-05, + "loss": 0.2585, + "step": 5909 + }, + { + "epoch": 0.10541147932793493, + "grad_norm": 0.37532517313957214, + "learning_rate": 4.999552217948421e-05, + "loss": 0.2657, + "step": 5910 + }, + { + "epoch": 0.10542931544964863, + "grad_norm": 0.302166223526001, + "learning_rate": 4.999549267257254e-05, + "loss": 0.2633, + "step": 5911 + }, + { + "epoch": 0.10544715157136232, + "grad_norm": 0.2652914822101593, + "learning_rate": 4.9995463068769715e-05, + "loss": 0.2203, + "step": 5912 + }, + { + "epoch": 0.10546498769307602, + "grad_norm": 0.37150490283966064, + "learning_rate": 4.9995433368075846e-05, + "loss": 0.2608, + "step": 5913 + }, + { + "epoch": 0.10548282381478971, + "grad_norm": 0.37731674313545227, + "learning_rate": 4.999540357049104e-05, + "loss": 0.3231, + "step": 5914 + }, + { + "epoch": 0.10550065993650341, + "grad_norm": 0.28298869729042053, + "learning_rate": 4.999537367601541e-05, + "loss": 0.271, + "step": 5915 + }, + { + "epoch": 0.1055184960582171, + "grad_norm": 0.35270267724990845, + "learning_rate": 4.9995343684649084e-05, + "loss": 0.2672, + "step": 5916 + }, + { + "epoch": 0.10553633217993079, + "grad_norm": 0.27312782406806946, + "learning_rate": 4.999531359639218e-05, + "loss": 0.2213, + "step": 5917 + }, + { + "epoch": 0.10555416830164449, + "grad_norm": 0.24734452366828918, + "learning_rate": 4.9995283411244795e-05, + "loss": 0.1981, + "step": 5918 + }, + { + "epoch": 0.10557200442335818, + "grad_norm": 0.2973858416080475, + "learning_rate": 4.9995253129207074e-05, + "loss": 0.2462, + "step": 5919 + }, + { + "epoch": 0.10558984054507188, + "grad_norm": 0.3610922694206238, + "learning_rate": 4.99952227502791e-05, + "loss": 0.2494, + "step": 5920 + }, + { + "epoch": 0.10560767666678557, + "grad_norm": 0.27888768911361694, + "learning_rate": 4.999519227446102e-05, + "loss": 0.239, + "step": 5921 + }, + { + "epoch": 0.10562551278849927, + "grad_norm": 0.3393993377685547, + "learning_rate": 4.9995161701752945e-05, + "loss": 0.2399, + "step": 5922 + }, + { + "epoch": 0.10564334891021296, + "grad_norm": 0.29176065325737, + "learning_rate": 4.999513103215499e-05, + "loss": 0.2528, + "step": 5923 + }, + { + "epoch": 0.10566118503192666, + "grad_norm": 0.302426278591156, + "learning_rate": 4.999510026566727e-05, + "loss": 0.2504, + "step": 5924 + }, + { + "epoch": 0.10567902115364035, + "grad_norm": 0.3877299129962921, + "learning_rate": 4.999506940228991e-05, + "loss": 0.275, + "step": 5925 + }, + { + "epoch": 0.10569685727535405, + "grad_norm": 0.3485565185546875, + "learning_rate": 4.999503844202302e-05, + "loss": 0.2489, + "step": 5926 + }, + { + "epoch": 0.10571469339706774, + "grad_norm": 0.24428917467594147, + "learning_rate": 4.999500738486673e-05, + "loss": 0.2167, + "step": 5927 + }, + { + "epoch": 0.10573252951878144, + "grad_norm": 0.3241652846336365, + "learning_rate": 4.9994976230821167e-05, + "loss": 0.2855, + "step": 5928 + }, + { + "epoch": 0.10575036564049513, + "grad_norm": 0.3077361583709717, + "learning_rate": 4.999494497988644e-05, + "loss": 0.2577, + "step": 5929 + }, + { + "epoch": 0.10576820176220883, + "grad_norm": 0.4071911871433258, + "learning_rate": 4.9994913632062674e-05, + "loss": 0.2539, + "step": 5930 + }, + { + "epoch": 0.10578603788392252, + "grad_norm": 0.2919321060180664, + "learning_rate": 4.999488218734999e-05, + "loss": 0.201, + "step": 5931 + }, + { + "epoch": 0.10580387400563622, + "grad_norm": 0.4423716962337494, + "learning_rate": 4.9994850645748504e-05, + "loss": 0.3363, + "step": 5932 + }, + { + "epoch": 0.10582171012734991, + "grad_norm": 0.21169425547122955, + "learning_rate": 4.999481900725835e-05, + "loss": 0.2288, + "step": 5933 + }, + { + "epoch": 0.10583954624906361, + "grad_norm": 0.31259584426879883, + "learning_rate": 4.999478727187964e-05, + "loss": 0.2952, + "step": 5934 + }, + { + "epoch": 0.1058573823707773, + "grad_norm": 0.34835946559906006, + "learning_rate": 4.9994755439612507e-05, + "loss": 0.237, + "step": 5935 + }, + { + "epoch": 0.105875218492491, + "grad_norm": 0.23921677470207214, + "learning_rate": 4.999472351045707e-05, + "loss": 0.2478, + "step": 5936 + }, + { + "epoch": 0.10589305461420469, + "grad_norm": 0.30249902606010437, + "learning_rate": 4.999469148441344e-05, + "loss": 0.2483, + "step": 5937 + }, + { + "epoch": 0.10591089073591838, + "grad_norm": 0.30154699087142944, + "learning_rate": 4.999465936148176e-05, + "loss": 0.2412, + "step": 5938 + }, + { + "epoch": 0.10592872685763208, + "grad_norm": 0.37488991022109985, + "learning_rate": 4.9994627141662145e-05, + "loss": 0.2899, + "step": 5939 + }, + { + "epoch": 0.10594656297934577, + "grad_norm": 0.3827170729637146, + "learning_rate": 4.999459482495473e-05, + "loss": 0.3188, + "step": 5940 + }, + { + "epoch": 0.10596439910105947, + "grad_norm": 0.3265140950679779, + "learning_rate": 4.9994562411359626e-05, + "loss": 0.2274, + "step": 5941 + }, + { + "epoch": 0.10598223522277316, + "grad_norm": 0.23505325615406036, + "learning_rate": 4.999452990087697e-05, + "loss": 0.2478, + "step": 5942 + }, + { + "epoch": 0.10600007134448686, + "grad_norm": 0.3313054144382477, + "learning_rate": 4.999449729350688e-05, + "loss": 0.2755, + "step": 5943 + }, + { + "epoch": 0.10601790746620054, + "grad_norm": 0.29447782039642334, + "learning_rate": 4.999446458924949e-05, + "loss": 0.2366, + "step": 5944 + }, + { + "epoch": 0.10603574358791425, + "grad_norm": 0.35569271445274353, + "learning_rate": 4.9994431788104914e-05, + "loss": 0.3261, + "step": 5945 + }, + { + "epoch": 0.10605357970962793, + "grad_norm": 0.8849342465400696, + "learning_rate": 4.999439889007329e-05, + "loss": 0.2117, + "step": 5946 + }, + { + "epoch": 0.10607141583134164, + "grad_norm": 0.21632613241672516, + "learning_rate": 4.999436589515475e-05, + "loss": 0.1907, + "step": 5947 + }, + { + "epoch": 0.10608925195305532, + "grad_norm": 0.37242913246154785, + "learning_rate": 4.999433280334941e-05, + "loss": 0.3247, + "step": 5948 + }, + { + "epoch": 0.10610708807476903, + "grad_norm": 0.37078890204429626, + "learning_rate": 4.99942996146574e-05, + "loss": 0.2243, + "step": 5949 + }, + { + "epoch": 0.10612492419648271, + "grad_norm": 0.2567066550254822, + "learning_rate": 4.9994266329078856e-05, + "loss": 0.2658, + "step": 5950 + }, + { + "epoch": 0.10614276031819642, + "grad_norm": 0.2881944477558136, + "learning_rate": 4.9994232946613905e-05, + "loss": 0.2323, + "step": 5951 + }, + { + "epoch": 0.1061605964399101, + "grad_norm": 0.30119019746780396, + "learning_rate": 4.999419946726267e-05, + "loss": 0.2344, + "step": 5952 + }, + { + "epoch": 0.1061784325616238, + "grad_norm": 0.3263953626155853, + "learning_rate": 4.9994165891025285e-05, + "loss": 0.2812, + "step": 5953 + }, + { + "epoch": 0.1061962686833375, + "grad_norm": 0.47165006399154663, + "learning_rate": 4.999413221790188e-05, + "loss": 0.2212, + "step": 5954 + }, + { + "epoch": 0.1062141048050512, + "grad_norm": 0.36640140414237976, + "learning_rate": 4.999409844789259e-05, + "loss": 0.2945, + "step": 5955 + }, + { + "epoch": 0.10623194092676488, + "grad_norm": 0.3732265532016754, + "learning_rate": 4.999406458099754e-05, + "loss": 0.2663, + "step": 5956 + }, + { + "epoch": 0.10624977704847857, + "grad_norm": 0.3062219023704529, + "learning_rate": 4.999403061721686e-05, + "loss": 0.2458, + "step": 5957 + }, + { + "epoch": 0.10626761317019227, + "grad_norm": 0.28339648246765137, + "learning_rate": 4.9993996556550694e-05, + "loss": 0.1878, + "step": 5958 + }, + { + "epoch": 0.10628544929190596, + "grad_norm": 0.26308074593544006, + "learning_rate": 4.999396239899916e-05, + "loss": 0.2428, + "step": 5959 + }, + { + "epoch": 0.10630328541361966, + "grad_norm": 0.3475037217140198, + "learning_rate": 4.999392814456239e-05, + "loss": 0.2759, + "step": 5960 + }, + { + "epoch": 0.10632112153533335, + "grad_norm": 0.3121028244495392, + "learning_rate": 4.999389379324052e-05, + "loss": 0.2766, + "step": 5961 + }, + { + "epoch": 0.10633895765704705, + "grad_norm": 0.19273342192173004, + "learning_rate": 4.99938593450337e-05, + "loss": 0.1833, + "step": 5962 + }, + { + "epoch": 0.10635679377876074, + "grad_norm": 0.268231064081192, + "learning_rate": 4.999382479994204e-05, + "loss": 0.2217, + "step": 5963 + }, + { + "epoch": 0.10637462990047444, + "grad_norm": 0.29513055086135864, + "learning_rate": 4.999379015796567e-05, + "loss": 0.2268, + "step": 5964 + }, + { + "epoch": 0.10639246602218813, + "grad_norm": 0.3670935034751892, + "learning_rate": 4.9993755419104746e-05, + "loss": 0.2741, + "step": 5965 + }, + { + "epoch": 0.10641030214390183, + "grad_norm": 0.3474009037017822, + "learning_rate": 4.999372058335941e-05, + "loss": 0.2813, + "step": 5966 + }, + { + "epoch": 0.10642813826561552, + "grad_norm": 0.3425934910774231, + "learning_rate": 4.999368565072976e-05, + "loss": 0.2901, + "step": 5967 + }, + { + "epoch": 0.10644597438732922, + "grad_norm": 0.29651206731796265, + "learning_rate": 4.9993650621215954e-05, + "loss": 0.2657, + "step": 5968 + }, + { + "epoch": 0.10646381050904291, + "grad_norm": 0.3206726014614105, + "learning_rate": 4.999361549481813e-05, + "loss": 0.2541, + "step": 5969 + }, + { + "epoch": 0.10648164663075661, + "grad_norm": 0.2874189615249634, + "learning_rate": 4.999358027153642e-05, + "loss": 0.2617, + "step": 5970 + }, + { + "epoch": 0.1064994827524703, + "grad_norm": 0.36465299129486084, + "learning_rate": 4.999354495137096e-05, + "loss": 0.3122, + "step": 5971 + }, + { + "epoch": 0.106517318874184, + "grad_norm": 0.26799142360687256, + "learning_rate": 4.999350953432189e-05, + "loss": 0.217, + "step": 5972 + }, + { + "epoch": 0.10653515499589769, + "grad_norm": 0.2865329682826996, + "learning_rate": 4.999347402038934e-05, + "loss": 0.2723, + "step": 5973 + }, + { + "epoch": 0.10655299111761139, + "grad_norm": 0.34854480624198914, + "learning_rate": 4.999343840957345e-05, + "loss": 0.256, + "step": 5974 + }, + { + "epoch": 0.10657082723932508, + "grad_norm": 0.39281558990478516, + "learning_rate": 4.9993402701874363e-05, + "loss": 0.2716, + "step": 5975 + }, + { + "epoch": 0.10658866336103878, + "grad_norm": 0.3169569671154022, + "learning_rate": 4.999336689729222e-05, + "loss": 0.2712, + "step": 5976 + }, + { + "epoch": 0.10660649948275247, + "grad_norm": 0.4224412441253662, + "learning_rate": 4.999333099582715e-05, + "loss": 0.2605, + "step": 5977 + }, + { + "epoch": 0.10662433560446616, + "grad_norm": 0.33814242482185364, + "learning_rate": 4.99932949974793e-05, + "loss": 0.2375, + "step": 5978 + }, + { + "epoch": 0.10664217172617986, + "grad_norm": 0.27266937494277954, + "learning_rate": 4.9993258902248795e-05, + "loss": 0.2702, + "step": 5979 + }, + { + "epoch": 0.10666000784789355, + "grad_norm": 0.35742419958114624, + "learning_rate": 4.99932227101358e-05, + "loss": 0.2282, + "step": 5980 + }, + { + "epoch": 0.10667784396960725, + "grad_norm": 0.3397842049598694, + "learning_rate": 4.9993186421140434e-05, + "loss": 0.241, + "step": 5981 + }, + { + "epoch": 0.10669568009132094, + "grad_norm": 0.3398186266422272, + "learning_rate": 4.9993150035262846e-05, + "loss": 0.2372, + "step": 5982 + }, + { + "epoch": 0.10671351621303464, + "grad_norm": 0.395740270614624, + "learning_rate": 4.9993113552503176e-05, + "loss": 0.3187, + "step": 5983 + }, + { + "epoch": 0.10673135233474833, + "grad_norm": 0.36592522263526917, + "learning_rate": 4.9993076972861564e-05, + "loss": 0.352, + "step": 5984 + }, + { + "epoch": 0.10674918845646203, + "grad_norm": 0.28423792123794556, + "learning_rate": 4.999304029633815e-05, + "loss": 0.2569, + "step": 5985 + }, + { + "epoch": 0.10676702457817572, + "grad_norm": 0.3130965232849121, + "learning_rate": 4.999300352293309e-05, + "loss": 0.233, + "step": 5986 + }, + { + "epoch": 0.10678486069988942, + "grad_norm": 0.30348554253578186, + "learning_rate": 4.999296665264651e-05, + "loss": 0.2592, + "step": 5987 + }, + { + "epoch": 0.10680269682160311, + "grad_norm": 0.3310333788394928, + "learning_rate": 4.999292968547856e-05, + "loss": 0.2513, + "step": 5988 + }, + { + "epoch": 0.10682053294331681, + "grad_norm": 0.3381810784339905, + "learning_rate": 4.9992892621429386e-05, + "loss": 0.2422, + "step": 5989 + }, + { + "epoch": 0.1068383690650305, + "grad_norm": 0.33122992515563965, + "learning_rate": 4.999285546049912e-05, + "loss": 0.2937, + "step": 5990 + }, + { + "epoch": 0.1068562051867442, + "grad_norm": 0.25606706738471985, + "learning_rate": 4.999281820268792e-05, + "loss": 0.2294, + "step": 5991 + }, + { + "epoch": 0.10687404130845789, + "grad_norm": 0.3229588568210602, + "learning_rate": 4.999278084799592e-05, + "loss": 0.2417, + "step": 5992 + }, + { + "epoch": 0.10689187743017159, + "grad_norm": 0.3730687201023102, + "learning_rate": 4.9992743396423275e-05, + "loss": 0.3012, + "step": 5993 + }, + { + "epoch": 0.10690971355188528, + "grad_norm": 0.361680805683136, + "learning_rate": 4.9992705847970125e-05, + "loss": 0.2948, + "step": 5994 + }, + { + "epoch": 0.10692754967359898, + "grad_norm": 0.3156869411468506, + "learning_rate": 4.9992668202636606e-05, + "loss": 0.2573, + "step": 5995 + }, + { + "epoch": 0.10694538579531267, + "grad_norm": 0.3901461362838745, + "learning_rate": 4.999263046042288e-05, + "loss": 0.2043, + "step": 5996 + }, + { + "epoch": 0.10696322191702637, + "grad_norm": 0.26782023906707764, + "learning_rate": 4.999259262132908e-05, + "loss": 0.1929, + "step": 5997 + }, + { + "epoch": 0.10698105803874006, + "grad_norm": 0.32201719284057617, + "learning_rate": 4.9992554685355365e-05, + "loss": 0.2242, + "step": 5998 + }, + { + "epoch": 0.10699889416045374, + "grad_norm": 0.31323421001434326, + "learning_rate": 4.999251665250187e-05, + "loss": 0.2226, + "step": 5999 + }, + { + "epoch": 0.10701673028216745, + "grad_norm": 0.28290367126464844, + "learning_rate": 4.999247852276876e-05, + "loss": 0.2689, + "step": 6000 + }, + { + "epoch": 0.10701673028216745, + "eval_loss": 0.23532024025917053, + "eval_runtime": 794.127, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.215, + "step": 6000 + }, + { + "epoch": 0.10703456640388113, + "grad_norm": 0.2984603941440582, + "learning_rate": 4.999244029615616e-05, + "loss": 0.2576, + "step": 6001 + }, + { + "epoch": 0.10705240252559484, + "grad_norm": 0.22015346586704254, + "learning_rate": 4.999240197266424e-05, + "loss": 0.1942, + "step": 6002 + }, + { + "epoch": 0.10707023864730852, + "grad_norm": 0.29506194591522217, + "learning_rate": 4.999236355229313e-05, + "loss": 0.2088, + "step": 6003 + }, + { + "epoch": 0.10708807476902223, + "grad_norm": 0.31577104330062866, + "learning_rate": 4.999232503504298e-05, + "loss": 0.2538, + "step": 6004 + }, + { + "epoch": 0.10710591089073591, + "grad_norm": 0.3176822364330292, + "learning_rate": 4.9992286420913956e-05, + "loss": 0.2341, + "step": 6005 + }, + { + "epoch": 0.10712374701244962, + "grad_norm": 0.2654493749141693, + "learning_rate": 4.9992247709906195e-05, + "loss": 0.2331, + "step": 6006 + }, + { + "epoch": 0.1071415831341633, + "grad_norm": 0.3566538691520691, + "learning_rate": 4.999220890201985e-05, + "loss": 0.2523, + "step": 6007 + }, + { + "epoch": 0.107159419255877, + "grad_norm": 0.3755199611186981, + "learning_rate": 4.999216999725507e-05, + "loss": 0.261, + "step": 6008 + }, + { + "epoch": 0.1071772553775907, + "grad_norm": 0.3827899694442749, + "learning_rate": 4.999213099561201e-05, + "loss": 0.2879, + "step": 6009 + }, + { + "epoch": 0.1071950914993044, + "grad_norm": 0.30096253752708435, + "learning_rate": 4.999209189709081e-05, + "loss": 0.2389, + "step": 6010 + }, + { + "epoch": 0.10721292762101808, + "grad_norm": 0.30005961656570435, + "learning_rate": 4.9992052701691635e-05, + "loss": 0.2306, + "step": 6011 + }, + { + "epoch": 0.10723076374273179, + "grad_norm": 0.267581045627594, + "learning_rate": 4.999201340941464e-05, + "loss": 0.2216, + "step": 6012 + }, + { + "epoch": 0.10724859986444547, + "grad_norm": 0.298268586397171, + "learning_rate": 4.999197402025996e-05, + "loss": 0.2681, + "step": 6013 + }, + { + "epoch": 0.10726643598615918, + "grad_norm": 0.3477075397968292, + "learning_rate": 4.999193453422776e-05, + "loss": 0.2782, + "step": 6014 + }, + { + "epoch": 0.10728427210787286, + "grad_norm": 0.3017905354499817, + "learning_rate": 4.999189495131819e-05, + "loss": 0.2688, + "step": 6015 + }, + { + "epoch": 0.10730210822958657, + "grad_norm": 0.3463447093963623, + "learning_rate": 4.99918552715314e-05, + "loss": 0.292, + "step": 6016 + }, + { + "epoch": 0.10731994435130025, + "grad_norm": 0.339599609375, + "learning_rate": 4.9991815494867547e-05, + "loss": 0.2528, + "step": 6017 + }, + { + "epoch": 0.10733778047301394, + "grad_norm": 0.36239829659461975, + "learning_rate": 4.9991775621326785e-05, + "loss": 0.2529, + "step": 6018 + }, + { + "epoch": 0.10735561659472764, + "grad_norm": 0.3503287732601166, + "learning_rate": 4.999173565090928e-05, + "loss": 0.2379, + "step": 6019 + }, + { + "epoch": 0.10737345271644133, + "grad_norm": 0.3132491409778595, + "learning_rate": 4.9991695583615164e-05, + "loss": 0.2603, + "step": 6020 + }, + { + "epoch": 0.10739128883815503, + "grad_norm": 0.34829726815223694, + "learning_rate": 4.9991655419444605e-05, + "loss": 0.2233, + "step": 6021 + }, + { + "epoch": 0.10740912495986872, + "grad_norm": 0.22812891006469727, + "learning_rate": 4.999161515839776e-05, + "loss": 0.2249, + "step": 6022 + }, + { + "epoch": 0.10742696108158242, + "grad_norm": 0.3753577470779419, + "learning_rate": 4.9991574800474785e-05, + "loss": 0.217, + "step": 6023 + }, + { + "epoch": 0.10744479720329611, + "grad_norm": 0.263096421957016, + "learning_rate": 4.999153434567583e-05, + "loss": 0.2259, + "step": 6024 + }, + { + "epoch": 0.10746263332500981, + "grad_norm": 0.28593751788139343, + "learning_rate": 4.999149379400105e-05, + "loss": 0.2848, + "step": 6025 + }, + { + "epoch": 0.1074804694467235, + "grad_norm": 0.31411415338516235, + "learning_rate": 4.999145314545062e-05, + "loss": 0.2286, + "step": 6026 + }, + { + "epoch": 0.1074983055684372, + "grad_norm": 0.3467569649219513, + "learning_rate": 4.999141240002468e-05, + "loss": 0.2233, + "step": 6027 + }, + { + "epoch": 0.10751614169015089, + "grad_norm": 0.26406314969062805, + "learning_rate": 4.99913715577234e-05, + "loss": 0.2314, + "step": 6028 + }, + { + "epoch": 0.10753397781186459, + "grad_norm": 0.3606877326965332, + "learning_rate": 4.999133061854693e-05, + "loss": 0.2605, + "step": 6029 + }, + { + "epoch": 0.10755181393357828, + "grad_norm": 0.39641469717025757, + "learning_rate": 4.9991289582495424e-05, + "loss": 0.2972, + "step": 6030 + }, + { + "epoch": 0.10756965005529198, + "grad_norm": 0.2957558333873749, + "learning_rate": 4.9991248449569054e-05, + "loss": 0.2513, + "step": 6031 + }, + { + "epoch": 0.10758748617700567, + "grad_norm": 0.3177221417427063, + "learning_rate": 4.999120721976797e-05, + "loss": 0.2681, + "step": 6032 + }, + { + "epoch": 0.10760532229871937, + "grad_norm": 0.3491055369377136, + "learning_rate": 4.999116589309234e-05, + "loss": 0.2091, + "step": 6033 + }, + { + "epoch": 0.10762315842043306, + "grad_norm": 0.4598277807235718, + "learning_rate": 4.9991124469542315e-05, + "loss": 0.2542, + "step": 6034 + }, + { + "epoch": 0.10764099454214676, + "grad_norm": 0.5211182236671448, + "learning_rate": 4.999108294911806e-05, + "loss": 0.2541, + "step": 6035 + }, + { + "epoch": 0.10765883066386045, + "grad_norm": 0.27361178398132324, + "learning_rate": 4.999104133181974e-05, + "loss": 0.2298, + "step": 6036 + }, + { + "epoch": 0.10767666678557415, + "grad_norm": 0.333618700504303, + "learning_rate": 4.999099961764751e-05, + "loss": 0.2798, + "step": 6037 + }, + { + "epoch": 0.10769450290728784, + "grad_norm": 0.32966190576553345, + "learning_rate": 4.999095780660153e-05, + "loss": 0.2935, + "step": 6038 + }, + { + "epoch": 0.10771233902900153, + "grad_norm": 0.3095170259475708, + "learning_rate": 4.9990915898681964e-05, + "loss": 0.2609, + "step": 6039 + }, + { + "epoch": 0.10773017515071523, + "grad_norm": 0.4958934783935547, + "learning_rate": 4.999087389388899e-05, + "loss": 0.2342, + "step": 6040 + }, + { + "epoch": 0.10774801127242892, + "grad_norm": 0.2796764075756073, + "learning_rate": 4.999083179222275e-05, + "loss": 0.266, + "step": 6041 + }, + { + "epoch": 0.10776584739414262, + "grad_norm": 0.4637889564037323, + "learning_rate": 4.9990789593683406e-05, + "loss": 0.2424, + "step": 6042 + }, + { + "epoch": 0.10778368351585631, + "grad_norm": 0.3296448588371277, + "learning_rate": 4.999074729827114e-05, + "loss": 0.2743, + "step": 6043 + }, + { + "epoch": 0.10780151963757001, + "grad_norm": 0.4007031321525574, + "learning_rate": 4.99907049059861e-05, + "loss": 0.2988, + "step": 6044 + }, + { + "epoch": 0.1078193557592837, + "grad_norm": 0.2786741256713867, + "learning_rate": 4.999066241682846e-05, + "loss": 0.2043, + "step": 6045 + }, + { + "epoch": 0.1078371918809974, + "grad_norm": 0.3915177285671234, + "learning_rate": 4.999061983079838e-05, + "loss": 0.2911, + "step": 6046 + }, + { + "epoch": 0.10785502800271109, + "grad_norm": 0.3464531898498535, + "learning_rate": 4.999057714789603e-05, + "loss": 0.2519, + "step": 6047 + }, + { + "epoch": 0.10787286412442479, + "grad_norm": 0.29628807306289673, + "learning_rate": 4.9990534368121564e-05, + "loss": 0.2497, + "step": 6048 + }, + { + "epoch": 0.10789070024613848, + "grad_norm": 0.3597511649131775, + "learning_rate": 4.9990491491475164e-05, + "loss": 0.3015, + "step": 6049 + }, + { + "epoch": 0.10790853636785218, + "grad_norm": 0.3681495487689972, + "learning_rate": 4.999044851795698e-05, + "loss": 0.3212, + "step": 6050 + }, + { + "epoch": 0.10792637248956587, + "grad_norm": 0.3813852369785309, + "learning_rate": 4.999040544756719e-05, + "loss": 0.3013, + "step": 6051 + }, + { + "epoch": 0.10794420861127957, + "grad_norm": 0.2874419093132019, + "learning_rate": 4.9990362280305955e-05, + "loss": 0.2736, + "step": 6052 + }, + { + "epoch": 0.10796204473299326, + "grad_norm": 0.327522873878479, + "learning_rate": 4.9990319016173444e-05, + "loss": 0.2763, + "step": 6053 + }, + { + "epoch": 0.10797988085470696, + "grad_norm": 0.31113898754119873, + "learning_rate": 4.999027565516983e-05, + "loss": 0.2377, + "step": 6054 + }, + { + "epoch": 0.10799771697642065, + "grad_norm": 0.32192838191986084, + "learning_rate": 4.9990232197295267e-05, + "loss": 0.2245, + "step": 6055 + }, + { + "epoch": 0.10801555309813435, + "grad_norm": 0.3355289399623871, + "learning_rate": 4.999018864254994e-05, + "loss": 0.2257, + "step": 6056 + }, + { + "epoch": 0.10803338921984804, + "grad_norm": 0.30110621452331543, + "learning_rate": 4.999014499093401e-05, + "loss": 0.2119, + "step": 6057 + }, + { + "epoch": 0.10805122534156172, + "grad_norm": 0.3693029582500458, + "learning_rate": 4.999010124244764e-05, + "loss": 0.2438, + "step": 6058 + }, + { + "epoch": 0.10806906146327543, + "grad_norm": 0.27824169397354126, + "learning_rate": 4.9990057397091014e-05, + "loss": 0.219, + "step": 6059 + }, + { + "epoch": 0.10808689758498911, + "grad_norm": 0.25682544708251953, + "learning_rate": 4.999001345486429e-05, + "loss": 0.2107, + "step": 6060 + }, + { + "epoch": 0.10810473370670282, + "grad_norm": 0.35045406222343445, + "learning_rate": 4.998996941576764e-05, + "loss": 0.2276, + "step": 6061 + }, + { + "epoch": 0.1081225698284165, + "grad_norm": 0.2425319403409958, + "learning_rate": 4.998992527980125e-05, + "loss": 0.1976, + "step": 6062 + }, + { + "epoch": 0.1081404059501302, + "grad_norm": 0.3771149218082428, + "learning_rate": 4.998988104696527e-05, + "loss": 0.2801, + "step": 6063 + }, + { + "epoch": 0.1081582420718439, + "grad_norm": 0.31544533371925354, + "learning_rate": 4.9989836717259875e-05, + "loss": 0.2673, + "step": 6064 + }, + { + "epoch": 0.1081760781935576, + "grad_norm": 0.2958180904388428, + "learning_rate": 4.998979229068525e-05, + "loss": 0.2704, + "step": 6065 + }, + { + "epoch": 0.10819391431527128, + "grad_norm": 0.318117618560791, + "learning_rate": 4.998974776724156e-05, + "loss": 0.2595, + "step": 6066 + }, + { + "epoch": 0.10821175043698499, + "grad_norm": 0.28931739926338196, + "learning_rate": 4.9989703146928966e-05, + "loss": 0.2173, + "step": 6067 + }, + { + "epoch": 0.10822958655869867, + "grad_norm": 0.2558846175670624, + "learning_rate": 4.998965842974766e-05, + "loss": 0.1761, + "step": 6068 + }, + { + "epoch": 0.10824742268041238, + "grad_norm": 0.3012659549713135, + "learning_rate": 4.99896136156978e-05, + "loss": 0.2723, + "step": 6069 + }, + { + "epoch": 0.10826525880212606, + "grad_norm": 0.4154328405857086, + "learning_rate": 4.998956870477958e-05, + "loss": 0.2006, + "step": 6070 + }, + { + "epoch": 0.10828309492383976, + "grad_norm": 0.3246530294418335, + "learning_rate": 4.9989523696993145e-05, + "loss": 0.2338, + "step": 6071 + }, + { + "epoch": 0.10830093104555345, + "grad_norm": 0.3683794140815735, + "learning_rate": 4.998947859233869e-05, + "loss": 0.2945, + "step": 6072 + }, + { + "epoch": 0.10831876716726715, + "grad_norm": 0.35793155431747437, + "learning_rate": 4.998943339081639e-05, + "loss": 0.2587, + "step": 6073 + }, + { + "epoch": 0.10833660328898084, + "grad_norm": 0.3311821222305298, + "learning_rate": 4.998938809242641e-05, + "loss": 0.2491, + "step": 6074 + }, + { + "epoch": 0.10835443941069454, + "grad_norm": 0.30905959010124207, + "learning_rate": 4.998934269716893e-05, + "loss": 0.2262, + "step": 6075 + }, + { + "epoch": 0.10837227553240823, + "grad_norm": 0.2520501911640167, + "learning_rate": 4.998929720504413e-05, + "loss": 0.2377, + "step": 6076 + }, + { + "epoch": 0.10839011165412193, + "grad_norm": 0.34372684359550476, + "learning_rate": 4.998925161605218e-05, + "loss": 0.2369, + "step": 6077 + }, + { + "epoch": 0.10840794777583562, + "grad_norm": 0.2450200766324997, + "learning_rate": 4.998920593019326e-05, + "loss": 0.2144, + "step": 6078 + }, + { + "epoch": 0.10842578389754931, + "grad_norm": 0.31581997871398926, + "learning_rate": 4.998916014746755e-05, + "loss": 0.28, + "step": 6079 + }, + { + "epoch": 0.10844362001926301, + "grad_norm": 0.42305368185043335, + "learning_rate": 4.9989114267875224e-05, + "loss": 0.304, + "step": 6080 + }, + { + "epoch": 0.1084614561409767, + "grad_norm": 0.3959745168685913, + "learning_rate": 4.998906829141646e-05, + "loss": 0.3352, + "step": 6081 + }, + { + "epoch": 0.1084792922626904, + "grad_norm": 0.28059807419776917, + "learning_rate": 4.998902221809143e-05, + "loss": 0.2479, + "step": 6082 + }, + { + "epoch": 0.10849712838440409, + "grad_norm": 0.37664470076560974, + "learning_rate": 4.998897604790033e-05, + "loss": 0.2808, + "step": 6083 + }, + { + "epoch": 0.10851496450611779, + "grad_norm": 0.33188992738723755, + "learning_rate": 4.998892978084332e-05, + "loss": 0.265, + "step": 6084 + }, + { + "epoch": 0.10853280062783148, + "grad_norm": 0.24675920605659485, + "learning_rate": 4.9988883416920586e-05, + "loss": 0.2402, + "step": 6085 + }, + { + "epoch": 0.10855063674954518, + "grad_norm": 0.2949751317501068, + "learning_rate": 4.998883695613231e-05, + "loss": 0.2281, + "step": 6086 + }, + { + "epoch": 0.10856847287125887, + "grad_norm": 0.30466875433921814, + "learning_rate": 4.998879039847868e-05, + "loss": 0.2761, + "step": 6087 + }, + { + "epoch": 0.10858630899297257, + "grad_norm": 0.26031750440597534, + "learning_rate": 4.9988743743959855e-05, + "loss": 0.269, + "step": 6088 + }, + { + "epoch": 0.10860414511468626, + "grad_norm": 0.2275160551071167, + "learning_rate": 4.998869699257604e-05, + "loss": 0.2081, + "step": 6089 + }, + { + "epoch": 0.10862198123639996, + "grad_norm": 0.3278014361858368, + "learning_rate": 4.9988650144327395e-05, + "loss": 0.2316, + "step": 6090 + }, + { + "epoch": 0.10863981735811365, + "grad_norm": 0.27700796723365784, + "learning_rate": 4.998860319921411e-05, + "loss": 0.2219, + "step": 6091 + }, + { + "epoch": 0.10865765347982735, + "grad_norm": 0.26910465955734253, + "learning_rate": 4.998855615723638e-05, + "loss": 0.2289, + "step": 6092 + }, + { + "epoch": 0.10867548960154104, + "grad_norm": 0.31600767374038696, + "learning_rate": 4.9988509018394366e-05, + "loss": 0.2177, + "step": 6093 + }, + { + "epoch": 0.10869332572325474, + "grad_norm": 0.34261491894721985, + "learning_rate": 4.998846178268827e-05, + "loss": 0.3016, + "step": 6094 + }, + { + "epoch": 0.10871116184496843, + "grad_norm": 0.3501835763454437, + "learning_rate": 4.998841445011826e-05, + "loss": 0.23, + "step": 6095 + }, + { + "epoch": 0.10872899796668213, + "grad_norm": 0.3551133871078491, + "learning_rate": 4.998836702068451e-05, + "loss": 0.2488, + "step": 6096 + }, + { + "epoch": 0.10874683408839582, + "grad_norm": 0.3116627633571625, + "learning_rate": 4.9988319494387235e-05, + "loss": 0.2868, + "step": 6097 + }, + { + "epoch": 0.10876467021010951, + "grad_norm": 0.2815439701080322, + "learning_rate": 4.99882718712266e-05, + "loss": 0.2173, + "step": 6098 + }, + { + "epoch": 0.10878250633182321, + "grad_norm": 0.3250252306461334, + "learning_rate": 4.99882241512028e-05, + "loss": 0.2407, + "step": 6099 + }, + { + "epoch": 0.1088003424535369, + "grad_norm": 0.3013324737548828, + "learning_rate": 4.9988176334316e-05, + "loss": 0.246, + "step": 6100 + }, + { + "epoch": 0.1088181785752506, + "grad_norm": 0.27790069580078125, + "learning_rate": 4.998812842056641e-05, + "loss": 0.1996, + "step": 6101 + }, + { + "epoch": 0.10883601469696429, + "grad_norm": 0.4440501630306244, + "learning_rate": 4.998808040995419e-05, + "loss": 0.2417, + "step": 6102 + }, + { + "epoch": 0.10885385081867799, + "grad_norm": 0.2666701376438141, + "learning_rate": 4.998803230247955e-05, + "loss": 0.1837, + "step": 6103 + }, + { + "epoch": 0.10887168694039168, + "grad_norm": 0.3506120443344116, + "learning_rate": 4.998798409814266e-05, + "loss": 0.2753, + "step": 6104 + }, + { + "epoch": 0.10888952306210538, + "grad_norm": 0.28628668189048767, + "learning_rate": 4.998793579694372e-05, + "loss": 0.2452, + "step": 6105 + }, + { + "epoch": 0.10890735918381907, + "grad_norm": 0.24965091049671173, + "learning_rate": 4.9987887398882906e-05, + "loss": 0.2357, + "step": 6106 + }, + { + "epoch": 0.10892519530553277, + "grad_norm": 0.2993568778038025, + "learning_rate": 4.9987838903960405e-05, + "loss": 0.2715, + "step": 6107 + }, + { + "epoch": 0.10894303142724646, + "grad_norm": 0.2944696247577667, + "learning_rate": 4.9987790312176414e-05, + "loss": 0.2335, + "step": 6108 + }, + { + "epoch": 0.10896086754896016, + "grad_norm": 0.29676422476768494, + "learning_rate": 4.9987741623531115e-05, + "loss": 0.2433, + "step": 6109 + }, + { + "epoch": 0.10897870367067385, + "grad_norm": 0.311562716960907, + "learning_rate": 4.9987692838024705e-05, + "loss": 0.2176, + "step": 6110 + }, + { + "epoch": 0.10899653979238755, + "grad_norm": 0.23696637153625488, + "learning_rate": 4.998764395565737e-05, + "loss": 0.2096, + "step": 6111 + }, + { + "epoch": 0.10901437591410124, + "grad_norm": 0.29208680987358093, + "learning_rate": 4.9987594976429284e-05, + "loss": 0.2599, + "step": 6112 + }, + { + "epoch": 0.10903221203581494, + "grad_norm": 0.3510104715824127, + "learning_rate": 4.998754590034066e-05, + "loss": 0.2258, + "step": 6113 + }, + { + "epoch": 0.10905004815752863, + "grad_norm": 0.2819032669067383, + "learning_rate": 4.998749672739167e-05, + "loss": 0.2605, + "step": 6114 + }, + { + "epoch": 0.10906788427924233, + "grad_norm": 0.2493922859430313, + "learning_rate": 4.9987447457582516e-05, + "loss": 0.2415, + "step": 6115 + }, + { + "epoch": 0.10908572040095602, + "grad_norm": 0.29971078038215637, + "learning_rate": 4.9987398090913384e-05, + "loss": 0.2315, + "step": 6116 + }, + { + "epoch": 0.10910355652266972, + "grad_norm": 0.3297044634819031, + "learning_rate": 4.998734862738447e-05, + "loss": 0.2432, + "step": 6117 + }, + { + "epoch": 0.1091213926443834, + "grad_norm": 0.31636524200439453, + "learning_rate": 4.998729906699596e-05, + "loss": 0.2606, + "step": 6118 + }, + { + "epoch": 0.1091392287660971, + "grad_norm": 0.8150376081466675, + "learning_rate": 4.998724940974805e-05, + "loss": 0.2372, + "step": 6119 + }, + { + "epoch": 0.1091570648878108, + "grad_norm": 0.2824878692626953, + "learning_rate": 4.9987199655640925e-05, + "loss": 0.2242, + "step": 6120 + }, + { + "epoch": 0.10917490100952448, + "grad_norm": 0.3265892267227173, + "learning_rate": 4.9987149804674797e-05, + "loss": 0.2565, + "step": 6121 + }, + { + "epoch": 0.10919273713123819, + "grad_norm": 0.3480130732059479, + "learning_rate": 4.9987099856849834e-05, + "loss": 0.23, + "step": 6122 + }, + { + "epoch": 0.10921057325295187, + "grad_norm": 0.3382149040699005, + "learning_rate": 4.998704981216624e-05, + "loss": 0.2605, + "step": 6123 + }, + { + "epoch": 0.10922840937466557, + "grad_norm": 0.24673400819301605, + "learning_rate": 4.9986999670624226e-05, + "loss": 0.222, + "step": 6124 + }, + { + "epoch": 0.10924624549637926, + "grad_norm": 0.30811798572540283, + "learning_rate": 4.9986949432223953e-05, + "loss": 0.2254, + "step": 6125 + }, + { + "epoch": 0.10926408161809296, + "grad_norm": 0.3038312792778015, + "learning_rate": 4.9986899096965646e-05, + "loss": 0.2588, + "step": 6126 + }, + { + "epoch": 0.10928191773980665, + "grad_norm": 0.2873574197292328, + "learning_rate": 4.998684866484948e-05, + "loss": 0.2267, + "step": 6127 + }, + { + "epoch": 0.10929975386152035, + "grad_norm": 0.4430965781211853, + "learning_rate": 4.998679813587567e-05, + "loss": 0.2601, + "step": 6128 + }, + { + "epoch": 0.10931758998323404, + "grad_norm": 0.280263215303421, + "learning_rate": 4.998674751004439e-05, + "loss": 0.2569, + "step": 6129 + }, + { + "epoch": 0.10933542610494774, + "grad_norm": 0.31165051460266113, + "learning_rate": 4.998669678735585e-05, + "loss": 0.2697, + "step": 6130 + }, + { + "epoch": 0.10935326222666143, + "grad_norm": 0.2865162193775177, + "learning_rate": 4.998664596781023e-05, + "loss": 0.244, + "step": 6131 + }, + { + "epoch": 0.10937109834837513, + "grad_norm": 0.2541183829307556, + "learning_rate": 4.998659505140776e-05, + "loss": 0.2101, + "step": 6132 + }, + { + "epoch": 0.10938893447008882, + "grad_norm": 0.2579323947429657, + "learning_rate": 4.998654403814861e-05, + "loss": 0.2211, + "step": 6133 + }, + { + "epoch": 0.10940677059180252, + "grad_norm": 0.410474956035614, + "learning_rate": 4.998649292803298e-05, + "loss": 0.2, + "step": 6134 + }, + { + "epoch": 0.10942460671351621, + "grad_norm": 0.2245824933052063, + "learning_rate": 4.998644172106108e-05, + "loss": 0.2042, + "step": 6135 + }, + { + "epoch": 0.10944244283522991, + "grad_norm": 0.2705850899219513, + "learning_rate": 4.99863904172331e-05, + "loss": 0.2172, + "step": 6136 + }, + { + "epoch": 0.1094602789569436, + "grad_norm": 0.3629564940929413, + "learning_rate": 4.998633901654924e-05, + "loss": 0.2701, + "step": 6137 + }, + { + "epoch": 0.10947811507865729, + "grad_norm": 0.2784716784954071, + "learning_rate": 4.99862875190097e-05, + "loss": 0.23, + "step": 6138 + }, + { + "epoch": 0.10949595120037099, + "grad_norm": 0.28572648763656616, + "learning_rate": 4.998623592461468e-05, + "loss": 0.2295, + "step": 6139 + }, + { + "epoch": 0.10951378732208468, + "grad_norm": 0.31702741980552673, + "learning_rate": 4.998618423336439e-05, + "loss": 0.2704, + "step": 6140 + }, + { + "epoch": 0.10953162344379838, + "grad_norm": 0.27289608120918274, + "learning_rate": 4.9986132445259e-05, + "loss": 0.1875, + "step": 6141 + }, + { + "epoch": 0.10954945956551207, + "grad_norm": 0.25540637969970703, + "learning_rate": 4.998608056029874e-05, + "loss": 0.2339, + "step": 6142 + }, + { + "epoch": 0.10956729568722577, + "grad_norm": 0.31396380066871643, + "learning_rate": 4.998602857848381e-05, + "loss": 0.2224, + "step": 6143 + }, + { + "epoch": 0.10958513180893946, + "grad_norm": 0.28415483236312866, + "learning_rate": 4.998597649981439e-05, + "loss": 0.2423, + "step": 6144 + }, + { + "epoch": 0.10960296793065316, + "grad_norm": 0.41271770000457764, + "learning_rate": 4.99859243242907e-05, + "loss": 0.2975, + "step": 6145 + }, + { + "epoch": 0.10962080405236685, + "grad_norm": 0.2675800323486328, + "learning_rate": 4.9985872051912944e-05, + "loss": 0.2519, + "step": 6146 + }, + { + "epoch": 0.10963864017408055, + "grad_norm": 0.3569709062576294, + "learning_rate": 4.998581968268131e-05, + "loss": 0.2755, + "step": 6147 + }, + { + "epoch": 0.10965647629579424, + "grad_norm": 0.25617632269859314, + "learning_rate": 4.9985767216596016e-05, + "loss": 0.2191, + "step": 6148 + }, + { + "epoch": 0.10967431241750794, + "grad_norm": 0.2642269730567932, + "learning_rate": 4.998571465365725e-05, + "loss": 0.2284, + "step": 6149 + }, + { + "epoch": 0.10969214853922163, + "grad_norm": 0.2254253327846527, + "learning_rate": 4.998566199386523e-05, + "loss": 0.18, + "step": 6150 + }, + { + "epoch": 0.10970998466093533, + "grad_norm": 0.22765931487083435, + "learning_rate": 4.998560923722015e-05, + "loss": 0.2193, + "step": 6151 + }, + { + "epoch": 0.10972782078264902, + "grad_norm": 0.3216729164123535, + "learning_rate": 4.998555638372222e-05, + "loss": 0.2509, + "step": 6152 + }, + { + "epoch": 0.10974565690436272, + "grad_norm": 0.2652188241481781, + "learning_rate": 4.998550343337165e-05, + "loss": 0.2612, + "step": 6153 + }, + { + "epoch": 0.10976349302607641, + "grad_norm": 0.22096742689609528, + "learning_rate": 4.998545038616863e-05, + "loss": 0.2263, + "step": 6154 + }, + { + "epoch": 0.10978132914779011, + "grad_norm": 0.3716314435005188, + "learning_rate": 4.998539724211338e-05, + "loss": 0.3624, + "step": 6155 + }, + { + "epoch": 0.1097991652695038, + "grad_norm": 0.328799843788147, + "learning_rate": 4.9985344001206105e-05, + "loss": 0.2156, + "step": 6156 + }, + { + "epoch": 0.1098170013912175, + "grad_norm": 0.3530081510543823, + "learning_rate": 4.9985290663446996e-05, + "loss": 0.221, + "step": 6157 + }, + { + "epoch": 0.10983483751293119, + "grad_norm": 0.2691895365715027, + "learning_rate": 4.9985237228836276e-05, + "loss": 0.223, + "step": 6158 + }, + { + "epoch": 0.10985267363464488, + "grad_norm": 0.3150103688240051, + "learning_rate": 4.998518369737415e-05, + "loss": 0.2054, + "step": 6159 + }, + { + "epoch": 0.10987050975635858, + "grad_norm": 0.25021952390670776, + "learning_rate": 4.998513006906082e-05, + "loss": 0.2369, + "step": 6160 + }, + { + "epoch": 0.10988834587807227, + "grad_norm": 0.32279905676841736, + "learning_rate": 4.998507634389649e-05, + "loss": 0.2603, + "step": 6161 + }, + { + "epoch": 0.10990618199978597, + "grad_norm": 0.3455301821231842, + "learning_rate": 4.998502252188138e-05, + "loss": 0.2754, + "step": 6162 + }, + { + "epoch": 0.10992401812149966, + "grad_norm": 0.256946325302124, + "learning_rate": 4.9984968603015694e-05, + "loss": 0.2082, + "step": 6163 + }, + { + "epoch": 0.10994185424321336, + "grad_norm": 0.3117446005344391, + "learning_rate": 4.998491458729964e-05, + "loss": 0.2522, + "step": 6164 + }, + { + "epoch": 0.10995969036492705, + "grad_norm": 0.2743896245956421, + "learning_rate": 4.998486047473343e-05, + "loss": 0.2451, + "step": 6165 + }, + { + "epoch": 0.10997752648664075, + "grad_norm": 0.2710326910018921, + "learning_rate": 4.998480626531726e-05, + "loss": 0.2303, + "step": 6166 + }, + { + "epoch": 0.10999536260835444, + "grad_norm": 0.26594817638397217, + "learning_rate": 4.998475195905137e-05, + "loss": 0.2119, + "step": 6167 + }, + { + "epoch": 0.11001319873006814, + "grad_norm": 0.3471861779689789, + "learning_rate": 4.998469755593594e-05, + "loss": 0.2587, + "step": 6168 + }, + { + "epoch": 0.11003103485178183, + "grad_norm": 0.2982967495918274, + "learning_rate": 4.998464305597119e-05, + "loss": 0.1988, + "step": 6169 + }, + { + "epoch": 0.11004887097349553, + "grad_norm": 0.4203164577484131, + "learning_rate": 4.9984588459157346e-05, + "loss": 0.2208, + "step": 6170 + }, + { + "epoch": 0.11006670709520922, + "grad_norm": 0.4424651861190796, + "learning_rate": 4.9984533765494594e-05, + "loss": 0.2392, + "step": 6171 + }, + { + "epoch": 0.11008454321692292, + "grad_norm": 0.4270980954170227, + "learning_rate": 4.9984478974983165e-05, + "loss": 0.2957, + "step": 6172 + }, + { + "epoch": 0.1101023793386366, + "grad_norm": 0.24900686740875244, + "learning_rate": 4.998442408762327e-05, + "loss": 0.2188, + "step": 6173 + }, + { + "epoch": 0.11012021546035031, + "grad_norm": 0.36748048663139343, + "learning_rate": 4.998436910341512e-05, + "loss": 0.2705, + "step": 6174 + }, + { + "epoch": 0.110138051582064, + "grad_norm": 0.3921428322792053, + "learning_rate": 4.998431402235891e-05, + "loss": 0.2952, + "step": 6175 + }, + { + "epoch": 0.1101558877037777, + "grad_norm": 0.29848167300224304, + "learning_rate": 4.998425884445489e-05, + "loss": 0.248, + "step": 6176 + }, + { + "epoch": 0.11017372382549138, + "grad_norm": 0.3298819065093994, + "learning_rate": 4.9984203569703244e-05, + "loss": 0.2825, + "step": 6177 + }, + { + "epoch": 0.11019155994720509, + "grad_norm": 0.3297257721424103, + "learning_rate": 4.9984148198104194e-05, + "loss": 0.2818, + "step": 6178 + }, + { + "epoch": 0.11020939606891877, + "grad_norm": 0.29978567361831665, + "learning_rate": 4.998409272965796e-05, + "loss": 0.2187, + "step": 6179 + }, + { + "epoch": 0.11022723219063246, + "grad_norm": 0.2707396149635315, + "learning_rate": 4.998403716436475e-05, + "loss": 0.2495, + "step": 6180 + }, + { + "epoch": 0.11024506831234616, + "grad_norm": 0.4788021743297577, + "learning_rate": 4.9983981502224783e-05, + "loss": 0.267, + "step": 6181 + }, + { + "epoch": 0.11026290443405985, + "grad_norm": 0.2787097692489624, + "learning_rate": 4.998392574323827e-05, + "loss": 0.2364, + "step": 6182 + }, + { + "epoch": 0.11028074055577355, + "grad_norm": 0.3392464518547058, + "learning_rate": 4.998386988740544e-05, + "loss": 0.3185, + "step": 6183 + }, + { + "epoch": 0.11029857667748724, + "grad_norm": 0.2750253677368164, + "learning_rate": 4.9983813934726495e-05, + "loss": 0.2046, + "step": 6184 + }, + { + "epoch": 0.11031641279920094, + "grad_norm": 0.3816703259944916, + "learning_rate": 4.9983757885201664e-05, + "loss": 0.2903, + "step": 6185 + }, + { + "epoch": 0.11033424892091463, + "grad_norm": 0.598724365234375, + "learning_rate": 4.998370173883116e-05, + "loss": 0.3125, + "step": 6186 + }, + { + "epoch": 0.11035208504262833, + "grad_norm": 0.25722193717956543, + "learning_rate": 4.9983645495615197e-05, + "loss": 0.2287, + "step": 6187 + }, + { + "epoch": 0.11036992116434202, + "grad_norm": 0.3098689317703247, + "learning_rate": 4.998358915555399e-05, + "loss": 0.1934, + "step": 6188 + }, + { + "epoch": 0.11038775728605572, + "grad_norm": 0.29725563526153564, + "learning_rate": 4.9983532718647765e-05, + "loss": 0.2689, + "step": 6189 + }, + { + "epoch": 0.11040559340776941, + "grad_norm": 0.29313573241233826, + "learning_rate": 4.9983476184896736e-05, + "loss": 0.2638, + "step": 6190 + }, + { + "epoch": 0.11042342952948311, + "grad_norm": 0.26852619647979736, + "learning_rate": 4.998341955430113e-05, + "loss": 0.2294, + "step": 6191 + }, + { + "epoch": 0.1104412656511968, + "grad_norm": 0.23123401403427124, + "learning_rate": 4.998336282686116e-05, + "loss": 0.2208, + "step": 6192 + }, + { + "epoch": 0.1104591017729105, + "grad_norm": 0.2545979917049408, + "learning_rate": 4.998330600257704e-05, + "loss": 0.2482, + "step": 6193 + }, + { + "epoch": 0.11047693789462419, + "grad_norm": 0.31205466389656067, + "learning_rate": 4.9983249081449e-05, + "loss": 0.2341, + "step": 6194 + }, + { + "epoch": 0.1104947740163379, + "grad_norm": 0.3120054006576538, + "learning_rate": 4.998319206347726e-05, + "loss": 0.2513, + "step": 6195 + }, + { + "epoch": 0.11051261013805158, + "grad_norm": 0.32524827122688293, + "learning_rate": 4.998313494866204e-05, + "loss": 0.2498, + "step": 6196 + }, + { + "epoch": 0.11053044625976528, + "grad_norm": 0.29201456904411316, + "learning_rate": 4.998307773700356e-05, + "loss": 0.2354, + "step": 6197 + }, + { + "epoch": 0.11054828238147897, + "grad_norm": 0.2925111651420593, + "learning_rate": 4.9983020428502035e-05, + "loss": 0.2252, + "step": 6198 + }, + { + "epoch": 0.11056611850319266, + "grad_norm": 0.2674996852874756, + "learning_rate": 4.99829630231577e-05, + "loss": 0.2106, + "step": 6199 + }, + { + "epoch": 0.11058395462490636, + "grad_norm": 0.4428156018257141, + "learning_rate": 4.998290552097077e-05, + "loss": 0.3163, + "step": 6200 + }, + { + "epoch": 0.11060179074662005, + "grad_norm": 0.3531860411167145, + "learning_rate": 4.9982847921941465e-05, + "loss": 0.1911, + "step": 6201 + }, + { + "epoch": 0.11061962686833375, + "grad_norm": 0.4287932813167572, + "learning_rate": 4.998279022607002e-05, + "loss": 0.2908, + "step": 6202 + }, + { + "epoch": 0.11063746299004744, + "grad_norm": 0.25531309843063354, + "learning_rate": 4.998273243335665e-05, + "loss": 0.249, + "step": 6203 + }, + { + "epoch": 0.11065529911176114, + "grad_norm": 0.2637823224067688, + "learning_rate": 4.998267454380158e-05, + "loss": 0.228, + "step": 6204 + }, + { + "epoch": 0.11067313523347483, + "grad_norm": 0.23722325265407562, + "learning_rate": 4.998261655740503e-05, + "loss": 0.2065, + "step": 6205 + }, + { + "epoch": 0.11069097135518853, + "grad_norm": 0.27698805928230286, + "learning_rate": 4.998255847416724e-05, + "loss": 0.2218, + "step": 6206 + }, + { + "epoch": 0.11070880747690222, + "grad_norm": 0.22974862158298492, + "learning_rate": 4.998250029408841e-05, + "loss": 0.2292, + "step": 6207 + }, + { + "epoch": 0.11072664359861592, + "grad_norm": 0.2588344216346741, + "learning_rate": 4.998244201716879e-05, + "loss": 0.244, + "step": 6208 + }, + { + "epoch": 0.11074447972032961, + "grad_norm": 0.26613056659698486, + "learning_rate": 4.998238364340859e-05, + "loss": 0.2387, + "step": 6209 + }, + { + "epoch": 0.11076231584204331, + "grad_norm": 0.2978186011314392, + "learning_rate": 4.998232517280805e-05, + "loss": 0.2729, + "step": 6210 + }, + { + "epoch": 0.110780151963757, + "grad_norm": 0.3010079264640808, + "learning_rate": 4.9982266605367384e-05, + "loss": 0.2557, + "step": 6211 + }, + { + "epoch": 0.1107979880854707, + "grad_norm": 0.22793611884117126, + "learning_rate": 4.9982207941086825e-05, + "loss": 0.2086, + "step": 6212 + }, + { + "epoch": 0.11081582420718439, + "grad_norm": 0.3061627745628357, + "learning_rate": 4.9982149179966594e-05, + "loss": 0.2224, + "step": 6213 + }, + { + "epoch": 0.11083366032889809, + "grad_norm": 0.3212285041809082, + "learning_rate": 4.998209032200694e-05, + "loss": 0.2814, + "step": 6214 + }, + { + "epoch": 0.11085149645061178, + "grad_norm": 0.351762592792511, + "learning_rate": 4.998203136720806e-05, + "loss": 0.2228, + "step": 6215 + }, + { + "epoch": 0.11086933257232548, + "grad_norm": 0.227890282869339, + "learning_rate": 4.99819723155702e-05, + "loss": 0.2452, + "step": 6216 + }, + { + "epoch": 0.11088716869403917, + "grad_norm": 0.23476283252239227, + "learning_rate": 4.99819131670936e-05, + "loss": 0.2362, + "step": 6217 + }, + { + "epoch": 0.11090500481575287, + "grad_norm": 0.25866883993148804, + "learning_rate": 4.998185392177846e-05, + "loss": 0.2064, + "step": 6218 + }, + { + "epoch": 0.11092284093746656, + "grad_norm": 0.24904648959636688, + "learning_rate": 4.998179457962503e-05, + "loss": 0.2447, + "step": 6219 + }, + { + "epoch": 0.11094067705918025, + "grad_norm": 0.22712509334087372, + "learning_rate": 4.9981735140633536e-05, + "loss": 0.2186, + "step": 6220 + }, + { + "epoch": 0.11095851318089395, + "grad_norm": 0.2733319401741028, + "learning_rate": 4.998167560480421e-05, + "loss": 0.248, + "step": 6221 + }, + { + "epoch": 0.11097634930260764, + "grad_norm": 0.341614693403244, + "learning_rate": 4.9981615972137285e-05, + "loss": 0.2572, + "step": 6222 + }, + { + "epoch": 0.11099418542432134, + "grad_norm": 0.28420737385749817, + "learning_rate": 4.998155624263298e-05, + "loss": 0.2251, + "step": 6223 + }, + { + "epoch": 0.11101202154603503, + "grad_norm": 0.24382448196411133, + "learning_rate": 4.998149641629154e-05, + "loss": 0.2384, + "step": 6224 + }, + { + "epoch": 0.11102985766774873, + "grad_norm": 0.29594287276268005, + "learning_rate": 4.998143649311319e-05, + "loss": 0.1826, + "step": 6225 + }, + { + "epoch": 0.11104769378946241, + "grad_norm": 0.27644747495651245, + "learning_rate": 4.998137647309816e-05, + "loss": 0.253, + "step": 6226 + }, + { + "epoch": 0.11106552991117612, + "grad_norm": 0.36379367113113403, + "learning_rate": 4.9981316356246695e-05, + "loss": 0.2897, + "step": 6227 + }, + { + "epoch": 0.1110833660328898, + "grad_norm": 0.40374189615249634, + "learning_rate": 4.9981256142559015e-05, + "loss": 0.254, + "step": 6228 + }, + { + "epoch": 0.1111012021546035, + "grad_norm": 0.3151934742927551, + "learning_rate": 4.9981195832035356e-05, + "loss": 0.2596, + "step": 6229 + }, + { + "epoch": 0.1111190382763172, + "grad_norm": 0.29207053780555725, + "learning_rate": 4.998113542467596e-05, + "loss": 0.2347, + "step": 6230 + }, + { + "epoch": 0.1111368743980309, + "grad_norm": 0.2925536036491394, + "learning_rate": 4.998107492048105e-05, + "loss": 0.2415, + "step": 6231 + }, + { + "epoch": 0.11115471051974458, + "grad_norm": 0.39146339893341064, + "learning_rate": 4.998101431945086e-05, + "loss": 0.2845, + "step": 6232 + }, + { + "epoch": 0.11117254664145829, + "grad_norm": 0.2833290994167328, + "learning_rate": 4.9980953621585634e-05, + "loss": 0.1937, + "step": 6233 + }, + { + "epoch": 0.11119038276317197, + "grad_norm": 0.37179383635520935, + "learning_rate": 4.9980892826885604e-05, + "loss": 0.284, + "step": 6234 + }, + { + "epoch": 0.11120821888488568, + "grad_norm": 0.31555330753326416, + "learning_rate": 4.998083193535101e-05, + "loss": 0.2576, + "step": 6235 + }, + { + "epoch": 0.11122605500659936, + "grad_norm": 0.2974212169647217, + "learning_rate": 4.998077094698208e-05, + "loss": 0.2481, + "step": 6236 + }, + { + "epoch": 0.11124389112831307, + "grad_norm": 0.32543492317199707, + "learning_rate": 4.998070986177906e-05, + "loss": 0.2611, + "step": 6237 + }, + { + "epoch": 0.11126172725002675, + "grad_norm": 0.24908842146396637, + "learning_rate": 4.998064867974217e-05, + "loss": 0.2637, + "step": 6238 + }, + { + "epoch": 0.11127956337174044, + "grad_norm": 0.30848029255867004, + "learning_rate": 4.998058740087166e-05, + "loss": 0.275, + "step": 6239 + }, + { + "epoch": 0.11129739949345414, + "grad_norm": 0.24796777963638306, + "learning_rate": 4.998052602516777e-05, + "loss": 0.2479, + "step": 6240 + }, + { + "epoch": 0.11131523561516783, + "grad_norm": 0.24908721446990967, + "learning_rate": 4.998046455263074e-05, + "loss": 0.2378, + "step": 6241 + }, + { + "epoch": 0.11133307173688153, + "grad_norm": 0.25761616230010986, + "learning_rate": 4.998040298326079e-05, + "loss": 0.2202, + "step": 6242 + }, + { + "epoch": 0.11135090785859522, + "grad_norm": 0.3541248142719269, + "learning_rate": 4.9980341317058166e-05, + "loss": 0.2754, + "step": 6243 + }, + { + "epoch": 0.11136874398030892, + "grad_norm": 0.2676992118358612, + "learning_rate": 4.998027955402312e-05, + "loss": 0.2242, + "step": 6244 + }, + { + "epoch": 0.11138658010202261, + "grad_norm": 0.233119934797287, + "learning_rate": 4.998021769415587e-05, + "loss": 0.2297, + "step": 6245 + }, + { + "epoch": 0.11140441622373631, + "grad_norm": 0.26920872926712036, + "learning_rate": 4.998015573745668e-05, + "loss": 0.2243, + "step": 6246 + }, + { + "epoch": 0.11142225234545, + "grad_norm": 0.2761625647544861, + "learning_rate": 4.998009368392578e-05, + "loss": 0.2291, + "step": 6247 + }, + { + "epoch": 0.1114400884671637, + "grad_norm": 0.3195796310901642, + "learning_rate": 4.998003153356341e-05, + "loss": 0.2158, + "step": 6248 + }, + { + "epoch": 0.11145792458887739, + "grad_norm": 0.43256333470344543, + "learning_rate": 4.99799692863698e-05, + "loss": 0.2463, + "step": 6249 + }, + { + "epoch": 0.11147576071059109, + "grad_norm": 0.4290931820869446, + "learning_rate": 4.997990694234521e-05, + "loss": 0.186, + "step": 6250 + }, + { + "epoch": 0.11149359683230478, + "grad_norm": 0.3229944705963135, + "learning_rate": 4.997984450148987e-05, + "loss": 0.2246, + "step": 6251 + }, + { + "epoch": 0.11151143295401848, + "grad_norm": 0.2988409399986267, + "learning_rate": 4.997978196380402e-05, + "loss": 0.2592, + "step": 6252 + }, + { + "epoch": 0.11152926907573217, + "grad_norm": 0.3372195065021515, + "learning_rate": 4.997971932928792e-05, + "loss": 0.2119, + "step": 6253 + }, + { + "epoch": 0.11154710519744587, + "grad_norm": 0.24260346591472626, + "learning_rate": 4.9979656597941786e-05, + "loss": 0.2259, + "step": 6254 + }, + { + "epoch": 0.11156494131915956, + "grad_norm": 0.2870357036590576, + "learning_rate": 4.997959376976589e-05, + "loss": 0.2433, + "step": 6255 + }, + { + "epoch": 0.11158277744087326, + "grad_norm": 0.32909926772117615, + "learning_rate": 4.9979530844760446e-05, + "loss": 0.2515, + "step": 6256 + }, + { + "epoch": 0.11160061356258695, + "grad_norm": 0.35816195607185364, + "learning_rate": 4.997946782292572e-05, + "loss": 0.2196, + "step": 6257 + }, + { + "epoch": 0.11161844968430065, + "grad_norm": 0.4002450704574585, + "learning_rate": 4.997940470426195e-05, + "loss": 0.212, + "step": 6258 + }, + { + "epoch": 0.11163628580601434, + "grad_norm": 0.37571850419044495, + "learning_rate": 4.9979341488769374e-05, + "loss": 0.2869, + "step": 6259 + }, + { + "epoch": 0.11165412192772803, + "grad_norm": 0.33898311853408813, + "learning_rate": 4.997927817644825e-05, + "loss": 0.2697, + "step": 6260 + }, + { + "epoch": 0.11167195804944173, + "grad_norm": 0.29540374875068665, + "learning_rate": 4.997921476729881e-05, + "loss": 0.2543, + "step": 6261 + }, + { + "epoch": 0.11168979417115542, + "grad_norm": 0.2542315125465393, + "learning_rate": 4.997915126132131e-05, + "loss": 0.2412, + "step": 6262 + }, + { + "epoch": 0.11170763029286912, + "grad_norm": 0.29263821244239807, + "learning_rate": 4.997908765851599e-05, + "loss": 0.2449, + "step": 6263 + }, + { + "epoch": 0.11172546641458281, + "grad_norm": 0.3407067358493805, + "learning_rate": 4.99790239588831e-05, + "loss": 0.253, + "step": 6264 + }, + { + "epoch": 0.11174330253629651, + "grad_norm": 0.3167259693145752, + "learning_rate": 4.997896016242289e-05, + "loss": 0.2535, + "step": 6265 + }, + { + "epoch": 0.1117611386580102, + "grad_norm": 0.21316608786582947, + "learning_rate": 4.99788962691356e-05, + "loss": 0.2159, + "step": 6266 + }, + { + "epoch": 0.1117789747797239, + "grad_norm": 0.23873494565486908, + "learning_rate": 4.997883227902147e-05, + "loss": 0.2027, + "step": 6267 + }, + { + "epoch": 0.11179681090143759, + "grad_norm": 0.2827610373497009, + "learning_rate": 4.997876819208077e-05, + "loss": 0.2453, + "step": 6268 + }, + { + "epoch": 0.11181464702315129, + "grad_norm": 0.410043329000473, + "learning_rate": 4.997870400831374e-05, + "loss": 0.2306, + "step": 6269 + }, + { + "epoch": 0.11183248314486498, + "grad_norm": 0.29575398564338684, + "learning_rate": 4.997863972772062e-05, + "loss": 0.251, + "step": 6270 + }, + { + "epoch": 0.11185031926657868, + "grad_norm": 0.3259303569793701, + "learning_rate": 4.9978575350301664e-05, + "loss": 0.2596, + "step": 6271 + }, + { + "epoch": 0.11186815538829237, + "grad_norm": 0.3475322425365448, + "learning_rate": 4.9978510876057124e-05, + "loss": 0.2582, + "step": 6272 + }, + { + "epoch": 0.11188599151000607, + "grad_norm": 0.2831577956676483, + "learning_rate": 4.9978446304987245e-05, + "loss": 0.254, + "step": 6273 + }, + { + "epoch": 0.11190382763171976, + "grad_norm": 0.3299480080604553, + "learning_rate": 4.997838163709229e-05, + "loss": 0.2024, + "step": 6274 + }, + { + "epoch": 0.11192166375343346, + "grad_norm": 0.30874449014663696, + "learning_rate": 4.99783168723725e-05, + "loss": 0.226, + "step": 6275 + }, + { + "epoch": 0.11193949987514715, + "grad_norm": 0.339728444814682, + "learning_rate": 4.997825201082812e-05, + "loss": 0.2566, + "step": 6276 + }, + { + "epoch": 0.11195733599686085, + "grad_norm": 0.2576361894607544, + "learning_rate": 4.9978187052459406e-05, + "loss": 0.1971, + "step": 6277 + }, + { + "epoch": 0.11197517211857454, + "grad_norm": 0.26656538248062134, + "learning_rate": 4.997812199726662e-05, + "loss": 0.187, + "step": 6278 + }, + { + "epoch": 0.11199300824028822, + "grad_norm": 0.311737596988678, + "learning_rate": 4.997805684525e-05, + "loss": 0.2497, + "step": 6279 + }, + { + "epoch": 0.11201084436200193, + "grad_norm": 0.23330265283584595, + "learning_rate": 4.997799159640981e-05, + "loss": 0.2234, + "step": 6280 + }, + { + "epoch": 0.11202868048371561, + "grad_norm": 0.2843758463859558, + "learning_rate": 4.997792625074629e-05, + "loss": 0.2138, + "step": 6281 + }, + { + "epoch": 0.11204651660542932, + "grad_norm": 0.3988863527774811, + "learning_rate": 4.99778608082597e-05, + "loss": 0.291, + "step": 6282 + }, + { + "epoch": 0.112064352727143, + "grad_norm": 0.23501631617546082, + "learning_rate": 4.99777952689503e-05, + "loss": 0.2299, + "step": 6283 + }, + { + "epoch": 0.1120821888488567, + "grad_norm": 0.25568756461143494, + "learning_rate": 4.9977729632818335e-05, + "loss": 0.1858, + "step": 6284 + }, + { + "epoch": 0.1121000249705704, + "grad_norm": 0.27769866585731506, + "learning_rate": 4.9977663899864066e-05, + "loss": 0.2274, + "step": 6285 + }, + { + "epoch": 0.1121178610922841, + "grad_norm": 0.28924545645713806, + "learning_rate": 4.997759807008775e-05, + "loss": 0.2104, + "step": 6286 + }, + { + "epoch": 0.11213569721399778, + "grad_norm": 0.33430570363998413, + "learning_rate": 4.997753214348963e-05, + "loss": 0.2393, + "step": 6287 + }, + { + "epoch": 0.11215353333571149, + "grad_norm": 0.29556894302368164, + "learning_rate": 4.997746612006997e-05, + "loss": 0.2698, + "step": 6288 + }, + { + "epoch": 0.11217136945742517, + "grad_norm": 0.3289881646633148, + "learning_rate": 4.997739999982902e-05, + "loss": 0.2697, + "step": 6289 + }, + { + "epoch": 0.11218920557913888, + "grad_norm": 0.4218645691871643, + "learning_rate": 4.997733378276705e-05, + "loss": 0.2174, + "step": 6290 + }, + { + "epoch": 0.11220704170085256, + "grad_norm": 0.31306329369544983, + "learning_rate": 4.99772674688843e-05, + "loss": 0.2056, + "step": 6291 + }, + { + "epoch": 0.11222487782256627, + "grad_norm": 0.27947649359703064, + "learning_rate": 4.9977201058181036e-05, + "loss": 0.2352, + "step": 6292 + }, + { + "epoch": 0.11224271394427995, + "grad_norm": 0.30447328090667725, + "learning_rate": 4.9977134550657514e-05, + "loss": 0.2313, + "step": 6293 + }, + { + "epoch": 0.11226055006599366, + "grad_norm": 0.3301723301410675, + "learning_rate": 4.9977067946313993e-05, + "loss": 0.2706, + "step": 6294 + }, + { + "epoch": 0.11227838618770734, + "grad_norm": 0.3076240122318268, + "learning_rate": 4.997700124515073e-05, + "loss": 0.2819, + "step": 6295 + }, + { + "epoch": 0.11229622230942105, + "grad_norm": 0.3603857457637787, + "learning_rate": 4.997693444716798e-05, + "loss": 0.2595, + "step": 6296 + }, + { + "epoch": 0.11231405843113473, + "grad_norm": 0.27384883165359497, + "learning_rate": 4.997686755236601e-05, + "loss": 0.2458, + "step": 6297 + }, + { + "epoch": 0.11233189455284844, + "grad_norm": 0.2910906672477722, + "learning_rate": 4.997680056074507e-05, + "loss": 0.2109, + "step": 6298 + }, + { + "epoch": 0.11234973067456212, + "grad_norm": 0.27966779470443726, + "learning_rate": 4.9976733472305425e-05, + "loss": 0.2459, + "step": 6299 + }, + { + "epoch": 0.11236756679627581, + "grad_norm": 0.328244686126709, + "learning_rate": 4.9976666287047335e-05, + "loss": 0.2739, + "step": 6300 + }, + { + "epoch": 0.11238540291798951, + "grad_norm": 0.3239681124687195, + "learning_rate": 4.997659900497106e-05, + "loss": 0.2886, + "step": 6301 + }, + { + "epoch": 0.1124032390397032, + "grad_norm": 0.2994845509529114, + "learning_rate": 4.9976531626076864e-05, + "loss": 0.2705, + "step": 6302 + }, + { + "epoch": 0.1124210751614169, + "grad_norm": 0.25063931941986084, + "learning_rate": 4.9976464150365e-05, + "loss": 0.2299, + "step": 6303 + }, + { + "epoch": 0.11243891128313059, + "grad_norm": 0.22160574793815613, + "learning_rate": 4.997639657783574e-05, + "loss": 0.2041, + "step": 6304 + }, + { + "epoch": 0.11245674740484429, + "grad_norm": 0.2589114010334015, + "learning_rate": 4.9976328908489336e-05, + "loss": 0.2013, + "step": 6305 + }, + { + "epoch": 0.11247458352655798, + "grad_norm": 0.23705771565437317, + "learning_rate": 4.9976261142326056e-05, + "loss": 0.2158, + "step": 6306 + }, + { + "epoch": 0.11249241964827168, + "grad_norm": 0.2486175298690796, + "learning_rate": 4.997619327934616e-05, + "loss": 0.2093, + "step": 6307 + }, + { + "epoch": 0.11251025576998537, + "grad_norm": 0.32091838121414185, + "learning_rate": 4.997612531954991e-05, + "loss": 0.2503, + "step": 6308 + }, + { + "epoch": 0.11252809189169907, + "grad_norm": 0.2645072042942047, + "learning_rate": 4.997605726293757e-05, + "loss": 0.209, + "step": 6309 + }, + { + "epoch": 0.11254592801341276, + "grad_norm": 0.20899713039398193, + "learning_rate": 4.9975989109509414e-05, + "loss": 0.2095, + "step": 6310 + }, + { + "epoch": 0.11256376413512646, + "grad_norm": 0.315972238779068, + "learning_rate": 4.9975920859265694e-05, + "loss": 0.2751, + "step": 6311 + }, + { + "epoch": 0.11258160025684015, + "grad_norm": 0.3038237988948822, + "learning_rate": 4.997585251220668e-05, + "loss": 0.2485, + "step": 6312 + }, + { + "epoch": 0.11259943637855385, + "grad_norm": 0.27426764369010925, + "learning_rate": 4.9975784068332634e-05, + "loss": 0.2084, + "step": 6313 + }, + { + "epoch": 0.11261727250026754, + "grad_norm": 0.31844186782836914, + "learning_rate": 4.997571552764382e-05, + "loss": 0.2872, + "step": 6314 + }, + { + "epoch": 0.11263510862198124, + "grad_norm": 0.2916257381439209, + "learning_rate": 4.997564689014051e-05, + "loss": 0.2347, + "step": 6315 + }, + { + "epoch": 0.11265294474369493, + "grad_norm": 0.2968656122684479, + "learning_rate": 4.997557815582297e-05, + "loss": 0.2968, + "step": 6316 + }, + { + "epoch": 0.11267078086540863, + "grad_norm": 0.30372437834739685, + "learning_rate": 4.9975509324691446e-05, + "loss": 0.2675, + "step": 6317 + }, + { + "epoch": 0.11268861698712232, + "grad_norm": 0.2827107012271881, + "learning_rate": 4.997544039674624e-05, + "loss": 0.2411, + "step": 6318 + }, + { + "epoch": 0.11270645310883601, + "grad_norm": 0.26831698417663574, + "learning_rate": 4.9975371371987594e-05, + "loss": 0.2447, + "step": 6319 + }, + { + "epoch": 0.11272428923054971, + "grad_norm": 0.3394215404987335, + "learning_rate": 4.9975302250415785e-05, + "loss": 0.3011, + "step": 6320 + }, + { + "epoch": 0.1127421253522634, + "grad_norm": 0.23676671087741852, + "learning_rate": 4.997523303203108e-05, + "loss": 0.2029, + "step": 6321 + }, + { + "epoch": 0.1127599614739771, + "grad_norm": 0.2872179448604584, + "learning_rate": 4.997516371683373e-05, + "loss": 0.2259, + "step": 6322 + }, + { + "epoch": 0.11277779759569079, + "grad_norm": 0.28405171632766724, + "learning_rate": 4.9975094304824036e-05, + "loss": 0.2674, + "step": 6323 + }, + { + "epoch": 0.11279563371740449, + "grad_norm": 0.25588372349739075, + "learning_rate": 4.997502479600225e-05, + "loss": 0.2451, + "step": 6324 + }, + { + "epoch": 0.11281346983911818, + "grad_norm": 0.29073432087898254, + "learning_rate": 4.9974955190368634e-05, + "loss": 0.2812, + "step": 6325 + }, + { + "epoch": 0.11283130596083188, + "grad_norm": 0.29777273535728455, + "learning_rate": 4.9974885487923464e-05, + "loss": 0.2973, + "step": 6326 + }, + { + "epoch": 0.11284914208254557, + "grad_norm": 0.27667126059532166, + "learning_rate": 4.997481568866702e-05, + "loss": 0.2042, + "step": 6327 + }, + { + "epoch": 0.11286697820425927, + "grad_norm": 0.3610670268535614, + "learning_rate": 4.997474579259956e-05, + "loss": 0.2552, + "step": 6328 + }, + { + "epoch": 0.11288481432597296, + "grad_norm": 0.23689375817775726, + "learning_rate": 4.997467579972136e-05, + "loss": 0.2121, + "step": 6329 + }, + { + "epoch": 0.11290265044768666, + "grad_norm": 0.27611005306243896, + "learning_rate": 4.997460571003269e-05, + "loss": 0.2419, + "step": 6330 + }, + { + "epoch": 0.11292048656940035, + "grad_norm": 0.2606268525123596, + "learning_rate": 4.9974535523533815e-05, + "loss": 0.1892, + "step": 6331 + }, + { + "epoch": 0.11293832269111405, + "grad_norm": 0.27849745750427246, + "learning_rate": 4.997446524022503e-05, + "loss": 0.2306, + "step": 6332 + }, + { + "epoch": 0.11295615881282774, + "grad_norm": 0.33862075209617615, + "learning_rate": 4.997439486010658e-05, + "loss": 0.2262, + "step": 6333 + }, + { + "epoch": 0.11297399493454144, + "grad_norm": 0.3877054452896118, + "learning_rate": 4.997432438317876e-05, + "loss": 0.2952, + "step": 6334 + }, + { + "epoch": 0.11299183105625513, + "grad_norm": 0.30139851570129395, + "learning_rate": 4.997425380944182e-05, + "loss": 0.2475, + "step": 6335 + }, + { + "epoch": 0.11300966717796883, + "grad_norm": 0.28107771277427673, + "learning_rate": 4.9974183138896056e-05, + "loss": 0.2732, + "step": 6336 + }, + { + "epoch": 0.11302750329968252, + "grad_norm": 0.24334359169006348, + "learning_rate": 4.9974112371541725e-05, + "loss": 0.2249, + "step": 6337 + }, + { + "epoch": 0.11304533942139622, + "grad_norm": 0.2737603485584259, + "learning_rate": 4.997404150737912e-05, + "loss": 0.22, + "step": 6338 + }, + { + "epoch": 0.1130631755431099, + "grad_norm": 0.3706985116004944, + "learning_rate": 4.997397054640849e-05, + "loss": 0.3006, + "step": 6339 + }, + { + "epoch": 0.1130810116648236, + "grad_norm": 0.2653593122959137, + "learning_rate": 4.9973899488630136e-05, + "loss": 0.2339, + "step": 6340 + }, + { + "epoch": 0.1130988477865373, + "grad_norm": 0.33987531065940857, + "learning_rate": 4.997382833404432e-05, + "loss": 0.2428, + "step": 6341 + }, + { + "epoch": 0.11311668390825098, + "grad_norm": 0.38109177350997925, + "learning_rate": 4.9973757082651315e-05, + "loss": 0.2732, + "step": 6342 + }, + { + "epoch": 0.11313452002996469, + "grad_norm": 0.2975313663482666, + "learning_rate": 4.9973685734451404e-05, + "loss": 0.2383, + "step": 6343 + }, + { + "epoch": 0.11315235615167837, + "grad_norm": 0.23836776614189148, + "learning_rate": 4.997361428944486e-05, + "loss": 0.2286, + "step": 6344 + }, + { + "epoch": 0.11317019227339208, + "grad_norm": 0.36299118399620056, + "learning_rate": 4.9973542747631964e-05, + "loss": 0.2606, + "step": 6345 + }, + { + "epoch": 0.11318802839510576, + "grad_norm": 0.3544904291629791, + "learning_rate": 4.9973471109012984e-05, + "loss": 0.2463, + "step": 6346 + }, + { + "epoch": 0.11320586451681947, + "grad_norm": 0.2779907286167145, + "learning_rate": 4.9973399373588214e-05, + "loss": 0.1852, + "step": 6347 + }, + { + "epoch": 0.11322370063853315, + "grad_norm": 0.33224770426750183, + "learning_rate": 4.997332754135792e-05, + "loss": 0.2596, + "step": 6348 + }, + { + "epoch": 0.11324153676024686, + "grad_norm": 0.3348247706890106, + "learning_rate": 4.9973255612322376e-05, + "loss": 0.2257, + "step": 6349 + }, + { + "epoch": 0.11325937288196054, + "grad_norm": 0.29954883456230164, + "learning_rate": 4.997318358648188e-05, + "loss": 0.2286, + "step": 6350 + }, + { + "epoch": 0.11327720900367425, + "grad_norm": 0.30738696455955505, + "learning_rate": 4.9973111463836686e-05, + "loss": 0.2517, + "step": 6351 + }, + { + "epoch": 0.11329504512538793, + "grad_norm": 0.2955269515514374, + "learning_rate": 4.99730392443871e-05, + "loss": 0.2784, + "step": 6352 + }, + { + "epoch": 0.11331288124710164, + "grad_norm": 0.27649739384651184, + "learning_rate": 4.997296692813338e-05, + "loss": 0.2681, + "step": 6353 + }, + { + "epoch": 0.11333071736881532, + "grad_norm": 0.22541755437850952, + "learning_rate": 4.9972894515075816e-05, + "loss": 0.1894, + "step": 6354 + }, + { + "epoch": 0.11334855349052902, + "grad_norm": 0.3360641896724701, + "learning_rate": 4.9972822005214684e-05, + "loss": 0.2214, + "step": 6355 + }, + { + "epoch": 0.11336638961224271, + "grad_norm": 0.381287544965744, + "learning_rate": 4.997274939855027e-05, + "loss": 0.2642, + "step": 6356 + }, + { + "epoch": 0.11338422573395641, + "grad_norm": 0.45694929361343384, + "learning_rate": 4.997267669508286e-05, + "loss": 0.2628, + "step": 6357 + }, + { + "epoch": 0.1134020618556701, + "grad_norm": 0.3429132401943207, + "learning_rate": 4.9972603894812725e-05, + "loss": 0.2534, + "step": 6358 + }, + { + "epoch": 0.1134198979773838, + "grad_norm": 0.3763284385204315, + "learning_rate": 4.997253099774015e-05, + "loss": 0.2125, + "step": 6359 + }, + { + "epoch": 0.11343773409909749, + "grad_norm": 0.26830655336380005, + "learning_rate": 4.9972458003865426e-05, + "loss": 0.2206, + "step": 6360 + }, + { + "epoch": 0.11345557022081118, + "grad_norm": 0.30152490735054016, + "learning_rate": 4.997238491318882e-05, + "loss": 0.2296, + "step": 6361 + }, + { + "epoch": 0.11347340634252488, + "grad_norm": 0.3029189109802246, + "learning_rate": 4.9972311725710635e-05, + "loss": 0.2644, + "step": 6362 + }, + { + "epoch": 0.11349124246423857, + "grad_norm": 0.27133142948150635, + "learning_rate": 4.997223844143114e-05, + "loss": 0.2141, + "step": 6363 + }, + { + "epoch": 0.11350907858595227, + "grad_norm": 0.3439355492591858, + "learning_rate": 4.997216506035063e-05, + "loss": 0.2409, + "step": 6364 + }, + { + "epoch": 0.11352691470766596, + "grad_norm": 0.3204980492591858, + "learning_rate": 4.997209158246937e-05, + "loss": 0.2508, + "step": 6365 + }, + { + "epoch": 0.11354475082937966, + "grad_norm": 0.431309312582016, + "learning_rate": 4.9972018007787666e-05, + "loss": 0.2948, + "step": 6366 + }, + { + "epoch": 0.11356258695109335, + "grad_norm": 0.25124138593673706, + "learning_rate": 4.99719443363058e-05, + "loss": 0.2442, + "step": 6367 + }, + { + "epoch": 0.11358042307280705, + "grad_norm": 0.3142741918563843, + "learning_rate": 4.997187056802405e-05, + "loss": 0.2233, + "step": 6368 + }, + { + "epoch": 0.11359825919452074, + "grad_norm": 0.29613104462623596, + "learning_rate": 4.99717967029427e-05, + "loss": 0.2738, + "step": 6369 + }, + { + "epoch": 0.11361609531623444, + "grad_norm": 0.3058638870716095, + "learning_rate": 4.9971722741062046e-05, + "loss": 0.2274, + "step": 6370 + }, + { + "epoch": 0.11363393143794813, + "grad_norm": 0.6070004105567932, + "learning_rate": 4.997164868238236e-05, + "loss": 0.2278, + "step": 6371 + }, + { + "epoch": 0.11365176755966183, + "grad_norm": 0.2776007056236267, + "learning_rate": 4.997157452690395e-05, + "loss": 0.2321, + "step": 6372 + }, + { + "epoch": 0.11366960368137552, + "grad_norm": 0.3096480071544647, + "learning_rate": 4.997150027462708e-05, + "loss": 0.259, + "step": 6373 + }, + { + "epoch": 0.11368743980308922, + "grad_norm": 0.2727145850658417, + "learning_rate": 4.9971425925552064e-05, + "loss": 0.2359, + "step": 6374 + }, + { + "epoch": 0.11370527592480291, + "grad_norm": 0.3384600579738617, + "learning_rate": 4.997135147967917e-05, + "loss": 0.2703, + "step": 6375 + }, + { + "epoch": 0.11372311204651661, + "grad_norm": 0.403394490480423, + "learning_rate": 4.997127693700869e-05, + "loss": 0.3012, + "step": 6376 + }, + { + "epoch": 0.1137409481682303, + "grad_norm": 0.26332804560661316, + "learning_rate": 4.997120229754092e-05, + "loss": 0.2519, + "step": 6377 + }, + { + "epoch": 0.113758784289944, + "grad_norm": 0.28521740436553955, + "learning_rate": 4.9971127561276144e-05, + "loss": 0.2077, + "step": 6378 + }, + { + "epoch": 0.11377662041165769, + "grad_norm": 0.3364304006099701, + "learning_rate": 4.997105272821465e-05, + "loss": 0.2361, + "step": 6379 + }, + { + "epoch": 0.11379445653337138, + "grad_norm": 0.2692857086658478, + "learning_rate": 4.997097779835673e-05, + "loss": 0.2087, + "step": 6380 + }, + { + "epoch": 0.11381229265508508, + "grad_norm": 0.3222743570804596, + "learning_rate": 4.997090277170269e-05, + "loss": 0.2304, + "step": 6381 + }, + { + "epoch": 0.11383012877679877, + "grad_norm": 0.34658974409103394, + "learning_rate": 4.9970827648252787e-05, + "loss": 0.2453, + "step": 6382 + }, + { + "epoch": 0.11384796489851247, + "grad_norm": 0.3299717307090759, + "learning_rate": 4.997075242800734e-05, + "loss": 0.2882, + "step": 6383 + }, + { + "epoch": 0.11386580102022616, + "grad_norm": 0.25349822640419006, + "learning_rate": 4.997067711096663e-05, + "loss": 0.2119, + "step": 6384 + }, + { + "epoch": 0.11388363714193986, + "grad_norm": 0.31958621740341187, + "learning_rate": 4.997060169713096e-05, + "loss": 0.2819, + "step": 6385 + }, + { + "epoch": 0.11390147326365355, + "grad_norm": 0.22673916816711426, + "learning_rate": 4.99705261865006e-05, + "loss": 0.2459, + "step": 6386 + }, + { + "epoch": 0.11391930938536725, + "grad_norm": 0.3198469877243042, + "learning_rate": 4.997045057907586e-05, + "loss": 0.2673, + "step": 6387 + }, + { + "epoch": 0.11393714550708094, + "grad_norm": 0.25635942816734314, + "learning_rate": 4.997037487485703e-05, + "loss": 0.2291, + "step": 6388 + }, + { + "epoch": 0.11395498162879464, + "grad_norm": 0.2754196524620056, + "learning_rate": 4.997029907384441e-05, + "loss": 0.2331, + "step": 6389 + }, + { + "epoch": 0.11397281775050833, + "grad_norm": 0.31777021288871765, + "learning_rate": 4.9970223176038276e-05, + "loss": 0.2586, + "step": 6390 + }, + { + "epoch": 0.11399065387222203, + "grad_norm": 0.42584753036499023, + "learning_rate": 4.997014718143893e-05, + "loss": 0.1865, + "step": 6391 + }, + { + "epoch": 0.11400848999393572, + "grad_norm": 0.3233788311481476, + "learning_rate": 4.9970071090046675e-05, + "loss": 0.224, + "step": 6392 + }, + { + "epoch": 0.11402632611564942, + "grad_norm": 0.2840725779533386, + "learning_rate": 4.9969994901861805e-05, + "loss": 0.2431, + "step": 6393 + }, + { + "epoch": 0.1140441622373631, + "grad_norm": 0.2885996997356415, + "learning_rate": 4.99699186168846e-05, + "loss": 0.2349, + "step": 6394 + }, + { + "epoch": 0.11406199835907681, + "grad_norm": 0.3769633173942566, + "learning_rate": 4.996984223511538e-05, + "loss": 0.231, + "step": 6395 + }, + { + "epoch": 0.1140798344807905, + "grad_norm": 0.3439430296421051, + "learning_rate": 4.9969765756554414e-05, + "loss": 0.2088, + "step": 6396 + }, + { + "epoch": 0.1140976706025042, + "grad_norm": 0.3478592038154602, + "learning_rate": 4.996968918120202e-05, + "loss": 0.2713, + "step": 6397 + }, + { + "epoch": 0.11411550672421789, + "grad_norm": 0.27024587988853455, + "learning_rate": 4.996961250905848e-05, + "loss": 0.2235, + "step": 6398 + }, + { + "epoch": 0.11413334284593159, + "grad_norm": 0.35732901096343994, + "learning_rate": 4.99695357401241e-05, + "loss": 0.2179, + "step": 6399 + }, + { + "epoch": 0.11415117896764528, + "grad_norm": 0.2081969529390335, + "learning_rate": 4.996945887439918e-05, + "loss": 0.2018, + "step": 6400 + }, + { + "epoch": 0.11416901508935896, + "grad_norm": 0.29519927501678467, + "learning_rate": 4.9969381911884e-05, + "loss": 0.243, + "step": 6401 + }, + { + "epoch": 0.11418685121107267, + "grad_norm": 0.35555705428123474, + "learning_rate": 4.9969304852578886e-05, + "loss": 0.3235, + "step": 6402 + }, + { + "epoch": 0.11420468733278635, + "grad_norm": 0.3278159201145172, + "learning_rate": 4.996922769648412e-05, + "loss": 0.2316, + "step": 6403 + }, + { + "epoch": 0.11422252345450006, + "grad_norm": 0.2941592037677765, + "learning_rate": 4.99691504436e-05, + "loss": 0.2137, + "step": 6404 + }, + { + "epoch": 0.11424035957621374, + "grad_norm": 0.30727818608283997, + "learning_rate": 4.996907309392683e-05, + "loss": 0.2572, + "step": 6405 + }, + { + "epoch": 0.11425819569792744, + "grad_norm": 0.2645432651042938, + "learning_rate": 4.9968995647464906e-05, + "loss": 0.2195, + "step": 6406 + }, + { + "epoch": 0.11427603181964113, + "grad_norm": 0.35110896825790405, + "learning_rate": 4.9968918104214534e-05, + "loss": 0.2446, + "step": 6407 + }, + { + "epoch": 0.11429386794135483, + "grad_norm": 0.26152899861335754, + "learning_rate": 4.9968840464176004e-05, + "loss": 0.2329, + "step": 6408 + }, + { + "epoch": 0.11431170406306852, + "grad_norm": 0.3267683982849121, + "learning_rate": 4.9968762727349636e-05, + "loss": 0.2762, + "step": 6409 + }, + { + "epoch": 0.11432954018478222, + "grad_norm": 0.352154940366745, + "learning_rate": 4.996868489373571e-05, + "loss": 0.2472, + "step": 6410 + }, + { + "epoch": 0.11434737630649591, + "grad_norm": 0.29199671745300293, + "learning_rate": 4.996860696333454e-05, + "loss": 0.2135, + "step": 6411 + }, + { + "epoch": 0.11436521242820961, + "grad_norm": 0.25746950507164, + "learning_rate": 4.996852893614643e-05, + "loss": 0.2302, + "step": 6412 + }, + { + "epoch": 0.1143830485499233, + "grad_norm": 0.2342277467250824, + "learning_rate": 4.996845081217168e-05, + "loss": 0.2516, + "step": 6413 + }, + { + "epoch": 0.114400884671637, + "grad_norm": 0.35752689838409424, + "learning_rate": 4.9968372591410585e-05, + "loss": 0.2359, + "step": 6414 + }, + { + "epoch": 0.11441872079335069, + "grad_norm": 0.692800760269165, + "learning_rate": 4.996829427386345e-05, + "loss": 0.2611, + "step": 6415 + }, + { + "epoch": 0.1144365569150644, + "grad_norm": 0.272035151720047, + "learning_rate": 4.996821585953059e-05, + "loss": 0.2189, + "step": 6416 + }, + { + "epoch": 0.11445439303677808, + "grad_norm": 0.4375305771827698, + "learning_rate": 4.99681373484123e-05, + "loss": 0.2213, + "step": 6417 + }, + { + "epoch": 0.11447222915849178, + "grad_norm": 0.4251265227794647, + "learning_rate": 4.996805874050888e-05, + "loss": 0.2138, + "step": 6418 + }, + { + "epoch": 0.11449006528020547, + "grad_norm": 0.31208524107933044, + "learning_rate": 4.996798003582065e-05, + "loss": 0.2425, + "step": 6419 + }, + { + "epoch": 0.11450790140191916, + "grad_norm": 0.32257500290870667, + "learning_rate": 4.99679012343479e-05, + "loss": 0.2843, + "step": 6420 + }, + { + "epoch": 0.11452573752363286, + "grad_norm": 0.3965137004852295, + "learning_rate": 4.9967822336090943e-05, + "loss": 0.2367, + "step": 6421 + }, + { + "epoch": 0.11454357364534655, + "grad_norm": 0.4175276756286621, + "learning_rate": 4.996774334105008e-05, + "loss": 0.3549, + "step": 6422 + }, + { + "epoch": 0.11456140976706025, + "grad_norm": 0.3600330650806427, + "learning_rate": 4.996766424922563e-05, + "loss": 0.2308, + "step": 6423 + }, + { + "epoch": 0.11457924588877394, + "grad_norm": 0.35123971104621887, + "learning_rate": 4.996758506061788e-05, + "loss": 0.2521, + "step": 6424 + }, + { + "epoch": 0.11459708201048764, + "grad_norm": 0.20919214189052582, + "learning_rate": 4.9967505775227145e-05, + "loss": 0.2236, + "step": 6425 + }, + { + "epoch": 0.11461491813220133, + "grad_norm": 0.25192296504974365, + "learning_rate": 4.996742639305374e-05, + "loss": 0.253, + "step": 6426 + }, + { + "epoch": 0.11463275425391503, + "grad_norm": 0.3408343195915222, + "learning_rate": 4.996734691409797e-05, + "loss": 0.2927, + "step": 6427 + }, + { + "epoch": 0.11465059037562872, + "grad_norm": 0.25281551480293274, + "learning_rate": 4.996726733836013e-05, + "loss": 0.1761, + "step": 6428 + }, + { + "epoch": 0.11466842649734242, + "grad_norm": 0.48902568221092224, + "learning_rate": 4.996718766584054e-05, + "loss": 0.2263, + "step": 6429 + }, + { + "epoch": 0.11468626261905611, + "grad_norm": 0.23247681558132172, + "learning_rate": 4.996710789653952e-05, + "loss": 0.2398, + "step": 6430 + }, + { + "epoch": 0.11470409874076981, + "grad_norm": 0.24835175275802612, + "learning_rate": 4.9967028030457354e-05, + "loss": 0.2648, + "step": 6431 + }, + { + "epoch": 0.1147219348624835, + "grad_norm": 0.40737810730934143, + "learning_rate": 4.996694806759436e-05, + "loss": 0.2457, + "step": 6432 + }, + { + "epoch": 0.1147397709841972, + "grad_norm": 0.23159782588481903, + "learning_rate": 4.9966868007950865e-05, + "loss": 0.2263, + "step": 6433 + }, + { + "epoch": 0.11475760710591089, + "grad_norm": 0.34643515944480896, + "learning_rate": 4.9966787851527164e-05, + "loss": 0.2836, + "step": 6434 + }, + { + "epoch": 0.11477544322762459, + "grad_norm": 0.357815146446228, + "learning_rate": 4.9966707598323565e-05, + "loss": 0.2776, + "step": 6435 + }, + { + "epoch": 0.11479327934933828, + "grad_norm": 0.28077706694602966, + "learning_rate": 4.996662724834039e-05, + "loss": 0.254, + "step": 6436 + }, + { + "epoch": 0.11481111547105198, + "grad_norm": 0.29435211420059204, + "learning_rate": 4.996654680157794e-05, + "loss": 0.2312, + "step": 6437 + }, + { + "epoch": 0.11482895159276567, + "grad_norm": 0.3501879870891571, + "learning_rate": 4.996646625803653e-05, + "loss": 0.3071, + "step": 6438 + }, + { + "epoch": 0.11484678771447937, + "grad_norm": 0.29916924238204956, + "learning_rate": 4.996638561771647e-05, + "loss": 0.2341, + "step": 6439 + }, + { + "epoch": 0.11486462383619306, + "grad_norm": 0.3069373071193695, + "learning_rate": 4.996630488061809e-05, + "loss": 0.2665, + "step": 6440 + }, + { + "epoch": 0.11488245995790675, + "grad_norm": 0.32497838139533997, + "learning_rate": 4.996622404674168e-05, + "loss": 0.2189, + "step": 6441 + }, + { + "epoch": 0.11490029607962045, + "grad_norm": 0.31551235914230347, + "learning_rate": 4.996614311608756e-05, + "loss": 0.2158, + "step": 6442 + }, + { + "epoch": 0.11491813220133414, + "grad_norm": 0.32928216457366943, + "learning_rate": 4.996606208865605e-05, + "loss": 0.2308, + "step": 6443 + }, + { + "epoch": 0.11493596832304784, + "grad_norm": 0.25787994265556335, + "learning_rate": 4.9965980964447456e-05, + "loss": 0.2259, + "step": 6444 + }, + { + "epoch": 0.11495380444476153, + "grad_norm": 0.41256728768348694, + "learning_rate": 4.996589974346211e-05, + "loss": 0.3111, + "step": 6445 + }, + { + "epoch": 0.11497164056647523, + "grad_norm": 0.3078208267688751, + "learning_rate": 4.99658184257003e-05, + "loss": 0.253, + "step": 6446 + }, + { + "epoch": 0.11498947668818892, + "grad_norm": 0.30306053161621094, + "learning_rate": 4.996573701116236e-05, + "loss": 0.2293, + "step": 6447 + }, + { + "epoch": 0.11500731280990262, + "grad_norm": 0.26847824454307556, + "learning_rate": 4.9965655499848595e-05, + "loss": 0.2103, + "step": 6448 + }, + { + "epoch": 0.1150251489316163, + "grad_norm": 0.25561755895614624, + "learning_rate": 4.996557389175933e-05, + "loss": 0.2148, + "step": 6449 + }, + { + "epoch": 0.11504298505333001, + "grad_norm": 0.38018563389778137, + "learning_rate": 4.996549218689488e-05, + "loss": 0.2907, + "step": 6450 + }, + { + "epoch": 0.1150608211750437, + "grad_norm": 0.24978797137737274, + "learning_rate": 4.9965410385255563e-05, + "loss": 0.2327, + "step": 6451 + }, + { + "epoch": 0.1150786572967574, + "grad_norm": 0.25283369421958923, + "learning_rate": 4.996532848684169e-05, + "loss": 0.2186, + "step": 6452 + }, + { + "epoch": 0.11509649341847109, + "grad_norm": 0.36950114369392395, + "learning_rate": 4.996524649165358e-05, + "loss": 0.3156, + "step": 6453 + }, + { + "epoch": 0.11511432954018479, + "grad_norm": 0.37075960636138916, + "learning_rate": 4.9965164399691546e-05, + "loss": 0.2716, + "step": 6454 + }, + { + "epoch": 0.11513216566189848, + "grad_norm": 0.2191033512353897, + "learning_rate": 4.996508221095592e-05, + "loss": 0.1946, + "step": 6455 + }, + { + "epoch": 0.11515000178361218, + "grad_norm": 0.2517928183078766, + "learning_rate": 4.9964999925447006e-05, + "loss": 0.1969, + "step": 6456 + }, + { + "epoch": 0.11516783790532586, + "grad_norm": 0.34651434421539307, + "learning_rate": 4.9964917543165136e-05, + "loss": 0.2295, + "step": 6457 + }, + { + "epoch": 0.11518567402703957, + "grad_norm": 0.39816907048225403, + "learning_rate": 4.996483506411062e-05, + "loss": 0.323, + "step": 6458 + }, + { + "epoch": 0.11520351014875325, + "grad_norm": 0.2937268614768982, + "learning_rate": 4.9964752488283786e-05, + "loss": 0.2753, + "step": 6459 + }, + { + "epoch": 0.11522134627046694, + "grad_norm": 0.26291102170944214, + "learning_rate": 4.996466981568494e-05, + "loss": 0.2612, + "step": 6460 + }, + { + "epoch": 0.11523918239218064, + "grad_norm": 0.2093595564365387, + "learning_rate": 4.996458704631442e-05, + "loss": 0.1959, + "step": 6461 + }, + { + "epoch": 0.11525701851389433, + "grad_norm": 0.25515130162239075, + "learning_rate": 4.996450418017253e-05, + "loss": 0.269, + "step": 6462 + }, + { + "epoch": 0.11527485463560803, + "grad_norm": 0.3007342517375946, + "learning_rate": 4.9964421217259604e-05, + "loss": 0.2474, + "step": 6463 + }, + { + "epoch": 0.11529269075732172, + "grad_norm": 0.396407812833786, + "learning_rate": 4.9964338157575954e-05, + "loss": 0.2741, + "step": 6464 + }, + { + "epoch": 0.11531052687903542, + "grad_norm": 0.3987675905227661, + "learning_rate": 4.9964255001121914e-05, + "loss": 0.2597, + "step": 6465 + }, + { + "epoch": 0.11532836300074911, + "grad_norm": 0.3393239974975586, + "learning_rate": 4.996417174789779e-05, + "loss": 0.3094, + "step": 6466 + }, + { + "epoch": 0.11534619912246281, + "grad_norm": 0.28795576095581055, + "learning_rate": 4.996408839790392e-05, + "loss": 0.2929, + "step": 6467 + }, + { + "epoch": 0.1153640352441765, + "grad_norm": 0.24852922558784485, + "learning_rate": 4.996400495114062e-05, + "loss": 0.2523, + "step": 6468 + }, + { + "epoch": 0.1153818713658902, + "grad_norm": 0.32010847330093384, + "learning_rate": 4.9963921407608214e-05, + "loss": 0.2641, + "step": 6469 + }, + { + "epoch": 0.11539970748760389, + "grad_norm": 0.3046504259109497, + "learning_rate": 4.9963837767307034e-05, + "loss": 0.2756, + "step": 6470 + }, + { + "epoch": 0.1154175436093176, + "grad_norm": 0.3047195374965668, + "learning_rate": 4.996375403023739e-05, + "loss": 0.2424, + "step": 6471 + }, + { + "epoch": 0.11543537973103128, + "grad_norm": 0.2599313259124756, + "learning_rate": 4.9963670196399614e-05, + "loss": 0.257, + "step": 6472 + }, + { + "epoch": 0.11545321585274498, + "grad_norm": 0.32083696126937866, + "learning_rate": 4.9963586265794025e-05, + "loss": 0.2477, + "step": 6473 + }, + { + "epoch": 0.11547105197445867, + "grad_norm": 0.33297500014305115, + "learning_rate": 4.996350223842096e-05, + "loss": 0.1974, + "step": 6474 + }, + { + "epoch": 0.11548888809617237, + "grad_norm": 0.44310811161994934, + "learning_rate": 4.9963418114280736e-05, + "loss": 0.2823, + "step": 6475 + }, + { + "epoch": 0.11550672421788606, + "grad_norm": 0.31379446387290955, + "learning_rate": 4.996333389337368e-05, + "loss": 0.2603, + "step": 6476 + }, + { + "epoch": 0.11552456033959976, + "grad_norm": 0.3003830313682556, + "learning_rate": 4.996324957570012e-05, + "loss": 0.2681, + "step": 6477 + }, + { + "epoch": 0.11554239646131345, + "grad_norm": 0.2535483241081238, + "learning_rate": 4.996316516126038e-05, + "loss": 0.2071, + "step": 6478 + }, + { + "epoch": 0.11556023258302715, + "grad_norm": 0.41141194105148315, + "learning_rate": 4.9963080650054807e-05, + "loss": 0.1867, + "step": 6479 + }, + { + "epoch": 0.11557806870474084, + "grad_norm": 0.3306072950363159, + "learning_rate": 4.996299604208369e-05, + "loss": 0.2536, + "step": 6480 + }, + { + "epoch": 0.11559590482645453, + "grad_norm": 0.2728017270565033, + "learning_rate": 4.996291133734739e-05, + "loss": 0.244, + "step": 6481 + }, + { + "epoch": 0.11561374094816823, + "grad_norm": 0.3277038335800171, + "learning_rate": 4.996282653584622e-05, + "loss": 0.1984, + "step": 6482 + }, + { + "epoch": 0.11563157706988192, + "grad_norm": 0.26794102787971497, + "learning_rate": 4.996274163758051e-05, + "loss": 0.2643, + "step": 6483 + }, + { + "epoch": 0.11564941319159562, + "grad_norm": 0.2872062921524048, + "learning_rate": 4.9962656642550596e-05, + "loss": 0.2108, + "step": 6484 + }, + { + "epoch": 0.11566724931330931, + "grad_norm": 0.2878411114215851, + "learning_rate": 4.9962571550756796e-05, + "loss": 0.2314, + "step": 6485 + }, + { + "epoch": 0.11568508543502301, + "grad_norm": 0.22507423162460327, + "learning_rate": 4.996248636219946e-05, + "loss": 0.2133, + "step": 6486 + }, + { + "epoch": 0.1157029215567367, + "grad_norm": 0.29494550824165344, + "learning_rate": 4.9962401076878896e-05, + "loss": 0.2535, + "step": 6487 + }, + { + "epoch": 0.1157207576784504, + "grad_norm": 0.30076175928115845, + "learning_rate": 4.996231569479545e-05, + "loss": 0.256, + "step": 6488 + }, + { + "epoch": 0.11573859380016409, + "grad_norm": 0.29725104570388794, + "learning_rate": 4.996223021594943e-05, + "loss": 0.237, + "step": 6489 + }, + { + "epoch": 0.11575642992187779, + "grad_norm": 0.2872684895992279, + "learning_rate": 4.99621446403412e-05, + "loss": 0.2464, + "step": 6490 + }, + { + "epoch": 0.11577426604359148, + "grad_norm": 0.2946079671382904, + "learning_rate": 4.996205896797107e-05, + "loss": 0.2098, + "step": 6491 + }, + { + "epoch": 0.11579210216530518, + "grad_norm": 0.32457631826400757, + "learning_rate": 4.9961973198839385e-05, + "loss": 0.2796, + "step": 6492 + }, + { + "epoch": 0.11580993828701887, + "grad_norm": 0.2626686990261078, + "learning_rate": 4.9961887332946464e-05, + "loss": 0.2303, + "step": 6493 + }, + { + "epoch": 0.11582777440873257, + "grad_norm": 0.376022070646286, + "learning_rate": 4.9961801370292646e-05, + "loss": 0.2298, + "step": 6494 + }, + { + "epoch": 0.11584561053044626, + "grad_norm": 0.29716289043426514, + "learning_rate": 4.9961715310878263e-05, + "loss": 0.2096, + "step": 6495 + }, + { + "epoch": 0.11586344665215996, + "grad_norm": 0.3023572862148285, + "learning_rate": 4.9961629154703655e-05, + "loss": 0.2592, + "step": 6496 + }, + { + "epoch": 0.11588128277387365, + "grad_norm": 0.3096959590911865, + "learning_rate": 4.9961542901769146e-05, + "loss": 0.2651, + "step": 6497 + }, + { + "epoch": 0.11589911889558735, + "grad_norm": 0.32270315289497375, + "learning_rate": 4.996145655207508e-05, + "loss": 0.2717, + "step": 6498 + }, + { + "epoch": 0.11591695501730104, + "grad_norm": 0.2928166687488556, + "learning_rate": 4.996137010562179e-05, + "loss": 0.243, + "step": 6499 + }, + { + "epoch": 0.11593479113901474, + "grad_norm": 0.31219711899757385, + "learning_rate": 4.9961283562409595e-05, + "loss": 0.3059, + "step": 6500 + }, + { + "epoch": 0.11595262726072843, + "grad_norm": 0.36242711544036865, + "learning_rate": 4.9961196922438843e-05, + "loss": 0.3463, + "step": 6501 + }, + { + "epoch": 0.11597046338244212, + "grad_norm": 0.3275732100009918, + "learning_rate": 4.9961110185709886e-05, + "loss": 0.2772, + "step": 6502 + }, + { + "epoch": 0.11598829950415582, + "grad_norm": 0.3870683014392853, + "learning_rate": 4.996102335222303e-05, + "loss": 0.3145, + "step": 6503 + }, + { + "epoch": 0.1160061356258695, + "grad_norm": 0.3099972605705261, + "learning_rate": 4.996093642197864e-05, + "loss": 0.2505, + "step": 6504 + }, + { + "epoch": 0.11602397174758321, + "grad_norm": 0.273925244808197, + "learning_rate": 4.996084939497703e-05, + "loss": 0.2285, + "step": 6505 + }, + { + "epoch": 0.1160418078692969, + "grad_norm": 0.33993473649024963, + "learning_rate": 4.9960762271218554e-05, + "loss": 0.3025, + "step": 6506 + }, + { + "epoch": 0.1160596439910106, + "grad_norm": 0.29106321930885315, + "learning_rate": 4.996067505070353e-05, + "loss": 0.1543, + "step": 6507 + }, + { + "epoch": 0.11607748011272429, + "grad_norm": 0.3417963981628418, + "learning_rate": 4.996058773343232e-05, + "loss": 0.2547, + "step": 6508 + }, + { + "epoch": 0.11609531623443799, + "grad_norm": 0.2559162974357605, + "learning_rate": 4.9960500319405246e-05, + "loss": 0.2384, + "step": 6509 + }, + { + "epoch": 0.11611315235615167, + "grad_norm": 0.523456871509552, + "learning_rate": 4.996041280862265e-05, + "loss": 0.2227, + "step": 6510 + }, + { + "epoch": 0.11613098847786538, + "grad_norm": 0.2539406716823578, + "learning_rate": 4.996032520108488e-05, + "loss": 0.246, + "step": 6511 + }, + { + "epoch": 0.11614882459957906, + "grad_norm": 0.2890263497829437, + "learning_rate": 4.996023749679226e-05, + "loss": 0.2105, + "step": 6512 + }, + { + "epoch": 0.11616666072129277, + "grad_norm": 0.2841342091560364, + "learning_rate": 4.996014969574514e-05, + "loss": 0.2724, + "step": 6513 + }, + { + "epoch": 0.11618449684300645, + "grad_norm": 0.3194986581802368, + "learning_rate": 4.996006179794386e-05, + "loss": 0.1871, + "step": 6514 + }, + { + "epoch": 0.11620233296472016, + "grad_norm": 0.28453782200813293, + "learning_rate": 4.995997380338876e-05, + "loss": 0.2868, + "step": 6515 + }, + { + "epoch": 0.11622016908643384, + "grad_norm": 0.26048335433006287, + "learning_rate": 4.995988571208018e-05, + "loss": 0.2275, + "step": 6516 + }, + { + "epoch": 0.11623800520814755, + "grad_norm": 0.35794925689697266, + "learning_rate": 4.995979752401847e-05, + "loss": 0.2656, + "step": 6517 + }, + { + "epoch": 0.11625584132986123, + "grad_norm": 0.2667000889778137, + "learning_rate": 4.9959709239203954e-05, + "loss": 0.2628, + "step": 6518 + }, + { + "epoch": 0.11627367745157494, + "grad_norm": 0.3313361704349518, + "learning_rate": 4.995962085763699e-05, + "loss": 0.2469, + "step": 6519 + }, + { + "epoch": 0.11629151357328862, + "grad_norm": 0.3026551306247711, + "learning_rate": 4.995953237931791e-05, + "loss": 0.2531, + "step": 6520 + }, + { + "epoch": 0.11630934969500231, + "grad_norm": 0.2981475293636322, + "learning_rate": 4.995944380424706e-05, + "loss": 0.2563, + "step": 6521 + }, + { + "epoch": 0.11632718581671601, + "grad_norm": 0.44038650393486023, + "learning_rate": 4.995935513242479e-05, + "loss": 0.296, + "step": 6522 + }, + { + "epoch": 0.1163450219384297, + "grad_norm": 0.21879370510578156, + "learning_rate": 4.995926636385144e-05, + "loss": 0.198, + "step": 6523 + }, + { + "epoch": 0.1163628580601434, + "grad_norm": 0.4341939389705658, + "learning_rate": 4.995917749852735e-05, + "loss": 0.2931, + "step": 6524 + }, + { + "epoch": 0.11638069418185709, + "grad_norm": 0.21881984174251556, + "learning_rate": 4.995908853645287e-05, + "loss": 0.2055, + "step": 6525 + }, + { + "epoch": 0.1163985303035708, + "grad_norm": 0.25374263525009155, + "learning_rate": 4.995899947762834e-05, + "loss": 0.2823, + "step": 6526 + }, + { + "epoch": 0.11641636642528448, + "grad_norm": 0.4448148310184479, + "learning_rate": 4.995891032205411e-05, + "loss": 0.2673, + "step": 6527 + }, + { + "epoch": 0.11643420254699818, + "grad_norm": 0.2964814305305481, + "learning_rate": 4.9958821069730514e-05, + "loss": 0.2506, + "step": 6528 + }, + { + "epoch": 0.11645203866871187, + "grad_norm": 0.2353295385837555, + "learning_rate": 4.995873172065791e-05, + "loss": 0.1917, + "step": 6529 + }, + { + "epoch": 0.11646987479042557, + "grad_norm": 0.3661505877971649, + "learning_rate": 4.995864227483665e-05, + "loss": 0.2805, + "step": 6530 + }, + { + "epoch": 0.11648771091213926, + "grad_norm": 0.3254474103450775, + "learning_rate": 4.995855273226707e-05, + "loss": 0.2197, + "step": 6531 + }, + { + "epoch": 0.11650554703385296, + "grad_norm": 0.28213346004486084, + "learning_rate": 4.995846309294952e-05, + "loss": 0.2391, + "step": 6532 + }, + { + "epoch": 0.11652338315556665, + "grad_norm": 0.38951122760772705, + "learning_rate": 4.9958373356884334e-05, + "loss": 0.2189, + "step": 6533 + }, + { + "epoch": 0.11654121927728035, + "grad_norm": 0.2582286596298218, + "learning_rate": 4.995828352407189e-05, + "loss": 0.2568, + "step": 6534 + }, + { + "epoch": 0.11655905539899404, + "grad_norm": 0.3363356292247772, + "learning_rate": 4.99581935945125e-05, + "loss": 0.2397, + "step": 6535 + }, + { + "epoch": 0.11657689152070774, + "grad_norm": 0.24810463190078735, + "learning_rate": 4.995810356820655e-05, + "loss": 0.2711, + "step": 6536 + }, + { + "epoch": 0.11659472764242143, + "grad_norm": 0.26700448989868164, + "learning_rate": 4.9958013445154365e-05, + "loss": 0.2649, + "step": 6537 + }, + { + "epoch": 0.11661256376413513, + "grad_norm": 0.3393983244895935, + "learning_rate": 4.995792322535629e-05, + "loss": 0.2674, + "step": 6538 + }, + { + "epoch": 0.11663039988584882, + "grad_norm": 0.3047894835472107, + "learning_rate": 4.99578329088127e-05, + "loss": 0.2624, + "step": 6539 + }, + { + "epoch": 0.11664823600756252, + "grad_norm": 0.37343868613243103, + "learning_rate": 4.995774249552391e-05, + "loss": 0.3026, + "step": 6540 + }, + { + "epoch": 0.11666607212927621, + "grad_norm": 0.3277461528778076, + "learning_rate": 4.9957651985490305e-05, + "loss": 0.2905, + "step": 6541 + }, + { + "epoch": 0.1166839082509899, + "grad_norm": 0.3397107720375061, + "learning_rate": 4.9957561378712216e-05, + "loss": 0.2455, + "step": 6542 + }, + { + "epoch": 0.1167017443727036, + "grad_norm": 0.20923329889774323, + "learning_rate": 4.995747067519e-05, + "loss": 0.2026, + "step": 6543 + }, + { + "epoch": 0.11671958049441729, + "grad_norm": 0.31039029359817505, + "learning_rate": 4.995737987492401e-05, + "loss": 0.2255, + "step": 6544 + }, + { + "epoch": 0.11673741661613099, + "grad_norm": 0.23596099019050598, + "learning_rate": 4.9957288977914585e-05, + "loss": 0.2107, + "step": 6545 + }, + { + "epoch": 0.11675525273784468, + "grad_norm": 0.4043689966201782, + "learning_rate": 4.9957197984162094e-05, + "loss": 0.2299, + "step": 6546 + }, + { + "epoch": 0.11677308885955838, + "grad_norm": 0.2682308852672577, + "learning_rate": 4.995710689366689e-05, + "loss": 0.2398, + "step": 6547 + }, + { + "epoch": 0.11679092498127207, + "grad_norm": 0.26397573947906494, + "learning_rate": 4.995701570642931e-05, + "loss": 0.2, + "step": 6548 + }, + { + "epoch": 0.11680876110298577, + "grad_norm": 0.31530824303627014, + "learning_rate": 4.995692442244972e-05, + "loss": 0.3019, + "step": 6549 + }, + { + "epoch": 0.11682659722469946, + "grad_norm": 0.2686672508716583, + "learning_rate": 4.995683304172848e-05, + "loss": 0.2578, + "step": 6550 + }, + { + "epoch": 0.11684443334641316, + "grad_norm": 0.4198894202709198, + "learning_rate": 4.995674156426593e-05, + "loss": 0.1865, + "step": 6551 + }, + { + "epoch": 0.11686226946812685, + "grad_norm": 0.3906414210796356, + "learning_rate": 4.9956649990062425e-05, + "loss": 0.2459, + "step": 6552 + }, + { + "epoch": 0.11688010558984055, + "grad_norm": 0.303987056016922, + "learning_rate": 4.995655831911833e-05, + "loss": 0.2608, + "step": 6553 + }, + { + "epoch": 0.11689794171155424, + "grad_norm": 0.24724708497524261, + "learning_rate": 4.9956466551433996e-05, + "loss": 0.2321, + "step": 6554 + }, + { + "epoch": 0.11691577783326794, + "grad_norm": 0.3111753463745117, + "learning_rate": 4.995637468700978e-05, + "loss": 0.2368, + "step": 6555 + }, + { + "epoch": 0.11693361395498163, + "grad_norm": 0.2826472818851471, + "learning_rate": 4.9956282725846025e-05, + "loss": 0.235, + "step": 6556 + }, + { + "epoch": 0.11695145007669533, + "grad_norm": 0.21869373321533203, + "learning_rate": 4.9956190667943105e-05, + "loss": 0.2272, + "step": 6557 + }, + { + "epoch": 0.11696928619840902, + "grad_norm": 0.21034951508045197, + "learning_rate": 4.995609851330137e-05, + "loss": 0.2254, + "step": 6558 + }, + { + "epoch": 0.11698712232012272, + "grad_norm": 0.30858585238456726, + "learning_rate": 4.995600626192118e-05, + "loss": 0.2623, + "step": 6559 + }, + { + "epoch": 0.1170049584418364, + "grad_norm": 0.2884853780269623, + "learning_rate": 4.995591391380289e-05, + "loss": 0.2639, + "step": 6560 + }, + { + "epoch": 0.1170227945635501, + "grad_norm": 0.2359286993741989, + "learning_rate": 4.995582146894685e-05, + "loss": 0.2539, + "step": 6561 + }, + { + "epoch": 0.1170406306852638, + "grad_norm": 0.27310407161712646, + "learning_rate": 4.995572892735344e-05, + "loss": 0.2584, + "step": 6562 + }, + { + "epoch": 0.11705846680697748, + "grad_norm": 0.2923947274684906, + "learning_rate": 4.9955636289023e-05, + "loss": 0.2785, + "step": 6563 + }, + { + "epoch": 0.11707630292869119, + "grad_norm": 0.23849262297153473, + "learning_rate": 4.9955543553955886e-05, + "loss": 0.2302, + "step": 6564 + }, + { + "epoch": 0.11709413905040487, + "grad_norm": 0.33659827709198, + "learning_rate": 4.995545072215248e-05, + "loss": 0.2384, + "step": 6565 + }, + { + "epoch": 0.11711197517211858, + "grad_norm": 0.222193643450737, + "learning_rate": 4.9955357793613115e-05, + "loss": 0.1611, + "step": 6566 + }, + { + "epoch": 0.11712981129383226, + "grad_norm": 0.3508221209049225, + "learning_rate": 4.995526476833817e-05, + "loss": 0.2726, + "step": 6567 + }, + { + "epoch": 0.11714764741554597, + "grad_norm": 0.26974058151245117, + "learning_rate": 4.9955171646328e-05, + "loss": 0.2534, + "step": 6568 + }, + { + "epoch": 0.11716548353725965, + "grad_norm": 0.29008787870407104, + "learning_rate": 4.995507842758296e-05, + "loss": 0.2275, + "step": 6569 + }, + { + "epoch": 0.11718331965897336, + "grad_norm": 0.3137807250022888, + "learning_rate": 4.9954985112103426e-05, + "loss": 0.2848, + "step": 6570 + }, + { + "epoch": 0.11720115578068704, + "grad_norm": 0.3047144114971161, + "learning_rate": 4.9954891699889745e-05, + "loss": 0.269, + "step": 6571 + }, + { + "epoch": 0.11721899190240075, + "grad_norm": 0.36958450078964233, + "learning_rate": 4.9954798190942286e-05, + "loss": 0.2438, + "step": 6572 + }, + { + "epoch": 0.11723682802411443, + "grad_norm": 0.31683841347694397, + "learning_rate": 4.99547045852614e-05, + "loss": 0.177, + "step": 6573 + }, + { + "epoch": 0.11725466414582814, + "grad_norm": 0.30695050954818726, + "learning_rate": 4.995461088284748e-05, + "loss": 0.254, + "step": 6574 + }, + { + "epoch": 0.11727250026754182, + "grad_norm": 0.2589077651500702, + "learning_rate": 4.995451708370085e-05, + "loss": 0.242, + "step": 6575 + }, + { + "epoch": 0.11729033638925553, + "grad_norm": 0.4255754053592682, + "learning_rate": 4.9954423187821906e-05, + "loss": 0.28, + "step": 6576 + }, + { + "epoch": 0.11730817251096921, + "grad_norm": 0.32474592328071594, + "learning_rate": 4.995432919521099e-05, + "loss": 0.3075, + "step": 6577 + }, + { + "epoch": 0.11732600863268292, + "grad_norm": 0.2682439982891083, + "learning_rate": 4.9954235105868486e-05, + "loss": 0.2371, + "step": 6578 + }, + { + "epoch": 0.1173438447543966, + "grad_norm": 0.3613157868385315, + "learning_rate": 4.995414091979474e-05, + "loss": 0.2335, + "step": 6579 + }, + { + "epoch": 0.1173616808761103, + "grad_norm": 0.226851224899292, + "learning_rate": 4.9954046636990124e-05, + "loss": 0.2178, + "step": 6580 + }, + { + "epoch": 0.117379516997824, + "grad_norm": 0.3016261160373688, + "learning_rate": 4.9953952257455005e-05, + "loss": 0.2686, + "step": 6581 + }, + { + "epoch": 0.11739735311953768, + "grad_norm": 0.26512303948402405, + "learning_rate": 4.995385778118975e-05, + "loss": 0.2245, + "step": 6582 + }, + { + "epoch": 0.11741518924125138, + "grad_norm": 0.30120185017585754, + "learning_rate": 4.995376320819472e-05, + "loss": 0.2316, + "step": 6583 + }, + { + "epoch": 0.11743302536296507, + "grad_norm": 0.38008370995521545, + "learning_rate": 4.995366853847029e-05, + "loss": 0.2354, + "step": 6584 + }, + { + "epoch": 0.11745086148467877, + "grad_norm": 0.3327029049396515, + "learning_rate": 4.995357377201682e-05, + "loss": 0.2229, + "step": 6585 + }, + { + "epoch": 0.11746869760639246, + "grad_norm": 0.27921929955482483, + "learning_rate": 4.9953478908834675e-05, + "loss": 0.211, + "step": 6586 + }, + { + "epoch": 0.11748653372810616, + "grad_norm": 0.2631414234638214, + "learning_rate": 4.995338394892423e-05, + "loss": 0.2494, + "step": 6587 + }, + { + "epoch": 0.11750436984981985, + "grad_norm": 0.2520674765110016, + "learning_rate": 4.9953288892285854e-05, + "loss": 0.2591, + "step": 6588 + }, + { + "epoch": 0.11752220597153355, + "grad_norm": 0.22743797302246094, + "learning_rate": 4.9953193738919914e-05, + "loss": 0.1758, + "step": 6589 + }, + { + "epoch": 0.11754004209324724, + "grad_norm": 0.353944331407547, + "learning_rate": 4.995309848882677e-05, + "loss": 0.2928, + "step": 6590 + }, + { + "epoch": 0.11755787821496094, + "grad_norm": 0.26722222566604614, + "learning_rate": 4.995300314200679e-05, + "loss": 0.2199, + "step": 6591 + }, + { + "epoch": 0.11757571433667463, + "grad_norm": 0.4017643928527832, + "learning_rate": 4.9952907698460366e-05, + "loss": 0.2431, + "step": 6592 + }, + { + "epoch": 0.11759355045838833, + "grad_norm": 0.3151828944683075, + "learning_rate": 4.995281215818785e-05, + "loss": 0.2712, + "step": 6593 + }, + { + "epoch": 0.11761138658010202, + "grad_norm": 0.21078132092952728, + "learning_rate": 4.9952716521189616e-05, + "loss": 0.2107, + "step": 6594 + }, + { + "epoch": 0.11762922270181572, + "grad_norm": 0.2749013304710388, + "learning_rate": 4.9952620787466034e-05, + "loss": 0.2412, + "step": 6595 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.26387470960617065, + "learning_rate": 4.9952524957017464e-05, + "loss": 0.1953, + "step": 6596 + }, + { + "epoch": 0.11766489494524311, + "grad_norm": 0.3293147683143616, + "learning_rate": 4.9952429029844304e-05, + "loss": 0.2751, + "step": 6597 + }, + { + "epoch": 0.1176827310669568, + "grad_norm": 0.25535276532173157, + "learning_rate": 4.9952333005946906e-05, + "loss": 0.2381, + "step": 6598 + }, + { + "epoch": 0.1177005671886705, + "grad_norm": 0.30123427510261536, + "learning_rate": 4.9952236885325644e-05, + "loss": 0.2064, + "step": 6599 + }, + { + "epoch": 0.11771840331038419, + "grad_norm": 0.3245164453983307, + "learning_rate": 4.99521406679809e-05, + "loss": 0.2512, + "step": 6600 + }, + { + "epoch": 0.11773623943209788, + "grad_norm": 0.2973853051662445, + "learning_rate": 4.995204435391303e-05, + "loss": 0.2492, + "step": 6601 + }, + { + "epoch": 0.11775407555381158, + "grad_norm": 0.4251580238342285, + "learning_rate": 4.9951947943122425e-05, + "loss": 0.3153, + "step": 6602 + }, + { + "epoch": 0.11777191167552527, + "grad_norm": 0.23978105187416077, + "learning_rate": 4.995185143560945e-05, + "loss": 0.2151, + "step": 6603 + }, + { + "epoch": 0.11778974779723897, + "grad_norm": 0.25439050793647766, + "learning_rate": 4.9951754831374485e-05, + "loss": 0.2507, + "step": 6604 + }, + { + "epoch": 0.11780758391895266, + "grad_norm": 0.27356353402137756, + "learning_rate": 4.9951658130417897e-05, + "loss": 0.189, + "step": 6605 + }, + { + "epoch": 0.11782542004066636, + "grad_norm": 0.3462202250957489, + "learning_rate": 4.995156133274006e-05, + "loss": 0.2398, + "step": 6606 + }, + { + "epoch": 0.11784325616238005, + "grad_norm": 0.27835625410079956, + "learning_rate": 4.995146443834136e-05, + "loss": 0.2305, + "step": 6607 + }, + { + "epoch": 0.11786109228409375, + "grad_norm": 0.42552027106285095, + "learning_rate": 4.995136744722216e-05, + "loss": 0.2283, + "step": 6608 + }, + { + "epoch": 0.11787892840580744, + "grad_norm": 0.33396610617637634, + "learning_rate": 4.9951270359382854e-05, + "loss": 0.2483, + "step": 6609 + }, + { + "epoch": 0.11789676452752114, + "grad_norm": 0.36073634028434753, + "learning_rate": 4.995117317482379e-05, + "loss": 0.2802, + "step": 6610 + }, + { + "epoch": 0.11791460064923483, + "grad_norm": 0.3328351676464081, + "learning_rate": 4.995107589354537e-05, + "loss": 0.2249, + "step": 6611 + }, + { + "epoch": 0.11793243677094853, + "grad_norm": 0.27836957573890686, + "learning_rate": 4.9950978515547955e-05, + "loss": 0.2508, + "step": 6612 + }, + { + "epoch": 0.11795027289266222, + "grad_norm": 0.31090670824050903, + "learning_rate": 4.995088104083194e-05, + "loss": 0.247, + "step": 6613 + }, + { + "epoch": 0.11796810901437592, + "grad_norm": 0.32610180974006653, + "learning_rate": 4.995078346939769e-05, + "loss": 0.2439, + "step": 6614 + }, + { + "epoch": 0.1179859451360896, + "grad_norm": 0.2930186688899994, + "learning_rate": 4.995068580124558e-05, + "loss": 0.2517, + "step": 6615 + }, + { + "epoch": 0.11800378125780331, + "grad_norm": 0.3002059757709503, + "learning_rate": 4.9950588036375996e-05, + "loss": 0.2864, + "step": 6616 + }, + { + "epoch": 0.118021617379517, + "grad_norm": 0.25812119245529175, + "learning_rate": 4.995049017478931e-05, + "loss": 0.2364, + "step": 6617 + }, + { + "epoch": 0.1180394535012307, + "grad_norm": 0.23621611297130585, + "learning_rate": 4.995039221648592e-05, + "loss": 0.2157, + "step": 6618 + }, + { + "epoch": 0.11805728962294439, + "grad_norm": 0.3015047311782837, + "learning_rate": 4.995029416146618e-05, + "loss": 0.2681, + "step": 6619 + }, + { + "epoch": 0.11807512574465809, + "grad_norm": 0.38616377115249634, + "learning_rate": 4.995019600973049e-05, + "loss": 0.2638, + "step": 6620 + }, + { + "epoch": 0.11809296186637178, + "grad_norm": 0.2730218768119812, + "learning_rate": 4.9950097761279216e-05, + "loss": 0.2351, + "step": 6621 + }, + { + "epoch": 0.11811079798808546, + "grad_norm": 0.3057653307914734, + "learning_rate": 4.994999941611275e-05, + "loss": 0.2449, + "step": 6622 + }, + { + "epoch": 0.11812863410979917, + "grad_norm": 0.2960295081138611, + "learning_rate": 4.9949900974231466e-05, + "loss": 0.2147, + "step": 6623 + }, + { + "epoch": 0.11814647023151285, + "grad_norm": 0.3486759066581726, + "learning_rate": 4.994980243563575e-05, + "loss": 0.2918, + "step": 6624 + }, + { + "epoch": 0.11816430635322656, + "grad_norm": 0.335227906703949, + "learning_rate": 4.994970380032599e-05, + "loss": 0.2261, + "step": 6625 + }, + { + "epoch": 0.11818214247494024, + "grad_norm": 0.2913608253002167, + "learning_rate": 4.994960506830255e-05, + "loss": 0.2308, + "step": 6626 + }, + { + "epoch": 0.11819997859665395, + "grad_norm": 0.3032926023006439, + "learning_rate": 4.9949506239565823e-05, + "loss": 0.1946, + "step": 6627 + }, + { + "epoch": 0.11821781471836763, + "grad_norm": 0.29746878147125244, + "learning_rate": 4.99494073141162e-05, + "loss": 0.2485, + "step": 6628 + }, + { + "epoch": 0.11823565084008134, + "grad_norm": 0.24762023985385895, + "learning_rate": 4.994930829195405e-05, + "loss": 0.1897, + "step": 6629 + }, + { + "epoch": 0.11825348696179502, + "grad_norm": 0.2895682156085968, + "learning_rate": 4.994920917307977e-05, + "loss": 0.2041, + "step": 6630 + }, + { + "epoch": 0.11827132308350873, + "grad_norm": 0.36560821533203125, + "learning_rate": 4.994910995749373e-05, + "loss": 0.2905, + "step": 6631 + }, + { + "epoch": 0.11828915920522241, + "grad_norm": 0.247646301984787, + "learning_rate": 4.994901064519633e-05, + "loss": 0.2463, + "step": 6632 + }, + { + "epoch": 0.11830699532693612, + "grad_norm": 0.2746172547340393, + "learning_rate": 4.994891123618794e-05, + "loss": 0.2764, + "step": 6633 + }, + { + "epoch": 0.1183248314486498, + "grad_norm": 0.2499716579914093, + "learning_rate": 4.9948811730468964e-05, + "loss": 0.2391, + "step": 6634 + }, + { + "epoch": 0.1183426675703635, + "grad_norm": 0.2748628556728363, + "learning_rate": 4.994871212803977e-05, + "loss": 0.2461, + "step": 6635 + }, + { + "epoch": 0.11836050369207719, + "grad_norm": 0.32685232162475586, + "learning_rate": 4.9948612428900755e-05, + "loss": 0.2476, + "step": 6636 + }, + { + "epoch": 0.1183783398137909, + "grad_norm": 0.2630250155925751, + "learning_rate": 4.99485126330523e-05, + "loss": 0.245, + "step": 6637 + }, + { + "epoch": 0.11839617593550458, + "grad_norm": 0.214598149061203, + "learning_rate": 4.994841274049479e-05, + "loss": 0.1883, + "step": 6638 + }, + { + "epoch": 0.11841401205721828, + "grad_norm": 0.3095002770423889, + "learning_rate": 4.994831275122862e-05, + "loss": 0.2162, + "step": 6639 + }, + { + "epoch": 0.11843184817893197, + "grad_norm": 0.3015035390853882, + "learning_rate": 4.9948212665254164e-05, + "loss": 0.2522, + "step": 6640 + }, + { + "epoch": 0.11844968430064566, + "grad_norm": 0.22176417708396912, + "learning_rate": 4.9948112482571824e-05, + "loss": 0.2248, + "step": 6641 + }, + { + "epoch": 0.11846752042235936, + "grad_norm": 0.17620062828063965, + "learning_rate": 4.9948012203181984e-05, + "loss": 0.1728, + "step": 6642 + }, + { + "epoch": 0.11848535654407305, + "grad_norm": 0.266944944858551, + "learning_rate": 4.994791182708503e-05, + "loss": 0.2401, + "step": 6643 + }, + { + "epoch": 0.11850319266578675, + "grad_norm": 0.35771605372428894, + "learning_rate": 4.9947811354281356e-05, + "loss": 0.2796, + "step": 6644 + }, + { + "epoch": 0.11852102878750044, + "grad_norm": 0.27012234926223755, + "learning_rate": 4.994771078477135e-05, + "loss": 0.2315, + "step": 6645 + }, + { + "epoch": 0.11853886490921414, + "grad_norm": 0.3303559124469757, + "learning_rate": 4.99476101185554e-05, + "loss": 0.2775, + "step": 6646 + }, + { + "epoch": 0.11855670103092783, + "grad_norm": 0.29805228114128113, + "learning_rate": 4.9947509355633885e-05, + "loss": 0.2268, + "step": 6647 + }, + { + "epoch": 0.11857453715264153, + "grad_norm": 0.2512591481208801, + "learning_rate": 4.994740849600722e-05, + "loss": 0.2129, + "step": 6648 + }, + { + "epoch": 0.11859237327435522, + "grad_norm": 0.3391852378845215, + "learning_rate": 4.994730753967578e-05, + "loss": 0.2284, + "step": 6649 + }, + { + "epoch": 0.11861020939606892, + "grad_norm": 0.22234192490577698, + "learning_rate": 4.9947206486639956e-05, + "loss": 0.2152, + "step": 6650 + }, + { + "epoch": 0.11862804551778261, + "grad_norm": 0.2869161367416382, + "learning_rate": 4.994710533690015e-05, + "loss": 0.164, + "step": 6651 + }, + { + "epoch": 0.11864588163949631, + "grad_norm": 0.4121025800704956, + "learning_rate": 4.994700409045674e-05, + "loss": 0.2403, + "step": 6652 + }, + { + "epoch": 0.11866371776121, + "grad_norm": 0.2744717299938202, + "learning_rate": 4.994690274731013e-05, + "loss": 0.2896, + "step": 6653 + }, + { + "epoch": 0.1186815538829237, + "grad_norm": 0.24404260516166687, + "learning_rate": 4.9946801307460705e-05, + "loss": 0.2179, + "step": 6654 + }, + { + "epoch": 0.11869939000463739, + "grad_norm": 0.2969261705875397, + "learning_rate": 4.994669977090887e-05, + "loss": 0.2448, + "step": 6655 + }, + { + "epoch": 0.11871722612635109, + "grad_norm": 0.31903931498527527, + "learning_rate": 4.9946598137655e-05, + "loss": 0.2262, + "step": 6656 + }, + { + "epoch": 0.11873506224806478, + "grad_norm": 0.5626586675643921, + "learning_rate": 4.9946496407699505e-05, + "loss": 0.2645, + "step": 6657 + }, + { + "epoch": 0.11875289836977848, + "grad_norm": 0.26961636543273926, + "learning_rate": 4.9946394581042766e-05, + "loss": 0.2242, + "step": 6658 + }, + { + "epoch": 0.11877073449149217, + "grad_norm": 0.33832380175590515, + "learning_rate": 4.9946292657685194e-05, + "loss": 0.2846, + "step": 6659 + }, + { + "epoch": 0.11878857061320587, + "grad_norm": 0.20829878747463226, + "learning_rate": 4.994619063762718e-05, + "loss": 0.2231, + "step": 6660 + }, + { + "epoch": 0.11880640673491956, + "grad_norm": 0.27362382411956787, + "learning_rate": 4.994608852086911e-05, + "loss": 0.2139, + "step": 6661 + }, + { + "epoch": 0.11882424285663325, + "grad_norm": 0.2556726634502411, + "learning_rate": 4.994598630741137e-05, + "loss": 0.241, + "step": 6662 + }, + { + "epoch": 0.11884207897834695, + "grad_norm": 0.2956306040287018, + "learning_rate": 4.9945883997254395e-05, + "loss": 0.317, + "step": 6663 + }, + { + "epoch": 0.11885991510006064, + "grad_norm": 0.29402780532836914, + "learning_rate": 4.9945781590398546e-05, + "loss": 0.242, + "step": 6664 + }, + { + "epoch": 0.11887775122177434, + "grad_norm": 0.22185377776622772, + "learning_rate": 4.994567908684423e-05, + "loss": 0.2097, + "step": 6665 + }, + { + "epoch": 0.11889558734348803, + "grad_norm": 0.3091468811035156, + "learning_rate": 4.994557648659185e-05, + "loss": 0.2363, + "step": 6666 + }, + { + "epoch": 0.11891342346520173, + "grad_norm": 0.31103816628456116, + "learning_rate": 4.9945473789641794e-05, + "loss": 0.287, + "step": 6667 + }, + { + "epoch": 0.11893125958691542, + "grad_norm": 0.30507156252861023, + "learning_rate": 4.994537099599447e-05, + "loss": 0.3044, + "step": 6668 + }, + { + "epoch": 0.11894909570862912, + "grad_norm": 0.34385067224502563, + "learning_rate": 4.9945268105650274e-05, + "loss": 0.2604, + "step": 6669 + }, + { + "epoch": 0.1189669318303428, + "grad_norm": 0.2492048591375351, + "learning_rate": 4.99451651186096e-05, + "loss": 0.2526, + "step": 6670 + }, + { + "epoch": 0.11898476795205651, + "grad_norm": 0.32889285683631897, + "learning_rate": 4.994506203487285e-05, + "loss": 0.2379, + "step": 6671 + }, + { + "epoch": 0.1190026040737702, + "grad_norm": 0.26086732745170593, + "learning_rate": 4.994495885444043e-05, + "loss": 0.2191, + "step": 6672 + }, + { + "epoch": 0.1190204401954839, + "grad_norm": 0.3591679036617279, + "learning_rate": 4.994485557731272e-05, + "loss": 0.2758, + "step": 6673 + }, + { + "epoch": 0.11903827631719759, + "grad_norm": 0.29329365491867065, + "learning_rate": 4.9944752203490144e-05, + "loss": 0.2618, + "step": 6674 + }, + { + "epoch": 0.11905611243891129, + "grad_norm": 0.34661364555358887, + "learning_rate": 4.994464873297309e-05, + "loss": 0.2673, + "step": 6675 + }, + { + "epoch": 0.11907394856062498, + "grad_norm": 0.2907252311706543, + "learning_rate": 4.994454516576197e-05, + "loss": 0.253, + "step": 6676 + }, + { + "epoch": 0.11909178468233868, + "grad_norm": 0.3732890486717224, + "learning_rate": 4.994444150185716e-05, + "loss": 0.2695, + "step": 6677 + }, + { + "epoch": 0.11910962080405237, + "grad_norm": 0.24741658568382263, + "learning_rate": 4.9944337741259095e-05, + "loss": 0.2255, + "step": 6678 + }, + { + "epoch": 0.11912745692576607, + "grad_norm": 0.35463738441467285, + "learning_rate": 4.9944233883968163e-05, + "loss": 0.2489, + "step": 6679 + }, + { + "epoch": 0.11914529304747976, + "grad_norm": 0.28461650013923645, + "learning_rate": 4.994412992998475e-05, + "loss": 0.2562, + "step": 6680 + }, + { + "epoch": 0.11916312916919346, + "grad_norm": 0.4393540024757385, + "learning_rate": 4.994402587930928e-05, + "loss": 0.2461, + "step": 6681 + }, + { + "epoch": 0.11918096529090715, + "grad_norm": 0.24490773677825928, + "learning_rate": 4.9943921731942155e-05, + "loss": 0.2249, + "step": 6682 + }, + { + "epoch": 0.11919880141262083, + "grad_norm": 0.23941229283809662, + "learning_rate": 4.994381748788377e-05, + "loss": 0.1848, + "step": 6683 + }, + { + "epoch": 0.11921663753433454, + "grad_norm": 0.3012526035308838, + "learning_rate": 4.994371314713454e-05, + "loss": 0.2049, + "step": 6684 + }, + { + "epoch": 0.11923447365604822, + "grad_norm": 0.45927903056144714, + "learning_rate": 4.994360870969486e-05, + "loss": 0.2074, + "step": 6685 + }, + { + "epoch": 0.11925230977776193, + "grad_norm": 0.5445613861083984, + "learning_rate": 4.9943504175565134e-05, + "loss": 0.245, + "step": 6686 + }, + { + "epoch": 0.11927014589947561, + "grad_norm": 0.3541640043258667, + "learning_rate": 4.9943399544745765e-05, + "loss": 0.2455, + "step": 6687 + }, + { + "epoch": 0.11928798202118931, + "grad_norm": 0.2198963761329651, + "learning_rate": 4.994329481723717e-05, + "loss": 0.2239, + "step": 6688 + }, + { + "epoch": 0.119305818142903, + "grad_norm": 0.2399929016828537, + "learning_rate": 4.994318999303975e-05, + "loss": 0.1995, + "step": 6689 + }, + { + "epoch": 0.1193236542646167, + "grad_norm": 0.30972346663475037, + "learning_rate": 4.994308507215392e-05, + "loss": 0.1912, + "step": 6690 + }, + { + "epoch": 0.11934149038633039, + "grad_norm": 0.25458747148513794, + "learning_rate": 4.994298005458006e-05, + "loss": 0.2028, + "step": 6691 + }, + { + "epoch": 0.1193593265080441, + "grad_norm": 0.31524714827537537, + "learning_rate": 4.99428749403186e-05, + "loss": 0.2667, + "step": 6692 + }, + { + "epoch": 0.11937716262975778, + "grad_norm": 0.26774513721466064, + "learning_rate": 4.994276972936994e-05, + "loss": 0.259, + "step": 6693 + }, + { + "epoch": 0.11939499875147148, + "grad_norm": 0.38334810733795166, + "learning_rate": 4.99426644217345e-05, + "loss": 0.2902, + "step": 6694 + }, + { + "epoch": 0.11941283487318517, + "grad_norm": 0.2599673867225647, + "learning_rate": 4.994255901741267e-05, + "loss": 0.2221, + "step": 6695 + }, + { + "epoch": 0.11943067099489887, + "grad_norm": 0.3482665419578552, + "learning_rate": 4.994245351640486e-05, + "loss": 0.2359, + "step": 6696 + }, + { + "epoch": 0.11944850711661256, + "grad_norm": 0.34844645857810974, + "learning_rate": 4.99423479187115e-05, + "loss": 0.271, + "step": 6697 + }, + { + "epoch": 0.11946634323832626, + "grad_norm": 0.3628528118133545, + "learning_rate": 4.9942242224332975e-05, + "loss": 0.2892, + "step": 6698 + }, + { + "epoch": 0.11948417936003995, + "grad_norm": 0.2471059113740921, + "learning_rate": 4.99421364332697e-05, + "loss": 0.2482, + "step": 6699 + }, + { + "epoch": 0.11950201548175365, + "grad_norm": 0.3259129226207733, + "learning_rate": 4.99420305455221e-05, + "loss": 0.3085, + "step": 6700 + }, + { + "epoch": 0.11951985160346734, + "grad_norm": 0.3657534420490265, + "learning_rate": 4.994192456109057e-05, + "loss": 0.2476, + "step": 6701 + }, + { + "epoch": 0.11953768772518103, + "grad_norm": 0.29605263471603394, + "learning_rate": 4.9941818479975535e-05, + "loss": 0.2237, + "step": 6702 + }, + { + "epoch": 0.11955552384689473, + "grad_norm": 0.32068613171577454, + "learning_rate": 4.994171230217738e-05, + "loss": 0.2713, + "step": 6703 + }, + { + "epoch": 0.11957335996860842, + "grad_norm": 0.27349868416786194, + "learning_rate": 4.994160602769654e-05, + "loss": 0.2632, + "step": 6704 + }, + { + "epoch": 0.11959119609032212, + "grad_norm": 0.30878397822380066, + "learning_rate": 4.994149965653343e-05, + "loss": 0.2176, + "step": 6705 + }, + { + "epoch": 0.11960903221203581, + "grad_norm": 0.27538296580314636, + "learning_rate": 4.9941393188688444e-05, + "loss": 0.2307, + "step": 6706 + }, + { + "epoch": 0.11962686833374951, + "grad_norm": 0.2698029577732086, + "learning_rate": 4.9941286624162e-05, + "loss": 0.2221, + "step": 6707 + }, + { + "epoch": 0.1196447044554632, + "grad_norm": 0.29200440645217896, + "learning_rate": 4.994117996295452e-05, + "loss": 0.1846, + "step": 6708 + }, + { + "epoch": 0.1196625405771769, + "grad_norm": 0.21572646498680115, + "learning_rate": 4.9941073205066414e-05, + "loss": 0.2047, + "step": 6709 + }, + { + "epoch": 0.11968037669889059, + "grad_norm": 0.3035578727722168, + "learning_rate": 4.994096635049809e-05, + "loss": 0.2245, + "step": 6710 + }, + { + "epoch": 0.11969821282060429, + "grad_norm": 0.27503350377082825, + "learning_rate": 4.9940859399249965e-05, + "loss": 0.2405, + "step": 6711 + }, + { + "epoch": 0.11971604894231798, + "grad_norm": 0.268382728099823, + "learning_rate": 4.994075235132246e-05, + "loss": 0.2561, + "step": 6712 + }, + { + "epoch": 0.11973388506403168, + "grad_norm": 0.2590249478816986, + "learning_rate": 4.994064520671598e-05, + "loss": 0.2279, + "step": 6713 + }, + { + "epoch": 0.11975172118574537, + "grad_norm": 0.2540910840034485, + "learning_rate": 4.9940537965430943e-05, + "loss": 0.2034, + "step": 6714 + }, + { + "epoch": 0.11976955730745907, + "grad_norm": 0.24925297498703003, + "learning_rate": 4.994043062746778e-05, + "loss": 0.2024, + "step": 6715 + }, + { + "epoch": 0.11978739342917276, + "grad_norm": 0.26008927822113037, + "learning_rate": 4.994032319282688e-05, + "loss": 0.2176, + "step": 6716 + }, + { + "epoch": 0.11980522955088646, + "grad_norm": 0.35265469551086426, + "learning_rate": 4.994021566150868e-05, + "loss": 0.2135, + "step": 6717 + }, + { + "epoch": 0.11982306567260015, + "grad_norm": 0.36726319789886475, + "learning_rate": 4.9940108033513585e-05, + "loss": 0.1925, + "step": 6718 + }, + { + "epoch": 0.11984090179431385, + "grad_norm": 0.33205559849739075, + "learning_rate": 4.9940000308842015e-05, + "loss": 0.3123, + "step": 6719 + }, + { + "epoch": 0.11985873791602754, + "grad_norm": 0.29143860936164856, + "learning_rate": 4.99398924874944e-05, + "loss": 0.2415, + "step": 6720 + }, + { + "epoch": 0.11987657403774124, + "grad_norm": 0.2193698287010193, + "learning_rate": 4.9939784569471135e-05, + "loss": 0.2014, + "step": 6721 + }, + { + "epoch": 0.11989441015945493, + "grad_norm": 0.37905389070510864, + "learning_rate": 4.9939676554772665e-05, + "loss": 0.2994, + "step": 6722 + }, + { + "epoch": 0.11991224628116862, + "grad_norm": 0.3636080324649811, + "learning_rate": 4.9939568443399384e-05, + "loss": 0.2016, + "step": 6723 + }, + { + "epoch": 0.11993008240288232, + "grad_norm": 0.27542924880981445, + "learning_rate": 4.993946023535173e-05, + "loss": 0.2301, + "step": 6724 + }, + { + "epoch": 0.119947918524596, + "grad_norm": 0.30708178877830505, + "learning_rate": 4.993935193063011e-05, + "loss": 0.2058, + "step": 6725 + }, + { + "epoch": 0.11996575464630971, + "grad_norm": 0.3631151020526886, + "learning_rate": 4.993924352923495e-05, + "loss": 0.1853, + "step": 6726 + }, + { + "epoch": 0.1199835907680234, + "grad_norm": 0.31861039996147156, + "learning_rate": 4.993913503116666e-05, + "loss": 0.2565, + "step": 6727 + }, + { + "epoch": 0.1200014268897371, + "grad_norm": 0.3508286774158478, + "learning_rate": 4.993902643642568e-05, + "loss": 0.2511, + "step": 6728 + }, + { + "epoch": 0.12001926301145079, + "grad_norm": 0.26951679587364197, + "learning_rate": 4.993891774501241e-05, + "loss": 0.1625, + "step": 6729 + }, + { + "epoch": 0.12003709913316449, + "grad_norm": 0.49868273735046387, + "learning_rate": 4.993880895692729e-05, + "loss": 0.2422, + "step": 6730 + }, + { + "epoch": 0.12005493525487818, + "grad_norm": 0.27037322521209717, + "learning_rate": 4.993870007217073e-05, + "loss": 0.2366, + "step": 6731 + }, + { + "epoch": 0.12007277137659188, + "grad_norm": 0.29594314098358154, + "learning_rate": 4.993859109074315e-05, + "loss": 0.2601, + "step": 6732 + }, + { + "epoch": 0.12009060749830557, + "grad_norm": 0.32387587428092957, + "learning_rate": 4.993848201264498e-05, + "loss": 0.2454, + "step": 6733 + }, + { + "epoch": 0.12010844362001927, + "grad_norm": 0.29027819633483887, + "learning_rate": 4.993837283787664e-05, + "loss": 0.1717, + "step": 6734 + }, + { + "epoch": 0.12012627974173296, + "grad_norm": 0.24900208413600922, + "learning_rate": 4.993826356643856e-05, + "loss": 0.2277, + "step": 6735 + }, + { + "epoch": 0.12014411586344666, + "grad_norm": 0.29552027583122253, + "learning_rate": 4.9938154198331155e-05, + "loss": 0.216, + "step": 6736 + }, + { + "epoch": 0.12016195198516035, + "grad_norm": 0.39090508222579956, + "learning_rate": 4.993804473355485e-05, + "loss": 0.283, + "step": 6737 + }, + { + "epoch": 0.12017978810687405, + "grad_norm": 0.4318279027938843, + "learning_rate": 4.9937935172110065e-05, + "loss": 0.2041, + "step": 6738 + }, + { + "epoch": 0.12019762422858773, + "grad_norm": 0.36057740449905396, + "learning_rate": 4.993782551399724e-05, + "loss": 0.2527, + "step": 6739 + }, + { + "epoch": 0.12021546035030144, + "grad_norm": 0.7333322167396545, + "learning_rate": 4.993771575921678e-05, + "loss": 0.225, + "step": 6740 + }, + { + "epoch": 0.12023329647201512, + "grad_norm": 0.3208678364753723, + "learning_rate": 4.993760590776913e-05, + "loss": 0.2134, + "step": 6741 + }, + { + "epoch": 0.12025113259372881, + "grad_norm": 0.33157041668891907, + "learning_rate": 4.993749595965469e-05, + "loss": 0.2635, + "step": 6742 + }, + { + "epoch": 0.12026896871544251, + "grad_norm": 0.22630712389945984, + "learning_rate": 4.9937385914873916e-05, + "loss": 0.2186, + "step": 6743 + }, + { + "epoch": 0.1202868048371562, + "grad_norm": 0.37185901403427124, + "learning_rate": 4.993727577342722e-05, + "loss": 0.1725, + "step": 6744 + }, + { + "epoch": 0.1203046409588699, + "grad_norm": 0.2303929626941681, + "learning_rate": 4.993716553531503e-05, + "loss": 0.2277, + "step": 6745 + }, + { + "epoch": 0.12032247708058359, + "grad_norm": 0.3440285325050354, + "learning_rate": 4.993705520053777e-05, + "loss": 0.2769, + "step": 6746 + }, + { + "epoch": 0.1203403132022973, + "grad_norm": 0.39371126890182495, + "learning_rate": 4.9936944769095874e-05, + "loss": 0.2804, + "step": 6747 + }, + { + "epoch": 0.12035814932401098, + "grad_norm": 0.3641282618045807, + "learning_rate": 4.993683424098976e-05, + "loss": 0.3108, + "step": 6748 + }, + { + "epoch": 0.12037598544572468, + "grad_norm": 0.3169698715209961, + "learning_rate": 4.993672361621987e-05, + "loss": 0.3041, + "step": 6749 + }, + { + "epoch": 0.12039382156743837, + "grad_norm": 0.42722922563552856, + "learning_rate": 4.993661289478663e-05, + "loss": 0.2841, + "step": 6750 + }, + { + "epoch": 0.12041165768915207, + "grad_norm": 0.3565793037414551, + "learning_rate": 4.993650207669046e-05, + "loss": 0.2598, + "step": 6751 + }, + { + "epoch": 0.12042949381086576, + "grad_norm": 0.4076734185218811, + "learning_rate": 4.99363911619318e-05, + "loss": 0.2926, + "step": 6752 + }, + { + "epoch": 0.12044732993257946, + "grad_norm": 0.2621926963329315, + "learning_rate": 4.993628015051107e-05, + "loss": 0.2074, + "step": 6753 + }, + { + "epoch": 0.12046516605429315, + "grad_norm": 0.2789039611816406, + "learning_rate": 4.993616904242871e-05, + "loss": 0.2603, + "step": 6754 + }, + { + "epoch": 0.12048300217600685, + "grad_norm": 0.23423604667186737, + "learning_rate": 4.993605783768514e-05, + "loss": 0.221, + "step": 6755 + }, + { + "epoch": 0.12050083829772054, + "grad_norm": 0.37556326389312744, + "learning_rate": 4.99359465362808e-05, + "loss": 0.2258, + "step": 6756 + }, + { + "epoch": 0.12051867441943424, + "grad_norm": 0.29016271233558655, + "learning_rate": 4.993583513821612e-05, + "loss": 0.2374, + "step": 6757 + }, + { + "epoch": 0.12053651054114793, + "grad_norm": 0.3432520925998688, + "learning_rate": 4.9935723643491526e-05, + "loss": 0.2683, + "step": 6758 + }, + { + "epoch": 0.12055434666286163, + "grad_norm": 0.3250076174736023, + "learning_rate": 4.9935612052107464e-05, + "loss": 0.2431, + "step": 6759 + }, + { + "epoch": 0.12057218278457532, + "grad_norm": 0.3131701350212097, + "learning_rate": 4.9935500364064346e-05, + "loss": 0.3196, + "step": 6760 + }, + { + "epoch": 0.12059001890628902, + "grad_norm": 0.36178430914878845, + "learning_rate": 4.9935388579362625e-05, + "loss": 0.2442, + "step": 6761 + }, + { + "epoch": 0.12060785502800271, + "grad_norm": 0.4066222906112671, + "learning_rate": 4.993527669800272e-05, + "loss": 0.3069, + "step": 6762 + }, + { + "epoch": 0.1206256911497164, + "grad_norm": 0.21849587559700012, + "learning_rate": 4.993516471998507e-05, + "loss": 0.2223, + "step": 6763 + }, + { + "epoch": 0.1206435272714301, + "grad_norm": 0.3107141852378845, + "learning_rate": 4.993505264531012e-05, + "loss": 0.2641, + "step": 6764 + }, + { + "epoch": 0.12066136339314379, + "grad_norm": 0.37780794501304626, + "learning_rate": 4.993494047397828e-05, + "loss": 0.2627, + "step": 6765 + }, + { + "epoch": 0.12067919951485749, + "grad_norm": 0.49505695700645447, + "learning_rate": 4.993482820599e-05, + "loss": 0.2632, + "step": 6766 + }, + { + "epoch": 0.12069703563657118, + "grad_norm": 0.3290885090827942, + "learning_rate": 4.993471584134573e-05, + "loss": 0.2578, + "step": 6767 + }, + { + "epoch": 0.12071487175828488, + "grad_norm": 0.3457907736301422, + "learning_rate": 4.9934603380045865e-05, + "loss": 0.2594, + "step": 6768 + }, + { + "epoch": 0.12073270787999857, + "grad_norm": 0.3863312602043152, + "learning_rate": 4.993449082209088e-05, + "loss": 0.2666, + "step": 6769 + }, + { + "epoch": 0.12075054400171227, + "grad_norm": 0.30412641167640686, + "learning_rate": 4.993437816748119e-05, + "loss": 0.2323, + "step": 6770 + }, + { + "epoch": 0.12076838012342596, + "grad_norm": 0.6357429623603821, + "learning_rate": 4.993426541621724e-05, + "loss": 0.3115, + "step": 6771 + }, + { + "epoch": 0.12078621624513966, + "grad_norm": 0.2708114981651306, + "learning_rate": 4.993415256829947e-05, + "loss": 0.2863, + "step": 6772 + }, + { + "epoch": 0.12080405236685335, + "grad_norm": 0.20062050223350525, + "learning_rate": 4.993403962372831e-05, + "loss": 0.1997, + "step": 6773 + }, + { + "epoch": 0.12082188848856705, + "grad_norm": 0.32945525646209717, + "learning_rate": 4.9933926582504196e-05, + "loss": 0.2578, + "step": 6774 + }, + { + "epoch": 0.12083972461028074, + "grad_norm": 0.3725665807723999, + "learning_rate": 4.993381344462757e-05, + "loss": 0.2429, + "step": 6775 + }, + { + "epoch": 0.12085756073199444, + "grad_norm": 0.229294091463089, + "learning_rate": 4.9933700210098885e-05, + "loss": 0.18, + "step": 6776 + }, + { + "epoch": 0.12087539685370813, + "grad_norm": 0.36206942796707153, + "learning_rate": 4.9933586878918555e-05, + "loss": 0.3436, + "step": 6777 + }, + { + "epoch": 0.12089323297542183, + "grad_norm": 0.26434171199798584, + "learning_rate": 4.993347345108703e-05, + "loss": 0.2589, + "step": 6778 + }, + { + "epoch": 0.12091106909713552, + "grad_norm": 0.23119845986366272, + "learning_rate": 4.9933359926604754e-05, + "loss": 0.245, + "step": 6779 + }, + { + "epoch": 0.12092890521884922, + "grad_norm": 0.23035800457000732, + "learning_rate": 4.993324630547216e-05, + "loss": 0.2051, + "step": 6780 + }, + { + "epoch": 0.12094674134056291, + "grad_norm": 0.21824614703655243, + "learning_rate": 4.993313258768969e-05, + "loss": 0.1873, + "step": 6781 + }, + { + "epoch": 0.1209645774622766, + "grad_norm": 0.24966119229793549, + "learning_rate": 4.993301877325779e-05, + "loss": 0.2338, + "step": 6782 + }, + { + "epoch": 0.1209824135839903, + "grad_norm": 0.24788899719715118, + "learning_rate": 4.99329048621769e-05, + "loss": 0.2228, + "step": 6783 + }, + { + "epoch": 0.12100024970570399, + "grad_norm": 0.336873322725296, + "learning_rate": 4.993279085444745e-05, + "loss": 0.1714, + "step": 6784 + }, + { + "epoch": 0.12101808582741769, + "grad_norm": 0.26959362626075745, + "learning_rate": 4.9932676750069906e-05, + "loss": 0.2045, + "step": 6785 + }, + { + "epoch": 0.12103592194913138, + "grad_norm": 0.24538543820381165, + "learning_rate": 4.993256254904468e-05, + "loss": 0.1682, + "step": 6786 + }, + { + "epoch": 0.12105375807084508, + "grad_norm": 0.27018868923187256, + "learning_rate": 4.993244825137224e-05, + "loss": 0.2366, + "step": 6787 + }, + { + "epoch": 0.12107159419255877, + "grad_norm": 0.30733153223991394, + "learning_rate": 4.9932333857053015e-05, + "loss": 0.2237, + "step": 6788 + }, + { + "epoch": 0.12108943031427247, + "grad_norm": 0.25243696570396423, + "learning_rate": 4.993221936608746e-05, + "loss": 0.2187, + "step": 6789 + }, + { + "epoch": 0.12110726643598616, + "grad_norm": 0.4344727098941803, + "learning_rate": 4.9932104778476005e-05, + "loss": 0.2699, + "step": 6790 + }, + { + "epoch": 0.12112510255769986, + "grad_norm": 0.3126193583011627, + "learning_rate": 4.9931990094219095e-05, + "loss": 0.2161, + "step": 6791 + }, + { + "epoch": 0.12114293867941354, + "grad_norm": 0.34603577852249146, + "learning_rate": 4.993187531331719e-05, + "loss": 0.2325, + "step": 6792 + }, + { + "epoch": 0.12116077480112725, + "grad_norm": 0.46902620792388916, + "learning_rate": 4.993176043577072e-05, + "loss": 0.3418, + "step": 6793 + }, + { + "epoch": 0.12117861092284093, + "grad_norm": 0.25489768385887146, + "learning_rate": 4.993164546158013e-05, + "loss": 0.2325, + "step": 6794 + }, + { + "epoch": 0.12119644704455464, + "grad_norm": 0.27331244945526123, + "learning_rate": 4.9931530390745884e-05, + "loss": 0.2092, + "step": 6795 + }, + { + "epoch": 0.12121428316626832, + "grad_norm": 0.28268101811408997, + "learning_rate": 4.993141522326841e-05, + "loss": 0.1951, + "step": 6796 + }, + { + "epoch": 0.12123211928798203, + "grad_norm": 0.24887113273143768, + "learning_rate": 4.993129995914816e-05, + "loss": 0.1944, + "step": 6797 + }, + { + "epoch": 0.12124995540969571, + "grad_norm": 0.2970426380634308, + "learning_rate": 4.9931184598385575e-05, + "loss": 0.2731, + "step": 6798 + }, + { + "epoch": 0.12126779153140942, + "grad_norm": 0.2415713667869568, + "learning_rate": 4.9931069140981115e-05, + "loss": 0.2219, + "step": 6799 + }, + { + "epoch": 0.1212856276531231, + "grad_norm": 0.3172830641269684, + "learning_rate": 4.9930953586935216e-05, + "loss": 0.2126, + "step": 6800 + }, + { + "epoch": 0.1213034637748368, + "grad_norm": 0.3285074830055237, + "learning_rate": 4.993083793624833e-05, + "loss": 0.2399, + "step": 6801 + }, + { + "epoch": 0.1213212998965505, + "grad_norm": 0.2567670941352844, + "learning_rate": 4.99307221889209e-05, + "loss": 0.1948, + "step": 6802 + }, + { + "epoch": 0.12133913601826418, + "grad_norm": 0.23109138011932373, + "learning_rate": 4.993060634495339e-05, + "loss": 0.2142, + "step": 6803 + }, + { + "epoch": 0.12135697213997788, + "grad_norm": 0.21992535889148712, + "learning_rate": 4.993049040434623e-05, + "loss": 0.1925, + "step": 6804 + }, + { + "epoch": 0.12137480826169157, + "grad_norm": 0.276203453540802, + "learning_rate": 4.9930374367099886e-05, + "loss": 0.2182, + "step": 6805 + }, + { + "epoch": 0.12139264438340527, + "grad_norm": 0.24764001369476318, + "learning_rate": 4.99302582332148e-05, + "loss": 0.2153, + "step": 6806 + }, + { + "epoch": 0.12141048050511896, + "grad_norm": 0.35662516951560974, + "learning_rate": 4.9930142002691416e-05, + "loss": 0.2543, + "step": 6807 + }, + { + "epoch": 0.12142831662683266, + "grad_norm": 0.3767130672931671, + "learning_rate": 4.99300256755302e-05, + "loss": 0.2175, + "step": 6808 + }, + { + "epoch": 0.12144615274854635, + "grad_norm": 0.33303138613700867, + "learning_rate": 4.992990925173159e-05, + "loss": 0.2623, + "step": 6809 + }, + { + "epoch": 0.12146398887026005, + "grad_norm": 0.24306745827198029, + "learning_rate": 4.9929792731296035e-05, + "loss": 0.2644, + "step": 6810 + }, + { + "epoch": 0.12148182499197374, + "grad_norm": 0.45018666982650757, + "learning_rate": 4.9929676114224e-05, + "loss": 0.2324, + "step": 6811 + }, + { + "epoch": 0.12149966111368744, + "grad_norm": 0.2308064103126526, + "learning_rate": 4.992955940051593e-05, + "loss": 0.1957, + "step": 6812 + }, + { + "epoch": 0.12151749723540113, + "grad_norm": 0.24440710246562958, + "learning_rate": 4.992944259017227e-05, + "loss": 0.2413, + "step": 6813 + }, + { + "epoch": 0.12153533335711483, + "grad_norm": 0.2904827296733856, + "learning_rate": 4.992932568319349e-05, + "loss": 0.2265, + "step": 6814 + }, + { + "epoch": 0.12155316947882852, + "grad_norm": 0.29666444659233093, + "learning_rate": 4.9929208679580034e-05, + "loss": 0.2269, + "step": 6815 + }, + { + "epoch": 0.12157100560054222, + "grad_norm": 0.24752987921237946, + "learning_rate": 4.992909157933234e-05, + "loss": 0.2141, + "step": 6816 + }, + { + "epoch": 0.12158884172225591, + "grad_norm": 0.30989548563957214, + "learning_rate": 4.992897438245089e-05, + "loss": 0.2451, + "step": 6817 + }, + { + "epoch": 0.12160667784396961, + "grad_norm": 0.353629469871521, + "learning_rate": 4.992885708893612e-05, + "loss": 0.2336, + "step": 6818 + }, + { + "epoch": 0.1216245139656833, + "grad_norm": 0.2712937593460083, + "learning_rate": 4.9928739698788495e-05, + "loss": 0.2644, + "step": 6819 + }, + { + "epoch": 0.121642350087397, + "grad_norm": 0.33031079173088074, + "learning_rate": 4.992862221200846e-05, + "loss": 0.2104, + "step": 6820 + }, + { + "epoch": 0.12166018620911069, + "grad_norm": 0.3419361412525177, + "learning_rate": 4.992850462859647e-05, + "loss": 0.231, + "step": 6821 + }, + { + "epoch": 0.12167802233082439, + "grad_norm": 0.4428279995918274, + "learning_rate": 4.992838694855299e-05, + "loss": 0.2102, + "step": 6822 + }, + { + "epoch": 0.12169585845253808, + "grad_norm": 0.3489314615726471, + "learning_rate": 4.9928269171878485e-05, + "loss": 0.2683, + "step": 6823 + }, + { + "epoch": 0.12171369457425177, + "grad_norm": 0.2731943726539612, + "learning_rate": 4.992815129857339e-05, + "loss": 0.2125, + "step": 6824 + }, + { + "epoch": 0.12173153069596547, + "grad_norm": 0.31702354550361633, + "learning_rate": 4.992803332863817e-05, + "loss": 0.229, + "step": 6825 + }, + { + "epoch": 0.12174936681767916, + "grad_norm": 0.2625712454319, + "learning_rate": 4.9927915262073276e-05, + "loss": 0.2298, + "step": 6826 + }, + { + "epoch": 0.12176720293939286, + "grad_norm": 0.30082428455352783, + "learning_rate": 4.992779709887918e-05, + "loss": 0.222, + "step": 6827 + }, + { + "epoch": 0.12178503906110655, + "grad_norm": 0.28463712334632874, + "learning_rate": 4.9927678839056336e-05, + "loss": 0.2292, + "step": 6828 + }, + { + "epoch": 0.12180287518282025, + "grad_norm": 0.20966772735118866, + "learning_rate": 4.992756048260519e-05, + "loss": 0.2277, + "step": 6829 + }, + { + "epoch": 0.12182071130453394, + "grad_norm": 0.31907060742378235, + "learning_rate": 4.9927442029526214e-05, + "loss": 0.1992, + "step": 6830 + }, + { + "epoch": 0.12183854742624764, + "grad_norm": 0.33759158849716187, + "learning_rate": 4.992732347981987e-05, + "loss": 0.2195, + "step": 6831 + }, + { + "epoch": 0.12185638354796133, + "grad_norm": 0.21873730421066284, + "learning_rate": 4.9927204833486596e-05, + "loss": 0.2156, + "step": 6832 + }, + { + "epoch": 0.12187421966967503, + "grad_norm": 0.40703877806663513, + "learning_rate": 4.992708609052688e-05, + "loss": 0.2315, + "step": 6833 + }, + { + "epoch": 0.12189205579138872, + "grad_norm": 0.2686084806919098, + "learning_rate": 4.992696725094116e-05, + "loss": 0.2249, + "step": 6834 + }, + { + "epoch": 0.12190989191310242, + "grad_norm": 0.3546275496482849, + "learning_rate": 4.9926848314729914e-05, + "loss": 0.2807, + "step": 6835 + }, + { + "epoch": 0.12192772803481611, + "grad_norm": 0.28341612219810486, + "learning_rate": 4.992672928189358e-05, + "loss": 0.2633, + "step": 6836 + }, + { + "epoch": 0.12194556415652981, + "grad_norm": 0.2698984444141388, + "learning_rate": 4.9926610152432644e-05, + "loss": 0.2086, + "step": 6837 + }, + { + "epoch": 0.1219634002782435, + "grad_norm": 0.3039058744907379, + "learning_rate": 4.992649092634756e-05, + "loss": 0.2717, + "step": 6838 + }, + { + "epoch": 0.1219812363999572, + "grad_norm": 0.19431540369987488, + "learning_rate": 4.9926371603638786e-05, + "loss": 0.1906, + "step": 6839 + }, + { + "epoch": 0.12199907252167089, + "grad_norm": 0.3405723571777344, + "learning_rate": 4.9926252184306785e-05, + "loss": 0.2554, + "step": 6840 + }, + { + "epoch": 0.12201690864338459, + "grad_norm": 0.2744424343109131, + "learning_rate": 4.992613266835202e-05, + "loss": 0.1976, + "step": 6841 + }, + { + "epoch": 0.12203474476509828, + "grad_norm": 0.4421294033527374, + "learning_rate": 4.9926013055774956e-05, + "loss": 0.2431, + "step": 6842 + }, + { + "epoch": 0.12205258088681196, + "grad_norm": 0.2360885590314865, + "learning_rate": 4.9925893346576056e-05, + "loss": 0.2236, + "step": 6843 + }, + { + "epoch": 0.12207041700852567, + "grad_norm": 0.364916056394577, + "learning_rate": 4.992577354075578e-05, + "loss": 0.2304, + "step": 6844 + }, + { + "epoch": 0.12208825313023935, + "grad_norm": 0.33794131875038147, + "learning_rate": 4.9925653638314603e-05, + "loss": 0.2657, + "step": 6845 + }, + { + "epoch": 0.12210608925195306, + "grad_norm": 0.32369041442871094, + "learning_rate": 4.9925533639252986e-05, + "loss": 0.2263, + "step": 6846 + }, + { + "epoch": 0.12212392537366674, + "grad_norm": 0.351128488779068, + "learning_rate": 4.992541354357138e-05, + "loss": 0.2544, + "step": 6847 + }, + { + "epoch": 0.12214176149538045, + "grad_norm": 0.2790156602859497, + "learning_rate": 4.992529335127028e-05, + "loss": 0.2653, + "step": 6848 + }, + { + "epoch": 0.12215959761709413, + "grad_norm": 0.24617858231067657, + "learning_rate": 4.992517306235012e-05, + "loss": 0.2477, + "step": 6849 + }, + { + "epoch": 0.12217743373880784, + "grad_norm": 0.24819330871105194, + "learning_rate": 4.992505267681139e-05, + "loss": 0.2145, + "step": 6850 + }, + { + "epoch": 0.12219526986052152, + "grad_norm": 0.2965717613697052, + "learning_rate": 4.992493219465454e-05, + "loss": 0.1964, + "step": 6851 + }, + { + "epoch": 0.12221310598223523, + "grad_norm": 0.4467228651046753, + "learning_rate": 4.992481161588004e-05, + "loss": 0.2555, + "step": 6852 + }, + { + "epoch": 0.12223094210394891, + "grad_norm": 0.36978498101234436, + "learning_rate": 4.9924690940488375e-05, + "loss": 0.3208, + "step": 6853 + }, + { + "epoch": 0.12224877822566262, + "grad_norm": 0.40889060497283936, + "learning_rate": 4.992457016847999e-05, + "loss": 0.2466, + "step": 6854 + }, + { + "epoch": 0.1222666143473763, + "grad_norm": 0.27608317136764526, + "learning_rate": 4.9924449299855355e-05, + "loss": 0.2297, + "step": 6855 + }, + { + "epoch": 0.12228445046909, + "grad_norm": 0.4811840355396271, + "learning_rate": 4.9924328334614954e-05, + "loss": 0.2224, + "step": 6856 + }, + { + "epoch": 0.1223022865908037, + "grad_norm": 0.3006702661514282, + "learning_rate": 4.992420727275926e-05, + "loss": 0.2652, + "step": 6857 + }, + { + "epoch": 0.1223201227125174, + "grad_norm": 0.3643052279949188, + "learning_rate": 4.992408611428871e-05, + "loss": 0.2831, + "step": 6858 + }, + { + "epoch": 0.12233795883423108, + "grad_norm": 0.3541288375854492, + "learning_rate": 4.99239648592038e-05, + "loss": 0.31, + "step": 6859 + }, + { + "epoch": 0.12235579495594479, + "grad_norm": 0.2782037556171417, + "learning_rate": 4.9923843507505e-05, + "loss": 0.2041, + "step": 6860 + }, + { + "epoch": 0.12237363107765847, + "grad_norm": 0.3100832998752594, + "learning_rate": 4.992372205919277e-05, + "loss": 0.2858, + "step": 6861 + }, + { + "epoch": 0.12239146719937218, + "grad_norm": 0.3420276641845703, + "learning_rate": 4.992360051426759e-05, + "loss": 0.2443, + "step": 6862 + }, + { + "epoch": 0.12240930332108586, + "grad_norm": 0.3606860637664795, + "learning_rate": 4.992347887272991e-05, + "loss": 0.309, + "step": 6863 + }, + { + "epoch": 0.12242713944279955, + "grad_norm": 0.3299979865550995, + "learning_rate": 4.992335713458023e-05, + "loss": 0.277, + "step": 6864 + }, + { + "epoch": 0.12244497556451325, + "grad_norm": 0.3540804088115692, + "learning_rate": 4.992323529981901e-05, + "loss": 0.2813, + "step": 6865 + }, + { + "epoch": 0.12246281168622694, + "grad_norm": 0.315603643655777, + "learning_rate": 4.992311336844672e-05, + "loss": 0.2417, + "step": 6866 + }, + { + "epoch": 0.12248064780794064, + "grad_norm": 0.3033362627029419, + "learning_rate": 4.9922991340463834e-05, + "loss": 0.2342, + "step": 6867 + }, + { + "epoch": 0.12249848392965433, + "grad_norm": 0.2897791266441345, + "learning_rate": 4.9922869215870824e-05, + "loss": 0.2639, + "step": 6868 + }, + { + "epoch": 0.12251632005136803, + "grad_norm": 0.3022953271865845, + "learning_rate": 4.992274699466817e-05, + "loss": 0.1963, + "step": 6869 + }, + { + "epoch": 0.12253415617308172, + "grad_norm": 0.24022029340267181, + "learning_rate": 4.992262467685633e-05, + "loss": 0.2159, + "step": 6870 + }, + { + "epoch": 0.12255199229479542, + "grad_norm": 0.2967076897621155, + "learning_rate": 4.99225022624358e-05, + "loss": 0.2484, + "step": 6871 + }, + { + "epoch": 0.12256982841650911, + "grad_norm": 0.3520475924015045, + "learning_rate": 4.9922379751407045e-05, + "loss": 0.2256, + "step": 6872 + }, + { + "epoch": 0.12258766453822281, + "grad_norm": 0.30049577355384827, + "learning_rate": 4.992225714377053e-05, + "loss": 0.1818, + "step": 6873 + }, + { + "epoch": 0.1226055006599365, + "grad_norm": 0.2451341450214386, + "learning_rate": 4.992213443952674e-05, + "loss": 0.1951, + "step": 6874 + }, + { + "epoch": 0.1226233367816502, + "grad_norm": 0.3067533075809479, + "learning_rate": 4.992201163867615e-05, + "loss": 0.2347, + "step": 6875 + }, + { + "epoch": 0.12264117290336389, + "grad_norm": 0.2550610899925232, + "learning_rate": 4.9921888741219234e-05, + "loss": 0.2227, + "step": 6876 + }, + { + "epoch": 0.12265900902507759, + "grad_norm": 0.36997440457344055, + "learning_rate": 4.992176574715647e-05, + "loss": 0.2883, + "step": 6877 + }, + { + "epoch": 0.12267684514679128, + "grad_norm": 0.24751408398151398, + "learning_rate": 4.9921642656488334e-05, + "loss": 0.2605, + "step": 6878 + }, + { + "epoch": 0.12269468126850498, + "grad_norm": 0.3025315999984741, + "learning_rate": 4.99215194692153e-05, + "loss": 0.2219, + "step": 6879 + }, + { + "epoch": 0.12271251739021867, + "grad_norm": 0.29911789298057556, + "learning_rate": 4.992139618533785e-05, + "loss": 0.1984, + "step": 6880 + }, + { + "epoch": 0.12273035351193237, + "grad_norm": 0.37772804498672485, + "learning_rate": 4.992127280485647e-05, + "loss": 0.2645, + "step": 6881 + }, + { + "epoch": 0.12274818963364606, + "grad_norm": 0.2674390971660614, + "learning_rate": 4.992114932777162e-05, + "loss": 0.2172, + "step": 6882 + }, + { + "epoch": 0.12276602575535975, + "grad_norm": 0.3375721871852875, + "learning_rate": 4.9921025754083794e-05, + "loss": 0.3013, + "step": 6883 + }, + { + "epoch": 0.12278386187707345, + "grad_norm": 0.23008044064044952, + "learning_rate": 4.992090208379346e-05, + "loss": 0.2178, + "step": 6884 + }, + { + "epoch": 0.12280169799878714, + "grad_norm": 0.3182332217693329, + "learning_rate": 4.9920778316901105e-05, + "loss": 0.2497, + "step": 6885 + }, + { + "epoch": 0.12281953412050084, + "grad_norm": 0.293599009513855, + "learning_rate": 4.99206544534072e-05, + "loss": 0.2041, + "step": 6886 + }, + { + "epoch": 0.12283737024221453, + "grad_norm": 0.22377248108386993, + "learning_rate": 4.992053049331224e-05, + "loss": 0.2083, + "step": 6887 + }, + { + "epoch": 0.12285520636392823, + "grad_norm": 0.3267717957496643, + "learning_rate": 4.992040643661669e-05, + "loss": 0.2569, + "step": 6888 + }, + { + "epoch": 0.12287304248564192, + "grad_norm": 0.40441974997520447, + "learning_rate": 4.992028228332104e-05, + "loss": 0.2593, + "step": 6889 + }, + { + "epoch": 0.12289087860735562, + "grad_norm": 0.27668124437332153, + "learning_rate": 4.9920158033425765e-05, + "loss": 0.3579, + "step": 6890 + }, + { + "epoch": 0.12290871472906931, + "grad_norm": 0.2906394600868225, + "learning_rate": 4.9920033686931354e-05, + "loss": 0.2355, + "step": 6891 + }, + { + "epoch": 0.12292655085078301, + "grad_norm": 0.21560589969158173, + "learning_rate": 4.991990924383829e-05, + "loss": 0.1665, + "step": 6892 + }, + { + "epoch": 0.1229443869724967, + "grad_norm": 0.31353211402893066, + "learning_rate": 4.9919784704147046e-05, + "loss": 0.2615, + "step": 6893 + }, + { + "epoch": 0.1229622230942104, + "grad_norm": 0.2735927999019623, + "learning_rate": 4.9919660067858106e-05, + "loss": 0.2205, + "step": 6894 + }, + { + "epoch": 0.12298005921592409, + "grad_norm": 0.2787560522556305, + "learning_rate": 4.991953533497196e-05, + "loss": 0.2643, + "step": 6895 + }, + { + "epoch": 0.12299789533763779, + "grad_norm": 0.2606993019580841, + "learning_rate": 4.9919410505489086e-05, + "loss": 0.2131, + "step": 6896 + }, + { + "epoch": 0.12301573145935148, + "grad_norm": 0.2543543875217438, + "learning_rate": 4.991928557940997e-05, + "loss": 0.2143, + "step": 6897 + }, + { + "epoch": 0.12303356758106518, + "grad_norm": 0.2402089387178421, + "learning_rate": 4.9919160556735104e-05, + "loss": 0.2459, + "step": 6898 + }, + { + "epoch": 0.12305140370277887, + "grad_norm": 0.30288493633270264, + "learning_rate": 4.9919035437464955e-05, + "loss": 0.2601, + "step": 6899 + }, + { + "epoch": 0.12306923982449257, + "grad_norm": 0.2556219696998596, + "learning_rate": 4.9918910221600016e-05, + "loss": 0.2384, + "step": 6900 + }, + { + "epoch": 0.12308707594620626, + "grad_norm": 0.38939225673675537, + "learning_rate": 4.991878490914078e-05, + "loss": 0.2902, + "step": 6901 + }, + { + "epoch": 0.12310491206791996, + "grad_norm": 0.23063524067401886, + "learning_rate": 4.9918659500087725e-05, + "loss": 0.225, + "step": 6902 + }, + { + "epoch": 0.12312274818963365, + "grad_norm": 0.30427998304367065, + "learning_rate": 4.991853399444134e-05, + "loss": 0.1915, + "step": 6903 + }, + { + "epoch": 0.12314058431134733, + "grad_norm": 0.29430699348449707, + "learning_rate": 4.9918408392202114e-05, + "loss": 0.2301, + "step": 6904 + }, + { + "epoch": 0.12315842043306104, + "grad_norm": 0.3004782497882843, + "learning_rate": 4.9918282693370535e-05, + "loss": 0.1757, + "step": 6905 + }, + { + "epoch": 0.12317625655477472, + "grad_norm": 0.24297845363616943, + "learning_rate": 4.991815689794708e-05, + "loss": 0.2394, + "step": 6906 + }, + { + "epoch": 0.12319409267648843, + "grad_norm": 0.32428014278411865, + "learning_rate": 4.9918031005932236e-05, + "loss": 0.2479, + "step": 6907 + }, + { + "epoch": 0.12321192879820211, + "grad_norm": 0.1973564326763153, + "learning_rate": 4.9917905017326505e-05, + "loss": 0.2077, + "step": 6908 + }, + { + "epoch": 0.12322976491991582, + "grad_norm": 0.2838943302631378, + "learning_rate": 4.991777893213037e-05, + "loss": 0.2143, + "step": 6909 + }, + { + "epoch": 0.1232476010416295, + "grad_norm": 0.3316414952278137, + "learning_rate": 4.99176527503443e-05, + "loss": 0.224, + "step": 6910 + }, + { + "epoch": 0.1232654371633432, + "grad_norm": 0.2882927358150482, + "learning_rate": 4.991752647196882e-05, + "loss": 0.2262, + "step": 6911 + }, + { + "epoch": 0.1232832732850569, + "grad_norm": 0.4223480522632599, + "learning_rate": 4.99174000970044e-05, + "loss": 0.2436, + "step": 6912 + }, + { + "epoch": 0.1233011094067706, + "grad_norm": 0.31423330307006836, + "learning_rate": 4.9917273625451524e-05, + "loss": 0.2832, + "step": 6913 + }, + { + "epoch": 0.12331894552848428, + "grad_norm": 0.3034530580043793, + "learning_rate": 4.991714705731069e-05, + "loss": 0.26, + "step": 6914 + }, + { + "epoch": 0.12333678165019799, + "grad_norm": 0.24791258573532104, + "learning_rate": 4.9917020392582395e-05, + "loss": 0.1659, + "step": 6915 + }, + { + "epoch": 0.12335461777191167, + "grad_norm": 0.23711775243282318, + "learning_rate": 4.991689363126712e-05, + "loss": 0.1974, + "step": 6916 + }, + { + "epoch": 0.12337245389362538, + "grad_norm": 0.3169557452201843, + "learning_rate": 4.991676677336535e-05, + "loss": 0.2164, + "step": 6917 + }, + { + "epoch": 0.12339029001533906, + "grad_norm": 0.2168099284172058, + "learning_rate": 4.9916639818877595e-05, + "loss": 0.2195, + "step": 6918 + }, + { + "epoch": 0.12340812613705276, + "grad_norm": 0.27511683106422424, + "learning_rate": 4.991651276780433e-05, + "loss": 0.2062, + "step": 6919 + }, + { + "epoch": 0.12342596225876645, + "grad_norm": 0.3275053799152374, + "learning_rate": 4.9916385620146066e-05, + "loss": 0.2376, + "step": 6920 + }, + { + "epoch": 0.12344379838048015, + "grad_norm": 0.29005566239356995, + "learning_rate": 4.991625837590328e-05, + "loss": 0.2192, + "step": 6921 + }, + { + "epoch": 0.12346163450219384, + "grad_norm": 0.2789865732192993, + "learning_rate": 4.9916131035076474e-05, + "loss": 0.1732, + "step": 6922 + }, + { + "epoch": 0.12347947062390753, + "grad_norm": 0.35607048869132996, + "learning_rate": 4.991600359766614e-05, + "loss": 0.3044, + "step": 6923 + }, + { + "epoch": 0.12349730674562123, + "grad_norm": 0.3326689600944519, + "learning_rate": 4.991587606367276e-05, + "loss": 0.1585, + "step": 6924 + }, + { + "epoch": 0.12351514286733492, + "grad_norm": 0.2452046424150467, + "learning_rate": 4.991574843309685e-05, + "loss": 0.2547, + "step": 6925 + }, + { + "epoch": 0.12353297898904862, + "grad_norm": 1.0032877922058105, + "learning_rate": 4.991562070593889e-05, + "loss": 0.2402, + "step": 6926 + }, + { + "epoch": 0.12355081511076231, + "grad_norm": 0.29741159081459045, + "learning_rate": 4.9915492882199375e-05, + "loss": 0.2421, + "step": 6927 + }, + { + "epoch": 0.12356865123247601, + "grad_norm": 0.24511712789535522, + "learning_rate": 4.9915364961878805e-05, + "loss": 0.2252, + "step": 6928 + }, + { + "epoch": 0.1235864873541897, + "grad_norm": 0.29859545826911926, + "learning_rate": 4.9915236944977676e-05, + "loss": 0.2824, + "step": 6929 + }, + { + "epoch": 0.1236043234759034, + "grad_norm": 0.26726216077804565, + "learning_rate": 4.991510883149649e-05, + "loss": 0.229, + "step": 6930 + }, + { + "epoch": 0.12362215959761709, + "grad_norm": 0.2911757230758667, + "learning_rate": 4.991498062143573e-05, + "loss": 0.2572, + "step": 6931 + }, + { + "epoch": 0.12363999571933079, + "grad_norm": 0.4179574251174927, + "learning_rate": 4.99148523147959e-05, + "loss": 0.2272, + "step": 6932 + }, + { + "epoch": 0.12365783184104448, + "grad_norm": 0.3357955813407898, + "learning_rate": 4.9914723911577496e-05, + "loss": 0.1862, + "step": 6933 + }, + { + "epoch": 0.12367566796275818, + "grad_norm": 0.4733642339706421, + "learning_rate": 4.991459541178102e-05, + "loss": 0.2488, + "step": 6934 + }, + { + "epoch": 0.12369350408447187, + "grad_norm": 0.32170218229293823, + "learning_rate": 4.9914466815406965e-05, + "loss": 0.217, + "step": 6935 + }, + { + "epoch": 0.12371134020618557, + "grad_norm": 0.32162222266197205, + "learning_rate": 4.9914338122455826e-05, + "loss": 0.2519, + "step": 6936 + }, + { + "epoch": 0.12372917632789926, + "grad_norm": 0.257475882768631, + "learning_rate": 4.991420933292812e-05, + "loss": 0.25, + "step": 6937 + }, + { + "epoch": 0.12374701244961296, + "grad_norm": 0.2553541362285614, + "learning_rate": 4.9914080446824315e-05, + "loss": 0.2234, + "step": 6938 + }, + { + "epoch": 0.12376484857132665, + "grad_norm": 0.31391093134880066, + "learning_rate": 4.991395146414495e-05, + "loss": 0.2811, + "step": 6939 + }, + { + "epoch": 0.12378268469304035, + "grad_norm": 0.21910032629966736, + "learning_rate": 4.9913822384890494e-05, + "loss": 0.2291, + "step": 6940 + }, + { + "epoch": 0.12380052081475404, + "grad_norm": 0.27430328726768494, + "learning_rate": 4.991369320906146e-05, + "loss": 0.2178, + "step": 6941 + }, + { + "epoch": 0.12381835693646774, + "grad_norm": 0.2509065866470337, + "learning_rate": 4.9913563936658345e-05, + "loss": 0.2475, + "step": 6942 + }, + { + "epoch": 0.12383619305818143, + "grad_norm": 0.38076311349868774, + "learning_rate": 4.991343456768165e-05, + "loss": 0.2779, + "step": 6943 + }, + { + "epoch": 0.12385402917989512, + "grad_norm": 0.2753581702709198, + "learning_rate": 4.9913305102131874e-05, + "loss": 0.189, + "step": 6944 + }, + { + "epoch": 0.12387186530160882, + "grad_norm": 0.29255151748657227, + "learning_rate": 4.9913175540009525e-05, + "loss": 0.1742, + "step": 6945 + }, + { + "epoch": 0.1238897014233225, + "grad_norm": 0.22305609285831451, + "learning_rate": 4.9913045881315106e-05, + "loss": 0.2015, + "step": 6946 + }, + { + "epoch": 0.12390753754503621, + "grad_norm": 0.265197217464447, + "learning_rate": 4.991291612604911e-05, + "loss": 0.2021, + "step": 6947 + }, + { + "epoch": 0.1239253736667499, + "grad_norm": 0.3825306296348572, + "learning_rate": 4.991278627421206e-05, + "loss": 0.2325, + "step": 6948 + }, + { + "epoch": 0.1239432097884636, + "grad_norm": 0.2112373560667038, + "learning_rate": 4.991265632580444e-05, + "loss": 0.2315, + "step": 6949 + }, + { + "epoch": 0.12396104591017729, + "grad_norm": 0.28901270031929016, + "learning_rate": 4.991252628082675e-05, + "loss": 0.235, + "step": 6950 + }, + { + "epoch": 0.12397888203189099, + "grad_norm": 0.3946826159954071, + "learning_rate": 4.9912396139279514e-05, + "loss": 0.2183, + "step": 6951 + }, + { + "epoch": 0.12399671815360468, + "grad_norm": 0.23901917040348053, + "learning_rate": 4.991226590116322e-05, + "loss": 0.1935, + "step": 6952 + }, + { + "epoch": 0.12401455427531838, + "grad_norm": 0.24910520017147064, + "learning_rate": 4.9912135566478385e-05, + "loss": 0.2386, + "step": 6953 + }, + { + "epoch": 0.12403239039703207, + "grad_norm": 0.3099457919597626, + "learning_rate": 4.99120051352255e-05, + "loss": 0.2173, + "step": 6954 + }, + { + "epoch": 0.12405022651874577, + "grad_norm": 0.2548370361328125, + "learning_rate": 4.991187460740509e-05, + "loss": 0.2487, + "step": 6955 + }, + { + "epoch": 0.12406806264045946, + "grad_norm": 0.2176741361618042, + "learning_rate": 4.9911743983017646e-05, + "loss": 0.2101, + "step": 6956 + }, + { + "epoch": 0.12408589876217316, + "grad_norm": 0.30715304613113403, + "learning_rate": 4.991161326206367e-05, + "loss": 0.2348, + "step": 6957 + }, + { + "epoch": 0.12410373488388685, + "grad_norm": 0.29987630248069763, + "learning_rate": 4.991148244454369e-05, + "loss": 0.2031, + "step": 6958 + }, + { + "epoch": 0.12412157100560055, + "grad_norm": 0.3162432610988617, + "learning_rate": 4.991135153045819e-05, + "loss": 0.2179, + "step": 6959 + }, + { + "epoch": 0.12413940712731424, + "grad_norm": 0.24059271812438965, + "learning_rate": 4.9911220519807686e-05, + "loss": 0.2392, + "step": 6960 + }, + { + "epoch": 0.12415724324902794, + "grad_norm": 0.6025384664535522, + "learning_rate": 4.991108941259269e-05, + "loss": 0.2131, + "step": 6961 + }, + { + "epoch": 0.12417507937074163, + "grad_norm": 0.23318976163864136, + "learning_rate": 4.991095820881371e-05, + "loss": 0.2249, + "step": 6962 + }, + { + "epoch": 0.12419291549245531, + "grad_norm": 0.19816309213638306, + "learning_rate": 4.991082690847125e-05, + "loss": 0.2005, + "step": 6963 + }, + { + "epoch": 0.12421075161416902, + "grad_norm": 0.30013400316238403, + "learning_rate": 4.991069551156582e-05, + "loss": 0.2298, + "step": 6964 + }, + { + "epoch": 0.1242285877358827, + "grad_norm": 0.33225423097610474, + "learning_rate": 4.991056401809794e-05, + "loss": 0.2784, + "step": 6965 + }, + { + "epoch": 0.1242464238575964, + "grad_norm": 0.23370786011219025, + "learning_rate": 4.99104324280681e-05, + "loss": 0.1799, + "step": 6966 + }, + { + "epoch": 0.1242642599793101, + "grad_norm": 0.29927730560302734, + "learning_rate": 4.9910300741476815e-05, + "loss": 0.2371, + "step": 6967 + }, + { + "epoch": 0.1242820961010238, + "grad_norm": 0.2729722261428833, + "learning_rate": 4.991016895832461e-05, + "loss": 0.2465, + "step": 6968 + }, + { + "epoch": 0.12429993222273748, + "grad_norm": 0.24219760298728943, + "learning_rate": 4.991003707861198e-05, + "loss": 0.2032, + "step": 6969 + }, + { + "epoch": 0.12431776834445118, + "grad_norm": 0.3124775290489197, + "learning_rate": 4.990990510233945e-05, + "loss": 0.2507, + "step": 6970 + }, + { + "epoch": 0.12433560446616487, + "grad_norm": 0.29919275641441345, + "learning_rate": 4.9909773029507525e-05, + "loss": 0.2153, + "step": 6971 + }, + { + "epoch": 0.12435344058787857, + "grad_norm": 0.3777317702770233, + "learning_rate": 4.990964086011671e-05, + "loss": 0.2294, + "step": 6972 + }, + { + "epoch": 0.12437127670959226, + "grad_norm": 0.3543206751346588, + "learning_rate": 4.990950859416752e-05, + "loss": 0.2033, + "step": 6973 + }, + { + "epoch": 0.12438911283130596, + "grad_norm": 0.2996242940425873, + "learning_rate": 4.990937623166048e-05, + "loss": 0.2033, + "step": 6974 + }, + { + "epoch": 0.12440694895301965, + "grad_norm": 0.27193644642829895, + "learning_rate": 4.9909243772596096e-05, + "loss": 0.2304, + "step": 6975 + }, + { + "epoch": 0.12442478507473335, + "grad_norm": 0.2828409969806671, + "learning_rate": 4.990911121697487e-05, + "loss": 0.1804, + "step": 6976 + }, + { + "epoch": 0.12444262119644704, + "grad_norm": 0.29215842485427856, + "learning_rate": 4.990897856479733e-05, + "loss": 0.1992, + "step": 6977 + }, + { + "epoch": 0.12446045731816074, + "grad_norm": 0.2817901074886322, + "learning_rate": 4.990884581606399e-05, + "loss": 0.2286, + "step": 6978 + }, + { + "epoch": 0.12447829343987443, + "grad_norm": 0.35553768277168274, + "learning_rate": 4.990871297077535e-05, + "loss": 0.2617, + "step": 6979 + }, + { + "epoch": 0.12449612956158813, + "grad_norm": 0.3343108296394348, + "learning_rate": 4.9908580028931946e-05, + "loss": 0.2858, + "step": 6980 + }, + { + "epoch": 0.12451396568330182, + "grad_norm": 0.2859632968902588, + "learning_rate": 4.990844699053427e-05, + "loss": 0.2155, + "step": 6981 + }, + { + "epoch": 0.12453180180501552, + "grad_norm": 0.38963982462882996, + "learning_rate": 4.9908313855582864e-05, + "loss": 0.2182, + "step": 6982 + }, + { + "epoch": 0.12454963792672921, + "grad_norm": 0.30357739329338074, + "learning_rate": 4.9908180624078225e-05, + "loss": 0.2206, + "step": 6983 + }, + { + "epoch": 0.1245674740484429, + "grad_norm": 0.26114726066589355, + "learning_rate": 4.990804729602088e-05, + "loss": 0.234, + "step": 6984 + }, + { + "epoch": 0.1245853101701566, + "grad_norm": 0.2825394570827484, + "learning_rate": 4.9907913871411334e-05, + "loss": 0.1947, + "step": 6985 + }, + { + "epoch": 0.12460314629187029, + "grad_norm": 0.30148234963417053, + "learning_rate": 4.990778035025011e-05, + "loss": 0.25, + "step": 6986 + }, + { + "epoch": 0.12462098241358399, + "grad_norm": 0.3360712230205536, + "learning_rate": 4.990764673253773e-05, + "loss": 0.26, + "step": 6987 + }, + { + "epoch": 0.12463881853529768, + "grad_norm": 0.2792559266090393, + "learning_rate": 4.990751301827471e-05, + "loss": 0.2438, + "step": 6988 + }, + { + "epoch": 0.12465665465701138, + "grad_norm": 0.2513696551322937, + "learning_rate": 4.990737920746156e-05, + "loss": 0.1989, + "step": 6989 + }, + { + "epoch": 0.12467449077872507, + "grad_norm": 0.2617587447166443, + "learning_rate": 4.990724530009881e-05, + "loss": 0.188, + "step": 6990 + }, + { + "epoch": 0.12469232690043877, + "grad_norm": 0.32672059535980225, + "learning_rate": 4.990711129618698e-05, + "loss": 0.1809, + "step": 6991 + }, + { + "epoch": 0.12471016302215246, + "grad_norm": 0.2739046514034271, + "learning_rate": 4.990697719572658e-05, + "loss": 0.2026, + "step": 6992 + }, + { + "epoch": 0.12472799914386616, + "grad_norm": 0.3025754690170288, + "learning_rate": 4.9906842998718126e-05, + "loss": 0.236, + "step": 6993 + }, + { + "epoch": 0.12474583526557985, + "grad_norm": 0.3255358040332794, + "learning_rate": 4.9906708705162155e-05, + "loss": 0.2637, + "step": 6994 + }, + { + "epoch": 0.12476367138729355, + "grad_norm": 0.2542930245399475, + "learning_rate": 4.990657431505917e-05, + "loss": 0.1842, + "step": 6995 + }, + { + "epoch": 0.12478150750900724, + "grad_norm": 0.29506662487983704, + "learning_rate": 4.9906439828409715e-05, + "loss": 0.2241, + "step": 6996 + }, + { + "epoch": 0.12479934363072094, + "grad_norm": 0.22836719453334808, + "learning_rate": 4.9906305245214284e-05, + "loss": 0.2128, + "step": 6997 + }, + { + "epoch": 0.12481717975243463, + "grad_norm": 0.24722233414649963, + "learning_rate": 4.990617056547342e-05, + "loss": 0.1961, + "step": 6998 + }, + { + "epoch": 0.12483501587414833, + "grad_norm": 0.2995939552783966, + "learning_rate": 4.9906035789187626e-05, + "loss": 0.2237, + "step": 6999 + }, + { + "epoch": 0.12485285199586202, + "grad_norm": 0.2700544595718384, + "learning_rate": 4.9905900916357437e-05, + "loss": 0.2316, + "step": 7000 + }, + { + "epoch": 0.12485285199586202, + "eval_loss": 0.230218306183815, + "eval_runtime": 113.194, + "eval_samples_per_second": 9.046, + "eval_steps_per_second": 1.511, + "step": 7000 + }, + { + "epoch": 0.12487068811757572, + "grad_norm": 0.3256397843360901, + "learning_rate": 4.990576594698339e-05, + "loss": 0.2307, + "step": 7001 + }, + { + "epoch": 0.12488852423928941, + "grad_norm": 0.3033009469509125, + "learning_rate": 4.990563088106598e-05, + "loss": 0.2325, + "step": 7002 + }, + { + "epoch": 0.12490636036100311, + "grad_norm": 0.2994910478591919, + "learning_rate": 4.9905495718605735e-05, + "loss": 0.2616, + "step": 7003 + }, + { + "epoch": 0.1249241964827168, + "grad_norm": 0.23377302289009094, + "learning_rate": 4.9905360459603196e-05, + "loss": 0.1878, + "step": 7004 + }, + { + "epoch": 0.12494203260443049, + "grad_norm": 0.9831739664077759, + "learning_rate": 4.990522510405887e-05, + "loss": 0.3694, + "step": 7005 + }, + { + "epoch": 0.12495986872614419, + "grad_norm": 0.36338818073272705, + "learning_rate": 4.99050896519733e-05, + "loss": 0.2196, + "step": 7006 + }, + { + "epoch": 0.12497770484785788, + "grad_norm": 0.307064026594162, + "learning_rate": 4.9904954103347e-05, + "loss": 0.2506, + "step": 7007 + }, + { + "epoch": 0.12499554096957158, + "grad_norm": 0.3263118267059326, + "learning_rate": 4.990481845818049e-05, + "loss": 0.2595, + "step": 7008 + }, + { + "epoch": 0.12501337709128527, + "grad_norm": 0.35931575298309326, + "learning_rate": 4.9904682716474305e-05, + "loss": 0.2843, + "step": 7009 + }, + { + "epoch": 0.12503121321299895, + "grad_norm": 0.3789559602737427, + "learning_rate": 4.990454687822896e-05, + "loss": 0.3291, + "step": 7010 + }, + { + "epoch": 0.12504904933471267, + "grad_norm": 0.3388417065143585, + "learning_rate": 4.9904410943445e-05, + "loss": 0.2229, + "step": 7011 + }, + { + "epoch": 0.12506688545642636, + "grad_norm": 0.20887786149978638, + "learning_rate": 4.990427491212294e-05, + "loss": 0.2237, + "step": 7012 + }, + { + "epoch": 0.12508472157814005, + "grad_norm": 0.3833588659763336, + "learning_rate": 4.990413878426331e-05, + "loss": 0.2421, + "step": 7013 + }, + { + "epoch": 0.12510255769985373, + "grad_norm": 0.3767065405845642, + "learning_rate": 4.990400255986663e-05, + "loss": 0.2479, + "step": 7014 + }, + { + "epoch": 0.12512039382156745, + "grad_norm": 0.34950730204582214, + "learning_rate": 4.990386623893344e-05, + "loss": 0.2592, + "step": 7015 + }, + { + "epoch": 0.12513822994328114, + "grad_norm": 0.2206435352563858, + "learning_rate": 4.9903729821464255e-05, + "loss": 0.1807, + "step": 7016 + }, + { + "epoch": 0.12515606606499483, + "grad_norm": 0.4904520809650421, + "learning_rate": 4.990359330745962e-05, + "loss": 0.3227, + "step": 7017 + }, + { + "epoch": 0.1251739021867085, + "grad_norm": 0.22506429255008698, + "learning_rate": 4.9903456696920054e-05, + "loss": 0.2118, + "step": 7018 + }, + { + "epoch": 0.12519173830842223, + "grad_norm": 0.2756589949131012, + "learning_rate": 4.990331998984609e-05, + "loss": 0.2696, + "step": 7019 + }, + { + "epoch": 0.12520957443013592, + "grad_norm": 0.20561277866363525, + "learning_rate": 4.990318318623825e-05, + "loss": 0.1979, + "step": 7020 + }, + { + "epoch": 0.1252274105518496, + "grad_norm": 0.34968605637550354, + "learning_rate": 4.990304628609708e-05, + "loss": 0.2748, + "step": 7021 + }, + { + "epoch": 0.1252452466735633, + "grad_norm": 0.2604430019855499, + "learning_rate": 4.990290928942309e-05, + "loss": 0.232, + "step": 7022 + }, + { + "epoch": 0.125263082795277, + "grad_norm": 0.3240351974964142, + "learning_rate": 4.990277219621683e-05, + "loss": 0.2242, + "step": 7023 + }, + { + "epoch": 0.1252809189169907, + "grad_norm": 0.3275964558124542, + "learning_rate": 4.990263500647883e-05, + "loss": 0.2374, + "step": 7024 + }, + { + "epoch": 0.12529875503870438, + "grad_norm": 0.611990749835968, + "learning_rate": 4.99024977202096e-05, + "loss": 0.2739, + "step": 7025 + }, + { + "epoch": 0.12531659116041807, + "grad_norm": 0.28065142035484314, + "learning_rate": 4.9902360337409695e-05, + "loss": 0.2148, + "step": 7026 + }, + { + "epoch": 0.12533442728213176, + "grad_norm": 0.2256687432527542, + "learning_rate": 4.990222285807964e-05, + "loss": 0.22, + "step": 7027 + }, + { + "epoch": 0.12535226340384548, + "grad_norm": 0.3032439351081848, + "learning_rate": 4.990208528221997e-05, + "loss": 0.2012, + "step": 7028 + }, + { + "epoch": 0.12537009952555916, + "grad_norm": 0.37160810828208923, + "learning_rate": 4.9901947609831216e-05, + "loss": 0.2148, + "step": 7029 + }, + { + "epoch": 0.12538793564727285, + "grad_norm": 0.33610060811042786, + "learning_rate": 4.9901809840913915e-05, + "loss": 0.2868, + "step": 7030 + }, + { + "epoch": 0.12540577176898654, + "grad_norm": 0.3161230683326721, + "learning_rate": 4.990167197546859e-05, + "loss": 0.2118, + "step": 7031 + }, + { + "epoch": 0.12542360789070026, + "grad_norm": 0.3094927668571472, + "learning_rate": 4.990153401349579e-05, + "loss": 0.2372, + "step": 7032 + }, + { + "epoch": 0.12544144401241394, + "grad_norm": 0.31055590510368347, + "learning_rate": 4.990139595499605e-05, + "loss": 0.2715, + "step": 7033 + }, + { + "epoch": 0.12545928013412763, + "grad_norm": 0.2605281472206116, + "learning_rate": 4.9901257799969884e-05, + "loss": 0.2313, + "step": 7034 + }, + { + "epoch": 0.12547711625584132, + "grad_norm": 0.30338236689567566, + "learning_rate": 4.990111954841785e-05, + "loss": 0.2343, + "step": 7035 + }, + { + "epoch": 0.12549495237755504, + "grad_norm": 0.3016177713871002, + "learning_rate": 4.990098120034048e-05, + "loss": 0.2303, + "step": 7036 + }, + { + "epoch": 0.12551278849926872, + "grad_norm": 0.3031337559223175, + "learning_rate": 4.9900842755738296e-05, + "loss": 0.2328, + "step": 7037 + }, + { + "epoch": 0.1255306246209824, + "grad_norm": 0.4175161123275757, + "learning_rate": 4.9900704214611856e-05, + "loss": 0.2587, + "step": 7038 + }, + { + "epoch": 0.1255484607426961, + "grad_norm": 0.3712256848812103, + "learning_rate": 4.9900565576961675e-05, + "loss": 0.2534, + "step": 7039 + }, + { + "epoch": 0.12556629686440982, + "grad_norm": 0.28539612889289856, + "learning_rate": 4.990042684278831e-05, + "loss": 0.2157, + "step": 7040 + }, + { + "epoch": 0.1255841329861235, + "grad_norm": 0.3292580544948578, + "learning_rate": 4.990028801209229e-05, + "loss": 0.2726, + "step": 7041 + }, + { + "epoch": 0.1256019691078372, + "grad_norm": 0.3154231607913971, + "learning_rate": 4.9900149084874146e-05, + "loss": 0.2791, + "step": 7042 + }, + { + "epoch": 0.12561980522955088, + "grad_norm": 0.25210949778556824, + "learning_rate": 4.990001006113443e-05, + "loss": 0.2359, + "step": 7043 + }, + { + "epoch": 0.12563764135126457, + "grad_norm": 0.32752400636672974, + "learning_rate": 4.989987094087367e-05, + "loss": 0.2439, + "step": 7044 + }, + { + "epoch": 0.12565547747297828, + "grad_norm": 0.2529580891132355, + "learning_rate": 4.9899731724092414e-05, + "loss": 0.2527, + "step": 7045 + }, + { + "epoch": 0.12567331359469197, + "grad_norm": 0.26535871624946594, + "learning_rate": 4.9899592410791203e-05, + "loss": 0.2259, + "step": 7046 + }, + { + "epoch": 0.12569114971640566, + "grad_norm": 0.270537793636322, + "learning_rate": 4.989945300097056e-05, + "loss": 0.2232, + "step": 7047 + }, + { + "epoch": 0.12570898583811935, + "grad_norm": 0.46519288420677185, + "learning_rate": 4.989931349463105e-05, + "loss": 0.2533, + "step": 7048 + }, + { + "epoch": 0.12572682195983306, + "grad_norm": 0.2930956482887268, + "learning_rate": 4.9899173891773185e-05, + "loss": 0.2248, + "step": 7049 + }, + { + "epoch": 0.12574465808154675, + "grad_norm": 0.21747002005577087, + "learning_rate": 4.9899034192397534e-05, + "loss": 0.1907, + "step": 7050 + }, + { + "epoch": 0.12576249420326044, + "grad_norm": 0.2502320110797882, + "learning_rate": 4.989889439650463e-05, + "loss": 0.1992, + "step": 7051 + }, + { + "epoch": 0.12578033032497413, + "grad_norm": 0.2994558811187744, + "learning_rate": 4.9898754504095e-05, + "loss": 0.2814, + "step": 7052 + }, + { + "epoch": 0.12579816644668784, + "grad_norm": 0.2771264910697937, + "learning_rate": 4.989861451516921e-05, + "loss": 0.2194, + "step": 7053 + }, + { + "epoch": 0.12581600256840153, + "grad_norm": 0.2600449025630951, + "learning_rate": 4.989847442972778e-05, + "loss": 0.2088, + "step": 7054 + }, + { + "epoch": 0.12583383869011522, + "grad_norm": 0.26924750208854675, + "learning_rate": 4.989833424777127e-05, + "loss": 0.2414, + "step": 7055 + }, + { + "epoch": 0.1258516748118289, + "grad_norm": 0.2907157242298126, + "learning_rate": 4.989819396930021e-05, + "loss": 0.243, + "step": 7056 + }, + { + "epoch": 0.12586951093354262, + "grad_norm": 0.26334866881370544, + "learning_rate": 4.989805359431515e-05, + "loss": 0.288, + "step": 7057 + }, + { + "epoch": 0.1258873470552563, + "grad_norm": 0.26659277081489563, + "learning_rate": 4.9897913122816644e-05, + "loss": 0.2132, + "step": 7058 + }, + { + "epoch": 0.12590518317697, + "grad_norm": 0.2721278667449951, + "learning_rate": 4.9897772554805224e-05, + "loss": 0.2501, + "step": 7059 + }, + { + "epoch": 0.12592301929868369, + "grad_norm": 0.23906442523002625, + "learning_rate": 4.989763189028144e-05, + "loss": 0.1896, + "step": 7060 + }, + { + "epoch": 0.1259408554203974, + "grad_norm": 0.25772127509117126, + "learning_rate": 4.989749112924583e-05, + "loss": 0.2225, + "step": 7061 + }, + { + "epoch": 0.1259586915421111, + "grad_norm": 0.28775516152381897, + "learning_rate": 4.989735027169895e-05, + "loss": 0.2317, + "step": 7062 + }, + { + "epoch": 0.12597652766382478, + "grad_norm": 0.22797401249408722, + "learning_rate": 4.989720931764134e-05, + "loss": 0.2131, + "step": 7063 + }, + { + "epoch": 0.12599436378553847, + "grad_norm": 0.2809610366821289, + "learning_rate": 4.989706826707354e-05, + "loss": 0.2476, + "step": 7064 + }, + { + "epoch": 0.12601219990725215, + "grad_norm": 0.2797238826751709, + "learning_rate": 4.989692711999611e-05, + "loss": 0.2934, + "step": 7065 + }, + { + "epoch": 0.12603003602896587, + "grad_norm": 0.2841593325138092, + "learning_rate": 4.9896785876409594e-05, + "loss": 0.233, + "step": 7066 + }, + { + "epoch": 0.12604787215067956, + "grad_norm": 0.2755345404148102, + "learning_rate": 4.989664453631453e-05, + "loss": 0.2209, + "step": 7067 + }, + { + "epoch": 0.12606570827239325, + "grad_norm": 0.2395692616701126, + "learning_rate": 4.989650309971148e-05, + "loss": 0.2216, + "step": 7068 + }, + { + "epoch": 0.12608354439410693, + "grad_norm": 0.2653280198574066, + "learning_rate": 4.989636156660098e-05, + "loss": 0.2229, + "step": 7069 + }, + { + "epoch": 0.12610138051582065, + "grad_norm": 0.3702566623687744, + "learning_rate": 4.989621993698359e-05, + "loss": 0.2473, + "step": 7070 + }, + { + "epoch": 0.12611921663753434, + "grad_norm": 0.39347711205482483, + "learning_rate": 4.9896078210859845e-05, + "loss": 0.2803, + "step": 7071 + }, + { + "epoch": 0.12613705275924803, + "grad_norm": 0.21345220506191254, + "learning_rate": 4.9895936388230304e-05, + "loss": 0.1963, + "step": 7072 + }, + { + "epoch": 0.1261548888809617, + "grad_norm": 0.28686854243278503, + "learning_rate": 4.9895794469095514e-05, + "loss": 0.2214, + "step": 7073 + }, + { + "epoch": 0.12617272500267543, + "grad_norm": 0.2529907822608948, + "learning_rate": 4.989565245345603e-05, + "loss": 0.2153, + "step": 7074 + }, + { + "epoch": 0.12619056112438912, + "grad_norm": 0.28283044695854187, + "learning_rate": 4.98955103413124e-05, + "loss": 0.2356, + "step": 7075 + }, + { + "epoch": 0.1262083972461028, + "grad_norm": 0.35509192943573, + "learning_rate": 4.989536813266516e-05, + "loss": 0.2731, + "step": 7076 + }, + { + "epoch": 0.1262262333678165, + "grad_norm": 0.33380529284477234, + "learning_rate": 4.9895225827514894e-05, + "loss": 0.1978, + "step": 7077 + }, + { + "epoch": 0.1262440694895302, + "grad_norm": 0.3438994586467743, + "learning_rate": 4.989508342586212e-05, + "loss": 0.2279, + "step": 7078 + }, + { + "epoch": 0.1262619056112439, + "grad_norm": 0.31843382120132446, + "learning_rate": 4.989494092770741e-05, + "loss": 0.2683, + "step": 7079 + }, + { + "epoch": 0.12627974173295758, + "grad_norm": 0.28281909227371216, + "learning_rate": 4.9894798333051305e-05, + "loss": 0.2026, + "step": 7080 + }, + { + "epoch": 0.12629757785467127, + "grad_norm": 0.3684866726398468, + "learning_rate": 4.989465564189437e-05, + "loss": 0.2174, + "step": 7081 + }, + { + "epoch": 0.126315413976385, + "grad_norm": 0.27203404903411865, + "learning_rate": 4.989451285423715e-05, + "loss": 0.2944, + "step": 7082 + }, + { + "epoch": 0.12633325009809868, + "grad_norm": 0.19522850215435028, + "learning_rate": 4.98943699700802e-05, + "loss": 0.2127, + "step": 7083 + }, + { + "epoch": 0.12635108621981236, + "grad_norm": 0.2942343056201935, + "learning_rate": 4.989422698942407e-05, + "loss": 0.2893, + "step": 7084 + }, + { + "epoch": 0.12636892234152605, + "grad_norm": 0.1972874253988266, + "learning_rate": 4.989408391226932e-05, + "loss": 0.2022, + "step": 7085 + }, + { + "epoch": 0.12638675846323974, + "grad_norm": 0.24227045476436615, + "learning_rate": 4.989394073861651e-05, + "loss": 0.2418, + "step": 7086 + }, + { + "epoch": 0.12640459458495346, + "grad_norm": 0.2634616792201996, + "learning_rate": 4.989379746846617e-05, + "loss": 0.2009, + "step": 7087 + }, + { + "epoch": 0.12642243070666714, + "grad_norm": 0.327183336019516, + "learning_rate": 4.989365410181889e-05, + "loss": 0.297, + "step": 7088 + }, + { + "epoch": 0.12644026682838083, + "grad_norm": 0.2170710265636444, + "learning_rate": 4.98935106386752e-05, + "loss": 0.2102, + "step": 7089 + }, + { + "epoch": 0.12645810295009452, + "grad_norm": 0.2536414861679077, + "learning_rate": 4.989336707903567e-05, + "loss": 0.2414, + "step": 7090 + }, + { + "epoch": 0.12647593907180824, + "grad_norm": 0.36148250102996826, + "learning_rate": 4.9893223422900845e-05, + "loss": 0.2368, + "step": 7091 + }, + { + "epoch": 0.12649377519352192, + "grad_norm": 0.19189855456352234, + "learning_rate": 4.9893079670271294e-05, + "loss": 0.1983, + "step": 7092 + }, + { + "epoch": 0.1265116113152356, + "grad_norm": 0.25199437141418457, + "learning_rate": 4.989293582114756e-05, + "loss": 0.2555, + "step": 7093 + }, + { + "epoch": 0.1265294474369493, + "grad_norm": 0.30870717763900757, + "learning_rate": 4.989279187553022e-05, + "loss": 0.2599, + "step": 7094 + }, + { + "epoch": 0.12654728355866302, + "grad_norm": 0.22379128634929657, + "learning_rate": 4.989264783341981e-05, + "loss": 0.2159, + "step": 7095 + }, + { + "epoch": 0.1265651196803767, + "grad_norm": 0.2508992850780487, + "learning_rate": 4.989250369481691e-05, + "loss": 0.2514, + "step": 7096 + }, + { + "epoch": 0.1265829558020904, + "grad_norm": 0.2804580330848694, + "learning_rate": 4.989235945972206e-05, + "loss": 0.272, + "step": 7097 + }, + { + "epoch": 0.12660079192380408, + "grad_norm": 0.27964669466018677, + "learning_rate": 4.9892215128135835e-05, + "loss": 0.2291, + "step": 7098 + }, + { + "epoch": 0.1266186280455178, + "grad_norm": 0.26933884620666504, + "learning_rate": 4.989207070005878e-05, + "loss": 0.2207, + "step": 7099 + }, + { + "epoch": 0.12663646416723148, + "grad_norm": 0.21618296205997467, + "learning_rate": 4.9891926175491464e-05, + "loss": 0.1993, + "step": 7100 + }, + { + "epoch": 0.12665430028894517, + "grad_norm": 0.46692606806755066, + "learning_rate": 4.9891781554434445e-05, + "loss": 0.1942, + "step": 7101 + }, + { + "epoch": 0.12667213641065886, + "grad_norm": 0.49683696031570435, + "learning_rate": 4.9891636836888285e-05, + "loss": 0.2264, + "step": 7102 + }, + { + "epoch": 0.12668997253237257, + "grad_norm": 0.4323539435863495, + "learning_rate": 4.989149202285354e-05, + "loss": 0.2789, + "step": 7103 + }, + { + "epoch": 0.12670780865408626, + "grad_norm": 0.3104749023914337, + "learning_rate": 4.9891347112330775e-05, + "loss": 0.2795, + "step": 7104 + }, + { + "epoch": 0.12672564477579995, + "grad_norm": 0.4278308153152466, + "learning_rate": 4.989120210532056e-05, + "loss": 0.2591, + "step": 7105 + }, + { + "epoch": 0.12674348089751364, + "grad_norm": 0.27192848920822144, + "learning_rate": 4.9891057001823435e-05, + "loss": 0.197, + "step": 7106 + }, + { + "epoch": 0.12676131701922733, + "grad_norm": 0.3221433460712433, + "learning_rate": 4.9890911801839976e-05, + "loss": 0.2084, + "step": 7107 + }, + { + "epoch": 0.12677915314094104, + "grad_norm": 0.3191899061203003, + "learning_rate": 4.9890766505370757e-05, + "loss": 0.2936, + "step": 7108 + }, + { + "epoch": 0.12679698926265473, + "grad_norm": 0.23041144013404846, + "learning_rate": 4.9890621112416326e-05, + "loss": 0.2087, + "step": 7109 + }, + { + "epoch": 0.12681482538436842, + "grad_norm": 0.2721308171749115, + "learning_rate": 4.9890475622977253e-05, + "loss": 0.2475, + "step": 7110 + }, + { + "epoch": 0.1268326615060821, + "grad_norm": 0.3211425244808197, + "learning_rate": 4.989033003705409e-05, + "loss": 0.2238, + "step": 7111 + }, + { + "epoch": 0.12685049762779582, + "grad_norm": 0.43620380759239197, + "learning_rate": 4.989018435464742e-05, + "loss": 0.2644, + "step": 7112 + }, + { + "epoch": 0.1268683337495095, + "grad_norm": 0.24428436160087585, + "learning_rate": 4.9890038575757794e-05, + "loss": 0.2495, + "step": 7113 + }, + { + "epoch": 0.1268861698712232, + "grad_norm": 0.3536229133605957, + "learning_rate": 4.9889892700385784e-05, + "loss": 0.2727, + "step": 7114 + }, + { + "epoch": 0.12690400599293689, + "grad_norm": 0.33278393745422363, + "learning_rate": 4.988974672853195e-05, + "loss": 0.254, + "step": 7115 + }, + { + "epoch": 0.1269218421146506, + "grad_norm": 0.3001064956188202, + "learning_rate": 4.9889600660196863e-05, + "loss": 0.2742, + "step": 7116 + }, + { + "epoch": 0.1269396782363643, + "grad_norm": 0.32525044679641724, + "learning_rate": 4.988945449538109e-05, + "loss": 0.2575, + "step": 7117 + }, + { + "epoch": 0.12695751435807798, + "grad_norm": 0.323993444442749, + "learning_rate": 4.988930823408519e-05, + "loss": 0.214, + "step": 7118 + }, + { + "epoch": 0.12697535047979167, + "grad_norm": 0.23346802592277527, + "learning_rate": 4.9889161876309736e-05, + "loss": 0.2648, + "step": 7119 + }, + { + "epoch": 0.12699318660150538, + "grad_norm": 0.24181324243545532, + "learning_rate": 4.98890154220553e-05, + "loss": 0.183, + "step": 7120 + }, + { + "epoch": 0.12701102272321907, + "grad_norm": 0.29001834988594055, + "learning_rate": 4.988886887132244e-05, + "loss": 0.2229, + "step": 7121 + }, + { + "epoch": 0.12702885884493276, + "grad_norm": 0.2402995377779007, + "learning_rate": 4.988872222411172e-05, + "loss": 0.2124, + "step": 7122 + }, + { + "epoch": 0.12704669496664645, + "grad_norm": 0.3174564242362976, + "learning_rate": 4.9888575480423725e-05, + "loss": 0.2638, + "step": 7123 + }, + { + "epoch": 0.12706453108836013, + "grad_norm": 0.30612319707870483, + "learning_rate": 4.9888428640259016e-05, + "loss": 0.2281, + "step": 7124 + }, + { + "epoch": 0.12708236721007385, + "grad_norm": 0.2363874316215515, + "learning_rate": 4.9888281703618156e-05, + "loss": 0.1912, + "step": 7125 + }, + { + "epoch": 0.12710020333178754, + "grad_norm": 0.3057478666305542, + "learning_rate": 4.988813467050173e-05, + "loss": 0.2342, + "step": 7126 + }, + { + "epoch": 0.12711803945350122, + "grad_norm": 0.24093061685562134, + "learning_rate": 4.9887987540910285e-05, + "loss": 0.2417, + "step": 7127 + }, + { + "epoch": 0.1271358755752149, + "grad_norm": 0.3051645755767822, + "learning_rate": 4.988784031484441e-05, + "loss": 0.2063, + "step": 7128 + }, + { + "epoch": 0.12715371169692863, + "grad_norm": 0.3925570845603943, + "learning_rate": 4.9887692992304666e-05, + "loss": 0.3614, + "step": 7129 + }, + { + "epoch": 0.12717154781864232, + "grad_norm": 0.25032737851142883, + "learning_rate": 4.988754557329164e-05, + "loss": 0.2481, + "step": 7130 + }, + { + "epoch": 0.127189383940356, + "grad_norm": 0.3387826979160309, + "learning_rate": 4.988739805780588e-05, + "loss": 0.2183, + "step": 7131 + }, + { + "epoch": 0.1272072200620697, + "grad_norm": 0.1918492615222931, + "learning_rate": 4.988725044584798e-05, + "loss": 0.2358, + "step": 7132 + }, + { + "epoch": 0.1272250561837834, + "grad_norm": 0.2590310871601105, + "learning_rate": 4.988710273741849e-05, + "loss": 0.2327, + "step": 7133 + }, + { + "epoch": 0.1272428923054971, + "grad_norm": 0.273307740688324, + "learning_rate": 4.9886954932518e-05, + "loss": 0.212, + "step": 7134 + }, + { + "epoch": 0.12726072842721078, + "grad_norm": 0.4322851002216339, + "learning_rate": 4.988680703114708e-05, + "loss": 0.2269, + "step": 7135 + }, + { + "epoch": 0.12727856454892447, + "grad_norm": 0.28264355659484863, + "learning_rate": 4.988665903330629e-05, + "loss": 0.2379, + "step": 7136 + }, + { + "epoch": 0.1272964006706382, + "grad_norm": 0.40426623821258545, + "learning_rate": 4.988651093899622e-05, + "loss": 0.3116, + "step": 7137 + }, + { + "epoch": 0.12731423679235188, + "grad_norm": 0.203849658370018, + "learning_rate": 4.988636274821744e-05, + "loss": 0.206, + "step": 7138 + }, + { + "epoch": 0.12733207291406556, + "grad_norm": 0.371700257062912, + "learning_rate": 4.988621446097052e-05, + "loss": 0.2767, + "step": 7139 + }, + { + "epoch": 0.12734990903577925, + "grad_norm": 0.24924980103969574, + "learning_rate": 4.9886066077256036e-05, + "loss": 0.2138, + "step": 7140 + }, + { + "epoch": 0.12736774515749297, + "grad_norm": 0.31650349497795105, + "learning_rate": 4.9885917597074564e-05, + "loss": 0.2502, + "step": 7141 + }, + { + "epoch": 0.12738558127920666, + "grad_norm": 0.37491849064826965, + "learning_rate": 4.9885769020426685e-05, + "loss": 0.2849, + "step": 7142 + }, + { + "epoch": 0.12740341740092034, + "grad_norm": 0.23761944472789764, + "learning_rate": 4.988562034731297e-05, + "loss": 0.2161, + "step": 7143 + }, + { + "epoch": 0.12742125352263403, + "grad_norm": 0.24542593955993652, + "learning_rate": 4.988547157773399e-05, + "loss": 0.2216, + "step": 7144 + }, + { + "epoch": 0.12743908964434772, + "grad_norm": 0.3461378216743469, + "learning_rate": 4.988532271169033e-05, + "loss": 0.2744, + "step": 7145 + }, + { + "epoch": 0.12745692576606144, + "grad_norm": 0.24887950718402863, + "learning_rate": 4.988517374918257e-05, + "loss": 0.2093, + "step": 7146 + }, + { + "epoch": 0.12747476188777512, + "grad_norm": 0.4498194754123688, + "learning_rate": 4.988502469021127e-05, + "loss": 0.2834, + "step": 7147 + }, + { + "epoch": 0.1274925980094888, + "grad_norm": 0.24465256929397583, + "learning_rate": 4.988487553477702e-05, + "loss": 0.2407, + "step": 7148 + }, + { + "epoch": 0.1275104341312025, + "grad_norm": 0.2679983675479889, + "learning_rate": 4.9884726282880407e-05, + "loss": 0.2291, + "step": 7149 + }, + { + "epoch": 0.12752827025291621, + "grad_norm": 0.26163923740386963, + "learning_rate": 4.988457693452199e-05, + "loss": 0.2076, + "step": 7150 + }, + { + "epoch": 0.1275461063746299, + "grad_norm": 0.2710815370082855, + "learning_rate": 4.988442748970237e-05, + "loss": 0.2657, + "step": 7151 + }, + { + "epoch": 0.1275639424963436, + "grad_norm": 0.3420710265636444, + "learning_rate": 4.98842779484221e-05, + "loss": 0.2604, + "step": 7152 + }, + { + "epoch": 0.12758177861805728, + "grad_norm": 0.26653480529785156, + "learning_rate": 4.988412831068179e-05, + "loss": 0.2263, + "step": 7153 + }, + { + "epoch": 0.127599614739771, + "grad_norm": 0.42929571866989136, + "learning_rate": 4.9883978576481996e-05, + "loss": 0.2385, + "step": 7154 + }, + { + "epoch": 0.12761745086148468, + "grad_norm": 0.28003376722335815, + "learning_rate": 4.98838287458233e-05, + "loss": 0.2799, + "step": 7155 + }, + { + "epoch": 0.12763528698319837, + "grad_norm": 0.2639767825603485, + "learning_rate": 4.988367881870629e-05, + "loss": 0.2215, + "step": 7156 + }, + { + "epoch": 0.12765312310491206, + "grad_norm": 0.214997336268425, + "learning_rate": 4.988352879513155e-05, + "loss": 0.2058, + "step": 7157 + }, + { + "epoch": 0.12767095922662577, + "grad_norm": 0.40716752409935, + "learning_rate": 4.988337867509967e-05, + "loss": 0.2574, + "step": 7158 + }, + { + "epoch": 0.12768879534833946, + "grad_norm": 0.2493445724248886, + "learning_rate": 4.9883228458611204e-05, + "loss": 0.2082, + "step": 7159 + }, + { + "epoch": 0.12770663147005315, + "grad_norm": 0.3026892840862274, + "learning_rate": 4.988307814566675e-05, + "loss": 0.2066, + "step": 7160 + }, + { + "epoch": 0.12772446759176684, + "grad_norm": 0.22813519835472107, + "learning_rate": 4.98829277362669e-05, + "loss": 0.2255, + "step": 7161 + }, + { + "epoch": 0.12774230371348055, + "grad_norm": 0.31025686860084534, + "learning_rate": 4.9882777230412227e-05, + "loss": 0.2771, + "step": 7162 + }, + { + "epoch": 0.12776013983519424, + "grad_norm": 0.2800917327404022, + "learning_rate": 4.988262662810331e-05, + "loss": 0.2425, + "step": 7163 + }, + { + "epoch": 0.12777797595690793, + "grad_norm": 0.30579638481140137, + "learning_rate": 4.988247592934074e-05, + "loss": 0.2653, + "step": 7164 + }, + { + "epoch": 0.12779581207862162, + "grad_norm": 0.2706877589225769, + "learning_rate": 4.9882325134125096e-05, + "loss": 0.2159, + "step": 7165 + }, + { + "epoch": 0.1278136482003353, + "grad_norm": 0.3356277048587799, + "learning_rate": 4.988217424245697e-05, + "loss": 0.2746, + "step": 7166 + }, + { + "epoch": 0.12783148432204902, + "grad_norm": 0.2686861455440521, + "learning_rate": 4.988202325433694e-05, + "loss": 0.2254, + "step": 7167 + }, + { + "epoch": 0.1278493204437627, + "grad_norm": 0.24877183139324188, + "learning_rate": 4.98818721697656e-05, + "loss": 0.2136, + "step": 7168 + }, + { + "epoch": 0.1278671565654764, + "grad_norm": 0.2516845166683197, + "learning_rate": 4.988172098874352e-05, + "loss": 0.2355, + "step": 7169 + }, + { + "epoch": 0.12788499268719009, + "grad_norm": 0.27416130900382996, + "learning_rate": 4.98815697112713e-05, + "loss": 0.2415, + "step": 7170 + }, + { + "epoch": 0.1279028288089038, + "grad_norm": 0.3243381083011627, + "learning_rate": 4.988141833734952e-05, + "loss": 0.2601, + "step": 7171 + }, + { + "epoch": 0.1279206649306175, + "grad_norm": 0.2594567537307739, + "learning_rate": 4.988126686697877e-05, + "loss": 0.2234, + "step": 7172 + }, + { + "epoch": 0.12793850105233118, + "grad_norm": 0.23373720049858093, + "learning_rate": 4.9881115300159644e-05, + "loss": 0.2238, + "step": 7173 + }, + { + "epoch": 0.12795633717404487, + "grad_norm": 0.2778187692165375, + "learning_rate": 4.988096363689271e-05, + "loss": 0.2632, + "step": 7174 + }, + { + "epoch": 0.12797417329575858, + "grad_norm": 0.2085629254579544, + "learning_rate": 4.988081187717857e-05, + "loss": 0.2305, + "step": 7175 + }, + { + "epoch": 0.12799200941747227, + "grad_norm": 0.272589772939682, + "learning_rate": 4.988066002101781e-05, + "loss": 0.1807, + "step": 7176 + }, + { + "epoch": 0.12800984553918596, + "grad_norm": 0.2666608989238739, + "learning_rate": 4.988050806841102e-05, + "loss": 0.2161, + "step": 7177 + }, + { + "epoch": 0.12802768166089964, + "grad_norm": 0.24305440485477448, + "learning_rate": 4.9880356019358786e-05, + "loss": 0.2268, + "step": 7178 + }, + { + "epoch": 0.12804551778261336, + "grad_norm": 0.25793445110321045, + "learning_rate": 4.9880203873861705e-05, + "loss": 0.2286, + "step": 7179 + }, + { + "epoch": 0.12806335390432705, + "grad_norm": 0.33685505390167236, + "learning_rate": 4.9880051631920355e-05, + "loss": 0.2512, + "step": 7180 + }, + { + "epoch": 0.12808119002604074, + "grad_norm": 0.422828733921051, + "learning_rate": 4.9879899293535325e-05, + "loss": 0.2657, + "step": 7181 + }, + { + "epoch": 0.12809902614775442, + "grad_norm": 0.3008039593696594, + "learning_rate": 4.987974685870722e-05, + "loss": 0.2593, + "step": 7182 + }, + { + "epoch": 0.12811686226946814, + "grad_norm": 0.2825548052787781, + "learning_rate": 4.9879594327436625e-05, + "loss": 0.2266, + "step": 7183 + }, + { + "epoch": 0.12813469839118183, + "grad_norm": 0.331137090921402, + "learning_rate": 4.987944169972413e-05, + "loss": 0.2633, + "step": 7184 + }, + { + "epoch": 0.12815253451289552, + "grad_norm": 0.19249378144741058, + "learning_rate": 4.9879288975570315e-05, + "loss": 0.1942, + "step": 7185 + }, + { + "epoch": 0.1281703706346092, + "grad_norm": 0.3144652843475342, + "learning_rate": 4.987913615497579e-05, + "loss": 0.2052, + "step": 7186 + }, + { + "epoch": 0.1281882067563229, + "grad_norm": 0.3076115548610687, + "learning_rate": 4.987898323794114e-05, + "loss": 0.3222, + "step": 7187 + }, + { + "epoch": 0.1282060428780366, + "grad_norm": 0.3074822723865509, + "learning_rate": 4.987883022446696e-05, + "loss": 0.2512, + "step": 7188 + }, + { + "epoch": 0.1282238789997503, + "grad_norm": 0.32042768597602844, + "learning_rate": 4.987867711455384e-05, + "loss": 0.2065, + "step": 7189 + }, + { + "epoch": 0.12824171512146398, + "grad_norm": 0.27149438858032227, + "learning_rate": 4.987852390820237e-05, + "loss": 0.2664, + "step": 7190 + }, + { + "epoch": 0.12825955124317767, + "grad_norm": 0.47419872879981995, + "learning_rate": 4.987837060541316e-05, + "loss": 0.2312, + "step": 7191 + }, + { + "epoch": 0.1282773873648914, + "grad_norm": 0.3739066421985626, + "learning_rate": 4.987821720618678e-05, + "loss": 0.2824, + "step": 7192 + }, + { + "epoch": 0.12829522348660508, + "grad_norm": 0.25958433747291565, + "learning_rate": 4.987806371052384e-05, + "loss": 0.2229, + "step": 7193 + }, + { + "epoch": 0.12831305960831876, + "grad_norm": 0.38448628783226013, + "learning_rate": 4.9877910118424936e-05, + "loss": 0.2412, + "step": 7194 + }, + { + "epoch": 0.12833089573003245, + "grad_norm": 0.29795241355895996, + "learning_rate": 4.987775642989066e-05, + "loss": 0.1777, + "step": 7195 + }, + { + "epoch": 0.12834873185174617, + "grad_norm": 0.3055592477321625, + "learning_rate": 4.9877602644921606e-05, + "loss": 0.2968, + "step": 7196 + }, + { + "epoch": 0.12836656797345986, + "grad_norm": 0.29176947474479675, + "learning_rate": 4.9877448763518374e-05, + "loss": 0.2482, + "step": 7197 + }, + { + "epoch": 0.12838440409517354, + "grad_norm": 0.342585951089859, + "learning_rate": 4.987729478568155e-05, + "loss": 0.2275, + "step": 7198 + }, + { + "epoch": 0.12840224021688723, + "grad_norm": 0.38080471754074097, + "learning_rate": 4.987714071141175e-05, + "loss": 0.2813, + "step": 7199 + }, + { + "epoch": 0.12842007633860095, + "grad_norm": 0.3040217459201813, + "learning_rate": 4.987698654070956e-05, + "loss": 0.2595, + "step": 7200 + }, + { + "epoch": 0.12843791246031463, + "grad_norm": 0.25592151284217834, + "learning_rate": 4.9876832273575566e-05, + "loss": 0.2186, + "step": 7201 + }, + { + "epoch": 0.12845574858202832, + "grad_norm": 0.4482082426548004, + "learning_rate": 4.987667791001039e-05, + "loss": 0.2359, + "step": 7202 + }, + { + "epoch": 0.128473584703742, + "grad_norm": 0.3252483010292053, + "learning_rate": 4.987652345001461e-05, + "loss": 0.26, + "step": 7203 + }, + { + "epoch": 0.12849142082545573, + "grad_norm": 0.5447587966918945, + "learning_rate": 4.987636889358884e-05, + "loss": 0.2533, + "step": 7204 + }, + { + "epoch": 0.12850925694716941, + "grad_norm": 0.2592175006866455, + "learning_rate": 4.987621424073366e-05, + "loss": 0.2131, + "step": 7205 + }, + { + "epoch": 0.1285270930688831, + "grad_norm": 0.32026323676109314, + "learning_rate": 4.98760594914497e-05, + "loss": 0.2803, + "step": 7206 + }, + { + "epoch": 0.1285449291905968, + "grad_norm": 0.2672712206840515, + "learning_rate": 4.987590464573753e-05, + "loss": 0.1968, + "step": 7207 + }, + { + "epoch": 0.12856276531231048, + "grad_norm": 0.27520185708999634, + "learning_rate": 4.9875749703597765e-05, + "loss": 0.2167, + "step": 7208 + }, + { + "epoch": 0.1285806014340242, + "grad_norm": 0.297823041677475, + "learning_rate": 4.9875594665031e-05, + "loss": 0.2811, + "step": 7209 + }, + { + "epoch": 0.12859843755573788, + "grad_norm": 0.2841673493385315, + "learning_rate": 4.987543953003784e-05, + "loss": 0.2356, + "step": 7210 + }, + { + "epoch": 0.12861627367745157, + "grad_norm": 0.24720583856105804, + "learning_rate": 4.987528429861889e-05, + "loss": 0.2153, + "step": 7211 + }, + { + "epoch": 0.12863410979916526, + "grad_norm": 0.2523522675037384, + "learning_rate": 4.987512897077474e-05, + "loss": 0.2502, + "step": 7212 + }, + { + "epoch": 0.12865194592087897, + "grad_norm": 0.38946402072906494, + "learning_rate": 4.9874973546506e-05, + "loss": 0.335, + "step": 7213 + }, + { + "epoch": 0.12866978204259266, + "grad_norm": 0.2724445164203644, + "learning_rate": 4.987481802581326e-05, + "loss": 0.2455, + "step": 7214 + }, + { + "epoch": 0.12868761816430635, + "grad_norm": 0.2129668891429901, + "learning_rate": 4.9874662408697146e-05, + "loss": 0.2358, + "step": 7215 + }, + { + "epoch": 0.12870545428602004, + "grad_norm": 0.2719738483428955, + "learning_rate": 4.9874506695158254e-05, + "loss": 0.2488, + "step": 7216 + }, + { + "epoch": 0.12872329040773375, + "grad_norm": 0.22912661731243134, + "learning_rate": 4.987435088519718e-05, + "loss": 0.2124, + "step": 7217 + }, + { + "epoch": 0.12874112652944744, + "grad_norm": 0.35899364948272705, + "learning_rate": 4.987419497881452e-05, + "loss": 0.2554, + "step": 7218 + }, + { + "epoch": 0.12875896265116113, + "grad_norm": 0.23842033743858337, + "learning_rate": 4.987403897601089e-05, + "loss": 0.2106, + "step": 7219 + }, + { + "epoch": 0.12877679877287482, + "grad_norm": 0.27896079421043396, + "learning_rate": 4.9873882876786905e-05, + "loss": 0.2493, + "step": 7220 + }, + { + "epoch": 0.12879463489458853, + "grad_norm": 0.31804925203323364, + "learning_rate": 4.987372668114315e-05, + "loss": 0.2306, + "step": 7221 + }, + { + "epoch": 0.12881247101630222, + "grad_norm": 0.3121148943901062, + "learning_rate": 4.987357038908024e-05, + "loss": 0.2749, + "step": 7222 + }, + { + "epoch": 0.1288303071380159, + "grad_norm": 0.28705453872680664, + "learning_rate": 4.9873414000598785e-05, + "loss": 0.2367, + "step": 7223 + }, + { + "epoch": 0.1288481432597296, + "grad_norm": 0.24653789401054382, + "learning_rate": 4.9873257515699386e-05, + "loss": 0.1778, + "step": 7224 + }, + { + "epoch": 0.12886597938144329, + "grad_norm": 0.3480568528175354, + "learning_rate": 4.9873100934382646e-05, + "loss": 0.2243, + "step": 7225 + }, + { + "epoch": 0.128883815503157, + "grad_norm": 0.30845171213150024, + "learning_rate": 4.9872944256649176e-05, + "loss": 0.2111, + "step": 7226 + }, + { + "epoch": 0.1289016516248707, + "grad_norm": 0.30911576747894287, + "learning_rate": 4.9872787482499585e-05, + "loss": 0.2733, + "step": 7227 + }, + { + "epoch": 0.12891948774658438, + "grad_norm": 0.26741185784339905, + "learning_rate": 4.9872630611934477e-05, + "loss": 0.2411, + "step": 7228 + }, + { + "epoch": 0.12893732386829806, + "grad_norm": 0.2957748472690582, + "learning_rate": 4.987247364495447e-05, + "loss": 0.2229, + "step": 7229 + }, + { + "epoch": 0.12895515999001178, + "grad_norm": 0.2056843489408493, + "learning_rate": 4.9872316581560154e-05, + "loss": 0.2201, + "step": 7230 + }, + { + "epoch": 0.12897299611172547, + "grad_norm": 0.21183530986309052, + "learning_rate": 4.987215942175215e-05, + "loss": 0.2334, + "step": 7231 + }, + { + "epoch": 0.12899083223343916, + "grad_norm": 0.2882821261882782, + "learning_rate": 4.987200216553107e-05, + "loss": 0.2005, + "step": 7232 + }, + { + "epoch": 0.12900866835515284, + "grad_norm": 0.2475908249616623, + "learning_rate": 4.987184481289752e-05, + "loss": 0.2269, + "step": 7233 + }, + { + "epoch": 0.12902650447686656, + "grad_norm": 0.2058449685573578, + "learning_rate": 4.9871687363852106e-05, + "loss": 0.2194, + "step": 7234 + }, + { + "epoch": 0.12904434059858025, + "grad_norm": 0.2869996130466461, + "learning_rate": 4.987152981839544e-05, + "loss": 0.2365, + "step": 7235 + }, + { + "epoch": 0.12906217672029394, + "grad_norm": 0.5378527641296387, + "learning_rate": 4.987137217652814e-05, + "loss": 0.2745, + "step": 7236 + }, + { + "epoch": 0.12908001284200762, + "grad_norm": 0.28777915239334106, + "learning_rate": 4.9871214438250804e-05, + "loss": 0.1801, + "step": 7237 + }, + { + "epoch": 0.12909784896372134, + "grad_norm": 0.20621678233146667, + "learning_rate": 4.987105660356405e-05, + "loss": 0.1761, + "step": 7238 + }, + { + "epoch": 0.12911568508543503, + "grad_norm": 0.2804788053035736, + "learning_rate": 4.9870898672468495e-05, + "loss": 0.2099, + "step": 7239 + }, + { + "epoch": 0.12913352120714872, + "grad_norm": 0.39618542790412903, + "learning_rate": 4.987074064496474e-05, + "loss": 0.2601, + "step": 7240 + }, + { + "epoch": 0.1291513573288624, + "grad_norm": 0.27285629510879517, + "learning_rate": 4.9870582521053416e-05, + "loss": 0.2128, + "step": 7241 + }, + { + "epoch": 0.12916919345057612, + "grad_norm": 0.24478517472743988, + "learning_rate": 4.987042430073512e-05, + "loss": 0.2281, + "step": 7242 + }, + { + "epoch": 0.1291870295722898, + "grad_norm": 0.3979249596595764, + "learning_rate": 4.987026598401047e-05, + "loss": 0.3359, + "step": 7243 + }, + { + "epoch": 0.1292048656940035, + "grad_norm": 0.2648313045501709, + "learning_rate": 4.987010757088007e-05, + "loss": 0.2329, + "step": 7244 + }, + { + "epoch": 0.12922270181571718, + "grad_norm": 0.27400505542755127, + "learning_rate": 4.986994906134455e-05, + "loss": 0.2202, + "step": 7245 + }, + { + "epoch": 0.12924053793743087, + "grad_norm": 0.2718895375728607, + "learning_rate": 4.986979045540452e-05, + "loss": 0.1965, + "step": 7246 + }, + { + "epoch": 0.1292583740591446, + "grad_norm": 0.2907407283782959, + "learning_rate": 4.9869631753060584e-05, + "loss": 0.2692, + "step": 7247 + }, + { + "epoch": 0.12927621018085828, + "grad_norm": 0.284167617559433, + "learning_rate": 4.9869472954313374e-05, + "loss": 0.2336, + "step": 7248 + }, + { + "epoch": 0.12929404630257196, + "grad_norm": 0.2803809642791748, + "learning_rate": 4.9869314059163495e-05, + "loss": 0.2546, + "step": 7249 + }, + { + "epoch": 0.12931188242428565, + "grad_norm": 0.3222905695438385, + "learning_rate": 4.986915506761156e-05, + "loss": 0.2377, + "step": 7250 + }, + { + "epoch": 0.12932971854599937, + "grad_norm": 0.28519296646118164, + "learning_rate": 4.9868995979658194e-05, + "loss": 0.2338, + "step": 7251 + }, + { + "epoch": 0.12934755466771305, + "grad_norm": 0.26038432121276855, + "learning_rate": 4.986883679530401e-05, + "loss": 0.2282, + "step": 7252 + }, + { + "epoch": 0.12936539078942674, + "grad_norm": 0.30088332295417786, + "learning_rate": 4.9868677514549635e-05, + "loss": 0.2655, + "step": 7253 + }, + { + "epoch": 0.12938322691114043, + "grad_norm": 0.2853420078754425, + "learning_rate": 4.9868518137395664e-05, + "loss": 0.2086, + "step": 7254 + }, + { + "epoch": 0.12940106303285415, + "grad_norm": 0.28434690833091736, + "learning_rate": 4.986835866384273e-05, + "loss": 0.2432, + "step": 7255 + }, + { + "epoch": 0.12941889915456783, + "grad_norm": 0.2156800478696823, + "learning_rate": 4.986819909389145e-05, + "loss": 0.1931, + "step": 7256 + }, + { + "epoch": 0.12943673527628152, + "grad_norm": 0.3005669414997101, + "learning_rate": 4.986803942754244e-05, + "loss": 0.2059, + "step": 7257 + }, + { + "epoch": 0.1294545713979952, + "grad_norm": 0.27690404653549194, + "learning_rate": 4.986787966479632e-05, + "loss": 0.2131, + "step": 7258 + }, + { + "epoch": 0.12947240751970893, + "grad_norm": 0.26980459690093994, + "learning_rate": 4.986771980565371e-05, + "loss": 0.2851, + "step": 7259 + }, + { + "epoch": 0.12949024364142261, + "grad_norm": 0.23820079863071442, + "learning_rate": 4.986755985011523e-05, + "loss": 0.1944, + "step": 7260 + }, + { + "epoch": 0.1295080797631363, + "grad_norm": 0.25404220819473267, + "learning_rate": 4.98673997981815e-05, + "loss": 0.2101, + "step": 7261 + }, + { + "epoch": 0.12952591588485, + "grad_norm": 0.2757260799407959, + "learning_rate": 4.986723964985314e-05, + "loss": 0.2174, + "step": 7262 + }, + { + "epoch": 0.1295437520065637, + "grad_norm": 0.28552815318107605, + "learning_rate": 4.986707940513077e-05, + "loss": 0.2209, + "step": 7263 + }, + { + "epoch": 0.1295615881282774, + "grad_norm": 0.3079882562160492, + "learning_rate": 4.9866919064015003e-05, + "loss": 0.254, + "step": 7264 + }, + { + "epoch": 0.12957942424999108, + "grad_norm": 0.365155965089798, + "learning_rate": 4.9866758626506476e-05, + "loss": 0.1941, + "step": 7265 + }, + { + "epoch": 0.12959726037170477, + "grad_norm": 0.22253163158893585, + "learning_rate": 4.98665980926058e-05, + "loss": 0.2079, + "step": 7266 + }, + { + "epoch": 0.12961509649341846, + "grad_norm": 0.2523843050003052, + "learning_rate": 4.986643746231361e-05, + "loss": 0.2262, + "step": 7267 + }, + { + "epoch": 0.12963293261513217, + "grad_norm": 0.29638391733169556, + "learning_rate": 4.986627673563051e-05, + "loss": 0.1368, + "step": 7268 + }, + { + "epoch": 0.12965076873684586, + "grad_norm": 0.2818562984466553, + "learning_rate": 4.986611591255714e-05, + "loss": 0.2181, + "step": 7269 + }, + { + "epoch": 0.12966860485855955, + "grad_norm": 0.29367595911026, + "learning_rate": 4.986595499309411e-05, + "loss": 0.2396, + "step": 7270 + }, + { + "epoch": 0.12968644098027324, + "grad_norm": 0.339464008808136, + "learning_rate": 4.986579397724205e-05, + "loss": 0.2391, + "step": 7271 + }, + { + "epoch": 0.12970427710198695, + "grad_norm": 0.27732816338539124, + "learning_rate": 4.986563286500159e-05, + "loss": 0.2258, + "step": 7272 + }, + { + "epoch": 0.12972211322370064, + "grad_norm": 0.2927684783935547, + "learning_rate": 4.9865471656373345e-05, + "loss": 0.2061, + "step": 7273 + }, + { + "epoch": 0.12973994934541433, + "grad_norm": 0.2760705053806305, + "learning_rate": 4.9865310351357946e-05, + "loss": 0.2343, + "step": 7274 + }, + { + "epoch": 0.12975778546712802, + "grad_norm": 0.2814476788043976, + "learning_rate": 4.986514894995602e-05, + "loss": 0.2495, + "step": 7275 + }, + { + "epoch": 0.12977562158884173, + "grad_norm": 0.2642858922481537, + "learning_rate": 4.986498745216818e-05, + "loss": 0.2121, + "step": 7276 + }, + { + "epoch": 0.12979345771055542, + "grad_norm": 0.23780542612075806, + "learning_rate": 4.9864825857995064e-05, + "loss": 0.2167, + "step": 7277 + }, + { + "epoch": 0.1298112938322691, + "grad_norm": 0.301698237657547, + "learning_rate": 4.986466416743729e-05, + "loss": 0.2434, + "step": 7278 + }, + { + "epoch": 0.1298291299539828, + "grad_norm": 0.3107871413230896, + "learning_rate": 4.9864502380495496e-05, + "loss": 0.2982, + "step": 7279 + }, + { + "epoch": 0.1298469660756965, + "grad_norm": 0.22600796818733215, + "learning_rate": 4.9864340497170304e-05, + "loss": 0.2023, + "step": 7280 + }, + { + "epoch": 0.1298648021974102, + "grad_norm": 0.2335529625415802, + "learning_rate": 4.986417851746234e-05, + "loss": 0.224, + "step": 7281 + }, + { + "epoch": 0.1298826383191239, + "grad_norm": 0.24890993535518646, + "learning_rate": 4.986401644137223e-05, + "loss": 0.2266, + "step": 7282 + }, + { + "epoch": 0.12990047444083758, + "grad_norm": 0.2737363576889038, + "learning_rate": 4.98638542689006e-05, + "loss": 0.2667, + "step": 7283 + }, + { + "epoch": 0.1299183105625513, + "grad_norm": 0.20195892453193665, + "learning_rate": 4.986369200004809e-05, + "loss": 0.2062, + "step": 7284 + }, + { + "epoch": 0.12993614668426498, + "grad_norm": 0.2856735587120056, + "learning_rate": 4.986352963481532e-05, + "loss": 0.234, + "step": 7285 + }, + { + "epoch": 0.12995398280597867, + "grad_norm": 0.25961220264434814, + "learning_rate": 4.986336717320292e-05, + "loss": 0.2041, + "step": 7286 + }, + { + "epoch": 0.12997181892769236, + "grad_norm": 0.25032082200050354, + "learning_rate": 4.986320461521152e-05, + "loss": 0.2153, + "step": 7287 + }, + { + "epoch": 0.12998965504940604, + "grad_norm": 0.4232363700866699, + "learning_rate": 4.986304196084176e-05, + "loss": 0.2961, + "step": 7288 + }, + { + "epoch": 0.13000749117111976, + "grad_norm": 0.25651246309280396, + "learning_rate": 4.9862879210094247e-05, + "loss": 0.1977, + "step": 7289 + }, + { + "epoch": 0.13002532729283345, + "grad_norm": 0.33551570773124695, + "learning_rate": 4.986271636296964e-05, + "loss": 0.2081, + "step": 7290 + }, + { + "epoch": 0.13004316341454714, + "grad_norm": 0.30205658078193665, + "learning_rate": 4.986255341946855e-05, + "loss": 0.2803, + "step": 7291 + }, + { + "epoch": 0.13006099953626082, + "grad_norm": 0.3281456232070923, + "learning_rate": 4.986239037959162e-05, + "loss": 0.2086, + "step": 7292 + }, + { + "epoch": 0.13007883565797454, + "grad_norm": 0.34779486060142517, + "learning_rate": 4.986222724333948e-05, + "loss": 0.2819, + "step": 7293 + }, + { + "epoch": 0.13009667177968823, + "grad_norm": 0.3936409652233124, + "learning_rate": 4.986206401071275e-05, + "loss": 0.2155, + "step": 7294 + }, + { + "epoch": 0.13011450790140192, + "grad_norm": 0.2572975158691406, + "learning_rate": 4.986190068171208e-05, + "loss": 0.1798, + "step": 7295 + }, + { + "epoch": 0.1301323440231156, + "grad_norm": 0.29663142561912537, + "learning_rate": 4.9861737256338094e-05, + "loss": 0.2346, + "step": 7296 + }, + { + "epoch": 0.13015018014482932, + "grad_norm": 0.24904395639896393, + "learning_rate": 4.986157373459143e-05, + "loss": 0.2422, + "step": 7297 + }, + { + "epoch": 0.130168016266543, + "grad_norm": 0.28781792521476746, + "learning_rate": 4.986141011647272e-05, + "loss": 0.193, + "step": 7298 + }, + { + "epoch": 0.1301858523882567, + "grad_norm": 0.3377923369407654, + "learning_rate": 4.9861246401982594e-05, + "loss": 0.299, + "step": 7299 + }, + { + "epoch": 0.13020368850997038, + "grad_norm": 0.2569024860858917, + "learning_rate": 4.986108259112169e-05, + "loss": 0.2206, + "step": 7300 + }, + { + "epoch": 0.1302215246316841, + "grad_norm": 0.24787509441375732, + "learning_rate": 4.9860918683890646e-05, + "loss": 0.213, + "step": 7301 + }, + { + "epoch": 0.1302393607533978, + "grad_norm": 0.3531358242034912, + "learning_rate": 4.986075468029009e-05, + "loss": 0.2745, + "step": 7302 + }, + { + "epoch": 0.13025719687511148, + "grad_norm": 0.30264678597450256, + "learning_rate": 4.9860590580320664e-05, + "loss": 0.2203, + "step": 7303 + }, + { + "epoch": 0.13027503299682516, + "grad_norm": 0.33674877882003784, + "learning_rate": 4.9860426383983006e-05, + "loss": 0.2484, + "step": 7304 + }, + { + "epoch": 0.13029286911853888, + "grad_norm": 0.3663444519042969, + "learning_rate": 4.9860262091277745e-05, + "loss": 0.2497, + "step": 7305 + }, + { + "epoch": 0.13031070524025257, + "grad_norm": 0.3186067044734955, + "learning_rate": 4.986009770220552e-05, + "loss": 0.2653, + "step": 7306 + }, + { + "epoch": 0.13032854136196625, + "grad_norm": 0.243214949965477, + "learning_rate": 4.985993321676698e-05, + "loss": 0.1699, + "step": 7307 + }, + { + "epoch": 0.13034637748367994, + "grad_norm": 0.3463163673877716, + "learning_rate": 4.9859768634962744e-05, + "loss": 0.2308, + "step": 7308 + }, + { + "epoch": 0.13036421360539363, + "grad_norm": 0.3058823049068451, + "learning_rate": 4.9859603956793456e-05, + "loss": 0.2483, + "step": 7309 + }, + { + "epoch": 0.13038204972710735, + "grad_norm": 0.2753719985485077, + "learning_rate": 4.985943918225976e-05, + "loss": 0.2232, + "step": 7310 + }, + { + "epoch": 0.13039988584882103, + "grad_norm": 0.30290111899375916, + "learning_rate": 4.985927431136229e-05, + "loss": 0.2378, + "step": 7311 + }, + { + "epoch": 0.13041772197053472, + "grad_norm": 0.23741087317466736, + "learning_rate": 4.9859109344101697e-05, + "loss": 0.1981, + "step": 7312 + }, + { + "epoch": 0.1304355580922484, + "grad_norm": 0.30447036027908325, + "learning_rate": 4.98589442804786e-05, + "loss": 0.2011, + "step": 7313 + }, + { + "epoch": 0.13045339421396213, + "grad_norm": 0.3168881833553314, + "learning_rate": 4.985877912049365e-05, + "loss": 0.2594, + "step": 7314 + }, + { + "epoch": 0.13047123033567581, + "grad_norm": 0.21747729182243347, + "learning_rate": 4.9858613864147485e-05, + "loss": 0.1846, + "step": 7315 + }, + { + "epoch": 0.1304890664573895, + "grad_norm": 0.23899520933628082, + "learning_rate": 4.9858448511440745e-05, + "loss": 0.2008, + "step": 7316 + }, + { + "epoch": 0.1305069025791032, + "grad_norm": 0.3052922189235687, + "learning_rate": 4.985828306237409e-05, + "loss": 0.2549, + "step": 7317 + }, + { + "epoch": 0.1305247387008169, + "grad_norm": 0.2646942138671875, + "learning_rate": 4.985811751694812e-05, + "loss": 0.2114, + "step": 7318 + }, + { + "epoch": 0.1305425748225306, + "grad_norm": 0.24101893603801727, + "learning_rate": 4.985795187516351e-05, + "loss": 0.2271, + "step": 7319 + }, + { + "epoch": 0.13056041094424428, + "grad_norm": 0.2978793680667877, + "learning_rate": 4.98577861370209e-05, + "loss": 0.2594, + "step": 7320 + }, + { + "epoch": 0.13057824706595797, + "grad_norm": 0.3433240056037903, + "learning_rate": 4.985762030252092e-05, + "loss": 0.2729, + "step": 7321 + }, + { + "epoch": 0.13059608318767169, + "grad_norm": 0.27056026458740234, + "learning_rate": 4.9857454371664216e-05, + "loss": 0.2169, + "step": 7322 + }, + { + "epoch": 0.13061391930938537, + "grad_norm": 0.24161820113658905, + "learning_rate": 4.985728834445144e-05, + "loss": 0.2506, + "step": 7323 + }, + { + "epoch": 0.13063175543109906, + "grad_norm": 0.381404310464859, + "learning_rate": 4.985712222088322e-05, + "loss": 0.322, + "step": 7324 + }, + { + "epoch": 0.13064959155281275, + "grad_norm": 0.35411256551742554, + "learning_rate": 4.985695600096022e-05, + "loss": 0.2483, + "step": 7325 + }, + { + "epoch": 0.13066742767452644, + "grad_norm": 0.3423224091529846, + "learning_rate": 4.985678968468306e-05, + "loss": 0.2482, + "step": 7326 + }, + { + "epoch": 0.13068526379624015, + "grad_norm": 0.24416756629943848, + "learning_rate": 4.98566232720524e-05, + "loss": 0.2548, + "step": 7327 + }, + { + "epoch": 0.13070309991795384, + "grad_norm": 0.3216147720813751, + "learning_rate": 4.9856456763068895e-05, + "loss": 0.2331, + "step": 7328 + }, + { + "epoch": 0.13072093603966753, + "grad_norm": 0.32868513464927673, + "learning_rate": 4.9856290157733166e-05, + "loss": 0.2664, + "step": 7329 + }, + { + "epoch": 0.13073877216138122, + "grad_norm": 0.2403426617383957, + "learning_rate": 4.985612345604588e-05, + "loss": 0.2157, + "step": 7330 + }, + { + "epoch": 0.13075660828309493, + "grad_norm": 0.3066485822200775, + "learning_rate": 4.985595665800767e-05, + "loss": 0.2603, + "step": 7331 + }, + { + "epoch": 0.13077444440480862, + "grad_norm": 0.33103078603744507, + "learning_rate": 4.985578976361919e-05, + "loss": 0.2256, + "step": 7332 + }, + { + "epoch": 0.1307922805265223, + "grad_norm": 0.2382347583770752, + "learning_rate": 4.985562277288108e-05, + "loss": 0.2273, + "step": 7333 + }, + { + "epoch": 0.130810116648236, + "grad_norm": 0.2508715093135834, + "learning_rate": 4.985545568579399e-05, + "loss": 0.2027, + "step": 7334 + }, + { + "epoch": 0.1308279527699497, + "grad_norm": 0.2651994824409485, + "learning_rate": 4.985528850235857e-05, + "loss": 0.2797, + "step": 7335 + }, + { + "epoch": 0.1308457888916634, + "grad_norm": 0.442849338054657, + "learning_rate": 4.985512122257547e-05, + "loss": 0.2864, + "step": 7336 + }, + { + "epoch": 0.1308636250133771, + "grad_norm": 0.39108017086982727, + "learning_rate": 4.985495384644534e-05, + "loss": 0.1999, + "step": 7337 + }, + { + "epoch": 0.13088146113509078, + "grad_norm": 0.36775991320610046, + "learning_rate": 4.985478637396881e-05, + "loss": 0.2797, + "step": 7338 + }, + { + "epoch": 0.1308992972568045, + "grad_norm": 0.25715216994285583, + "learning_rate": 4.9854618805146556e-05, + "loss": 0.2472, + "step": 7339 + }, + { + "epoch": 0.13091713337851818, + "grad_norm": 0.2287001609802246, + "learning_rate": 4.985445113997921e-05, + "loss": 0.2388, + "step": 7340 + }, + { + "epoch": 0.13093496950023187, + "grad_norm": 0.21074619889259338, + "learning_rate": 4.985428337846743e-05, + "loss": 0.2302, + "step": 7341 + }, + { + "epoch": 0.13095280562194556, + "grad_norm": 0.29683929681777954, + "learning_rate": 4.9854115520611856e-05, + "loss": 0.1928, + "step": 7342 + }, + { + "epoch": 0.13097064174365927, + "grad_norm": 0.208047017455101, + "learning_rate": 4.985394756641315e-05, + "loss": 0.2079, + "step": 7343 + }, + { + "epoch": 0.13098847786537296, + "grad_norm": 0.19809278845787048, + "learning_rate": 4.985377951587196e-05, + "loss": 0.1892, + "step": 7344 + }, + { + "epoch": 0.13100631398708665, + "grad_norm": 0.3061544895172119, + "learning_rate": 4.985361136898894e-05, + "loss": 0.2173, + "step": 7345 + }, + { + "epoch": 0.13102415010880034, + "grad_norm": 0.30910956859588623, + "learning_rate": 4.985344312576473e-05, + "loss": 0.2203, + "step": 7346 + }, + { + "epoch": 0.13104198623051402, + "grad_norm": 0.28544309735298157, + "learning_rate": 4.98532747862e-05, + "loss": 0.232, + "step": 7347 + }, + { + "epoch": 0.13105982235222774, + "grad_norm": 0.30717405676841736, + "learning_rate": 4.985310635029538e-05, + "loss": 0.219, + "step": 7348 + }, + { + "epoch": 0.13107765847394143, + "grad_norm": 0.3384716212749481, + "learning_rate": 4.9852937818051545e-05, + "loss": 0.2279, + "step": 7349 + }, + { + "epoch": 0.13109549459565512, + "grad_norm": 0.23273895680904388, + "learning_rate": 4.985276918946914e-05, + "loss": 0.2464, + "step": 7350 + }, + { + "epoch": 0.1311133307173688, + "grad_norm": 0.22510305047035217, + "learning_rate": 4.9852600464548814e-05, + "loss": 0.1801, + "step": 7351 + }, + { + "epoch": 0.13113116683908252, + "grad_norm": 0.2647697329521179, + "learning_rate": 4.985243164329123e-05, + "loss": 0.2144, + "step": 7352 + }, + { + "epoch": 0.1311490029607962, + "grad_norm": 0.24195782840251923, + "learning_rate": 4.985226272569703e-05, + "loss": 0.2408, + "step": 7353 + }, + { + "epoch": 0.1311668390825099, + "grad_norm": 0.2554917633533478, + "learning_rate": 4.985209371176688e-05, + "loss": 0.2075, + "step": 7354 + }, + { + "epoch": 0.13118467520422358, + "grad_norm": 0.35380035638809204, + "learning_rate": 4.9851924601501435e-05, + "loss": 0.2639, + "step": 7355 + }, + { + "epoch": 0.1312025113259373, + "grad_norm": 0.2879059910774231, + "learning_rate": 4.985175539490134e-05, + "loss": 0.2594, + "step": 7356 + }, + { + "epoch": 0.131220347447651, + "grad_norm": 0.3154323697090149, + "learning_rate": 4.9851586091967264e-05, + "loss": 0.1967, + "step": 7357 + }, + { + "epoch": 0.13123818356936467, + "grad_norm": 0.34300410747528076, + "learning_rate": 4.985141669269985e-05, + "loss": 0.2231, + "step": 7358 + }, + { + "epoch": 0.13125601969107836, + "grad_norm": 0.2851123809814453, + "learning_rate": 4.985124719709976e-05, + "loss": 0.1936, + "step": 7359 + }, + { + "epoch": 0.13127385581279208, + "grad_norm": 0.26627442240715027, + "learning_rate": 4.9851077605167654e-05, + "loss": 0.2074, + "step": 7360 + }, + { + "epoch": 0.13129169193450577, + "grad_norm": 0.2769263982772827, + "learning_rate": 4.9850907916904193e-05, + "loss": 0.1697, + "step": 7361 + }, + { + "epoch": 0.13130952805621945, + "grad_norm": 0.3018946051597595, + "learning_rate": 4.985073813231003e-05, + "loss": 0.2341, + "step": 7362 + }, + { + "epoch": 0.13132736417793314, + "grad_norm": 0.3470059335231781, + "learning_rate": 4.985056825138582e-05, + "loss": 0.2934, + "step": 7363 + }, + { + "epoch": 0.13134520029964686, + "grad_norm": 0.25130894780158997, + "learning_rate": 4.985039827413222e-05, + "loss": 0.2158, + "step": 7364 + }, + { + "epoch": 0.13136303642136055, + "grad_norm": 0.2852213382720947, + "learning_rate": 4.98502282005499e-05, + "loss": 0.2334, + "step": 7365 + }, + { + "epoch": 0.13138087254307423, + "grad_norm": 0.30131983757019043, + "learning_rate": 4.985005803063951e-05, + "loss": 0.2024, + "step": 7366 + }, + { + "epoch": 0.13139870866478792, + "grad_norm": 0.32201939821243286, + "learning_rate": 4.9849887764401715e-05, + "loss": 0.2665, + "step": 7367 + }, + { + "epoch": 0.1314165447865016, + "grad_norm": 0.31907594203948975, + "learning_rate": 4.9849717401837165e-05, + "loss": 0.2239, + "step": 7368 + }, + { + "epoch": 0.13143438090821533, + "grad_norm": 0.23806947469711304, + "learning_rate": 4.984954694294654e-05, + "loss": 0.2133, + "step": 7369 + }, + { + "epoch": 0.131452217029929, + "grad_norm": 0.29021796584129333, + "learning_rate": 4.9849376387730475e-05, + "loss": 0.2284, + "step": 7370 + }, + { + "epoch": 0.1314700531516427, + "grad_norm": 0.2276151478290558, + "learning_rate": 4.9849205736189644e-05, + "loss": 0.1933, + "step": 7371 + }, + { + "epoch": 0.1314878892733564, + "grad_norm": 0.3081069886684418, + "learning_rate": 4.9849034988324714e-05, + "loss": 0.2603, + "step": 7372 + }, + { + "epoch": 0.1315057253950701, + "grad_norm": 0.2812741696834564, + "learning_rate": 4.984886414413634e-05, + "loss": 0.2767, + "step": 7373 + }, + { + "epoch": 0.1315235615167838, + "grad_norm": 0.2832390069961548, + "learning_rate": 4.98486932036252e-05, + "loss": 0.2416, + "step": 7374 + }, + { + "epoch": 0.13154139763849748, + "grad_norm": 0.2827257812023163, + "learning_rate": 4.984852216679192e-05, + "loss": 0.244, + "step": 7375 + }, + { + "epoch": 0.13155923376021117, + "grad_norm": 0.29743492603302, + "learning_rate": 4.9848351033637204e-05, + "loss": 0.2342, + "step": 7376 + }, + { + "epoch": 0.13157706988192489, + "grad_norm": 0.21614915132522583, + "learning_rate": 4.984817980416169e-05, + "loss": 0.2135, + "step": 7377 + }, + { + "epoch": 0.13159490600363857, + "grad_norm": 0.31758689880371094, + "learning_rate": 4.984800847836605e-05, + "loss": 0.2247, + "step": 7378 + }, + { + "epoch": 0.13161274212535226, + "grad_norm": 0.27685463428497314, + "learning_rate": 4.984783705625094e-05, + "loss": 0.2143, + "step": 7379 + }, + { + "epoch": 0.13163057824706595, + "grad_norm": 0.2874879240989685, + "learning_rate": 4.9847665537817036e-05, + "loss": 0.2186, + "step": 7380 + }, + { + "epoch": 0.13164841436877966, + "grad_norm": 0.21807777881622314, + "learning_rate": 4.9847493923065004e-05, + "loss": 0.2182, + "step": 7381 + }, + { + "epoch": 0.13166625049049335, + "grad_norm": 0.2918378710746765, + "learning_rate": 4.9847322211995494e-05, + "loss": 0.2186, + "step": 7382 + }, + { + "epoch": 0.13168408661220704, + "grad_norm": 0.36302876472473145, + "learning_rate": 4.984715040460919e-05, + "loss": 0.2428, + "step": 7383 + }, + { + "epoch": 0.13170192273392073, + "grad_norm": 0.2821221947669983, + "learning_rate": 4.984697850090674e-05, + "loss": 0.2846, + "step": 7384 + }, + { + "epoch": 0.13171975885563444, + "grad_norm": 0.348736435174942, + "learning_rate": 4.984680650088883e-05, + "loss": 0.2693, + "step": 7385 + }, + { + "epoch": 0.13173759497734813, + "grad_norm": 0.2931061089038849, + "learning_rate": 4.9846634404556106e-05, + "loss": 0.2384, + "step": 7386 + }, + { + "epoch": 0.13175543109906182, + "grad_norm": 0.23608984053134918, + "learning_rate": 4.984646221190925e-05, + "loss": 0.1999, + "step": 7387 + }, + { + "epoch": 0.1317732672207755, + "grad_norm": 0.27440783381462097, + "learning_rate": 4.9846289922948926e-05, + "loss": 0.2103, + "step": 7388 + }, + { + "epoch": 0.1317911033424892, + "grad_norm": 0.4285084009170532, + "learning_rate": 4.98461175376758e-05, + "loss": 0.2412, + "step": 7389 + }, + { + "epoch": 0.1318089394642029, + "grad_norm": 0.2917060852050781, + "learning_rate": 4.9845945056090535e-05, + "loss": 0.2005, + "step": 7390 + }, + { + "epoch": 0.1318267755859166, + "grad_norm": 0.35007357597351074, + "learning_rate": 4.984577247819381e-05, + "loss": 0.2758, + "step": 7391 + }, + { + "epoch": 0.1318446117076303, + "grad_norm": 0.3258071839809418, + "learning_rate": 4.984559980398629e-05, + "loss": 0.2322, + "step": 7392 + }, + { + "epoch": 0.13186244782934398, + "grad_norm": 0.3649417459964752, + "learning_rate": 4.984542703346865e-05, + "loss": 0.219, + "step": 7393 + }, + { + "epoch": 0.1318802839510577, + "grad_norm": 0.31614258885383606, + "learning_rate": 4.9845254166641545e-05, + "loss": 0.2399, + "step": 7394 + }, + { + "epoch": 0.13189812007277138, + "grad_norm": 0.2513994574546814, + "learning_rate": 4.984508120350566e-05, + "loss": 0.2293, + "step": 7395 + }, + { + "epoch": 0.13191595619448507, + "grad_norm": 0.31215736269950867, + "learning_rate": 4.984490814406165e-05, + "loss": 0.2178, + "step": 7396 + }, + { + "epoch": 0.13193379231619876, + "grad_norm": 0.2803609371185303, + "learning_rate": 4.9844734988310196e-05, + "loss": 0.1633, + "step": 7397 + }, + { + "epoch": 0.13195162843791247, + "grad_norm": 0.4034261405467987, + "learning_rate": 4.9844561736251975e-05, + "loss": 0.3008, + "step": 7398 + }, + { + "epoch": 0.13196946455962616, + "grad_norm": 0.2939809560775757, + "learning_rate": 4.984438838788765e-05, + "loss": 0.2614, + "step": 7399 + }, + { + "epoch": 0.13198730068133985, + "grad_norm": 0.29928505420684814, + "learning_rate": 4.9844214943217893e-05, + "loss": 0.2184, + "step": 7400 + }, + { + "epoch": 0.13200513680305354, + "grad_norm": 0.2603811025619507, + "learning_rate": 4.984404140224338e-05, + "loss": 0.2246, + "step": 7401 + }, + { + "epoch": 0.13202297292476725, + "grad_norm": 0.34333038330078125, + "learning_rate": 4.984386776496478e-05, + "loss": 0.2713, + "step": 7402 + }, + { + "epoch": 0.13204080904648094, + "grad_norm": 0.23444755375385284, + "learning_rate": 4.984369403138276e-05, + "loss": 0.2139, + "step": 7403 + }, + { + "epoch": 0.13205864516819463, + "grad_norm": 0.2562881112098694, + "learning_rate": 4.984352020149801e-05, + "loss": 0.2343, + "step": 7404 + }, + { + "epoch": 0.13207648128990832, + "grad_norm": 0.21762588620185852, + "learning_rate": 4.98433462753112e-05, + "loss": 0.2157, + "step": 7405 + }, + { + "epoch": 0.132094317411622, + "grad_norm": 0.23702339828014374, + "learning_rate": 4.984317225282299e-05, + "loss": 0.1997, + "step": 7406 + }, + { + "epoch": 0.13211215353333572, + "grad_norm": 0.2891445457935333, + "learning_rate": 4.984299813403407e-05, + "loss": 0.2365, + "step": 7407 + }, + { + "epoch": 0.1321299896550494, + "grad_norm": 0.2948136031627655, + "learning_rate": 4.98428239189451e-05, + "loss": 0.2539, + "step": 7408 + }, + { + "epoch": 0.1321478257767631, + "grad_norm": 0.4793187975883484, + "learning_rate": 4.984264960755677e-05, + "loss": 0.2017, + "step": 7409 + }, + { + "epoch": 0.13216566189847678, + "grad_norm": 0.2965013086795807, + "learning_rate": 4.984247519986975e-05, + "loss": 0.2459, + "step": 7410 + }, + { + "epoch": 0.1321834980201905, + "grad_norm": 0.3114519417285919, + "learning_rate": 4.984230069588471e-05, + "loss": 0.2457, + "step": 7411 + }, + { + "epoch": 0.1322013341419042, + "grad_norm": 0.2890632152557373, + "learning_rate": 4.9842126095602345e-05, + "loss": 0.236, + "step": 7412 + }, + { + "epoch": 0.13221917026361787, + "grad_norm": 0.2622167468070984, + "learning_rate": 4.984195139902331e-05, + "loss": 0.2601, + "step": 7413 + }, + { + "epoch": 0.13223700638533156, + "grad_norm": 0.355807900428772, + "learning_rate": 4.984177660614829e-05, + "loss": 0.3013, + "step": 7414 + }, + { + "epoch": 0.13225484250704528, + "grad_norm": 0.38276156783103943, + "learning_rate": 4.984160171697797e-05, + "loss": 0.3612, + "step": 7415 + }, + { + "epoch": 0.13227267862875897, + "grad_norm": 0.3555077016353607, + "learning_rate": 4.984142673151302e-05, + "loss": 0.2059, + "step": 7416 + }, + { + "epoch": 0.13229051475047265, + "grad_norm": 0.19492195546627045, + "learning_rate": 4.9841251649754115e-05, + "loss": 0.2003, + "step": 7417 + }, + { + "epoch": 0.13230835087218634, + "grad_norm": 0.23308365046977997, + "learning_rate": 4.984107647170194e-05, + "loss": 0.2407, + "step": 7418 + }, + { + "epoch": 0.13232618699390006, + "grad_norm": 0.4899159371852875, + "learning_rate": 4.9840901197357174e-05, + "loss": 0.2352, + "step": 7419 + }, + { + "epoch": 0.13234402311561375, + "grad_norm": 0.24222806096076965, + "learning_rate": 4.9840725826720495e-05, + "loss": 0.2122, + "step": 7420 + }, + { + "epoch": 0.13236185923732743, + "grad_norm": 0.31503376364707947, + "learning_rate": 4.984055035979258e-05, + "loss": 0.2196, + "step": 7421 + }, + { + "epoch": 0.13237969535904112, + "grad_norm": 0.3184950649738312, + "learning_rate": 4.984037479657412e-05, + "loss": 0.2498, + "step": 7422 + }, + { + "epoch": 0.13239753148075484, + "grad_norm": 0.3167721927165985, + "learning_rate": 4.984019913706578e-05, + "loss": 0.2236, + "step": 7423 + }, + { + "epoch": 0.13241536760246853, + "grad_norm": 0.26916879415512085, + "learning_rate": 4.984002338126826e-05, + "loss": 0.1883, + "step": 7424 + }, + { + "epoch": 0.1324332037241822, + "grad_norm": 0.38445138931274414, + "learning_rate": 4.983984752918221e-05, + "loss": 0.2736, + "step": 7425 + }, + { + "epoch": 0.1324510398458959, + "grad_norm": 0.22318536043167114, + "learning_rate": 4.9839671580808355e-05, + "loss": 0.202, + "step": 7426 + }, + { + "epoch": 0.1324688759676096, + "grad_norm": 0.30396726727485657, + "learning_rate": 4.983949553614734e-05, + "loss": 0.2405, + "step": 7427 + }, + { + "epoch": 0.1324867120893233, + "grad_norm": 0.3115949034690857, + "learning_rate": 4.9839319395199865e-05, + "loss": 0.2934, + "step": 7428 + }, + { + "epoch": 0.132504548211037, + "grad_norm": 0.2768704295158386, + "learning_rate": 4.98391431579666e-05, + "loss": 0.2351, + "step": 7429 + }, + { + "epoch": 0.13252238433275068, + "grad_norm": 0.2526929974555969, + "learning_rate": 4.983896682444825e-05, + "loss": 0.1906, + "step": 7430 + }, + { + "epoch": 0.13254022045446437, + "grad_norm": 0.28604450821876526, + "learning_rate": 4.983879039464548e-05, + "loss": 0.2842, + "step": 7431 + }, + { + "epoch": 0.13255805657617808, + "grad_norm": 0.2940399646759033, + "learning_rate": 4.983861386855898e-05, + "loss": 0.2227, + "step": 7432 + }, + { + "epoch": 0.13257589269789177, + "grad_norm": 0.3287406265735626, + "learning_rate": 4.983843724618943e-05, + "loss": 0.2872, + "step": 7433 + }, + { + "epoch": 0.13259372881960546, + "grad_norm": 0.41262704133987427, + "learning_rate": 4.9838260527537524e-05, + "loss": 0.2893, + "step": 7434 + }, + { + "epoch": 0.13261156494131915, + "grad_norm": 0.27647820115089417, + "learning_rate": 4.983808371260393e-05, + "loss": 0.2182, + "step": 7435 + }, + { + "epoch": 0.13262940106303286, + "grad_norm": 0.2574402987957001, + "learning_rate": 4.983790680138935e-05, + "loss": 0.1768, + "step": 7436 + }, + { + "epoch": 0.13264723718474655, + "grad_norm": 0.2551569640636444, + "learning_rate": 4.9837729793894476e-05, + "loss": 0.1964, + "step": 7437 + }, + { + "epoch": 0.13266507330646024, + "grad_norm": 0.2723551094532013, + "learning_rate": 4.983755269011998e-05, + "loss": 0.2523, + "step": 7438 + }, + { + "epoch": 0.13268290942817393, + "grad_norm": 0.32283326983451843, + "learning_rate": 4.983737549006654e-05, + "loss": 0.2537, + "step": 7439 + }, + { + "epoch": 0.13270074554988764, + "grad_norm": 0.23678213357925415, + "learning_rate": 4.983719819373486e-05, + "loss": 0.2307, + "step": 7440 + }, + { + "epoch": 0.13271858167160133, + "grad_norm": 0.31286126375198364, + "learning_rate": 4.9837020801125624e-05, + "loss": 0.1984, + "step": 7441 + }, + { + "epoch": 0.13273641779331502, + "grad_norm": 0.37980055809020996, + "learning_rate": 4.9836843312239514e-05, + "loss": 0.245, + "step": 7442 + }, + { + "epoch": 0.1327542539150287, + "grad_norm": 0.2871837019920349, + "learning_rate": 4.983666572707721e-05, + "loss": 0.2573, + "step": 7443 + }, + { + "epoch": 0.13277209003674242, + "grad_norm": 0.3354865312576294, + "learning_rate": 4.9836488045639426e-05, + "loss": 0.2576, + "step": 7444 + }, + { + "epoch": 0.1327899261584561, + "grad_norm": 0.2894785404205322, + "learning_rate": 4.983631026792683e-05, + "loss": 0.228, + "step": 7445 + }, + { + "epoch": 0.1328077622801698, + "grad_norm": 0.29659420251846313, + "learning_rate": 4.9836132393940126e-05, + "loss": 0.2526, + "step": 7446 + }, + { + "epoch": 0.1328255984018835, + "grad_norm": 0.45775067806243896, + "learning_rate": 4.983595442367999e-05, + "loss": 0.2359, + "step": 7447 + }, + { + "epoch": 0.13284343452359718, + "grad_norm": 0.2509719431400299, + "learning_rate": 4.9835776357147115e-05, + "loss": 0.2034, + "step": 7448 + }, + { + "epoch": 0.1328612706453109, + "grad_norm": 0.30304911732673645, + "learning_rate": 4.9835598194342185e-05, + "loss": 0.2317, + "step": 7449 + }, + { + "epoch": 0.13287910676702458, + "grad_norm": 0.35082000494003296, + "learning_rate": 4.983541993526591e-05, + "loss": 0.2846, + "step": 7450 + }, + { + "epoch": 0.13289694288873827, + "grad_norm": 0.33010074496269226, + "learning_rate": 4.9835241579918965e-05, + "loss": 0.197, + "step": 7451 + }, + { + "epoch": 0.13291477901045196, + "grad_norm": 0.33244559168815613, + "learning_rate": 4.9835063128302044e-05, + "loss": 0.2421, + "step": 7452 + }, + { + "epoch": 0.13293261513216567, + "grad_norm": 0.24155063927173615, + "learning_rate": 4.9834884580415845e-05, + "loss": 0.1957, + "step": 7453 + }, + { + "epoch": 0.13295045125387936, + "grad_norm": 0.2793540358543396, + "learning_rate": 4.983470593626105e-05, + "loss": 0.2112, + "step": 7454 + }, + { + "epoch": 0.13296828737559305, + "grad_norm": 0.24453821778297424, + "learning_rate": 4.983452719583837e-05, + "loss": 0.2217, + "step": 7455 + }, + { + "epoch": 0.13298612349730674, + "grad_norm": 0.34428685903549194, + "learning_rate": 4.9834348359148464e-05, + "loss": 0.2909, + "step": 7456 + }, + { + "epoch": 0.13300395961902045, + "grad_norm": 0.32234349846839905, + "learning_rate": 4.983416942619206e-05, + "loss": 0.2448, + "step": 7457 + }, + { + "epoch": 0.13302179574073414, + "grad_norm": 0.3137282431125641, + "learning_rate": 4.9833990396969834e-05, + "loss": 0.2445, + "step": 7458 + }, + { + "epoch": 0.13303963186244783, + "grad_norm": 0.36944276094436646, + "learning_rate": 4.983381127148249e-05, + "loss": 0.2005, + "step": 7459 + }, + { + "epoch": 0.13305746798416151, + "grad_norm": 0.29638609290122986, + "learning_rate": 4.983363204973071e-05, + "loss": 0.2449, + "step": 7460 + }, + { + "epoch": 0.13307530410587523, + "grad_norm": 0.23793640732765198, + "learning_rate": 4.98334527317152e-05, + "loss": 0.2209, + "step": 7461 + }, + { + "epoch": 0.13309314022758892, + "grad_norm": 0.34974244236946106, + "learning_rate": 4.9833273317436645e-05, + "loss": 0.2812, + "step": 7462 + }, + { + "epoch": 0.1331109763493026, + "grad_norm": 0.33365777134895325, + "learning_rate": 4.9833093806895745e-05, + "loss": 0.2071, + "step": 7463 + }, + { + "epoch": 0.1331288124710163, + "grad_norm": 0.3364517390727997, + "learning_rate": 4.98329142000932e-05, + "loss": 0.2148, + "step": 7464 + }, + { + "epoch": 0.13314664859273, + "grad_norm": 0.27121469378471375, + "learning_rate": 4.98327344970297e-05, + "loss": 0.1943, + "step": 7465 + }, + { + "epoch": 0.1331644847144437, + "grad_norm": 0.3351459205150604, + "learning_rate": 4.983255469770595e-05, + "loss": 0.2306, + "step": 7466 + }, + { + "epoch": 0.1331823208361574, + "grad_norm": 0.34545639157295227, + "learning_rate": 4.9832374802122626e-05, + "loss": 0.279, + "step": 7467 + }, + { + "epoch": 0.13320015695787107, + "grad_norm": 0.29428547620773315, + "learning_rate": 4.983219481028045e-05, + "loss": 0.2129, + "step": 7468 + }, + { + "epoch": 0.13321799307958476, + "grad_norm": 0.24164363741874695, + "learning_rate": 4.983201472218011e-05, + "loss": 0.2321, + "step": 7469 + }, + { + "epoch": 0.13323582920129848, + "grad_norm": 0.25894299149513245, + "learning_rate": 4.983183453782231e-05, + "loss": 0.2383, + "step": 7470 + }, + { + "epoch": 0.13325366532301217, + "grad_norm": 0.28912675380706787, + "learning_rate": 4.983165425720774e-05, + "loss": 0.2316, + "step": 7471 + }, + { + "epoch": 0.13327150144472585, + "grad_norm": 0.3120078146457672, + "learning_rate": 4.9831473880337095e-05, + "loss": 0.2464, + "step": 7472 + }, + { + "epoch": 0.13328933756643954, + "grad_norm": 0.2935456931591034, + "learning_rate": 4.983129340721109e-05, + "loss": 0.2744, + "step": 7473 + }, + { + "epoch": 0.13330717368815326, + "grad_norm": 0.2654741108417511, + "learning_rate": 4.9831112837830406e-05, + "loss": 0.1975, + "step": 7474 + }, + { + "epoch": 0.13332500980986695, + "grad_norm": 0.26337653398513794, + "learning_rate": 4.983093217219575e-05, + "loss": 0.2111, + "step": 7475 + }, + { + "epoch": 0.13334284593158063, + "grad_norm": 0.2705751359462738, + "learning_rate": 4.983075141030784e-05, + "loss": 0.219, + "step": 7476 + }, + { + "epoch": 0.13336068205329432, + "grad_norm": 0.26234325766563416, + "learning_rate": 4.9830570552167356e-05, + "loss": 0.2491, + "step": 7477 + }, + { + "epoch": 0.13337851817500804, + "grad_norm": 0.31031814217567444, + "learning_rate": 4.9830389597774996e-05, + "loss": 0.2297, + "step": 7478 + }, + { + "epoch": 0.13339635429672173, + "grad_norm": 0.23101654648780823, + "learning_rate": 4.983020854713147e-05, + "loss": 0.2085, + "step": 7479 + }, + { + "epoch": 0.1334141904184354, + "grad_norm": 0.34721609950065613, + "learning_rate": 4.983002740023749e-05, + "loss": 0.2502, + "step": 7480 + }, + { + "epoch": 0.1334320265401491, + "grad_norm": 0.2588844895362854, + "learning_rate": 4.982984615709374e-05, + "loss": 0.2182, + "step": 7481 + }, + { + "epoch": 0.13344986266186282, + "grad_norm": 0.3425954282283783, + "learning_rate": 4.982966481770093e-05, + "loss": 0.2143, + "step": 7482 + }, + { + "epoch": 0.1334676987835765, + "grad_norm": 0.297648549079895, + "learning_rate": 4.982948338205977e-05, + "loss": 0.2721, + "step": 7483 + }, + { + "epoch": 0.1334855349052902, + "grad_norm": 0.3153221011161804, + "learning_rate": 4.982930185017095e-05, + "loss": 0.2366, + "step": 7484 + }, + { + "epoch": 0.13350337102700388, + "grad_norm": 0.46999824047088623, + "learning_rate": 4.982912022203519e-05, + "loss": 0.2137, + "step": 7485 + }, + { + "epoch": 0.1335212071487176, + "grad_norm": 0.21449771523475647, + "learning_rate": 4.9828938497653165e-05, + "loss": 0.1918, + "step": 7486 + }, + { + "epoch": 0.13353904327043128, + "grad_norm": 0.27725961804389954, + "learning_rate": 4.9828756677025614e-05, + "loss": 0.2502, + "step": 7487 + }, + { + "epoch": 0.13355687939214497, + "grad_norm": 0.39225447177886963, + "learning_rate": 4.9828574760153227e-05, + "loss": 0.2417, + "step": 7488 + }, + { + "epoch": 0.13357471551385866, + "grad_norm": 0.22374741733074188, + "learning_rate": 4.98283927470367e-05, + "loss": 0.2059, + "step": 7489 + }, + { + "epoch": 0.13359255163557235, + "grad_norm": 0.29595109820365906, + "learning_rate": 4.982821063767675e-05, + "loss": 0.2066, + "step": 7490 + }, + { + "epoch": 0.13361038775728606, + "grad_norm": 0.2838314175605774, + "learning_rate": 4.982802843207408e-05, + "loss": 0.2573, + "step": 7491 + }, + { + "epoch": 0.13362822387899975, + "grad_norm": 0.18106932938098907, + "learning_rate": 4.982784613022941e-05, + "loss": 0.2278, + "step": 7492 + }, + { + "epoch": 0.13364606000071344, + "grad_norm": 0.33832937479019165, + "learning_rate": 4.9827663732143414e-05, + "loss": 0.2553, + "step": 7493 + }, + { + "epoch": 0.13366389612242713, + "grad_norm": 0.23699772357940674, + "learning_rate": 4.9827481237816824e-05, + "loss": 0.1986, + "step": 7494 + }, + { + "epoch": 0.13368173224414084, + "grad_norm": 0.30046558380126953, + "learning_rate": 4.9827298647250344e-05, + "loss": 0.2143, + "step": 7495 + }, + { + "epoch": 0.13369956836585453, + "grad_norm": 0.2665005624294281, + "learning_rate": 4.982711596044468e-05, + "loss": 0.2324, + "step": 7496 + }, + { + "epoch": 0.13371740448756822, + "grad_norm": 0.30308958888053894, + "learning_rate": 4.982693317740053e-05, + "loss": 0.2719, + "step": 7497 + }, + { + "epoch": 0.1337352406092819, + "grad_norm": 0.4038073420524597, + "learning_rate": 4.982675029811863e-05, + "loss": 0.3033, + "step": 7498 + }, + { + "epoch": 0.13375307673099562, + "grad_norm": 0.24031807482242584, + "learning_rate": 4.982656732259966e-05, + "loss": 0.1977, + "step": 7499 + }, + { + "epoch": 0.1337709128527093, + "grad_norm": 0.3382944166660309, + "learning_rate": 4.9826384250844346e-05, + "loss": 0.2559, + "step": 7500 + }, + { + "epoch": 0.133788748974423, + "grad_norm": 0.3220905363559723, + "learning_rate": 4.9826201082853385e-05, + "loss": 0.2957, + "step": 7501 + }, + { + "epoch": 0.1338065850961367, + "grad_norm": 0.3390187919139862, + "learning_rate": 4.9826017818627494e-05, + "loss": 0.2495, + "step": 7502 + }, + { + "epoch": 0.1338244212178504, + "grad_norm": 0.2769834101200104, + "learning_rate": 4.9825834458167385e-05, + "loss": 0.2327, + "step": 7503 + }, + { + "epoch": 0.1338422573395641, + "grad_norm": 0.36122724413871765, + "learning_rate": 4.9825651001473775e-05, + "loss": 0.3026, + "step": 7504 + }, + { + "epoch": 0.13386009346127778, + "grad_norm": 0.25237777829170227, + "learning_rate": 4.982546744854736e-05, + "loss": 0.2315, + "step": 7505 + }, + { + "epoch": 0.13387792958299147, + "grad_norm": 0.37935492396354675, + "learning_rate": 4.9825283799388854e-05, + "loss": 0.2855, + "step": 7506 + }, + { + "epoch": 0.13389576570470516, + "grad_norm": 0.29708775877952576, + "learning_rate": 4.982510005399897e-05, + "loss": 0.2491, + "step": 7507 + }, + { + "epoch": 0.13391360182641887, + "grad_norm": 0.4350215792655945, + "learning_rate": 4.9824916212378436e-05, + "loss": 0.286, + "step": 7508 + }, + { + "epoch": 0.13393143794813256, + "grad_norm": 0.3282392621040344, + "learning_rate": 4.982473227452795e-05, + "loss": 0.2853, + "step": 7509 + }, + { + "epoch": 0.13394927406984625, + "grad_norm": 0.6081443428993225, + "learning_rate": 4.9824548240448234e-05, + "loss": 0.2892, + "step": 7510 + }, + { + "epoch": 0.13396711019155993, + "grad_norm": 0.24356110394001007, + "learning_rate": 4.9824364110139984e-05, + "loss": 0.2276, + "step": 7511 + }, + { + "epoch": 0.13398494631327365, + "grad_norm": 0.2384781390428543, + "learning_rate": 4.9824179883603926e-05, + "loss": 0.2432, + "step": 7512 + }, + { + "epoch": 0.13400278243498734, + "grad_norm": 0.3611038029193878, + "learning_rate": 4.982399556084078e-05, + "loss": 0.3191, + "step": 7513 + }, + { + "epoch": 0.13402061855670103, + "grad_norm": 0.2158660739660263, + "learning_rate": 4.982381114185124e-05, + "loss": 0.1981, + "step": 7514 + }, + { + "epoch": 0.13403845467841471, + "grad_norm": 0.22014988958835602, + "learning_rate": 4.9823626626636045e-05, + "loss": 0.2359, + "step": 7515 + }, + { + "epoch": 0.13405629080012843, + "grad_norm": 0.26914459466934204, + "learning_rate": 4.9823442015195896e-05, + "loss": 0.2023, + "step": 7516 + }, + { + "epoch": 0.13407412692184212, + "grad_norm": 0.36341536045074463, + "learning_rate": 4.982325730753151e-05, + "loss": 0.3064, + "step": 7517 + }, + { + "epoch": 0.1340919630435558, + "grad_norm": 0.29143232107162476, + "learning_rate": 4.9823072503643606e-05, + "loss": 0.2266, + "step": 7518 + }, + { + "epoch": 0.1341097991652695, + "grad_norm": 0.3108413815498352, + "learning_rate": 4.98228876035329e-05, + "loss": 0.2183, + "step": 7519 + }, + { + "epoch": 0.1341276352869832, + "grad_norm": 0.3253674805164337, + "learning_rate": 4.9822702607200114e-05, + "loss": 0.2183, + "step": 7520 + }, + { + "epoch": 0.1341454714086969, + "grad_norm": 0.2737044095993042, + "learning_rate": 4.982251751464595e-05, + "loss": 0.2016, + "step": 7521 + }, + { + "epoch": 0.13416330753041059, + "grad_norm": 0.2607121467590332, + "learning_rate": 4.982233232587114e-05, + "loss": 0.2314, + "step": 7522 + }, + { + "epoch": 0.13418114365212427, + "grad_norm": 0.2589547634124756, + "learning_rate": 4.98221470408764e-05, + "loss": 0.224, + "step": 7523 + }, + { + "epoch": 0.134198979773838, + "grad_norm": 0.24485012888908386, + "learning_rate": 4.9821961659662434e-05, + "loss": 0.2448, + "step": 7524 + }, + { + "epoch": 0.13421681589555168, + "grad_norm": 0.3238263428211212, + "learning_rate": 4.9821776182229976e-05, + "loss": 0.2345, + "step": 7525 + }, + { + "epoch": 0.13423465201726537, + "grad_norm": 0.28708145022392273, + "learning_rate": 4.982159060857975e-05, + "loss": 0.2222, + "step": 7526 + }, + { + "epoch": 0.13425248813897905, + "grad_norm": 0.3221355378627777, + "learning_rate": 4.9821404938712454e-05, + "loss": 0.2758, + "step": 7527 + }, + { + "epoch": 0.13427032426069274, + "grad_norm": 0.31099021434783936, + "learning_rate": 4.982121917262882e-05, + "loss": 0.2624, + "step": 7528 + }, + { + "epoch": 0.13428816038240646, + "grad_norm": 0.18360298871994019, + "learning_rate": 4.982103331032957e-05, + "loss": 0.1803, + "step": 7529 + }, + { + "epoch": 0.13430599650412015, + "grad_norm": 0.2785327136516571, + "learning_rate": 4.9820847351815424e-05, + "loss": 0.2526, + "step": 7530 + }, + { + "epoch": 0.13432383262583383, + "grad_norm": 0.1713157743215561, + "learning_rate": 4.98206612970871e-05, + "loss": 0.1979, + "step": 7531 + }, + { + "epoch": 0.13434166874754752, + "grad_norm": 0.3337853252887726, + "learning_rate": 4.9820475146145317e-05, + "loss": 0.2219, + "step": 7532 + }, + { + "epoch": 0.13435950486926124, + "grad_norm": 0.4755903482437134, + "learning_rate": 4.9820288898990804e-05, + "loss": 0.2267, + "step": 7533 + }, + { + "epoch": 0.13437734099097493, + "grad_norm": 0.2882138192653656, + "learning_rate": 4.982010255562428e-05, + "loss": 0.248, + "step": 7534 + }, + { + "epoch": 0.1343951771126886, + "grad_norm": 0.3054235577583313, + "learning_rate": 4.981991611604646e-05, + "loss": 0.2103, + "step": 7535 + }, + { + "epoch": 0.1344130132344023, + "grad_norm": 0.33048173785209656, + "learning_rate": 4.981972958025807e-05, + "loss": 0.2469, + "step": 7536 + }, + { + "epoch": 0.13443084935611602, + "grad_norm": 0.25596049427986145, + "learning_rate": 4.9819542948259843e-05, + "loss": 0.209, + "step": 7537 + }, + { + "epoch": 0.1344486854778297, + "grad_norm": 0.2825992703437805, + "learning_rate": 4.981935622005249e-05, + "loss": 0.1947, + "step": 7538 + }, + { + "epoch": 0.1344665215995434, + "grad_norm": 0.28847551345825195, + "learning_rate": 4.9819169395636744e-05, + "loss": 0.2466, + "step": 7539 + }, + { + "epoch": 0.13448435772125708, + "grad_norm": 0.2828240394592285, + "learning_rate": 4.981898247501333e-05, + "loss": 0.2332, + "step": 7540 + }, + { + "epoch": 0.1345021938429708, + "grad_norm": 0.28144776821136475, + "learning_rate": 4.9818795458182955e-05, + "loss": 0.1825, + "step": 7541 + }, + { + "epoch": 0.13452002996468448, + "grad_norm": 0.29975757002830505, + "learning_rate": 4.981860834514637e-05, + "loss": 0.2202, + "step": 7542 + }, + { + "epoch": 0.13453786608639817, + "grad_norm": 0.33452877402305603, + "learning_rate": 4.9818421135904276e-05, + "loss": 0.2115, + "step": 7543 + }, + { + "epoch": 0.13455570220811186, + "grad_norm": 0.2965460419654846, + "learning_rate": 4.9818233830457414e-05, + "loss": 0.2238, + "step": 7544 + }, + { + "epoch": 0.13457353832982558, + "grad_norm": 0.41199344396591187, + "learning_rate": 4.981804642880651e-05, + "loss": 0.2071, + "step": 7545 + }, + { + "epoch": 0.13459137445153926, + "grad_norm": 0.39077064394950867, + "learning_rate": 4.981785893095228e-05, + "loss": 0.1899, + "step": 7546 + }, + { + "epoch": 0.13460921057325295, + "grad_norm": 0.30126917362213135, + "learning_rate": 4.981767133689545e-05, + "loss": 0.2044, + "step": 7547 + }, + { + "epoch": 0.13462704669496664, + "grad_norm": 0.5434653162956238, + "learning_rate": 4.981748364663677e-05, + "loss": 0.2414, + "step": 7548 + }, + { + "epoch": 0.13464488281668033, + "grad_norm": 0.35855361819267273, + "learning_rate": 4.981729586017694e-05, + "loss": 0.2518, + "step": 7549 + }, + { + "epoch": 0.13466271893839404, + "grad_norm": 0.3158375322818756, + "learning_rate": 4.9817107977516706e-05, + "loss": 0.2736, + "step": 7550 + }, + { + "epoch": 0.13468055506010773, + "grad_norm": 0.2712560296058655, + "learning_rate": 4.9816919998656784e-05, + "loss": 0.1824, + "step": 7551 + }, + { + "epoch": 0.13469839118182142, + "grad_norm": 0.35339856147766113, + "learning_rate": 4.9816731923597914e-05, + "loss": 0.2988, + "step": 7552 + }, + { + "epoch": 0.1347162273035351, + "grad_norm": 0.25982826948165894, + "learning_rate": 4.981654375234082e-05, + "loss": 0.2368, + "step": 7553 + }, + { + "epoch": 0.13473406342524882, + "grad_norm": 0.24519726634025574, + "learning_rate": 4.9816355484886237e-05, + "loss": 0.2228, + "step": 7554 + }, + { + "epoch": 0.1347518995469625, + "grad_norm": 0.2974936068058014, + "learning_rate": 4.981616712123488e-05, + "loss": 0.1965, + "step": 7555 + }, + { + "epoch": 0.1347697356686762, + "grad_norm": 0.26997777819633484, + "learning_rate": 4.981597866138749e-05, + "loss": 0.2227, + "step": 7556 + }, + { + "epoch": 0.1347875717903899, + "grad_norm": 0.2788987457752228, + "learning_rate": 4.981579010534479e-05, + "loss": 0.2542, + "step": 7557 + }, + { + "epoch": 0.1348054079121036, + "grad_norm": 0.28224748373031616, + "learning_rate": 4.9815601453107516e-05, + "loss": 0.2572, + "step": 7558 + }, + { + "epoch": 0.1348232440338173, + "grad_norm": 0.30463093519210815, + "learning_rate": 4.981541270467641e-05, + "loss": 0.2796, + "step": 7559 + }, + { + "epoch": 0.13484108015553098, + "grad_norm": 0.3485594093799591, + "learning_rate": 4.981522386005219e-05, + "loss": 0.2443, + "step": 7560 + }, + { + "epoch": 0.13485891627724467, + "grad_norm": 0.2835558354854584, + "learning_rate": 4.981503491923559e-05, + "loss": 0.2386, + "step": 7561 + }, + { + "epoch": 0.13487675239895838, + "grad_norm": 0.23558658361434937, + "learning_rate": 4.981484588222735e-05, + "loss": 0.1589, + "step": 7562 + }, + { + "epoch": 0.13489458852067207, + "grad_norm": 0.25932517647743225, + "learning_rate": 4.981465674902818e-05, + "loss": 0.1913, + "step": 7563 + }, + { + "epoch": 0.13491242464238576, + "grad_norm": 0.2826513946056366, + "learning_rate": 4.981446751963884e-05, + "loss": 0.1794, + "step": 7564 + }, + { + "epoch": 0.13493026076409945, + "grad_norm": 0.24743780493736267, + "learning_rate": 4.981427819406006e-05, + "loss": 0.2051, + "step": 7565 + }, + { + "epoch": 0.13494809688581316, + "grad_norm": 0.28147047758102417, + "learning_rate": 4.981408877229256e-05, + "loss": 0.229, + "step": 7566 + }, + { + "epoch": 0.13496593300752685, + "grad_norm": 0.2662518322467804, + "learning_rate": 4.9813899254337084e-05, + "loss": 0.2272, + "step": 7567 + }, + { + "epoch": 0.13498376912924054, + "grad_norm": 0.26303184032440186, + "learning_rate": 4.981370964019436e-05, + "loss": 0.1733, + "step": 7568 + }, + { + "epoch": 0.13500160525095423, + "grad_norm": 0.2362370789051056, + "learning_rate": 4.9813519929865125e-05, + "loss": 0.2038, + "step": 7569 + }, + { + "epoch": 0.13501944137266791, + "grad_norm": 0.2295181155204773, + "learning_rate": 4.981333012335012e-05, + "loss": 0.2052, + "step": 7570 + }, + { + "epoch": 0.13503727749438163, + "grad_norm": 0.29024702310562134, + "learning_rate": 4.981314022065008e-05, + "loss": 0.253, + "step": 7571 + }, + { + "epoch": 0.13505511361609532, + "grad_norm": 0.31106990575790405, + "learning_rate": 4.981295022176573e-05, + "loss": 0.1461, + "step": 7572 + }, + { + "epoch": 0.135072949737809, + "grad_norm": 0.31030508875846863, + "learning_rate": 4.981276012669782e-05, + "loss": 0.2392, + "step": 7573 + }, + { + "epoch": 0.1350907858595227, + "grad_norm": 0.2728276252746582, + "learning_rate": 4.9812569935447087e-05, + "loss": 0.2436, + "step": 7574 + }, + { + "epoch": 0.1351086219812364, + "grad_norm": 0.24788887798786163, + "learning_rate": 4.981237964801426e-05, + "loss": 0.1683, + "step": 7575 + }, + { + "epoch": 0.1351264581029501, + "grad_norm": 0.3057432472705841, + "learning_rate": 4.9812189264400075e-05, + "loss": 0.2349, + "step": 7576 + }, + { + "epoch": 0.13514429422466379, + "grad_norm": 0.44676804542541504, + "learning_rate": 4.981199878460528e-05, + "loss": 0.2308, + "step": 7577 + }, + { + "epoch": 0.13516213034637747, + "grad_norm": 0.41504162549972534, + "learning_rate": 4.98118082086306e-05, + "loss": 0.2349, + "step": 7578 + }, + { + "epoch": 0.1351799664680912, + "grad_norm": 0.3064153492450714, + "learning_rate": 4.98116175364768e-05, + "loss": 0.219, + "step": 7579 + }, + { + "epoch": 0.13519780258980488, + "grad_norm": 0.35112741589546204, + "learning_rate": 4.9811426768144574e-05, + "loss": 0.2371, + "step": 7580 + }, + { + "epoch": 0.13521563871151857, + "grad_norm": 0.2610100507736206, + "learning_rate": 4.981123590363471e-05, + "loss": 0.2202, + "step": 7581 + }, + { + "epoch": 0.13523347483323225, + "grad_norm": 0.3716852068901062, + "learning_rate": 4.981104494294792e-05, + "loss": 0.1981, + "step": 7582 + }, + { + "epoch": 0.13525131095494597, + "grad_norm": 0.339006632566452, + "learning_rate": 4.981085388608494e-05, + "loss": 0.2599, + "step": 7583 + }, + { + "epoch": 0.13526914707665966, + "grad_norm": 0.27571389079093933, + "learning_rate": 4.9810662733046534e-05, + "loss": 0.1884, + "step": 7584 + }, + { + "epoch": 0.13528698319837335, + "grad_norm": 0.24952851235866547, + "learning_rate": 4.9810471483833426e-05, + "loss": 0.2138, + "step": 7585 + }, + { + "epoch": 0.13530481932008703, + "grad_norm": 0.24076996743679047, + "learning_rate": 4.981028013844636e-05, + "loss": 0.2029, + "step": 7586 + }, + { + "epoch": 0.13532265544180072, + "grad_norm": 0.569031834602356, + "learning_rate": 4.9810088696886084e-05, + "loss": 0.3176, + "step": 7587 + }, + { + "epoch": 0.13534049156351444, + "grad_norm": 0.2556706368923187, + "learning_rate": 4.980989715915333e-05, + "loss": 0.2467, + "step": 7588 + }, + { + "epoch": 0.13535832768522812, + "grad_norm": 0.2645498216152191, + "learning_rate": 4.980970552524884e-05, + "loss": 0.2695, + "step": 7589 + }, + { + "epoch": 0.1353761638069418, + "grad_norm": 0.26399847865104675, + "learning_rate": 4.980951379517337e-05, + "loss": 0.2065, + "step": 7590 + }, + { + "epoch": 0.1353939999286555, + "grad_norm": 0.2496369630098343, + "learning_rate": 4.980932196892766e-05, + "loss": 0.2078, + "step": 7591 + }, + { + "epoch": 0.13541183605036922, + "grad_norm": 0.19830262660980225, + "learning_rate": 4.980913004651244e-05, + "loss": 0.2083, + "step": 7592 + }, + { + "epoch": 0.1354296721720829, + "grad_norm": 0.28163546323776245, + "learning_rate": 4.9808938027928466e-05, + "loss": 0.2439, + "step": 7593 + }, + { + "epoch": 0.1354475082937966, + "grad_norm": 0.2291107177734375, + "learning_rate": 4.980874591317648e-05, + "loss": 0.1775, + "step": 7594 + }, + { + "epoch": 0.13546534441551028, + "grad_norm": 0.2885727882385254, + "learning_rate": 4.980855370225722e-05, + "loss": 0.1927, + "step": 7595 + }, + { + "epoch": 0.135483180537224, + "grad_norm": 0.21624629199504852, + "learning_rate": 4.980836139517145e-05, + "loss": 0.1885, + "step": 7596 + }, + { + "epoch": 0.13550101665893768, + "grad_norm": 0.44790226221084595, + "learning_rate": 4.98081689919199e-05, + "loss": 0.2218, + "step": 7597 + }, + { + "epoch": 0.13551885278065137, + "grad_norm": 0.3004307746887207, + "learning_rate": 4.980797649250331e-05, + "loss": 0.2285, + "step": 7598 + }, + { + "epoch": 0.13553668890236506, + "grad_norm": 0.29081809520721436, + "learning_rate": 4.980778389692244e-05, + "loss": 0.225, + "step": 7599 + }, + { + "epoch": 0.13555452502407878, + "grad_norm": 0.2907610237598419, + "learning_rate": 4.980759120517803e-05, + "loss": 0.2349, + "step": 7600 + }, + { + "epoch": 0.13557236114579246, + "grad_norm": 0.21427428722381592, + "learning_rate": 4.980739841727083e-05, + "loss": 0.2029, + "step": 7601 + }, + { + "epoch": 0.13559019726750615, + "grad_norm": 0.42423343658447266, + "learning_rate": 4.980720553320158e-05, + "loss": 0.3132, + "step": 7602 + }, + { + "epoch": 0.13560803338921984, + "grad_norm": 0.29754114151000977, + "learning_rate": 4.9807012552971045e-05, + "loss": 0.2514, + "step": 7603 + }, + { + "epoch": 0.13562586951093356, + "grad_norm": 0.3172772526741028, + "learning_rate": 4.980681947657995e-05, + "loss": 0.2296, + "step": 7604 + }, + { + "epoch": 0.13564370563264724, + "grad_norm": 0.30040034651756287, + "learning_rate": 4.9806626304029056e-05, + "loss": 0.2615, + "step": 7605 + }, + { + "epoch": 0.13566154175436093, + "grad_norm": 0.3222219944000244, + "learning_rate": 4.9806433035319114e-05, + "loss": 0.2498, + "step": 7606 + }, + { + "epoch": 0.13567937787607462, + "grad_norm": 0.25147029757499695, + "learning_rate": 4.980623967045087e-05, + "loss": 0.2258, + "step": 7607 + }, + { + "epoch": 0.1356972139977883, + "grad_norm": 0.3038370609283447, + "learning_rate": 4.980604620942507e-05, + "loss": 0.2091, + "step": 7608 + }, + { + "epoch": 0.13571505011950202, + "grad_norm": 0.35255715250968933, + "learning_rate": 4.980585265224247e-05, + "loss": 0.1884, + "step": 7609 + }, + { + "epoch": 0.1357328862412157, + "grad_norm": 0.3303506076335907, + "learning_rate": 4.9805658998903815e-05, + "loss": 0.189, + "step": 7610 + }, + { + "epoch": 0.1357507223629294, + "grad_norm": 0.3681982755661011, + "learning_rate": 4.980546524940987e-05, + "loss": 0.2408, + "step": 7611 + }, + { + "epoch": 0.1357685584846431, + "grad_norm": 0.3078673481941223, + "learning_rate": 4.980527140376136e-05, + "loss": 0.2295, + "step": 7612 + }, + { + "epoch": 0.1357863946063568, + "grad_norm": 0.2225552201271057, + "learning_rate": 4.980507746195905e-05, + "loss": 0.2092, + "step": 7613 + }, + { + "epoch": 0.1358042307280705, + "grad_norm": 0.2969847023487091, + "learning_rate": 4.98048834240037e-05, + "loss": 0.2222, + "step": 7614 + }, + { + "epoch": 0.13582206684978418, + "grad_norm": 0.43571245670318604, + "learning_rate": 4.980468928989605e-05, + "loss": 0.2177, + "step": 7615 + }, + { + "epoch": 0.13583990297149787, + "grad_norm": 0.2678923010826111, + "learning_rate": 4.980449505963686e-05, + "loss": 0.2913, + "step": 7616 + }, + { + "epoch": 0.13585773909321158, + "grad_norm": 0.22254163026809692, + "learning_rate": 4.9804300733226875e-05, + "loss": 0.1995, + "step": 7617 + }, + { + "epoch": 0.13587557521492527, + "grad_norm": 0.29656386375427246, + "learning_rate": 4.980410631066686e-05, + "loss": 0.2393, + "step": 7618 + }, + { + "epoch": 0.13589341133663896, + "grad_norm": 0.30731868743896484, + "learning_rate": 4.980391179195756e-05, + "loss": 0.249, + "step": 7619 + }, + { + "epoch": 0.13591124745835265, + "grad_norm": 0.3457026481628418, + "learning_rate": 4.980371717709973e-05, + "loss": 0.2449, + "step": 7620 + }, + { + "epoch": 0.13592908358006636, + "grad_norm": 0.33114808797836304, + "learning_rate": 4.980352246609412e-05, + "loss": 0.2788, + "step": 7621 + }, + { + "epoch": 0.13594691970178005, + "grad_norm": 0.3019861578941345, + "learning_rate": 4.9803327658941494e-05, + "loss": 0.2316, + "step": 7622 + }, + { + "epoch": 0.13596475582349374, + "grad_norm": 0.24484869837760925, + "learning_rate": 4.9803132755642604e-05, + "loss": 0.2464, + "step": 7623 + }, + { + "epoch": 0.13598259194520743, + "grad_norm": 0.26004162430763245, + "learning_rate": 4.980293775619821e-05, + "loss": 0.2382, + "step": 7624 + }, + { + "epoch": 0.13600042806692114, + "grad_norm": 0.2276809811592102, + "learning_rate": 4.980274266060905e-05, + "loss": 0.2028, + "step": 7625 + }, + { + "epoch": 0.13601826418863483, + "grad_norm": 0.30349066853523254, + "learning_rate": 4.9802547468875906e-05, + "loss": 0.2134, + "step": 7626 + }, + { + "epoch": 0.13603610031034852, + "grad_norm": 0.4561436176300049, + "learning_rate": 4.9802352180999514e-05, + "loss": 0.3011, + "step": 7627 + }, + { + "epoch": 0.1360539364320622, + "grad_norm": 0.29373860359191895, + "learning_rate": 4.9802156796980634e-05, + "loss": 0.2635, + "step": 7628 + }, + { + "epoch": 0.1360717725537759, + "grad_norm": 0.43702778220176697, + "learning_rate": 4.980196131682004e-05, + "loss": 0.2278, + "step": 7629 + }, + { + "epoch": 0.1360896086754896, + "grad_norm": 0.2533094584941864, + "learning_rate": 4.980176574051847e-05, + "loss": 0.2398, + "step": 7630 + }, + { + "epoch": 0.1361074447972033, + "grad_norm": 0.29270872473716736, + "learning_rate": 4.9801570068076694e-05, + "loss": 0.2301, + "step": 7631 + }, + { + "epoch": 0.13612528091891699, + "grad_norm": 0.22814325988292694, + "learning_rate": 4.9801374299495464e-05, + "loss": 0.1466, + "step": 7632 + }, + { + "epoch": 0.13614311704063067, + "grad_norm": 0.2394150346517563, + "learning_rate": 4.980117843477554e-05, + "loss": 0.2213, + "step": 7633 + }, + { + "epoch": 0.1361609531623444, + "grad_norm": 0.38309991359710693, + "learning_rate": 4.980098247391768e-05, + "loss": 0.2326, + "step": 7634 + }, + { + "epoch": 0.13617878928405808, + "grad_norm": 0.35889971256256104, + "learning_rate": 4.980078641692265e-05, + "loss": 0.2485, + "step": 7635 + }, + { + "epoch": 0.13619662540577177, + "grad_norm": 0.22773735225200653, + "learning_rate": 4.9800590263791205e-05, + "loss": 0.2159, + "step": 7636 + }, + { + "epoch": 0.13621446152748545, + "grad_norm": 0.26242002844810486, + "learning_rate": 4.980039401452411e-05, + "loss": 0.2011, + "step": 7637 + }, + { + "epoch": 0.13623229764919917, + "grad_norm": 0.33753737807273865, + "learning_rate": 4.9800197669122116e-05, + "loss": 0.2791, + "step": 7638 + }, + { + "epoch": 0.13625013377091286, + "grad_norm": 0.2737777829170227, + "learning_rate": 4.980000122758599e-05, + "loss": 0.1997, + "step": 7639 + }, + { + "epoch": 0.13626796989262654, + "grad_norm": 0.2658303380012512, + "learning_rate": 4.9799804689916496e-05, + "loss": 0.2115, + "step": 7640 + }, + { + "epoch": 0.13628580601434023, + "grad_norm": 0.3175022304058075, + "learning_rate": 4.979960805611439e-05, + "loss": 0.2359, + "step": 7641 + }, + { + "epoch": 0.13630364213605395, + "grad_norm": 0.3396710455417633, + "learning_rate": 4.979941132618045e-05, + "loss": 0.2294, + "step": 7642 + }, + { + "epoch": 0.13632147825776764, + "grad_norm": 0.24313685297966003, + "learning_rate": 4.979921450011541e-05, + "loss": 0.2103, + "step": 7643 + }, + { + "epoch": 0.13633931437948132, + "grad_norm": 0.26737284660339355, + "learning_rate": 4.979901757792006e-05, + "loss": 0.2164, + "step": 7644 + }, + { + "epoch": 0.136357150501195, + "grad_norm": 0.2784193158149719, + "learning_rate": 4.979882055959515e-05, + "loss": 0.2089, + "step": 7645 + }, + { + "epoch": 0.13637498662290873, + "grad_norm": 0.3830283582210541, + "learning_rate": 4.9798623445141446e-05, + "loss": 0.2285, + "step": 7646 + }, + { + "epoch": 0.13639282274462242, + "grad_norm": 0.39115971326828003, + "learning_rate": 4.979842623455971e-05, + "loss": 0.2719, + "step": 7647 + }, + { + "epoch": 0.1364106588663361, + "grad_norm": 0.24941574037075043, + "learning_rate": 4.979822892785071e-05, + "loss": 0.2265, + "step": 7648 + }, + { + "epoch": 0.1364284949880498, + "grad_norm": 0.27254605293273926, + "learning_rate": 4.979803152501521e-05, + "loss": 0.2409, + "step": 7649 + }, + { + "epoch": 0.13644633110976348, + "grad_norm": 0.3258862793445587, + "learning_rate": 4.979783402605398e-05, + "loss": 0.2733, + "step": 7650 + }, + { + "epoch": 0.1364641672314772, + "grad_norm": 0.3291133642196655, + "learning_rate": 4.979763643096777e-05, + "loss": 0.2678, + "step": 7651 + }, + { + "epoch": 0.13648200335319088, + "grad_norm": 0.3363911211490631, + "learning_rate": 4.9797438739757366e-05, + "loss": 0.28, + "step": 7652 + }, + { + "epoch": 0.13649983947490457, + "grad_norm": 0.25408124923706055, + "learning_rate": 4.979724095242352e-05, + "loss": 0.194, + "step": 7653 + }, + { + "epoch": 0.13651767559661826, + "grad_norm": 0.3379240930080414, + "learning_rate": 4.9797043068967e-05, + "loss": 0.2662, + "step": 7654 + }, + { + "epoch": 0.13653551171833198, + "grad_norm": 0.2658340632915497, + "learning_rate": 4.979684508938858e-05, + "loss": 0.1696, + "step": 7655 + }, + { + "epoch": 0.13655334784004566, + "grad_norm": 0.29358401894569397, + "learning_rate": 4.979664701368903e-05, + "loss": 0.243, + "step": 7656 + }, + { + "epoch": 0.13657118396175935, + "grad_norm": 0.231268510222435, + "learning_rate": 4.9796448841869104e-05, + "loss": 0.2119, + "step": 7657 + }, + { + "epoch": 0.13658902008347304, + "grad_norm": 0.3786660134792328, + "learning_rate": 4.979625057392958e-05, + "loss": 0.1902, + "step": 7658 + }, + { + "epoch": 0.13660685620518676, + "grad_norm": 0.35064491629600525, + "learning_rate": 4.979605220987122e-05, + "loss": 0.2257, + "step": 7659 + }, + { + "epoch": 0.13662469232690044, + "grad_norm": 0.2789657413959503, + "learning_rate": 4.97958537496948e-05, + "loss": 0.2233, + "step": 7660 + }, + { + "epoch": 0.13664252844861413, + "grad_norm": 0.24864362180233002, + "learning_rate": 4.979565519340109e-05, + "loss": 0.2041, + "step": 7661 + }, + { + "epoch": 0.13666036457032782, + "grad_norm": 0.2274325042963028, + "learning_rate": 4.979545654099086e-05, + "loss": 0.217, + "step": 7662 + }, + { + "epoch": 0.13667820069204153, + "grad_norm": 0.6578096151351929, + "learning_rate": 4.9795257792464865e-05, + "loss": 0.251, + "step": 7663 + }, + { + "epoch": 0.13669603681375522, + "grad_norm": 0.2294447273015976, + "learning_rate": 4.979505894782389e-05, + "loss": 0.2561, + "step": 7664 + }, + { + "epoch": 0.1367138729354689, + "grad_norm": 0.3409743010997772, + "learning_rate": 4.979486000706871e-05, + "loss": 0.2623, + "step": 7665 + }, + { + "epoch": 0.1367317090571826, + "grad_norm": 0.3147277534008026, + "learning_rate": 4.979466097020008e-05, + "loss": 0.2224, + "step": 7666 + }, + { + "epoch": 0.13674954517889631, + "grad_norm": 0.28334489464759827, + "learning_rate": 4.979446183721879e-05, + "loss": 0.2328, + "step": 7667 + }, + { + "epoch": 0.13676738130061, + "grad_norm": 0.25945356488227844, + "learning_rate": 4.97942626081256e-05, + "loss": 0.2445, + "step": 7668 + }, + { + "epoch": 0.1367852174223237, + "grad_norm": 0.23040388524532318, + "learning_rate": 4.979406328292128e-05, + "loss": 0.2049, + "step": 7669 + }, + { + "epoch": 0.13680305354403738, + "grad_norm": 0.22734519839286804, + "learning_rate": 4.9793863861606606e-05, + "loss": 0.2041, + "step": 7670 + }, + { + "epoch": 0.13682088966575107, + "grad_norm": 0.3171316683292389, + "learning_rate": 4.979366434418235e-05, + "loss": 0.2241, + "step": 7671 + }, + { + "epoch": 0.13683872578746478, + "grad_norm": 0.28010791540145874, + "learning_rate": 4.97934647306493e-05, + "loss": 0.2299, + "step": 7672 + }, + { + "epoch": 0.13685656190917847, + "grad_norm": 0.317367821931839, + "learning_rate": 4.979326502100821e-05, + "loss": 0.2972, + "step": 7673 + }, + { + "epoch": 0.13687439803089216, + "grad_norm": 0.23120209574699402, + "learning_rate": 4.979306521525985e-05, + "loss": 0.2046, + "step": 7674 + }, + { + "epoch": 0.13689223415260585, + "grad_norm": 0.2688504755496979, + "learning_rate": 4.9792865313405016e-05, + "loss": 0.2804, + "step": 7675 + }, + { + "epoch": 0.13691007027431956, + "grad_norm": 0.27711114287376404, + "learning_rate": 4.9792665315444474e-05, + "loss": 0.2628, + "step": 7676 + }, + { + "epoch": 0.13692790639603325, + "grad_norm": 0.29201340675354004, + "learning_rate": 4.9792465221379005e-05, + "loss": 0.2444, + "step": 7677 + }, + { + "epoch": 0.13694574251774694, + "grad_norm": 0.3418673276901245, + "learning_rate": 4.979226503120936e-05, + "loss": 0.2169, + "step": 7678 + }, + { + "epoch": 0.13696357863946063, + "grad_norm": 0.2988224923610687, + "learning_rate": 4.979206474493635e-05, + "loss": 0.2612, + "step": 7679 + }, + { + "epoch": 0.13698141476117434, + "grad_norm": 0.28798702359199524, + "learning_rate": 4.9791864362560725e-05, + "loss": 0.1973, + "step": 7680 + }, + { + "epoch": 0.13699925088288803, + "grad_norm": 0.33267301321029663, + "learning_rate": 4.979166388408327e-05, + "loss": 0.2744, + "step": 7681 + }, + { + "epoch": 0.13701708700460172, + "grad_norm": 0.22727416455745697, + "learning_rate": 4.979146330950477e-05, + "loss": 0.2341, + "step": 7682 + }, + { + "epoch": 0.1370349231263154, + "grad_norm": 0.21520079672336578, + "learning_rate": 4.979126263882599e-05, + "loss": 0.1871, + "step": 7683 + }, + { + "epoch": 0.13705275924802912, + "grad_norm": 0.3009205162525177, + "learning_rate": 4.979106187204772e-05, + "loss": 0.2557, + "step": 7684 + }, + { + "epoch": 0.1370705953697428, + "grad_norm": 0.2567964494228363, + "learning_rate": 4.979086100917072e-05, + "loss": 0.2346, + "step": 7685 + }, + { + "epoch": 0.1370884314914565, + "grad_norm": 0.25454699993133545, + "learning_rate": 4.979066005019579e-05, + "loss": 0.2332, + "step": 7686 + }, + { + "epoch": 0.13710626761317019, + "grad_norm": 0.21537166833877563, + "learning_rate": 4.97904589951237e-05, + "loss": 0.2111, + "step": 7687 + }, + { + "epoch": 0.13712410373488387, + "grad_norm": 0.3063955307006836, + "learning_rate": 4.979025784395522e-05, + "loss": 0.2375, + "step": 7688 + }, + { + "epoch": 0.1371419398565976, + "grad_norm": 0.28575992584228516, + "learning_rate": 4.979005659669114e-05, + "loss": 0.2595, + "step": 7689 + }, + { + "epoch": 0.13715977597831128, + "grad_norm": 0.29159319400787354, + "learning_rate": 4.978985525333224e-05, + "loss": 0.2096, + "step": 7690 + }, + { + "epoch": 0.13717761210002496, + "grad_norm": 0.27432194352149963, + "learning_rate": 4.9789653813879305e-05, + "loss": 0.2223, + "step": 7691 + }, + { + "epoch": 0.13719544822173865, + "grad_norm": 0.25467804074287415, + "learning_rate": 4.9789452278333106e-05, + "loss": 0.1564, + "step": 7692 + }, + { + "epoch": 0.13721328434345237, + "grad_norm": 0.20438627898693085, + "learning_rate": 4.978925064669443e-05, + "loss": 0.1789, + "step": 7693 + }, + { + "epoch": 0.13723112046516606, + "grad_norm": 0.36302071809768677, + "learning_rate": 4.978904891896405e-05, + "loss": 0.2068, + "step": 7694 + }, + { + "epoch": 0.13724895658687974, + "grad_norm": 0.2433769255876541, + "learning_rate": 4.9788847095142754e-05, + "loss": 0.2283, + "step": 7695 + }, + { + "epoch": 0.13726679270859343, + "grad_norm": 0.27924856543540955, + "learning_rate": 4.978864517523133e-05, + "loss": 0.2749, + "step": 7696 + }, + { + "epoch": 0.13728462883030715, + "grad_norm": 0.29090026021003723, + "learning_rate": 4.9788443159230556e-05, + "loss": 0.2267, + "step": 7697 + }, + { + "epoch": 0.13730246495202084, + "grad_norm": 0.38446855545043945, + "learning_rate": 4.9788241047141216e-05, + "loss": 0.2827, + "step": 7698 + }, + { + "epoch": 0.13732030107373452, + "grad_norm": 0.2920713722705841, + "learning_rate": 4.9788038838964093e-05, + "loss": 0.2333, + "step": 7699 + }, + { + "epoch": 0.1373381371954482, + "grad_norm": 0.20829983055591583, + "learning_rate": 4.978783653469996e-05, + "loss": 0.2014, + "step": 7700 + }, + { + "epoch": 0.13735597331716193, + "grad_norm": 0.2887932360172272, + "learning_rate": 4.9787634134349614e-05, + "loss": 0.1782, + "step": 7701 + }, + { + "epoch": 0.13737380943887562, + "grad_norm": 0.3210797607898712, + "learning_rate": 4.978743163791384e-05, + "loss": 0.2622, + "step": 7702 + }, + { + "epoch": 0.1373916455605893, + "grad_norm": 0.24034082889556885, + "learning_rate": 4.978722904539343e-05, + "loss": 0.2119, + "step": 7703 + }, + { + "epoch": 0.137409481682303, + "grad_norm": 0.27551499009132385, + "learning_rate": 4.978702635678914e-05, + "loss": 0.1916, + "step": 7704 + }, + { + "epoch": 0.1374273178040167, + "grad_norm": 0.4206904172897339, + "learning_rate": 4.9786823572101786e-05, + "loss": 0.3473, + "step": 7705 + }, + { + "epoch": 0.1374451539257304, + "grad_norm": 0.2999703884124756, + "learning_rate": 4.978662069133214e-05, + "loss": 0.213, + "step": 7706 + }, + { + "epoch": 0.13746299004744408, + "grad_norm": 0.24993175268173218, + "learning_rate": 4.978641771448099e-05, + "loss": 0.2125, + "step": 7707 + }, + { + "epoch": 0.13748082616915777, + "grad_norm": 0.2945472002029419, + "learning_rate": 4.978621464154913e-05, + "loss": 0.211, + "step": 7708 + }, + { + "epoch": 0.13749866229087146, + "grad_norm": 0.2602582573890686, + "learning_rate": 4.978601147253733e-05, + "loss": 0.2164, + "step": 7709 + }, + { + "epoch": 0.13751649841258518, + "grad_norm": 0.3148273229598999, + "learning_rate": 4.97858082074464e-05, + "loss": 0.2348, + "step": 7710 + }, + { + "epoch": 0.13753433453429886, + "grad_norm": 0.22988027334213257, + "learning_rate": 4.9785604846277113e-05, + "loss": 0.2005, + "step": 7711 + }, + { + "epoch": 0.13755217065601255, + "grad_norm": 0.23377783596515656, + "learning_rate": 4.978540138903026e-05, + "loss": 0.2024, + "step": 7712 + }, + { + "epoch": 0.13757000677772624, + "grad_norm": 0.27650436758995056, + "learning_rate": 4.978519783570663e-05, + "loss": 0.1741, + "step": 7713 + }, + { + "epoch": 0.13758784289943995, + "grad_norm": 0.30671361088752747, + "learning_rate": 4.978499418630701e-05, + "loss": 0.2339, + "step": 7714 + }, + { + "epoch": 0.13760567902115364, + "grad_norm": 0.3299501836299896, + "learning_rate": 4.9784790440832196e-05, + "loss": 0.1843, + "step": 7715 + }, + { + "epoch": 0.13762351514286733, + "grad_norm": 0.25257813930511475, + "learning_rate": 4.978458659928297e-05, + "loss": 0.2184, + "step": 7716 + }, + { + "epoch": 0.13764135126458102, + "grad_norm": 0.3795832097530365, + "learning_rate": 4.9784382661660134e-05, + "loss": 0.2324, + "step": 7717 + }, + { + "epoch": 0.13765918738629473, + "grad_norm": 0.2929432690143585, + "learning_rate": 4.978417862796446e-05, + "loss": 0.2189, + "step": 7718 + }, + { + "epoch": 0.13767702350800842, + "grad_norm": 0.2690240144729614, + "learning_rate": 4.978397449819676e-05, + "loss": 0.242, + "step": 7719 + }, + { + "epoch": 0.1376948596297221, + "grad_norm": 0.25798019766807556, + "learning_rate": 4.9783770272357814e-05, + "loss": 0.2546, + "step": 7720 + }, + { + "epoch": 0.1377126957514358, + "grad_norm": 0.25778549909591675, + "learning_rate": 4.9783565950448406e-05, + "loss": 0.2008, + "step": 7721 + }, + { + "epoch": 0.13773053187314951, + "grad_norm": 0.4554760158061981, + "learning_rate": 4.978336153246934e-05, + "loss": 0.2811, + "step": 7722 + }, + { + "epoch": 0.1377483679948632, + "grad_norm": 0.28381386399269104, + "learning_rate": 4.9783157018421405e-05, + "loss": 0.2299, + "step": 7723 + }, + { + "epoch": 0.1377662041165769, + "grad_norm": 0.36203667521476746, + "learning_rate": 4.978295240830539e-05, + "loss": 0.2807, + "step": 7724 + }, + { + "epoch": 0.13778404023829058, + "grad_norm": 0.2515256404876709, + "learning_rate": 4.97827477021221e-05, + "loss": 0.1768, + "step": 7725 + }, + { + "epoch": 0.1378018763600043, + "grad_norm": 0.2921491265296936, + "learning_rate": 4.9782542899872314e-05, + "loss": 0.1776, + "step": 7726 + }, + { + "epoch": 0.13781971248171798, + "grad_norm": 0.40256834030151367, + "learning_rate": 4.9782338001556836e-05, + "loss": 0.2834, + "step": 7727 + }, + { + "epoch": 0.13783754860343167, + "grad_norm": 0.2674209177494049, + "learning_rate": 4.978213300717646e-05, + "loss": 0.2224, + "step": 7728 + }, + { + "epoch": 0.13785538472514536, + "grad_norm": 0.25338393449783325, + "learning_rate": 4.978192791673196e-05, + "loss": 0.1888, + "step": 7729 + }, + { + "epoch": 0.13787322084685905, + "grad_norm": 0.3655220866203308, + "learning_rate": 4.978172273022417e-05, + "loss": 0.2599, + "step": 7730 + }, + { + "epoch": 0.13789105696857276, + "grad_norm": 0.28567010164260864, + "learning_rate": 4.978151744765385e-05, + "loss": 0.2238, + "step": 7731 + }, + { + "epoch": 0.13790889309028645, + "grad_norm": 0.2943632900714874, + "learning_rate": 4.978131206902181e-05, + "loss": 0.2445, + "step": 7732 + }, + { + "epoch": 0.13792672921200014, + "grad_norm": 0.23185555636882782, + "learning_rate": 4.9781106594328846e-05, + "loss": 0.2087, + "step": 7733 + }, + { + "epoch": 0.13794456533371383, + "grad_norm": 0.3514593541622162, + "learning_rate": 4.978090102357575e-05, + "loss": 0.2235, + "step": 7734 + }, + { + "epoch": 0.13796240145542754, + "grad_norm": 0.26353880763053894, + "learning_rate": 4.978069535676333e-05, + "loss": 0.1979, + "step": 7735 + }, + { + "epoch": 0.13798023757714123, + "grad_norm": 0.2900046110153198, + "learning_rate": 4.978048959389238e-05, + "loss": 0.2148, + "step": 7736 + }, + { + "epoch": 0.13799807369885492, + "grad_norm": 0.23146869242191315, + "learning_rate": 4.978028373496369e-05, + "loss": 0.1841, + "step": 7737 + }, + { + "epoch": 0.1380159098205686, + "grad_norm": 0.2509578466415405, + "learning_rate": 4.978007777997805e-05, + "loss": 0.2163, + "step": 7738 + }, + { + "epoch": 0.13803374594228232, + "grad_norm": 0.30609941482543945, + "learning_rate": 4.977987172893628e-05, + "loss": 0.2252, + "step": 7739 + }, + { + "epoch": 0.138051582063996, + "grad_norm": 0.31992587447166443, + "learning_rate": 4.977966558183916e-05, + "loss": 0.2674, + "step": 7740 + }, + { + "epoch": 0.1380694181857097, + "grad_norm": 0.24072711169719696, + "learning_rate": 4.977945933868751e-05, + "loss": 0.196, + "step": 7741 + }, + { + "epoch": 0.13808725430742338, + "grad_norm": 0.31340426206588745, + "learning_rate": 4.977925299948211e-05, + "loss": 0.2514, + "step": 7742 + }, + { + "epoch": 0.1381050904291371, + "grad_norm": 0.2519441843032837, + "learning_rate": 4.977904656422376e-05, + "loss": 0.2283, + "step": 7743 + }, + { + "epoch": 0.1381229265508508, + "grad_norm": 0.3220392167568207, + "learning_rate": 4.977884003291328e-05, + "loss": 0.2451, + "step": 7744 + }, + { + "epoch": 0.13814076267256448, + "grad_norm": 0.3158110976219177, + "learning_rate": 4.9778633405551455e-05, + "loss": 0.2628, + "step": 7745 + }, + { + "epoch": 0.13815859879427816, + "grad_norm": 0.3030248284339905, + "learning_rate": 4.977842668213909e-05, + "loss": 0.189, + "step": 7746 + }, + { + "epoch": 0.13817643491599188, + "grad_norm": 0.47189828753471375, + "learning_rate": 4.977821986267698e-05, + "loss": 0.2288, + "step": 7747 + }, + { + "epoch": 0.13819427103770557, + "grad_norm": 0.34288308024406433, + "learning_rate": 4.977801294716593e-05, + "loss": 0.2301, + "step": 7748 + }, + { + "epoch": 0.13821210715941926, + "grad_norm": 0.29675331711769104, + "learning_rate": 4.9777805935606746e-05, + "loss": 0.267, + "step": 7749 + }, + { + "epoch": 0.13822994328113294, + "grad_norm": 0.2849023938179016, + "learning_rate": 4.977759882800023e-05, + "loss": 0.2276, + "step": 7750 + }, + { + "epoch": 0.13824777940284663, + "grad_norm": 0.2039736807346344, + "learning_rate": 4.977739162434718e-05, + "loss": 0.2152, + "step": 7751 + }, + { + "epoch": 0.13826561552456035, + "grad_norm": 0.3245219886302948, + "learning_rate": 4.977718432464841e-05, + "loss": 0.2591, + "step": 7752 + }, + { + "epoch": 0.13828345164627404, + "grad_norm": 0.2827920913696289, + "learning_rate": 4.977697692890471e-05, + "loss": 0.2193, + "step": 7753 + }, + { + "epoch": 0.13830128776798772, + "grad_norm": 0.26571762561798096, + "learning_rate": 4.9776769437116885e-05, + "loss": 0.2147, + "step": 7754 + }, + { + "epoch": 0.1383191238897014, + "grad_norm": 0.3618394136428833, + "learning_rate": 4.977656184928575e-05, + "loss": 0.2503, + "step": 7755 + }, + { + "epoch": 0.13833696001141513, + "grad_norm": 0.29902219772338867, + "learning_rate": 4.97763541654121e-05, + "loss": 0.2391, + "step": 7756 + }, + { + "epoch": 0.13835479613312882, + "grad_norm": 0.27607762813568115, + "learning_rate": 4.977614638549675e-05, + "loss": 0.1884, + "step": 7757 + }, + { + "epoch": 0.1383726322548425, + "grad_norm": 0.3239946961402893, + "learning_rate": 4.977593850954049e-05, + "loss": 0.2456, + "step": 7758 + }, + { + "epoch": 0.1383904683765562, + "grad_norm": 0.27721700072288513, + "learning_rate": 4.977573053754414e-05, + "loss": 0.2462, + "step": 7759 + }, + { + "epoch": 0.1384083044982699, + "grad_norm": 0.388680100440979, + "learning_rate": 4.9775522469508504e-05, + "loss": 0.3074, + "step": 7760 + }, + { + "epoch": 0.1384261406199836, + "grad_norm": 0.30190929770469666, + "learning_rate": 4.9775314305434385e-05, + "loss": 0.2169, + "step": 7761 + }, + { + "epoch": 0.13844397674169728, + "grad_norm": 0.35949015617370605, + "learning_rate": 4.977510604532259e-05, + "loss": 0.2478, + "step": 7762 + }, + { + "epoch": 0.13846181286341097, + "grad_norm": 0.6132022738456726, + "learning_rate": 4.9774897689173926e-05, + "loss": 0.2179, + "step": 7763 + }, + { + "epoch": 0.1384796489851247, + "grad_norm": 0.3205450475215912, + "learning_rate": 4.97746892369892e-05, + "loss": 0.2549, + "step": 7764 + }, + { + "epoch": 0.13849748510683837, + "grad_norm": 0.2332857996225357, + "learning_rate": 4.977448068876922e-05, + "loss": 0.2082, + "step": 7765 + }, + { + "epoch": 0.13851532122855206, + "grad_norm": 0.3490433990955353, + "learning_rate": 4.9774272044514806e-05, + "loss": 0.3034, + "step": 7766 + }, + { + "epoch": 0.13853315735026575, + "grad_norm": 0.26735496520996094, + "learning_rate": 4.977406330422675e-05, + "loss": 0.2018, + "step": 7767 + }, + { + "epoch": 0.13855099347197944, + "grad_norm": 0.34351831674575806, + "learning_rate": 4.977385446790587e-05, + "loss": 0.2448, + "step": 7768 + }, + { + "epoch": 0.13856882959369315, + "grad_norm": 0.3086816668510437, + "learning_rate": 4.977364553555296e-05, + "loss": 0.2432, + "step": 7769 + }, + { + "epoch": 0.13858666571540684, + "grad_norm": 0.25710898637771606, + "learning_rate": 4.9773436507168857e-05, + "loss": 0.2239, + "step": 7770 + }, + { + "epoch": 0.13860450183712053, + "grad_norm": 0.3609481453895569, + "learning_rate": 4.977322738275436e-05, + "loss": 0.2575, + "step": 7771 + }, + { + "epoch": 0.13862233795883422, + "grad_norm": 0.3354337513446808, + "learning_rate": 4.977301816231027e-05, + "loss": 0.2455, + "step": 7772 + }, + { + "epoch": 0.13864017408054793, + "grad_norm": 0.260863333940506, + "learning_rate": 4.977280884583741e-05, + "loss": 0.2296, + "step": 7773 + }, + { + "epoch": 0.13865801020226162, + "grad_norm": 0.43223699927330017, + "learning_rate": 4.977259943333658e-05, + "loss": 0.2654, + "step": 7774 + }, + { + "epoch": 0.1386758463239753, + "grad_norm": 0.280205100774765, + "learning_rate": 4.9772389924808605e-05, + "loss": 0.2126, + "step": 7775 + }, + { + "epoch": 0.138693682445689, + "grad_norm": 0.2625616490840912, + "learning_rate": 4.977218032025429e-05, + "loss": 0.2647, + "step": 7776 + }, + { + "epoch": 0.13871151856740271, + "grad_norm": 0.3067643642425537, + "learning_rate": 4.9771970619674446e-05, + "loss": 0.231, + "step": 7777 + }, + { + "epoch": 0.1387293546891164, + "grad_norm": 0.24626529216766357, + "learning_rate": 4.977176082306989e-05, + "loss": 0.2554, + "step": 7778 + }, + { + "epoch": 0.1387471908108301, + "grad_norm": 0.2416291981935501, + "learning_rate": 4.9771550930441426e-05, + "loss": 0.2272, + "step": 7779 + }, + { + "epoch": 0.13876502693254378, + "grad_norm": 0.35257646441459656, + "learning_rate": 4.9771340941789884e-05, + "loss": 0.2731, + "step": 7780 + }, + { + "epoch": 0.1387828630542575, + "grad_norm": 0.3278643488883972, + "learning_rate": 4.9771130857116065e-05, + "loss": 0.2786, + "step": 7781 + }, + { + "epoch": 0.13880069917597118, + "grad_norm": 0.23713479936122894, + "learning_rate": 4.977092067642078e-05, + "loss": 0.2336, + "step": 7782 + }, + { + "epoch": 0.13881853529768487, + "grad_norm": 0.2497008889913559, + "learning_rate": 4.977071039970487e-05, + "loss": 0.1848, + "step": 7783 + }, + { + "epoch": 0.13883637141939856, + "grad_norm": 0.38271504640579224, + "learning_rate": 4.9770500026969116e-05, + "loss": 0.2249, + "step": 7784 + }, + { + "epoch": 0.13885420754111227, + "grad_norm": 0.28771165013313293, + "learning_rate": 4.977028955821435e-05, + "loss": 0.2435, + "step": 7785 + }, + { + "epoch": 0.13887204366282596, + "grad_norm": 0.3053930699825287, + "learning_rate": 4.9770078993441386e-05, + "loss": 0.2176, + "step": 7786 + }, + { + "epoch": 0.13888987978453965, + "grad_norm": 0.2448943704366684, + "learning_rate": 4.9769868332651047e-05, + "loss": 0.2293, + "step": 7787 + }, + { + "epoch": 0.13890771590625334, + "grad_norm": 0.2562117576599121, + "learning_rate": 4.9769657575844136e-05, + "loss": 0.2296, + "step": 7788 + }, + { + "epoch": 0.13892555202796703, + "grad_norm": 0.2721194922924042, + "learning_rate": 4.976944672302148e-05, + "loss": 0.2138, + "step": 7789 + }, + { + "epoch": 0.13894338814968074, + "grad_norm": 0.36899375915527344, + "learning_rate": 4.976923577418389e-05, + "loss": 0.2031, + "step": 7790 + }, + { + "epoch": 0.13896122427139443, + "grad_norm": 0.26053762435913086, + "learning_rate": 4.976902472933219e-05, + "loss": 0.2323, + "step": 7791 + }, + { + "epoch": 0.13897906039310812, + "grad_norm": 0.27441224455833435, + "learning_rate": 4.97688135884672e-05, + "loss": 0.2217, + "step": 7792 + }, + { + "epoch": 0.1389968965148218, + "grad_norm": 0.31387025117874146, + "learning_rate": 4.9768602351589724e-05, + "loss": 0.2297, + "step": 7793 + }, + { + "epoch": 0.13901473263653552, + "grad_norm": 0.3134457767009735, + "learning_rate": 4.97683910187006e-05, + "loss": 0.2938, + "step": 7794 + }, + { + "epoch": 0.1390325687582492, + "grad_norm": 0.3367738723754883, + "learning_rate": 4.9768179589800634e-05, + "loss": 0.252, + "step": 7795 + }, + { + "epoch": 0.1390504048799629, + "grad_norm": 0.39340123534202576, + "learning_rate": 4.9767968064890646e-05, + "loss": 0.2329, + "step": 7796 + }, + { + "epoch": 0.13906824100167658, + "grad_norm": 0.3973371684551239, + "learning_rate": 4.976775644397146e-05, + "loss": 0.2732, + "step": 7797 + }, + { + "epoch": 0.1390860771233903, + "grad_norm": 0.22739411890506744, + "learning_rate": 4.97675447270439e-05, + "loss": 0.1817, + "step": 7798 + }, + { + "epoch": 0.139103913245104, + "grad_norm": 0.3370538353919983, + "learning_rate": 4.9767332914108776e-05, + "loss": 0.2702, + "step": 7799 + }, + { + "epoch": 0.13912174936681768, + "grad_norm": 0.2768855094909668, + "learning_rate": 4.976712100516692e-05, + "loss": 0.2037, + "step": 7800 + }, + { + "epoch": 0.13913958548853136, + "grad_norm": 0.4047466814517975, + "learning_rate": 4.976690900021915e-05, + "loss": 0.2317, + "step": 7801 + }, + { + "epoch": 0.13915742161024508, + "grad_norm": 0.25438666343688965, + "learning_rate": 4.976669689926628e-05, + "loss": 0.2062, + "step": 7802 + }, + { + "epoch": 0.13917525773195877, + "grad_norm": 0.3366991877555847, + "learning_rate": 4.9766484702309143e-05, + "loss": 0.2606, + "step": 7803 + }, + { + "epoch": 0.13919309385367246, + "grad_norm": 0.29640814661979675, + "learning_rate": 4.9766272409348555e-05, + "loss": 0.2453, + "step": 7804 + }, + { + "epoch": 0.13921092997538614, + "grad_norm": 0.24976986646652222, + "learning_rate": 4.976606002038534e-05, + "loss": 0.1908, + "step": 7805 + }, + { + "epoch": 0.13922876609709986, + "grad_norm": 0.24931728839874268, + "learning_rate": 4.9765847535420326e-05, + "loss": 0.2028, + "step": 7806 + }, + { + "epoch": 0.13924660221881355, + "grad_norm": 0.340193510055542, + "learning_rate": 4.976563495445433e-05, + "loss": 0.1855, + "step": 7807 + }, + { + "epoch": 0.13926443834052724, + "grad_norm": 0.3073176145553589, + "learning_rate": 4.9765422277488186e-05, + "loss": 0.2306, + "step": 7808 + }, + { + "epoch": 0.13928227446224092, + "grad_norm": 0.2590597867965698, + "learning_rate": 4.976520950452271e-05, + "loss": 0.2141, + "step": 7809 + }, + { + "epoch": 0.1393001105839546, + "grad_norm": 0.31174999475479126, + "learning_rate": 4.976499663555872e-05, + "loss": 0.2089, + "step": 7810 + }, + { + "epoch": 0.13931794670566833, + "grad_norm": 0.221009761095047, + "learning_rate": 4.976478367059706e-05, + "loss": 0.2174, + "step": 7811 + }, + { + "epoch": 0.13933578282738202, + "grad_norm": 0.3522317111492157, + "learning_rate": 4.976457060963854e-05, + "loss": 0.3125, + "step": 7812 + }, + { + "epoch": 0.1393536189490957, + "grad_norm": 0.5266415476799011, + "learning_rate": 4.976435745268398e-05, + "loss": 0.2487, + "step": 7813 + }, + { + "epoch": 0.1393714550708094, + "grad_norm": 0.28174644708633423, + "learning_rate": 4.976414419973424e-05, + "loss": 0.2332, + "step": 7814 + }, + { + "epoch": 0.1393892911925231, + "grad_norm": 0.2874157130718231, + "learning_rate": 4.976393085079011e-05, + "loss": 0.2329, + "step": 7815 + }, + { + "epoch": 0.1394071273142368, + "grad_norm": 0.28394871950149536, + "learning_rate": 4.976371740585243e-05, + "loss": 0.1899, + "step": 7816 + }, + { + "epoch": 0.13942496343595048, + "grad_norm": 0.3517719507217407, + "learning_rate": 4.976350386492203e-05, + "loss": 0.2816, + "step": 7817 + }, + { + "epoch": 0.13944279955766417, + "grad_norm": 0.2708662152290344, + "learning_rate": 4.976329022799974e-05, + "loss": 0.1945, + "step": 7818 + }, + { + "epoch": 0.1394606356793779, + "grad_norm": 0.42597663402557373, + "learning_rate": 4.976307649508638e-05, + "loss": 0.2259, + "step": 7819 + }, + { + "epoch": 0.13947847180109157, + "grad_norm": 0.28468161821365356, + "learning_rate": 4.976286266618279e-05, + "loss": 0.2786, + "step": 7820 + }, + { + "epoch": 0.13949630792280526, + "grad_norm": 0.3473394513130188, + "learning_rate": 4.9762648741289776e-05, + "loss": 0.225, + "step": 7821 + }, + { + "epoch": 0.13951414404451895, + "grad_norm": 0.2951386868953705, + "learning_rate": 4.976243472040819e-05, + "loss": 0.2848, + "step": 7822 + }, + { + "epoch": 0.13953198016623267, + "grad_norm": 0.246065154671669, + "learning_rate": 4.9762220603538857e-05, + "loss": 0.2411, + "step": 7823 + }, + { + "epoch": 0.13954981628794635, + "grad_norm": 0.20123934745788574, + "learning_rate": 4.97620063906826e-05, + "loss": 0.1367, + "step": 7824 + }, + { + "epoch": 0.13956765240966004, + "grad_norm": 0.2690652012825012, + "learning_rate": 4.976179208184026e-05, + "loss": 0.2411, + "step": 7825 + }, + { + "epoch": 0.13958548853137373, + "grad_norm": 0.27102038264274597, + "learning_rate": 4.9761577677012664e-05, + "loss": 0.2195, + "step": 7826 + }, + { + "epoch": 0.13960332465308745, + "grad_norm": 0.28155240416526794, + "learning_rate": 4.9761363176200634e-05, + "loss": 0.221, + "step": 7827 + }, + { + "epoch": 0.13962116077480113, + "grad_norm": 0.23990245163440704, + "learning_rate": 4.976114857940501e-05, + "loss": 0.2387, + "step": 7828 + }, + { + "epoch": 0.13963899689651482, + "grad_norm": 0.27962687611579895, + "learning_rate": 4.976093388662662e-05, + "loss": 0.222, + "step": 7829 + }, + { + "epoch": 0.1396568330182285, + "grad_norm": 0.27067622542381287, + "learning_rate": 4.97607190978663e-05, + "loss": 0.2328, + "step": 7830 + }, + { + "epoch": 0.1396746691399422, + "grad_norm": 0.2322358638048172, + "learning_rate": 4.9760504213124884e-05, + "loss": 0.2285, + "step": 7831 + }, + { + "epoch": 0.1396925052616559, + "grad_norm": 0.30207064747810364, + "learning_rate": 4.976028923240319e-05, + "loss": 0.2724, + "step": 7832 + }, + { + "epoch": 0.1397103413833696, + "grad_norm": 0.265759140253067, + "learning_rate": 4.976007415570207e-05, + "loss": 0.241, + "step": 7833 + }, + { + "epoch": 0.1397281775050833, + "grad_norm": 0.2388290911912918, + "learning_rate": 4.9759858983022355e-05, + "loss": 0.2088, + "step": 7834 + }, + { + "epoch": 0.13974601362679698, + "grad_norm": 0.22784021496772766, + "learning_rate": 4.975964371436487e-05, + "loss": 0.1872, + "step": 7835 + }, + { + "epoch": 0.1397638497485107, + "grad_norm": 0.2823319137096405, + "learning_rate": 4.975942834973045e-05, + "loss": 0.2361, + "step": 7836 + }, + { + "epoch": 0.13978168587022438, + "grad_norm": 0.3873385787010193, + "learning_rate": 4.975921288911994e-05, + "loss": 0.2146, + "step": 7837 + }, + { + "epoch": 0.13979952199193807, + "grad_norm": 0.3117041289806366, + "learning_rate": 4.975899733253417e-05, + "loss": 0.2604, + "step": 7838 + }, + { + "epoch": 0.13981735811365176, + "grad_norm": 0.21867047250270844, + "learning_rate": 4.975878167997398e-05, + "loss": 0.2108, + "step": 7839 + }, + { + "epoch": 0.13983519423536547, + "grad_norm": 0.2233334183692932, + "learning_rate": 4.975856593144019e-05, + "loss": 0.1897, + "step": 7840 + }, + { + "epoch": 0.13985303035707916, + "grad_norm": 0.29016876220703125, + "learning_rate": 4.975835008693365e-05, + "loss": 0.2198, + "step": 7841 + }, + { + "epoch": 0.13987086647879285, + "grad_norm": 0.23145824670791626, + "learning_rate": 4.9758134146455195e-05, + "loss": 0.2077, + "step": 7842 + }, + { + "epoch": 0.13988870260050654, + "grad_norm": 0.3653275966644287, + "learning_rate": 4.975791811000566e-05, + "loss": 0.2354, + "step": 7843 + }, + { + "epoch": 0.13990653872222025, + "grad_norm": 0.2752903401851654, + "learning_rate": 4.9757701977585894e-05, + "loss": 0.2082, + "step": 7844 + }, + { + "epoch": 0.13992437484393394, + "grad_norm": 0.32940614223480225, + "learning_rate": 4.975748574919671e-05, + "loss": 0.2427, + "step": 7845 + }, + { + "epoch": 0.13994221096564763, + "grad_norm": 0.3125225603580475, + "learning_rate": 4.975726942483896e-05, + "loss": 0.1942, + "step": 7846 + }, + { + "epoch": 0.13996004708736132, + "grad_norm": 0.31455323100090027, + "learning_rate": 4.975705300451349e-05, + "loss": 0.2438, + "step": 7847 + }, + { + "epoch": 0.13997788320907503, + "grad_norm": 0.46753013134002686, + "learning_rate": 4.975683648822113e-05, + "loss": 0.218, + "step": 7848 + }, + { + "epoch": 0.13999571933078872, + "grad_norm": 0.28788918256759644, + "learning_rate": 4.9756619875962716e-05, + "loss": 0.2348, + "step": 7849 + }, + { + "epoch": 0.1400135554525024, + "grad_norm": 0.21361945569515228, + "learning_rate": 4.9756403167739105e-05, + "loss": 0.1712, + "step": 7850 + }, + { + "epoch": 0.1400313915742161, + "grad_norm": 0.2623341679573059, + "learning_rate": 4.975618636355111e-05, + "loss": 0.2154, + "step": 7851 + }, + { + "epoch": 0.14004922769592978, + "grad_norm": 0.37504589557647705, + "learning_rate": 4.97559694633996e-05, + "loss": 0.2406, + "step": 7852 + }, + { + "epoch": 0.1400670638176435, + "grad_norm": 0.3687654137611389, + "learning_rate": 4.975575246728539e-05, + "loss": 0.203, + "step": 7853 + }, + { + "epoch": 0.1400848999393572, + "grad_norm": 0.3115758001804352, + "learning_rate": 4.975553537520934e-05, + "loss": 0.2691, + "step": 7854 + }, + { + "epoch": 0.14010273606107088, + "grad_norm": 0.254865437746048, + "learning_rate": 4.975531818717228e-05, + "loss": 0.1961, + "step": 7855 + }, + { + "epoch": 0.14012057218278456, + "grad_norm": 0.28733110427856445, + "learning_rate": 4.975510090317506e-05, + "loss": 0.2301, + "step": 7856 + }, + { + "epoch": 0.14013840830449828, + "grad_norm": 0.3322754502296448, + "learning_rate": 4.975488352321852e-05, + "loss": 0.2262, + "step": 7857 + }, + { + "epoch": 0.14015624442621197, + "grad_norm": 0.43327051401138306, + "learning_rate": 4.97546660473035e-05, + "loss": 0.2336, + "step": 7858 + }, + { + "epoch": 0.14017408054792566, + "grad_norm": 0.256805419921875, + "learning_rate": 4.9754448475430835e-05, + "loss": 0.1908, + "step": 7859 + }, + { + "epoch": 0.14019191666963934, + "grad_norm": 0.2885371744632721, + "learning_rate": 4.975423080760139e-05, + "loss": 0.2392, + "step": 7860 + }, + { + "epoch": 0.14020975279135306, + "grad_norm": 0.21325603127479553, + "learning_rate": 4.975401304381599e-05, + "loss": 0.1862, + "step": 7861 + }, + { + "epoch": 0.14022758891306675, + "grad_norm": 0.26652243733406067, + "learning_rate": 4.975379518407549e-05, + "loss": 0.2251, + "step": 7862 + }, + { + "epoch": 0.14024542503478044, + "grad_norm": 0.2744951844215393, + "learning_rate": 4.975357722838073e-05, + "loss": 0.24, + "step": 7863 + }, + { + "epoch": 0.14026326115649412, + "grad_norm": 0.2505877614021301, + "learning_rate": 4.9753359176732555e-05, + "loss": 0.2052, + "step": 7864 + }, + { + "epoch": 0.14028109727820784, + "grad_norm": 0.3701929450035095, + "learning_rate": 4.975314102913181e-05, + "loss": 0.2578, + "step": 7865 + }, + { + "epoch": 0.14029893339992153, + "grad_norm": 0.34019145369529724, + "learning_rate": 4.9752922785579334e-05, + "loss": 0.2317, + "step": 7866 + }, + { + "epoch": 0.14031676952163522, + "grad_norm": 0.23121345043182373, + "learning_rate": 4.975270444607599e-05, + "loss": 0.2122, + "step": 7867 + }, + { + "epoch": 0.1403346056433489, + "grad_norm": 0.20442263782024384, + "learning_rate": 4.975248601062261e-05, + "loss": 0.1995, + "step": 7868 + }, + { + "epoch": 0.1403524417650626, + "grad_norm": 0.30495911836624146, + "learning_rate": 4.975226747922005e-05, + "loss": 0.2616, + "step": 7869 + }, + { + "epoch": 0.1403702778867763, + "grad_norm": 0.3029687702655792, + "learning_rate": 4.975204885186915e-05, + "loss": 0.2401, + "step": 7870 + }, + { + "epoch": 0.14038811400849, + "grad_norm": 0.2812706530094147, + "learning_rate": 4.9751830128570754e-05, + "loss": 0.2493, + "step": 7871 + }, + { + "epoch": 0.14040595013020368, + "grad_norm": 0.2909778356552124, + "learning_rate": 4.9751611309325716e-05, + "loss": 0.2208, + "step": 7872 + }, + { + "epoch": 0.14042378625191737, + "grad_norm": 0.32993146777153015, + "learning_rate": 4.975139239413489e-05, + "loss": 0.2599, + "step": 7873 + }, + { + "epoch": 0.1404416223736311, + "grad_norm": 0.32004687190055847, + "learning_rate": 4.975117338299911e-05, + "loss": 0.2658, + "step": 7874 + }, + { + "epoch": 0.14045945849534477, + "grad_norm": 0.27156245708465576, + "learning_rate": 4.975095427591924e-05, + "loss": 0.2032, + "step": 7875 + }, + { + "epoch": 0.14047729461705846, + "grad_norm": 0.2504098415374756, + "learning_rate": 4.9750735072896117e-05, + "loss": 0.2203, + "step": 7876 + }, + { + "epoch": 0.14049513073877215, + "grad_norm": 0.2697783410549164, + "learning_rate": 4.97505157739306e-05, + "loss": 0.2577, + "step": 7877 + }, + { + "epoch": 0.14051296686048587, + "grad_norm": 0.34915924072265625, + "learning_rate": 4.975029637902353e-05, + "loss": 0.1898, + "step": 7878 + }, + { + "epoch": 0.14053080298219955, + "grad_norm": 0.3016009032726288, + "learning_rate": 4.975007688817577e-05, + "loss": 0.2329, + "step": 7879 + }, + { + "epoch": 0.14054863910391324, + "grad_norm": 0.2796032130718231, + "learning_rate": 4.974985730138816e-05, + "loss": 0.1957, + "step": 7880 + }, + { + "epoch": 0.14056647522562693, + "grad_norm": 0.5127027034759521, + "learning_rate": 4.974963761866156e-05, + "loss": 0.2184, + "step": 7881 + }, + { + "epoch": 0.14058431134734065, + "grad_norm": 0.49412110447883606, + "learning_rate": 4.974941783999681e-05, + "loss": 0.2192, + "step": 7882 + }, + { + "epoch": 0.14060214746905433, + "grad_norm": 0.21453262865543365, + "learning_rate": 4.974919796539477e-05, + "loss": 0.1925, + "step": 7883 + }, + { + "epoch": 0.14061998359076802, + "grad_norm": 0.29166239500045776, + "learning_rate": 4.974897799485629e-05, + "loss": 0.2405, + "step": 7884 + }, + { + "epoch": 0.1406378197124817, + "grad_norm": 0.3181530833244324, + "learning_rate": 4.9748757928382225e-05, + "loss": 0.2747, + "step": 7885 + }, + { + "epoch": 0.14065565583419543, + "grad_norm": 0.27765437960624695, + "learning_rate": 4.974853776597343e-05, + "loss": 0.2084, + "step": 7886 + }, + { + "epoch": 0.1406734919559091, + "grad_norm": 0.24031805992126465, + "learning_rate": 4.974831750763074e-05, + "loss": 0.22, + "step": 7887 + }, + { + "epoch": 0.1406913280776228, + "grad_norm": 0.21924948692321777, + "learning_rate": 4.974809715335504e-05, + "loss": 0.1956, + "step": 7888 + }, + { + "epoch": 0.1407091641993365, + "grad_norm": 0.278427392244339, + "learning_rate": 4.9747876703147155e-05, + "loss": 0.2104, + "step": 7889 + }, + { + "epoch": 0.14072700032105018, + "grad_norm": 0.27412453293800354, + "learning_rate": 4.974765615700796e-05, + "loss": 0.2294, + "step": 7890 + }, + { + "epoch": 0.1407448364427639, + "grad_norm": 0.2860864996910095, + "learning_rate": 4.97474355149383e-05, + "loss": 0.1772, + "step": 7891 + }, + { + "epoch": 0.14076267256447758, + "grad_norm": 0.3259214162826538, + "learning_rate": 4.9747214776939035e-05, + "loss": 0.1863, + "step": 7892 + }, + { + "epoch": 0.14078050868619127, + "grad_norm": 0.3208634555339813, + "learning_rate": 4.9746993943011014e-05, + "loss": 0.2001, + "step": 7893 + }, + { + "epoch": 0.14079834480790496, + "grad_norm": 0.2717551290988922, + "learning_rate": 4.97467730131551e-05, + "loss": 0.1538, + "step": 7894 + }, + { + "epoch": 0.14081618092961867, + "grad_norm": 0.28506383299827576, + "learning_rate": 4.9746551987372146e-05, + "loss": 0.2135, + "step": 7895 + }, + { + "epoch": 0.14083401705133236, + "grad_norm": 0.35007163882255554, + "learning_rate": 4.9746330865663014e-05, + "loss": 0.2224, + "step": 7896 + }, + { + "epoch": 0.14085185317304605, + "grad_norm": 0.2527383267879486, + "learning_rate": 4.974610964802855e-05, + "loss": 0.2415, + "step": 7897 + }, + { + "epoch": 0.14086968929475974, + "grad_norm": 0.30727824568748474, + "learning_rate": 4.974588833446962e-05, + "loss": 0.2302, + "step": 7898 + }, + { + "epoch": 0.14088752541647345, + "grad_norm": 0.3137305974960327, + "learning_rate": 4.974566692498708e-05, + "loss": 0.2473, + "step": 7899 + }, + { + "epoch": 0.14090536153818714, + "grad_norm": 0.267914354801178, + "learning_rate": 4.9745445419581785e-05, + "loss": 0.2019, + "step": 7900 + }, + { + "epoch": 0.14092319765990083, + "grad_norm": 0.31420591473579407, + "learning_rate": 4.9745223818254605e-05, + "loss": 0.2311, + "step": 7901 + }, + { + "epoch": 0.14094103378161452, + "grad_norm": 0.3154907524585724, + "learning_rate": 4.974500212100638e-05, + "loss": 0.2104, + "step": 7902 + }, + { + "epoch": 0.14095886990332823, + "grad_norm": 0.2309712916612625, + "learning_rate": 4.974478032783799e-05, + "loss": 0.1947, + "step": 7903 + }, + { + "epoch": 0.14097670602504192, + "grad_norm": 0.24380679428577423, + "learning_rate": 4.9744558438750276e-05, + "loss": 0.2208, + "step": 7904 + }, + { + "epoch": 0.1409945421467556, + "grad_norm": 0.23298384249210358, + "learning_rate": 4.9744336453744114e-05, + "loss": 0.1853, + "step": 7905 + }, + { + "epoch": 0.1410123782684693, + "grad_norm": 0.28253084421157837, + "learning_rate": 4.974411437282035e-05, + "loss": 0.2184, + "step": 7906 + }, + { + "epoch": 0.141030214390183, + "grad_norm": 0.30858737230300903, + "learning_rate": 4.974389219597986e-05, + "loss": 0.2265, + "step": 7907 + }, + { + "epoch": 0.1410480505118967, + "grad_norm": 0.36670511960983276, + "learning_rate": 4.97436699232235e-05, + "loss": 0.2287, + "step": 7908 + }, + { + "epoch": 0.1410658866336104, + "grad_norm": 0.2474767416715622, + "learning_rate": 4.974344755455212e-05, + "loss": 0.2146, + "step": 7909 + }, + { + "epoch": 0.14108372275532408, + "grad_norm": 0.27468055486679077, + "learning_rate": 4.974322508996659e-05, + "loss": 0.1993, + "step": 7910 + }, + { + "epoch": 0.14110155887703776, + "grad_norm": 0.2542925477027893, + "learning_rate": 4.9743002529467786e-05, + "loss": 0.2377, + "step": 7911 + }, + { + "epoch": 0.14111939499875148, + "grad_norm": 0.5516005158424377, + "learning_rate": 4.974277987305655e-05, + "loss": 0.294, + "step": 7912 + }, + { + "epoch": 0.14113723112046517, + "grad_norm": 0.24601012468338013, + "learning_rate": 4.9742557120733755e-05, + "loss": 0.2118, + "step": 7913 + }, + { + "epoch": 0.14115506724217886, + "grad_norm": 0.2901665270328522, + "learning_rate": 4.9742334272500256e-05, + "loss": 0.2164, + "step": 7914 + }, + { + "epoch": 0.14117290336389254, + "grad_norm": 0.25811460614204407, + "learning_rate": 4.974211132835693e-05, + "loss": 0.2268, + "step": 7915 + }, + { + "epoch": 0.14119073948560626, + "grad_norm": 0.31770452857017517, + "learning_rate": 4.974188828830464e-05, + "loss": 0.22, + "step": 7916 + }, + { + "epoch": 0.14120857560731995, + "grad_norm": 0.2723231315612793, + "learning_rate": 4.974166515234424e-05, + "loss": 0.2514, + "step": 7917 + }, + { + "epoch": 0.14122641172903364, + "grad_norm": 0.30405065417289734, + "learning_rate": 4.9741441920476596e-05, + "loss": 0.2547, + "step": 7918 + }, + { + "epoch": 0.14124424785074732, + "grad_norm": 0.26237574219703674, + "learning_rate": 4.9741218592702584e-05, + "loss": 0.2056, + "step": 7919 + }, + { + "epoch": 0.14126208397246104, + "grad_norm": 0.22551776468753815, + "learning_rate": 4.9740995169023056e-05, + "loss": 0.1652, + "step": 7920 + }, + { + "epoch": 0.14127992009417473, + "grad_norm": 0.2911512553691864, + "learning_rate": 4.9740771649438894e-05, + "loss": 0.2436, + "step": 7921 + }, + { + "epoch": 0.14129775621588841, + "grad_norm": 0.2582857310771942, + "learning_rate": 4.974054803395095e-05, + "loss": 0.2375, + "step": 7922 + }, + { + "epoch": 0.1413155923376021, + "grad_norm": 0.31233006715774536, + "learning_rate": 4.97403243225601e-05, + "loss": 0.2474, + "step": 7923 + }, + { + "epoch": 0.14133342845931582, + "grad_norm": 0.22808755934238434, + "learning_rate": 4.9740100515267206e-05, + "loss": 0.2234, + "step": 7924 + }, + { + "epoch": 0.1413512645810295, + "grad_norm": 0.2825382649898529, + "learning_rate": 4.9739876612073145e-05, + "loss": 0.2038, + "step": 7925 + }, + { + "epoch": 0.1413691007027432, + "grad_norm": 0.34420591592788696, + "learning_rate": 4.973965261297877e-05, + "loss": 0.2381, + "step": 7926 + }, + { + "epoch": 0.14138693682445688, + "grad_norm": 0.2703482508659363, + "learning_rate": 4.973942851798496e-05, + "loss": 0.227, + "step": 7927 + }, + { + "epoch": 0.1414047729461706, + "grad_norm": 0.3449389338493347, + "learning_rate": 4.973920432709257e-05, + "loss": 0.3174, + "step": 7928 + }, + { + "epoch": 0.1414226090678843, + "grad_norm": 0.2196127325296402, + "learning_rate": 4.973898004030249e-05, + "loss": 0.2498, + "step": 7929 + }, + { + "epoch": 0.14144044518959797, + "grad_norm": 0.3597094714641571, + "learning_rate": 4.973875565761558e-05, + "loss": 0.2552, + "step": 7930 + }, + { + "epoch": 0.14145828131131166, + "grad_norm": 0.2061515748500824, + "learning_rate": 4.97385311790327e-05, + "loss": 0.1896, + "step": 7931 + }, + { + "epoch": 0.14147611743302535, + "grad_norm": 0.2318653017282486, + "learning_rate": 4.973830660455474e-05, + "loss": 0.2182, + "step": 7932 + }, + { + "epoch": 0.14149395355473907, + "grad_norm": 0.2656973898410797, + "learning_rate": 4.9738081934182546e-05, + "loss": 0.1992, + "step": 7933 + }, + { + "epoch": 0.14151178967645275, + "grad_norm": 0.1819588840007782, + "learning_rate": 4.973785716791701e-05, + "loss": 0.2088, + "step": 7934 + }, + { + "epoch": 0.14152962579816644, + "grad_norm": 0.2581210434436798, + "learning_rate": 4.9737632305758996e-05, + "loss": 0.1868, + "step": 7935 + }, + { + "epoch": 0.14154746191988013, + "grad_norm": 0.25400230288505554, + "learning_rate": 4.973740734770938e-05, + "loss": 0.1907, + "step": 7936 + }, + { + "epoch": 0.14156529804159385, + "grad_norm": 0.30852246284484863, + "learning_rate": 4.9737182293769017e-05, + "loss": 0.2392, + "step": 7937 + }, + { + "epoch": 0.14158313416330753, + "grad_norm": 0.3192148506641388, + "learning_rate": 4.973695714393879e-05, + "loss": 0.2806, + "step": 7938 + }, + { + "epoch": 0.14160097028502122, + "grad_norm": 0.25666457414627075, + "learning_rate": 4.973673189821959e-05, + "loss": 0.2046, + "step": 7939 + }, + { + "epoch": 0.1416188064067349, + "grad_norm": 0.30235546827316284, + "learning_rate": 4.973650655661226e-05, + "loss": 0.2737, + "step": 7940 + }, + { + "epoch": 0.14163664252844863, + "grad_norm": 0.32992446422576904, + "learning_rate": 4.973628111911769e-05, + "loss": 0.2122, + "step": 7941 + }, + { + "epoch": 0.1416544786501623, + "grad_norm": 0.3310461938381195, + "learning_rate": 4.973605558573675e-05, + "loss": 0.2673, + "step": 7942 + }, + { + "epoch": 0.141672314771876, + "grad_norm": 0.2232588827610016, + "learning_rate": 4.9735829956470314e-05, + "loss": 0.1908, + "step": 7943 + }, + { + "epoch": 0.1416901508935897, + "grad_norm": 0.21040025353431702, + "learning_rate": 4.9735604231319266e-05, + "loss": 0.2036, + "step": 7944 + }, + { + "epoch": 0.1417079870153034, + "grad_norm": 0.30268794298171997, + "learning_rate": 4.973537841028446e-05, + "loss": 0.2791, + "step": 7945 + }, + { + "epoch": 0.1417258231370171, + "grad_norm": 0.3545149862766266, + "learning_rate": 4.9735152493366795e-05, + "loss": 0.2817, + "step": 7946 + }, + { + "epoch": 0.14174365925873078, + "grad_norm": 0.2459118813276291, + "learning_rate": 4.9734926480567126e-05, + "loss": 0.2244, + "step": 7947 + }, + { + "epoch": 0.14176149538044447, + "grad_norm": 0.34598618745803833, + "learning_rate": 4.973470037188634e-05, + "loss": 0.3079, + "step": 7948 + }, + { + "epoch": 0.14177933150215818, + "grad_norm": 0.25589612126350403, + "learning_rate": 4.9734474167325326e-05, + "loss": 0.2909, + "step": 7949 + }, + { + "epoch": 0.14179716762387187, + "grad_norm": 0.26901867985725403, + "learning_rate": 4.973424786688494e-05, + "loss": 0.1569, + "step": 7950 + }, + { + "epoch": 0.14181500374558556, + "grad_norm": 0.24347417056560516, + "learning_rate": 4.973402147056606e-05, + "loss": 0.2038, + "step": 7951 + }, + { + "epoch": 0.14183283986729925, + "grad_norm": 0.2705577611923218, + "learning_rate": 4.973379497836957e-05, + "loss": 0.2513, + "step": 7952 + }, + { + "epoch": 0.14185067598901294, + "grad_norm": 0.2895198166370392, + "learning_rate": 4.9733568390296357e-05, + "loss": 0.1899, + "step": 7953 + }, + { + "epoch": 0.14186851211072665, + "grad_norm": 0.24842306971549988, + "learning_rate": 4.973334170634728e-05, + "loss": 0.2473, + "step": 7954 + }, + { + "epoch": 0.14188634823244034, + "grad_norm": 0.18200337886810303, + "learning_rate": 4.9733114926523233e-05, + "loss": 0.1884, + "step": 7955 + }, + { + "epoch": 0.14190418435415403, + "grad_norm": 0.23742805421352386, + "learning_rate": 4.973288805082509e-05, + "loss": 0.2358, + "step": 7956 + }, + { + "epoch": 0.14192202047586772, + "grad_norm": 0.2698855400085449, + "learning_rate": 4.973266107925373e-05, + "loss": 0.2343, + "step": 7957 + }, + { + "epoch": 0.14193985659758143, + "grad_norm": 0.2611949145793915, + "learning_rate": 4.9732434011810036e-05, + "loss": 0.2233, + "step": 7958 + }, + { + "epoch": 0.14195769271929512, + "grad_norm": 0.29039445519447327, + "learning_rate": 4.973220684849488e-05, + "loss": 0.2236, + "step": 7959 + }, + { + "epoch": 0.1419755288410088, + "grad_norm": 0.2685015797615051, + "learning_rate": 4.973197958930915e-05, + "loss": 0.2361, + "step": 7960 + }, + { + "epoch": 0.1419933649627225, + "grad_norm": 0.2829175293445587, + "learning_rate": 4.9731752234253723e-05, + "loss": 0.2004, + "step": 7961 + }, + { + "epoch": 0.1420112010844362, + "grad_norm": 0.43400296568870544, + "learning_rate": 4.9731524783329485e-05, + "loss": 0.2381, + "step": 7962 + }, + { + "epoch": 0.1420290372061499, + "grad_norm": 0.2866727113723755, + "learning_rate": 4.973129723653732e-05, + "loss": 0.2553, + "step": 7963 + }, + { + "epoch": 0.1420468733278636, + "grad_norm": 0.3433106243610382, + "learning_rate": 4.97310695938781e-05, + "loss": 0.2442, + "step": 7964 + }, + { + "epoch": 0.14206470944957728, + "grad_norm": 0.2611335813999176, + "learning_rate": 4.973084185535271e-05, + "loss": 0.2685, + "step": 7965 + }, + { + "epoch": 0.142082545571291, + "grad_norm": 0.2822869122028351, + "learning_rate": 4.973061402096204e-05, + "loss": 0.2473, + "step": 7966 + }, + { + "epoch": 0.14210038169300468, + "grad_norm": 0.24985238909721375, + "learning_rate": 4.9730386090706965e-05, + "loss": 0.205, + "step": 7967 + }, + { + "epoch": 0.14211821781471837, + "grad_norm": 0.3009773790836334, + "learning_rate": 4.9730158064588375e-05, + "loss": 0.2491, + "step": 7968 + }, + { + "epoch": 0.14213605393643206, + "grad_norm": 0.31487444043159485, + "learning_rate": 4.972992994260714e-05, + "loss": 0.2181, + "step": 7969 + }, + { + "epoch": 0.14215389005814574, + "grad_norm": 0.3135034739971161, + "learning_rate": 4.9729701724764163e-05, + "loss": 0.2353, + "step": 7970 + }, + { + "epoch": 0.14217172617985946, + "grad_norm": 0.28377479314804077, + "learning_rate": 4.972947341106033e-05, + "loss": 0.2243, + "step": 7971 + }, + { + "epoch": 0.14218956230157315, + "grad_norm": 0.2449411153793335, + "learning_rate": 4.9729245001496505e-05, + "loss": 0.2249, + "step": 7972 + }, + { + "epoch": 0.14220739842328683, + "grad_norm": 0.36053651571273804, + "learning_rate": 4.972901649607359e-05, + "loss": 0.2622, + "step": 7973 + }, + { + "epoch": 0.14222523454500052, + "grad_norm": 0.24852022528648376, + "learning_rate": 4.972878789479246e-05, + "loss": 0.2031, + "step": 7974 + }, + { + "epoch": 0.14224307066671424, + "grad_norm": 0.27910172939300537, + "learning_rate": 4.972855919765402e-05, + "loss": 0.2611, + "step": 7975 + }, + { + "epoch": 0.14226090678842793, + "grad_norm": 0.32723069190979004, + "learning_rate": 4.9728330404659126e-05, + "loss": 0.1865, + "step": 7976 + }, + { + "epoch": 0.14227874291014161, + "grad_norm": 0.2713997960090637, + "learning_rate": 4.97281015158087e-05, + "loss": 0.2426, + "step": 7977 + }, + { + "epoch": 0.1422965790318553, + "grad_norm": 0.22855231165885925, + "learning_rate": 4.97278725311036e-05, + "loss": 0.2, + "step": 7978 + }, + { + "epoch": 0.14231441515356902, + "grad_norm": 0.2307584434747696, + "learning_rate": 4.9727643450544725e-05, + "loss": 0.2072, + "step": 7979 + }, + { + "epoch": 0.1423322512752827, + "grad_norm": 0.25341737270355225, + "learning_rate": 4.972741427413297e-05, + "loss": 0.1912, + "step": 7980 + }, + { + "epoch": 0.1423500873969964, + "grad_norm": 0.26412492990493774, + "learning_rate": 4.972718500186921e-05, + "loss": 0.2425, + "step": 7981 + }, + { + "epoch": 0.14236792351871008, + "grad_norm": 0.33935558795928955, + "learning_rate": 4.9726955633754344e-05, + "loss": 0.2557, + "step": 7982 + }, + { + "epoch": 0.1423857596404238, + "grad_norm": 0.240280881524086, + "learning_rate": 4.972672616978926e-05, + "loss": 0.1984, + "step": 7983 + }, + { + "epoch": 0.14240359576213749, + "grad_norm": 0.36089375615119934, + "learning_rate": 4.972649660997485e-05, + "loss": 0.2615, + "step": 7984 + }, + { + "epoch": 0.14242143188385117, + "grad_norm": 0.29607707262039185, + "learning_rate": 4.9726266954311986e-05, + "loss": 0.2109, + "step": 7985 + }, + { + "epoch": 0.14243926800556486, + "grad_norm": 0.24391315877437592, + "learning_rate": 4.972603720280158e-05, + "loss": 0.1981, + "step": 7986 + }, + { + "epoch": 0.14245710412727858, + "grad_norm": 0.23113210499286652, + "learning_rate": 4.972580735544451e-05, + "loss": 0.1997, + "step": 7987 + }, + { + "epoch": 0.14247494024899227, + "grad_norm": 0.3148064613342285, + "learning_rate": 4.9725577412241666e-05, + "loss": 0.2365, + "step": 7988 + }, + { + "epoch": 0.14249277637070595, + "grad_norm": 0.8902646899223328, + "learning_rate": 4.972534737319395e-05, + "loss": 0.2688, + "step": 7989 + }, + { + "epoch": 0.14251061249241964, + "grad_norm": 0.22924846410751343, + "learning_rate": 4.9725117238302246e-05, + "loss": 0.2286, + "step": 7990 + }, + { + "epoch": 0.14252844861413333, + "grad_norm": 0.23769497871398926, + "learning_rate": 4.972488700756745e-05, + "loss": 0.1631, + "step": 7991 + }, + { + "epoch": 0.14254628473584705, + "grad_norm": 0.31016814708709717, + "learning_rate": 4.972465668099045e-05, + "loss": 0.2464, + "step": 7992 + }, + { + "epoch": 0.14256412085756073, + "grad_norm": 0.279687762260437, + "learning_rate": 4.972442625857214e-05, + "loss": 0.2219, + "step": 7993 + }, + { + "epoch": 0.14258195697927442, + "grad_norm": 0.27037692070007324, + "learning_rate": 4.972419574031342e-05, + "loss": 0.2399, + "step": 7994 + }, + { + "epoch": 0.1425997931009881, + "grad_norm": 0.2630898058414459, + "learning_rate": 4.972396512621517e-05, + "loss": 0.2321, + "step": 7995 + }, + { + "epoch": 0.14261762922270182, + "grad_norm": 0.21923020482063293, + "learning_rate": 4.97237344162783e-05, + "loss": 0.1847, + "step": 7996 + }, + { + "epoch": 0.1426354653444155, + "grad_norm": 0.30596184730529785, + "learning_rate": 4.9723503610503693e-05, + "loss": 0.2455, + "step": 7997 + }, + { + "epoch": 0.1426533014661292, + "grad_norm": 0.33181026577949524, + "learning_rate": 4.9723272708892243e-05, + "loss": 0.1955, + "step": 7998 + }, + { + "epoch": 0.1426711375878429, + "grad_norm": 0.42547452449798584, + "learning_rate": 4.972304171144485e-05, + "loss": 0.2604, + "step": 7999 + }, + { + "epoch": 0.1426889737095566, + "grad_norm": 0.3572325110435486, + "learning_rate": 4.972281061816241e-05, + "loss": 0.2575, + "step": 8000 + }, + { + "epoch": 0.1426889737095566, + "eval_loss": 0.21591931581497192, + "eval_runtime": 106.6657, + "eval_samples_per_second": 9.6, + "eval_steps_per_second": 1.603, + "step": 8000 + }, + { + "epoch": 0.1427068098312703, + "grad_norm": 0.1958993822336197, + "learning_rate": 4.9722579429045816e-05, + "loss": 0.2162, + "step": 8001 + }, + { + "epoch": 0.14272464595298398, + "grad_norm": 0.33424729108810425, + "learning_rate": 4.9722348144095966e-05, + "loss": 0.1894, + "step": 8002 + }, + { + "epoch": 0.14274248207469767, + "grad_norm": 0.2643277645111084, + "learning_rate": 4.9722116763313756e-05, + "loss": 0.2133, + "step": 8003 + }, + { + "epoch": 0.14276031819641138, + "grad_norm": 0.33820122480392456, + "learning_rate": 4.972188528670009e-05, + "loss": 0.241, + "step": 8004 + }, + { + "epoch": 0.14277815431812507, + "grad_norm": 0.28121674060821533, + "learning_rate": 4.9721653714255844e-05, + "loss": 0.2235, + "step": 8005 + }, + { + "epoch": 0.14279599043983876, + "grad_norm": 0.31005755066871643, + "learning_rate": 4.972142204598194e-05, + "loss": 0.2206, + "step": 8006 + }, + { + "epoch": 0.14281382656155245, + "grad_norm": 0.25871145725250244, + "learning_rate": 4.9721190281879256e-05, + "loss": 0.1956, + "step": 8007 + }, + { + "epoch": 0.14283166268326616, + "grad_norm": 0.3964315950870514, + "learning_rate": 4.97209584219487e-05, + "loss": 0.2176, + "step": 8008 + }, + { + "epoch": 0.14284949880497985, + "grad_norm": 0.2921893298625946, + "learning_rate": 4.9720726466191184e-05, + "loss": 0.2438, + "step": 8009 + }, + { + "epoch": 0.14286733492669354, + "grad_norm": 0.44086408615112305, + "learning_rate": 4.972049441460759e-05, + "loss": 0.2577, + "step": 8010 + }, + { + "epoch": 0.14288517104840723, + "grad_norm": 0.32219594717025757, + "learning_rate": 4.9720262267198815e-05, + "loss": 0.1987, + "step": 8011 + }, + { + "epoch": 0.14290300717012092, + "grad_norm": 0.2792239785194397, + "learning_rate": 4.972003002396577e-05, + "loss": 0.1722, + "step": 8012 + }, + { + "epoch": 0.14292084329183463, + "grad_norm": 0.18763373792171478, + "learning_rate": 4.971979768490935e-05, + "loss": 0.2042, + "step": 8013 + }, + { + "epoch": 0.14293867941354832, + "grad_norm": 0.2346906065940857, + "learning_rate": 4.971956525003045e-05, + "loss": 0.1704, + "step": 8014 + }, + { + "epoch": 0.142956515535262, + "grad_norm": 0.29033222794532776, + "learning_rate": 4.971933271932999e-05, + "loss": 0.2116, + "step": 8015 + }, + { + "epoch": 0.1429743516569757, + "grad_norm": 0.25404953956604004, + "learning_rate": 4.971910009280885e-05, + "loss": 0.2469, + "step": 8016 + }, + { + "epoch": 0.1429921877786894, + "grad_norm": 0.28456249833106995, + "learning_rate": 4.971886737046794e-05, + "loss": 0.2165, + "step": 8017 + }, + { + "epoch": 0.1430100239004031, + "grad_norm": 0.20949894189834595, + "learning_rate": 4.971863455230816e-05, + "loss": 0.2139, + "step": 8018 + }, + { + "epoch": 0.1430278600221168, + "grad_norm": 0.2785196006298065, + "learning_rate": 4.971840163833042e-05, + "loss": 0.2459, + "step": 8019 + }, + { + "epoch": 0.14304569614383048, + "grad_norm": 0.23433707654476166, + "learning_rate": 4.9718168628535615e-05, + "loss": 0.2369, + "step": 8020 + }, + { + "epoch": 0.1430635322655442, + "grad_norm": 0.289565771818161, + "learning_rate": 4.971793552292465e-05, + "loss": 0.2314, + "step": 8021 + }, + { + "epoch": 0.14308136838725788, + "grad_norm": 0.2865285277366638, + "learning_rate": 4.9717702321498435e-05, + "loss": 0.2114, + "step": 8022 + }, + { + "epoch": 0.14309920450897157, + "grad_norm": 0.248923197388649, + "learning_rate": 4.9717469024257866e-05, + "loss": 0.2595, + "step": 8023 + }, + { + "epoch": 0.14311704063068525, + "grad_norm": 0.23164553940296173, + "learning_rate": 4.971723563120385e-05, + "loss": 0.2634, + "step": 8024 + }, + { + "epoch": 0.14313487675239897, + "grad_norm": 0.3079147934913635, + "learning_rate": 4.971700214233729e-05, + "loss": 0.2527, + "step": 8025 + }, + { + "epoch": 0.14315271287411266, + "grad_norm": 0.21224114298820496, + "learning_rate": 4.97167685576591e-05, + "loss": 0.2148, + "step": 8026 + }, + { + "epoch": 0.14317054899582635, + "grad_norm": 0.4156232476234436, + "learning_rate": 4.9716534877170167e-05, + "loss": 0.2609, + "step": 8027 + }, + { + "epoch": 0.14318838511754003, + "grad_norm": 0.20538099110126495, + "learning_rate": 4.971630110087141e-05, + "loss": 0.1823, + "step": 8028 + }, + { + "epoch": 0.14320622123925375, + "grad_norm": 0.2794420123100281, + "learning_rate": 4.971606722876374e-05, + "loss": 0.2258, + "step": 8029 + }, + { + "epoch": 0.14322405736096744, + "grad_norm": 0.2170548290014267, + "learning_rate": 4.971583326084806e-05, + "loss": 0.1697, + "step": 8030 + }, + { + "epoch": 0.14324189348268113, + "grad_norm": 0.2593088448047638, + "learning_rate": 4.971559919712526e-05, + "loss": 0.2506, + "step": 8031 + }, + { + "epoch": 0.14325972960439481, + "grad_norm": 0.3047870099544525, + "learning_rate": 4.971536503759627e-05, + "loss": 0.2401, + "step": 8032 + }, + { + "epoch": 0.1432775657261085, + "grad_norm": 0.23515376448631287, + "learning_rate": 4.971513078226199e-05, + "loss": 0.2025, + "step": 8033 + }, + { + "epoch": 0.14329540184782222, + "grad_norm": 0.2998775839805603, + "learning_rate": 4.9714896431123314e-05, + "loss": 0.2554, + "step": 8034 + }, + { + "epoch": 0.1433132379695359, + "grad_norm": 0.28107354044914246, + "learning_rate": 4.9714661984181175e-05, + "loss": 0.2352, + "step": 8035 + }, + { + "epoch": 0.1433310740912496, + "grad_norm": 0.28603890538215637, + "learning_rate": 4.971442744143647e-05, + "loss": 0.2584, + "step": 8036 + }, + { + "epoch": 0.14334891021296328, + "grad_norm": 0.2731328010559082, + "learning_rate": 4.97141928028901e-05, + "loss": 0.2455, + "step": 8037 + }, + { + "epoch": 0.143366746334677, + "grad_norm": 0.33348676562309265, + "learning_rate": 4.9713958068542994e-05, + "loss": 0.2973, + "step": 8038 + }, + { + "epoch": 0.14338458245639069, + "grad_norm": 0.3898952603340149, + "learning_rate": 4.971372323839605e-05, + "loss": 0.2013, + "step": 8039 + }, + { + "epoch": 0.14340241857810437, + "grad_norm": 0.22789378464221954, + "learning_rate": 4.971348831245017e-05, + "loss": 0.2235, + "step": 8040 + }, + { + "epoch": 0.14342025469981806, + "grad_norm": 0.3405819833278656, + "learning_rate": 4.971325329070628e-05, + "loss": 0.2106, + "step": 8041 + }, + { + "epoch": 0.14343809082153178, + "grad_norm": 0.3461797833442688, + "learning_rate": 4.9713018173165274e-05, + "loss": 0.2119, + "step": 8042 + }, + { + "epoch": 0.14345592694324547, + "grad_norm": 0.27884143590927124, + "learning_rate": 4.971278295982808e-05, + "loss": 0.2193, + "step": 8043 + }, + { + "epoch": 0.14347376306495915, + "grad_norm": 0.3599877655506134, + "learning_rate": 4.971254765069561e-05, + "loss": 0.2579, + "step": 8044 + }, + { + "epoch": 0.14349159918667284, + "grad_norm": 0.2900587022304535, + "learning_rate": 4.9712312245768766e-05, + "loss": 0.2459, + "step": 8045 + }, + { + "epoch": 0.14350943530838656, + "grad_norm": 0.35157376527786255, + "learning_rate": 4.971207674504846e-05, + "loss": 0.2461, + "step": 8046 + }, + { + "epoch": 0.14352727143010025, + "grad_norm": 0.3435586392879486, + "learning_rate": 4.971184114853562e-05, + "loss": 0.2141, + "step": 8047 + }, + { + "epoch": 0.14354510755181393, + "grad_norm": 0.32010671496391296, + "learning_rate": 4.9711605456231136e-05, + "loss": 0.2176, + "step": 8048 + }, + { + "epoch": 0.14356294367352762, + "grad_norm": 0.3224741518497467, + "learning_rate": 4.9711369668135945e-05, + "loss": 0.2333, + "step": 8049 + }, + { + "epoch": 0.1435807797952413, + "grad_norm": 0.24018892645835876, + "learning_rate": 4.971113378425094e-05, + "loss": 0.2011, + "step": 8050 + }, + { + "epoch": 0.14359861591695502, + "grad_norm": 0.3418002128601074, + "learning_rate": 4.971089780457705e-05, + "loss": 0.2582, + "step": 8051 + }, + { + "epoch": 0.1436164520386687, + "grad_norm": 0.23992377519607544, + "learning_rate": 4.9710661729115185e-05, + "loss": 0.1967, + "step": 8052 + }, + { + "epoch": 0.1436342881603824, + "grad_norm": 0.36270955204963684, + "learning_rate": 4.9710425557866266e-05, + "loss": 0.3147, + "step": 8053 + }, + { + "epoch": 0.1436521242820961, + "grad_norm": 0.23312827944755554, + "learning_rate": 4.97101892908312e-05, + "loss": 0.1954, + "step": 8054 + }, + { + "epoch": 0.1436699604038098, + "grad_norm": 0.2479989379644394, + "learning_rate": 4.97099529280109e-05, + "loss": 0.1936, + "step": 8055 + }, + { + "epoch": 0.1436877965255235, + "grad_norm": 0.4056938886642456, + "learning_rate": 4.9709716469406286e-05, + "loss": 0.2886, + "step": 8056 + }, + { + "epoch": 0.14370563264723718, + "grad_norm": 0.3030484914779663, + "learning_rate": 4.970947991501829e-05, + "loss": 0.2702, + "step": 8057 + }, + { + "epoch": 0.14372346876895087, + "grad_norm": 0.3851870894432068, + "learning_rate": 4.97092432648478e-05, + "loss": 0.2449, + "step": 8058 + }, + { + "epoch": 0.14374130489066458, + "grad_norm": 0.2610694468021393, + "learning_rate": 4.970900651889576e-05, + "loss": 0.2187, + "step": 8059 + }, + { + "epoch": 0.14375914101237827, + "grad_norm": 0.24890683591365814, + "learning_rate": 4.970876967716307e-05, + "loss": 0.2262, + "step": 8060 + }, + { + "epoch": 0.14377697713409196, + "grad_norm": 0.25924643874168396, + "learning_rate": 4.970853273965066e-05, + "loss": 0.2189, + "step": 8061 + }, + { + "epoch": 0.14379481325580565, + "grad_norm": 0.2735474705696106, + "learning_rate": 4.970829570635944e-05, + "loss": 0.2177, + "step": 8062 + }, + { + "epoch": 0.14381264937751936, + "grad_norm": 0.23441441357135773, + "learning_rate": 4.9708058577290336e-05, + "loss": 0.1935, + "step": 8063 + }, + { + "epoch": 0.14383048549923305, + "grad_norm": 0.2908290922641754, + "learning_rate": 4.970782135244426e-05, + "loss": 0.223, + "step": 8064 + }, + { + "epoch": 0.14384832162094674, + "grad_norm": 0.3465617001056671, + "learning_rate": 4.970758403182213e-05, + "loss": 0.2106, + "step": 8065 + }, + { + "epoch": 0.14386615774266043, + "grad_norm": 0.28953370451927185, + "learning_rate": 4.9707346615424876e-05, + "loss": 0.2513, + "step": 8066 + }, + { + "epoch": 0.14388399386437414, + "grad_norm": 0.24587005376815796, + "learning_rate": 4.970710910325341e-05, + "loss": 0.229, + "step": 8067 + }, + { + "epoch": 0.14390182998608783, + "grad_norm": 0.2227289229631424, + "learning_rate": 4.970687149530866e-05, + "loss": 0.2105, + "step": 8068 + }, + { + "epoch": 0.14391966610780152, + "grad_norm": 0.4306078553199768, + "learning_rate": 4.970663379159154e-05, + "loss": 0.2362, + "step": 8069 + }, + { + "epoch": 0.1439375022295152, + "grad_norm": 0.2689933478832245, + "learning_rate": 4.9706395992102974e-05, + "loss": 0.1784, + "step": 8070 + }, + { + "epoch": 0.1439553383512289, + "grad_norm": 0.27545493841171265, + "learning_rate": 4.970615809684388e-05, + "loss": 0.2588, + "step": 8071 + }, + { + "epoch": 0.1439731744729426, + "grad_norm": 0.2528339922428131, + "learning_rate": 4.970592010581519e-05, + "loss": 0.2213, + "step": 8072 + }, + { + "epoch": 0.1439910105946563, + "grad_norm": 0.27702268958091736, + "learning_rate": 4.970568201901781e-05, + "loss": 0.246, + "step": 8073 + }, + { + "epoch": 0.14400884671637, + "grad_norm": 0.23985956609249115, + "learning_rate": 4.9705443836452684e-05, + "loss": 0.2214, + "step": 8074 + }, + { + "epoch": 0.14402668283808367, + "grad_norm": 0.2800622880458832, + "learning_rate": 4.970520555812073e-05, + "loss": 0.2471, + "step": 8075 + }, + { + "epoch": 0.1440445189597974, + "grad_norm": 0.2803814113140106, + "learning_rate": 4.970496718402286e-05, + "loss": 0.2139, + "step": 8076 + }, + { + "epoch": 0.14406235508151108, + "grad_norm": 0.30929356813430786, + "learning_rate": 4.9704728714159995e-05, + "loss": 0.2391, + "step": 8077 + }, + { + "epoch": 0.14408019120322477, + "grad_norm": 0.20972399413585663, + "learning_rate": 4.970449014853308e-05, + "loss": 0.1837, + "step": 8078 + }, + { + "epoch": 0.14409802732493845, + "grad_norm": 0.33398327231407166, + "learning_rate": 4.9704251487143024e-05, + "loss": 0.2284, + "step": 8079 + }, + { + "epoch": 0.14411586344665217, + "grad_norm": 0.24110308289527893, + "learning_rate": 4.970401272999076e-05, + "loss": 0.225, + "step": 8080 + }, + { + "epoch": 0.14413369956836586, + "grad_norm": 0.2563042938709259, + "learning_rate": 4.97037738770772e-05, + "loss": 0.246, + "step": 8081 + }, + { + "epoch": 0.14415153569007955, + "grad_norm": 0.22172963619232178, + "learning_rate": 4.970353492840329e-05, + "loss": 0.2462, + "step": 8082 + }, + { + "epoch": 0.14416937181179323, + "grad_norm": 0.2786707878112793, + "learning_rate": 4.970329588396995e-05, + "loss": 0.187, + "step": 8083 + }, + { + "epoch": 0.14418720793350695, + "grad_norm": 0.35980361700057983, + "learning_rate": 4.9703056743778094e-05, + "loss": 0.2348, + "step": 8084 + }, + { + "epoch": 0.14420504405522064, + "grad_norm": 0.2369632124900818, + "learning_rate": 4.970281750782866e-05, + "loss": 0.1732, + "step": 8085 + }, + { + "epoch": 0.14422288017693433, + "grad_norm": 0.25025275349617004, + "learning_rate": 4.970257817612257e-05, + "loss": 0.2125, + "step": 8086 + }, + { + "epoch": 0.14424071629864801, + "grad_norm": 0.2807694673538208, + "learning_rate": 4.970233874866076e-05, + "loss": 0.2635, + "step": 8087 + }, + { + "epoch": 0.14425855242036173, + "grad_norm": 0.28084516525268555, + "learning_rate": 4.970209922544415e-05, + "loss": 0.2589, + "step": 8088 + }, + { + "epoch": 0.14427638854207542, + "grad_norm": 0.213764950633049, + "learning_rate": 4.970185960647368e-05, + "loss": 0.2026, + "step": 8089 + }, + { + "epoch": 0.1442942246637891, + "grad_norm": 0.20018413662910461, + "learning_rate": 4.970161989175026e-05, + "loss": 0.227, + "step": 8090 + }, + { + "epoch": 0.1443120607855028, + "grad_norm": 0.26697149872779846, + "learning_rate": 4.970138008127484e-05, + "loss": 0.2787, + "step": 8091 + }, + { + "epoch": 0.14432989690721648, + "grad_norm": 0.24063348770141602, + "learning_rate": 4.970114017504833e-05, + "loss": 0.2127, + "step": 8092 + }, + { + "epoch": 0.1443477330289302, + "grad_norm": 0.31096041202545166, + "learning_rate": 4.970090017307167e-05, + "loss": 0.1976, + "step": 8093 + }, + { + "epoch": 0.14436556915064389, + "grad_norm": 0.36545073986053467, + "learning_rate": 4.97006600753458e-05, + "loss": 0.2472, + "step": 8094 + }, + { + "epoch": 0.14438340527235757, + "grad_norm": 0.2593143880367279, + "learning_rate": 4.970041988187163e-05, + "loss": 0.2508, + "step": 8095 + }, + { + "epoch": 0.14440124139407126, + "grad_norm": 0.29695287346839905, + "learning_rate": 4.97001795926501e-05, + "loss": 0.2252, + "step": 8096 + }, + { + "epoch": 0.14441907751578498, + "grad_norm": 0.4915003776550293, + "learning_rate": 4.9699939207682147e-05, + "loss": 0.1764, + "step": 8097 + }, + { + "epoch": 0.14443691363749867, + "grad_norm": 0.2949909567832947, + "learning_rate": 4.96996987269687e-05, + "loss": 0.246, + "step": 8098 + }, + { + "epoch": 0.14445474975921235, + "grad_norm": 0.24923904240131378, + "learning_rate": 4.969945815051068e-05, + "loss": 0.2175, + "step": 8099 + }, + { + "epoch": 0.14447258588092604, + "grad_norm": 0.3403671979904175, + "learning_rate": 4.969921747830904e-05, + "loss": 0.2788, + "step": 8100 + }, + { + "epoch": 0.14449042200263976, + "grad_norm": 0.21889542043209076, + "learning_rate": 4.96989767103647e-05, + "loss": 0.1927, + "step": 8101 + }, + { + "epoch": 0.14450825812435344, + "grad_norm": 0.3265772759914398, + "learning_rate": 4.969873584667859e-05, + "loss": 0.2157, + "step": 8102 + }, + { + "epoch": 0.14452609424606713, + "grad_norm": 0.2654820680618286, + "learning_rate": 4.969849488725166e-05, + "loss": 0.2121, + "step": 8103 + }, + { + "epoch": 0.14454393036778082, + "grad_norm": 0.32656437158584595, + "learning_rate": 4.9698253832084825e-05, + "loss": 0.2258, + "step": 8104 + }, + { + "epoch": 0.14456176648949454, + "grad_norm": 0.358965128660202, + "learning_rate": 4.969801268117903e-05, + "loss": 0.2774, + "step": 8105 + }, + { + "epoch": 0.14457960261120822, + "grad_norm": 0.23094293475151062, + "learning_rate": 4.96977714345352e-05, + "loss": 0.2035, + "step": 8106 + }, + { + "epoch": 0.1445974387329219, + "grad_norm": 0.22926008701324463, + "learning_rate": 4.969753009215429e-05, + "loss": 0.1865, + "step": 8107 + }, + { + "epoch": 0.1446152748546356, + "grad_norm": 0.26557493209838867, + "learning_rate": 4.969728865403722e-05, + "loss": 0.1873, + "step": 8108 + }, + { + "epoch": 0.14463311097634932, + "grad_norm": 0.21748287975788116, + "learning_rate": 4.9697047120184926e-05, + "loss": 0.2328, + "step": 8109 + }, + { + "epoch": 0.144650947098063, + "grad_norm": 0.25423404574394226, + "learning_rate": 4.969680549059834e-05, + "loss": 0.1797, + "step": 8110 + }, + { + "epoch": 0.1446687832197767, + "grad_norm": 0.3078209161758423, + "learning_rate": 4.969656376527842e-05, + "loss": 0.203, + "step": 8111 + }, + { + "epoch": 0.14468661934149038, + "grad_norm": 0.2998126447200775, + "learning_rate": 4.969632194422608e-05, + "loss": 0.2082, + "step": 8112 + }, + { + "epoch": 0.14470445546320407, + "grad_norm": 0.3039228618144989, + "learning_rate": 4.9696080027442274e-05, + "loss": 0.2659, + "step": 8113 + }, + { + "epoch": 0.14472229158491778, + "grad_norm": 0.3542231321334839, + "learning_rate": 4.969583801492792e-05, + "loss": 0.2618, + "step": 8114 + }, + { + "epoch": 0.14474012770663147, + "grad_norm": 0.37329572439193726, + "learning_rate": 4.969559590668398e-05, + "loss": 0.2969, + "step": 8115 + }, + { + "epoch": 0.14475796382834516, + "grad_norm": 0.22694751620292664, + "learning_rate": 4.969535370271138e-05, + "loss": 0.2274, + "step": 8116 + }, + { + "epoch": 0.14477579995005885, + "grad_norm": 0.2826818823814392, + "learning_rate": 4.9695111403011054e-05, + "loss": 0.234, + "step": 8117 + }, + { + "epoch": 0.14479363607177256, + "grad_norm": 0.2796265184879303, + "learning_rate": 4.9694869007583946e-05, + "loss": 0.1865, + "step": 8118 + }, + { + "epoch": 0.14481147219348625, + "grad_norm": 0.2666398286819458, + "learning_rate": 4.9694626516430995e-05, + "loss": 0.2224, + "step": 8119 + }, + { + "epoch": 0.14482930831519994, + "grad_norm": 0.29951611161231995, + "learning_rate": 4.969438392955314e-05, + "loss": 0.2108, + "step": 8120 + }, + { + "epoch": 0.14484714443691363, + "grad_norm": 0.2740534543991089, + "learning_rate": 4.969414124695134e-05, + "loss": 0.2134, + "step": 8121 + }, + { + "epoch": 0.14486498055862734, + "grad_norm": 0.29929760098457336, + "learning_rate": 4.969389846862651e-05, + "loss": 0.2098, + "step": 8122 + }, + { + "epoch": 0.14488281668034103, + "grad_norm": 0.3080112040042877, + "learning_rate": 4.9693655594579594e-05, + "loss": 0.1933, + "step": 8123 + }, + { + "epoch": 0.14490065280205472, + "grad_norm": 0.2736315131187439, + "learning_rate": 4.969341262481155e-05, + "loss": 0.1723, + "step": 8124 + }, + { + "epoch": 0.1449184889237684, + "grad_norm": 0.26007893681526184, + "learning_rate": 4.969316955932331e-05, + "loss": 0.1735, + "step": 8125 + }, + { + "epoch": 0.14493632504548212, + "grad_norm": 0.2956458330154419, + "learning_rate": 4.96929263981158e-05, + "loss": 0.2097, + "step": 8126 + }, + { + "epoch": 0.1449541611671958, + "grad_norm": 0.2500307559967041, + "learning_rate": 4.969268314119e-05, + "loss": 0.2197, + "step": 8127 + }, + { + "epoch": 0.1449719972889095, + "grad_norm": 0.3087429106235504, + "learning_rate": 4.969243978854682e-05, + "loss": 0.2027, + "step": 8128 + }, + { + "epoch": 0.1449898334106232, + "grad_norm": 0.29210174083709717, + "learning_rate": 4.9692196340187216e-05, + "loss": 0.2017, + "step": 8129 + }, + { + "epoch": 0.1450076695323369, + "grad_norm": 0.27657708525657654, + "learning_rate": 4.969195279611213e-05, + "loss": 0.2153, + "step": 8130 + }, + { + "epoch": 0.1450255056540506, + "grad_norm": 0.3296692967414856, + "learning_rate": 4.969170915632251e-05, + "loss": 0.2376, + "step": 8131 + }, + { + "epoch": 0.14504334177576428, + "grad_norm": 0.2788555920124054, + "learning_rate": 4.969146542081929e-05, + "loss": 0.2657, + "step": 8132 + }, + { + "epoch": 0.14506117789747797, + "grad_norm": 0.3158535659313202, + "learning_rate": 4.969122158960343e-05, + "loss": 0.2651, + "step": 8133 + }, + { + "epoch": 0.14507901401919165, + "grad_norm": 0.23868532478809357, + "learning_rate": 4.969097766267587e-05, + "loss": 0.2108, + "step": 8134 + }, + { + "epoch": 0.14509685014090537, + "grad_norm": 0.46661341190338135, + "learning_rate": 4.969073364003754e-05, + "loss": 0.1925, + "step": 8135 + }, + { + "epoch": 0.14511468626261906, + "grad_norm": 0.23679344356060028, + "learning_rate": 4.9690489521689406e-05, + "loss": 0.237, + "step": 8136 + }, + { + "epoch": 0.14513252238433275, + "grad_norm": 0.2594318091869354, + "learning_rate": 4.96902453076324e-05, + "loss": 0.2708, + "step": 8137 + }, + { + "epoch": 0.14515035850604643, + "grad_norm": 0.34159931540489197, + "learning_rate": 4.9690000997867486e-05, + "loss": 0.261, + "step": 8138 + }, + { + "epoch": 0.14516819462776015, + "grad_norm": 0.32327941060066223, + "learning_rate": 4.96897565923956e-05, + "loss": 0.2556, + "step": 8139 + }, + { + "epoch": 0.14518603074947384, + "grad_norm": 0.5102737545967102, + "learning_rate": 4.968951209121768e-05, + "loss": 0.3226, + "step": 8140 + }, + { + "epoch": 0.14520386687118753, + "grad_norm": 0.24807362258434296, + "learning_rate": 4.968926749433469e-05, + "loss": 0.2064, + "step": 8141 + }, + { + "epoch": 0.1452217029929012, + "grad_norm": 0.2608893811702728, + "learning_rate": 4.9689022801747576e-05, + "loss": 0.2039, + "step": 8142 + }, + { + "epoch": 0.14523953911461493, + "grad_norm": 0.2564334571361542, + "learning_rate": 4.968877801345727e-05, + "loss": 0.2266, + "step": 8143 + }, + { + "epoch": 0.14525737523632862, + "grad_norm": 0.27542316913604736, + "learning_rate": 4.9688533129464745e-05, + "loss": 0.2699, + "step": 8144 + }, + { + "epoch": 0.1452752113580423, + "grad_norm": 0.2978516221046448, + "learning_rate": 4.968828814977093e-05, + "loss": 0.2244, + "step": 8145 + }, + { + "epoch": 0.145293047479756, + "grad_norm": 0.29370105266571045, + "learning_rate": 4.968804307437679e-05, + "loss": 0.2232, + "step": 8146 + }, + { + "epoch": 0.1453108836014697, + "grad_norm": 0.21740269660949707, + "learning_rate": 4.968779790328326e-05, + "loss": 0.1728, + "step": 8147 + }, + { + "epoch": 0.1453287197231834, + "grad_norm": 0.3440368175506592, + "learning_rate": 4.968755263649131e-05, + "loss": 0.2197, + "step": 8148 + }, + { + "epoch": 0.14534655584489709, + "grad_norm": 0.32539790868759155, + "learning_rate": 4.968730727400187e-05, + "loss": 0.2727, + "step": 8149 + }, + { + "epoch": 0.14536439196661077, + "grad_norm": 0.2705410420894623, + "learning_rate": 4.968706181581591e-05, + "loss": 0.2623, + "step": 8150 + }, + { + "epoch": 0.14538222808832446, + "grad_norm": 0.23159928619861603, + "learning_rate": 4.968681626193436e-05, + "loss": 0.1949, + "step": 8151 + }, + { + "epoch": 0.14540006421003818, + "grad_norm": 0.2809099555015564, + "learning_rate": 4.9686570612358184e-05, + "loss": 0.2459, + "step": 8152 + }, + { + "epoch": 0.14541790033175186, + "grad_norm": 0.37867191433906555, + "learning_rate": 4.968632486708834e-05, + "loss": 0.2524, + "step": 8153 + }, + { + "epoch": 0.14543573645346555, + "grad_norm": 0.1915334165096283, + "learning_rate": 4.968607902612578e-05, + "loss": 0.1841, + "step": 8154 + }, + { + "epoch": 0.14545357257517924, + "grad_norm": 0.2534467577934265, + "learning_rate": 4.968583308947144e-05, + "loss": 0.228, + "step": 8155 + }, + { + "epoch": 0.14547140869689296, + "grad_norm": 0.2592531740665436, + "learning_rate": 4.968558705712629e-05, + "loss": 0.215, + "step": 8156 + }, + { + "epoch": 0.14548924481860664, + "grad_norm": 0.23502813279628754, + "learning_rate": 4.9685340929091276e-05, + "loss": 0.1788, + "step": 8157 + }, + { + "epoch": 0.14550708094032033, + "grad_norm": 0.2555530369281769, + "learning_rate": 4.968509470536735e-05, + "loss": 0.2124, + "step": 8158 + }, + { + "epoch": 0.14552491706203402, + "grad_norm": 0.40521240234375, + "learning_rate": 4.968484838595548e-05, + "loss": 0.1925, + "step": 8159 + }, + { + "epoch": 0.14554275318374774, + "grad_norm": 0.2218252420425415, + "learning_rate": 4.9684601970856606e-05, + "loss": 0.2149, + "step": 8160 + }, + { + "epoch": 0.14556058930546142, + "grad_norm": 0.32937249541282654, + "learning_rate": 4.96843554600717e-05, + "loss": 0.2082, + "step": 8161 + }, + { + "epoch": 0.1455784254271751, + "grad_norm": 0.23088762164115906, + "learning_rate": 4.96841088536017e-05, + "loss": 0.1849, + "step": 8162 + }, + { + "epoch": 0.1455962615488888, + "grad_norm": 0.2395429164171219, + "learning_rate": 4.9683862151447556e-05, + "loss": 0.1946, + "step": 8163 + }, + { + "epoch": 0.14561409767060252, + "grad_norm": 0.2747701108455658, + "learning_rate": 4.968361535361025e-05, + "loss": 0.1932, + "step": 8164 + }, + { + "epoch": 0.1456319337923162, + "grad_norm": 0.36645421385765076, + "learning_rate": 4.9683368460090725e-05, + "loss": 0.25, + "step": 8165 + }, + { + "epoch": 0.1456497699140299, + "grad_norm": 0.22646576166152954, + "learning_rate": 4.968312147088994e-05, + "loss": 0.2238, + "step": 8166 + }, + { + "epoch": 0.14566760603574358, + "grad_norm": 0.2774210274219513, + "learning_rate": 4.968287438600884e-05, + "loss": 0.2193, + "step": 8167 + }, + { + "epoch": 0.1456854421574573, + "grad_norm": 0.2596759498119354, + "learning_rate": 4.96826272054484e-05, + "loss": 0.1979, + "step": 8168 + }, + { + "epoch": 0.14570327827917098, + "grad_norm": 0.3014243245124817, + "learning_rate": 4.9682379929209576e-05, + "loss": 0.2679, + "step": 8169 + }, + { + "epoch": 0.14572111440088467, + "grad_norm": 0.22725193202495575, + "learning_rate": 4.968213255729332e-05, + "loss": 0.2007, + "step": 8170 + }, + { + "epoch": 0.14573895052259836, + "grad_norm": 0.2004374861717224, + "learning_rate": 4.968188508970059e-05, + "loss": 0.1861, + "step": 8171 + }, + { + "epoch": 0.14575678664431205, + "grad_norm": 0.27653491497039795, + "learning_rate": 4.968163752643235e-05, + "loss": 0.2265, + "step": 8172 + }, + { + "epoch": 0.14577462276602576, + "grad_norm": 0.24558009207248688, + "learning_rate": 4.968138986748956e-05, + "loss": 0.2089, + "step": 8173 + }, + { + "epoch": 0.14579245888773945, + "grad_norm": 0.23888957500457764, + "learning_rate": 4.968114211287318e-05, + "loss": 0.2124, + "step": 8174 + }, + { + "epoch": 0.14581029500945314, + "grad_norm": 0.27592718601226807, + "learning_rate": 4.968089426258417e-05, + "loss": 0.2321, + "step": 8175 + }, + { + "epoch": 0.14582813113116683, + "grad_norm": 0.262919157743454, + "learning_rate": 4.968064631662348e-05, + "loss": 0.2093, + "step": 8176 + }, + { + "epoch": 0.14584596725288054, + "grad_norm": 0.3246628940105438, + "learning_rate": 4.9680398274992086e-05, + "loss": 0.2108, + "step": 8177 + }, + { + "epoch": 0.14586380337459423, + "grad_norm": 0.22410131990909576, + "learning_rate": 4.968015013769094e-05, + "loss": 0.2301, + "step": 8178 + }, + { + "epoch": 0.14588163949630792, + "grad_norm": 0.23250728845596313, + "learning_rate": 4.9679901904721015e-05, + "loss": 0.1826, + "step": 8179 + }, + { + "epoch": 0.1458994756180216, + "grad_norm": 0.29271814227104187, + "learning_rate": 4.9679653576083265e-05, + "loss": 0.2291, + "step": 8180 + }, + { + "epoch": 0.14591731173973532, + "grad_norm": 0.3109009861946106, + "learning_rate": 4.9679405151778646e-05, + "loss": 0.259, + "step": 8181 + }, + { + "epoch": 0.145935147861449, + "grad_norm": 0.23242051899433136, + "learning_rate": 4.967915663180813e-05, + "loss": 0.2082, + "step": 8182 + }, + { + "epoch": 0.1459529839831627, + "grad_norm": 0.3276471197605133, + "learning_rate": 4.967890801617269e-05, + "loss": 0.1778, + "step": 8183 + }, + { + "epoch": 0.1459708201048764, + "grad_norm": 0.21032224595546722, + "learning_rate": 4.9678659304873264e-05, + "loss": 0.2045, + "step": 8184 + }, + { + "epoch": 0.1459886562265901, + "grad_norm": 0.2781781256198883, + "learning_rate": 4.9678410497910845e-05, + "loss": 0.2736, + "step": 8185 + }, + { + "epoch": 0.1460064923483038, + "grad_norm": 0.32890549302101135, + "learning_rate": 4.9678161595286375e-05, + "loss": 0.2146, + "step": 8186 + }, + { + "epoch": 0.14602432847001748, + "grad_norm": 0.3681790828704834, + "learning_rate": 4.967791259700082e-05, + "loss": 0.1741, + "step": 8187 + }, + { + "epoch": 0.14604216459173117, + "grad_norm": 0.32718828320503235, + "learning_rate": 4.9677663503055164e-05, + "loss": 0.2018, + "step": 8188 + }, + { + "epoch": 0.14606000071344488, + "grad_norm": 0.3910665810108185, + "learning_rate": 4.967741431345036e-05, + "loss": 0.2411, + "step": 8189 + }, + { + "epoch": 0.14607783683515857, + "grad_norm": 0.3346827030181885, + "learning_rate": 4.967716502818737e-05, + "loss": 0.2197, + "step": 8190 + }, + { + "epoch": 0.14609567295687226, + "grad_norm": 0.24446497857570648, + "learning_rate": 4.967691564726717e-05, + "loss": 0.1766, + "step": 8191 + }, + { + "epoch": 0.14611350907858595, + "grad_norm": 0.2619584798812866, + "learning_rate": 4.9676666170690724e-05, + "loss": 0.2152, + "step": 8192 + }, + { + "epoch": 0.14613134520029963, + "grad_norm": 0.2953864634037018, + "learning_rate": 4.967641659845899e-05, + "loss": 0.2223, + "step": 8193 + }, + { + "epoch": 0.14614918132201335, + "grad_norm": 0.28771674633026123, + "learning_rate": 4.9676166930572945e-05, + "loss": 0.2306, + "step": 8194 + }, + { + "epoch": 0.14616701744372704, + "grad_norm": 0.32447558641433716, + "learning_rate": 4.967591716703355e-05, + "loss": 0.216, + "step": 8195 + }, + { + "epoch": 0.14618485356544073, + "grad_norm": 0.3141137659549713, + "learning_rate": 4.9675667307841784e-05, + "loss": 0.1908, + "step": 8196 + }, + { + "epoch": 0.1462026896871544, + "grad_norm": 0.20078648626804352, + "learning_rate": 4.9675417352998604e-05, + "loss": 0.1952, + "step": 8197 + }, + { + "epoch": 0.14622052580886813, + "grad_norm": 0.28396427631378174, + "learning_rate": 4.967516730250499e-05, + "loss": 0.2495, + "step": 8198 + }, + { + "epoch": 0.14623836193058182, + "grad_norm": 0.41971689462661743, + "learning_rate": 4.96749171563619e-05, + "loss": 0.2914, + "step": 8199 + }, + { + "epoch": 0.1462561980522955, + "grad_norm": 0.19014528393745422, + "learning_rate": 4.967466691457031e-05, + "loss": 0.193, + "step": 8200 + }, + { + "epoch": 0.1462740341740092, + "grad_norm": 0.2847881019115448, + "learning_rate": 4.967441657713118e-05, + "loss": 0.2313, + "step": 8201 + }, + { + "epoch": 0.1462918702957229, + "grad_norm": 0.32855483889579773, + "learning_rate": 4.9674166144045495e-05, + "loss": 0.2555, + "step": 8202 + }, + { + "epoch": 0.1463097064174366, + "grad_norm": 0.2652437686920166, + "learning_rate": 4.967391561531422e-05, + "loss": 0.2204, + "step": 8203 + }, + { + "epoch": 0.14632754253915028, + "grad_norm": 0.2823733389377594, + "learning_rate": 4.967366499093834e-05, + "loss": 0.1949, + "step": 8204 + }, + { + "epoch": 0.14634537866086397, + "grad_norm": 0.28654277324676514, + "learning_rate": 4.9673414270918795e-05, + "loss": 0.2389, + "step": 8205 + }, + { + "epoch": 0.1463632147825777, + "grad_norm": 0.38240379095077515, + "learning_rate": 4.9673163455256576e-05, + "loss": 0.259, + "step": 8206 + }, + { + "epoch": 0.14638105090429138, + "grad_norm": 0.2780166268348694, + "learning_rate": 4.967291254395266e-05, + "loss": 0.1985, + "step": 8207 + }, + { + "epoch": 0.14639888702600506, + "grad_norm": 0.26061293482780457, + "learning_rate": 4.9672661537008016e-05, + "loss": 0.2017, + "step": 8208 + }, + { + "epoch": 0.14641672314771875, + "grad_norm": 0.35266759991645813, + "learning_rate": 4.9672410434423605e-05, + "loss": 0.214, + "step": 8209 + }, + { + "epoch": 0.14643455926943247, + "grad_norm": 0.28910812735557556, + "learning_rate": 4.967215923620041e-05, + "loss": 0.2458, + "step": 8210 + }, + { + "epoch": 0.14645239539114616, + "grad_norm": 0.24122272431850433, + "learning_rate": 4.9671907942339414e-05, + "loss": 0.1887, + "step": 8211 + }, + { + "epoch": 0.14647023151285984, + "grad_norm": 0.2747018039226532, + "learning_rate": 4.967165655284157e-05, + "loss": 0.2361, + "step": 8212 + }, + { + "epoch": 0.14648806763457353, + "grad_norm": 0.29957759380340576, + "learning_rate": 4.967140506770787e-05, + "loss": 0.2312, + "step": 8213 + }, + { + "epoch": 0.14650590375628722, + "grad_norm": 0.3302885890007019, + "learning_rate": 4.967115348693928e-05, + "loss": 0.2114, + "step": 8214 + }, + { + "epoch": 0.14652373987800094, + "grad_norm": 0.6019874215126038, + "learning_rate": 4.9670901810536786e-05, + "loss": 0.1988, + "step": 8215 + }, + { + "epoch": 0.14654157599971462, + "grad_norm": 0.22647647559642792, + "learning_rate": 4.967065003850134e-05, + "loss": 0.2315, + "step": 8216 + }, + { + "epoch": 0.1465594121214283, + "grad_norm": 0.37151867151260376, + "learning_rate": 4.967039817083395e-05, + "loss": 0.207, + "step": 8217 + }, + { + "epoch": 0.146577248243142, + "grad_norm": 0.4023069739341736, + "learning_rate": 4.9670146207535564e-05, + "loss": 0.1972, + "step": 8218 + }, + { + "epoch": 0.14659508436485572, + "grad_norm": 0.2935321629047394, + "learning_rate": 4.9669894148607174e-05, + "loss": 0.2346, + "step": 8219 + }, + { + "epoch": 0.1466129204865694, + "grad_norm": 0.3261707127094269, + "learning_rate": 4.9669641994049754e-05, + "loss": 0.2257, + "step": 8220 + }, + { + "epoch": 0.1466307566082831, + "grad_norm": 0.26666054129600525, + "learning_rate": 4.9669389743864286e-05, + "loss": 0.195, + "step": 8221 + }, + { + "epoch": 0.14664859272999678, + "grad_norm": 0.2738647758960724, + "learning_rate": 4.966913739805173e-05, + "loss": 0.2008, + "step": 8222 + }, + { + "epoch": 0.1466664288517105, + "grad_norm": 0.2955527603626251, + "learning_rate": 4.9668884956613096e-05, + "loss": 0.2421, + "step": 8223 + }, + { + "epoch": 0.14668426497342418, + "grad_norm": 0.24047254025936127, + "learning_rate": 4.966863241954933e-05, + "loss": 0.2012, + "step": 8224 + }, + { + "epoch": 0.14670210109513787, + "grad_norm": 0.23851563036441803, + "learning_rate": 4.966837978686143e-05, + "loss": 0.2016, + "step": 8225 + }, + { + "epoch": 0.14671993721685156, + "grad_norm": 0.28972363471984863, + "learning_rate": 4.966812705855036e-05, + "loss": 0.2711, + "step": 8226 + }, + { + "epoch": 0.14673777333856527, + "grad_norm": 0.23951609432697296, + "learning_rate": 4.9667874234617116e-05, + "loss": 0.1954, + "step": 8227 + }, + { + "epoch": 0.14675560946027896, + "grad_norm": 0.24366194009780884, + "learning_rate": 4.9667621315062676e-05, + "loss": 0.2251, + "step": 8228 + }, + { + "epoch": 0.14677344558199265, + "grad_norm": 0.4206940233707428, + "learning_rate": 4.9667368299888014e-05, + "loss": 0.2047, + "step": 8229 + }, + { + "epoch": 0.14679128170370634, + "grad_norm": 0.3422740399837494, + "learning_rate": 4.966711518909411e-05, + "loss": 0.2524, + "step": 8230 + }, + { + "epoch": 0.14680911782542003, + "grad_norm": 0.41899481415748596, + "learning_rate": 4.9666861982681944e-05, + "loss": 0.2174, + "step": 8231 + }, + { + "epoch": 0.14682695394713374, + "grad_norm": 0.31520822644233704, + "learning_rate": 4.966660868065251e-05, + "loss": 0.2663, + "step": 8232 + }, + { + "epoch": 0.14684479006884743, + "grad_norm": 0.29547446966171265, + "learning_rate": 4.966635528300677e-05, + "loss": 0.2506, + "step": 8233 + }, + { + "epoch": 0.14686262619056112, + "grad_norm": 0.2587685286998749, + "learning_rate": 4.9666101789745735e-05, + "loss": 0.1836, + "step": 8234 + }, + { + "epoch": 0.1468804623122748, + "grad_norm": 0.257411926984787, + "learning_rate": 4.966584820087036e-05, + "loss": 0.2155, + "step": 8235 + }, + { + "epoch": 0.14689829843398852, + "grad_norm": 0.299124151468277, + "learning_rate": 4.9665594516381633e-05, + "loss": 0.1628, + "step": 8236 + }, + { + "epoch": 0.1469161345557022, + "grad_norm": 0.27042296528816223, + "learning_rate": 4.966534073628054e-05, + "loss": 0.2252, + "step": 8237 + }, + { + "epoch": 0.1469339706774159, + "grad_norm": 0.27317970991134644, + "learning_rate": 4.9665086860568075e-05, + "loss": 0.2202, + "step": 8238 + }, + { + "epoch": 0.1469518067991296, + "grad_norm": 0.26989591121673584, + "learning_rate": 4.9664832889245214e-05, + "loss": 0.1972, + "step": 8239 + }, + { + "epoch": 0.1469696429208433, + "grad_norm": 0.24562902748584747, + "learning_rate": 4.9664578822312947e-05, + "loss": 0.2044, + "step": 8240 + }, + { + "epoch": 0.146987479042557, + "grad_norm": 0.3071196973323822, + "learning_rate": 4.966432465977224e-05, + "loss": 0.2394, + "step": 8241 + }, + { + "epoch": 0.14700531516427068, + "grad_norm": 0.28153204917907715, + "learning_rate": 4.966407040162411e-05, + "loss": 0.157, + "step": 8242 + }, + { + "epoch": 0.14702315128598437, + "grad_norm": 0.3015204966068268, + "learning_rate": 4.9663816047869505e-05, + "loss": 0.1976, + "step": 8243 + }, + { + "epoch": 0.14704098740769808, + "grad_norm": 0.35817933082580566, + "learning_rate": 4.9663561598509445e-05, + "loss": 0.2379, + "step": 8244 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 0.4818767309188843, + "learning_rate": 4.966330705354489e-05, + "loss": 0.2459, + "step": 8245 + }, + { + "epoch": 0.14707665965112546, + "grad_norm": 0.29964667558670044, + "learning_rate": 4.966305241297685e-05, + "loss": 0.2077, + "step": 8246 + }, + { + "epoch": 0.14709449577283915, + "grad_norm": 0.2904871106147766, + "learning_rate": 4.9662797676806294e-05, + "loss": 0.1811, + "step": 8247 + }, + { + "epoch": 0.14711233189455286, + "grad_norm": 0.40816056728363037, + "learning_rate": 4.9662542845034216e-05, + "loss": 0.1932, + "step": 8248 + }, + { + "epoch": 0.14713016801626655, + "grad_norm": 0.2841852307319641, + "learning_rate": 4.96622879176616e-05, + "loss": 0.1828, + "step": 8249 + }, + { + "epoch": 0.14714800413798024, + "grad_norm": 0.26207995414733887, + "learning_rate": 4.966203289468945e-05, + "loss": 0.2171, + "step": 8250 + }, + { + "epoch": 0.14716584025969393, + "grad_norm": 0.29089948534965515, + "learning_rate": 4.9661777776118736e-05, + "loss": 0.2864, + "step": 8251 + }, + { + "epoch": 0.1471836763814076, + "grad_norm": 0.265689492225647, + "learning_rate": 4.966152256195045e-05, + "loss": 0.2163, + "step": 8252 + }, + { + "epoch": 0.14720151250312133, + "grad_norm": 0.3602275550365448, + "learning_rate": 4.966126725218558e-05, + "loss": 0.2683, + "step": 8253 + }, + { + "epoch": 0.14721934862483502, + "grad_norm": 0.24762341380119324, + "learning_rate": 4.9661011846825134e-05, + "loss": 0.1996, + "step": 8254 + }, + { + "epoch": 0.1472371847465487, + "grad_norm": 0.23596131801605225, + "learning_rate": 4.966075634587008e-05, + "loss": 0.1925, + "step": 8255 + }, + { + "epoch": 0.1472550208682624, + "grad_norm": 0.2957203984260559, + "learning_rate": 4.966050074932141e-05, + "loss": 0.2074, + "step": 8256 + }, + { + "epoch": 0.1472728569899761, + "grad_norm": 0.2252059429883957, + "learning_rate": 4.966024505718014e-05, + "loss": 0.1966, + "step": 8257 + }, + { + "epoch": 0.1472906931116898, + "grad_norm": 0.2354527711868286, + "learning_rate": 4.965998926944723e-05, + "loss": 0.1968, + "step": 8258 + }, + { + "epoch": 0.14730852923340348, + "grad_norm": 0.26098114252090454, + "learning_rate": 4.9659733386123694e-05, + "loss": 0.2569, + "step": 8259 + }, + { + "epoch": 0.14732636535511717, + "grad_norm": 0.2737635672092438, + "learning_rate": 4.9659477407210505e-05, + "loss": 0.2287, + "step": 8260 + }, + { + "epoch": 0.1473442014768309, + "grad_norm": 0.3221443295478821, + "learning_rate": 4.9659221332708665e-05, + "loss": 0.1841, + "step": 8261 + }, + { + "epoch": 0.14736203759854458, + "grad_norm": 0.3214664161205292, + "learning_rate": 4.965896516261917e-05, + "loss": 0.1934, + "step": 8262 + }, + { + "epoch": 0.14737987372025826, + "grad_norm": 0.38391464948654175, + "learning_rate": 4.965870889694301e-05, + "loss": 0.244, + "step": 8263 + }, + { + "epoch": 0.14739770984197195, + "grad_norm": 0.2779027819633484, + "learning_rate": 4.9658452535681174e-05, + "loss": 0.213, + "step": 8264 + }, + { + "epoch": 0.14741554596368567, + "grad_norm": 0.26770997047424316, + "learning_rate": 4.965819607883466e-05, + "loss": 0.1298, + "step": 8265 + }, + { + "epoch": 0.14743338208539936, + "grad_norm": 0.37342795729637146, + "learning_rate": 4.965793952640446e-05, + "loss": 0.2392, + "step": 8266 + }, + { + "epoch": 0.14745121820711304, + "grad_norm": 0.32738959789276123, + "learning_rate": 4.965768287839157e-05, + "loss": 0.2381, + "step": 8267 + }, + { + "epoch": 0.14746905432882673, + "grad_norm": 0.241044819355011, + "learning_rate": 4.965742613479699e-05, + "loss": 0.2188, + "step": 8268 + }, + { + "epoch": 0.14748689045054045, + "grad_norm": 0.25374168157577515, + "learning_rate": 4.96571692956217e-05, + "loss": 0.2453, + "step": 8269 + }, + { + "epoch": 0.14750472657225414, + "grad_norm": 0.1898292899131775, + "learning_rate": 4.9656912360866713e-05, + "loss": 0.1853, + "step": 8270 + }, + { + "epoch": 0.14752256269396782, + "grad_norm": 0.2697450518608093, + "learning_rate": 4.965665533053302e-05, + "loss": 0.1873, + "step": 8271 + }, + { + "epoch": 0.1475403988156815, + "grad_norm": 0.32905855774879456, + "learning_rate": 4.965639820462161e-05, + "loss": 0.2453, + "step": 8272 + }, + { + "epoch": 0.1475582349373952, + "grad_norm": 0.38826000690460205, + "learning_rate": 4.965614098313348e-05, + "loss": 0.2626, + "step": 8273 + }, + { + "epoch": 0.14757607105910892, + "grad_norm": 0.2632378339767456, + "learning_rate": 4.965588366606964e-05, + "loss": 0.2363, + "step": 8274 + }, + { + "epoch": 0.1475939071808226, + "grad_norm": 0.27714109420776367, + "learning_rate": 4.9655626253431076e-05, + "loss": 0.188, + "step": 8275 + }, + { + "epoch": 0.1476117433025363, + "grad_norm": 0.2937466502189636, + "learning_rate": 4.9655368745218785e-05, + "loss": 0.2229, + "step": 8276 + }, + { + "epoch": 0.14762957942424998, + "grad_norm": 0.32056376338005066, + "learning_rate": 4.9655111141433775e-05, + "loss": 0.2606, + "step": 8277 + }, + { + "epoch": 0.1476474155459637, + "grad_norm": 0.26838433742523193, + "learning_rate": 4.9654853442077035e-05, + "loss": 0.2063, + "step": 8278 + }, + { + "epoch": 0.14766525166767738, + "grad_norm": 0.35123705863952637, + "learning_rate": 4.9654595647149574e-05, + "loss": 0.2585, + "step": 8279 + }, + { + "epoch": 0.14768308778939107, + "grad_norm": 0.34772756695747375, + "learning_rate": 4.9654337756652375e-05, + "loss": 0.2656, + "step": 8280 + }, + { + "epoch": 0.14770092391110476, + "grad_norm": 0.3487825393676758, + "learning_rate": 4.9654079770586454e-05, + "loss": 0.2803, + "step": 8281 + }, + { + "epoch": 0.14771876003281847, + "grad_norm": 0.23886552453041077, + "learning_rate": 4.965382168895281e-05, + "loss": 0.2179, + "step": 8282 + }, + { + "epoch": 0.14773659615453216, + "grad_norm": 0.35290834307670593, + "learning_rate": 4.965356351175242e-05, + "loss": 0.2081, + "step": 8283 + }, + { + "epoch": 0.14775443227624585, + "grad_norm": 0.2803749144077301, + "learning_rate": 4.9653305238986316e-05, + "loss": 0.2845, + "step": 8284 + }, + { + "epoch": 0.14777226839795954, + "grad_norm": 0.4249259829521179, + "learning_rate": 4.965304687065548e-05, + "loss": 0.2111, + "step": 8285 + }, + { + "epoch": 0.14779010451967325, + "grad_norm": 0.2126653790473938, + "learning_rate": 4.965278840676093e-05, + "loss": 0.2064, + "step": 8286 + }, + { + "epoch": 0.14780794064138694, + "grad_norm": 0.28971126675605774, + "learning_rate": 4.9652529847303644e-05, + "loss": 0.1976, + "step": 8287 + }, + { + "epoch": 0.14782577676310063, + "grad_norm": 0.3742590844631195, + "learning_rate": 4.965227119228465e-05, + "loss": 0.2615, + "step": 8288 + }, + { + "epoch": 0.14784361288481432, + "grad_norm": 0.24936038255691528, + "learning_rate": 4.965201244170493e-05, + "loss": 0.2163, + "step": 8289 + }, + { + "epoch": 0.14786144900652803, + "grad_norm": 0.2307882308959961, + "learning_rate": 4.96517535955655e-05, + "loss": 0.1838, + "step": 8290 + }, + { + "epoch": 0.14787928512824172, + "grad_norm": 0.2994025945663452, + "learning_rate": 4.9651494653867356e-05, + "loss": 0.2127, + "step": 8291 + }, + { + "epoch": 0.1478971212499554, + "grad_norm": 0.21554845571517944, + "learning_rate": 4.9651235616611504e-05, + "loss": 0.1681, + "step": 8292 + }, + { + "epoch": 0.1479149573716691, + "grad_norm": 0.25892943143844604, + "learning_rate": 4.9650976483798945e-05, + "loss": 0.2094, + "step": 8293 + }, + { + "epoch": 0.14793279349338279, + "grad_norm": 0.2690183222293854, + "learning_rate": 4.965071725543069e-05, + "loss": 0.23, + "step": 8294 + }, + { + "epoch": 0.1479506296150965, + "grad_norm": 0.24941860139369965, + "learning_rate": 4.9650457931507744e-05, + "loss": 0.2253, + "step": 8295 + }, + { + "epoch": 0.1479684657368102, + "grad_norm": 0.3571094274520874, + "learning_rate": 4.965019851203111e-05, + "loss": 0.2332, + "step": 8296 + }, + { + "epoch": 0.14798630185852388, + "grad_norm": 0.29016420245170593, + "learning_rate": 4.964993899700179e-05, + "loss": 0.2428, + "step": 8297 + }, + { + "epoch": 0.14800413798023757, + "grad_norm": 0.3337816298007965, + "learning_rate": 4.964967938642079e-05, + "loss": 0.2693, + "step": 8298 + }, + { + "epoch": 0.14802197410195128, + "grad_norm": 0.2792920470237732, + "learning_rate": 4.9649419680289116e-05, + "loss": 0.2064, + "step": 8299 + }, + { + "epoch": 0.14803981022366497, + "grad_norm": 0.27864938974380493, + "learning_rate": 4.964915987860779e-05, + "loss": 0.2242, + "step": 8300 + }, + { + "epoch": 0.14805764634537866, + "grad_norm": 0.25790777802467346, + "learning_rate": 4.9648899981377796e-05, + "loss": 0.2249, + "step": 8301 + }, + { + "epoch": 0.14807548246709235, + "grad_norm": 0.35313209891319275, + "learning_rate": 4.964863998860016e-05, + "loss": 0.2099, + "step": 8302 + }, + { + "epoch": 0.14809331858880606, + "grad_norm": 0.2688464820384979, + "learning_rate": 4.964837990027587e-05, + "loss": 0.2186, + "step": 8303 + }, + { + "epoch": 0.14811115471051975, + "grad_norm": 0.3255918025970459, + "learning_rate": 4.9648119716405954e-05, + "loss": 0.2578, + "step": 8304 + }, + { + "epoch": 0.14812899083223344, + "grad_norm": 0.27829429507255554, + "learning_rate": 4.964785943699141e-05, + "loss": 0.2015, + "step": 8305 + }, + { + "epoch": 0.14814682695394712, + "grad_norm": 0.29929518699645996, + "learning_rate": 4.964759906203326e-05, + "loss": 0.2974, + "step": 8306 + }, + { + "epoch": 0.14816466307566084, + "grad_norm": 0.3907299041748047, + "learning_rate": 4.964733859153249e-05, + "loss": 0.2528, + "step": 8307 + }, + { + "epoch": 0.14818249919737453, + "grad_norm": 0.3519734740257263, + "learning_rate": 4.964707802549013e-05, + "loss": 0.1722, + "step": 8308 + }, + { + "epoch": 0.14820033531908822, + "grad_norm": 0.2800590395927429, + "learning_rate": 4.964681736390718e-05, + "loss": 0.227, + "step": 8309 + }, + { + "epoch": 0.1482181714408019, + "grad_norm": 0.2709098160266876, + "learning_rate": 4.9646556606784655e-05, + "loss": 0.2277, + "step": 8310 + }, + { + "epoch": 0.14823600756251562, + "grad_norm": 0.25083452463150024, + "learning_rate": 4.964629575412356e-05, + "loss": 0.2256, + "step": 8311 + }, + { + "epoch": 0.1482538436842293, + "grad_norm": 0.23127196729183197, + "learning_rate": 4.964603480592491e-05, + "loss": 0.2213, + "step": 8312 + }, + { + "epoch": 0.148271679805943, + "grad_norm": 0.2816026508808136, + "learning_rate": 4.964577376218972e-05, + "loss": 0.2505, + "step": 8313 + }, + { + "epoch": 0.14828951592765668, + "grad_norm": 0.2292989045381546, + "learning_rate": 4.9645512622919e-05, + "loss": 0.1753, + "step": 8314 + }, + { + "epoch": 0.14830735204937037, + "grad_norm": 0.3542279899120331, + "learning_rate": 4.964525138811376e-05, + "loss": 0.2513, + "step": 8315 + }, + { + "epoch": 0.1483251881710841, + "grad_norm": 0.2088947743177414, + "learning_rate": 4.9644990057775015e-05, + "loss": 0.2277, + "step": 8316 + }, + { + "epoch": 0.14834302429279778, + "grad_norm": 0.30650970339775085, + "learning_rate": 4.964472863190377e-05, + "loss": 0.2214, + "step": 8317 + }, + { + "epoch": 0.14836086041451146, + "grad_norm": 0.27056947350502014, + "learning_rate": 4.964446711050105e-05, + "loss": 0.2224, + "step": 8318 + }, + { + "epoch": 0.14837869653622515, + "grad_norm": 0.3205641806125641, + "learning_rate": 4.964420549356786e-05, + "loss": 0.1694, + "step": 8319 + }, + { + "epoch": 0.14839653265793887, + "grad_norm": 0.29826006293296814, + "learning_rate": 4.964394378110522e-05, + "loss": 0.2263, + "step": 8320 + }, + { + "epoch": 0.14841436877965256, + "grad_norm": 0.36686989665031433, + "learning_rate": 4.964368197311414e-05, + "loss": 0.2471, + "step": 8321 + }, + { + "epoch": 0.14843220490136624, + "grad_norm": 0.24641509354114532, + "learning_rate": 4.964342006959565e-05, + "loss": 0.1845, + "step": 8322 + }, + { + "epoch": 0.14845004102307993, + "grad_norm": 0.31873640418052673, + "learning_rate": 4.9643158070550734e-05, + "loss": 0.277, + "step": 8323 + }, + { + "epoch": 0.14846787714479365, + "grad_norm": 0.27908164262771606, + "learning_rate": 4.964289597598044e-05, + "loss": 0.2519, + "step": 8324 + }, + { + "epoch": 0.14848571326650734, + "grad_norm": 0.3104163706302643, + "learning_rate": 4.964263378588576e-05, + "loss": 0.2318, + "step": 8325 + }, + { + "epoch": 0.14850354938822102, + "grad_norm": 0.2784401476383209, + "learning_rate": 4.964237150026772e-05, + "loss": 0.2207, + "step": 8326 + }, + { + "epoch": 0.1485213855099347, + "grad_norm": 0.4940575361251831, + "learning_rate": 4.9642109119127345e-05, + "loss": 0.2583, + "step": 8327 + }, + { + "epoch": 0.14853922163164843, + "grad_norm": 0.20831309258937836, + "learning_rate": 4.9641846642465636e-05, + "loss": 0.1777, + "step": 8328 + }, + { + "epoch": 0.14855705775336212, + "grad_norm": 0.20303252339363098, + "learning_rate": 4.964158407028362e-05, + "loss": 0.2176, + "step": 8329 + }, + { + "epoch": 0.1485748938750758, + "grad_norm": 0.33788809180259705, + "learning_rate": 4.964132140258231e-05, + "loss": 0.2748, + "step": 8330 + }, + { + "epoch": 0.1485927299967895, + "grad_norm": 0.26124975085258484, + "learning_rate": 4.9641058639362735e-05, + "loss": 0.2575, + "step": 8331 + }, + { + "epoch": 0.14861056611850318, + "grad_norm": 0.2390952706336975, + "learning_rate": 4.9640795780625903e-05, + "loss": 0.215, + "step": 8332 + }, + { + "epoch": 0.1486284022402169, + "grad_norm": 0.28409543633461, + "learning_rate": 4.964053282637284e-05, + "loss": 0.2635, + "step": 8333 + }, + { + "epoch": 0.14864623836193058, + "grad_norm": 0.34645968675613403, + "learning_rate": 4.964026977660455e-05, + "loss": 0.3206, + "step": 8334 + }, + { + "epoch": 0.14866407448364427, + "grad_norm": 0.2699476480484009, + "learning_rate": 4.964000663132208e-05, + "loss": 0.2449, + "step": 8335 + }, + { + "epoch": 0.14868191060535796, + "grad_norm": 0.2880427837371826, + "learning_rate": 4.963974339052641e-05, + "loss": 0.2121, + "step": 8336 + }, + { + "epoch": 0.14869974672707167, + "grad_norm": 0.31315329670906067, + "learning_rate": 4.963948005421861e-05, + "loss": 0.2072, + "step": 8337 + }, + { + "epoch": 0.14871758284878536, + "grad_norm": 0.3538931906223297, + "learning_rate": 4.963921662239966e-05, + "loss": 0.2264, + "step": 8338 + }, + { + "epoch": 0.14873541897049905, + "grad_norm": 0.2767057418823242, + "learning_rate": 4.963895309507061e-05, + "loss": 0.2501, + "step": 8339 + }, + { + "epoch": 0.14875325509221274, + "grad_norm": 0.2637615203857422, + "learning_rate": 4.9638689472232455e-05, + "loss": 0.2457, + "step": 8340 + }, + { + "epoch": 0.14877109121392645, + "grad_norm": 0.2188502848148346, + "learning_rate": 4.963842575388623e-05, + "loss": 0.1833, + "step": 8341 + }, + { + "epoch": 0.14878892733564014, + "grad_norm": 0.2582019567489624, + "learning_rate": 4.963816194003296e-05, + "loss": 0.2158, + "step": 8342 + }, + { + "epoch": 0.14880676345735383, + "grad_norm": 0.31333044171333313, + "learning_rate": 4.963789803067366e-05, + "loss": 0.2446, + "step": 8343 + }, + { + "epoch": 0.14882459957906752, + "grad_norm": 0.25765636563301086, + "learning_rate": 4.9637634025809365e-05, + "loss": 0.2173, + "step": 8344 + }, + { + "epoch": 0.14884243570078123, + "grad_norm": 0.2975947856903076, + "learning_rate": 4.963736992544109e-05, + "loss": 0.1715, + "step": 8345 + }, + { + "epoch": 0.14886027182249492, + "grad_norm": 0.267433762550354, + "learning_rate": 4.9637105729569854e-05, + "loss": 0.2244, + "step": 8346 + }, + { + "epoch": 0.1488781079442086, + "grad_norm": 0.23204271495342255, + "learning_rate": 4.963684143819669e-05, + "loss": 0.2341, + "step": 8347 + }, + { + "epoch": 0.1488959440659223, + "grad_norm": 0.35042837262153625, + "learning_rate": 4.9636577051322616e-05, + "loss": 0.2745, + "step": 8348 + }, + { + "epoch": 0.148913780187636, + "grad_norm": 0.21348612010478973, + "learning_rate": 4.9636312568948674e-05, + "loss": 0.1458, + "step": 8349 + }, + { + "epoch": 0.1489316163093497, + "grad_norm": 0.30840039253234863, + "learning_rate": 4.963604799107586e-05, + "loss": 0.2059, + "step": 8350 + }, + { + "epoch": 0.1489494524310634, + "grad_norm": 0.25926488637924194, + "learning_rate": 4.963578331770522e-05, + "loss": 0.1971, + "step": 8351 + }, + { + "epoch": 0.14896728855277708, + "grad_norm": 0.29957008361816406, + "learning_rate": 4.9635518548837775e-05, + "loss": 0.2229, + "step": 8352 + }, + { + "epoch": 0.14898512467449077, + "grad_norm": 0.29102393984794617, + "learning_rate": 4.963525368447455e-05, + "loss": 0.2005, + "step": 8353 + }, + { + "epoch": 0.14900296079620448, + "grad_norm": 0.29465726017951965, + "learning_rate": 4.963498872461658e-05, + "loss": 0.2216, + "step": 8354 + }, + { + "epoch": 0.14902079691791817, + "grad_norm": 0.3072602152824402, + "learning_rate": 4.963472366926488e-05, + "loss": 0.2175, + "step": 8355 + }, + { + "epoch": 0.14903863303963186, + "grad_norm": 0.2767748534679413, + "learning_rate": 4.963445851842048e-05, + "loss": 0.2509, + "step": 8356 + }, + { + "epoch": 0.14905646916134555, + "grad_norm": 0.2600916028022766, + "learning_rate": 4.963419327208441e-05, + "loss": 0.1951, + "step": 8357 + }, + { + "epoch": 0.14907430528305926, + "grad_norm": 0.3140299618244171, + "learning_rate": 4.963392793025771e-05, + "loss": 0.2379, + "step": 8358 + }, + { + "epoch": 0.14909214140477295, + "grad_norm": 0.21253187954425812, + "learning_rate": 4.963366249294138e-05, + "loss": 0.2285, + "step": 8359 + }, + { + "epoch": 0.14910997752648664, + "grad_norm": 0.2381104677915573, + "learning_rate": 4.963339696013647e-05, + "loss": 0.2235, + "step": 8360 + }, + { + "epoch": 0.14912781364820032, + "grad_norm": 0.26216456294059753, + "learning_rate": 4.963313133184402e-05, + "loss": 0.1895, + "step": 8361 + }, + { + "epoch": 0.14914564976991404, + "grad_norm": 0.2712250351905823, + "learning_rate": 4.9632865608065025e-05, + "loss": 0.2727, + "step": 8362 + }, + { + "epoch": 0.14916348589162773, + "grad_norm": 0.22147129476070404, + "learning_rate": 4.963259978880055e-05, + "loss": 0.1657, + "step": 8363 + }, + { + "epoch": 0.14918132201334142, + "grad_norm": 0.39221853017807007, + "learning_rate": 4.9632333874051604e-05, + "loss": 0.2286, + "step": 8364 + }, + { + "epoch": 0.1491991581350551, + "grad_norm": 0.2558264434337616, + "learning_rate": 4.963206786381922e-05, + "loss": 0.2123, + "step": 8365 + }, + { + "epoch": 0.14921699425676882, + "grad_norm": 0.3101685643196106, + "learning_rate": 4.963180175810444e-05, + "loss": 0.1932, + "step": 8366 + }, + { + "epoch": 0.1492348303784825, + "grad_norm": 0.3007619380950928, + "learning_rate": 4.963153555690829e-05, + "loss": 0.2591, + "step": 8367 + }, + { + "epoch": 0.1492526665001962, + "grad_norm": 0.2865447998046875, + "learning_rate": 4.963126926023179e-05, + "loss": 0.2352, + "step": 8368 + }, + { + "epoch": 0.14927050262190988, + "grad_norm": 0.24834208190441132, + "learning_rate": 4.9631002868075995e-05, + "loss": 0.2512, + "step": 8369 + }, + { + "epoch": 0.1492883387436236, + "grad_norm": 0.27149853110313416, + "learning_rate": 4.9630736380441925e-05, + "loss": 0.237, + "step": 8370 + }, + { + "epoch": 0.1493061748653373, + "grad_norm": 0.26262322068214417, + "learning_rate": 4.9630469797330606e-05, + "loss": 0.2102, + "step": 8371 + }, + { + "epoch": 0.14932401098705098, + "grad_norm": 0.30290353298187256, + "learning_rate": 4.963020311874308e-05, + "loss": 0.2413, + "step": 8372 + }, + { + "epoch": 0.14934184710876466, + "grad_norm": 0.3651910722255707, + "learning_rate": 4.962993634468038e-05, + "loss": 0.2062, + "step": 8373 + }, + { + "epoch": 0.14935968323047835, + "grad_norm": 0.28044047951698303, + "learning_rate": 4.962966947514355e-05, + "loss": 0.2423, + "step": 8374 + }, + { + "epoch": 0.14937751935219207, + "grad_norm": 0.2301790416240692, + "learning_rate": 4.9629402510133604e-05, + "loss": 0.2353, + "step": 8375 + }, + { + "epoch": 0.14939535547390576, + "grad_norm": 0.4232882857322693, + "learning_rate": 4.9629135449651585e-05, + "loss": 0.2332, + "step": 8376 + }, + { + "epoch": 0.14941319159561944, + "grad_norm": 0.23570029437541962, + "learning_rate": 4.962886829369854e-05, + "loss": 0.1951, + "step": 8377 + }, + { + "epoch": 0.14943102771733313, + "grad_norm": 0.3388097882270813, + "learning_rate": 4.962860104227549e-05, + "loss": 0.2622, + "step": 8378 + }, + { + "epoch": 0.14944886383904685, + "grad_norm": 0.29123201966285706, + "learning_rate": 4.962833369538348e-05, + "loss": 0.225, + "step": 8379 + }, + { + "epoch": 0.14946669996076054, + "grad_norm": 0.2710510790348053, + "learning_rate": 4.962806625302353e-05, + "loss": 0.2378, + "step": 8380 + }, + { + "epoch": 0.14948453608247422, + "grad_norm": 0.2576589584350586, + "learning_rate": 4.96277987151967e-05, + "loss": 0.2127, + "step": 8381 + }, + { + "epoch": 0.1495023722041879, + "grad_norm": 0.2736780643463135, + "learning_rate": 4.962753108190401e-05, + "loss": 0.2326, + "step": 8382 + }, + { + "epoch": 0.14952020832590163, + "grad_norm": 0.2831859588623047, + "learning_rate": 4.962726335314651e-05, + "loss": 0.2421, + "step": 8383 + }, + { + "epoch": 0.14953804444761531, + "grad_norm": 0.27426156401634216, + "learning_rate": 4.962699552892522e-05, + "loss": 0.2091, + "step": 8384 + }, + { + "epoch": 0.149555880569329, + "grad_norm": 0.24105580151081085, + "learning_rate": 4.962672760924121e-05, + "loss": 0.2285, + "step": 8385 + }, + { + "epoch": 0.1495737166910427, + "grad_norm": 0.33097365498542786, + "learning_rate": 4.962645959409548e-05, + "loss": 0.1814, + "step": 8386 + }, + { + "epoch": 0.1495915528127564, + "grad_norm": 0.2840391993522644, + "learning_rate": 4.9626191483489094e-05, + "loss": 0.1809, + "step": 8387 + }, + { + "epoch": 0.1496093889344701, + "grad_norm": 0.25599801540374756, + "learning_rate": 4.962592327742308e-05, + "loss": 0.2323, + "step": 8388 + }, + { + "epoch": 0.14962722505618378, + "grad_norm": 0.29342347383499146, + "learning_rate": 4.962565497589848e-05, + "loss": 0.2408, + "step": 8389 + }, + { + "epoch": 0.14964506117789747, + "grad_norm": 0.25567734241485596, + "learning_rate": 4.9625386578916346e-05, + "loss": 0.263, + "step": 8390 + }, + { + "epoch": 0.1496628972996112, + "grad_norm": 0.23582299053668976, + "learning_rate": 4.96251180864777e-05, + "loss": 0.1918, + "step": 8391 + }, + { + "epoch": 0.14968073342132487, + "grad_norm": 0.31972259283065796, + "learning_rate": 4.962484949858359e-05, + "loss": 0.1738, + "step": 8392 + }, + { + "epoch": 0.14969856954303856, + "grad_norm": 0.21748530864715576, + "learning_rate": 4.9624580815235064e-05, + "loss": 0.1819, + "step": 8393 + }, + { + "epoch": 0.14971640566475225, + "grad_norm": 0.26137396693229675, + "learning_rate": 4.9624312036433155e-05, + "loss": 0.1671, + "step": 8394 + }, + { + "epoch": 0.14973424178646594, + "grad_norm": 0.3161908686161041, + "learning_rate": 4.9624043162178914e-05, + "loss": 0.1966, + "step": 8395 + }, + { + "epoch": 0.14975207790817965, + "grad_norm": 0.3270423710346222, + "learning_rate": 4.962377419247337e-05, + "loss": 0.213, + "step": 8396 + }, + { + "epoch": 0.14976991402989334, + "grad_norm": 0.25357145071029663, + "learning_rate": 4.9623505127317574e-05, + "loss": 0.2075, + "step": 8397 + }, + { + "epoch": 0.14978775015160703, + "grad_norm": 0.30495455861091614, + "learning_rate": 4.962323596671257e-05, + "loss": 0.2164, + "step": 8398 + }, + { + "epoch": 0.14980558627332072, + "grad_norm": 0.32474029064178467, + "learning_rate": 4.962296671065939e-05, + "loss": 0.2119, + "step": 8399 + }, + { + "epoch": 0.14982342239503443, + "grad_norm": 0.31686919927597046, + "learning_rate": 4.962269735915909e-05, + "loss": 0.227, + "step": 8400 + }, + { + "epoch": 0.14984125851674812, + "grad_norm": 0.27647504210472107, + "learning_rate": 4.962242791221271e-05, + "loss": 0.2417, + "step": 8401 + }, + { + "epoch": 0.1498590946384618, + "grad_norm": 0.2595807611942291, + "learning_rate": 4.9622158369821306e-05, + "loss": 0.1925, + "step": 8402 + }, + { + "epoch": 0.1498769307601755, + "grad_norm": 0.21945913136005402, + "learning_rate": 4.96218887319859e-05, + "loss": 0.1882, + "step": 8403 + }, + { + "epoch": 0.1498947668818892, + "grad_norm": 0.45307475328445435, + "learning_rate": 4.9621618998707554e-05, + "loss": 0.21, + "step": 8404 + }, + { + "epoch": 0.1499126030036029, + "grad_norm": 0.280192106962204, + "learning_rate": 4.962134916998731e-05, + "loss": 0.2048, + "step": 8405 + }, + { + "epoch": 0.1499304391253166, + "grad_norm": 0.306252121925354, + "learning_rate": 4.962107924582621e-05, + "loss": 0.2351, + "step": 8406 + }, + { + "epoch": 0.14994827524703028, + "grad_norm": 0.31264379620552063, + "learning_rate": 4.96208092262253e-05, + "loss": 0.2696, + "step": 8407 + }, + { + "epoch": 0.149966111368744, + "grad_norm": 0.24853278696537018, + "learning_rate": 4.962053911118563e-05, + "loss": 0.2416, + "step": 8408 + }, + { + "epoch": 0.14998394749045768, + "grad_norm": 0.24951770901679993, + "learning_rate": 4.962026890070825e-05, + "loss": 0.1975, + "step": 8409 + }, + { + "epoch": 0.15000178361217137, + "grad_norm": 0.3052162230014801, + "learning_rate": 4.96199985947942e-05, + "loss": 0.1841, + "step": 8410 + }, + { + "epoch": 0.15001961973388506, + "grad_norm": 0.26822376251220703, + "learning_rate": 4.9619728193444536e-05, + "loss": 0.2704, + "step": 8411 + }, + { + "epoch": 0.15003745585559874, + "grad_norm": 0.3441866636276245, + "learning_rate": 4.961945769666031e-05, + "loss": 0.2713, + "step": 8412 + }, + { + "epoch": 0.15005529197731246, + "grad_norm": 0.2854105830192566, + "learning_rate": 4.961918710444255e-05, + "loss": 0.3004, + "step": 8413 + }, + { + "epoch": 0.15007312809902615, + "grad_norm": 0.26902419328689575, + "learning_rate": 4.9618916416792324e-05, + "loss": 0.2319, + "step": 8414 + }, + { + "epoch": 0.15009096422073984, + "grad_norm": 0.2367042899131775, + "learning_rate": 4.961864563371067e-05, + "loss": 0.2256, + "step": 8415 + }, + { + "epoch": 0.15010880034245352, + "grad_norm": 0.3011890649795532, + "learning_rate": 4.9618374755198646e-05, + "loss": 0.2273, + "step": 8416 + }, + { + "epoch": 0.15012663646416724, + "grad_norm": 0.2846713066101074, + "learning_rate": 4.9618103781257295e-05, + "loss": 0.232, + "step": 8417 + }, + { + "epoch": 0.15014447258588093, + "grad_norm": 0.30780264735221863, + "learning_rate": 4.961783271188768e-05, + "loss": 0.2468, + "step": 8418 + }, + { + "epoch": 0.15016230870759462, + "grad_norm": 0.24429358541965485, + "learning_rate": 4.961756154709083e-05, + "loss": 0.2227, + "step": 8419 + }, + { + "epoch": 0.1501801448293083, + "grad_norm": 0.36014044284820557, + "learning_rate": 4.961729028686782e-05, + "loss": 0.2634, + "step": 8420 + }, + { + "epoch": 0.15019798095102202, + "grad_norm": 0.3677643835544586, + "learning_rate": 4.9617018931219686e-05, + "loss": 0.262, + "step": 8421 + }, + { + "epoch": 0.1502158170727357, + "grad_norm": 0.29291391372680664, + "learning_rate": 4.961674748014749e-05, + "loss": 0.2256, + "step": 8422 + }, + { + "epoch": 0.1502336531944494, + "grad_norm": 0.2308768481016159, + "learning_rate": 4.9616475933652264e-05, + "loss": 0.2112, + "step": 8423 + }, + { + "epoch": 0.15025148931616308, + "grad_norm": 0.21656841039657593, + "learning_rate": 4.9616204291735086e-05, + "loss": 0.2052, + "step": 8424 + }, + { + "epoch": 0.1502693254378768, + "grad_norm": 0.3243447542190552, + "learning_rate": 4.961593255439699e-05, + "loss": 0.2141, + "step": 8425 + }, + { + "epoch": 0.1502871615595905, + "grad_norm": 0.34292158484458923, + "learning_rate": 4.961566072163905e-05, + "loss": 0.2355, + "step": 8426 + }, + { + "epoch": 0.15030499768130418, + "grad_norm": 0.21832285821437836, + "learning_rate": 4.9615388793462294e-05, + "loss": 0.2206, + "step": 8427 + }, + { + "epoch": 0.15032283380301786, + "grad_norm": 0.23595459759235382, + "learning_rate": 4.96151167698678e-05, + "loss": 0.2027, + "step": 8428 + }, + { + "epoch": 0.15034066992473158, + "grad_norm": 0.3123375475406647, + "learning_rate": 4.9614844650856605e-05, + "loss": 0.2355, + "step": 8429 + }, + { + "epoch": 0.15035850604644527, + "grad_norm": 0.23871618509292603, + "learning_rate": 4.9614572436429777e-05, + "loss": 0.1664, + "step": 8430 + }, + { + "epoch": 0.15037634216815896, + "grad_norm": 0.25301218032836914, + "learning_rate": 4.961430012658835e-05, + "loss": 0.1872, + "step": 8431 + }, + { + "epoch": 0.15039417828987264, + "grad_norm": 0.3113987445831299, + "learning_rate": 4.961402772133341e-05, + "loss": 0.2002, + "step": 8432 + }, + { + "epoch": 0.15041201441158633, + "grad_norm": 0.26478689908981323, + "learning_rate": 4.9613755220665994e-05, + "loss": 0.2332, + "step": 8433 + }, + { + "epoch": 0.15042985053330005, + "grad_norm": 0.4453291893005371, + "learning_rate": 4.961348262458715e-05, + "loss": 0.2066, + "step": 8434 + }, + { + "epoch": 0.15044768665501373, + "grad_norm": 0.33681079745292664, + "learning_rate": 4.961320993309796e-05, + "loss": 0.2064, + "step": 8435 + }, + { + "epoch": 0.15046552277672742, + "grad_norm": 0.28030309081077576, + "learning_rate": 4.961293714619946e-05, + "loss": 0.2421, + "step": 8436 + }, + { + "epoch": 0.1504833588984411, + "grad_norm": 0.2608387768268585, + "learning_rate": 4.961266426389272e-05, + "loss": 0.2318, + "step": 8437 + }, + { + "epoch": 0.15050119502015483, + "grad_norm": 0.29991066455841064, + "learning_rate": 4.9612391286178784e-05, + "loss": 0.2629, + "step": 8438 + }, + { + "epoch": 0.15051903114186851, + "grad_norm": 0.34991273283958435, + "learning_rate": 4.9612118213058725e-05, + "loss": 0.267, + "step": 8439 + }, + { + "epoch": 0.1505368672635822, + "grad_norm": 0.2911750376224518, + "learning_rate": 4.961184504453359e-05, + "loss": 0.2172, + "step": 8440 + }, + { + "epoch": 0.1505547033852959, + "grad_norm": 0.3051919937133789, + "learning_rate": 4.961157178060445e-05, + "loss": 0.2735, + "step": 8441 + }, + { + "epoch": 0.1505725395070096, + "grad_norm": 0.2562267482280731, + "learning_rate": 4.9611298421272356e-05, + "loss": 0.1978, + "step": 8442 + }, + { + "epoch": 0.1505903756287233, + "grad_norm": 0.45183566212654114, + "learning_rate": 4.961102496653837e-05, + "loss": 0.1959, + "step": 8443 + }, + { + "epoch": 0.15060821175043698, + "grad_norm": 0.2823636531829834, + "learning_rate": 4.961075141640355e-05, + "loss": 0.2414, + "step": 8444 + }, + { + "epoch": 0.15062604787215067, + "grad_norm": 0.27796629071235657, + "learning_rate": 4.961047777086894e-05, + "loss": 0.2348, + "step": 8445 + }, + { + "epoch": 0.15064388399386439, + "grad_norm": 0.40828168392181396, + "learning_rate": 4.9610204029935634e-05, + "loss": 0.2373, + "step": 8446 + }, + { + "epoch": 0.15066172011557807, + "grad_norm": 0.25054875016212463, + "learning_rate": 4.9609930193604684e-05, + "loss": 0.2241, + "step": 8447 + }, + { + "epoch": 0.15067955623729176, + "grad_norm": 0.23433774709701538, + "learning_rate": 4.960965626187713e-05, + "loss": 0.1971, + "step": 8448 + }, + { + "epoch": 0.15069739235900545, + "grad_norm": 0.26952916383743286, + "learning_rate": 4.9609382234754054e-05, + "loss": 0.2388, + "step": 8449 + }, + { + "epoch": 0.15071522848071917, + "grad_norm": 0.3398025631904602, + "learning_rate": 4.9609108112236515e-05, + "loss": 0.2012, + "step": 8450 + }, + { + "epoch": 0.15073306460243285, + "grad_norm": 0.28553298115730286, + "learning_rate": 4.960883389432557e-05, + "loss": 0.2339, + "step": 8451 + }, + { + "epoch": 0.15075090072414654, + "grad_norm": 0.32566359639167786, + "learning_rate": 4.960855958102228e-05, + "loss": 0.2237, + "step": 8452 + }, + { + "epoch": 0.15076873684586023, + "grad_norm": 0.300100713968277, + "learning_rate": 4.960828517232773e-05, + "loss": 0.236, + "step": 8453 + }, + { + "epoch": 0.15078657296757392, + "grad_norm": 0.31294363737106323, + "learning_rate": 4.960801066824295e-05, + "loss": 0.2633, + "step": 8454 + }, + { + "epoch": 0.15080440908928763, + "grad_norm": 0.2673596739768982, + "learning_rate": 4.9607736068769034e-05, + "loss": 0.213, + "step": 8455 + }, + { + "epoch": 0.15082224521100132, + "grad_norm": 0.2582441568374634, + "learning_rate": 4.960746137390703e-05, + "loss": 0.2209, + "step": 8456 + }, + { + "epoch": 0.150840081332715, + "grad_norm": 0.25271373987197876, + "learning_rate": 4.9607186583658e-05, + "loss": 0.2171, + "step": 8457 + }, + { + "epoch": 0.1508579174544287, + "grad_norm": 0.22569482028484344, + "learning_rate": 4.9606911698023024e-05, + "loss": 0.2073, + "step": 8458 + }, + { + "epoch": 0.1508757535761424, + "grad_norm": 0.26054930686950684, + "learning_rate": 4.960663671700315e-05, + "loss": 0.1471, + "step": 8459 + }, + { + "epoch": 0.1508935896978561, + "grad_norm": 0.23638111352920532, + "learning_rate": 4.9606361640599464e-05, + "loss": 0.2183, + "step": 8460 + }, + { + "epoch": 0.1509114258195698, + "grad_norm": 0.4223468005657196, + "learning_rate": 4.960608646881302e-05, + "loss": 0.2378, + "step": 8461 + }, + { + "epoch": 0.15092926194128348, + "grad_norm": 0.2853187322616577, + "learning_rate": 4.960581120164488e-05, + "loss": 0.2367, + "step": 8462 + }, + { + "epoch": 0.1509470980629972, + "grad_norm": 0.2830091416835785, + "learning_rate": 4.960553583909612e-05, + "loss": 0.2337, + "step": 8463 + }, + { + "epoch": 0.15096493418471088, + "grad_norm": 0.288740873336792, + "learning_rate": 4.960526038116781e-05, + "loss": 0.1913, + "step": 8464 + }, + { + "epoch": 0.15098277030642457, + "grad_norm": 0.35857534408569336, + "learning_rate": 4.9604984827861e-05, + "loss": 0.2053, + "step": 8465 + }, + { + "epoch": 0.15100060642813826, + "grad_norm": 0.3122760057449341, + "learning_rate": 4.9604709179176777e-05, + "loss": 0.2189, + "step": 8466 + }, + { + "epoch": 0.15101844254985197, + "grad_norm": 0.2901649475097656, + "learning_rate": 4.96044334351162e-05, + "loss": 0.2098, + "step": 8467 + }, + { + "epoch": 0.15103627867156566, + "grad_norm": 0.21733441948890686, + "learning_rate": 4.9604157595680356e-05, + "loss": 0.2342, + "step": 8468 + }, + { + "epoch": 0.15105411479327935, + "grad_norm": 0.2657352387905121, + "learning_rate": 4.960388166087028e-05, + "loss": 0.1948, + "step": 8469 + }, + { + "epoch": 0.15107195091499304, + "grad_norm": 0.243385449051857, + "learning_rate": 4.960360563068707e-05, + "loss": 0.2416, + "step": 8470 + }, + { + "epoch": 0.15108978703670675, + "grad_norm": 0.39115190505981445, + "learning_rate": 4.960332950513179e-05, + "loss": 0.1748, + "step": 8471 + }, + { + "epoch": 0.15110762315842044, + "grad_norm": 0.2807091772556305, + "learning_rate": 4.9603053284205504e-05, + "loss": 0.2212, + "step": 8472 + }, + { + "epoch": 0.15112545928013413, + "grad_norm": 0.22286827862262726, + "learning_rate": 4.960277696790928e-05, + "loss": 0.2162, + "step": 8473 + }, + { + "epoch": 0.15114329540184782, + "grad_norm": 0.3109433054924011, + "learning_rate": 4.9602500556244204e-05, + "loss": 0.249, + "step": 8474 + }, + { + "epoch": 0.1511611315235615, + "grad_norm": 0.24084334075450897, + "learning_rate": 4.960222404921133e-05, + "loss": 0.166, + "step": 8475 + }, + { + "epoch": 0.15117896764527522, + "grad_norm": 0.35053345561027527, + "learning_rate": 4.960194744681174e-05, + "loss": 0.2813, + "step": 8476 + }, + { + "epoch": 0.1511968037669889, + "grad_norm": 0.30203041434288025, + "learning_rate": 4.960167074904651e-05, + "loss": 0.241, + "step": 8477 + }, + { + "epoch": 0.1512146398887026, + "grad_norm": 0.2878948450088501, + "learning_rate": 4.96013939559167e-05, + "loss": 0.225, + "step": 8478 + }, + { + "epoch": 0.15123247601041628, + "grad_norm": 0.26000845432281494, + "learning_rate": 4.96011170674234e-05, + "loss": 0.2151, + "step": 8479 + }, + { + "epoch": 0.15125031213213, + "grad_norm": 0.2977293133735657, + "learning_rate": 4.9600840083567665e-05, + "loss": 0.2014, + "step": 8480 + }, + { + "epoch": 0.1512681482538437, + "grad_norm": 0.29933255910873413, + "learning_rate": 4.960056300435058e-05, + "loss": 0.1762, + "step": 8481 + }, + { + "epoch": 0.15128598437555738, + "grad_norm": 0.2834800183773041, + "learning_rate": 4.960028582977321e-05, + "loss": 0.2121, + "step": 8482 + }, + { + "epoch": 0.15130382049727106, + "grad_norm": 0.2429545670747757, + "learning_rate": 4.960000855983664e-05, + "loss": 0.1709, + "step": 8483 + }, + { + "epoch": 0.15132165661898478, + "grad_norm": 0.31816548109054565, + "learning_rate": 4.9599731194541943e-05, + "loss": 0.2205, + "step": 8484 + }, + { + "epoch": 0.15133949274069847, + "grad_norm": 0.4332368075847626, + "learning_rate": 4.9599453733890186e-05, + "loss": 0.2089, + "step": 8485 + }, + { + "epoch": 0.15135732886241215, + "grad_norm": 0.27083703875541687, + "learning_rate": 4.9599176177882454e-05, + "loss": 0.2379, + "step": 8486 + }, + { + "epoch": 0.15137516498412584, + "grad_norm": 0.35265836119651794, + "learning_rate": 4.959889852651982e-05, + "loss": 0.216, + "step": 8487 + }, + { + "epoch": 0.15139300110583956, + "grad_norm": 0.3029022216796875, + "learning_rate": 4.959862077980335e-05, + "loss": 0.2287, + "step": 8488 + }, + { + "epoch": 0.15141083722755325, + "grad_norm": 0.2494308054447174, + "learning_rate": 4.959834293773414e-05, + "loss": 0.187, + "step": 8489 + }, + { + "epoch": 0.15142867334926693, + "grad_norm": 0.639824628829956, + "learning_rate": 4.959806500031325e-05, + "loss": 0.2199, + "step": 8490 + }, + { + "epoch": 0.15144650947098062, + "grad_norm": 0.300088107585907, + "learning_rate": 4.959778696754177e-05, + "loss": 0.2404, + "step": 8491 + }, + { + "epoch": 0.15146434559269434, + "grad_norm": 0.2985757887363434, + "learning_rate": 4.959750883942077e-05, + "loss": 0.2502, + "step": 8492 + }, + { + "epoch": 0.15148218171440803, + "grad_norm": 0.46329471468925476, + "learning_rate": 4.959723061595133e-05, + "loss": 0.2459, + "step": 8493 + }, + { + "epoch": 0.15150001783612171, + "grad_norm": 0.3448359966278076, + "learning_rate": 4.9596952297134525e-05, + "loss": 0.2431, + "step": 8494 + }, + { + "epoch": 0.1515178539578354, + "grad_norm": 0.3522936701774597, + "learning_rate": 4.959667388297144e-05, + "loss": 0.2975, + "step": 8495 + }, + { + "epoch": 0.1515356900795491, + "grad_norm": 0.27203306555747986, + "learning_rate": 4.9596395373463153e-05, + "loss": 0.202, + "step": 8496 + }, + { + "epoch": 0.1515535262012628, + "grad_norm": 0.3497348129749298, + "learning_rate": 4.959611676861074e-05, + "loss": 0.2551, + "step": 8497 + }, + { + "epoch": 0.1515713623229765, + "grad_norm": 0.2883908450603485, + "learning_rate": 4.959583806841529e-05, + "loss": 0.1995, + "step": 8498 + }, + { + "epoch": 0.15158919844469018, + "grad_norm": 0.3639731705188751, + "learning_rate": 4.959555927287787e-05, + "loss": 0.2438, + "step": 8499 + }, + { + "epoch": 0.15160703456640387, + "grad_norm": 0.30356094241142273, + "learning_rate": 4.959528038199956e-05, + "loss": 0.2124, + "step": 8500 + }, + { + "epoch": 0.15162487068811759, + "grad_norm": 0.26965057849884033, + "learning_rate": 4.959500139578146e-05, + "loss": 0.1731, + "step": 8501 + }, + { + "epoch": 0.15164270680983127, + "grad_norm": 0.31213638186454773, + "learning_rate": 4.959472231422464e-05, + "loss": 0.2119, + "step": 8502 + }, + { + "epoch": 0.15166054293154496, + "grad_norm": 0.30166947841644287, + "learning_rate": 4.9594443137330175e-05, + "loss": 0.2041, + "step": 8503 + }, + { + "epoch": 0.15167837905325865, + "grad_norm": 0.2721315622329712, + "learning_rate": 4.9594163865099156e-05, + "loss": 0.2055, + "step": 8504 + }, + { + "epoch": 0.15169621517497237, + "grad_norm": 0.45368692278862, + "learning_rate": 4.959388449753266e-05, + "loss": 0.3265, + "step": 8505 + }, + { + "epoch": 0.15171405129668605, + "grad_norm": 0.2886175215244293, + "learning_rate": 4.959360503463178e-05, + "loss": 0.1992, + "step": 8506 + }, + { + "epoch": 0.15173188741839974, + "grad_norm": 0.31494712829589844, + "learning_rate": 4.959332547639759e-05, + "loss": 0.2155, + "step": 8507 + }, + { + "epoch": 0.15174972354011343, + "grad_norm": 0.33003467321395874, + "learning_rate": 4.959304582283118e-05, + "loss": 0.2715, + "step": 8508 + }, + { + "epoch": 0.15176755966182714, + "grad_norm": 0.29312780499458313, + "learning_rate": 4.959276607393362e-05, + "loss": 0.1529, + "step": 8509 + }, + { + "epoch": 0.15178539578354083, + "grad_norm": 0.2924138903617859, + "learning_rate": 4.9592486229706016e-05, + "loss": 0.1973, + "step": 8510 + }, + { + "epoch": 0.15180323190525452, + "grad_norm": 0.3075374662876129, + "learning_rate": 4.9592206290149434e-05, + "loss": 0.2165, + "step": 8511 + }, + { + "epoch": 0.1518210680269682, + "grad_norm": 0.27690669894218445, + "learning_rate": 4.9591926255264966e-05, + "loss": 0.2144, + "step": 8512 + }, + { + "epoch": 0.1518389041486819, + "grad_norm": 0.3529800772666931, + "learning_rate": 4.95916461250537e-05, + "loss": 0.2546, + "step": 8513 + }, + { + "epoch": 0.1518567402703956, + "grad_norm": 0.2664533257484436, + "learning_rate": 4.959136589951672e-05, + "loss": 0.2501, + "step": 8514 + }, + { + "epoch": 0.1518745763921093, + "grad_norm": 0.27176034450531006, + "learning_rate": 4.959108557865512e-05, + "loss": 0.2364, + "step": 8515 + }, + { + "epoch": 0.151892412513823, + "grad_norm": 0.2882852852344513, + "learning_rate": 4.9590805162469966e-05, + "loss": 0.218, + "step": 8516 + }, + { + "epoch": 0.15191024863553668, + "grad_norm": 0.27222636342048645, + "learning_rate": 4.959052465096237e-05, + "loss": 0.221, + "step": 8517 + }, + { + "epoch": 0.1519280847572504, + "grad_norm": 0.4094749689102173, + "learning_rate": 4.95902440441334e-05, + "loss": 0.2974, + "step": 8518 + }, + { + "epoch": 0.15194592087896408, + "grad_norm": 0.26940155029296875, + "learning_rate": 4.958996334198415e-05, + "loss": 0.2223, + "step": 8519 + }, + { + "epoch": 0.15196375700067777, + "grad_norm": 0.2948804497718811, + "learning_rate": 4.95896825445157e-05, + "loss": 0.2379, + "step": 8520 + }, + { + "epoch": 0.15198159312239146, + "grad_norm": 0.3114687204360962, + "learning_rate": 4.958940165172916e-05, + "loss": 0.2183, + "step": 8521 + }, + { + "epoch": 0.15199942924410517, + "grad_norm": 0.477892130613327, + "learning_rate": 4.958912066362561e-05, + "loss": 0.2554, + "step": 8522 + }, + { + "epoch": 0.15201726536581886, + "grad_norm": 0.23657453060150146, + "learning_rate": 4.9588839580206126e-05, + "loss": 0.2047, + "step": 8523 + }, + { + "epoch": 0.15203510148753255, + "grad_norm": 0.2781616449356079, + "learning_rate": 4.958855840147181e-05, + "loss": 0.2234, + "step": 8524 + }, + { + "epoch": 0.15205293760924624, + "grad_norm": 0.26945939660072327, + "learning_rate": 4.958827712742375e-05, + "loss": 0.2486, + "step": 8525 + }, + { + "epoch": 0.15207077373095995, + "grad_norm": 0.2888774275779724, + "learning_rate": 4.958799575806303e-05, + "loss": 0.2455, + "step": 8526 + }, + { + "epoch": 0.15208860985267364, + "grad_norm": 0.24447181820869446, + "learning_rate": 4.958771429339076e-05, + "loss": 0.2261, + "step": 8527 + }, + { + "epoch": 0.15210644597438733, + "grad_norm": 0.3367098569869995, + "learning_rate": 4.9587432733408004e-05, + "loss": 0.2687, + "step": 8528 + }, + { + "epoch": 0.15212428209610102, + "grad_norm": 0.29054486751556396, + "learning_rate": 4.958715107811587e-05, + "loss": 0.21, + "step": 8529 + }, + { + "epoch": 0.15214211821781473, + "grad_norm": 0.40333035588264465, + "learning_rate": 4.958686932751545e-05, + "loss": 0.1971, + "step": 8530 + }, + { + "epoch": 0.15215995433952842, + "grad_norm": 0.38966500759124756, + "learning_rate": 4.9586587481607824e-05, + "loss": 0.2542, + "step": 8531 + }, + { + "epoch": 0.1521777904612421, + "grad_norm": 0.2995615303516388, + "learning_rate": 4.95863055403941e-05, + "loss": 0.2149, + "step": 8532 + }, + { + "epoch": 0.1521956265829558, + "grad_norm": 0.23772743344306946, + "learning_rate": 4.958602350387537e-05, + "loss": 0.1994, + "step": 8533 + }, + { + "epoch": 0.15221346270466948, + "grad_norm": 0.2952633202075958, + "learning_rate": 4.958574137205271e-05, + "loss": 0.288, + "step": 8534 + }, + { + "epoch": 0.1522312988263832, + "grad_norm": 0.22035880386829376, + "learning_rate": 4.9585459144927225e-05, + "loss": 0.2107, + "step": 8535 + }, + { + "epoch": 0.1522491349480969, + "grad_norm": 0.44636270403862, + "learning_rate": 4.9585176822500015e-05, + "loss": 0.2507, + "step": 8536 + }, + { + "epoch": 0.15226697106981057, + "grad_norm": 0.27889424562454224, + "learning_rate": 4.958489440477217e-05, + "loss": 0.2453, + "step": 8537 + }, + { + "epoch": 0.15228480719152426, + "grad_norm": 0.40132936835289, + "learning_rate": 4.958461189174477e-05, + "loss": 0.2846, + "step": 8538 + }, + { + "epoch": 0.15230264331323798, + "grad_norm": 0.20691967010498047, + "learning_rate": 4.958432928341893e-05, + "loss": 0.1841, + "step": 8539 + }, + { + "epoch": 0.15232047943495167, + "grad_norm": 0.21933163702487946, + "learning_rate": 4.958404657979574e-05, + "loss": 0.1895, + "step": 8540 + }, + { + "epoch": 0.15233831555666535, + "grad_norm": 0.25566959381103516, + "learning_rate": 4.9583763780876296e-05, + "loss": 0.2703, + "step": 8541 + }, + { + "epoch": 0.15235615167837904, + "grad_norm": 0.25642478466033936, + "learning_rate": 4.958348088666169e-05, + "loss": 0.214, + "step": 8542 + }, + { + "epoch": 0.15237398780009276, + "grad_norm": 0.2996155321598053, + "learning_rate": 4.958319789715302e-05, + "loss": 0.2258, + "step": 8543 + }, + { + "epoch": 0.15239182392180645, + "grad_norm": 0.3043200373649597, + "learning_rate": 4.958291481235139e-05, + "loss": 0.2875, + "step": 8544 + }, + { + "epoch": 0.15240966004352013, + "grad_norm": 0.4334576427936554, + "learning_rate": 4.9582631632257884e-05, + "loss": 0.2147, + "step": 8545 + }, + { + "epoch": 0.15242749616523382, + "grad_norm": 0.4442044794559479, + "learning_rate": 4.9582348356873615e-05, + "loss": 0.2965, + "step": 8546 + }, + { + "epoch": 0.15244533228694754, + "grad_norm": 0.3632372319698334, + "learning_rate": 4.958206498619966e-05, + "loss": 0.2019, + "step": 8547 + }, + { + "epoch": 0.15246316840866123, + "grad_norm": 0.3032687306404114, + "learning_rate": 4.958178152023715e-05, + "loss": 0.2347, + "step": 8548 + }, + { + "epoch": 0.15248100453037491, + "grad_norm": 0.3779764175415039, + "learning_rate": 4.958149795898715e-05, + "loss": 0.3051, + "step": 8549 + }, + { + "epoch": 0.1524988406520886, + "grad_norm": 0.3018413782119751, + "learning_rate": 4.958121430245078e-05, + "loss": 0.2445, + "step": 8550 + }, + { + "epoch": 0.15251667677380232, + "grad_norm": 0.31004467606544495, + "learning_rate": 4.9580930550629136e-05, + "loss": 0.2038, + "step": 8551 + }, + { + "epoch": 0.152534512895516, + "grad_norm": 0.2716842591762543, + "learning_rate": 4.95806467035233e-05, + "loss": 0.179, + "step": 8552 + }, + { + "epoch": 0.1525523490172297, + "grad_norm": 0.4797303080558777, + "learning_rate": 4.95803627611344e-05, + "loss": 0.2629, + "step": 8553 + }, + { + "epoch": 0.15257018513894338, + "grad_norm": 0.27326005697250366, + "learning_rate": 4.958007872346353e-05, + "loss": 0.2147, + "step": 8554 + }, + { + "epoch": 0.15258802126065707, + "grad_norm": 0.30851417779922485, + "learning_rate": 4.9579794590511777e-05, + "loss": 0.2648, + "step": 8555 + }, + { + "epoch": 0.15260585738237079, + "grad_norm": 0.332461416721344, + "learning_rate": 4.957951036228024e-05, + "loss": 0.2503, + "step": 8556 + }, + { + "epoch": 0.15262369350408447, + "grad_norm": 0.3649390637874603, + "learning_rate": 4.957922603877005e-05, + "loss": 0.2272, + "step": 8557 + }, + { + "epoch": 0.15264152962579816, + "grad_norm": 0.3612349033355713, + "learning_rate": 4.957894161998228e-05, + "loss": 0.1984, + "step": 8558 + }, + { + "epoch": 0.15265936574751185, + "grad_norm": 0.6173525452613831, + "learning_rate": 4.9578657105918044e-05, + "loss": 0.2221, + "step": 8559 + }, + { + "epoch": 0.15267720186922557, + "grad_norm": 0.2857859134674072, + "learning_rate": 4.957837249657845e-05, + "loss": 0.2298, + "step": 8560 + }, + { + "epoch": 0.15269503799093925, + "grad_norm": 0.35880544781684875, + "learning_rate": 4.957808779196459e-05, + "loss": 0.2385, + "step": 8561 + }, + { + "epoch": 0.15271287411265294, + "grad_norm": 0.3190654516220093, + "learning_rate": 4.957780299207758e-05, + "loss": 0.2148, + "step": 8562 + }, + { + "epoch": 0.15273071023436663, + "grad_norm": 0.3299749195575714, + "learning_rate": 4.9577518096918506e-05, + "loss": 0.2325, + "step": 8563 + }, + { + "epoch": 0.15274854635608034, + "grad_norm": 0.28279489278793335, + "learning_rate": 4.957723310648849e-05, + "loss": 0.2081, + "step": 8564 + }, + { + "epoch": 0.15276638247779403, + "grad_norm": 0.49218663573265076, + "learning_rate": 4.957694802078863e-05, + "loss": 0.1838, + "step": 8565 + }, + { + "epoch": 0.15278421859950772, + "grad_norm": 0.24212224781513214, + "learning_rate": 4.957666283982002e-05, + "loss": 0.2183, + "step": 8566 + }, + { + "epoch": 0.1528020547212214, + "grad_norm": 0.34885886311531067, + "learning_rate": 4.9576377563583786e-05, + "loss": 0.2463, + "step": 8567 + }, + { + "epoch": 0.15281989084293512, + "grad_norm": 0.3379933536052704, + "learning_rate": 4.9576092192081024e-05, + "loss": 0.2377, + "step": 8568 + }, + { + "epoch": 0.1528377269646488, + "grad_norm": 0.3034881353378296, + "learning_rate": 4.957580672531283e-05, + "loss": 0.2469, + "step": 8569 + }, + { + "epoch": 0.1528555630863625, + "grad_norm": 0.2592678368091583, + "learning_rate": 4.9575521163280336e-05, + "loss": 0.1855, + "step": 8570 + }, + { + "epoch": 0.1528733992080762, + "grad_norm": 0.41172993183135986, + "learning_rate": 4.9575235505984626e-05, + "loss": 0.2534, + "step": 8571 + }, + { + "epoch": 0.1528912353297899, + "grad_norm": 0.39557528495788574, + "learning_rate": 4.957494975342682e-05, + "loss": 0.2813, + "step": 8572 + }, + { + "epoch": 0.1529090714515036, + "grad_norm": 0.2432795763015747, + "learning_rate": 4.9574663905608024e-05, + "loss": 0.2221, + "step": 8573 + }, + { + "epoch": 0.15292690757321728, + "grad_norm": 0.2979724407196045, + "learning_rate": 4.9574377962529327e-05, + "loss": 0.2153, + "step": 8574 + }, + { + "epoch": 0.15294474369493097, + "grad_norm": 0.3295513987541199, + "learning_rate": 4.957409192419187e-05, + "loss": 0.2493, + "step": 8575 + }, + { + "epoch": 0.15296257981664466, + "grad_norm": 0.242191880941391, + "learning_rate": 4.957380579059673e-05, + "loss": 0.1908, + "step": 8576 + }, + { + "epoch": 0.15298041593835837, + "grad_norm": 0.22380758821964264, + "learning_rate": 4.957351956174504e-05, + "loss": 0.2212, + "step": 8577 + }, + { + "epoch": 0.15299825206007206, + "grad_norm": 0.24662764370441437, + "learning_rate": 4.9573233237637904e-05, + "loss": 0.1852, + "step": 8578 + }, + { + "epoch": 0.15301608818178575, + "grad_norm": 0.25940418243408203, + "learning_rate": 4.957294681827642e-05, + "loss": 0.2088, + "step": 8579 + }, + { + "epoch": 0.15303392430349944, + "grad_norm": 0.30162614583969116, + "learning_rate": 4.9572660303661716e-05, + "loss": 0.2102, + "step": 8580 + }, + { + "epoch": 0.15305176042521315, + "grad_norm": 0.2159837931394577, + "learning_rate": 4.957237369379489e-05, + "loss": 0.203, + "step": 8581 + }, + { + "epoch": 0.15306959654692684, + "grad_norm": 0.3033880889415741, + "learning_rate": 4.957208698867706e-05, + "loss": 0.2548, + "step": 8582 + }, + { + "epoch": 0.15308743266864053, + "grad_norm": 0.24734680354595184, + "learning_rate": 4.957180018830933e-05, + "loss": 0.194, + "step": 8583 + }, + { + "epoch": 0.15310526879035422, + "grad_norm": 0.259811669588089, + "learning_rate": 4.9571513292692804e-05, + "loss": 0.2127, + "step": 8584 + }, + { + "epoch": 0.15312310491206793, + "grad_norm": 0.23779602348804474, + "learning_rate": 4.957122630182862e-05, + "loss": 0.1758, + "step": 8585 + }, + { + "epoch": 0.15314094103378162, + "grad_norm": 0.27816519141197205, + "learning_rate": 4.957093921571787e-05, + "loss": 0.161, + "step": 8586 + }, + { + "epoch": 0.1531587771554953, + "grad_norm": 0.5385149717330933, + "learning_rate": 4.957065203436168e-05, + "loss": 0.1855, + "step": 8587 + }, + { + "epoch": 0.153176613277209, + "grad_norm": 0.48837822675704956, + "learning_rate": 4.9570364757761154e-05, + "loss": 0.2055, + "step": 8588 + }, + { + "epoch": 0.1531944493989227, + "grad_norm": 0.25603777170181274, + "learning_rate": 4.9570077385917405e-05, + "loss": 0.2018, + "step": 8589 + }, + { + "epoch": 0.1532122855206364, + "grad_norm": 0.32549938559532166, + "learning_rate": 4.956978991883156e-05, + "loss": 0.2484, + "step": 8590 + }, + { + "epoch": 0.1532301216423501, + "grad_norm": 0.24236349761486053, + "learning_rate": 4.95695023565047e-05, + "loss": 0.2225, + "step": 8591 + }, + { + "epoch": 0.15324795776406377, + "grad_norm": 0.3066392242908478, + "learning_rate": 4.956921469893798e-05, + "loss": 0.2101, + "step": 8592 + }, + { + "epoch": 0.1532657938857775, + "grad_norm": 0.3377484083175659, + "learning_rate": 4.95689269461325e-05, + "loss": 0.2184, + "step": 8593 + }, + { + "epoch": 0.15328363000749118, + "grad_norm": 0.3868796229362488, + "learning_rate": 4.956863909808936e-05, + "loss": 0.2844, + "step": 8594 + }, + { + "epoch": 0.15330146612920487, + "grad_norm": 0.2308701127767563, + "learning_rate": 4.95683511548097e-05, + "loss": 0.2011, + "step": 8595 + }, + { + "epoch": 0.15331930225091855, + "grad_norm": 0.17837509512901306, + "learning_rate": 4.9568063116294625e-05, + "loss": 0.1902, + "step": 8596 + }, + { + "epoch": 0.15333713837263224, + "grad_norm": 0.25052610039711, + "learning_rate": 4.956777498254525e-05, + "loss": 0.2347, + "step": 8597 + }, + { + "epoch": 0.15335497449434596, + "grad_norm": 0.2298162877559662, + "learning_rate": 4.9567486753562697e-05, + "loss": 0.2386, + "step": 8598 + }, + { + "epoch": 0.15337281061605965, + "grad_norm": 0.33065974712371826, + "learning_rate": 4.9567198429348075e-05, + "loss": 0.2868, + "step": 8599 + }, + { + "epoch": 0.15339064673777333, + "grad_norm": 0.229557603597641, + "learning_rate": 4.956691000990251e-05, + "loss": 0.2111, + "step": 8600 + }, + { + "epoch": 0.15340848285948702, + "grad_norm": 0.2160930037498474, + "learning_rate": 4.9566621495227115e-05, + "loss": 0.2264, + "step": 8601 + }, + { + "epoch": 0.15342631898120074, + "grad_norm": 0.25786328315734863, + "learning_rate": 4.9566332885323005e-05, + "loss": 0.1941, + "step": 8602 + }, + { + "epoch": 0.15344415510291443, + "grad_norm": 0.3277515172958374, + "learning_rate": 4.9566044180191304e-05, + "loss": 0.2087, + "step": 8603 + }, + { + "epoch": 0.1534619912246281, + "grad_norm": 0.3133716285228729, + "learning_rate": 4.956575537983314e-05, + "loss": 0.1982, + "step": 8604 + }, + { + "epoch": 0.1534798273463418, + "grad_norm": 0.3015703558921814, + "learning_rate": 4.9565466484249616e-05, + "loss": 0.2515, + "step": 8605 + }, + { + "epoch": 0.15349766346805552, + "grad_norm": 0.3521009087562561, + "learning_rate": 4.9565177493441864e-05, + "loss": 0.2298, + "step": 8606 + }, + { + "epoch": 0.1535154995897692, + "grad_norm": 0.26029446721076965, + "learning_rate": 4.9564888407411e-05, + "loss": 0.2074, + "step": 8607 + }, + { + "epoch": 0.1535333357114829, + "grad_norm": 0.4189286231994629, + "learning_rate": 4.9564599226158136e-05, + "loss": 0.2077, + "step": 8608 + }, + { + "epoch": 0.15355117183319658, + "grad_norm": 0.22298909723758698, + "learning_rate": 4.956430994968441e-05, + "loss": 0.1952, + "step": 8609 + }, + { + "epoch": 0.1535690079549103, + "grad_norm": 0.3025025725364685, + "learning_rate": 4.956402057799093e-05, + "loss": 0.2466, + "step": 8610 + }, + { + "epoch": 0.15358684407662399, + "grad_norm": 0.2786144018173218, + "learning_rate": 4.956373111107883e-05, + "loss": 0.2594, + "step": 8611 + }, + { + "epoch": 0.15360468019833767, + "grad_norm": 0.3745969235897064, + "learning_rate": 4.9563441548949205e-05, + "loss": 0.2655, + "step": 8612 + }, + { + "epoch": 0.15362251632005136, + "grad_norm": 0.2170204520225525, + "learning_rate": 4.956315189160322e-05, + "loss": 0.1745, + "step": 8613 + }, + { + "epoch": 0.15364035244176505, + "grad_norm": 0.2866086959838867, + "learning_rate": 4.956286213904196e-05, + "loss": 0.2301, + "step": 8614 + }, + { + "epoch": 0.15365818856347876, + "grad_norm": 0.5086783170700073, + "learning_rate": 4.9562572291266565e-05, + "loss": 0.2374, + "step": 8615 + }, + { + "epoch": 0.15367602468519245, + "grad_norm": 0.22385302186012268, + "learning_rate": 4.956228234827816e-05, + "loss": 0.2082, + "step": 8616 + }, + { + "epoch": 0.15369386080690614, + "grad_norm": 0.20260019600391388, + "learning_rate": 4.956199231007786e-05, + "loss": 0.1938, + "step": 8617 + }, + { + "epoch": 0.15371169692861983, + "grad_norm": 0.3071066737174988, + "learning_rate": 4.9561702176666796e-05, + "loss": 0.1806, + "step": 8618 + }, + { + "epoch": 0.15372953305033354, + "grad_norm": 0.2761852741241455, + "learning_rate": 4.956141194804609e-05, + "loss": 0.2204, + "step": 8619 + }, + { + "epoch": 0.15374736917204723, + "grad_norm": 0.31176427006721497, + "learning_rate": 4.956112162421687e-05, + "loss": 0.2642, + "step": 8620 + }, + { + "epoch": 0.15376520529376092, + "grad_norm": 0.43502193689346313, + "learning_rate": 4.956083120518026e-05, + "loss": 0.2575, + "step": 8621 + }, + { + "epoch": 0.1537830414154746, + "grad_norm": 0.25103381276130676, + "learning_rate": 4.956054069093738e-05, + "loss": 0.2386, + "step": 8622 + }, + { + "epoch": 0.15380087753718832, + "grad_norm": 0.21322041749954224, + "learning_rate": 4.956025008148937e-05, + "loss": 0.2087, + "step": 8623 + }, + { + "epoch": 0.153818713658902, + "grad_norm": 0.3090284764766693, + "learning_rate": 4.955995937683734e-05, + "loss": 0.2988, + "step": 8624 + }, + { + "epoch": 0.1538365497806157, + "grad_norm": 0.27395325899124146, + "learning_rate": 4.955966857698243e-05, + "loss": 0.226, + "step": 8625 + }, + { + "epoch": 0.1538543859023294, + "grad_norm": 0.30051928758621216, + "learning_rate": 4.9559377681925764e-05, + "loss": 0.2205, + "step": 8626 + }, + { + "epoch": 0.1538722220240431, + "grad_norm": 0.33471882343292236, + "learning_rate": 4.955908669166846e-05, + "loss": 0.192, + "step": 8627 + }, + { + "epoch": 0.1538900581457568, + "grad_norm": 0.2831500768661499, + "learning_rate": 4.955879560621166e-05, + "loss": 0.235, + "step": 8628 + }, + { + "epoch": 0.15390789426747048, + "grad_norm": 0.28826627135276794, + "learning_rate": 4.955850442555648e-05, + "loss": 0.2041, + "step": 8629 + }, + { + "epoch": 0.15392573038918417, + "grad_norm": 0.3237340450286865, + "learning_rate": 4.955821314970406e-05, + "loss": 0.2274, + "step": 8630 + }, + { + "epoch": 0.15394356651089788, + "grad_norm": 0.3359971046447754, + "learning_rate": 4.955792177865553e-05, + "loss": 0.225, + "step": 8631 + }, + { + "epoch": 0.15396140263261157, + "grad_norm": 0.22938977181911469, + "learning_rate": 4.9557630312412e-05, + "loss": 0.2399, + "step": 8632 + }, + { + "epoch": 0.15397923875432526, + "grad_norm": 0.18498247861862183, + "learning_rate": 4.955733875097461e-05, + "loss": 0.1935, + "step": 8633 + }, + { + "epoch": 0.15399707487603895, + "grad_norm": 0.29871103167533875, + "learning_rate": 4.9557047094344504e-05, + "loss": 0.2009, + "step": 8634 + }, + { + "epoch": 0.15401491099775264, + "grad_norm": 0.314211368560791, + "learning_rate": 4.95567553425228e-05, + "loss": 0.216, + "step": 8635 + }, + { + "epoch": 0.15403274711946635, + "grad_norm": 0.2440318912267685, + "learning_rate": 4.955646349551063e-05, + "loss": 0.2551, + "step": 8636 + }, + { + "epoch": 0.15405058324118004, + "grad_norm": 0.2110043466091156, + "learning_rate": 4.955617155330913e-05, + "loss": 0.1894, + "step": 8637 + }, + { + "epoch": 0.15406841936289373, + "grad_norm": 0.21493402123451233, + "learning_rate": 4.955587951591941e-05, + "loss": 0.2011, + "step": 8638 + }, + { + "epoch": 0.15408625548460742, + "grad_norm": 0.21289654076099396, + "learning_rate": 4.955558738334264e-05, + "loss": 0.2037, + "step": 8639 + }, + { + "epoch": 0.15410409160632113, + "grad_norm": 0.3226395845413208, + "learning_rate": 4.9555295155579925e-05, + "loss": 0.2882, + "step": 8640 + }, + { + "epoch": 0.15412192772803482, + "grad_norm": 0.32725533843040466, + "learning_rate": 4.95550028326324e-05, + "loss": 0.3093, + "step": 8641 + }, + { + "epoch": 0.1541397638497485, + "grad_norm": 0.3094915449619293, + "learning_rate": 4.955471041450121e-05, + "loss": 0.2093, + "step": 8642 + }, + { + "epoch": 0.1541575999714622, + "grad_norm": 0.27979058027267456, + "learning_rate": 4.955441790118748e-05, + "loss": 0.2234, + "step": 8643 + }, + { + "epoch": 0.1541754360931759, + "grad_norm": 0.22268089652061462, + "learning_rate": 4.9554125292692346e-05, + "loss": 0.2318, + "step": 8644 + }, + { + "epoch": 0.1541932722148896, + "grad_norm": 0.20991800725460052, + "learning_rate": 4.9553832589016934e-05, + "loss": 0.2301, + "step": 8645 + }, + { + "epoch": 0.1542111083366033, + "grad_norm": 0.2542620301246643, + "learning_rate": 4.9553539790162395e-05, + "loss": 0.2449, + "step": 8646 + }, + { + "epoch": 0.15422894445831697, + "grad_norm": 0.30696237087249756, + "learning_rate": 4.9553246896129854e-05, + "loss": 0.2621, + "step": 8647 + }, + { + "epoch": 0.1542467805800307, + "grad_norm": 0.29050058126449585, + "learning_rate": 4.955295390692044e-05, + "loss": 0.2537, + "step": 8648 + }, + { + "epoch": 0.15426461670174438, + "grad_norm": 0.2910822927951813, + "learning_rate": 4.9552660822535306e-05, + "loss": 0.2586, + "step": 8649 + }, + { + "epoch": 0.15428245282345807, + "grad_norm": 0.26228010654449463, + "learning_rate": 4.9552367642975575e-05, + "loss": 0.2458, + "step": 8650 + }, + { + "epoch": 0.15430028894517175, + "grad_norm": 0.24198441207408905, + "learning_rate": 4.955207436824239e-05, + "loss": 0.195, + "step": 8651 + }, + { + "epoch": 0.15431812506688547, + "grad_norm": 0.3051553964614868, + "learning_rate": 4.9551780998336885e-05, + "loss": 0.2308, + "step": 8652 + }, + { + "epoch": 0.15433596118859916, + "grad_norm": 0.33055949211120605, + "learning_rate": 4.955148753326019e-05, + "loss": 0.2798, + "step": 8653 + }, + { + "epoch": 0.15435379731031285, + "grad_norm": 0.3392762839794159, + "learning_rate": 4.9551193973013453e-05, + "loss": 0.2468, + "step": 8654 + }, + { + "epoch": 0.15437163343202653, + "grad_norm": 0.3386043608188629, + "learning_rate": 4.95509003175978e-05, + "loss": 0.231, + "step": 8655 + }, + { + "epoch": 0.15438946955374022, + "grad_norm": 0.31472453474998474, + "learning_rate": 4.955060656701439e-05, + "loss": 0.2748, + "step": 8656 + }, + { + "epoch": 0.15440730567545394, + "grad_norm": 0.2320673018693924, + "learning_rate": 4.955031272126435e-05, + "loss": 0.2129, + "step": 8657 + }, + { + "epoch": 0.15442514179716763, + "grad_norm": 0.2811758816242218, + "learning_rate": 4.955001878034881e-05, + "loss": 0.2266, + "step": 8658 + }, + { + "epoch": 0.1544429779188813, + "grad_norm": 0.32722583413124084, + "learning_rate": 4.954972474426892e-05, + "loss": 0.2144, + "step": 8659 + }, + { + "epoch": 0.154460814040595, + "grad_norm": 0.2794470489025116, + "learning_rate": 4.9549430613025824e-05, + "loss": 0.2236, + "step": 8660 + }, + { + "epoch": 0.15447865016230872, + "grad_norm": 0.2867196202278137, + "learning_rate": 4.9549136386620655e-05, + "loss": 0.2413, + "step": 8661 + }, + { + "epoch": 0.1544964862840224, + "grad_norm": 0.2107728123664856, + "learning_rate": 4.954884206505455e-05, + "loss": 0.173, + "step": 8662 + }, + { + "epoch": 0.1545143224057361, + "grad_norm": 0.21892587840557098, + "learning_rate": 4.954854764832865e-05, + "loss": 0.1816, + "step": 8663 + }, + { + "epoch": 0.15453215852744978, + "grad_norm": 0.2340811938047409, + "learning_rate": 4.9548253136444105e-05, + "loss": 0.1745, + "step": 8664 + }, + { + "epoch": 0.1545499946491635, + "grad_norm": 0.26641198992729187, + "learning_rate": 4.9547958529402056e-05, + "loss": 0.2024, + "step": 8665 + }, + { + "epoch": 0.15456783077087718, + "grad_norm": 0.40584680438041687, + "learning_rate": 4.954766382720364e-05, + "loss": 0.2996, + "step": 8666 + }, + { + "epoch": 0.15458566689259087, + "grad_norm": 0.26639845967292786, + "learning_rate": 4.954736902985e-05, + "loss": 0.2264, + "step": 8667 + }, + { + "epoch": 0.15460350301430456, + "grad_norm": 0.4409404993057251, + "learning_rate": 4.954707413734227e-05, + "loss": 0.3571, + "step": 8668 + }, + { + "epoch": 0.15462133913601828, + "grad_norm": 0.2202620804309845, + "learning_rate": 4.954677914968161e-05, + "loss": 0.1992, + "step": 8669 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 0.23582911491394043, + "learning_rate": 4.954648406686916e-05, + "loss": 0.2166, + "step": 8670 + }, + { + "epoch": 0.15465701137944565, + "grad_norm": 0.32048580050468445, + "learning_rate": 4.954618888890605e-05, + "loss": 0.2881, + "step": 8671 + }, + { + "epoch": 0.15467484750115934, + "grad_norm": 0.26223132014274597, + "learning_rate": 4.9545893615793444e-05, + "loss": 0.1733, + "step": 8672 + }, + { + "epoch": 0.15469268362287306, + "grad_norm": 0.25206106901168823, + "learning_rate": 4.9545598247532473e-05, + "loss": 0.2346, + "step": 8673 + }, + { + "epoch": 0.15471051974458674, + "grad_norm": 0.3384833037853241, + "learning_rate": 4.954530278412428e-05, + "loss": 0.2749, + "step": 8674 + }, + { + "epoch": 0.15472835586630043, + "grad_norm": 0.2313491553068161, + "learning_rate": 4.9545007225570024e-05, + "loss": 0.224, + "step": 8675 + }, + { + "epoch": 0.15474619198801412, + "grad_norm": 0.307544082403183, + "learning_rate": 4.954471157187084e-05, + "loss": 0.2255, + "step": 8676 + }, + { + "epoch": 0.1547640281097278, + "grad_norm": 0.24871240556240082, + "learning_rate": 4.954441582302787e-05, + "loss": 0.201, + "step": 8677 + }, + { + "epoch": 0.15478186423144152, + "grad_norm": 0.31050002574920654, + "learning_rate": 4.954411997904228e-05, + "loss": 0.2032, + "step": 8678 + }, + { + "epoch": 0.1547997003531552, + "grad_norm": 0.3050120174884796, + "learning_rate": 4.9543824039915185e-05, + "loss": 0.2329, + "step": 8679 + }, + { + "epoch": 0.1548175364748689, + "grad_norm": 0.31380385160446167, + "learning_rate": 4.9543528005647766e-05, + "loss": 0.2206, + "step": 8680 + }, + { + "epoch": 0.1548353725965826, + "grad_norm": 0.2867913246154785, + "learning_rate": 4.9543231876241145e-05, + "loss": 0.207, + "step": 8681 + }, + { + "epoch": 0.1548532087182963, + "grad_norm": 0.30340704321861267, + "learning_rate": 4.9542935651696496e-05, + "loss": 0.2325, + "step": 8682 + }, + { + "epoch": 0.15487104484001, + "grad_norm": 0.2605895400047302, + "learning_rate": 4.954263933201494e-05, + "loss": 0.1886, + "step": 8683 + }, + { + "epoch": 0.15488888096172368, + "grad_norm": 0.2807618975639343, + "learning_rate": 4.9542342917197636e-05, + "loss": 0.2215, + "step": 8684 + }, + { + "epoch": 0.15490671708343737, + "grad_norm": 0.25563374161720276, + "learning_rate": 4.9542046407245737e-05, + "loss": 0.2062, + "step": 8685 + }, + { + "epoch": 0.15492455320515108, + "grad_norm": 0.3591375946998596, + "learning_rate": 4.954174980216039e-05, + "loss": 0.2117, + "step": 8686 + }, + { + "epoch": 0.15494238932686477, + "grad_norm": 0.27902495861053467, + "learning_rate": 4.9541453101942744e-05, + "loss": 0.1791, + "step": 8687 + }, + { + "epoch": 0.15496022544857846, + "grad_norm": 0.26965099573135376, + "learning_rate": 4.954115630659395e-05, + "loss": 0.2185, + "step": 8688 + }, + { + "epoch": 0.15497806157029215, + "grad_norm": 0.2934694290161133, + "learning_rate": 4.954085941611516e-05, + "loss": 0.2054, + "step": 8689 + }, + { + "epoch": 0.15499589769200586, + "grad_norm": 0.31233468651771545, + "learning_rate": 4.954056243050752e-05, + "loss": 0.2922, + "step": 8690 + }, + { + "epoch": 0.15501373381371955, + "grad_norm": 0.259063184261322, + "learning_rate": 4.954026534977218e-05, + "loss": 0.2171, + "step": 8691 + }, + { + "epoch": 0.15503156993543324, + "grad_norm": 0.24388599395751953, + "learning_rate": 4.95399681739103e-05, + "loss": 0.2172, + "step": 8692 + }, + { + "epoch": 0.15504940605714693, + "grad_norm": 0.2819579541683197, + "learning_rate": 4.953967090292303e-05, + "loss": 0.2178, + "step": 8693 + }, + { + "epoch": 0.15506724217886061, + "grad_norm": 0.25104451179504395, + "learning_rate": 4.9539373536811516e-05, + "loss": 0.1745, + "step": 8694 + }, + { + "epoch": 0.15508507830057433, + "grad_norm": 0.31823596358299255, + "learning_rate": 4.953907607557692e-05, + "loss": 0.1805, + "step": 8695 + }, + { + "epoch": 0.15510291442228802, + "grad_norm": 0.2884424924850464, + "learning_rate": 4.953877851922038e-05, + "loss": 0.2039, + "step": 8696 + }, + { + "epoch": 0.1551207505440017, + "grad_norm": 0.2584998309612274, + "learning_rate": 4.9538480867743064e-05, + "loss": 0.264, + "step": 8697 + }, + { + "epoch": 0.1551385866657154, + "grad_norm": 0.3246391713619232, + "learning_rate": 4.953818312114612e-05, + "loss": 0.2266, + "step": 8698 + }, + { + "epoch": 0.1551564227874291, + "grad_norm": 0.26628631353378296, + "learning_rate": 4.9537885279430705e-05, + "loss": 0.2587, + "step": 8699 + }, + { + "epoch": 0.1551742589091428, + "grad_norm": 0.1852973997592926, + "learning_rate": 4.953758734259797e-05, + "loss": 0.1619, + "step": 8700 + }, + { + "epoch": 0.1551920950308565, + "grad_norm": 0.19855822622776031, + "learning_rate": 4.953728931064907e-05, + "loss": 0.1836, + "step": 8701 + }, + { + "epoch": 0.15520993115257017, + "grad_norm": 0.2494380623102188, + "learning_rate": 4.953699118358517e-05, + "loss": 0.2222, + "step": 8702 + }, + { + "epoch": 0.1552277672742839, + "grad_norm": 0.2721339166164398, + "learning_rate": 4.953669296140741e-05, + "loss": 0.2114, + "step": 8703 + }, + { + "epoch": 0.15524560339599758, + "grad_norm": 0.25268349051475525, + "learning_rate": 4.9536394644116954e-05, + "loss": 0.2186, + "step": 8704 + }, + { + "epoch": 0.15526343951771127, + "grad_norm": 0.3157144784927368, + "learning_rate": 4.9536096231714954e-05, + "loss": 0.2638, + "step": 8705 + }, + { + "epoch": 0.15528127563942495, + "grad_norm": 0.22628776729106903, + "learning_rate": 4.953579772420258e-05, + "loss": 0.1914, + "step": 8706 + }, + { + "epoch": 0.15529911176113867, + "grad_norm": 0.2234720140695572, + "learning_rate": 4.953549912158097e-05, + "loss": 0.1698, + "step": 8707 + }, + { + "epoch": 0.15531694788285236, + "grad_norm": 0.3426528573036194, + "learning_rate": 4.9535200423851295e-05, + "loss": 0.2192, + "step": 8708 + }, + { + "epoch": 0.15533478400456605, + "grad_norm": 0.24046680331230164, + "learning_rate": 4.9534901631014706e-05, + "loss": 0.2183, + "step": 8709 + }, + { + "epoch": 0.15535262012627973, + "grad_norm": 0.31615275144577026, + "learning_rate": 4.953460274307237e-05, + "loss": 0.2088, + "step": 8710 + }, + { + "epoch": 0.15537045624799345, + "grad_norm": 0.22079584002494812, + "learning_rate": 4.953430376002544e-05, + "loss": 0.1927, + "step": 8711 + }, + { + "epoch": 0.15538829236970714, + "grad_norm": 0.24755722284317017, + "learning_rate": 4.953400468187507e-05, + "loss": 0.2278, + "step": 8712 + }, + { + "epoch": 0.15540612849142083, + "grad_norm": 0.22813691198825836, + "learning_rate": 4.953370550862243e-05, + "loss": 0.1993, + "step": 8713 + }, + { + "epoch": 0.1554239646131345, + "grad_norm": 0.2970469892024994, + "learning_rate": 4.953340624026867e-05, + "loss": 0.1896, + "step": 8714 + }, + { + "epoch": 0.1554418007348482, + "grad_norm": 0.36117640137672424, + "learning_rate": 4.953310687681495e-05, + "loss": 0.1771, + "step": 8715 + }, + { + "epoch": 0.15545963685656192, + "grad_norm": 0.2168557047843933, + "learning_rate": 4.953280741826244e-05, + "loss": 0.1855, + "step": 8716 + }, + { + "epoch": 0.1554774729782756, + "grad_norm": 0.24977099895477295, + "learning_rate": 4.9532507864612296e-05, + "loss": 0.231, + "step": 8717 + }, + { + "epoch": 0.1554953090999893, + "grad_norm": 0.1853153258562088, + "learning_rate": 4.953220821586567e-05, + "loss": 0.1602, + "step": 8718 + }, + { + "epoch": 0.15551314522170298, + "grad_norm": 0.25104716420173645, + "learning_rate": 4.953190847202374e-05, + "loss": 0.2043, + "step": 8719 + }, + { + "epoch": 0.1555309813434167, + "grad_norm": 0.23357023298740387, + "learning_rate": 4.953160863308766e-05, + "loss": 0.1792, + "step": 8720 + }, + { + "epoch": 0.15554881746513038, + "grad_norm": 0.24363559484481812, + "learning_rate": 4.953130869905859e-05, + "loss": 0.2322, + "step": 8721 + }, + { + "epoch": 0.15556665358684407, + "grad_norm": 0.3597864508628845, + "learning_rate": 4.953100866993769e-05, + "loss": 0.2694, + "step": 8722 + }, + { + "epoch": 0.15558448970855776, + "grad_norm": 0.3142867684364319, + "learning_rate": 4.9530708545726135e-05, + "loss": 0.2261, + "step": 8723 + }, + { + "epoch": 0.15560232583027148, + "grad_norm": 0.24061615765094757, + "learning_rate": 4.953040832642507e-05, + "loss": 0.2014, + "step": 8724 + }, + { + "epoch": 0.15562016195198516, + "grad_norm": 0.294924259185791, + "learning_rate": 4.953010801203568e-05, + "loss": 0.256, + "step": 8725 + }, + { + "epoch": 0.15563799807369885, + "grad_norm": 0.35685548186302185, + "learning_rate": 4.952980760255912e-05, + "loss": 0.2613, + "step": 8726 + }, + { + "epoch": 0.15565583419541254, + "grad_norm": 0.36911293864250183, + "learning_rate": 4.952950709799655e-05, + "loss": 0.1945, + "step": 8727 + }, + { + "epoch": 0.15567367031712626, + "grad_norm": 0.27103596925735474, + "learning_rate": 4.9529206498349134e-05, + "loss": 0.2065, + "step": 8728 + }, + { + "epoch": 0.15569150643883994, + "grad_norm": 0.32939764857292175, + "learning_rate": 4.952890580361804e-05, + "loss": 0.2353, + "step": 8729 + }, + { + "epoch": 0.15570934256055363, + "grad_norm": 0.2824895977973938, + "learning_rate": 4.952860501380445e-05, + "loss": 0.256, + "step": 8730 + }, + { + "epoch": 0.15572717868226732, + "grad_norm": 0.31564801931381226, + "learning_rate": 4.95283041289095e-05, + "loss": 0.3028, + "step": 8731 + }, + { + "epoch": 0.15574501480398104, + "grad_norm": 0.2352769374847412, + "learning_rate": 4.952800314893438e-05, + "loss": 0.2005, + "step": 8732 + }, + { + "epoch": 0.15576285092569472, + "grad_norm": 0.36282840371131897, + "learning_rate": 4.952770207388024e-05, + "loss": 0.1704, + "step": 8733 + }, + { + "epoch": 0.1557806870474084, + "grad_norm": 0.2992383539676666, + "learning_rate": 4.9527400903748264e-05, + "loss": 0.2129, + "step": 8734 + }, + { + "epoch": 0.1557985231691221, + "grad_norm": 0.27786487340927124, + "learning_rate": 4.952709963853961e-05, + "loss": 0.2069, + "step": 8735 + }, + { + "epoch": 0.1558163592908358, + "grad_norm": 0.2970375120639801, + "learning_rate": 4.9526798278255435e-05, + "loss": 0.2419, + "step": 8736 + }, + { + "epoch": 0.1558341954125495, + "grad_norm": 0.2086101770401001, + "learning_rate": 4.952649682289693e-05, + "loss": 0.1731, + "step": 8737 + }, + { + "epoch": 0.1558520315342632, + "grad_norm": 0.40671852231025696, + "learning_rate": 4.952619527246525e-05, + "loss": 0.1869, + "step": 8738 + }, + { + "epoch": 0.15586986765597688, + "grad_norm": 0.2768620550632477, + "learning_rate": 4.9525893626961564e-05, + "loss": 0.2335, + "step": 8739 + }, + { + "epoch": 0.15588770377769057, + "grad_norm": 0.22819559276103973, + "learning_rate": 4.952559188638705e-05, + "loss": 0.1749, + "step": 8740 + }, + { + "epoch": 0.15590553989940428, + "grad_norm": 0.2739728093147278, + "learning_rate": 4.9525290050742855e-05, + "loss": 0.2235, + "step": 8741 + }, + { + "epoch": 0.15592337602111797, + "grad_norm": 0.3490871787071228, + "learning_rate": 4.952498812003018e-05, + "loss": 0.2354, + "step": 8742 + }, + { + "epoch": 0.15594121214283166, + "grad_norm": 0.287117600440979, + "learning_rate": 4.9524686094250175e-05, + "loss": 0.254, + "step": 8743 + }, + { + "epoch": 0.15595904826454535, + "grad_norm": 0.29424506425857544, + "learning_rate": 4.952438397340402e-05, + "loss": 0.2015, + "step": 8744 + }, + { + "epoch": 0.15597688438625906, + "grad_norm": 0.31124168634414673, + "learning_rate": 4.952408175749288e-05, + "loss": 0.2054, + "step": 8745 + }, + { + "epoch": 0.15599472050797275, + "grad_norm": 0.2352152317762375, + "learning_rate": 4.952377944651793e-05, + "loss": 0.2381, + "step": 8746 + }, + { + "epoch": 0.15601255662968644, + "grad_norm": 0.28440725803375244, + "learning_rate": 4.952347704048033e-05, + "loss": 0.239, + "step": 8747 + }, + { + "epoch": 0.15603039275140013, + "grad_norm": 0.25761833786964417, + "learning_rate": 4.952317453938127e-05, + "loss": 0.2086, + "step": 8748 + }, + { + "epoch": 0.15604822887311384, + "grad_norm": 0.21963690221309662, + "learning_rate": 4.9522871943221914e-05, + "loss": 0.1989, + "step": 8749 + }, + { + "epoch": 0.15606606499482753, + "grad_norm": 0.3100791871547699, + "learning_rate": 4.952256925200345e-05, + "loss": 0.2065, + "step": 8750 + }, + { + "epoch": 0.15608390111654122, + "grad_norm": 0.34078601002693176, + "learning_rate": 4.952226646572702e-05, + "loss": 0.2479, + "step": 8751 + }, + { + "epoch": 0.1561017372382549, + "grad_norm": 0.30791711807250977, + "learning_rate": 4.9521963584393824e-05, + "loss": 0.2446, + "step": 8752 + }, + { + "epoch": 0.15611957335996862, + "grad_norm": 0.2543742060661316, + "learning_rate": 4.952166060800503e-05, + "loss": 0.2336, + "step": 8753 + }, + { + "epoch": 0.1561374094816823, + "grad_norm": 0.23772947490215302, + "learning_rate": 4.95213575365618e-05, + "loss": 0.1987, + "step": 8754 + }, + { + "epoch": 0.156155245603396, + "grad_norm": 0.27661585807800293, + "learning_rate": 4.9521054370065324e-05, + "loss": 0.238, + "step": 8755 + }, + { + "epoch": 0.15617308172510969, + "grad_norm": 0.2552585005760193, + "learning_rate": 4.9520751108516773e-05, + "loss": 0.2462, + "step": 8756 + }, + { + "epoch": 0.15619091784682337, + "grad_norm": 0.2204442024230957, + "learning_rate": 4.9520447751917323e-05, + "loss": 0.2097, + "step": 8757 + }, + { + "epoch": 0.1562087539685371, + "grad_norm": 0.2437412589788437, + "learning_rate": 4.9520144300268146e-05, + "loss": 0.1955, + "step": 8758 + }, + { + "epoch": 0.15622659009025078, + "grad_norm": 0.3021933138370514, + "learning_rate": 4.9519840753570426e-05, + "loss": 0.3068, + "step": 8759 + }, + { + "epoch": 0.15624442621196447, + "grad_norm": 0.33262425661087036, + "learning_rate": 4.9519537111825324e-05, + "loss": 0.2585, + "step": 8760 + }, + { + "epoch": 0.15626226233367815, + "grad_norm": 0.33257779479026794, + "learning_rate": 4.951923337503404e-05, + "loss": 0.1946, + "step": 8761 + }, + { + "epoch": 0.15628009845539187, + "grad_norm": 0.2478998601436615, + "learning_rate": 4.951892954319772e-05, + "loss": 0.2019, + "step": 8762 + }, + { + "epoch": 0.15629793457710556, + "grad_norm": 0.23337683081626892, + "learning_rate": 4.9518625616317583e-05, + "loss": 0.1918, + "step": 8763 + }, + { + "epoch": 0.15631577069881925, + "grad_norm": 0.27422499656677246, + "learning_rate": 4.9518321594394767e-05, + "loss": 0.1997, + "step": 8764 + }, + { + "epoch": 0.15633360682053293, + "grad_norm": 0.2050887942314148, + "learning_rate": 4.9518017477430476e-05, + "loss": 0.1976, + "step": 8765 + }, + { + "epoch": 0.15635144294224665, + "grad_norm": 0.34559714794158936, + "learning_rate": 4.951771326542588e-05, + "loss": 0.1783, + "step": 8766 + }, + { + "epoch": 0.15636927906396034, + "grad_norm": 0.3659738600254059, + "learning_rate": 4.951740895838216e-05, + "loss": 0.1905, + "step": 8767 + }, + { + "epoch": 0.15638711518567402, + "grad_norm": 0.42823851108551025, + "learning_rate": 4.95171045563005e-05, + "loss": 0.2344, + "step": 8768 + }, + { + "epoch": 0.1564049513073877, + "grad_norm": 0.2627209424972534, + "learning_rate": 4.951680005918207e-05, + "loss": 0.2262, + "step": 8769 + }, + { + "epoch": 0.15642278742910143, + "grad_norm": 0.27858999371528625, + "learning_rate": 4.951649546702805e-05, + "loss": 0.2307, + "step": 8770 + }, + { + "epoch": 0.15644062355081512, + "grad_norm": 0.3153055012226105, + "learning_rate": 4.951619077983963e-05, + "loss": 0.2738, + "step": 8771 + }, + { + "epoch": 0.1564584596725288, + "grad_norm": 0.4113086760044098, + "learning_rate": 4.951588599761798e-05, + "loss": 0.2621, + "step": 8772 + }, + { + "epoch": 0.1564762957942425, + "grad_norm": 0.2547321319580078, + "learning_rate": 4.9515581120364295e-05, + "loss": 0.2166, + "step": 8773 + }, + { + "epoch": 0.1564941319159562, + "grad_norm": 0.28523120284080505, + "learning_rate": 4.9515276148079754e-05, + "loss": 0.2276, + "step": 8774 + }, + { + "epoch": 0.1565119680376699, + "grad_norm": 0.21435360610485077, + "learning_rate": 4.951497108076553e-05, + "loss": 0.2089, + "step": 8775 + }, + { + "epoch": 0.15652980415938358, + "grad_norm": 0.2475617676973343, + "learning_rate": 4.9514665918422815e-05, + "loss": 0.2407, + "step": 8776 + }, + { + "epoch": 0.15654764028109727, + "grad_norm": 0.2178918570280075, + "learning_rate": 4.951436066105278e-05, + "loss": 0.2188, + "step": 8777 + }, + { + "epoch": 0.15656547640281096, + "grad_norm": 0.31759148836135864, + "learning_rate": 4.951405530865663e-05, + "loss": 0.244, + "step": 8778 + }, + { + "epoch": 0.15658331252452468, + "grad_norm": 0.2582131624221802, + "learning_rate": 4.951374986123553e-05, + "loss": 0.2115, + "step": 8779 + }, + { + "epoch": 0.15660114864623836, + "grad_norm": 0.23167286813259125, + "learning_rate": 4.951344431879066e-05, + "loss": 0.1681, + "step": 8780 + }, + { + "epoch": 0.15661898476795205, + "grad_norm": 1.0556530952453613, + "learning_rate": 4.951313868132321e-05, + "loss": 0.1988, + "step": 8781 + }, + { + "epoch": 0.15663682088966574, + "grad_norm": 0.35775575041770935, + "learning_rate": 4.951283294883438e-05, + "loss": 0.3155, + "step": 8782 + }, + { + "epoch": 0.15665465701137946, + "grad_norm": 0.18764452636241913, + "learning_rate": 4.9512527121325345e-05, + "loss": 0.2122, + "step": 8783 + }, + { + "epoch": 0.15667249313309314, + "grad_norm": 0.33498260378837585, + "learning_rate": 4.9512221198797285e-05, + "loss": 0.2391, + "step": 8784 + }, + { + "epoch": 0.15669032925480683, + "grad_norm": 0.3173202574253082, + "learning_rate": 4.951191518125138e-05, + "loss": 0.2411, + "step": 8785 + }, + { + "epoch": 0.15670816537652052, + "grad_norm": 0.2543131709098816, + "learning_rate": 4.9511609068688836e-05, + "loss": 0.2016, + "step": 8786 + }, + { + "epoch": 0.15672600149823424, + "grad_norm": 0.26163366436958313, + "learning_rate": 4.951130286111082e-05, + "loss": 0.223, + "step": 8787 + }, + { + "epoch": 0.15674383761994792, + "grad_norm": 0.3173080086708069, + "learning_rate": 4.951099655851854e-05, + "loss": 0.1949, + "step": 8788 + }, + { + "epoch": 0.1567616737416616, + "grad_norm": 0.336243212223053, + "learning_rate": 4.9510690160913166e-05, + "loss": 0.2066, + "step": 8789 + }, + { + "epoch": 0.1567795098633753, + "grad_norm": 0.49204471707344055, + "learning_rate": 4.951038366829589e-05, + "loss": 0.1743, + "step": 8790 + }, + { + "epoch": 0.15679734598508901, + "grad_norm": 0.2887970805168152, + "learning_rate": 4.95100770806679e-05, + "loss": 0.1905, + "step": 8791 + }, + { + "epoch": 0.1568151821068027, + "grad_norm": 0.2841673791408539, + "learning_rate": 4.950977039803039e-05, + "loss": 0.2525, + "step": 8792 + }, + { + "epoch": 0.1568330182285164, + "grad_norm": 0.353541761636734, + "learning_rate": 4.950946362038454e-05, + "loss": 0.1895, + "step": 8793 + }, + { + "epoch": 0.15685085435023008, + "grad_norm": 0.2435857206583023, + "learning_rate": 4.9509156747731544e-05, + "loss": 0.2216, + "step": 8794 + }, + { + "epoch": 0.15686869047194377, + "grad_norm": 0.2575038969516754, + "learning_rate": 4.95088497800726e-05, + "loss": 0.2111, + "step": 8795 + }, + { + "epoch": 0.15688652659365748, + "grad_norm": 0.3675071597099304, + "learning_rate": 4.9508542717408877e-05, + "loss": 0.2019, + "step": 8796 + }, + { + "epoch": 0.15690436271537117, + "grad_norm": 0.3092038333415985, + "learning_rate": 4.950823555974158e-05, + "loss": 0.2541, + "step": 8797 + }, + { + "epoch": 0.15692219883708486, + "grad_norm": 0.23880967497825623, + "learning_rate": 4.9507928307071904e-05, + "loss": 0.2112, + "step": 8798 + }, + { + "epoch": 0.15694003495879855, + "grad_norm": 0.21204820275306702, + "learning_rate": 4.9507620959401024e-05, + "loss": 0.1796, + "step": 8799 + }, + { + "epoch": 0.15695787108051226, + "grad_norm": 0.21364718675613403, + "learning_rate": 4.950731351673015e-05, + "loss": 0.2262, + "step": 8800 + }, + { + "epoch": 0.15697570720222595, + "grad_norm": 0.2173614203929901, + "learning_rate": 4.950700597906046e-05, + "loss": 0.2114, + "step": 8801 + }, + { + "epoch": 0.15699354332393964, + "grad_norm": 0.26080960035324097, + "learning_rate": 4.950669834639315e-05, + "loss": 0.2279, + "step": 8802 + }, + { + "epoch": 0.15701137944565333, + "grad_norm": 0.3522777557373047, + "learning_rate": 4.9506390618729416e-05, + "loss": 0.2678, + "step": 8803 + }, + { + "epoch": 0.15702921556736704, + "grad_norm": 0.24790038168430328, + "learning_rate": 4.950608279607044e-05, + "loss": 0.2021, + "step": 8804 + }, + { + "epoch": 0.15704705168908073, + "grad_norm": 0.23251527547836304, + "learning_rate": 4.9505774878417434e-05, + "loss": 0.1912, + "step": 8805 + }, + { + "epoch": 0.15706488781079442, + "grad_norm": 0.2686958909034729, + "learning_rate": 4.950546686577157e-05, + "loss": 0.2239, + "step": 8806 + }, + { + "epoch": 0.1570827239325081, + "grad_norm": 0.24929532408714294, + "learning_rate": 4.9505158758134054e-05, + "loss": 0.2136, + "step": 8807 + }, + { + "epoch": 0.15710056005422182, + "grad_norm": 0.3674381375312805, + "learning_rate": 4.9504850555506085e-05, + "loss": 0.1945, + "step": 8808 + }, + { + "epoch": 0.1571183961759355, + "grad_norm": 0.35281965136528015, + "learning_rate": 4.9504542257888845e-05, + "loss": 0.1781, + "step": 8809 + }, + { + "epoch": 0.1571362322976492, + "grad_norm": 0.25824156403541565, + "learning_rate": 4.950423386528354e-05, + "loss": 0.2593, + "step": 8810 + }, + { + "epoch": 0.15715406841936289, + "grad_norm": 0.21956311166286469, + "learning_rate": 4.950392537769136e-05, + "loss": 0.1892, + "step": 8811 + }, + { + "epoch": 0.1571719045410766, + "grad_norm": 0.2829541265964508, + "learning_rate": 4.95036167951135e-05, + "loss": 0.2587, + "step": 8812 + }, + { + "epoch": 0.1571897406627903, + "grad_norm": 0.3168048560619354, + "learning_rate": 4.9503308117551164e-05, + "loss": 0.2331, + "step": 8813 + }, + { + "epoch": 0.15720757678450398, + "grad_norm": 0.3431147038936615, + "learning_rate": 4.950299934500553e-05, + "loss": 0.2068, + "step": 8814 + }, + { + "epoch": 0.15722541290621767, + "grad_norm": 0.2998479902744293, + "learning_rate": 4.950269047747782e-05, + "loss": 0.2285, + "step": 8815 + }, + { + "epoch": 0.15724324902793135, + "grad_norm": 0.2590060830116272, + "learning_rate": 4.9502381514969215e-05, + "loss": 0.2303, + "step": 8816 + }, + { + "epoch": 0.15726108514964507, + "grad_norm": 0.2475730925798416, + "learning_rate": 4.950207245748092e-05, + "loss": 0.2162, + "step": 8817 + }, + { + "epoch": 0.15727892127135876, + "grad_norm": 0.23853184282779694, + "learning_rate": 4.9501763305014125e-05, + "loss": 0.1879, + "step": 8818 + }, + { + "epoch": 0.15729675739307244, + "grad_norm": 0.30825313925743103, + "learning_rate": 4.950145405757003e-05, + "loss": 0.2148, + "step": 8819 + }, + { + "epoch": 0.15731459351478613, + "grad_norm": 0.2983599305152893, + "learning_rate": 4.9501144715149836e-05, + "loss": 0.232, + "step": 8820 + }, + { + "epoch": 0.15733242963649985, + "grad_norm": 0.3923456072807312, + "learning_rate": 4.9500835277754756e-05, + "loss": 0.2835, + "step": 8821 + }, + { + "epoch": 0.15735026575821354, + "grad_norm": 0.22664014995098114, + "learning_rate": 4.9500525745385964e-05, + "loss": 0.201, + "step": 8822 + }, + { + "epoch": 0.15736810187992722, + "grad_norm": 0.28232407569885254, + "learning_rate": 4.9500216118044674e-05, + "loss": 0.2129, + "step": 8823 + }, + { + "epoch": 0.1573859380016409, + "grad_norm": 0.2689587473869324, + "learning_rate": 4.9499906395732085e-05, + "loss": 0.2052, + "step": 8824 + }, + { + "epoch": 0.15740377412335463, + "grad_norm": 0.2931232750415802, + "learning_rate": 4.949959657844939e-05, + "loss": 0.1753, + "step": 8825 + }, + { + "epoch": 0.15742161024506832, + "grad_norm": 0.3985549807548523, + "learning_rate": 4.949928666619781e-05, + "loss": 0.2974, + "step": 8826 + }, + { + "epoch": 0.157439446366782, + "grad_norm": 0.2937762439250946, + "learning_rate": 4.9498976658978524e-05, + "loss": 0.2571, + "step": 8827 + }, + { + "epoch": 0.1574572824884957, + "grad_norm": 0.232209250330925, + "learning_rate": 4.9498666556792745e-05, + "loss": 0.1929, + "step": 8828 + }, + { + "epoch": 0.1574751186102094, + "grad_norm": 0.49438947439193726, + "learning_rate": 4.949835635964167e-05, + "loss": 0.2251, + "step": 8829 + }, + { + "epoch": 0.1574929547319231, + "grad_norm": 0.2629947066307068, + "learning_rate": 4.949804606752651e-05, + "loss": 0.2425, + "step": 8830 + }, + { + "epoch": 0.15751079085363678, + "grad_norm": 0.2666455805301666, + "learning_rate": 4.9497735680448456e-05, + "loss": 0.2106, + "step": 8831 + }, + { + "epoch": 0.15752862697535047, + "grad_norm": 0.32069793343544006, + "learning_rate": 4.949742519840872e-05, + "loss": 0.2422, + "step": 8832 + }, + { + "epoch": 0.1575464630970642, + "grad_norm": 0.25603896379470825, + "learning_rate": 4.9497114621408506e-05, + "loss": 0.2203, + "step": 8833 + }, + { + "epoch": 0.15756429921877788, + "grad_norm": 0.252055287361145, + "learning_rate": 4.9496803949449014e-05, + "loss": 0.2054, + "step": 8834 + }, + { + "epoch": 0.15758213534049156, + "grad_norm": 0.32936811447143555, + "learning_rate": 4.949649318253144e-05, + "loss": 0.2401, + "step": 8835 + }, + { + "epoch": 0.15759997146220525, + "grad_norm": 0.25010862946510315, + "learning_rate": 4.9496182320657014e-05, + "loss": 0.2547, + "step": 8836 + }, + { + "epoch": 0.15761780758391894, + "grad_norm": 0.386197954416275, + "learning_rate": 4.949587136382691e-05, + "loss": 0.2993, + "step": 8837 + }, + { + "epoch": 0.15763564370563266, + "grad_norm": 0.2307082712650299, + "learning_rate": 4.9495560312042355e-05, + "loss": 0.1814, + "step": 8838 + }, + { + "epoch": 0.15765347982734634, + "grad_norm": 0.37932249903678894, + "learning_rate": 4.9495249165304545e-05, + "loss": 0.2772, + "step": 8839 + }, + { + "epoch": 0.15767131594906003, + "grad_norm": 0.22730407118797302, + "learning_rate": 4.9494937923614694e-05, + "loss": 0.2157, + "step": 8840 + }, + { + "epoch": 0.15768915207077372, + "grad_norm": 0.22970502078533173, + "learning_rate": 4.9494626586973995e-05, + "loss": 0.2247, + "step": 8841 + }, + { + "epoch": 0.15770698819248744, + "grad_norm": 0.5269384384155273, + "learning_rate": 4.949431515538367e-05, + "loss": 0.2016, + "step": 8842 + }, + { + "epoch": 0.15772482431420112, + "grad_norm": 0.27919816970825195, + "learning_rate": 4.9494003628844916e-05, + "loss": 0.2272, + "step": 8843 + }, + { + "epoch": 0.1577426604359148, + "grad_norm": 0.26930591464042664, + "learning_rate": 4.949369200735894e-05, + "loss": 0.1959, + "step": 8844 + }, + { + "epoch": 0.1577604965576285, + "grad_norm": 0.2881614863872528, + "learning_rate": 4.949338029092696e-05, + "loss": 0.25, + "step": 8845 + }, + { + "epoch": 0.15777833267934221, + "grad_norm": 0.3047850430011749, + "learning_rate": 4.949306847955018e-05, + "loss": 0.2708, + "step": 8846 + }, + { + "epoch": 0.1577961688010559, + "grad_norm": 0.2956016957759857, + "learning_rate": 4.9492756573229806e-05, + "loss": 0.2368, + "step": 8847 + }, + { + "epoch": 0.1578140049227696, + "grad_norm": 0.27399855852127075, + "learning_rate": 4.949244457196704e-05, + "loss": 0.2314, + "step": 8848 + }, + { + "epoch": 0.15783184104448328, + "grad_norm": 0.2946315407752991, + "learning_rate": 4.9492132475763107e-05, + "loss": 0.1979, + "step": 8849 + }, + { + "epoch": 0.157849677166197, + "grad_norm": 0.28162893652915955, + "learning_rate": 4.949182028461921e-05, + "loss": 0.226, + "step": 8850 + }, + { + "epoch": 0.15786751328791068, + "grad_norm": 0.2386331856250763, + "learning_rate": 4.949150799853656e-05, + "loss": 0.2156, + "step": 8851 + }, + { + "epoch": 0.15788534940962437, + "grad_norm": 0.1961589753627777, + "learning_rate": 4.9491195617516364e-05, + "loss": 0.1616, + "step": 8852 + }, + { + "epoch": 0.15790318553133806, + "grad_norm": 0.2792550027370453, + "learning_rate": 4.949088314155983e-05, + "loss": 0.2126, + "step": 8853 + }, + { + "epoch": 0.15792102165305177, + "grad_norm": 0.2500261664390564, + "learning_rate": 4.949057057066817e-05, + "loss": 0.1943, + "step": 8854 + }, + { + "epoch": 0.15793885777476546, + "grad_norm": 0.26533347368240356, + "learning_rate": 4.9490257904842606e-05, + "loss": 0.2237, + "step": 8855 + }, + { + "epoch": 0.15795669389647915, + "grad_norm": 0.22932292520999908, + "learning_rate": 4.948994514408435e-05, + "loss": 0.1762, + "step": 8856 + }, + { + "epoch": 0.15797453001819284, + "grad_norm": 0.32544583082199097, + "learning_rate": 4.94896322883946e-05, + "loss": 0.2228, + "step": 8857 + }, + { + "epoch": 0.15799236613990653, + "grad_norm": 0.2862977981567383, + "learning_rate": 4.9489319337774573e-05, + "loss": 0.2136, + "step": 8858 + }, + { + "epoch": 0.15801020226162024, + "grad_norm": 0.35817044973373413, + "learning_rate": 4.9489006292225496e-05, + "loss": 0.1899, + "step": 8859 + }, + { + "epoch": 0.15802803838333393, + "grad_norm": 0.2702390253543854, + "learning_rate": 4.948869315174857e-05, + "loss": 0.1696, + "step": 8860 + }, + { + "epoch": 0.15804587450504762, + "grad_norm": 0.25291597843170166, + "learning_rate": 4.9488379916345004e-05, + "loss": 0.2307, + "step": 8861 + }, + { + "epoch": 0.1580637106267613, + "grad_norm": 0.27768197655677795, + "learning_rate": 4.948806658601603e-05, + "loss": 0.2101, + "step": 8862 + }, + { + "epoch": 0.15808154674847502, + "grad_norm": 0.2693616449832916, + "learning_rate": 4.948775316076285e-05, + "loss": 0.2381, + "step": 8863 + }, + { + "epoch": 0.1580993828701887, + "grad_norm": 0.29252687096595764, + "learning_rate": 4.9487439640586674e-05, + "loss": 0.2301, + "step": 8864 + }, + { + "epoch": 0.1581172189919024, + "grad_norm": 0.30311325192451477, + "learning_rate": 4.9487126025488726e-05, + "loss": 0.1958, + "step": 8865 + }, + { + "epoch": 0.15813505511361609, + "grad_norm": 0.2934456467628479, + "learning_rate": 4.9486812315470226e-05, + "loss": 0.191, + "step": 8866 + }, + { + "epoch": 0.1581528912353298, + "grad_norm": 0.2761151194572449, + "learning_rate": 4.948649851053238e-05, + "loss": 0.2083, + "step": 8867 + }, + { + "epoch": 0.1581707273570435, + "grad_norm": 0.23584337532520294, + "learning_rate": 4.948618461067641e-05, + "loss": 0.189, + "step": 8868 + }, + { + "epoch": 0.15818856347875718, + "grad_norm": 0.25781938433647156, + "learning_rate": 4.948587061590353e-05, + "loss": 0.2331, + "step": 8869 + }, + { + "epoch": 0.15820639960047087, + "grad_norm": 0.27441632747650146, + "learning_rate": 4.9485556526214955e-05, + "loss": 0.1956, + "step": 8870 + }, + { + "epoch": 0.15822423572218458, + "grad_norm": 0.29231467843055725, + "learning_rate": 4.948524234161191e-05, + "loss": 0.2419, + "step": 8871 + }, + { + "epoch": 0.15824207184389827, + "grad_norm": 0.2262069433927536, + "learning_rate": 4.94849280620956e-05, + "loss": 0.1842, + "step": 8872 + }, + { + "epoch": 0.15825990796561196, + "grad_norm": 0.38599586486816406, + "learning_rate": 4.948461368766727e-05, + "loss": 0.2499, + "step": 8873 + }, + { + "epoch": 0.15827774408732564, + "grad_norm": 0.414283812046051, + "learning_rate": 4.948429921832811e-05, + "loss": 0.1976, + "step": 8874 + }, + { + "epoch": 0.15829558020903933, + "grad_norm": 0.22861029207706451, + "learning_rate": 4.948398465407935e-05, + "loss": 0.2088, + "step": 8875 + }, + { + "epoch": 0.15831341633075305, + "grad_norm": 0.2521374821662903, + "learning_rate": 4.948366999492221e-05, + "loss": 0.1988, + "step": 8876 + }, + { + "epoch": 0.15833125245246674, + "grad_norm": 0.23936320841312408, + "learning_rate": 4.94833552408579e-05, + "loss": 0.161, + "step": 8877 + }, + { + "epoch": 0.15834908857418042, + "grad_norm": 0.28193894028663635, + "learning_rate": 4.948304039188766e-05, + "loss": 0.1982, + "step": 8878 + }, + { + "epoch": 0.1583669246958941, + "grad_norm": 0.3436919152736664, + "learning_rate": 4.9482725448012695e-05, + "loss": 0.2975, + "step": 8879 + }, + { + "epoch": 0.15838476081760783, + "grad_norm": 0.29465392231941223, + "learning_rate": 4.948241040923423e-05, + "loss": 0.2351, + "step": 8880 + }, + { + "epoch": 0.15840259693932152, + "grad_norm": 0.3796325922012329, + "learning_rate": 4.9482095275553484e-05, + "loss": 0.2295, + "step": 8881 + }, + { + "epoch": 0.1584204330610352, + "grad_norm": 0.3656231164932251, + "learning_rate": 4.948178004697169e-05, + "loss": 0.3035, + "step": 8882 + }, + { + "epoch": 0.1584382691827489, + "grad_norm": 0.26530107855796814, + "learning_rate": 4.948146472349005e-05, + "loss": 0.225, + "step": 8883 + }, + { + "epoch": 0.1584561053044626, + "grad_norm": 0.30532997846603394, + "learning_rate": 4.948114930510981e-05, + "loss": 0.2209, + "step": 8884 + }, + { + "epoch": 0.1584739414261763, + "grad_norm": 0.23127897083759308, + "learning_rate": 4.948083379183217e-05, + "loss": 0.2262, + "step": 8885 + }, + { + "epoch": 0.15849177754788998, + "grad_norm": 0.37624356150627136, + "learning_rate": 4.9480518183658364e-05, + "loss": 0.1997, + "step": 8886 + }, + { + "epoch": 0.15850961366960367, + "grad_norm": 0.2639813721179962, + "learning_rate": 4.948020248058961e-05, + "loss": 0.2645, + "step": 8887 + }, + { + "epoch": 0.1585274497913174, + "grad_norm": 0.2535322606563568, + "learning_rate": 4.947988668262714e-05, + "loss": 0.2417, + "step": 8888 + }, + { + "epoch": 0.15854528591303108, + "grad_norm": 0.25185152888298035, + "learning_rate": 4.9479570789772176e-05, + "loss": 0.1838, + "step": 8889 + }, + { + "epoch": 0.15856312203474476, + "grad_norm": 0.1991785615682602, + "learning_rate": 4.947925480202594e-05, + "loss": 0.1901, + "step": 8890 + }, + { + "epoch": 0.15858095815645845, + "grad_norm": 0.3542342782020569, + "learning_rate": 4.947893871938966e-05, + "loss": 0.2594, + "step": 8891 + }, + { + "epoch": 0.15859879427817217, + "grad_norm": 0.2714672088623047, + "learning_rate": 4.947862254186455e-05, + "loss": 0.2104, + "step": 8892 + }, + { + "epoch": 0.15861663039988586, + "grad_norm": 0.282929003238678, + "learning_rate": 4.947830626945186e-05, + "loss": 0.2695, + "step": 8893 + }, + { + "epoch": 0.15863446652159954, + "grad_norm": 0.2924908995628357, + "learning_rate": 4.947798990215278e-05, + "loss": 0.2223, + "step": 8894 + }, + { + "epoch": 0.15865230264331323, + "grad_norm": 0.34512925148010254, + "learning_rate": 4.947767343996857e-05, + "loss": 0.2111, + "step": 8895 + }, + { + "epoch": 0.15867013876502692, + "grad_norm": 0.3300144672393799, + "learning_rate": 4.947735688290044e-05, + "loss": 0.2437, + "step": 8896 + }, + { + "epoch": 0.15868797488674063, + "grad_norm": 0.2518937289714813, + "learning_rate": 4.947704023094962e-05, + "loss": 0.2384, + "step": 8897 + }, + { + "epoch": 0.15870581100845432, + "grad_norm": 0.2370232343673706, + "learning_rate": 4.947672348411734e-05, + "loss": 0.2191, + "step": 8898 + }, + { + "epoch": 0.158723647130168, + "grad_norm": 0.29083213210105896, + "learning_rate": 4.9476406642404826e-05, + "loss": 0.1869, + "step": 8899 + }, + { + "epoch": 0.1587414832518817, + "grad_norm": 0.2743731141090393, + "learning_rate": 4.9476089705813306e-05, + "loss": 0.1975, + "step": 8900 + }, + { + "epoch": 0.15875931937359541, + "grad_norm": 0.33588162064552307, + "learning_rate": 4.9475772674344005e-05, + "loss": 0.2459, + "step": 8901 + }, + { + "epoch": 0.1587771554953091, + "grad_norm": 0.33835238218307495, + "learning_rate": 4.9475455547998165e-05, + "loss": 0.2301, + "step": 8902 + }, + { + "epoch": 0.1587949916170228, + "grad_norm": 0.22800800204277039, + "learning_rate": 4.947513832677699e-05, + "loss": 0.1953, + "step": 8903 + }, + { + "epoch": 0.15881282773873648, + "grad_norm": 0.25396639108657837, + "learning_rate": 4.9474821010681736e-05, + "loss": 0.2107, + "step": 8904 + }, + { + "epoch": 0.1588306638604502, + "grad_norm": 0.24167804419994354, + "learning_rate": 4.947450359971362e-05, + "loss": 0.1658, + "step": 8905 + }, + { + "epoch": 0.15884849998216388, + "grad_norm": 0.29995760321617126, + "learning_rate": 4.947418609387387e-05, + "loss": 0.241, + "step": 8906 + }, + { + "epoch": 0.15886633610387757, + "grad_norm": 0.3503248393535614, + "learning_rate": 4.947386849316373e-05, + "loss": 0.1706, + "step": 8907 + }, + { + "epoch": 0.15888417222559126, + "grad_norm": 0.29240545630455017, + "learning_rate": 4.947355079758442e-05, + "loss": 0.2675, + "step": 8908 + }, + { + "epoch": 0.15890200834730497, + "grad_norm": 0.2589358687400818, + "learning_rate": 4.947323300713718e-05, + "loss": 0.1867, + "step": 8909 + }, + { + "epoch": 0.15891984446901866, + "grad_norm": 0.2756797969341278, + "learning_rate": 4.9472915121823226e-05, + "loss": 0.1965, + "step": 8910 + }, + { + "epoch": 0.15893768059073235, + "grad_norm": 0.4475850462913513, + "learning_rate": 4.947259714164381e-05, + "loss": 0.2285, + "step": 8911 + }, + { + "epoch": 0.15895551671244604, + "grad_norm": 0.27526500821113586, + "learning_rate": 4.947227906660015e-05, + "loss": 0.2318, + "step": 8912 + }, + { + "epoch": 0.15897335283415975, + "grad_norm": 0.2726994752883911, + "learning_rate": 4.947196089669348e-05, + "loss": 0.2049, + "step": 8913 + }, + { + "epoch": 0.15899118895587344, + "grad_norm": 0.2614360451698303, + "learning_rate": 4.9471642631925045e-05, + "loss": 0.1859, + "step": 8914 + }, + { + "epoch": 0.15900902507758713, + "grad_norm": 0.4405480623245239, + "learning_rate": 4.947132427229606e-05, + "loss": 0.2847, + "step": 8915 + }, + { + "epoch": 0.15902686119930082, + "grad_norm": 0.41811949014663696, + "learning_rate": 4.947100581780778e-05, + "loss": 0.2068, + "step": 8916 + }, + { + "epoch": 0.1590446973210145, + "grad_norm": 0.2581080198287964, + "learning_rate": 4.9470687268461426e-05, + "loss": 0.1841, + "step": 8917 + }, + { + "epoch": 0.15906253344272822, + "grad_norm": 0.255830317735672, + "learning_rate": 4.9470368624258226e-05, + "loss": 0.2252, + "step": 8918 + }, + { + "epoch": 0.1590803695644419, + "grad_norm": 0.2948690354824066, + "learning_rate": 4.9470049885199445e-05, + "loss": 0.2325, + "step": 8919 + }, + { + "epoch": 0.1590982056861556, + "grad_norm": 0.3190224766731262, + "learning_rate": 4.946973105128628e-05, + "loss": 0.1824, + "step": 8920 + }, + { + "epoch": 0.15911604180786929, + "grad_norm": 0.2818789780139923, + "learning_rate": 4.946941212252e-05, + "loss": 0.2352, + "step": 8921 + }, + { + "epoch": 0.159133877929583, + "grad_norm": 0.3696920871734619, + "learning_rate": 4.946909309890182e-05, + "loss": 0.2749, + "step": 8922 + }, + { + "epoch": 0.1591517140512967, + "grad_norm": 0.2734166979789734, + "learning_rate": 4.946877398043299e-05, + "loss": 0.1973, + "step": 8923 + }, + { + "epoch": 0.15916955017301038, + "grad_norm": 0.32807162404060364, + "learning_rate": 4.9468454767114735e-05, + "loss": 0.2532, + "step": 8924 + }, + { + "epoch": 0.15918738629472406, + "grad_norm": 0.28743776679039, + "learning_rate": 4.946813545894829e-05, + "loss": 0.2206, + "step": 8925 + }, + { + "epoch": 0.15920522241643778, + "grad_norm": 0.2819172143936157, + "learning_rate": 4.9467816055934916e-05, + "loss": 0.2572, + "step": 8926 + }, + { + "epoch": 0.15922305853815147, + "grad_norm": 0.3044775426387787, + "learning_rate": 4.946749655807583e-05, + "loss": 0.2425, + "step": 8927 + }, + { + "epoch": 0.15924089465986516, + "grad_norm": 0.292955219745636, + "learning_rate": 4.946717696537228e-05, + "loss": 0.2514, + "step": 8928 + }, + { + "epoch": 0.15925873078157884, + "grad_norm": 0.2505530118942261, + "learning_rate": 4.94668572778255e-05, + "loss": 0.2392, + "step": 8929 + }, + { + "epoch": 0.15927656690329256, + "grad_norm": 0.22183232009410858, + "learning_rate": 4.9466537495436726e-05, + "loss": 0.1846, + "step": 8930 + }, + { + "epoch": 0.15929440302500625, + "grad_norm": 0.26471900939941406, + "learning_rate": 4.946621761820721e-05, + "loss": 0.217, + "step": 8931 + }, + { + "epoch": 0.15931223914671994, + "grad_norm": 0.3160739243030548, + "learning_rate": 4.946589764613818e-05, + "loss": 0.1824, + "step": 8932 + }, + { + "epoch": 0.15933007526843362, + "grad_norm": 0.2557903826236725, + "learning_rate": 4.9465577579230884e-05, + "loss": 0.1999, + "step": 8933 + }, + { + "epoch": 0.15934791139014734, + "grad_norm": 0.24554532766342163, + "learning_rate": 4.946525741748655e-05, + "loss": 0.2405, + "step": 8934 + }, + { + "epoch": 0.15936574751186103, + "grad_norm": 0.29524701833724976, + "learning_rate": 4.9464937160906433e-05, + "loss": 0.2606, + "step": 8935 + }, + { + "epoch": 0.15938358363357472, + "grad_norm": 0.23595523834228516, + "learning_rate": 4.9464616809491774e-05, + "loss": 0.2311, + "step": 8936 + }, + { + "epoch": 0.1594014197552884, + "grad_norm": 0.268909752368927, + "learning_rate": 4.946429636324381e-05, + "loss": 0.1478, + "step": 8937 + }, + { + "epoch": 0.1594192558770021, + "grad_norm": 0.26586514711380005, + "learning_rate": 4.946397582216378e-05, + "loss": 0.2411, + "step": 8938 + }, + { + "epoch": 0.1594370919987158, + "grad_norm": 0.2992939352989197, + "learning_rate": 4.9463655186252925e-05, + "loss": 0.1826, + "step": 8939 + }, + { + "epoch": 0.1594549281204295, + "grad_norm": 0.3257649540901184, + "learning_rate": 4.94633344555125e-05, + "loss": 0.2712, + "step": 8940 + }, + { + "epoch": 0.15947276424214318, + "grad_norm": 0.3885219693183899, + "learning_rate": 4.946301362994374e-05, + "loss": 0.2507, + "step": 8941 + }, + { + "epoch": 0.15949060036385687, + "grad_norm": 0.24294568598270416, + "learning_rate": 4.946269270954789e-05, + "loss": 0.1777, + "step": 8942 + }, + { + "epoch": 0.1595084364855706, + "grad_norm": 0.33604273200035095, + "learning_rate": 4.946237169432619e-05, + "loss": 0.2024, + "step": 8943 + }, + { + "epoch": 0.15952627260728428, + "grad_norm": 0.34399721026420593, + "learning_rate": 4.94620505842799e-05, + "loss": 0.2638, + "step": 8944 + }, + { + "epoch": 0.15954410872899796, + "grad_norm": 0.2487800121307373, + "learning_rate": 4.9461729379410235e-05, + "loss": 0.2206, + "step": 8945 + }, + { + "epoch": 0.15956194485071165, + "grad_norm": 0.29507145285606384, + "learning_rate": 4.9461408079718474e-05, + "loss": 0.2007, + "step": 8946 + }, + { + "epoch": 0.15957978097242537, + "grad_norm": 0.25678563117980957, + "learning_rate": 4.946108668520584e-05, + "loss": 0.2522, + "step": 8947 + }, + { + "epoch": 0.15959761709413905, + "grad_norm": 0.42071107029914856, + "learning_rate": 4.946076519587359e-05, + "loss": 0.2379, + "step": 8948 + }, + { + "epoch": 0.15961545321585274, + "grad_norm": 0.24987351894378662, + "learning_rate": 4.946044361172296e-05, + "loss": 0.2132, + "step": 8949 + }, + { + "epoch": 0.15963328933756643, + "grad_norm": 0.28699952363967896, + "learning_rate": 4.94601219327552e-05, + "loss": 0.1839, + "step": 8950 + }, + { + "epoch": 0.15965112545928015, + "grad_norm": 0.2963704764842987, + "learning_rate": 4.945980015897157e-05, + "loss": 0.2226, + "step": 8951 + }, + { + "epoch": 0.15966896158099383, + "grad_norm": 0.2969897985458374, + "learning_rate": 4.94594782903733e-05, + "loss": 0.2807, + "step": 8952 + }, + { + "epoch": 0.15968679770270752, + "grad_norm": 0.29011672735214233, + "learning_rate": 4.9459156326961645e-05, + "loss": 0.2143, + "step": 8953 + }, + { + "epoch": 0.1597046338244212, + "grad_norm": 0.26859965920448303, + "learning_rate": 4.945883426873784e-05, + "loss": 0.2292, + "step": 8954 + }, + { + "epoch": 0.15972246994613493, + "grad_norm": 0.25487181544303894, + "learning_rate": 4.945851211570316e-05, + "loss": 0.2219, + "step": 8955 + }, + { + "epoch": 0.15974030606784861, + "grad_norm": 0.31263524293899536, + "learning_rate": 4.945818986785884e-05, + "loss": 0.1598, + "step": 8956 + }, + { + "epoch": 0.1597581421895623, + "grad_norm": 0.47453898191452026, + "learning_rate": 4.945786752520612e-05, + "loss": 0.2524, + "step": 8957 + }, + { + "epoch": 0.159775978311276, + "grad_norm": 0.3323827385902405, + "learning_rate": 4.945754508774626e-05, + "loss": 0.2851, + "step": 8958 + }, + { + "epoch": 0.15979381443298968, + "grad_norm": 0.3283613920211792, + "learning_rate": 4.945722255548051e-05, + "loss": 0.2461, + "step": 8959 + }, + { + "epoch": 0.1598116505547034, + "grad_norm": 0.29481279850006104, + "learning_rate": 4.945689992841012e-05, + "loss": 0.2443, + "step": 8960 + }, + { + "epoch": 0.15982948667641708, + "grad_norm": 0.26263076066970825, + "learning_rate": 4.9456577206536333e-05, + "loss": 0.2342, + "step": 8961 + }, + { + "epoch": 0.15984732279813077, + "grad_norm": 0.2603324353694916, + "learning_rate": 4.945625438986041e-05, + "loss": 0.2133, + "step": 8962 + }, + { + "epoch": 0.15986515891984446, + "grad_norm": 0.2865920960903168, + "learning_rate": 4.94559314783836e-05, + "loss": 0.2091, + "step": 8963 + }, + { + "epoch": 0.15988299504155817, + "grad_norm": 0.40341323614120483, + "learning_rate": 4.945560847210715e-05, + "loss": 0.2199, + "step": 8964 + }, + { + "epoch": 0.15990083116327186, + "grad_norm": 0.3902820944786072, + "learning_rate": 4.945528537103232e-05, + "loss": 0.1633, + "step": 8965 + }, + { + "epoch": 0.15991866728498555, + "grad_norm": 0.353304922580719, + "learning_rate": 4.9454962175160345e-05, + "loss": 0.27, + "step": 8966 + }, + { + "epoch": 0.15993650340669924, + "grad_norm": 0.3476138114929199, + "learning_rate": 4.94546388844925e-05, + "loss": 0.2563, + "step": 8967 + }, + { + "epoch": 0.15995433952841295, + "grad_norm": 0.2467448115348816, + "learning_rate": 4.945431549903003e-05, + "loss": 0.2318, + "step": 8968 + }, + { + "epoch": 0.15997217565012664, + "grad_norm": 0.22938476502895355, + "learning_rate": 4.945399201877418e-05, + "loss": 0.1865, + "step": 8969 + }, + { + "epoch": 0.15999001177184033, + "grad_norm": 0.2441449910402298, + "learning_rate": 4.945366844372622e-05, + "loss": 0.2405, + "step": 8970 + }, + { + "epoch": 0.16000784789355402, + "grad_norm": 0.22523269057273865, + "learning_rate": 4.945334477388739e-05, + "loss": 0.1787, + "step": 8971 + }, + { + "epoch": 0.16002568401526773, + "grad_norm": 0.26215478777885437, + "learning_rate": 4.9453021009258944e-05, + "loss": 0.2547, + "step": 8972 + }, + { + "epoch": 0.16004352013698142, + "grad_norm": 0.2577087879180908, + "learning_rate": 4.945269714984215e-05, + "loss": 0.1961, + "step": 8973 + }, + { + "epoch": 0.1600613562586951, + "grad_norm": 0.310244619846344, + "learning_rate": 4.9452373195638245e-05, + "loss": 0.2473, + "step": 8974 + }, + { + "epoch": 0.1600791923804088, + "grad_norm": 0.3036620616912842, + "learning_rate": 4.945204914664851e-05, + "loss": 0.2243, + "step": 8975 + }, + { + "epoch": 0.16009702850212248, + "grad_norm": 0.3738292455673218, + "learning_rate": 4.945172500287418e-05, + "loss": 0.1934, + "step": 8976 + }, + { + "epoch": 0.1601148646238362, + "grad_norm": 0.3114437758922577, + "learning_rate": 4.9451400764316526e-05, + "loss": 0.2377, + "step": 8977 + }, + { + "epoch": 0.1601327007455499, + "grad_norm": 0.2868841886520386, + "learning_rate": 4.945107643097679e-05, + "loss": 0.2644, + "step": 8978 + }, + { + "epoch": 0.16015053686726358, + "grad_norm": 0.3465786874294281, + "learning_rate": 4.9450752002856235e-05, + "loss": 0.2069, + "step": 8979 + }, + { + "epoch": 0.16016837298897726, + "grad_norm": 0.26838719844818115, + "learning_rate": 4.9450427479956126e-05, + "loss": 0.1961, + "step": 8980 + }, + { + "epoch": 0.16018620911069098, + "grad_norm": 0.2795334756374359, + "learning_rate": 4.9450102862277706e-05, + "loss": 0.2156, + "step": 8981 + }, + { + "epoch": 0.16020404523240467, + "grad_norm": 0.30109819769859314, + "learning_rate": 4.9449778149822255e-05, + "loss": 0.245, + "step": 8982 + }, + { + "epoch": 0.16022188135411836, + "grad_norm": 0.33533725142478943, + "learning_rate": 4.944945334259101e-05, + "loss": 0.2278, + "step": 8983 + }, + { + "epoch": 0.16023971747583204, + "grad_norm": 0.44862592220306396, + "learning_rate": 4.944912844058525e-05, + "loss": 0.1847, + "step": 8984 + }, + { + "epoch": 0.16025755359754576, + "grad_norm": 0.19114886224269867, + "learning_rate": 4.9448803443806214e-05, + "loss": 0.1796, + "step": 8985 + }, + { + "epoch": 0.16027538971925945, + "grad_norm": 0.3102222979068756, + "learning_rate": 4.944847835225517e-05, + "loss": 0.2716, + "step": 8986 + }, + { + "epoch": 0.16029322584097314, + "grad_norm": 0.21888549625873566, + "learning_rate": 4.9448153165933385e-05, + "loss": 0.2119, + "step": 8987 + }, + { + "epoch": 0.16031106196268682, + "grad_norm": 0.26361963152885437, + "learning_rate": 4.944782788484211e-05, + "loss": 0.2145, + "step": 8988 + }, + { + "epoch": 0.16032889808440054, + "grad_norm": 0.32678624987602234, + "learning_rate": 4.9447502508982616e-05, + "loss": 0.2564, + "step": 8989 + }, + { + "epoch": 0.16034673420611423, + "grad_norm": 0.25996658205986023, + "learning_rate": 4.944717703835615e-05, + "loss": 0.1867, + "step": 8990 + }, + { + "epoch": 0.16036457032782792, + "grad_norm": 0.23481029272079468, + "learning_rate": 4.944685147296399e-05, + "loss": 0.198, + "step": 8991 + }, + { + "epoch": 0.1603824064495416, + "grad_norm": 0.2839064300060272, + "learning_rate": 4.9446525812807385e-05, + "loss": 0.2736, + "step": 8992 + }, + { + "epoch": 0.16040024257125532, + "grad_norm": 0.2369505614042282, + "learning_rate": 4.94462000578876e-05, + "loss": 0.2307, + "step": 8993 + }, + { + "epoch": 0.160418078692969, + "grad_norm": 0.31546878814697266, + "learning_rate": 4.944587420820591e-05, + "loss": 0.2291, + "step": 8994 + }, + { + "epoch": 0.1604359148146827, + "grad_norm": 0.34085386991500854, + "learning_rate": 4.9445548263763564e-05, + "loss": 0.2389, + "step": 8995 + }, + { + "epoch": 0.16045375093639638, + "grad_norm": 0.285194993019104, + "learning_rate": 4.944522222456183e-05, + "loss": 0.2312, + "step": 8996 + }, + { + "epoch": 0.16047158705811007, + "grad_norm": 0.2984153628349304, + "learning_rate": 4.9444896090601965e-05, + "loss": 0.2711, + "step": 8997 + }, + { + "epoch": 0.1604894231798238, + "grad_norm": 0.19079795479774475, + "learning_rate": 4.944456986188525e-05, + "loss": 0.208, + "step": 8998 + }, + { + "epoch": 0.16050725930153747, + "grad_norm": 0.23399192094802856, + "learning_rate": 4.944424353841293e-05, + "loss": 0.1868, + "step": 8999 + }, + { + "epoch": 0.16052509542325116, + "grad_norm": 0.3381305932998657, + "learning_rate": 4.944391712018629e-05, + "loss": 0.2717, + "step": 9000 + }, + { + "epoch": 0.16052509542325116, + "eval_loss": 0.2077334076166153, + "eval_runtime": 107.6389, + "eval_samples_per_second": 9.513, + "eval_steps_per_second": 1.589, + "step": 9000 + }, + { + "epoch": 0.16054293154496485, + "grad_norm": 0.261714905500412, + "learning_rate": 4.944359060720657e-05, + "loss": 0.2021, + "step": 9001 + }, + { + "epoch": 0.16056076766667857, + "grad_norm": 0.30212825536727905, + "learning_rate": 4.9443263999475066e-05, + "loss": 0.2212, + "step": 9002 + }, + { + "epoch": 0.16057860378839225, + "grad_norm": 0.28419339656829834, + "learning_rate": 4.9442937296993016e-05, + "loss": 0.213, + "step": 9003 + }, + { + "epoch": 0.16059643991010594, + "grad_norm": 0.32400792837142944, + "learning_rate": 4.94426104997617e-05, + "loss": 0.2554, + "step": 9004 + }, + { + "epoch": 0.16061427603181963, + "grad_norm": 0.28723564743995667, + "learning_rate": 4.944228360778239e-05, + "loss": 0.235, + "step": 9005 + }, + { + "epoch": 0.16063211215353335, + "grad_norm": 0.2636263966560364, + "learning_rate": 4.944195662105634e-05, + "loss": 0.2128, + "step": 9006 + }, + { + "epoch": 0.16064994827524703, + "grad_norm": 0.26810839772224426, + "learning_rate": 4.944162953958483e-05, + "loss": 0.195, + "step": 9007 + }, + { + "epoch": 0.16066778439696072, + "grad_norm": 0.18495428562164307, + "learning_rate": 4.944130236336913e-05, + "loss": 0.1662, + "step": 9008 + }, + { + "epoch": 0.1606856205186744, + "grad_norm": 0.2508189380168915, + "learning_rate": 4.944097509241048e-05, + "loss": 0.1853, + "step": 9009 + }, + { + "epoch": 0.16070345664038813, + "grad_norm": 0.30176711082458496, + "learning_rate": 4.944064772671017e-05, + "loss": 0.1929, + "step": 9010 + }, + { + "epoch": 0.16072129276210181, + "grad_norm": 0.319826602935791, + "learning_rate": 4.9440320266269486e-05, + "loss": 0.2475, + "step": 9011 + }, + { + "epoch": 0.1607391288838155, + "grad_norm": 0.28028926253318787, + "learning_rate": 4.943999271108967e-05, + "loss": 0.2455, + "step": 9012 + }, + { + "epoch": 0.1607569650055292, + "grad_norm": 0.252003014087677, + "learning_rate": 4.9439665061172006e-05, + "loss": 0.1783, + "step": 9013 + }, + { + "epoch": 0.1607748011272429, + "grad_norm": 0.2615832984447479, + "learning_rate": 4.943933731651775e-05, + "loss": 0.1914, + "step": 9014 + }, + { + "epoch": 0.1607926372489566, + "grad_norm": 0.2725923955440521, + "learning_rate": 4.943900947712819e-05, + "loss": 0.2566, + "step": 9015 + }, + { + "epoch": 0.16081047337067028, + "grad_norm": 0.2708735167980194, + "learning_rate": 4.943868154300458e-05, + "loss": 0.2271, + "step": 9016 + }, + { + "epoch": 0.16082830949238397, + "grad_norm": 0.2694780230522156, + "learning_rate": 4.94383535141482e-05, + "loss": 0.2171, + "step": 9017 + }, + { + "epoch": 0.16084614561409766, + "grad_norm": 0.3065152168273926, + "learning_rate": 4.9438025390560324e-05, + "loss": 0.2566, + "step": 9018 + }, + { + "epoch": 0.16086398173581137, + "grad_norm": 0.27773818373680115, + "learning_rate": 4.9437697172242227e-05, + "loss": 0.2278, + "step": 9019 + }, + { + "epoch": 0.16088181785752506, + "grad_norm": 0.2347675859928131, + "learning_rate": 4.943736885919516e-05, + "loss": 0.1888, + "step": 9020 + }, + { + "epoch": 0.16089965397923875, + "grad_norm": 0.2529999911785126, + "learning_rate": 4.9437040451420426e-05, + "loss": 0.1808, + "step": 9021 + }, + { + "epoch": 0.16091749010095244, + "grad_norm": 0.2778419554233551, + "learning_rate": 4.943671194891928e-05, + "loss": 0.2236, + "step": 9022 + }, + { + "epoch": 0.16093532622266615, + "grad_norm": 0.22367282211780548, + "learning_rate": 4.943638335169299e-05, + "loss": 0.18, + "step": 9023 + }, + { + "epoch": 0.16095316234437984, + "grad_norm": 0.3940797746181488, + "learning_rate": 4.943605465974285e-05, + "loss": 0.2755, + "step": 9024 + }, + { + "epoch": 0.16097099846609353, + "grad_norm": 0.2517111599445343, + "learning_rate": 4.943572587307012e-05, + "loss": 0.1823, + "step": 9025 + }, + { + "epoch": 0.16098883458780722, + "grad_norm": 0.30851662158966064, + "learning_rate": 4.943539699167606e-05, + "loss": 0.2598, + "step": 9026 + }, + { + "epoch": 0.16100667070952093, + "grad_norm": 0.27914607524871826, + "learning_rate": 4.9435068015561984e-05, + "loss": 0.2495, + "step": 9027 + }, + { + "epoch": 0.16102450683123462, + "grad_norm": 0.3053116500377655, + "learning_rate": 4.943473894472913e-05, + "loss": 0.2166, + "step": 9028 + }, + { + "epoch": 0.1610423429529483, + "grad_norm": 0.42158517241477966, + "learning_rate": 4.943440977917879e-05, + "loss": 0.2609, + "step": 9029 + }, + { + "epoch": 0.161060179074662, + "grad_norm": 0.26858243346214294, + "learning_rate": 4.943408051891224e-05, + "loss": 0.2538, + "step": 9030 + }, + { + "epoch": 0.1610780151963757, + "grad_norm": 0.22635705769062042, + "learning_rate": 4.9433751163930766e-05, + "loss": 0.2091, + "step": 9031 + }, + { + "epoch": 0.1610958513180894, + "grad_norm": 0.21643264591693878, + "learning_rate": 4.9433421714235614e-05, + "loss": 0.1997, + "step": 9032 + }, + { + "epoch": 0.1611136874398031, + "grad_norm": 0.32239702343940735, + "learning_rate": 4.943309216982809e-05, + "loss": 0.2162, + "step": 9033 + }, + { + "epoch": 0.16113152356151678, + "grad_norm": 0.21508969366550446, + "learning_rate": 4.9432762530709464e-05, + "loss": 0.1844, + "step": 9034 + }, + { + "epoch": 0.1611493596832305, + "grad_norm": 0.2988855242729187, + "learning_rate": 4.943243279688101e-05, + "loss": 0.1863, + "step": 9035 + }, + { + "epoch": 0.16116719580494418, + "grad_norm": 0.35321035981178284, + "learning_rate": 4.9432102968344006e-05, + "loss": 0.3203, + "step": 9036 + }, + { + "epoch": 0.16118503192665787, + "grad_norm": 0.29631587862968445, + "learning_rate": 4.943177304509974e-05, + "loss": 0.2274, + "step": 9037 + }, + { + "epoch": 0.16120286804837156, + "grad_norm": 0.2676721215248108, + "learning_rate": 4.943144302714947e-05, + "loss": 0.2211, + "step": 9038 + }, + { + "epoch": 0.16122070417008524, + "grad_norm": 0.2809307277202606, + "learning_rate": 4.94311129144945e-05, + "loss": 0.1732, + "step": 9039 + }, + { + "epoch": 0.16123854029179896, + "grad_norm": 0.3126406669616699, + "learning_rate": 4.943078270713609e-05, + "loss": 0.2488, + "step": 9040 + }, + { + "epoch": 0.16125637641351265, + "grad_norm": 0.25174424052238464, + "learning_rate": 4.943045240507553e-05, + "loss": 0.1848, + "step": 9041 + }, + { + "epoch": 0.16127421253522634, + "grad_norm": 0.28735512495040894, + "learning_rate": 4.94301220083141e-05, + "loss": 0.219, + "step": 9042 + }, + { + "epoch": 0.16129204865694002, + "grad_norm": 0.2653440833091736, + "learning_rate": 4.942979151685309e-05, + "loss": 0.2425, + "step": 9043 + }, + { + "epoch": 0.16130988477865374, + "grad_norm": 0.2919001877307892, + "learning_rate": 4.942946093069375e-05, + "loss": 0.2239, + "step": 9044 + }, + { + "epoch": 0.16132772090036743, + "grad_norm": 0.3311496376991272, + "learning_rate": 4.9429130249837395e-05, + "loss": 0.2193, + "step": 9045 + }, + { + "epoch": 0.16134555702208112, + "grad_norm": 0.3010842502117157, + "learning_rate": 4.9428799474285285e-05, + "loss": 0.2165, + "step": 9046 + }, + { + "epoch": 0.1613633931437948, + "grad_norm": 0.29573285579681396, + "learning_rate": 4.942846860403872e-05, + "loss": 0.2065, + "step": 9047 + }, + { + "epoch": 0.16138122926550852, + "grad_norm": 0.36145737767219543, + "learning_rate": 4.942813763909897e-05, + "loss": 0.2555, + "step": 9048 + }, + { + "epoch": 0.1613990653872222, + "grad_norm": 0.31323644518852234, + "learning_rate": 4.942780657946732e-05, + "loss": 0.1722, + "step": 9049 + }, + { + "epoch": 0.1614169015089359, + "grad_norm": 0.19427388906478882, + "learning_rate": 4.942747542514505e-05, + "loss": 0.1989, + "step": 9050 + }, + { + "epoch": 0.16143473763064958, + "grad_norm": 0.2875831127166748, + "learning_rate": 4.9427144176133454e-05, + "loss": 0.2038, + "step": 9051 + }, + { + "epoch": 0.1614525737523633, + "grad_norm": 0.25899288058280945, + "learning_rate": 4.942681283243381e-05, + "loss": 0.2161, + "step": 9052 + }, + { + "epoch": 0.161470409874077, + "grad_norm": 0.2957363426685333, + "learning_rate": 4.9426481394047404e-05, + "loss": 0.2435, + "step": 9053 + }, + { + "epoch": 0.16148824599579067, + "grad_norm": 0.38003870844841003, + "learning_rate": 4.9426149860975514e-05, + "loss": 0.2859, + "step": 9054 + }, + { + "epoch": 0.16150608211750436, + "grad_norm": 0.2551209330558777, + "learning_rate": 4.9425818233219436e-05, + "loss": 0.2494, + "step": 9055 + }, + { + "epoch": 0.16152391823921805, + "grad_norm": 0.2630995512008667, + "learning_rate": 4.942548651078045e-05, + "loss": 0.2529, + "step": 9056 + }, + { + "epoch": 0.16154175436093177, + "grad_norm": 0.30915892124176025, + "learning_rate": 4.9425154693659834e-05, + "loss": 0.245, + "step": 9057 + }, + { + "epoch": 0.16155959048264545, + "grad_norm": 0.40903642773628235, + "learning_rate": 4.942482278185889e-05, + "loss": 0.2441, + "step": 9058 + }, + { + "epoch": 0.16157742660435914, + "grad_norm": 0.308986634016037, + "learning_rate": 4.942449077537889e-05, + "loss": 0.2346, + "step": 9059 + }, + { + "epoch": 0.16159526272607283, + "grad_norm": 0.2930994927883148, + "learning_rate": 4.942415867422114e-05, + "loss": 0.2378, + "step": 9060 + }, + { + "epoch": 0.16161309884778655, + "grad_norm": 0.30246230959892273, + "learning_rate": 4.942382647838691e-05, + "loss": 0.2361, + "step": 9061 + }, + { + "epoch": 0.16163093496950023, + "grad_norm": 0.25455141067504883, + "learning_rate": 4.9423494187877494e-05, + "loss": 0.1872, + "step": 9062 + }, + { + "epoch": 0.16164877109121392, + "grad_norm": 0.2597738206386566, + "learning_rate": 4.942316180269417e-05, + "loss": 0.2298, + "step": 9063 + }, + { + "epoch": 0.1616666072129276, + "grad_norm": 0.34653058648109436, + "learning_rate": 4.942282932283825e-05, + "loss": 0.3185, + "step": 9064 + }, + { + "epoch": 0.16168444333464133, + "grad_norm": 0.27348095178604126, + "learning_rate": 4.9422496748311e-05, + "loss": 0.1916, + "step": 9065 + }, + { + "epoch": 0.161702279456355, + "grad_norm": 0.23623839020729065, + "learning_rate": 4.942216407911371e-05, + "loss": 0.2325, + "step": 9066 + }, + { + "epoch": 0.1617201155780687, + "grad_norm": 0.21392683684825897, + "learning_rate": 4.9421831315247685e-05, + "loss": 0.2109, + "step": 9067 + }, + { + "epoch": 0.1617379516997824, + "grad_norm": 0.3376295864582062, + "learning_rate": 4.942149845671421e-05, + "loss": 0.2582, + "step": 9068 + }, + { + "epoch": 0.1617557878214961, + "grad_norm": 0.2988283038139343, + "learning_rate": 4.9421165503514566e-05, + "loss": 0.2613, + "step": 9069 + }, + { + "epoch": 0.1617736239432098, + "grad_norm": 0.23068830370903015, + "learning_rate": 4.942083245565005e-05, + "loss": 0.2025, + "step": 9070 + }, + { + "epoch": 0.16179146006492348, + "grad_norm": 0.19631221890449524, + "learning_rate": 4.9420499313121954e-05, + "loss": 0.1988, + "step": 9071 + }, + { + "epoch": 0.16180929618663717, + "grad_norm": 0.24286353588104248, + "learning_rate": 4.9420166075931576e-05, + "loss": 0.1754, + "step": 9072 + }, + { + "epoch": 0.16182713230835089, + "grad_norm": 0.34894347190856934, + "learning_rate": 4.9419832744080184e-05, + "loss": 0.2563, + "step": 9073 + }, + { + "epoch": 0.16184496843006457, + "grad_norm": 0.2975502014160156, + "learning_rate": 4.94194993175691e-05, + "loss": 0.2278, + "step": 9074 + }, + { + "epoch": 0.16186280455177826, + "grad_norm": 0.23034453392028809, + "learning_rate": 4.941916579639959e-05, + "loss": 0.1912, + "step": 9075 + }, + { + "epoch": 0.16188064067349195, + "grad_norm": 0.25986409187316895, + "learning_rate": 4.9418832180572973e-05, + "loss": 0.2014, + "step": 9076 + }, + { + "epoch": 0.16189847679520564, + "grad_norm": 0.2645118236541748, + "learning_rate": 4.9418498470090515e-05, + "loss": 0.2078, + "step": 9077 + }, + { + "epoch": 0.16191631291691935, + "grad_norm": 0.4214165210723877, + "learning_rate": 4.9418164664953534e-05, + "loss": 0.2373, + "step": 9078 + }, + { + "epoch": 0.16193414903863304, + "grad_norm": 0.24069081246852875, + "learning_rate": 4.9417830765163305e-05, + "loss": 0.222, + "step": 9079 + }, + { + "epoch": 0.16195198516034673, + "grad_norm": 0.2954716086387634, + "learning_rate": 4.9417496770721135e-05, + "loss": 0.2126, + "step": 9080 + }, + { + "epoch": 0.16196982128206042, + "grad_norm": 0.4046162962913513, + "learning_rate": 4.941716268162831e-05, + "loss": 0.3247, + "step": 9081 + }, + { + "epoch": 0.16198765740377413, + "grad_norm": 0.26268166303634644, + "learning_rate": 4.941682849788614e-05, + "loss": 0.2403, + "step": 9082 + }, + { + "epoch": 0.16200549352548782, + "grad_norm": 0.2406916320323944, + "learning_rate": 4.941649421949589e-05, + "loss": 0.2215, + "step": 9083 + }, + { + "epoch": 0.1620233296472015, + "grad_norm": 0.2639237940311432, + "learning_rate": 4.941615984645889e-05, + "loss": 0.1992, + "step": 9084 + }, + { + "epoch": 0.1620411657689152, + "grad_norm": 0.317731648683548, + "learning_rate": 4.9415825378776414e-05, + "loss": 0.2707, + "step": 9085 + }, + { + "epoch": 0.1620590018906289, + "grad_norm": 0.2969341576099396, + "learning_rate": 4.941549081644977e-05, + "loss": 0.1779, + "step": 9086 + }, + { + "epoch": 0.1620768380123426, + "grad_norm": 0.3448761999607086, + "learning_rate": 4.941515615948025e-05, + "loss": 0.1679, + "step": 9087 + }, + { + "epoch": 0.1620946741340563, + "grad_norm": 0.29252657294273376, + "learning_rate": 4.941482140786916e-05, + "loss": 0.194, + "step": 9088 + }, + { + "epoch": 0.16211251025576998, + "grad_norm": 0.227921724319458, + "learning_rate": 4.941448656161778e-05, + "loss": 0.2128, + "step": 9089 + }, + { + "epoch": 0.1621303463774837, + "grad_norm": 0.2951878309249878, + "learning_rate": 4.941415162072742e-05, + "loss": 0.2238, + "step": 9090 + }, + { + "epoch": 0.16214818249919738, + "grad_norm": 0.35384219884872437, + "learning_rate": 4.941381658519937e-05, + "loss": 0.2157, + "step": 9091 + }, + { + "epoch": 0.16216601862091107, + "grad_norm": 0.25139087438583374, + "learning_rate": 4.941348145503494e-05, + "loss": 0.1804, + "step": 9092 + }, + { + "epoch": 0.16218385474262476, + "grad_norm": 0.34730130434036255, + "learning_rate": 4.941314623023543e-05, + "loss": 0.2003, + "step": 9093 + }, + { + "epoch": 0.16220169086433847, + "grad_norm": 0.35382080078125, + "learning_rate": 4.9412810910802124e-05, + "loss": 0.2623, + "step": 9094 + }, + { + "epoch": 0.16221952698605216, + "grad_norm": 0.31246358156204224, + "learning_rate": 4.941247549673633e-05, + "loss": 0.1946, + "step": 9095 + }, + { + "epoch": 0.16223736310776585, + "grad_norm": 0.27841705083847046, + "learning_rate": 4.9412139988039356e-05, + "loss": 0.2081, + "step": 9096 + }, + { + "epoch": 0.16225519922947954, + "grad_norm": 0.28796371817588806, + "learning_rate": 4.941180438471249e-05, + "loss": 0.2546, + "step": 9097 + }, + { + "epoch": 0.16227303535119322, + "grad_norm": 0.2595517933368683, + "learning_rate": 4.9411468686757046e-05, + "loss": 0.2342, + "step": 9098 + }, + { + "epoch": 0.16229087147290694, + "grad_norm": 0.28407278656959534, + "learning_rate": 4.941113289417431e-05, + "loss": 0.2107, + "step": 9099 + }, + { + "epoch": 0.16230870759462063, + "grad_norm": 0.2532673478126526, + "learning_rate": 4.9410797006965596e-05, + "loss": 0.2051, + "step": 9100 + }, + { + "epoch": 0.16232654371633432, + "grad_norm": 0.2700106203556061, + "learning_rate": 4.9410461025132203e-05, + "loss": 0.1779, + "step": 9101 + }, + { + "epoch": 0.162344379838048, + "grad_norm": 0.26635727286338806, + "learning_rate": 4.9410124948675426e-05, + "loss": 0.2286, + "step": 9102 + }, + { + "epoch": 0.16236221595976172, + "grad_norm": 0.30911654233932495, + "learning_rate": 4.940978877759658e-05, + "loss": 0.2352, + "step": 9103 + }, + { + "epoch": 0.1623800520814754, + "grad_norm": 0.23911771178245544, + "learning_rate": 4.940945251189696e-05, + "loss": 0.188, + "step": 9104 + }, + { + "epoch": 0.1623978882031891, + "grad_norm": 0.31234505772590637, + "learning_rate": 4.940911615157787e-05, + "loss": 0.2039, + "step": 9105 + }, + { + "epoch": 0.16241572432490278, + "grad_norm": 0.3038794994354248, + "learning_rate": 4.940877969664062e-05, + "loss": 0.1957, + "step": 9106 + }, + { + "epoch": 0.1624335604466165, + "grad_norm": 0.4138329029083252, + "learning_rate": 4.94084431470865e-05, + "loss": 0.2636, + "step": 9107 + }, + { + "epoch": 0.1624513965683302, + "grad_norm": 0.29021209478378296, + "learning_rate": 4.940810650291683e-05, + "loss": 0.2374, + "step": 9108 + }, + { + "epoch": 0.16246923269004387, + "grad_norm": 0.23764866590499878, + "learning_rate": 4.9407769764132904e-05, + "loss": 0.2121, + "step": 9109 + }, + { + "epoch": 0.16248706881175756, + "grad_norm": 0.3777204155921936, + "learning_rate": 4.940743293073604e-05, + "loss": 0.2393, + "step": 9110 + }, + { + "epoch": 0.16250490493347128, + "grad_norm": 0.22937625646591187, + "learning_rate": 4.940709600272753e-05, + "loss": 0.1655, + "step": 9111 + }, + { + "epoch": 0.16252274105518497, + "grad_norm": 0.290792852640152, + "learning_rate": 4.940675898010869e-05, + "loss": 0.2217, + "step": 9112 + }, + { + "epoch": 0.16254057717689865, + "grad_norm": 0.42406579852104187, + "learning_rate": 4.940642186288082e-05, + "loss": 0.2039, + "step": 9113 + }, + { + "epoch": 0.16255841329861234, + "grad_norm": 0.38261035084724426, + "learning_rate": 4.940608465104523e-05, + "loss": 0.2746, + "step": 9114 + }, + { + "epoch": 0.16257624942032606, + "grad_norm": 0.27044060826301575, + "learning_rate": 4.940574734460323e-05, + "loss": 0.2171, + "step": 9115 + }, + { + "epoch": 0.16259408554203975, + "grad_norm": 0.27975448966026306, + "learning_rate": 4.940540994355612e-05, + "loss": 0.2406, + "step": 9116 + }, + { + "epoch": 0.16261192166375343, + "grad_norm": 0.2940988838672638, + "learning_rate": 4.9405072447905204e-05, + "loss": 0.2433, + "step": 9117 + }, + { + "epoch": 0.16262975778546712, + "grad_norm": 0.2927718758583069, + "learning_rate": 4.9404734857651804e-05, + "loss": 0.2238, + "step": 9118 + }, + { + "epoch": 0.1626475939071808, + "grad_norm": 0.22832036018371582, + "learning_rate": 4.9404397172797224e-05, + "loss": 0.219, + "step": 9119 + }, + { + "epoch": 0.16266543002889453, + "grad_norm": 0.25635266304016113, + "learning_rate": 4.940405939334277e-05, + "loss": 0.1817, + "step": 9120 + }, + { + "epoch": 0.1626832661506082, + "grad_norm": 0.27498435974121094, + "learning_rate": 4.940372151928976e-05, + "loss": 0.1861, + "step": 9121 + }, + { + "epoch": 0.1627011022723219, + "grad_norm": 0.32306236028671265, + "learning_rate": 4.9403383550639485e-05, + "loss": 0.2662, + "step": 9122 + }, + { + "epoch": 0.1627189383940356, + "grad_norm": 0.28174206614494324, + "learning_rate": 4.940304548739327e-05, + "loss": 0.2218, + "step": 9123 + }, + { + "epoch": 0.1627367745157493, + "grad_norm": 0.25538182258605957, + "learning_rate": 4.940270732955243e-05, + "loss": 0.2307, + "step": 9124 + }, + { + "epoch": 0.162754610637463, + "grad_norm": 0.27320006489753723, + "learning_rate": 4.940236907711826e-05, + "loss": 0.2023, + "step": 9125 + }, + { + "epoch": 0.16277244675917668, + "grad_norm": 0.25064462423324585, + "learning_rate": 4.940203073009209e-05, + "loss": 0.2175, + "step": 9126 + }, + { + "epoch": 0.16279028288089037, + "grad_norm": 0.23444148898124695, + "learning_rate": 4.940169228847521e-05, + "loss": 0.2256, + "step": 9127 + }, + { + "epoch": 0.16280811900260408, + "grad_norm": 0.28661078214645386, + "learning_rate": 4.940135375226894e-05, + "loss": 0.1922, + "step": 9128 + }, + { + "epoch": 0.16282595512431777, + "grad_norm": 0.22695735096931458, + "learning_rate": 4.9401015121474606e-05, + "loss": 0.2143, + "step": 9129 + }, + { + "epoch": 0.16284379124603146, + "grad_norm": 0.24954082071781158, + "learning_rate": 4.94006763960935e-05, + "loss": 0.2431, + "step": 9130 + }, + { + "epoch": 0.16286162736774515, + "grad_norm": 0.27113354206085205, + "learning_rate": 4.9400337576126945e-05, + "loss": 0.191, + "step": 9131 + }, + { + "epoch": 0.16287946348945886, + "grad_norm": 0.3145564794540405, + "learning_rate": 4.9399998661576265e-05, + "loss": 0.2514, + "step": 9132 + }, + { + "epoch": 0.16289729961117255, + "grad_norm": 0.25162550806999207, + "learning_rate": 4.9399659652442756e-05, + "loss": 0.1994, + "step": 9133 + }, + { + "epoch": 0.16291513573288624, + "grad_norm": 0.329497754573822, + "learning_rate": 4.939932054872773e-05, + "loss": 0.2718, + "step": 9134 + }, + { + "epoch": 0.16293297185459993, + "grad_norm": 0.26434576511383057, + "learning_rate": 4.939898135043251e-05, + "loss": 0.2875, + "step": 9135 + }, + { + "epoch": 0.16295080797631364, + "grad_norm": 0.42806506156921387, + "learning_rate": 4.939864205755843e-05, + "loss": 0.2814, + "step": 9136 + }, + { + "epoch": 0.16296864409802733, + "grad_norm": 0.23605118691921234, + "learning_rate": 4.9398302670106775e-05, + "loss": 0.2328, + "step": 9137 + }, + { + "epoch": 0.16298648021974102, + "grad_norm": 0.27913331985473633, + "learning_rate": 4.939796318807887e-05, + "loss": 0.2526, + "step": 9138 + }, + { + "epoch": 0.1630043163414547, + "grad_norm": 0.2679133713245392, + "learning_rate": 4.939762361147604e-05, + "loss": 0.2347, + "step": 9139 + }, + { + "epoch": 0.1630221524631684, + "grad_norm": 0.2884804606437683, + "learning_rate": 4.9397283940299585e-05, + "loss": 0.2805, + "step": 9140 + }, + { + "epoch": 0.1630399885848821, + "grad_norm": 0.24972495436668396, + "learning_rate": 4.939694417455083e-05, + "loss": 0.204, + "step": 9141 + }, + { + "epoch": 0.1630578247065958, + "grad_norm": 0.2075093686580658, + "learning_rate": 4.93966043142311e-05, + "loss": 0.2175, + "step": 9142 + }, + { + "epoch": 0.1630756608283095, + "grad_norm": 0.24308235943317413, + "learning_rate": 4.939626435934171e-05, + "loss": 0.1705, + "step": 9143 + }, + { + "epoch": 0.16309349695002318, + "grad_norm": 0.22188474237918854, + "learning_rate": 4.9395924309883966e-05, + "loss": 0.2176, + "step": 9144 + }, + { + "epoch": 0.1631113330717369, + "grad_norm": 0.23017635941505432, + "learning_rate": 4.9395584165859197e-05, + "loss": 0.2217, + "step": 9145 + }, + { + "epoch": 0.16312916919345058, + "grad_norm": 0.2875329852104187, + "learning_rate": 4.939524392726871e-05, + "loss": 0.1913, + "step": 9146 + }, + { + "epoch": 0.16314700531516427, + "grad_norm": 0.24955126643180847, + "learning_rate": 4.939490359411384e-05, + "loss": 0.217, + "step": 9147 + }, + { + "epoch": 0.16316484143687796, + "grad_norm": 0.3250877559185028, + "learning_rate": 4.939456316639589e-05, + "loss": 0.2772, + "step": 9148 + }, + { + "epoch": 0.16318267755859167, + "grad_norm": 0.22584985196590424, + "learning_rate": 4.939422264411619e-05, + "loss": 0.1957, + "step": 9149 + }, + { + "epoch": 0.16320051368030536, + "grad_norm": 0.24119976162910461, + "learning_rate": 4.939388202727606e-05, + "loss": 0.2039, + "step": 9150 + }, + { + "epoch": 0.16321834980201905, + "grad_norm": 0.5231024026870728, + "learning_rate": 4.939354131587682e-05, + "loss": 0.18, + "step": 9151 + }, + { + "epoch": 0.16323618592373274, + "grad_norm": 0.26688915491104126, + "learning_rate": 4.939320050991979e-05, + "loss": 0.2165, + "step": 9152 + }, + { + "epoch": 0.16325402204544645, + "grad_norm": 0.23372162878513336, + "learning_rate": 4.9392859609406284e-05, + "loss": 0.2203, + "step": 9153 + }, + { + "epoch": 0.16327185816716014, + "grad_norm": 0.2783568203449249, + "learning_rate": 4.939251861433763e-05, + "loss": 0.2346, + "step": 9154 + }, + { + "epoch": 0.16328969428887383, + "grad_norm": 0.27722346782684326, + "learning_rate": 4.939217752471515e-05, + "loss": 0.2122, + "step": 9155 + }, + { + "epoch": 0.16330753041058751, + "grad_norm": 0.2658185362815857, + "learning_rate": 4.9391836340540166e-05, + "loss": 0.2288, + "step": 9156 + }, + { + "epoch": 0.1633253665323012, + "grad_norm": 0.29888370633125305, + "learning_rate": 4.9391495061813994e-05, + "loss": 0.1779, + "step": 9157 + }, + { + "epoch": 0.16334320265401492, + "grad_norm": 0.448290079832077, + "learning_rate": 4.939115368853797e-05, + "loss": 0.2321, + "step": 9158 + }, + { + "epoch": 0.1633610387757286, + "grad_norm": 0.16972346603870392, + "learning_rate": 4.939081222071341e-05, + "loss": 0.1749, + "step": 9159 + }, + { + "epoch": 0.1633788748974423, + "grad_norm": 0.2823371887207031, + "learning_rate": 4.939047065834164e-05, + "loss": 0.195, + "step": 9160 + }, + { + "epoch": 0.16339671101915598, + "grad_norm": 0.3754192590713501, + "learning_rate": 4.939012900142397e-05, + "loss": 0.2495, + "step": 9161 + }, + { + "epoch": 0.1634145471408697, + "grad_norm": 0.40936192870140076, + "learning_rate": 4.938978724996174e-05, + "loss": 0.1623, + "step": 9162 + }, + { + "epoch": 0.16343238326258339, + "grad_norm": 0.27821943163871765, + "learning_rate": 4.9389445403956275e-05, + "loss": 0.2047, + "step": 9163 + }, + { + "epoch": 0.16345021938429707, + "grad_norm": 0.28809860348701477, + "learning_rate": 4.938910346340889e-05, + "loss": 0.1842, + "step": 9164 + }, + { + "epoch": 0.16346805550601076, + "grad_norm": 0.26269033551216125, + "learning_rate": 4.938876142832092e-05, + "loss": 0.1676, + "step": 9165 + }, + { + "epoch": 0.16348589162772448, + "grad_norm": 0.3284805417060852, + "learning_rate": 4.938841929869369e-05, + "loss": 0.257, + "step": 9166 + }, + { + "epoch": 0.16350372774943817, + "grad_norm": 0.27647194266319275, + "learning_rate": 4.938807707452852e-05, + "loss": 0.2268, + "step": 9167 + }, + { + "epoch": 0.16352156387115185, + "grad_norm": 0.405900776386261, + "learning_rate": 4.9387734755826745e-05, + "loss": 0.2674, + "step": 9168 + }, + { + "epoch": 0.16353939999286554, + "grad_norm": 0.2592333257198334, + "learning_rate": 4.938739234258968e-05, + "loss": 0.2214, + "step": 9169 + }, + { + "epoch": 0.16355723611457926, + "grad_norm": 0.22996890544891357, + "learning_rate": 4.938704983481866e-05, + "loss": 0.2299, + "step": 9170 + }, + { + "epoch": 0.16357507223629295, + "grad_norm": 0.3244262933731079, + "learning_rate": 4.9386707232515015e-05, + "loss": 0.2111, + "step": 9171 + }, + { + "epoch": 0.16359290835800663, + "grad_norm": 0.2678900957107544, + "learning_rate": 4.938636453568006e-05, + "loss": 0.1915, + "step": 9172 + }, + { + "epoch": 0.16361074447972032, + "grad_norm": 0.25056418776512146, + "learning_rate": 4.938602174431515e-05, + "loss": 0.1759, + "step": 9173 + }, + { + "epoch": 0.16362858060143404, + "grad_norm": 0.32993167638778687, + "learning_rate": 4.938567885842158e-05, + "loss": 0.2133, + "step": 9174 + }, + { + "epoch": 0.16364641672314773, + "grad_norm": 0.2732590138912201, + "learning_rate": 4.938533587800071e-05, + "loss": 0.2263, + "step": 9175 + }, + { + "epoch": 0.1636642528448614, + "grad_norm": 0.2834323048591614, + "learning_rate": 4.9384992803053845e-05, + "loss": 0.1817, + "step": 9176 + }, + { + "epoch": 0.1636820889665751, + "grad_norm": 0.27836138010025024, + "learning_rate": 4.9384649633582334e-05, + "loss": 0.229, + "step": 9177 + }, + { + "epoch": 0.1636999250882888, + "grad_norm": 0.261381596326828, + "learning_rate": 4.9384306369587496e-05, + "loss": 0.2145, + "step": 9178 + }, + { + "epoch": 0.1637177612100025, + "grad_norm": 0.27303367853164673, + "learning_rate": 4.9383963011070665e-05, + "loss": 0.1928, + "step": 9179 + }, + { + "epoch": 0.1637355973317162, + "grad_norm": 0.2684634029865265, + "learning_rate": 4.9383619558033164e-05, + "loss": 0.2303, + "step": 9180 + }, + { + "epoch": 0.16375343345342988, + "grad_norm": 0.28900694847106934, + "learning_rate": 4.938327601047634e-05, + "loss": 0.2435, + "step": 9181 + }, + { + "epoch": 0.16377126957514357, + "grad_norm": 0.24638424813747406, + "learning_rate": 4.9382932368401516e-05, + "loss": 0.1906, + "step": 9182 + }, + { + "epoch": 0.16378910569685728, + "grad_norm": 0.28836917877197266, + "learning_rate": 4.9382588631810025e-05, + "loss": 0.1684, + "step": 9183 + }, + { + "epoch": 0.16380694181857097, + "grad_norm": 0.24633167684078217, + "learning_rate": 4.938224480070319e-05, + "loss": 0.1984, + "step": 9184 + }, + { + "epoch": 0.16382477794028466, + "grad_norm": 0.2866842746734619, + "learning_rate": 4.9381900875082364e-05, + "loss": 0.2271, + "step": 9185 + }, + { + "epoch": 0.16384261406199835, + "grad_norm": 0.3088231086730957, + "learning_rate": 4.9381556854948864e-05, + "loss": 0.214, + "step": 9186 + }, + { + "epoch": 0.16386045018371206, + "grad_norm": 0.29733720421791077, + "learning_rate": 4.938121274030403e-05, + "loss": 0.2241, + "step": 9187 + }, + { + "epoch": 0.16387828630542575, + "grad_norm": 0.3455222249031067, + "learning_rate": 4.9380868531149193e-05, + "loss": 0.1825, + "step": 9188 + }, + { + "epoch": 0.16389612242713944, + "grad_norm": 0.2591247260570526, + "learning_rate": 4.938052422748569e-05, + "loss": 0.207, + "step": 9189 + }, + { + "epoch": 0.16391395854885313, + "grad_norm": 0.3061193823814392, + "learning_rate": 4.9380179829314856e-05, + "loss": 0.219, + "step": 9190 + }, + { + "epoch": 0.16393179467056684, + "grad_norm": 0.2616443336009979, + "learning_rate": 4.9379835336638026e-05, + "loss": 0.2156, + "step": 9191 + }, + { + "epoch": 0.16394963079228053, + "grad_norm": 0.29381993412971497, + "learning_rate": 4.9379490749456523e-05, + "loss": 0.2207, + "step": 9192 + }, + { + "epoch": 0.16396746691399422, + "grad_norm": 0.24452641606330872, + "learning_rate": 4.9379146067771705e-05, + "loss": 0.2204, + "step": 9193 + }, + { + "epoch": 0.1639853030357079, + "grad_norm": 0.26373422145843506, + "learning_rate": 4.937880129158488e-05, + "loss": 0.1731, + "step": 9194 + }, + { + "epoch": 0.16400313915742162, + "grad_norm": 0.21541623771190643, + "learning_rate": 4.937845642089742e-05, + "loss": 0.1844, + "step": 9195 + }, + { + "epoch": 0.1640209752791353, + "grad_norm": 0.27089133858680725, + "learning_rate": 4.937811145571064e-05, + "loss": 0.1617, + "step": 9196 + }, + { + "epoch": 0.164038811400849, + "grad_norm": 0.19446557760238647, + "learning_rate": 4.937776639602587e-05, + "loss": 0.177, + "step": 9197 + }, + { + "epoch": 0.1640566475225627, + "grad_norm": 0.3105425536632538, + "learning_rate": 4.937742124184447e-05, + "loss": 0.2382, + "step": 9198 + }, + { + "epoch": 0.16407448364427638, + "grad_norm": 0.2988620400428772, + "learning_rate": 4.937707599316776e-05, + "loss": 0.2219, + "step": 9199 + }, + { + "epoch": 0.1640923197659901, + "grad_norm": 0.36146506667137146, + "learning_rate": 4.9376730649997085e-05, + "loss": 0.212, + "step": 9200 + }, + { + "epoch": 0.16411015588770378, + "grad_norm": 0.21937191486358643, + "learning_rate": 4.9376385212333775e-05, + "loss": 0.1952, + "step": 9201 + }, + { + "epoch": 0.16412799200941747, + "grad_norm": 0.24174143373966217, + "learning_rate": 4.937603968017918e-05, + "loss": 0.2014, + "step": 9202 + }, + { + "epoch": 0.16414582813113116, + "grad_norm": 0.3496522009372711, + "learning_rate": 4.9375694053534636e-05, + "loss": 0.2198, + "step": 9203 + }, + { + "epoch": 0.16416366425284487, + "grad_norm": 0.2990228235721588, + "learning_rate": 4.937534833240149e-05, + "loss": 0.227, + "step": 9204 + }, + { + "epoch": 0.16418150037455856, + "grad_norm": 0.24706658720970154, + "learning_rate": 4.937500251678107e-05, + "loss": 0.2213, + "step": 9205 + }, + { + "epoch": 0.16419933649627225, + "grad_norm": 0.2717100977897644, + "learning_rate": 4.937465660667472e-05, + "loss": 0.1839, + "step": 9206 + }, + { + "epoch": 0.16421717261798593, + "grad_norm": 0.40295538306236267, + "learning_rate": 4.9374310602083785e-05, + "loss": 0.2493, + "step": 9207 + }, + { + "epoch": 0.16423500873969965, + "grad_norm": 0.35368722677230835, + "learning_rate": 4.9373964503009594e-05, + "loss": 0.2407, + "step": 9208 + }, + { + "epoch": 0.16425284486141334, + "grad_norm": 0.24363276362419128, + "learning_rate": 4.937361830945351e-05, + "loss": 0.2023, + "step": 9209 + }, + { + "epoch": 0.16427068098312703, + "grad_norm": 0.30939704179763794, + "learning_rate": 4.937327202141686e-05, + "loss": 0.2017, + "step": 9210 + }, + { + "epoch": 0.16428851710484071, + "grad_norm": 0.30299273133277893, + "learning_rate": 4.937292563890099e-05, + "loss": 0.2789, + "step": 9211 + }, + { + "epoch": 0.16430635322655443, + "grad_norm": 0.36112216114997864, + "learning_rate": 4.937257916190724e-05, + "loss": 0.2697, + "step": 9212 + }, + { + "epoch": 0.16432418934826812, + "grad_norm": 0.20341669023036957, + "learning_rate": 4.937223259043695e-05, + "loss": 0.2112, + "step": 9213 + }, + { + "epoch": 0.1643420254699818, + "grad_norm": 0.19208763539791107, + "learning_rate": 4.937188592449148e-05, + "loss": 0.1909, + "step": 9214 + }, + { + "epoch": 0.1643598615916955, + "grad_norm": 0.26996225118637085, + "learning_rate": 4.937153916407216e-05, + "loss": 0.243, + "step": 9215 + }, + { + "epoch": 0.1643776977134092, + "grad_norm": 0.24019834399223328, + "learning_rate": 4.9371192309180325e-05, + "loss": 0.2197, + "step": 9216 + }, + { + "epoch": 0.1643955338351229, + "grad_norm": 0.28452110290527344, + "learning_rate": 4.937084535981734e-05, + "loss": 0.2213, + "step": 9217 + }, + { + "epoch": 0.16441336995683659, + "grad_norm": 0.2400692254304886, + "learning_rate": 4.937049831598454e-05, + "loss": 0.1732, + "step": 9218 + }, + { + "epoch": 0.16443120607855027, + "grad_norm": 0.3314850330352783, + "learning_rate": 4.937015117768328e-05, + "loss": 0.2573, + "step": 9219 + }, + { + "epoch": 0.16444904220026396, + "grad_norm": 0.3499220013618469, + "learning_rate": 4.936980394491488e-05, + "loss": 0.2428, + "step": 9220 + }, + { + "epoch": 0.16446687832197768, + "grad_norm": 0.20323525369167328, + "learning_rate": 4.9369456617680706e-05, + "loss": 0.1806, + "step": 9221 + }, + { + "epoch": 0.16448471444369137, + "grad_norm": 0.27988317608833313, + "learning_rate": 4.936910919598211e-05, + "loss": 0.2443, + "step": 9222 + }, + { + "epoch": 0.16450255056540505, + "grad_norm": 0.33889874815940857, + "learning_rate": 4.936876167982043e-05, + "loss": 0.1574, + "step": 9223 + }, + { + "epoch": 0.16452038668711874, + "grad_norm": 0.26385030150413513, + "learning_rate": 4.936841406919701e-05, + "loss": 0.19, + "step": 9224 + }, + { + "epoch": 0.16453822280883246, + "grad_norm": 0.22243303060531616, + "learning_rate": 4.93680663641132e-05, + "loss": 0.1955, + "step": 9225 + }, + { + "epoch": 0.16455605893054615, + "grad_norm": 0.33228254318237305, + "learning_rate": 4.9367718564570344e-05, + "loss": 0.2159, + "step": 9226 + }, + { + "epoch": 0.16457389505225983, + "grad_norm": 0.40989211201667786, + "learning_rate": 4.93673706705698e-05, + "loss": 0.1839, + "step": 9227 + }, + { + "epoch": 0.16459173117397352, + "grad_norm": 0.2205306887626648, + "learning_rate": 4.9367022682112905e-05, + "loss": 0.2204, + "step": 9228 + }, + { + "epoch": 0.16460956729568724, + "grad_norm": 0.25861892104148865, + "learning_rate": 4.9366674599201026e-05, + "loss": 0.1944, + "step": 9229 + }, + { + "epoch": 0.16462740341740092, + "grad_norm": 0.297453910112381, + "learning_rate": 4.936632642183549e-05, + "loss": 0.1802, + "step": 9230 + }, + { + "epoch": 0.1646452395391146, + "grad_norm": 0.3584047853946686, + "learning_rate": 4.936597815001766e-05, + "loss": 0.1766, + "step": 9231 + }, + { + "epoch": 0.1646630756608283, + "grad_norm": 0.26932886242866516, + "learning_rate": 4.9365629783748886e-05, + "loss": 0.191, + "step": 9232 + }, + { + "epoch": 0.16468091178254202, + "grad_norm": 0.23983994126319885, + "learning_rate": 4.936528132303051e-05, + "loss": 0.1901, + "step": 9233 + }, + { + "epoch": 0.1646987479042557, + "grad_norm": 0.30796483159065247, + "learning_rate": 4.9364932767863895e-05, + "loss": 0.1925, + "step": 9234 + }, + { + "epoch": 0.1647165840259694, + "grad_norm": 0.2939542829990387, + "learning_rate": 4.936458411825038e-05, + "loss": 0.221, + "step": 9235 + }, + { + "epoch": 0.16473442014768308, + "grad_norm": 0.22121989727020264, + "learning_rate": 4.936423537419131e-05, + "loss": 0.144, + "step": 9236 + }, + { + "epoch": 0.1647522562693968, + "grad_norm": 0.40346765518188477, + "learning_rate": 4.936388653568806e-05, + "loss": 0.2865, + "step": 9237 + }, + { + "epoch": 0.16477009239111048, + "grad_norm": 0.23672229051589966, + "learning_rate": 4.936353760274198e-05, + "loss": 0.2238, + "step": 9238 + }, + { + "epoch": 0.16478792851282417, + "grad_norm": 0.2538418173789978, + "learning_rate": 4.93631885753544e-05, + "loss": 0.2314, + "step": 9239 + }, + { + "epoch": 0.16480576463453786, + "grad_norm": 0.2791252136230469, + "learning_rate": 4.9362839453526696e-05, + "loss": 0.1656, + "step": 9240 + }, + { + "epoch": 0.16482360075625155, + "grad_norm": 0.3458743095397949, + "learning_rate": 4.93624902372602e-05, + "loss": 0.2176, + "step": 9241 + }, + { + "epoch": 0.16484143687796526, + "grad_norm": 0.2711387872695923, + "learning_rate": 4.9362140926556286e-05, + "loss": 0.2272, + "step": 9242 + }, + { + "epoch": 0.16485927299967895, + "grad_norm": 0.2151830792427063, + "learning_rate": 4.93617915214163e-05, + "loss": 0.1776, + "step": 9243 + }, + { + "epoch": 0.16487710912139264, + "grad_norm": 0.22689075767993927, + "learning_rate": 4.93614420218416e-05, + "loss": 0.236, + "step": 9244 + }, + { + "epoch": 0.16489494524310633, + "grad_norm": 0.2275635004043579, + "learning_rate": 4.9361092427833525e-05, + "loss": 0.1879, + "step": 9245 + }, + { + "epoch": 0.16491278136482004, + "grad_norm": 0.25142842531204224, + "learning_rate": 4.936074273939345e-05, + "loss": 0.2016, + "step": 9246 + }, + { + "epoch": 0.16493061748653373, + "grad_norm": 0.2320721447467804, + "learning_rate": 4.936039295652272e-05, + "loss": 0.1782, + "step": 9247 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 0.24124976992607117, + "learning_rate": 4.9360043079222686e-05, + "loss": 0.1985, + "step": 9248 + }, + { + "epoch": 0.1649662897299611, + "grad_norm": 0.22440405189990997, + "learning_rate": 4.935969310749472e-05, + "loss": 0.1928, + "step": 9249 + }, + { + "epoch": 0.16498412585167482, + "grad_norm": 0.32603585720062256, + "learning_rate": 4.9359343041340166e-05, + "loss": 0.2157, + "step": 9250 + }, + { + "epoch": 0.1650019619733885, + "grad_norm": 0.31903767585754395, + "learning_rate": 4.935899288076038e-05, + "loss": 0.2224, + "step": 9251 + }, + { + "epoch": 0.1650197980951022, + "grad_norm": 0.3269581198692322, + "learning_rate": 4.935864262575674e-05, + "loss": 0.2286, + "step": 9252 + }, + { + "epoch": 0.1650376342168159, + "grad_norm": 0.359083354473114, + "learning_rate": 4.935829227633058e-05, + "loss": 0.2729, + "step": 9253 + }, + { + "epoch": 0.1650554703385296, + "grad_norm": 0.30463945865631104, + "learning_rate": 4.9357941832483255e-05, + "loss": 0.2887, + "step": 9254 + }, + { + "epoch": 0.1650733064602433, + "grad_norm": 0.3253450393676758, + "learning_rate": 4.935759129421614e-05, + "loss": 0.1632, + "step": 9255 + }, + { + "epoch": 0.16509114258195698, + "grad_norm": 0.24560873210430145, + "learning_rate": 4.9357240661530596e-05, + "loss": 0.2428, + "step": 9256 + }, + { + "epoch": 0.16510897870367067, + "grad_norm": 0.2875535488128662, + "learning_rate": 4.9356889934427975e-05, + "loss": 0.1914, + "step": 9257 + }, + { + "epoch": 0.16512681482538435, + "grad_norm": 0.2681664824485779, + "learning_rate": 4.935653911290963e-05, + "loss": 0.2437, + "step": 9258 + }, + { + "epoch": 0.16514465094709807, + "grad_norm": 0.22050854563713074, + "learning_rate": 4.935618819697693e-05, + "loss": 0.1901, + "step": 9259 + }, + { + "epoch": 0.16516248706881176, + "grad_norm": 0.2631033957004547, + "learning_rate": 4.935583718663123e-05, + "loss": 0.229, + "step": 9260 + }, + { + "epoch": 0.16518032319052545, + "grad_norm": 0.22643926739692688, + "learning_rate": 4.935548608187389e-05, + "loss": 0.2473, + "step": 9261 + }, + { + "epoch": 0.16519815931223913, + "grad_norm": 0.283899188041687, + "learning_rate": 4.935513488270628e-05, + "loss": 0.2028, + "step": 9262 + }, + { + "epoch": 0.16521599543395285, + "grad_norm": 0.33085399866104126, + "learning_rate": 4.935478358912975e-05, + "loss": 0.1895, + "step": 9263 + }, + { + "epoch": 0.16523383155566654, + "grad_norm": 0.3649313449859619, + "learning_rate": 4.935443220114567e-05, + "loss": 0.2429, + "step": 9264 + }, + { + "epoch": 0.16525166767738023, + "grad_norm": 0.23980526626110077, + "learning_rate": 4.93540807187554e-05, + "loss": 0.1669, + "step": 9265 + }, + { + "epoch": 0.16526950379909391, + "grad_norm": 0.3209800124168396, + "learning_rate": 4.93537291419603e-05, + "loss": 0.1951, + "step": 9266 + }, + { + "epoch": 0.16528733992080763, + "grad_norm": 0.3084694743156433, + "learning_rate": 4.935337747076173e-05, + "loss": 0.1937, + "step": 9267 + }, + { + "epoch": 0.16530517604252132, + "grad_norm": 0.4329877197742462, + "learning_rate": 4.935302570516106e-05, + "loss": 0.271, + "step": 9268 + }, + { + "epoch": 0.165323012164235, + "grad_norm": 0.30714884400367737, + "learning_rate": 4.935267384515966e-05, + "loss": 0.2226, + "step": 9269 + }, + { + "epoch": 0.1653408482859487, + "grad_norm": 0.2898719608783722, + "learning_rate": 4.935232189075887e-05, + "loss": 0.2108, + "step": 9270 + }, + { + "epoch": 0.1653586844076624, + "grad_norm": 0.2915104627609253, + "learning_rate": 4.935196984196008e-05, + "loss": 0.2004, + "step": 9271 + }, + { + "epoch": 0.1653765205293761, + "grad_norm": 0.2621375620365143, + "learning_rate": 4.935161769876464e-05, + "loss": 0.1941, + "step": 9272 + }, + { + "epoch": 0.16539435665108979, + "grad_norm": 0.285562127828598, + "learning_rate": 4.9351265461173926e-05, + "loss": 0.2176, + "step": 9273 + }, + { + "epoch": 0.16541219277280347, + "grad_norm": 0.32798701524734497, + "learning_rate": 4.935091312918929e-05, + "loss": 0.2213, + "step": 9274 + }, + { + "epoch": 0.1654300288945172, + "grad_norm": 0.24334648251533508, + "learning_rate": 4.93505607028121e-05, + "loss": 0.2243, + "step": 9275 + }, + { + "epoch": 0.16544786501623088, + "grad_norm": 0.3257021903991699, + "learning_rate": 4.9350208182043734e-05, + "loss": 0.2943, + "step": 9276 + }, + { + "epoch": 0.16546570113794457, + "grad_norm": 0.34047359228134155, + "learning_rate": 4.934985556688555e-05, + "loss": 0.2667, + "step": 9277 + }, + { + "epoch": 0.16548353725965825, + "grad_norm": 0.2405623346567154, + "learning_rate": 4.934950285733892e-05, + "loss": 0.2246, + "step": 9278 + }, + { + "epoch": 0.16550137338137194, + "grad_norm": 0.2989078462123871, + "learning_rate": 4.934915005340519e-05, + "loss": 0.1881, + "step": 9279 + }, + { + "epoch": 0.16551920950308566, + "grad_norm": 0.2507303059101105, + "learning_rate": 4.934879715508576e-05, + "loss": 0.2, + "step": 9280 + }, + { + "epoch": 0.16553704562479934, + "grad_norm": 0.2618032991886139, + "learning_rate": 4.934844416238197e-05, + "loss": 0.1976, + "step": 9281 + }, + { + "epoch": 0.16555488174651303, + "grad_norm": 0.31148505210876465, + "learning_rate": 4.934809107529521e-05, + "loss": 0.2888, + "step": 9282 + }, + { + "epoch": 0.16557271786822672, + "grad_norm": 0.21653443574905396, + "learning_rate": 4.934773789382684e-05, + "loss": 0.2061, + "step": 9283 + }, + { + "epoch": 0.16559055398994044, + "grad_norm": 0.2662765681743622, + "learning_rate": 4.934738461797823e-05, + "loss": 0.2127, + "step": 9284 + }, + { + "epoch": 0.16560839011165412, + "grad_norm": 0.2488943487405777, + "learning_rate": 4.934703124775074e-05, + "loss": 0.1938, + "step": 9285 + }, + { + "epoch": 0.1656262262333678, + "grad_norm": 0.24388650059700012, + "learning_rate": 4.934667778314575e-05, + "loss": 0.1947, + "step": 9286 + }, + { + "epoch": 0.1656440623550815, + "grad_norm": 0.2611672282218933, + "learning_rate": 4.934632422416463e-05, + "loss": 0.2194, + "step": 9287 + }, + { + "epoch": 0.16566189847679522, + "grad_norm": 0.32193028926849365, + "learning_rate": 4.934597057080875e-05, + "loss": 0.163, + "step": 9288 + }, + { + "epoch": 0.1656797345985089, + "grad_norm": 0.38710469007492065, + "learning_rate": 4.934561682307948e-05, + "loss": 0.215, + "step": 9289 + }, + { + "epoch": 0.1656975707202226, + "grad_norm": 0.24965006113052368, + "learning_rate": 4.9345262980978176e-05, + "loss": 0.1624, + "step": 9290 + }, + { + "epoch": 0.16571540684193628, + "grad_norm": 0.21670930087566376, + "learning_rate": 4.934490904450624e-05, + "loss": 0.1711, + "step": 9291 + }, + { + "epoch": 0.16573324296365, + "grad_norm": 0.16519004106521606, + "learning_rate": 4.9344555013665025e-05, + "loss": 0.1714, + "step": 9292 + }, + { + "epoch": 0.16575107908536368, + "grad_norm": 0.2520431876182556, + "learning_rate": 4.93442008884559e-05, + "loss": 0.2168, + "step": 9293 + }, + { + "epoch": 0.16576891520707737, + "grad_norm": 0.21662983298301697, + "learning_rate": 4.9343846668880254e-05, + "loss": 0.2019, + "step": 9294 + }, + { + "epoch": 0.16578675132879106, + "grad_norm": 0.2764663100242615, + "learning_rate": 4.9343492354939444e-05, + "loss": 0.2365, + "step": 9295 + }, + { + "epoch": 0.16580458745050478, + "grad_norm": 0.26178547739982605, + "learning_rate": 4.934313794663485e-05, + "loss": 0.1978, + "step": 9296 + }, + { + "epoch": 0.16582242357221846, + "grad_norm": 0.25380316376686096, + "learning_rate": 4.934278344396785e-05, + "loss": 0.2209, + "step": 9297 + }, + { + "epoch": 0.16584025969393215, + "grad_norm": 0.2863463759422302, + "learning_rate": 4.9342428846939815e-05, + "loss": 0.2536, + "step": 9298 + }, + { + "epoch": 0.16585809581564584, + "grad_norm": 0.29264238476753235, + "learning_rate": 4.934207415555211e-05, + "loss": 0.1947, + "step": 9299 + }, + { + "epoch": 0.16587593193735953, + "grad_norm": 0.27421027421951294, + "learning_rate": 4.934171936980612e-05, + "loss": 0.1866, + "step": 9300 + }, + { + "epoch": 0.16589376805907324, + "grad_norm": 0.24602636694908142, + "learning_rate": 4.9341364489703225e-05, + "loss": 0.1794, + "step": 9301 + }, + { + "epoch": 0.16591160418078693, + "grad_norm": 0.3985421061515808, + "learning_rate": 4.934100951524479e-05, + "loss": 0.2384, + "step": 9302 + }, + { + "epoch": 0.16592944030250062, + "grad_norm": 0.2679640054702759, + "learning_rate": 4.9340654446432185e-05, + "loss": 0.2029, + "step": 9303 + }, + { + "epoch": 0.1659472764242143, + "grad_norm": 0.319214403629303, + "learning_rate": 4.934029928326681e-05, + "loss": 0.2091, + "step": 9304 + }, + { + "epoch": 0.16596511254592802, + "grad_norm": 0.2522352337837219, + "learning_rate": 4.9339944025750024e-05, + "loss": 0.2142, + "step": 9305 + }, + { + "epoch": 0.1659829486676417, + "grad_norm": 0.4243687689304352, + "learning_rate": 4.933958867388321e-05, + "loss": 0.1222, + "step": 9306 + }, + { + "epoch": 0.1660007847893554, + "grad_norm": 0.34678542613983154, + "learning_rate": 4.933923322766774e-05, + "loss": 0.2195, + "step": 9307 + }, + { + "epoch": 0.1660186209110691, + "grad_norm": 0.2175455540418625, + "learning_rate": 4.9338877687104995e-05, + "loss": 0.1939, + "step": 9308 + }, + { + "epoch": 0.1660364570327828, + "grad_norm": 0.34130173921585083, + "learning_rate": 4.9338522052196354e-05, + "loss": 0.1974, + "step": 9309 + }, + { + "epoch": 0.1660542931544965, + "grad_norm": 0.2852207124233246, + "learning_rate": 4.9338166322943205e-05, + "loss": 0.2181, + "step": 9310 + }, + { + "epoch": 0.16607212927621018, + "grad_norm": 0.2939755320549011, + "learning_rate": 4.93378104993469e-05, + "loss": 0.2153, + "step": 9311 + }, + { + "epoch": 0.16608996539792387, + "grad_norm": 0.29781708121299744, + "learning_rate": 4.933745458140885e-05, + "loss": 0.2711, + "step": 9312 + }, + { + "epoch": 0.16610780151963758, + "grad_norm": 0.2629675567150116, + "learning_rate": 4.9337098569130416e-05, + "loss": 0.1836, + "step": 9313 + }, + { + "epoch": 0.16612563764135127, + "grad_norm": 0.28269559144973755, + "learning_rate": 4.9336742462512976e-05, + "loss": 0.2018, + "step": 9314 + }, + { + "epoch": 0.16614347376306496, + "grad_norm": 0.23797199130058289, + "learning_rate": 4.933638626155792e-05, + "loss": 0.1891, + "step": 9315 + }, + { + "epoch": 0.16616130988477865, + "grad_norm": 0.31651830673217773, + "learning_rate": 4.933602996626663e-05, + "loss": 0.219, + "step": 9316 + }, + { + "epoch": 0.16617914600649236, + "grad_norm": 0.24222183227539062, + "learning_rate": 4.9335673576640476e-05, + "loss": 0.1917, + "step": 9317 + }, + { + "epoch": 0.16619698212820605, + "grad_norm": 0.2895123362541199, + "learning_rate": 4.9335317092680856e-05, + "loss": 0.1995, + "step": 9318 + }, + { + "epoch": 0.16621481824991974, + "grad_norm": 0.32943058013916016, + "learning_rate": 4.933496051438913e-05, + "loss": 0.1907, + "step": 9319 + }, + { + "epoch": 0.16623265437163343, + "grad_norm": 0.3121800124645233, + "learning_rate": 4.9334603841766695e-05, + "loss": 0.2735, + "step": 9320 + }, + { + "epoch": 0.16625049049334711, + "grad_norm": 0.2764334976673126, + "learning_rate": 4.933424707481493e-05, + "loss": 0.1593, + "step": 9321 + }, + { + "epoch": 0.16626832661506083, + "grad_norm": 0.2478717565536499, + "learning_rate": 4.933389021353523e-05, + "loss": 0.2118, + "step": 9322 + }, + { + "epoch": 0.16628616273677452, + "grad_norm": 0.25999343395233154, + "learning_rate": 4.9333533257928954e-05, + "loss": 0.2372, + "step": 9323 + }, + { + "epoch": 0.1663039988584882, + "grad_norm": 0.2991454601287842, + "learning_rate": 4.93331762079975e-05, + "loss": 0.2344, + "step": 9324 + }, + { + "epoch": 0.1663218349802019, + "grad_norm": 0.390278160572052, + "learning_rate": 4.933281906374225e-05, + "loss": 0.2002, + "step": 9325 + }, + { + "epoch": 0.1663396711019156, + "grad_norm": 0.2732143998146057, + "learning_rate": 4.93324618251646e-05, + "loss": 0.2408, + "step": 9326 + }, + { + "epoch": 0.1663575072236293, + "grad_norm": 0.2051214575767517, + "learning_rate": 4.9332104492265915e-05, + "loss": 0.1909, + "step": 9327 + }, + { + "epoch": 0.16637534334534299, + "grad_norm": 0.39487937092781067, + "learning_rate": 4.933174706504759e-05, + "loss": 0.2202, + "step": 9328 + }, + { + "epoch": 0.16639317946705667, + "grad_norm": 0.3209646940231323, + "learning_rate": 4.9331389543511006e-05, + "loss": 0.2248, + "step": 9329 + }, + { + "epoch": 0.1664110155887704, + "grad_norm": 0.21034787595272064, + "learning_rate": 4.933103192765756e-05, + "loss": 0.1834, + "step": 9330 + }, + { + "epoch": 0.16642885171048408, + "grad_norm": 0.22905397415161133, + "learning_rate": 4.9330674217488626e-05, + "loss": 0.2089, + "step": 9331 + }, + { + "epoch": 0.16644668783219776, + "grad_norm": 0.26188793778419495, + "learning_rate": 4.9330316413005596e-05, + "loss": 0.2152, + "step": 9332 + }, + { + "epoch": 0.16646452395391145, + "grad_norm": 0.32350656390190125, + "learning_rate": 4.932995851420985e-05, + "loss": 0.2177, + "step": 9333 + }, + { + "epoch": 0.16648236007562517, + "grad_norm": 0.27724212408065796, + "learning_rate": 4.932960052110279e-05, + "loss": 0.222, + "step": 9334 + }, + { + "epoch": 0.16650019619733886, + "grad_norm": 0.26598259806632996, + "learning_rate": 4.93292424336858e-05, + "loss": 0.2034, + "step": 9335 + }, + { + "epoch": 0.16651803231905254, + "grad_norm": 0.26200130581855774, + "learning_rate": 4.932888425196025e-05, + "loss": 0.208, + "step": 9336 + }, + { + "epoch": 0.16653586844076623, + "grad_norm": 0.22515469789505005, + "learning_rate": 4.9328525975927545e-05, + "loss": 0.2054, + "step": 9337 + }, + { + "epoch": 0.16655370456247992, + "grad_norm": 0.2405278980731964, + "learning_rate": 4.932816760558907e-05, + "loss": 0.2007, + "step": 9338 + }, + { + "epoch": 0.16657154068419364, + "grad_norm": 0.2784559726715088, + "learning_rate": 4.9327809140946225e-05, + "loss": 0.2017, + "step": 9339 + }, + { + "epoch": 0.16658937680590732, + "grad_norm": 0.25360438227653503, + "learning_rate": 4.932745058200038e-05, + "loss": 0.2042, + "step": 9340 + }, + { + "epoch": 0.166607212927621, + "grad_norm": 0.24294637143611908, + "learning_rate": 4.932709192875293e-05, + "loss": 0.2125, + "step": 9341 + }, + { + "epoch": 0.1666250490493347, + "grad_norm": 0.2221154272556305, + "learning_rate": 4.9326733181205284e-05, + "loss": 0.1764, + "step": 9342 + }, + { + "epoch": 0.16664288517104842, + "grad_norm": 0.23242053389549255, + "learning_rate": 4.932637433935881e-05, + "loss": 0.1794, + "step": 9343 + }, + { + "epoch": 0.1666607212927621, + "grad_norm": 0.27005764842033386, + "learning_rate": 4.93260154032149e-05, + "loss": 0.1828, + "step": 9344 + }, + { + "epoch": 0.1666785574144758, + "grad_norm": 0.3330439329147339, + "learning_rate": 4.9325656372774966e-05, + "loss": 0.2613, + "step": 9345 + }, + { + "epoch": 0.16669639353618948, + "grad_norm": 0.29532188177108765, + "learning_rate": 4.932529724804037e-05, + "loss": 0.2227, + "step": 9346 + }, + { + "epoch": 0.1667142296579032, + "grad_norm": 0.2895061671733856, + "learning_rate": 4.9324938029012535e-05, + "loss": 0.2338, + "step": 9347 + }, + { + "epoch": 0.16673206577961688, + "grad_norm": 0.23465098440647125, + "learning_rate": 4.932457871569282e-05, + "loss": 0.1812, + "step": 9348 + }, + { + "epoch": 0.16674990190133057, + "grad_norm": 0.2588139474391937, + "learning_rate": 4.932421930808266e-05, + "loss": 0.1632, + "step": 9349 + }, + { + "epoch": 0.16676773802304426, + "grad_norm": 0.2403249442577362, + "learning_rate": 4.932385980618341e-05, + "loss": 0.1964, + "step": 9350 + }, + { + "epoch": 0.16678557414475798, + "grad_norm": 0.2389722466468811, + "learning_rate": 4.9323500209996486e-05, + "loss": 0.1993, + "step": 9351 + }, + { + "epoch": 0.16680341026647166, + "grad_norm": 0.2843533158302307, + "learning_rate": 4.932314051952327e-05, + "loss": 0.2609, + "step": 9352 + }, + { + "epoch": 0.16682124638818535, + "grad_norm": 0.30103957653045654, + "learning_rate": 4.932278073476516e-05, + "loss": 0.2318, + "step": 9353 + }, + { + "epoch": 0.16683908250989904, + "grad_norm": 0.28298187255859375, + "learning_rate": 4.932242085572355e-05, + "loss": 0.2628, + "step": 9354 + }, + { + "epoch": 0.16685691863161276, + "grad_norm": 0.3170102834701538, + "learning_rate": 4.9322060882399836e-05, + "loss": 0.2369, + "step": 9355 + }, + { + "epoch": 0.16687475475332644, + "grad_norm": 0.30096954107284546, + "learning_rate": 4.9321700814795414e-05, + "loss": 0.2139, + "step": 9356 + }, + { + "epoch": 0.16689259087504013, + "grad_norm": 0.26875007152557373, + "learning_rate": 4.9321340652911686e-05, + "loss": 0.1966, + "step": 9357 + }, + { + "epoch": 0.16691042699675382, + "grad_norm": 0.356659471988678, + "learning_rate": 4.932098039675003e-05, + "loss": 0.1979, + "step": 9358 + }, + { + "epoch": 0.1669282631184675, + "grad_norm": 0.22786852717399597, + "learning_rate": 4.932062004631186e-05, + "loss": 0.2311, + "step": 9359 + }, + { + "epoch": 0.16694609924018122, + "grad_norm": 0.2709221839904785, + "learning_rate": 4.932025960159857e-05, + "loss": 0.2024, + "step": 9360 + }, + { + "epoch": 0.1669639353618949, + "grad_norm": 0.2221805602312088, + "learning_rate": 4.931989906261155e-05, + "loss": 0.1716, + "step": 9361 + }, + { + "epoch": 0.1669817714836086, + "grad_norm": 0.2553713619709015, + "learning_rate": 4.93195384293522e-05, + "loss": 0.1863, + "step": 9362 + }, + { + "epoch": 0.1669996076053223, + "grad_norm": 0.32181254029273987, + "learning_rate": 4.931917770182192e-05, + "loss": 0.2268, + "step": 9363 + }, + { + "epoch": 0.167017443727036, + "grad_norm": 0.23647813498973846, + "learning_rate": 4.9318816880022106e-05, + "loss": 0.1974, + "step": 9364 + }, + { + "epoch": 0.1670352798487497, + "grad_norm": 0.2741914689540863, + "learning_rate": 4.931845596395416e-05, + "loss": 0.2122, + "step": 9365 + }, + { + "epoch": 0.16705311597046338, + "grad_norm": 0.3124186098575592, + "learning_rate": 4.931809495361948e-05, + "loss": 0.2283, + "step": 9366 + }, + { + "epoch": 0.16707095209217707, + "grad_norm": 0.4242728650569916, + "learning_rate": 4.9317733849019464e-05, + "loss": 0.2164, + "step": 9367 + }, + { + "epoch": 0.16708878821389078, + "grad_norm": 0.23358015716075897, + "learning_rate": 4.9317372650155514e-05, + "loss": 0.1944, + "step": 9368 + }, + { + "epoch": 0.16710662433560447, + "grad_norm": 0.24014799296855927, + "learning_rate": 4.931701135702903e-05, + "loss": 0.1961, + "step": 9369 + }, + { + "epoch": 0.16712446045731816, + "grad_norm": 0.2570462226867676, + "learning_rate": 4.93166499696414e-05, + "loss": 0.2115, + "step": 9370 + }, + { + "epoch": 0.16714229657903185, + "grad_norm": 0.24118487536907196, + "learning_rate": 4.931628848799405e-05, + "loss": 0.2014, + "step": 9371 + }, + { + "epoch": 0.16716013270074556, + "grad_norm": 0.3396017849445343, + "learning_rate": 4.931592691208836e-05, + "loss": 0.1809, + "step": 9372 + }, + { + "epoch": 0.16717796882245925, + "grad_norm": 0.37545526027679443, + "learning_rate": 4.9315565241925746e-05, + "loss": 0.2441, + "step": 9373 + }, + { + "epoch": 0.16719580494417294, + "grad_norm": 0.27863696217536926, + "learning_rate": 4.93152034775076e-05, + "loss": 0.2231, + "step": 9374 + }, + { + "epoch": 0.16721364106588663, + "grad_norm": 0.27867764234542847, + "learning_rate": 4.931484161883532e-05, + "loss": 0.2506, + "step": 9375 + }, + { + "epoch": 0.16723147718760034, + "grad_norm": 0.2119695246219635, + "learning_rate": 4.9314479665910326e-05, + "loss": 0.194, + "step": 9376 + }, + { + "epoch": 0.16724931330931403, + "grad_norm": 0.2885444462299347, + "learning_rate": 4.9314117618734e-05, + "loss": 0.2483, + "step": 9377 + }, + { + "epoch": 0.16726714943102772, + "grad_norm": 0.2301822006702423, + "learning_rate": 4.931375547730777e-05, + "loss": 0.1828, + "step": 9378 + }, + { + "epoch": 0.1672849855527414, + "grad_norm": 0.3811148405075073, + "learning_rate": 4.931339324163301e-05, + "loss": 0.2558, + "step": 9379 + }, + { + "epoch": 0.1673028216744551, + "grad_norm": 0.24624472856521606, + "learning_rate": 4.931303091171115e-05, + "loss": 0.2141, + "step": 9380 + }, + { + "epoch": 0.1673206577961688, + "grad_norm": 0.3223497271537781, + "learning_rate": 4.9312668487543584e-05, + "loss": 0.2806, + "step": 9381 + }, + { + "epoch": 0.1673384939178825, + "grad_norm": 0.31021153926849365, + "learning_rate": 4.9312305969131716e-05, + "loss": 0.2328, + "step": 9382 + }, + { + "epoch": 0.16735633003959619, + "grad_norm": 0.2613779306411743, + "learning_rate": 4.9311943356476956e-05, + "loss": 0.2023, + "step": 9383 + }, + { + "epoch": 0.16737416616130987, + "grad_norm": 0.3111892640590668, + "learning_rate": 4.93115806495807e-05, + "loss": 0.2871, + "step": 9384 + }, + { + "epoch": 0.1673920022830236, + "grad_norm": 0.3307771384716034, + "learning_rate": 4.931121784844437e-05, + "loss": 0.2521, + "step": 9385 + }, + { + "epoch": 0.16740983840473728, + "grad_norm": 0.2541097104549408, + "learning_rate": 4.931085495306935e-05, + "loss": 0.209, + "step": 9386 + }, + { + "epoch": 0.16742767452645096, + "grad_norm": 0.29149866104125977, + "learning_rate": 4.9310491963457074e-05, + "loss": 0.2009, + "step": 9387 + }, + { + "epoch": 0.16744551064816465, + "grad_norm": 0.2406076043844223, + "learning_rate": 4.9310128879608924e-05, + "loss": 0.189, + "step": 9388 + }, + { + "epoch": 0.16746334676987837, + "grad_norm": 0.3291756212711334, + "learning_rate": 4.9309765701526325e-05, + "loss": 0.2816, + "step": 9389 + }, + { + "epoch": 0.16748118289159206, + "grad_norm": 0.1909375935792923, + "learning_rate": 4.9309402429210674e-05, + "loss": 0.1709, + "step": 9390 + }, + { + "epoch": 0.16749901901330574, + "grad_norm": 0.23396065831184387, + "learning_rate": 4.9309039062663374e-05, + "loss": 0.1379, + "step": 9391 + }, + { + "epoch": 0.16751685513501943, + "grad_norm": 0.2677765190601349, + "learning_rate": 4.9308675601885853e-05, + "loss": 0.2063, + "step": 9392 + }, + { + "epoch": 0.16753469125673315, + "grad_norm": 0.3470064401626587, + "learning_rate": 4.930831204687951e-05, + "loss": 0.2567, + "step": 9393 + }, + { + "epoch": 0.16755252737844684, + "grad_norm": 0.2177836298942566, + "learning_rate": 4.930794839764575e-05, + "loss": 0.194, + "step": 9394 + }, + { + "epoch": 0.16757036350016052, + "grad_norm": 0.26476576924324036, + "learning_rate": 4.930758465418599e-05, + "loss": 0.2522, + "step": 9395 + }, + { + "epoch": 0.1675881996218742, + "grad_norm": 0.19955436885356903, + "learning_rate": 4.9307220816501634e-05, + "loss": 0.18, + "step": 9396 + }, + { + "epoch": 0.16760603574358793, + "grad_norm": 0.41708457469940186, + "learning_rate": 4.93068568845941e-05, + "loss": 0.2566, + "step": 9397 + }, + { + "epoch": 0.16762387186530162, + "grad_norm": 0.2114209681749344, + "learning_rate": 4.930649285846478e-05, + "loss": 0.1735, + "step": 9398 + }, + { + "epoch": 0.1676417079870153, + "grad_norm": 0.2221326380968094, + "learning_rate": 4.930612873811511e-05, + "loss": 0.2334, + "step": 9399 + }, + { + "epoch": 0.167659544108729, + "grad_norm": 0.22841550409793854, + "learning_rate": 4.930576452354649e-05, + "loss": 0.2472, + "step": 9400 + }, + { + "epoch": 0.16767738023044268, + "grad_norm": 0.2357344925403595, + "learning_rate": 4.930540021476032e-05, + "loss": 0.1603, + "step": 9401 + }, + { + "epoch": 0.1676952163521564, + "grad_norm": 0.33083274960517883, + "learning_rate": 4.9305035811758035e-05, + "loss": 0.1738, + "step": 9402 + }, + { + "epoch": 0.16771305247387008, + "grad_norm": 0.2918514311313629, + "learning_rate": 4.930467131454104e-05, + "loss": 0.2345, + "step": 9403 + }, + { + "epoch": 0.16773088859558377, + "grad_norm": 0.3187107443809509, + "learning_rate": 4.930430672311074e-05, + "loss": 0.2117, + "step": 9404 + }, + { + "epoch": 0.16774872471729746, + "grad_norm": 0.21408715844154358, + "learning_rate": 4.9303942037468545e-05, + "loss": 0.208, + "step": 9405 + }, + { + "epoch": 0.16776656083901118, + "grad_norm": 0.2944090962409973, + "learning_rate": 4.9303577257615886e-05, + "loss": 0.2434, + "step": 9406 + }, + { + "epoch": 0.16778439696072486, + "grad_norm": 0.29843562841415405, + "learning_rate": 4.930321238355416e-05, + "loss": 0.1755, + "step": 9407 + }, + { + "epoch": 0.16780223308243855, + "grad_norm": 0.26216739416122437, + "learning_rate": 4.930284741528479e-05, + "loss": 0.1796, + "step": 9408 + }, + { + "epoch": 0.16782006920415224, + "grad_norm": 0.27436238527297974, + "learning_rate": 4.93024823528092e-05, + "loss": 0.2511, + "step": 9409 + }, + { + "epoch": 0.16783790532586595, + "grad_norm": 0.23143106698989868, + "learning_rate": 4.930211719612878e-05, + "loss": 0.2406, + "step": 9410 + }, + { + "epoch": 0.16785574144757964, + "grad_norm": 0.21253831684589386, + "learning_rate": 4.930175194524497e-05, + "loss": 0.1675, + "step": 9411 + }, + { + "epoch": 0.16787357756929333, + "grad_norm": 0.2958551347255707, + "learning_rate": 4.9301386600159174e-05, + "loss": 0.2471, + "step": 9412 + }, + { + "epoch": 0.16789141369100702, + "grad_norm": 0.2436182200908661, + "learning_rate": 4.9301021160872806e-05, + "loss": 0.1959, + "step": 9413 + }, + { + "epoch": 0.16790924981272073, + "grad_norm": 0.4054587185382843, + "learning_rate": 4.930065562738729e-05, + "loss": 0.2134, + "step": 9414 + }, + { + "epoch": 0.16792708593443442, + "grad_norm": 0.2374972403049469, + "learning_rate": 4.930028999970404e-05, + "loss": 0.1883, + "step": 9415 + }, + { + "epoch": 0.1679449220561481, + "grad_norm": 0.3502541780471802, + "learning_rate": 4.929992427782447e-05, + "loss": 0.2559, + "step": 9416 + }, + { + "epoch": 0.1679627581778618, + "grad_norm": 0.2598343789577484, + "learning_rate": 4.9299558461750006e-05, + "loss": 0.2454, + "step": 9417 + }, + { + "epoch": 0.16798059429957551, + "grad_norm": 0.4071826934814453, + "learning_rate": 4.929919255148205e-05, + "loss": 0.2169, + "step": 9418 + }, + { + "epoch": 0.1679984304212892, + "grad_norm": 0.247068390250206, + "learning_rate": 4.929882654702205e-05, + "loss": 0.2036, + "step": 9419 + }, + { + "epoch": 0.1680162665430029, + "grad_norm": 0.2935871481895447, + "learning_rate": 4.929846044837139e-05, + "loss": 0.1573, + "step": 9420 + }, + { + "epoch": 0.16803410266471658, + "grad_norm": 0.40864014625549316, + "learning_rate": 4.929809425553151e-05, + "loss": 0.2359, + "step": 9421 + }, + { + "epoch": 0.16805193878643027, + "grad_norm": 0.2923342287540436, + "learning_rate": 4.929772796850382e-05, + "loss": 0.1903, + "step": 9422 + }, + { + "epoch": 0.16806977490814398, + "grad_norm": 0.3476279079914093, + "learning_rate": 4.9297361587289745e-05, + "loss": 0.1812, + "step": 9423 + }, + { + "epoch": 0.16808761102985767, + "grad_norm": 0.3068753778934479, + "learning_rate": 4.9296995111890707e-05, + "loss": 0.1785, + "step": 9424 + }, + { + "epoch": 0.16810544715157136, + "grad_norm": 0.2985677719116211, + "learning_rate": 4.929662854230813e-05, + "loss": 0.2138, + "step": 9425 + }, + { + "epoch": 0.16812328327328505, + "grad_norm": 0.2700032889842987, + "learning_rate": 4.929626187854342e-05, + "loss": 0.2015, + "step": 9426 + }, + { + "epoch": 0.16814111939499876, + "grad_norm": 0.31977972388267517, + "learning_rate": 4.929589512059801e-05, + "loss": 0.2467, + "step": 9427 + }, + { + "epoch": 0.16815895551671245, + "grad_norm": 0.28535133600234985, + "learning_rate": 4.929552826847332e-05, + "loss": 0.2166, + "step": 9428 + }, + { + "epoch": 0.16817679163842614, + "grad_norm": 0.4397616386413574, + "learning_rate": 4.9295161322170766e-05, + "loss": 0.2034, + "step": 9429 + }, + { + "epoch": 0.16819462776013983, + "grad_norm": 0.245170459151268, + "learning_rate": 4.929479428169178e-05, + "loss": 0.2384, + "step": 9430 + }, + { + "epoch": 0.16821246388185354, + "grad_norm": 0.3002978265285492, + "learning_rate": 4.929442714703778e-05, + "loss": 0.1601, + "step": 9431 + }, + { + "epoch": 0.16823030000356723, + "grad_norm": 0.3409084677696228, + "learning_rate": 4.929405991821019e-05, + "loss": 0.1642, + "step": 9432 + }, + { + "epoch": 0.16824813612528092, + "grad_norm": 0.2634219229221344, + "learning_rate": 4.9293692595210435e-05, + "loss": 0.27, + "step": 9433 + }, + { + "epoch": 0.1682659722469946, + "grad_norm": 0.3201228678226471, + "learning_rate": 4.929332517803993e-05, + "loss": 0.2406, + "step": 9434 + }, + { + "epoch": 0.16828380836870832, + "grad_norm": 0.21339882910251617, + "learning_rate": 4.9292957666700113e-05, + "loss": 0.1946, + "step": 9435 + }, + { + "epoch": 0.168301644490422, + "grad_norm": 0.2794226109981537, + "learning_rate": 4.929259006119239e-05, + "loss": 0.2107, + "step": 9436 + }, + { + "epoch": 0.1683194806121357, + "grad_norm": 0.2748399078845978, + "learning_rate": 4.929222236151821e-05, + "loss": 0.2781, + "step": 9437 + }, + { + "epoch": 0.16833731673384938, + "grad_norm": 0.293453186750412, + "learning_rate": 4.929185456767898e-05, + "loss": 0.1632, + "step": 9438 + }, + { + "epoch": 0.16835515285556307, + "grad_norm": 0.31061115860939026, + "learning_rate": 4.929148667967613e-05, + "loss": 0.1794, + "step": 9439 + }, + { + "epoch": 0.1683729889772768, + "grad_norm": 0.35332250595092773, + "learning_rate": 4.9291118697511096e-05, + "loss": 0.1995, + "step": 9440 + }, + { + "epoch": 0.16839082509899048, + "grad_norm": 0.4127959907054901, + "learning_rate": 4.929075062118529e-05, + "loss": 0.2784, + "step": 9441 + }, + { + "epoch": 0.16840866122070416, + "grad_norm": 0.24581199884414673, + "learning_rate": 4.929038245070015e-05, + "loss": 0.1807, + "step": 9442 + }, + { + "epoch": 0.16842649734241785, + "grad_norm": 0.2739531993865967, + "learning_rate": 4.929001418605709e-05, + "loss": 0.263, + "step": 9443 + }, + { + "epoch": 0.16844433346413157, + "grad_norm": 0.23609992861747742, + "learning_rate": 4.928964582725755e-05, + "loss": 0.1981, + "step": 9444 + }, + { + "epoch": 0.16846216958584526, + "grad_norm": 0.2891889810562134, + "learning_rate": 4.928927737430296e-05, + "loss": 0.2169, + "step": 9445 + }, + { + "epoch": 0.16848000570755894, + "grad_norm": 0.2761732339859009, + "learning_rate": 4.928890882719472e-05, + "loss": 0.2447, + "step": 9446 + }, + { + "epoch": 0.16849784182927263, + "grad_norm": 0.23232130706310272, + "learning_rate": 4.9288540185934307e-05, + "loss": 0.1814, + "step": 9447 + }, + { + "epoch": 0.16851567795098635, + "grad_norm": 0.23173119127750397, + "learning_rate": 4.928817145052311e-05, + "loss": 0.2003, + "step": 9448 + }, + { + "epoch": 0.16853351407270004, + "grad_norm": 0.2666068971157074, + "learning_rate": 4.928780262096257e-05, + "loss": 0.2011, + "step": 9449 + }, + { + "epoch": 0.16855135019441372, + "grad_norm": 0.2718893885612488, + "learning_rate": 4.928743369725412e-05, + "loss": 0.2074, + "step": 9450 + }, + { + "epoch": 0.1685691863161274, + "grad_norm": 0.3470384180545807, + "learning_rate": 4.9287064679399184e-05, + "loss": 0.1858, + "step": 9451 + }, + { + "epoch": 0.16858702243784113, + "grad_norm": 0.3397728502750397, + "learning_rate": 4.928669556739921e-05, + "loss": 0.2754, + "step": 9452 + }, + { + "epoch": 0.16860485855955482, + "grad_norm": 0.2192605584859848, + "learning_rate": 4.92863263612556e-05, + "loss": 0.2343, + "step": 9453 + }, + { + "epoch": 0.1686226946812685, + "grad_norm": 0.2916926145553589, + "learning_rate": 4.928595706096981e-05, + "loss": 0.2249, + "step": 9454 + }, + { + "epoch": 0.1686405308029822, + "grad_norm": 0.3425137996673584, + "learning_rate": 4.928558766654326e-05, + "loss": 0.26, + "step": 9455 + }, + { + "epoch": 0.1686583669246959, + "grad_norm": 0.23316988348960876, + "learning_rate": 4.928521817797739e-05, + "loss": 0.1864, + "step": 9456 + }, + { + "epoch": 0.1686762030464096, + "grad_norm": 0.34133097529411316, + "learning_rate": 4.928484859527362e-05, + "loss": 0.2389, + "step": 9457 + }, + { + "epoch": 0.16869403916812328, + "grad_norm": 0.34185847640037537, + "learning_rate": 4.9284478918433385e-05, + "loss": 0.172, + "step": 9458 + }, + { + "epoch": 0.16871187528983697, + "grad_norm": 0.34840038418769836, + "learning_rate": 4.928410914745813e-05, + "loss": 0.2312, + "step": 9459 + }, + { + "epoch": 0.16872971141155066, + "grad_norm": 0.4255453944206238, + "learning_rate": 4.928373928234928e-05, + "loss": 0.2366, + "step": 9460 + }, + { + "epoch": 0.16874754753326437, + "grad_norm": 0.2785493731498718, + "learning_rate": 4.9283369323108265e-05, + "loss": 0.219, + "step": 9461 + }, + { + "epoch": 0.16876538365497806, + "grad_norm": 0.21891143918037415, + "learning_rate": 4.928299926973653e-05, + "loss": 0.1987, + "step": 9462 + }, + { + "epoch": 0.16878321977669175, + "grad_norm": 0.2747426927089691, + "learning_rate": 4.9282629122235495e-05, + "loss": 0.233, + "step": 9463 + }, + { + "epoch": 0.16880105589840544, + "grad_norm": 0.266500860452652, + "learning_rate": 4.928225888060661e-05, + "loss": 0.1999, + "step": 9464 + }, + { + "epoch": 0.16881889202011915, + "grad_norm": 0.308432936668396, + "learning_rate": 4.9281888544851294e-05, + "loss": 0.2153, + "step": 9465 + }, + { + "epoch": 0.16883672814183284, + "grad_norm": 0.3269672691822052, + "learning_rate": 4.9281518114970996e-05, + "loss": 0.1926, + "step": 9466 + }, + { + "epoch": 0.16885456426354653, + "grad_norm": 0.30079373717308044, + "learning_rate": 4.928114759096715e-05, + "loss": 0.226, + "step": 9467 + }, + { + "epoch": 0.16887240038526022, + "grad_norm": 0.41163352131843567, + "learning_rate": 4.9280776972841184e-05, + "loss": 0.2061, + "step": 9468 + }, + { + "epoch": 0.16889023650697393, + "grad_norm": 0.24012619256973267, + "learning_rate": 4.9280406260594545e-05, + "loss": 0.1878, + "step": 9469 + }, + { + "epoch": 0.16890807262868762, + "grad_norm": 0.24753014743328094, + "learning_rate": 4.928003545422866e-05, + "loss": 0.2083, + "step": 9470 + }, + { + "epoch": 0.1689259087504013, + "grad_norm": 0.24844524264335632, + "learning_rate": 4.9279664553744974e-05, + "loss": 0.1917, + "step": 9471 + }, + { + "epoch": 0.168943744872115, + "grad_norm": 0.2850281298160553, + "learning_rate": 4.927929355914492e-05, + "loss": 0.2142, + "step": 9472 + }, + { + "epoch": 0.1689615809938287, + "grad_norm": 0.35059431195259094, + "learning_rate": 4.927892247042994e-05, + "loss": 0.1987, + "step": 9473 + }, + { + "epoch": 0.1689794171155424, + "grad_norm": 0.22267958521842957, + "learning_rate": 4.9278551287601484e-05, + "loss": 0.2018, + "step": 9474 + }, + { + "epoch": 0.1689972532372561, + "grad_norm": 0.23396602272987366, + "learning_rate": 4.927818001066096e-05, + "loss": 0.2111, + "step": 9475 + }, + { + "epoch": 0.16901508935896978, + "grad_norm": 0.3829486668109894, + "learning_rate": 4.927780863960984e-05, + "loss": 0.1948, + "step": 9476 + }, + { + "epoch": 0.1690329254806835, + "grad_norm": 0.3118758201599121, + "learning_rate": 4.927743717444953e-05, + "loss": 0.1669, + "step": 9477 + }, + { + "epoch": 0.16905076160239718, + "grad_norm": 0.3104802966117859, + "learning_rate": 4.92770656151815e-05, + "loss": 0.204, + "step": 9478 + }, + { + "epoch": 0.16906859772411087, + "grad_norm": 0.3120456337928772, + "learning_rate": 4.9276693961807175e-05, + "loss": 0.2165, + "step": 9479 + }, + { + "epoch": 0.16908643384582456, + "grad_norm": 0.23897507786750793, + "learning_rate": 4.9276322214327994e-05, + "loss": 0.1911, + "step": 9480 + }, + { + "epoch": 0.16910426996753825, + "grad_norm": 0.3869253098964691, + "learning_rate": 4.927595037274542e-05, + "loss": 0.2543, + "step": 9481 + }, + { + "epoch": 0.16912210608925196, + "grad_norm": 0.46600019931793213, + "learning_rate": 4.927557843706086e-05, + "loss": 0.2489, + "step": 9482 + }, + { + "epoch": 0.16913994221096565, + "grad_norm": 0.26517653465270996, + "learning_rate": 4.9275206407275784e-05, + "loss": 0.2443, + "step": 9483 + }, + { + "epoch": 0.16915777833267934, + "grad_norm": 0.20588456094264984, + "learning_rate": 4.927483428339162e-05, + "loss": 0.1974, + "step": 9484 + }, + { + "epoch": 0.16917561445439303, + "grad_norm": 0.2681596279144287, + "learning_rate": 4.927446206540981e-05, + "loss": 0.2339, + "step": 9485 + }, + { + "epoch": 0.16919345057610674, + "grad_norm": 0.2703278660774231, + "learning_rate": 4.927408975333181e-05, + "loss": 0.2253, + "step": 9486 + }, + { + "epoch": 0.16921128669782043, + "grad_norm": 0.3100188076496124, + "learning_rate": 4.927371734715905e-05, + "loss": 0.2053, + "step": 9487 + }, + { + "epoch": 0.16922912281953412, + "grad_norm": 0.2778243124485016, + "learning_rate": 4.927334484689298e-05, + "loss": 0.2267, + "step": 9488 + }, + { + "epoch": 0.1692469589412478, + "grad_norm": 0.27329525351524353, + "learning_rate": 4.927297225253503e-05, + "loss": 0.1882, + "step": 9489 + }, + { + "epoch": 0.16926479506296152, + "grad_norm": 0.3512638509273529, + "learning_rate": 4.927259956408667e-05, + "loss": 0.2209, + "step": 9490 + }, + { + "epoch": 0.1692826311846752, + "grad_norm": 0.23508402705192566, + "learning_rate": 4.927222678154932e-05, + "loss": 0.2112, + "step": 9491 + }, + { + "epoch": 0.1693004673063889, + "grad_norm": 0.24606750905513763, + "learning_rate": 4.927185390492445e-05, + "loss": 0.22, + "step": 9492 + }, + { + "epoch": 0.16931830342810258, + "grad_norm": 0.2985594570636749, + "learning_rate": 4.927148093421348e-05, + "loss": 0.1979, + "step": 9493 + }, + { + "epoch": 0.1693361395498163, + "grad_norm": 0.4615451395511627, + "learning_rate": 4.927110786941786e-05, + "loss": 0.2043, + "step": 9494 + }, + { + "epoch": 0.16935397567153, + "grad_norm": 0.2040075808763504, + "learning_rate": 4.927073471053906e-05, + "loss": 0.214, + "step": 9495 + }, + { + "epoch": 0.16937181179324368, + "grad_norm": 0.2320394665002823, + "learning_rate": 4.9270361457578496e-05, + "loss": 0.231, + "step": 9496 + }, + { + "epoch": 0.16938964791495736, + "grad_norm": 0.17597275972366333, + "learning_rate": 4.926998811053763e-05, + "loss": 0.1999, + "step": 9497 + }, + { + "epoch": 0.16940748403667108, + "grad_norm": 0.2441011518239975, + "learning_rate": 4.9269614669417916e-05, + "loss": 0.1758, + "step": 9498 + }, + { + "epoch": 0.16942532015838477, + "grad_norm": 0.23479744791984558, + "learning_rate": 4.926924113422079e-05, + "loss": 0.2184, + "step": 9499 + }, + { + "epoch": 0.16944315628009846, + "grad_norm": 0.2610747516155243, + "learning_rate": 4.92688675049477e-05, + "loss": 0.2098, + "step": 9500 + }, + { + "epoch": 0.16946099240181214, + "grad_norm": 0.2525850236415863, + "learning_rate": 4.92684937816001e-05, + "loss": 0.1759, + "step": 9501 + }, + { + "epoch": 0.16947882852352583, + "grad_norm": 0.30792132019996643, + "learning_rate": 4.9268119964179436e-05, + "loss": 0.2316, + "step": 9502 + }, + { + "epoch": 0.16949666464523955, + "grad_norm": 0.31012576818466187, + "learning_rate": 4.926774605268715e-05, + "loss": 0.2273, + "step": 9503 + }, + { + "epoch": 0.16951450076695324, + "grad_norm": 0.2511320412158966, + "learning_rate": 4.9267372047124704e-05, + "loss": 0.1761, + "step": 9504 + }, + { + "epoch": 0.16953233688866692, + "grad_norm": 0.27153435349464417, + "learning_rate": 4.926699794749354e-05, + "loss": 0.2474, + "step": 9505 + }, + { + "epoch": 0.1695501730103806, + "grad_norm": 0.3075025975704193, + "learning_rate": 4.9266623753795114e-05, + "loss": 0.2206, + "step": 9506 + }, + { + "epoch": 0.16956800913209433, + "grad_norm": 0.29549577832221985, + "learning_rate": 4.926624946603087e-05, + "loss": 0.2054, + "step": 9507 + }, + { + "epoch": 0.16958584525380802, + "grad_norm": 0.33228665590286255, + "learning_rate": 4.926587508420227e-05, + "loss": 0.2292, + "step": 9508 + }, + { + "epoch": 0.1696036813755217, + "grad_norm": 0.30444657802581787, + "learning_rate": 4.926550060831074e-05, + "loss": 0.1954, + "step": 9509 + }, + { + "epoch": 0.1696215174972354, + "grad_norm": 0.2660526931285858, + "learning_rate": 4.9265126038357767e-05, + "loss": 0.192, + "step": 9510 + }, + { + "epoch": 0.1696393536189491, + "grad_norm": 0.33575963973999023, + "learning_rate": 4.926475137434478e-05, + "loss": 0.219, + "step": 9511 + }, + { + "epoch": 0.1696571897406628, + "grad_norm": 0.38330385088920593, + "learning_rate": 4.926437661627323e-05, + "loss": 0.259, + "step": 9512 + }, + { + "epoch": 0.16967502586237648, + "grad_norm": 0.26194047927856445, + "learning_rate": 4.926400176414458e-05, + "loss": 0.2118, + "step": 9513 + }, + { + "epoch": 0.16969286198409017, + "grad_norm": 0.37973135709762573, + "learning_rate": 4.9263626817960274e-05, + "loss": 0.2161, + "step": 9514 + }, + { + "epoch": 0.1697106981058039, + "grad_norm": 0.2892526388168335, + "learning_rate": 4.926325177772177e-05, + "loss": 0.2438, + "step": 9515 + }, + { + "epoch": 0.16972853422751757, + "grad_norm": 0.2314528524875641, + "learning_rate": 4.926287664343052e-05, + "loss": 0.2021, + "step": 9516 + }, + { + "epoch": 0.16974637034923126, + "grad_norm": 0.27935171127319336, + "learning_rate": 4.926250141508799e-05, + "loss": 0.2135, + "step": 9517 + }, + { + "epoch": 0.16976420647094495, + "grad_norm": 0.3179842233657837, + "learning_rate": 4.926212609269562e-05, + "loss": 0.2195, + "step": 9518 + }, + { + "epoch": 0.16978204259265864, + "grad_norm": 0.31895214319229126, + "learning_rate": 4.926175067625487e-05, + "loss": 0.1969, + "step": 9519 + }, + { + "epoch": 0.16979987871437235, + "grad_norm": 0.24014034867286682, + "learning_rate": 4.926137516576719e-05, + "loss": 0.201, + "step": 9520 + }, + { + "epoch": 0.16981771483608604, + "grad_norm": 0.3354727029800415, + "learning_rate": 4.926099956123404e-05, + "loss": 0.2192, + "step": 9521 + }, + { + "epoch": 0.16983555095779973, + "grad_norm": 0.4670446515083313, + "learning_rate": 4.926062386265689e-05, + "loss": 0.2549, + "step": 9522 + }, + { + "epoch": 0.16985338707951342, + "grad_norm": 0.22529637813568115, + "learning_rate": 4.926024807003716e-05, + "loss": 0.2162, + "step": 9523 + }, + { + "epoch": 0.16987122320122713, + "grad_norm": 0.28081199526786804, + "learning_rate": 4.925987218337635e-05, + "loss": 0.249, + "step": 9524 + }, + { + "epoch": 0.16988905932294082, + "grad_norm": 0.21510134637355804, + "learning_rate": 4.925949620267589e-05, + "loss": 0.1642, + "step": 9525 + }, + { + "epoch": 0.1699068954446545, + "grad_norm": 0.23330473899841309, + "learning_rate": 4.925912012793724e-05, + "loss": 0.1789, + "step": 9526 + }, + { + "epoch": 0.1699247315663682, + "grad_norm": 0.2610885798931122, + "learning_rate": 4.9258743959161855e-05, + "loss": 0.2157, + "step": 9527 + }, + { + "epoch": 0.1699425676880819, + "grad_norm": 0.1969381719827652, + "learning_rate": 4.925836769635121e-05, + "loss": 0.2244, + "step": 9528 + }, + { + "epoch": 0.1699604038097956, + "grad_norm": 0.289852112531662, + "learning_rate": 4.9257991339506754e-05, + "loss": 0.2412, + "step": 9529 + }, + { + "epoch": 0.1699782399315093, + "grad_norm": 0.32819539308547974, + "learning_rate": 4.925761488862994e-05, + "loss": 0.2586, + "step": 9530 + }, + { + "epoch": 0.16999607605322298, + "grad_norm": 0.3206731677055359, + "learning_rate": 4.9257238343722233e-05, + "loss": 0.23, + "step": 9531 + }, + { + "epoch": 0.1700139121749367, + "grad_norm": 0.28485774993896484, + "learning_rate": 4.925686170478509e-05, + "loss": 0.1996, + "step": 9532 + }, + { + "epoch": 0.17003174829665038, + "grad_norm": 0.3539358377456665, + "learning_rate": 4.9256484971819984e-05, + "loss": 0.2612, + "step": 9533 + }, + { + "epoch": 0.17004958441836407, + "grad_norm": 0.23671498894691467, + "learning_rate": 4.9256108144828356e-05, + "loss": 0.1673, + "step": 9534 + }, + { + "epoch": 0.17006742054007776, + "grad_norm": 0.3517628014087677, + "learning_rate": 4.925573122381167e-05, + "loss": 0.2546, + "step": 9535 + }, + { + "epoch": 0.17008525666179147, + "grad_norm": 0.24914629757404327, + "learning_rate": 4.9255354208771406e-05, + "loss": 0.1779, + "step": 9536 + }, + { + "epoch": 0.17010309278350516, + "grad_norm": 0.46196451783180237, + "learning_rate": 4.9254977099709e-05, + "loss": 0.1753, + "step": 9537 + }, + { + "epoch": 0.17012092890521885, + "grad_norm": 0.21600893139839172, + "learning_rate": 4.925459989662593e-05, + "loss": 0.2025, + "step": 9538 + }, + { + "epoch": 0.17013876502693254, + "grad_norm": 0.290483295917511, + "learning_rate": 4.925422259952365e-05, + "loss": 0.2166, + "step": 9539 + }, + { + "epoch": 0.17015660114864622, + "grad_norm": 0.38096922636032104, + "learning_rate": 4.925384520840363e-05, + "loss": 0.23, + "step": 9540 + }, + { + "epoch": 0.17017443727035994, + "grad_norm": 0.2617246210575104, + "learning_rate": 4.925346772326733e-05, + "loss": 0.1999, + "step": 9541 + }, + { + "epoch": 0.17019227339207363, + "grad_norm": 0.26019787788391113, + "learning_rate": 4.925309014411621e-05, + "loss": 0.2342, + "step": 9542 + }, + { + "epoch": 0.17021010951378732, + "grad_norm": 0.2577129900455475, + "learning_rate": 4.925271247095173e-05, + "loss": 0.1996, + "step": 9543 + }, + { + "epoch": 0.170227945635501, + "grad_norm": 0.31563353538513184, + "learning_rate": 4.925233470377537e-05, + "loss": 0.1928, + "step": 9544 + }, + { + "epoch": 0.17024578175721472, + "grad_norm": 0.3143184781074524, + "learning_rate": 4.9251956842588574e-05, + "loss": 0.2377, + "step": 9545 + }, + { + "epoch": 0.1702636178789284, + "grad_norm": 0.22418881952762604, + "learning_rate": 4.925157888739282e-05, + "loss": 0.1899, + "step": 9546 + }, + { + "epoch": 0.1702814540006421, + "grad_norm": 0.40662509202957153, + "learning_rate": 4.9251200838189585e-05, + "loss": 0.1679, + "step": 9547 + }, + { + "epoch": 0.17029929012235578, + "grad_norm": 0.1995219886302948, + "learning_rate": 4.92508226949803e-05, + "loss": 0.1832, + "step": 9548 + }, + { + "epoch": 0.1703171262440695, + "grad_norm": 0.24337337911128998, + "learning_rate": 4.925044445776646e-05, + "loss": 0.2133, + "step": 9549 + }, + { + "epoch": 0.1703349623657832, + "grad_norm": 0.35046741366386414, + "learning_rate": 4.925006612654952e-05, + "loss": 0.2338, + "step": 9550 + }, + { + "epoch": 0.17035279848749688, + "grad_norm": 0.264070600271225, + "learning_rate": 4.924968770133094e-05, + "loss": 0.1708, + "step": 9551 + }, + { + "epoch": 0.17037063460921056, + "grad_norm": 0.2133130431175232, + "learning_rate": 4.92493091821122e-05, + "loss": 0.1851, + "step": 9552 + }, + { + "epoch": 0.17038847073092428, + "grad_norm": 0.28974494338035583, + "learning_rate": 4.924893056889477e-05, + "loss": 0.2417, + "step": 9553 + }, + { + "epoch": 0.17040630685263797, + "grad_norm": 0.2992122173309326, + "learning_rate": 4.9248551861680106e-05, + "loss": 0.2528, + "step": 9554 + }, + { + "epoch": 0.17042414297435166, + "grad_norm": 0.22294269502162933, + "learning_rate": 4.924817306046967e-05, + "loss": 0.2189, + "step": 9555 + }, + { + "epoch": 0.17044197909606534, + "grad_norm": 0.3522729277610779, + "learning_rate": 4.924779416526495e-05, + "loss": 0.2156, + "step": 9556 + }, + { + "epoch": 0.17045981521777906, + "grad_norm": 0.25028425455093384, + "learning_rate": 4.92474151760674e-05, + "loss": 0.2003, + "step": 9557 + }, + { + "epoch": 0.17047765133949275, + "grad_norm": 0.34904852509498596, + "learning_rate": 4.92470360928785e-05, + "loss": 0.1727, + "step": 9558 + }, + { + "epoch": 0.17049548746120644, + "grad_norm": 0.2682909667491913, + "learning_rate": 4.924665691569971e-05, + "loss": 0.1896, + "step": 9559 + }, + { + "epoch": 0.17051332358292012, + "grad_norm": 0.3951048254966736, + "learning_rate": 4.92462776445325e-05, + "loss": 0.2083, + "step": 9560 + }, + { + "epoch": 0.1705311597046338, + "grad_norm": 0.28123539686203003, + "learning_rate": 4.924589827937835e-05, + "loss": 0.2124, + "step": 9561 + }, + { + "epoch": 0.17054899582634753, + "grad_norm": 0.2618462145328522, + "learning_rate": 4.9245518820238724e-05, + "loss": 0.2757, + "step": 9562 + }, + { + "epoch": 0.17056683194806121, + "grad_norm": 0.23455531895160675, + "learning_rate": 4.9245139267115086e-05, + "loss": 0.2121, + "step": 9563 + }, + { + "epoch": 0.1705846680697749, + "grad_norm": 0.2572258412837982, + "learning_rate": 4.9244759620008915e-05, + "loss": 0.2257, + "step": 9564 + }, + { + "epoch": 0.1706025041914886, + "grad_norm": 0.2199053168296814, + "learning_rate": 4.9244379878921686e-05, + "loss": 0.2304, + "step": 9565 + }, + { + "epoch": 0.1706203403132023, + "grad_norm": 0.2827913761138916, + "learning_rate": 4.9244000043854865e-05, + "loss": 0.2141, + "step": 9566 + }, + { + "epoch": 0.170638176434916, + "grad_norm": 0.224032923579216, + "learning_rate": 4.924362011480993e-05, + "loss": 0.2007, + "step": 9567 + }, + { + "epoch": 0.17065601255662968, + "grad_norm": 0.2857799828052521, + "learning_rate": 4.9243240091788345e-05, + "loss": 0.1962, + "step": 9568 + }, + { + "epoch": 0.17067384867834337, + "grad_norm": 0.29973313212394714, + "learning_rate": 4.924285997479159e-05, + "loss": 0.2459, + "step": 9569 + }, + { + "epoch": 0.1706916848000571, + "grad_norm": 0.3320271670818329, + "learning_rate": 4.924247976382114e-05, + "loss": 0.2526, + "step": 9570 + }, + { + "epoch": 0.17070952092177077, + "grad_norm": 0.2696791887283325, + "learning_rate": 4.924209945887846e-05, + "loss": 0.1771, + "step": 9571 + }, + { + "epoch": 0.17072735704348446, + "grad_norm": 0.31899896264076233, + "learning_rate": 4.924171905996503e-05, + "loss": 0.2624, + "step": 9572 + }, + { + "epoch": 0.17074519316519815, + "grad_norm": 0.2947900891304016, + "learning_rate": 4.924133856708232e-05, + "loss": 0.1881, + "step": 9573 + }, + { + "epoch": 0.17076302928691187, + "grad_norm": 0.25045454502105713, + "learning_rate": 4.924095798023182e-05, + "loss": 0.2334, + "step": 9574 + }, + { + "epoch": 0.17078086540862555, + "grad_norm": 0.3101300597190857, + "learning_rate": 4.924057729941499e-05, + "loss": 0.2129, + "step": 9575 + }, + { + "epoch": 0.17079870153033924, + "grad_norm": 0.2634217143058777, + "learning_rate": 4.92401965246333e-05, + "loss": 0.227, + "step": 9576 + }, + { + "epoch": 0.17081653765205293, + "grad_norm": 0.4140169620513916, + "learning_rate": 4.923981565588824e-05, + "loss": 0.1585, + "step": 9577 + }, + { + "epoch": 0.17083437377376665, + "grad_norm": 0.3452723026275635, + "learning_rate": 4.92394346931813e-05, + "loss": 0.2064, + "step": 9578 + }, + { + "epoch": 0.17085220989548033, + "grad_norm": 0.46921178698539734, + "learning_rate": 4.923905363651392e-05, + "loss": 0.2256, + "step": 9579 + }, + { + "epoch": 0.17087004601719402, + "grad_norm": 0.366527259349823, + "learning_rate": 4.9238672485887606e-05, + "loss": 0.2719, + "step": 9580 + }, + { + "epoch": 0.1708878821389077, + "grad_norm": 0.3229540288448334, + "learning_rate": 4.923829124130382e-05, + "loss": 0.2027, + "step": 9581 + }, + { + "epoch": 0.1709057182606214, + "grad_norm": 0.2804720401763916, + "learning_rate": 4.9237909902764045e-05, + "loss": 0.2224, + "step": 9582 + }, + { + "epoch": 0.1709235543823351, + "grad_norm": 0.28612813353538513, + "learning_rate": 4.923752847026976e-05, + "loss": 0.2143, + "step": 9583 + }, + { + "epoch": 0.1709413905040488, + "grad_norm": 0.3355180323123932, + "learning_rate": 4.923714694382245e-05, + "loss": 0.1881, + "step": 9584 + }, + { + "epoch": 0.1709592266257625, + "grad_norm": 0.3040127754211426, + "learning_rate": 4.9236765323423587e-05, + "loss": 0.2305, + "step": 9585 + }, + { + "epoch": 0.17097706274747618, + "grad_norm": 0.29329100251197815, + "learning_rate": 4.9236383609074635e-05, + "loss": 0.295, + "step": 9586 + }, + { + "epoch": 0.1709948988691899, + "grad_norm": 0.2795793116092682, + "learning_rate": 4.9236001800777105e-05, + "loss": 0.2712, + "step": 9587 + }, + { + "epoch": 0.17101273499090358, + "grad_norm": 0.2780926525592804, + "learning_rate": 4.923561989853246e-05, + "loss": 0.2072, + "step": 9588 + }, + { + "epoch": 0.17103057111261727, + "grad_norm": 0.2929146885871887, + "learning_rate": 4.923523790234217e-05, + "loss": 0.212, + "step": 9589 + }, + { + "epoch": 0.17104840723433096, + "grad_norm": 0.27513065934181213, + "learning_rate": 4.923485581220774e-05, + "loss": 0.2167, + "step": 9590 + }, + { + "epoch": 0.17106624335604467, + "grad_norm": 0.2884487807750702, + "learning_rate": 4.923447362813063e-05, + "loss": 0.2433, + "step": 9591 + }, + { + "epoch": 0.17108407947775836, + "grad_norm": 0.41791215538978577, + "learning_rate": 4.923409135011233e-05, + "loss": 0.2539, + "step": 9592 + }, + { + "epoch": 0.17110191559947205, + "grad_norm": 0.38883331418037415, + "learning_rate": 4.923370897815433e-05, + "loss": 0.1835, + "step": 9593 + }, + { + "epoch": 0.17111975172118574, + "grad_norm": 0.2700742483139038, + "learning_rate": 4.9233326512258096e-05, + "loss": 0.2083, + "step": 9594 + }, + { + "epoch": 0.17113758784289945, + "grad_norm": 0.27792683243751526, + "learning_rate": 4.923294395242513e-05, + "loss": 0.2313, + "step": 9595 + }, + { + "epoch": 0.17115542396461314, + "grad_norm": 0.26268866658210754, + "learning_rate": 4.9232561298656885e-05, + "loss": 0.186, + "step": 9596 + }, + { + "epoch": 0.17117326008632683, + "grad_norm": 0.2692398130893707, + "learning_rate": 4.923217855095488e-05, + "loss": 0.1767, + "step": 9597 + }, + { + "epoch": 0.17119109620804052, + "grad_norm": 0.41611286997795105, + "learning_rate": 4.9231795709320574e-05, + "loss": 0.2298, + "step": 9598 + }, + { + "epoch": 0.17120893232975423, + "grad_norm": 0.27804091572761536, + "learning_rate": 4.923141277375546e-05, + "loss": 0.2003, + "step": 9599 + }, + { + "epoch": 0.17122676845146792, + "grad_norm": 0.2379477322101593, + "learning_rate": 4.923102974426101e-05, + "loss": 0.1769, + "step": 9600 + }, + { + "epoch": 0.1712446045731816, + "grad_norm": 0.2880176901817322, + "learning_rate": 4.9230646620838736e-05, + "loss": 0.2089, + "step": 9601 + }, + { + "epoch": 0.1712624406948953, + "grad_norm": 0.36888349056243896, + "learning_rate": 4.9230263403490095e-05, + "loss": 0.1884, + "step": 9602 + }, + { + "epoch": 0.17128027681660898, + "grad_norm": 0.2614987790584564, + "learning_rate": 4.92298800922166e-05, + "loss": 0.2062, + "step": 9603 + }, + { + "epoch": 0.1712981129383227, + "grad_norm": 0.30136510729789734, + "learning_rate": 4.9229496687019705e-05, + "loss": 0.2217, + "step": 9604 + }, + { + "epoch": 0.1713159490600364, + "grad_norm": 0.2660360038280487, + "learning_rate": 4.922911318790092e-05, + "loss": 0.2118, + "step": 9605 + }, + { + "epoch": 0.17133378518175008, + "grad_norm": 0.3715563714504242, + "learning_rate": 4.9228729594861716e-05, + "loss": 0.2264, + "step": 9606 + }, + { + "epoch": 0.17135162130346376, + "grad_norm": 0.27642402052879333, + "learning_rate": 4.9228345907903595e-05, + "loss": 0.2029, + "step": 9607 + }, + { + "epoch": 0.17136945742517748, + "grad_norm": 0.21892401576042175, + "learning_rate": 4.922796212702804e-05, + "loss": 0.2302, + "step": 9608 + }, + { + "epoch": 0.17138729354689117, + "grad_norm": 0.23819518089294434, + "learning_rate": 4.922757825223653e-05, + "loss": 0.1883, + "step": 9609 + }, + { + "epoch": 0.17140512966860486, + "grad_norm": 0.2200346291065216, + "learning_rate": 4.9227194283530556e-05, + "loss": 0.1766, + "step": 9610 + }, + { + "epoch": 0.17142296579031854, + "grad_norm": 0.37342411279678345, + "learning_rate": 4.9226810220911615e-05, + "loss": 0.2511, + "step": 9611 + }, + { + "epoch": 0.17144080191203226, + "grad_norm": 0.3035658895969391, + "learning_rate": 4.922642606438118e-05, + "loss": 0.243, + "step": 9612 + }, + { + "epoch": 0.17145863803374595, + "grad_norm": 0.27800440788269043, + "learning_rate": 4.922604181394076e-05, + "loss": 0.2063, + "step": 9613 + }, + { + "epoch": 0.17147647415545964, + "grad_norm": 0.22545850276947021, + "learning_rate": 4.9225657469591835e-05, + "loss": 0.1697, + "step": 9614 + }, + { + "epoch": 0.17149431027717332, + "grad_norm": 0.25454989075660706, + "learning_rate": 4.9225273031335886e-05, + "loss": 0.2046, + "step": 9615 + }, + { + "epoch": 0.17151214639888704, + "grad_norm": 0.2833195626735687, + "learning_rate": 4.922488849917442e-05, + "loss": 0.2352, + "step": 9616 + }, + { + "epoch": 0.17152998252060073, + "grad_norm": 0.24875544011592865, + "learning_rate": 4.9224503873108915e-05, + "loss": 0.2119, + "step": 9617 + }, + { + "epoch": 0.17154781864231441, + "grad_norm": 0.3346499800682068, + "learning_rate": 4.922411915314087e-05, + "loss": 0.2307, + "step": 9618 + }, + { + "epoch": 0.1715656547640281, + "grad_norm": 0.2831631302833557, + "learning_rate": 4.922373433927176e-05, + "loss": 0.2582, + "step": 9619 + }, + { + "epoch": 0.1715834908857418, + "grad_norm": 0.2854125201702118, + "learning_rate": 4.922334943150311e-05, + "loss": 0.2082, + "step": 9620 + }, + { + "epoch": 0.1716013270074555, + "grad_norm": 0.3862302303314209, + "learning_rate": 4.922296442983638e-05, + "loss": 0.2358, + "step": 9621 + }, + { + "epoch": 0.1716191631291692, + "grad_norm": 0.2591022849082947, + "learning_rate": 4.922257933427307e-05, + "loss": 0.2111, + "step": 9622 + }, + { + "epoch": 0.17163699925088288, + "grad_norm": 0.2876119017601013, + "learning_rate": 4.922219414481468e-05, + "loss": 0.2475, + "step": 9623 + }, + { + "epoch": 0.17165483537259657, + "grad_norm": 0.24456492066383362, + "learning_rate": 4.92218088614627e-05, + "loss": 0.2087, + "step": 9624 + }, + { + "epoch": 0.17167267149431029, + "grad_norm": 0.31571707129478455, + "learning_rate": 4.922142348421862e-05, + "loss": 0.2666, + "step": 9625 + }, + { + "epoch": 0.17169050761602397, + "grad_norm": 0.39414793252944946, + "learning_rate": 4.922103801308394e-05, + "loss": 0.2134, + "step": 9626 + }, + { + "epoch": 0.17170834373773766, + "grad_norm": 0.4650692939758301, + "learning_rate": 4.9220652448060144e-05, + "loss": 0.2645, + "step": 9627 + }, + { + "epoch": 0.17172617985945135, + "grad_norm": 0.2746374011039734, + "learning_rate": 4.922026678914874e-05, + "loss": 0.2298, + "step": 9628 + }, + { + "epoch": 0.17174401598116507, + "grad_norm": 0.2523934543132782, + "learning_rate": 4.9219881036351226e-05, + "loss": 0.2227, + "step": 9629 + }, + { + "epoch": 0.17176185210287875, + "grad_norm": 0.2765379548072815, + "learning_rate": 4.921949518966907e-05, + "loss": 0.2579, + "step": 9630 + }, + { + "epoch": 0.17177968822459244, + "grad_norm": 0.27917927503585815, + "learning_rate": 4.921910924910379e-05, + "loss": 0.1976, + "step": 9631 + }, + { + "epoch": 0.17179752434630613, + "grad_norm": 0.28405430912971497, + "learning_rate": 4.9218723214656885e-05, + "loss": 0.2388, + "step": 9632 + }, + { + "epoch": 0.17181536046801985, + "grad_norm": 0.3750477731227875, + "learning_rate": 4.921833708632984e-05, + "loss": 0.1991, + "step": 9633 + }, + { + "epoch": 0.17183319658973353, + "grad_norm": 0.4381203353404999, + "learning_rate": 4.921795086412416e-05, + "loss": 0.2514, + "step": 9634 + }, + { + "epoch": 0.17185103271144722, + "grad_norm": 0.3596727252006531, + "learning_rate": 4.921756454804133e-05, + "loss": 0.2688, + "step": 9635 + }, + { + "epoch": 0.1718688688331609, + "grad_norm": 0.28607720136642456, + "learning_rate": 4.921717813808286e-05, + "loss": 0.2055, + "step": 9636 + }, + { + "epoch": 0.17188670495487463, + "grad_norm": 0.32283514738082886, + "learning_rate": 4.9216791634250236e-05, + "loss": 0.2551, + "step": 9637 + }, + { + "epoch": 0.1719045410765883, + "grad_norm": 0.2867508828639984, + "learning_rate": 4.921640503654497e-05, + "loss": 0.2092, + "step": 9638 + }, + { + "epoch": 0.171922377198302, + "grad_norm": 0.25139811635017395, + "learning_rate": 4.9216018344968554e-05, + "loss": 0.1677, + "step": 9639 + }, + { + "epoch": 0.1719402133200157, + "grad_norm": 0.1929500699043274, + "learning_rate": 4.921563155952249e-05, + "loss": 0.1734, + "step": 9640 + }, + { + "epoch": 0.17195804944172938, + "grad_norm": 0.22774456441402435, + "learning_rate": 4.921524468020827e-05, + "loss": 0.1664, + "step": 9641 + }, + { + "epoch": 0.1719758855634431, + "grad_norm": 0.2855418026447296, + "learning_rate": 4.92148577070274e-05, + "loss": 0.2406, + "step": 9642 + }, + { + "epoch": 0.17199372168515678, + "grad_norm": 0.2703842222690582, + "learning_rate": 4.921447063998137e-05, + "loss": 0.2062, + "step": 9643 + }, + { + "epoch": 0.17201155780687047, + "grad_norm": 0.5362157225608826, + "learning_rate": 4.92140834790717e-05, + "loss": 0.2785, + "step": 9644 + }, + { + "epoch": 0.17202939392858416, + "grad_norm": 0.21611298620700836, + "learning_rate": 4.921369622429987e-05, + "loss": 0.2096, + "step": 9645 + }, + { + "epoch": 0.17204723005029787, + "grad_norm": 0.24363186955451965, + "learning_rate": 4.92133088756674e-05, + "loss": 0.223, + "step": 9646 + }, + { + "epoch": 0.17206506617201156, + "grad_norm": 0.22381004691123962, + "learning_rate": 4.9212921433175775e-05, + "loss": 0.1981, + "step": 9647 + }, + { + "epoch": 0.17208290229372525, + "grad_norm": 0.3383592963218689, + "learning_rate": 4.9212533896826505e-05, + "loss": 0.2343, + "step": 9648 + }, + { + "epoch": 0.17210073841543894, + "grad_norm": 0.2760606110095978, + "learning_rate": 4.9212146266621084e-05, + "loss": 0.1813, + "step": 9649 + }, + { + "epoch": 0.17211857453715265, + "grad_norm": 0.3160358965396881, + "learning_rate": 4.921175854256103e-05, + "loss": 0.2898, + "step": 9650 + }, + { + "epoch": 0.17213641065886634, + "grad_norm": 0.23598331212997437, + "learning_rate": 4.921137072464784e-05, + "loss": 0.187, + "step": 9651 + }, + { + "epoch": 0.17215424678058003, + "grad_norm": 0.2617507576942444, + "learning_rate": 4.9210982812883e-05, + "loss": 0.2486, + "step": 9652 + }, + { + "epoch": 0.17217208290229372, + "grad_norm": 0.3494400680065155, + "learning_rate": 4.921059480726805e-05, + "loss": 0.197, + "step": 9653 + }, + { + "epoch": 0.17218991902400743, + "grad_norm": 0.26766592264175415, + "learning_rate": 4.9210206707804456e-05, + "loss": 0.2161, + "step": 9654 + }, + { + "epoch": 0.17220775514572112, + "grad_norm": 0.2603318691253662, + "learning_rate": 4.920981851449374e-05, + "loss": 0.234, + "step": 9655 + }, + { + "epoch": 0.1722255912674348, + "grad_norm": 0.2517866790294647, + "learning_rate": 4.9209430227337415e-05, + "loss": 0.2215, + "step": 9656 + }, + { + "epoch": 0.1722434273891485, + "grad_norm": 0.21566170454025269, + "learning_rate": 4.9209041846336965e-05, + "loss": 0.1644, + "step": 9657 + }, + { + "epoch": 0.1722612635108622, + "grad_norm": 0.31020960211753845, + "learning_rate": 4.920865337149392e-05, + "loss": 0.2661, + "step": 9658 + }, + { + "epoch": 0.1722790996325759, + "grad_norm": 0.24349386990070343, + "learning_rate": 4.9208264802809766e-05, + "loss": 0.2034, + "step": 9659 + }, + { + "epoch": 0.1722969357542896, + "grad_norm": 0.29610615968704224, + "learning_rate": 4.920787614028601e-05, + "loss": 0.2042, + "step": 9660 + }, + { + "epoch": 0.17231477187600328, + "grad_norm": 0.3083944618701935, + "learning_rate": 4.920748738392418e-05, + "loss": 0.1733, + "step": 9661 + }, + { + "epoch": 0.17233260799771696, + "grad_norm": 0.3282853364944458, + "learning_rate": 4.920709853372576e-05, + "loss": 0.2216, + "step": 9662 + }, + { + "epoch": 0.17235044411943068, + "grad_norm": 0.333893358707428, + "learning_rate": 4.9206709589692265e-05, + "loss": 0.2421, + "step": 9663 + }, + { + "epoch": 0.17236828024114437, + "grad_norm": 0.32559463381767273, + "learning_rate": 4.9206320551825206e-05, + "loss": 0.2711, + "step": 9664 + }, + { + "epoch": 0.17238611636285806, + "grad_norm": 0.26360490918159485, + "learning_rate": 4.920593142012608e-05, + "loss": 0.2177, + "step": 9665 + }, + { + "epoch": 0.17240395248457174, + "grad_norm": 0.2548786997795105, + "learning_rate": 4.920554219459641e-05, + "loss": 0.2029, + "step": 9666 + }, + { + "epoch": 0.17242178860628546, + "grad_norm": 0.25616228580474854, + "learning_rate": 4.920515287523769e-05, + "loss": 0.204, + "step": 9667 + }, + { + "epoch": 0.17243962472799915, + "grad_norm": 0.30703577399253845, + "learning_rate": 4.920476346205145e-05, + "loss": 0.2718, + "step": 9668 + }, + { + "epoch": 0.17245746084971283, + "grad_norm": 0.31625422835350037, + "learning_rate": 4.9204373955039174e-05, + "loss": 0.2013, + "step": 9669 + }, + { + "epoch": 0.17247529697142652, + "grad_norm": 0.22914358973503113, + "learning_rate": 4.9203984354202393e-05, + "loss": 0.1981, + "step": 9670 + }, + { + "epoch": 0.17249313309314024, + "grad_norm": 0.3993740379810333, + "learning_rate": 4.9203594659542605e-05, + "loss": 0.2443, + "step": 9671 + }, + { + "epoch": 0.17251096921485393, + "grad_norm": 0.36154547333717346, + "learning_rate": 4.920320487106133e-05, + "loss": 0.2278, + "step": 9672 + }, + { + "epoch": 0.17252880533656761, + "grad_norm": 0.2840382158756256, + "learning_rate": 4.920281498876007e-05, + "loss": 0.1785, + "step": 9673 + }, + { + "epoch": 0.1725466414582813, + "grad_norm": 0.2900967597961426, + "learning_rate": 4.9202425012640344e-05, + "loss": 0.2081, + "step": 9674 + }, + { + "epoch": 0.17256447757999502, + "grad_norm": 0.4035519063472748, + "learning_rate": 4.920203494270365e-05, + "loss": 0.2242, + "step": 9675 + }, + { + "epoch": 0.1725823137017087, + "grad_norm": 0.26229366660118103, + "learning_rate": 4.9201644778951516e-05, + "loss": 0.1973, + "step": 9676 + }, + { + "epoch": 0.1726001498234224, + "grad_norm": 0.24187491834163666, + "learning_rate": 4.9201254521385446e-05, + "loss": 0.2187, + "step": 9677 + }, + { + "epoch": 0.17261798594513608, + "grad_norm": 0.30757611989974976, + "learning_rate": 4.920086417000695e-05, + "loss": 0.2842, + "step": 9678 + }, + { + "epoch": 0.1726358220668498, + "grad_norm": 0.44807401299476624, + "learning_rate": 4.9200473724817555e-05, + "loss": 0.2652, + "step": 9679 + }, + { + "epoch": 0.17265365818856349, + "grad_norm": 0.3205125331878662, + "learning_rate": 4.920008318581876e-05, + "loss": 0.2036, + "step": 9680 + }, + { + "epoch": 0.17267149431027717, + "grad_norm": 0.36433398723602295, + "learning_rate": 4.919969255301209e-05, + "loss": 0.1986, + "step": 9681 + }, + { + "epoch": 0.17268933043199086, + "grad_norm": 0.2050919532775879, + "learning_rate": 4.9199301826399046e-05, + "loss": 0.1881, + "step": 9682 + }, + { + "epoch": 0.17270716655370455, + "grad_norm": 0.430519700050354, + "learning_rate": 4.9198911005981155e-05, + "loss": 0.2074, + "step": 9683 + }, + { + "epoch": 0.17272500267541827, + "grad_norm": 0.32934385538101196, + "learning_rate": 4.919852009175992e-05, + "loss": 0.212, + "step": 9684 + }, + { + "epoch": 0.17274283879713195, + "grad_norm": 0.2581077814102173, + "learning_rate": 4.919812908373686e-05, + "loss": 0.1845, + "step": 9685 + }, + { + "epoch": 0.17276067491884564, + "grad_norm": 0.32681527733802795, + "learning_rate": 4.9197737981913504e-05, + "loss": 0.2646, + "step": 9686 + }, + { + "epoch": 0.17277851104055933, + "grad_norm": 0.2916010916233063, + "learning_rate": 4.919734678629136e-05, + "loss": 0.24, + "step": 9687 + }, + { + "epoch": 0.17279634716227305, + "grad_norm": 0.2567000389099121, + "learning_rate": 4.919695549687193e-05, + "loss": 0.2023, + "step": 9688 + }, + { + "epoch": 0.17281418328398673, + "grad_norm": 0.3838261365890503, + "learning_rate": 4.9196564113656755e-05, + "loss": 0.2245, + "step": 9689 + }, + { + "epoch": 0.17283201940570042, + "grad_norm": 0.2686448097229004, + "learning_rate": 4.919617263664734e-05, + "loss": 0.1613, + "step": 9690 + }, + { + "epoch": 0.1728498555274141, + "grad_norm": 0.3094487190246582, + "learning_rate": 4.91957810658452e-05, + "loss": 0.1849, + "step": 9691 + }, + { + "epoch": 0.17286769164912782, + "grad_norm": 0.28329363465309143, + "learning_rate": 4.919538940125185e-05, + "loss": 0.2214, + "step": 9692 + }, + { + "epoch": 0.1728855277708415, + "grad_norm": 0.24060699343681335, + "learning_rate": 4.9194997642868826e-05, + "loss": 0.1737, + "step": 9693 + }, + { + "epoch": 0.1729033638925552, + "grad_norm": 0.26238158345222473, + "learning_rate": 4.9194605790697625e-05, + "loss": 0.1841, + "step": 9694 + }, + { + "epoch": 0.1729212000142689, + "grad_norm": 0.20694534480571747, + "learning_rate": 4.919421384473977e-05, + "loss": 0.165, + "step": 9695 + }, + { + "epoch": 0.1729390361359826, + "grad_norm": 0.27723363041877747, + "learning_rate": 4.9193821804996797e-05, + "loss": 0.1745, + "step": 9696 + }, + { + "epoch": 0.1729568722576963, + "grad_norm": 0.2610498070716858, + "learning_rate": 4.919342967147021e-05, + "loss": 0.2022, + "step": 9697 + }, + { + "epoch": 0.17297470837940998, + "grad_norm": 0.4002273678779602, + "learning_rate": 4.9193037444161536e-05, + "loss": 0.168, + "step": 9698 + }, + { + "epoch": 0.17299254450112367, + "grad_norm": 0.2577795088291168, + "learning_rate": 4.919264512307229e-05, + "loss": 0.1844, + "step": 9699 + }, + { + "epoch": 0.17301038062283736, + "grad_norm": 0.20801451802253723, + "learning_rate": 4.9192252708204e-05, + "loss": 0.1781, + "step": 9700 + }, + { + "epoch": 0.17302821674455107, + "grad_norm": 0.2729775309562683, + "learning_rate": 4.9191860199558174e-05, + "loss": 0.2275, + "step": 9701 + }, + { + "epoch": 0.17304605286626476, + "grad_norm": 0.3288865089416504, + "learning_rate": 4.919146759713634e-05, + "loss": 0.2058, + "step": 9702 + }, + { + "epoch": 0.17306388898797845, + "grad_norm": 0.24803242087364197, + "learning_rate": 4.9191074900940034e-05, + "loss": 0.2055, + "step": 9703 + }, + { + "epoch": 0.17308172510969214, + "grad_norm": 0.3133016526699066, + "learning_rate": 4.919068211097076e-05, + "loss": 0.2303, + "step": 9704 + }, + { + "epoch": 0.17309956123140585, + "grad_norm": 0.27445313334465027, + "learning_rate": 4.919028922723005e-05, + "loss": 0.1938, + "step": 9705 + }, + { + "epoch": 0.17311739735311954, + "grad_norm": 0.244804248213768, + "learning_rate": 4.9189896249719425e-05, + "loss": 0.2076, + "step": 9706 + }, + { + "epoch": 0.17313523347483323, + "grad_norm": 0.3342616558074951, + "learning_rate": 4.918950317844041e-05, + "loss": 0.2105, + "step": 9707 + }, + { + "epoch": 0.17315306959654692, + "grad_norm": 0.35800743103027344, + "learning_rate": 4.918911001339451e-05, + "loss": 0.2382, + "step": 9708 + }, + { + "epoch": 0.17317090571826063, + "grad_norm": 0.42986416816711426, + "learning_rate": 4.918871675458328e-05, + "loss": 0.2172, + "step": 9709 + }, + { + "epoch": 0.17318874183997432, + "grad_norm": 0.2927144169807434, + "learning_rate": 4.918832340200823e-05, + "loss": 0.2592, + "step": 9710 + }, + { + "epoch": 0.173206577961688, + "grad_norm": 0.27397555112838745, + "learning_rate": 4.9187929955670875e-05, + "loss": 0.2169, + "step": 9711 + }, + { + "epoch": 0.1732244140834017, + "grad_norm": 0.2059878408908844, + "learning_rate": 4.9187536415572756e-05, + "loss": 0.1558, + "step": 9712 + }, + { + "epoch": 0.1732422502051154, + "grad_norm": 0.4312272071838379, + "learning_rate": 4.918714278171539e-05, + "loss": 0.1652, + "step": 9713 + }, + { + "epoch": 0.1732600863268291, + "grad_norm": 0.27202436327934265, + "learning_rate": 4.91867490541003e-05, + "loss": 0.2212, + "step": 9714 + }, + { + "epoch": 0.1732779224485428, + "grad_norm": 0.3257104158401489, + "learning_rate": 4.918635523272902e-05, + "loss": 0.2062, + "step": 9715 + }, + { + "epoch": 0.17329575857025648, + "grad_norm": 0.3060513436794281, + "learning_rate": 4.9185961317603074e-05, + "loss": 0.2417, + "step": 9716 + }, + { + "epoch": 0.1733135946919702, + "grad_norm": 0.5047610998153687, + "learning_rate": 4.918556730872399e-05, + "loss": 0.1946, + "step": 9717 + }, + { + "epoch": 0.17333143081368388, + "grad_norm": 0.21300067007541656, + "learning_rate": 4.918517320609329e-05, + "loss": 0.1606, + "step": 9718 + }, + { + "epoch": 0.17334926693539757, + "grad_norm": 0.2857724726200104, + "learning_rate": 4.918477900971251e-05, + "loss": 0.2572, + "step": 9719 + }, + { + "epoch": 0.17336710305711125, + "grad_norm": 0.22725330293178558, + "learning_rate": 4.9184384719583165e-05, + "loss": 0.1677, + "step": 9720 + }, + { + "epoch": 0.17338493917882494, + "grad_norm": 0.2861236333847046, + "learning_rate": 4.91839903357068e-05, + "loss": 0.2036, + "step": 9721 + }, + { + "epoch": 0.17340277530053866, + "grad_norm": 0.26781216263771057, + "learning_rate": 4.918359585808493e-05, + "loss": 0.1937, + "step": 9722 + }, + { + "epoch": 0.17342061142225235, + "grad_norm": 0.3556922674179077, + "learning_rate": 4.91832012867191e-05, + "loss": 0.1568, + "step": 9723 + }, + { + "epoch": 0.17343844754396603, + "grad_norm": 0.27563250064849854, + "learning_rate": 4.9182806621610825e-05, + "loss": 0.2472, + "step": 9724 + }, + { + "epoch": 0.17345628366567972, + "grad_norm": 0.3423020541667938, + "learning_rate": 4.918241186276163e-05, + "loss": 0.1946, + "step": 9725 + }, + { + "epoch": 0.17347411978739344, + "grad_norm": 0.334852010011673, + "learning_rate": 4.9182017010173067e-05, + "loss": 0.1823, + "step": 9726 + }, + { + "epoch": 0.17349195590910713, + "grad_norm": 0.3619869351387024, + "learning_rate": 4.9181622063846644e-05, + "loss": 0.2479, + "step": 9727 + }, + { + "epoch": 0.17350979203082081, + "grad_norm": 0.37388190627098083, + "learning_rate": 4.91812270237839e-05, + "loss": 0.2479, + "step": 9728 + }, + { + "epoch": 0.1735276281525345, + "grad_norm": 0.2579343020915985, + "learning_rate": 4.918083188998638e-05, + "loss": 0.2453, + "step": 9729 + }, + { + "epoch": 0.17354546427424822, + "grad_norm": 0.2560495436191559, + "learning_rate": 4.918043666245559e-05, + "loss": 0.215, + "step": 9730 + }, + { + "epoch": 0.1735633003959619, + "grad_norm": 0.35689017176628113, + "learning_rate": 4.918004134119308e-05, + "loss": 0.2404, + "step": 9731 + }, + { + "epoch": 0.1735811365176756, + "grad_norm": 0.3746400475502014, + "learning_rate": 4.917964592620039e-05, + "loss": 0.2109, + "step": 9732 + }, + { + "epoch": 0.17359897263938928, + "grad_norm": 0.23508204519748688, + "learning_rate": 4.917925041747903e-05, + "loss": 0.178, + "step": 9733 + }, + { + "epoch": 0.173616808761103, + "grad_norm": 0.36890703439712524, + "learning_rate": 4.9178854815030543e-05, + "loss": 0.2254, + "step": 9734 + }, + { + "epoch": 0.17363464488281669, + "grad_norm": 0.321085125207901, + "learning_rate": 4.917845911885647e-05, + "loss": 0.215, + "step": 9735 + }, + { + "epoch": 0.17365248100453037, + "grad_norm": 0.3148922622203827, + "learning_rate": 4.917806332895833e-05, + "loss": 0.2464, + "step": 9736 + }, + { + "epoch": 0.17367031712624406, + "grad_norm": 0.2999875545501709, + "learning_rate": 4.9177667445337674e-05, + "loss": 0.2248, + "step": 9737 + }, + { + "epoch": 0.17368815324795778, + "grad_norm": 0.23620140552520752, + "learning_rate": 4.9177271467996025e-05, + "loss": 0.2219, + "step": 9738 + }, + { + "epoch": 0.17370598936967147, + "grad_norm": 0.24593178927898407, + "learning_rate": 4.9176875396934925e-05, + "loss": 0.1783, + "step": 9739 + }, + { + "epoch": 0.17372382549138515, + "grad_norm": 0.31903770565986633, + "learning_rate": 4.91764792321559e-05, + "loss": 0.2441, + "step": 9740 + }, + { + "epoch": 0.17374166161309884, + "grad_norm": 0.26004692912101746, + "learning_rate": 4.91760829736605e-05, + "loss": 0.2603, + "step": 9741 + }, + { + "epoch": 0.17375949773481253, + "grad_norm": 0.29169684648513794, + "learning_rate": 4.917568662145024e-05, + "loss": 0.2386, + "step": 9742 + }, + { + "epoch": 0.17377733385652624, + "grad_norm": 0.36901918053627014, + "learning_rate": 4.9175290175526676e-05, + "loss": 0.1731, + "step": 9743 + }, + { + "epoch": 0.17379516997823993, + "grad_norm": 0.35805922746658325, + "learning_rate": 4.9174893635891333e-05, + "loss": 0.2204, + "step": 9744 + }, + { + "epoch": 0.17381300609995362, + "grad_norm": 0.5005916357040405, + "learning_rate": 4.9174497002545754e-05, + "loss": 0.2167, + "step": 9745 + }, + { + "epoch": 0.1738308422216673, + "grad_norm": 0.5021663904190063, + "learning_rate": 4.9174100275491477e-05, + "loss": 0.2218, + "step": 9746 + }, + { + "epoch": 0.17384867834338102, + "grad_norm": 0.350599080324173, + "learning_rate": 4.917370345473004e-05, + "loss": 0.3009, + "step": 9747 + }, + { + "epoch": 0.1738665144650947, + "grad_norm": 0.305866539478302, + "learning_rate": 4.917330654026297e-05, + "loss": 0.1872, + "step": 9748 + }, + { + "epoch": 0.1738843505868084, + "grad_norm": 0.3086687922477722, + "learning_rate": 4.917290953209183e-05, + "loss": 0.2482, + "step": 9749 + }, + { + "epoch": 0.1739021867085221, + "grad_norm": 0.34263765811920166, + "learning_rate": 4.9172512430218134e-05, + "loss": 0.2292, + "step": 9750 + }, + { + "epoch": 0.1739200228302358, + "grad_norm": 0.24874238669872284, + "learning_rate": 4.9172115234643425e-05, + "loss": 0.1788, + "step": 9751 + }, + { + "epoch": 0.1739378589519495, + "grad_norm": 0.3459385633468628, + "learning_rate": 4.917171794536925e-05, + "loss": 0.2341, + "step": 9752 + }, + { + "epoch": 0.17395569507366318, + "grad_norm": 0.27681493759155273, + "learning_rate": 4.9171320562397164e-05, + "loss": 0.2004, + "step": 9753 + }, + { + "epoch": 0.17397353119537687, + "grad_norm": 0.2709489166736603, + "learning_rate": 4.9170923085728676e-05, + "loss": 0.2219, + "step": 9754 + }, + { + "epoch": 0.17399136731709058, + "grad_norm": 0.20481276512145996, + "learning_rate": 4.9170525515365346e-05, + "loss": 0.1868, + "step": 9755 + }, + { + "epoch": 0.17400920343880427, + "grad_norm": 0.23008392751216888, + "learning_rate": 4.9170127851308715e-05, + "loss": 0.1872, + "step": 9756 + }, + { + "epoch": 0.17402703956051796, + "grad_norm": 0.3555377125740051, + "learning_rate": 4.9169730093560305e-05, + "loss": 0.2022, + "step": 9757 + }, + { + "epoch": 0.17404487568223165, + "grad_norm": 0.2266392856836319, + "learning_rate": 4.916933224212169e-05, + "loss": 0.2072, + "step": 9758 + }, + { + "epoch": 0.17406271180394536, + "grad_norm": 0.25155770778656006, + "learning_rate": 4.9168934296994386e-05, + "loss": 0.1901, + "step": 9759 + }, + { + "epoch": 0.17408054792565905, + "grad_norm": 0.247994065284729, + "learning_rate": 4.9168536258179946e-05, + "loss": 0.1749, + "step": 9760 + }, + { + "epoch": 0.17409838404737274, + "grad_norm": 0.21949608623981476, + "learning_rate": 4.916813812567992e-05, + "loss": 0.1891, + "step": 9761 + }, + { + "epoch": 0.17411622016908643, + "grad_norm": 0.2974618077278137, + "learning_rate": 4.916773989949584e-05, + "loss": 0.2353, + "step": 9762 + }, + { + "epoch": 0.17413405629080012, + "grad_norm": 0.35332390666007996, + "learning_rate": 4.9167341579629245e-05, + "loss": 0.2217, + "step": 9763 + }, + { + "epoch": 0.17415189241251383, + "grad_norm": 0.3531196117401123, + "learning_rate": 4.916694316608169e-05, + "loss": 0.2382, + "step": 9764 + }, + { + "epoch": 0.17416972853422752, + "grad_norm": 0.4063680171966553, + "learning_rate": 4.9166544658854717e-05, + "loss": 0.1759, + "step": 9765 + }, + { + "epoch": 0.1741875646559412, + "grad_norm": 0.2495127022266388, + "learning_rate": 4.916614605794988e-05, + "loss": 0.1589, + "step": 9766 + }, + { + "epoch": 0.1742054007776549, + "grad_norm": 0.369315505027771, + "learning_rate": 4.9165747363368696e-05, + "loss": 0.2126, + "step": 9767 + }, + { + "epoch": 0.1742232368993686, + "grad_norm": 0.28651097416877747, + "learning_rate": 4.916534857511274e-05, + "loss": 0.1435, + "step": 9768 + }, + { + "epoch": 0.1742410730210823, + "grad_norm": 0.3717736601829529, + "learning_rate": 4.916494969318355e-05, + "loss": 0.2426, + "step": 9769 + }, + { + "epoch": 0.174258909142796, + "grad_norm": 0.39979088306427, + "learning_rate": 4.916455071758266e-05, + "loss": 0.2567, + "step": 9770 + }, + { + "epoch": 0.17427674526450967, + "grad_norm": 0.24432671070098877, + "learning_rate": 4.916415164831163e-05, + "loss": 0.194, + "step": 9771 + }, + { + "epoch": 0.1742945813862234, + "grad_norm": 0.39139798283576965, + "learning_rate": 4.9163752485372e-05, + "loss": 0.1943, + "step": 9772 + }, + { + "epoch": 0.17431241750793708, + "grad_norm": 0.3452328145503998, + "learning_rate": 4.916335322876532e-05, + "loss": 0.1947, + "step": 9773 + }, + { + "epoch": 0.17433025362965077, + "grad_norm": 0.3301299810409546, + "learning_rate": 4.9162953878493135e-05, + "loss": 0.2463, + "step": 9774 + }, + { + "epoch": 0.17434808975136445, + "grad_norm": 0.35334160923957825, + "learning_rate": 4.9162554434556996e-05, + "loss": 0.1241, + "step": 9775 + }, + { + "epoch": 0.17436592587307817, + "grad_norm": 0.2236124724149704, + "learning_rate": 4.9162154896958454e-05, + "loss": 0.211, + "step": 9776 + }, + { + "epoch": 0.17438376199479186, + "grad_norm": 0.23963242769241333, + "learning_rate": 4.916175526569905e-05, + "loss": 0.2092, + "step": 9777 + }, + { + "epoch": 0.17440159811650555, + "grad_norm": 0.36571216583251953, + "learning_rate": 4.916135554078034e-05, + "loss": 0.2641, + "step": 9778 + }, + { + "epoch": 0.17441943423821923, + "grad_norm": 0.2502310574054718, + "learning_rate": 4.9160955722203875e-05, + "loss": 0.217, + "step": 9779 + }, + { + "epoch": 0.17443727035993295, + "grad_norm": 0.2844808101654053, + "learning_rate": 4.916055580997119e-05, + "loss": 0.182, + "step": 9780 + }, + { + "epoch": 0.17445510648164664, + "grad_norm": 0.2635055184364319, + "learning_rate": 4.916015580408386e-05, + "loss": 0.2371, + "step": 9781 + }, + { + "epoch": 0.17447294260336033, + "grad_norm": 0.2510976493358612, + "learning_rate": 4.915975570454341e-05, + "loss": 0.2193, + "step": 9782 + }, + { + "epoch": 0.174490778725074, + "grad_norm": 0.2558085024356842, + "learning_rate": 4.9159355511351404e-05, + "loss": 0.1912, + "step": 9783 + }, + { + "epoch": 0.1745086148467877, + "grad_norm": 0.2219114452600479, + "learning_rate": 4.9158955224509395e-05, + "loss": 0.2049, + "step": 9784 + }, + { + "epoch": 0.17452645096850142, + "grad_norm": 0.23177124559879303, + "learning_rate": 4.9158554844018934e-05, + "loss": 0.1954, + "step": 9785 + }, + { + "epoch": 0.1745442870902151, + "grad_norm": 0.303877055644989, + "learning_rate": 4.915815436988156e-05, + "loss": 0.224, + "step": 9786 + }, + { + "epoch": 0.1745621232119288, + "grad_norm": 0.3392929434776306, + "learning_rate": 4.915775380209884e-05, + "loss": 0.2373, + "step": 9787 + }, + { + "epoch": 0.17457995933364248, + "grad_norm": 0.30069419741630554, + "learning_rate": 4.915735314067233e-05, + "loss": 0.2341, + "step": 9788 + }, + { + "epoch": 0.1745977954553562, + "grad_norm": 0.2342492789030075, + "learning_rate": 4.915695238560357e-05, + "loss": 0.1689, + "step": 9789 + }, + { + "epoch": 0.17461563157706989, + "grad_norm": 0.28799715638160706, + "learning_rate": 4.915655153689412e-05, + "loss": 0.1907, + "step": 9790 + }, + { + "epoch": 0.17463346769878357, + "grad_norm": 0.23690320551395416, + "learning_rate": 4.915615059454553e-05, + "loss": 0.2085, + "step": 9791 + }, + { + "epoch": 0.17465130382049726, + "grad_norm": 0.35785195231437683, + "learning_rate": 4.915574955855936e-05, + "loss": 0.1599, + "step": 9792 + }, + { + "epoch": 0.17466913994221098, + "grad_norm": 0.3013131022453308, + "learning_rate": 4.915534842893716e-05, + "loss": 0.2044, + "step": 9793 + }, + { + "epoch": 0.17468697606392466, + "grad_norm": 0.23619958758354187, + "learning_rate": 4.9154947205680485e-05, + "loss": 0.2028, + "step": 9794 + }, + { + "epoch": 0.17470481218563835, + "grad_norm": 0.21933041512966156, + "learning_rate": 4.9154545888790894e-05, + "loss": 0.1891, + "step": 9795 + }, + { + "epoch": 0.17472264830735204, + "grad_norm": 0.3393298089504242, + "learning_rate": 4.9154144478269935e-05, + "loss": 0.176, + "step": 9796 + }, + { + "epoch": 0.17474048442906576, + "grad_norm": 0.37045788764953613, + "learning_rate": 4.9153742974119174e-05, + "loss": 0.2525, + "step": 9797 + }, + { + "epoch": 0.17475832055077944, + "grad_norm": 0.3073008358478546, + "learning_rate": 4.915334137634017e-05, + "loss": 0.2141, + "step": 9798 + }, + { + "epoch": 0.17477615667249313, + "grad_norm": 0.22247996926307678, + "learning_rate": 4.9152939684934465e-05, + "loss": 0.2147, + "step": 9799 + }, + { + "epoch": 0.17479399279420682, + "grad_norm": 0.24229590594768524, + "learning_rate": 4.915253789990363e-05, + "loss": 0.125, + "step": 9800 + }, + { + "epoch": 0.1748118289159205, + "grad_norm": 0.20457574725151062, + "learning_rate": 4.915213602124921e-05, + "loss": 0.1612, + "step": 9801 + }, + { + "epoch": 0.17482966503763422, + "grad_norm": 0.3060280382633209, + "learning_rate": 4.915173404897277e-05, + "loss": 0.1487, + "step": 9802 + }, + { + "epoch": 0.1748475011593479, + "grad_norm": 0.28459033370018005, + "learning_rate": 4.915133198307586e-05, + "loss": 0.2066, + "step": 9803 + }, + { + "epoch": 0.1748653372810616, + "grad_norm": 0.28581908345222473, + "learning_rate": 4.9150929823560055e-05, + "loss": 0.264, + "step": 9804 + }, + { + "epoch": 0.1748831734027753, + "grad_norm": 0.33398017287254333, + "learning_rate": 4.9150527570426895e-05, + "loss": 0.1408, + "step": 9805 + }, + { + "epoch": 0.174901009524489, + "grad_norm": 0.28528156876564026, + "learning_rate": 4.915012522367796e-05, + "loss": 0.2305, + "step": 9806 + }, + { + "epoch": 0.1749188456462027, + "grad_norm": 0.30481860041618347, + "learning_rate": 4.9149722783314794e-05, + "loss": 0.2607, + "step": 9807 + }, + { + "epoch": 0.17493668176791638, + "grad_norm": 0.2669420838356018, + "learning_rate": 4.9149320249338956e-05, + "loss": 0.2067, + "step": 9808 + }, + { + "epoch": 0.17495451788963007, + "grad_norm": 0.21455597877502441, + "learning_rate": 4.914891762175202e-05, + "loss": 0.2008, + "step": 9809 + }, + { + "epoch": 0.17497235401134378, + "grad_norm": 0.24768298864364624, + "learning_rate": 4.914851490055554e-05, + "loss": 0.2354, + "step": 9810 + }, + { + "epoch": 0.17499019013305747, + "grad_norm": 0.30151158571243286, + "learning_rate": 4.914811208575107e-05, + "loss": 0.1939, + "step": 9811 + }, + { + "epoch": 0.17500802625477116, + "grad_norm": 0.3170900046825409, + "learning_rate": 4.914770917734018e-05, + "loss": 0.2702, + "step": 9812 + }, + { + "epoch": 0.17502586237648485, + "grad_norm": 0.3405921757221222, + "learning_rate": 4.914730617532443e-05, + "loss": 0.1717, + "step": 9813 + }, + { + "epoch": 0.17504369849819856, + "grad_norm": 0.2955649495124817, + "learning_rate": 4.914690307970538e-05, + "loss": 0.2227, + "step": 9814 + }, + { + "epoch": 0.17506153461991225, + "grad_norm": 0.25658777356147766, + "learning_rate": 4.914649989048459e-05, + "loss": 0.2099, + "step": 9815 + }, + { + "epoch": 0.17507937074162594, + "grad_norm": 0.37147191166877747, + "learning_rate": 4.914609660766363e-05, + "loss": 0.1566, + "step": 9816 + }, + { + "epoch": 0.17509720686333963, + "grad_norm": 0.2716359794139862, + "learning_rate": 4.914569323124405e-05, + "loss": 0.2056, + "step": 9817 + }, + { + "epoch": 0.17511504298505334, + "grad_norm": 0.25064435601234436, + "learning_rate": 4.9145289761227434e-05, + "loss": 0.228, + "step": 9818 + }, + { + "epoch": 0.17513287910676703, + "grad_norm": 0.27970239520072937, + "learning_rate": 4.9144886197615334e-05, + "loss": 0.2028, + "step": 9819 + }, + { + "epoch": 0.17515071522848072, + "grad_norm": 0.280620276927948, + "learning_rate": 4.914448254040931e-05, + "loss": 0.2091, + "step": 9820 + }, + { + "epoch": 0.1751685513501944, + "grad_norm": 0.7011124491691589, + "learning_rate": 4.914407878961094e-05, + "loss": 0.2389, + "step": 9821 + }, + { + "epoch": 0.1751863874719081, + "grad_norm": 0.31332695484161377, + "learning_rate": 4.9143674945221776e-05, + "loss": 0.1929, + "step": 9822 + }, + { + "epoch": 0.1752042235936218, + "grad_norm": 0.23674538731575012, + "learning_rate": 4.9143271007243394e-05, + "loss": 0.1476, + "step": 9823 + }, + { + "epoch": 0.1752220597153355, + "grad_norm": 0.24798457324504852, + "learning_rate": 4.914286697567736e-05, + "loss": 0.1976, + "step": 9824 + }, + { + "epoch": 0.1752398958370492, + "grad_norm": 0.2565896511077881, + "learning_rate": 4.9142462850525225e-05, + "loss": 0.1953, + "step": 9825 + }, + { + "epoch": 0.17525773195876287, + "grad_norm": 0.35916703939437866, + "learning_rate": 4.914205863178857e-05, + "loss": 0.2273, + "step": 9826 + }, + { + "epoch": 0.1752755680804766, + "grad_norm": 0.2522268295288086, + "learning_rate": 4.914165431946895e-05, + "loss": 0.2018, + "step": 9827 + }, + { + "epoch": 0.17529340420219028, + "grad_norm": 0.2315714806318283, + "learning_rate": 4.9141249913567945e-05, + "loss": 0.1948, + "step": 9828 + }, + { + "epoch": 0.17531124032390397, + "grad_norm": 0.2239033430814743, + "learning_rate": 4.914084541408712e-05, + "loss": 0.1864, + "step": 9829 + }, + { + "epoch": 0.17532907644561765, + "grad_norm": 0.26858869194984436, + "learning_rate": 4.914044082102803e-05, + "loss": 0.2481, + "step": 9830 + }, + { + "epoch": 0.17534691256733137, + "grad_norm": 0.3861125111579895, + "learning_rate": 4.9140036134392266e-05, + "loss": 0.3285, + "step": 9831 + }, + { + "epoch": 0.17536474868904506, + "grad_norm": 0.3304365575313568, + "learning_rate": 4.9139631354181376e-05, + "loss": 0.2086, + "step": 9832 + }, + { + "epoch": 0.17538258481075875, + "grad_norm": 0.24423335492610931, + "learning_rate": 4.913922648039694e-05, + "loss": 0.2261, + "step": 9833 + }, + { + "epoch": 0.17540042093247243, + "grad_norm": 0.2607804834842682, + "learning_rate": 4.913882151304052e-05, + "loss": 0.2353, + "step": 9834 + }, + { + "epoch": 0.17541825705418615, + "grad_norm": 0.29175958037376404, + "learning_rate": 4.91384164521137e-05, + "loss": 0.2022, + "step": 9835 + }, + { + "epoch": 0.17543609317589984, + "grad_norm": 0.25276750326156616, + "learning_rate": 4.9138011297618036e-05, + "loss": 0.2201, + "step": 9836 + }, + { + "epoch": 0.17545392929761353, + "grad_norm": 0.308044970035553, + "learning_rate": 4.9137606049555105e-05, + "loss": 0.1551, + "step": 9837 + }, + { + "epoch": 0.1754717654193272, + "grad_norm": 0.24433742463588715, + "learning_rate": 4.9137200707926476e-05, + "loss": 0.1797, + "step": 9838 + }, + { + "epoch": 0.17548960154104093, + "grad_norm": 0.1911228746175766, + "learning_rate": 4.913679527273371e-05, + "loss": 0.1688, + "step": 9839 + }, + { + "epoch": 0.17550743766275462, + "grad_norm": 0.28093472123146057, + "learning_rate": 4.913638974397841e-05, + "loss": 0.2009, + "step": 9840 + }, + { + "epoch": 0.1755252737844683, + "grad_norm": 0.27896586060523987, + "learning_rate": 4.913598412166211e-05, + "loss": 0.1798, + "step": 9841 + }, + { + "epoch": 0.175543109906182, + "grad_norm": 0.20525510609149933, + "learning_rate": 4.9135578405786404e-05, + "loss": 0.1791, + "step": 9842 + }, + { + "epoch": 0.17556094602789568, + "grad_norm": 0.32102063298225403, + "learning_rate": 4.913517259635286e-05, + "loss": 0.1919, + "step": 9843 + }, + { + "epoch": 0.1755787821496094, + "grad_norm": 0.34964942932128906, + "learning_rate": 4.913476669336305e-05, + "loss": 0.2439, + "step": 9844 + }, + { + "epoch": 0.17559661827132308, + "grad_norm": 0.33790168166160583, + "learning_rate": 4.913436069681855e-05, + "loss": 0.2195, + "step": 9845 + }, + { + "epoch": 0.17561445439303677, + "grad_norm": 0.2906341850757599, + "learning_rate": 4.913395460672093e-05, + "loss": 0.2097, + "step": 9846 + }, + { + "epoch": 0.17563229051475046, + "grad_norm": 0.35009142756462097, + "learning_rate": 4.9133548423071765e-05, + "loss": 0.2547, + "step": 9847 + }, + { + "epoch": 0.17565012663646418, + "grad_norm": 0.2403053641319275, + "learning_rate": 4.913314214587263e-05, + "loss": 0.1929, + "step": 9848 + }, + { + "epoch": 0.17566796275817786, + "grad_norm": 0.2491363137960434, + "learning_rate": 4.913273577512511e-05, + "loss": 0.1813, + "step": 9849 + }, + { + "epoch": 0.17568579887989155, + "grad_norm": 0.3104622960090637, + "learning_rate": 4.9132329310830764e-05, + "loss": 0.2335, + "step": 9850 + }, + { + "epoch": 0.17570363500160524, + "grad_norm": 0.3666343688964844, + "learning_rate": 4.913192275299118e-05, + "loss": 0.2267, + "step": 9851 + }, + { + "epoch": 0.17572147112331896, + "grad_norm": 0.2799401879310608, + "learning_rate": 4.9131516101607923e-05, + "loss": 0.246, + "step": 9852 + }, + { + "epoch": 0.17573930724503264, + "grad_norm": 0.316927045583725, + "learning_rate": 4.913110935668258e-05, + "loss": 0.2505, + "step": 9853 + }, + { + "epoch": 0.17575714336674633, + "grad_norm": 0.277109831571579, + "learning_rate": 4.9130702518216715e-05, + "loss": 0.2282, + "step": 9854 + }, + { + "epoch": 0.17577497948846002, + "grad_norm": 0.2826685309410095, + "learning_rate": 4.913029558621192e-05, + "loss": 0.2573, + "step": 9855 + }, + { + "epoch": 0.17579281561017374, + "grad_norm": 0.32976076006889343, + "learning_rate": 4.9129888560669755e-05, + "loss": 0.2104, + "step": 9856 + }, + { + "epoch": 0.17581065173188742, + "grad_norm": 0.214805006980896, + "learning_rate": 4.912948144159182e-05, + "loss": 0.2194, + "step": 9857 + }, + { + "epoch": 0.1758284878536011, + "grad_norm": 0.36780327558517456, + "learning_rate": 4.912907422897967e-05, + "loss": 0.1797, + "step": 9858 + }, + { + "epoch": 0.1758463239753148, + "grad_norm": 0.30478477478027344, + "learning_rate": 4.91286669228349e-05, + "loss": 0.229, + "step": 9859 + }, + { + "epoch": 0.17586416009702852, + "grad_norm": 0.23440249264240265, + "learning_rate": 4.912825952315908e-05, + "loss": 0.1987, + "step": 9860 + }, + { + "epoch": 0.1758819962187422, + "grad_norm": 0.3610669672489166, + "learning_rate": 4.91278520299538e-05, + "loss": 0.2409, + "step": 9861 + }, + { + "epoch": 0.1758998323404559, + "grad_norm": 0.26612475514411926, + "learning_rate": 4.912744444322063e-05, + "loss": 0.2077, + "step": 9862 + }, + { + "epoch": 0.17591766846216958, + "grad_norm": 0.28233879804611206, + "learning_rate": 4.9127036762961144e-05, + "loss": 0.2312, + "step": 9863 + }, + { + "epoch": 0.17593550458388327, + "grad_norm": 0.23805402219295502, + "learning_rate": 4.9126628989176936e-05, + "loss": 0.1833, + "step": 9864 + }, + { + "epoch": 0.17595334070559698, + "grad_norm": 0.32132792472839355, + "learning_rate": 4.912622112186958e-05, + "loss": 0.2008, + "step": 9865 + }, + { + "epoch": 0.17597117682731067, + "grad_norm": 0.3585755228996277, + "learning_rate": 4.9125813161040654e-05, + "loss": 0.2293, + "step": 9866 + }, + { + "epoch": 0.17598901294902436, + "grad_norm": 0.3080814778804779, + "learning_rate": 4.912540510669175e-05, + "loss": 0.2971, + "step": 9867 + }, + { + "epoch": 0.17600684907073805, + "grad_norm": 0.33391812443733215, + "learning_rate": 4.912499695882444e-05, + "loss": 0.2254, + "step": 9868 + }, + { + "epoch": 0.17602468519245176, + "grad_norm": 0.2802722156047821, + "learning_rate": 4.912458871744031e-05, + "loss": 0.2181, + "step": 9869 + }, + { + "epoch": 0.17604252131416545, + "grad_norm": 0.24930240213871002, + "learning_rate": 4.912418038254094e-05, + "loss": 0.2178, + "step": 9870 + }, + { + "epoch": 0.17606035743587914, + "grad_norm": 0.25119373202323914, + "learning_rate": 4.9123771954127917e-05, + "loss": 0.1792, + "step": 9871 + }, + { + "epoch": 0.17607819355759283, + "grad_norm": 0.339108943939209, + "learning_rate": 4.912336343220283e-05, + "loss": 0.1594, + "step": 9872 + }, + { + "epoch": 0.17609602967930654, + "grad_norm": 0.3193516135215759, + "learning_rate": 4.912295481676724e-05, + "loss": 0.2209, + "step": 9873 + }, + { + "epoch": 0.17611386580102023, + "grad_norm": 0.28487035632133484, + "learning_rate": 4.9122546107822756e-05, + "loss": 0.2196, + "step": 9874 + }, + { + "epoch": 0.17613170192273392, + "grad_norm": 0.3017159700393677, + "learning_rate": 4.912213730537094e-05, + "loss": 0.1577, + "step": 9875 + }, + { + "epoch": 0.1761495380444476, + "grad_norm": 0.36447104811668396, + "learning_rate": 4.91217284094134e-05, + "loss": 0.2475, + "step": 9876 + }, + { + "epoch": 0.17616737416616132, + "grad_norm": 0.3028956949710846, + "learning_rate": 4.91213194199517e-05, + "loss": 0.2058, + "step": 9877 + }, + { + "epoch": 0.176185210287875, + "grad_norm": 0.21894517540931702, + "learning_rate": 4.912091033698744e-05, + "loss": 0.1685, + "step": 9878 + }, + { + "epoch": 0.1762030464095887, + "grad_norm": 0.24029791355133057, + "learning_rate": 4.9120501160522204e-05, + "loss": 0.1988, + "step": 9879 + }, + { + "epoch": 0.1762208825313024, + "grad_norm": 0.39944303035736084, + "learning_rate": 4.9120091890557565e-05, + "loss": 0.2806, + "step": 9880 + }, + { + "epoch": 0.17623871865301607, + "grad_norm": 0.33641016483306885, + "learning_rate": 4.9119682527095126e-05, + "loss": 0.1639, + "step": 9881 + }, + { + "epoch": 0.1762565547747298, + "grad_norm": 0.3470108211040497, + "learning_rate": 4.9119273070136465e-05, + "loss": 0.2263, + "step": 9882 + }, + { + "epoch": 0.17627439089644348, + "grad_norm": 0.33996477723121643, + "learning_rate": 4.911886351968317e-05, + "loss": 0.1611, + "step": 9883 + }, + { + "epoch": 0.17629222701815717, + "grad_norm": 0.3267709016799927, + "learning_rate": 4.911845387573683e-05, + "loss": 0.2091, + "step": 9884 + }, + { + "epoch": 0.17631006313987085, + "grad_norm": 0.33487847447395325, + "learning_rate": 4.9118044138299033e-05, + "loss": 0.2619, + "step": 9885 + }, + { + "epoch": 0.17632789926158457, + "grad_norm": 0.29962268471717834, + "learning_rate": 4.911763430737136e-05, + "loss": 0.2807, + "step": 9886 + }, + { + "epoch": 0.17634573538329826, + "grad_norm": 0.3162447214126587, + "learning_rate": 4.911722438295542e-05, + "loss": 0.2039, + "step": 9887 + }, + { + "epoch": 0.17636357150501195, + "grad_norm": 0.32478970289230347, + "learning_rate": 4.9116814365052784e-05, + "loss": 0.2566, + "step": 9888 + }, + { + "epoch": 0.17638140762672563, + "grad_norm": 0.2739572525024414, + "learning_rate": 4.9116404253665037e-05, + "loss": 0.2126, + "step": 9889 + }, + { + "epoch": 0.17639924374843935, + "grad_norm": 0.26051992177963257, + "learning_rate": 4.911599404879379e-05, + "loss": 0.2655, + "step": 9890 + }, + { + "epoch": 0.17641707987015304, + "grad_norm": 0.26293838024139404, + "learning_rate": 4.911558375044061e-05, + "loss": 0.2522, + "step": 9891 + }, + { + "epoch": 0.17643491599186673, + "grad_norm": 0.29335278272628784, + "learning_rate": 4.9115173358607105e-05, + "loss": 0.1907, + "step": 9892 + }, + { + "epoch": 0.1764527521135804, + "grad_norm": 0.26845383644104004, + "learning_rate": 4.911476287329486e-05, + "loss": 0.1828, + "step": 9893 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.30866870284080505, + "learning_rate": 4.911435229450546e-05, + "loss": 0.2407, + "step": 9894 + }, + { + "epoch": 0.17648842435700782, + "grad_norm": 0.24196955561637878, + "learning_rate": 4.91139416222405e-05, + "loss": 0.235, + "step": 9895 + }, + { + "epoch": 0.1765062604787215, + "grad_norm": 0.328097939491272, + "learning_rate": 4.9113530856501575e-05, + "loss": 0.1971, + "step": 9896 + }, + { + "epoch": 0.1765240966004352, + "grad_norm": 0.47702354192733765, + "learning_rate": 4.9113119997290284e-05, + "loss": 0.3136, + "step": 9897 + }, + { + "epoch": 0.1765419327221489, + "grad_norm": 0.2346765697002411, + "learning_rate": 4.911270904460821e-05, + "loss": 0.1882, + "step": 9898 + }, + { + "epoch": 0.1765597688438626, + "grad_norm": 0.2766011357307434, + "learning_rate": 4.911229799845694e-05, + "loss": 0.2342, + "step": 9899 + }, + { + "epoch": 0.17657760496557628, + "grad_norm": 0.24140343070030212, + "learning_rate": 4.9111886858838074e-05, + "loss": 0.2126, + "step": 9900 + }, + { + "epoch": 0.17659544108728997, + "grad_norm": 0.29035061597824097, + "learning_rate": 4.911147562575321e-05, + "loss": 0.1827, + "step": 9901 + }, + { + "epoch": 0.17661327720900366, + "grad_norm": 0.4507565200328827, + "learning_rate": 4.9111064299203946e-05, + "loss": 0.2255, + "step": 9902 + }, + { + "epoch": 0.17663111333071738, + "grad_norm": 0.2248343527317047, + "learning_rate": 4.911065287919186e-05, + "loss": 0.2027, + "step": 9903 + }, + { + "epoch": 0.17664894945243106, + "grad_norm": 0.23875249922275543, + "learning_rate": 4.911024136571856e-05, + "loss": 0.2375, + "step": 9904 + }, + { + "epoch": 0.17666678557414475, + "grad_norm": 0.3332054316997528, + "learning_rate": 4.910982975878563e-05, + "loss": 0.2603, + "step": 9905 + }, + { + "epoch": 0.17668462169585844, + "grad_norm": 0.30601832270622253, + "learning_rate": 4.910941805839468e-05, + "loss": 0.253, + "step": 9906 + }, + { + "epoch": 0.17670245781757216, + "grad_norm": 0.2917705476284027, + "learning_rate": 4.91090062645473e-05, + "loss": 0.2622, + "step": 9907 + }, + { + "epoch": 0.17672029393928584, + "grad_norm": 0.31659191846847534, + "learning_rate": 4.910859437724508e-05, + "loss": 0.226, + "step": 9908 + }, + { + "epoch": 0.17673813006099953, + "grad_norm": 0.23532883822917938, + "learning_rate": 4.910818239648962e-05, + "loss": 0.1455, + "step": 9909 + }, + { + "epoch": 0.17675596618271322, + "grad_norm": 0.24646751582622528, + "learning_rate": 4.910777032228252e-05, + "loss": 0.2111, + "step": 9910 + }, + { + "epoch": 0.17677380230442694, + "grad_norm": 0.2357739359140396, + "learning_rate": 4.910735815462538e-05, + "loss": 0.2006, + "step": 9911 + }, + { + "epoch": 0.17679163842614062, + "grad_norm": 0.23476216197013855, + "learning_rate": 4.910694589351979e-05, + "loss": 0.1557, + "step": 9912 + }, + { + "epoch": 0.1768094745478543, + "grad_norm": 0.30611780285835266, + "learning_rate": 4.910653353896735e-05, + "loss": 0.1899, + "step": 9913 + }, + { + "epoch": 0.176827310669568, + "grad_norm": 0.3897404968738556, + "learning_rate": 4.9106121090969656e-05, + "loss": 0.2269, + "step": 9914 + }, + { + "epoch": 0.17684514679128172, + "grad_norm": 0.42323267459869385, + "learning_rate": 4.910570854952832e-05, + "loss": 0.2138, + "step": 9915 + }, + { + "epoch": 0.1768629829129954, + "grad_norm": 0.304720938205719, + "learning_rate": 4.9105295914644925e-05, + "loss": 0.229, + "step": 9916 + }, + { + "epoch": 0.1768808190347091, + "grad_norm": 0.3408209979534149, + "learning_rate": 4.9104883186321083e-05, + "loss": 0.1995, + "step": 9917 + }, + { + "epoch": 0.17689865515642278, + "grad_norm": 0.4280528724193573, + "learning_rate": 4.910447036455839e-05, + "loss": 0.2414, + "step": 9918 + }, + { + "epoch": 0.1769164912781365, + "grad_norm": 0.30985012650489807, + "learning_rate": 4.910405744935843e-05, + "loss": 0.216, + "step": 9919 + }, + { + "epoch": 0.17693432739985018, + "grad_norm": 0.37898531556129456, + "learning_rate": 4.910364444072283e-05, + "loss": 0.217, + "step": 9920 + }, + { + "epoch": 0.17695216352156387, + "grad_norm": 0.4527721405029297, + "learning_rate": 4.910323133865318e-05, + "loss": 0.2087, + "step": 9921 + }, + { + "epoch": 0.17696999964327756, + "grad_norm": 0.3203611969947815, + "learning_rate": 4.910281814315108e-05, + "loss": 0.2423, + "step": 9922 + }, + { + "epoch": 0.17698783576499125, + "grad_norm": 0.3138802647590637, + "learning_rate": 4.910240485421812e-05, + "loss": 0.2013, + "step": 9923 + }, + { + "epoch": 0.17700567188670496, + "grad_norm": 0.2812381386756897, + "learning_rate": 4.9101991471855926e-05, + "loss": 0.2333, + "step": 9924 + }, + { + "epoch": 0.17702350800841865, + "grad_norm": 0.28126227855682373, + "learning_rate": 4.9101577996066085e-05, + "loss": 0.1604, + "step": 9925 + }, + { + "epoch": 0.17704134413013234, + "grad_norm": 0.29395556449890137, + "learning_rate": 4.91011644268502e-05, + "loss": 0.2431, + "step": 9926 + }, + { + "epoch": 0.17705918025184603, + "grad_norm": 0.3164561688899994, + "learning_rate": 4.910075076420988e-05, + "loss": 0.2506, + "step": 9927 + }, + { + "epoch": 0.17707701637355974, + "grad_norm": 0.25460055470466614, + "learning_rate": 4.910033700814673e-05, + "loss": 0.1756, + "step": 9928 + }, + { + "epoch": 0.17709485249527343, + "grad_norm": 0.3828481137752533, + "learning_rate": 4.9099923158662346e-05, + "loss": 0.1819, + "step": 9929 + }, + { + "epoch": 0.17711268861698712, + "grad_norm": 0.3121713697910309, + "learning_rate": 4.909950921575834e-05, + "loss": 0.1679, + "step": 9930 + }, + { + "epoch": 0.1771305247387008, + "grad_norm": 0.24948590993881226, + "learning_rate": 4.9099095179436305e-05, + "loss": 0.2032, + "step": 9931 + }, + { + "epoch": 0.17714836086041452, + "grad_norm": 0.35499536991119385, + "learning_rate": 4.909868104969786e-05, + "loss": 0.2371, + "step": 9932 + }, + { + "epoch": 0.1771661969821282, + "grad_norm": 0.3047904074192047, + "learning_rate": 4.90982668265446e-05, + "loss": 0.2402, + "step": 9933 + }, + { + "epoch": 0.1771840331038419, + "grad_norm": 0.2975825369358063, + "learning_rate": 4.909785250997814e-05, + "loss": 0.1914, + "step": 9934 + }, + { + "epoch": 0.17720186922555559, + "grad_norm": 0.382649689912796, + "learning_rate": 4.909743810000008e-05, + "loss": 0.2156, + "step": 9935 + }, + { + "epoch": 0.1772197053472693, + "grad_norm": 0.33098697662353516, + "learning_rate": 4.9097023596612024e-05, + "loss": 0.2515, + "step": 9936 + }, + { + "epoch": 0.177237541468983, + "grad_norm": 0.2597023844718933, + "learning_rate": 4.9096608999815575e-05, + "loss": 0.1875, + "step": 9937 + }, + { + "epoch": 0.17725537759069668, + "grad_norm": 0.3416253626346588, + "learning_rate": 4.909619430961235e-05, + "loss": 0.2768, + "step": 9938 + }, + { + "epoch": 0.17727321371241037, + "grad_norm": 0.2221720963716507, + "learning_rate": 4.909577952600396e-05, + "loss": 0.2055, + "step": 9939 + }, + { + "epoch": 0.17729104983412408, + "grad_norm": 0.325078547000885, + "learning_rate": 4.9095364648992e-05, + "loss": 0.2108, + "step": 9940 + }, + { + "epoch": 0.17730888595583777, + "grad_norm": 0.2751140594482422, + "learning_rate": 4.9094949678578095e-05, + "loss": 0.1819, + "step": 9941 + }, + { + "epoch": 0.17732672207755146, + "grad_norm": 0.3418734669685364, + "learning_rate": 4.909453461476383e-05, + "loss": 0.2249, + "step": 9942 + }, + { + "epoch": 0.17734455819926515, + "grad_norm": 0.28655219078063965, + "learning_rate": 4.9094119457550834e-05, + "loss": 0.2366, + "step": 9943 + }, + { + "epoch": 0.17736239432097883, + "grad_norm": 0.25974804162979126, + "learning_rate": 4.9093704206940705e-05, + "loss": 0.2074, + "step": 9944 + }, + { + "epoch": 0.17738023044269255, + "grad_norm": 0.31325864791870117, + "learning_rate": 4.909328886293506e-05, + "loss": 0.2014, + "step": 9945 + }, + { + "epoch": 0.17739806656440624, + "grad_norm": 0.27111369371414185, + "learning_rate": 4.909287342553551e-05, + "loss": 0.2426, + "step": 9946 + }, + { + "epoch": 0.17741590268611993, + "grad_norm": 0.35351452231407166, + "learning_rate": 4.909245789474365e-05, + "loss": 0.2444, + "step": 9947 + }, + { + "epoch": 0.1774337388078336, + "grad_norm": 0.27385374903678894, + "learning_rate": 4.909204227056111e-05, + "loss": 0.1869, + "step": 9948 + }, + { + "epoch": 0.17745157492954733, + "grad_norm": 0.3400082588195801, + "learning_rate": 4.909162655298949e-05, + "loss": 0.2009, + "step": 9949 + }, + { + "epoch": 0.17746941105126102, + "grad_norm": 0.30281776189804077, + "learning_rate": 4.90912107420304e-05, + "loss": 0.2664, + "step": 9950 + }, + { + "epoch": 0.1774872471729747, + "grad_norm": 0.3745337128639221, + "learning_rate": 4.909079483768547e-05, + "loss": 0.247, + "step": 9951 + }, + { + "epoch": 0.1775050832946884, + "grad_norm": 0.38315799832344055, + "learning_rate": 4.9090378839956285e-05, + "loss": 0.2328, + "step": 9952 + }, + { + "epoch": 0.1775229194164021, + "grad_norm": 0.30493584275245667, + "learning_rate": 4.908996274884448e-05, + "loss": 0.1915, + "step": 9953 + }, + { + "epoch": 0.1775407555381158, + "grad_norm": 0.26232585310935974, + "learning_rate": 4.908954656435165e-05, + "loss": 0.2097, + "step": 9954 + }, + { + "epoch": 0.17755859165982948, + "grad_norm": 0.35629040002822876, + "learning_rate": 4.9089130286479424e-05, + "loss": 0.1797, + "step": 9955 + }, + { + "epoch": 0.17757642778154317, + "grad_norm": 0.27860766649246216, + "learning_rate": 4.908871391522941e-05, + "loss": 0.2176, + "step": 9956 + }, + { + "epoch": 0.1775942639032569, + "grad_norm": 0.2514987587928772, + "learning_rate": 4.908829745060321e-05, + "loss": 0.1759, + "step": 9957 + }, + { + "epoch": 0.17761210002497058, + "grad_norm": 0.3102920353412628, + "learning_rate": 4.9087880892602466e-05, + "loss": 0.2128, + "step": 9958 + }, + { + "epoch": 0.17762993614668426, + "grad_norm": 0.24939922988414764, + "learning_rate": 4.9087464241228765e-05, + "loss": 0.1897, + "step": 9959 + }, + { + "epoch": 0.17764777226839795, + "grad_norm": 0.326032429933548, + "learning_rate": 4.908704749648374e-05, + "loss": 0.2266, + "step": 9960 + }, + { + "epoch": 0.17766560839011167, + "grad_norm": 0.3501764237880707, + "learning_rate": 4.908663065836899e-05, + "loss": 0.2301, + "step": 9961 + }, + { + "epoch": 0.17768344451182536, + "grad_norm": 0.23558185994625092, + "learning_rate": 4.908621372688614e-05, + "loss": 0.2011, + "step": 9962 + }, + { + "epoch": 0.17770128063353904, + "grad_norm": 0.30565160512924194, + "learning_rate": 4.9085796702036817e-05, + "loss": 0.1886, + "step": 9963 + }, + { + "epoch": 0.17771911675525273, + "grad_norm": 0.2684495449066162, + "learning_rate": 4.908537958382262e-05, + "loss": 0.1652, + "step": 9964 + }, + { + "epoch": 0.17773695287696642, + "grad_norm": 0.63982093334198, + "learning_rate": 4.908496237224518e-05, + "loss": 0.1859, + "step": 9965 + }, + { + "epoch": 0.17775478899868014, + "grad_norm": 0.3628515899181366, + "learning_rate": 4.9084545067306096e-05, + "loss": 0.2356, + "step": 9966 + }, + { + "epoch": 0.17777262512039382, + "grad_norm": 0.38683274388313293, + "learning_rate": 4.9084127669007005e-05, + "loss": 0.2076, + "step": 9967 + }, + { + "epoch": 0.1777904612421075, + "grad_norm": 0.2974708378314972, + "learning_rate": 4.9083710177349515e-05, + "loss": 0.2423, + "step": 9968 + }, + { + "epoch": 0.1778082973638212, + "grad_norm": 0.4412638247013092, + "learning_rate": 4.908329259233525e-05, + "loss": 0.2604, + "step": 9969 + }, + { + "epoch": 0.17782613348553492, + "grad_norm": 0.29436057806015015, + "learning_rate": 4.9082874913965815e-05, + "loss": 0.1318, + "step": 9970 + }, + { + "epoch": 0.1778439696072486, + "grad_norm": 0.4153296947479248, + "learning_rate": 4.9082457142242845e-05, + "loss": 0.1986, + "step": 9971 + }, + { + "epoch": 0.1778618057289623, + "grad_norm": 0.26473212242126465, + "learning_rate": 4.908203927716796e-05, + "loss": 0.2138, + "step": 9972 + }, + { + "epoch": 0.17787964185067598, + "grad_norm": 0.34248077869415283, + "learning_rate": 4.908162131874276e-05, + "loss": 0.2183, + "step": 9973 + }, + { + "epoch": 0.1778974779723897, + "grad_norm": 0.2785325348377228, + "learning_rate": 4.908120326696888e-05, + "loss": 0.1947, + "step": 9974 + }, + { + "epoch": 0.17791531409410338, + "grad_norm": 0.2864294648170471, + "learning_rate": 4.908078512184795e-05, + "loss": 0.1924, + "step": 9975 + }, + { + "epoch": 0.17793315021581707, + "grad_norm": 0.584975004196167, + "learning_rate": 4.908036688338157e-05, + "loss": 0.1898, + "step": 9976 + }, + { + "epoch": 0.17795098633753076, + "grad_norm": 0.25670674443244934, + "learning_rate": 4.907994855157138e-05, + "loss": 0.1677, + "step": 9977 + }, + { + "epoch": 0.17796882245924447, + "grad_norm": 0.24501079320907593, + "learning_rate": 4.9079530126418975e-05, + "loss": 0.1651, + "step": 9978 + }, + { + "epoch": 0.17798665858095816, + "grad_norm": 0.29451894760131836, + "learning_rate": 4.907911160792601e-05, + "loss": 0.2531, + "step": 9979 + }, + { + "epoch": 0.17800449470267185, + "grad_norm": 0.3314656615257263, + "learning_rate": 4.907869299609408e-05, + "loss": 0.2115, + "step": 9980 + }, + { + "epoch": 0.17802233082438554, + "grad_norm": 0.2793998718261719, + "learning_rate": 4.907827429092483e-05, + "loss": 0.2081, + "step": 9981 + }, + { + "epoch": 0.17804016694609923, + "grad_norm": 0.2776472866535187, + "learning_rate": 4.907785549241987e-05, + "loss": 0.2439, + "step": 9982 + }, + { + "epoch": 0.17805800306781294, + "grad_norm": 0.2417910248041153, + "learning_rate": 4.907743660058082e-05, + "loss": 0.2136, + "step": 9983 + }, + { + "epoch": 0.17807583918952663, + "grad_norm": 0.2986113727092743, + "learning_rate": 4.907701761540931e-05, + "loss": 0.2501, + "step": 9984 + }, + { + "epoch": 0.17809367531124032, + "grad_norm": 0.2245412915945053, + "learning_rate": 4.907659853690698e-05, + "loss": 0.2006, + "step": 9985 + }, + { + "epoch": 0.178111511432954, + "grad_norm": 0.2672586143016815, + "learning_rate": 4.907617936507543e-05, + "loss": 0.2609, + "step": 9986 + }, + { + "epoch": 0.17812934755466772, + "grad_norm": 0.2883603572845459, + "learning_rate": 4.907576009991628e-05, + "loss": 0.2264, + "step": 9987 + }, + { + "epoch": 0.1781471836763814, + "grad_norm": 0.253579318523407, + "learning_rate": 4.907534074143118e-05, + "loss": 0.218, + "step": 9988 + }, + { + "epoch": 0.1781650197980951, + "grad_norm": 0.2399081587791443, + "learning_rate": 4.9074921289621745e-05, + "loss": 0.2039, + "step": 9989 + }, + { + "epoch": 0.17818285591980879, + "grad_norm": 0.2588498890399933, + "learning_rate": 4.9074501744489596e-05, + "loss": 0.1713, + "step": 9990 + }, + { + "epoch": 0.1782006920415225, + "grad_norm": 0.2734178602695465, + "learning_rate": 4.907408210603636e-05, + "loss": 0.1993, + "step": 9991 + }, + { + "epoch": 0.1782185281632362, + "grad_norm": 0.2253803014755249, + "learning_rate": 4.9073662374263676e-05, + "loss": 0.2069, + "step": 9992 + }, + { + "epoch": 0.17823636428494988, + "grad_norm": 0.2691268026828766, + "learning_rate": 4.9073242549173145e-05, + "loss": 0.2307, + "step": 9993 + }, + { + "epoch": 0.17825420040666357, + "grad_norm": 0.38317370414733887, + "learning_rate": 4.907282263076643e-05, + "loss": 0.1795, + "step": 9994 + }, + { + "epoch": 0.17827203652837728, + "grad_norm": 0.31615981459617615, + "learning_rate": 4.907240261904513e-05, + "loss": 0.2273, + "step": 9995 + }, + { + "epoch": 0.17828987265009097, + "grad_norm": 0.319078654050827, + "learning_rate": 4.907198251401089e-05, + "loss": 0.165, + "step": 9996 + }, + { + "epoch": 0.17830770877180466, + "grad_norm": 0.31444546580314636, + "learning_rate": 4.907156231566532e-05, + "loss": 0.1997, + "step": 9997 + }, + { + "epoch": 0.17832554489351835, + "grad_norm": 0.30600807070732117, + "learning_rate": 4.907114202401008e-05, + "loss": 0.2123, + "step": 9998 + }, + { + "epoch": 0.17834338101523206, + "grad_norm": 0.3261357247829437, + "learning_rate": 4.907072163904676e-05, + "loss": 0.2054, + "step": 9999 + }, + { + "epoch": 0.17836121713694575, + "grad_norm": 0.34018146991729736, + "learning_rate": 4.907030116077702e-05, + "loss": 0.2091, + "step": 10000 + }, + { + "epoch": 0.17836121713694575, + "eval_loss": 0.20239697396755219, + "eval_runtime": 106.9886, + "eval_samples_per_second": 9.571, + "eval_steps_per_second": 1.598, + "step": 10000 + }, + { + "epoch": 0.17837905325865944, + "grad_norm": 0.40059399604797363, + "learning_rate": 4.906988058920247e-05, + "loss": 0.2009, + "step": 10001 + }, + { + "epoch": 0.17839688938037312, + "grad_norm": 0.2519589960575104, + "learning_rate": 4.9069459924324754e-05, + "loss": 0.2261, + "step": 10002 + }, + { + "epoch": 0.1784147255020868, + "grad_norm": 0.3123573064804077, + "learning_rate": 4.90690391661455e-05, + "loss": 0.2269, + "step": 10003 + }, + { + "epoch": 0.17843256162380053, + "grad_norm": 0.319833904504776, + "learning_rate": 4.906861831466634e-05, + "loss": 0.1499, + "step": 10004 + }, + { + "epoch": 0.17845039774551422, + "grad_norm": 0.31864285469055176, + "learning_rate": 4.90681973698889e-05, + "loss": 0.2233, + "step": 10005 + }, + { + "epoch": 0.1784682338672279, + "grad_norm": 0.38282573223114014, + "learning_rate": 4.906777633181481e-05, + "loss": 0.1963, + "step": 10006 + }, + { + "epoch": 0.1784860699889416, + "grad_norm": 0.2704548239707947, + "learning_rate": 4.9067355200445706e-05, + "loss": 0.1841, + "step": 10007 + }, + { + "epoch": 0.1785039061106553, + "grad_norm": 0.33108240365982056, + "learning_rate": 4.906693397578322e-05, + "loss": 0.2074, + "step": 10008 + }, + { + "epoch": 0.178521742232369, + "grad_norm": 0.3164540231227875, + "learning_rate": 4.9066512657829e-05, + "loss": 0.2126, + "step": 10009 + }, + { + "epoch": 0.17853957835408268, + "grad_norm": 0.302031010389328, + "learning_rate": 4.906609124658464e-05, + "loss": 0.1862, + "step": 10010 + }, + { + "epoch": 0.17855741447579637, + "grad_norm": 0.23535902798175812, + "learning_rate": 4.906566974205182e-05, + "loss": 0.1513, + "step": 10011 + }, + { + "epoch": 0.1785752505975101, + "grad_norm": 0.28384605050086975, + "learning_rate": 4.9065248144232144e-05, + "loss": 0.2258, + "step": 10012 + }, + { + "epoch": 0.17859308671922378, + "grad_norm": 0.36058810353279114, + "learning_rate": 4.906482645312726e-05, + "loss": 0.1631, + "step": 10013 + }, + { + "epoch": 0.17861092284093746, + "grad_norm": 0.20574362576007843, + "learning_rate": 4.906440466873878e-05, + "loss": 0.2168, + "step": 10014 + }, + { + "epoch": 0.17862875896265115, + "grad_norm": 0.23759403824806213, + "learning_rate": 4.9063982791068377e-05, + "loss": 0.1958, + "step": 10015 + }, + { + "epoch": 0.17864659508436487, + "grad_norm": 0.2946540117263794, + "learning_rate": 4.906356082011765e-05, + "loss": 0.2133, + "step": 10016 + }, + { + "epoch": 0.17866443120607856, + "grad_norm": 0.26436474919319153, + "learning_rate": 4.906313875588826e-05, + "loss": 0.1651, + "step": 10017 + }, + { + "epoch": 0.17868226732779224, + "grad_norm": 0.3037585914134979, + "learning_rate": 4.906271659838182e-05, + "loss": 0.184, + "step": 10018 + }, + { + "epoch": 0.17870010344950593, + "grad_norm": 0.4259195327758789, + "learning_rate": 4.906229434759999e-05, + "loss": 0.2351, + "step": 10019 + }, + { + "epoch": 0.17871793957121965, + "grad_norm": 0.30201098322868347, + "learning_rate": 4.9061872003544395e-05, + "loss": 0.2287, + "step": 10020 + }, + { + "epoch": 0.17873577569293334, + "grad_norm": 0.31891515851020813, + "learning_rate": 4.9061449566216675e-05, + "loss": 0.2143, + "step": 10021 + }, + { + "epoch": 0.17875361181464702, + "grad_norm": 0.2972477972507477, + "learning_rate": 4.906102703561846e-05, + "loss": 0.1957, + "step": 10022 + }, + { + "epoch": 0.1787714479363607, + "grad_norm": 0.2640993297100067, + "learning_rate": 4.9060604411751396e-05, + "loss": 0.1779, + "step": 10023 + }, + { + "epoch": 0.1787892840580744, + "grad_norm": 0.21846936643123627, + "learning_rate": 4.9060181694617123e-05, + "loss": 0.1865, + "step": 10024 + }, + { + "epoch": 0.17880712017978811, + "grad_norm": 0.28230080008506775, + "learning_rate": 4.905975888421727e-05, + "loss": 0.2395, + "step": 10025 + }, + { + "epoch": 0.1788249563015018, + "grad_norm": 0.22010251879692078, + "learning_rate": 4.905933598055349e-05, + "loss": 0.1691, + "step": 10026 + }, + { + "epoch": 0.1788427924232155, + "grad_norm": 0.3181862533092499, + "learning_rate": 4.90589129836274e-05, + "loss": 0.2341, + "step": 10027 + }, + { + "epoch": 0.17886062854492918, + "grad_norm": 0.3197450041770935, + "learning_rate": 4.9058489893440664e-05, + "loss": 0.2166, + "step": 10028 + }, + { + "epoch": 0.1788784646666429, + "grad_norm": 0.21327778697013855, + "learning_rate": 4.905806670999491e-05, + "loss": 0.1914, + "step": 10029 + }, + { + "epoch": 0.17889630078835658, + "grad_norm": 0.20844419300556183, + "learning_rate": 4.9057643433291776e-05, + "loss": 0.1759, + "step": 10030 + }, + { + "epoch": 0.17891413691007027, + "grad_norm": 0.21894694864749908, + "learning_rate": 4.9057220063332914e-05, + "loss": 0.2039, + "step": 10031 + }, + { + "epoch": 0.17893197303178396, + "grad_norm": 0.25392717123031616, + "learning_rate": 4.905679660011996e-05, + "loss": 0.2122, + "step": 10032 + }, + { + "epoch": 0.17894980915349767, + "grad_norm": 0.2689146399497986, + "learning_rate": 4.9056373043654546e-05, + "loss": 0.2041, + "step": 10033 + }, + { + "epoch": 0.17896764527521136, + "grad_norm": 0.2774428427219391, + "learning_rate": 4.905594939393831e-05, + "loss": 0.2238, + "step": 10034 + }, + { + "epoch": 0.17898548139692505, + "grad_norm": 0.23594151437282562, + "learning_rate": 4.905552565097293e-05, + "loss": 0.1941, + "step": 10035 + }, + { + "epoch": 0.17900331751863874, + "grad_norm": 0.24410341680049896, + "learning_rate": 4.905510181476001e-05, + "loss": 0.1933, + "step": 10036 + }, + { + "epoch": 0.17902115364035245, + "grad_norm": 0.3463740944862366, + "learning_rate": 4.905467788530121e-05, + "loss": 0.2435, + "step": 10037 + }, + { + "epoch": 0.17903898976206614, + "grad_norm": 0.20690517127513885, + "learning_rate": 4.905425386259817e-05, + "loss": 0.1466, + "step": 10038 + }, + { + "epoch": 0.17905682588377983, + "grad_norm": 0.44511640071868896, + "learning_rate": 4.905382974665254e-05, + "loss": 0.2505, + "step": 10039 + }, + { + "epoch": 0.17907466200549352, + "grad_norm": 0.38292738795280457, + "learning_rate": 4.9053405537465946e-05, + "loss": 0.2146, + "step": 10040 + }, + { + "epoch": 0.17909249812720723, + "grad_norm": 0.2860642075538635, + "learning_rate": 4.905298123504005e-05, + "loss": 0.2118, + "step": 10041 + }, + { + "epoch": 0.17911033424892092, + "grad_norm": 0.2530413269996643, + "learning_rate": 4.905255683937649e-05, + "loss": 0.1631, + "step": 10042 + }, + { + "epoch": 0.1791281703706346, + "grad_norm": 0.24118545651435852, + "learning_rate": 4.9052132350476916e-05, + "loss": 0.2145, + "step": 10043 + }, + { + "epoch": 0.1791460064923483, + "grad_norm": 0.2952115535736084, + "learning_rate": 4.9051707768342966e-05, + "loss": 0.2511, + "step": 10044 + }, + { + "epoch": 0.17916384261406199, + "grad_norm": 0.26165199279785156, + "learning_rate": 4.905128309297629e-05, + "loss": 0.1825, + "step": 10045 + }, + { + "epoch": 0.1791816787357757, + "grad_norm": 0.35493025183677673, + "learning_rate": 4.905085832437853e-05, + "loss": 0.286, + "step": 10046 + }, + { + "epoch": 0.1791995148574894, + "grad_norm": 0.21601396799087524, + "learning_rate": 4.905043346255135e-05, + "loss": 0.1767, + "step": 10047 + }, + { + "epoch": 0.17921735097920308, + "grad_norm": 0.3951280117034912, + "learning_rate": 4.905000850749637e-05, + "loss": 0.187, + "step": 10048 + }, + { + "epoch": 0.17923518710091677, + "grad_norm": 0.19908763468265533, + "learning_rate": 4.904958345921525e-05, + "loss": 0.2055, + "step": 10049 + }, + { + "epoch": 0.17925302322263048, + "grad_norm": 0.3498621881008148, + "learning_rate": 4.904915831770964e-05, + "loss": 0.2689, + "step": 10050 + }, + { + "epoch": 0.17927085934434417, + "grad_norm": 0.19761058688163757, + "learning_rate": 4.904873308298119e-05, + "loss": 0.2118, + "step": 10051 + }, + { + "epoch": 0.17928869546605786, + "grad_norm": 0.20359058678150177, + "learning_rate": 4.9048307755031544e-05, + "loss": 0.184, + "step": 10052 + }, + { + "epoch": 0.17930653158777154, + "grad_norm": 0.2612164318561554, + "learning_rate": 4.904788233386235e-05, + "loss": 0.229, + "step": 10053 + }, + { + "epoch": 0.17932436770948526, + "grad_norm": 0.4159994423389435, + "learning_rate": 4.904745681947526e-05, + "loss": 0.2232, + "step": 10054 + }, + { + "epoch": 0.17934220383119895, + "grad_norm": 0.26582232117652893, + "learning_rate": 4.904703121187192e-05, + "loss": 0.1779, + "step": 10055 + }, + { + "epoch": 0.17936003995291264, + "grad_norm": 0.2905386686325073, + "learning_rate": 4.904660551105398e-05, + "loss": 0.1675, + "step": 10056 + }, + { + "epoch": 0.17937787607462632, + "grad_norm": 0.2359733134508133, + "learning_rate": 4.904617971702309e-05, + "loss": 0.192, + "step": 10057 + }, + { + "epoch": 0.17939571219634004, + "grad_norm": 0.3787361681461334, + "learning_rate": 4.904575382978091e-05, + "loss": 0.2654, + "step": 10058 + }, + { + "epoch": 0.17941354831805373, + "grad_norm": 0.2540598511695862, + "learning_rate": 4.904532784932907e-05, + "loss": 0.2457, + "step": 10059 + }, + { + "epoch": 0.17943138443976742, + "grad_norm": 0.24077114462852478, + "learning_rate": 4.904490177566925e-05, + "loss": 0.1961, + "step": 10060 + }, + { + "epoch": 0.1794492205614811, + "grad_norm": 0.3214060664176941, + "learning_rate": 4.9044475608803074e-05, + "loss": 0.1926, + "step": 10061 + }, + { + "epoch": 0.17946705668319482, + "grad_norm": 0.2988150417804718, + "learning_rate": 4.904404934873221e-05, + "loss": 0.2524, + "step": 10062 + }, + { + "epoch": 0.1794848928049085, + "grad_norm": 0.2745426297187805, + "learning_rate": 4.9043622995458306e-05, + "loss": 0.1605, + "step": 10063 + }, + { + "epoch": 0.1795027289266222, + "grad_norm": 0.20333455502986908, + "learning_rate": 4.904319654898302e-05, + "loss": 0.1994, + "step": 10064 + }, + { + "epoch": 0.17952056504833588, + "grad_norm": 0.28882116079330444, + "learning_rate": 4.904277000930799e-05, + "loss": 0.2462, + "step": 10065 + }, + { + "epoch": 0.17953840117004957, + "grad_norm": 0.2296663224697113, + "learning_rate": 4.9042343376434887e-05, + "loss": 0.2299, + "step": 10066 + }, + { + "epoch": 0.1795562372917633, + "grad_norm": 0.22529172897338867, + "learning_rate": 4.904191665036535e-05, + "loss": 0.1718, + "step": 10067 + }, + { + "epoch": 0.17957407341347698, + "grad_norm": 0.3339504897594452, + "learning_rate": 4.904148983110105e-05, + "loss": 0.2199, + "step": 10068 + }, + { + "epoch": 0.17959190953519066, + "grad_norm": 0.22459593415260315, + "learning_rate": 4.904106291864362e-05, + "loss": 0.207, + "step": 10069 + }, + { + "epoch": 0.17960974565690435, + "grad_norm": 0.3362443745136261, + "learning_rate": 4.904063591299474e-05, + "loss": 0.2692, + "step": 10070 + }, + { + "epoch": 0.17962758177861807, + "grad_norm": 0.28358936309814453, + "learning_rate": 4.904020881415604e-05, + "loss": 0.2137, + "step": 10071 + }, + { + "epoch": 0.17964541790033176, + "grad_norm": 0.28809306025505066, + "learning_rate": 4.9039781622129185e-05, + "loss": 0.2255, + "step": 10072 + }, + { + "epoch": 0.17966325402204544, + "grad_norm": 0.3335660398006439, + "learning_rate": 4.903935433691584e-05, + "loss": 0.2067, + "step": 10073 + }, + { + "epoch": 0.17968109014375913, + "grad_norm": 0.9616582989692688, + "learning_rate": 4.903892695851766e-05, + "loss": 0.2188, + "step": 10074 + }, + { + "epoch": 0.17969892626547285, + "grad_norm": 0.32171082496643066, + "learning_rate": 4.9038499486936296e-05, + "loss": 0.2343, + "step": 10075 + }, + { + "epoch": 0.17971676238718653, + "grad_norm": 0.3304702341556549, + "learning_rate": 4.90380719221734e-05, + "loss": 0.2034, + "step": 10076 + }, + { + "epoch": 0.17973459850890022, + "grad_norm": 0.2655666768550873, + "learning_rate": 4.9037644264230634e-05, + "loss": 0.2003, + "step": 10077 + }, + { + "epoch": 0.1797524346306139, + "grad_norm": 0.3717540204524994, + "learning_rate": 4.903721651310966e-05, + "loss": 0.2121, + "step": 10078 + }, + { + "epoch": 0.17977027075232763, + "grad_norm": 0.2529218792915344, + "learning_rate": 4.903678866881213e-05, + "loss": 0.2095, + "step": 10079 + }, + { + "epoch": 0.17978810687404131, + "grad_norm": 0.2826620936393738, + "learning_rate": 4.9036360731339706e-05, + "loss": 0.1921, + "step": 10080 + }, + { + "epoch": 0.179805942995755, + "grad_norm": 0.25628387928009033, + "learning_rate": 4.903593270069404e-05, + "loss": 0.2037, + "step": 10081 + }, + { + "epoch": 0.1798237791174687, + "grad_norm": 0.22990234196186066, + "learning_rate": 4.90355045768768e-05, + "loss": 0.1803, + "step": 10082 + }, + { + "epoch": 0.17984161523918238, + "grad_norm": 0.24044960737228394, + "learning_rate": 4.903507635988965e-05, + "loss": 0.1575, + "step": 10083 + }, + { + "epoch": 0.1798594513608961, + "grad_norm": 0.24302862584590912, + "learning_rate": 4.903464804973424e-05, + "loss": 0.2132, + "step": 10084 + }, + { + "epoch": 0.17987728748260978, + "grad_norm": 0.5112969279289246, + "learning_rate": 4.903421964641223e-05, + "loss": 0.1884, + "step": 10085 + }, + { + "epoch": 0.17989512360432347, + "grad_norm": 0.28698021173477173, + "learning_rate": 4.903379114992528e-05, + "loss": 0.194, + "step": 10086 + }, + { + "epoch": 0.17991295972603716, + "grad_norm": 0.28727683424949646, + "learning_rate": 4.9033362560275066e-05, + "loss": 0.2345, + "step": 10087 + }, + { + "epoch": 0.17993079584775087, + "grad_norm": 0.2771809995174408, + "learning_rate": 4.903293387746323e-05, + "loss": 0.2315, + "step": 10088 + }, + { + "epoch": 0.17994863196946456, + "grad_norm": 0.30761608481407166, + "learning_rate": 4.9032505101491436e-05, + "loss": 0.2544, + "step": 10089 + }, + { + "epoch": 0.17996646809117825, + "grad_norm": 0.3372572958469391, + "learning_rate": 4.903207623236136e-05, + "loss": 0.1898, + "step": 10090 + }, + { + "epoch": 0.17998430421289194, + "grad_norm": 0.29283350706100464, + "learning_rate": 4.9031647270074655e-05, + "loss": 0.1629, + "step": 10091 + }, + { + "epoch": 0.18000214033460565, + "grad_norm": 0.2555445730686188, + "learning_rate": 4.903121821463299e-05, + "loss": 0.1916, + "step": 10092 + }, + { + "epoch": 0.18001997645631934, + "grad_norm": 0.25533509254455566, + "learning_rate": 4.903078906603801e-05, + "loss": 0.2242, + "step": 10093 + }, + { + "epoch": 0.18003781257803303, + "grad_norm": 0.33725500106811523, + "learning_rate": 4.90303598242914e-05, + "loss": 0.2191, + "step": 10094 + }, + { + "epoch": 0.18005564869974672, + "grad_norm": 0.28250038623809814, + "learning_rate": 4.902993048939482e-05, + "loss": 0.2508, + "step": 10095 + }, + { + "epoch": 0.18007348482146043, + "grad_norm": 0.2518704831600189, + "learning_rate": 4.902950106134992e-05, + "loss": 0.1816, + "step": 10096 + }, + { + "epoch": 0.18009132094317412, + "grad_norm": 0.32761144638061523, + "learning_rate": 4.902907154015838e-05, + "loss": 0.2432, + "step": 10097 + }, + { + "epoch": 0.1801091570648878, + "grad_norm": 0.27961263060569763, + "learning_rate": 4.902864192582185e-05, + "loss": 0.1908, + "step": 10098 + }, + { + "epoch": 0.1801269931866015, + "grad_norm": 0.22646716237068176, + "learning_rate": 4.902821221834202e-05, + "loss": 0.2251, + "step": 10099 + }, + { + "epoch": 0.1801448293083152, + "grad_norm": 0.30442455410957336, + "learning_rate": 4.902778241772053e-05, + "loss": 0.1881, + "step": 10100 + }, + { + "epoch": 0.1801626654300289, + "grad_norm": 0.34533753991127014, + "learning_rate": 4.9027352523959056e-05, + "loss": 0.2673, + "step": 10101 + }, + { + "epoch": 0.1801805015517426, + "grad_norm": 0.23440693318843842, + "learning_rate": 4.902692253705927e-05, + "loss": 0.2069, + "step": 10102 + }, + { + "epoch": 0.18019833767345628, + "grad_norm": 0.31848660111427307, + "learning_rate": 4.9026492457022834e-05, + "loss": 0.2152, + "step": 10103 + }, + { + "epoch": 0.18021617379516996, + "grad_norm": 0.17989078164100647, + "learning_rate": 4.9026062283851404e-05, + "loss": 0.1923, + "step": 10104 + }, + { + "epoch": 0.18023400991688368, + "grad_norm": 0.24270272254943848, + "learning_rate": 4.9025632017546675e-05, + "loss": 0.1616, + "step": 10105 + }, + { + "epoch": 0.18025184603859737, + "grad_norm": 0.3410046100616455, + "learning_rate": 4.902520165811029e-05, + "loss": 0.2518, + "step": 10106 + }, + { + "epoch": 0.18026968216031106, + "grad_norm": 0.20635966956615448, + "learning_rate": 4.902477120554392e-05, + "loss": 0.1945, + "step": 10107 + }, + { + "epoch": 0.18028751828202474, + "grad_norm": 0.24014919996261597, + "learning_rate": 4.9024340659849244e-05, + "loss": 0.2093, + "step": 10108 + }, + { + "epoch": 0.18030535440373846, + "grad_norm": 0.2237083911895752, + "learning_rate": 4.902391002102792e-05, + "loss": 0.1716, + "step": 10109 + }, + { + "epoch": 0.18032319052545215, + "grad_norm": 0.24098485708236694, + "learning_rate": 4.902347928908163e-05, + "loss": 0.1786, + "step": 10110 + }, + { + "epoch": 0.18034102664716584, + "grad_norm": 0.2782251238822937, + "learning_rate": 4.902304846401204e-05, + "loss": 0.1624, + "step": 10111 + }, + { + "epoch": 0.18035886276887952, + "grad_norm": 0.2588271498680115, + "learning_rate": 4.9022617545820815e-05, + "loss": 0.2122, + "step": 10112 + }, + { + "epoch": 0.18037669889059324, + "grad_norm": 0.3134334683418274, + "learning_rate": 4.9022186534509626e-05, + "loss": 0.2113, + "step": 10113 + }, + { + "epoch": 0.18039453501230693, + "grad_norm": 0.24441859126091003, + "learning_rate": 4.902175543008014e-05, + "loss": 0.2083, + "step": 10114 + }, + { + "epoch": 0.18041237113402062, + "grad_norm": 0.25083234906196594, + "learning_rate": 4.902132423253404e-05, + "loss": 0.1869, + "step": 10115 + }, + { + "epoch": 0.1804302072557343, + "grad_norm": 0.23445671796798706, + "learning_rate": 4.9020892941872985e-05, + "loss": 0.1789, + "step": 10116 + }, + { + "epoch": 0.18044804337744802, + "grad_norm": 0.2497745305299759, + "learning_rate": 4.9020461558098655e-05, + "loss": 0.1765, + "step": 10117 + }, + { + "epoch": 0.1804658794991617, + "grad_norm": 0.31935933232307434, + "learning_rate": 4.902003008121272e-05, + "loss": 0.2159, + "step": 10118 + }, + { + "epoch": 0.1804837156208754, + "grad_norm": 0.2808605134487152, + "learning_rate": 4.9019598511216844e-05, + "loss": 0.2251, + "step": 10119 + }, + { + "epoch": 0.18050155174258908, + "grad_norm": 0.28707271814346313, + "learning_rate": 4.901916684811272e-05, + "loss": 0.1909, + "step": 10120 + }, + { + "epoch": 0.1805193878643028, + "grad_norm": 0.2506902515888214, + "learning_rate": 4.9018735091902005e-05, + "loss": 0.1728, + "step": 10121 + }, + { + "epoch": 0.1805372239860165, + "grad_norm": 0.3061169385910034, + "learning_rate": 4.901830324258638e-05, + "loss": 0.2428, + "step": 10122 + }, + { + "epoch": 0.18055506010773018, + "grad_norm": 0.18775995075702667, + "learning_rate": 4.901787130016751e-05, + "loss": 0.213, + "step": 10123 + }, + { + "epoch": 0.18057289622944386, + "grad_norm": 0.20270133018493652, + "learning_rate": 4.901743926464708e-05, + "loss": 0.2191, + "step": 10124 + }, + { + "epoch": 0.18059073235115755, + "grad_norm": 0.307338148355484, + "learning_rate": 4.9017007136026763e-05, + "loss": 0.1448, + "step": 10125 + }, + { + "epoch": 0.18060856847287127, + "grad_norm": 0.26945802569389343, + "learning_rate": 4.9016574914308224e-05, + "loss": 0.1872, + "step": 10126 + }, + { + "epoch": 0.18062640459458496, + "grad_norm": 0.46143609285354614, + "learning_rate": 4.901614259949315e-05, + "loss": 0.2883, + "step": 10127 + }, + { + "epoch": 0.18064424071629864, + "grad_norm": 0.2881545126438141, + "learning_rate": 4.9015710191583206e-05, + "loss": 0.2017, + "step": 10128 + }, + { + "epoch": 0.18066207683801233, + "grad_norm": 0.23882745206356049, + "learning_rate": 4.901527769058008e-05, + "loss": 0.1722, + "step": 10129 + }, + { + "epoch": 0.18067991295972605, + "grad_norm": 0.29898542165756226, + "learning_rate": 4.901484509648544e-05, + "loss": 0.2052, + "step": 10130 + }, + { + "epoch": 0.18069774908143973, + "grad_norm": 0.26189887523651123, + "learning_rate": 4.9014412409300966e-05, + "loss": 0.1923, + "step": 10131 + }, + { + "epoch": 0.18071558520315342, + "grad_norm": 0.29160553216934204, + "learning_rate": 4.901397962902834e-05, + "loss": 0.2391, + "step": 10132 + }, + { + "epoch": 0.1807334213248671, + "grad_norm": 0.39594241976737976, + "learning_rate": 4.9013546755669236e-05, + "loss": 0.2531, + "step": 10133 + }, + { + "epoch": 0.18075125744658083, + "grad_norm": 0.2338830679655075, + "learning_rate": 4.901311378922532e-05, + "loss": 0.2227, + "step": 10134 + }, + { + "epoch": 0.18076909356829451, + "grad_norm": 0.32091307640075684, + "learning_rate": 4.901268072969829e-05, + "loss": 0.2462, + "step": 10135 + }, + { + "epoch": 0.1807869296900082, + "grad_norm": 0.27749884128570557, + "learning_rate": 4.9012247577089815e-05, + "loss": 0.205, + "step": 10136 + }, + { + "epoch": 0.1808047658117219, + "grad_norm": 0.2812494933605194, + "learning_rate": 4.9011814331401575e-05, + "loss": 0.2368, + "step": 10137 + }, + { + "epoch": 0.1808226019334356, + "grad_norm": 0.24878400564193726, + "learning_rate": 4.901138099263525e-05, + "loss": 0.2003, + "step": 10138 + }, + { + "epoch": 0.1808404380551493, + "grad_norm": 0.33797627687454224, + "learning_rate": 4.901094756079251e-05, + "loss": 0.257, + "step": 10139 + }, + { + "epoch": 0.18085827417686298, + "grad_norm": 0.32351264357566833, + "learning_rate": 4.901051403587506e-05, + "loss": 0.2164, + "step": 10140 + }, + { + "epoch": 0.18087611029857667, + "grad_norm": 0.2124016135931015, + "learning_rate": 4.901008041788455e-05, + "loss": 0.1737, + "step": 10141 + }, + { + "epoch": 0.18089394642029039, + "grad_norm": 0.22003863751888275, + "learning_rate": 4.900964670682268e-05, + "loss": 0.185, + "step": 10142 + }, + { + "epoch": 0.18091178254200407, + "grad_norm": 0.269803524017334, + "learning_rate": 4.900921290269113e-05, + "loss": 0.2116, + "step": 10143 + }, + { + "epoch": 0.18092961866371776, + "grad_norm": 0.3971666097640991, + "learning_rate": 4.900877900549158e-05, + "loss": 0.2402, + "step": 10144 + }, + { + "epoch": 0.18094745478543145, + "grad_norm": 0.2891790568828583, + "learning_rate": 4.90083450152257e-05, + "loss": 0.1826, + "step": 10145 + }, + { + "epoch": 0.18096529090714514, + "grad_norm": 0.44094592332839966, + "learning_rate": 4.900791093189519e-05, + "loss": 0.2428, + "step": 10146 + }, + { + "epoch": 0.18098312702885885, + "grad_norm": 0.3213162422180176, + "learning_rate": 4.900747675550172e-05, + "loss": 0.2035, + "step": 10147 + }, + { + "epoch": 0.18100096315057254, + "grad_norm": 0.3137926459312439, + "learning_rate": 4.900704248604698e-05, + "loss": 0.1753, + "step": 10148 + }, + { + "epoch": 0.18101879927228623, + "grad_norm": 0.3039524555206299, + "learning_rate": 4.900660812353266e-05, + "loss": 0.2506, + "step": 10149 + }, + { + "epoch": 0.18103663539399992, + "grad_norm": 0.32710862159729004, + "learning_rate": 4.900617366796043e-05, + "loss": 0.1822, + "step": 10150 + }, + { + "epoch": 0.18105447151571363, + "grad_norm": 0.3087460398674011, + "learning_rate": 4.900573911933197e-05, + "loss": 0.1597, + "step": 10151 + }, + { + "epoch": 0.18107230763742732, + "grad_norm": 0.2529021203517914, + "learning_rate": 4.900530447764899e-05, + "loss": 0.2498, + "step": 10152 + }, + { + "epoch": 0.181090143759141, + "grad_norm": 0.3058854937553406, + "learning_rate": 4.900486974291315e-05, + "loss": 0.2046, + "step": 10153 + }, + { + "epoch": 0.1811079798808547, + "grad_norm": 0.4110319912433624, + "learning_rate": 4.9004434915126144e-05, + "loss": 0.1871, + "step": 10154 + }, + { + "epoch": 0.1811258160025684, + "grad_norm": 0.24077773094177246, + "learning_rate": 4.900399999428966e-05, + "loss": 0.2313, + "step": 10155 + }, + { + "epoch": 0.1811436521242821, + "grad_norm": 0.37185460329055786, + "learning_rate": 4.900356498040538e-05, + "loss": 0.2642, + "step": 10156 + }, + { + "epoch": 0.1811614882459958, + "grad_norm": 0.252057820558548, + "learning_rate": 4.900312987347498e-05, + "loss": 0.1816, + "step": 10157 + }, + { + "epoch": 0.18117932436770948, + "grad_norm": 0.3014383912086487, + "learning_rate": 4.900269467350018e-05, + "loss": 0.2613, + "step": 10158 + }, + { + "epoch": 0.1811971604894232, + "grad_norm": 0.3230541944503784, + "learning_rate": 4.900225938048263e-05, + "loss": 0.2529, + "step": 10159 + }, + { + "epoch": 0.18121499661113688, + "grad_norm": 0.4753343462944031, + "learning_rate": 4.900182399442404e-05, + "loss": 0.1648, + "step": 10160 + }, + { + "epoch": 0.18123283273285057, + "grad_norm": 0.3304778039455414, + "learning_rate": 4.9001388515326085e-05, + "loss": 0.1849, + "step": 10161 + }, + { + "epoch": 0.18125066885456426, + "grad_norm": 0.22614115476608276, + "learning_rate": 4.900095294319046e-05, + "loss": 0.2063, + "step": 10162 + }, + { + "epoch": 0.18126850497627794, + "grad_norm": 0.23606637120246887, + "learning_rate": 4.900051727801885e-05, + "loss": 0.214, + "step": 10163 + }, + { + "epoch": 0.18128634109799166, + "grad_norm": 0.28194326162338257, + "learning_rate": 4.900008151981295e-05, + "loss": 0.2027, + "step": 10164 + }, + { + "epoch": 0.18130417721970535, + "grad_norm": 0.337079793214798, + "learning_rate": 4.899964566857444e-05, + "loss": 0.204, + "step": 10165 + }, + { + "epoch": 0.18132201334141904, + "grad_norm": 0.31034016609191895, + "learning_rate": 4.899920972430502e-05, + "loss": 0.2246, + "step": 10166 + }, + { + "epoch": 0.18133984946313272, + "grad_norm": 0.32246097922325134, + "learning_rate": 4.899877368700637e-05, + "loss": 0.1863, + "step": 10167 + }, + { + "epoch": 0.18135768558484644, + "grad_norm": 0.3250961899757385, + "learning_rate": 4.8998337556680186e-05, + "loss": 0.1845, + "step": 10168 + }, + { + "epoch": 0.18137552170656013, + "grad_norm": 0.3406941592693329, + "learning_rate": 4.8997901333328156e-05, + "loss": 0.2148, + "step": 10169 + }, + { + "epoch": 0.18139335782827382, + "grad_norm": 0.27561306953430176, + "learning_rate": 4.899746501695197e-05, + "loss": 0.2462, + "step": 10170 + }, + { + "epoch": 0.1814111939499875, + "grad_norm": 0.2547914385795593, + "learning_rate": 4.8997028607553316e-05, + "loss": 0.2442, + "step": 10171 + }, + { + "epoch": 0.18142903007170122, + "grad_norm": 0.24809367954730988, + "learning_rate": 4.89965921051339e-05, + "loss": 0.258, + "step": 10172 + }, + { + "epoch": 0.1814468661934149, + "grad_norm": 0.39914289116859436, + "learning_rate": 4.899615550969541e-05, + "loss": 0.2125, + "step": 10173 + }, + { + "epoch": 0.1814647023151286, + "grad_norm": 0.23397384583950043, + "learning_rate": 4.8995718821239525e-05, + "loss": 0.2111, + "step": 10174 + }, + { + "epoch": 0.18148253843684228, + "grad_norm": 0.28204038739204407, + "learning_rate": 4.8995282039767945e-05, + "loss": 0.2132, + "step": 10175 + }, + { + "epoch": 0.181500374558556, + "grad_norm": 0.26229846477508545, + "learning_rate": 4.899484516528236e-05, + "loss": 0.1764, + "step": 10176 + }, + { + "epoch": 0.1815182106802697, + "grad_norm": 0.182540163397789, + "learning_rate": 4.899440819778448e-05, + "loss": 0.1721, + "step": 10177 + }, + { + "epoch": 0.18153604680198338, + "grad_norm": 0.3202981948852539, + "learning_rate": 4.899397113727597e-05, + "loss": 0.2177, + "step": 10178 + }, + { + "epoch": 0.18155388292369706, + "grad_norm": 0.219949871301651, + "learning_rate": 4.8993533983758554e-05, + "loss": 0.1655, + "step": 10179 + }, + { + "epoch": 0.18157171904541078, + "grad_norm": 0.275773286819458, + "learning_rate": 4.8993096737233915e-05, + "loss": 0.2187, + "step": 10180 + }, + { + "epoch": 0.18158955516712447, + "grad_norm": 0.2234012335538864, + "learning_rate": 4.899265939770374e-05, + "loss": 0.1435, + "step": 10181 + }, + { + "epoch": 0.18160739128883815, + "grad_norm": 0.2315882295370102, + "learning_rate": 4.899222196516973e-05, + "loss": 0.198, + "step": 10182 + }, + { + "epoch": 0.18162522741055184, + "grad_norm": 0.2787816524505615, + "learning_rate": 4.899178443963358e-05, + "loss": 0.1807, + "step": 10183 + }, + { + "epoch": 0.18164306353226553, + "grad_norm": 0.2763764560222626, + "learning_rate": 4.899134682109699e-05, + "loss": 0.1814, + "step": 10184 + }, + { + "epoch": 0.18166089965397925, + "grad_norm": 0.31553885340690613, + "learning_rate": 4.8990909109561655e-05, + "loss": 0.2394, + "step": 10185 + }, + { + "epoch": 0.18167873577569293, + "grad_norm": 0.27082380652427673, + "learning_rate": 4.899047130502926e-05, + "loss": 0.2341, + "step": 10186 + }, + { + "epoch": 0.18169657189740662, + "grad_norm": 0.48682701587677, + "learning_rate": 4.899003340750152e-05, + "loss": 0.2737, + "step": 10187 + }, + { + "epoch": 0.1817144080191203, + "grad_norm": 0.30625849962234497, + "learning_rate": 4.8989595416980126e-05, + "loss": 0.2232, + "step": 10188 + }, + { + "epoch": 0.18173224414083403, + "grad_norm": 0.30230265855789185, + "learning_rate": 4.898915733346677e-05, + "loss": 0.2432, + "step": 10189 + }, + { + "epoch": 0.18175008026254771, + "grad_norm": 0.34137627482414246, + "learning_rate": 4.898871915696316e-05, + "loss": 0.2677, + "step": 10190 + }, + { + "epoch": 0.1817679163842614, + "grad_norm": 0.20676808059215546, + "learning_rate": 4.898828088747099e-05, + "loss": 0.2142, + "step": 10191 + }, + { + "epoch": 0.1817857525059751, + "grad_norm": 0.26453015208244324, + "learning_rate": 4.8987842524991956e-05, + "loss": 0.2176, + "step": 10192 + }, + { + "epoch": 0.1818035886276888, + "grad_norm": 0.35668259859085083, + "learning_rate": 4.898740406952775e-05, + "loss": 0.2253, + "step": 10193 + }, + { + "epoch": 0.1818214247494025, + "grad_norm": 0.275824636220932, + "learning_rate": 4.8986965521080095e-05, + "loss": 0.2202, + "step": 10194 + }, + { + "epoch": 0.18183926087111618, + "grad_norm": 0.2264062613248825, + "learning_rate": 4.898652687965067e-05, + "loss": 0.1946, + "step": 10195 + }, + { + "epoch": 0.18185709699282987, + "grad_norm": 0.3434748649597168, + "learning_rate": 4.898608814524118e-05, + "loss": 0.2257, + "step": 10196 + }, + { + "epoch": 0.18187493311454359, + "grad_norm": 0.49679169058799744, + "learning_rate": 4.898564931785333e-05, + "loss": 0.2124, + "step": 10197 + }, + { + "epoch": 0.18189276923625727, + "grad_norm": 0.20148178935050964, + "learning_rate": 4.8985210397488825e-05, + "loss": 0.1772, + "step": 10198 + }, + { + "epoch": 0.18191060535797096, + "grad_norm": 0.2926502227783203, + "learning_rate": 4.898477138414935e-05, + "loss": 0.2321, + "step": 10199 + }, + { + "epoch": 0.18192844147968465, + "grad_norm": 0.29967305064201355, + "learning_rate": 4.898433227783662e-05, + "loss": 0.188, + "step": 10200 + }, + { + "epoch": 0.18194627760139837, + "grad_norm": 0.2732852101325989, + "learning_rate": 4.8983893078552336e-05, + "loss": 0.1824, + "step": 10201 + }, + { + "epoch": 0.18196411372311205, + "grad_norm": 0.2793472111225128, + "learning_rate": 4.898345378629819e-05, + "loss": 0.1622, + "step": 10202 + }, + { + "epoch": 0.18198194984482574, + "grad_norm": 0.29475104808807373, + "learning_rate": 4.898301440107591e-05, + "loss": 0.2124, + "step": 10203 + }, + { + "epoch": 0.18199978596653943, + "grad_norm": 0.28337574005126953, + "learning_rate": 4.898257492288718e-05, + "loss": 0.2164, + "step": 10204 + }, + { + "epoch": 0.18201762208825312, + "grad_norm": 0.34891387820243835, + "learning_rate": 4.898213535173369e-05, + "loss": 0.2182, + "step": 10205 + }, + { + "epoch": 0.18203545820996683, + "grad_norm": 0.326509028673172, + "learning_rate": 4.898169568761718e-05, + "loss": 0.2226, + "step": 10206 + }, + { + "epoch": 0.18205329433168052, + "grad_norm": 0.2673135995864868, + "learning_rate": 4.898125593053932e-05, + "loss": 0.1923, + "step": 10207 + }, + { + "epoch": 0.1820711304533942, + "grad_norm": 0.3589719831943512, + "learning_rate": 4.8980816080501836e-05, + "loss": 0.3074, + "step": 10208 + }, + { + "epoch": 0.1820889665751079, + "grad_norm": 0.24968448281288147, + "learning_rate": 4.8980376137506425e-05, + "loss": 0.172, + "step": 10209 + }, + { + "epoch": 0.1821068026968216, + "grad_norm": 0.29482802748680115, + "learning_rate": 4.897993610155479e-05, + "loss": 0.2223, + "step": 10210 + }, + { + "epoch": 0.1821246388185353, + "grad_norm": 0.3561578392982483, + "learning_rate": 4.8979495972648645e-05, + "loss": 0.2261, + "step": 10211 + }, + { + "epoch": 0.182142474940249, + "grad_norm": 0.27590808272361755, + "learning_rate": 4.897905575078969e-05, + "loss": 0.2053, + "step": 10212 + }, + { + "epoch": 0.18216031106196268, + "grad_norm": 0.2933117747306824, + "learning_rate": 4.8978615435979635e-05, + "loss": 0.2206, + "step": 10213 + }, + { + "epoch": 0.1821781471836764, + "grad_norm": 0.2966252565383911, + "learning_rate": 4.897817502822018e-05, + "loss": 0.199, + "step": 10214 + }, + { + "epoch": 0.18219598330539008, + "grad_norm": 0.29027706384658813, + "learning_rate": 4.897773452751304e-05, + "loss": 0.1623, + "step": 10215 + }, + { + "epoch": 0.18221381942710377, + "grad_norm": 0.271456241607666, + "learning_rate": 4.897729393385992e-05, + "loss": 0.1994, + "step": 10216 + }, + { + "epoch": 0.18223165554881746, + "grad_norm": 0.33194246888160706, + "learning_rate": 4.8976853247262524e-05, + "loss": 0.2635, + "step": 10217 + }, + { + "epoch": 0.18224949167053117, + "grad_norm": 0.3467560112476349, + "learning_rate": 4.897641246772257e-05, + "loss": 0.2193, + "step": 10218 + }, + { + "epoch": 0.18226732779224486, + "grad_norm": 0.2434745728969574, + "learning_rate": 4.897597159524175e-05, + "loss": 0.2056, + "step": 10219 + }, + { + "epoch": 0.18228516391395855, + "grad_norm": 0.3007213771343231, + "learning_rate": 4.8975530629821784e-05, + "loss": 0.2594, + "step": 10220 + }, + { + "epoch": 0.18230300003567224, + "grad_norm": 0.265828013420105, + "learning_rate": 4.8975089571464386e-05, + "loss": 0.2334, + "step": 10221 + }, + { + "epoch": 0.18232083615738595, + "grad_norm": 0.2535021901130676, + "learning_rate": 4.8974648420171264e-05, + "loss": 0.2272, + "step": 10222 + }, + { + "epoch": 0.18233867227909964, + "grad_norm": 0.3300265371799469, + "learning_rate": 4.897420717594412e-05, + "loss": 0.212, + "step": 10223 + }, + { + "epoch": 0.18235650840081333, + "grad_norm": 0.40497615933418274, + "learning_rate": 4.897376583878467e-05, + "loss": 0.2154, + "step": 10224 + }, + { + "epoch": 0.18237434452252702, + "grad_norm": 0.28284552693367004, + "learning_rate": 4.8973324408694617e-05, + "loss": 0.2036, + "step": 10225 + }, + { + "epoch": 0.1823921806442407, + "grad_norm": 0.2925427556037903, + "learning_rate": 4.897288288567568e-05, + "loss": 0.2122, + "step": 10226 + }, + { + "epoch": 0.18241001676595442, + "grad_norm": 0.4144842028617859, + "learning_rate": 4.8972441269729576e-05, + "loss": 0.2253, + "step": 10227 + }, + { + "epoch": 0.1824278528876681, + "grad_norm": 0.267392098903656, + "learning_rate": 4.8971999560858e-05, + "loss": 0.2377, + "step": 10228 + }, + { + "epoch": 0.1824456890093818, + "grad_norm": 0.33622896671295166, + "learning_rate": 4.897155775906268e-05, + "loss": 0.2393, + "step": 10229 + }, + { + "epoch": 0.18246352513109548, + "grad_norm": 0.2719358205795288, + "learning_rate": 4.897111586434532e-05, + "loss": 0.1879, + "step": 10230 + }, + { + "epoch": 0.1824813612528092, + "grad_norm": 0.22177031636238098, + "learning_rate": 4.8970673876707643e-05, + "loss": 0.1907, + "step": 10231 + }, + { + "epoch": 0.1824991973745229, + "grad_norm": 0.2801766097545624, + "learning_rate": 4.8970231796151345e-05, + "loss": 0.1349, + "step": 10232 + }, + { + "epoch": 0.18251703349623657, + "grad_norm": 0.32964321970939636, + "learning_rate": 4.8969789622678155e-05, + "loss": 0.2377, + "step": 10233 + }, + { + "epoch": 0.18253486961795026, + "grad_norm": 0.26035168766975403, + "learning_rate": 4.896934735628978e-05, + "loss": 0.233, + "step": 10234 + }, + { + "epoch": 0.18255270573966398, + "grad_norm": 0.30898937582969666, + "learning_rate": 4.8968904996987936e-05, + "loss": 0.2615, + "step": 10235 + }, + { + "epoch": 0.18257054186137767, + "grad_norm": 0.3576295077800751, + "learning_rate": 4.896846254477434e-05, + "loss": 0.1934, + "step": 10236 + }, + { + "epoch": 0.18258837798309135, + "grad_norm": 0.2590436339378357, + "learning_rate": 4.89680199996507e-05, + "loss": 0.2266, + "step": 10237 + }, + { + "epoch": 0.18260621410480504, + "grad_norm": 0.25483256578445435, + "learning_rate": 4.896757736161874e-05, + "loss": 0.1919, + "step": 10238 + }, + { + "epoch": 0.18262405022651876, + "grad_norm": 0.3020104467868805, + "learning_rate": 4.896713463068017e-05, + "loss": 0.2111, + "step": 10239 + }, + { + "epoch": 0.18264188634823245, + "grad_norm": 0.23808705806732178, + "learning_rate": 4.896669180683671e-05, + "loss": 0.2008, + "step": 10240 + }, + { + "epoch": 0.18265972246994613, + "grad_norm": 0.37742194533348083, + "learning_rate": 4.8966248890090075e-05, + "loss": 0.2515, + "step": 10241 + }, + { + "epoch": 0.18267755859165982, + "grad_norm": 0.3267439901828766, + "learning_rate": 4.896580588044198e-05, + "loss": 0.2342, + "step": 10242 + }, + { + "epoch": 0.18269539471337354, + "grad_norm": 0.25319239497184753, + "learning_rate": 4.896536277789414e-05, + "loss": 0.206, + "step": 10243 + }, + { + "epoch": 0.18271323083508723, + "grad_norm": 0.4356033504009247, + "learning_rate": 4.896491958244828e-05, + "loss": 0.2916, + "step": 10244 + }, + { + "epoch": 0.1827310669568009, + "grad_norm": 0.4053899347782135, + "learning_rate": 4.896447629410612e-05, + "loss": 0.2414, + "step": 10245 + }, + { + "epoch": 0.1827489030785146, + "grad_norm": 0.28319886326789856, + "learning_rate": 4.8964032912869364e-05, + "loss": 0.1892, + "step": 10246 + }, + { + "epoch": 0.1827667392002283, + "grad_norm": 0.23461323976516724, + "learning_rate": 4.8963589438739746e-05, + "loss": 0.1816, + "step": 10247 + }, + { + "epoch": 0.182784575321942, + "grad_norm": 0.528769850730896, + "learning_rate": 4.896314587171897e-05, + "loss": 0.2609, + "step": 10248 + }, + { + "epoch": 0.1828024114436557, + "grad_norm": 0.300067663192749, + "learning_rate": 4.896270221180878e-05, + "loss": 0.2166, + "step": 10249 + }, + { + "epoch": 0.18282024756536938, + "grad_norm": 0.28619349002838135, + "learning_rate": 4.896225845901087e-05, + "loss": 0.2043, + "step": 10250 + }, + { + "epoch": 0.18283808368708307, + "grad_norm": 0.2763802409172058, + "learning_rate": 4.896181461332696e-05, + "loss": 0.1843, + "step": 10251 + }, + { + "epoch": 0.18285591980879679, + "grad_norm": 0.30844414234161377, + "learning_rate": 4.896137067475879e-05, + "loss": 0.2434, + "step": 10252 + }, + { + "epoch": 0.18287375593051047, + "grad_norm": 0.32825011014938354, + "learning_rate": 4.896092664330808e-05, + "loss": 0.2587, + "step": 10253 + }, + { + "epoch": 0.18289159205222416, + "grad_norm": 0.2965489327907562, + "learning_rate": 4.896048251897652e-05, + "loss": 0.2666, + "step": 10254 + }, + { + "epoch": 0.18290942817393785, + "grad_norm": 0.2501685619354248, + "learning_rate": 4.896003830176588e-05, + "loss": 0.2091, + "step": 10255 + }, + { + "epoch": 0.18292726429565156, + "grad_norm": 0.26090285181999207, + "learning_rate": 4.895959399167784e-05, + "loss": 0.2426, + "step": 10256 + }, + { + "epoch": 0.18294510041736525, + "grad_norm": 0.2957301139831543, + "learning_rate": 4.895914958871414e-05, + "loss": 0.2544, + "step": 10257 + }, + { + "epoch": 0.18296293653907894, + "grad_norm": 0.21864773333072662, + "learning_rate": 4.895870509287651e-05, + "loss": 0.2117, + "step": 10258 + }, + { + "epoch": 0.18298077266079263, + "grad_norm": 0.26415616273880005, + "learning_rate": 4.8958260504166654e-05, + "loss": 0.2288, + "step": 10259 + }, + { + "epoch": 0.18299860878250634, + "grad_norm": 0.27247998118400574, + "learning_rate": 4.8957815822586304e-05, + "loss": 0.2024, + "step": 10260 + }, + { + "epoch": 0.18301644490422003, + "grad_norm": 0.2134028524160385, + "learning_rate": 4.895737104813719e-05, + "loss": 0.1938, + "step": 10261 + }, + { + "epoch": 0.18303428102593372, + "grad_norm": 0.22990845143795013, + "learning_rate": 4.895692618082103e-05, + "loss": 0.2045, + "step": 10262 + }, + { + "epoch": 0.1830521171476474, + "grad_norm": 0.3156064748764038, + "learning_rate": 4.895648122063955e-05, + "loss": 0.2366, + "step": 10263 + }, + { + "epoch": 0.1830699532693611, + "grad_norm": 0.2991919219493866, + "learning_rate": 4.8956036167594476e-05, + "loss": 0.249, + "step": 10264 + }, + { + "epoch": 0.1830877893910748, + "grad_norm": 0.2450971007347107, + "learning_rate": 4.895559102168754e-05, + "loss": 0.2049, + "step": 10265 + }, + { + "epoch": 0.1831056255127885, + "grad_norm": 0.302469402551651, + "learning_rate": 4.895514578292044e-05, + "loss": 0.237, + "step": 10266 + }, + { + "epoch": 0.1831234616345022, + "grad_norm": 0.3612455129623413, + "learning_rate": 4.8954700451294933e-05, + "loss": 0.2851, + "step": 10267 + }, + { + "epoch": 0.18314129775621588, + "grad_norm": 0.4785887897014618, + "learning_rate": 4.8954255026812737e-05, + "loss": 0.3032, + "step": 10268 + }, + { + "epoch": 0.1831591338779296, + "grad_norm": 0.31391844153404236, + "learning_rate": 4.895380950947557e-05, + "loss": 0.2405, + "step": 10269 + }, + { + "epoch": 0.18317696999964328, + "grad_norm": 0.27950945496559143, + "learning_rate": 4.895336389928516e-05, + "loss": 0.28, + "step": 10270 + }, + { + "epoch": 0.18319480612135697, + "grad_norm": 0.36554351449012756, + "learning_rate": 4.895291819624324e-05, + "loss": 0.2614, + "step": 10271 + }, + { + "epoch": 0.18321264224307066, + "grad_norm": 0.2411484569311142, + "learning_rate": 4.895247240035154e-05, + "loss": 0.1321, + "step": 10272 + }, + { + "epoch": 0.18323047836478437, + "grad_norm": 0.31390002369880676, + "learning_rate": 4.895202651161178e-05, + "loss": 0.2091, + "step": 10273 + }, + { + "epoch": 0.18324831448649806, + "grad_norm": 0.3174966275691986, + "learning_rate": 4.8951580530025696e-05, + "loss": 0.218, + "step": 10274 + }, + { + "epoch": 0.18326615060821175, + "grad_norm": 0.25548991560935974, + "learning_rate": 4.895113445559501e-05, + "loss": 0.2092, + "step": 10275 + }, + { + "epoch": 0.18328398672992544, + "grad_norm": 0.20418310165405273, + "learning_rate": 4.8950688288321456e-05, + "loss": 0.1875, + "step": 10276 + }, + { + "epoch": 0.18330182285163915, + "grad_norm": 0.2449674755334854, + "learning_rate": 4.895024202820676e-05, + "loss": 0.2224, + "step": 10277 + }, + { + "epoch": 0.18331965897335284, + "grad_norm": 0.28462448716163635, + "learning_rate": 4.8949795675252656e-05, + "loss": 0.2413, + "step": 10278 + }, + { + "epoch": 0.18333749509506653, + "grad_norm": 0.29260769486427307, + "learning_rate": 4.894934922946087e-05, + "loss": 0.2203, + "step": 10279 + }, + { + "epoch": 0.18335533121678022, + "grad_norm": 0.2456568330526352, + "learning_rate": 4.894890269083314e-05, + "loss": 0.2272, + "step": 10280 + }, + { + "epoch": 0.18337316733849393, + "grad_norm": 0.3558341860771179, + "learning_rate": 4.894845605937118e-05, + "loss": 0.1961, + "step": 10281 + }, + { + "epoch": 0.18339100346020762, + "grad_norm": 0.23935003578662872, + "learning_rate": 4.894800933507675e-05, + "loss": 0.2079, + "step": 10282 + }, + { + "epoch": 0.1834088395819213, + "grad_norm": 0.30422544479370117, + "learning_rate": 4.894756251795155e-05, + "loss": 0.195, + "step": 10283 + }, + { + "epoch": 0.183426675703635, + "grad_norm": 0.24865223467350006, + "learning_rate": 4.894711560799733e-05, + "loss": 0.1899, + "step": 10284 + }, + { + "epoch": 0.18344451182534868, + "grad_norm": 0.20240098237991333, + "learning_rate": 4.8946668605215824e-05, + "loss": 0.1628, + "step": 10285 + }, + { + "epoch": 0.1834623479470624, + "grad_norm": 0.25108715891838074, + "learning_rate": 4.894622150960875e-05, + "loss": 0.1887, + "step": 10286 + }, + { + "epoch": 0.1834801840687761, + "grad_norm": 0.29093337059020996, + "learning_rate": 4.894577432117786e-05, + "loss": 0.1975, + "step": 10287 + }, + { + "epoch": 0.18349802019048977, + "grad_norm": 0.23927940428256989, + "learning_rate": 4.894532703992487e-05, + "loss": 0.1879, + "step": 10288 + }, + { + "epoch": 0.18351585631220346, + "grad_norm": 0.23843155801296234, + "learning_rate": 4.894487966585153e-05, + "loss": 0.2221, + "step": 10289 + }, + { + "epoch": 0.18353369243391718, + "grad_norm": 0.2642580568790436, + "learning_rate": 4.894443219895957e-05, + "loss": 0.2257, + "step": 10290 + }, + { + "epoch": 0.18355152855563087, + "grad_norm": 0.2536928951740265, + "learning_rate": 4.8943984639250704e-05, + "loss": 0.2286, + "step": 10291 + }, + { + "epoch": 0.18356936467734455, + "grad_norm": 0.2608407437801361, + "learning_rate": 4.894353698672669e-05, + "loss": 0.1545, + "step": 10292 + }, + { + "epoch": 0.18358720079905824, + "grad_norm": 0.3679421842098236, + "learning_rate": 4.8943089241389264e-05, + "loss": 0.2385, + "step": 10293 + }, + { + "epoch": 0.18360503692077196, + "grad_norm": 0.22421351075172424, + "learning_rate": 4.894264140324015e-05, + "loss": 0.1818, + "step": 10294 + }, + { + "epoch": 0.18362287304248565, + "grad_norm": 0.2700023949146271, + "learning_rate": 4.894219347228109e-05, + "loss": 0.1828, + "step": 10295 + }, + { + "epoch": 0.18364070916419933, + "grad_norm": 0.2802733778953552, + "learning_rate": 4.8941745448513814e-05, + "loss": 0.204, + "step": 10296 + }, + { + "epoch": 0.18365854528591302, + "grad_norm": 0.29686352610588074, + "learning_rate": 4.8941297331940066e-05, + "loss": 0.1993, + "step": 10297 + }, + { + "epoch": 0.18367638140762674, + "grad_norm": 0.3029332160949707, + "learning_rate": 4.894084912256158e-05, + "loss": 0.2077, + "step": 10298 + }, + { + "epoch": 0.18369421752934043, + "grad_norm": 0.2746410667896271, + "learning_rate": 4.8940400820380097e-05, + "loss": 0.1405, + "step": 10299 + }, + { + "epoch": 0.1837120536510541, + "grad_norm": 0.2549181878566742, + "learning_rate": 4.893995242539735e-05, + "loss": 0.2129, + "step": 10300 + }, + { + "epoch": 0.1837298897727678, + "grad_norm": 0.23572704195976257, + "learning_rate": 4.893950393761508e-05, + "loss": 0.152, + "step": 10301 + }, + { + "epoch": 0.18374772589448152, + "grad_norm": 0.2893717288970947, + "learning_rate": 4.893905535703502e-05, + "loss": 0.2152, + "step": 10302 + }, + { + "epoch": 0.1837655620161952, + "grad_norm": 0.4165283143520355, + "learning_rate": 4.8938606683658915e-05, + "loss": 0.3003, + "step": 10303 + }, + { + "epoch": 0.1837833981379089, + "grad_norm": 0.22356657683849335, + "learning_rate": 4.8938157917488505e-05, + "loss": 0.1867, + "step": 10304 + }, + { + "epoch": 0.18380123425962258, + "grad_norm": 0.26489782333374023, + "learning_rate": 4.893770905852553e-05, + "loss": 0.1968, + "step": 10305 + }, + { + "epoch": 0.18381907038133627, + "grad_norm": 0.20398347079753876, + "learning_rate": 4.893726010677172e-05, + "loss": 0.1736, + "step": 10306 + }, + { + "epoch": 0.18383690650304998, + "grad_norm": 0.22316183149814606, + "learning_rate": 4.893681106222882e-05, + "loss": 0.1809, + "step": 10307 + }, + { + "epoch": 0.18385474262476367, + "grad_norm": 0.34353235363960266, + "learning_rate": 4.893636192489858e-05, + "loss": 0.2134, + "step": 10308 + }, + { + "epoch": 0.18387257874647736, + "grad_norm": 0.2776363492012024, + "learning_rate": 4.8935912694782725e-05, + "loss": 0.1773, + "step": 10309 + }, + { + "epoch": 0.18389041486819105, + "grad_norm": 0.348707377910614, + "learning_rate": 4.893546337188302e-05, + "loss": 0.2195, + "step": 10310 + }, + { + "epoch": 0.18390825098990476, + "grad_norm": 0.2620590925216675, + "learning_rate": 4.8935013956201176e-05, + "loss": 0.1788, + "step": 10311 + }, + { + "epoch": 0.18392608711161845, + "grad_norm": 0.21259760856628418, + "learning_rate": 4.8934564447738965e-05, + "loss": 0.1605, + "step": 10312 + }, + { + "epoch": 0.18394392323333214, + "grad_norm": 0.3143465518951416, + "learning_rate": 4.8934114846498105e-05, + "loss": 0.1483, + "step": 10313 + }, + { + "epoch": 0.18396175935504583, + "grad_norm": 0.3004714846611023, + "learning_rate": 4.893366515248034e-05, + "loss": 0.2323, + "step": 10314 + }, + { + "epoch": 0.18397959547675954, + "grad_norm": 0.2775789499282837, + "learning_rate": 4.893321536568744e-05, + "loss": 0.1946, + "step": 10315 + }, + { + "epoch": 0.18399743159847323, + "grad_norm": 0.23492038249969482, + "learning_rate": 4.893276548612114e-05, + "loss": 0.2067, + "step": 10316 + }, + { + "epoch": 0.18401526772018692, + "grad_norm": 0.3655635416507721, + "learning_rate": 4.8932315513783155e-05, + "loss": 0.2117, + "step": 10317 + }, + { + "epoch": 0.1840331038419006, + "grad_norm": 0.31022271513938904, + "learning_rate": 4.893186544867525e-05, + "loss": 0.2344, + "step": 10318 + }, + { + "epoch": 0.18405093996361432, + "grad_norm": 0.3456069827079773, + "learning_rate": 4.8931415290799175e-05, + "loss": 0.201, + "step": 10319 + }, + { + "epoch": 0.184068776085328, + "grad_norm": 0.3017507493495941, + "learning_rate": 4.893096504015667e-05, + "loss": 0.2253, + "step": 10320 + }, + { + "epoch": 0.1840866122070417, + "grad_norm": 0.21177324652671814, + "learning_rate": 4.8930514696749475e-05, + "loss": 0.1705, + "step": 10321 + }, + { + "epoch": 0.1841044483287554, + "grad_norm": 0.27520138025283813, + "learning_rate": 4.893006426057934e-05, + "loss": 0.2337, + "step": 10322 + }, + { + "epoch": 0.1841222844504691, + "grad_norm": 0.2814891040325165, + "learning_rate": 4.8929613731648014e-05, + "loss": 0.2062, + "step": 10323 + }, + { + "epoch": 0.1841401205721828, + "grad_norm": 0.24566584825515747, + "learning_rate": 4.8929163109957234e-05, + "loss": 0.2111, + "step": 10324 + }, + { + "epoch": 0.18415795669389648, + "grad_norm": 0.30118003487586975, + "learning_rate": 4.892871239550876e-05, + "loss": 0.2353, + "step": 10325 + }, + { + "epoch": 0.18417579281561017, + "grad_norm": 0.2540930211544037, + "learning_rate": 4.8928261588304325e-05, + "loss": 0.2879, + "step": 10326 + }, + { + "epoch": 0.18419362893732386, + "grad_norm": 0.34676486253738403, + "learning_rate": 4.8927810688345685e-05, + "loss": 0.2383, + "step": 10327 + }, + { + "epoch": 0.18421146505903757, + "grad_norm": 0.25336676836013794, + "learning_rate": 4.892735969563459e-05, + "loss": 0.2008, + "step": 10328 + }, + { + "epoch": 0.18422930118075126, + "grad_norm": 0.32810625433921814, + "learning_rate": 4.892690861017278e-05, + "loss": 0.1595, + "step": 10329 + }, + { + "epoch": 0.18424713730246495, + "grad_norm": 0.2761939764022827, + "learning_rate": 4.892645743196202e-05, + "loss": 0.1878, + "step": 10330 + }, + { + "epoch": 0.18426497342417864, + "grad_norm": 0.3160751163959503, + "learning_rate": 4.892600616100403e-05, + "loss": 0.238, + "step": 10331 + }, + { + "epoch": 0.18428280954589235, + "grad_norm": 0.2797072231769562, + "learning_rate": 4.892555479730059e-05, + "loss": 0.1414, + "step": 10332 + }, + { + "epoch": 0.18430064566760604, + "grad_norm": 0.22800606489181519, + "learning_rate": 4.8925103340853436e-05, + "loss": 0.1621, + "step": 10333 + }, + { + "epoch": 0.18431848178931973, + "grad_norm": 0.2722718417644501, + "learning_rate": 4.892465179166431e-05, + "loss": 0.2118, + "step": 10334 + }, + { + "epoch": 0.18433631791103341, + "grad_norm": 0.40476706624031067, + "learning_rate": 4.8924200149734976e-05, + "loss": 0.1782, + "step": 10335 + }, + { + "epoch": 0.18435415403274713, + "grad_norm": 0.20445409417152405, + "learning_rate": 4.892374841506717e-05, + "loss": 0.1783, + "step": 10336 + }, + { + "epoch": 0.18437199015446082, + "grad_norm": 0.25002533197402954, + "learning_rate": 4.892329658766266e-05, + "loss": 0.2073, + "step": 10337 + }, + { + "epoch": 0.1843898262761745, + "grad_norm": 0.256468266248703, + "learning_rate": 4.892284466752319e-05, + "loss": 0.1964, + "step": 10338 + }, + { + "epoch": 0.1844076623978882, + "grad_norm": 0.3192954659461975, + "learning_rate": 4.892239265465051e-05, + "loss": 0.1902, + "step": 10339 + }, + { + "epoch": 0.1844254985196019, + "grad_norm": 0.3462444245815277, + "learning_rate": 4.8921940549046376e-05, + "loss": 0.2478, + "step": 10340 + }, + { + "epoch": 0.1844433346413156, + "grad_norm": 0.314211905002594, + "learning_rate": 4.892148835071253e-05, + "loss": 0.2892, + "step": 10341 + }, + { + "epoch": 0.1844611707630293, + "grad_norm": 0.2857251763343811, + "learning_rate": 4.8921036059650737e-05, + "loss": 0.1764, + "step": 10342 + }, + { + "epoch": 0.18447900688474297, + "grad_norm": 0.2825760245323181, + "learning_rate": 4.8920583675862755e-05, + "loss": 0.1645, + "step": 10343 + }, + { + "epoch": 0.18449684300645666, + "grad_norm": 0.3393782377243042, + "learning_rate": 4.892013119935032e-05, + "loss": 0.2655, + "step": 10344 + }, + { + "epoch": 0.18451467912817038, + "grad_norm": 0.24785509705543518, + "learning_rate": 4.8919678630115194e-05, + "loss": 0.2039, + "step": 10345 + }, + { + "epoch": 0.18453251524988407, + "grad_norm": 0.21270188689231873, + "learning_rate": 4.891922596815913e-05, + "loss": 0.1796, + "step": 10346 + }, + { + "epoch": 0.18455035137159775, + "grad_norm": 0.25588127970695496, + "learning_rate": 4.891877321348389e-05, + "loss": 0.1828, + "step": 10347 + }, + { + "epoch": 0.18456818749331144, + "grad_norm": 0.2840774357318878, + "learning_rate": 4.891832036609122e-05, + "loss": 0.197, + "step": 10348 + }, + { + "epoch": 0.18458602361502516, + "grad_norm": 0.343094140291214, + "learning_rate": 4.891786742598289e-05, + "loss": 0.2261, + "step": 10349 + }, + { + "epoch": 0.18460385973673885, + "grad_norm": 0.22164809703826904, + "learning_rate": 4.8917414393160634e-05, + "loss": 0.186, + "step": 10350 + }, + { + "epoch": 0.18462169585845253, + "grad_norm": 0.34998172521591187, + "learning_rate": 4.891696126762622e-05, + "loss": 0.1992, + "step": 10351 + }, + { + "epoch": 0.18463953198016622, + "grad_norm": 0.25199243426322937, + "learning_rate": 4.8916508049381404e-05, + "loss": 0.1758, + "step": 10352 + }, + { + "epoch": 0.18465736810187994, + "grad_norm": 0.23775465786457062, + "learning_rate": 4.891605473842794e-05, + "loss": 0.2022, + "step": 10353 + }, + { + "epoch": 0.18467520422359363, + "grad_norm": 0.27597668766975403, + "learning_rate": 4.891560133476759e-05, + "loss": 0.2727, + "step": 10354 + }, + { + "epoch": 0.1846930403453073, + "grad_norm": 0.3825770318508148, + "learning_rate": 4.8915147838402106e-05, + "loss": 0.185, + "step": 10355 + }, + { + "epoch": 0.184710876467021, + "grad_norm": 0.5336138606071472, + "learning_rate": 4.891469424933326e-05, + "loss": 0.2208, + "step": 10356 + }, + { + "epoch": 0.18472871258873472, + "grad_norm": 0.396625280380249, + "learning_rate": 4.891424056756279e-05, + "loss": 0.2225, + "step": 10357 + }, + { + "epoch": 0.1847465487104484, + "grad_norm": 0.22673653066158295, + "learning_rate": 4.891378679309247e-05, + "loss": 0.1938, + "step": 10358 + }, + { + "epoch": 0.1847643848321621, + "grad_norm": 0.24844437837600708, + "learning_rate": 4.891333292592404e-05, + "loss": 0.2051, + "step": 10359 + }, + { + "epoch": 0.18478222095387578, + "grad_norm": 0.549529492855072, + "learning_rate": 4.891287896605928e-05, + "loss": 0.229, + "step": 10360 + }, + { + "epoch": 0.1848000570755895, + "grad_norm": 0.3561497926712036, + "learning_rate": 4.891242491349994e-05, + "loss": 0.1734, + "step": 10361 + }, + { + "epoch": 0.18481789319730318, + "grad_norm": 0.36185210943222046, + "learning_rate": 4.89119707682478e-05, + "loss": 0.2414, + "step": 10362 + }, + { + "epoch": 0.18483572931901687, + "grad_norm": 0.3784153163433075, + "learning_rate": 4.891151653030458e-05, + "loss": 0.2199, + "step": 10363 + }, + { + "epoch": 0.18485356544073056, + "grad_norm": 0.2495015263557434, + "learning_rate": 4.891106219967206e-05, + "loss": 0.1791, + "step": 10364 + }, + { + "epoch": 0.18487140156244425, + "grad_norm": 0.2821410298347473, + "learning_rate": 4.8910607776352024e-05, + "loss": 0.1975, + "step": 10365 + }, + { + "epoch": 0.18488923768415796, + "grad_norm": 0.2477578967809677, + "learning_rate": 4.8910153260346204e-05, + "loss": 0.2251, + "step": 10366 + }, + { + "epoch": 0.18490707380587165, + "grad_norm": 0.2333185076713562, + "learning_rate": 4.8909698651656366e-05, + "loss": 0.1984, + "step": 10367 + }, + { + "epoch": 0.18492490992758534, + "grad_norm": 0.2450585812330246, + "learning_rate": 4.890924395028429e-05, + "loss": 0.2067, + "step": 10368 + }, + { + "epoch": 0.18494274604929903, + "grad_norm": 0.2756058871746063, + "learning_rate": 4.8908789156231715e-05, + "loss": 0.2216, + "step": 10369 + }, + { + "epoch": 0.18496058217101274, + "grad_norm": 0.2147826999425888, + "learning_rate": 4.890833426950042e-05, + "loss": 0.1968, + "step": 10370 + }, + { + "epoch": 0.18497841829272643, + "grad_norm": 0.23656854033470154, + "learning_rate": 4.8907879290092165e-05, + "loss": 0.2101, + "step": 10371 + }, + { + "epoch": 0.18499625441444012, + "grad_norm": 0.21040819585323334, + "learning_rate": 4.8907424218008714e-05, + "loss": 0.2268, + "step": 10372 + }, + { + "epoch": 0.1850140905361538, + "grad_norm": 0.2578413188457489, + "learning_rate": 4.890696905325183e-05, + "loss": 0.1864, + "step": 10373 + }, + { + "epoch": 0.18503192665786752, + "grad_norm": 0.25228288769721985, + "learning_rate": 4.890651379582327e-05, + "loss": 0.1558, + "step": 10374 + }, + { + "epoch": 0.1850497627795812, + "grad_norm": 0.2664131224155426, + "learning_rate": 4.890605844572481e-05, + "loss": 0.2014, + "step": 10375 + }, + { + "epoch": 0.1850675989012949, + "grad_norm": 0.28589069843292236, + "learning_rate": 4.890560300295821e-05, + "loss": 0.1808, + "step": 10376 + }, + { + "epoch": 0.1850854350230086, + "grad_norm": 0.31567105650901794, + "learning_rate": 4.890514746752524e-05, + "loss": 0.2233, + "step": 10377 + }, + { + "epoch": 0.1851032711447223, + "grad_norm": 0.36673760414123535, + "learning_rate": 4.890469183942765e-05, + "loss": 0.2316, + "step": 10378 + }, + { + "epoch": 0.185121107266436, + "grad_norm": 0.31213730573654175, + "learning_rate": 4.8904236118667226e-05, + "loss": 0.2195, + "step": 10379 + }, + { + "epoch": 0.18513894338814968, + "grad_norm": 0.3055059611797333, + "learning_rate": 4.890378030524573e-05, + "loss": 0.2288, + "step": 10380 + }, + { + "epoch": 0.18515677950986337, + "grad_norm": 0.256708025932312, + "learning_rate": 4.8903324399164916e-05, + "loss": 0.2355, + "step": 10381 + }, + { + "epoch": 0.18517461563157708, + "grad_norm": 0.2967715859413147, + "learning_rate": 4.8902868400426574e-05, + "loss": 0.2373, + "step": 10382 + }, + { + "epoch": 0.18519245175329077, + "grad_norm": 0.3068571984767914, + "learning_rate": 4.890241230903245e-05, + "loss": 0.2203, + "step": 10383 + }, + { + "epoch": 0.18521028787500446, + "grad_norm": 0.27461376786231995, + "learning_rate": 4.890195612498432e-05, + "loss": 0.1891, + "step": 10384 + }, + { + "epoch": 0.18522812399671815, + "grad_norm": 0.26489153504371643, + "learning_rate": 4.890149984828395e-05, + "loss": 0.1708, + "step": 10385 + }, + { + "epoch": 0.18524596011843183, + "grad_norm": 0.33622679114341736, + "learning_rate": 4.890104347893312e-05, + "loss": 0.2821, + "step": 10386 + }, + { + "epoch": 0.18526379624014555, + "grad_norm": 0.25586479902267456, + "learning_rate": 4.890058701693358e-05, + "loss": 0.1965, + "step": 10387 + }, + { + "epoch": 0.18528163236185924, + "grad_norm": 0.27159854769706726, + "learning_rate": 4.8900130462287115e-05, + "loss": 0.2123, + "step": 10388 + }, + { + "epoch": 0.18529946848357293, + "grad_norm": 0.3056102991104126, + "learning_rate": 4.8899673814995486e-05, + "loss": 0.1632, + "step": 10389 + }, + { + "epoch": 0.18531730460528661, + "grad_norm": 0.33112654089927673, + "learning_rate": 4.889921707506047e-05, + "loss": 0.2297, + "step": 10390 + }, + { + "epoch": 0.18533514072700033, + "grad_norm": 0.3449118137359619, + "learning_rate": 4.889876024248384e-05, + "loss": 0.223, + "step": 10391 + }, + { + "epoch": 0.18535297684871402, + "grad_norm": 0.2861473560333252, + "learning_rate": 4.889830331726735e-05, + "loss": 0.2286, + "step": 10392 + }, + { + "epoch": 0.1853708129704277, + "grad_norm": 0.3719724118709564, + "learning_rate": 4.889784629941279e-05, + "loss": 0.206, + "step": 10393 + }, + { + "epoch": 0.1853886490921414, + "grad_norm": 0.4821029305458069, + "learning_rate": 4.889738918892192e-05, + "loss": 0.1915, + "step": 10394 + }, + { + "epoch": 0.1854064852138551, + "grad_norm": 0.2707119286060333, + "learning_rate": 4.8896931985796515e-05, + "loss": 0.2132, + "step": 10395 + }, + { + "epoch": 0.1854243213355688, + "grad_norm": 0.2872961759567261, + "learning_rate": 4.889647469003835e-05, + "loss": 0.1871, + "step": 10396 + }, + { + "epoch": 0.18544215745728249, + "grad_norm": 0.21720540523529053, + "learning_rate": 4.8896017301649196e-05, + "loss": 0.1443, + "step": 10397 + }, + { + "epoch": 0.18545999357899617, + "grad_norm": 0.30704933404922485, + "learning_rate": 4.889555982063082e-05, + "loss": 0.2134, + "step": 10398 + }, + { + "epoch": 0.1854778297007099, + "grad_norm": 0.3383081555366516, + "learning_rate": 4.8895102246985007e-05, + "loss": 0.2082, + "step": 10399 + }, + { + "epoch": 0.18549566582242358, + "grad_norm": 0.2674878239631653, + "learning_rate": 4.889464458071352e-05, + "loss": 0.1919, + "step": 10400 + }, + { + "epoch": 0.18551350194413727, + "grad_norm": 0.3940347135066986, + "learning_rate": 4.8894186821818144e-05, + "loss": 0.2138, + "step": 10401 + }, + { + "epoch": 0.18553133806585095, + "grad_norm": 0.2133244276046753, + "learning_rate": 4.889372897030065e-05, + "loss": 0.2427, + "step": 10402 + }, + { + "epoch": 0.18554917418756467, + "grad_norm": 0.2724769115447998, + "learning_rate": 4.88932710261628e-05, + "loss": 0.1914, + "step": 10403 + }, + { + "epoch": 0.18556701030927836, + "grad_norm": 0.35610491037368774, + "learning_rate": 4.889281298940638e-05, + "loss": 0.211, + "step": 10404 + }, + { + "epoch": 0.18558484643099205, + "grad_norm": 0.3148233890533447, + "learning_rate": 4.889235486003316e-05, + "loss": 0.2114, + "step": 10405 + }, + { + "epoch": 0.18560268255270573, + "grad_norm": 0.24598360061645508, + "learning_rate": 4.889189663804493e-05, + "loss": 0.2118, + "step": 10406 + }, + { + "epoch": 0.18562051867441942, + "grad_norm": 0.2542036175727844, + "learning_rate": 4.8891438323443456e-05, + "loss": 0.209, + "step": 10407 + }, + { + "epoch": 0.18563835479613314, + "grad_norm": 0.343126118183136, + "learning_rate": 4.889097991623052e-05, + "loss": 0.2053, + "step": 10408 + }, + { + "epoch": 0.18565619091784683, + "grad_norm": 0.2791293263435364, + "learning_rate": 4.889052141640788e-05, + "loss": 0.1894, + "step": 10409 + }, + { + "epoch": 0.1856740270395605, + "grad_norm": 0.3229711949825287, + "learning_rate": 4.889006282397733e-05, + "loss": 0.2333, + "step": 10410 + }, + { + "epoch": 0.1856918631612742, + "grad_norm": 0.22067569196224213, + "learning_rate": 4.888960413894066e-05, + "loss": 0.2048, + "step": 10411 + }, + { + "epoch": 0.18570969928298792, + "grad_norm": 0.27574804425239563, + "learning_rate": 4.8889145361299616e-05, + "loss": 0.1934, + "step": 10412 + }, + { + "epoch": 0.1857275354047016, + "grad_norm": 0.21312962472438812, + "learning_rate": 4.8888686491056e-05, + "loss": 0.1792, + "step": 10413 + }, + { + "epoch": 0.1857453715264153, + "grad_norm": 0.2817594110965729, + "learning_rate": 4.888822752821159e-05, + "loss": 0.1818, + "step": 10414 + }, + { + "epoch": 0.18576320764812898, + "grad_norm": 0.2855500280857086, + "learning_rate": 4.8887768472768155e-05, + "loss": 0.178, + "step": 10415 + }, + { + "epoch": 0.1857810437698427, + "grad_norm": 0.23362240195274353, + "learning_rate": 4.8887309324727475e-05, + "loss": 0.2257, + "step": 10416 + }, + { + "epoch": 0.18579887989155638, + "grad_norm": 0.22333790361881256, + "learning_rate": 4.888685008409134e-05, + "loss": 0.1452, + "step": 10417 + }, + { + "epoch": 0.18581671601327007, + "grad_norm": 0.33676812052726746, + "learning_rate": 4.8886390750861524e-05, + "loss": 0.1794, + "step": 10418 + }, + { + "epoch": 0.18583455213498376, + "grad_norm": 0.25902023911476135, + "learning_rate": 4.88859313250398e-05, + "loss": 0.2096, + "step": 10419 + }, + { + "epoch": 0.18585238825669748, + "grad_norm": 0.23255212604999542, + "learning_rate": 4.8885471806627957e-05, + "loss": 0.2005, + "step": 10420 + }, + { + "epoch": 0.18587022437841116, + "grad_norm": 0.26637881994247437, + "learning_rate": 4.888501219562778e-05, + "loss": 0.1722, + "step": 10421 + }, + { + "epoch": 0.18588806050012485, + "grad_norm": 0.30216068029403687, + "learning_rate": 4.8884552492041044e-05, + "loss": 0.1606, + "step": 10422 + }, + { + "epoch": 0.18590589662183854, + "grad_norm": 0.28722715377807617, + "learning_rate": 4.888409269586953e-05, + "loss": 0.2134, + "step": 10423 + }, + { + "epoch": 0.18592373274355226, + "grad_norm": 0.41661202907562256, + "learning_rate": 4.888363280711503e-05, + "loss": 0.2097, + "step": 10424 + }, + { + "epoch": 0.18594156886526594, + "grad_norm": 0.26423731446266174, + "learning_rate": 4.888317282577932e-05, + "loss": 0.1949, + "step": 10425 + }, + { + "epoch": 0.18595940498697963, + "grad_norm": 0.35436776280403137, + "learning_rate": 4.888271275186418e-05, + "loss": 0.2011, + "step": 10426 + }, + { + "epoch": 0.18597724110869332, + "grad_norm": 0.2574150562286377, + "learning_rate": 4.88822525853714e-05, + "loss": 0.2063, + "step": 10427 + }, + { + "epoch": 0.185995077230407, + "grad_norm": 0.26777106523513794, + "learning_rate": 4.8881792326302755e-05, + "loss": 0.1655, + "step": 10428 + }, + { + "epoch": 0.18601291335212072, + "grad_norm": 0.2430422157049179, + "learning_rate": 4.888133197466004e-05, + "loss": 0.1554, + "step": 10429 + }, + { + "epoch": 0.1860307494738344, + "grad_norm": 0.2989281117916107, + "learning_rate": 4.888087153044503e-05, + "loss": 0.2006, + "step": 10430 + }, + { + "epoch": 0.1860485855955481, + "grad_norm": 0.2649911344051361, + "learning_rate": 4.888041099365951e-05, + "loss": 0.2234, + "step": 10431 + }, + { + "epoch": 0.1860664217172618, + "grad_norm": 0.31233587861061096, + "learning_rate": 4.887995036430527e-05, + "loss": 0.2529, + "step": 10432 + }, + { + "epoch": 0.1860842578389755, + "grad_norm": 0.28314629197120667, + "learning_rate": 4.8879489642384104e-05, + "loss": 0.1864, + "step": 10433 + }, + { + "epoch": 0.1861020939606892, + "grad_norm": 0.3037908375263214, + "learning_rate": 4.8879028827897776e-05, + "loss": 0.24, + "step": 10434 + }, + { + "epoch": 0.18611993008240288, + "grad_norm": 0.2743125259876251, + "learning_rate": 4.8878567920848094e-05, + "loss": 0.1829, + "step": 10435 + }, + { + "epoch": 0.18613776620411657, + "grad_norm": 0.2598416209220886, + "learning_rate": 4.8878106921236833e-05, + "loss": 0.2357, + "step": 10436 + }, + { + "epoch": 0.18615560232583028, + "grad_norm": 0.3465643525123596, + "learning_rate": 4.8877645829065783e-05, + "loss": 0.2195, + "step": 10437 + }, + { + "epoch": 0.18617343844754397, + "grad_norm": 0.2014244794845581, + "learning_rate": 4.8877184644336726e-05, + "loss": 0.194, + "step": 10438 + }, + { + "epoch": 0.18619127456925766, + "grad_norm": 0.25579357147216797, + "learning_rate": 4.887672336705146e-05, + "loss": 0.1877, + "step": 10439 + }, + { + "epoch": 0.18620911069097135, + "grad_norm": 0.28658202290534973, + "learning_rate": 4.887626199721177e-05, + "loss": 0.2202, + "step": 10440 + }, + { + "epoch": 0.18622694681268506, + "grad_norm": 0.31616172194480896, + "learning_rate": 4.887580053481943e-05, + "loss": 0.2048, + "step": 10441 + }, + { + "epoch": 0.18624478293439875, + "grad_norm": 0.2259247601032257, + "learning_rate": 4.887533897987625e-05, + "loss": 0.1727, + "step": 10442 + }, + { + "epoch": 0.18626261905611244, + "grad_norm": 0.3351108431816101, + "learning_rate": 4.887487733238401e-05, + "loss": 0.2405, + "step": 10443 + }, + { + "epoch": 0.18628045517782613, + "grad_norm": 0.26033225655555725, + "learning_rate": 4.8874415592344494e-05, + "loss": 0.1762, + "step": 10444 + }, + { + "epoch": 0.18629829129953981, + "grad_norm": 0.2744556665420532, + "learning_rate": 4.887395375975951e-05, + "loss": 0.2253, + "step": 10445 + }, + { + "epoch": 0.18631612742125353, + "grad_norm": 0.32611531019210815, + "learning_rate": 4.887349183463082e-05, + "loss": 0.2745, + "step": 10446 + }, + { + "epoch": 0.18633396354296722, + "grad_norm": 0.2906774878501892, + "learning_rate": 4.887302981696024e-05, + "loss": 0.2273, + "step": 10447 + }, + { + "epoch": 0.1863517996646809, + "grad_norm": 0.2538030445575714, + "learning_rate": 4.887256770674954e-05, + "loss": 0.1846, + "step": 10448 + }, + { + "epoch": 0.1863696357863946, + "grad_norm": 0.35917553305625916, + "learning_rate": 4.887210550400053e-05, + "loss": 0.2289, + "step": 10449 + }, + { + "epoch": 0.1863874719081083, + "grad_norm": 0.23671753704547882, + "learning_rate": 4.887164320871499e-05, + "loss": 0.1749, + "step": 10450 + }, + { + "epoch": 0.186405308029822, + "grad_norm": 0.27708324790000916, + "learning_rate": 4.887118082089472e-05, + "loss": 0.2244, + "step": 10451 + }, + { + "epoch": 0.18642314415153569, + "grad_norm": 0.36812689900398254, + "learning_rate": 4.887071834054151e-05, + "loss": 0.1962, + "step": 10452 + }, + { + "epoch": 0.18644098027324937, + "grad_norm": 0.28726956248283386, + "learning_rate": 4.8870255767657146e-05, + "loss": 0.2299, + "step": 10453 + }, + { + "epoch": 0.1864588163949631, + "grad_norm": 0.23517903685569763, + "learning_rate": 4.886979310224343e-05, + "loss": 0.1669, + "step": 10454 + }, + { + "epoch": 0.18647665251667678, + "grad_norm": 0.2091180831193924, + "learning_rate": 4.886933034430215e-05, + "loss": 0.1511, + "step": 10455 + }, + { + "epoch": 0.18649448863839047, + "grad_norm": 0.3165168762207031, + "learning_rate": 4.88688674938351e-05, + "loss": 0.1985, + "step": 10456 + }, + { + "epoch": 0.18651232476010415, + "grad_norm": 0.341937392950058, + "learning_rate": 4.886840455084408e-05, + "loss": 0.1943, + "step": 10457 + }, + { + "epoch": 0.18653016088181787, + "grad_norm": 0.46690675616264343, + "learning_rate": 4.886794151533087e-05, + "loss": 0.2097, + "step": 10458 + }, + { + "epoch": 0.18654799700353156, + "grad_norm": 0.39278659224510193, + "learning_rate": 4.886747838729728e-05, + "loss": 0.2673, + "step": 10459 + }, + { + "epoch": 0.18656583312524525, + "grad_norm": 0.2433737814426422, + "learning_rate": 4.88670151667451e-05, + "loss": 0.1925, + "step": 10460 + }, + { + "epoch": 0.18658366924695893, + "grad_norm": 0.2127729207277298, + "learning_rate": 4.8866551853676135e-05, + "loss": 0.1718, + "step": 10461 + }, + { + "epoch": 0.18660150536867265, + "grad_norm": 0.23966027796268463, + "learning_rate": 4.886608844809216e-05, + "loss": 0.1774, + "step": 10462 + }, + { + "epoch": 0.18661934149038634, + "grad_norm": 0.37410762906074524, + "learning_rate": 4.8865624949994993e-05, + "loss": 0.1917, + "step": 10463 + }, + { + "epoch": 0.18663717761210002, + "grad_norm": 0.26284417510032654, + "learning_rate": 4.886516135938641e-05, + "loss": 0.2002, + "step": 10464 + }, + { + "epoch": 0.1866550137338137, + "grad_norm": 0.3009563088417053, + "learning_rate": 4.886469767626823e-05, + "loss": 0.2417, + "step": 10465 + }, + { + "epoch": 0.1866728498555274, + "grad_norm": 0.43802765011787415, + "learning_rate": 4.8864233900642234e-05, + "loss": 0.2612, + "step": 10466 + }, + { + "epoch": 0.18669068597724112, + "grad_norm": 0.29394182562828064, + "learning_rate": 4.8863770032510225e-05, + "loss": 0.2576, + "step": 10467 + }, + { + "epoch": 0.1867085220989548, + "grad_norm": 0.364239364862442, + "learning_rate": 4.8863306071874e-05, + "loss": 0.2408, + "step": 10468 + }, + { + "epoch": 0.1867263582206685, + "grad_norm": 0.24073238670825958, + "learning_rate": 4.8862842018735356e-05, + "loss": 0.261, + "step": 10469 + }, + { + "epoch": 0.18674419434238218, + "grad_norm": 0.2975509464740753, + "learning_rate": 4.88623778730961e-05, + "loss": 0.211, + "step": 10470 + }, + { + "epoch": 0.1867620304640959, + "grad_norm": 0.25481876730918884, + "learning_rate": 4.8861913634958025e-05, + "loss": 0.2289, + "step": 10471 + }, + { + "epoch": 0.18677986658580958, + "grad_norm": 0.2389741837978363, + "learning_rate": 4.8861449304322926e-05, + "loss": 0.1965, + "step": 10472 + }, + { + "epoch": 0.18679770270752327, + "grad_norm": 0.39479899406433105, + "learning_rate": 4.8860984881192615e-05, + "loss": 0.216, + "step": 10473 + }, + { + "epoch": 0.18681553882923696, + "grad_norm": 0.2663244605064392, + "learning_rate": 4.8860520365568875e-05, + "loss": 0.2175, + "step": 10474 + }, + { + "epoch": 0.18683337495095068, + "grad_norm": 0.39912599325180054, + "learning_rate": 4.886005575745353e-05, + "loss": 0.2101, + "step": 10475 + }, + { + "epoch": 0.18685121107266436, + "grad_norm": 0.29135116934776306, + "learning_rate": 4.885959105684835e-05, + "loss": 0.193, + "step": 10476 + }, + { + "epoch": 0.18686904719437805, + "grad_norm": 0.26923084259033203, + "learning_rate": 4.8859126263755176e-05, + "loss": 0.2592, + "step": 10477 + }, + { + "epoch": 0.18688688331609174, + "grad_norm": 0.27490487694740295, + "learning_rate": 4.8858661378175776e-05, + "loss": 0.2032, + "step": 10478 + }, + { + "epoch": 0.18690471943780546, + "grad_norm": 0.29819801449775696, + "learning_rate": 4.8858196400111966e-05, + "loss": 0.1763, + "step": 10479 + }, + { + "epoch": 0.18692255555951914, + "grad_norm": 0.2618018090724945, + "learning_rate": 4.885773132956554e-05, + "loss": 0.2239, + "step": 10480 + }, + { + "epoch": 0.18694039168123283, + "grad_norm": 0.2373030036687851, + "learning_rate": 4.8857266166538317e-05, + "loss": 0.2069, + "step": 10481 + }, + { + "epoch": 0.18695822780294652, + "grad_norm": 0.24133995175361633, + "learning_rate": 4.885680091103208e-05, + "loss": 0.1897, + "step": 10482 + }, + { + "epoch": 0.18697606392466024, + "grad_norm": 0.26540258526802063, + "learning_rate": 4.885633556304865e-05, + "loss": 0.2141, + "step": 10483 + }, + { + "epoch": 0.18699390004637392, + "grad_norm": 0.2913873493671417, + "learning_rate": 4.885587012258983e-05, + "loss": 0.1931, + "step": 10484 + }, + { + "epoch": 0.1870117361680876, + "grad_norm": 0.2755669355392456, + "learning_rate": 4.88554045896574e-05, + "loss": 0.2272, + "step": 10485 + }, + { + "epoch": 0.1870295722898013, + "grad_norm": 0.26733720302581787, + "learning_rate": 4.88549389642532e-05, + "loss": 0.2255, + "step": 10486 + }, + { + "epoch": 0.187047408411515, + "grad_norm": 0.24001014232635498, + "learning_rate": 4.8854473246379005e-05, + "loss": 0.1721, + "step": 10487 + }, + { + "epoch": 0.1870652445332287, + "grad_norm": 0.24600258469581604, + "learning_rate": 4.885400743603664e-05, + "loss": 0.2012, + "step": 10488 + }, + { + "epoch": 0.1870830806549424, + "grad_norm": 0.32753440737724304, + "learning_rate": 4.88535415332279e-05, + "loss": 0.1993, + "step": 10489 + }, + { + "epoch": 0.18710091677665608, + "grad_norm": 0.2985244393348694, + "learning_rate": 4.8853075537954596e-05, + "loss": 0.2165, + "step": 10490 + }, + { + "epoch": 0.18711875289836977, + "grad_norm": 0.27764397859573364, + "learning_rate": 4.8852609450218535e-05, + "loss": 0.2443, + "step": 10491 + }, + { + "epoch": 0.18713658902008348, + "grad_norm": 0.24802055954933167, + "learning_rate": 4.8852143270021524e-05, + "loss": 0.1829, + "step": 10492 + }, + { + "epoch": 0.18715442514179717, + "grad_norm": 0.3216690421104431, + "learning_rate": 4.885167699736536e-05, + "loss": 0.1525, + "step": 10493 + }, + { + "epoch": 0.18717226126351086, + "grad_norm": 0.3482573628425598, + "learning_rate": 4.885121063225185e-05, + "loss": 0.1983, + "step": 10494 + }, + { + "epoch": 0.18719009738522455, + "grad_norm": 0.3186511993408203, + "learning_rate": 4.885074417468283e-05, + "loss": 0.2242, + "step": 10495 + }, + { + "epoch": 0.18720793350693826, + "grad_norm": 0.22009779512882233, + "learning_rate": 4.885027762466007e-05, + "loss": 0.1687, + "step": 10496 + }, + { + "epoch": 0.18722576962865195, + "grad_norm": 0.25250473618507385, + "learning_rate": 4.8849810982185404e-05, + "loss": 0.1785, + "step": 10497 + }, + { + "epoch": 0.18724360575036564, + "grad_norm": 0.2682039141654968, + "learning_rate": 4.8849344247260634e-05, + "loss": 0.178, + "step": 10498 + }, + { + "epoch": 0.18726144187207933, + "grad_norm": 0.35088911652565, + "learning_rate": 4.884887741988757e-05, + "loss": 0.1735, + "step": 10499 + }, + { + "epoch": 0.18727927799379304, + "grad_norm": 0.28017309308052063, + "learning_rate": 4.884841050006802e-05, + "loss": 0.1913, + "step": 10500 + }, + { + "epoch": 0.18729711411550673, + "grad_norm": 0.2931995093822479, + "learning_rate": 4.884794348780378e-05, + "loss": 0.2281, + "step": 10501 + }, + { + "epoch": 0.18731495023722042, + "grad_norm": 0.25336283445358276, + "learning_rate": 4.884747638309669e-05, + "loss": 0.1918, + "step": 10502 + }, + { + "epoch": 0.1873327863589341, + "grad_norm": 0.40355825424194336, + "learning_rate": 4.8847009185948546e-05, + "loss": 0.1647, + "step": 10503 + }, + { + "epoch": 0.18735062248064782, + "grad_norm": 0.31773319840431213, + "learning_rate": 4.884654189636115e-05, + "loss": 0.2862, + "step": 10504 + }, + { + "epoch": 0.1873684586023615, + "grad_norm": 0.25512444972991943, + "learning_rate": 4.8846074514336324e-05, + "loss": 0.2549, + "step": 10505 + }, + { + "epoch": 0.1873862947240752, + "grad_norm": 0.30692002177238464, + "learning_rate": 4.8845607039875876e-05, + "loss": 0.2474, + "step": 10506 + }, + { + "epoch": 0.18740413084578889, + "grad_norm": 0.25550419092178345, + "learning_rate": 4.8845139472981616e-05, + "loss": 0.2519, + "step": 10507 + }, + { + "epoch": 0.18742196696750257, + "grad_norm": 0.23434126377105713, + "learning_rate": 4.884467181365536e-05, + "loss": 0.1703, + "step": 10508 + }, + { + "epoch": 0.1874398030892163, + "grad_norm": 0.3422868549823761, + "learning_rate": 4.8844204061898925e-05, + "loss": 0.2269, + "step": 10509 + }, + { + "epoch": 0.18745763921092998, + "grad_norm": 0.24813294410705566, + "learning_rate": 4.884373621771412e-05, + "loss": 0.1789, + "step": 10510 + }, + { + "epoch": 0.18747547533264367, + "grad_norm": 0.19203995168209076, + "learning_rate": 4.884326828110276e-05, + "loss": 0.2068, + "step": 10511 + }, + { + "epoch": 0.18749331145435735, + "grad_norm": 0.27889353036880493, + "learning_rate": 4.8842800252066644e-05, + "loss": 0.2654, + "step": 10512 + }, + { + "epoch": 0.18751114757607107, + "grad_norm": 0.22642937302589417, + "learning_rate": 4.884233213060761e-05, + "loss": 0.1717, + "step": 10513 + }, + { + "epoch": 0.18752898369778476, + "grad_norm": 0.303641140460968, + "learning_rate": 4.884186391672746e-05, + "loss": 0.1824, + "step": 10514 + }, + { + "epoch": 0.18754681981949844, + "grad_norm": 0.2523878514766693, + "learning_rate": 4.884139561042801e-05, + "loss": 0.2431, + "step": 10515 + }, + { + "epoch": 0.18756465594121213, + "grad_norm": 0.3550451397895813, + "learning_rate": 4.8840927211711076e-05, + "loss": 0.2396, + "step": 10516 + }, + { + "epoch": 0.18758249206292585, + "grad_norm": 0.19239379465579987, + "learning_rate": 4.8840458720578476e-05, + "loss": 0.2078, + "step": 10517 + }, + { + "epoch": 0.18760032818463954, + "grad_norm": 0.38046181201934814, + "learning_rate": 4.883999013703202e-05, + "loss": 0.2256, + "step": 10518 + }, + { + "epoch": 0.18761816430635322, + "grad_norm": 0.2400396317243576, + "learning_rate": 4.883952146107353e-05, + "loss": 0.1923, + "step": 10519 + }, + { + "epoch": 0.1876360004280669, + "grad_norm": 0.2525434195995331, + "learning_rate": 4.8839052692704825e-05, + "loss": 0.2219, + "step": 10520 + }, + { + "epoch": 0.18765383654978063, + "grad_norm": 0.24584171175956726, + "learning_rate": 4.883858383192771e-05, + "loss": 0.1575, + "step": 10521 + }, + { + "epoch": 0.18767167267149432, + "grad_norm": 0.39342108368873596, + "learning_rate": 4.883811487874402e-05, + "loss": 0.265, + "step": 10522 + }, + { + "epoch": 0.187689508793208, + "grad_norm": 0.2990253269672394, + "learning_rate": 4.883764583315556e-05, + "loss": 0.2416, + "step": 10523 + }, + { + "epoch": 0.1877073449149217, + "grad_norm": 0.30369022488594055, + "learning_rate": 4.883717669516414e-05, + "loss": 0.2062, + "step": 10524 + }, + { + "epoch": 0.18772518103663538, + "grad_norm": 0.29360026121139526, + "learning_rate": 4.8836707464771605e-05, + "loss": 0.1785, + "step": 10525 + }, + { + "epoch": 0.1877430171583491, + "grad_norm": 0.2826521098613739, + "learning_rate": 4.883623814197975e-05, + "loss": 0.208, + "step": 10526 + }, + { + "epoch": 0.18776085328006278, + "grad_norm": 0.23486946523189545, + "learning_rate": 4.883576872679041e-05, + "loss": 0.17, + "step": 10527 + }, + { + "epoch": 0.18777868940177647, + "grad_norm": 0.3598060607910156, + "learning_rate": 4.883529921920539e-05, + "loss": 0.2452, + "step": 10528 + }, + { + "epoch": 0.18779652552349016, + "grad_norm": 0.2967919707298279, + "learning_rate": 4.883482961922653e-05, + "loss": 0.1489, + "step": 10529 + }, + { + "epoch": 0.18781436164520388, + "grad_norm": 0.30657777190208435, + "learning_rate": 4.883435992685562e-05, + "loss": 0.1991, + "step": 10530 + }, + { + "epoch": 0.18783219776691756, + "grad_norm": 0.23573394119739532, + "learning_rate": 4.883389014209452e-05, + "loss": 0.1742, + "step": 10531 + }, + { + "epoch": 0.18785003388863125, + "grad_norm": 0.40680450201034546, + "learning_rate": 4.8833420264945015e-05, + "loss": 0.1742, + "step": 10532 + }, + { + "epoch": 0.18786787001034494, + "grad_norm": 0.39763349294662476, + "learning_rate": 4.883295029540894e-05, + "loss": 0.1765, + "step": 10533 + }, + { + "epoch": 0.18788570613205866, + "grad_norm": 0.30162012577056885, + "learning_rate": 4.8832480233488124e-05, + "loss": 0.1913, + "step": 10534 + }, + { + "epoch": 0.18790354225377234, + "grad_norm": 0.3297789394855499, + "learning_rate": 4.8832010079184384e-05, + "loss": 0.224, + "step": 10535 + }, + { + "epoch": 0.18792137837548603, + "grad_norm": 0.3167688846588135, + "learning_rate": 4.883153983249954e-05, + "loss": 0.2162, + "step": 10536 + }, + { + "epoch": 0.18793921449719972, + "grad_norm": 0.23432107269763947, + "learning_rate": 4.8831069493435414e-05, + "loss": 0.169, + "step": 10537 + }, + { + "epoch": 0.18795705061891343, + "grad_norm": 0.27008378505706787, + "learning_rate": 4.883059906199384e-05, + "loss": 0.2298, + "step": 10538 + }, + { + "epoch": 0.18797488674062712, + "grad_norm": 0.34587347507476807, + "learning_rate": 4.883012853817662e-05, + "loss": 0.2241, + "step": 10539 + }, + { + "epoch": 0.1879927228623408, + "grad_norm": 0.3272530436515808, + "learning_rate": 4.8829657921985605e-05, + "loss": 0.2656, + "step": 10540 + }, + { + "epoch": 0.1880105589840545, + "grad_norm": 0.30216947197914124, + "learning_rate": 4.8829187213422603e-05, + "loss": 0.187, + "step": 10541 + }, + { + "epoch": 0.18802839510576821, + "grad_norm": 0.2211897075176239, + "learning_rate": 4.8828716412489425e-05, + "loss": 0.1697, + "step": 10542 + }, + { + "epoch": 0.1880462312274819, + "grad_norm": 0.224198117852211, + "learning_rate": 4.8828245519187935e-05, + "loss": 0.2007, + "step": 10543 + }, + { + "epoch": 0.1880640673491956, + "grad_norm": 0.395285964012146, + "learning_rate": 4.882777453351992e-05, + "loss": 0.2465, + "step": 10544 + }, + { + "epoch": 0.18808190347090928, + "grad_norm": 0.20634904503822327, + "learning_rate": 4.8827303455487225e-05, + "loss": 0.1731, + "step": 10545 + }, + { + "epoch": 0.18809973959262297, + "grad_norm": 0.3251771628856659, + "learning_rate": 4.882683228509167e-05, + "loss": 0.2284, + "step": 10546 + }, + { + "epoch": 0.18811757571433668, + "grad_norm": 0.258035808801651, + "learning_rate": 4.882636102233509e-05, + "loss": 0.1689, + "step": 10547 + }, + { + "epoch": 0.18813541183605037, + "grad_norm": 0.3538181781768799, + "learning_rate": 4.88258896672193e-05, + "loss": 0.2385, + "step": 10548 + }, + { + "epoch": 0.18815324795776406, + "grad_norm": 0.2677677869796753, + "learning_rate": 4.8825418219746135e-05, + "loss": 0.1968, + "step": 10549 + }, + { + "epoch": 0.18817108407947775, + "grad_norm": 0.21104206144809723, + "learning_rate": 4.882494667991742e-05, + "loss": 0.1717, + "step": 10550 + }, + { + "epoch": 0.18818892020119146, + "grad_norm": 0.2170165479183197, + "learning_rate": 4.8824475047734974e-05, + "loss": 0.1801, + "step": 10551 + }, + { + "epoch": 0.18820675632290515, + "grad_norm": 0.2783610224723816, + "learning_rate": 4.882400332320065e-05, + "loss": 0.19, + "step": 10552 + }, + { + "epoch": 0.18822459244461884, + "grad_norm": 0.2591277062892914, + "learning_rate": 4.8823531506316244e-05, + "loss": 0.2198, + "step": 10553 + }, + { + "epoch": 0.18824242856633253, + "grad_norm": 0.21704983711242676, + "learning_rate": 4.882305959708361e-05, + "loss": 0.2005, + "step": 10554 + }, + { + "epoch": 0.18826026468804624, + "grad_norm": 0.2296086698770523, + "learning_rate": 4.882258759550457e-05, + "loss": 0.1808, + "step": 10555 + }, + { + "epoch": 0.18827810080975993, + "grad_norm": 0.26934829354286194, + "learning_rate": 4.882211550158095e-05, + "loss": 0.2394, + "step": 10556 + }, + { + "epoch": 0.18829593693147362, + "grad_norm": 0.38297170400619507, + "learning_rate": 4.8821643315314585e-05, + "loss": 0.1776, + "step": 10557 + }, + { + "epoch": 0.1883137730531873, + "grad_norm": 0.2702253460884094, + "learning_rate": 4.8821171036707304e-05, + "loss": 0.2196, + "step": 10558 + }, + { + "epoch": 0.18833160917490102, + "grad_norm": 0.2383365035057068, + "learning_rate": 4.882069866576093e-05, + "loss": 0.2077, + "step": 10559 + }, + { + "epoch": 0.1883494452966147, + "grad_norm": 0.20856791734695435, + "learning_rate": 4.8820226202477305e-05, + "loss": 0.1629, + "step": 10560 + }, + { + "epoch": 0.1883672814183284, + "grad_norm": 0.3456088602542877, + "learning_rate": 4.881975364685826e-05, + "loss": 0.2566, + "step": 10561 + }, + { + "epoch": 0.18838511754004209, + "grad_norm": 0.2895427346229553, + "learning_rate": 4.8819280998905616e-05, + "loss": 0.2019, + "step": 10562 + }, + { + "epoch": 0.1884029536617558, + "grad_norm": 0.2795896530151367, + "learning_rate": 4.8818808258621205e-05, + "loss": 0.208, + "step": 10563 + }, + { + "epoch": 0.1884207897834695, + "grad_norm": 0.3548523783683777, + "learning_rate": 4.881833542600688e-05, + "loss": 0.2041, + "step": 10564 + }, + { + "epoch": 0.18843862590518318, + "grad_norm": 0.27151885628700256, + "learning_rate": 4.8817862501064456e-05, + "loss": 0.2094, + "step": 10565 + }, + { + "epoch": 0.18845646202689686, + "grad_norm": 0.25382769107818604, + "learning_rate": 4.881738948379577e-05, + "loss": 0.2424, + "step": 10566 + }, + { + "epoch": 0.18847429814861055, + "grad_norm": 0.21698448061943054, + "learning_rate": 4.8816916374202656e-05, + "loss": 0.2004, + "step": 10567 + }, + { + "epoch": 0.18849213427032427, + "grad_norm": 0.32958438992500305, + "learning_rate": 4.881644317228695e-05, + "loss": 0.2452, + "step": 10568 + }, + { + "epoch": 0.18850997039203796, + "grad_norm": 0.24666844308376312, + "learning_rate": 4.8815969878050484e-05, + "loss": 0.18, + "step": 10569 + }, + { + "epoch": 0.18852780651375164, + "grad_norm": 0.2679874300956726, + "learning_rate": 4.8815496491495085e-05, + "loss": 0.1445, + "step": 10570 + }, + { + "epoch": 0.18854564263546533, + "grad_norm": 0.4095442593097687, + "learning_rate": 4.881502301262261e-05, + "loss": 0.3187, + "step": 10571 + }, + { + "epoch": 0.18856347875717905, + "grad_norm": 0.2798469662666321, + "learning_rate": 4.8814549441434865e-05, + "loss": 0.2063, + "step": 10572 + }, + { + "epoch": 0.18858131487889274, + "grad_norm": 0.2988153398036957, + "learning_rate": 4.881407577793371e-05, + "loss": 0.269, + "step": 10573 + }, + { + "epoch": 0.18859915100060642, + "grad_norm": 0.22357898950576782, + "learning_rate": 4.881360202212097e-05, + "loss": 0.1896, + "step": 10574 + }, + { + "epoch": 0.1886169871223201, + "grad_norm": 0.32179194688796997, + "learning_rate": 4.881312817399848e-05, + "loss": 0.2574, + "step": 10575 + }, + { + "epoch": 0.18863482324403383, + "grad_norm": 0.24870289862155914, + "learning_rate": 4.881265423356809e-05, + "loss": 0.2105, + "step": 10576 + }, + { + "epoch": 0.18865265936574752, + "grad_norm": 0.3383040130138397, + "learning_rate": 4.8812180200831626e-05, + "loss": 0.2095, + "step": 10577 + }, + { + "epoch": 0.1886704954874612, + "grad_norm": 0.22676241397857666, + "learning_rate": 4.881170607579092e-05, + "loss": 0.1704, + "step": 10578 + }, + { + "epoch": 0.1886883316091749, + "grad_norm": 0.2661273777484894, + "learning_rate": 4.8811231858447823e-05, + "loss": 0.1785, + "step": 10579 + }, + { + "epoch": 0.1887061677308886, + "grad_norm": 0.27105018496513367, + "learning_rate": 4.8810757548804165e-05, + "loss": 0.2322, + "step": 10580 + }, + { + "epoch": 0.1887240038526023, + "grad_norm": 0.3106338679790497, + "learning_rate": 4.881028314686179e-05, + "loss": 0.1691, + "step": 10581 + }, + { + "epoch": 0.18874183997431598, + "grad_norm": 0.2899042069911957, + "learning_rate": 4.8809808652622525e-05, + "loss": 0.2155, + "step": 10582 + }, + { + "epoch": 0.18875967609602967, + "grad_norm": 0.2647222876548767, + "learning_rate": 4.880933406608823e-05, + "loss": 0.2058, + "step": 10583 + }, + { + "epoch": 0.1887775122177434, + "grad_norm": 0.2770458161830902, + "learning_rate": 4.880885938726072e-05, + "loss": 0.1951, + "step": 10584 + }, + { + "epoch": 0.18879534833945708, + "grad_norm": 0.3557983934879303, + "learning_rate": 4.880838461614186e-05, + "loss": 0.289, + "step": 10585 + }, + { + "epoch": 0.18881318446117076, + "grad_norm": 0.2803858518600464, + "learning_rate": 4.880790975273347e-05, + "loss": 0.2157, + "step": 10586 + }, + { + "epoch": 0.18883102058288445, + "grad_norm": 0.22735588252544403, + "learning_rate": 4.88074347970374e-05, + "loss": 0.1677, + "step": 10587 + }, + { + "epoch": 0.18884885670459814, + "grad_norm": 0.3329298198223114, + "learning_rate": 4.8806959749055484e-05, + "loss": 0.2207, + "step": 10588 + }, + { + "epoch": 0.18886669282631185, + "grad_norm": 0.22475937008857727, + "learning_rate": 4.880648460878958e-05, + "loss": 0.2078, + "step": 10589 + }, + { + "epoch": 0.18888452894802554, + "grad_norm": 0.2685500979423523, + "learning_rate": 4.880600937624151e-05, + "loss": 0.1901, + "step": 10590 + }, + { + "epoch": 0.18890236506973923, + "grad_norm": 0.25866085290908813, + "learning_rate": 4.880553405141313e-05, + "loss": 0.1974, + "step": 10591 + }, + { + "epoch": 0.18892020119145292, + "grad_norm": 0.2405107468366623, + "learning_rate": 4.880505863430628e-05, + "loss": 0.1733, + "step": 10592 + }, + { + "epoch": 0.18893803731316663, + "grad_norm": 0.29471492767333984, + "learning_rate": 4.88045831249228e-05, + "loss": 0.1963, + "step": 10593 + }, + { + "epoch": 0.18895587343488032, + "grad_norm": 0.3194786608219147, + "learning_rate": 4.880410752326453e-05, + "loss": 0.1858, + "step": 10594 + }, + { + "epoch": 0.188973709556594, + "grad_norm": 0.21729514002799988, + "learning_rate": 4.8803631829333326e-05, + "loss": 0.1892, + "step": 10595 + }, + { + "epoch": 0.1889915456783077, + "grad_norm": 0.4171539545059204, + "learning_rate": 4.880315604313101e-05, + "loss": 0.2813, + "step": 10596 + }, + { + "epoch": 0.18900938180002141, + "grad_norm": 0.3281741440296173, + "learning_rate": 4.8802680164659456e-05, + "loss": 0.238, + "step": 10597 + }, + { + "epoch": 0.1890272179217351, + "grad_norm": 0.29790347814559937, + "learning_rate": 4.880220419392048e-05, + "loss": 0.1899, + "step": 10598 + }, + { + "epoch": 0.1890450540434488, + "grad_norm": 0.28304579854011536, + "learning_rate": 4.880172813091595e-05, + "loss": 0.2499, + "step": 10599 + }, + { + "epoch": 0.18906289016516248, + "grad_norm": 0.39394810795783997, + "learning_rate": 4.8801251975647686e-05, + "loss": 0.2015, + "step": 10600 + }, + { + "epoch": 0.1890807262868762, + "grad_norm": 0.2597050368785858, + "learning_rate": 4.8800775728117565e-05, + "loss": 0.2577, + "step": 10601 + }, + { + "epoch": 0.18909856240858988, + "grad_norm": 0.22351546585559845, + "learning_rate": 4.880029938832741e-05, + "loss": 0.1908, + "step": 10602 + }, + { + "epoch": 0.18911639853030357, + "grad_norm": 0.2598824203014374, + "learning_rate": 4.879982295627907e-05, + "loss": 0.1949, + "step": 10603 + }, + { + "epoch": 0.18913423465201726, + "grad_norm": 0.20953436195850372, + "learning_rate": 4.87993464319744e-05, + "loss": 0.1562, + "step": 10604 + }, + { + "epoch": 0.18915207077373097, + "grad_norm": 0.24557650089263916, + "learning_rate": 4.879886981541524e-05, + "loss": 0.2513, + "step": 10605 + }, + { + "epoch": 0.18916990689544466, + "grad_norm": 0.2735897898674011, + "learning_rate": 4.8798393106603444e-05, + "loss": 0.2241, + "step": 10606 + }, + { + "epoch": 0.18918774301715835, + "grad_norm": 0.3022579848766327, + "learning_rate": 4.879791630554086e-05, + "loss": 0.221, + "step": 10607 + }, + { + "epoch": 0.18920557913887204, + "grad_norm": 0.3795880973339081, + "learning_rate": 4.879743941222932e-05, + "loss": 0.2504, + "step": 10608 + }, + { + "epoch": 0.18922341526058573, + "grad_norm": 0.2794035077095032, + "learning_rate": 4.87969624266707e-05, + "loss": 0.2317, + "step": 10609 + }, + { + "epoch": 0.18924125138229944, + "grad_norm": 0.23743732273578644, + "learning_rate": 4.879648534886683e-05, + "loss": 0.2037, + "step": 10610 + }, + { + "epoch": 0.18925908750401313, + "grad_norm": 0.3326530158519745, + "learning_rate": 4.8796008178819565e-05, + "loss": 0.2405, + "step": 10611 + }, + { + "epoch": 0.18927692362572682, + "grad_norm": 0.24557062983512878, + "learning_rate": 4.8795530916530746e-05, + "loss": 0.1831, + "step": 10612 + }, + { + "epoch": 0.1892947597474405, + "grad_norm": 0.26500362157821655, + "learning_rate": 4.8795053562002235e-05, + "loss": 0.2482, + "step": 10613 + }, + { + "epoch": 0.18931259586915422, + "grad_norm": 0.2623896598815918, + "learning_rate": 4.879457611523588e-05, + "loss": 0.2006, + "step": 10614 + }, + { + "epoch": 0.1893304319908679, + "grad_norm": 0.25139978528022766, + "learning_rate": 4.879409857623353e-05, + "loss": 0.1826, + "step": 10615 + }, + { + "epoch": 0.1893482681125816, + "grad_norm": 0.28829190135002136, + "learning_rate": 4.879362094499703e-05, + "loss": 0.1951, + "step": 10616 + }, + { + "epoch": 0.18936610423429528, + "grad_norm": 0.2570495009422302, + "learning_rate": 4.8793143221528236e-05, + "loss": 0.2105, + "step": 10617 + }, + { + "epoch": 0.189383940356009, + "grad_norm": 0.2040308564901352, + "learning_rate": 4.8792665405829005e-05, + "loss": 0.123, + "step": 10618 + }, + { + "epoch": 0.1894017764777227, + "grad_norm": 0.3173140585422516, + "learning_rate": 4.879218749790119e-05, + "loss": 0.2562, + "step": 10619 + }, + { + "epoch": 0.18941961259943638, + "grad_norm": 0.23079805076122284, + "learning_rate": 4.879170949774663e-05, + "loss": 0.1872, + "step": 10620 + }, + { + "epoch": 0.18943744872115006, + "grad_norm": 0.2190384864807129, + "learning_rate": 4.87912314053672e-05, + "loss": 0.1654, + "step": 10621 + }, + { + "epoch": 0.18945528484286378, + "grad_norm": 0.3155633807182312, + "learning_rate": 4.879075322076473e-05, + "loss": 0.1621, + "step": 10622 + }, + { + "epoch": 0.18947312096457747, + "grad_norm": 0.24207884073257446, + "learning_rate": 4.879027494394108e-05, + "loss": 0.219, + "step": 10623 + }, + { + "epoch": 0.18949095708629116, + "grad_norm": 0.2664487063884735, + "learning_rate": 4.878979657489811e-05, + "loss": 0.2105, + "step": 10624 + }, + { + "epoch": 0.18950879320800484, + "grad_norm": 0.2868267297744751, + "learning_rate": 4.8789318113637676e-05, + "loss": 0.2023, + "step": 10625 + }, + { + "epoch": 0.18952662932971853, + "grad_norm": 0.24286139011383057, + "learning_rate": 4.878883956016163e-05, + "loss": 0.1904, + "step": 10626 + }, + { + "epoch": 0.18954446545143225, + "grad_norm": 0.27517902851104736, + "learning_rate": 4.878836091447182e-05, + "loss": 0.2367, + "step": 10627 + }, + { + "epoch": 0.18956230157314594, + "grad_norm": 0.22094179689884186, + "learning_rate": 4.878788217657011e-05, + "loss": 0.2, + "step": 10628 + }, + { + "epoch": 0.18958013769485962, + "grad_norm": 0.3139047622680664, + "learning_rate": 4.878740334645835e-05, + "loss": 0.1842, + "step": 10629 + }, + { + "epoch": 0.1895979738165733, + "grad_norm": 0.3774126172065735, + "learning_rate": 4.87869244241384e-05, + "loss": 0.2002, + "step": 10630 + }, + { + "epoch": 0.18961580993828703, + "grad_norm": 0.2085423320531845, + "learning_rate": 4.878644540961212e-05, + "loss": 0.1746, + "step": 10631 + }, + { + "epoch": 0.18963364606000072, + "grad_norm": 0.26533958315849304, + "learning_rate": 4.878596630288135e-05, + "loss": 0.1814, + "step": 10632 + }, + { + "epoch": 0.1896514821817144, + "grad_norm": 0.23908816277980804, + "learning_rate": 4.8785487103947965e-05, + "loss": 0.1822, + "step": 10633 + }, + { + "epoch": 0.1896693183034281, + "grad_norm": 0.22133202850818634, + "learning_rate": 4.878500781281381e-05, + "loss": 0.2197, + "step": 10634 + }, + { + "epoch": 0.1896871544251418, + "grad_norm": 0.2445531040430069, + "learning_rate": 4.878452842948076e-05, + "loss": 0.2065, + "step": 10635 + }, + { + "epoch": 0.1897049905468555, + "grad_norm": 0.31081560254096985, + "learning_rate": 4.878404895395067e-05, + "loss": 0.2119, + "step": 10636 + }, + { + "epoch": 0.18972282666856918, + "grad_norm": 0.32412323355674744, + "learning_rate": 4.8783569386225374e-05, + "loss": 0.2276, + "step": 10637 + }, + { + "epoch": 0.18974066279028287, + "grad_norm": 0.3143625259399414, + "learning_rate": 4.878308972630676e-05, + "loss": 0.2369, + "step": 10638 + }, + { + "epoch": 0.1897584989119966, + "grad_norm": 0.3914739787578583, + "learning_rate": 4.878260997419667e-05, + "loss": 0.2123, + "step": 10639 + }, + { + "epoch": 0.18977633503371028, + "grad_norm": 0.3520326018333435, + "learning_rate": 4.878213012989697e-05, + "loss": 0.2435, + "step": 10640 + }, + { + "epoch": 0.18979417115542396, + "grad_norm": 0.38156238198280334, + "learning_rate": 4.878165019340952e-05, + "loss": 0.2352, + "step": 10641 + }, + { + "epoch": 0.18981200727713765, + "grad_norm": 0.30638787150382996, + "learning_rate": 4.878117016473618e-05, + "loss": 0.2642, + "step": 10642 + }, + { + "epoch": 0.18982984339885137, + "grad_norm": 0.3258053660392761, + "learning_rate": 4.878069004387882e-05, + "loss": 0.2064, + "step": 10643 + }, + { + "epoch": 0.18984767952056505, + "grad_norm": 0.23376846313476562, + "learning_rate": 4.878020983083928e-05, + "loss": 0.2156, + "step": 10644 + }, + { + "epoch": 0.18986551564227874, + "grad_norm": 0.185083270072937, + "learning_rate": 4.8779729525619434e-05, + "loss": 0.1586, + "step": 10645 + }, + { + "epoch": 0.18988335176399243, + "grad_norm": 0.26105907559394836, + "learning_rate": 4.877924912822115e-05, + "loss": 0.2215, + "step": 10646 + }, + { + "epoch": 0.18990118788570612, + "grad_norm": 0.33682021498680115, + "learning_rate": 4.8778768638646275e-05, + "loss": 0.2209, + "step": 10647 + }, + { + "epoch": 0.18991902400741983, + "grad_norm": 0.3240136504173279, + "learning_rate": 4.877828805689669e-05, + "loss": 0.1778, + "step": 10648 + }, + { + "epoch": 0.18993686012913352, + "grad_norm": 0.24435627460479736, + "learning_rate": 4.8777807382974236e-05, + "loss": 0.2121, + "step": 10649 + }, + { + "epoch": 0.1899546962508472, + "grad_norm": 0.21817460656166077, + "learning_rate": 4.877732661688079e-05, + "loss": 0.2128, + "step": 10650 + }, + { + "epoch": 0.1899725323725609, + "grad_norm": 0.4411410391330719, + "learning_rate": 4.8776845758618225e-05, + "loss": 0.1674, + "step": 10651 + }, + { + "epoch": 0.18999036849427461, + "grad_norm": 0.28367194533348083, + "learning_rate": 4.8776364808188387e-05, + "loss": 0.1967, + "step": 10652 + }, + { + "epoch": 0.1900082046159883, + "grad_norm": 0.212859645485878, + "learning_rate": 4.8775883765593144e-05, + "loss": 0.1815, + "step": 10653 + }, + { + "epoch": 0.190026040737702, + "grad_norm": 0.22487853467464447, + "learning_rate": 4.8775402630834367e-05, + "loss": 0.201, + "step": 10654 + }, + { + "epoch": 0.19004387685941568, + "grad_norm": 0.40863654017448425, + "learning_rate": 4.877492140391391e-05, + "loss": 0.2182, + "step": 10655 + }, + { + "epoch": 0.1900617129811294, + "grad_norm": 0.25813040137290955, + "learning_rate": 4.877444008483366e-05, + "loss": 0.2106, + "step": 10656 + }, + { + "epoch": 0.19007954910284308, + "grad_norm": 0.2469097226858139, + "learning_rate": 4.877395867359545e-05, + "loss": 0.1879, + "step": 10657 + }, + { + "epoch": 0.19009738522455677, + "grad_norm": 0.29186534881591797, + "learning_rate": 4.877347717020118e-05, + "loss": 0.2304, + "step": 10658 + }, + { + "epoch": 0.19011522134627046, + "grad_norm": 0.30523616075515747, + "learning_rate": 4.87729955746527e-05, + "loss": 0.2028, + "step": 10659 + }, + { + "epoch": 0.19013305746798417, + "grad_norm": 0.22320972383022308, + "learning_rate": 4.877251388695188e-05, + "loss": 0.179, + "step": 10660 + }, + { + "epoch": 0.19015089358969786, + "grad_norm": 0.23349139094352722, + "learning_rate": 4.8772032107100575e-05, + "loss": 0.1955, + "step": 10661 + }, + { + "epoch": 0.19016872971141155, + "grad_norm": 0.30190032720565796, + "learning_rate": 4.877155023510067e-05, + "loss": 0.192, + "step": 10662 + }, + { + "epoch": 0.19018656583312524, + "grad_norm": 0.2792718708515167, + "learning_rate": 4.8771068270954026e-05, + "loss": 0.2449, + "step": 10663 + }, + { + "epoch": 0.19020440195483895, + "grad_norm": 0.291419118642807, + "learning_rate": 4.877058621466251e-05, + "loss": 0.1789, + "step": 10664 + }, + { + "epoch": 0.19022223807655264, + "grad_norm": 0.3131239116191864, + "learning_rate": 4.877010406622799e-05, + "loss": 0.2704, + "step": 10665 + }, + { + "epoch": 0.19024007419826633, + "grad_norm": 0.28706690669059753, + "learning_rate": 4.876962182565234e-05, + "loss": 0.2338, + "step": 10666 + }, + { + "epoch": 0.19025791031998002, + "grad_norm": 0.36330926418304443, + "learning_rate": 4.8769139492937424e-05, + "loss": 0.1698, + "step": 10667 + }, + { + "epoch": 0.1902757464416937, + "grad_norm": 0.24396871030330658, + "learning_rate": 4.876865706808511e-05, + "loss": 0.2072, + "step": 10668 + }, + { + "epoch": 0.19029358256340742, + "grad_norm": 0.2736376225948334, + "learning_rate": 4.876817455109728e-05, + "loss": 0.2225, + "step": 10669 + }, + { + "epoch": 0.1903114186851211, + "grad_norm": 0.25717228651046753, + "learning_rate": 4.876769194197579e-05, + "loss": 0.1949, + "step": 10670 + }, + { + "epoch": 0.1903292548068348, + "grad_norm": 0.31902459263801575, + "learning_rate": 4.876720924072252e-05, + "loss": 0.2415, + "step": 10671 + }, + { + "epoch": 0.19034709092854848, + "grad_norm": 0.2533029317855835, + "learning_rate": 4.876672644733934e-05, + "loss": 0.1688, + "step": 10672 + }, + { + "epoch": 0.1903649270502622, + "grad_norm": 0.6672841906547546, + "learning_rate": 4.876624356182811e-05, + "loss": 0.1843, + "step": 10673 + }, + { + "epoch": 0.1903827631719759, + "grad_norm": 0.30146417021751404, + "learning_rate": 4.876576058419072e-05, + "loss": 0.2155, + "step": 10674 + }, + { + "epoch": 0.19040059929368958, + "grad_norm": 0.5877314805984497, + "learning_rate": 4.876527751442903e-05, + "loss": 0.3144, + "step": 10675 + }, + { + "epoch": 0.19041843541540326, + "grad_norm": 0.4620359539985657, + "learning_rate": 4.876479435254492e-05, + "loss": 0.2325, + "step": 10676 + }, + { + "epoch": 0.19043627153711698, + "grad_norm": 0.2622694969177246, + "learning_rate": 4.8764311098540256e-05, + "loss": 0.1864, + "step": 10677 + }, + { + "epoch": 0.19045410765883067, + "grad_norm": 0.3003308176994324, + "learning_rate": 4.876382775241691e-05, + "loss": 0.2028, + "step": 10678 + }, + { + "epoch": 0.19047194378054436, + "grad_norm": 0.33290034532546997, + "learning_rate": 4.876334431417677e-05, + "loss": 0.2158, + "step": 10679 + }, + { + "epoch": 0.19048977990225804, + "grad_norm": 0.3635729253292084, + "learning_rate": 4.8762860783821695e-05, + "loss": 0.2039, + "step": 10680 + }, + { + "epoch": 0.19050761602397176, + "grad_norm": 0.23168057203292847, + "learning_rate": 4.876237716135356e-05, + "loss": 0.1456, + "step": 10681 + }, + { + "epoch": 0.19052545214568545, + "grad_norm": 0.37110477685928345, + "learning_rate": 4.8761893446774256e-05, + "loss": 0.2385, + "step": 10682 + }, + { + "epoch": 0.19054328826739914, + "grad_norm": 0.2669340670108795, + "learning_rate": 4.876140964008563e-05, + "loss": 0.1782, + "step": 10683 + }, + { + "epoch": 0.19056112438911282, + "grad_norm": 0.3143084645271301, + "learning_rate": 4.876092574128958e-05, + "loss": 0.1606, + "step": 10684 + }, + { + "epoch": 0.19057896051082654, + "grad_norm": 0.2638698220252991, + "learning_rate": 4.876044175038797e-05, + "loss": 0.2126, + "step": 10685 + }, + { + "epoch": 0.19059679663254023, + "grad_norm": 0.3711576461791992, + "learning_rate": 4.87599576673827e-05, + "loss": 0.1979, + "step": 10686 + }, + { + "epoch": 0.19061463275425392, + "grad_norm": 0.397236704826355, + "learning_rate": 4.875947349227561e-05, + "loss": 0.2516, + "step": 10687 + }, + { + "epoch": 0.1906324688759676, + "grad_norm": 0.2776789367198944, + "learning_rate": 4.8758989225068596e-05, + "loss": 0.2108, + "step": 10688 + }, + { + "epoch": 0.1906503049976813, + "grad_norm": 0.36340075731277466, + "learning_rate": 4.8758504865763544e-05, + "loss": 0.2122, + "step": 10689 + }, + { + "epoch": 0.190668141119395, + "grad_norm": 0.34707874059677124, + "learning_rate": 4.875802041436231e-05, + "loss": 0.2305, + "step": 10690 + }, + { + "epoch": 0.1906859772411087, + "grad_norm": 0.28226712346076965, + "learning_rate": 4.8757535870866785e-05, + "loss": 0.1888, + "step": 10691 + }, + { + "epoch": 0.19070381336282238, + "grad_norm": 0.3021794855594635, + "learning_rate": 4.875705123527885e-05, + "loss": 0.2266, + "step": 10692 + }, + { + "epoch": 0.19072164948453607, + "grad_norm": 0.2521364390850067, + "learning_rate": 4.875656650760038e-05, + "loss": 0.191, + "step": 10693 + }, + { + "epoch": 0.1907394856062498, + "grad_norm": 0.18075856566429138, + "learning_rate": 4.875608168783324e-05, + "loss": 0.1905, + "step": 10694 + }, + { + "epoch": 0.19075732172796347, + "grad_norm": 0.26127690076828003, + "learning_rate": 4.875559677597934e-05, + "loss": 0.2108, + "step": 10695 + }, + { + "epoch": 0.19077515784967716, + "grad_norm": 0.27619415521621704, + "learning_rate": 4.8755111772040526e-05, + "loss": 0.2049, + "step": 10696 + }, + { + "epoch": 0.19079299397139085, + "grad_norm": 0.37192365527153015, + "learning_rate": 4.87546266760187e-05, + "loss": 0.1842, + "step": 10697 + }, + { + "epoch": 0.19081083009310457, + "grad_norm": 0.285893976688385, + "learning_rate": 4.8754141487915745e-05, + "loss": 0.1877, + "step": 10698 + }, + { + "epoch": 0.19082866621481825, + "grad_norm": 0.37313127517700195, + "learning_rate": 4.875365620773352e-05, + "loss": 0.201, + "step": 10699 + }, + { + "epoch": 0.19084650233653194, + "grad_norm": 0.3476355969905853, + "learning_rate": 4.8753170835473926e-05, + "loss": 0.2817, + "step": 10700 + }, + { + "epoch": 0.19086433845824563, + "grad_norm": 0.2675214409828186, + "learning_rate": 4.875268537113884e-05, + "loss": 0.194, + "step": 10701 + }, + { + "epoch": 0.19088217457995935, + "grad_norm": 0.31153154373168945, + "learning_rate": 4.8752199814730134e-05, + "loss": 0.2034, + "step": 10702 + }, + { + "epoch": 0.19090001070167303, + "grad_norm": 0.23315644264221191, + "learning_rate": 4.8751714166249706e-05, + "loss": 0.1858, + "step": 10703 + }, + { + "epoch": 0.19091784682338672, + "grad_norm": 0.36079779267311096, + "learning_rate": 4.875122842569943e-05, + "loss": 0.264, + "step": 10704 + }, + { + "epoch": 0.1909356829451004, + "grad_norm": 0.2607985734939575, + "learning_rate": 4.875074259308119e-05, + "loss": 0.1662, + "step": 10705 + }, + { + "epoch": 0.19095351906681413, + "grad_norm": 0.4972494840621948, + "learning_rate": 4.875025666839686e-05, + "loss": 0.2623, + "step": 10706 + }, + { + "epoch": 0.1909713551885278, + "grad_norm": 0.3176359236240387, + "learning_rate": 4.874977065164834e-05, + "loss": 0.199, + "step": 10707 + }, + { + "epoch": 0.1909891913102415, + "grad_norm": 0.22909832000732422, + "learning_rate": 4.8749284542837504e-05, + "loss": 0.1805, + "step": 10708 + }, + { + "epoch": 0.1910070274319552, + "grad_norm": 0.23171298205852509, + "learning_rate": 4.874879834196623e-05, + "loss": 0.2299, + "step": 10709 + }, + { + "epoch": 0.19102486355366888, + "grad_norm": 0.318135142326355, + "learning_rate": 4.8748312049036426e-05, + "loss": 0.1971, + "step": 10710 + }, + { + "epoch": 0.1910426996753826, + "grad_norm": 0.2622092366218567, + "learning_rate": 4.8747825664049954e-05, + "loss": 0.2292, + "step": 10711 + }, + { + "epoch": 0.19106053579709628, + "grad_norm": 0.23834381997585297, + "learning_rate": 4.8747339187008707e-05, + "loss": 0.2478, + "step": 10712 + }, + { + "epoch": 0.19107837191880997, + "grad_norm": 0.276817262172699, + "learning_rate": 4.8746852617914575e-05, + "loss": 0.1794, + "step": 10713 + }, + { + "epoch": 0.19109620804052366, + "grad_norm": 0.30297133326530457, + "learning_rate": 4.8746365956769436e-05, + "loss": 0.2156, + "step": 10714 + }, + { + "epoch": 0.19111404416223737, + "grad_norm": 0.25306639075279236, + "learning_rate": 4.874587920357518e-05, + "loss": 0.2202, + "step": 10715 + }, + { + "epoch": 0.19113188028395106, + "grad_norm": 0.27077245712280273, + "learning_rate": 4.87453923583337e-05, + "loss": 0.1987, + "step": 10716 + }, + { + "epoch": 0.19114971640566475, + "grad_norm": 0.32376712560653687, + "learning_rate": 4.874490542104687e-05, + "loss": 0.2671, + "step": 10717 + }, + { + "epoch": 0.19116755252737844, + "grad_norm": 0.2347438484430313, + "learning_rate": 4.8744418391716597e-05, + "loss": 0.2045, + "step": 10718 + }, + { + "epoch": 0.19118538864909215, + "grad_norm": 0.23465541005134583, + "learning_rate": 4.8743931270344745e-05, + "loss": 0.1664, + "step": 10719 + }, + { + "epoch": 0.19120322477080584, + "grad_norm": 0.26362845301628113, + "learning_rate": 4.8743444056933216e-05, + "loss": 0.1581, + "step": 10720 + }, + { + "epoch": 0.19122106089251953, + "grad_norm": 0.30119380354881287, + "learning_rate": 4.87429567514839e-05, + "loss": 0.1942, + "step": 10721 + }, + { + "epoch": 0.19123889701423322, + "grad_norm": 0.3214512765407562, + "learning_rate": 4.874246935399869e-05, + "loss": 0.1876, + "step": 10722 + }, + { + "epoch": 0.19125673313594693, + "grad_norm": 0.3232775330543518, + "learning_rate": 4.874198186447946e-05, + "loss": 0.1692, + "step": 10723 + }, + { + "epoch": 0.19127456925766062, + "grad_norm": 0.2662023603916168, + "learning_rate": 4.874149428292811e-05, + "loss": 0.2053, + "step": 10724 + }, + { + "epoch": 0.1912924053793743, + "grad_norm": 0.323077529668808, + "learning_rate": 4.8741006609346527e-05, + "loss": 0.2443, + "step": 10725 + }, + { + "epoch": 0.191310241501088, + "grad_norm": 0.34338510036468506, + "learning_rate": 4.87405188437366e-05, + "loss": 0.1505, + "step": 10726 + }, + { + "epoch": 0.19132807762280168, + "grad_norm": 0.22434459626674652, + "learning_rate": 4.874003098610023e-05, + "loss": 0.1771, + "step": 10727 + }, + { + "epoch": 0.1913459137445154, + "grad_norm": 0.29782113432884216, + "learning_rate": 4.87395430364393e-05, + "loss": 0.2559, + "step": 10728 + }, + { + "epoch": 0.1913637498662291, + "grad_norm": 0.2826301157474518, + "learning_rate": 4.873905499475569e-05, + "loss": 0.2342, + "step": 10729 + }, + { + "epoch": 0.19138158598794278, + "grad_norm": 0.3040046989917755, + "learning_rate": 4.8738566861051324e-05, + "loss": 0.1915, + "step": 10730 + }, + { + "epoch": 0.19139942210965646, + "grad_norm": 0.23923376202583313, + "learning_rate": 4.873807863532806e-05, + "loss": 0.186, + "step": 10731 + }, + { + "epoch": 0.19141725823137018, + "grad_norm": 0.4032547175884247, + "learning_rate": 4.8737590317587806e-05, + "loss": 0.1615, + "step": 10732 + }, + { + "epoch": 0.19143509435308387, + "grad_norm": 0.33723071217536926, + "learning_rate": 4.873710190783245e-05, + "loss": 0.2841, + "step": 10733 + }, + { + "epoch": 0.19145293047479756, + "grad_norm": 0.339765727519989, + "learning_rate": 4.8736613406063894e-05, + "loss": 0.215, + "step": 10734 + }, + { + "epoch": 0.19147076659651124, + "grad_norm": 0.3320823907852173, + "learning_rate": 4.8736124812284025e-05, + "loss": 0.2184, + "step": 10735 + }, + { + "epoch": 0.19148860271822496, + "grad_norm": 0.4221637547016144, + "learning_rate": 4.8735636126494735e-05, + "loss": 0.2394, + "step": 10736 + }, + { + "epoch": 0.19150643883993865, + "grad_norm": 0.2771061956882477, + "learning_rate": 4.873514734869793e-05, + "loss": 0.2106, + "step": 10737 + }, + { + "epoch": 0.19152427496165234, + "grad_norm": 0.2547479569911957, + "learning_rate": 4.873465847889549e-05, + "loss": 0.2227, + "step": 10738 + }, + { + "epoch": 0.19154211108336602, + "grad_norm": 0.5282972455024719, + "learning_rate": 4.8734169517089315e-05, + "loss": 0.1985, + "step": 10739 + }, + { + "epoch": 0.19155994720507974, + "grad_norm": 0.2373201847076416, + "learning_rate": 4.873368046328129e-05, + "loss": 0.2075, + "step": 10740 + }, + { + "epoch": 0.19157778332679343, + "grad_norm": 0.24604111909866333, + "learning_rate": 4.873319131747334e-05, + "loss": 0.186, + "step": 10741 + }, + { + "epoch": 0.19159561944850712, + "grad_norm": 0.25536486506462097, + "learning_rate": 4.873270207966734e-05, + "loss": 0.197, + "step": 10742 + }, + { + "epoch": 0.1916134555702208, + "grad_norm": 0.28811123967170715, + "learning_rate": 4.8732212749865183e-05, + "loss": 0.1741, + "step": 10743 + }, + { + "epoch": 0.19163129169193452, + "grad_norm": 0.21727687120437622, + "learning_rate": 4.873172332806878e-05, + "loss": 0.194, + "step": 10744 + }, + { + "epoch": 0.1916491278136482, + "grad_norm": 0.3271524906158447, + "learning_rate": 4.873123381428002e-05, + "loss": 0.2366, + "step": 10745 + }, + { + "epoch": 0.1916669639353619, + "grad_norm": 0.22577716410160065, + "learning_rate": 4.8730744208500803e-05, + "loss": 0.2071, + "step": 10746 + }, + { + "epoch": 0.19168480005707558, + "grad_norm": 0.3227449357509613, + "learning_rate": 4.8730254510733014e-05, + "loss": 0.164, + "step": 10747 + }, + { + "epoch": 0.19170263617878927, + "grad_norm": 0.24243931472301483, + "learning_rate": 4.8729764720978565e-05, + "loss": 0.1931, + "step": 10748 + }, + { + "epoch": 0.191720472300503, + "grad_norm": 0.3302162289619446, + "learning_rate": 4.872927483923936e-05, + "loss": 0.22, + "step": 10749 + }, + { + "epoch": 0.19173830842221667, + "grad_norm": 0.2613740563392639, + "learning_rate": 4.872878486551728e-05, + "loss": 0.1605, + "step": 10750 + }, + { + "epoch": 0.19175614454393036, + "grad_norm": 0.22148790955543518, + "learning_rate": 4.8728294799814244e-05, + "loss": 0.1747, + "step": 10751 + }, + { + "epoch": 0.19177398066564405, + "grad_norm": 0.23619091510772705, + "learning_rate": 4.872780464213214e-05, + "loss": 0.1598, + "step": 10752 + }, + { + "epoch": 0.19179181678735777, + "grad_norm": 0.24416673183441162, + "learning_rate": 4.872731439247287e-05, + "loss": 0.1775, + "step": 10753 + }, + { + "epoch": 0.19180965290907145, + "grad_norm": 0.24513179063796997, + "learning_rate": 4.872682405083833e-05, + "loss": 0.1532, + "step": 10754 + }, + { + "epoch": 0.19182748903078514, + "grad_norm": 0.216408833861351, + "learning_rate": 4.872633361723043e-05, + "loss": 0.2088, + "step": 10755 + }, + { + "epoch": 0.19184532515249883, + "grad_norm": 0.347248911857605, + "learning_rate": 4.8725843091651057e-05, + "loss": 0.2144, + "step": 10756 + }, + { + "epoch": 0.19186316127421255, + "grad_norm": 0.3666917085647583, + "learning_rate": 4.872535247410213e-05, + "loss": 0.2423, + "step": 10757 + }, + { + "epoch": 0.19188099739592623, + "grad_norm": 0.22781166434288025, + "learning_rate": 4.872486176458554e-05, + "loss": 0.2025, + "step": 10758 + }, + { + "epoch": 0.19189883351763992, + "grad_norm": 0.19777579605579376, + "learning_rate": 4.8724370963103195e-05, + "loss": 0.1969, + "step": 10759 + }, + { + "epoch": 0.1919166696393536, + "grad_norm": 0.3431404232978821, + "learning_rate": 4.872388006965699e-05, + "loss": 0.241, + "step": 10760 + }, + { + "epoch": 0.19193450576106733, + "grad_norm": 0.3505854904651642, + "learning_rate": 4.8723389084248836e-05, + "loss": 0.2034, + "step": 10761 + }, + { + "epoch": 0.191952341882781, + "grad_norm": 0.3567541241645813, + "learning_rate": 4.872289800688063e-05, + "loss": 0.1626, + "step": 10762 + }, + { + "epoch": 0.1919701780044947, + "grad_norm": 0.34400448203086853, + "learning_rate": 4.872240683755427e-05, + "loss": 0.1878, + "step": 10763 + }, + { + "epoch": 0.1919880141262084, + "grad_norm": 0.2739449441432953, + "learning_rate": 4.8721915576271676e-05, + "loss": 0.1989, + "step": 10764 + }, + { + "epoch": 0.1920058502479221, + "grad_norm": 0.3268420398235321, + "learning_rate": 4.872142422303474e-05, + "loss": 0.227, + "step": 10765 + }, + { + "epoch": 0.1920236863696358, + "grad_norm": 0.2755216062068939, + "learning_rate": 4.8720932777845376e-05, + "loss": 0.2013, + "step": 10766 + }, + { + "epoch": 0.19204152249134948, + "grad_norm": 0.42162027955055237, + "learning_rate": 4.872044124070548e-05, + "loss": 0.1884, + "step": 10767 + }, + { + "epoch": 0.19205935861306317, + "grad_norm": 0.267325222492218, + "learning_rate": 4.871994961161695e-05, + "loss": 0.1785, + "step": 10768 + }, + { + "epoch": 0.19207719473477686, + "grad_norm": 0.22796767950057983, + "learning_rate": 4.871945789058172e-05, + "loss": 0.1904, + "step": 10769 + }, + { + "epoch": 0.19209503085649057, + "grad_norm": 0.32255902886390686, + "learning_rate": 4.871896607760168e-05, + "loss": 0.1818, + "step": 10770 + }, + { + "epoch": 0.19211286697820426, + "grad_norm": 0.3439323306083679, + "learning_rate": 4.8718474172678725e-05, + "loss": 0.1981, + "step": 10771 + }, + { + "epoch": 0.19213070309991795, + "grad_norm": 0.23410923779010773, + "learning_rate": 4.8717982175814774e-05, + "loss": 0.2099, + "step": 10772 + }, + { + "epoch": 0.19214853922163164, + "grad_norm": 0.2560957670211792, + "learning_rate": 4.871749008701173e-05, + "loss": 0.1981, + "step": 10773 + }, + { + "epoch": 0.19216637534334535, + "grad_norm": 0.23217159509658813, + "learning_rate": 4.871699790627151e-05, + "loss": 0.2204, + "step": 10774 + }, + { + "epoch": 0.19218421146505904, + "grad_norm": 0.2590517997741699, + "learning_rate": 4.871650563359601e-05, + "loss": 0.1566, + "step": 10775 + }, + { + "epoch": 0.19220204758677273, + "grad_norm": 0.3200806677341461, + "learning_rate": 4.871601326898714e-05, + "loss": 0.2173, + "step": 10776 + }, + { + "epoch": 0.19221988370848642, + "grad_norm": 0.2960502505302429, + "learning_rate": 4.8715520812446816e-05, + "loss": 0.1738, + "step": 10777 + }, + { + "epoch": 0.19223771983020013, + "grad_norm": 0.26086127758026123, + "learning_rate": 4.871502826397694e-05, + "loss": 0.2041, + "step": 10778 + }, + { + "epoch": 0.19225555595191382, + "grad_norm": 0.2736360430717468, + "learning_rate": 4.871453562357943e-05, + "loss": 0.1971, + "step": 10779 + }, + { + "epoch": 0.1922733920736275, + "grad_norm": 0.2267794907093048, + "learning_rate": 4.8714042891256175e-05, + "loss": 0.1632, + "step": 10780 + }, + { + "epoch": 0.1922912281953412, + "grad_norm": 0.21866673231124878, + "learning_rate": 4.8713550067009106e-05, + "loss": 0.1901, + "step": 10781 + }, + { + "epoch": 0.1923090643170549, + "grad_norm": 0.2071189284324646, + "learning_rate": 4.871305715084013e-05, + "loss": 0.1645, + "step": 10782 + }, + { + "epoch": 0.1923269004387686, + "grad_norm": 0.2618585228919983, + "learning_rate": 4.8712564142751154e-05, + "loss": 0.1955, + "step": 10783 + }, + { + "epoch": 0.1923447365604823, + "grad_norm": 0.2709810137748718, + "learning_rate": 4.871207104274409e-05, + "loss": 0.2486, + "step": 10784 + }, + { + "epoch": 0.19236257268219598, + "grad_norm": 0.3615318834781647, + "learning_rate": 4.8711577850820845e-05, + "loss": 0.2187, + "step": 10785 + }, + { + "epoch": 0.1923804088039097, + "grad_norm": 0.3363041579723358, + "learning_rate": 4.8711084566983334e-05, + "loss": 0.2602, + "step": 10786 + }, + { + "epoch": 0.19239824492562338, + "grad_norm": 0.3892727792263031, + "learning_rate": 4.8710591191233466e-05, + "loss": 0.1825, + "step": 10787 + }, + { + "epoch": 0.19241608104733707, + "grad_norm": 0.2107529491186142, + "learning_rate": 4.871009772357317e-05, + "loss": 0.2102, + "step": 10788 + }, + { + "epoch": 0.19243391716905076, + "grad_norm": 0.3997096121311188, + "learning_rate": 4.870960416400434e-05, + "loss": 0.1961, + "step": 10789 + }, + { + "epoch": 0.19245175329076444, + "grad_norm": 0.25473466515541077, + "learning_rate": 4.870911051252889e-05, + "loss": 0.2156, + "step": 10790 + }, + { + "epoch": 0.19246958941247816, + "grad_norm": 0.3052992522716522, + "learning_rate": 4.870861676914874e-05, + "loss": 0.1973, + "step": 10791 + }, + { + "epoch": 0.19248742553419185, + "grad_norm": 0.28740665316581726, + "learning_rate": 4.870812293386581e-05, + "loss": 0.2368, + "step": 10792 + }, + { + "epoch": 0.19250526165590554, + "grad_norm": 0.26118016242980957, + "learning_rate": 4.8707629006682e-05, + "loss": 0.16, + "step": 10793 + }, + { + "epoch": 0.19252309777761922, + "grad_norm": 0.3979540169239044, + "learning_rate": 4.870713498759924e-05, + "loss": 0.197, + "step": 10794 + }, + { + "epoch": 0.19254093389933294, + "grad_norm": 0.24741658568382263, + "learning_rate": 4.8706640876619423e-05, + "loss": 0.2398, + "step": 10795 + }, + { + "epoch": 0.19255877002104663, + "grad_norm": 0.2383471131324768, + "learning_rate": 4.870614667374449e-05, + "loss": 0.2067, + "step": 10796 + }, + { + "epoch": 0.19257660614276031, + "grad_norm": 0.20983032882213593, + "learning_rate": 4.870565237897634e-05, + "loss": 0.1858, + "step": 10797 + }, + { + "epoch": 0.192594442264474, + "grad_norm": 0.33932432532310486, + "learning_rate": 4.87051579923169e-05, + "loss": 0.2122, + "step": 10798 + }, + { + "epoch": 0.19261227838618772, + "grad_norm": 0.3419553339481354, + "learning_rate": 4.8704663513768065e-05, + "loss": 0.2347, + "step": 10799 + }, + { + "epoch": 0.1926301145079014, + "grad_norm": 0.226662278175354, + "learning_rate": 4.870416894333178e-05, + "loss": 0.162, + "step": 10800 + }, + { + "epoch": 0.1926479506296151, + "grad_norm": 0.2669966220855713, + "learning_rate": 4.8703674281009944e-05, + "loss": 0.1794, + "step": 10801 + }, + { + "epoch": 0.19266578675132878, + "grad_norm": 0.2578684687614441, + "learning_rate": 4.870317952680448e-05, + "loss": 0.2104, + "step": 10802 + }, + { + "epoch": 0.1926836228730425, + "grad_norm": 0.36353614926338196, + "learning_rate": 4.87026846807173e-05, + "loss": 0.2095, + "step": 10803 + }, + { + "epoch": 0.1927014589947562, + "grad_norm": 0.26796650886535645, + "learning_rate": 4.8702189742750336e-05, + "loss": 0.1554, + "step": 10804 + }, + { + "epoch": 0.19271929511646987, + "grad_norm": 0.2691427767276764, + "learning_rate": 4.87016947129055e-05, + "loss": 0.2155, + "step": 10805 + }, + { + "epoch": 0.19273713123818356, + "grad_norm": 0.43507060408592224, + "learning_rate": 4.87011995911847e-05, + "loss": 0.2378, + "step": 10806 + }, + { + "epoch": 0.19275496735989725, + "grad_norm": 0.2600294351577759, + "learning_rate": 4.870070437758987e-05, + "loss": 0.2423, + "step": 10807 + }, + { + "epoch": 0.19277280348161097, + "grad_norm": 0.2781378924846649, + "learning_rate": 4.870020907212293e-05, + "loss": 0.146, + "step": 10808 + }, + { + "epoch": 0.19279063960332465, + "grad_norm": 0.28758978843688965, + "learning_rate": 4.869971367478578e-05, + "loss": 0.2175, + "step": 10809 + }, + { + "epoch": 0.19280847572503834, + "grad_norm": 0.23192574083805084, + "learning_rate": 4.8699218185580364e-05, + "loss": 0.1984, + "step": 10810 + }, + { + "epoch": 0.19282631184675203, + "grad_norm": 0.3394330143928528, + "learning_rate": 4.8698722604508585e-05, + "loss": 0.2212, + "step": 10811 + }, + { + "epoch": 0.19284414796846575, + "grad_norm": 0.2767375409603119, + "learning_rate": 4.869822693157238e-05, + "loss": 0.1984, + "step": 10812 + }, + { + "epoch": 0.19286198409017943, + "grad_norm": 0.36856672167778015, + "learning_rate": 4.869773116677365e-05, + "loss": 0.2307, + "step": 10813 + }, + { + "epoch": 0.19287982021189312, + "grad_norm": 0.2712098956108093, + "learning_rate": 4.869723531011434e-05, + "loss": 0.1482, + "step": 10814 + }, + { + "epoch": 0.1928976563336068, + "grad_norm": 0.29187947511672974, + "learning_rate": 4.8696739361596364e-05, + "loss": 0.2319, + "step": 10815 + }, + { + "epoch": 0.19291549245532053, + "grad_norm": 0.3189451992511749, + "learning_rate": 4.8696243321221633e-05, + "loss": 0.2065, + "step": 10816 + }, + { + "epoch": 0.1929333285770342, + "grad_norm": 0.31867697834968567, + "learning_rate": 4.869574718899208e-05, + "loss": 0.1791, + "step": 10817 + }, + { + "epoch": 0.1929511646987479, + "grad_norm": 0.36782655119895935, + "learning_rate": 4.8695250964909634e-05, + "loss": 0.2715, + "step": 10818 + }, + { + "epoch": 0.1929690008204616, + "grad_norm": 0.44313058257102966, + "learning_rate": 4.86947546489762e-05, + "loss": 0.183, + "step": 10819 + }, + { + "epoch": 0.1929868369421753, + "grad_norm": 0.29162415862083435, + "learning_rate": 4.869425824119373e-05, + "loss": 0.2108, + "step": 10820 + }, + { + "epoch": 0.193004673063889, + "grad_norm": 0.27659785747528076, + "learning_rate": 4.8693761741564116e-05, + "loss": 0.1941, + "step": 10821 + }, + { + "epoch": 0.19302250918560268, + "grad_norm": 0.5724294185638428, + "learning_rate": 4.86932651500893e-05, + "loss": 0.2551, + "step": 10822 + }, + { + "epoch": 0.19304034530731637, + "grad_norm": 0.22013120353221893, + "learning_rate": 4.869276846677121e-05, + "loss": 0.1602, + "step": 10823 + }, + { + "epoch": 0.19305818142903008, + "grad_norm": 0.34596380591392517, + "learning_rate": 4.8692271691611755e-05, + "loss": 0.2071, + "step": 10824 + }, + { + "epoch": 0.19307601755074377, + "grad_norm": 0.274842232465744, + "learning_rate": 4.869177482461288e-05, + "loss": 0.1678, + "step": 10825 + }, + { + "epoch": 0.19309385367245746, + "grad_norm": 0.26954323053359985, + "learning_rate": 4.869127786577651e-05, + "loss": 0.2212, + "step": 10826 + }, + { + "epoch": 0.19311168979417115, + "grad_norm": 0.36622151732444763, + "learning_rate": 4.869078081510455e-05, + "loss": 0.2136, + "step": 10827 + }, + { + "epoch": 0.19312952591588484, + "grad_norm": 0.1847706139087677, + "learning_rate": 4.869028367259896e-05, + "loss": 0.1542, + "step": 10828 + }, + { + "epoch": 0.19314736203759855, + "grad_norm": 0.4318757653236389, + "learning_rate": 4.868978643826163e-05, + "loss": 0.1855, + "step": 10829 + }, + { + "epoch": 0.19316519815931224, + "grad_norm": 0.30360129475593567, + "learning_rate": 4.8689289112094515e-05, + "loss": 0.222, + "step": 10830 + }, + { + "epoch": 0.19318303428102593, + "grad_norm": 0.3232423663139343, + "learning_rate": 4.8688791694099525e-05, + "loss": 0.1796, + "step": 10831 + }, + { + "epoch": 0.19320087040273962, + "grad_norm": 0.2598733901977539, + "learning_rate": 4.868829418427861e-05, + "loss": 0.2433, + "step": 10832 + }, + { + "epoch": 0.19321870652445333, + "grad_norm": 0.23770968616008759, + "learning_rate": 4.8687796582633673e-05, + "loss": 0.2249, + "step": 10833 + }, + { + "epoch": 0.19323654264616702, + "grad_norm": 0.21962356567382812, + "learning_rate": 4.8687298889166655e-05, + "loss": 0.1789, + "step": 10834 + }, + { + "epoch": 0.1932543787678807, + "grad_norm": 0.26733145117759705, + "learning_rate": 4.868680110387949e-05, + "loss": 0.16, + "step": 10835 + }, + { + "epoch": 0.1932722148895944, + "grad_norm": 0.3937084674835205, + "learning_rate": 4.868630322677411e-05, + "loss": 0.323, + "step": 10836 + }, + { + "epoch": 0.1932900510113081, + "grad_norm": 0.26207637786865234, + "learning_rate": 4.868580525785242e-05, + "loss": 0.2092, + "step": 10837 + }, + { + "epoch": 0.1933078871330218, + "grad_norm": 0.20126092433929443, + "learning_rate": 4.868530719711638e-05, + "loss": 0.1836, + "step": 10838 + }, + { + "epoch": 0.1933257232547355, + "grad_norm": 0.2982984185218811, + "learning_rate": 4.868480904456791e-05, + "loss": 0.2397, + "step": 10839 + }, + { + "epoch": 0.19334355937644918, + "grad_norm": 0.27005136013031006, + "learning_rate": 4.868431080020893e-05, + "loss": 0.233, + "step": 10840 + }, + { + "epoch": 0.1933613954981629, + "grad_norm": 0.24865023791790009, + "learning_rate": 4.868381246404139e-05, + "loss": 0.1481, + "step": 10841 + }, + { + "epoch": 0.19337923161987658, + "grad_norm": 0.3362480401992798, + "learning_rate": 4.8683314036067205e-05, + "loss": 0.268, + "step": 10842 + }, + { + "epoch": 0.19339706774159027, + "grad_norm": 0.40264660120010376, + "learning_rate": 4.868281551628833e-05, + "loss": 0.2347, + "step": 10843 + }, + { + "epoch": 0.19341490386330396, + "grad_norm": 0.21482892334461212, + "learning_rate": 4.868231690470667e-05, + "loss": 0.19, + "step": 10844 + }, + { + "epoch": 0.19343273998501767, + "grad_norm": 0.2526087760925293, + "learning_rate": 4.868181820132417e-05, + "loss": 0.2067, + "step": 10845 + }, + { + "epoch": 0.19345057610673136, + "grad_norm": 0.18698081374168396, + "learning_rate": 4.868131940614277e-05, + "loss": 0.1784, + "step": 10846 + }, + { + "epoch": 0.19346841222844505, + "grad_norm": 0.23443618416786194, + "learning_rate": 4.868082051916438e-05, + "loss": 0.2057, + "step": 10847 + }, + { + "epoch": 0.19348624835015873, + "grad_norm": 0.28047892451286316, + "learning_rate": 4.8680321540390974e-05, + "loss": 0.2106, + "step": 10848 + }, + { + "epoch": 0.19350408447187242, + "grad_norm": 0.3395068645477295, + "learning_rate": 4.8679822469824444e-05, + "loss": 0.2003, + "step": 10849 + }, + { + "epoch": 0.19352192059358614, + "grad_norm": 0.2478107362985611, + "learning_rate": 4.867932330746675e-05, + "loss": 0.2144, + "step": 10850 + }, + { + "epoch": 0.19353975671529983, + "grad_norm": 0.2190067023038864, + "learning_rate": 4.867882405331983e-05, + "loss": 0.1752, + "step": 10851 + }, + { + "epoch": 0.19355759283701351, + "grad_norm": 0.2671342194080353, + "learning_rate": 4.867832470738559e-05, + "loss": 0.2254, + "step": 10852 + }, + { + "epoch": 0.1935754289587272, + "grad_norm": 0.30748844146728516, + "learning_rate": 4.8677825269666e-05, + "loss": 0.2, + "step": 10853 + }, + { + "epoch": 0.19359326508044092, + "grad_norm": 0.20674552023410797, + "learning_rate": 4.8677325740162974e-05, + "loss": 0.1793, + "step": 10854 + }, + { + "epoch": 0.1936111012021546, + "grad_norm": 0.32860302925109863, + "learning_rate": 4.867682611887846e-05, + "loss": 0.1456, + "step": 10855 + }, + { + "epoch": 0.1936289373238683, + "grad_norm": 0.3131978213787079, + "learning_rate": 4.8676326405814384e-05, + "loss": 0.2181, + "step": 10856 + }, + { + "epoch": 0.19364677344558198, + "grad_norm": 0.25286146998405457, + "learning_rate": 4.86758266009727e-05, + "loss": 0.2157, + "step": 10857 + }, + { + "epoch": 0.1936646095672957, + "grad_norm": 0.36881959438323975, + "learning_rate": 4.8675326704355325e-05, + "loss": 0.3224, + "step": 10858 + }, + { + "epoch": 0.19368244568900939, + "grad_norm": 0.23446522653102875, + "learning_rate": 4.8674826715964216e-05, + "loss": 0.1333, + "step": 10859 + }, + { + "epoch": 0.19370028181072307, + "grad_norm": 0.24140134453773499, + "learning_rate": 4.8674326635801294e-05, + "loss": 0.225, + "step": 10860 + }, + { + "epoch": 0.19371811793243676, + "grad_norm": 0.22906368970870972, + "learning_rate": 4.867382646386851e-05, + "loss": 0.165, + "step": 10861 + }, + { + "epoch": 0.19373595405415048, + "grad_norm": 0.2945690155029297, + "learning_rate": 4.867332620016779e-05, + "loss": 0.2849, + "step": 10862 + }, + { + "epoch": 0.19375379017586417, + "grad_norm": 0.23159514367580414, + "learning_rate": 4.867282584470109e-05, + "loss": 0.2187, + "step": 10863 + }, + { + "epoch": 0.19377162629757785, + "grad_norm": 0.210427388548851, + "learning_rate": 4.867232539747033e-05, + "loss": 0.1433, + "step": 10864 + }, + { + "epoch": 0.19378946241929154, + "grad_norm": 0.3701137602329254, + "learning_rate": 4.867182485847747e-05, + "loss": 0.1759, + "step": 10865 + }, + { + "epoch": 0.19380729854100526, + "grad_norm": 0.3354681432247162, + "learning_rate": 4.8671324227724444e-05, + "loss": 0.2063, + "step": 10866 + }, + { + "epoch": 0.19382513466271895, + "grad_norm": 0.2675134837627411, + "learning_rate": 4.867082350521318e-05, + "loss": 0.2005, + "step": 10867 + }, + { + "epoch": 0.19384297078443263, + "grad_norm": 0.29437896609306335, + "learning_rate": 4.867032269094563e-05, + "loss": 0.2271, + "step": 10868 + }, + { + "epoch": 0.19386080690614632, + "grad_norm": 0.304161936044693, + "learning_rate": 4.866982178492374e-05, + "loss": 0.1886, + "step": 10869 + }, + { + "epoch": 0.19387864302786, + "grad_norm": 0.28409087657928467, + "learning_rate": 4.866932078714944e-05, + "loss": 0.1967, + "step": 10870 + }, + { + "epoch": 0.19389647914957372, + "grad_norm": 0.2600594460964203, + "learning_rate": 4.866881969762468e-05, + "loss": 0.1916, + "step": 10871 + }, + { + "epoch": 0.1939143152712874, + "grad_norm": 0.5094261169433594, + "learning_rate": 4.86683185163514e-05, + "loss": 0.2283, + "step": 10872 + }, + { + "epoch": 0.1939321513930011, + "grad_norm": 0.24648918211460114, + "learning_rate": 4.8667817243331534e-05, + "loss": 0.1862, + "step": 10873 + }, + { + "epoch": 0.1939499875147148, + "grad_norm": 0.2584965229034424, + "learning_rate": 4.8667315878567044e-05, + "loss": 0.1896, + "step": 10874 + }, + { + "epoch": 0.1939678236364285, + "grad_norm": 0.2909288704395294, + "learning_rate": 4.866681442205986e-05, + "loss": 0.2344, + "step": 10875 + }, + { + "epoch": 0.1939856597581422, + "grad_norm": 0.24164675176143646, + "learning_rate": 4.866631287381193e-05, + "loss": 0.1786, + "step": 10876 + }, + { + "epoch": 0.19400349587985588, + "grad_norm": 0.28780633211135864, + "learning_rate": 4.86658112338252e-05, + "loss": 0.1794, + "step": 10877 + }, + { + "epoch": 0.19402133200156957, + "grad_norm": 0.285815954208374, + "learning_rate": 4.866530950210161e-05, + "loss": 0.1922, + "step": 10878 + }, + { + "epoch": 0.19403916812328328, + "grad_norm": 0.26834774017333984, + "learning_rate": 4.86648076786431e-05, + "loss": 0.2192, + "step": 10879 + }, + { + "epoch": 0.19405700424499697, + "grad_norm": 0.2725721597671509, + "learning_rate": 4.866430576345163e-05, + "loss": 0.1714, + "step": 10880 + }, + { + "epoch": 0.19407484036671066, + "grad_norm": 0.33946189284324646, + "learning_rate": 4.866380375652914e-05, + "loss": 0.24, + "step": 10881 + }, + { + "epoch": 0.19409267648842435, + "grad_norm": 0.241267591714859, + "learning_rate": 4.8663301657877556e-05, + "loss": 0.2099, + "step": 10882 + }, + { + "epoch": 0.19411051261013806, + "grad_norm": 0.3695162236690521, + "learning_rate": 4.866279946749886e-05, + "loss": 0.2413, + "step": 10883 + }, + { + "epoch": 0.19412834873185175, + "grad_norm": 0.29236093163490295, + "learning_rate": 4.8662297185394975e-05, + "loss": 0.1886, + "step": 10884 + }, + { + "epoch": 0.19414618485356544, + "grad_norm": 0.36731311678886414, + "learning_rate": 4.866179481156785e-05, + "loss": 0.1874, + "step": 10885 + }, + { + "epoch": 0.19416402097527913, + "grad_norm": 0.22661186754703522, + "learning_rate": 4.8661292346019436e-05, + "loss": 0.1745, + "step": 10886 + }, + { + "epoch": 0.19418185709699284, + "grad_norm": 0.29941847920417786, + "learning_rate": 4.866078978875168e-05, + "loss": 0.1862, + "step": 10887 + }, + { + "epoch": 0.19419969321870653, + "grad_norm": 0.22813382744789124, + "learning_rate": 4.866028713976654e-05, + "loss": 0.2094, + "step": 10888 + }, + { + "epoch": 0.19421752934042022, + "grad_norm": 0.34261658787727356, + "learning_rate": 4.865978439906595e-05, + "loss": 0.2389, + "step": 10889 + }, + { + "epoch": 0.1942353654621339, + "grad_norm": 0.2346302717924118, + "learning_rate": 4.865928156665186e-05, + "loss": 0.1827, + "step": 10890 + }, + { + "epoch": 0.1942532015838476, + "grad_norm": 0.23762480914592743, + "learning_rate": 4.865877864252622e-05, + "loss": 0.1724, + "step": 10891 + }, + { + "epoch": 0.1942710377055613, + "grad_norm": 0.2964513301849365, + "learning_rate": 4.865827562669099e-05, + "loss": 0.1865, + "step": 10892 + }, + { + "epoch": 0.194288873827275, + "grad_norm": 0.3174099624156952, + "learning_rate": 4.8657772519148117e-05, + "loss": 0.2023, + "step": 10893 + }, + { + "epoch": 0.1943067099489887, + "grad_norm": 0.36556562781333923, + "learning_rate": 4.865726931989954e-05, + "loss": 0.2465, + "step": 10894 + }, + { + "epoch": 0.19432454607070238, + "grad_norm": 0.255138099193573, + "learning_rate": 4.865676602894721e-05, + "loss": 0.2122, + "step": 10895 + }, + { + "epoch": 0.1943423821924161, + "grad_norm": 0.3078806400299072, + "learning_rate": 4.865626264629309e-05, + "loss": 0.2798, + "step": 10896 + }, + { + "epoch": 0.19436021831412978, + "grad_norm": 0.223884716629982, + "learning_rate": 4.865575917193913e-05, + "loss": 0.1669, + "step": 10897 + }, + { + "epoch": 0.19437805443584347, + "grad_norm": 0.3409411907196045, + "learning_rate": 4.865525560588727e-05, + "loss": 0.2308, + "step": 10898 + }, + { + "epoch": 0.19439589055755715, + "grad_norm": 0.30786609649658203, + "learning_rate": 4.865475194813947e-05, + "loss": 0.2074, + "step": 10899 + }, + { + "epoch": 0.19441372667927087, + "grad_norm": 0.22870858013629913, + "learning_rate": 4.8654248198697684e-05, + "loss": 0.2159, + "step": 10900 + }, + { + "epoch": 0.19443156280098456, + "grad_norm": 0.27009400725364685, + "learning_rate": 4.865374435756386e-05, + "loss": 0.1898, + "step": 10901 + }, + { + "epoch": 0.19444939892269825, + "grad_norm": 0.22941553592681885, + "learning_rate": 4.8653240424739955e-05, + "loss": 0.183, + "step": 10902 + }, + { + "epoch": 0.19446723504441193, + "grad_norm": 0.28045961260795593, + "learning_rate": 4.8652736400227914e-05, + "loss": 0.1481, + "step": 10903 + }, + { + "epoch": 0.19448507116612565, + "grad_norm": 0.2851707637310028, + "learning_rate": 4.8652232284029706e-05, + "loss": 0.2099, + "step": 10904 + }, + { + "epoch": 0.19450290728783934, + "grad_norm": 0.3628099262714386, + "learning_rate": 4.8651728076147276e-05, + "loss": 0.225, + "step": 10905 + }, + { + "epoch": 0.19452074340955303, + "grad_norm": 0.24372237920761108, + "learning_rate": 4.865122377658257e-05, + "loss": 0.1887, + "step": 10906 + }, + { + "epoch": 0.19453857953126671, + "grad_norm": 0.30829527974128723, + "learning_rate": 4.8650719385337565e-05, + "loss": 0.2443, + "step": 10907 + }, + { + "epoch": 0.1945564156529804, + "grad_norm": 0.3270097076892853, + "learning_rate": 4.865021490241419e-05, + "loss": 0.2493, + "step": 10908 + }, + { + "epoch": 0.19457425177469412, + "grad_norm": 0.2885185182094574, + "learning_rate": 4.8649710327814426e-05, + "loss": 0.2135, + "step": 10909 + }, + { + "epoch": 0.1945920878964078, + "grad_norm": 0.31050482392311096, + "learning_rate": 4.86492056615402e-05, + "loss": 0.2236, + "step": 10910 + }, + { + "epoch": 0.1946099240181215, + "grad_norm": 0.30919358134269714, + "learning_rate": 4.8648700903593505e-05, + "loss": 0.2012, + "step": 10911 + }, + { + "epoch": 0.19462776013983518, + "grad_norm": 0.25236862897872925, + "learning_rate": 4.8648196053976255e-05, + "loss": 0.2225, + "step": 10912 + }, + { + "epoch": 0.1946455962615489, + "grad_norm": 0.23284150660037994, + "learning_rate": 4.864769111269045e-05, + "loss": 0.1403, + "step": 10913 + }, + { + "epoch": 0.19466343238326259, + "grad_norm": 0.3003937304019928, + "learning_rate": 4.8647186079738014e-05, + "loss": 0.2178, + "step": 10914 + }, + { + "epoch": 0.19468126850497627, + "grad_norm": 0.363349586725235, + "learning_rate": 4.864668095512092e-05, + "loss": 0.1869, + "step": 10915 + }, + { + "epoch": 0.19469910462668996, + "grad_norm": 0.28343427181243896, + "learning_rate": 4.8646175738841124e-05, + "loss": 0.1852, + "step": 10916 + }, + { + "epoch": 0.19471694074840368, + "grad_norm": 0.24704958498477936, + "learning_rate": 4.864567043090059e-05, + "loss": 0.2422, + "step": 10917 + }, + { + "epoch": 0.19473477687011737, + "grad_norm": 0.2634768486022949, + "learning_rate": 4.864516503130126e-05, + "loss": 0.2002, + "step": 10918 + }, + { + "epoch": 0.19475261299183105, + "grad_norm": 0.33541154861450195, + "learning_rate": 4.8644659540045113e-05, + "loss": 0.2285, + "step": 10919 + }, + { + "epoch": 0.19477044911354474, + "grad_norm": 0.21996474266052246, + "learning_rate": 4.864415395713409e-05, + "loss": 0.1815, + "step": 10920 + }, + { + "epoch": 0.19478828523525846, + "grad_norm": 0.5477680563926697, + "learning_rate": 4.8643648282570165e-05, + "loss": 0.2558, + "step": 10921 + }, + { + "epoch": 0.19480612135697215, + "grad_norm": 0.33574825525283813, + "learning_rate": 4.8643142516355286e-05, + "loss": 0.1647, + "step": 10922 + }, + { + "epoch": 0.19482395747868583, + "grad_norm": 0.28537821769714355, + "learning_rate": 4.864263665849143e-05, + "loss": 0.1943, + "step": 10923 + }, + { + "epoch": 0.19484179360039952, + "grad_norm": 0.3535176217556, + "learning_rate": 4.864213070898055e-05, + "loss": 0.1775, + "step": 10924 + }, + { + "epoch": 0.19485962972211324, + "grad_norm": 0.28527507185935974, + "learning_rate": 4.864162466782459e-05, + "loss": 0.2236, + "step": 10925 + }, + { + "epoch": 0.19487746584382692, + "grad_norm": 0.3154849410057068, + "learning_rate": 4.864111853502554e-05, + "loss": 0.2291, + "step": 10926 + }, + { + "epoch": 0.1948953019655406, + "grad_norm": 0.3148837089538574, + "learning_rate": 4.8640612310585355e-05, + "loss": 0.1943, + "step": 10927 + }, + { + "epoch": 0.1949131380872543, + "grad_norm": 0.2947532534599304, + "learning_rate": 4.864010599450598e-05, + "loss": 0.2268, + "step": 10928 + }, + { + "epoch": 0.194930974208968, + "grad_norm": 0.2506810128688812, + "learning_rate": 4.863959958678939e-05, + "loss": 0.2585, + "step": 10929 + }, + { + "epoch": 0.1949488103306817, + "grad_norm": 0.31015411019325256, + "learning_rate": 4.863909308743755e-05, + "loss": 0.1536, + "step": 10930 + }, + { + "epoch": 0.1949666464523954, + "grad_norm": 0.19003814458847046, + "learning_rate": 4.863858649645242e-05, + "loss": 0.1376, + "step": 10931 + }, + { + "epoch": 0.19498448257410908, + "grad_norm": 0.23797856271266937, + "learning_rate": 4.863807981383597e-05, + "loss": 0.2068, + "step": 10932 + }, + { + "epoch": 0.19500231869582277, + "grad_norm": 0.27664363384246826, + "learning_rate": 4.863757303959015e-05, + "loss": 0.1614, + "step": 10933 + }, + { + "epoch": 0.19502015481753648, + "grad_norm": 0.2264014333486557, + "learning_rate": 4.863706617371693e-05, + "loss": 0.1998, + "step": 10934 + }, + { + "epoch": 0.19503799093925017, + "grad_norm": 0.31272202730178833, + "learning_rate": 4.863655921621829e-05, + "loss": 0.196, + "step": 10935 + }, + { + "epoch": 0.19505582706096386, + "grad_norm": 0.21604971587657928, + "learning_rate": 4.863605216709617e-05, + "loss": 0.1825, + "step": 10936 + }, + { + "epoch": 0.19507366318267755, + "grad_norm": 0.27598392963409424, + "learning_rate": 4.863554502635256e-05, + "loss": 0.2422, + "step": 10937 + }, + { + "epoch": 0.19509149930439126, + "grad_norm": 0.30639636516571045, + "learning_rate": 4.8635037793989405e-05, + "loss": 0.2215, + "step": 10938 + }, + { + "epoch": 0.19510933542610495, + "grad_norm": 0.27988162636756897, + "learning_rate": 4.8634530470008674e-05, + "loss": 0.2001, + "step": 10939 + }, + { + "epoch": 0.19512717154781864, + "grad_norm": 0.35650813579559326, + "learning_rate": 4.863402305441236e-05, + "loss": 0.2067, + "step": 10940 + }, + { + "epoch": 0.19514500766953233, + "grad_norm": 0.2776621878147125, + "learning_rate": 4.863351554720239e-05, + "loss": 0.2096, + "step": 10941 + }, + { + "epoch": 0.19516284379124604, + "grad_norm": 0.2894197106361389, + "learning_rate": 4.863300794838076e-05, + "loss": 0.234, + "step": 10942 + }, + { + "epoch": 0.19518067991295973, + "grad_norm": 0.27866825461387634, + "learning_rate": 4.863250025794943e-05, + "loss": 0.1422, + "step": 10943 + }, + { + "epoch": 0.19519851603467342, + "grad_norm": 0.24755118787288666, + "learning_rate": 4.8631992475910363e-05, + "loss": 0.1817, + "step": 10944 + }, + { + "epoch": 0.1952163521563871, + "grad_norm": 0.28582048416137695, + "learning_rate": 4.863148460226554e-05, + "loss": 0.1802, + "step": 10945 + }, + { + "epoch": 0.19523418827810082, + "grad_norm": 0.3617633581161499, + "learning_rate": 4.863097663701691e-05, + "loss": 0.2156, + "step": 10946 + }, + { + "epoch": 0.1952520243998145, + "grad_norm": 0.35809874534606934, + "learning_rate": 4.8630468580166456e-05, + "loss": 0.2128, + "step": 10947 + }, + { + "epoch": 0.1952698605215282, + "grad_norm": 0.3512871265411377, + "learning_rate": 4.862996043171614e-05, + "loss": 0.276, + "step": 10948 + }, + { + "epoch": 0.1952876966432419, + "grad_norm": 0.26752448081970215, + "learning_rate": 4.8629452191667945e-05, + "loss": 0.1536, + "step": 10949 + }, + { + "epoch": 0.19530553276495558, + "grad_norm": 0.27539655566215515, + "learning_rate": 4.8628943860023826e-05, + "loss": 0.2174, + "step": 10950 + }, + { + "epoch": 0.1953233688866693, + "grad_norm": 0.2687312364578247, + "learning_rate": 4.862843543678576e-05, + "loss": 0.1843, + "step": 10951 + }, + { + "epoch": 0.19534120500838298, + "grad_norm": 0.3138580918312073, + "learning_rate": 4.8627926921955715e-05, + "loss": 0.2082, + "step": 10952 + }, + { + "epoch": 0.19535904113009667, + "grad_norm": 0.3139511048793793, + "learning_rate": 4.862741831553567e-05, + "loss": 0.2334, + "step": 10953 + }, + { + "epoch": 0.19537687725181035, + "grad_norm": 0.28194954991340637, + "learning_rate": 4.8626909617527586e-05, + "loss": 0.234, + "step": 10954 + }, + { + "epoch": 0.19539471337352407, + "grad_norm": 0.2429482489824295, + "learning_rate": 4.862640082793344e-05, + "loss": 0.2133, + "step": 10955 + }, + { + "epoch": 0.19541254949523776, + "grad_norm": 0.29499930143356323, + "learning_rate": 4.862589194675521e-05, + "loss": 0.2284, + "step": 10956 + }, + { + "epoch": 0.19543038561695145, + "grad_norm": 0.2928573787212372, + "learning_rate": 4.862538297399486e-05, + "loss": 0.2361, + "step": 10957 + }, + { + "epoch": 0.19544822173866513, + "grad_norm": 0.31073400378227234, + "learning_rate": 4.862487390965436e-05, + "loss": 0.1667, + "step": 10958 + }, + { + "epoch": 0.19546605786037885, + "grad_norm": 0.3647032082080841, + "learning_rate": 4.862436475373569e-05, + "loss": 0.2649, + "step": 10959 + }, + { + "epoch": 0.19548389398209254, + "grad_norm": 0.26545149087905884, + "learning_rate": 4.8623855506240826e-05, + "loss": 0.2153, + "step": 10960 + }, + { + "epoch": 0.19550173010380623, + "grad_norm": 0.2558302581310272, + "learning_rate": 4.862334616717175e-05, + "loss": 0.1868, + "step": 10961 + }, + { + "epoch": 0.19551956622551991, + "grad_norm": 0.23132599890232086, + "learning_rate": 4.862283673653041e-05, + "loss": 0.1707, + "step": 10962 + }, + { + "epoch": 0.19553740234723363, + "grad_norm": 0.3207215368747711, + "learning_rate": 4.8622327214318795e-05, + "loss": 0.2362, + "step": 10963 + }, + { + "epoch": 0.19555523846894732, + "grad_norm": 0.29617559909820557, + "learning_rate": 4.862181760053889e-05, + "loss": 0.189, + "step": 10964 + }, + { + "epoch": 0.195573074590661, + "grad_norm": 0.22181539237499237, + "learning_rate": 4.8621307895192646e-05, + "loss": 0.1409, + "step": 10965 + }, + { + "epoch": 0.1955909107123747, + "grad_norm": 0.3290722370147705, + "learning_rate": 4.8620798098282075e-05, + "loss": 0.2132, + "step": 10966 + }, + { + "epoch": 0.1956087468340884, + "grad_norm": 0.23870375752449036, + "learning_rate": 4.862028820980912e-05, + "loss": 0.2184, + "step": 10967 + }, + { + "epoch": 0.1956265829558021, + "grad_norm": 0.2592199146747589, + "learning_rate": 4.861977822977577e-05, + "loss": 0.2276, + "step": 10968 + }, + { + "epoch": 0.19564441907751579, + "grad_norm": 0.2996153235435486, + "learning_rate": 4.8619268158184e-05, + "loss": 0.244, + "step": 10969 + }, + { + "epoch": 0.19566225519922947, + "grad_norm": 0.2985076308250427, + "learning_rate": 4.861875799503579e-05, + "loss": 0.156, + "step": 10970 + }, + { + "epoch": 0.19568009132094316, + "grad_norm": 0.28612715005874634, + "learning_rate": 4.8618247740333125e-05, + "loss": 0.1653, + "step": 10971 + }, + { + "epoch": 0.19569792744265688, + "grad_norm": 0.23777885735034943, + "learning_rate": 4.861773739407796e-05, + "loss": 0.1621, + "step": 10972 + }, + { + "epoch": 0.19571576356437057, + "grad_norm": 0.2995002269744873, + "learning_rate": 4.86172269562723e-05, + "loss": 0.2022, + "step": 10973 + }, + { + "epoch": 0.19573359968608425, + "grad_norm": 0.4907090663909912, + "learning_rate": 4.86167164269181e-05, + "loss": 0.2436, + "step": 10974 + }, + { + "epoch": 0.19575143580779794, + "grad_norm": 0.2928963601589203, + "learning_rate": 4.861620580601736e-05, + "loss": 0.2402, + "step": 10975 + }, + { + "epoch": 0.19576927192951166, + "grad_norm": 0.30040040612220764, + "learning_rate": 4.8615695093572044e-05, + "loss": 0.2323, + "step": 10976 + }, + { + "epoch": 0.19578710805122534, + "grad_norm": 0.3200577199459076, + "learning_rate": 4.861518428958414e-05, + "loss": 0.1874, + "step": 10977 + }, + { + "epoch": 0.19580494417293903, + "grad_norm": 0.3254393935203552, + "learning_rate": 4.8614673394055624e-05, + "loss": 0.2132, + "step": 10978 + }, + { + "epoch": 0.19582278029465272, + "grad_norm": 0.3281918466091156, + "learning_rate": 4.861416240698848e-05, + "loss": 0.231, + "step": 10979 + }, + { + "epoch": 0.19584061641636644, + "grad_norm": 0.2855898439884186, + "learning_rate": 4.8613651328384676e-05, + "loss": 0.2164, + "step": 10980 + }, + { + "epoch": 0.19585845253808012, + "grad_norm": 0.3017638325691223, + "learning_rate": 4.861314015824622e-05, + "loss": 0.2394, + "step": 10981 + }, + { + "epoch": 0.1958762886597938, + "grad_norm": 0.24240683019161224, + "learning_rate": 4.8612628896575063e-05, + "loss": 0.2354, + "step": 10982 + }, + { + "epoch": 0.1958941247815075, + "grad_norm": 0.25195175409317017, + "learning_rate": 4.861211754337321e-05, + "loss": 0.2011, + "step": 10983 + }, + { + "epoch": 0.19591196090322122, + "grad_norm": 0.2800824046134949, + "learning_rate": 4.861160609864263e-05, + "loss": 0.2427, + "step": 10984 + }, + { + "epoch": 0.1959297970249349, + "grad_norm": 0.24269692599773407, + "learning_rate": 4.8611094562385306e-05, + "loss": 0.1372, + "step": 10985 + }, + { + "epoch": 0.1959476331466486, + "grad_norm": 0.23989631235599518, + "learning_rate": 4.8610582934603234e-05, + "loss": 0.2211, + "step": 10986 + }, + { + "epoch": 0.19596546926836228, + "grad_norm": 0.30193716287612915, + "learning_rate": 4.861007121529838e-05, + "loss": 0.216, + "step": 10987 + }, + { + "epoch": 0.19598330539007597, + "grad_norm": 0.45588362216949463, + "learning_rate": 4.860955940447274e-05, + "loss": 0.163, + "step": 10988 + }, + { + "epoch": 0.19600114151178968, + "grad_norm": 0.2539214789867401, + "learning_rate": 4.860904750212829e-05, + "loss": 0.1891, + "step": 10989 + }, + { + "epoch": 0.19601897763350337, + "grad_norm": 0.2454519271850586, + "learning_rate": 4.8608535508267015e-05, + "loss": 0.1247, + "step": 10990 + }, + { + "epoch": 0.19603681375521706, + "grad_norm": 0.3297535479068756, + "learning_rate": 4.860802342289091e-05, + "loss": 0.2363, + "step": 10991 + }, + { + "epoch": 0.19605464987693075, + "grad_norm": 0.2104528844356537, + "learning_rate": 4.8607511246001944e-05, + "loss": 0.1695, + "step": 10992 + }, + { + "epoch": 0.19607248599864446, + "grad_norm": 0.21681469678878784, + "learning_rate": 4.860699897760212e-05, + "loss": 0.2314, + "step": 10993 + }, + { + "epoch": 0.19609032212035815, + "grad_norm": 0.29959338903427124, + "learning_rate": 4.860648661769341e-05, + "loss": 0.1895, + "step": 10994 + }, + { + "epoch": 0.19610815824207184, + "grad_norm": 0.28645989298820496, + "learning_rate": 4.86059741662778e-05, + "loss": 0.2517, + "step": 10995 + }, + { + "epoch": 0.19612599436378553, + "grad_norm": 0.3185204267501831, + "learning_rate": 4.860546162335728e-05, + "loss": 0.1633, + "step": 10996 + }, + { + "epoch": 0.19614383048549924, + "grad_norm": 0.24247121810913086, + "learning_rate": 4.8604948988933854e-05, + "loss": 0.1895, + "step": 10997 + }, + { + "epoch": 0.19616166660721293, + "grad_norm": 0.34270310401916504, + "learning_rate": 4.860443626300948e-05, + "loss": 0.2418, + "step": 10998 + }, + { + "epoch": 0.19617950272892662, + "grad_norm": 0.2494347244501114, + "learning_rate": 4.8603923445586154e-05, + "loss": 0.1486, + "step": 10999 + }, + { + "epoch": 0.1961973388506403, + "grad_norm": 0.4187646806240082, + "learning_rate": 4.860341053666587e-05, + "loss": 0.1794, + "step": 11000 + }, + { + "epoch": 0.1961973388506403, + "eval_loss": 0.19454918801784515, + "eval_runtime": 107.5552, + "eval_samples_per_second": 9.521, + "eval_steps_per_second": 1.59, + "step": 11000 + }, + { + "epoch": 0.19621517497235402, + "grad_norm": 0.25480276346206665, + "learning_rate": 4.860289753625062e-05, + "loss": 0.2163, + "step": 11001 + }, + { + "epoch": 0.1962330110940677, + "grad_norm": 0.2538530230522156, + "learning_rate": 4.860238444434239e-05, + "loss": 0.2152, + "step": 11002 + }, + { + "epoch": 0.1962508472157814, + "grad_norm": 0.32229483127593994, + "learning_rate": 4.8601871260943164e-05, + "loss": 0.1846, + "step": 11003 + }, + { + "epoch": 0.1962686833374951, + "grad_norm": 0.2602200210094452, + "learning_rate": 4.860135798605492e-05, + "loss": 0.2004, + "step": 11004 + }, + { + "epoch": 0.1962865194592088, + "grad_norm": 0.2437405288219452, + "learning_rate": 4.860084461967967e-05, + "loss": 0.1985, + "step": 11005 + }, + { + "epoch": 0.1963043555809225, + "grad_norm": 0.23054543137550354, + "learning_rate": 4.8600331161819405e-05, + "loss": 0.2062, + "step": 11006 + }, + { + "epoch": 0.19632219170263618, + "grad_norm": 0.36326172947883606, + "learning_rate": 4.8599817612476095e-05, + "loss": 0.2494, + "step": 11007 + }, + { + "epoch": 0.19634002782434987, + "grad_norm": 0.34366416931152344, + "learning_rate": 4.859930397165174e-05, + "loss": 0.2261, + "step": 11008 + }, + { + "epoch": 0.19635786394606355, + "grad_norm": 0.2851044535636902, + "learning_rate": 4.8598790239348335e-05, + "loss": 0.1639, + "step": 11009 + }, + { + "epoch": 0.19637570006777727, + "grad_norm": 0.2594095766544342, + "learning_rate": 4.859827641556787e-05, + "loss": 0.1978, + "step": 11010 + }, + { + "epoch": 0.19639353618949096, + "grad_norm": 0.23761868476867676, + "learning_rate": 4.859776250031233e-05, + "loss": 0.2054, + "step": 11011 + }, + { + "epoch": 0.19641137231120465, + "grad_norm": 0.26732778549194336, + "learning_rate": 4.859724849358371e-05, + "loss": 0.2, + "step": 11012 + }, + { + "epoch": 0.19642920843291833, + "grad_norm": 0.29227781295776367, + "learning_rate": 4.859673439538401e-05, + "loss": 0.1499, + "step": 11013 + }, + { + "epoch": 0.19644704455463205, + "grad_norm": 0.21788150072097778, + "learning_rate": 4.8596220205715214e-05, + "loss": 0.1688, + "step": 11014 + }, + { + "epoch": 0.19646488067634574, + "grad_norm": 0.3312310576438904, + "learning_rate": 4.859570592457932e-05, + "loss": 0.2355, + "step": 11015 + }, + { + "epoch": 0.19648271679805943, + "grad_norm": 0.27505844831466675, + "learning_rate": 4.859519155197832e-05, + "loss": 0.2236, + "step": 11016 + }, + { + "epoch": 0.1965005529197731, + "grad_norm": 0.4549323320388794, + "learning_rate": 4.859467708791421e-05, + "loss": 0.1763, + "step": 11017 + }, + { + "epoch": 0.19651838904148683, + "grad_norm": 0.33471062779426575, + "learning_rate": 4.859416253238898e-05, + "loss": 0.1952, + "step": 11018 + }, + { + "epoch": 0.19653622516320052, + "grad_norm": 0.2178431898355484, + "learning_rate": 4.859364788540463e-05, + "loss": 0.2121, + "step": 11019 + }, + { + "epoch": 0.1965540612849142, + "grad_norm": 0.2079540640115738, + "learning_rate": 4.859313314696315e-05, + "loss": 0.1889, + "step": 11020 + }, + { + "epoch": 0.1965718974066279, + "grad_norm": 0.2850393056869507, + "learning_rate": 4.859261831706653e-05, + "loss": 0.1715, + "step": 11021 + }, + { + "epoch": 0.1965897335283416, + "grad_norm": 0.3005395531654358, + "learning_rate": 4.8592103395716785e-05, + "loss": 0.2188, + "step": 11022 + }, + { + "epoch": 0.1966075696500553, + "grad_norm": 0.2339594066143036, + "learning_rate": 4.859158838291589e-05, + "loss": 0.1895, + "step": 11023 + }, + { + "epoch": 0.19662540577176899, + "grad_norm": 0.22571700811386108, + "learning_rate": 4.859107327866585e-05, + "loss": 0.1794, + "step": 11024 + }, + { + "epoch": 0.19664324189348267, + "grad_norm": 0.23732657730579376, + "learning_rate": 4.859055808296867e-05, + "loss": 0.2179, + "step": 11025 + }, + { + "epoch": 0.1966610780151964, + "grad_norm": 0.23061764240264893, + "learning_rate": 4.859004279582633e-05, + "loss": 0.184, + "step": 11026 + }, + { + "epoch": 0.19667891413691008, + "grad_norm": 0.34119805693626404, + "learning_rate": 4.858952741724084e-05, + "loss": 0.1919, + "step": 11027 + }, + { + "epoch": 0.19669675025862376, + "grad_norm": 0.270794540643692, + "learning_rate": 4.8589011947214206e-05, + "loss": 0.2249, + "step": 11028 + }, + { + "epoch": 0.19671458638033745, + "grad_norm": 0.3803497552871704, + "learning_rate": 4.858849638574839e-05, + "loss": 0.3063, + "step": 11029 + }, + { + "epoch": 0.19673242250205114, + "grad_norm": 0.24121667444705963, + "learning_rate": 4.858798073284544e-05, + "loss": 0.2079, + "step": 11030 + }, + { + "epoch": 0.19675025862376486, + "grad_norm": 0.2138267159461975, + "learning_rate": 4.8587464988507314e-05, + "loss": 0.1569, + "step": 11031 + }, + { + "epoch": 0.19676809474547854, + "grad_norm": 0.24966634809970856, + "learning_rate": 4.858694915273603e-05, + "loss": 0.2027, + "step": 11032 + }, + { + "epoch": 0.19678593086719223, + "grad_norm": 0.2865935266017914, + "learning_rate": 4.8586433225533584e-05, + "loss": 0.2174, + "step": 11033 + }, + { + "epoch": 0.19680376698890592, + "grad_norm": 0.24785466492176056, + "learning_rate": 4.858591720690198e-05, + "loss": 0.2075, + "step": 11034 + }, + { + "epoch": 0.19682160311061964, + "grad_norm": 0.36719441413879395, + "learning_rate": 4.858540109684321e-05, + "loss": 0.2419, + "step": 11035 + }, + { + "epoch": 0.19683943923233332, + "grad_norm": 0.25197044014930725, + "learning_rate": 4.8584884895359286e-05, + "loss": 0.1649, + "step": 11036 + }, + { + "epoch": 0.196857275354047, + "grad_norm": 0.24637635052204132, + "learning_rate": 4.858436860245219e-05, + "loss": 0.2739, + "step": 11037 + }, + { + "epoch": 0.1968751114757607, + "grad_norm": 0.26137208938598633, + "learning_rate": 4.858385221812395e-05, + "loss": 0.2079, + "step": 11038 + }, + { + "epoch": 0.19689294759747442, + "grad_norm": 0.3005337715148926, + "learning_rate": 4.858333574237654e-05, + "loss": 0.2485, + "step": 11039 + }, + { + "epoch": 0.1969107837191881, + "grad_norm": 0.2986501157283783, + "learning_rate": 4.858281917521198e-05, + "loss": 0.2404, + "step": 11040 + }, + { + "epoch": 0.1969286198409018, + "grad_norm": 0.2938983142375946, + "learning_rate": 4.858230251663227e-05, + "loss": 0.1532, + "step": 11041 + }, + { + "epoch": 0.19694645596261548, + "grad_norm": 0.3248229920864105, + "learning_rate": 4.858178576663941e-05, + "loss": 0.2509, + "step": 11042 + }, + { + "epoch": 0.1969642920843292, + "grad_norm": 0.22464273869991302, + "learning_rate": 4.85812689252354e-05, + "loss": 0.1826, + "step": 11043 + }, + { + "epoch": 0.19698212820604288, + "grad_norm": 0.3778996467590332, + "learning_rate": 4.858075199242225e-05, + "loss": 0.1933, + "step": 11044 + }, + { + "epoch": 0.19699996432775657, + "grad_norm": 0.2337600588798523, + "learning_rate": 4.8580234968201965e-05, + "loss": 0.1796, + "step": 11045 + }, + { + "epoch": 0.19701780044947026, + "grad_norm": 0.35485145449638367, + "learning_rate": 4.857971785257654e-05, + "loss": 0.2053, + "step": 11046 + }, + { + "epoch": 0.19703563657118398, + "grad_norm": 0.27883970737457275, + "learning_rate": 4.8579200645547984e-05, + "loss": 0.2144, + "step": 11047 + }, + { + "epoch": 0.19705347269289766, + "grad_norm": 0.25961363315582275, + "learning_rate": 4.8578683347118306e-05, + "loss": 0.2338, + "step": 11048 + }, + { + "epoch": 0.19707130881461135, + "grad_norm": 0.28233227133750916, + "learning_rate": 4.8578165957289504e-05, + "loss": 0.2079, + "step": 11049 + }, + { + "epoch": 0.19708914493632504, + "grad_norm": 0.3279772400856018, + "learning_rate": 4.8577648476063584e-05, + "loss": 0.2071, + "step": 11050 + }, + { + "epoch": 0.19710698105803873, + "grad_norm": 0.3648107647895813, + "learning_rate": 4.857713090344256e-05, + "loss": 0.1887, + "step": 11051 + }, + { + "epoch": 0.19712481717975244, + "grad_norm": 0.3489072322845459, + "learning_rate": 4.857661323942843e-05, + "loss": 0.2542, + "step": 11052 + }, + { + "epoch": 0.19714265330146613, + "grad_norm": 0.3205341398715973, + "learning_rate": 4.857609548402321e-05, + "loss": 0.199, + "step": 11053 + }, + { + "epoch": 0.19716048942317982, + "grad_norm": 0.22784355282783508, + "learning_rate": 4.85755776372289e-05, + "loss": 0.2137, + "step": 11054 + }, + { + "epoch": 0.1971783255448935, + "grad_norm": 0.2208159863948822, + "learning_rate": 4.85750596990475e-05, + "loss": 0.1949, + "step": 11055 + }, + { + "epoch": 0.19719616166660722, + "grad_norm": 0.2629421353340149, + "learning_rate": 4.857454166948103e-05, + "loss": 0.2038, + "step": 11056 + }, + { + "epoch": 0.1972139977883209, + "grad_norm": 0.4527952969074249, + "learning_rate": 4.85740235485315e-05, + "loss": 0.2401, + "step": 11057 + }, + { + "epoch": 0.1972318339100346, + "grad_norm": 0.27207663655281067, + "learning_rate": 4.85735053362009e-05, + "loss": 0.1897, + "step": 11058 + }, + { + "epoch": 0.1972496700317483, + "grad_norm": 0.2801261842250824, + "learning_rate": 4.8572987032491264e-05, + "loss": 0.1671, + "step": 11059 + }, + { + "epoch": 0.197267506153462, + "grad_norm": 0.2612663507461548, + "learning_rate": 4.857246863740458e-05, + "loss": 0.1913, + "step": 11060 + }, + { + "epoch": 0.1972853422751757, + "grad_norm": 0.32024240493774414, + "learning_rate": 4.857195015094287e-05, + "loss": 0.2919, + "step": 11061 + }, + { + "epoch": 0.19730317839688938, + "grad_norm": 0.24168513715267181, + "learning_rate": 4.857143157310814e-05, + "loss": 0.1842, + "step": 11062 + }, + { + "epoch": 0.19732101451860307, + "grad_norm": 0.551415205001831, + "learning_rate": 4.85709129039024e-05, + "loss": 0.2532, + "step": 11063 + }, + { + "epoch": 0.19733885064031678, + "grad_norm": 0.26950013637542725, + "learning_rate": 4.857039414332766e-05, + "loss": 0.1991, + "step": 11064 + }, + { + "epoch": 0.19735668676203047, + "grad_norm": 0.31064918637275696, + "learning_rate": 4.8569875291385936e-05, + "loss": 0.19, + "step": 11065 + }, + { + "epoch": 0.19737452288374416, + "grad_norm": 0.27538514137268066, + "learning_rate": 4.856935634807923e-05, + "loss": 0.1863, + "step": 11066 + }, + { + "epoch": 0.19739235900545785, + "grad_norm": 0.27919042110443115, + "learning_rate": 4.856883731340955e-05, + "loss": 0.2486, + "step": 11067 + }, + { + "epoch": 0.19741019512717156, + "grad_norm": 0.27432864904403687, + "learning_rate": 4.8568318187378924e-05, + "loss": 0.211, + "step": 11068 + }, + { + "epoch": 0.19742803124888525, + "grad_norm": 0.2728572189807892, + "learning_rate": 4.856779896998936e-05, + "loss": 0.1357, + "step": 11069 + }, + { + "epoch": 0.19744586737059894, + "grad_norm": 0.2934436500072479, + "learning_rate": 4.856727966124286e-05, + "loss": 0.2306, + "step": 11070 + }, + { + "epoch": 0.19746370349231263, + "grad_norm": 0.2735760509967804, + "learning_rate": 4.856676026114145e-05, + "loss": 0.2001, + "step": 11071 + }, + { + "epoch": 0.1974815396140263, + "grad_norm": 0.28950780630111694, + "learning_rate": 4.8566240769687135e-05, + "loss": 0.2104, + "step": 11072 + }, + { + "epoch": 0.19749937573574003, + "grad_norm": 0.2395309954881668, + "learning_rate": 4.856572118688193e-05, + "loss": 0.2276, + "step": 11073 + }, + { + "epoch": 0.19751721185745372, + "grad_norm": 0.18597102165222168, + "learning_rate": 4.856520151272785e-05, + "loss": 0.1351, + "step": 11074 + }, + { + "epoch": 0.1975350479791674, + "grad_norm": 0.3088156580924988, + "learning_rate": 4.8564681747226914e-05, + "loss": 0.174, + "step": 11075 + }, + { + "epoch": 0.1975528841008811, + "grad_norm": 0.23534858226776123, + "learning_rate": 4.856416189038113e-05, + "loss": 0.1774, + "step": 11076 + }, + { + "epoch": 0.1975707202225948, + "grad_norm": 0.2968669831752777, + "learning_rate": 4.8563641942192514e-05, + "loss": 0.2016, + "step": 11077 + }, + { + "epoch": 0.1975885563443085, + "grad_norm": 0.2954857051372528, + "learning_rate": 4.856312190266309e-05, + "loss": 0.2246, + "step": 11078 + }, + { + "epoch": 0.19760639246602218, + "grad_norm": 0.2747255265712738, + "learning_rate": 4.856260177179486e-05, + "loss": 0.1979, + "step": 11079 + }, + { + "epoch": 0.19762422858773587, + "grad_norm": 0.218937948346138, + "learning_rate": 4.8562081549589855e-05, + "loss": 0.189, + "step": 11080 + }, + { + "epoch": 0.1976420647094496, + "grad_norm": 0.2704882323741913, + "learning_rate": 4.856156123605007e-05, + "loss": 0.1866, + "step": 11081 + }, + { + "epoch": 0.19765990083116328, + "grad_norm": 0.23084577918052673, + "learning_rate": 4.856104083117755e-05, + "loss": 0.209, + "step": 11082 + }, + { + "epoch": 0.19767773695287696, + "grad_norm": 0.3036177158355713, + "learning_rate": 4.856052033497429e-05, + "loss": 0.2205, + "step": 11083 + }, + { + "epoch": 0.19769557307459065, + "grad_norm": 0.2827765941619873, + "learning_rate": 4.8559999747442316e-05, + "loss": 0.1875, + "step": 11084 + }, + { + "epoch": 0.19771340919630437, + "grad_norm": 0.2353384643793106, + "learning_rate": 4.8559479068583645e-05, + "loss": 0.1756, + "step": 11085 + }, + { + "epoch": 0.19773124531801806, + "grad_norm": 0.34830811619758606, + "learning_rate": 4.85589582984003e-05, + "loss": 0.1886, + "step": 11086 + }, + { + "epoch": 0.19774908143973174, + "grad_norm": 0.253828763961792, + "learning_rate": 4.85584374368943e-05, + "loss": 0.2047, + "step": 11087 + }, + { + "epoch": 0.19776691756144543, + "grad_norm": 0.2931230664253235, + "learning_rate": 4.855791648406765e-05, + "loss": 0.2165, + "step": 11088 + }, + { + "epoch": 0.19778475368315912, + "grad_norm": 0.30035024881362915, + "learning_rate": 4.855739543992238e-05, + "loss": 0.2276, + "step": 11089 + }, + { + "epoch": 0.19780258980487284, + "grad_norm": 0.27374139428138733, + "learning_rate": 4.8556874304460516e-05, + "loss": 0.1931, + "step": 11090 + }, + { + "epoch": 0.19782042592658652, + "grad_norm": 0.2854543924331665, + "learning_rate": 4.855635307768406e-05, + "loss": 0.2248, + "step": 11091 + }, + { + "epoch": 0.1978382620483002, + "grad_norm": 0.27571243047714233, + "learning_rate": 4.8555831759595056e-05, + "loss": 0.2106, + "step": 11092 + }, + { + "epoch": 0.1978560981700139, + "grad_norm": 0.2587672770023346, + "learning_rate": 4.8555310350195506e-05, + "loss": 0.2361, + "step": 11093 + }, + { + "epoch": 0.19787393429172762, + "grad_norm": 0.3703326880931854, + "learning_rate": 4.855478884948744e-05, + "loss": 0.141, + "step": 11094 + }, + { + "epoch": 0.1978917704134413, + "grad_norm": 0.32847192883491516, + "learning_rate": 4.8554267257472876e-05, + "loss": 0.2245, + "step": 11095 + }, + { + "epoch": 0.197909606535155, + "grad_norm": 0.2797450125217438, + "learning_rate": 4.855374557415383e-05, + "loss": 0.2239, + "step": 11096 + }, + { + "epoch": 0.19792744265686868, + "grad_norm": 0.2755650579929352, + "learning_rate": 4.855322379953233e-05, + "loss": 0.2082, + "step": 11097 + }, + { + "epoch": 0.1979452787785824, + "grad_norm": 0.2720598876476288, + "learning_rate": 4.855270193361041e-05, + "loss": 0.206, + "step": 11098 + }, + { + "epoch": 0.19796311490029608, + "grad_norm": 0.2276594340801239, + "learning_rate": 4.855217997639008e-05, + "loss": 0.1982, + "step": 11099 + }, + { + "epoch": 0.19798095102200977, + "grad_norm": 0.31687310338020325, + "learning_rate": 4.855165792787336e-05, + "loss": 0.1331, + "step": 11100 + }, + { + "epoch": 0.19799878714372346, + "grad_norm": 0.285958468914032, + "learning_rate": 4.855113578806228e-05, + "loss": 0.1516, + "step": 11101 + }, + { + "epoch": 0.19801662326543717, + "grad_norm": 0.2214995175600052, + "learning_rate": 4.855061355695887e-05, + "loss": 0.1714, + "step": 11102 + }, + { + "epoch": 0.19803445938715086, + "grad_norm": 0.3054547905921936, + "learning_rate": 4.8550091234565144e-05, + "loss": 0.1881, + "step": 11103 + }, + { + "epoch": 0.19805229550886455, + "grad_norm": 0.24299995601177216, + "learning_rate": 4.854956882088313e-05, + "loss": 0.2094, + "step": 11104 + }, + { + "epoch": 0.19807013163057824, + "grad_norm": 0.35838520526885986, + "learning_rate": 4.854904631591486e-05, + "loss": 0.2222, + "step": 11105 + }, + { + "epoch": 0.19808796775229195, + "grad_norm": 0.29429590702056885, + "learning_rate": 4.854852371966234e-05, + "loss": 0.2435, + "step": 11106 + }, + { + "epoch": 0.19810580387400564, + "grad_norm": 0.28344273567199707, + "learning_rate": 4.854800103212762e-05, + "loss": 0.2233, + "step": 11107 + }, + { + "epoch": 0.19812363999571933, + "grad_norm": 0.35314199328422546, + "learning_rate": 4.8547478253312706e-05, + "loss": 0.1924, + "step": 11108 + }, + { + "epoch": 0.19814147611743302, + "grad_norm": 0.31719276309013367, + "learning_rate": 4.854695538321964e-05, + "loss": 0.2063, + "step": 11109 + }, + { + "epoch": 0.1981593122391467, + "grad_norm": 0.327848345041275, + "learning_rate": 4.854643242185044e-05, + "loss": 0.2478, + "step": 11110 + }, + { + "epoch": 0.19817714836086042, + "grad_norm": 0.26491832733154297, + "learning_rate": 4.854590936920713e-05, + "loss": 0.2171, + "step": 11111 + }, + { + "epoch": 0.1981949844825741, + "grad_norm": 0.27557340264320374, + "learning_rate": 4.8545386225291756e-05, + "loss": 0.2171, + "step": 11112 + }, + { + "epoch": 0.1982128206042878, + "grad_norm": 0.37003302574157715, + "learning_rate": 4.854486299010632e-05, + "loss": 0.226, + "step": 11113 + }, + { + "epoch": 0.1982306567260015, + "grad_norm": 0.23664388060569763, + "learning_rate": 4.854433966365287e-05, + "loss": 0.1684, + "step": 11114 + }, + { + "epoch": 0.1982484928477152, + "grad_norm": 0.23957613110542297, + "learning_rate": 4.854381624593342e-05, + "loss": 0.1899, + "step": 11115 + }, + { + "epoch": 0.1982663289694289, + "grad_norm": 0.26655709743499756, + "learning_rate": 4.8543292736950016e-05, + "loss": 0.2046, + "step": 11116 + }, + { + "epoch": 0.19828416509114258, + "grad_norm": 0.29274481534957886, + "learning_rate": 4.854276913670467e-05, + "loss": 0.1896, + "step": 11117 + }, + { + "epoch": 0.19830200121285627, + "grad_norm": 0.3711560070514679, + "learning_rate": 4.854224544519942e-05, + "loss": 0.2031, + "step": 11118 + }, + { + "epoch": 0.19831983733456998, + "grad_norm": 0.2939615249633789, + "learning_rate": 4.85417216624363e-05, + "loss": 0.229, + "step": 11119 + }, + { + "epoch": 0.19833767345628367, + "grad_norm": 0.2645339369773865, + "learning_rate": 4.854119778841734e-05, + "loss": 0.2124, + "step": 11120 + }, + { + "epoch": 0.19835550957799736, + "grad_norm": 0.2945692837238312, + "learning_rate": 4.854067382314456e-05, + "loss": 0.2609, + "step": 11121 + }, + { + "epoch": 0.19837334569971105, + "grad_norm": 0.3351791203022003, + "learning_rate": 4.854014976661999e-05, + "loss": 0.2441, + "step": 11122 + }, + { + "epoch": 0.19839118182142476, + "grad_norm": 0.24094338715076447, + "learning_rate": 4.853962561884568e-05, + "loss": 0.2036, + "step": 11123 + }, + { + "epoch": 0.19840901794313845, + "grad_norm": 0.24937154352664948, + "learning_rate": 4.853910137982365e-05, + "loss": 0.1894, + "step": 11124 + }, + { + "epoch": 0.19842685406485214, + "grad_norm": 0.2517138719558716, + "learning_rate": 4.853857704955593e-05, + "loss": 0.2204, + "step": 11125 + }, + { + "epoch": 0.19844469018656583, + "grad_norm": 0.2192293107509613, + "learning_rate": 4.853805262804455e-05, + "loss": 0.1849, + "step": 11126 + }, + { + "epoch": 0.19846252630827954, + "grad_norm": 0.2045575976371765, + "learning_rate": 4.8537528115291556e-05, + "loss": 0.1599, + "step": 11127 + }, + { + "epoch": 0.19848036242999323, + "grad_norm": 0.352490097284317, + "learning_rate": 4.853700351129897e-05, + "loss": 0.2454, + "step": 11128 + }, + { + "epoch": 0.19849819855170692, + "grad_norm": 0.21434324979782104, + "learning_rate": 4.853647881606883e-05, + "loss": 0.1742, + "step": 11129 + }, + { + "epoch": 0.1985160346734206, + "grad_norm": 0.306272953748703, + "learning_rate": 4.853595402960317e-05, + "loss": 0.213, + "step": 11130 + }, + { + "epoch": 0.1985338707951343, + "grad_norm": 0.2694195806980133, + "learning_rate": 4.8535429151904025e-05, + "loss": 0.1842, + "step": 11131 + }, + { + "epoch": 0.198551706916848, + "grad_norm": 0.2298295497894287, + "learning_rate": 4.853490418297343e-05, + "loss": 0.1591, + "step": 11132 + }, + { + "epoch": 0.1985695430385617, + "grad_norm": 0.32406559586524963, + "learning_rate": 4.853437912281341e-05, + "loss": 0.1695, + "step": 11133 + }, + { + "epoch": 0.19858737916027538, + "grad_norm": 0.2561782896518707, + "learning_rate": 4.853385397142601e-05, + "loss": 0.22, + "step": 11134 + }, + { + "epoch": 0.19860521528198907, + "grad_norm": 0.30626562237739563, + "learning_rate": 4.8533328728813265e-05, + "loss": 0.2258, + "step": 11135 + }, + { + "epoch": 0.1986230514037028, + "grad_norm": 0.2491305023431778, + "learning_rate": 4.853280339497721e-05, + "loss": 0.2172, + "step": 11136 + }, + { + "epoch": 0.19864088752541648, + "grad_norm": 0.21116437017917633, + "learning_rate": 4.853227796991988e-05, + "loss": 0.1982, + "step": 11137 + }, + { + "epoch": 0.19865872364713016, + "grad_norm": 0.310923308134079, + "learning_rate": 4.853175245364331e-05, + "loss": 0.2385, + "step": 11138 + }, + { + "epoch": 0.19867655976884385, + "grad_norm": 0.29891157150268555, + "learning_rate": 4.8531226846149544e-05, + "loss": 0.2239, + "step": 11139 + }, + { + "epoch": 0.19869439589055757, + "grad_norm": 0.33879512548446655, + "learning_rate": 4.8530701147440615e-05, + "loss": 0.2059, + "step": 11140 + }, + { + "epoch": 0.19871223201227126, + "grad_norm": 0.23478348553180695, + "learning_rate": 4.8530175357518556e-05, + "loss": 0.1603, + "step": 11141 + }, + { + "epoch": 0.19873006813398494, + "grad_norm": 0.31598207354545593, + "learning_rate": 4.852964947638542e-05, + "loss": 0.2302, + "step": 11142 + }, + { + "epoch": 0.19874790425569863, + "grad_norm": 0.3208726942539215, + "learning_rate": 4.852912350404323e-05, + "loss": 0.2221, + "step": 11143 + }, + { + "epoch": 0.19876574037741235, + "grad_norm": 0.2035573273897171, + "learning_rate": 4.852859744049403e-05, + "loss": 0.1876, + "step": 11144 + }, + { + "epoch": 0.19878357649912604, + "grad_norm": 0.29464900493621826, + "learning_rate": 4.8528071285739864e-05, + "loss": 0.2111, + "step": 11145 + }, + { + "epoch": 0.19880141262083972, + "grad_norm": 0.17972739040851593, + "learning_rate": 4.852754503978276e-05, + "loss": 0.176, + "step": 11146 + }, + { + "epoch": 0.1988192487425534, + "grad_norm": 0.25738292932510376, + "learning_rate": 4.8527018702624774e-05, + "loss": 0.2385, + "step": 11147 + }, + { + "epoch": 0.19883708486426713, + "grad_norm": 0.2520672380924225, + "learning_rate": 4.852649227426793e-05, + "loss": 0.2049, + "step": 11148 + }, + { + "epoch": 0.19885492098598082, + "grad_norm": 0.17404726147651672, + "learning_rate": 4.852596575471428e-05, + "loss": 0.1774, + "step": 11149 + }, + { + "epoch": 0.1988727571076945, + "grad_norm": 0.2706785798072815, + "learning_rate": 4.8525439143965855e-05, + "loss": 0.213, + "step": 11150 + }, + { + "epoch": 0.1988905932294082, + "grad_norm": 0.7308741211891174, + "learning_rate": 4.852491244202471e-05, + "loss": 0.2105, + "step": 11151 + }, + { + "epoch": 0.19890842935112188, + "grad_norm": 0.2657563090324402, + "learning_rate": 4.852438564889288e-05, + "loss": 0.1833, + "step": 11152 + }, + { + "epoch": 0.1989262654728356, + "grad_norm": 0.21944956481456757, + "learning_rate": 4.85238587645724e-05, + "loss": 0.188, + "step": 11153 + }, + { + "epoch": 0.19894410159454928, + "grad_norm": 0.21879857778549194, + "learning_rate": 4.852333178906532e-05, + "loss": 0.219, + "step": 11154 + }, + { + "epoch": 0.19896193771626297, + "grad_norm": 0.2715228796005249, + "learning_rate": 4.8522804722373685e-05, + "loss": 0.1822, + "step": 11155 + }, + { + "epoch": 0.19897977383797666, + "grad_norm": 0.23303835093975067, + "learning_rate": 4.852227756449953e-05, + "loss": 0.2435, + "step": 11156 + }, + { + "epoch": 0.19899760995969037, + "grad_norm": 0.3210408091545105, + "learning_rate": 4.8521750315444905e-05, + "loss": 0.1564, + "step": 11157 + }, + { + "epoch": 0.19901544608140406, + "grad_norm": 0.3169405162334442, + "learning_rate": 4.8521222975211854e-05, + "loss": 0.2404, + "step": 11158 + }, + { + "epoch": 0.19903328220311775, + "grad_norm": 0.20615223050117493, + "learning_rate": 4.852069554380242e-05, + "loss": 0.2015, + "step": 11159 + }, + { + "epoch": 0.19905111832483144, + "grad_norm": 0.3027874231338501, + "learning_rate": 4.852016802121864e-05, + "loss": 0.2397, + "step": 11160 + }, + { + "epoch": 0.19906895444654515, + "grad_norm": 0.2798895239830017, + "learning_rate": 4.851964040746256e-05, + "loss": 0.1947, + "step": 11161 + }, + { + "epoch": 0.19908679056825884, + "grad_norm": 0.31407061219215393, + "learning_rate": 4.851911270253625e-05, + "loss": 0.2067, + "step": 11162 + }, + { + "epoch": 0.19910462668997253, + "grad_norm": 0.2413611114025116, + "learning_rate": 4.851858490644172e-05, + "loss": 0.1678, + "step": 11163 + }, + { + "epoch": 0.19912246281168622, + "grad_norm": 0.28156372904777527, + "learning_rate": 4.8518057019181035e-05, + "loss": 0.1594, + "step": 11164 + }, + { + "epoch": 0.19914029893339993, + "grad_norm": 0.4610389471054077, + "learning_rate": 4.851752904075624e-05, + "loss": 0.2219, + "step": 11165 + }, + { + "epoch": 0.19915813505511362, + "grad_norm": 0.2356463074684143, + "learning_rate": 4.851700097116938e-05, + "loss": 0.1838, + "step": 11166 + }, + { + "epoch": 0.1991759711768273, + "grad_norm": 0.34358465671539307, + "learning_rate": 4.8516472810422495e-05, + "loss": 0.3043, + "step": 11167 + }, + { + "epoch": 0.199193807298541, + "grad_norm": 0.17977052927017212, + "learning_rate": 4.851594455851764e-05, + "loss": 0.1664, + "step": 11168 + }, + { + "epoch": 0.19921164342025469, + "grad_norm": 0.3159478008747101, + "learning_rate": 4.8515416215456874e-05, + "loss": 0.2261, + "step": 11169 + }, + { + "epoch": 0.1992294795419684, + "grad_norm": 0.20334972441196442, + "learning_rate": 4.851488778124222e-05, + "loss": 0.1678, + "step": 11170 + }, + { + "epoch": 0.1992473156636821, + "grad_norm": 0.40189042687416077, + "learning_rate": 4.851435925587575e-05, + "loss": 0.2146, + "step": 11171 + }, + { + "epoch": 0.19926515178539578, + "grad_norm": 0.47486233711242676, + "learning_rate": 4.8513830639359495e-05, + "loss": 0.2367, + "step": 11172 + }, + { + "epoch": 0.19928298790710947, + "grad_norm": 0.2458191215991974, + "learning_rate": 4.8513301931695515e-05, + "loss": 0.2109, + "step": 11173 + }, + { + "epoch": 0.19930082402882318, + "grad_norm": 0.2704322934150696, + "learning_rate": 4.851277313288585e-05, + "loss": 0.1861, + "step": 11174 + }, + { + "epoch": 0.19931866015053687, + "grad_norm": 0.26017236709594727, + "learning_rate": 4.851224424293256e-05, + "loss": 0.1901, + "step": 11175 + }, + { + "epoch": 0.19933649627225056, + "grad_norm": 0.25227510929107666, + "learning_rate": 4.8511715261837684e-05, + "loss": 0.2146, + "step": 11176 + }, + { + "epoch": 0.19935433239396425, + "grad_norm": 0.3166466951370239, + "learning_rate": 4.851118618960328e-05, + "loss": 0.1695, + "step": 11177 + }, + { + "epoch": 0.19937216851567796, + "grad_norm": 0.42788925766944885, + "learning_rate": 4.851065702623141e-05, + "loss": 0.1623, + "step": 11178 + }, + { + "epoch": 0.19939000463739165, + "grad_norm": 0.25692322850227356, + "learning_rate": 4.8510127771724104e-05, + "loss": 0.2137, + "step": 11179 + }, + { + "epoch": 0.19940784075910534, + "grad_norm": 0.2887136936187744, + "learning_rate": 4.8509598426083426e-05, + "loss": 0.2074, + "step": 11180 + }, + { + "epoch": 0.19942567688081902, + "grad_norm": 0.23590153455734253, + "learning_rate": 4.850906898931142e-05, + "loss": 0.1661, + "step": 11181 + }, + { + "epoch": 0.19944351300253274, + "grad_norm": 0.2689376175403595, + "learning_rate": 4.8508539461410144e-05, + "loss": 0.1732, + "step": 11182 + }, + { + "epoch": 0.19946134912424643, + "grad_norm": 0.2713359296321869, + "learning_rate": 4.8508009842381654e-05, + "loss": 0.1991, + "step": 11183 + }, + { + "epoch": 0.19947918524596012, + "grad_norm": 0.4445231258869171, + "learning_rate": 4.850748013222799e-05, + "loss": 0.2334, + "step": 11184 + }, + { + "epoch": 0.1994970213676738, + "grad_norm": 0.35040053725242615, + "learning_rate": 4.850695033095122e-05, + "loss": 0.2546, + "step": 11185 + }, + { + "epoch": 0.19951485748938752, + "grad_norm": 0.28394970297813416, + "learning_rate": 4.850642043855339e-05, + "loss": 0.2435, + "step": 11186 + }, + { + "epoch": 0.1995326936111012, + "grad_norm": 0.24912366271018982, + "learning_rate": 4.8505890455036554e-05, + "loss": 0.1939, + "step": 11187 + }, + { + "epoch": 0.1995505297328149, + "grad_norm": 0.3353257179260254, + "learning_rate": 4.850536038040276e-05, + "loss": 0.1978, + "step": 11188 + }, + { + "epoch": 0.19956836585452858, + "grad_norm": 0.22216598689556122, + "learning_rate": 4.8504830214654085e-05, + "loss": 0.1926, + "step": 11189 + }, + { + "epoch": 0.19958620197624227, + "grad_norm": 0.3079867660999298, + "learning_rate": 4.850429995779257e-05, + "loss": 0.1991, + "step": 11190 + }, + { + "epoch": 0.199604038097956, + "grad_norm": 0.35655754804611206, + "learning_rate": 4.850376960982026e-05, + "loss": 0.2763, + "step": 11191 + }, + { + "epoch": 0.19962187421966968, + "grad_norm": 0.38088274002075195, + "learning_rate": 4.850323917073922e-05, + "loss": 0.2186, + "step": 11192 + }, + { + "epoch": 0.19963971034138336, + "grad_norm": 0.26722249388694763, + "learning_rate": 4.8502708640551517e-05, + "loss": 0.2097, + "step": 11193 + }, + { + "epoch": 0.19965754646309705, + "grad_norm": 0.3426783084869385, + "learning_rate": 4.850217801925919e-05, + "loss": 0.2566, + "step": 11194 + }, + { + "epoch": 0.19967538258481077, + "grad_norm": 0.41486701369285583, + "learning_rate": 4.8501647306864314e-05, + "loss": 0.2343, + "step": 11195 + }, + { + "epoch": 0.19969321870652446, + "grad_norm": 0.30593550205230713, + "learning_rate": 4.8501116503368925e-05, + "loss": 0.2194, + "step": 11196 + }, + { + "epoch": 0.19971105482823814, + "grad_norm": 0.27238717675209045, + "learning_rate": 4.8500585608775095e-05, + "loss": 0.2066, + "step": 11197 + }, + { + "epoch": 0.19972889094995183, + "grad_norm": 0.3307051956653595, + "learning_rate": 4.8500054623084884e-05, + "loss": 0.2675, + "step": 11198 + }, + { + "epoch": 0.19974672707166555, + "grad_norm": 0.2783980965614319, + "learning_rate": 4.849952354630034e-05, + "loss": 0.2019, + "step": 11199 + }, + { + "epoch": 0.19976456319337924, + "grad_norm": 0.3086075186729431, + "learning_rate": 4.8498992378423525e-05, + "loss": 0.21, + "step": 11200 + }, + { + "epoch": 0.19978239931509292, + "grad_norm": 0.3717106878757477, + "learning_rate": 4.8498461119456504e-05, + "loss": 0.1922, + "step": 11201 + }, + { + "epoch": 0.1998002354368066, + "grad_norm": 0.2317422479391098, + "learning_rate": 4.849792976940132e-05, + "loss": 0.232, + "step": 11202 + }, + { + "epoch": 0.19981807155852033, + "grad_norm": 0.3480771780014038, + "learning_rate": 4.849739832826006e-05, + "loss": 0.1527, + "step": 11203 + }, + { + "epoch": 0.19983590768023402, + "grad_norm": 0.2628558576107025, + "learning_rate": 4.8496866796034754e-05, + "loss": 0.195, + "step": 11204 + }, + { + "epoch": 0.1998537438019477, + "grad_norm": 0.2192603349685669, + "learning_rate": 4.849633517272748e-05, + "loss": 0.1893, + "step": 11205 + }, + { + "epoch": 0.1998715799236614, + "grad_norm": 0.26822423934936523, + "learning_rate": 4.849580345834031e-05, + "loss": 0.1787, + "step": 11206 + }, + { + "epoch": 0.1998894160453751, + "grad_norm": 0.2718605697154999, + "learning_rate": 4.849527165287527e-05, + "loss": 0.1806, + "step": 11207 + }, + { + "epoch": 0.1999072521670888, + "grad_norm": 0.28898414969444275, + "learning_rate": 4.849473975633445e-05, + "loss": 0.1951, + "step": 11208 + }, + { + "epoch": 0.19992508828880248, + "grad_norm": 0.3359219431877136, + "learning_rate": 4.8494207768719906e-05, + "loss": 0.1756, + "step": 11209 + }, + { + "epoch": 0.19994292441051617, + "grad_norm": 0.3178466856479645, + "learning_rate": 4.84936756900337e-05, + "loss": 0.2605, + "step": 11210 + }, + { + "epoch": 0.19996076053222986, + "grad_norm": 0.4385218918323517, + "learning_rate": 4.849314352027789e-05, + "loss": 0.1936, + "step": 11211 + }, + { + "epoch": 0.19997859665394357, + "grad_norm": 0.2442820966243744, + "learning_rate": 4.849261125945454e-05, + "loss": 0.2093, + "step": 11212 + }, + { + "epoch": 0.19999643277565726, + "grad_norm": 0.32601553201675415, + "learning_rate": 4.849207890756572e-05, + "loss": 0.1856, + "step": 11213 + }, + { + "epoch": 0.20001426889737095, + "grad_norm": 0.2076176404953003, + "learning_rate": 4.849154646461348e-05, + "loss": 0.2047, + "step": 11214 + }, + { + "epoch": 0.20003210501908464, + "grad_norm": 0.2987322509288788, + "learning_rate": 4.849101393059989e-05, + "loss": 0.2296, + "step": 11215 + }, + { + "epoch": 0.20004994114079835, + "grad_norm": 0.24282674491405487, + "learning_rate": 4.849048130552703e-05, + "loss": 0.1968, + "step": 11216 + }, + { + "epoch": 0.20006777726251204, + "grad_norm": 0.27987241744995117, + "learning_rate": 4.8489948589396935e-05, + "loss": 0.2309, + "step": 11217 + }, + { + "epoch": 0.20008561338422573, + "grad_norm": 0.28088998794555664, + "learning_rate": 4.84894157822117e-05, + "loss": 0.219, + "step": 11218 + }, + { + "epoch": 0.20010344950593942, + "grad_norm": 0.321716845035553, + "learning_rate": 4.8488882883973375e-05, + "loss": 0.251, + "step": 11219 + }, + { + "epoch": 0.20012128562765313, + "grad_norm": 0.2137146294116974, + "learning_rate": 4.848834989468402e-05, + "loss": 0.1197, + "step": 11220 + }, + { + "epoch": 0.20013912174936682, + "grad_norm": 0.27422183752059937, + "learning_rate": 4.848781681434571e-05, + "loss": 0.184, + "step": 11221 + }, + { + "epoch": 0.2001569578710805, + "grad_norm": 0.3200380504131317, + "learning_rate": 4.848728364296051e-05, + "loss": 0.1888, + "step": 11222 + }, + { + "epoch": 0.2001747939927942, + "grad_norm": 0.2924257218837738, + "learning_rate": 4.8486750380530495e-05, + "loss": 0.2313, + "step": 11223 + }, + { + "epoch": 0.2001926301145079, + "grad_norm": 0.2781852185726166, + "learning_rate": 4.848621702705771e-05, + "loss": 0.2546, + "step": 11224 + }, + { + "epoch": 0.2002104662362216, + "grad_norm": 0.20512135326862335, + "learning_rate": 4.848568358254424e-05, + "loss": 0.1845, + "step": 11225 + }, + { + "epoch": 0.2002283023579353, + "grad_norm": 0.21732193231582642, + "learning_rate": 4.848515004699216e-05, + "loss": 0.1698, + "step": 11226 + }, + { + "epoch": 0.20024613847964898, + "grad_norm": 0.4090214669704437, + "learning_rate": 4.8484616420403516e-05, + "loss": 0.1995, + "step": 11227 + }, + { + "epoch": 0.2002639746013627, + "grad_norm": 0.5717278718948364, + "learning_rate": 4.8484082702780387e-05, + "loss": 0.2029, + "step": 11228 + }, + { + "epoch": 0.20028181072307638, + "grad_norm": 0.43718910217285156, + "learning_rate": 4.8483548894124844e-05, + "loss": 0.247, + "step": 11229 + }, + { + "epoch": 0.20029964684479007, + "grad_norm": 0.27847278118133545, + "learning_rate": 4.8483014994438955e-05, + "loss": 0.2494, + "step": 11230 + }, + { + "epoch": 0.20031748296650376, + "grad_norm": 0.2626401484012604, + "learning_rate": 4.848248100372479e-05, + "loss": 0.1916, + "step": 11231 + }, + { + "epoch": 0.20033531908821745, + "grad_norm": 0.23114930093288422, + "learning_rate": 4.848194692198442e-05, + "loss": 0.222, + "step": 11232 + }, + { + "epoch": 0.20035315520993116, + "grad_norm": 0.3235883116722107, + "learning_rate": 4.8481412749219906e-05, + "loss": 0.1538, + "step": 11233 + }, + { + "epoch": 0.20037099133164485, + "grad_norm": 0.24516287446022034, + "learning_rate": 4.8480878485433334e-05, + "loss": 0.2268, + "step": 11234 + }, + { + "epoch": 0.20038882745335854, + "grad_norm": 0.2867223620414734, + "learning_rate": 4.848034413062676e-05, + "loss": 0.219, + "step": 11235 + }, + { + "epoch": 0.20040666357507222, + "grad_norm": 0.3025898337364197, + "learning_rate": 4.8479809684802266e-05, + "loss": 0.1612, + "step": 11236 + }, + { + "epoch": 0.20042449969678594, + "grad_norm": 0.2771890461444855, + "learning_rate": 4.8479275147961924e-05, + "loss": 0.2332, + "step": 11237 + }, + { + "epoch": 0.20044233581849963, + "grad_norm": 0.2824143171310425, + "learning_rate": 4.847874052010779e-05, + "loss": 0.2284, + "step": 11238 + }, + { + "epoch": 0.20046017194021332, + "grad_norm": 0.2517559230327606, + "learning_rate": 4.847820580124196e-05, + "loss": 0.1842, + "step": 11239 + }, + { + "epoch": 0.200478008061927, + "grad_norm": 0.24260075390338898, + "learning_rate": 4.8477670991366484e-05, + "loss": 0.225, + "step": 11240 + }, + { + "epoch": 0.20049584418364072, + "grad_norm": 0.46930018067359924, + "learning_rate": 4.847713609048346e-05, + "loss": 0.2576, + "step": 11241 + }, + { + "epoch": 0.2005136803053544, + "grad_norm": 0.23439662158489227, + "learning_rate": 4.8476601098594945e-05, + "loss": 0.1545, + "step": 11242 + }, + { + "epoch": 0.2005315164270681, + "grad_norm": 0.3082413971424103, + "learning_rate": 4.847606601570301e-05, + "loss": 0.2586, + "step": 11243 + }, + { + "epoch": 0.20054935254878178, + "grad_norm": 0.3749242424964905, + "learning_rate": 4.847553084180974e-05, + "loss": 0.231, + "step": 11244 + }, + { + "epoch": 0.2005671886704955, + "grad_norm": 0.35473912954330444, + "learning_rate": 4.8474995576917195e-05, + "loss": 0.2241, + "step": 11245 + }, + { + "epoch": 0.2005850247922092, + "grad_norm": 0.22156821191310883, + "learning_rate": 4.847446022102746e-05, + "loss": 0.2013, + "step": 11246 + }, + { + "epoch": 0.20060286091392288, + "grad_norm": 0.32568418979644775, + "learning_rate": 4.847392477414262e-05, + "loss": 0.1538, + "step": 11247 + }, + { + "epoch": 0.20062069703563656, + "grad_norm": 0.3184446096420288, + "learning_rate": 4.8473389236264735e-05, + "loss": 0.2534, + "step": 11248 + }, + { + "epoch": 0.20063853315735028, + "grad_norm": 0.2477748543024063, + "learning_rate": 4.847285360739589e-05, + "loss": 0.1544, + "step": 11249 + }, + { + "epoch": 0.20065636927906397, + "grad_norm": 0.24207846820354462, + "learning_rate": 4.847231788753815e-05, + "loss": 0.219, + "step": 11250 + }, + { + "epoch": 0.20067420540077766, + "grad_norm": 0.30859509110450745, + "learning_rate": 4.84717820766936e-05, + "loss": 0.1844, + "step": 11251 + }, + { + "epoch": 0.20069204152249134, + "grad_norm": 0.2695724070072174, + "learning_rate": 4.847124617486432e-05, + "loss": 0.1656, + "step": 11252 + }, + { + "epoch": 0.20070987764420503, + "grad_norm": 0.2993283271789551, + "learning_rate": 4.8470710182052375e-05, + "loss": 0.1897, + "step": 11253 + }, + { + "epoch": 0.20072771376591875, + "grad_norm": 0.354107141494751, + "learning_rate": 4.847017409825986e-05, + "loss": 0.1842, + "step": 11254 + }, + { + "epoch": 0.20074554988763244, + "grad_norm": 0.31370314955711365, + "learning_rate": 4.8469637923488833e-05, + "loss": 0.1657, + "step": 11255 + }, + { + "epoch": 0.20076338600934612, + "grad_norm": 0.30259957909584045, + "learning_rate": 4.8469101657741395e-05, + "loss": 0.2054, + "step": 11256 + }, + { + "epoch": 0.2007812221310598, + "grad_norm": 0.22491726279258728, + "learning_rate": 4.846856530101961e-05, + "loss": 0.1949, + "step": 11257 + }, + { + "epoch": 0.20079905825277353, + "grad_norm": 0.32310980558395386, + "learning_rate": 4.8468028853325556e-05, + "loss": 0.1853, + "step": 11258 + }, + { + "epoch": 0.20081689437448721, + "grad_norm": 0.25922152400016785, + "learning_rate": 4.8467492314661316e-05, + "loss": 0.1868, + "step": 11259 + }, + { + "epoch": 0.2008347304962009, + "grad_norm": 0.25198090076446533, + "learning_rate": 4.846695568502898e-05, + "loss": 0.1529, + "step": 11260 + }, + { + "epoch": 0.2008525666179146, + "grad_norm": 0.2648249864578247, + "learning_rate": 4.8466418964430606e-05, + "loss": 0.2072, + "step": 11261 + }, + { + "epoch": 0.2008704027396283, + "grad_norm": 0.2596457302570343, + "learning_rate": 4.846588215286829e-05, + "loss": 0.1783, + "step": 11262 + }, + { + "epoch": 0.200888238861342, + "grad_norm": 0.47238320112228394, + "learning_rate": 4.846534525034412e-05, + "loss": 0.1971, + "step": 11263 + }, + { + "epoch": 0.20090607498305568, + "grad_norm": 0.37923213839530945, + "learning_rate": 4.846480825686016e-05, + "loss": 0.2222, + "step": 11264 + }, + { + "epoch": 0.20092391110476937, + "grad_norm": 0.23220795392990112, + "learning_rate": 4.846427117241849e-05, + "loss": 0.2088, + "step": 11265 + }, + { + "epoch": 0.2009417472264831, + "grad_norm": 0.33572810888290405, + "learning_rate": 4.846373399702121e-05, + "loss": 0.2196, + "step": 11266 + }, + { + "epoch": 0.20095958334819677, + "grad_norm": 0.27861925959587097, + "learning_rate": 4.8463196730670396e-05, + "loss": 0.188, + "step": 11267 + }, + { + "epoch": 0.20097741946991046, + "grad_norm": 0.3227293789386749, + "learning_rate": 4.8462659373368126e-05, + "loss": 0.1758, + "step": 11268 + }, + { + "epoch": 0.20099525559162415, + "grad_norm": 0.3112263083457947, + "learning_rate": 4.846212192511648e-05, + "loss": 0.2377, + "step": 11269 + }, + { + "epoch": 0.20101309171333784, + "grad_norm": 0.2563783526420593, + "learning_rate": 4.846158438591755e-05, + "loss": 0.1988, + "step": 11270 + }, + { + "epoch": 0.20103092783505155, + "grad_norm": 0.49856215715408325, + "learning_rate": 4.846104675577341e-05, + "loss": 0.2516, + "step": 11271 + }, + { + "epoch": 0.20104876395676524, + "grad_norm": 0.36224600672721863, + "learning_rate": 4.8460509034686154e-05, + "loss": 0.2016, + "step": 11272 + }, + { + "epoch": 0.20106660007847893, + "grad_norm": 0.29951876401901245, + "learning_rate": 4.8459971222657864e-05, + "loss": 0.2289, + "step": 11273 + }, + { + "epoch": 0.20108443620019262, + "grad_norm": 0.29126256704330444, + "learning_rate": 4.845943331969062e-05, + "loss": 0.1926, + "step": 11274 + }, + { + "epoch": 0.20110227232190633, + "grad_norm": 0.2900295853614807, + "learning_rate": 4.845889532578651e-05, + "loss": 0.1862, + "step": 11275 + }, + { + "epoch": 0.20112010844362002, + "grad_norm": 0.22529807686805725, + "learning_rate": 4.8458357240947615e-05, + "loss": 0.1833, + "step": 11276 + }, + { + "epoch": 0.2011379445653337, + "grad_norm": 0.30294477939605713, + "learning_rate": 4.845781906517603e-05, + "loss": 0.209, + "step": 11277 + }, + { + "epoch": 0.2011557806870474, + "grad_norm": 0.23169225454330444, + "learning_rate": 4.8457280798473836e-05, + "loss": 0.159, + "step": 11278 + }, + { + "epoch": 0.2011736168087611, + "grad_norm": 0.17531831562519073, + "learning_rate": 4.845674244084312e-05, + "loss": 0.1469, + "step": 11279 + }, + { + "epoch": 0.2011914529304748, + "grad_norm": 0.39195239543914795, + "learning_rate": 4.8456203992285966e-05, + "loss": 0.2589, + "step": 11280 + }, + { + "epoch": 0.2012092890521885, + "grad_norm": 0.40377548336982727, + "learning_rate": 4.845566545280447e-05, + "loss": 0.2756, + "step": 11281 + }, + { + "epoch": 0.20122712517390218, + "grad_norm": 0.22159047424793243, + "learning_rate": 4.84551268224007e-05, + "loss": 0.1906, + "step": 11282 + }, + { + "epoch": 0.2012449612956159, + "grad_norm": 0.2438051849603653, + "learning_rate": 4.845458810107677e-05, + "loss": 0.2253, + "step": 11283 + }, + { + "epoch": 0.20126279741732958, + "grad_norm": 0.36674851179122925, + "learning_rate": 4.845404928883475e-05, + "loss": 0.2272, + "step": 11284 + }, + { + "epoch": 0.20128063353904327, + "grad_norm": 0.35977867245674133, + "learning_rate": 4.8453510385676734e-05, + "loss": 0.1991, + "step": 11285 + }, + { + "epoch": 0.20129846966075696, + "grad_norm": 0.2580593526363373, + "learning_rate": 4.845297139160482e-05, + "loss": 0.1738, + "step": 11286 + }, + { + "epoch": 0.20131630578247067, + "grad_norm": 0.21098247170448303, + "learning_rate": 4.845243230662107e-05, + "loss": 0.1881, + "step": 11287 + }, + { + "epoch": 0.20133414190418436, + "grad_norm": 0.23772303760051727, + "learning_rate": 4.845189313072761e-05, + "loss": 0.1936, + "step": 11288 + }, + { + "epoch": 0.20135197802589805, + "grad_norm": 0.24394717812538147, + "learning_rate": 4.8451353863926504e-05, + "loss": 0.2178, + "step": 11289 + }, + { + "epoch": 0.20136981414761174, + "grad_norm": 0.36160221695899963, + "learning_rate": 4.8450814506219854e-05, + "loss": 0.1865, + "step": 11290 + }, + { + "epoch": 0.20138765026932542, + "grad_norm": 0.3158969581127167, + "learning_rate": 4.845027505760975e-05, + "loss": 0.2043, + "step": 11291 + }, + { + "epoch": 0.20140548639103914, + "grad_norm": 0.3311038315296173, + "learning_rate": 4.8449735518098274e-05, + "loss": 0.2591, + "step": 11292 + }, + { + "epoch": 0.20142332251275283, + "grad_norm": 0.2837072014808655, + "learning_rate": 4.844919588768752e-05, + "loss": 0.2724, + "step": 11293 + }, + { + "epoch": 0.20144115863446652, + "grad_norm": 0.32290181517601013, + "learning_rate": 4.8448656166379594e-05, + "loss": 0.2593, + "step": 11294 + }, + { + "epoch": 0.2014589947561802, + "grad_norm": 0.335918128490448, + "learning_rate": 4.844811635417657e-05, + "loss": 0.2, + "step": 11295 + }, + { + "epoch": 0.20147683087789392, + "grad_norm": 0.25452184677124023, + "learning_rate": 4.844757645108055e-05, + "loss": 0.2229, + "step": 11296 + }, + { + "epoch": 0.2014946669996076, + "grad_norm": 0.2768148183822632, + "learning_rate": 4.844703645709363e-05, + "loss": 0.2139, + "step": 11297 + }, + { + "epoch": 0.2015125031213213, + "grad_norm": 0.30912166833877563, + "learning_rate": 4.8446496372217895e-05, + "loss": 0.2014, + "step": 11298 + }, + { + "epoch": 0.20153033924303498, + "grad_norm": 0.25039976835250854, + "learning_rate": 4.8445956196455444e-05, + "loss": 0.2202, + "step": 11299 + }, + { + "epoch": 0.2015481753647487, + "grad_norm": 0.21830569207668304, + "learning_rate": 4.844541592980837e-05, + "loss": 0.171, + "step": 11300 + }, + { + "epoch": 0.2015660114864624, + "grad_norm": 0.213765487074852, + "learning_rate": 4.8444875572278755e-05, + "loss": 0.1579, + "step": 11301 + }, + { + "epoch": 0.20158384760817608, + "grad_norm": 0.2506726086139679, + "learning_rate": 4.8444335123868725e-05, + "loss": 0.1681, + "step": 11302 + }, + { + "epoch": 0.20160168372988976, + "grad_norm": 0.2370816171169281, + "learning_rate": 4.844379458458034e-05, + "loss": 0.1813, + "step": 11303 + }, + { + "epoch": 0.20161951985160348, + "grad_norm": 0.23096415400505066, + "learning_rate": 4.8443253954415714e-05, + "loss": 0.1626, + "step": 11304 + }, + { + "epoch": 0.20163735597331717, + "grad_norm": 0.2860069274902344, + "learning_rate": 4.8442713233376935e-05, + "loss": 0.2174, + "step": 11305 + }, + { + "epoch": 0.20165519209503086, + "grad_norm": 0.3346981406211853, + "learning_rate": 4.8442172421466104e-05, + "loss": 0.2084, + "step": 11306 + }, + { + "epoch": 0.20167302821674454, + "grad_norm": 0.1893150955438614, + "learning_rate": 4.844163151868531e-05, + "loss": 0.1844, + "step": 11307 + }, + { + "epoch": 0.20169086433845826, + "grad_norm": 0.24594847857952118, + "learning_rate": 4.844109052503667e-05, + "loss": 0.1797, + "step": 11308 + }, + { + "epoch": 0.20170870046017195, + "grad_norm": 0.2367420196533203, + "learning_rate": 4.844054944052225e-05, + "loss": 0.1805, + "step": 11309 + }, + { + "epoch": 0.20172653658188563, + "grad_norm": 0.3526380658149719, + "learning_rate": 4.8440008265144175e-05, + "loss": 0.1925, + "step": 11310 + }, + { + "epoch": 0.20174437270359932, + "grad_norm": 0.3324848711490631, + "learning_rate": 4.8439466998904535e-05, + "loss": 0.1427, + "step": 11311 + }, + { + "epoch": 0.201762208825313, + "grad_norm": 0.23890207707881927, + "learning_rate": 4.843892564180542e-05, + "loss": 0.1623, + "step": 11312 + }, + { + "epoch": 0.20178004494702673, + "grad_norm": 0.35688701272010803, + "learning_rate": 4.8438384193848935e-05, + "loss": 0.1902, + "step": 11313 + }, + { + "epoch": 0.20179788106874041, + "grad_norm": 0.4197233319282532, + "learning_rate": 4.843784265503718e-05, + "loss": 0.1918, + "step": 11314 + }, + { + "epoch": 0.2018157171904541, + "grad_norm": 0.23810192942619324, + "learning_rate": 4.843730102537224e-05, + "loss": 0.155, + "step": 11315 + }, + { + "epoch": 0.2018335533121678, + "grad_norm": 0.23382043838500977, + "learning_rate": 4.8436759304856236e-05, + "loss": 0.217, + "step": 11316 + }, + { + "epoch": 0.2018513894338815, + "grad_norm": 0.27680182456970215, + "learning_rate": 4.843621749349126e-05, + "loss": 0.2037, + "step": 11317 + }, + { + "epoch": 0.2018692255555952, + "grad_norm": 0.2820006310939789, + "learning_rate": 4.8435675591279405e-05, + "loss": 0.2018, + "step": 11318 + }, + { + "epoch": 0.20188706167730888, + "grad_norm": 0.22262385487556458, + "learning_rate": 4.843513359822278e-05, + "loss": 0.1734, + "step": 11319 + }, + { + "epoch": 0.20190489779902257, + "grad_norm": 0.29639485478401184, + "learning_rate": 4.843459151432349e-05, + "loss": 0.185, + "step": 11320 + }, + { + "epoch": 0.20192273392073629, + "grad_norm": 0.30568423867225647, + "learning_rate": 4.843404933958362e-05, + "loss": 0.1997, + "step": 11321 + }, + { + "epoch": 0.20194057004244997, + "grad_norm": 0.27424970269203186, + "learning_rate": 4.843350707400528e-05, + "loss": 0.1931, + "step": 11322 + }, + { + "epoch": 0.20195840616416366, + "grad_norm": 0.2373412847518921, + "learning_rate": 4.843296471759058e-05, + "loss": 0.1727, + "step": 11323 + }, + { + "epoch": 0.20197624228587735, + "grad_norm": 0.23920440673828125, + "learning_rate": 4.8432422270341605e-05, + "loss": 0.1505, + "step": 11324 + }, + { + "epoch": 0.20199407840759107, + "grad_norm": 0.3613201081752777, + "learning_rate": 4.843187973226048e-05, + "loss": 0.1801, + "step": 11325 + }, + { + "epoch": 0.20201191452930475, + "grad_norm": 0.3097452223300934, + "learning_rate": 4.843133710334928e-05, + "loss": 0.2277, + "step": 11326 + }, + { + "epoch": 0.20202975065101844, + "grad_norm": 0.29457807540893555, + "learning_rate": 4.843079438361014e-05, + "loss": 0.1892, + "step": 11327 + }, + { + "epoch": 0.20204758677273213, + "grad_norm": 0.25454843044281006, + "learning_rate": 4.843025157304514e-05, + "loss": 0.1835, + "step": 11328 + }, + { + "epoch": 0.20206542289444585, + "grad_norm": 0.2775270640850067, + "learning_rate": 4.842970867165639e-05, + "loss": 0.2404, + "step": 11329 + }, + { + "epoch": 0.20208325901615953, + "grad_norm": 0.37864527106285095, + "learning_rate": 4.8429165679446006e-05, + "loss": 0.1556, + "step": 11330 + }, + { + "epoch": 0.20210109513787322, + "grad_norm": 0.19824384152889252, + "learning_rate": 4.842862259641608e-05, + "loss": 0.1706, + "step": 11331 + }, + { + "epoch": 0.2021189312595869, + "grad_norm": 0.20984798669815063, + "learning_rate": 4.842807942256872e-05, + "loss": 0.1834, + "step": 11332 + }, + { + "epoch": 0.2021367673813006, + "grad_norm": 0.2893177270889282, + "learning_rate": 4.8427536157906025e-05, + "loss": 0.225, + "step": 11333 + }, + { + "epoch": 0.2021546035030143, + "grad_norm": 0.27121877670288086, + "learning_rate": 4.8426992802430124e-05, + "loss": 0.2244, + "step": 11334 + }, + { + "epoch": 0.202172439624728, + "grad_norm": 0.35227641463279724, + "learning_rate": 4.84264493561431e-05, + "loss": 0.1842, + "step": 11335 + }, + { + "epoch": 0.2021902757464417, + "grad_norm": 0.3382189869880676, + "learning_rate": 4.842590581904706e-05, + "loss": 0.1874, + "step": 11336 + }, + { + "epoch": 0.20220811186815538, + "grad_norm": 0.2463979572057724, + "learning_rate": 4.842536219114413e-05, + "loss": 0.2219, + "step": 11337 + }, + { + "epoch": 0.2022259479898691, + "grad_norm": 0.2901211678981781, + "learning_rate": 4.8424818472436394e-05, + "loss": 0.2135, + "step": 11338 + }, + { + "epoch": 0.20224378411158278, + "grad_norm": 0.312142550945282, + "learning_rate": 4.842427466292598e-05, + "loss": 0.2433, + "step": 11339 + }, + { + "epoch": 0.20226162023329647, + "grad_norm": 0.2258642017841339, + "learning_rate": 4.8423730762614985e-05, + "loss": 0.2135, + "step": 11340 + }, + { + "epoch": 0.20227945635501016, + "grad_norm": 0.267499715089798, + "learning_rate": 4.8423186771505516e-05, + "loss": 0.1737, + "step": 11341 + }, + { + "epoch": 0.20229729247672387, + "grad_norm": 0.26048150658607483, + "learning_rate": 4.8422642689599685e-05, + "loss": 0.1797, + "step": 11342 + }, + { + "epoch": 0.20231512859843756, + "grad_norm": 0.22089146077632904, + "learning_rate": 4.8422098516899606e-05, + "loss": 0.2012, + "step": 11343 + }, + { + "epoch": 0.20233296472015125, + "grad_norm": 0.21420103311538696, + "learning_rate": 4.8421554253407374e-05, + "loss": 0.1441, + "step": 11344 + }, + { + "epoch": 0.20235080084186494, + "grad_norm": 0.2936931550502777, + "learning_rate": 4.8421009899125115e-05, + "loss": 0.1853, + "step": 11345 + }, + { + "epoch": 0.20236863696357865, + "grad_norm": 0.27731451392173767, + "learning_rate": 4.842046545405493e-05, + "loss": 0.2182, + "step": 11346 + }, + { + "epoch": 0.20238647308529234, + "grad_norm": 0.2916032373905182, + "learning_rate": 4.8419920918198936e-05, + "loss": 0.1762, + "step": 11347 + }, + { + "epoch": 0.20240430920700603, + "grad_norm": 0.28585314750671387, + "learning_rate": 4.841937629155924e-05, + "loss": 0.2415, + "step": 11348 + }, + { + "epoch": 0.20242214532871972, + "grad_norm": 0.28881019353866577, + "learning_rate": 4.841883157413795e-05, + "loss": 0.1883, + "step": 11349 + }, + { + "epoch": 0.20243998145043343, + "grad_norm": 0.27883437275886536, + "learning_rate": 4.841828676593718e-05, + "loss": 0.2013, + "step": 11350 + }, + { + "epoch": 0.20245781757214712, + "grad_norm": 0.24770846962928772, + "learning_rate": 4.841774186695904e-05, + "loss": 0.1986, + "step": 11351 + }, + { + "epoch": 0.2024756536938608, + "grad_norm": 0.40508878231048584, + "learning_rate": 4.841719687720565e-05, + "loss": 0.206, + "step": 11352 + }, + { + "epoch": 0.2024934898155745, + "grad_norm": 0.15239794552326202, + "learning_rate": 4.841665179667911e-05, + "loss": 0.1421, + "step": 11353 + }, + { + "epoch": 0.20251132593728818, + "grad_norm": 0.3014697730541229, + "learning_rate": 4.8416106625381544e-05, + "loss": 0.2713, + "step": 11354 + }, + { + "epoch": 0.2025291620590019, + "grad_norm": 0.3859409987926483, + "learning_rate": 4.8415561363315055e-05, + "loss": 0.2337, + "step": 11355 + }, + { + "epoch": 0.2025469981807156, + "grad_norm": 0.2989046275615692, + "learning_rate": 4.841501601048177e-05, + "loss": 0.1813, + "step": 11356 + }, + { + "epoch": 0.20256483430242928, + "grad_norm": 0.27538689970970154, + "learning_rate": 4.841447056688379e-05, + "loss": 0.1773, + "step": 11357 + }, + { + "epoch": 0.20258267042414296, + "grad_norm": 0.25325432419776917, + "learning_rate": 4.8413925032523235e-05, + "loss": 0.1901, + "step": 11358 + }, + { + "epoch": 0.20260050654585668, + "grad_norm": 0.21702085435390472, + "learning_rate": 4.841337940740222e-05, + "loss": 0.2, + "step": 11359 + }, + { + "epoch": 0.20261834266757037, + "grad_norm": 0.3129924237728119, + "learning_rate": 4.841283369152287e-05, + "loss": 0.2324, + "step": 11360 + }, + { + "epoch": 0.20263617878928405, + "grad_norm": 0.24327322840690613, + "learning_rate": 4.841228788488728e-05, + "loss": 0.2297, + "step": 11361 + }, + { + "epoch": 0.20265401491099774, + "grad_norm": 0.29003897309303284, + "learning_rate": 4.841174198749758e-05, + "loss": 0.2204, + "step": 11362 + }, + { + "epoch": 0.20267185103271146, + "grad_norm": 0.2230042666196823, + "learning_rate": 4.841119599935588e-05, + "loss": 0.192, + "step": 11363 + }, + { + "epoch": 0.20268968715442515, + "grad_norm": 0.2523375451564789, + "learning_rate": 4.8410649920464294e-05, + "loss": 0.2427, + "step": 11364 + }, + { + "epoch": 0.20270752327613883, + "grad_norm": 0.3380890488624573, + "learning_rate": 4.8410103750824954e-05, + "loss": 0.2331, + "step": 11365 + }, + { + "epoch": 0.20272535939785252, + "grad_norm": 0.2913780212402344, + "learning_rate": 4.840955749043996e-05, + "loss": 0.1787, + "step": 11366 + }, + { + "epoch": 0.20274319551956624, + "grad_norm": 0.21512164175510406, + "learning_rate": 4.8409011139311435e-05, + "loss": 0.1946, + "step": 11367 + }, + { + "epoch": 0.20276103164127993, + "grad_norm": 0.2979514002799988, + "learning_rate": 4.84084646974415e-05, + "loss": 0.1874, + "step": 11368 + }, + { + "epoch": 0.20277886776299361, + "grad_norm": 0.21054485440254211, + "learning_rate": 4.840791816483227e-05, + "loss": 0.1914, + "step": 11369 + }, + { + "epoch": 0.2027967038847073, + "grad_norm": 0.2549343705177307, + "learning_rate": 4.8407371541485856e-05, + "loss": 0.2033, + "step": 11370 + }, + { + "epoch": 0.202814540006421, + "grad_norm": 0.3070774972438812, + "learning_rate": 4.840682482740439e-05, + "loss": 0.2618, + "step": 11371 + }, + { + "epoch": 0.2028323761281347, + "grad_norm": 0.2668624222278595, + "learning_rate": 4.8406278022589993e-05, + "loss": 0.1912, + "step": 11372 + }, + { + "epoch": 0.2028502122498484, + "grad_norm": 0.2944768965244293, + "learning_rate": 4.840573112704477e-05, + "loss": 0.216, + "step": 11373 + }, + { + "epoch": 0.20286804837156208, + "grad_norm": 0.3573305606842041, + "learning_rate": 4.840518414077086e-05, + "loss": 0.1576, + "step": 11374 + }, + { + "epoch": 0.20288588449327577, + "grad_norm": 0.29191821813583374, + "learning_rate": 4.840463706377036e-05, + "loss": 0.1903, + "step": 11375 + }, + { + "epoch": 0.20290372061498949, + "grad_norm": 0.3394958972930908, + "learning_rate": 4.8404089896045414e-05, + "loss": 0.193, + "step": 11376 + }, + { + "epoch": 0.20292155673670317, + "grad_norm": 0.3412723243236542, + "learning_rate": 4.840354263759813e-05, + "loss": 0.2274, + "step": 11377 + }, + { + "epoch": 0.20293939285841686, + "grad_norm": 0.42917343974113464, + "learning_rate": 4.8402995288430626e-05, + "loss": 0.2389, + "step": 11378 + }, + { + "epoch": 0.20295722898013055, + "grad_norm": 0.30133911967277527, + "learning_rate": 4.840244784854503e-05, + "loss": 0.2173, + "step": 11379 + }, + { + "epoch": 0.20297506510184427, + "grad_norm": 0.2544538378715515, + "learning_rate": 4.840190031794346e-05, + "loss": 0.1813, + "step": 11380 + }, + { + "epoch": 0.20299290122355795, + "grad_norm": 0.3277254104614258, + "learning_rate": 4.840135269662805e-05, + "loss": 0.2397, + "step": 11381 + }, + { + "epoch": 0.20301073734527164, + "grad_norm": 0.26831668615341187, + "learning_rate": 4.8400804984600913e-05, + "loss": 0.2141, + "step": 11382 + }, + { + "epoch": 0.20302857346698533, + "grad_norm": 0.30280405282974243, + "learning_rate": 4.8400257181864175e-05, + "loss": 0.2337, + "step": 11383 + }, + { + "epoch": 0.20304640958869904, + "grad_norm": 0.24623818695545197, + "learning_rate": 4.8399709288419944e-05, + "loss": 0.1875, + "step": 11384 + }, + { + "epoch": 0.20306424571041273, + "grad_norm": 0.34140992164611816, + "learning_rate": 4.839916130427037e-05, + "loss": 0.1983, + "step": 11385 + }, + { + "epoch": 0.20308208183212642, + "grad_norm": 0.26018744707107544, + "learning_rate": 4.839861322941757e-05, + "loss": 0.1615, + "step": 11386 + }, + { + "epoch": 0.2030999179538401, + "grad_norm": 0.22534151375293732, + "learning_rate": 4.839806506386365e-05, + "loss": 0.1914, + "step": 11387 + }, + { + "epoch": 0.20311775407555382, + "grad_norm": 0.2761533558368683, + "learning_rate": 4.8397516807610756e-05, + "loss": 0.2203, + "step": 11388 + }, + { + "epoch": 0.2031355901972675, + "grad_norm": 0.2777502238750458, + "learning_rate": 4.8396968460661006e-05, + "loss": 0.2011, + "step": 11389 + }, + { + "epoch": 0.2031534263189812, + "grad_norm": 0.28777870535850525, + "learning_rate": 4.839642002301652e-05, + "loss": 0.204, + "step": 11390 + }, + { + "epoch": 0.2031712624406949, + "grad_norm": 0.38688167929649353, + "learning_rate": 4.8395871494679434e-05, + "loss": 0.3018, + "step": 11391 + }, + { + "epoch": 0.20318909856240858, + "grad_norm": 0.22930848598480225, + "learning_rate": 4.8395322875651874e-05, + "loss": 0.2044, + "step": 11392 + }, + { + "epoch": 0.2032069346841223, + "grad_norm": 0.3254350423812866, + "learning_rate": 4.839477416593595e-05, + "loss": 0.216, + "step": 11393 + }, + { + "epoch": 0.20322477080583598, + "grad_norm": 0.2998526096343994, + "learning_rate": 4.839422536553381e-05, + "loss": 0.1831, + "step": 11394 + }, + { + "epoch": 0.20324260692754967, + "grad_norm": 0.511393129825592, + "learning_rate": 4.839367647444757e-05, + "loss": 0.2312, + "step": 11395 + }, + { + "epoch": 0.20326044304926336, + "grad_norm": 0.23622164130210876, + "learning_rate": 4.839312749267936e-05, + "loss": 0.2112, + "step": 11396 + }, + { + "epoch": 0.20327827917097707, + "grad_norm": 0.4600674510002136, + "learning_rate": 4.83925784202313e-05, + "loss": 0.1679, + "step": 11397 + }, + { + "epoch": 0.20329611529269076, + "grad_norm": 0.257719486951828, + "learning_rate": 4.8392029257105534e-05, + "loss": 0.238, + "step": 11398 + }, + { + "epoch": 0.20331395141440445, + "grad_norm": 0.2990468740463257, + "learning_rate": 4.839148000330419e-05, + "loss": 0.2321, + "step": 11399 + }, + { + "epoch": 0.20333178753611814, + "grad_norm": 0.2650798261165619, + "learning_rate": 4.839093065882938e-05, + "loss": 0.2051, + "step": 11400 + }, + { + "epoch": 0.20334962365783185, + "grad_norm": 0.2789836823940277, + "learning_rate": 4.8390381223683246e-05, + "loss": 0.2093, + "step": 11401 + }, + { + "epoch": 0.20336745977954554, + "grad_norm": 0.21605423092842102, + "learning_rate": 4.838983169786792e-05, + "loss": 0.1904, + "step": 11402 + }, + { + "epoch": 0.20338529590125923, + "grad_norm": 0.2877443730831146, + "learning_rate": 4.8389282081385526e-05, + "loss": 0.2026, + "step": 11403 + }, + { + "epoch": 0.20340313202297292, + "grad_norm": 0.24442289769649506, + "learning_rate": 4.838873237423819e-05, + "loss": 0.1872, + "step": 11404 + }, + { + "epoch": 0.20342096814468663, + "grad_norm": 0.3187635838985443, + "learning_rate": 4.838818257642806e-05, + "loss": 0.2663, + "step": 11405 + }, + { + "epoch": 0.20343880426640032, + "grad_norm": 0.32398679852485657, + "learning_rate": 4.838763268795725e-05, + "loss": 0.2051, + "step": 11406 + }, + { + "epoch": 0.203456640388114, + "grad_norm": 0.264431893825531, + "learning_rate": 4.8387082708827894e-05, + "loss": 0.2238, + "step": 11407 + }, + { + "epoch": 0.2034744765098277, + "grad_norm": 0.261491984128952, + "learning_rate": 4.838653263904214e-05, + "loss": 0.1814, + "step": 11408 + }, + { + "epoch": 0.2034923126315414, + "grad_norm": 0.2942681908607483, + "learning_rate": 4.83859824786021e-05, + "loss": 0.2062, + "step": 11409 + }, + { + "epoch": 0.2035101487532551, + "grad_norm": 0.5667554140090942, + "learning_rate": 4.8385432227509906e-05, + "loss": 0.2474, + "step": 11410 + }, + { + "epoch": 0.2035279848749688, + "grad_norm": 0.27233177423477173, + "learning_rate": 4.8384881885767716e-05, + "loss": 0.1866, + "step": 11411 + }, + { + "epoch": 0.20354582099668247, + "grad_norm": 0.24171605706214905, + "learning_rate": 4.838433145337764e-05, + "loss": 0.2231, + "step": 11412 + }, + { + "epoch": 0.20356365711839616, + "grad_norm": 0.19546818733215332, + "learning_rate": 4.838378093034182e-05, + "loss": 0.1653, + "step": 11413 + }, + { + "epoch": 0.20358149324010988, + "grad_norm": 0.21581801772117615, + "learning_rate": 4.838323031666238e-05, + "loss": 0.1527, + "step": 11414 + }, + { + "epoch": 0.20359932936182357, + "grad_norm": 0.26809418201446533, + "learning_rate": 4.838267961234147e-05, + "loss": 0.1789, + "step": 11415 + }, + { + "epoch": 0.20361716548353725, + "grad_norm": 0.4005754292011261, + "learning_rate": 4.838212881738122e-05, + "loss": 0.2106, + "step": 11416 + }, + { + "epoch": 0.20363500160525094, + "grad_norm": 0.2776334583759308, + "learning_rate": 4.838157793178376e-05, + "loss": 0.1961, + "step": 11417 + }, + { + "epoch": 0.20365283772696466, + "grad_norm": 0.24265480041503906, + "learning_rate": 4.838102695555123e-05, + "loss": 0.1628, + "step": 11418 + }, + { + "epoch": 0.20367067384867835, + "grad_norm": 0.2787797152996063, + "learning_rate": 4.838047588868576e-05, + "loss": 0.2372, + "step": 11419 + }, + { + "epoch": 0.20368850997039203, + "grad_norm": 0.3233875632286072, + "learning_rate": 4.8379924731189496e-05, + "loss": 0.2295, + "step": 11420 + }, + { + "epoch": 0.20370634609210572, + "grad_norm": 0.2317197322845459, + "learning_rate": 4.837937348306456e-05, + "loss": 0.1612, + "step": 11421 + }, + { + "epoch": 0.20372418221381944, + "grad_norm": 0.21986952424049377, + "learning_rate": 4.837882214431311e-05, + "loss": 0.1661, + "step": 11422 + }, + { + "epoch": 0.20374201833553313, + "grad_norm": 0.33218953013420105, + "learning_rate": 4.837827071493726e-05, + "loss": 0.2106, + "step": 11423 + }, + { + "epoch": 0.20375985445724681, + "grad_norm": 0.2304145246744156, + "learning_rate": 4.837771919493916e-05, + "loss": 0.1509, + "step": 11424 + }, + { + "epoch": 0.2037776905789605, + "grad_norm": 0.3058910667896271, + "learning_rate": 4.837716758432095e-05, + "loss": 0.1587, + "step": 11425 + }, + { + "epoch": 0.20379552670067422, + "grad_norm": 0.20500940084457397, + "learning_rate": 4.837661588308476e-05, + "loss": 0.1558, + "step": 11426 + }, + { + "epoch": 0.2038133628223879, + "grad_norm": 0.3538636565208435, + "learning_rate": 4.8376064091232734e-05, + "loss": 0.1576, + "step": 11427 + }, + { + "epoch": 0.2038311989441016, + "grad_norm": 0.26476800441741943, + "learning_rate": 4.8375512208767e-05, + "loss": 0.1957, + "step": 11428 + }, + { + "epoch": 0.20384903506581528, + "grad_norm": 0.36764729022979736, + "learning_rate": 4.8374960235689724e-05, + "loss": 0.1987, + "step": 11429 + }, + { + "epoch": 0.203866871187529, + "grad_norm": 0.3004645109176636, + "learning_rate": 4.837440817200302e-05, + "loss": 0.2302, + "step": 11430 + }, + { + "epoch": 0.20388470730924269, + "grad_norm": 0.21339671313762665, + "learning_rate": 4.837385601770904e-05, + "loss": 0.212, + "step": 11431 + }, + { + "epoch": 0.20390254343095637, + "grad_norm": 0.26658669114112854, + "learning_rate": 4.837330377280992e-05, + "loss": 0.1547, + "step": 11432 + }, + { + "epoch": 0.20392037955267006, + "grad_norm": 0.2382674217224121, + "learning_rate": 4.83727514373078e-05, + "loss": 0.1728, + "step": 11433 + }, + { + "epoch": 0.20393821567438375, + "grad_norm": 0.2619294822216034, + "learning_rate": 4.8372199011204824e-05, + "loss": 0.2034, + "step": 11434 + }, + { + "epoch": 0.20395605179609747, + "grad_norm": 0.2552444040775299, + "learning_rate": 4.837164649450313e-05, + "loss": 0.2182, + "step": 11435 + }, + { + "epoch": 0.20397388791781115, + "grad_norm": 0.38829824328422546, + "learning_rate": 4.837109388720486e-05, + "loss": 0.2658, + "step": 11436 + }, + { + "epoch": 0.20399172403952484, + "grad_norm": 0.2558879256248474, + "learning_rate": 4.837054118931217e-05, + "loss": 0.2176, + "step": 11437 + }, + { + "epoch": 0.20400956016123853, + "grad_norm": 0.262234091758728, + "learning_rate": 4.8369988400827185e-05, + "loss": 0.1894, + "step": 11438 + }, + { + "epoch": 0.20402739628295224, + "grad_norm": 0.294992059469223, + "learning_rate": 4.836943552175204e-05, + "loss": 0.212, + "step": 11439 + }, + { + "epoch": 0.20404523240466593, + "grad_norm": 0.339077889919281, + "learning_rate": 4.836888255208891e-05, + "loss": 0.1808, + "step": 11440 + }, + { + "epoch": 0.20406306852637962, + "grad_norm": 0.28222477436065674, + "learning_rate": 4.836832949183991e-05, + "loss": 0.2125, + "step": 11441 + }, + { + "epoch": 0.2040809046480933, + "grad_norm": 0.32367995381355286, + "learning_rate": 4.836777634100719e-05, + "loss": 0.2416, + "step": 11442 + }, + { + "epoch": 0.20409874076980702, + "grad_norm": 0.2875728905200958, + "learning_rate": 4.8367223099592904e-05, + "loss": 0.1786, + "step": 11443 + }, + { + "epoch": 0.2041165768915207, + "grad_norm": 0.24228376150131226, + "learning_rate": 4.8366669767599194e-05, + "loss": 0.1962, + "step": 11444 + }, + { + "epoch": 0.2041344130132344, + "grad_norm": 0.2372943013906479, + "learning_rate": 4.836611634502819e-05, + "loss": 0.1839, + "step": 11445 + }, + { + "epoch": 0.2041522491349481, + "grad_norm": 0.2930713891983032, + "learning_rate": 4.836556283188206e-05, + "loss": 0.1448, + "step": 11446 + }, + { + "epoch": 0.2041700852566618, + "grad_norm": 0.3082902729511261, + "learning_rate": 4.836500922816294e-05, + "loss": 0.1671, + "step": 11447 + }, + { + "epoch": 0.2041879213783755, + "grad_norm": 0.29287946224212646, + "learning_rate": 4.8364455533872965e-05, + "loss": 0.213, + "step": 11448 + }, + { + "epoch": 0.20420575750008918, + "grad_norm": 0.3938713073730469, + "learning_rate": 4.83639017490143e-05, + "loss": 0.2069, + "step": 11449 + }, + { + "epoch": 0.20422359362180287, + "grad_norm": 0.25268763303756714, + "learning_rate": 4.836334787358907e-05, + "loss": 0.1887, + "step": 11450 + }, + { + "epoch": 0.20424142974351656, + "grad_norm": 0.3099231719970703, + "learning_rate": 4.836279390759944e-05, + "loss": 0.1988, + "step": 11451 + }, + { + "epoch": 0.20425926586523027, + "grad_norm": 0.2578817903995514, + "learning_rate": 4.8362239851047554e-05, + "loss": 0.1446, + "step": 11452 + }, + { + "epoch": 0.20427710198694396, + "grad_norm": 0.33942461013793945, + "learning_rate": 4.8361685703935554e-05, + "loss": 0.2132, + "step": 11453 + }, + { + "epoch": 0.20429493810865765, + "grad_norm": 0.3227539658546448, + "learning_rate": 4.8361131466265595e-05, + "loss": 0.1603, + "step": 11454 + }, + { + "epoch": 0.20431277423037134, + "grad_norm": 0.2488083392381668, + "learning_rate": 4.836057713803982e-05, + "loss": 0.2075, + "step": 11455 + }, + { + "epoch": 0.20433061035208505, + "grad_norm": 0.2665003836154938, + "learning_rate": 4.836002271926037e-05, + "loss": 0.2125, + "step": 11456 + }, + { + "epoch": 0.20434844647379874, + "grad_norm": 0.2565172016620636, + "learning_rate": 4.8359468209929414e-05, + "loss": 0.1748, + "step": 11457 + }, + { + "epoch": 0.20436628259551243, + "grad_norm": 0.2896265983581543, + "learning_rate": 4.83589136100491e-05, + "loss": 0.2044, + "step": 11458 + }, + { + "epoch": 0.20438411871722612, + "grad_norm": 0.3849414587020874, + "learning_rate": 4.835835891962155e-05, + "loss": 0.2037, + "step": 11459 + }, + { + "epoch": 0.20440195483893983, + "grad_norm": 0.30482566356658936, + "learning_rate": 4.8357804138648943e-05, + "loss": 0.272, + "step": 11460 + }, + { + "epoch": 0.20441979096065352, + "grad_norm": 0.4008617699146271, + "learning_rate": 4.835724926713342e-05, + "loss": 0.3139, + "step": 11461 + }, + { + "epoch": 0.2044376270823672, + "grad_norm": 0.3116397559642792, + "learning_rate": 4.8356694305077125e-05, + "loss": 0.2907, + "step": 11462 + }, + { + "epoch": 0.2044554632040809, + "grad_norm": 0.23187680542469025, + "learning_rate": 4.835613925248222e-05, + "loss": 0.1781, + "step": 11463 + }, + { + "epoch": 0.2044732993257946, + "grad_norm": 0.2739063501358032, + "learning_rate": 4.8355584109350854e-05, + "loss": 0.2504, + "step": 11464 + }, + { + "epoch": 0.2044911354475083, + "grad_norm": 0.2777095437049866, + "learning_rate": 4.8355028875685175e-05, + "loss": 0.2163, + "step": 11465 + }, + { + "epoch": 0.204508971569222, + "grad_norm": 0.20674772560596466, + "learning_rate": 4.835447355148734e-05, + "loss": 0.1669, + "step": 11466 + }, + { + "epoch": 0.20452680769093567, + "grad_norm": 0.2952086329460144, + "learning_rate": 4.8353918136759494e-05, + "loss": 0.2259, + "step": 11467 + }, + { + "epoch": 0.2045446438126494, + "grad_norm": 0.34441298246383667, + "learning_rate": 4.83533626315038e-05, + "loss": 0.1494, + "step": 11468 + }, + { + "epoch": 0.20456247993436308, + "grad_norm": 0.2142394334077835, + "learning_rate": 4.83528070357224e-05, + "loss": 0.1515, + "step": 11469 + }, + { + "epoch": 0.20458031605607677, + "grad_norm": 0.3051239848136902, + "learning_rate": 4.835225134941746e-05, + "loss": 0.2263, + "step": 11470 + }, + { + "epoch": 0.20459815217779045, + "grad_norm": 0.3721875846385956, + "learning_rate": 4.8351695572591134e-05, + "loss": 0.2559, + "step": 11471 + }, + { + "epoch": 0.20461598829950414, + "grad_norm": 0.28584057092666626, + "learning_rate": 4.835113970524556e-05, + "loss": 0.2043, + "step": 11472 + }, + { + "epoch": 0.20463382442121786, + "grad_norm": 0.2517910599708557, + "learning_rate": 4.8350583747382914e-05, + "loss": 0.1781, + "step": 11473 + }, + { + "epoch": 0.20465166054293155, + "grad_norm": 0.32056596875190735, + "learning_rate": 4.835002769900533e-05, + "loss": 0.2394, + "step": 11474 + }, + { + "epoch": 0.20466949666464523, + "grad_norm": 0.3380945920944214, + "learning_rate": 4.834947156011498e-05, + "loss": 0.2438, + "step": 11475 + }, + { + "epoch": 0.20468733278635892, + "grad_norm": 0.25351786613464355, + "learning_rate": 4.834891533071401e-05, + "loss": 0.2389, + "step": 11476 + }, + { + "epoch": 0.20470516890807264, + "grad_norm": 0.2545580565929413, + "learning_rate": 4.834835901080458e-05, + "loss": 0.2039, + "step": 11477 + }, + { + "epoch": 0.20472300502978633, + "grad_norm": 0.2410781979560852, + "learning_rate": 4.834780260038885e-05, + "loss": 0.2047, + "step": 11478 + }, + { + "epoch": 0.2047408411515, + "grad_norm": 0.25942933559417725, + "learning_rate": 4.8347246099468966e-05, + "loss": 0.2017, + "step": 11479 + }, + { + "epoch": 0.2047586772732137, + "grad_norm": 0.24382233619689941, + "learning_rate": 4.8346689508047095e-05, + "loss": 0.1566, + "step": 11480 + }, + { + "epoch": 0.20477651339492742, + "grad_norm": 0.2222810685634613, + "learning_rate": 4.83461328261254e-05, + "loss": 0.1633, + "step": 11481 + }, + { + "epoch": 0.2047943495166411, + "grad_norm": 0.30445748567581177, + "learning_rate": 4.8345576053706023e-05, + "loss": 0.2015, + "step": 11482 + }, + { + "epoch": 0.2048121856383548, + "grad_norm": 0.24044805765151978, + "learning_rate": 4.834501919079113e-05, + "loss": 0.2317, + "step": 11483 + }, + { + "epoch": 0.20483002176006848, + "grad_norm": 0.409867525100708, + "learning_rate": 4.8344462237382874e-05, + "loss": 0.2042, + "step": 11484 + }, + { + "epoch": 0.2048478578817822, + "grad_norm": 0.242264062166214, + "learning_rate": 4.834390519348343e-05, + "loss": 0.1738, + "step": 11485 + }, + { + "epoch": 0.20486569400349589, + "grad_norm": 0.35216033458709717, + "learning_rate": 4.834334805909494e-05, + "loss": 0.2183, + "step": 11486 + }, + { + "epoch": 0.20488353012520957, + "grad_norm": 0.2909255027770996, + "learning_rate": 4.8342790834219575e-05, + "loss": 0.196, + "step": 11487 + }, + { + "epoch": 0.20490136624692326, + "grad_norm": 0.2488517016172409, + "learning_rate": 4.834223351885949e-05, + "loss": 0.1587, + "step": 11488 + }, + { + "epoch": 0.20491920236863698, + "grad_norm": 0.36704373359680176, + "learning_rate": 4.834167611301684e-05, + "loss": 0.2038, + "step": 11489 + }, + { + "epoch": 0.20493703849035066, + "grad_norm": 0.32630297541618347, + "learning_rate": 4.8341118616693796e-05, + "loss": 0.1769, + "step": 11490 + }, + { + "epoch": 0.20495487461206435, + "grad_norm": 0.31875577569007874, + "learning_rate": 4.8340561029892514e-05, + "loss": 0.2206, + "step": 11491 + }, + { + "epoch": 0.20497271073377804, + "grad_norm": 0.26939448714256287, + "learning_rate": 4.834000335261516e-05, + "loss": 0.1627, + "step": 11492 + }, + { + "epoch": 0.20499054685549173, + "grad_norm": 0.22917385399341583, + "learning_rate": 4.8339445584863887e-05, + "loss": 0.1589, + "step": 11493 + }, + { + "epoch": 0.20500838297720544, + "grad_norm": 0.5053426623344421, + "learning_rate": 4.833888772664086e-05, + "loss": 0.2124, + "step": 11494 + }, + { + "epoch": 0.20502621909891913, + "grad_norm": 0.29969319701194763, + "learning_rate": 4.8338329777948246e-05, + "loss": 0.2134, + "step": 11495 + }, + { + "epoch": 0.20504405522063282, + "grad_norm": 0.27430886030197144, + "learning_rate": 4.83377717387882e-05, + "loss": 0.2095, + "step": 11496 + }, + { + "epoch": 0.2050618913423465, + "grad_norm": 0.2871789336204529, + "learning_rate": 4.833721360916289e-05, + "loss": 0.1714, + "step": 11497 + }, + { + "epoch": 0.20507972746406022, + "grad_norm": 0.2735663056373596, + "learning_rate": 4.8336655389074485e-05, + "loss": 0.2492, + "step": 11498 + }, + { + "epoch": 0.2050975635857739, + "grad_norm": 0.3597601056098938, + "learning_rate": 4.833609707852514e-05, + "loss": 0.1995, + "step": 11499 + }, + { + "epoch": 0.2051153997074876, + "grad_norm": 0.31295526027679443, + "learning_rate": 4.833553867751703e-05, + "loss": 0.16, + "step": 11500 + }, + { + "epoch": 0.2051332358292013, + "grad_norm": 0.2865414619445801, + "learning_rate": 4.83349801860523e-05, + "loss": 0.1707, + "step": 11501 + }, + { + "epoch": 0.205151071950915, + "grad_norm": 0.3860272467136383, + "learning_rate": 4.833442160413314e-05, + "loss": 0.3146, + "step": 11502 + }, + { + "epoch": 0.2051689080726287, + "grad_norm": 0.2656128406524658, + "learning_rate": 4.83338629317617e-05, + "loss": 0.2305, + "step": 11503 + }, + { + "epoch": 0.20518674419434238, + "grad_norm": 0.33289384841918945, + "learning_rate": 4.833330416894014e-05, + "loss": 0.2213, + "step": 11504 + }, + { + "epoch": 0.20520458031605607, + "grad_norm": 0.26756981015205383, + "learning_rate": 4.833274531567064e-05, + "loss": 0.2241, + "step": 11505 + }, + { + "epoch": 0.20522241643776978, + "grad_norm": 0.25366178154945374, + "learning_rate": 4.833218637195536e-05, + "loss": 0.2162, + "step": 11506 + }, + { + "epoch": 0.20524025255948347, + "grad_norm": 0.2395322471857071, + "learning_rate": 4.833162733779647e-05, + "loss": 0.2169, + "step": 11507 + }, + { + "epoch": 0.20525808868119716, + "grad_norm": 0.27153098583221436, + "learning_rate": 4.833106821319613e-05, + "loss": 0.1839, + "step": 11508 + }, + { + "epoch": 0.20527592480291085, + "grad_norm": 0.2954888343811035, + "learning_rate": 4.833050899815651e-05, + "loss": 0.212, + "step": 11509 + }, + { + "epoch": 0.20529376092462456, + "grad_norm": 0.18858271837234497, + "learning_rate": 4.832994969267978e-05, + "loss": 0.1443, + "step": 11510 + }, + { + "epoch": 0.20531159704633825, + "grad_norm": 0.37491849064826965, + "learning_rate": 4.832939029676811e-05, + "loss": 0.2055, + "step": 11511 + }, + { + "epoch": 0.20532943316805194, + "grad_norm": 0.22919854521751404, + "learning_rate": 4.832883081042366e-05, + "loss": 0.196, + "step": 11512 + }, + { + "epoch": 0.20534726928976563, + "grad_norm": 0.2858999967575073, + "learning_rate": 4.83282712336486e-05, + "loss": 0.1831, + "step": 11513 + }, + { + "epoch": 0.20536510541147932, + "grad_norm": 0.24436570703983307, + "learning_rate": 4.8327711566445116e-05, + "loss": 0.2245, + "step": 11514 + }, + { + "epoch": 0.20538294153319303, + "grad_norm": 0.32226887345314026, + "learning_rate": 4.832715180881536e-05, + "loss": 0.1983, + "step": 11515 + }, + { + "epoch": 0.20540077765490672, + "grad_norm": 0.2781701982021332, + "learning_rate": 4.832659196076151e-05, + "loss": 0.2024, + "step": 11516 + }, + { + "epoch": 0.2054186137766204, + "grad_norm": 0.27700164914131165, + "learning_rate": 4.832603202228573e-05, + "loss": 0.197, + "step": 11517 + }, + { + "epoch": 0.2054364498983341, + "grad_norm": 0.21660085022449493, + "learning_rate": 4.8325471993390195e-05, + "loss": 0.184, + "step": 11518 + }, + { + "epoch": 0.2054542860200478, + "grad_norm": 0.2504339814186096, + "learning_rate": 4.832491187407706e-05, + "loss": 0.1552, + "step": 11519 + }, + { + "epoch": 0.2054721221417615, + "grad_norm": 0.31692907214164734, + "learning_rate": 4.832435166434853e-05, + "loss": 0.1871, + "step": 11520 + }, + { + "epoch": 0.2054899582634752, + "grad_norm": 0.24089834094047546, + "learning_rate": 4.832379136420675e-05, + "loss": 0.1874, + "step": 11521 + }, + { + "epoch": 0.20550779438518887, + "grad_norm": 0.2773512303829193, + "learning_rate": 4.832323097365389e-05, + "loss": 0.185, + "step": 11522 + }, + { + "epoch": 0.2055256305069026, + "grad_norm": 0.3746817409992218, + "learning_rate": 4.8322670492692145e-05, + "loss": 0.2556, + "step": 11523 + }, + { + "epoch": 0.20554346662861628, + "grad_norm": 0.25191113352775574, + "learning_rate": 4.832210992132367e-05, + "loss": 0.2098, + "step": 11524 + }, + { + "epoch": 0.20556130275032997, + "grad_norm": 0.3493598997592926, + "learning_rate": 4.8321549259550636e-05, + "loss": 0.1926, + "step": 11525 + }, + { + "epoch": 0.20557913887204365, + "grad_norm": 0.24469542503356934, + "learning_rate": 4.832098850737522e-05, + "loss": 0.1969, + "step": 11526 + }, + { + "epoch": 0.20559697499375737, + "grad_norm": 0.2523654103279114, + "learning_rate": 4.832042766479961e-05, + "loss": 0.1761, + "step": 11527 + }, + { + "epoch": 0.20561481111547106, + "grad_norm": 0.2907570004463196, + "learning_rate": 4.8319866731825955e-05, + "loss": 0.2217, + "step": 11528 + }, + { + "epoch": 0.20563264723718475, + "grad_norm": 0.23885464668273926, + "learning_rate": 4.831930570845645e-05, + "loss": 0.1395, + "step": 11529 + }, + { + "epoch": 0.20565048335889843, + "grad_norm": 0.2950996160507202, + "learning_rate": 4.831874459469326e-05, + "loss": 0.1898, + "step": 11530 + }, + { + "epoch": 0.20566831948061215, + "grad_norm": 0.24162188172340393, + "learning_rate": 4.831818339053856e-05, + "loss": 0.1902, + "step": 11531 + }, + { + "epoch": 0.20568615560232584, + "grad_norm": 0.2928565442562103, + "learning_rate": 4.831762209599453e-05, + "loss": 0.2269, + "step": 11532 + }, + { + "epoch": 0.20570399172403953, + "grad_norm": 0.23402349650859833, + "learning_rate": 4.8317060711063345e-05, + "loss": 0.1853, + "step": 11533 + }, + { + "epoch": 0.2057218278457532, + "grad_norm": 0.28118249773979187, + "learning_rate": 4.831649923574717e-05, + "loss": 0.2051, + "step": 11534 + }, + { + "epoch": 0.2057396639674669, + "grad_norm": 0.35088911652565, + "learning_rate": 4.831593767004821e-05, + "loss": 0.1741, + "step": 11535 + }, + { + "epoch": 0.20575750008918062, + "grad_norm": 0.4024946689605713, + "learning_rate": 4.8315376013968606e-05, + "loss": 0.2107, + "step": 11536 + }, + { + "epoch": 0.2057753362108943, + "grad_norm": 0.3792724311351776, + "learning_rate": 4.8314814267510554e-05, + "loss": 0.2424, + "step": 11537 + }, + { + "epoch": 0.205793172332608, + "grad_norm": 0.3693736493587494, + "learning_rate": 4.8314252430676234e-05, + "loss": 0.2483, + "step": 11538 + }, + { + "epoch": 0.20581100845432168, + "grad_norm": 0.240234836935997, + "learning_rate": 4.831369050346781e-05, + "loss": 0.1542, + "step": 11539 + }, + { + "epoch": 0.2058288445760354, + "grad_norm": 0.24847690761089325, + "learning_rate": 4.8313128485887474e-05, + "loss": 0.2104, + "step": 11540 + }, + { + "epoch": 0.20584668069774908, + "grad_norm": 0.2729666531085968, + "learning_rate": 4.83125663779374e-05, + "loss": 0.2241, + "step": 11541 + }, + { + "epoch": 0.20586451681946277, + "grad_norm": 0.4091082811355591, + "learning_rate": 4.8312004179619766e-05, + "loss": 0.224, + "step": 11542 + }, + { + "epoch": 0.20588235294117646, + "grad_norm": 0.3012774586677551, + "learning_rate": 4.831144189093676e-05, + "loss": 0.2253, + "step": 11543 + }, + { + "epoch": 0.20590018906289018, + "grad_norm": 0.25534123182296753, + "learning_rate": 4.8310879511890546e-05, + "loss": 0.2236, + "step": 11544 + }, + { + "epoch": 0.20591802518460386, + "grad_norm": 0.21891318261623383, + "learning_rate": 4.8310317042483314e-05, + "loss": 0.1361, + "step": 11545 + }, + { + "epoch": 0.20593586130631755, + "grad_norm": 0.34830811619758606, + "learning_rate": 4.830975448271724e-05, + "loss": 0.2053, + "step": 11546 + }, + { + "epoch": 0.20595369742803124, + "grad_norm": 0.3653646409511566, + "learning_rate": 4.830919183259451e-05, + "loss": 0.1926, + "step": 11547 + }, + { + "epoch": 0.20597153354974496, + "grad_norm": 0.24574635922908783, + "learning_rate": 4.83086290921173e-05, + "loss": 0.1726, + "step": 11548 + }, + { + "epoch": 0.20598936967145864, + "grad_norm": 0.5150383710861206, + "learning_rate": 4.830806626128779e-05, + "loss": 0.1787, + "step": 11549 + }, + { + "epoch": 0.20600720579317233, + "grad_norm": 0.2258683145046234, + "learning_rate": 4.830750334010817e-05, + "loss": 0.1716, + "step": 11550 + }, + { + "epoch": 0.20602504191488602, + "grad_norm": 0.3205817639827728, + "learning_rate": 4.8306940328580614e-05, + "loss": 0.2262, + "step": 11551 + }, + { + "epoch": 0.2060428780365997, + "grad_norm": 0.30511584877967834, + "learning_rate": 4.8306377226707304e-05, + "loss": 0.1959, + "step": 11552 + }, + { + "epoch": 0.20606071415831342, + "grad_norm": 0.4284152388572693, + "learning_rate": 4.830581403449043e-05, + "loss": 0.239, + "step": 11553 + }, + { + "epoch": 0.2060785502800271, + "grad_norm": 0.40363168716430664, + "learning_rate": 4.830525075193218e-05, + "loss": 0.1761, + "step": 11554 + }, + { + "epoch": 0.2060963864017408, + "grad_norm": 0.3364522159099579, + "learning_rate": 4.830468737903471e-05, + "loss": 0.2448, + "step": 11555 + }, + { + "epoch": 0.2061142225234545, + "grad_norm": 0.3168528974056244, + "learning_rate": 4.830412391580024e-05, + "loss": 0.1427, + "step": 11556 + }, + { + "epoch": 0.2061320586451682, + "grad_norm": 0.2818145453929901, + "learning_rate": 4.830356036223093e-05, + "loss": 0.2116, + "step": 11557 + }, + { + "epoch": 0.2061498947668819, + "grad_norm": 0.2617489695549011, + "learning_rate": 4.8302996718328965e-05, + "loss": 0.2497, + "step": 11558 + }, + { + "epoch": 0.20616773088859558, + "grad_norm": 0.2629631459712982, + "learning_rate": 4.830243298409655e-05, + "loss": 0.2034, + "step": 11559 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 0.24528156220912933, + "learning_rate": 4.830186915953584e-05, + "loss": 0.2211, + "step": 11560 + }, + { + "epoch": 0.20620340313202298, + "grad_norm": 0.2015800029039383, + "learning_rate": 4.830130524464904e-05, + "loss": 0.1736, + "step": 11561 + }, + { + "epoch": 0.20622123925373667, + "grad_norm": 0.3551153242588043, + "learning_rate": 4.830074123943834e-05, + "loss": 0.2888, + "step": 11562 + }, + { + "epoch": 0.20623907537545036, + "grad_norm": 0.3136424124240875, + "learning_rate": 4.830017714390592e-05, + "loss": 0.1885, + "step": 11563 + }, + { + "epoch": 0.20625691149716405, + "grad_norm": 0.32659777998924255, + "learning_rate": 4.829961295805396e-05, + "loss": 0.2399, + "step": 11564 + }, + { + "epoch": 0.20627474761887776, + "grad_norm": 0.2697278559207916, + "learning_rate": 4.8299048681884654e-05, + "loss": 0.2096, + "step": 11565 + }, + { + "epoch": 0.20629258374059145, + "grad_norm": 0.3694291114807129, + "learning_rate": 4.8298484315400194e-05, + "loss": 0.2328, + "step": 11566 + }, + { + "epoch": 0.20631041986230514, + "grad_norm": 0.23069000244140625, + "learning_rate": 4.8297919858602756e-05, + "loss": 0.1868, + "step": 11567 + }, + { + "epoch": 0.20632825598401883, + "grad_norm": 0.28382018208503723, + "learning_rate": 4.8297355311494536e-05, + "loss": 0.2109, + "step": 11568 + }, + { + "epoch": 0.20634609210573254, + "grad_norm": 0.28953275084495544, + "learning_rate": 4.829679067407772e-05, + "loss": 0.1856, + "step": 11569 + }, + { + "epoch": 0.20636392822744623, + "grad_norm": 0.26258930563926697, + "learning_rate": 4.8296225946354494e-05, + "loss": 0.2311, + "step": 11570 + }, + { + "epoch": 0.20638176434915992, + "grad_norm": 0.34757038950920105, + "learning_rate": 4.829566112832705e-05, + "loss": 0.1777, + "step": 11571 + }, + { + "epoch": 0.2063996004708736, + "grad_norm": 0.220563605427742, + "learning_rate": 4.8295096219997584e-05, + "loss": 0.1982, + "step": 11572 + }, + { + "epoch": 0.2064174365925873, + "grad_norm": 0.33619746565818787, + "learning_rate": 4.8294531221368274e-05, + "loss": 0.173, + "step": 11573 + }, + { + "epoch": 0.206435272714301, + "grad_norm": 0.3179114758968353, + "learning_rate": 4.8293966132441315e-05, + "loss": 0.1851, + "step": 11574 + }, + { + "epoch": 0.2064531088360147, + "grad_norm": 0.2530944347381592, + "learning_rate": 4.8293400953218896e-05, + "loss": 0.2173, + "step": 11575 + }, + { + "epoch": 0.2064709449577284, + "grad_norm": 0.260496586561203, + "learning_rate": 4.829283568370321e-05, + "loss": 0.1996, + "step": 11576 + }, + { + "epoch": 0.20648878107944207, + "grad_norm": 0.2651001214981079, + "learning_rate": 4.829227032389646e-05, + "loss": 0.2357, + "step": 11577 + }, + { + "epoch": 0.2065066172011558, + "grad_norm": 0.2797831594944, + "learning_rate": 4.8291704873800816e-05, + "loss": 0.2425, + "step": 11578 + }, + { + "epoch": 0.20652445332286948, + "grad_norm": 0.2605598568916321, + "learning_rate": 4.829113933341847e-05, + "loss": 0.2215, + "step": 11579 + }, + { + "epoch": 0.20654228944458317, + "grad_norm": 0.18506459891796112, + "learning_rate": 4.829057370275163e-05, + "loss": 0.1737, + "step": 11580 + }, + { + "epoch": 0.20656012556629685, + "grad_norm": 0.33147522807121277, + "learning_rate": 4.829000798180249e-05, + "loss": 0.231, + "step": 11581 + }, + { + "epoch": 0.20657796168801057, + "grad_norm": 0.23073697090148926, + "learning_rate": 4.828944217057323e-05, + "loss": 0.1762, + "step": 11582 + }, + { + "epoch": 0.20659579780972426, + "grad_norm": 0.2841963768005371, + "learning_rate": 4.828887626906604e-05, + "loss": 0.1965, + "step": 11583 + }, + { + "epoch": 0.20661363393143795, + "grad_norm": 0.23192636668682098, + "learning_rate": 4.828831027728313e-05, + "loss": 0.2083, + "step": 11584 + }, + { + "epoch": 0.20663147005315163, + "grad_norm": 0.3724508285522461, + "learning_rate": 4.828774419522669e-05, + "loss": 0.2254, + "step": 11585 + }, + { + "epoch": 0.20664930617486535, + "grad_norm": 0.3201790153980255, + "learning_rate": 4.8287178022898906e-05, + "loss": 0.2069, + "step": 11586 + }, + { + "epoch": 0.20666714229657904, + "grad_norm": 0.20933395624160767, + "learning_rate": 4.8286611760301974e-05, + "loss": 0.1612, + "step": 11587 + }, + { + "epoch": 0.20668497841829273, + "grad_norm": 0.30476272106170654, + "learning_rate": 4.8286045407438096e-05, + "loss": 0.2198, + "step": 11588 + }, + { + "epoch": 0.2067028145400064, + "grad_norm": 0.2761203646659851, + "learning_rate": 4.828547896430946e-05, + "loss": 0.1914, + "step": 11589 + }, + { + "epoch": 0.20672065066172013, + "grad_norm": 0.3802258372306824, + "learning_rate": 4.828491243091827e-05, + "loss": 0.2512, + "step": 11590 + }, + { + "epoch": 0.20673848678343382, + "grad_norm": 0.22877644002437592, + "learning_rate": 4.8284345807266706e-05, + "loss": 0.1768, + "step": 11591 + }, + { + "epoch": 0.2067563229051475, + "grad_norm": 0.2869945168495178, + "learning_rate": 4.8283779093356986e-05, + "loss": 0.215, + "step": 11592 + }, + { + "epoch": 0.2067741590268612, + "grad_norm": 0.3268187642097473, + "learning_rate": 4.828321228919129e-05, + "loss": 0.2584, + "step": 11593 + }, + { + "epoch": 0.20679199514857488, + "grad_norm": 0.3540118634700775, + "learning_rate": 4.828264539477183e-05, + "loss": 0.1475, + "step": 11594 + }, + { + "epoch": 0.2068098312702886, + "grad_norm": 0.3449714481830597, + "learning_rate": 4.828207841010078e-05, + "loss": 0.2457, + "step": 11595 + }, + { + "epoch": 0.20682766739200228, + "grad_norm": 0.2848774194717407, + "learning_rate": 4.828151133518037e-05, + "loss": 0.1825, + "step": 11596 + }, + { + "epoch": 0.20684550351371597, + "grad_norm": 0.3935084640979767, + "learning_rate": 4.828094417001277e-05, + "loss": 0.1661, + "step": 11597 + }, + { + "epoch": 0.20686333963542966, + "grad_norm": 0.2558848261833191, + "learning_rate": 4.82803769146002e-05, + "loss": 0.1652, + "step": 11598 + }, + { + "epoch": 0.20688117575714338, + "grad_norm": 0.38729530572891235, + "learning_rate": 4.827980956894484e-05, + "loss": 0.2168, + "step": 11599 + }, + { + "epoch": 0.20689901187885706, + "grad_norm": 0.22212867438793182, + "learning_rate": 4.82792421330489e-05, + "loss": 0.1866, + "step": 11600 + }, + { + "epoch": 0.20691684800057075, + "grad_norm": 0.3084488809108734, + "learning_rate": 4.827867460691456e-05, + "loss": 0.2304, + "step": 11601 + }, + { + "epoch": 0.20693468412228444, + "grad_norm": 0.2829030454158783, + "learning_rate": 4.8278106990544056e-05, + "loss": 0.1894, + "step": 11602 + }, + { + "epoch": 0.20695252024399816, + "grad_norm": 0.32071101665496826, + "learning_rate": 4.8277539283939566e-05, + "loss": 0.1733, + "step": 11603 + }, + { + "epoch": 0.20697035636571184, + "grad_norm": 0.2623653709888458, + "learning_rate": 4.82769714871033e-05, + "loss": 0.1994, + "step": 11604 + }, + { + "epoch": 0.20698819248742553, + "grad_norm": 0.2787366509437561, + "learning_rate": 4.827640360003745e-05, + "loss": 0.2468, + "step": 11605 + }, + { + "epoch": 0.20700602860913922, + "grad_norm": 0.20443102717399597, + "learning_rate": 4.827583562274421e-05, + "loss": 0.2222, + "step": 11606 + }, + { + "epoch": 0.20702386473085294, + "grad_norm": 0.2780798673629761, + "learning_rate": 4.82752675552258e-05, + "loss": 0.2002, + "step": 11607 + }, + { + "epoch": 0.20704170085256662, + "grad_norm": 0.2777286469936371, + "learning_rate": 4.827469939748441e-05, + "loss": 0.2184, + "step": 11608 + }, + { + "epoch": 0.2070595369742803, + "grad_norm": 0.31433841586112976, + "learning_rate": 4.827413114952225e-05, + "loss": 0.1627, + "step": 11609 + }, + { + "epoch": 0.207077373095994, + "grad_norm": 0.4145423471927643, + "learning_rate": 4.827356281134152e-05, + "loss": 0.1787, + "step": 11610 + }, + { + "epoch": 0.20709520921770772, + "grad_norm": 0.36890047788619995, + "learning_rate": 4.8272994382944416e-05, + "loss": 0.2169, + "step": 11611 + }, + { + "epoch": 0.2071130453394214, + "grad_norm": 0.23022206127643585, + "learning_rate": 4.827242586433315e-05, + "loss": 0.1788, + "step": 11612 + }, + { + "epoch": 0.2071308814611351, + "grad_norm": 0.28333592414855957, + "learning_rate": 4.8271857255509925e-05, + "loss": 0.1839, + "step": 11613 + }, + { + "epoch": 0.20714871758284878, + "grad_norm": 0.3183026611804962, + "learning_rate": 4.827128855647694e-05, + "loss": 0.226, + "step": 11614 + }, + { + "epoch": 0.20716655370456247, + "grad_norm": 0.29933854937553406, + "learning_rate": 4.827071976723641e-05, + "loss": 0.226, + "step": 11615 + }, + { + "epoch": 0.20718438982627618, + "grad_norm": 0.3120318651199341, + "learning_rate": 4.8270150887790533e-05, + "loss": 0.2724, + "step": 11616 + }, + { + "epoch": 0.20720222594798987, + "grad_norm": 0.24469727277755737, + "learning_rate": 4.82695819181415e-05, + "loss": 0.1706, + "step": 11617 + }, + { + "epoch": 0.20722006206970356, + "grad_norm": 0.2674146294593811, + "learning_rate": 4.8269012858291554e-05, + "loss": 0.193, + "step": 11618 + }, + { + "epoch": 0.20723789819141725, + "grad_norm": 0.3089778423309326, + "learning_rate": 4.826844370824286e-05, + "loss": 0.1956, + "step": 11619 + }, + { + "epoch": 0.20725573431313096, + "grad_norm": 0.2859371602535248, + "learning_rate": 4.8267874467997644e-05, + "loss": 0.2225, + "step": 11620 + }, + { + "epoch": 0.20727357043484465, + "grad_norm": 0.28218528628349304, + "learning_rate": 4.826730513755811e-05, + "loss": 0.1991, + "step": 11621 + }, + { + "epoch": 0.20729140655655834, + "grad_norm": 0.2642873525619507, + "learning_rate": 4.826673571692646e-05, + "loss": 0.1814, + "step": 11622 + }, + { + "epoch": 0.20730924267827203, + "grad_norm": 0.3275371491909027, + "learning_rate": 4.826616620610492e-05, + "loss": 0.2512, + "step": 11623 + }, + { + "epoch": 0.20732707879998574, + "grad_norm": 0.3450893759727478, + "learning_rate": 4.8265596605095675e-05, + "loss": 0.2632, + "step": 11624 + }, + { + "epoch": 0.20734491492169943, + "grad_norm": 0.2679916024208069, + "learning_rate": 4.8265026913900944e-05, + "loss": 0.221, + "step": 11625 + }, + { + "epoch": 0.20736275104341312, + "grad_norm": 0.24468261003494263, + "learning_rate": 4.8264457132522934e-05, + "loss": 0.1941, + "step": 11626 + }, + { + "epoch": 0.2073805871651268, + "grad_norm": 0.328224241733551, + "learning_rate": 4.8263887260963846e-05, + "loss": 0.2185, + "step": 11627 + }, + { + "epoch": 0.20739842328684052, + "grad_norm": 0.23001453280448914, + "learning_rate": 4.826331729922591e-05, + "loss": 0.1848, + "step": 11628 + }, + { + "epoch": 0.2074162594085542, + "grad_norm": 0.3273157775402069, + "learning_rate": 4.826274724731131e-05, + "loss": 0.2393, + "step": 11629 + }, + { + "epoch": 0.2074340955302679, + "grad_norm": 0.3130827248096466, + "learning_rate": 4.8262177105222265e-05, + "loss": 0.2294, + "step": 11630 + }, + { + "epoch": 0.20745193165198159, + "grad_norm": 0.21297885477542877, + "learning_rate": 4.8261606872960994e-05, + "loss": 0.1678, + "step": 11631 + }, + { + "epoch": 0.20746976777369527, + "grad_norm": 0.28783902525901794, + "learning_rate": 4.826103655052969e-05, + "loss": 0.2266, + "step": 11632 + }, + { + "epoch": 0.207487603895409, + "grad_norm": 0.2523465156555176, + "learning_rate": 4.826046613793059e-05, + "loss": 0.2163, + "step": 11633 + }, + { + "epoch": 0.20750544001712268, + "grad_norm": 0.2851579189300537, + "learning_rate": 4.825989563516588e-05, + "loss": 0.2245, + "step": 11634 + }, + { + "epoch": 0.20752327613883637, + "grad_norm": 0.19099538028240204, + "learning_rate": 4.825932504223778e-05, + "loss": 0.1646, + "step": 11635 + }, + { + "epoch": 0.20754111226055005, + "grad_norm": 0.32935628294944763, + "learning_rate": 4.825875435914851e-05, + "loss": 0.2004, + "step": 11636 + }, + { + "epoch": 0.20755894838226377, + "grad_norm": 0.3129284083843231, + "learning_rate": 4.8258183585900264e-05, + "loss": 0.2332, + "step": 11637 + }, + { + "epoch": 0.20757678450397746, + "grad_norm": 0.3103378117084503, + "learning_rate": 4.825761272249527e-05, + "loss": 0.265, + "step": 11638 + }, + { + "epoch": 0.20759462062569115, + "grad_norm": 0.2517412602901459, + "learning_rate": 4.825704176893574e-05, + "loss": 0.2157, + "step": 11639 + }, + { + "epoch": 0.20761245674740483, + "grad_norm": 0.3071826696395874, + "learning_rate": 4.8256470725223876e-05, + "loss": 0.174, + "step": 11640 + }, + { + "epoch": 0.20763029286911855, + "grad_norm": 0.29822486639022827, + "learning_rate": 4.825589959136191e-05, + "loss": 0.2022, + "step": 11641 + }, + { + "epoch": 0.20764812899083224, + "grad_norm": 0.37174782156944275, + "learning_rate": 4.825532836735204e-05, + "loss": 0.226, + "step": 11642 + }, + { + "epoch": 0.20766596511254592, + "grad_norm": 0.26324722170829773, + "learning_rate": 4.825475705319648e-05, + "loss": 0.2499, + "step": 11643 + }, + { + "epoch": 0.2076838012342596, + "grad_norm": 0.2306855171918869, + "learning_rate": 4.8254185648897455e-05, + "loss": 0.1702, + "step": 11644 + }, + { + "epoch": 0.20770163735597333, + "grad_norm": 0.2013261318206787, + "learning_rate": 4.825361415445717e-05, + "loss": 0.1269, + "step": 11645 + }, + { + "epoch": 0.20771947347768702, + "grad_norm": 0.31223180890083313, + "learning_rate": 4.8253042569877846e-05, + "loss": 0.1959, + "step": 11646 + }, + { + "epoch": 0.2077373095994007, + "grad_norm": 0.22666801512241364, + "learning_rate": 4.825247089516171e-05, + "loss": 0.1778, + "step": 11647 + }, + { + "epoch": 0.2077551457211144, + "grad_norm": 0.386925607919693, + "learning_rate": 4.825189913031095e-05, + "loss": 0.2875, + "step": 11648 + }, + { + "epoch": 0.2077729818428281, + "grad_norm": 0.3336915075778961, + "learning_rate": 4.82513272753278e-05, + "loss": 0.248, + "step": 11649 + }, + { + "epoch": 0.2077908179645418, + "grad_norm": 0.3365747332572937, + "learning_rate": 4.825075533021448e-05, + "loss": 0.1855, + "step": 11650 + }, + { + "epoch": 0.20780865408625548, + "grad_norm": 0.3874429762363434, + "learning_rate": 4.82501832949732e-05, + "loss": 0.1836, + "step": 11651 + }, + { + "epoch": 0.20782649020796917, + "grad_norm": 0.2855224311351776, + "learning_rate": 4.824961116960618e-05, + "loss": 0.2393, + "step": 11652 + }, + { + "epoch": 0.20784432632968286, + "grad_norm": 0.2667388319969177, + "learning_rate": 4.8249038954115634e-05, + "loss": 0.1979, + "step": 11653 + }, + { + "epoch": 0.20786216245139658, + "grad_norm": 0.35553085803985596, + "learning_rate": 4.824846664850379e-05, + "loss": 0.209, + "step": 11654 + }, + { + "epoch": 0.20787999857311026, + "grad_norm": 0.25695550441741943, + "learning_rate": 4.824789425277285e-05, + "loss": 0.2067, + "step": 11655 + }, + { + "epoch": 0.20789783469482395, + "grad_norm": 0.2398696094751358, + "learning_rate": 4.824732176692505e-05, + "loss": 0.2035, + "step": 11656 + }, + { + "epoch": 0.20791567081653764, + "grad_norm": 0.2993643581867218, + "learning_rate": 4.824674919096259e-05, + "loss": 0.2223, + "step": 11657 + }, + { + "epoch": 0.20793350693825136, + "grad_norm": 0.3369992673397064, + "learning_rate": 4.8246176524887717e-05, + "loss": 0.1716, + "step": 11658 + }, + { + "epoch": 0.20795134305996504, + "grad_norm": 0.4276783764362335, + "learning_rate": 4.8245603768702626e-05, + "loss": 0.2464, + "step": 11659 + }, + { + "epoch": 0.20796917918167873, + "grad_norm": 0.21072588860988617, + "learning_rate": 4.824503092240955e-05, + "loss": 0.2085, + "step": 11660 + }, + { + "epoch": 0.20798701530339242, + "grad_norm": 0.379810631275177, + "learning_rate": 4.82444579860107e-05, + "loss": 0.2069, + "step": 11661 + }, + { + "epoch": 0.20800485142510614, + "grad_norm": 0.19832254946231842, + "learning_rate": 4.82438849595083e-05, + "loss": 0.1803, + "step": 11662 + }, + { + "epoch": 0.20802268754681982, + "grad_norm": 0.250905305147171, + "learning_rate": 4.824331184290458e-05, + "loss": 0.2263, + "step": 11663 + }, + { + "epoch": 0.2080405236685335, + "grad_norm": 0.2300119251012802, + "learning_rate": 4.824273863620176e-05, + "loss": 0.1935, + "step": 11664 + }, + { + "epoch": 0.2080583597902472, + "grad_norm": 0.2527783513069153, + "learning_rate": 4.8242165339402044e-05, + "loss": 0.2127, + "step": 11665 + }, + { + "epoch": 0.20807619591196092, + "grad_norm": 0.21021036803722382, + "learning_rate": 4.824159195250768e-05, + "loss": 0.18, + "step": 11666 + }, + { + "epoch": 0.2080940320336746, + "grad_norm": 0.3651825487613678, + "learning_rate": 4.824101847552087e-05, + "loss": 0.227, + "step": 11667 + }, + { + "epoch": 0.2081118681553883, + "grad_norm": 0.2813571095466614, + "learning_rate": 4.824044490844385e-05, + "loss": 0.2167, + "step": 11668 + }, + { + "epoch": 0.20812970427710198, + "grad_norm": 0.24911226332187653, + "learning_rate": 4.8239871251278835e-05, + "loss": 0.2084, + "step": 11669 + }, + { + "epoch": 0.2081475403988157, + "grad_norm": 0.2716328799724579, + "learning_rate": 4.8239297504028056e-05, + "loss": 0.2106, + "step": 11670 + }, + { + "epoch": 0.20816537652052938, + "grad_norm": 0.2395828366279602, + "learning_rate": 4.823872366669373e-05, + "loss": 0.1997, + "step": 11671 + }, + { + "epoch": 0.20818321264224307, + "grad_norm": 0.23245181143283844, + "learning_rate": 4.823814973927809e-05, + "loss": 0.2127, + "step": 11672 + }, + { + "epoch": 0.20820104876395676, + "grad_norm": 0.33587050437927246, + "learning_rate": 4.823757572178334e-05, + "loss": 0.2432, + "step": 11673 + }, + { + "epoch": 0.20821888488567045, + "grad_norm": 0.2422908991575241, + "learning_rate": 4.823700161421174e-05, + "loss": 0.196, + "step": 11674 + }, + { + "epoch": 0.20823672100738416, + "grad_norm": 0.2858142554759979, + "learning_rate": 4.823642741656549e-05, + "loss": 0.2166, + "step": 11675 + }, + { + "epoch": 0.20825455712909785, + "grad_norm": 0.30445921421051025, + "learning_rate": 4.823585312884682e-05, + "loss": 0.1284, + "step": 11676 + }, + { + "epoch": 0.20827239325081154, + "grad_norm": 0.2217496782541275, + "learning_rate": 4.823527875105796e-05, + "loss": 0.2001, + "step": 11677 + }, + { + "epoch": 0.20829022937252523, + "grad_norm": 0.6687961220741272, + "learning_rate": 4.823470428320113e-05, + "loss": 0.2046, + "step": 11678 + }, + { + "epoch": 0.20830806549423894, + "grad_norm": 0.2101033627986908, + "learning_rate": 4.823412972527856e-05, + "loss": 0.1793, + "step": 11679 + }, + { + "epoch": 0.20832590161595263, + "grad_norm": 0.24649551510810852, + "learning_rate": 4.8233555077292484e-05, + "loss": 0.1993, + "step": 11680 + }, + { + "epoch": 0.20834373773766632, + "grad_norm": 0.29888081550598145, + "learning_rate": 4.823298033924512e-05, + "loss": 0.1844, + "step": 11681 + }, + { + "epoch": 0.20836157385938, + "grad_norm": 0.25010573863983154, + "learning_rate": 4.82324055111387e-05, + "loss": 0.2259, + "step": 11682 + }, + { + "epoch": 0.20837940998109372, + "grad_norm": 0.3335828185081482, + "learning_rate": 4.823183059297546e-05, + "loss": 0.2108, + "step": 11683 + }, + { + "epoch": 0.2083972461028074, + "grad_norm": 0.2499372661113739, + "learning_rate": 4.823125558475761e-05, + "loss": 0.1706, + "step": 11684 + }, + { + "epoch": 0.2084150822245211, + "grad_norm": 0.2664951682090759, + "learning_rate": 4.823068048648739e-05, + "loss": 0.1762, + "step": 11685 + }, + { + "epoch": 0.20843291834623479, + "grad_norm": 0.34128519892692566, + "learning_rate": 4.8230105298167034e-05, + "loss": 0.2474, + "step": 11686 + }, + { + "epoch": 0.2084507544679485, + "grad_norm": 0.2763022184371948, + "learning_rate": 4.822953001979876e-05, + "loss": 0.1702, + "step": 11687 + }, + { + "epoch": 0.2084685905896622, + "grad_norm": 0.23775845766067505, + "learning_rate": 4.822895465138481e-05, + "loss": 0.2059, + "step": 11688 + }, + { + "epoch": 0.20848642671137588, + "grad_norm": 0.2731362581253052, + "learning_rate": 4.822837919292741e-05, + "loss": 0.2283, + "step": 11689 + }, + { + "epoch": 0.20850426283308957, + "grad_norm": 0.22162795066833496, + "learning_rate": 4.8227803644428786e-05, + "loss": 0.1441, + "step": 11690 + }, + { + "epoch": 0.20852209895480328, + "grad_norm": 0.24297797679901123, + "learning_rate": 4.8227228005891176e-05, + "loss": 0.2219, + "step": 11691 + }, + { + "epoch": 0.20853993507651697, + "grad_norm": 0.26579803228378296, + "learning_rate": 4.822665227731681e-05, + "loss": 0.1966, + "step": 11692 + }, + { + "epoch": 0.20855777119823066, + "grad_norm": 0.26142609119415283, + "learning_rate": 4.822607645870791e-05, + "loss": 0.2179, + "step": 11693 + }, + { + "epoch": 0.20857560731994434, + "grad_norm": 0.2581977844238281, + "learning_rate": 4.8225500550066714e-05, + "loss": 0.2093, + "step": 11694 + }, + { + "epoch": 0.20859344344165803, + "grad_norm": 0.4347952902317047, + "learning_rate": 4.822492455139546e-05, + "loss": 0.1717, + "step": 11695 + }, + { + "epoch": 0.20861127956337175, + "grad_norm": 0.21105089783668518, + "learning_rate": 4.822434846269638e-05, + "loss": 0.1807, + "step": 11696 + }, + { + "epoch": 0.20862911568508544, + "grad_norm": 0.2665728032588959, + "learning_rate": 4.82237722839717e-05, + "loss": 0.2004, + "step": 11697 + }, + { + "epoch": 0.20864695180679912, + "grad_norm": 0.3084055483341217, + "learning_rate": 4.822319601522366e-05, + "loss": 0.1922, + "step": 11698 + }, + { + "epoch": 0.2086647879285128, + "grad_norm": 0.2810305953025818, + "learning_rate": 4.822261965645449e-05, + "loss": 0.1687, + "step": 11699 + }, + { + "epoch": 0.20868262405022653, + "grad_norm": 0.37187448143959045, + "learning_rate": 4.822204320766642e-05, + "loss": 0.1858, + "step": 11700 + }, + { + "epoch": 0.20870046017194022, + "grad_norm": 0.39211082458496094, + "learning_rate": 4.82214666688617e-05, + "loss": 0.2323, + "step": 11701 + }, + { + "epoch": 0.2087182962936539, + "grad_norm": 0.25368061661720276, + "learning_rate": 4.822089004004255e-05, + "loss": 0.1602, + "step": 11702 + }, + { + "epoch": 0.2087361324153676, + "grad_norm": 0.40686118602752686, + "learning_rate": 4.822031332121121e-05, + "loss": 0.2694, + "step": 11703 + }, + { + "epoch": 0.2087539685370813, + "grad_norm": 0.2769964933395386, + "learning_rate": 4.821973651236992e-05, + "loss": 0.1704, + "step": 11704 + }, + { + "epoch": 0.208771804658795, + "grad_norm": 0.32989227771759033, + "learning_rate": 4.82191596135209e-05, + "loss": 0.1781, + "step": 11705 + }, + { + "epoch": 0.20878964078050868, + "grad_norm": 0.32637158036231995, + "learning_rate": 4.8218582624666406e-05, + "loss": 0.1668, + "step": 11706 + }, + { + "epoch": 0.20880747690222237, + "grad_norm": 0.2789934575557709, + "learning_rate": 4.821800554580866e-05, + "loss": 0.2033, + "step": 11707 + }, + { + "epoch": 0.2088253130239361, + "grad_norm": 0.28855159878730774, + "learning_rate": 4.8217428376949916e-05, + "loss": 0.2071, + "step": 11708 + }, + { + "epoch": 0.20884314914564978, + "grad_norm": 0.21203669905662537, + "learning_rate": 4.82168511180924e-05, + "loss": 0.1961, + "step": 11709 + }, + { + "epoch": 0.20886098526736346, + "grad_norm": 0.25486427545547485, + "learning_rate": 4.821627376923834e-05, + "loss": 0.189, + "step": 11710 + }, + { + "epoch": 0.20887882138907715, + "grad_norm": 0.24313603341579437, + "learning_rate": 4.8215696330389994e-05, + "loss": 0.2032, + "step": 11711 + }, + { + "epoch": 0.20889665751079087, + "grad_norm": 0.2916683852672577, + "learning_rate": 4.821511880154958e-05, + "loss": 0.1774, + "step": 11712 + }, + { + "epoch": 0.20891449363250456, + "grad_norm": 0.29036620259284973, + "learning_rate": 4.8214541182719355e-05, + "loss": 0.2242, + "step": 11713 + }, + { + "epoch": 0.20893232975421824, + "grad_norm": 0.3922606110572815, + "learning_rate": 4.8213963473901546e-05, + "loss": 0.2757, + "step": 11714 + }, + { + "epoch": 0.20895016587593193, + "grad_norm": 0.2874530553817749, + "learning_rate": 4.82133856750984e-05, + "loss": 0.2539, + "step": 11715 + }, + { + "epoch": 0.20896800199764562, + "grad_norm": 0.21845576167106628, + "learning_rate": 4.821280778631215e-05, + "loss": 0.1825, + "step": 11716 + }, + { + "epoch": 0.20898583811935934, + "grad_norm": 0.30289462208747864, + "learning_rate": 4.821222980754504e-05, + "loss": 0.1981, + "step": 11717 + }, + { + "epoch": 0.20900367424107302, + "grad_norm": 0.29000192880630493, + "learning_rate": 4.821165173879932e-05, + "loss": 0.2164, + "step": 11718 + }, + { + "epoch": 0.2090215103627867, + "grad_norm": 0.2553132176399231, + "learning_rate": 4.8211073580077215e-05, + "loss": 0.2171, + "step": 11719 + }, + { + "epoch": 0.2090393464845004, + "grad_norm": 0.3072320520877838, + "learning_rate": 4.821049533138097e-05, + "loss": 0.2263, + "step": 11720 + }, + { + "epoch": 0.20905718260621411, + "grad_norm": 0.27789634466171265, + "learning_rate": 4.820991699271282e-05, + "loss": 0.188, + "step": 11721 + }, + { + "epoch": 0.2090750187279278, + "grad_norm": 0.21587303280830383, + "learning_rate": 4.820933856407503e-05, + "loss": 0.1947, + "step": 11722 + }, + { + "epoch": 0.2090928548496415, + "grad_norm": 0.3030090630054474, + "learning_rate": 4.820876004546982e-05, + "loss": 0.2017, + "step": 11723 + }, + { + "epoch": 0.20911069097135518, + "grad_norm": 0.19152145087718964, + "learning_rate": 4.8208181436899443e-05, + "loss": 0.1621, + "step": 11724 + }, + { + "epoch": 0.2091285270930689, + "grad_norm": 0.2322104126214981, + "learning_rate": 4.8207602738366136e-05, + "loss": 0.1784, + "step": 11725 + }, + { + "epoch": 0.20914636321478258, + "grad_norm": 0.24238301813602448, + "learning_rate": 4.820702394987214e-05, + "loss": 0.192, + "step": 11726 + }, + { + "epoch": 0.20916419933649627, + "grad_norm": 0.25751087069511414, + "learning_rate": 4.8206445071419714e-05, + "loss": 0.1923, + "step": 11727 + }, + { + "epoch": 0.20918203545820996, + "grad_norm": 0.3497230112552643, + "learning_rate": 4.820586610301109e-05, + "loss": 0.1868, + "step": 11728 + }, + { + "epoch": 0.20919987157992367, + "grad_norm": 0.26342979073524475, + "learning_rate": 4.820528704464851e-05, + "loss": 0.2047, + "step": 11729 + }, + { + "epoch": 0.20921770770163736, + "grad_norm": 0.3436564803123474, + "learning_rate": 4.8204707896334224e-05, + "loss": 0.1599, + "step": 11730 + }, + { + "epoch": 0.20923554382335105, + "grad_norm": 0.1855187714099884, + "learning_rate": 4.820412865807048e-05, + "loss": 0.1529, + "step": 11731 + }, + { + "epoch": 0.20925337994506474, + "grad_norm": 0.29702046513557434, + "learning_rate": 4.820354932985951e-05, + "loss": 0.2061, + "step": 11732 + }, + { + "epoch": 0.20927121606677843, + "grad_norm": 0.242684468626976, + "learning_rate": 4.8202969911703575e-05, + "loss": 0.182, + "step": 11733 + }, + { + "epoch": 0.20928905218849214, + "grad_norm": 0.3582392930984497, + "learning_rate": 4.8202390403604915e-05, + "loss": 0.2258, + "step": 11734 + }, + { + "epoch": 0.20930688831020583, + "grad_norm": 0.18719659745693207, + "learning_rate": 4.8201810805565776e-05, + "loss": 0.1584, + "step": 11735 + }, + { + "epoch": 0.20932472443191952, + "grad_norm": 0.24418845772743225, + "learning_rate": 4.8201231117588393e-05, + "loss": 0.1744, + "step": 11736 + }, + { + "epoch": 0.2093425605536332, + "grad_norm": 0.41438546776771545, + "learning_rate": 4.820065133967504e-05, + "loss": 0.2297, + "step": 11737 + }, + { + "epoch": 0.20936039667534692, + "grad_norm": 0.2501727044582367, + "learning_rate": 4.820007147182794e-05, + "loss": 0.2315, + "step": 11738 + }, + { + "epoch": 0.2093782327970606, + "grad_norm": 0.27792489528656006, + "learning_rate": 4.8199491514049354e-05, + "loss": 0.2057, + "step": 11739 + }, + { + "epoch": 0.2093960689187743, + "grad_norm": 0.45859941840171814, + "learning_rate": 4.8198911466341525e-05, + "loss": 0.1545, + "step": 11740 + }, + { + "epoch": 0.20941390504048799, + "grad_norm": 0.4077008068561554, + "learning_rate": 4.81983313287067e-05, + "loss": 0.2182, + "step": 11741 + }, + { + "epoch": 0.2094317411622017, + "grad_norm": 0.24512937664985657, + "learning_rate": 4.819775110114714e-05, + "loss": 0.1992, + "step": 11742 + }, + { + "epoch": 0.2094495772839154, + "grad_norm": 0.3170695900917053, + "learning_rate": 4.8197170783665075e-05, + "loss": 0.1599, + "step": 11743 + }, + { + "epoch": 0.20946741340562908, + "grad_norm": 0.28509825468063354, + "learning_rate": 4.819659037626276e-05, + "loss": 0.2171, + "step": 11744 + }, + { + "epoch": 0.20948524952734277, + "grad_norm": 0.2481437474489212, + "learning_rate": 4.819600987894246e-05, + "loss": 0.2093, + "step": 11745 + }, + { + "epoch": 0.20950308564905648, + "grad_norm": 0.23982587456703186, + "learning_rate": 4.8195429291706406e-05, + "loss": 0.2145, + "step": 11746 + }, + { + "epoch": 0.20952092177077017, + "grad_norm": 0.33134645223617554, + "learning_rate": 4.819484861455687e-05, + "loss": 0.2098, + "step": 11747 + }, + { + "epoch": 0.20953875789248386, + "grad_norm": 0.20616430044174194, + "learning_rate": 4.8194267847496074e-05, + "loss": 0.2048, + "step": 11748 + }, + { + "epoch": 0.20955659401419754, + "grad_norm": 0.17882724106311798, + "learning_rate": 4.8193686990526286e-05, + "loss": 0.1799, + "step": 11749 + }, + { + "epoch": 0.20957443013591126, + "grad_norm": 0.2518196702003479, + "learning_rate": 4.8193106043649763e-05, + "loss": 0.2014, + "step": 11750 + }, + { + "epoch": 0.20959226625762495, + "grad_norm": 0.23048117756843567, + "learning_rate": 4.819252500686875e-05, + "loss": 0.2102, + "step": 11751 + }, + { + "epoch": 0.20961010237933864, + "grad_norm": 0.2745598256587982, + "learning_rate": 4.81919438801855e-05, + "loss": 0.1998, + "step": 11752 + }, + { + "epoch": 0.20962793850105232, + "grad_norm": 0.26343855261802673, + "learning_rate": 4.819136266360226e-05, + "loss": 0.2124, + "step": 11753 + }, + { + "epoch": 0.209645774622766, + "grad_norm": 0.31484368443489075, + "learning_rate": 4.8190781357121295e-05, + "loss": 0.2256, + "step": 11754 + }, + { + "epoch": 0.20966361074447973, + "grad_norm": 0.29990607500076294, + "learning_rate": 4.819019996074484e-05, + "loss": 0.2108, + "step": 11755 + }, + { + "epoch": 0.20968144686619342, + "grad_norm": 0.19660401344299316, + "learning_rate": 4.8189618474475175e-05, + "loss": 0.1694, + "step": 11756 + }, + { + "epoch": 0.2096992829879071, + "grad_norm": 0.31754928827285767, + "learning_rate": 4.818903689831453e-05, + "loss": 0.2387, + "step": 11757 + }, + { + "epoch": 0.2097171191096208, + "grad_norm": 0.19113314151763916, + "learning_rate": 4.8188455232265174e-05, + "loss": 0.1914, + "step": 11758 + }, + { + "epoch": 0.2097349552313345, + "grad_norm": 0.26601749658584595, + "learning_rate": 4.818787347632935e-05, + "loss": 0.2073, + "step": 11759 + }, + { + "epoch": 0.2097527913530482, + "grad_norm": 0.31796202063560486, + "learning_rate": 4.8187291630509324e-05, + "loss": 0.2152, + "step": 11760 + }, + { + "epoch": 0.20977062747476188, + "grad_norm": 0.2584232687950134, + "learning_rate": 4.8186709694807346e-05, + "loss": 0.1779, + "step": 11761 + }, + { + "epoch": 0.20978846359647557, + "grad_norm": 0.32587021589279175, + "learning_rate": 4.818612766922567e-05, + "loss": 0.1862, + "step": 11762 + }, + { + "epoch": 0.2098062997181893, + "grad_norm": 0.277302622795105, + "learning_rate": 4.8185545553766564e-05, + "loss": 0.2069, + "step": 11763 + }, + { + "epoch": 0.20982413583990298, + "grad_norm": 0.38512516021728516, + "learning_rate": 4.818496334843227e-05, + "loss": 0.2824, + "step": 11764 + }, + { + "epoch": 0.20984197196161666, + "grad_norm": 0.24719849228858948, + "learning_rate": 4.818438105322505e-05, + "loss": 0.1492, + "step": 11765 + }, + { + "epoch": 0.20985980808333035, + "grad_norm": 0.30014151334762573, + "learning_rate": 4.818379866814716e-05, + "loss": 0.1974, + "step": 11766 + }, + { + "epoch": 0.20987764420504407, + "grad_norm": 0.23975475132465363, + "learning_rate": 4.8183216193200856e-05, + "loss": 0.2285, + "step": 11767 + }, + { + "epoch": 0.20989548032675776, + "grad_norm": 0.26746538281440735, + "learning_rate": 4.8182633628388406e-05, + "loss": 0.1758, + "step": 11768 + }, + { + "epoch": 0.20991331644847144, + "grad_norm": 0.20747677981853485, + "learning_rate": 4.8182050973712055e-05, + "loss": 0.1621, + "step": 11769 + }, + { + "epoch": 0.20993115257018513, + "grad_norm": 0.2552240490913391, + "learning_rate": 4.818146822917407e-05, + "loss": 0.1566, + "step": 11770 + }, + { + "epoch": 0.20994898869189885, + "grad_norm": 0.21534845232963562, + "learning_rate": 4.818088539477671e-05, + "loss": 0.1422, + "step": 11771 + }, + { + "epoch": 0.20996682481361253, + "grad_norm": 0.2540239095687866, + "learning_rate": 4.818030247052223e-05, + "loss": 0.2022, + "step": 11772 + }, + { + "epoch": 0.20998466093532622, + "grad_norm": 0.24961107969284058, + "learning_rate": 4.817971945641289e-05, + "loss": 0.2003, + "step": 11773 + }, + { + "epoch": 0.2100024970570399, + "grad_norm": 0.3151911795139313, + "learning_rate": 4.817913635245096e-05, + "loss": 0.2056, + "step": 11774 + }, + { + "epoch": 0.2100203331787536, + "grad_norm": 0.34245213866233826, + "learning_rate": 4.817855315863868e-05, + "loss": 0.1899, + "step": 11775 + }, + { + "epoch": 0.21003816930046731, + "grad_norm": 0.2559554874897003, + "learning_rate": 4.8177969874978326e-05, + "loss": 0.1803, + "step": 11776 + }, + { + "epoch": 0.210056005422181, + "grad_norm": 0.25878554582595825, + "learning_rate": 4.8177386501472154e-05, + "loss": 0.186, + "step": 11777 + }, + { + "epoch": 0.2100738415438947, + "grad_norm": 0.22685648500919342, + "learning_rate": 4.817680303812243e-05, + "loss": 0.2176, + "step": 11778 + }, + { + "epoch": 0.21009167766560838, + "grad_norm": 0.22430676221847534, + "learning_rate": 4.8176219484931404e-05, + "loss": 0.1763, + "step": 11779 + }, + { + "epoch": 0.2101095137873221, + "grad_norm": 0.3075858950614929, + "learning_rate": 4.817563584190136e-05, + "loss": 0.2276, + "step": 11780 + }, + { + "epoch": 0.21012734990903578, + "grad_norm": 0.3472638428211212, + "learning_rate": 4.817505210903454e-05, + "loss": 0.1572, + "step": 11781 + }, + { + "epoch": 0.21014518603074947, + "grad_norm": 0.3664585053920746, + "learning_rate": 4.8174468286333216e-05, + "loss": 0.1972, + "step": 11782 + }, + { + "epoch": 0.21016302215246316, + "grad_norm": 0.2868519723415375, + "learning_rate": 4.8173884373799644e-05, + "loss": 0.2241, + "step": 11783 + }, + { + "epoch": 0.21018085827417687, + "grad_norm": 0.2659103572368622, + "learning_rate": 4.81733003714361e-05, + "loss": 0.2196, + "step": 11784 + }, + { + "epoch": 0.21019869439589056, + "grad_norm": 0.4240790605545044, + "learning_rate": 4.817271627924483e-05, + "loss": 0.2387, + "step": 11785 + }, + { + "epoch": 0.21021653051760425, + "grad_norm": 0.28208059072494507, + "learning_rate": 4.817213209722812e-05, + "loss": 0.2099, + "step": 11786 + }, + { + "epoch": 0.21023436663931794, + "grad_norm": 0.34443867206573486, + "learning_rate": 4.8171547825388206e-05, + "loss": 0.211, + "step": 11787 + }, + { + "epoch": 0.21025220276103165, + "grad_norm": 0.27742916345596313, + "learning_rate": 4.817096346372738e-05, + "loss": 0.2465, + "step": 11788 + }, + { + "epoch": 0.21027003888274534, + "grad_norm": 0.2281203269958496, + "learning_rate": 4.8170379012247905e-05, + "loss": 0.2177, + "step": 11789 + }, + { + "epoch": 0.21028787500445903, + "grad_norm": 0.235040083527565, + "learning_rate": 4.8169794470952024e-05, + "loss": 0.1531, + "step": 11790 + }, + { + "epoch": 0.21030571112617272, + "grad_norm": 0.21724410355091095, + "learning_rate": 4.816920983984202e-05, + "loss": 0.1489, + "step": 11791 + }, + { + "epoch": 0.21032354724788643, + "grad_norm": 0.5910566449165344, + "learning_rate": 4.816862511892016e-05, + "loss": 0.2799, + "step": 11792 + }, + { + "epoch": 0.21034138336960012, + "grad_norm": 0.3476826846599579, + "learning_rate": 4.81680403081887e-05, + "loss": 0.2637, + "step": 11793 + }, + { + "epoch": 0.2103592194913138, + "grad_norm": 0.24049030244350433, + "learning_rate": 4.816745540764992e-05, + "loss": 0.2091, + "step": 11794 + }, + { + "epoch": 0.2103770556130275, + "grad_norm": 0.23804958164691925, + "learning_rate": 4.816687041730608e-05, + "loss": 0.1957, + "step": 11795 + }, + { + "epoch": 0.21039489173474119, + "grad_norm": 0.2845692038536072, + "learning_rate": 4.816628533715945e-05, + "loss": 0.2095, + "step": 11796 + }, + { + "epoch": 0.2104127278564549, + "grad_norm": 0.32254090905189514, + "learning_rate": 4.816570016721229e-05, + "loss": 0.2591, + "step": 11797 + }, + { + "epoch": 0.2104305639781686, + "grad_norm": 0.2692525088787079, + "learning_rate": 4.816511490746689e-05, + "loss": 0.2275, + "step": 11798 + }, + { + "epoch": 0.21044840009988228, + "grad_norm": 0.2805209457874298, + "learning_rate": 4.8164529557925487e-05, + "loss": 0.202, + "step": 11799 + }, + { + "epoch": 0.21046623622159596, + "grad_norm": 0.2221936285495758, + "learning_rate": 4.8163944118590374e-05, + "loss": 0.1844, + "step": 11800 + }, + { + "epoch": 0.21048407234330968, + "grad_norm": 0.276731014251709, + "learning_rate": 4.816335858946381e-05, + "loss": 0.2538, + "step": 11801 + }, + { + "epoch": 0.21050190846502337, + "grad_norm": 0.33310019969940186, + "learning_rate": 4.8162772970548073e-05, + "loss": 0.2104, + "step": 11802 + }, + { + "epoch": 0.21051974458673706, + "grad_norm": 0.395674467086792, + "learning_rate": 4.8162187261845425e-05, + "loss": 0.2155, + "step": 11803 + }, + { + "epoch": 0.21053758070845074, + "grad_norm": 0.2592020332813263, + "learning_rate": 4.8161601463358145e-05, + "loss": 0.2121, + "step": 11804 + }, + { + "epoch": 0.21055541683016446, + "grad_norm": 0.33151042461395264, + "learning_rate": 4.816101557508849e-05, + "loss": 0.2532, + "step": 11805 + }, + { + "epoch": 0.21057325295187815, + "grad_norm": 0.2417377531528473, + "learning_rate": 4.8160429597038735e-05, + "loss": 0.2056, + "step": 11806 + }, + { + "epoch": 0.21059108907359184, + "grad_norm": 0.3523246943950653, + "learning_rate": 4.8159843529211166e-05, + "loss": 0.2051, + "step": 11807 + }, + { + "epoch": 0.21060892519530552, + "grad_norm": 0.300409734249115, + "learning_rate": 4.815925737160804e-05, + "loss": 0.1907, + "step": 11808 + }, + { + "epoch": 0.21062676131701924, + "grad_norm": 0.2804553806781769, + "learning_rate": 4.8158671124231636e-05, + "loss": 0.2201, + "step": 11809 + }, + { + "epoch": 0.21064459743873293, + "grad_norm": 0.297488272190094, + "learning_rate": 4.8158084787084216e-05, + "loss": 0.1939, + "step": 11810 + }, + { + "epoch": 0.21066243356044662, + "grad_norm": 0.26098382472991943, + "learning_rate": 4.815749836016807e-05, + "loss": 0.2015, + "step": 11811 + }, + { + "epoch": 0.2106802696821603, + "grad_norm": 0.3780629634857178, + "learning_rate": 4.8156911843485454e-05, + "loss": 0.1788, + "step": 11812 + }, + { + "epoch": 0.210698105803874, + "grad_norm": 0.26268333196640015, + "learning_rate": 4.815632523703866e-05, + "loss": 0.2065, + "step": 11813 + }, + { + "epoch": 0.2107159419255877, + "grad_norm": 0.3022797107696533, + "learning_rate": 4.815573854082994e-05, + "loss": 0.2288, + "step": 11814 + }, + { + "epoch": 0.2107337780473014, + "grad_norm": 0.2933153808116913, + "learning_rate": 4.815515175486159e-05, + "loss": 0.2226, + "step": 11815 + }, + { + "epoch": 0.21075161416901508, + "grad_norm": 0.22890408337116241, + "learning_rate": 4.8154564879135865e-05, + "loss": 0.179, + "step": 11816 + }, + { + "epoch": 0.21076945029072877, + "grad_norm": 0.3245255649089813, + "learning_rate": 4.815397791365506e-05, + "loss": 0.1807, + "step": 11817 + }, + { + "epoch": 0.2107872864124425, + "grad_norm": 0.27721354365348816, + "learning_rate": 4.815339085842143e-05, + "loss": 0.1805, + "step": 11818 + }, + { + "epoch": 0.21080512253415618, + "grad_norm": 0.2621777057647705, + "learning_rate": 4.815280371343726e-05, + "loss": 0.2, + "step": 11819 + }, + { + "epoch": 0.21082295865586986, + "grad_norm": 0.3462750017642975, + "learning_rate": 4.815221647870483e-05, + "loss": 0.2016, + "step": 11820 + }, + { + "epoch": 0.21084079477758355, + "grad_norm": 0.24098993837833405, + "learning_rate": 4.815162915422641e-05, + "loss": 0.1859, + "step": 11821 + }, + { + "epoch": 0.21085863089929727, + "grad_norm": 0.34345999360084534, + "learning_rate": 4.815104174000428e-05, + "loss": 0.2116, + "step": 11822 + }, + { + "epoch": 0.21087646702101095, + "grad_norm": 0.27372217178344727, + "learning_rate": 4.815045423604072e-05, + "loss": 0.2217, + "step": 11823 + }, + { + "epoch": 0.21089430314272464, + "grad_norm": 0.3180995583534241, + "learning_rate": 4.8149866642338e-05, + "loss": 0.162, + "step": 11824 + }, + { + "epoch": 0.21091213926443833, + "grad_norm": 0.27679339051246643, + "learning_rate": 4.81492789588984e-05, + "loss": 0.1638, + "step": 11825 + }, + { + "epoch": 0.21092997538615205, + "grad_norm": 0.31787386536598206, + "learning_rate": 4.81486911857242e-05, + "loss": 0.2216, + "step": 11826 + }, + { + "epoch": 0.21094781150786573, + "grad_norm": 0.35636964440345764, + "learning_rate": 4.814810332281768e-05, + "loss": 0.1713, + "step": 11827 + }, + { + "epoch": 0.21096564762957942, + "grad_norm": 0.29187142848968506, + "learning_rate": 4.814751537018112e-05, + "loss": 0.1877, + "step": 11828 + }, + { + "epoch": 0.2109834837512931, + "grad_norm": 0.22472573816776276, + "learning_rate": 4.814692732781678e-05, + "loss": 0.141, + "step": 11829 + }, + { + "epoch": 0.21100131987300683, + "grad_norm": 0.31143370270729065, + "learning_rate": 4.814633919572697e-05, + "loss": 0.1929, + "step": 11830 + }, + { + "epoch": 0.21101915599472051, + "grad_norm": 0.39636850357055664, + "learning_rate": 4.814575097391395e-05, + "loss": 0.2463, + "step": 11831 + }, + { + "epoch": 0.2110369921164342, + "grad_norm": 0.3290325999259949, + "learning_rate": 4.8145162662380006e-05, + "loss": 0.2384, + "step": 11832 + }, + { + "epoch": 0.2110548282381479, + "grad_norm": 0.3592544496059418, + "learning_rate": 4.814457426112742e-05, + "loss": 0.2279, + "step": 11833 + }, + { + "epoch": 0.21107266435986158, + "grad_norm": 0.24955043196678162, + "learning_rate": 4.8143985770158464e-05, + "loss": 0.2159, + "step": 11834 + }, + { + "epoch": 0.2110905004815753, + "grad_norm": 0.28856950998306274, + "learning_rate": 4.8143397189475425e-05, + "loss": 0.2151, + "step": 11835 + }, + { + "epoch": 0.21110833660328898, + "grad_norm": 0.3253841698169708, + "learning_rate": 4.814280851908059e-05, + "loss": 0.2263, + "step": 11836 + }, + { + "epoch": 0.21112617272500267, + "grad_norm": 0.2808425724506378, + "learning_rate": 4.8142219758976235e-05, + "loss": 0.2373, + "step": 11837 + }, + { + "epoch": 0.21114400884671636, + "grad_norm": 0.25354379415512085, + "learning_rate": 4.8141630909164646e-05, + "loss": 0.1603, + "step": 11838 + }, + { + "epoch": 0.21116184496843007, + "grad_norm": 0.27182918787002563, + "learning_rate": 4.814104196964809e-05, + "loss": 0.2015, + "step": 11839 + }, + { + "epoch": 0.21117968109014376, + "grad_norm": 0.23462463915348053, + "learning_rate": 4.814045294042887e-05, + "loss": 0.1674, + "step": 11840 + }, + { + "epoch": 0.21119751721185745, + "grad_norm": 0.24965649843215942, + "learning_rate": 4.8139863821509265e-05, + "loss": 0.2268, + "step": 11841 + }, + { + "epoch": 0.21121535333357114, + "grad_norm": 0.3184555470943451, + "learning_rate": 4.8139274612891564e-05, + "loss": 0.237, + "step": 11842 + }, + { + "epoch": 0.21123318945528485, + "grad_norm": 0.3308340907096863, + "learning_rate": 4.813868531457803e-05, + "loss": 0.2417, + "step": 11843 + }, + { + "epoch": 0.21125102557699854, + "grad_norm": 0.22181598842144012, + "learning_rate": 4.813809592657096e-05, + "loss": 0.1977, + "step": 11844 + }, + { + "epoch": 0.21126886169871223, + "grad_norm": 0.278463751077652, + "learning_rate": 4.8137506448872636e-05, + "loss": 0.2269, + "step": 11845 + }, + { + "epoch": 0.21128669782042592, + "grad_norm": 0.27575892210006714, + "learning_rate": 4.8136916881485355e-05, + "loss": 0.2041, + "step": 11846 + }, + { + "epoch": 0.21130453394213963, + "grad_norm": 0.3143797814846039, + "learning_rate": 4.8136327224411394e-05, + "loss": 0.1607, + "step": 11847 + }, + { + "epoch": 0.21132237006385332, + "grad_norm": 0.2297838181257248, + "learning_rate": 4.8135737477653035e-05, + "loss": 0.1843, + "step": 11848 + }, + { + "epoch": 0.211340206185567, + "grad_norm": 0.25091153383255005, + "learning_rate": 4.8135147641212555e-05, + "loss": 0.1848, + "step": 11849 + }, + { + "epoch": 0.2113580423072807, + "grad_norm": 0.2786402702331543, + "learning_rate": 4.8134557715092266e-05, + "loss": 0.149, + "step": 11850 + }, + { + "epoch": 0.2113758784289944, + "grad_norm": 0.294948548078537, + "learning_rate": 4.8133967699294436e-05, + "loss": 0.2547, + "step": 11851 + }, + { + "epoch": 0.2113937145507081, + "grad_norm": 0.2742416262626648, + "learning_rate": 4.813337759382136e-05, + "loss": 0.193, + "step": 11852 + }, + { + "epoch": 0.2114115506724218, + "grad_norm": 0.2357669323682785, + "learning_rate": 4.8132787398675314e-05, + "loss": 0.2058, + "step": 11853 + }, + { + "epoch": 0.21142938679413548, + "grad_norm": 0.40839940309524536, + "learning_rate": 4.81321971138586e-05, + "loss": 0.213, + "step": 11854 + }, + { + "epoch": 0.21144722291584916, + "grad_norm": 0.24076344072818756, + "learning_rate": 4.81316067393735e-05, + "loss": 0.1451, + "step": 11855 + }, + { + "epoch": 0.21146505903756288, + "grad_norm": 0.2290666699409485, + "learning_rate": 4.813101627522231e-05, + "loss": 0.1757, + "step": 11856 + }, + { + "epoch": 0.21148289515927657, + "grad_norm": 0.32733190059661865, + "learning_rate": 4.81304257214073e-05, + "loss": 0.1944, + "step": 11857 + }, + { + "epoch": 0.21150073128099026, + "grad_norm": 0.2146921306848526, + "learning_rate": 4.8129835077930775e-05, + "loss": 0.1594, + "step": 11858 + }, + { + "epoch": 0.21151856740270394, + "grad_norm": 0.2122744619846344, + "learning_rate": 4.812924434479502e-05, + "loss": 0.1781, + "step": 11859 + }, + { + "epoch": 0.21153640352441766, + "grad_norm": 0.3274732530117035, + "learning_rate": 4.8128653522002326e-05, + "loss": 0.218, + "step": 11860 + }, + { + "epoch": 0.21155423964613135, + "grad_norm": 0.26031941175460815, + "learning_rate": 4.812806260955498e-05, + "loss": 0.1755, + "step": 11861 + }, + { + "epoch": 0.21157207576784504, + "grad_norm": 0.48987406492233276, + "learning_rate": 4.812747160745528e-05, + "loss": 0.2388, + "step": 11862 + }, + { + "epoch": 0.21158991188955872, + "grad_norm": 0.26355957984924316, + "learning_rate": 4.812688051570551e-05, + "loss": 0.2151, + "step": 11863 + }, + { + "epoch": 0.21160774801127244, + "grad_norm": 0.2524564564228058, + "learning_rate": 4.812628933430797e-05, + "loss": 0.1913, + "step": 11864 + }, + { + "epoch": 0.21162558413298613, + "grad_norm": 0.28413692116737366, + "learning_rate": 4.812569806326493e-05, + "loss": 0.1949, + "step": 11865 + }, + { + "epoch": 0.21164342025469982, + "grad_norm": 0.3151012659072876, + "learning_rate": 4.812510670257871e-05, + "loss": 0.1919, + "step": 11866 + }, + { + "epoch": 0.2116612563764135, + "grad_norm": 0.23169133067131042, + "learning_rate": 4.8124515252251586e-05, + "loss": 0.2148, + "step": 11867 + }, + { + "epoch": 0.21167909249812722, + "grad_norm": 0.32055649161338806, + "learning_rate": 4.812392371228585e-05, + "loss": 0.2344, + "step": 11868 + }, + { + "epoch": 0.2116969286198409, + "grad_norm": 0.2602478265762329, + "learning_rate": 4.81233320826838e-05, + "loss": 0.1945, + "step": 11869 + }, + { + "epoch": 0.2117147647415546, + "grad_norm": 0.2460232824087143, + "learning_rate": 4.812274036344773e-05, + "loss": 0.1679, + "step": 11870 + }, + { + "epoch": 0.21173260086326828, + "grad_norm": 0.26596733927726746, + "learning_rate": 4.812214855457994e-05, + "loss": 0.2208, + "step": 11871 + }, + { + "epoch": 0.211750436984982, + "grad_norm": 0.3579787611961365, + "learning_rate": 4.81215566560827e-05, + "loss": 0.1963, + "step": 11872 + }, + { + "epoch": 0.2117682731066957, + "grad_norm": 0.24507632851600647, + "learning_rate": 4.8120964667958334e-05, + "loss": 0.1884, + "step": 11873 + }, + { + "epoch": 0.21178610922840937, + "grad_norm": 0.410803884267807, + "learning_rate": 4.812037259020912e-05, + "loss": 0.2362, + "step": 11874 + }, + { + "epoch": 0.21180394535012306, + "grad_norm": 0.30791109800338745, + "learning_rate": 4.811978042283735e-05, + "loss": 0.1933, + "step": 11875 + }, + { + "epoch": 0.21182178147183675, + "grad_norm": 0.33453458547592163, + "learning_rate": 4.811918816584533e-05, + "loss": 0.217, + "step": 11876 + }, + { + "epoch": 0.21183961759355047, + "grad_norm": 0.2710801064968109, + "learning_rate": 4.811859581923535e-05, + "loss": 0.1559, + "step": 11877 + }, + { + "epoch": 0.21185745371526415, + "grad_norm": 0.2979060709476471, + "learning_rate": 4.811800338300971e-05, + "loss": 0.2213, + "step": 11878 + }, + { + "epoch": 0.21187528983697784, + "grad_norm": 0.42303067445755005, + "learning_rate": 4.81174108571707e-05, + "loss": 0.2194, + "step": 11879 + }, + { + "epoch": 0.21189312595869153, + "grad_norm": 0.19875451922416687, + "learning_rate": 4.811681824172063e-05, + "loss": 0.188, + "step": 11880 + }, + { + "epoch": 0.21191096208040525, + "grad_norm": 0.2997405529022217, + "learning_rate": 4.811622553666178e-05, + "loss": 0.2203, + "step": 11881 + }, + { + "epoch": 0.21192879820211893, + "grad_norm": 0.2618153393268585, + "learning_rate": 4.8115632741996455e-05, + "loss": 0.1761, + "step": 11882 + }, + { + "epoch": 0.21194663432383262, + "grad_norm": 0.2891547977924347, + "learning_rate": 4.8115039857726965e-05, + "loss": 0.2172, + "step": 11883 + }, + { + "epoch": 0.2119644704455463, + "grad_norm": 0.28733545541763306, + "learning_rate": 4.811444688385558e-05, + "loss": 0.2071, + "step": 11884 + }, + { + "epoch": 0.21198230656726003, + "grad_norm": 0.46259891986846924, + "learning_rate": 4.8113853820384625e-05, + "loss": 0.2229, + "step": 11885 + }, + { + "epoch": 0.21200014268897371, + "grad_norm": 0.43129998445510864, + "learning_rate": 4.811326066731639e-05, + "loss": 0.1536, + "step": 11886 + }, + { + "epoch": 0.2120179788106874, + "grad_norm": 0.2170029580593109, + "learning_rate": 4.8112667424653165e-05, + "loss": 0.1908, + "step": 11887 + }, + { + "epoch": 0.2120358149324011, + "grad_norm": 0.3331025242805481, + "learning_rate": 4.811207409239727e-05, + "loss": 0.1963, + "step": 11888 + }, + { + "epoch": 0.2120536510541148, + "grad_norm": 0.42987164855003357, + "learning_rate": 4.811148067055099e-05, + "loss": 0.232, + "step": 11889 + }, + { + "epoch": 0.2120714871758285, + "grad_norm": 0.45533299446105957, + "learning_rate": 4.811088715911662e-05, + "loss": 0.1773, + "step": 11890 + }, + { + "epoch": 0.21208932329754218, + "grad_norm": 0.23405250906944275, + "learning_rate": 4.8110293558096484e-05, + "loss": 0.1527, + "step": 11891 + }, + { + "epoch": 0.21210715941925587, + "grad_norm": 0.29416489601135254, + "learning_rate": 4.810969986749286e-05, + "loss": 0.2106, + "step": 11892 + }, + { + "epoch": 0.21212499554096959, + "grad_norm": 0.2741703391075134, + "learning_rate": 4.8109106087308056e-05, + "loss": 0.2402, + "step": 11893 + }, + { + "epoch": 0.21214283166268327, + "grad_norm": 0.286952406167984, + "learning_rate": 4.8108512217544375e-05, + "loss": 0.2211, + "step": 11894 + }, + { + "epoch": 0.21216066778439696, + "grad_norm": 0.28501641750335693, + "learning_rate": 4.810791825820412e-05, + "loss": 0.1733, + "step": 11895 + }, + { + "epoch": 0.21217850390611065, + "grad_norm": 0.2270091474056244, + "learning_rate": 4.8107324209289595e-05, + "loss": 0.1887, + "step": 11896 + }, + { + "epoch": 0.21219634002782434, + "grad_norm": 0.30119457840919495, + "learning_rate": 4.81067300708031e-05, + "loss": 0.1958, + "step": 11897 + }, + { + "epoch": 0.21221417614953805, + "grad_norm": 0.2292444109916687, + "learning_rate": 4.810613584274693e-05, + "loss": 0.1988, + "step": 11898 + }, + { + "epoch": 0.21223201227125174, + "grad_norm": 0.2498779594898224, + "learning_rate": 4.810554152512341e-05, + "loss": 0.2417, + "step": 11899 + }, + { + "epoch": 0.21224984839296543, + "grad_norm": 0.2860521376132965, + "learning_rate": 4.8104947117934826e-05, + "loss": 0.2432, + "step": 11900 + }, + { + "epoch": 0.21226768451467912, + "grad_norm": 0.25130772590637207, + "learning_rate": 4.8104352621183486e-05, + "loss": 0.1743, + "step": 11901 + }, + { + "epoch": 0.21228552063639283, + "grad_norm": 0.3735920488834381, + "learning_rate": 4.810375803487169e-05, + "loss": 0.164, + "step": 11902 + }, + { + "epoch": 0.21230335675810652, + "grad_norm": 0.24691012501716614, + "learning_rate": 4.810316335900176e-05, + "loss": 0.1604, + "step": 11903 + }, + { + "epoch": 0.2123211928798202, + "grad_norm": 0.32348644733428955, + "learning_rate": 4.810256859357598e-05, + "loss": 0.2053, + "step": 11904 + }, + { + "epoch": 0.2123390290015339, + "grad_norm": 0.32618266344070435, + "learning_rate": 4.8101973738596664e-05, + "loss": 0.1986, + "step": 11905 + }, + { + "epoch": 0.2123568651232476, + "grad_norm": 0.2829591631889343, + "learning_rate": 4.810137879406612e-05, + "loss": 0.1656, + "step": 11906 + }, + { + "epoch": 0.2123747012449613, + "grad_norm": 0.243828684091568, + "learning_rate": 4.8100783759986657e-05, + "loss": 0.1787, + "step": 11907 + }, + { + "epoch": 0.212392537366675, + "grad_norm": 0.2694132924079895, + "learning_rate": 4.810018863636058e-05, + "loss": 0.188, + "step": 11908 + }, + { + "epoch": 0.21241037348838868, + "grad_norm": 0.3744795024394989, + "learning_rate": 4.809959342319018e-05, + "loss": 0.2002, + "step": 11909 + }, + { + "epoch": 0.2124282096101024, + "grad_norm": 0.28664690256118774, + "learning_rate": 4.809899812047779e-05, + "loss": 0.1884, + "step": 11910 + }, + { + "epoch": 0.21244604573181608, + "grad_norm": 0.2437635362148285, + "learning_rate": 4.80984027282257e-05, + "loss": 0.1947, + "step": 11911 + }, + { + "epoch": 0.21246388185352977, + "grad_norm": 0.3605238199234009, + "learning_rate": 4.8097807246436224e-05, + "loss": 0.2133, + "step": 11912 + }, + { + "epoch": 0.21248171797524346, + "grad_norm": 0.26123175024986267, + "learning_rate": 4.809721167511167e-05, + "loss": 0.1601, + "step": 11913 + }, + { + "epoch": 0.21249955409695714, + "grad_norm": 0.3012862801551819, + "learning_rate": 4.8096616014254345e-05, + "loss": 0.1922, + "step": 11914 + }, + { + "epoch": 0.21251739021867086, + "grad_norm": 0.2996641993522644, + "learning_rate": 4.809602026386656e-05, + "loss": 0.1709, + "step": 11915 + }, + { + "epoch": 0.21253522634038455, + "grad_norm": 0.29815566539764404, + "learning_rate": 4.809542442395062e-05, + "loss": 0.2033, + "step": 11916 + }, + { + "epoch": 0.21255306246209824, + "grad_norm": 0.27722781896591187, + "learning_rate": 4.809482849450885e-05, + "loss": 0.2296, + "step": 11917 + }, + { + "epoch": 0.21257089858381192, + "grad_norm": 0.29742431640625, + "learning_rate": 4.809423247554353e-05, + "loss": 0.2146, + "step": 11918 + }, + { + "epoch": 0.21258873470552564, + "grad_norm": 0.25360652804374695, + "learning_rate": 4.8093636367057e-05, + "loss": 0.193, + "step": 11919 + }, + { + "epoch": 0.21260657082723933, + "grad_norm": 0.33742067217826843, + "learning_rate": 4.809304016905156e-05, + "loss": 0.19, + "step": 11920 + }, + { + "epoch": 0.21262440694895302, + "grad_norm": 0.2800961136817932, + "learning_rate": 4.8092443881529514e-05, + "loss": 0.2151, + "step": 11921 + }, + { + "epoch": 0.2126422430706667, + "grad_norm": 0.38956162333488464, + "learning_rate": 4.8091847504493184e-05, + "loss": 0.2057, + "step": 11922 + }, + { + "epoch": 0.21266007919238042, + "grad_norm": 0.24416503310203552, + "learning_rate": 4.809125103794487e-05, + "loss": 0.1762, + "step": 11923 + }, + { + "epoch": 0.2126779153140941, + "grad_norm": 0.27841487526893616, + "learning_rate": 4.80906544818869e-05, + "loss": 0.2577, + "step": 11924 + }, + { + "epoch": 0.2126957514358078, + "grad_norm": 0.26839733123779297, + "learning_rate": 4.8090057836321577e-05, + "loss": 0.1725, + "step": 11925 + }, + { + "epoch": 0.21271358755752148, + "grad_norm": 0.27964547276496887, + "learning_rate": 4.808946110125121e-05, + "loss": 0.1925, + "step": 11926 + }, + { + "epoch": 0.2127314236792352, + "grad_norm": 0.28022778034210205, + "learning_rate": 4.808886427667812e-05, + "loss": 0.2106, + "step": 11927 + }, + { + "epoch": 0.2127492598009489, + "grad_norm": 0.36025023460388184, + "learning_rate": 4.8088267362604615e-05, + "loss": 0.2445, + "step": 11928 + }, + { + "epoch": 0.21276709592266257, + "grad_norm": 0.16520865261554718, + "learning_rate": 4.8087670359033016e-05, + "loss": 0.1755, + "step": 11929 + }, + { + "epoch": 0.21278493204437626, + "grad_norm": 0.22348365187644958, + "learning_rate": 4.808707326596563e-05, + "loss": 0.1452, + "step": 11930 + }, + { + "epoch": 0.21280276816608998, + "grad_norm": 0.2352830022573471, + "learning_rate": 4.8086476083404775e-05, + "loss": 0.2214, + "step": 11931 + }, + { + "epoch": 0.21282060428780367, + "grad_norm": 0.2535237967967987, + "learning_rate": 4.808587881135276e-05, + "loss": 0.2172, + "step": 11932 + }, + { + "epoch": 0.21283844040951735, + "grad_norm": 0.22616231441497803, + "learning_rate": 4.8085281449811913e-05, + "loss": 0.1837, + "step": 11933 + }, + { + "epoch": 0.21285627653123104, + "grad_norm": 0.2927800416946411, + "learning_rate": 4.8084683998784535e-05, + "loss": 0.1422, + "step": 11934 + }, + { + "epoch": 0.21287411265294473, + "grad_norm": 0.30131295323371887, + "learning_rate": 4.808408645827295e-05, + "loss": 0.2458, + "step": 11935 + }, + { + "epoch": 0.21289194877465845, + "grad_norm": 0.2846871018409729, + "learning_rate": 4.808348882827948e-05, + "loss": 0.2268, + "step": 11936 + }, + { + "epoch": 0.21290978489637213, + "grad_norm": 0.22972151637077332, + "learning_rate": 4.8082891108806426e-05, + "loss": 0.1559, + "step": 11937 + }, + { + "epoch": 0.21292762101808582, + "grad_norm": 0.28498220443725586, + "learning_rate": 4.808229329985612e-05, + "loss": 0.222, + "step": 11938 + }, + { + "epoch": 0.2129454571397995, + "grad_norm": 0.2661803960800171, + "learning_rate": 4.8081695401430866e-05, + "loss": 0.2112, + "step": 11939 + }, + { + "epoch": 0.21296329326151323, + "grad_norm": 0.24602225422859192, + "learning_rate": 4.808109741353299e-05, + "loss": 0.2333, + "step": 11940 + }, + { + "epoch": 0.2129811293832269, + "grad_norm": 0.2584832012653351, + "learning_rate": 4.808049933616481e-05, + "loss": 0.1862, + "step": 11941 + }, + { + "epoch": 0.2129989655049406, + "grad_norm": 0.23810431361198425, + "learning_rate": 4.8079901169328645e-05, + "loss": 0.1852, + "step": 11942 + }, + { + "epoch": 0.2130168016266543, + "grad_norm": 0.4730599522590637, + "learning_rate": 4.8079302913026807e-05, + "loss": 0.2304, + "step": 11943 + }, + { + "epoch": 0.213034637748368, + "grad_norm": 0.20296625792980194, + "learning_rate": 4.8078704567261626e-05, + "loss": 0.1609, + "step": 11944 + }, + { + "epoch": 0.2130524738700817, + "grad_norm": 0.20573608577251434, + "learning_rate": 4.807810613203541e-05, + "loss": 0.1551, + "step": 11945 + }, + { + "epoch": 0.21307030999179538, + "grad_norm": 0.2590511441230774, + "learning_rate": 4.807750760735048e-05, + "loss": 0.1581, + "step": 11946 + }, + { + "epoch": 0.21308814611350907, + "grad_norm": 0.2788696885108948, + "learning_rate": 4.8076908993209166e-05, + "loss": 0.19, + "step": 11947 + }, + { + "epoch": 0.21310598223522279, + "grad_norm": 0.2824763357639313, + "learning_rate": 4.807631028961378e-05, + "loss": 0.2314, + "step": 11948 + }, + { + "epoch": 0.21312381835693647, + "grad_norm": 0.30209213495254517, + "learning_rate": 4.807571149656664e-05, + "loss": 0.1945, + "step": 11949 + }, + { + "epoch": 0.21314165447865016, + "grad_norm": 0.3376280665397644, + "learning_rate": 4.8075112614070084e-05, + "loss": 0.2085, + "step": 11950 + }, + { + "epoch": 0.21315949060036385, + "grad_norm": 0.26649901270866394, + "learning_rate": 4.807451364212642e-05, + "loss": 0.2408, + "step": 11951 + }, + { + "epoch": 0.21317732672207756, + "grad_norm": 0.2890677750110626, + "learning_rate": 4.807391458073796e-05, + "loss": 0.1671, + "step": 11952 + }, + { + "epoch": 0.21319516284379125, + "grad_norm": 0.1995590478181839, + "learning_rate": 4.8073315429907046e-05, + "loss": 0.1647, + "step": 11953 + }, + { + "epoch": 0.21321299896550494, + "grad_norm": 0.35758885741233826, + "learning_rate": 4.8072716189635995e-05, + "loss": 0.1911, + "step": 11954 + }, + { + "epoch": 0.21323083508721863, + "grad_norm": 0.28260499238967896, + "learning_rate": 4.807211685992712e-05, + "loss": 0.1685, + "step": 11955 + }, + { + "epoch": 0.21324867120893232, + "grad_norm": 0.257367879152298, + "learning_rate": 4.8071517440782754e-05, + "loss": 0.1681, + "step": 11956 + }, + { + "epoch": 0.21326650733064603, + "grad_norm": 0.24316635727882385, + "learning_rate": 4.8070917932205214e-05, + "loss": 0.1854, + "step": 11957 + }, + { + "epoch": 0.21328434345235972, + "grad_norm": 0.4340924918651581, + "learning_rate": 4.807031833419683e-05, + "loss": 0.2457, + "step": 11958 + }, + { + "epoch": 0.2133021795740734, + "grad_norm": 0.27530351281166077, + "learning_rate": 4.806971864675993e-05, + "loss": 0.2573, + "step": 11959 + }, + { + "epoch": 0.2133200156957871, + "grad_norm": 0.2605418562889099, + "learning_rate": 4.8069118869896826e-05, + "loss": 0.2114, + "step": 11960 + }, + { + "epoch": 0.2133378518175008, + "grad_norm": 0.2141258865594864, + "learning_rate": 4.806851900360985e-05, + "loss": 0.1921, + "step": 11961 + }, + { + "epoch": 0.2133556879392145, + "grad_norm": 0.19329896569252014, + "learning_rate": 4.8067919047901326e-05, + "loss": 0.1408, + "step": 11962 + }, + { + "epoch": 0.2133735240609282, + "grad_norm": 0.3235015869140625, + "learning_rate": 4.8067319002773584e-05, + "loss": 0.1239, + "step": 11963 + }, + { + "epoch": 0.21339136018264188, + "grad_norm": 0.35358837246894836, + "learning_rate": 4.8066718868228945e-05, + "loss": 0.2181, + "step": 11964 + }, + { + "epoch": 0.2134091963043556, + "grad_norm": 0.20358708500862122, + "learning_rate": 4.806611864426974e-05, + "loss": 0.1683, + "step": 11965 + }, + { + "epoch": 0.21342703242606928, + "grad_norm": 0.275892049074173, + "learning_rate": 4.8065518330898285e-05, + "loss": 0.2148, + "step": 11966 + }, + { + "epoch": 0.21344486854778297, + "grad_norm": 0.3201720118522644, + "learning_rate": 4.806491792811691e-05, + "loss": 0.2338, + "step": 11967 + }, + { + "epoch": 0.21346270466949666, + "grad_norm": 0.1990060806274414, + "learning_rate": 4.8064317435927966e-05, + "loss": 0.1785, + "step": 11968 + }, + { + "epoch": 0.21348054079121037, + "grad_norm": 0.39237022399902344, + "learning_rate": 4.8063716854333743e-05, + "loss": 0.2709, + "step": 11969 + }, + { + "epoch": 0.21349837691292406, + "grad_norm": 0.2247430980205536, + "learning_rate": 4.8063116183336596e-05, + "loss": 0.1909, + "step": 11970 + }, + { + "epoch": 0.21351621303463775, + "grad_norm": 0.3619181215763092, + "learning_rate": 4.806251542293885e-05, + "loss": 0.1805, + "step": 11971 + }, + { + "epoch": 0.21353404915635144, + "grad_norm": 0.29609400033950806, + "learning_rate": 4.806191457314282e-05, + "loss": 0.2129, + "step": 11972 + }, + { + "epoch": 0.21355188527806515, + "grad_norm": 0.2318839281797409, + "learning_rate": 4.806131363395084e-05, + "loss": 0.1717, + "step": 11973 + }, + { + "epoch": 0.21356972139977884, + "grad_norm": 0.2564683258533478, + "learning_rate": 4.806071260536526e-05, + "loss": 0.1953, + "step": 11974 + }, + { + "epoch": 0.21358755752149253, + "grad_norm": 0.2644842863082886, + "learning_rate": 4.8060111487388376e-05, + "loss": 0.1874, + "step": 11975 + }, + { + "epoch": 0.21360539364320622, + "grad_norm": 0.3433472812175751, + "learning_rate": 4.805951028002254e-05, + "loss": 0.2004, + "step": 11976 + }, + { + "epoch": 0.2136232297649199, + "grad_norm": 0.2824678421020508, + "learning_rate": 4.8058908983270076e-05, + "loss": 0.2133, + "step": 11977 + }, + { + "epoch": 0.21364106588663362, + "grad_norm": 0.3142847716808319, + "learning_rate": 4.8058307597133324e-05, + "loss": 0.2046, + "step": 11978 + }, + { + "epoch": 0.2136589020083473, + "grad_norm": 0.24438883364200592, + "learning_rate": 4.80577061216146e-05, + "loss": 0.2191, + "step": 11979 + }, + { + "epoch": 0.213676738130061, + "grad_norm": 0.3002220690250397, + "learning_rate": 4.805710455671625e-05, + "loss": 0.1785, + "step": 11980 + }, + { + "epoch": 0.21369457425177468, + "grad_norm": 0.35666581988334656, + "learning_rate": 4.805650290244059e-05, + "loss": 0.1978, + "step": 11981 + }, + { + "epoch": 0.2137124103734884, + "grad_norm": 0.2793177664279938, + "learning_rate": 4.805590115878997e-05, + "loss": 0.1631, + "step": 11982 + }, + { + "epoch": 0.2137302464952021, + "grad_norm": 0.2895793318748474, + "learning_rate": 4.8055299325766714e-05, + "loss": 0.1832, + "step": 11983 + }, + { + "epoch": 0.21374808261691577, + "grad_norm": 0.19846618175506592, + "learning_rate": 4.805469740337315e-05, + "loss": 0.1969, + "step": 11984 + }, + { + "epoch": 0.21376591873862946, + "grad_norm": 0.286077618598938, + "learning_rate": 4.805409539161162e-05, + "loss": 0.2073, + "step": 11985 + }, + { + "epoch": 0.21378375486034318, + "grad_norm": 0.2080845832824707, + "learning_rate": 4.8053493290484444e-05, + "loss": 0.2, + "step": 11986 + }, + { + "epoch": 0.21380159098205687, + "grad_norm": 0.23820963501930237, + "learning_rate": 4.805289109999398e-05, + "loss": 0.1443, + "step": 11987 + }, + { + "epoch": 0.21381942710377055, + "grad_norm": 0.3149418234825134, + "learning_rate": 4.805228882014254e-05, + "loss": 0.2416, + "step": 11988 + }, + { + "epoch": 0.21383726322548424, + "grad_norm": 0.3047579526901245, + "learning_rate": 4.8051686450932465e-05, + "loss": 0.1769, + "step": 11989 + }, + { + "epoch": 0.21385509934719796, + "grad_norm": 0.4249333143234253, + "learning_rate": 4.805108399236609e-05, + "loss": 0.2322, + "step": 11990 + }, + { + "epoch": 0.21387293546891165, + "grad_norm": 0.2610306739807129, + "learning_rate": 4.805048144444576e-05, + "loss": 0.2014, + "step": 11991 + }, + { + "epoch": 0.21389077159062533, + "grad_norm": 0.2611127197742462, + "learning_rate": 4.80498788071738e-05, + "loss": 0.204, + "step": 11992 + }, + { + "epoch": 0.21390860771233902, + "grad_norm": 0.22319549322128296, + "learning_rate": 4.8049276080552544e-05, + "loss": 0.2071, + "step": 11993 + }, + { + "epoch": 0.21392644383405274, + "grad_norm": 0.3226379156112671, + "learning_rate": 4.804867326458433e-05, + "loss": 0.2498, + "step": 11994 + }, + { + "epoch": 0.21394427995576643, + "grad_norm": 0.31403273344039917, + "learning_rate": 4.804807035927151e-05, + "loss": 0.201, + "step": 11995 + }, + { + "epoch": 0.2139621160774801, + "grad_norm": 0.26563963294029236, + "learning_rate": 4.80474673646164e-05, + "loss": 0.2283, + "step": 11996 + }, + { + "epoch": 0.2139799521991938, + "grad_norm": 0.32253947854042053, + "learning_rate": 4.804686428062135e-05, + "loss": 0.2401, + "step": 11997 + }, + { + "epoch": 0.2139977883209075, + "grad_norm": 0.2164137363433838, + "learning_rate": 4.804626110728869e-05, + "loss": 0.15, + "step": 11998 + }, + { + "epoch": 0.2140156244426212, + "grad_norm": 0.27956709265708923, + "learning_rate": 4.804565784462076e-05, + "loss": 0.1889, + "step": 11999 + }, + { + "epoch": 0.2140334605643349, + "grad_norm": 0.20151473581790924, + "learning_rate": 4.8045054492619914e-05, + "loss": 0.196, + "step": 12000 + }, + { + "epoch": 0.2140334605643349, + "eval_loss": 0.19065994024276733, + "eval_runtime": 106.7888, + "eval_samples_per_second": 9.589, + "eval_steps_per_second": 1.601, + "step": 12000 + }, + { + "epoch": 0.21405129668604858, + "grad_norm": 0.1964709311723709, + "learning_rate": 4.8044451051288465e-05, + "loss": 0.1239, + "step": 12001 + }, + { + "epoch": 0.21406913280776227, + "grad_norm": 0.3363712430000305, + "learning_rate": 4.804384752062876e-05, + "loss": 0.2575, + "step": 12002 + }, + { + "epoch": 0.21408696892947598, + "grad_norm": 0.26239413022994995, + "learning_rate": 4.804324390064315e-05, + "loss": 0.2051, + "step": 12003 + }, + { + "epoch": 0.21410480505118967, + "grad_norm": 0.20325836539268494, + "learning_rate": 4.804264019133397e-05, + "loss": 0.1709, + "step": 12004 + }, + { + "epoch": 0.21412264117290336, + "grad_norm": 0.3335111141204834, + "learning_rate": 4.804203639270356e-05, + "loss": 0.2097, + "step": 12005 + }, + { + "epoch": 0.21414047729461705, + "grad_norm": 0.3034976124763489, + "learning_rate": 4.804143250475425e-05, + "loss": 0.1847, + "step": 12006 + }, + { + "epoch": 0.21415831341633076, + "grad_norm": 0.26121431589126587, + "learning_rate": 4.80408285274884e-05, + "loss": 0.1882, + "step": 12007 + }, + { + "epoch": 0.21417614953804445, + "grad_norm": 0.32125476002693176, + "learning_rate": 4.804022446090834e-05, + "loss": 0.1768, + "step": 12008 + }, + { + "epoch": 0.21419398565975814, + "grad_norm": 0.2741241455078125, + "learning_rate": 4.8039620305016396e-05, + "loss": 0.1964, + "step": 12009 + }, + { + "epoch": 0.21421182178147183, + "grad_norm": 0.1951020509004593, + "learning_rate": 4.8039016059814935e-05, + "loss": 0.159, + "step": 12010 + }, + { + "epoch": 0.21422965790318554, + "grad_norm": 0.25098711252212524, + "learning_rate": 4.803841172530629e-05, + "loss": 0.2214, + "step": 12011 + }, + { + "epoch": 0.21424749402489923, + "grad_norm": 0.3764820992946625, + "learning_rate": 4.803780730149281e-05, + "loss": 0.2713, + "step": 12012 + }, + { + "epoch": 0.21426533014661292, + "grad_norm": 0.2963665723800659, + "learning_rate": 4.803720278837683e-05, + "loss": 0.1738, + "step": 12013 + }, + { + "epoch": 0.2142831662683266, + "grad_norm": 0.19593137502670288, + "learning_rate": 4.803659818596069e-05, + "loss": 0.1776, + "step": 12014 + }, + { + "epoch": 0.2143010023900403, + "grad_norm": 0.3092212378978729, + "learning_rate": 4.803599349424675e-05, + "loss": 0.1467, + "step": 12015 + }, + { + "epoch": 0.214318838511754, + "grad_norm": 0.24250073730945587, + "learning_rate": 4.803538871323733e-05, + "loss": 0.2136, + "step": 12016 + }, + { + "epoch": 0.2143366746334677, + "grad_norm": 0.2425922006368637, + "learning_rate": 4.803478384293479e-05, + "loss": 0.2097, + "step": 12017 + }, + { + "epoch": 0.2143545107551814, + "grad_norm": 0.24515973031520844, + "learning_rate": 4.803417888334148e-05, + "loss": 0.1966, + "step": 12018 + }, + { + "epoch": 0.21437234687689508, + "grad_norm": 0.24052035808563232, + "learning_rate": 4.803357383445972e-05, + "loss": 0.1853, + "step": 12019 + }, + { + "epoch": 0.2143901829986088, + "grad_norm": 0.24403268098831177, + "learning_rate": 4.803296869629188e-05, + "loss": 0.1928, + "step": 12020 + }, + { + "epoch": 0.21440801912032248, + "grad_norm": 0.33883607387542725, + "learning_rate": 4.80323634688403e-05, + "loss": 0.1631, + "step": 12021 + }, + { + "epoch": 0.21442585524203617, + "grad_norm": 0.259956032037735, + "learning_rate": 4.803175815210733e-05, + "loss": 0.1825, + "step": 12022 + }, + { + "epoch": 0.21444369136374986, + "grad_norm": 0.36655721068382263, + "learning_rate": 4.803115274609531e-05, + "loss": 0.2128, + "step": 12023 + }, + { + "epoch": 0.21446152748546357, + "grad_norm": 0.2695331573486328, + "learning_rate": 4.803054725080658e-05, + "loss": 0.2105, + "step": 12024 + }, + { + "epoch": 0.21447936360717726, + "grad_norm": 0.24818134307861328, + "learning_rate": 4.802994166624349e-05, + "loss": 0.2043, + "step": 12025 + }, + { + "epoch": 0.21449719972889095, + "grad_norm": 0.2615300118923187, + "learning_rate": 4.8029335992408406e-05, + "loss": 0.2032, + "step": 12026 + }, + { + "epoch": 0.21451503585060464, + "grad_norm": 0.20985738933086395, + "learning_rate": 4.8028730229303655e-05, + "loss": 0.1569, + "step": 12027 + }, + { + "epoch": 0.21453287197231835, + "grad_norm": 0.30844759941101074, + "learning_rate": 4.802812437693159e-05, + "loss": 0.2262, + "step": 12028 + }, + { + "epoch": 0.21455070809403204, + "grad_norm": 0.2554391920566559, + "learning_rate": 4.802751843529456e-05, + "loss": 0.2213, + "step": 12029 + }, + { + "epoch": 0.21456854421574573, + "grad_norm": 0.2754734456539154, + "learning_rate": 4.802691240439492e-05, + "loss": 0.193, + "step": 12030 + }, + { + "epoch": 0.21458638033745941, + "grad_norm": 0.38308730721473694, + "learning_rate": 4.802630628423501e-05, + "loss": 0.1922, + "step": 12031 + }, + { + "epoch": 0.21460421645917313, + "grad_norm": 0.3109648525714874, + "learning_rate": 4.8025700074817184e-05, + "loss": 0.1987, + "step": 12032 + }, + { + "epoch": 0.21462205258088682, + "grad_norm": 0.2678164541721344, + "learning_rate": 4.8025093776143794e-05, + "loss": 0.2009, + "step": 12033 + }, + { + "epoch": 0.2146398887026005, + "grad_norm": 0.32864266633987427, + "learning_rate": 4.802448738821719e-05, + "loss": 0.2062, + "step": 12034 + }, + { + "epoch": 0.2146577248243142, + "grad_norm": 0.30044132471084595, + "learning_rate": 4.8023880911039715e-05, + "loss": 0.2207, + "step": 12035 + }, + { + "epoch": 0.21467556094602788, + "grad_norm": 0.27375471591949463, + "learning_rate": 4.802327434461373e-05, + "loss": 0.1924, + "step": 12036 + }, + { + "epoch": 0.2146933970677416, + "grad_norm": 0.3943275809288025, + "learning_rate": 4.802266768894158e-05, + "loss": 0.2027, + "step": 12037 + }, + { + "epoch": 0.2147112331894553, + "grad_norm": 0.3643236756324768, + "learning_rate": 4.802206094402561e-05, + "loss": 0.2858, + "step": 12038 + }, + { + "epoch": 0.21472906931116897, + "grad_norm": 0.27129387855529785, + "learning_rate": 4.802145410986819e-05, + "loss": 0.2144, + "step": 12039 + }, + { + "epoch": 0.21474690543288266, + "grad_norm": 0.2571626603603363, + "learning_rate": 4.802084718647166e-05, + "loss": 0.201, + "step": 12040 + }, + { + "epoch": 0.21476474155459638, + "grad_norm": 0.2622687816619873, + "learning_rate": 4.802024017383838e-05, + "loss": 0.2093, + "step": 12041 + }, + { + "epoch": 0.21478257767631007, + "grad_norm": 0.2342766672372818, + "learning_rate": 4.8019633071970696e-05, + "loss": 0.1853, + "step": 12042 + }, + { + "epoch": 0.21480041379802375, + "grad_norm": 0.338860422372818, + "learning_rate": 4.801902588087096e-05, + "loss": 0.1861, + "step": 12043 + }, + { + "epoch": 0.21481824991973744, + "grad_norm": 0.3265077471733093, + "learning_rate": 4.801841860054153e-05, + "loss": 0.1804, + "step": 12044 + }, + { + "epoch": 0.21483608604145116, + "grad_norm": 0.23601371049880981, + "learning_rate": 4.801781123098476e-05, + "loss": 0.2038, + "step": 12045 + }, + { + "epoch": 0.21485392216316485, + "grad_norm": 0.2651790380477905, + "learning_rate": 4.8017203772203e-05, + "loss": 0.2151, + "step": 12046 + }, + { + "epoch": 0.21487175828487853, + "grad_norm": 0.26052558422088623, + "learning_rate": 4.8016596224198616e-05, + "loss": 0.2408, + "step": 12047 + }, + { + "epoch": 0.21488959440659222, + "grad_norm": 0.3609578013420105, + "learning_rate": 4.801598858697395e-05, + "loss": 0.2196, + "step": 12048 + }, + { + "epoch": 0.21490743052830594, + "grad_norm": 0.24095414578914642, + "learning_rate": 4.8015380860531366e-05, + "loss": 0.2276, + "step": 12049 + }, + { + "epoch": 0.21492526665001963, + "grad_norm": 0.24156628549098969, + "learning_rate": 4.801477304487321e-05, + "loss": 0.1881, + "step": 12050 + }, + { + "epoch": 0.2149431027717333, + "grad_norm": 0.2123507857322693, + "learning_rate": 4.801416514000186e-05, + "loss": 0.1919, + "step": 12051 + }, + { + "epoch": 0.214960938893447, + "grad_norm": 0.21883635222911835, + "learning_rate": 4.801355714591964e-05, + "loss": 0.2207, + "step": 12052 + }, + { + "epoch": 0.21497877501516072, + "grad_norm": 0.2538836896419525, + "learning_rate": 4.801294906262893e-05, + "loss": 0.2068, + "step": 12053 + }, + { + "epoch": 0.2149966111368744, + "grad_norm": 0.22579586505889893, + "learning_rate": 4.801234089013208e-05, + "loss": 0.1817, + "step": 12054 + }, + { + "epoch": 0.2150144472585881, + "grad_norm": 0.20638296008110046, + "learning_rate": 4.801173262843145e-05, + "loss": 0.1581, + "step": 12055 + }, + { + "epoch": 0.21503228338030178, + "grad_norm": 0.34238219261169434, + "learning_rate": 4.8011124277529394e-05, + "loss": 0.2141, + "step": 12056 + }, + { + "epoch": 0.21505011950201547, + "grad_norm": 0.2788119614124298, + "learning_rate": 4.801051583742827e-05, + "loss": 0.2361, + "step": 12057 + }, + { + "epoch": 0.21506795562372918, + "grad_norm": 0.36585521697998047, + "learning_rate": 4.800990730813045e-05, + "loss": 0.1913, + "step": 12058 + }, + { + "epoch": 0.21508579174544287, + "grad_norm": 0.37219300866127014, + "learning_rate": 4.800929868963827e-05, + "loss": 0.2789, + "step": 12059 + }, + { + "epoch": 0.21510362786715656, + "grad_norm": 0.27912479639053345, + "learning_rate": 4.800868998195411e-05, + "loss": 0.2172, + "step": 12060 + }, + { + "epoch": 0.21512146398887025, + "grad_norm": 0.3401962220668793, + "learning_rate": 4.800808118508032e-05, + "loss": 0.2169, + "step": 12061 + }, + { + "epoch": 0.21513930011058396, + "grad_norm": 0.29472339153289795, + "learning_rate": 4.800747229901925e-05, + "loss": 0.2179, + "step": 12062 + }, + { + "epoch": 0.21515713623229765, + "grad_norm": 0.24006521701812744, + "learning_rate": 4.800686332377329e-05, + "loss": 0.1951, + "step": 12063 + }, + { + "epoch": 0.21517497235401134, + "grad_norm": 0.2039320319890976, + "learning_rate": 4.800625425934476e-05, + "loss": 0.1813, + "step": 12064 + }, + { + "epoch": 0.21519280847572503, + "grad_norm": 0.26818224787712097, + "learning_rate": 4.8005645105736054e-05, + "loss": 0.2396, + "step": 12065 + }, + { + "epoch": 0.21521064459743874, + "grad_norm": 0.3269694149494171, + "learning_rate": 4.8005035862949524e-05, + "loss": 0.1883, + "step": 12066 + }, + { + "epoch": 0.21522848071915243, + "grad_norm": 0.3098810911178589, + "learning_rate": 4.800442653098752e-05, + "loss": 0.2599, + "step": 12067 + }, + { + "epoch": 0.21524631684086612, + "grad_norm": 0.25813785195350647, + "learning_rate": 4.8003817109852424e-05, + "loss": 0.1805, + "step": 12068 + }, + { + "epoch": 0.2152641529625798, + "grad_norm": 0.27266553044319153, + "learning_rate": 4.800320759954658e-05, + "loss": 0.1648, + "step": 12069 + }, + { + "epoch": 0.21528198908429352, + "grad_norm": 0.23762328922748566, + "learning_rate": 4.8002598000072365e-05, + "loss": 0.2227, + "step": 12070 + }, + { + "epoch": 0.2152998252060072, + "grad_norm": 0.23860234022140503, + "learning_rate": 4.800198831143212e-05, + "loss": 0.1918, + "step": 12071 + }, + { + "epoch": 0.2153176613277209, + "grad_norm": 0.19405187666416168, + "learning_rate": 4.800137853362824e-05, + "loss": 0.1624, + "step": 12072 + }, + { + "epoch": 0.2153354974494346, + "grad_norm": 0.29371708631515503, + "learning_rate": 4.800076866666306e-05, + "loss": 0.2006, + "step": 12073 + }, + { + "epoch": 0.2153533335711483, + "grad_norm": 0.20635518431663513, + "learning_rate": 4.8000158710538975e-05, + "loss": 0.1952, + "step": 12074 + }, + { + "epoch": 0.215371169692862, + "grad_norm": 0.19975915551185608, + "learning_rate": 4.799954866525831e-05, + "loss": 0.1702, + "step": 12075 + }, + { + "epoch": 0.21538900581457568, + "grad_norm": 0.23310358822345734, + "learning_rate": 4.799893853082347e-05, + "loss": 0.1834, + "step": 12076 + }, + { + "epoch": 0.21540684193628937, + "grad_norm": 0.2859938442707062, + "learning_rate": 4.799832830723678e-05, + "loss": 0.2155, + "step": 12077 + }, + { + "epoch": 0.21542467805800306, + "grad_norm": 0.2594512104988098, + "learning_rate": 4.799771799450065e-05, + "loss": 0.1832, + "step": 12078 + }, + { + "epoch": 0.21544251417971677, + "grad_norm": 0.2855367362499237, + "learning_rate": 4.79971075926174e-05, + "loss": 0.249, + "step": 12079 + }, + { + "epoch": 0.21546035030143046, + "grad_norm": 0.31071969866752625, + "learning_rate": 4.7996497101589424e-05, + "loss": 0.1791, + "step": 12080 + }, + { + "epoch": 0.21547818642314415, + "grad_norm": 0.36608681082725525, + "learning_rate": 4.7995886521419084e-05, + "loss": 0.2383, + "step": 12081 + }, + { + "epoch": 0.21549602254485783, + "grad_norm": 0.2994576096534729, + "learning_rate": 4.799527585210875e-05, + "loss": 0.1858, + "step": 12082 + }, + { + "epoch": 0.21551385866657155, + "grad_norm": 0.22747214138507843, + "learning_rate": 4.7994665093660784e-05, + "loss": 0.1565, + "step": 12083 + }, + { + "epoch": 0.21553169478828524, + "grad_norm": 0.32047033309936523, + "learning_rate": 4.799405424607755e-05, + "loss": 0.2137, + "step": 12084 + }, + { + "epoch": 0.21554953090999893, + "grad_norm": 0.2919504940509796, + "learning_rate": 4.799344330936142e-05, + "loss": 0.2513, + "step": 12085 + }, + { + "epoch": 0.21556736703171261, + "grad_norm": 0.2635859549045563, + "learning_rate": 4.799283228351476e-05, + "loss": 0.172, + "step": 12086 + }, + { + "epoch": 0.21558520315342633, + "grad_norm": 0.27573275566101074, + "learning_rate": 4.799222116853994e-05, + "loss": 0.2189, + "step": 12087 + }, + { + "epoch": 0.21560303927514002, + "grad_norm": 0.4691026508808136, + "learning_rate": 4.799160996443934e-05, + "loss": 0.2742, + "step": 12088 + }, + { + "epoch": 0.2156208753968537, + "grad_norm": 0.26830360293388367, + "learning_rate": 4.799099867121531e-05, + "loss": 0.2154, + "step": 12089 + }, + { + "epoch": 0.2156387115185674, + "grad_norm": 0.15114668011665344, + "learning_rate": 4.799038728887023e-05, + "loss": 0.1332, + "step": 12090 + }, + { + "epoch": 0.2156565476402811, + "grad_norm": 0.2538500726222992, + "learning_rate": 4.798977581740647e-05, + "loss": 0.2175, + "step": 12091 + }, + { + "epoch": 0.2156743837619948, + "grad_norm": 0.23798954486846924, + "learning_rate": 4.798916425682639e-05, + "loss": 0.2165, + "step": 12092 + }, + { + "epoch": 0.21569221988370849, + "grad_norm": 0.3038810193538666, + "learning_rate": 4.798855260713238e-05, + "loss": 0.2489, + "step": 12093 + }, + { + "epoch": 0.21571005600542217, + "grad_norm": 0.21753878891468048, + "learning_rate": 4.79879408683268e-05, + "loss": 0.1744, + "step": 12094 + }, + { + "epoch": 0.21572789212713586, + "grad_norm": 0.6992783546447754, + "learning_rate": 4.798732904041201e-05, + "loss": 0.1578, + "step": 12095 + }, + { + "epoch": 0.21574572824884958, + "grad_norm": 0.23858462274074554, + "learning_rate": 4.79867171233904e-05, + "loss": 0.1804, + "step": 12096 + }, + { + "epoch": 0.21576356437056327, + "grad_norm": 0.2669230103492737, + "learning_rate": 4.798610511726434e-05, + "loss": 0.1789, + "step": 12097 + }, + { + "epoch": 0.21578140049227695, + "grad_norm": 0.2066022753715515, + "learning_rate": 4.798549302203619e-05, + "loss": 0.1746, + "step": 12098 + }, + { + "epoch": 0.21579923661399064, + "grad_norm": 0.20645281672477722, + "learning_rate": 4.7984880837708335e-05, + "loss": 0.1541, + "step": 12099 + }, + { + "epoch": 0.21581707273570436, + "grad_norm": 0.28157174587249756, + "learning_rate": 4.7984268564283144e-05, + "loss": 0.2116, + "step": 12100 + }, + { + "epoch": 0.21583490885741805, + "grad_norm": 0.4844302535057068, + "learning_rate": 4.798365620176298e-05, + "loss": 0.2843, + "step": 12101 + }, + { + "epoch": 0.21585274497913173, + "grad_norm": 0.26500019431114197, + "learning_rate": 4.7983043750150236e-05, + "loss": 0.1995, + "step": 12102 + }, + { + "epoch": 0.21587058110084542, + "grad_norm": 0.2732619643211365, + "learning_rate": 4.7982431209447275e-05, + "loss": 0.1719, + "step": 12103 + }, + { + "epoch": 0.21588841722255914, + "grad_norm": 0.2765786051750183, + "learning_rate": 4.7981818579656465e-05, + "loss": 0.1721, + "step": 12104 + }, + { + "epoch": 0.21590625334427282, + "grad_norm": 0.3647291660308838, + "learning_rate": 4.79812058607802e-05, + "loss": 0.2315, + "step": 12105 + }, + { + "epoch": 0.2159240894659865, + "grad_norm": 0.2344547063112259, + "learning_rate": 4.798059305282083e-05, + "loss": 0.2106, + "step": 12106 + }, + { + "epoch": 0.2159419255877002, + "grad_norm": 0.3204667568206787, + "learning_rate": 4.797998015578076e-05, + "loss": 0.233, + "step": 12107 + }, + { + "epoch": 0.21595976170941392, + "grad_norm": 0.28090086579322815, + "learning_rate": 4.797936716966234e-05, + "loss": 0.2314, + "step": 12108 + }, + { + "epoch": 0.2159775978311276, + "grad_norm": 0.2808286249637604, + "learning_rate": 4.797875409446796e-05, + "loss": 0.1293, + "step": 12109 + }, + { + "epoch": 0.2159954339528413, + "grad_norm": 0.35246843099594116, + "learning_rate": 4.7978140930199985e-05, + "loss": 0.2188, + "step": 12110 + }, + { + "epoch": 0.21601327007455498, + "grad_norm": 0.35542526841163635, + "learning_rate": 4.797752767686081e-05, + "loss": 0.1862, + "step": 12111 + }, + { + "epoch": 0.2160311061962687, + "grad_norm": 0.40284082293510437, + "learning_rate": 4.797691433445279e-05, + "loss": 0.2124, + "step": 12112 + }, + { + "epoch": 0.21604894231798238, + "grad_norm": 0.22678276896476746, + "learning_rate": 4.7976300902978325e-05, + "loss": 0.175, + "step": 12113 + }, + { + "epoch": 0.21606677843969607, + "grad_norm": 0.2312001883983612, + "learning_rate": 4.797568738243978e-05, + "loss": 0.2215, + "step": 12114 + }, + { + "epoch": 0.21608461456140976, + "grad_norm": 0.17601428925991058, + "learning_rate": 4.797507377283953e-05, + "loss": 0.1576, + "step": 12115 + }, + { + "epoch": 0.21610245068312345, + "grad_norm": 0.3084196150302887, + "learning_rate": 4.7974460074179964e-05, + "loss": 0.2023, + "step": 12116 + }, + { + "epoch": 0.21612028680483716, + "grad_norm": 0.3049376904964447, + "learning_rate": 4.797384628646345e-05, + "loss": 0.2035, + "step": 12117 + }, + { + "epoch": 0.21613812292655085, + "grad_norm": 0.2835913896560669, + "learning_rate": 4.797323240969238e-05, + "loss": 0.2013, + "step": 12118 + }, + { + "epoch": 0.21615595904826454, + "grad_norm": 0.21759581565856934, + "learning_rate": 4.797261844386912e-05, + "loss": 0.1578, + "step": 12119 + }, + { + "epoch": 0.21617379516997823, + "grad_norm": 0.31380537152290344, + "learning_rate": 4.7972004388996064e-05, + "loss": 0.1689, + "step": 12120 + }, + { + "epoch": 0.21619163129169194, + "grad_norm": 0.5979520082473755, + "learning_rate": 4.797139024507558e-05, + "loss": 0.1988, + "step": 12121 + }, + { + "epoch": 0.21620946741340563, + "grad_norm": 0.29887494444847107, + "learning_rate": 4.797077601211005e-05, + "loss": 0.1988, + "step": 12122 + }, + { + "epoch": 0.21622730353511932, + "grad_norm": 0.24623669683933258, + "learning_rate": 4.7970161690101856e-05, + "loss": 0.1765, + "step": 12123 + }, + { + "epoch": 0.216245139656833, + "grad_norm": 0.27386224269866943, + "learning_rate": 4.796954727905339e-05, + "loss": 0.2408, + "step": 12124 + }, + { + "epoch": 0.21626297577854672, + "grad_norm": 0.256830096244812, + "learning_rate": 4.796893277896702e-05, + "loss": 0.1673, + "step": 12125 + }, + { + "epoch": 0.2162808119002604, + "grad_norm": 0.2863677144050598, + "learning_rate": 4.796831818984514e-05, + "loss": 0.2012, + "step": 12126 + }, + { + "epoch": 0.2162986480219741, + "grad_norm": 0.24352280795574188, + "learning_rate": 4.796770351169012e-05, + "loss": 0.2075, + "step": 12127 + }, + { + "epoch": 0.2163164841436878, + "grad_norm": 0.2724657356739044, + "learning_rate": 4.796708874450435e-05, + "loss": 0.1693, + "step": 12128 + }, + { + "epoch": 0.2163343202654015, + "grad_norm": 0.21111701428890228, + "learning_rate": 4.796647388829021e-05, + "loss": 0.1456, + "step": 12129 + }, + { + "epoch": 0.2163521563871152, + "grad_norm": 0.29900088906288147, + "learning_rate": 4.796585894305009e-05, + "loss": 0.1979, + "step": 12130 + }, + { + "epoch": 0.21636999250882888, + "grad_norm": 0.37574273347854614, + "learning_rate": 4.796524390878636e-05, + "loss": 0.2705, + "step": 12131 + }, + { + "epoch": 0.21638782863054257, + "grad_norm": 0.2196299433708191, + "learning_rate": 4.796462878550142e-05, + "loss": 0.2057, + "step": 12132 + }, + { + "epoch": 0.21640566475225628, + "grad_norm": 0.4287925660610199, + "learning_rate": 4.7964013573197643e-05, + "loss": 0.1545, + "step": 12133 + }, + { + "epoch": 0.21642350087396997, + "grad_norm": 0.3357856869697571, + "learning_rate": 4.796339827187742e-05, + "loss": 0.1947, + "step": 12134 + }, + { + "epoch": 0.21644133699568366, + "grad_norm": 0.323546439409256, + "learning_rate": 4.7962782881543135e-05, + "loss": 0.2179, + "step": 12135 + }, + { + "epoch": 0.21645917311739735, + "grad_norm": 0.3660182058811188, + "learning_rate": 4.7962167402197165e-05, + "loss": 0.1956, + "step": 12136 + }, + { + "epoch": 0.21647700923911103, + "grad_norm": 0.5602064728736877, + "learning_rate": 4.796155183384191e-05, + "loss": 0.2135, + "step": 12137 + }, + { + "epoch": 0.21649484536082475, + "grad_norm": 0.23747332394123077, + "learning_rate": 4.796093617647975e-05, + "loss": 0.1801, + "step": 12138 + }, + { + "epoch": 0.21651268148253844, + "grad_norm": 0.30389052629470825, + "learning_rate": 4.796032043011307e-05, + "loss": 0.2015, + "step": 12139 + }, + { + "epoch": 0.21653051760425213, + "grad_norm": 0.29486486315727234, + "learning_rate": 4.795970459474426e-05, + "loss": 0.2369, + "step": 12140 + }, + { + "epoch": 0.21654835372596581, + "grad_norm": 0.24094629287719727, + "learning_rate": 4.7959088670375695e-05, + "loss": 0.1953, + "step": 12141 + }, + { + "epoch": 0.21656618984767953, + "grad_norm": 0.331759512424469, + "learning_rate": 4.795847265700978e-05, + "loss": 0.1681, + "step": 12142 + }, + { + "epoch": 0.21658402596939322, + "grad_norm": 0.22682291269302368, + "learning_rate": 4.7957856554648893e-05, + "loss": 0.1985, + "step": 12143 + }, + { + "epoch": 0.2166018620911069, + "grad_norm": 0.2786453068256378, + "learning_rate": 4.795724036329543e-05, + "loss": 0.2236, + "step": 12144 + }, + { + "epoch": 0.2166196982128206, + "grad_norm": 0.19285887479782104, + "learning_rate": 4.795662408295177e-05, + "loss": 0.174, + "step": 12145 + }, + { + "epoch": 0.2166375343345343, + "grad_norm": 0.27381888031959534, + "learning_rate": 4.795600771362031e-05, + "loss": 0.1749, + "step": 12146 + }, + { + "epoch": 0.216655370456248, + "grad_norm": 0.26288434863090515, + "learning_rate": 4.795539125530343e-05, + "loss": 0.196, + "step": 12147 + }, + { + "epoch": 0.21667320657796169, + "grad_norm": 0.2533511817455292, + "learning_rate": 4.795477470800353e-05, + "loss": 0.1775, + "step": 12148 + }, + { + "epoch": 0.21669104269967537, + "grad_norm": 0.3898746073246002, + "learning_rate": 4.795415807172299e-05, + "loss": 0.2122, + "step": 12149 + }, + { + "epoch": 0.2167088788213891, + "grad_norm": 0.3281749188899994, + "learning_rate": 4.7953541346464204e-05, + "loss": 0.2089, + "step": 12150 + }, + { + "epoch": 0.21672671494310278, + "grad_norm": 0.3043719232082367, + "learning_rate": 4.795292453222957e-05, + "loss": 0.1722, + "step": 12151 + }, + { + "epoch": 0.21674455106481647, + "grad_norm": 0.32433322072029114, + "learning_rate": 4.7952307629021463e-05, + "loss": 0.197, + "step": 12152 + }, + { + "epoch": 0.21676238718653015, + "grad_norm": 0.2759004533290863, + "learning_rate": 4.7951690636842294e-05, + "loss": 0.2065, + "step": 12153 + }, + { + "epoch": 0.21678022330824387, + "grad_norm": 0.48696210980415344, + "learning_rate": 4.795107355569445e-05, + "loss": 0.1324, + "step": 12154 + }, + { + "epoch": 0.21679805942995756, + "grad_norm": 0.3025630712509155, + "learning_rate": 4.79504563855803e-05, + "loss": 0.2084, + "step": 12155 + }, + { + "epoch": 0.21681589555167124, + "grad_norm": 0.236316978931427, + "learning_rate": 4.794983912650227e-05, + "loss": 0.1819, + "step": 12156 + }, + { + "epoch": 0.21683373167338493, + "grad_norm": 0.2562558948993683, + "learning_rate": 4.7949221778462726e-05, + "loss": 0.1969, + "step": 12157 + }, + { + "epoch": 0.21685156779509862, + "grad_norm": 0.20902837812900543, + "learning_rate": 4.7948604341464075e-05, + "loss": 0.1526, + "step": 12158 + }, + { + "epoch": 0.21686940391681234, + "grad_norm": 0.2542819678783417, + "learning_rate": 4.7947986815508716e-05, + "loss": 0.1609, + "step": 12159 + }, + { + "epoch": 0.21688724003852602, + "grad_norm": 0.2710743248462677, + "learning_rate": 4.794736920059902e-05, + "loss": 0.1999, + "step": 12160 + }, + { + "epoch": 0.2169050761602397, + "grad_norm": 0.4025109112262726, + "learning_rate": 4.794675149673741e-05, + "loss": 0.1791, + "step": 12161 + }, + { + "epoch": 0.2169229122819534, + "grad_norm": 0.26331812143325806, + "learning_rate": 4.794613370392625e-05, + "loss": 0.1838, + "step": 12162 + }, + { + "epoch": 0.21694074840366712, + "grad_norm": 0.4019258916378021, + "learning_rate": 4.794551582216796e-05, + "loss": 0.2056, + "step": 12163 + }, + { + "epoch": 0.2169585845253808, + "grad_norm": 0.31794071197509766, + "learning_rate": 4.794489785146493e-05, + "loss": 0.2645, + "step": 12164 + }, + { + "epoch": 0.2169764206470945, + "grad_norm": 0.29514479637145996, + "learning_rate": 4.794427979181955e-05, + "loss": 0.1921, + "step": 12165 + }, + { + "epoch": 0.21699425676880818, + "grad_norm": 0.26694512367248535, + "learning_rate": 4.7943661643234204e-05, + "loss": 0.2074, + "step": 12166 + }, + { + "epoch": 0.2170120928905219, + "grad_norm": 0.28734442591667175, + "learning_rate": 4.794304340571131e-05, + "loss": 0.2038, + "step": 12167 + }, + { + "epoch": 0.21702992901223558, + "grad_norm": 0.38088735938072205, + "learning_rate": 4.794242507925325e-05, + "loss": 0.1931, + "step": 12168 + }, + { + "epoch": 0.21704776513394927, + "grad_norm": 0.24369138479232788, + "learning_rate": 4.794180666386243e-05, + "loss": 0.1778, + "step": 12169 + }, + { + "epoch": 0.21706560125566296, + "grad_norm": 0.28930190205574036, + "learning_rate": 4.794118815954125e-05, + "loss": 0.2593, + "step": 12170 + }, + { + "epoch": 0.21708343737737668, + "grad_norm": 0.28787702322006226, + "learning_rate": 4.794056956629209e-05, + "loss": 0.1811, + "step": 12171 + }, + { + "epoch": 0.21710127349909036, + "grad_norm": 0.39515286684036255, + "learning_rate": 4.7939950884117366e-05, + "loss": 0.1736, + "step": 12172 + }, + { + "epoch": 0.21711910962080405, + "grad_norm": 0.3337680399417877, + "learning_rate": 4.793933211301947e-05, + "loss": 0.2176, + "step": 12173 + }, + { + "epoch": 0.21713694574251774, + "grad_norm": 0.24080854654312134, + "learning_rate": 4.79387132530008e-05, + "loss": 0.2172, + "step": 12174 + }, + { + "epoch": 0.21715478186423146, + "grad_norm": 0.28199151158332825, + "learning_rate": 4.793809430406375e-05, + "loss": 0.1989, + "step": 12175 + }, + { + "epoch": 0.21717261798594514, + "grad_norm": 0.37244999408721924, + "learning_rate": 4.7937475266210724e-05, + "loss": 0.2353, + "step": 12176 + }, + { + "epoch": 0.21719045410765883, + "grad_norm": 0.2573454976081848, + "learning_rate": 4.7936856139444125e-05, + "loss": 0.1933, + "step": 12177 + }, + { + "epoch": 0.21720829022937252, + "grad_norm": 0.2606930136680603, + "learning_rate": 4.793623692376635e-05, + "loss": 0.2353, + "step": 12178 + }, + { + "epoch": 0.2172261263510862, + "grad_norm": 0.2101358026266098, + "learning_rate": 4.793561761917979e-05, + "loss": 0.1799, + "step": 12179 + }, + { + "epoch": 0.21724396247279992, + "grad_norm": 0.26700419187545776, + "learning_rate": 4.793499822568687e-05, + "loss": 0.1509, + "step": 12180 + }, + { + "epoch": 0.2172617985945136, + "grad_norm": 0.3781948387622833, + "learning_rate": 4.7934378743289964e-05, + "loss": 0.1807, + "step": 12181 + }, + { + "epoch": 0.2172796347162273, + "grad_norm": 0.2628592252731323, + "learning_rate": 4.793375917199149e-05, + "loss": 0.1672, + "step": 12182 + }, + { + "epoch": 0.217297470837941, + "grad_norm": 0.3336634635925293, + "learning_rate": 4.793313951179383e-05, + "loss": 0.2129, + "step": 12183 + }, + { + "epoch": 0.2173153069596547, + "grad_norm": 0.4094424545764923, + "learning_rate": 4.793251976269942e-05, + "loss": 0.1769, + "step": 12184 + }, + { + "epoch": 0.2173331430813684, + "grad_norm": 0.3108433187007904, + "learning_rate": 4.793189992471063e-05, + "loss": 0.1407, + "step": 12185 + }, + { + "epoch": 0.21735097920308208, + "grad_norm": 0.29962775111198425, + "learning_rate": 4.793127999782988e-05, + "loss": 0.2472, + "step": 12186 + }, + { + "epoch": 0.21736881532479577, + "grad_norm": 0.2748205065727234, + "learning_rate": 4.793065998205957e-05, + "loss": 0.1938, + "step": 12187 + }, + { + "epoch": 0.21738665144650948, + "grad_norm": 0.3648146986961365, + "learning_rate": 4.79300398774021e-05, + "loss": 0.2381, + "step": 12188 + }, + { + "epoch": 0.21740448756822317, + "grad_norm": 0.29264548420906067, + "learning_rate": 4.792941968385988e-05, + "loss": 0.1776, + "step": 12189 + }, + { + "epoch": 0.21742232368993686, + "grad_norm": 0.30888015031814575, + "learning_rate": 4.7928799401435305e-05, + "loss": 0.2233, + "step": 12190 + }, + { + "epoch": 0.21744015981165055, + "grad_norm": 0.23494990170001984, + "learning_rate": 4.792817903013078e-05, + "loss": 0.2076, + "step": 12191 + }, + { + "epoch": 0.21745799593336426, + "grad_norm": 0.3458961844444275, + "learning_rate": 4.792755856994872e-05, + "loss": 0.2022, + "step": 12192 + }, + { + "epoch": 0.21747583205507795, + "grad_norm": 0.1945781409740448, + "learning_rate": 4.7926938020891526e-05, + "loss": 0.1732, + "step": 12193 + }, + { + "epoch": 0.21749366817679164, + "grad_norm": 0.3091394603252411, + "learning_rate": 4.7926317382961595e-05, + "loss": 0.2258, + "step": 12194 + }, + { + "epoch": 0.21751150429850533, + "grad_norm": 0.456118106842041, + "learning_rate": 4.792569665616135e-05, + "loss": 0.1607, + "step": 12195 + }, + { + "epoch": 0.21752934042021901, + "grad_norm": 0.3005441129207611, + "learning_rate": 4.792507584049317e-05, + "loss": 0.2279, + "step": 12196 + }, + { + "epoch": 0.21754717654193273, + "grad_norm": 0.262269526720047, + "learning_rate": 4.792445493595949e-05, + "loss": 0.1413, + "step": 12197 + }, + { + "epoch": 0.21756501266364642, + "grad_norm": 0.30201366543769836, + "learning_rate": 4.79238339425627e-05, + "loss": 0.2252, + "step": 12198 + }, + { + "epoch": 0.2175828487853601, + "grad_norm": 0.23367714881896973, + "learning_rate": 4.7923212860305223e-05, + "loss": 0.2042, + "step": 12199 + }, + { + "epoch": 0.2176006849070738, + "grad_norm": 0.2498628795146942, + "learning_rate": 4.7922591689189445e-05, + "loss": 0.1889, + "step": 12200 + }, + { + "epoch": 0.2176185210287875, + "grad_norm": 0.3375679552555084, + "learning_rate": 4.792197042921778e-05, + "loss": 0.1977, + "step": 12201 + }, + { + "epoch": 0.2176363571505012, + "grad_norm": 0.2772034704685211, + "learning_rate": 4.7921349080392655e-05, + "loss": 0.1729, + "step": 12202 + }, + { + "epoch": 0.21765419327221489, + "grad_norm": 0.2688341438770294, + "learning_rate": 4.792072764271645e-05, + "loss": 0.2039, + "step": 12203 + }, + { + "epoch": 0.21767202939392857, + "grad_norm": 0.2971316874027252, + "learning_rate": 4.79201061161916e-05, + "loss": 0.2044, + "step": 12204 + }, + { + "epoch": 0.2176898655156423, + "grad_norm": 0.28549161553382874, + "learning_rate": 4.7919484500820485e-05, + "loss": 0.1834, + "step": 12205 + }, + { + "epoch": 0.21770770163735598, + "grad_norm": 0.23461858928203583, + "learning_rate": 4.791886279660555e-05, + "loss": 0.1859, + "step": 12206 + }, + { + "epoch": 0.21772553775906967, + "grad_norm": 0.26371443271636963, + "learning_rate": 4.791824100354918e-05, + "loss": 0.1898, + "step": 12207 + }, + { + "epoch": 0.21774337388078335, + "grad_norm": 0.3226472735404968, + "learning_rate": 4.791761912165379e-05, + "loss": 0.1528, + "step": 12208 + }, + { + "epoch": 0.21776121000249707, + "grad_norm": 0.3185974359512329, + "learning_rate": 4.79169971509218e-05, + "loss": 0.218, + "step": 12209 + }, + { + "epoch": 0.21777904612421076, + "grad_norm": 0.24560217559337616, + "learning_rate": 4.7916375091355606e-05, + "loss": 0.166, + "step": 12210 + }, + { + "epoch": 0.21779688224592444, + "grad_norm": 0.32225093245506287, + "learning_rate": 4.791575294295763e-05, + "loss": 0.2163, + "step": 12211 + }, + { + "epoch": 0.21781471836763813, + "grad_norm": 0.28764331340789795, + "learning_rate": 4.791513070573028e-05, + "loss": 0.2047, + "step": 12212 + }, + { + "epoch": 0.21783255448935185, + "grad_norm": 0.33743715286254883, + "learning_rate": 4.791450837967597e-05, + "loss": 0.2239, + "step": 12213 + }, + { + "epoch": 0.21785039061106554, + "grad_norm": 0.24375684559345245, + "learning_rate": 4.791388596479711e-05, + "loss": 0.1919, + "step": 12214 + }, + { + "epoch": 0.21786822673277922, + "grad_norm": 0.28009992837905884, + "learning_rate": 4.791326346109611e-05, + "loss": 0.1917, + "step": 12215 + }, + { + "epoch": 0.2178860628544929, + "grad_norm": 0.23669345676898956, + "learning_rate": 4.7912640868575396e-05, + "loss": 0.1318, + "step": 12216 + }, + { + "epoch": 0.2179038989762066, + "grad_norm": 0.23772671818733215, + "learning_rate": 4.7912018187237365e-05, + "loss": 0.1475, + "step": 12217 + }, + { + "epoch": 0.21792173509792032, + "grad_norm": 0.2695966958999634, + "learning_rate": 4.791139541708444e-05, + "loss": 0.2337, + "step": 12218 + }, + { + "epoch": 0.217939571219634, + "grad_norm": 0.3321371376514435, + "learning_rate": 4.791077255811904e-05, + "loss": 0.2339, + "step": 12219 + }, + { + "epoch": 0.2179574073413477, + "grad_norm": 0.334119975566864, + "learning_rate": 4.7910149610343564e-05, + "loss": 0.2019, + "step": 12220 + }, + { + "epoch": 0.21797524346306138, + "grad_norm": 0.8641327619552612, + "learning_rate": 4.790952657376043e-05, + "loss": 0.2221, + "step": 12221 + }, + { + "epoch": 0.2179930795847751, + "grad_norm": 0.2770143747329712, + "learning_rate": 4.7908903448372065e-05, + "loss": 0.1945, + "step": 12222 + }, + { + "epoch": 0.21801091570648878, + "grad_norm": 0.2931188642978668, + "learning_rate": 4.790828023418088e-05, + "loss": 0.1901, + "step": 12223 + }, + { + "epoch": 0.21802875182820247, + "grad_norm": 0.2954394817352295, + "learning_rate": 4.790765693118929e-05, + "loss": 0.2452, + "step": 12224 + }, + { + "epoch": 0.21804658794991616, + "grad_norm": 0.3034812808036804, + "learning_rate": 4.7907033539399706e-05, + "loss": 0.1653, + "step": 12225 + }, + { + "epoch": 0.21806442407162988, + "grad_norm": 0.2628895938396454, + "learning_rate": 4.790641005881455e-05, + "loss": 0.1726, + "step": 12226 + }, + { + "epoch": 0.21808226019334356, + "grad_norm": 0.21256977319717407, + "learning_rate": 4.790578648943623e-05, + "loss": 0.1834, + "step": 12227 + }, + { + "epoch": 0.21810009631505725, + "grad_norm": 0.2179006040096283, + "learning_rate": 4.7905162831267183e-05, + "loss": 0.1966, + "step": 12228 + }, + { + "epoch": 0.21811793243677094, + "grad_norm": 0.2746255397796631, + "learning_rate": 4.79045390843098e-05, + "loss": 0.1546, + "step": 12229 + }, + { + "epoch": 0.21813576855848466, + "grad_norm": 0.29825106263160706, + "learning_rate": 4.790391524856652e-05, + "loss": 0.1767, + "step": 12230 + }, + { + "epoch": 0.21815360468019834, + "grad_norm": 0.31748199462890625, + "learning_rate": 4.790329132403975e-05, + "loss": 0.231, + "step": 12231 + }, + { + "epoch": 0.21817144080191203, + "grad_norm": 0.35434678196907043, + "learning_rate": 4.790266731073192e-05, + "loss": 0.2346, + "step": 12232 + }, + { + "epoch": 0.21818927692362572, + "grad_norm": 0.23999528586864471, + "learning_rate": 4.790204320864544e-05, + "loss": 0.1707, + "step": 12233 + }, + { + "epoch": 0.21820711304533943, + "grad_norm": 0.22765415906906128, + "learning_rate": 4.7901419017782725e-05, + "loss": 0.1612, + "step": 12234 + }, + { + "epoch": 0.21822494916705312, + "grad_norm": 0.3081587851047516, + "learning_rate": 4.7900794738146195e-05, + "loss": 0.1684, + "step": 12235 + }, + { + "epoch": 0.2182427852887668, + "grad_norm": 0.2984144389629364, + "learning_rate": 4.790017036973828e-05, + "loss": 0.2263, + "step": 12236 + }, + { + "epoch": 0.2182606214104805, + "grad_norm": 0.32591012120246887, + "learning_rate": 4.78995459125614e-05, + "loss": 0.2128, + "step": 12237 + }, + { + "epoch": 0.2182784575321942, + "grad_norm": 0.29073888063430786, + "learning_rate": 4.7898921366617964e-05, + "loss": 0.197, + "step": 12238 + }, + { + "epoch": 0.2182962936539079, + "grad_norm": 0.42854738235473633, + "learning_rate": 4.7898296731910407e-05, + "loss": 0.2, + "step": 12239 + }, + { + "epoch": 0.2183141297756216, + "grad_norm": 0.287572979927063, + "learning_rate": 4.789767200844114e-05, + "loss": 0.17, + "step": 12240 + }, + { + "epoch": 0.21833196589733528, + "grad_norm": 0.3488253355026245, + "learning_rate": 4.789704719621259e-05, + "loss": 0.2905, + "step": 12241 + }, + { + "epoch": 0.21834980201904897, + "grad_norm": 0.3265188932418823, + "learning_rate": 4.789642229522717e-05, + "loss": 0.1688, + "step": 12242 + }, + { + "epoch": 0.21836763814076268, + "grad_norm": 0.33907297253608704, + "learning_rate": 4.789579730548731e-05, + "loss": 0.2235, + "step": 12243 + }, + { + "epoch": 0.21838547426247637, + "grad_norm": 0.2541789710521698, + "learning_rate": 4.7895172226995436e-05, + "loss": 0.2259, + "step": 12244 + }, + { + "epoch": 0.21840331038419006, + "grad_norm": 0.4170028269290924, + "learning_rate": 4.789454705975397e-05, + "loss": 0.2484, + "step": 12245 + }, + { + "epoch": 0.21842114650590375, + "grad_norm": 0.3981008529663086, + "learning_rate": 4.789392180376532e-05, + "loss": 0.1641, + "step": 12246 + }, + { + "epoch": 0.21843898262761746, + "grad_norm": 0.33615782856941223, + "learning_rate": 4.7893296459031935e-05, + "loss": 0.2295, + "step": 12247 + }, + { + "epoch": 0.21845681874933115, + "grad_norm": 0.27407175302505493, + "learning_rate": 4.7892671025556214e-05, + "loss": 0.2229, + "step": 12248 + }, + { + "epoch": 0.21847465487104484, + "grad_norm": 0.2900344431400299, + "learning_rate": 4.78920455033406e-05, + "loss": 0.2087, + "step": 12249 + }, + { + "epoch": 0.21849249099275853, + "grad_norm": 0.4448204040527344, + "learning_rate": 4.7891419892387515e-05, + "loss": 0.225, + "step": 12250 + }, + { + "epoch": 0.21851032711447224, + "grad_norm": 0.27972525358200073, + "learning_rate": 4.7890794192699375e-05, + "loss": 0.2287, + "step": 12251 + }, + { + "epoch": 0.21852816323618593, + "grad_norm": 0.28016993403434753, + "learning_rate": 4.7890168404278604e-05, + "loss": 0.2018, + "step": 12252 + }, + { + "epoch": 0.21854599935789962, + "grad_norm": 0.2946826219558716, + "learning_rate": 4.788954252712764e-05, + "loss": 0.1597, + "step": 12253 + }, + { + "epoch": 0.2185638354796133, + "grad_norm": 0.2103988528251648, + "learning_rate": 4.788891656124891e-05, + "loss": 0.1728, + "step": 12254 + }, + { + "epoch": 0.21858167160132702, + "grad_norm": 0.2620214521884918, + "learning_rate": 4.788829050664483e-05, + "loss": 0.1533, + "step": 12255 + }, + { + "epoch": 0.2185995077230407, + "grad_norm": 0.29502132534980774, + "learning_rate": 4.788766436331782e-05, + "loss": 0.2134, + "step": 12256 + }, + { + "epoch": 0.2186173438447544, + "grad_norm": 0.3599487841129303, + "learning_rate": 4.7887038131270335e-05, + "loss": 0.1665, + "step": 12257 + }, + { + "epoch": 0.21863517996646809, + "grad_norm": 0.23317238688468933, + "learning_rate": 4.788641181050477e-05, + "loss": 0.1911, + "step": 12258 + }, + { + "epoch": 0.21865301608818177, + "grad_norm": 0.1978743076324463, + "learning_rate": 4.7885785401023575e-05, + "loss": 0.1875, + "step": 12259 + }, + { + "epoch": 0.2186708522098955, + "grad_norm": 0.3456242084503174, + "learning_rate": 4.7885158902829175e-05, + "loss": 0.2045, + "step": 12260 + }, + { + "epoch": 0.21868868833160918, + "grad_norm": 0.3013564646244049, + "learning_rate": 4.788453231592399e-05, + "loss": 0.1876, + "step": 12261 + }, + { + "epoch": 0.21870652445332286, + "grad_norm": 0.3238040506839752, + "learning_rate": 4.7883905640310455e-05, + "loss": 0.2449, + "step": 12262 + }, + { + "epoch": 0.21872436057503655, + "grad_norm": 0.2792685925960541, + "learning_rate": 4.7883278875990994e-05, + "loss": 0.197, + "step": 12263 + }, + { + "epoch": 0.21874219669675027, + "grad_norm": 0.24670548737049103, + "learning_rate": 4.788265202296805e-05, + "loss": 0.2046, + "step": 12264 + }, + { + "epoch": 0.21876003281846396, + "grad_norm": 0.2170332372188568, + "learning_rate": 4.788202508124403e-05, + "loss": 0.1863, + "step": 12265 + }, + { + "epoch": 0.21877786894017764, + "grad_norm": 0.2853853106498718, + "learning_rate": 4.788139805082139e-05, + "loss": 0.1468, + "step": 12266 + }, + { + "epoch": 0.21879570506189133, + "grad_norm": 0.28167521953582764, + "learning_rate": 4.788077093170253e-05, + "loss": 0.1929, + "step": 12267 + }, + { + "epoch": 0.21881354118360505, + "grad_norm": 0.26518744230270386, + "learning_rate": 4.788014372388992e-05, + "loss": 0.1677, + "step": 12268 + }, + { + "epoch": 0.21883137730531874, + "grad_norm": 0.2601465582847595, + "learning_rate": 4.787951642738596e-05, + "loss": 0.2303, + "step": 12269 + }, + { + "epoch": 0.21884921342703242, + "grad_norm": 0.2647935152053833, + "learning_rate": 4.78788890421931e-05, + "loss": 0.1886, + "step": 12270 + }, + { + "epoch": 0.2188670495487461, + "grad_norm": 0.3722890317440033, + "learning_rate": 4.787826156831375e-05, + "loss": 0.2433, + "step": 12271 + }, + { + "epoch": 0.21888488567045983, + "grad_norm": 0.41759246587753296, + "learning_rate": 4.7877634005750375e-05, + "loss": 0.193, + "step": 12272 + }, + { + "epoch": 0.21890272179217352, + "grad_norm": 0.23161475360393524, + "learning_rate": 4.787700635450537e-05, + "loss": 0.1773, + "step": 12273 + }, + { + "epoch": 0.2189205579138872, + "grad_norm": 0.3327459990978241, + "learning_rate": 4.7876378614581195e-05, + "loss": 0.231, + "step": 12274 + }, + { + "epoch": 0.2189383940356009, + "grad_norm": 0.2902162969112396, + "learning_rate": 4.787575078598028e-05, + "loss": 0.1908, + "step": 12275 + }, + { + "epoch": 0.21895623015731458, + "grad_norm": 0.2756287157535553, + "learning_rate": 4.787512286870505e-05, + "loss": 0.2331, + "step": 12276 + }, + { + "epoch": 0.2189740662790283, + "grad_norm": 0.3340472877025604, + "learning_rate": 4.787449486275794e-05, + "loss": 0.2683, + "step": 12277 + }, + { + "epoch": 0.21899190240074198, + "grad_norm": 0.2721349596977234, + "learning_rate": 4.787386676814139e-05, + "loss": 0.1819, + "step": 12278 + }, + { + "epoch": 0.21900973852245567, + "grad_norm": 0.3155156373977661, + "learning_rate": 4.787323858485783e-05, + "loss": 0.199, + "step": 12279 + }, + { + "epoch": 0.21902757464416936, + "grad_norm": 0.3061648905277252, + "learning_rate": 4.7872610312909706e-05, + "loss": 0.2095, + "step": 12280 + }, + { + "epoch": 0.21904541076588308, + "grad_norm": 0.35271456837654114, + "learning_rate": 4.787198195229943e-05, + "loss": 0.2197, + "step": 12281 + }, + { + "epoch": 0.21906324688759676, + "grad_norm": 0.28183358907699585, + "learning_rate": 4.787135350302946e-05, + "loss": 0.2137, + "step": 12282 + }, + { + "epoch": 0.21908108300931045, + "grad_norm": 0.3016718924045563, + "learning_rate": 4.787072496510223e-05, + "loss": 0.2259, + "step": 12283 + }, + { + "epoch": 0.21909891913102414, + "grad_norm": 0.19393084943294525, + "learning_rate": 4.787009633852016e-05, + "loss": 0.1584, + "step": 12284 + }, + { + "epoch": 0.21911675525273785, + "grad_norm": 0.2776353061199188, + "learning_rate": 4.78694676232857e-05, + "loss": 0.2061, + "step": 12285 + }, + { + "epoch": 0.21913459137445154, + "grad_norm": 0.26373741030693054, + "learning_rate": 4.786883881940129e-05, + "loss": 0.2597, + "step": 12286 + }, + { + "epoch": 0.21915242749616523, + "grad_norm": 0.24344190955162048, + "learning_rate": 4.7868209926869355e-05, + "loss": 0.1913, + "step": 12287 + }, + { + "epoch": 0.21917026361787892, + "grad_norm": 0.22540917992591858, + "learning_rate": 4.786758094569235e-05, + "loss": 0.1767, + "step": 12288 + }, + { + "epoch": 0.21918809973959263, + "grad_norm": 0.1850290447473526, + "learning_rate": 4.78669518758727e-05, + "loss": 0.1381, + "step": 12289 + }, + { + "epoch": 0.21920593586130632, + "grad_norm": 0.30858656764030457, + "learning_rate": 4.786632271741284e-05, + "loss": 0.1993, + "step": 12290 + }, + { + "epoch": 0.21922377198302, + "grad_norm": 0.253150075674057, + "learning_rate": 4.786569347031522e-05, + "loss": 0.1938, + "step": 12291 + }, + { + "epoch": 0.2192416081047337, + "grad_norm": 0.23767095804214478, + "learning_rate": 4.786506413458227e-05, + "loss": 0.1879, + "step": 12292 + }, + { + "epoch": 0.21925944422644741, + "grad_norm": 0.3637741208076477, + "learning_rate": 4.786443471021644e-05, + "loss": 0.1946, + "step": 12293 + }, + { + "epoch": 0.2192772803481611, + "grad_norm": 0.27433422207832336, + "learning_rate": 4.786380519722016e-05, + "loss": 0.2105, + "step": 12294 + }, + { + "epoch": 0.2192951164698748, + "grad_norm": 0.22182419896125793, + "learning_rate": 4.7863175595595877e-05, + "loss": 0.136, + "step": 12295 + }, + { + "epoch": 0.21931295259158848, + "grad_norm": 0.25109902024269104, + "learning_rate": 4.7862545905346024e-05, + "loss": 0.1871, + "step": 12296 + }, + { + "epoch": 0.21933078871330217, + "grad_norm": 0.2733488380908966, + "learning_rate": 4.786191612647305e-05, + "loss": 0.1789, + "step": 12297 + }, + { + "epoch": 0.21934862483501588, + "grad_norm": 0.2835102677345276, + "learning_rate": 4.786128625897939e-05, + "loss": 0.2043, + "step": 12298 + }, + { + "epoch": 0.21936646095672957, + "grad_norm": 0.31644755601882935, + "learning_rate": 4.786065630286749e-05, + "loss": 0.2068, + "step": 12299 + }, + { + "epoch": 0.21938429707844326, + "grad_norm": 0.2616254985332489, + "learning_rate": 4.786002625813979e-05, + "loss": 0.2211, + "step": 12300 + }, + { + "epoch": 0.21940213320015695, + "grad_norm": 0.3720972239971161, + "learning_rate": 4.785939612479874e-05, + "loss": 0.1697, + "step": 12301 + }, + { + "epoch": 0.21941996932187066, + "grad_norm": 0.28673508763313293, + "learning_rate": 4.785876590284677e-05, + "loss": 0.2045, + "step": 12302 + }, + { + "epoch": 0.21943780544358435, + "grad_norm": 0.31360381841659546, + "learning_rate": 4.785813559228632e-05, + "loss": 0.1938, + "step": 12303 + }, + { + "epoch": 0.21945564156529804, + "grad_norm": 0.21911442279815674, + "learning_rate": 4.7857505193119854e-05, + "loss": 0.1566, + "step": 12304 + }, + { + "epoch": 0.21947347768701173, + "grad_norm": 0.21743826568126678, + "learning_rate": 4.7856874705349795e-05, + "loss": 0.1601, + "step": 12305 + }, + { + "epoch": 0.21949131380872544, + "grad_norm": 0.2897314131259918, + "learning_rate": 4.78562441289786e-05, + "loss": 0.2092, + "step": 12306 + }, + { + "epoch": 0.21950914993043913, + "grad_norm": 0.22341065108776093, + "learning_rate": 4.7855613464008706e-05, + "loss": 0.1722, + "step": 12307 + }, + { + "epoch": 0.21952698605215282, + "grad_norm": 0.2659430503845215, + "learning_rate": 4.785498271044256e-05, + "loss": 0.1837, + "step": 12308 + }, + { + "epoch": 0.2195448221738665, + "grad_norm": 0.22382672131061554, + "learning_rate": 4.785435186828261e-05, + "loss": 0.1463, + "step": 12309 + }, + { + "epoch": 0.21956265829558022, + "grad_norm": 0.39409008622169495, + "learning_rate": 4.7853720937531296e-05, + "loss": 0.2372, + "step": 12310 + }, + { + "epoch": 0.2195804944172939, + "grad_norm": 0.2905671000480652, + "learning_rate": 4.7853089918191075e-05, + "loss": 0.2056, + "step": 12311 + }, + { + "epoch": 0.2195983305390076, + "grad_norm": 0.2422112375497818, + "learning_rate": 4.785245881026437e-05, + "loss": 0.1488, + "step": 12312 + }, + { + "epoch": 0.21961616666072128, + "grad_norm": 0.23518626391887665, + "learning_rate": 4.785182761375365e-05, + "loss": 0.1402, + "step": 12313 + }, + { + "epoch": 0.219634002782435, + "grad_norm": 0.2504526674747467, + "learning_rate": 4.785119632866135e-05, + "loss": 0.1965, + "step": 12314 + }, + { + "epoch": 0.2196518389041487, + "grad_norm": 0.31703639030456543, + "learning_rate": 4.785056495498992e-05, + "loss": 0.2529, + "step": 12315 + }, + { + "epoch": 0.21966967502586238, + "grad_norm": 0.2896729111671448, + "learning_rate": 4.784993349274181e-05, + "loss": 0.1814, + "step": 12316 + }, + { + "epoch": 0.21968751114757606, + "grad_norm": 0.2588949203491211, + "learning_rate": 4.784930194191947e-05, + "loss": 0.2313, + "step": 12317 + }, + { + "epoch": 0.21970534726928975, + "grad_norm": 0.27120110392570496, + "learning_rate": 4.784867030252533e-05, + "loss": 0.212, + "step": 12318 + }, + { + "epoch": 0.21972318339100347, + "grad_norm": 0.3410847783088684, + "learning_rate": 4.7848038574561866e-05, + "loss": 0.2028, + "step": 12319 + }, + { + "epoch": 0.21974101951271716, + "grad_norm": 0.2402489185333252, + "learning_rate": 4.784740675803151e-05, + "loss": 0.1879, + "step": 12320 + }, + { + "epoch": 0.21975885563443084, + "grad_norm": 0.2496604025363922, + "learning_rate": 4.784677485293671e-05, + "loss": 0.185, + "step": 12321 + }, + { + "epoch": 0.21977669175614453, + "grad_norm": 0.3286452889442444, + "learning_rate": 4.7846142859279916e-05, + "loss": 0.2043, + "step": 12322 + }, + { + "epoch": 0.21979452787785825, + "grad_norm": 0.3150593638420105, + "learning_rate": 4.7845510777063596e-05, + "loss": 0.188, + "step": 12323 + }, + { + "epoch": 0.21981236399957194, + "grad_norm": 0.3060225248336792, + "learning_rate": 4.784487860629018e-05, + "loss": 0.1816, + "step": 12324 + }, + { + "epoch": 0.21983020012128562, + "grad_norm": 0.20153634250164032, + "learning_rate": 4.784424634696212e-05, + "loss": 0.166, + "step": 12325 + }, + { + "epoch": 0.2198480362429993, + "grad_norm": 0.2629064619541168, + "learning_rate": 4.784361399908187e-05, + "loss": 0.2131, + "step": 12326 + }, + { + "epoch": 0.21986587236471303, + "grad_norm": 0.35216501355171204, + "learning_rate": 4.7842981562651885e-05, + "loss": 0.1888, + "step": 12327 + }, + { + "epoch": 0.21988370848642672, + "grad_norm": 0.3123012185096741, + "learning_rate": 4.784234903767461e-05, + "loss": 0.1586, + "step": 12328 + }, + { + "epoch": 0.2199015446081404, + "grad_norm": 0.23378488421440125, + "learning_rate": 4.7841716424152504e-05, + "loss": 0.179, + "step": 12329 + }, + { + "epoch": 0.2199193807298541, + "grad_norm": 0.27993661165237427, + "learning_rate": 4.784108372208802e-05, + "loss": 0.2027, + "step": 12330 + }, + { + "epoch": 0.2199372168515678, + "grad_norm": 0.2942934036254883, + "learning_rate": 4.7840450931483596e-05, + "loss": 0.237, + "step": 12331 + }, + { + "epoch": 0.2199550529732815, + "grad_norm": 0.24330751597881317, + "learning_rate": 4.783981805234171e-05, + "loss": 0.1687, + "step": 12332 + }, + { + "epoch": 0.21997288909499518, + "grad_norm": 0.2552853524684906, + "learning_rate": 4.7839185084664785e-05, + "loss": 0.1772, + "step": 12333 + }, + { + "epoch": 0.21999072521670887, + "grad_norm": 0.2882727086544037, + "learning_rate": 4.7838552028455294e-05, + "loss": 0.1679, + "step": 12334 + }, + { + "epoch": 0.2200085613384226, + "grad_norm": 0.37794631719589233, + "learning_rate": 4.7837918883715695e-05, + "loss": 0.1799, + "step": 12335 + }, + { + "epoch": 0.22002639746013627, + "grad_norm": 0.3529682457447052, + "learning_rate": 4.7837285650448434e-05, + "loss": 0.1801, + "step": 12336 + }, + { + "epoch": 0.22004423358184996, + "grad_norm": 0.34412533044815063, + "learning_rate": 4.7836652328655964e-05, + "loss": 0.1926, + "step": 12337 + }, + { + "epoch": 0.22006206970356365, + "grad_norm": 0.3385235667228699, + "learning_rate": 4.783601891834074e-05, + "loss": 0.1939, + "step": 12338 + }, + { + "epoch": 0.22007990582527734, + "grad_norm": 0.28697073459625244, + "learning_rate": 4.7835385419505215e-05, + "loss": 0.1902, + "step": 12339 + }, + { + "epoch": 0.22009774194699105, + "grad_norm": 0.2434438318014145, + "learning_rate": 4.783475183215185e-05, + "loss": 0.234, + "step": 12340 + }, + { + "epoch": 0.22011557806870474, + "grad_norm": 0.20635941624641418, + "learning_rate": 4.78341181562831e-05, + "loss": 0.17, + "step": 12341 + }, + { + "epoch": 0.22013341419041843, + "grad_norm": 0.2844592332839966, + "learning_rate": 4.783348439190143e-05, + "loss": 0.2128, + "step": 12342 + }, + { + "epoch": 0.22015125031213212, + "grad_norm": 0.3016209900379181, + "learning_rate": 4.7832850539009284e-05, + "loss": 0.177, + "step": 12343 + }, + { + "epoch": 0.22016908643384583, + "grad_norm": 0.3935510814189911, + "learning_rate": 4.7832216597609123e-05, + "loss": 0.2482, + "step": 12344 + }, + { + "epoch": 0.22018692255555952, + "grad_norm": 0.29393836855888367, + "learning_rate": 4.7831582567703405e-05, + "loss": 0.213, + "step": 12345 + }, + { + "epoch": 0.2202047586772732, + "grad_norm": 0.21075181663036346, + "learning_rate": 4.783094844929458e-05, + "loss": 0.1891, + "step": 12346 + }, + { + "epoch": 0.2202225947989869, + "grad_norm": 0.2374795824289322, + "learning_rate": 4.783031424238512e-05, + "loss": 0.2152, + "step": 12347 + }, + { + "epoch": 0.22024043092070061, + "grad_norm": 0.3203499913215637, + "learning_rate": 4.782967994697748e-05, + "loss": 0.2363, + "step": 12348 + }, + { + "epoch": 0.2202582670424143, + "grad_norm": 0.29125335812568665, + "learning_rate": 4.782904556307411e-05, + "loss": 0.1946, + "step": 12349 + }, + { + "epoch": 0.220276103164128, + "grad_norm": 0.29094862937927246, + "learning_rate": 4.7828411090677473e-05, + "loss": 0.1917, + "step": 12350 + }, + { + "epoch": 0.22029393928584168, + "grad_norm": 0.22975733876228333, + "learning_rate": 4.7827776529790036e-05, + "loss": 0.157, + "step": 12351 + }, + { + "epoch": 0.2203117754075554, + "grad_norm": 0.29881027340888977, + "learning_rate": 4.782714188041425e-05, + "loss": 0.2171, + "step": 12352 + }, + { + "epoch": 0.22032961152926908, + "grad_norm": 0.32889387011528015, + "learning_rate": 4.7826507142552575e-05, + "loss": 0.2454, + "step": 12353 + }, + { + "epoch": 0.22034744765098277, + "grad_norm": 0.34230297803878784, + "learning_rate": 4.782587231620748e-05, + "loss": 0.218, + "step": 12354 + }, + { + "epoch": 0.22036528377269646, + "grad_norm": 0.3229703903198242, + "learning_rate": 4.7825237401381414e-05, + "loss": 0.195, + "step": 12355 + }, + { + "epoch": 0.22038311989441017, + "grad_norm": 0.2649451196193695, + "learning_rate": 4.7824602398076844e-05, + "loss": 0.2257, + "step": 12356 + }, + { + "epoch": 0.22040095601612386, + "grad_norm": 0.26519548892974854, + "learning_rate": 4.782396730629623e-05, + "loss": 0.1385, + "step": 12357 + }, + { + "epoch": 0.22041879213783755, + "grad_norm": 0.26670214533805847, + "learning_rate": 4.782333212604204e-05, + "loss": 0.2327, + "step": 12358 + }, + { + "epoch": 0.22043662825955124, + "grad_norm": 0.3590201437473297, + "learning_rate": 4.782269685731674e-05, + "loss": 0.2465, + "step": 12359 + }, + { + "epoch": 0.22045446438126493, + "grad_norm": 0.20097680389881134, + "learning_rate": 4.7822061500122774e-05, + "loss": 0.1764, + "step": 12360 + }, + { + "epoch": 0.22047230050297864, + "grad_norm": 0.8733730912208557, + "learning_rate": 4.782142605446261e-05, + "loss": 0.2678, + "step": 12361 + }, + { + "epoch": 0.22049013662469233, + "grad_norm": 0.30836859345436096, + "learning_rate": 4.782079052033873e-05, + "loss": 0.1088, + "step": 12362 + }, + { + "epoch": 0.22050797274640602, + "grad_norm": 0.3191524147987366, + "learning_rate": 4.782015489775358e-05, + "loss": 0.1871, + "step": 12363 + }, + { + "epoch": 0.2205258088681197, + "grad_norm": 0.34677889943122864, + "learning_rate": 4.781951918670962e-05, + "loss": 0.2062, + "step": 12364 + }, + { + "epoch": 0.22054364498983342, + "grad_norm": 0.28938624262809753, + "learning_rate": 4.781888338720933e-05, + "loss": 0.2099, + "step": 12365 + }, + { + "epoch": 0.2205614811115471, + "grad_norm": 0.5357033610343933, + "learning_rate": 4.781824749925516e-05, + "loss": 0.2763, + "step": 12366 + }, + { + "epoch": 0.2205793172332608, + "grad_norm": 0.2821754217147827, + "learning_rate": 4.7817611522849584e-05, + "loss": 0.2164, + "step": 12367 + }, + { + "epoch": 0.22059715335497448, + "grad_norm": 0.25177887082099915, + "learning_rate": 4.781697545799507e-05, + "loss": 0.2464, + "step": 12368 + }, + { + "epoch": 0.2206149894766882, + "grad_norm": 0.24871040880680084, + "learning_rate": 4.781633930469407e-05, + "loss": 0.1728, + "step": 12369 + }, + { + "epoch": 0.2206328255984019, + "grad_norm": 0.26730161905288696, + "learning_rate": 4.781570306294907e-05, + "loss": 0.2026, + "step": 12370 + }, + { + "epoch": 0.22065066172011558, + "grad_norm": 0.23566238582134247, + "learning_rate": 4.781506673276251e-05, + "loss": 0.1298, + "step": 12371 + }, + { + "epoch": 0.22066849784182926, + "grad_norm": 0.3828012943267822, + "learning_rate": 4.781443031413688e-05, + "loss": 0.1918, + "step": 12372 + }, + { + "epoch": 0.22068633396354298, + "grad_norm": 0.2641248404979706, + "learning_rate": 4.7813793807074636e-05, + "loss": 0.1923, + "step": 12373 + }, + { + "epoch": 0.22070417008525667, + "grad_norm": 0.2882792055606842, + "learning_rate": 4.781315721157825e-05, + "loss": 0.2229, + "step": 12374 + }, + { + "epoch": 0.22072200620697036, + "grad_norm": 0.2525346875190735, + "learning_rate": 4.781252052765019e-05, + "loss": 0.1523, + "step": 12375 + }, + { + "epoch": 0.22073984232868404, + "grad_norm": 0.4984354078769684, + "learning_rate": 4.781188375529292e-05, + "loss": 0.1894, + "step": 12376 + }, + { + "epoch": 0.22075767845039773, + "grad_norm": 0.29797837138175964, + "learning_rate": 4.78112468945089e-05, + "loss": 0.2333, + "step": 12377 + }, + { + "epoch": 0.22077551457211145, + "grad_norm": 0.34396496415138245, + "learning_rate": 4.781060994530062e-05, + "loss": 0.2196, + "step": 12378 + }, + { + "epoch": 0.22079335069382514, + "grad_norm": 0.3002025783061981, + "learning_rate": 4.780997290767053e-05, + "loss": 0.2182, + "step": 12379 + }, + { + "epoch": 0.22081118681553882, + "grad_norm": 0.32589346170425415, + "learning_rate": 4.780933578162111e-05, + "loss": 0.2119, + "step": 12380 + }, + { + "epoch": 0.2208290229372525, + "grad_norm": 0.3294805586338043, + "learning_rate": 4.780869856715482e-05, + "loss": 0.2759, + "step": 12381 + }, + { + "epoch": 0.22084685905896623, + "grad_norm": 0.20914940536022186, + "learning_rate": 4.7808061264274145e-05, + "loss": 0.1827, + "step": 12382 + }, + { + "epoch": 0.22086469518067992, + "grad_norm": 0.1981377750635147, + "learning_rate": 4.7807423872981546e-05, + "loss": 0.1795, + "step": 12383 + }, + { + "epoch": 0.2208825313023936, + "grad_norm": 0.26864659786224365, + "learning_rate": 4.780678639327949e-05, + "loss": 0.1919, + "step": 12384 + }, + { + "epoch": 0.2209003674241073, + "grad_norm": 0.26839518547058105, + "learning_rate": 4.780614882517045e-05, + "loss": 0.195, + "step": 12385 + }, + { + "epoch": 0.220918203545821, + "grad_norm": 0.27852749824523926, + "learning_rate": 4.7805511168656916e-05, + "loss": 0.1043, + "step": 12386 + }, + { + "epoch": 0.2209360396675347, + "grad_norm": 0.33031165599823, + "learning_rate": 4.7804873423741323e-05, + "loss": 0.2583, + "step": 12387 + }, + { + "epoch": 0.22095387578924838, + "grad_norm": 0.23721639811992645, + "learning_rate": 4.780423559042618e-05, + "loss": 0.1855, + "step": 12388 + }, + { + "epoch": 0.22097171191096207, + "grad_norm": 0.2521204650402069, + "learning_rate": 4.780359766871394e-05, + "loss": 0.1738, + "step": 12389 + }, + { + "epoch": 0.2209895480326758, + "grad_norm": 0.26155155897140503, + "learning_rate": 4.780295965860707e-05, + "loss": 0.2106, + "step": 12390 + }, + { + "epoch": 0.22100738415438947, + "grad_norm": 0.29395127296447754, + "learning_rate": 4.7802321560108064e-05, + "loss": 0.1983, + "step": 12391 + }, + { + "epoch": 0.22102522027610316, + "grad_norm": 0.35063931345939636, + "learning_rate": 4.780168337321938e-05, + "loss": 0.2406, + "step": 12392 + }, + { + "epoch": 0.22104305639781685, + "grad_norm": 0.3468126654624939, + "learning_rate": 4.78010450979435e-05, + "loss": 0.1558, + "step": 12393 + }, + { + "epoch": 0.22106089251953057, + "grad_norm": 0.3247314989566803, + "learning_rate": 4.780040673428289e-05, + "loss": 0.1939, + "step": 12394 + }, + { + "epoch": 0.22107872864124425, + "grad_norm": 0.29034194350242615, + "learning_rate": 4.779976828224002e-05, + "loss": 0.1906, + "step": 12395 + }, + { + "epoch": 0.22109656476295794, + "grad_norm": 0.2510344386100769, + "learning_rate": 4.779912974181738e-05, + "loss": 0.1675, + "step": 12396 + }, + { + "epoch": 0.22111440088467163, + "grad_norm": 0.2595363259315491, + "learning_rate": 4.779849111301744e-05, + "loss": 0.1586, + "step": 12397 + }, + { + "epoch": 0.22113223700638532, + "grad_norm": 0.34143128991127014, + "learning_rate": 4.779785239584268e-05, + "loss": 0.1739, + "step": 12398 + }, + { + "epoch": 0.22115007312809903, + "grad_norm": 0.2366911768913269, + "learning_rate": 4.779721359029556e-05, + "loss": 0.1999, + "step": 12399 + }, + { + "epoch": 0.22116790924981272, + "grad_norm": 0.37339404225349426, + "learning_rate": 4.7796574696378574e-05, + "loss": 0.1865, + "step": 12400 + }, + { + "epoch": 0.2211857453715264, + "grad_norm": 0.3215400278568268, + "learning_rate": 4.7795935714094186e-05, + "loss": 0.2001, + "step": 12401 + }, + { + "epoch": 0.2212035814932401, + "grad_norm": 0.21011440455913544, + "learning_rate": 4.7795296643444874e-05, + "loss": 0.1924, + "step": 12402 + }, + { + "epoch": 0.2212214176149538, + "grad_norm": 0.33960291743278503, + "learning_rate": 4.7794657484433126e-05, + "loss": 0.2013, + "step": 12403 + }, + { + "epoch": 0.2212392537366675, + "grad_norm": 0.2527913749217987, + "learning_rate": 4.7794018237061414e-05, + "loss": 0.1877, + "step": 12404 + }, + { + "epoch": 0.2212570898583812, + "grad_norm": 0.22020575404167175, + "learning_rate": 4.7793378901332206e-05, + "loss": 0.177, + "step": 12405 + }, + { + "epoch": 0.22127492598009488, + "grad_norm": 0.2744147777557373, + "learning_rate": 4.7792739477247996e-05, + "loss": 0.1796, + "step": 12406 + }, + { + "epoch": 0.2212927621018086, + "grad_norm": 0.29843688011169434, + "learning_rate": 4.779209996481125e-05, + "loss": 0.1583, + "step": 12407 + }, + { + "epoch": 0.22131059822352228, + "grad_norm": 0.34808576107025146, + "learning_rate": 4.779146036402445e-05, + "loss": 0.1641, + "step": 12408 + }, + { + "epoch": 0.22132843434523597, + "grad_norm": 0.3092830777168274, + "learning_rate": 4.7790820674890094e-05, + "loss": 0.2148, + "step": 12409 + }, + { + "epoch": 0.22134627046694966, + "grad_norm": 0.3606700301170349, + "learning_rate": 4.779018089741063e-05, + "loss": 0.2408, + "step": 12410 + }, + { + "epoch": 0.22136410658866337, + "grad_norm": 0.28641277551651, + "learning_rate": 4.778954103158856e-05, + "loss": 0.2272, + "step": 12411 + }, + { + "epoch": 0.22138194271037706, + "grad_norm": 0.2776421904563904, + "learning_rate": 4.778890107742635e-05, + "loss": 0.2094, + "step": 12412 + }, + { + "epoch": 0.22139977883209075, + "grad_norm": 0.2324228435754776, + "learning_rate": 4.778826103492649e-05, + "loss": 0.182, + "step": 12413 + }, + { + "epoch": 0.22141761495380444, + "grad_norm": 0.27369239926338196, + "learning_rate": 4.778762090409147e-05, + "loss": 0.19, + "step": 12414 + }, + { + "epoch": 0.22143545107551815, + "grad_norm": 0.23946613073349, + "learning_rate": 4.778698068492376e-05, + "loss": 0.1773, + "step": 12415 + }, + { + "epoch": 0.22145328719723184, + "grad_norm": 0.30583590269088745, + "learning_rate": 4.778634037742583e-05, + "loss": 0.194, + "step": 12416 + }, + { + "epoch": 0.22147112331894553, + "grad_norm": 0.2545165419578552, + "learning_rate": 4.778569998160018e-05, + "loss": 0.1727, + "step": 12417 + }, + { + "epoch": 0.22148895944065922, + "grad_norm": 0.37317144870758057, + "learning_rate": 4.778505949744929e-05, + "loss": 0.2441, + "step": 12418 + }, + { + "epoch": 0.2215067955623729, + "grad_norm": 0.30775630474090576, + "learning_rate": 4.778441892497564e-05, + "loss": 0.2023, + "step": 12419 + }, + { + "epoch": 0.22152463168408662, + "grad_norm": 0.24987727403640747, + "learning_rate": 4.7783778264181704e-05, + "loss": 0.2301, + "step": 12420 + }, + { + "epoch": 0.2215424678058003, + "grad_norm": 0.2718750834465027, + "learning_rate": 4.778313751506998e-05, + "loss": 0.2274, + "step": 12421 + }, + { + "epoch": 0.221560303927514, + "grad_norm": 0.28519824147224426, + "learning_rate": 4.7782496677642954e-05, + "loss": 0.1384, + "step": 12422 + }, + { + "epoch": 0.22157814004922768, + "grad_norm": 0.3110770285129547, + "learning_rate": 4.7781855751903084e-05, + "loss": 0.2168, + "step": 12423 + }, + { + "epoch": 0.2215959761709414, + "grad_norm": 0.26959535479545593, + "learning_rate": 4.778121473785289e-05, + "loss": 0.181, + "step": 12424 + }, + { + "epoch": 0.2216138122926551, + "grad_norm": 0.2882334887981415, + "learning_rate": 4.778057363549483e-05, + "loss": 0.2345, + "step": 12425 + }, + { + "epoch": 0.22163164841436878, + "grad_norm": 0.36539745330810547, + "learning_rate": 4.77799324448314e-05, + "loss": 0.2083, + "step": 12426 + }, + { + "epoch": 0.22164948453608246, + "grad_norm": 0.3765331506729126, + "learning_rate": 4.7779291165865084e-05, + "loss": 0.2543, + "step": 12427 + }, + { + "epoch": 0.22166732065779618, + "grad_norm": 0.34074121713638306, + "learning_rate": 4.7778649798598374e-05, + "loss": 0.2185, + "step": 12428 + }, + { + "epoch": 0.22168515677950987, + "grad_norm": 0.28381723165512085, + "learning_rate": 4.7778008343033745e-05, + "loss": 0.1947, + "step": 12429 + }, + { + "epoch": 0.22170299290122356, + "grad_norm": 0.2325785756111145, + "learning_rate": 4.777736679917368e-05, + "loss": 0.1974, + "step": 12430 + }, + { + "epoch": 0.22172082902293724, + "grad_norm": 0.2748464345932007, + "learning_rate": 4.777672516702069e-05, + "loss": 0.1967, + "step": 12431 + }, + { + "epoch": 0.22173866514465096, + "grad_norm": 0.34121984243392944, + "learning_rate": 4.7776083446577235e-05, + "loss": 0.2096, + "step": 12432 + }, + { + "epoch": 0.22175650126636465, + "grad_norm": 0.2746466398239136, + "learning_rate": 4.777544163784582e-05, + "loss": 0.2211, + "step": 12433 + }, + { + "epoch": 0.22177433738807834, + "grad_norm": 0.3871321380138397, + "learning_rate": 4.777479974082893e-05, + "loss": 0.1872, + "step": 12434 + }, + { + "epoch": 0.22179217350979202, + "grad_norm": 0.38841402530670166, + "learning_rate": 4.777415775552904e-05, + "loss": 0.2271, + "step": 12435 + }, + { + "epoch": 0.22181000963150574, + "grad_norm": 0.21632587909698486, + "learning_rate": 4.7773515681948645e-05, + "loss": 0.1863, + "step": 12436 + }, + { + "epoch": 0.22182784575321943, + "grad_norm": 0.25671523809432983, + "learning_rate": 4.777287352009024e-05, + "loss": 0.1826, + "step": 12437 + }, + { + "epoch": 0.22184568187493311, + "grad_norm": 0.31478574872016907, + "learning_rate": 4.777223126995633e-05, + "loss": 0.2155, + "step": 12438 + }, + { + "epoch": 0.2218635179966468, + "grad_norm": 0.2308403104543686, + "learning_rate": 4.777158893154937e-05, + "loss": 0.1729, + "step": 12439 + }, + { + "epoch": 0.2218813541183605, + "grad_norm": 0.31062284111976624, + "learning_rate": 4.777094650487187e-05, + "loss": 0.1945, + "step": 12440 + }, + { + "epoch": 0.2218991902400742, + "grad_norm": 0.2726714015007019, + "learning_rate": 4.7770303989926315e-05, + "loss": 0.2605, + "step": 12441 + }, + { + "epoch": 0.2219170263617879, + "grad_norm": 0.3029727339744568, + "learning_rate": 4.77696613867152e-05, + "loss": 0.1779, + "step": 12442 + }, + { + "epoch": 0.22193486248350158, + "grad_norm": 0.1870802640914917, + "learning_rate": 4.776901869524101e-05, + "loss": 0.1698, + "step": 12443 + }, + { + "epoch": 0.22195269860521527, + "grad_norm": 0.3114982843399048, + "learning_rate": 4.776837591550624e-05, + "loss": 0.1462, + "step": 12444 + }, + { + "epoch": 0.221970534726929, + "grad_norm": 0.2691832482814789, + "learning_rate": 4.776773304751338e-05, + "loss": 0.1868, + "step": 12445 + }, + { + "epoch": 0.22198837084864267, + "grad_norm": 0.24110494554042816, + "learning_rate": 4.776709009126492e-05, + "loss": 0.1902, + "step": 12446 + }, + { + "epoch": 0.22200620697035636, + "grad_norm": 0.38330644369125366, + "learning_rate": 4.776644704676336e-05, + "loss": 0.2426, + "step": 12447 + }, + { + "epoch": 0.22202404309207005, + "grad_norm": 0.336843341588974, + "learning_rate": 4.7765803914011194e-05, + "loss": 0.2182, + "step": 12448 + }, + { + "epoch": 0.22204187921378377, + "grad_norm": 0.31483957171440125, + "learning_rate": 4.77651606930109e-05, + "loss": 0.2141, + "step": 12449 + }, + { + "epoch": 0.22205971533549745, + "grad_norm": 0.30925506353378296, + "learning_rate": 4.776451738376499e-05, + "loss": 0.1692, + "step": 12450 + }, + { + "epoch": 0.22207755145721114, + "grad_norm": 0.3509606122970581, + "learning_rate": 4.776387398627594e-05, + "loss": 0.1632, + "step": 12451 + }, + { + "epoch": 0.22209538757892483, + "grad_norm": 0.22312790155410767, + "learning_rate": 4.776323050054625e-05, + "loss": 0.1781, + "step": 12452 + }, + { + "epoch": 0.22211322370063855, + "grad_norm": 0.2647301256656647, + "learning_rate": 4.776258692657842e-05, + "loss": 0.2245, + "step": 12453 + }, + { + "epoch": 0.22213105982235223, + "grad_norm": 0.2997078001499176, + "learning_rate": 4.776194326437494e-05, + "loss": 0.2456, + "step": 12454 + }, + { + "epoch": 0.22214889594406592, + "grad_norm": 0.25826555490493774, + "learning_rate": 4.776129951393831e-05, + "loss": 0.1465, + "step": 12455 + }, + { + "epoch": 0.2221667320657796, + "grad_norm": 0.24937480688095093, + "learning_rate": 4.776065567527102e-05, + "loss": 0.1995, + "step": 12456 + }, + { + "epoch": 0.2221845681874933, + "grad_norm": 0.2960071563720703, + "learning_rate": 4.7760011748375564e-05, + "loss": 0.1851, + "step": 12457 + }, + { + "epoch": 0.222202404309207, + "grad_norm": 0.2337069809436798, + "learning_rate": 4.775936773325444e-05, + "loss": 0.1983, + "step": 12458 + }, + { + "epoch": 0.2222202404309207, + "grad_norm": 0.23904629051685333, + "learning_rate": 4.775872362991015e-05, + "loss": 0.2313, + "step": 12459 + }, + { + "epoch": 0.2222380765526344, + "grad_norm": 0.3666958808898926, + "learning_rate": 4.775807943834518e-05, + "loss": 0.2248, + "step": 12460 + }, + { + "epoch": 0.22225591267434808, + "grad_norm": 0.2366989105939865, + "learning_rate": 4.775743515856205e-05, + "loss": 0.2115, + "step": 12461 + }, + { + "epoch": 0.2222737487960618, + "grad_norm": 0.24880069494247437, + "learning_rate": 4.775679079056323e-05, + "loss": 0.2274, + "step": 12462 + }, + { + "epoch": 0.22229158491777548, + "grad_norm": 0.20895929634571075, + "learning_rate": 4.775614633435123e-05, + "loss": 0.1992, + "step": 12463 + }, + { + "epoch": 0.22230942103948917, + "grad_norm": 0.23697924613952637, + "learning_rate": 4.7755501789928544e-05, + "loss": 0.2289, + "step": 12464 + }, + { + "epoch": 0.22232725716120286, + "grad_norm": 0.292427122592926, + "learning_rate": 4.775485715729767e-05, + "loss": 0.2371, + "step": 12465 + }, + { + "epoch": 0.22234509328291657, + "grad_norm": 0.28915947675704956, + "learning_rate": 4.775421243646112e-05, + "loss": 0.2467, + "step": 12466 + }, + { + "epoch": 0.22236292940463026, + "grad_norm": 0.4016871750354767, + "learning_rate": 4.775356762742138e-05, + "loss": 0.2503, + "step": 12467 + }, + { + "epoch": 0.22238076552634395, + "grad_norm": 0.301969438791275, + "learning_rate": 4.775292273018095e-05, + "loss": 0.1695, + "step": 12468 + }, + { + "epoch": 0.22239860164805764, + "grad_norm": 0.2205217480659485, + "learning_rate": 4.775227774474234e-05, + "loss": 0.1614, + "step": 12469 + }, + { + "epoch": 0.22241643776977135, + "grad_norm": 0.30096668004989624, + "learning_rate": 4.775163267110804e-05, + "loss": 0.2206, + "step": 12470 + }, + { + "epoch": 0.22243427389148504, + "grad_norm": 0.29696354269981384, + "learning_rate": 4.7750987509280554e-05, + "loss": 0.2207, + "step": 12471 + }, + { + "epoch": 0.22245211001319873, + "grad_norm": 0.22098317742347717, + "learning_rate": 4.775034225926238e-05, + "loss": 0.19, + "step": 12472 + }, + { + "epoch": 0.22246994613491242, + "grad_norm": 0.26950424909591675, + "learning_rate": 4.774969692105602e-05, + "loss": 0.1691, + "step": 12473 + }, + { + "epoch": 0.22248778225662613, + "grad_norm": 0.33573946356773376, + "learning_rate": 4.774905149466398e-05, + "loss": 0.2512, + "step": 12474 + }, + { + "epoch": 0.22250561837833982, + "grad_norm": 0.22495293617248535, + "learning_rate": 4.7748405980088764e-05, + "loss": 0.182, + "step": 12475 + }, + { + "epoch": 0.2225234545000535, + "grad_norm": 0.4752029776573181, + "learning_rate": 4.7747760377332864e-05, + "loss": 0.1851, + "step": 12476 + }, + { + "epoch": 0.2225412906217672, + "grad_norm": 0.2621522843837738, + "learning_rate": 4.774711468639879e-05, + "loss": 0.2193, + "step": 12477 + }, + { + "epoch": 0.22255912674348088, + "grad_norm": 0.19988669455051422, + "learning_rate": 4.7746468907289035e-05, + "loss": 0.1945, + "step": 12478 + }, + { + "epoch": 0.2225769628651946, + "grad_norm": 0.2635309398174286, + "learning_rate": 4.774582304000612e-05, + "loss": 0.2111, + "step": 12479 + }, + { + "epoch": 0.2225947989869083, + "grad_norm": 0.2650481164455414, + "learning_rate": 4.774517708455254e-05, + "loss": 0.162, + "step": 12480 + }, + { + "epoch": 0.22261263510862198, + "grad_norm": 0.24110287427902222, + "learning_rate": 4.774453104093079e-05, + "loss": 0.2034, + "step": 12481 + }, + { + "epoch": 0.22263047123033566, + "grad_norm": 0.22477370500564575, + "learning_rate": 4.774388490914339e-05, + "loss": 0.1907, + "step": 12482 + }, + { + "epoch": 0.22264830735204938, + "grad_norm": 0.26015955209732056, + "learning_rate": 4.774323868919283e-05, + "loss": 0.1822, + "step": 12483 + }, + { + "epoch": 0.22266614347376307, + "grad_norm": 0.26121658086776733, + "learning_rate": 4.774259238108162e-05, + "loss": 0.1732, + "step": 12484 + }, + { + "epoch": 0.22268397959547676, + "grad_norm": 0.23913203179836273, + "learning_rate": 4.774194598481227e-05, + "loss": 0.2201, + "step": 12485 + }, + { + "epoch": 0.22270181571719044, + "grad_norm": 0.3138361871242523, + "learning_rate": 4.774129950038728e-05, + "loss": 0.1932, + "step": 12486 + }, + { + "epoch": 0.22271965183890416, + "grad_norm": 0.30828264355659485, + "learning_rate": 4.774065292780916e-05, + "loss": 0.2261, + "step": 12487 + }, + { + "epoch": 0.22273748796061785, + "grad_norm": 0.30259039998054504, + "learning_rate": 4.7740006267080415e-05, + "loss": 0.1323, + "step": 12488 + }, + { + "epoch": 0.22275532408233154, + "grad_norm": 0.2695466876029968, + "learning_rate": 4.7739359518203556e-05, + "loss": 0.2086, + "step": 12489 + }, + { + "epoch": 0.22277316020404522, + "grad_norm": 0.2501279413700104, + "learning_rate": 4.773871268118109e-05, + "loss": 0.2003, + "step": 12490 + }, + { + "epoch": 0.22279099632575894, + "grad_norm": 0.27336621284484863, + "learning_rate": 4.7738065756015504e-05, + "loss": 0.2233, + "step": 12491 + }, + { + "epoch": 0.22280883244747263, + "grad_norm": 0.2427319586277008, + "learning_rate": 4.773741874270933e-05, + "loss": 0.1901, + "step": 12492 + }, + { + "epoch": 0.22282666856918631, + "grad_norm": 0.2651156187057495, + "learning_rate": 4.773677164126507e-05, + "loss": 0.1881, + "step": 12493 + }, + { + "epoch": 0.2228445046909, + "grad_norm": 0.26140034198760986, + "learning_rate": 4.773612445168523e-05, + "loss": 0.1733, + "step": 12494 + }, + { + "epoch": 0.22286234081261372, + "grad_norm": 0.30591756105422974, + "learning_rate": 4.7735477173972315e-05, + "loss": 0.1733, + "step": 12495 + }, + { + "epoch": 0.2228801769343274, + "grad_norm": 0.2474837601184845, + "learning_rate": 4.773482980812884e-05, + "loss": 0.1775, + "step": 12496 + }, + { + "epoch": 0.2228980130560411, + "grad_norm": 0.30363044142723083, + "learning_rate": 4.773418235415731e-05, + "loss": 0.1746, + "step": 12497 + }, + { + "epoch": 0.22291584917775478, + "grad_norm": 0.2405654489994049, + "learning_rate": 4.773353481206024e-05, + "loss": 0.1915, + "step": 12498 + }, + { + "epoch": 0.22293368529946847, + "grad_norm": 0.2663644850254059, + "learning_rate": 4.7732887181840135e-05, + "loss": 0.2052, + "step": 12499 + }, + { + "epoch": 0.22295152142118219, + "grad_norm": 0.22968687117099762, + "learning_rate": 4.773223946349951e-05, + "loss": 0.1872, + "step": 12500 + }, + { + "epoch": 0.22296935754289587, + "grad_norm": 0.2624761462211609, + "learning_rate": 4.7731591657040866e-05, + "loss": 0.1883, + "step": 12501 + }, + { + "epoch": 0.22298719366460956, + "grad_norm": 0.23904605209827423, + "learning_rate": 4.773094376246673e-05, + "loss": 0.1906, + "step": 12502 + }, + { + "epoch": 0.22300502978632325, + "grad_norm": 0.2636966407299042, + "learning_rate": 4.773029577977961e-05, + "loss": 0.193, + "step": 12503 + }, + { + "epoch": 0.22302286590803697, + "grad_norm": 0.33867472410202026, + "learning_rate": 4.772964770898199e-05, + "loss": 0.1561, + "step": 12504 + }, + { + "epoch": 0.22304070202975065, + "grad_norm": 0.3870651423931122, + "learning_rate": 4.7728999550076424e-05, + "loss": 0.1675, + "step": 12505 + }, + { + "epoch": 0.22305853815146434, + "grad_norm": 0.34564855694770813, + "learning_rate": 4.772835130306541e-05, + "loss": 0.1897, + "step": 12506 + }, + { + "epoch": 0.22307637427317803, + "grad_norm": 0.19082261621952057, + "learning_rate": 4.772770296795144e-05, + "loss": 0.1563, + "step": 12507 + }, + { + "epoch": 0.22309421039489175, + "grad_norm": 0.32542216777801514, + "learning_rate": 4.772705454473705e-05, + "loss": 0.2012, + "step": 12508 + }, + { + "epoch": 0.22311204651660543, + "grad_norm": 0.19786059856414795, + "learning_rate": 4.7726406033424754e-05, + "loss": 0.1493, + "step": 12509 + }, + { + "epoch": 0.22312988263831912, + "grad_norm": 0.30620458722114563, + "learning_rate": 4.772575743401705e-05, + "loss": 0.2117, + "step": 12510 + }, + { + "epoch": 0.2231477187600328, + "grad_norm": 0.22794589400291443, + "learning_rate": 4.7725108746516466e-05, + "loss": 0.1536, + "step": 12511 + }, + { + "epoch": 0.22316555488174653, + "grad_norm": 0.27520862221717834, + "learning_rate": 4.7724459970925503e-05, + "loss": 0.1601, + "step": 12512 + }, + { + "epoch": 0.2231833910034602, + "grad_norm": 0.27101796865463257, + "learning_rate": 4.77238111072467e-05, + "loss": 0.1833, + "step": 12513 + }, + { + "epoch": 0.2232012271251739, + "grad_norm": 0.24299657344818115, + "learning_rate": 4.772316215548254e-05, + "loss": 0.1955, + "step": 12514 + }, + { + "epoch": 0.2232190632468876, + "grad_norm": 0.28873303532600403, + "learning_rate": 4.772251311563557e-05, + "loss": 0.2151, + "step": 12515 + }, + { + "epoch": 0.2232368993686013, + "grad_norm": 0.2779642343521118, + "learning_rate": 4.772186398770828e-05, + "loss": 0.2124, + "step": 12516 + }, + { + "epoch": 0.223254735490315, + "grad_norm": 0.32345345616340637, + "learning_rate": 4.77212147717032e-05, + "loss": 0.2303, + "step": 12517 + }, + { + "epoch": 0.22327257161202868, + "grad_norm": 0.2563976049423218, + "learning_rate": 4.7720565467622844e-05, + "loss": 0.1996, + "step": 12518 + }, + { + "epoch": 0.22329040773374237, + "grad_norm": 0.4193630814552307, + "learning_rate": 4.771991607546973e-05, + "loss": 0.1548, + "step": 12519 + }, + { + "epoch": 0.22330824385545606, + "grad_norm": 0.2679010033607483, + "learning_rate": 4.771926659524637e-05, + "loss": 0.2244, + "step": 12520 + }, + { + "epoch": 0.22332607997716977, + "grad_norm": 0.2944014072418213, + "learning_rate": 4.771861702695529e-05, + "loss": 0.1156, + "step": 12521 + }, + { + "epoch": 0.22334391609888346, + "grad_norm": 0.2515939176082611, + "learning_rate": 4.7717967370599e-05, + "loss": 0.1958, + "step": 12522 + }, + { + "epoch": 0.22336175222059715, + "grad_norm": 0.25248903036117554, + "learning_rate": 4.771731762618003e-05, + "loss": 0.1916, + "step": 12523 + }, + { + "epoch": 0.22337958834231084, + "grad_norm": 0.2968897521495819, + "learning_rate": 4.7716667793700886e-05, + "loss": 0.2102, + "step": 12524 + }, + { + "epoch": 0.22339742446402455, + "grad_norm": 0.31653228402137756, + "learning_rate": 4.7716017873164085e-05, + "loss": 0.2381, + "step": 12525 + }, + { + "epoch": 0.22341526058573824, + "grad_norm": 0.28874629735946655, + "learning_rate": 4.7715367864572164e-05, + "loss": 0.1976, + "step": 12526 + }, + { + "epoch": 0.22343309670745193, + "grad_norm": 0.2989388108253479, + "learning_rate": 4.7714717767927624e-05, + "loss": 0.1361, + "step": 12527 + }, + { + "epoch": 0.22345093282916562, + "grad_norm": 0.2818071246147156, + "learning_rate": 4.7714067583232993e-05, + "loss": 0.1458, + "step": 12528 + }, + { + "epoch": 0.22346876895087933, + "grad_norm": 0.3405362665653229, + "learning_rate": 4.771341731049079e-05, + "loss": 0.2437, + "step": 12529 + }, + { + "epoch": 0.22348660507259302, + "grad_norm": 0.227031409740448, + "learning_rate": 4.771276694970355e-05, + "loss": 0.1537, + "step": 12530 + }, + { + "epoch": 0.2235044411943067, + "grad_norm": 0.21507854759693146, + "learning_rate": 4.771211650087377e-05, + "loss": 0.2011, + "step": 12531 + }, + { + "epoch": 0.2235222773160204, + "grad_norm": 0.28749793767929077, + "learning_rate": 4.771146596400398e-05, + "loss": 0.2227, + "step": 12532 + }, + { + "epoch": 0.2235401134377341, + "grad_norm": 0.20286786556243896, + "learning_rate": 4.77108153390967e-05, + "loss": 0.1821, + "step": 12533 + }, + { + "epoch": 0.2235579495594478, + "grad_norm": 0.2581403851509094, + "learning_rate": 4.771016462615446e-05, + "loss": 0.191, + "step": 12534 + }, + { + "epoch": 0.2235757856811615, + "grad_norm": 0.2919760048389435, + "learning_rate": 4.7709513825179785e-05, + "loss": 0.2362, + "step": 12535 + }, + { + "epoch": 0.22359362180287518, + "grad_norm": 0.25975555181503296, + "learning_rate": 4.770886293617518e-05, + "loss": 0.2548, + "step": 12536 + }, + { + "epoch": 0.2236114579245889, + "grad_norm": 0.3117729723453522, + "learning_rate": 4.770821195914319e-05, + "loss": 0.2579, + "step": 12537 + }, + { + "epoch": 0.22362929404630258, + "grad_norm": 0.20171460509300232, + "learning_rate": 4.770756089408632e-05, + "loss": 0.1327, + "step": 12538 + }, + { + "epoch": 0.22364713016801627, + "grad_norm": 0.2525087893009186, + "learning_rate": 4.77069097410071e-05, + "loss": 0.1552, + "step": 12539 + }, + { + "epoch": 0.22366496628972996, + "grad_norm": 0.3625781238079071, + "learning_rate": 4.770625849990806e-05, + "loss": 0.2671, + "step": 12540 + }, + { + "epoch": 0.22368280241144364, + "grad_norm": 0.3005658686161041, + "learning_rate": 4.770560717079171e-05, + "loss": 0.1834, + "step": 12541 + }, + { + "epoch": 0.22370063853315736, + "grad_norm": 0.23191936314105988, + "learning_rate": 4.77049557536606e-05, + "loss": 0.1706, + "step": 12542 + }, + { + "epoch": 0.22371847465487105, + "grad_norm": 0.316008597612381, + "learning_rate": 4.7704304248517225e-05, + "loss": 0.183, + "step": 12543 + }, + { + "epoch": 0.22373631077658473, + "grad_norm": 0.27352115511894226, + "learning_rate": 4.770365265536413e-05, + "loss": 0.1931, + "step": 12544 + }, + { + "epoch": 0.22375414689829842, + "grad_norm": 0.28434449434280396, + "learning_rate": 4.770300097420384e-05, + "loss": 0.2155, + "step": 12545 + }, + { + "epoch": 0.22377198302001214, + "grad_norm": 0.2685566246509552, + "learning_rate": 4.770234920503887e-05, + "loss": 0.1824, + "step": 12546 + }, + { + "epoch": 0.22378981914172583, + "grad_norm": 0.25530388951301575, + "learning_rate": 4.770169734787175e-05, + "loss": 0.1808, + "step": 12547 + }, + { + "epoch": 0.22380765526343951, + "grad_norm": 0.21423391997814178, + "learning_rate": 4.770104540270502e-05, + "loss": 0.1425, + "step": 12548 + }, + { + "epoch": 0.2238254913851532, + "grad_norm": 0.2734600305557251, + "learning_rate": 4.77003933695412e-05, + "loss": 0.191, + "step": 12549 + }, + { + "epoch": 0.22384332750686692, + "grad_norm": 0.2774091958999634, + "learning_rate": 4.769974124838281e-05, + "loss": 0.1139, + "step": 12550 + }, + { + "epoch": 0.2238611636285806, + "grad_norm": 0.2846418619155884, + "learning_rate": 4.769908903923238e-05, + "loss": 0.1406, + "step": 12551 + }, + { + "epoch": 0.2238789997502943, + "grad_norm": 0.25708311796188354, + "learning_rate": 4.769843674209244e-05, + "loss": 0.1881, + "step": 12552 + }, + { + "epoch": 0.22389683587200798, + "grad_norm": 0.2455696016550064, + "learning_rate": 4.769778435696552e-05, + "loss": 0.1705, + "step": 12553 + }, + { + "epoch": 0.2239146719937217, + "grad_norm": 0.17797338962554932, + "learning_rate": 4.769713188385415e-05, + "loss": 0.1583, + "step": 12554 + }, + { + "epoch": 0.22393250811543539, + "grad_norm": 0.23763418197631836, + "learning_rate": 4.769647932276086e-05, + "loss": 0.1589, + "step": 12555 + }, + { + "epoch": 0.22395034423714907, + "grad_norm": 0.30871322751045227, + "learning_rate": 4.769582667368817e-05, + "loss": 0.2244, + "step": 12556 + }, + { + "epoch": 0.22396818035886276, + "grad_norm": 0.25054746866226196, + "learning_rate": 4.769517393663863e-05, + "loss": 0.2029, + "step": 12557 + }, + { + "epoch": 0.22398601648057645, + "grad_norm": 0.24173471331596375, + "learning_rate": 4.7694521111614744e-05, + "loss": 0.1945, + "step": 12558 + }, + { + "epoch": 0.22400385260229017, + "grad_norm": 0.29892677068710327, + "learning_rate": 4.769386819861906e-05, + "loss": 0.2288, + "step": 12559 + }, + { + "epoch": 0.22402168872400385, + "grad_norm": 0.22668752074241638, + "learning_rate": 4.769321519765411e-05, + "loss": 0.2058, + "step": 12560 + }, + { + "epoch": 0.22403952484571754, + "grad_norm": 0.22964948415756226, + "learning_rate": 4.7692562108722406e-05, + "loss": 0.1918, + "step": 12561 + }, + { + "epoch": 0.22405736096743123, + "grad_norm": 0.2790120840072632, + "learning_rate": 4.76919089318265e-05, + "loss": 0.1506, + "step": 12562 + }, + { + "epoch": 0.22407519708914495, + "grad_norm": 0.25331994891166687, + "learning_rate": 4.769125566696893e-05, + "loss": 0.1688, + "step": 12563 + }, + { + "epoch": 0.22409303321085863, + "grad_norm": 0.3228544592857361, + "learning_rate": 4.76906023141522e-05, + "loss": 0.2381, + "step": 12564 + }, + { + "epoch": 0.22411086933257232, + "grad_norm": 0.3493567109107971, + "learning_rate": 4.768994887337887e-05, + "loss": 0.191, + "step": 12565 + }, + { + "epoch": 0.224128705454286, + "grad_norm": 0.2573733925819397, + "learning_rate": 4.768929534465145e-05, + "loss": 0.2453, + "step": 12566 + }, + { + "epoch": 0.22414654157599972, + "grad_norm": 0.26742616295814514, + "learning_rate": 4.768864172797249e-05, + "loss": 0.1768, + "step": 12567 + }, + { + "epoch": 0.2241643776977134, + "grad_norm": 0.2692878246307373, + "learning_rate": 4.768798802334452e-05, + "loss": 0.1718, + "step": 12568 + }, + { + "epoch": 0.2241822138194271, + "grad_norm": 0.31958574056625366, + "learning_rate": 4.768733423077007e-05, + "loss": 0.2664, + "step": 12569 + }, + { + "epoch": 0.2242000499411408, + "grad_norm": 0.20087982714176178, + "learning_rate": 4.768668035025168e-05, + "loss": 0.1579, + "step": 12570 + }, + { + "epoch": 0.2242178860628545, + "grad_norm": 0.2572091817855835, + "learning_rate": 4.768602638179187e-05, + "loss": 0.1833, + "step": 12571 + }, + { + "epoch": 0.2242357221845682, + "grad_norm": 0.23120689392089844, + "learning_rate": 4.7685372325393195e-05, + "loss": 0.1457, + "step": 12572 + }, + { + "epoch": 0.22425355830628188, + "grad_norm": 0.2992466390132904, + "learning_rate": 4.768471818105819e-05, + "loss": 0.229, + "step": 12573 + }, + { + "epoch": 0.22427139442799557, + "grad_norm": 0.3062695860862732, + "learning_rate": 4.7684063948789365e-05, + "loss": 0.2456, + "step": 12574 + }, + { + "epoch": 0.22428923054970928, + "grad_norm": 0.35636234283447266, + "learning_rate": 4.768340962858928e-05, + "loss": 0.1896, + "step": 12575 + }, + { + "epoch": 0.22430706667142297, + "grad_norm": 0.25400036573410034, + "learning_rate": 4.768275522046047e-05, + "loss": 0.1699, + "step": 12576 + }, + { + "epoch": 0.22432490279313666, + "grad_norm": 0.47366049885749817, + "learning_rate": 4.768210072440546e-05, + "loss": 0.189, + "step": 12577 + }, + { + "epoch": 0.22434273891485035, + "grad_norm": 0.21881255507469177, + "learning_rate": 4.76814461404268e-05, + "loss": 0.1683, + "step": 12578 + }, + { + "epoch": 0.22436057503656404, + "grad_norm": 0.29892686009407043, + "learning_rate": 4.768079146852701e-05, + "loss": 0.1935, + "step": 12579 + }, + { + "epoch": 0.22437841115827775, + "grad_norm": 0.3141668140888214, + "learning_rate": 4.768013670870865e-05, + "loss": 0.2034, + "step": 12580 + }, + { + "epoch": 0.22439624727999144, + "grad_norm": 0.36283034086227417, + "learning_rate": 4.767948186097424e-05, + "loss": 0.241, + "step": 12581 + }, + { + "epoch": 0.22441408340170513, + "grad_norm": 0.2098165899515152, + "learning_rate": 4.767882692532632e-05, + "loss": 0.168, + "step": 12582 + }, + { + "epoch": 0.22443191952341882, + "grad_norm": 0.2089463323354721, + "learning_rate": 4.767817190176745e-05, + "loss": 0.1973, + "step": 12583 + }, + { + "epoch": 0.22444975564513253, + "grad_norm": 0.2082255780696869, + "learning_rate": 4.767751679030015e-05, + "loss": 0.1568, + "step": 12584 + }, + { + "epoch": 0.22446759176684622, + "grad_norm": 0.18804092705249786, + "learning_rate": 4.767686159092695e-05, + "loss": 0.1833, + "step": 12585 + }, + { + "epoch": 0.2244854278885599, + "grad_norm": 0.2836175560951233, + "learning_rate": 4.767620630365041e-05, + "loss": 0.1972, + "step": 12586 + }, + { + "epoch": 0.2245032640102736, + "grad_norm": 0.28468000888824463, + "learning_rate": 4.7675550928473056e-05, + "loss": 0.2064, + "step": 12587 + }, + { + "epoch": 0.2245211001319873, + "grad_norm": 0.25813114643096924, + "learning_rate": 4.767489546539744e-05, + "loss": 0.1701, + "step": 12588 + }, + { + "epoch": 0.224538936253701, + "grad_norm": 0.24302178621292114, + "learning_rate": 4.76742399144261e-05, + "loss": 0.1777, + "step": 12589 + }, + { + "epoch": 0.2245567723754147, + "grad_norm": 0.4876336455345154, + "learning_rate": 4.767358427556157e-05, + "loss": 0.1935, + "step": 12590 + }, + { + "epoch": 0.22457460849712838, + "grad_norm": 0.2273038625717163, + "learning_rate": 4.7672928548806394e-05, + "loss": 0.1488, + "step": 12591 + }, + { + "epoch": 0.2245924446188421, + "grad_norm": 0.26203829050064087, + "learning_rate": 4.767227273416313e-05, + "loss": 0.204, + "step": 12592 + }, + { + "epoch": 0.22461028074055578, + "grad_norm": 0.2577301561832428, + "learning_rate": 4.767161683163429e-05, + "loss": 0.1955, + "step": 12593 + }, + { + "epoch": 0.22462811686226947, + "grad_norm": 0.3304455280303955, + "learning_rate": 4.767096084122244e-05, + "loss": 0.2291, + "step": 12594 + }, + { + "epoch": 0.22464595298398315, + "grad_norm": 0.20761314034461975, + "learning_rate": 4.767030476293011e-05, + "loss": 0.1697, + "step": 12595 + }, + { + "epoch": 0.22466378910569687, + "grad_norm": 0.32202431559562683, + "learning_rate": 4.766964859675985e-05, + "loss": 0.2, + "step": 12596 + }, + { + "epoch": 0.22468162522741056, + "grad_norm": 0.32054996490478516, + "learning_rate": 4.766899234271421e-05, + "loss": 0.1984, + "step": 12597 + }, + { + "epoch": 0.22469946134912425, + "grad_norm": 0.22482392191886902, + "learning_rate": 4.766833600079572e-05, + "loss": 0.1116, + "step": 12598 + }, + { + "epoch": 0.22471729747083793, + "grad_norm": 0.2975764572620392, + "learning_rate": 4.7667679571006926e-05, + "loss": 0.1901, + "step": 12599 + }, + { + "epoch": 0.22473513359255162, + "grad_norm": 0.2900422513484955, + "learning_rate": 4.766702305335039e-05, + "loss": 0.2051, + "step": 12600 + }, + { + "epoch": 0.22475296971426534, + "grad_norm": 0.23242852091789246, + "learning_rate": 4.766636644782864e-05, + "loss": 0.2115, + "step": 12601 + }, + { + "epoch": 0.22477080583597903, + "grad_norm": 0.3072696626186371, + "learning_rate": 4.7665709754444214e-05, + "loss": 0.1842, + "step": 12602 + }, + { + "epoch": 0.22478864195769271, + "grad_norm": 0.20726057887077332, + "learning_rate": 4.766505297319968e-05, + "loss": 0.2156, + "step": 12603 + }, + { + "epoch": 0.2248064780794064, + "grad_norm": 0.2729635536670685, + "learning_rate": 4.7664396104097564e-05, + "loss": 0.163, + "step": 12604 + }, + { + "epoch": 0.22482431420112012, + "grad_norm": 0.3674893081188202, + "learning_rate": 4.766373914714043e-05, + "loss": 0.2338, + "step": 12605 + }, + { + "epoch": 0.2248421503228338, + "grad_norm": 0.3503851592540741, + "learning_rate": 4.76630821023308e-05, + "loss": 0.1877, + "step": 12606 + }, + { + "epoch": 0.2248599864445475, + "grad_norm": 0.3252289891242981, + "learning_rate": 4.766242496967125e-05, + "loss": 0.1552, + "step": 12607 + }, + { + "epoch": 0.22487782256626118, + "grad_norm": 0.2753390669822693, + "learning_rate": 4.766176774916431e-05, + "loss": 0.2123, + "step": 12608 + }, + { + "epoch": 0.2248956586879749, + "grad_norm": 0.20343218743801117, + "learning_rate": 4.766111044081253e-05, + "loss": 0.1371, + "step": 12609 + }, + { + "epoch": 0.22491349480968859, + "grad_norm": 0.2973015606403351, + "learning_rate": 4.766045304461846e-05, + "loss": 0.1767, + "step": 12610 + }, + { + "epoch": 0.22493133093140227, + "grad_norm": 0.24229112267494202, + "learning_rate": 4.765979556058464e-05, + "loss": 0.2021, + "step": 12611 + }, + { + "epoch": 0.22494916705311596, + "grad_norm": 0.22636985778808594, + "learning_rate": 4.7659137988713635e-05, + "loss": 0.1984, + "step": 12612 + }, + { + "epoch": 0.22496700317482968, + "grad_norm": 0.3754446506500244, + "learning_rate": 4.765848032900798e-05, + "loss": 0.2447, + "step": 12613 + }, + { + "epoch": 0.22498483929654337, + "grad_norm": 0.30979442596435547, + "learning_rate": 4.765782258147023e-05, + "loss": 0.2429, + "step": 12614 + }, + { + "epoch": 0.22500267541825705, + "grad_norm": 0.35259950160980225, + "learning_rate": 4.7657164746102944e-05, + "loss": 0.1906, + "step": 12615 + }, + { + "epoch": 0.22502051153997074, + "grad_norm": 0.3277779221534729, + "learning_rate": 4.765650682290865e-05, + "loss": 0.1577, + "step": 12616 + }, + { + "epoch": 0.22503834766168446, + "grad_norm": 0.25805234909057617, + "learning_rate": 4.765584881188991e-05, + "loss": 0.2183, + "step": 12617 + }, + { + "epoch": 0.22505618378339814, + "grad_norm": 0.3252462148666382, + "learning_rate": 4.765519071304928e-05, + "loss": 0.1116, + "step": 12618 + }, + { + "epoch": 0.22507401990511183, + "grad_norm": 0.39523595571517944, + "learning_rate": 4.7654532526389306e-05, + "loss": 0.2342, + "step": 12619 + }, + { + "epoch": 0.22509185602682552, + "grad_norm": 0.25782206654548645, + "learning_rate": 4.765387425191254e-05, + "loss": 0.1651, + "step": 12620 + }, + { + "epoch": 0.2251096921485392, + "grad_norm": 0.22414354979991913, + "learning_rate": 4.765321588962153e-05, + "loss": 0.1737, + "step": 12621 + }, + { + "epoch": 0.22512752827025292, + "grad_norm": 0.29870933294296265, + "learning_rate": 4.765255743951883e-05, + "loss": 0.2362, + "step": 12622 + }, + { + "epoch": 0.2251453643919666, + "grad_norm": 0.23767724633216858, + "learning_rate": 4.7651898901606994e-05, + "loss": 0.2038, + "step": 12623 + }, + { + "epoch": 0.2251632005136803, + "grad_norm": 0.23248374462127686, + "learning_rate": 4.765124027588858e-05, + "loss": 0.1979, + "step": 12624 + }, + { + "epoch": 0.225181036635394, + "grad_norm": 0.2406260371208191, + "learning_rate": 4.765058156236613e-05, + "loss": 0.1775, + "step": 12625 + }, + { + "epoch": 0.2251988727571077, + "grad_norm": 0.23524776101112366, + "learning_rate": 4.76499227610422e-05, + "loss": 0.1654, + "step": 12626 + }, + { + "epoch": 0.2252167088788214, + "grad_norm": 0.25309523940086365, + "learning_rate": 4.7649263871919355e-05, + "loss": 0.183, + "step": 12627 + }, + { + "epoch": 0.22523454500053508, + "grad_norm": 0.22781570255756378, + "learning_rate": 4.764860489500014e-05, + "loss": 0.2338, + "step": 12628 + }, + { + "epoch": 0.22525238112224877, + "grad_norm": 0.3151392936706543, + "learning_rate": 4.76479458302871e-05, + "loss": 0.2314, + "step": 12629 + }, + { + "epoch": 0.22527021724396248, + "grad_norm": 0.1762322336435318, + "learning_rate": 4.7647286677782803e-05, + "loss": 0.1768, + "step": 12630 + }, + { + "epoch": 0.22528805336567617, + "grad_norm": 0.24711892008781433, + "learning_rate": 4.76466274374898e-05, + "loss": 0.1755, + "step": 12631 + }, + { + "epoch": 0.22530588948738986, + "grad_norm": 0.27915292978286743, + "learning_rate": 4.764596810941065e-05, + "loss": 0.2159, + "step": 12632 + }, + { + "epoch": 0.22532372560910355, + "grad_norm": 0.2858349680900574, + "learning_rate": 4.7645308693547905e-05, + "loss": 0.1921, + "step": 12633 + }, + { + "epoch": 0.22534156173081726, + "grad_norm": 0.251296728849411, + "learning_rate": 4.7644649189904125e-05, + "loss": 0.242, + "step": 12634 + }, + { + "epoch": 0.22535939785253095, + "grad_norm": 0.2733573019504547, + "learning_rate": 4.7643989598481866e-05, + "loss": 0.1767, + "step": 12635 + }, + { + "epoch": 0.22537723397424464, + "grad_norm": 0.3312844932079315, + "learning_rate": 4.7643329919283676e-05, + "loss": 0.2093, + "step": 12636 + }, + { + "epoch": 0.22539507009595833, + "grad_norm": 0.3516193926334381, + "learning_rate": 4.764267015231212e-05, + "loss": 0.2128, + "step": 12637 + }, + { + "epoch": 0.22541290621767202, + "grad_norm": 0.24420495331287384, + "learning_rate": 4.764201029756975e-05, + "loss": 0.1676, + "step": 12638 + }, + { + "epoch": 0.22543074233938573, + "grad_norm": 0.22111408412456512, + "learning_rate": 4.764135035505913e-05, + "loss": 0.1143, + "step": 12639 + }, + { + "epoch": 0.22544857846109942, + "grad_norm": 0.38935935497283936, + "learning_rate": 4.764069032478282e-05, + "loss": 0.3068, + "step": 12640 + }, + { + "epoch": 0.2254664145828131, + "grad_norm": 0.3000342547893524, + "learning_rate": 4.764003020674337e-05, + "loss": 0.1927, + "step": 12641 + }, + { + "epoch": 0.2254842507045268, + "grad_norm": 0.32595810294151306, + "learning_rate": 4.7639370000943345e-05, + "loss": 0.1914, + "step": 12642 + }, + { + "epoch": 0.2255020868262405, + "grad_norm": 0.344596266746521, + "learning_rate": 4.76387097073853e-05, + "loss": 0.1982, + "step": 12643 + }, + { + "epoch": 0.2255199229479542, + "grad_norm": 0.22488991916179657, + "learning_rate": 4.7638049326071805e-05, + "loss": 0.202, + "step": 12644 + }, + { + "epoch": 0.2255377590696679, + "grad_norm": 0.23531319200992584, + "learning_rate": 4.7637388857005404e-05, + "loss": 0.182, + "step": 12645 + }, + { + "epoch": 0.22555559519138157, + "grad_norm": 0.32740113139152527, + "learning_rate": 4.7636728300188674e-05, + "loss": 0.174, + "step": 12646 + }, + { + "epoch": 0.2255734313130953, + "grad_norm": 0.219620943069458, + "learning_rate": 4.7636067655624154e-05, + "loss": 0.1723, + "step": 12647 + }, + { + "epoch": 0.22559126743480898, + "grad_norm": 0.217381089925766, + "learning_rate": 4.763540692331443e-05, + "loss": 0.1572, + "step": 12648 + }, + { + "epoch": 0.22560910355652267, + "grad_norm": 0.271856427192688, + "learning_rate": 4.763474610326204e-05, + "loss": 0.2022, + "step": 12649 + }, + { + "epoch": 0.22562693967823635, + "grad_norm": 0.29291120171546936, + "learning_rate": 4.763408519546956e-05, + "loss": 0.2298, + "step": 12650 + }, + { + "epoch": 0.22564477579995007, + "grad_norm": 0.2949700355529785, + "learning_rate": 4.7633424199939555e-05, + "loss": 0.2076, + "step": 12651 + }, + { + "epoch": 0.22566261192166376, + "grad_norm": 0.26310575008392334, + "learning_rate": 4.7632763116674575e-05, + "loss": 0.1599, + "step": 12652 + }, + { + "epoch": 0.22568044804337745, + "grad_norm": 0.2513583302497864, + "learning_rate": 4.763210194567719e-05, + "loss": 0.1971, + "step": 12653 + }, + { + "epoch": 0.22569828416509113, + "grad_norm": 0.2546949088573456, + "learning_rate": 4.763144068694995e-05, + "loss": 0.2049, + "step": 12654 + }, + { + "epoch": 0.22571612028680485, + "grad_norm": 0.27113422751426697, + "learning_rate": 4.763077934049544e-05, + "loss": 0.1984, + "step": 12655 + }, + { + "epoch": 0.22573395640851854, + "grad_norm": 0.2423454374074936, + "learning_rate": 4.763011790631621e-05, + "loss": 0.2333, + "step": 12656 + }, + { + "epoch": 0.22575179253023223, + "grad_norm": 0.31101346015930176, + "learning_rate": 4.7629456384414826e-05, + "loss": 0.2029, + "step": 12657 + }, + { + "epoch": 0.22576962865194591, + "grad_norm": 0.3214392066001892, + "learning_rate": 4.7628794774793855e-05, + "loss": 0.2725, + "step": 12658 + }, + { + "epoch": 0.2257874647736596, + "grad_norm": 0.2528858482837677, + "learning_rate": 4.762813307745586e-05, + "loss": 0.1879, + "step": 12659 + }, + { + "epoch": 0.22580530089537332, + "grad_norm": 0.2589768171310425, + "learning_rate": 4.76274712924034e-05, + "loss": 0.18, + "step": 12660 + }, + { + "epoch": 0.225823137017087, + "grad_norm": 0.3322522044181824, + "learning_rate": 4.7626809419639056e-05, + "loss": 0.2285, + "step": 12661 + }, + { + "epoch": 0.2258409731388007, + "grad_norm": 0.3407336175441742, + "learning_rate": 4.762614745916538e-05, + "loss": 0.2062, + "step": 12662 + }, + { + "epoch": 0.22585880926051438, + "grad_norm": 0.25700899958610535, + "learning_rate": 4.762548541098494e-05, + "loss": 0.1855, + "step": 12663 + }, + { + "epoch": 0.2258766453822281, + "grad_norm": 0.2184104025363922, + "learning_rate": 4.7624823275100304e-05, + "loss": 0.201, + "step": 12664 + }, + { + "epoch": 0.22589448150394179, + "grad_norm": 0.22487549483776093, + "learning_rate": 4.762416105151404e-05, + "loss": 0.2114, + "step": 12665 + }, + { + "epoch": 0.22591231762565547, + "grad_norm": 0.28850653767585754, + "learning_rate": 4.762349874022871e-05, + "loss": 0.2195, + "step": 12666 + }, + { + "epoch": 0.22593015374736916, + "grad_norm": 0.22642260789871216, + "learning_rate": 4.7622836341246894e-05, + "loss": 0.1728, + "step": 12667 + }, + { + "epoch": 0.22594798986908288, + "grad_norm": 0.2599693834781647, + "learning_rate": 4.762217385457114e-05, + "loss": 0.2182, + "step": 12668 + }, + { + "epoch": 0.22596582599079656, + "grad_norm": 0.2706993818283081, + "learning_rate": 4.762151128020404e-05, + "loss": 0.2161, + "step": 12669 + }, + { + "epoch": 0.22598366211251025, + "grad_norm": 0.16822826862335205, + "learning_rate": 4.7620848618148126e-05, + "loss": 0.1426, + "step": 12670 + }, + { + "epoch": 0.22600149823422394, + "grad_norm": 0.19447705149650574, + "learning_rate": 4.762018586840601e-05, + "loss": 0.1589, + "step": 12671 + }, + { + "epoch": 0.22601933435593766, + "grad_norm": 0.2862134575843811, + "learning_rate": 4.761952303098023e-05, + "loss": 0.1942, + "step": 12672 + }, + { + "epoch": 0.22603717047765134, + "grad_norm": 0.2801419198513031, + "learning_rate": 4.7618860105873375e-05, + "loss": 0.181, + "step": 12673 + }, + { + "epoch": 0.22605500659936503, + "grad_norm": 0.2613162100315094, + "learning_rate": 4.7618197093088e-05, + "loss": 0.1955, + "step": 12674 + }, + { + "epoch": 0.22607284272107872, + "grad_norm": 0.24245457351207733, + "learning_rate": 4.761753399262668e-05, + "loss": 0.2491, + "step": 12675 + }, + { + "epoch": 0.22609067884279244, + "grad_norm": 0.3227483630180359, + "learning_rate": 4.761687080449199e-05, + "loss": 0.2287, + "step": 12676 + }, + { + "epoch": 0.22610851496450612, + "grad_norm": 0.41901516914367676, + "learning_rate": 4.7616207528686496e-05, + "loss": 0.2599, + "step": 12677 + }, + { + "epoch": 0.2261263510862198, + "grad_norm": 0.2581169605255127, + "learning_rate": 4.761554416521278e-05, + "loss": 0.2277, + "step": 12678 + }, + { + "epoch": 0.2261441872079335, + "grad_norm": 0.24728922545909882, + "learning_rate": 4.761488071407339e-05, + "loss": 0.1854, + "step": 12679 + }, + { + "epoch": 0.2261620233296472, + "grad_norm": 0.25558969378471375, + "learning_rate": 4.761421717527091e-05, + "loss": 0.1577, + "step": 12680 + }, + { + "epoch": 0.2261798594513609, + "grad_norm": 0.28904178738594055, + "learning_rate": 4.761355354880792e-05, + "loss": 0.1776, + "step": 12681 + }, + { + "epoch": 0.2261976955730746, + "grad_norm": 0.27921637892723083, + "learning_rate": 4.761288983468699e-05, + "loss": 0.165, + "step": 12682 + }, + { + "epoch": 0.22621553169478828, + "grad_norm": 0.24143558740615845, + "learning_rate": 4.761222603291068e-05, + "loss": 0.1766, + "step": 12683 + }, + { + "epoch": 0.22623336781650197, + "grad_norm": 0.3705345392227173, + "learning_rate": 4.761156214348158e-05, + "loss": 0.2192, + "step": 12684 + }, + { + "epoch": 0.22625120393821568, + "grad_norm": 0.26557087898254395, + "learning_rate": 4.761089816640225e-05, + "loss": 0.1727, + "step": 12685 + }, + { + "epoch": 0.22626904005992937, + "grad_norm": 0.28113749623298645, + "learning_rate": 4.761023410167527e-05, + "loss": 0.1716, + "step": 12686 + }, + { + "epoch": 0.22628687618164306, + "grad_norm": 0.32993751764297485, + "learning_rate": 4.760956994930321e-05, + "loss": 0.2177, + "step": 12687 + }, + { + "epoch": 0.22630471230335675, + "grad_norm": 0.24255356192588806, + "learning_rate": 4.760890570928865e-05, + "loss": 0.1975, + "step": 12688 + }, + { + "epoch": 0.22632254842507046, + "grad_norm": 0.24917371571063995, + "learning_rate": 4.7608241381634165e-05, + "loss": 0.2161, + "step": 12689 + }, + { + "epoch": 0.22634038454678415, + "grad_norm": 0.27715256810188293, + "learning_rate": 4.7607576966342324e-05, + "loss": 0.207, + "step": 12690 + }, + { + "epoch": 0.22635822066849784, + "grad_norm": 0.3086145222187042, + "learning_rate": 4.7606912463415716e-05, + "loss": 0.2003, + "step": 12691 + }, + { + "epoch": 0.22637605679021153, + "grad_norm": 0.29022637009620667, + "learning_rate": 4.76062478728569e-05, + "loss": 0.1795, + "step": 12692 + }, + { + "epoch": 0.22639389291192524, + "grad_norm": 0.28601041436195374, + "learning_rate": 4.7605583194668457e-05, + "loss": 0.2078, + "step": 12693 + }, + { + "epoch": 0.22641172903363893, + "grad_norm": 0.28157880902290344, + "learning_rate": 4.760491842885297e-05, + "loss": 0.2023, + "step": 12694 + }, + { + "epoch": 0.22642956515535262, + "grad_norm": 0.32047608494758606, + "learning_rate": 4.760425357541301e-05, + "loss": 0.2534, + "step": 12695 + }, + { + "epoch": 0.2264474012770663, + "grad_norm": 0.2760776877403259, + "learning_rate": 4.760358863435115e-05, + "loss": 0.1793, + "step": 12696 + }, + { + "epoch": 0.22646523739878002, + "grad_norm": 0.22464804351329803, + "learning_rate": 4.760292360566998e-05, + "loss": 0.2103, + "step": 12697 + }, + { + "epoch": 0.2264830735204937, + "grad_norm": 0.30228474736213684, + "learning_rate": 4.7602258489372074e-05, + "loss": 0.2179, + "step": 12698 + }, + { + "epoch": 0.2265009096422074, + "grad_norm": 0.3229995369911194, + "learning_rate": 4.760159328546e-05, + "loss": 0.2361, + "step": 12699 + }, + { + "epoch": 0.2265187457639211, + "grad_norm": 0.3854117691516876, + "learning_rate": 4.760092799393635e-05, + "loss": 0.2036, + "step": 12700 + }, + { + "epoch": 0.22653658188563477, + "grad_norm": 0.22347381711006165, + "learning_rate": 4.76002626148037e-05, + "loss": 0.1794, + "step": 12701 + }, + { + "epoch": 0.2265544180073485, + "grad_norm": 0.27365759015083313, + "learning_rate": 4.7599597148064614e-05, + "loss": 0.1925, + "step": 12702 + }, + { + "epoch": 0.22657225412906218, + "grad_norm": 0.28982412815093994, + "learning_rate": 4.7598931593721694e-05, + "loss": 0.182, + "step": 12703 + }, + { + "epoch": 0.22659009025077587, + "grad_norm": 0.3106169104576111, + "learning_rate": 4.759826595177751e-05, + "loss": 0.1153, + "step": 12704 + }, + { + "epoch": 0.22660792637248955, + "grad_norm": 0.33322378993034363, + "learning_rate": 4.759760022223464e-05, + "loss": 0.2377, + "step": 12705 + }, + { + "epoch": 0.22662576249420327, + "grad_norm": 0.30550891160964966, + "learning_rate": 4.759693440509566e-05, + "loss": 0.2241, + "step": 12706 + }, + { + "epoch": 0.22664359861591696, + "grad_norm": 0.2674667239189148, + "learning_rate": 4.759626850036317e-05, + "loss": 0.1504, + "step": 12707 + }, + { + "epoch": 0.22666143473763065, + "grad_norm": 0.2966086268424988, + "learning_rate": 4.7595602508039724e-05, + "loss": 0.1877, + "step": 12708 + }, + { + "epoch": 0.22667927085934433, + "grad_norm": 0.30703243613243103, + "learning_rate": 4.759493642812793e-05, + "loss": 0.1762, + "step": 12709 + }, + { + "epoch": 0.22669710698105805, + "grad_norm": 0.3626992404460907, + "learning_rate": 4.7594270260630355e-05, + "loss": 0.2374, + "step": 12710 + }, + { + "epoch": 0.22671494310277174, + "grad_norm": 0.19699066877365112, + "learning_rate": 4.7593604005549586e-05, + "loss": 0.2244, + "step": 12711 + }, + { + "epoch": 0.22673277922448543, + "grad_norm": 0.2944333553314209, + "learning_rate": 4.75929376628882e-05, + "loss": 0.154, + "step": 12712 + }, + { + "epoch": 0.2267506153461991, + "grad_norm": 0.3691287338733673, + "learning_rate": 4.759227123264879e-05, + "loss": 0.1914, + "step": 12713 + }, + { + "epoch": 0.22676845146791283, + "grad_norm": 0.2353912591934204, + "learning_rate": 4.7591604714833924e-05, + "loss": 0.1987, + "step": 12714 + }, + { + "epoch": 0.22678628758962652, + "grad_norm": 0.24651099741458893, + "learning_rate": 4.75909381094462e-05, + "loss": 0.2126, + "step": 12715 + }, + { + "epoch": 0.2268041237113402, + "grad_norm": 0.422842800617218, + "learning_rate": 4.7590271416488206e-05, + "loss": 0.1782, + "step": 12716 + }, + { + "epoch": 0.2268219598330539, + "grad_norm": 0.3569892942905426, + "learning_rate": 4.75896046359625e-05, + "loss": 0.1954, + "step": 12717 + }, + { + "epoch": 0.2268397959547676, + "grad_norm": 0.19210238754749298, + "learning_rate": 4.7588937767871697e-05, + "loss": 0.1507, + "step": 12718 + }, + { + "epoch": 0.2268576320764813, + "grad_norm": 0.26525914669036865, + "learning_rate": 4.758827081221837e-05, + "loss": 0.1736, + "step": 12719 + }, + { + "epoch": 0.22687546819819499, + "grad_norm": 0.2761824429035187, + "learning_rate": 4.75876037690051e-05, + "loss": 0.1757, + "step": 12720 + }, + { + "epoch": 0.22689330431990867, + "grad_norm": 0.26994580030441284, + "learning_rate": 4.758693663823448e-05, + "loss": 0.1805, + "step": 12721 + }, + { + "epoch": 0.22691114044162236, + "grad_norm": 0.2924158275127411, + "learning_rate": 4.758626941990909e-05, + "loss": 0.1973, + "step": 12722 + }, + { + "epoch": 0.22692897656333608, + "grad_norm": 0.2680577337741852, + "learning_rate": 4.758560211403151e-05, + "loss": 0.1803, + "step": 12723 + }, + { + "epoch": 0.22694681268504976, + "grad_norm": 0.30167117714881897, + "learning_rate": 4.758493472060435e-05, + "loss": 0.2549, + "step": 12724 + }, + { + "epoch": 0.22696464880676345, + "grad_norm": 0.21846014261245728, + "learning_rate": 4.7584267239630175e-05, + "loss": 0.181, + "step": 12725 + }, + { + "epoch": 0.22698248492847714, + "grad_norm": 0.2149186134338379, + "learning_rate": 4.758359967111158e-05, + "loss": 0.1836, + "step": 12726 + }, + { + "epoch": 0.22700032105019086, + "grad_norm": 0.4402591288089752, + "learning_rate": 4.7582932015051154e-05, + "loss": 0.2634, + "step": 12727 + }, + { + "epoch": 0.22701815717190454, + "grad_norm": 0.19699132442474365, + "learning_rate": 4.758226427145148e-05, + "loss": 0.1991, + "step": 12728 + }, + { + "epoch": 0.22703599329361823, + "grad_norm": 0.25451552867889404, + "learning_rate": 4.758159644031515e-05, + "loss": 0.2104, + "step": 12729 + }, + { + "epoch": 0.22705382941533192, + "grad_norm": 0.2836599349975586, + "learning_rate": 4.758092852164476e-05, + "loss": 0.1335, + "step": 12730 + }, + { + "epoch": 0.22707166553704564, + "grad_norm": 0.2872105836868286, + "learning_rate": 4.7580260515442886e-05, + "loss": 0.1806, + "step": 12731 + }, + { + "epoch": 0.22708950165875932, + "grad_norm": 0.2629055678844452, + "learning_rate": 4.757959242171213e-05, + "loss": 0.1859, + "step": 12732 + }, + { + "epoch": 0.227107337780473, + "grad_norm": 0.2944047152996063, + "learning_rate": 4.757892424045506e-05, + "loss": 0.2199, + "step": 12733 + }, + { + "epoch": 0.2271251739021867, + "grad_norm": 0.43053120374679565, + "learning_rate": 4.757825597167429e-05, + "loss": 0.1515, + "step": 12734 + }, + { + "epoch": 0.22714301002390042, + "grad_norm": 0.24039088189601898, + "learning_rate": 4.7577587615372405e-05, + "loss": 0.1928, + "step": 12735 + }, + { + "epoch": 0.2271608461456141, + "grad_norm": 0.2977534234523773, + "learning_rate": 4.7576919171551996e-05, + "loss": 0.201, + "step": 12736 + }, + { + "epoch": 0.2271786822673278, + "grad_norm": 0.2886347472667694, + "learning_rate": 4.7576250640215634e-05, + "loss": 0.219, + "step": 12737 + }, + { + "epoch": 0.22719651838904148, + "grad_norm": 0.25993335247039795, + "learning_rate": 4.757558202136594e-05, + "loss": 0.1613, + "step": 12738 + }, + { + "epoch": 0.22721435451075517, + "grad_norm": 0.22002577781677246, + "learning_rate": 4.757491331500549e-05, + "loss": 0.1665, + "step": 12739 + }, + { + "epoch": 0.22723219063246888, + "grad_norm": 0.2938578724861145, + "learning_rate": 4.757424452113688e-05, + "loss": 0.1938, + "step": 12740 + }, + { + "epoch": 0.22725002675418257, + "grad_norm": 0.33019351959228516, + "learning_rate": 4.75735756397627e-05, + "loss": 0.1867, + "step": 12741 + }, + { + "epoch": 0.22726786287589626, + "grad_norm": 0.2709706425666809, + "learning_rate": 4.7572906670885544e-05, + "loss": 0.2353, + "step": 12742 + }, + { + "epoch": 0.22728569899760995, + "grad_norm": 0.23868712782859802, + "learning_rate": 4.7572237614508e-05, + "loss": 0.207, + "step": 12743 + }, + { + "epoch": 0.22730353511932366, + "grad_norm": 0.19620831310749054, + "learning_rate": 4.757156847063268e-05, + "loss": 0.1959, + "step": 12744 + }, + { + "epoch": 0.22732137124103735, + "grad_norm": 0.2820335030555725, + "learning_rate": 4.7570899239262155e-05, + "loss": 0.1799, + "step": 12745 + }, + { + "epoch": 0.22733920736275104, + "grad_norm": 0.2542852759361267, + "learning_rate": 4.757022992039903e-05, + "loss": 0.1962, + "step": 12746 + }, + { + "epoch": 0.22735704348446473, + "grad_norm": 0.2287750542163849, + "learning_rate": 4.7569560514045895e-05, + "loss": 0.1549, + "step": 12747 + }, + { + "epoch": 0.22737487960617844, + "grad_norm": 0.2620689868927002, + "learning_rate": 4.7568891020205354e-05, + "loss": 0.1945, + "step": 12748 + }, + { + "epoch": 0.22739271572789213, + "grad_norm": 0.30139219760894775, + "learning_rate": 4.756822143887999e-05, + "loss": 0.2162, + "step": 12749 + }, + { + "epoch": 0.22741055184960582, + "grad_norm": 0.23995572328567505, + "learning_rate": 4.7567551770072416e-05, + "loss": 0.1731, + "step": 12750 + }, + { + "epoch": 0.2274283879713195, + "grad_norm": 0.23730701208114624, + "learning_rate": 4.756688201378521e-05, + "loss": 0.1883, + "step": 12751 + }, + { + "epoch": 0.22744622409303322, + "grad_norm": 0.27638038992881775, + "learning_rate": 4.756621217002097e-05, + "loss": 0.2125, + "step": 12752 + }, + { + "epoch": 0.2274640602147469, + "grad_norm": 0.3324611485004425, + "learning_rate": 4.75655422387823e-05, + "loss": 0.1992, + "step": 12753 + }, + { + "epoch": 0.2274818963364606, + "grad_norm": 0.2876867353916168, + "learning_rate": 4.75648722200718e-05, + "loss": 0.2105, + "step": 12754 + }, + { + "epoch": 0.2274997324581743, + "grad_norm": 0.2931671738624573, + "learning_rate": 4.756420211389206e-05, + "loss": 0.1459, + "step": 12755 + }, + { + "epoch": 0.227517568579888, + "grad_norm": 0.3282272517681122, + "learning_rate": 4.7563531920245675e-05, + "loss": 0.1789, + "step": 12756 + }, + { + "epoch": 0.2275354047016017, + "grad_norm": 0.24702872335910797, + "learning_rate": 4.7562861639135254e-05, + "loss": 0.2027, + "step": 12757 + }, + { + "epoch": 0.22755324082331538, + "grad_norm": 0.32239800691604614, + "learning_rate": 4.756219127056338e-05, + "loss": 0.1549, + "step": 12758 + }, + { + "epoch": 0.22757107694502907, + "grad_norm": 0.3866209387779236, + "learning_rate": 4.756152081453267e-05, + "loss": 0.2548, + "step": 12759 + }, + { + "epoch": 0.22758891306674275, + "grad_norm": 0.27214887738227844, + "learning_rate": 4.75608502710457e-05, + "loss": 0.2139, + "step": 12760 + }, + { + "epoch": 0.22760674918845647, + "grad_norm": 0.3311125338077545, + "learning_rate": 4.75601796401051e-05, + "loss": 0.2604, + "step": 12761 + }, + { + "epoch": 0.22762458531017016, + "grad_norm": 0.23066891729831696, + "learning_rate": 4.7559508921713436e-05, + "loss": 0.1856, + "step": 12762 + }, + { + "epoch": 0.22764242143188385, + "grad_norm": 0.2128305584192276, + "learning_rate": 4.755883811587333e-05, + "loss": 0.1963, + "step": 12763 + }, + { + "epoch": 0.22766025755359753, + "grad_norm": 0.31742042303085327, + "learning_rate": 4.755816722258737e-05, + "loss": 0.1727, + "step": 12764 + }, + { + "epoch": 0.22767809367531125, + "grad_norm": 0.27673065662384033, + "learning_rate": 4.7557496241858165e-05, + "loss": 0.2091, + "step": 12765 + }, + { + "epoch": 0.22769592979702494, + "grad_norm": 0.459807425737381, + "learning_rate": 4.7556825173688314e-05, + "loss": 0.2079, + "step": 12766 + }, + { + "epoch": 0.22771376591873863, + "grad_norm": 0.4824574887752533, + "learning_rate": 4.7556154018080424e-05, + "loss": 0.1712, + "step": 12767 + }, + { + "epoch": 0.2277316020404523, + "grad_norm": 0.3288426995277405, + "learning_rate": 4.7555482775037084e-05, + "loss": 0.1877, + "step": 12768 + }, + { + "epoch": 0.22774943816216603, + "grad_norm": 0.23207063972949982, + "learning_rate": 4.7554811444560896e-05, + "loss": 0.1687, + "step": 12769 + }, + { + "epoch": 0.22776727428387972, + "grad_norm": 0.22554905712604523, + "learning_rate": 4.755414002665448e-05, + "loss": 0.2006, + "step": 12770 + }, + { + "epoch": 0.2277851104055934, + "grad_norm": 0.33833837509155273, + "learning_rate": 4.7553468521320424e-05, + "loss": 0.2134, + "step": 12771 + }, + { + "epoch": 0.2278029465273071, + "grad_norm": 0.28130850195884705, + "learning_rate": 4.755279692856134e-05, + "loss": 0.1603, + "step": 12772 + }, + { + "epoch": 0.2278207826490208, + "grad_norm": 0.2707313299179077, + "learning_rate": 4.755212524837981e-05, + "loss": 0.218, + "step": 12773 + }, + { + "epoch": 0.2278386187707345, + "grad_norm": 0.2821885049343109, + "learning_rate": 4.755145348077847e-05, + "loss": 0.1581, + "step": 12774 + }, + { + "epoch": 0.22785645489244818, + "grad_norm": 0.2544398903846741, + "learning_rate": 4.75507816257599e-05, + "loss": 0.1915, + "step": 12775 + }, + { + "epoch": 0.22787429101416187, + "grad_norm": 0.2084919959306717, + "learning_rate": 4.755010968332671e-05, + "loss": 0.1642, + "step": 12776 + }, + { + "epoch": 0.2278921271358756, + "grad_norm": 0.25685441493988037, + "learning_rate": 4.754943765348151e-05, + "loss": 0.1984, + "step": 12777 + }, + { + "epoch": 0.22790996325758928, + "grad_norm": 0.2325342744588852, + "learning_rate": 4.75487655362269e-05, + "loss": 0.2185, + "step": 12778 + }, + { + "epoch": 0.22792779937930296, + "grad_norm": 0.21224354207515717, + "learning_rate": 4.754809333156548e-05, + "loss": 0.1796, + "step": 12779 + }, + { + "epoch": 0.22794563550101665, + "grad_norm": 0.32012009620666504, + "learning_rate": 4.754742103949987e-05, + "loss": 0.2174, + "step": 12780 + }, + { + "epoch": 0.22796347162273034, + "grad_norm": 0.26159098744392395, + "learning_rate": 4.754674866003267e-05, + "loss": 0.126, + "step": 12781 + }, + { + "epoch": 0.22798130774444406, + "grad_norm": 0.22669540345668793, + "learning_rate": 4.7546076193166477e-05, + "loss": 0.2208, + "step": 12782 + }, + { + "epoch": 0.22799914386615774, + "grad_norm": 0.28815677762031555, + "learning_rate": 4.754540363890391e-05, + "loss": 0.2018, + "step": 12783 + }, + { + "epoch": 0.22801697998787143, + "grad_norm": 0.40440696477890015, + "learning_rate": 4.754473099724758e-05, + "loss": 0.2128, + "step": 12784 + }, + { + "epoch": 0.22803481610958512, + "grad_norm": 0.26720649003982544, + "learning_rate": 4.754405826820008e-05, + "loss": 0.2146, + "step": 12785 + }, + { + "epoch": 0.22805265223129884, + "grad_norm": 0.2865886092185974, + "learning_rate": 4.754338545176401e-05, + "loss": 0.1743, + "step": 12786 + }, + { + "epoch": 0.22807048835301252, + "grad_norm": 0.2756912410259247, + "learning_rate": 4.7542712547942e-05, + "loss": 0.2153, + "step": 12787 + }, + { + "epoch": 0.2280883244747262, + "grad_norm": 0.2839483320713043, + "learning_rate": 4.7542039556736663e-05, + "loss": 0.2104, + "step": 12788 + }, + { + "epoch": 0.2281061605964399, + "grad_norm": 0.30994048714637756, + "learning_rate": 4.7541366478150585e-05, + "loss": 0.2056, + "step": 12789 + }, + { + "epoch": 0.22812399671815362, + "grad_norm": 0.2848181128501892, + "learning_rate": 4.754069331218638e-05, + "loss": 0.1902, + "step": 12790 + }, + { + "epoch": 0.2281418328398673, + "grad_norm": 0.3385939598083496, + "learning_rate": 4.754002005884667e-05, + "loss": 0.2294, + "step": 12791 + }, + { + "epoch": 0.228159668961581, + "grad_norm": 0.2165912538766861, + "learning_rate": 4.7539346718134055e-05, + "loss": 0.1937, + "step": 12792 + }, + { + "epoch": 0.22817750508329468, + "grad_norm": 0.29350340366363525, + "learning_rate": 4.7538673290051144e-05, + "loss": 0.2353, + "step": 12793 + }, + { + "epoch": 0.2281953412050084, + "grad_norm": 0.310447096824646, + "learning_rate": 4.753799977460055e-05, + "loss": 0.2461, + "step": 12794 + }, + { + "epoch": 0.22821317732672208, + "grad_norm": 0.24659083783626556, + "learning_rate": 4.753732617178489e-05, + "loss": 0.2246, + "step": 12795 + }, + { + "epoch": 0.22823101344843577, + "grad_norm": 0.35219600796699524, + "learning_rate": 4.753665248160677e-05, + "loss": 0.1628, + "step": 12796 + }, + { + "epoch": 0.22824884957014946, + "grad_norm": 0.25911155343055725, + "learning_rate": 4.75359787040688e-05, + "loss": 0.1808, + "step": 12797 + }, + { + "epoch": 0.22826668569186317, + "grad_norm": 0.23166437447071075, + "learning_rate": 4.753530483917359e-05, + "loss": 0.1914, + "step": 12798 + }, + { + "epoch": 0.22828452181357686, + "grad_norm": 0.3343127369880676, + "learning_rate": 4.753463088692376e-05, + "loss": 0.1493, + "step": 12799 + }, + { + "epoch": 0.22830235793529055, + "grad_norm": 0.3121708333492279, + "learning_rate": 4.7533956847321916e-05, + "loss": 0.1674, + "step": 12800 + }, + { + "epoch": 0.22832019405700424, + "grad_norm": 0.30146902799606323, + "learning_rate": 4.753328272037066e-05, + "loss": 0.1684, + "step": 12801 + }, + { + "epoch": 0.22833803017871793, + "grad_norm": 0.2249869406223297, + "learning_rate": 4.7532608506072636e-05, + "loss": 0.1633, + "step": 12802 + }, + { + "epoch": 0.22835586630043164, + "grad_norm": 0.3040216565132141, + "learning_rate": 4.753193420443043e-05, + "loss": 0.258, + "step": 12803 + }, + { + "epoch": 0.22837370242214533, + "grad_norm": 0.24092234671115875, + "learning_rate": 4.7531259815446666e-05, + "loss": 0.1817, + "step": 12804 + }, + { + "epoch": 0.22839153854385902, + "grad_norm": 0.27227193117141724, + "learning_rate": 4.753058533912396e-05, + "loss": 0.1663, + "step": 12805 + }, + { + "epoch": 0.2284093746655727, + "grad_norm": 0.37233835458755493, + "learning_rate": 4.752991077546491e-05, + "loss": 0.1546, + "step": 12806 + }, + { + "epoch": 0.22842721078728642, + "grad_norm": 0.3068927526473999, + "learning_rate": 4.752923612447216e-05, + "loss": 0.1746, + "step": 12807 + }, + { + "epoch": 0.2284450469090001, + "grad_norm": 0.28168389201164246, + "learning_rate": 4.7528561386148305e-05, + "loss": 0.2291, + "step": 12808 + }, + { + "epoch": 0.2284628830307138, + "grad_norm": 0.29312944412231445, + "learning_rate": 4.752788656049596e-05, + "loss": 0.2322, + "step": 12809 + }, + { + "epoch": 0.22848071915242749, + "grad_norm": 0.4435538053512573, + "learning_rate": 4.7527211647517757e-05, + "loss": 0.2544, + "step": 12810 + }, + { + "epoch": 0.2284985552741412, + "grad_norm": 0.41616931557655334, + "learning_rate": 4.7526536647216294e-05, + "loss": 0.1661, + "step": 12811 + }, + { + "epoch": 0.2285163913958549, + "grad_norm": 0.23465916514396667, + "learning_rate": 4.7525861559594185e-05, + "loss": 0.1807, + "step": 12812 + }, + { + "epoch": 0.22853422751756858, + "grad_norm": 0.2502564787864685, + "learning_rate": 4.752518638465407e-05, + "loss": 0.1787, + "step": 12813 + }, + { + "epoch": 0.22855206363928227, + "grad_norm": 0.33122512698173523, + "learning_rate": 4.752451112239854e-05, + "loss": 0.16, + "step": 12814 + }, + { + "epoch": 0.22856989976099598, + "grad_norm": 0.24872533977031708, + "learning_rate": 4.752383577283024e-05, + "loss": 0.1655, + "step": 12815 + }, + { + "epoch": 0.22858773588270967, + "grad_norm": 0.3210082948207855, + "learning_rate": 4.752316033595177e-05, + "loss": 0.187, + "step": 12816 + }, + { + "epoch": 0.22860557200442336, + "grad_norm": 0.3571023941040039, + "learning_rate": 4.752248481176574e-05, + "loss": 0.2349, + "step": 12817 + }, + { + "epoch": 0.22862340812613705, + "grad_norm": 0.3391280770301819, + "learning_rate": 4.752180920027479e-05, + "loss": 0.1791, + "step": 12818 + }, + { + "epoch": 0.22864124424785076, + "grad_norm": 0.25080451369285583, + "learning_rate": 4.752113350148153e-05, + "loss": 0.1973, + "step": 12819 + }, + { + "epoch": 0.22865908036956445, + "grad_norm": 0.25983667373657227, + "learning_rate": 4.7520457715388566e-05, + "loss": 0.1727, + "step": 12820 + }, + { + "epoch": 0.22867691649127814, + "grad_norm": 0.4141266644001007, + "learning_rate": 4.751978184199854e-05, + "loss": 0.2127, + "step": 12821 + }, + { + "epoch": 0.22869475261299183, + "grad_norm": 0.2053796350955963, + "learning_rate": 4.751910588131406e-05, + "loss": 0.1704, + "step": 12822 + }, + { + "epoch": 0.2287125887347055, + "grad_norm": 0.20918521285057068, + "learning_rate": 4.7518429833337754e-05, + "loss": 0.1617, + "step": 12823 + }, + { + "epoch": 0.22873042485641923, + "grad_norm": 0.2291272133588791, + "learning_rate": 4.751775369807222e-05, + "loss": 0.18, + "step": 12824 + }, + { + "epoch": 0.22874826097813292, + "grad_norm": 0.31480973958969116, + "learning_rate": 4.751707747552011e-05, + "loss": 0.1797, + "step": 12825 + }, + { + "epoch": 0.2287660970998466, + "grad_norm": 0.30694177746772766, + "learning_rate": 4.751640116568402e-05, + "loss": 0.2233, + "step": 12826 + }, + { + "epoch": 0.2287839332215603, + "grad_norm": 0.2968466281890869, + "learning_rate": 4.7515724768566595e-05, + "loss": 0.2103, + "step": 12827 + }, + { + "epoch": 0.228801769343274, + "grad_norm": 0.2747965455055237, + "learning_rate": 4.751504828417043e-05, + "loss": 0.1908, + "step": 12828 + }, + { + "epoch": 0.2288196054649877, + "grad_norm": 0.28568482398986816, + "learning_rate": 4.751437171249817e-05, + "loss": 0.2047, + "step": 12829 + }, + { + "epoch": 0.22883744158670138, + "grad_norm": 0.3147212564945221, + "learning_rate": 4.751369505355242e-05, + "loss": 0.1898, + "step": 12830 + }, + { + "epoch": 0.22885527770841507, + "grad_norm": 0.28301647305488586, + "learning_rate": 4.751301830733582e-05, + "loss": 0.2023, + "step": 12831 + }, + { + "epoch": 0.2288731138301288, + "grad_norm": 0.2519884705543518, + "learning_rate": 4.751234147385099e-05, + "loss": 0.1737, + "step": 12832 + }, + { + "epoch": 0.22889094995184248, + "grad_norm": 0.40235304832458496, + "learning_rate": 4.7511664553100544e-05, + "loss": 0.2205, + "step": 12833 + }, + { + "epoch": 0.22890878607355616, + "grad_norm": 0.21696004271507263, + "learning_rate": 4.7510987545087105e-05, + "loss": 0.1797, + "step": 12834 + }, + { + "epoch": 0.22892662219526985, + "grad_norm": 0.29813551902770996, + "learning_rate": 4.751031044981331e-05, + "loss": 0.2039, + "step": 12835 + }, + { + "epoch": 0.22894445831698357, + "grad_norm": 0.22463171184062958, + "learning_rate": 4.7509633267281775e-05, + "loss": 0.203, + "step": 12836 + }, + { + "epoch": 0.22896229443869726, + "grad_norm": 0.29856470227241516, + "learning_rate": 4.750895599749513e-05, + "loss": 0.1771, + "step": 12837 + }, + { + "epoch": 0.22898013056041094, + "grad_norm": 0.32924017310142517, + "learning_rate": 4.750827864045599e-05, + "loss": 0.1918, + "step": 12838 + }, + { + "epoch": 0.22899796668212463, + "grad_norm": 0.3483649790287018, + "learning_rate": 4.7507601196167e-05, + "loss": 0.2634, + "step": 12839 + }, + { + "epoch": 0.22901580280383832, + "grad_norm": 0.37428799271583557, + "learning_rate": 4.7506923664630765e-05, + "loss": 0.2369, + "step": 12840 + }, + { + "epoch": 0.22903363892555204, + "grad_norm": 0.3149733543395996, + "learning_rate": 4.7506246045849916e-05, + "loss": 0.2341, + "step": 12841 + }, + { + "epoch": 0.22905147504726572, + "grad_norm": 0.23539508879184723, + "learning_rate": 4.750556833982709e-05, + "loss": 0.1631, + "step": 12842 + }, + { + "epoch": 0.2290693111689794, + "grad_norm": 0.3394657373428345, + "learning_rate": 4.750489054656491e-05, + "loss": 0.2337, + "step": 12843 + }, + { + "epoch": 0.2290871472906931, + "grad_norm": 0.26157131791114807, + "learning_rate": 4.7504212666065996e-05, + "loss": 0.2209, + "step": 12844 + }, + { + "epoch": 0.22910498341240682, + "grad_norm": 0.3097561299800873, + "learning_rate": 4.750353469833298e-05, + "loss": 0.1824, + "step": 12845 + }, + { + "epoch": 0.2291228195341205, + "grad_norm": 0.2788170874118805, + "learning_rate": 4.75028566433685e-05, + "loss": 0.1674, + "step": 12846 + }, + { + "epoch": 0.2291406556558342, + "grad_norm": 0.23734356462955475, + "learning_rate": 4.7502178501175165e-05, + "loss": 0.1911, + "step": 12847 + }, + { + "epoch": 0.22915849177754788, + "grad_norm": 0.33548101782798767, + "learning_rate": 4.750150027175562e-05, + "loss": 0.2315, + "step": 12848 + }, + { + "epoch": 0.2291763278992616, + "grad_norm": 0.22799284756183624, + "learning_rate": 4.750082195511248e-05, + "loss": 0.1867, + "step": 12849 + }, + { + "epoch": 0.22919416402097528, + "grad_norm": 0.329924076795578, + "learning_rate": 4.75001435512484e-05, + "loss": 0.2362, + "step": 12850 + }, + { + "epoch": 0.22921200014268897, + "grad_norm": 0.26934555172920227, + "learning_rate": 4.749946506016598e-05, + "loss": 0.2227, + "step": 12851 + }, + { + "epoch": 0.22922983626440266, + "grad_norm": 0.28445518016815186, + "learning_rate": 4.7498786481867864e-05, + "loss": 0.1908, + "step": 12852 + }, + { + "epoch": 0.22924767238611637, + "grad_norm": 0.23385927081108093, + "learning_rate": 4.749810781635668e-05, + "loss": 0.1717, + "step": 12853 + }, + { + "epoch": 0.22926550850783006, + "grad_norm": 0.3572027087211609, + "learning_rate": 4.749742906363506e-05, + "loss": 0.2461, + "step": 12854 + }, + { + "epoch": 0.22928334462954375, + "grad_norm": 0.26154258847236633, + "learning_rate": 4.7496750223705635e-05, + "loss": 0.1837, + "step": 12855 + }, + { + "epoch": 0.22930118075125744, + "grad_norm": 0.27525046467781067, + "learning_rate": 4.749607129657104e-05, + "loss": 0.1844, + "step": 12856 + }, + { + "epoch": 0.22931901687297115, + "grad_norm": 0.3137851357460022, + "learning_rate": 4.7495392282233896e-05, + "loss": 0.1366, + "step": 12857 + }, + { + "epoch": 0.22933685299468484, + "grad_norm": 0.32831886410713196, + "learning_rate": 4.749471318069685e-05, + "loss": 0.207, + "step": 12858 + }, + { + "epoch": 0.22935468911639853, + "grad_norm": 0.3689045011997223, + "learning_rate": 4.7494033991962514e-05, + "loss": 0.2653, + "step": 12859 + }, + { + "epoch": 0.22937252523811222, + "grad_norm": 0.36577335000038147, + "learning_rate": 4.7493354716033545e-05, + "loss": 0.2443, + "step": 12860 + }, + { + "epoch": 0.2293903613598259, + "grad_norm": 0.4057885706424713, + "learning_rate": 4.7492675352912556e-05, + "loss": 0.1787, + "step": 12861 + }, + { + "epoch": 0.22940819748153962, + "grad_norm": 0.24981991946697235, + "learning_rate": 4.7491995902602196e-05, + "loss": 0.1903, + "step": 12862 + }, + { + "epoch": 0.2294260336032533, + "grad_norm": 0.23707671463489532, + "learning_rate": 4.749131636510509e-05, + "loss": 0.184, + "step": 12863 + }, + { + "epoch": 0.229443869724967, + "grad_norm": 0.242751806974411, + "learning_rate": 4.7490636740423863e-05, + "loss": 0.1633, + "step": 12864 + }, + { + "epoch": 0.22946170584668069, + "grad_norm": 0.34951525926589966, + "learning_rate": 4.748995702856117e-05, + "loss": 0.24, + "step": 12865 + }, + { + "epoch": 0.2294795419683944, + "grad_norm": 0.2674331068992615, + "learning_rate": 4.748927722951963e-05, + "loss": 0.1761, + "step": 12866 + }, + { + "epoch": 0.2294973780901081, + "grad_norm": 0.34835100173950195, + "learning_rate": 4.748859734330189e-05, + "loss": 0.244, + "step": 12867 + }, + { + "epoch": 0.22951521421182178, + "grad_norm": 0.2509762942790985, + "learning_rate": 4.748791736991058e-05, + "loss": 0.1699, + "step": 12868 + }, + { + "epoch": 0.22953305033353547, + "grad_norm": 0.2583219110965729, + "learning_rate": 4.7487237309348334e-05, + "loss": 0.1958, + "step": 12869 + }, + { + "epoch": 0.22955088645524918, + "grad_norm": 0.24803723394870758, + "learning_rate": 4.7486557161617785e-05, + "loss": 0.1898, + "step": 12870 + }, + { + "epoch": 0.22956872257696287, + "grad_norm": 0.23758569359779358, + "learning_rate": 4.7485876926721576e-05, + "loss": 0.2047, + "step": 12871 + }, + { + "epoch": 0.22958655869867656, + "grad_norm": 0.30725789070129395, + "learning_rate": 4.748519660466234e-05, + "loss": 0.217, + "step": 12872 + }, + { + "epoch": 0.22960439482039025, + "grad_norm": 0.29835864901542664, + "learning_rate": 4.748451619544272e-05, + "loss": 0.1959, + "step": 12873 + }, + { + "epoch": 0.22962223094210396, + "grad_norm": 0.26595067977905273, + "learning_rate": 4.748383569906535e-05, + "loss": 0.1975, + "step": 12874 + }, + { + "epoch": 0.22964006706381765, + "grad_norm": 0.3447571098804474, + "learning_rate": 4.7483155115532865e-05, + "loss": 0.237, + "step": 12875 + }, + { + "epoch": 0.22965790318553134, + "grad_norm": 0.2722747325897217, + "learning_rate": 4.74824744448479e-05, + "loss": 0.212, + "step": 12876 + }, + { + "epoch": 0.22967573930724502, + "grad_norm": 0.207061767578125, + "learning_rate": 4.748179368701311e-05, + "loss": 0.1841, + "step": 12877 + }, + { + "epoch": 0.22969357542895874, + "grad_norm": 0.31938791275024414, + "learning_rate": 4.7481112842031104e-05, + "loss": 0.1613, + "step": 12878 + }, + { + "epoch": 0.22971141155067243, + "grad_norm": 0.2799973785877228, + "learning_rate": 4.7480431909904556e-05, + "loss": 0.1937, + "step": 12879 + }, + { + "epoch": 0.22972924767238612, + "grad_norm": 0.22087237238883972, + "learning_rate": 4.7479750890636085e-05, + "loss": 0.1444, + "step": 12880 + }, + { + "epoch": 0.2297470837940998, + "grad_norm": 0.35536837577819824, + "learning_rate": 4.7479069784228333e-05, + "loss": 0.1714, + "step": 12881 + }, + { + "epoch": 0.2297649199158135, + "grad_norm": 0.3145037889480591, + "learning_rate": 4.747838859068395e-05, + "loss": 0.2284, + "step": 12882 + }, + { + "epoch": 0.2297827560375272, + "grad_norm": 0.2892024517059326, + "learning_rate": 4.747770731000556e-05, + "loss": 0.2337, + "step": 12883 + }, + { + "epoch": 0.2298005921592409, + "grad_norm": 0.29131457209587097, + "learning_rate": 4.747702594219582e-05, + "loss": 0.1857, + "step": 12884 + }, + { + "epoch": 0.22981842828095458, + "grad_norm": 0.32974138855934143, + "learning_rate": 4.747634448725736e-05, + "loss": 0.2029, + "step": 12885 + }, + { + "epoch": 0.22983626440266827, + "grad_norm": 0.27079445123672485, + "learning_rate": 4.747566294519283e-05, + "loss": 0.2205, + "step": 12886 + }, + { + "epoch": 0.229854100524382, + "grad_norm": 0.25136810541152954, + "learning_rate": 4.747498131600486e-05, + "loss": 0.2119, + "step": 12887 + }, + { + "epoch": 0.22987193664609568, + "grad_norm": 0.3759794533252716, + "learning_rate": 4.747429959969611e-05, + "loss": 0.2434, + "step": 12888 + }, + { + "epoch": 0.22988977276780936, + "grad_norm": 0.34786808490753174, + "learning_rate": 4.7473617796269204e-05, + "loss": 0.2014, + "step": 12889 + }, + { + "epoch": 0.22990760888952305, + "grad_norm": 0.23923760652542114, + "learning_rate": 4.74729359057268e-05, + "loss": 0.1901, + "step": 12890 + }, + { + "epoch": 0.22992544501123677, + "grad_norm": 0.336771696805954, + "learning_rate": 4.747225392807153e-05, + "loss": 0.2576, + "step": 12891 + }, + { + "epoch": 0.22994328113295046, + "grad_norm": 0.28458261489868164, + "learning_rate": 4.7471571863306045e-05, + "loss": 0.2122, + "step": 12892 + }, + { + "epoch": 0.22996111725466414, + "grad_norm": 0.28829190135002136, + "learning_rate": 4.747088971143298e-05, + "loss": 0.1759, + "step": 12893 + }, + { + "epoch": 0.22997895337637783, + "grad_norm": 0.1950722336769104, + "learning_rate": 4.7470207472454985e-05, + "loss": 0.1724, + "step": 12894 + }, + { + "epoch": 0.22999678949809155, + "grad_norm": 0.30134278535842896, + "learning_rate": 4.746952514637471e-05, + "loss": 0.2178, + "step": 12895 + }, + { + "epoch": 0.23001462561980524, + "grad_norm": 0.2403934895992279, + "learning_rate": 4.74688427331948e-05, + "loss": 0.1623, + "step": 12896 + }, + { + "epoch": 0.23003246174151892, + "grad_norm": 0.3011443316936493, + "learning_rate": 4.746816023291788e-05, + "loss": 0.1716, + "step": 12897 + }, + { + "epoch": 0.2300502978632326, + "grad_norm": 0.2563803195953369, + "learning_rate": 4.746747764554662e-05, + "loss": 0.1566, + "step": 12898 + }, + { + "epoch": 0.23006813398494633, + "grad_norm": 0.5843698978424072, + "learning_rate": 4.746679497108366e-05, + "loss": 0.209, + "step": 12899 + }, + { + "epoch": 0.23008597010666001, + "grad_norm": 0.3100547790527344, + "learning_rate": 4.746611220953164e-05, + "loss": 0.2086, + "step": 12900 + }, + { + "epoch": 0.2301038062283737, + "grad_norm": 0.4501771926879883, + "learning_rate": 4.746542936089321e-05, + "loss": 0.1677, + "step": 12901 + }, + { + "epoch": 0.2301216423500874, + "grad_norm": 0.24397575855255127, + "learning_rate": 4.746474642517101e-05, + "loss": 0.1961, + "step": 12902 + }, + { + "epoch": 0.23013947847180108, + "grad_norm": 0.25141119956970215, + "learning_rate": 4.74640634023677e-05, + "loss": 0.1715, + "step": 12903 + }, + { + "epoch": 0.2301573145935148, + "grad_norm": 0.2494877725839615, + "learning_rate": 4.746338029248592e-05, + "loss": 0.1348, + "step": 12904 + }, + { + "epoch": 0.23017515071522848, + "grad_norm": 0.3292723000049591, + "learning_rate": 4.746269709552832e-05, + "loss": 0.1579, + "step": 12905 + }, + { + "epoch": 0.23019298683694217, + "grad_norm": 0.2648920714855194, + "learning_rate": 4.746201381149755e-05, + "loss": 0.1587, + "step": 12906 + }, + { + "epoch": 0.23021082295865586, + "grad_norm": 0.1770116090774536, + "learning_rate": 4.746133044039625e-05, + "loss": 0.1682, + "step": 12907 + }, + { + "epoch": 0.23022865908036957, + "grad_norm": 0.28378725051879883, + "learning_rate": 4.746064698222708e-05, + "loss": 0.115, + "step": 12908 + }, + { + "epoch": 0.23024649520208326, + "grad_norm": 0.23963265120983124, + "learning_rate": 4.745996343699268e-05, + "loss": 0.174, + "step": 12909 + }, + { + "epoch": 0.23026433132379695, + "grad_norm": 0.3272024989128113, + "learning_rate": 4.745927980469571e-05, + "loss": 0.1827, + "step": 12910 + }, + { + "epoch": 0.23028216744551064, + "grad_norm": 0.26129600405693054, + "learning_rate": 4.7458596085338816e-05, + "loss": 0.1964, + "step": 12911 + }, + { + "epoch": 0.23030000356722435, + "grad_norm": 0.2599860429763794, + "learning_rate": 4.745791227892464e-05, + "loss": 0.1631, + "step": 12912 + }, + { + "epoch": 0.23031783968893804, + "grad_norm": 0.2960149943828583, + "learning_rate": 4.745722838545584e-05, + "loss": 0.1618, + "step": 12913 + }, + { + "epoch": 0.23033567581065173, + "grad_norm": 0.29296499490737915, + "learning_rate": 4.7456544404935074e-05, + "loss": 0.205, + "step": 12914 + }, + { + "epoch": 0.23035351193236542, + "grad_norm": 0.23233060538768768, + "learning_rate": 4.7455860337364974e-05, + "loss": 0.1898, + "step": 12915 + }, + { + "epoch": 0.23037134805407913, + "grad_norm": 0.28250375390052795, + "learning_rate": 4.745517618274821e-05, + "loss": 0.1923, + "step": 12916 + }, + { + "epoch": 0.23038918417579282, + "grad_norm": 0.3355405926704407, + "learning_rate": 4.745449194108743e-05, + "loss": 0.2139, + "step": 12917 + }, + { + "epoch": 0.2304070202975065, + "grad_norm": 0.3375043272972107, + "learning_rate": 4.745380761238528e-05, + "loss": 0.222, + "step": 12918 + }, + { + "epoch": 0.2304248564192202, + "grad_norm": 0.24035699665546417, + "learning_rate": 4.7453123196644415e-05, + "loss": 0.1886, + "step": 12919 + }, + { + "epoch": 0.23044269254093389, + "grad_norm": 0.2645739018917084, + "learning_rate": 4.7452438693867493e-05, + "loss": 0.2325, + "step": 12920 + }, + { + "epoch": 0.2304605286626476, + "grad_norm": 0.31312644481658936, + "learning_rate": 4.7451754104057164e-05, + "loss": 0.2007, + "step": 12921 + }, + { + "epoch": 0.2304783647843613, + "grad_norm": 0.21971257030963898, + "learning_rate": 4.7451069427216075e-05, + "loss": 0.1921, + "step": 12922 + }, + { + "epoch": 0.23049620090607498, + "grad_norm": 0.2702762484550476, + "learning_rate": 4.745038466334689e-05, + "loss": 0.2327, + "step": 12923 + }, + { + "epoch": 0.23051403702778867, + "grad_norm": 0.33635812997817993, + "learning_rate": 4.744969981245226e-05, + "loss": 0.258, + "step": 12924 + }, + { + "epoch": 0.23053187314950238, + "grad_norm": 0.3847140073776245, + "learning_rate": 4.7449014874534844e-05, + "loss": 0.1502, + "step": 12925 + }, + { + "epoch": 0.23054970927121607, + "grad_norm": 0.2540401816368103, + "learning_rate": 4.744832984959729e-05, + "loss": 0.1938, + "step": 12926 + }, + { + "epoch": 0.23056754539292976, + "grad_norm": 0.3007860779762268, + "learning_rate": 4.7447644737642264e-05, + "loss": 0.1571, + "step": 12927 + }, + { + "epoch": 0.23058538151464344, + "grad_norm": 0.235521137714386, + "learning_rate": 4.7446959538672395e-05, + "loss": 0.1882, + "step": 12928 + }, + { + "epoch": 0.23060321763635716, + "grad_norm": 0.2145262509584427, + "learning_rate": 4.744627425269037e-05, + "loss": 0.1477, + "step": 12929 + }, + { + "epoch": 0.23062105375807085, + "grad_norm": 0.22935424745082855, + "learning_rate": 4.744558887969883e-05, + "loss": 0.1882, + "step": 12930 + }, + { + "epoch": 0.23063888987978454, + "grad_norm": 0.3163914680480957, + "learning_rate": 4.744490341970044e-05, + "loss": 0.1726, + "step": 12931 + }, + { + "epoch": 0.23065672600149822, + "grad_norm": 0.3141555190086365, + "learning_rate": 4.744421787269785e-05, + "loss": 0.23, + "step": 12932 + }, + { + "epoch": 0.23067456212321194, + "grad_norm": 0.335625559091568, + "learning_rate": 4.744353223869372e-05, + "loss": 0.1428, + "step": 12933 + }, + { + "epoch": 0.23069239824492563, + "grad_norm": 0.2633238136768341, + "learning_rate": 4.74428465176907e-05, + "loss": 0.1786, + "step": 12934 + }, + { + "epoch": 0.23071023436663932, + "grad_norm": 0.31745925545692444, + "learning_rate": 4.744216070969146e-05, + "loss": 0.2171, + "step": 12935 + }, + { + "epoch": 0.230728070488353, + "grad_norm": 0.20374995470046997, + "learning_rate": 4.744147481469866e-05, + "loss": 0.1751, + "step": 12936 + }, + { + "epoch": 0.23074590661006672, + "grad_norm": 0.2697664797306061, + "learning_rate": 4.744078883271494e-05, + "loss": 0.164, + "step": 12937 + }, + { + "epoch": 0.2307637427317804, + "grad_norm": 0.3518921732902527, + "learning_rate": 4.7440102763742983e-05, + "loss": 0.1947, + "step": 12938 + }, + { + "epoch": 0.2307815788534941, + "grad_norm": 0.3237800598144531, + "learning_rate": 4.743941660778544e-05, + "loss": 0.2331, + "step": 12939 + }, + { + "epoch": 0.23079941497520778, + "grad_norm": 0.2582206130027771, + "learning_rate": 4.7438730364844953e-05, + "loss": 0.2031, + "step": 12940 + }, + { + "epoch": 0.23081725109692147, + "grad_norm": 0.29988721013069153, + "learning_rate": 4.743804403492421e-05, + "loss": 0.1564, + "step": 12941 + }, + { + "epoch": 0.2308350872186352, + "grad_norm": 0.25897279381752014, + "learning_rate": 4.743735761802585e-05, + "loss": 0.182, + "step": 12942 + }, + { + "epoch": 0.23085292334034888, + "grad_norm": 0.23908762633800507, + "learning_rate": 4.743667111415255e-05, + "loss": 0.2118, + "step": 12943 + }, + { + "epoch": 0.23087075946206256, + "grad_norm": 0.274428129196167, + "learning_rate": 4.743598452330695e-05, + "loss": 0.2126, + "step": 12944 + }, + { + "epoch": 0.23088859558377625, + "grad_norm": 0.27773234248161316, + "learning_rate": 4.743529784549174e-05, + "loss": 0.1647, + "step": 12945 + }, + { + "epoch": 0.23090643170548997, + "grad_norm": 0.29228320717811584, + "learning_rate": 4.743461108070956e-05, + "loss": 0.1776, + "step": 12946 + }, + { + "epoch": 0.23092426782720366, + "grad_norm": 0.29485487937927246, + "learning_rate": 4.743392422896308e-05, + "loss": 0.1855, + "step": 12947 + }, + { + "epoch": 0.23094210394891734, + "grad_norm": 0.2691425681114197, + "learning_rate": 4.743323729025496e-05, + "loss": 0.203, + "step": 12948 + }, + { + "epoch": 0.23095994007063103, + "grad_norm": 0.24132020771503448, + "learning_rate": 4.743255026458786e-05, + "loss": 0.2317, + "step": 12949 + }, + { + "epoch": 0.23097777619234475, + "grad_norm": 0.24747806787490845, + "learning_rate": 4.7431863151964454e-05, + "loss": 0.1768, + "step": 12950 + }, + { + "epoch": 0.23099561231405843, + "grad_norm": 0.24529355764389038, + "learning_rate": 4.74311759523874e-05, + "loss": 0.1878, + "step": 12951 + }, + { + "epoch": 0.23101344843577212, + "grad_norm": 0.2427300214767456, + "learning_rate": 4.7430488665859356e-05, + "loss": 0.1797, + "step": 12952 + }, + { + "epoch": 0.2310312845574858, + "grad_norm": 0.24497056007385254, + "learning_rate": 4.7429801292382994e-05, + "loss": 0.1618, + "step": 12953 + }, + { + "epoch": 0.23104912067919953, + "grad_norm": 0.24052974581718445, + "learning_rate": 4.742911383196097e-05, + "loss": 0.2116, + "step": 12954 + }, + { + "epoch": 0.23106695680091321, + "grad_norm": 0.3020787537097931, + "learning_rate": 4.742842628459596e-05, + "loss": 0.22, + "step": 12955 + }, + { + "epoch": 0.2310847929226269, + "grad_norm": 0.228055939078331, + "learning_rate": 4.742773865029062e-05, + "loss": 0.2189, + "step": 12956 + }, + { + "epoch": 0.2311026290443406, + "grad_norm": 0.3163982331752777, + "learning_rate": 4.742705092904762e-05, + "loss": 0.1655, + "step": 12957 + }, + { + "epoch": 0.2311204651660543, + "grad_norm": 0.27987125515937805, + "learning_rate": 4.742636312086962e-05, + "loss": 0.1843, + "step": 12958 + }, + { + "epoch": 0.231138301287768, + "grad_norm": 0.28908318281173706, + "learning_rate": 4.742567522575929e-05, + "loss": 0.2074, + "step": 12959 + }, + { + "epoch": 0.23115613740948168, + "grad_norm": 0.3305481970310211, + "learning_rate": 4.742498724371931e-05, + "loss": 0.2145, + "step": 12960 + }, + { + "epoch": 0.23117397353119537, + "grad_norm": 0.22384819388389587, + "learning_rate": 4.7424299174752326e-05, + "loss": 0.1745, + "step": 12961 + }, + { + "epoch": 0.23119180965290906, + "grad_norm": 0.3761615455150604, + "learning_rate": 4.742361101886101e-05, + "loss": 0.2431, + "step": 12962 + }, + { + "epoch": 0.23120964577462277, + "grad_norm": 0.27289512753486633, + "learning_rate": 4.742292277604803e-05, + "loss": 0.1987, + "step": 12963 + }, + { + "epoch": 0.23122748189633646, + "grad_norm": 0.24646975100040436, + "learning_rate": 4.7422234446316074e-05, + "loss": 0.1656, + "step": 12964 + }, + { + "epoch": 0.23124531801805015, + "grad_norm": 0.4282931685447693, + "learning_rate": 4.7421546029667775e-05, + "loss": 0.1718, + "step": 12965 + }, + { + "epoch": 0.23126315413976384, + "grad_norm": 0.269220232963562, + "learning_rate": 4.7420857526105825e-05, + "loss": 0.1874, + "step": 12966 + }, + { + "epoch": 0.23128099026147755, + "grad_norm": 0.31382322311401367, + "learning_rate": 4.7420168935632895e-05, + "loss": 0.2035, + "step": 12967 + }, + { + "epoch": 0.23129882638319124, + "grad_norm": 0.29719898104667664, + "learning_rate": 4.741948025825164e-05, + "loss": 0.2168, + "step": 12968 + }, + { + "epoch": 0.23131666250490493, + "grad_norm": 0.22965769469738007, + "learning_rate": 4.741879149396473e-05, + "loss": 0.1878, + "step": 12969 + }, + { + "epoch": 0.23133449862661862, + "grad_norm": 0.32541218400001526, + "learning_rate": 4.7418102642774846e-05, + "loss": 0.238, + "step": 12970 + }, + { + "epoch": 0.23135233474833233, + "grad_norm": 0.37302398681640625, + "learning_rate": 4.7417413704684656e-05, + "loss": 0.1912, + "step": 12971 + }, + { + "epoch": 0.23137017087004602, + "grad_norm": 0.3561212122440338, + "learning_rate": 4.741672467969682e-05, + "loss": 0.2219, + "step": 12972 + }, + { + "epoch": 0.2313880069917597, + "grad_norm": 0.23414330184459686, + "learning_rate": 4.741603556781403e-05, + "loss": 0.2264, + "step": 12973 + }, + { + "epoch": 0.2314058431134734, + "grad_norm": 0.27014073729515076, + "learning_rate": 4.741534636903893e-05, + "loss": 0.2182, + "step": 12974 + }, + { + "epoch": 0.2314236792351871, + "grad_norm": 0.6651800274848938, + "learning_rate": 4.741465708337421e-05, + "loss": 0.2513, + "step": 12975 + }, + { + "epoch": 0.2314415153569008, + "grad_norm": 0.21229223906993866, + "learning_rate": 4.741396771082254e-05, + "loss": 0.1925, + "step": 12976 + }, + { + "epoch": 0.2314593514786145, + "grad_norm": 0.35066893696784973, + "learning_rate": 4.7413278251386594e-05, + "loss": 0.2169, + "step": 12977 + }, + { + "epoch": 0.23147718760032818, + "grad_norm": 0.26448994874954224, + "learning_rate": 4.7412588705069034e-05, + "loss": 0.1784, + "step": 12978 + }, + { + "epoch": 0.2314950237220419, + "grad_norm": 0.2352786809206009, + "learning_rate": 4.741189907187254e-05, + "loss": 0.2064, + "step": 12979 + }, + { + "epoch": 0.23151285984375558, + "grad_norm": 0.2799092233181, + "learning_rate": 4.741120935179978e-05, + "loss": 0.1786, + "step": 12980 + }, + { + "epoch": 0.23153069596546927, + "grad_norm": 0.21805337071418762, + "learning_rate": 4.7410519544853437e-05, + "loss": 0.1719, + "step": 12981 + }, + { + "epoch": 0.23154853208718296, + "grad_norm": 0.21304574608802795, + "learning_rate": 4.740982965103618e-05, + "loss": 0.1599, + "step": 12982 + }, + { + "epoch": 0.23156636820889664, + "grad_norm": 0.2549423277378082, + "learning_rate": 4.7409139670350683e-05, + "loss": 0.1628, + "step": 12983 + }, + { + "epoch": 0.23158420433061036, + "grad_norm": 0.24077706038951874, + "learning_rate": 4.740844960279962e-05, + "loss": 0.2012, + "step": 12984 + }, + { + "epoch": 0.23160204045232405, + "grad_norm": 0.25259336829185486, + "learning_rate": 4.740775944838567e-05, + "loss": 0.1884, + "step": 12985 + }, + { + "epoch": 0.23161987657403774, + "grad_norm": 0.32633882761001587, + "learning_rate": 4.74070692071115e-05, + "loss": 0.2058, + "step": 12986 + }, + { + "epoch": 0.23163771269575142, + "grad_norm": 0.25864505767822266, + "learning_rate": 4.740637887897979e-05, + "loss": 0.1927, + "step": 12987 + }, + { + "epoch": 0.23165554881746514, + "grad_norm": 0.3024556338787079, + "learning_rate": 4.7405688463993217e-05, + "loss": 0.2116, + "step": 12988 + }, + { + "epoch": 0.23167338493917883, + "grad_norm": 0.24341925978660583, + "learning_rate": 4.7404997962154465e-05, + "loss": 0.1732, + "step": 12989 + }, + { + "epoch": 0.23169122106089252, + "grad_norm": 0.32996392250061035, + "learning_rate": 4.740430737346619e-05, + "loss": 0.1795, + "step": 12990 + }, + { + "epoch": 0.2317090571826062, + "grad_norm": 0.25818613171577454, + "learning_rate": 4.740361669793109e-05, + "loss": 0.2149, + "step": 12991 + }, + { + "epoch": 0.23172689330431992, + "grad_norm": 0.29291197657585144, + "learning_rate": 4.7402925935551836e-05, + "loss": 0.1878, + "step": 12992 + }, + { + "epoch": 0.2317447294260336, + "grad_norm": 0.2562229335308075, + "learning_rate": 4.740223508633109e-05, + "loss": 0.1389, + "step": 12993 + }, + { + "epoch": 0.2317625655477473, + "grad_norm": 0.26252058148384094, + "learning_rate": 4.7401544150271557e-05, + "loss": 0.1662, + "step": 12994 + }, + { + "epoch": 0.23178040166946098, + "grad_norm": 0.3297211527824402, + "learning_rate": 4.74008531273759e-05, + "loss": 0.1865, + "step": 12995 + }, + { + "epoch": 0.2317982377911747, + "grad_norm": 0.27339687943458557, + "learning_rate": 4.7400162017646796e-05, + "loss": 0.2139, + "step": 12996 + }, + { + "epoch": 0.2318160739128884, + "grad_norm": 0.3054243326187134, + "learning_rate": 4.739947082108692e-05, + "loss": 0.2196, + "step": 12997 + }, + { + "epoch": 0.23183391003460208, + "grad_norm": 0.4263780415058136, + "learning_rate": 4.739877953769897e-05, + "loss": 0.1996, + "step": 12998 + }, + { + "epoch": 0.23185174615631576, + "grad_norm": 0.20971044898033142, + "learning_rate": 4.739808816748561e-05, + "loss": 0.1824, + "step": 12999 + }, + { + "epoch": 0.23186958227802948, + "grad_norm": 0.4023171067237854, + "learning_rate": 4.7397396710449525e-05, + "loss": 0.23, + "step": 13000 + }, + { + "epoch": 0.23186958227802948, + "eval_loss": 0.18585029244422913, + "eval_runtime": 106.8316, + "eval_samples_per_second": 9.585, + "eval_steps_per_second": 1.601, + "step": 13000 + }, + { + "epoch": 0.23188741839974317, + "grad_norm": 0.3185226321220398, + "learning_rate": 4.739670516659339e-05, + "loss": 0.2028, + "step": 13001 + }, + { + "epoch": 0.23190525452145686, + "grad_norm": 0.26754093170166016, + "learning_rate": 4.7396013535919894e-05, + "loss": 0.1318, + "step": 13002 + }, + { + "epoch": 0.23192309064317054, + "grad_norm": 0.3592711389064789, + "learning_rate": 4.7395321818431715e-05, + "loss": 0.2066, + "step": 13003 + }, + { + "epoch": 0.23194092676488423, + "grad_norm": 0.22774259746074677, + "learning_rate": 4.7394630014131536e-05, + "loss": 0.1387, + "step": 13004 + }, + { + "epoch": 0.23195876288659795, + "grad_norm": 0.2407887727022171, + "learning_rate": 4.739393812302203e-05, + "loss": 0.1774, + "step": 13005 + }, + { + "epoch": 0.23197659900831163, + "grad_norm": 0.24861784279346466, + "learning_rate": 4.739324614510589e-05, + "loss": 0.1211, + "step": 13006 + }, + { + "epoch": 0.23199443513002532, + "grad_norm": 0.31071850657463074, + "learning_rate": 4.739255408038579e-05, + "loss": 0.1912, + "step": 13007 + }, + { + "epoch": 0.232012271251739, + "grad_norm": 0.272596538066864, + "learning_rate": 4.7391861928864424e-05, + "loss": 0.1657, + "step": 13008 + }, + { + "epoch": 0.23203010737345273, + "grad_norm": 0.37417080998420715, + "learning_rate": 4.7391169690544454e-05, + "loss": 0.253, + "step": 13009 + }, + { + "epoch": 0.23204794349516641, + "grad_norm": 0.3753811717033386, + "learning_rate": 4.7390477365428584e-05, + "loss": 0.2186, + "step": 13010 + }, + { + "epoch": 0.2320657796168801, + "grad_norm": 0.4284873902797699, + "learning_rate": 4.738978495351949e-05, + "loss": 0.2663, + "step": 13011 + }, + { + "epoch": 0.2320836157385938, + "grad_norm": 0.33204787969589233, + "learning_rate": 4.738909245481986e-05, + "loss": 0.1966, + "step": 13012 + }, + { + "epoch": 0.2321014518603075, + "grad_norm": 0.28605467081069946, + "learning_rate": 4.738839986933237e-05, + "loss": 0.2157, + "step": 13013 + }, + { + "epoch": 0.2321192879820212, + "grad_norm": 0.3401779234409332, + "learning_rate": 4.7387707197059714e-05, + "loss": 0.17, + "step": 13014 + }, + { + "epoch": 0.23213712410373488, + "grad_norm": 0.24983623623847961, + "learning_rate": 4.738701443800456e-05, + "loss": 0.1968, + "step": 13015 + }, + { + "epoch": 0.23215496022544857, + "grad_norm": 0.2762739956378937, + "learning_rate": 4.738632159216962e-05, + "loss": 0.188, + "step": 13016 + }, + { + "epoch": 0.23217279634716229, + "grad_norm": 0.41999778151512146, + "learning_rate": 4.738562865955756e-05, + "loss": 0.1928, + "step": 13017 + }, + { + "epoch": 0.23219063246887597, + "grad_norm": 0.34503859281539917, + "learning_rate": 4.738493564017107e-05, + "loss": 0.2027, + "step": 13018 + }, + { + "epoch": 0.23220846859058966, + "grad_norm": 0.24494142830371857, + "learning_rate": 4.7384242534012844e-05, + "loss": 0.1644, + "step": 13019 + }, + { + "epoch": 0.23222630471230335, + "grad_norm": 0.27364447712898254, + "learning_rate": 4.738354934108556e-05, + "loss": 0.2003, + "step": 13020 + }, + { + "epoch": 0.23224414083401704, + "grad_norm": 0.3707612454891205, + "learning_rate": 4.73828560613919e-05, + "loss": 0.2234, + "step": 13021 + }, + { + "epoch": 0.23226197695573075, + "grad_norm": 0.27612432837486267, + "learning_rate": 4.738216269493457e-05, + "loss": 0.213, + "step": 13022 + }, + { + "epoch": 0.23227981307744444, + "grad_norm": 0.30849191546440125, + "learning_rate": 4.738146924171624e-05, + "loss": 0.1851, + "step": 13023 + }, + { + "epoch": 0.23229764919915813, + "grad_norm": 0.21203866600990295, + "learning_rate": 4.738077570173961e-05, + "loss": 0.1864, + "step": 13024 + }, + { + "epoch": 0.23231548532087182, + "grad_norm": 0.3886302709579468, + "learning_rate": 4.738008207500736e-05, + "loss": 0.2098, + "step": 13025 + }, + { + "epoch": 0.23233332144258553, + "grad_norm": 0.22119519114494324, + "learning_rate": 4.737938836152218e-05, + "loss": 0.1629, + "step": 13026 + }, + { + "epoch": 0.23235115756429922, + "grad_norm": 0.2599148750305176, + "learning_rate": 4.737869456128676e-05, + "loss": 0.1937, + "step": 13027 + }, + { + "epoch": 0.2323689936860129, + "grad_norm": 0.5678700804710388, + "learning_rate": 4.73780006743038e-05, + "loss": 0.2066, + "step": 13028 + }, + { + "epoch": 0.2323868298077266, + "grad_norm": 0.29183149337768555, + "learning_rate": 4.737730670057597e-05, + "loss": 0.2033, + "step": 13029 + }, + { + "epoch": 0.2324046659294403, + "grad_norm": 0.31361421942710876, + "learning_rate": 4.737661264010598e-05, + "loss": 0.2253, + "step": 13030 + }, + { + "epoch": 0.232422502051154, + "grad_norm": 0.42441412806510925, + "learning_rate": 4.7375918492896506e-05, + "loss": 0.1599, + "step": 13031 + }, + { + "epoch": 0.2324403381728677, + "grad_norm": 0.29404038190841675, + "learning_rate": 4.737522425895024e-05, + "loss": 0.2201, + "step": 13032 + }, + { + "epoch": 0.23245817429458138, + "grad_norm": 0.2951338291168213, + "learning_rate": 4.7374529938269886e-05, + "loss": 0.2216, + "step": 13033 + }, + { + "epoch": 0.2324760104162951, + "grad_norm": 0.34812551736831665, + "learning_rate": 4.737383553085811e-05, + "loss": 0.1922, + "step": 13034 + }, + { + "epoch": 0.23249384653800878, + "grad_norm": 0.26205068826675415, + "learning_rate": 4.737314103671763e-05, + "loss": 0.1671, + "step": 13035 + }, + { + "epoch": 0.23251168265972247, + "grad_norm": 0.2434355914592743, + "learning_rate": 4.737244645585113e-05, + "loss": 0.2038, + "step": 13036 + }, + { + "epoch": 0.23252951878143616, + "grad_norm": 0.2926849126815796, + "learning_rate": 4.73717517882613e-05, + "loss": 0.2421, + "step": 13037 + }, + { + "epoch": 0.23254735490314987, + "grad_norm": 0.2663050889968872, + "learning_rate": 4.737105703395083e-05, + "loss": 0.1713, + "step": 13038 + }, + { + "epoch": 0.23256519102486356, + "grad_norm": 0.25067776441574097, + "learning_rate": 4.737036219292241e-05, + "loss": 0.1919, + "step": 13039 + }, + { + "epoch": 0.23258302714657725, + "grad_norm": 0.2642282545566559, + "learning_rate": 4.736966726517875e-05, + "loss": 0.1491, + "step": 13040 + }, + { + "epoch": 0.23260086326829094, + "grad_norm": 0.2530624568462372, + "learning_rate": 4.7368972250722535e-05, + "loss": 0.1728, + "step": 13041 + }, + { + "epoch": 0.23261869939000462, + "grad_norm": 0.29371771216392517, + "learning_rate": 4.736827714955645e-05, + "loss": 0.2156, + "step": 13042 + }, + { + "epoch": 0.23263653551171834, + "grad_norm": 0.23769794404506683, + "learning_rate": 4.73675819616832e-05, + "loss": 0.1477, + "step": 13043 + }, + { + "epoch": 0.23265437163343203, + "grad_norm": 0.4349018633365631, + "learning_rate": 4.736688668710548e-05, + "loss": 0.2286, + "step": 13044 + }, + { + "epoch": 0.23267220775514572, + "grad_norm": 0.3502461314201355, + "learning_rate": 4.736619132582598e-05, + "loss": 0.194, + "step": 13045 + }, + { + "epoch": 0.2326900438768594, + "grad_norm": 0.34425291419029236, + "learning_rate": 4.7365495877847395e-05, + "loss": 0.1814, + "step": 13046 + }, + { + "epoch": 0.23270787999857312, + "grad_norm": 0.26752737164497375, + "learning_rate": 4.736480034317242e-05, + "loss": 0.1916, + "step": 13047 + }, + { + "epoch": 0.2327257161202868, + "grad_norm": 0.22934582829475403, + "learning_rate": 4.736410472180376e-05, + "loss": 0.18, + "step": 13048 + }, + { + "epoch": 0.2327435522420005, + "grad_norm": 0.2358386367559433, + "learning_rate": 4.7363409013744105e-05, + "loss": 0.1748, + "step": 13049 + }, + { + "epoch": 0.23276138836371418, + "grad_norm": 0.33683323860168457, + "learning_rate": 4.736271321899615e-05, + "loss": 0.2094, + "step": 13050 + }, + { + "epoch": 0.2327792244854279, + "grad_norm": 0.36347466707229614, + "learning_rate": 4.73620173375626e-05, + "loss": 0.2289, + "step": 13051 + }, + { + "epoch": 0.2327970606071416, + "grad_norm": 0.3697187602519989, + "learning_rate": 4.7361321369446147e-05, + "loss": 0.2265, + "step": 13052 + }, + { + "epoch": 0.23281489672885528, + "grad_norm": 0.45988330245018005, + "learning_rate": 4.7360625314649474e-05, + "loss": 0.219, + "step": 13053 + }, + { + "epoch": 0.23283273285056896, + "grad_norm": 0.25618958473205566, + "learning_rate": 4.735992917317531e-05, + "loss": 0.142, + "step": 13054 + }, + { + "epoch": 0.23285056897228268, + "grad_norm": 0.23562154173851013, + "learning_rate": 4.735923294502633e-05, + "loss": 0.1655, + "step": 13055 + }, + { + "epoch": 0.23286840509399637, + "grad_norm": 0.25488919019699097, + "learning_rate": 4.7358536630205255e-05, + "loss": 0.1917, + "step": 13056 + }, + { + "epoch": 0.23288624121571005, + "grad_norm": 0.2664492726325989, + "learning_rate": 4.735784022871476e-05, + "loss": 0.2198, + "step": 13057 + }, + { + "epoch": 0.23290407733742374, + "grad_norm": 0.4016435444355011, + "learning_rate": 4.735714374055755e-05, + "loss": 0.1594, + "step": 13058 + }, + { + "epoch": 0.23292191345913746, + "grad_norm": 0.19750067591667175, + "learning_rate": 4.735644716573633e-05, + "loss": 0.1749, + "step": 13059 + }, + { + "epoch": 0.23293974958085115, + "grad_norm": 0.24430808424949646, + "learning_rate": 4.73557505042538e-05, + "loss": 0.231, + "step": 13060 + }, + { + "epoch": 0.23295758570256483, + "grad_norm": 0.319717139005661, + "learning_rate": 4.735505375611266e-05, + "loss": 0.2144, + "step": 13061 + }, + { + "epoch": 0.23297542182427852, + "grad_norm": 0.30825549364089966, + "learning_rate": 4.735435692131561e-05, + "loss": 0.1976, + "step": 13062 + }, + { + "epoch": 0.2329932579459922, + "grad_norm": 0.31341513991355896, + "learning_rate": 4.735365999986535e-05, + "loss": 0.1904, + "step": 13063 + }, + { + "epoch": 0.23301109406770593, + "grad_norm": 0.32323968410491943, + "learning_rate": 4.735296299176459e-05, + "loss": 0.2164, + "step": 13064 + }, + { + "epoch": 0.23302893018941961, + "grad_norm": 0.2604954242706299, + "learning_rate": 4.735226589701602e-05, + "loss": 0.2051, + "step": 13065 + }, + { + "epoch": 0.2330467663111333, + "grad_norm": 0.23131580650806427, + "learning_rate": 4.7351568715622347e-05, + "loss": 0.2007, + "step": 13066 + }, + { + "epoch": 0.233064602432847, + "grad_norm": 0.23712332546710968, + "learning_rate": 4.735087144758628e-05, + "loss": 0.1675, + "step": 13067 + }, + { + "epoch": 0.2330824385545607, + "grad_norm": 0.3033466935157776, + "learning_rate": 4.7350174092910504e-05, + "loss": 0.2001, + "step": 13068 + }, + { + "epoch": 0.2331002746762744, + "grad_norm": 0.3434779644012451, + "learning_rate": 4.734947665159774e-05, + "loss": 0.2133, + "step": 13069 + }, + { + "epoch": 0.23311811079798808, + "grad_norm": 0.2736477255821228, + "learning_rate": 4.734877912365069e-05, + "loss": 0.198, + "step": 13070 + }, + { + "epoch": 0.23313594691970177, + "grad_norm": 0.3114743232727051, + "learning_rate": 4.734808150907204e-05, + "loss": 0.2444, + "step": 13071 + }, + { + "epoch": 0.23315378304141549, + "grad_norm": 0.24049529433250427, + "learning_rate": 4.734738380786452e-05, + "loss": 0.1613, + "step": 13072 + }, + { + "epoch": 0.23317161916312917, + "grad_norm": 0.20932382345199585, + "learning_rate": 4.734668602003082e-05, + "loss": 0.1679, + "step": 13073 + }, + { + "epoch": 0.23318945528484286, + "grad_norm": 0.22552795708179474, + "learning_rate": 4.734598814557364e-05, + "loss": 0.1074, + "step": 13074 + }, + { + "epoch": 0.23320729140655655, + "grad_norm": 0.3333311676979065, + "learning_rate": 4.73452901844957e-05, + "loss": 0.2534, + "step": 13075 + }, + { + "epoch": 0.23322512752827027, + "grad_norm": 0.18414105474948883, + "learning_rate": 4.7344592136799696e-05, + "loss": 0.1579, + "step": 13076 + }, + { + "epoch": 0.23324296364998395, + "grad_norm": 0.2633228302001953, + "learning_rate": 4.734389400248833e-05, + "loss": 0.1584, + "step": 13077 + }, + { + "epoch": 0.23326079977169764, + "grad_norm": 0.2603215277194977, + "learning_rate": 4.734319578156431e-05, + "loss": 0.203, + "step": 13078 + }, + { + "epoch": 0.23327863589341133, + "grad_norm": 0.23938825726509094, + "learning_rate": 4.7342497474030355e-05, + "loss": 0.202, + "step": 13079 + }, + { + "epoch": 0.23329647201512504, + "grad_norm": 0.3025229275226593, + "learning_rate": 4.734179907988916e-05, + "loss": 0.2084, + "step": 13080 + }, + { + "epoch": 0.23331430813683873, + "grad_norm": 0.2092043161392212, + "learning_rate": 4.7341100599143436e-05, + "loss": 0.1863, + "step": 13081 + }, + { + "epoch": 0.23333214425855242, + "grad_norm": 0.25694260001182556, + "learning_rate": 4.7340402031795886e-05, + "loss": 0.1711, + "step": 13082 + }, + { + "epoch": 0.2333499803802661, + "grad_norm": 0.23915836215019226, + "learning_rate": 4.733970337784922e-05, + "loss": 0.1939, + "step": 13083 + }, + { + "epoch": 0.2333678165019798, + "grad_norm": 0.4468158185482025, + "learning_rate": 4.733900463730616e-05, + "loss": 0.2067, + "step": 13084 + }, + { + "epoch": 0.2333856526236935, + "grad_norm": 0.7035412192344666, + "learning_rate": 4.733830581016939e-05, + "loss": 0.1528, + "step": 13085 + }, + { + "epoch": 0.2334034887454072, + "grad_norm": 0.2828312814235687, + "learning_rate": 4.733760689644164e-05, + "loss": 0.1913, + "step": 13086 + }, + { + "epoch": 0.2334213248671209, + "grad_norm": 0.3584742546081543, + "learning_rate": 4.7336907896125605e-05, + "loss": 0.1756, + "step": 13087 + }, + { + "epoch": 0.23343916098883458, + "grad_norm": 0.3478483557701111, + "learning_rate": 4.7336208809224e-05, + "loss": 0.1625, + "step": 13088 + }, + { + "epoch": 0.2334569971105483, + "grad_norm": 0.2671332359313965, + "learning_rate": 4.733550963573954e-05, + "loss": 0.1842, + "step": 13089 + }, + { + "epoch": 0.23347483323226198, + "grad_norm": 0.2902505695819855, + "learning_rate": 4.733481037567492e-05, + "loss": 0.2057, + "step": 13090 + }, + { + "epoch": 0.23349266935397567, + "grad_norm": 0.4179877042770386, + "learning_rate": 4.733411102903287e-05, + "loss": 0.1954, + "step": 13091 + }, + { + "epoch": 0.23351050547568936, + "grad_norm": 0.3280573785305023, + "learning_rate": 4.7333411595816094e-05, + "loss": 0.2002, + "step": 13092 + }, + { + "epoch": 0.23352834159740307, + "grad_norm": 0.33432939648628235, + "learning_rate": 4.733271207602729e-05, + "loss": 0.2547, + "step": 13093 + }, + { + "epoch": 0.23354617771911676, + "grad_norm": 0.3041699230670929, + "learning_rate": 4.733201246966919e-05, + "loss": 0.1653, + "step": 13094 + }, + { + "epoch": 0.23356401384083045, + "grad_norm": 0.3179178535938263, + "learning_rate": 4.73313127767445e-05, + "loss": 0.2096, + "step": 13095 + }, + { + "epoch": 0.23358184996254414, + "grad_norm": 0.27879855036735535, + "learning_rate": 4.733061299725591e-05, + "loss": 0.2239, + "step": 13096 + }, + { + "epoch": 0.23359968608425785, + "grad_norm": 0.6498161554336548, + "learning_rate": 4.7329913131206174e-05, + "loss": 0.2571, + "step": 13097 + }, + { + "epoch": 0.23361752220597154, + "grad_norm": 0.32810789346694946, + "learning_rate": 4.7329213178597964e-05, + "loss": 0.2375, + "step": 13098 + }, + { + "epoch": 0.23363535832768523, + "grad_norm": 0.28718990087509155, + "learning_rate": 4.732851313943402e-05, + "loss": 0.2159, + "step": 13099 + }, + { + "epoch": 0.23365319444939892, + "grad_norm": 0.3003918528556824, + "learning_rate": 4.732781301371705e-05, + "loss": 0.1594, + "step": 13100 + }, + { + "epoch": 0.2336710305711126, + "grad_norm": 0.22157765924930573, + "learning_rate": 4.7327112801449756e-05, + "loss": 0.1874, + "step": 13101 + }, + { + "epoch": 0.23368886669282632, + "grad_norm": 0.27124059200286865, + "learning_rate": 4.732641250263487e-05, + "loss": 0.1905, + "step": 13102 + }, + { + "epoch": 0.23370670281454, + "grad_norm": 0.3923420011997223, + "learning_rate": 4.732571211727509e-05, + "loss": 0.1689, + "step": 13103 + }, + { + "epoch": 0.2337245389362537, + "grad_norm": 0.22294719517230988, + "learning_rate": 4.732501164537314e-05, + "loss": 0.2173, + "step": 13104 + }, + { + "epoch": 0.23374237505796738, + "grad_norm": 0.25595393776893616, + "learning_rate": 4.732431108693174e-05, + "loss": 0.1879, + "step": 13105 + }, + { + "epoch": 0.2337602111796811, + "grad_norm": 0.3274551033973694, + "learning_rate": 4.73236104419536e-05, + "loss": 0.2778, + "step": 13106 + }, + { + "epoch": 0.2337780473013948, + "grad_norm": 0.3030029535293579, + "learning_rate": 4.732290971044143e-05, + "loss": 0.1698, + "step": 13107 + }, + { + "epoch": 0.23379588342310847, + "grad_norm": 0.21548080444335938, + "learning_rate": 4.732220889239795e-05, + "loss": 0.1683, + "step": 13108 + }, + { + "epoch": 0.23381371954482216, + "grad_norm": 0.2954399883747101, + "learning_rate": 4.7321507987825886e-05, + "loss": 0.1847, + "step": 13109 + }, + { + "epoch": 0.23383155566653588, + "grad_norm": 0.25482800602912903, + "learning_rate": 4.732080699672794e-05, + "loss": 0.2194, + "step": 13110 + }, + { + "epoch": 0.23384939178824957, + "grad_norm": 0.23453563451766968, + "learning_rate": 4.732010591910685e-05, + "loss": 0.1323, + "step": 13111 + }, + { + "epoch": 0.23386722790996325, + "grad_norm": 0.28474459052085876, + "learning_rate": 4.73194047549653e-05, + "loss": 0.2208, + "step": 13112 + }, + { + "epoch": 0.23388506403167694, + "grad_norm": 0.39361241459846497, + "learning_rate": 4.731870350430604e-05, + "loss": 0.1788, + "step": 13113 + }, + { + "epoch": 0.23390290015339066, + "grad_norm": 0.25086504220962524, + "learning_rate": 4.731800216713178e-05, + "loss": 0.1546, + "step": 13114 + }, + { + "epoch": 0.23392073627510435, + "grad_norm": 0.20546072721481323, + "learning_rate": 4.7317300743445224e-05, + "loss": 0.1503, + "step": 13115 + }, + { + "epoch": 0.23393857239681803, + "grad_norm": 0.20666436851024628, + "learning_rate": 4.7316599233249114e-05, + "loss": 0.1756, + "step": 13116 + }, + { + "epoch": 0.23395640851853172, + "grad_norm": 0.27438873052597046, + "learning_rate": 4.731589763654615e-05, + "loss": 0.2358, + "step": 13117 + }, + { + "epoch": 0.23397424464024544, + "grad_norm": 0.44955042004585266, + "learning_rate": 4.731519595333906e-05, + "loss": 0.1772, + "step": 13118 + }, + { + "epoch": 0.23399208076195913, + "grad_norm": 0.25977659225463867, + "learning_rate": 4.731449418363057e-05, + "loss": 0.2078, + "step": 13119 + }, + { + "epoch": 0.2340099168836728, + "grad_norm": 0.2971154749393463, + "learning_rate": 4.731379232742339e-05, + "loss": 0.1657, + "step": 13120 + }, + { + "epoch": 0.2340277530053865, + "grad_norm": 0.2584383189678192, + "learning_rate": 4.7313090384720236e-05, + "loss": 0.1963, + "step": 13121 + }, + { + "epoch": 0.2340455891271002, + "grad_norm": 0.25320371985435486, + "learning_rate": 4.731238835552384e-05, + "loss": 0.1343, + "step": 13122 + }, + { + "epoch": 0.2340634252488139, + "grad_norm": 0.41943204402923584, + "learning_rate": 4.731168623983693e-05, + "loss": 0.2122, + "step": 13123 + }, + { + "epoch": 0.2340812613705276, + "grad_norm": 0.24102890491485596, + "learning_rate": 4.7310984037662206e-05, + "loss": 0.2281, + "step": 13124 + }, + { + "epoch": 0.23409909749224128, + "grad_norm": 0.34076613187789917, + "learning_rate": 4.731028174900242e-05, + "loss": 0.216, + "step": 13125 + }, + { + "epoch": 0.23411693361395497, + "grad_norm": 0.2129395753145218, + "learning_rate": 4.730957937386026e-05, + "loss": 0.1598, + "step": 13126 + }, + { + "epoch": 0.23413476973566869, + "grad_norm": 0.226332888007164, + "learning_rate": 4.730887691223846e-05, + "loss": 0.199, + "step": 13127 + }, + { + "epoch": 0.23415260585738237, + "grad_norm": 0.26704511046409607, + "learning_rate": 4.730817436413976e-05, + "loss": 0.2175, + "step": 13128 + }, + { + "epoch": 0.23417044197909606, + "grad_norm": 0.26819998025894165, + "learning_rate": 4.730747172956687e-05, + "loss": 0.1581, + "step": 13129 + }, + { + "epoch": 0.23418827810080975, + "grad_norm": 0.42599111795425415, + "learning_rate": 4.7306769008522514e-05, + "loss": 0.1799, + "step": 13130 + }, + { + "epoch": 0.23420611422252346, + "grad_norm": 0.3520090878009796, + "learning_rate": 4.7306066201009414e-05, + "loss": 0.2521, + "step": 13131 + }, + { + "epoch": 0.23422395034423715, + "grad_norm": 0.22327473759651184, + "learning_rate": 4.7305363307030295e-05, + "loss": 0.1989, + "step": 13132 + }, + { + "epoch": 0.23424178646595084, + "grad_norm": 0.27530860900878906, + "learning_rate": 4.730466032658788e-05, + "loss": 0.1714, + "step": 13133 + }, + { + "epoch": 0.23425962258766453, + "grad_norm": 0.22513854503631592, + "learning_rate": 4.730395725968491e-05, + "loss": 0.1487, + "step": 13134 + }, + { + "epoch": 0.23427745870937824, + "grad_norm": 0.2687302827835083, + "learning_rate": 4.730325410632409e-05, + "loss": 0.1795, + "step": 13135 + }, + { + "epoch": 0.23429529483109193, + "grad_norm": 0.5469068884849548, + "learning_rate": 4.730255086650816e-05, + "loss": 0.1836, + "step": 13136 + }, + { + "epoch": 0.23431313095280562, + "grad_norm": 0.23095735907554626, + "learning_rate": 4.730184754023984e-05, + "loss": 0.1678, + "step": 13137 + }, + { + "epoch": 0.2343309670745193, + "grad_norm": 0.31276312470436096, + "learning_rate": 4.730114412752185e-05, + "loss": 0.2264, + "step": 13138 + }, + { + "epoch": 0.23434880319623302, + "grad_norm": 0.32502320408821106, + "learning_rate": 4.7300440628356926e-05, + "loss": 0.2478, + "step": 13139 + }, + { + "epoch": 0.2343666393179467, + "grad_norm": 0.34323054552078247, + "learning_rate": 4.729973704274779e-05, + "loss": 0.1917, + "step": 13140 + }, + { + "epoch": 0.2343844754396604, + "grad_norm": 0.30752578377723694, + "learning_rate": 4.729903337069717e-05, + "loss": 0.1974, + "step": 13141 + }, + { + "epoch": 0.2344023115613741, + "grad_norm": 0.24402940273284912, + "learning_rate": 4.72983296122078e-05, + "loss": 0.1883, + "step": 13142 + }, + { + "epoch": 0.23442014768308778, + "grad_norm": 0.2565370500087738, + "learning_rate": 4.72976257672824e-05, + "loss": 0.1759, + "step": 13143 + }, + { + "epoch": 0.2344379838048015, + "grad_norm": 0.2792462110519409, + "learning_rate": 4.72969218359237e-05, + "loss": 0.1586, + "step": 13144 + }, + { + "epoch": 0.23445581992651518, + "grad_norm": 0.23141470551490784, + "learning_rate": 4.729621781813443e-05, + "loss": 0.1625, + "step": 13145 + }, + { + "epoch": 0.23447365604822887, + "grad_norm": 0.3027353882789612, + "learning_rate": 4.729551371391732e-05, + "loss": 0.1638, + "step": 13146 + }, + { + "epoch": 0.23449149216994256, + "grad_norm": 0.3027116358280182, + "learning_rate": 4.72948095232751e-05, + "loss": 0.2218, + "step": 13147 + }, + { + "epoch": 0.23450932829165627, + "grad_norm": 0.331464946269989, + "learning_rate": 4.7294105246210494e-05, + "loss": 0.2074, + "step": 13148 + }, + { + "epoch": 0.23452716441336996, + "grad_norm": 0.28187480568885803, + "learning_rate": 4.7293400882726235e-05, + "loss": 0.2212, + "step": 13149 + }, + { + "epoch": 0.23454500053508365, + "grad_norm": 0.20477347075939178, + "learning_rate": 4.729269643282506e-05, + "loss": 0.1635, + "step": 13150 + }, + { + "epoch": 0.23456283665679734, + "grad_norm": 0.21685895323753357, + "learning_rate": 4.729199189650969e-05, + "loss": 0.1618, + "step": 13151 + }, + { + "epoch": 0.23458067277851105, + "grad_norm": 0.30709367990493774, + "learning_rate": 4.7291287273782865e-05, + "loss": 0.19, + "step": 13152 + }, + { + "epoch": 0.23459850890022474, + "grad_norm": 0.3493216037750244, + "learning_rate": 4.729058256464731e-05, + "loss": 0.1984, + "step": 13153 + }, + { + "epoch": 0.23461634502193843, + "grad_norm": 0.27642378211021423, + "learning_rate": 4.728987776910575e-05, + "loss": 0.1888, + "step": 13154 + }, + { + "epoch": 0.23463418114365212, + "grad_norm": 0.3807377219200134, + "learning_rate": 4.7289172887160934e-05, + "loss": 0.1897, + "step": 13155 + }, + { + "epoch": 0.23465201726536583, + "grad_norm": 0.327101469039917, + "learning_rate": 4.7288467918815584e-05, + "loss": 0.2586, + "step": 13156 + }, + { + "epoch": 0.23466985338707952, + "grad_norm": 0.3536888062953949, + "learning_rate": 4.7287762864072425e-05, + "loss": 0.1901, + "step": 13157 + }, + { + "epoch": 0.2346876895087932, + "grad_norm": 0.32119861245155334, + "learning_rate": 4.728705772293421e-05, + "loss": 0.2284, + "step": 13158 + }, + { + "epoch": 0.2347055256305069, + "grad_norm": 0.24259814620018005, + "learning_rate": 4.7286352495403656e-05, + "loss": 0.2237, + "step": 13159 + }, + { + "epoch": 0.2347233617522206, + "grad_norm": 0.24325969815254211, + "learning_rate": 4.7285647181483506e-05, + "loss": 0.1606, + "step": 13160 + }, + { + "epoch": 0.2347411978739343, + "grad_norm": 0.2513333559036255, + "learning_rate": 4.7284941781176485e-05, + "loss": 0.2097, + "step": 13161 + }, + { + "epoch": 0.234759033995648, + "grad_norm": 0.5247678756713867, + "learning_rate": 4.728423629448534e-05, + "loss": 0.229, + "step": 13162 + }, + { + "epoch": 0.23477687011736167, + "grad_norm": 0.975025475025177, + "learning_rate": 4.7283530721412795e-05, + "loss": 0.1968, + "step": 13163 + }, + { + "epoch": 0.23479470623907536, + "grad_norm": 0.22964558005332947, + "learning_rate": 4.728282506196159e-05, + "loss": 0.204, + "step": 13164 + }, + { + "epoch": 0.23481254236078908, + "grad_norm": 0.29146355390548706, + "learning_rate": 4.728211931613445e-05, + "loss": 0.1612, + "step": 13165 + }, + { + "epoch": 0.23483037848250277, + "grad_norm": 0.3188380300998688, + "learning_rate": 4.7281413483934134e-05, + "loss": 0.1968, + "step": 13166 + }, + { + "epoch": 0.23484821460421645, + "grad_norm": 0.38089612126350403, + "learning_rate": 4.728070756536335e-05, + "loss": 0.173, + "step": 13167 + }, + { + "epoch": 0.23486605072593014, + "grad_norm": 0.3769915699958801, + "learning_rate": 4.728000156042486e-05, + "loss": 0.2067, + "step": 13168 + }, + { + "epoch": 0.23488388684764386, + "grad_norm": 0.2790173292160034, + "learning_rate": 4.727929546912138e-05, + "loss": 0.2152, + "step": 13169 + }, + { + "epoch": 0.23490172296935755, + "grad_norm": 0.2992670238018036, + "learning_rate": 4.7278589291455656e-05, + "loss": 0.1843, + "step": 13170 + }, + { + "epoch": 0.23491955909107123, + "grad_norm": 0.2138238102197647, + "learning_rate": 4.727788302743043e-05, + "loss": 0.1673, + "step": 13171 + }, + { + "epoch": 0.23493739521278492, + "grad_norm": 0.3130911588668823, + "learning_rate": 4.727717667704843e-05, + "loss": 0.2143, + "step": 13172 + }, + { + "epoch": 0.23495523133449864, + "grad_norm": 0.2582393288612366, + "learning_rate": 4.72764702403124e-05, + "loss": 0.1538, + "step": 13173 + }, + { + "epoch": 0.23497306745621233, + "grad_norm": 0.27510881423950195, + "learning_rate": 4.727576371722508e-05, + "loss": 0.1735, + "step": 13174 + }, + { + "epoch": 0.234990903577926, + "grad_norm": 0.40130189061164856, + "learning_rate": 4.7275057107789205e-05, + "loss": 0.3251, + "step": 13175 + }, + { + "epoch": 0.2350087396996397, + "grad_norm": 0.4659244120121002, + "learning_rate": 4.727435041200752e-05, + "loss": 0.2313, + "step": 13176 + }, + { + "epoch": 0.23502657582135342, + "grad_norm": 0.31697192788124084, + "learning_rate": 4.727364362988275e-05, + "loss": 0.2322, + "step": 13177 + }, + { + "epoch": 0.2350444119430671, + "grad_norm": 0.24357974529266357, + "learning_rate": 4.7272936761417647e-05, + "loss": 0.1556, + "step": 13178 + }, + { + "epoch": 0.2350622480647808, + "grad_norm": 0.31785544753074646, + "learning_rate": 4.727222980661495e-05, + "loss": 0.2079, + "step": 13179 + }, + { + "epoch": 0.23508008418649448, + "grad_norm": 0.3436828851699829, + "learning_rate": 4.727152276547739e-05, + "loss": 0.2003, + "step": 13180 + }, + { + "epoch": 0.2350979203082082, + "grad_norm": 0.24924665689468384, + "learning_rate": 4.727081563800773e-05, + "loss": 0.2049, + "step": 13181 + }, + { + "epoch": 0.23511575642992188, + "grad_norm": 0.27374500036239624, + "learning_rate": 4.727010842420869e-05, + "loss": 0.144, + "step": 13182 + }, + { + "epoch": 0.23513359255163557, + "grad_norm": 0.2758798897266388, + "learning_rate": 4.726940112408301e-05, + "loss": 0.187, + "step": 13183 + }, + { + "epoch": 0.23515142867334926, + "grad_norm": 0.34047555923461914, + "learning_rate": 4.726869373763345e-05, + "loss": 0.189, + "step": 13184 + }, + { + "epoch": 0.23516926479506295, + "grad_norm": 0.2357666790485382, + "learning_rate": 4.726798626486274e-05, + "loss": 0.1756, + "step": 13185 + }, + { + "epoch": 0.23518710091677666, + "grad_norm": 0.3062521517276764, + "learning_rate": 4.726727870577362e-05, + "loss": 0.2419, + "step": 13186 + }, + { + "epoch": 0.23520493703849035, + "grad_norm": 0.3334413766860962, + "learning_rate": 4.7266571060368844e-05, + "loss": 0.2351, + "step": 13187 + }, + { + "epoch": 0.23522277316020404, + "grad_norm": 0.20553076267242432, + "learning_rate": 4.7265863328651145e-05, + "loss": 0.1504, + "step": 13188 + }, + { + "epoch": 0.23524060928191773, + "grad_norm": 0.2524462640285492, + "learning_rate": 4.726515551062327e-05, + "loss": 0.1784, + "step": 13189 + }, + { + "epoch": 0.23525844540363144, + "grad_norm": 0.2631435692310333, + "learning_rate": 4.726444760628795e-05, + "loss": 0.1875, + "step": 13190 + }, + { + "epoch": 0.23527628152534513, + "grad_norm": 0.27673405408859253, + "learning_rate": 4.726373961564796e-05, + "loss": 0.184, + "step": 13191 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.24376653134822845, + "learning_rate": 4.726303153870602e-05, + "loss": 0.1721, + "step": 13192 + }, + { + "epoch": 0.2353119537687725, + "grad_norm": 0.340243935585022, + "learning_rate": 4.726232337546487e-05, + "loss": 0.198, + "step": 13193 + }, + { + "epoch": 0.23532978989048622, + "grad_norm": 0.3555149435997009, + "learning_rate": 4.726161512592727e-05, + "loss": 0.2535, + "step": 13194 + }, + { + "epoch": 0.2353476260121999, + "grad_norm": 0.22092889249324799, + "learning_rate": 4.726090679009597e-05, + "loss": 0.1779, + "step": 13195 + }, + { + "epoch": 0.2353654621339136, + "grad_norm": 0.4123993217945099, + "learning_rate": 4.72601983679737e-05, + "loss": 0.2064, + "step": 13196 + }, + { + "epoch": 0.2353832982556273, + "grad_norm": 0.25728389620780945, + "learning_rate": 4.725948985956321e-05, + "loss": 0.1713, + "step": 13197 + }, + { + "epoch": 0.235401134377341, + "grad_norm": 0.2735288739204407, + "learning_rate": 4.7258781264867254e-05, + "loss": 0.2049, + "step": 13198 + }, + { + "epoch": 0.2354189704990547, + "grad_norm": 0.22504852712154388, + "learning_rate": 4.7258072583888566e-05, + "loss": 0.1938, + "step": 13199 + }, + { + "epoch": 0.23543680662076838, + "grad_norm": 0.25486525893211365, + "learning_rate": 4.7257363816629904e-05, + "loss": 0.1789, + "step": 13200 + }, + { + "epoch": 0.23545464274248207, + "grad_norm": 0.25982269644737244, + "learning_rate": 4.7256654963094024e-05, + "loss": 0.1896, + "step": 13201 + }, + { + "epoch": 0.23547247886419576, + "grad_norm": 0.2815377116203308, + "learning_rate": 4.725594602328365e-05, + "loss": 0.1669, + "step": 13202 + }, + { + "epoch": 0.23549031498590947, + "grad_norm": 0.31810420751571655, + "learning_rate": 4.725523699720155e-05, + "loss": 0.2062, + "step": 13203 + }, + { + "epoch": 0.23550815110762316, + "grad_norm": 0.24188999831676483, + "learning_rate": 4.725452788485046e-05, + "loss": 0.1543, + "step": 13204 + }, + { + "epoch": 0.23552598722933685, + "grad_norm": 0.321250319480896, + "learning_rate": 4.725381868623313e-05, + "loss": 0.1962, + "step": 13205 + }, + { + "epoch": 0.23554382335105054, + "grad_norm": 0.31936588883399963, + "learning_rate": 4.725310940135231e-05, + "loss": 0.2321, + "step": 13206 + }, + { + "epoch": 0.23556165947276425, + "grad_norm": 0.2989104688167572, + "learning_rate": 4.725240003021077e-05, + "loss": 0.1666, + "step": 13207 + }, + { + "epoch": 0.23557949559447794, + "grad_norm": 0.27593496441841125, + "learning_rate": 4.725169057281123e-05, + "loss": 0.1981, + "step": 13208 + }, + { + "epoch": 0.23559733171619163, + "grad_norm": 0.22347703576087952, + "learning_rate": 4.7250981029156446e-05, + "loss": 0.243, + "step": 13209 + }, + { + "epoch": 0.23561516783790531, + "grad_norm": 0.2823682129383087, + "learning_rate": 4.725027139924918e-05, + "loss": 0.2232, + "step": 13210 + }, + { + "epoch": 0.23563300395961903, + "grad_norm": 0.3040268123149872, + "learning_rate": 4.724956168309218e-05, + "loss": 0.217, + "step": 13211 + }, + { + "epoch": 0.23565084008133272, + "grad_norm": 0.2807352840900421, + "learning_rate": 4.724885188068819e-05, + "loss": 0.2244, + "step": 13212 + }, + { + "epoch": 0.2356686762030464, + "grad_norm": 0.28299254179000854, + "learning_rate": 4.7248141992039965e-05, + "loss": 0.2017, + "step": 13213 + }, + { + "epoch": 0.2356865123247601, + "grad_norm": 0.2473675012588501, + "learning_rate": 4.724743201715026e-05, + "loss": 0.1866, + "step": 13214 + }, + { + "epoch": 0.2357043484464738, + "grad_norm": 0.34206148982048035, + "learning_rate": 4.724672195602182e-05, + "loss": 0.2068, + "step": 13215 + }, + { + "epoch": 0.2357221845681875, + "grad_norm": 0.2382190227508545, + "learning_rate": 4.7246011808657406e-05, + "loss": 0.1943, + "step": 13216 + }, + { + "epoch": 0.2357400206899012, + "grad_norm": 0.3492027819156647, + "learning_rate": 4.724530157505978e-05, + "loss": 0.2146, + "step": 13217 + }, + { + "epoch": 0.23575785681161487, + "grad_norm": 0.23248231410980225, + "learning_rate": 4.724459125523166e-05, + "loss": 0.1774, + "step": 13218 + }, + { + "epoch": 0.2357756929333286, + "grad_norm": 0.3013003468513489, + "learning_rate": 4.724388084917583e-05, + "loss": 0.1592, + "step": 13219 + }, + { + "epoch": 0.23579352905504228, + "grad_norm": 0.36341676115989685, + "learning_rate": 4.7243170356895035e-05, + "loss": 0.1935, + "step": 13220 + }, + { + "epoch": 0.23581136517675597, + "grad_norm": 0.218472421169281, + "learning_rate": 4.724245977839202e-05, + "loss": 0.1904, + "step": 13221 + }, + { + "epoch": 0.23582920129846965, + "grad_norm": 0.4290112555027008, + "learning_rate": 4.7241749113669564e-05, + "loss": 0.1722, + "step": 13222 + }, + { + "epoch": 0.23584703742018334, + "grad_norm": 0.23001818358898163, + "learning_rate": 4.72410383627304e-05, + "loss": 0.1774, + "step": 13223 + }, + { + "epoch": 0.23586487354189706, + "grad_norm": 0.31045591831207275, + "learning_rate": 4.7240327525577286e-05, + "loss": 0.1402, + "step": 13224 + }, + { + "epoch": 0.23588270966361075, + "grad_norm": 0.23984810709953308, + "learning_rate": 4.7239616602212986e-05, + "loss": 0.1876, + "step": 13225 + }, + { + "epoch": 0.23590054578532443, + "grad_norm": 0.2721925973892212, + "learning_rate": 4.723890559264025e-05, + "loss": 0.2085, + "step": 13226 + }, + { + "epoch": 0.23591838190703812, + "grad_norm": 0.25344839692115784, + "learning_rate": 4.723819449686183e-05, + "loss": 0.1613, + "step": 13227 + }, + { + "epoch": 0.23593621802875184, + "grad_norm": 0.42632994055747986, + "learning_rate": 4.723748331488049e-05, + "loss": 0.2236, + "step": 13228 + }, + { + "epoch": 0.23595405415046553, + "grad_norm": 0.24490170180797577, + "learning_rate": 4.723677204669899e-05, + "loss": 0.1702, + "step": 13229 + }, + { + "epoch": 0.2359718902721792, + "grad_norm": 0.36458730697631836, + "learning_rate": 4.723606069232007e-05, + "loss": 0.1747, + "step": 13230 + }, + { + "epoch": 0.2359897263938929, + "grad_norm": 0.25357580184936523, + "learning_rate": 4.7235349251746505e-05, + "loss": 0.2037, + "step": 13231 + }, + { + "epoch": 0.23600756251560662, + "grad_norm": 0.23994719982147217, + "learning_rate": 4.7234637724981054e-05, + "loss": 0.1721, + "step": 13232 + }, + { + "epoch": 0.2360253986373203, + "grad_norm": 0.26088428497314453, + "learning_rate": 4.723392611202646e-05, + "loss": 0.1838, + "step": 13233 + }, + { + "epoch": 0.236043234759034, + "grad_norm": 0.3496204912662506, + "learning_rate": 4.7233214412885484e-05, + "loss": 0.2484, + "step": 13234 + }, + { + "epoch": 0.23606107088074768, + "grad_norm": 0.33100613951683044, + "learning_rate": 4.72325026275609e-05, + "loss": 0.1766, + "step": 13235 + }, + { + "epoch": 0.2360789070024614, + "grad_norm": 0.20524318516254425, + "learning_rate": 4.723179075605545e-05, + "loss": 0.1552, + "step": 13236 + }, + { + "epoch": 0.23609674312417508, + "grad_norm": 0.24219731986522675, + "learning_rate": 4.7231078798371896e-05, + "loss": 0.1978, + "step": 13237 + }, + { + "epoch": 0.23611457924588877, + "grad_norm": 0.4554736614227295, + "learning_rate": 4.723036675451301e-05, + "loss": 0.2012, + "step": 13238 + }, + { + "epoch": 0.23613241536760246, + "grad_norm": 0.30054935812950134, + "learning_rate": 4.7229654624481546e-05, + "loss": 0.1947, + "step": 13239 + }, + { + "epoch": 0.23615025148931618, + "grad_norm": 0.3623890280723572, + "learning_rate": 4.722894240828026e-05, + "loss": 0.1769, + "step": 13240 + }, + { + "epoch": 0.23616808761102986, + "grad_norm": 0.39175453782081604, + "learning_rate": 4.722823010591192e-05, + "loss": 0.1848, + "step": 13241 + }, + { + "epoch": 0.23618592373274355, + "grad_norm": 0.27913540601730347, + "learning_rate": 4.7227517717379275e-05, + "loss": 0.1751, + "step": 13242 + }, + { + "epoch": 0.23620375985445724, + "grad_norm": 0.2484302967786789, + "learning_rate": 4.72268052426851e-05, + "loss": 0.1674, + "step": 13243 + }, + { + "epoch": 0.23622159597617093, + "grad_norm": 0.2830994129180908, + "learning_rate": 4.7226092681832144e-05, + "loss": 0.2658, + "step": 13244 + }, + { + "epoch": 0.23623943209788464, + "grad_norm": 0.310022234916687, + "learning_rate": 4.722538003482318e-05, + "loss": 0.1573, + "step": 13245 + }, + { + "epoch": 0.23625726821959833, + "grad_norm": 0.28544631600379944, + "learning_rate": 4.7224667301660964e-05, + "loss": 0.1661, + "step": 13246 + }, + { + "epoch": 0.23627510434131202, + "grad_norm": 0.3461221158504486, + "learning_rate": 4.7223954482348266e-05, + "loss": 0.182, + "step": 13247 + }, + { + "epoch": 0.2362929404630257, + "grad_norm": 0.3091566860675812, + "learning_rate": 4.7223241576887846e-05, + "loss": 0.2046, + "step": 13248 + }, + { + "epoch": 0.23631077658473942, + "grad_norm": 0.283374160528183, + "learning_rate": 4.722252858528246e-05, + "loss": 0.1829, + "step": 13249 + }, + { + "epoch": 0.2363286127064531, + "grad_norm": 0.2626781165599823, + "learning_rate": 4.722181550753488e-05, + "loss": 0.2059, + "step": 13250 + }, + { + "epoch": 0.2363464488281668, + "grad_norm": 0.2459656298160553, + "learning_rate": 4.722110234364787e-05, + "loss": 0.193, + "step": 13251 + }, + { + "epoch": 0.2363642849498805, + "grad_norm": 0.3419763445854187, + "learning_rate": 4.722038909362419e-05, + "loss": 0.1987, + "step": 13252 + }, + { + "epoch": 0.2363821210715942, + "grad_norm": 0.2317509651184082, + "learning_rate": 4.721967575746661e-05, + "loss": 0.1803, + "step": 13253 + }, + { + "epoch": 0.2363999571933079, + "grad_norm": 0.2360445111989975, + "learning_rate": 4.721896233517788e-05, + "loss": 0.1838, + "step": 13254 + }, + { + "epoch": 0.23641779331502158, + "grad_norm": 0.22321978211402893, + "learning_rate": 4.72182488267608e-05, + "loss": 0.1199, + "step": 13255 + }, + { + "epoch": 0.23643562943673527, + "grad_norm": 0.2774013578891754, + "learning_rate": 4.72175352322181e-05, + "loss": 0.1518, + "step": 13256 + }, + { + "epoch": 0.23645346555844898, + "grad_norm": 0.25381648540496826, + "learning_rate": 4.721682155155256e-05, + "loss": 0.1644, + "step": 13257 + }, + { + "epoch": 0.23647130168016267, + "grad_norm": 0.3024599850177765, + "learning_rate": 4.721610778476695e-05, + "loss": 0.1874, + "step": 13258 + }, + { + "epoch": 0.23648913780187636, + "grad_norm": 0.3168708384037018, + "learning_rate": 4.7215393931864025e-05, + "loss": 0.1743, + "step": 13259 + }, + { + "epoch": 0.23650697392359005, + "grad_norm": 0.21824829280376434, + "learning_rate": 4.721467999284657e-05, + "loss": 0.1178, + "step": 13260 + }, + { + "epoch": 0.23652481004530376, + "grad_norm": 0.3472624719142914, + "learning_rate": 4.721396596771734e-05, + "loss": 0.2007, + "step": 13261 + }, + { + "epoch": 0.23654264616701745, + "grad_norm": 0.2955172657966614, + "learning_rate": 4.72132518564791e-05, + "loss": 0.1753, + "step": 13262 + }, + { + "epoch": 0.23656048228873114, + "grad_norm": 0.2570713758468628, + "learning_rate": 4.721253765913462e-05, + "loss": 0.1423, + "step": 13263 + }, + { + "epoch": 0.23657831841044483, + "grad_norm": 0.24804842472076416, + "learning_rate": 4.7211823375686695e-05, + "loss": 0.1539, + "step": 13264 + }, + { + "epoch": 0.23659615453215851, + "grad_norm": 0.2627914547920227, + "learning_rate": 4.721110900613805e-05, + "loss": 0.1908, + "step": 13265 + }, + { + "epoch": 0.23661399065387223, + "grad_norm": 0.25864657759666443, + "learning_rate": 4.721039455049148e-05, + "loss": 0.2274, + "step": 13266 + }, + { + "epoch": 0.23663182677558592, + "grad_norm": 0.24527081847190857, + "learning_rate": 4.7209680008749744e-05, + "loss": 0.174, + "step": 13267 + }, + { + "epoch": 0.2366496628972996, + "grad_norm": 0.2619577646255493, + "learning_rate": 4.720896538091563e-05, + "loss": 0.1532, + "step": 13268 + }, + { + "epoch": 0.2366674990190133, + "grad_norm": 0.2521253526210785, + "learning_rate": 4.720825066699189e-05, + "loss": 0.219, + "step": 13269 + }, + { + "epoch": 0.236685335140727, + "grad_norm": 0.2410418689250946, + "learning_rate": 4.72075358669813e-05, + "loss": 0.167, + "step": 13270 + }, + { + "epoch": 0.2367031712624407, + "grad_norm": 0.39443933963775635, + "learning_rate": 4.720682098088662e-05, + "loss": 0.2411, + "step": 13271 + }, + { + "epoch": 0.23672100738415439, + "grad_norm": 0.2510274648666382, + "learning_rate": 4.720610600871065e-05, + "loss": 0.1984, + "step": 13272 + }, + { + "epoch": 0.23673884350586807, + "grad_norm": 0.23060636222362518, + "learning_rate": 4.720539095045613e-05, + "loss": 0.1753, + "step": 13273 + }, + { + "epoch": 0.2367566796275818, + "grad_norm": 0.2736614942550659, + "learning_rate": 4.7204675806125854e-05, + "loss": 0.2107, + "step": 13274 + }, + { + "epoch": 0.23677451574929548, + "grad_norm": 0.2579177916049957, + "learning_rate": 4.720396057572258e-05, + "loss": 0.2351, + "step": 13275 + }, + { + "epoch": 0.23679235187100917, + "grad_norm": 0.21524077653884888, + "learning_rate": 4.720324525924908e-05, + "loss": 0.1999, + "step": 13276 + }, + { + "epoch": 0.23681018799272285, + "grad_norm": 0.23854389786720276, + "learning_rate": 4.7202529856708144e-05, + "loss": 0.1631, + "step": 13277 + }, + { + "epoch": 0.23682802411443657, + "grad_norm": 0.2408314198255539, + "learning_rate": 4.720181436810253e-05, + "loss": 0.1863, + "step": 13278 + }, + { + "epoch": 0.23684586023615026, + "grad_norm": 0.2868008315563202, + "learning_rate": 4.720109879343502e-05, + "loss": 0.1997, + "step": 13279 + }, + { + "epoch": 0.23686369635786395, + "grad_norm": 0.2984357476234436, + "learning_rate": 4.7200383132708375e-05, + "loss": 0.1433, + "step": 13280 + }, + { + "epoch": 0.23688153247957763, + "grad_norm": 0.3249823749065399, + "learning_rate": 4.7199667385925386e-05, + "loss": 0.2213, + "step": 13281 + }, + { + "epoch": 0.23689936860129132, + "grad_norm": 0.23707419633865356, + "learning_rate": 4.719895155308881e-05, + "loss": 0.1882, + "step": 13282 + }, + { + "epoch": 0.23691720472300504, + "grad_norm": 0.35891905426979065, + "learning_rate": 4.7198235634201425e-05, + "loss": 0.2058, + "step": 13283 + }, + { + "epoch": 0.23693504084471873, + "grad_norm": 0.4096406102180481, + "learning_rate": 4.719751962926602e-05, + "loss": 0.1501, + "step": 13284 + }, + { + "epoch": 0.2369528769664324, + "grad_norm": 0.2608923614025116, + "learning_rate": 4.719680353828537e-05, + "loss": 0.2068, + "step": 13285 + }, + { + "epoch": 0.2369707130881461, + "grad_norm": 0.35470229387283325, + "learning_rate": 4.7196087361262233e-05, + "loss": 0.1599, + "step": 13286 + }, + { + "epoch": 0.23698854920985982, + "grad_norm": 0.3483262062072754, + "learning_rate": 4.7195371098199395e-05, + "loss": 0.2137, + "step": 13287 + }, + { + "epoch": 0.2370063853315735, + "grad_norm": 0.3126085102558136, + "learning_rate": 4.719465474909963e-05, + "loss": 0.2526, + "step": 13288 + }, + { + "epoch": 0.2370242214532872, + "grad_norm": 0.22941049933433533, + "learning_rate": 4.7193938313965724e-05, + "loss": 0.2025, + "step": 13289 + }, + { + "epoch": 0.23704205757500088, + "grad_norm": 0.32364124059677124, + "learning_rate": 4.719322179280045e-05, + "loss": 0.2427, + "step": 13290 + }, + { + "epoch": 0.2370598936967146, + "grad_norm": 0.3139931261539459, + "learning_rate": 4.7192505185606575e-05, + "loss": 0.1877, + "step": 13291 + }, + { + "epoch": 0.23707772981842828, + "grad_norm": 0.20971724390983582, + "learning_rate": 4.719178849238689e-05, + "loss": 0.1841, + "step": 13292 + }, + { + "epoch": 0.23709556594014197, + "grad_norm": 0.2499406784772873, + "learning_rate": 4.719107171314416e-05, + "loss": 0.2355, + "step": 13293 + }, + { + "epoch": 0.23711340206185566, + "grad_norm": 0.259162038564682, + "learning_rate": 4.719035484788119e-05, + "loss": 0.1946, + "step": 13294 + }, + { + "epoch": 0.23713123818356938, + "grad_norm": 0.2665821611881256, + "learning_rate": 4.718963789660073e-05, + "loss": 0.2125, + "step": 13295 + }, + { + "epoch": 0.23714907430528306, + "grad_norm": 0.20773674547672272, + "learning_rate": 4.7188920859305566e-05, + "loss": 0.1398, + "step": 13296 + }, + { + "epoch": 0.23716691042699675, + "grad_norm": 0.3031942546367645, + "learning_rate": 4.718820373599848e-05, + "loss": 0.1848, + "step": 13297 + }, + { + "epoch": 0.23718474654871044, + "grad_norm": 0.4091244637966156, + "learning_rate": 4.718748652668226e-05, + "loss": 0.2595, + "step": 13298 + }, + { + "epoch": 0.23720258267042416, + "grad_norm": 0.32161083817481995, + "learning_rate": 4.7186769231359666e-05, + "loss": 0.2169, + "step": 13299 + }, + { + "epoch": 0.23722041879213784, + "grad_norm": 0.2657436728477478, + "learning_rate": 4.71860518500335e-05, + "loss": 0.2408, + "step": 13300 + }, + { + "epoch": 0.23723825491385153, + "grad_norm": 0.2363685667514801, + "learning_rate": 4.718533438270654e-05, + "loss": 0.1973, + "step": 13301 + }, + { + "epoch": 0.23725609103556522, + "grad_norm": 0.20735211670398712, + "learning_rate": 4.718461682938155e-05, + "loss": 0.1518, + "step": 13302 + }, + { + "epoch": 0.2372739271572789, + "grad_norm": 0.45115360617637634, + "learning_rate": 4.718389919006133e-05, + "loss": 0.1951, + "step": 13303 + }, + { + "epoch": 0.23729176327899262, + "grad_norm": 0.534774899482727, + "learning_rate": 4.718318146474865e-05, + "loss": 0.2071, + "step": 13304 + }, + { + "epoch": 0.2373095994007063, + "grad_norm": 0.34899401664733887, + "learning_rate": 4.71824636534463e-05, + "loss": 0.171, + "step": 13305 + }, + { + "epoch": 0.23732743552242, + "grad_norm": 0.2766760587692261, + "learning_rate": 4.718174575615706e-05, + "loss": 0.2194, + "step": 13306 + }, + { + "epoch": 0.2373452716441337, + "grad_norm": 0.3018389344215393, + "learning_rate": 4.718102777288371e-05, + "loss": 0.171, + "step": 13307 + }, + { + "epoch": 0.2373631077658474, + "grad_norm": 0.38258108496665955, + "learning_rate": 4.718030970362904e-05, + "loss": 0.1739, + "step": 13308 + }, + { + "epoch": 0.2373809438875611, + "grad_norm": 0.2464757263660431, + "learning_rate": 4.717959154839582e-05, + "loss": 0.2393, + "step": 13309 + }, + { + "epoch": 0.23739878000927478, + "grad_norm": 0.36514636874198914, + "learning_rate": 4.7178873307186855e-05, + "loss": 0.1899, + "step": 13310 + }, + { + "epoch": 0.23741661613098847, + "grad_norm": 0.2671298086643219, + "learning_rate": 4.7178154980004905e-05, + "loss": 0.1968, + "step": 13311 + }, + { + "epoch": 0.23743445225270218, + "grad_norm": 0.2568267583847046, + "learning_rate": 4.717743656685277e-05, + "loss": 0.1628, + "step": 13312 + }, + { + "epoch": 0.23745228837441587, + "grad_norm": 0.2959321439266205, + "learning_rate": 4.7176718067733235e-05, + "loss": 0.1864, + "step": 13313 + }, + { + "epoch": 0.23747012449612956, + "grad_norm": 0.27243509888648987, + "learning_rate": 4.717599948264908e-05, + "loss": 0.2192, + "step": 13314 + }, + { + "epoch": 0.23748796061784325, + "grad_norm": 0.2892591655254364, + "learning_rate": 4.7175280811603084e-05, + "loss": 0.2184, + "step": 13315 + }, + { + "epoch": 0.23750579673955696, + "grad_norm": 0.29562073945999146, + "learning_rate": 4.7174562054598046e-05, + "loss": 0.1802, + "step": 13316 + }, + { + "epoch": 0.23752363286127065, + "grad_norm": 0.28665128350257874, + "learning_rate": 4.717384321163675e-05, + "loss": 0.1569, + "step": 13317 + }, + { + "epoch": 0.23754146898298434, + "grad_norm": 0.32626378536224365, + "learning_rate": 4.717312428272197e-05, + "loss": 0.2696, + "step": 13318 + }, + { + "epoch": 0.23755930510469803, + "grad_norm": 0.2862628400325775, + "learning_rate": 4.7172405267856514e-05, + "loss": 0.1877, + "step": 13319 + }, + { + "epoch": 0.23757714122641174, + "grad_norm": 0.31922516226768494, + "learning_rate": 4.717168616704315e-05, + "loss": 0.2933, + "step": 13320 + }, + { + "epoch": 0.23759497734812543, + "grad_norm": 0.26153549551963806, + "learning_rate": 4.717096698028467e-05, + "loss": 0.1627, + "step": 13321 + }, + { + "epoch": 0.23761281346983912, + "grad_norm": 0.2663293778896332, + "learning_rate": 4.717024770758387e-05, + "loss": 0.2204, + "step": 13322 + }, + { + "epoch": 0.2376306495915528, + "grad_norm": 0.24172881245613098, + "learning_rate": 4.716952834894353e-05, + "loss": 0.1635, + "step": 13323 + }, + { + "epoch": 0.2376484857132665, + "grad_norm": 0.34882476925849915, + "learning_rate": 4.716880890436644e-05, + "loss": 0.1801, + "step": 13324 + }, + { + "epoch": 0.2376663218349802, + "grad_norm": 0.22524629533290863, + "learning_rate": 4.7168089373855396e-05, + "loss": 0.2157, + "step": 13325 + }, + { + "epoch": 0.2376841579566939, + "grad_norm": 0.3022221624851227, + "learning_rate": 4.716736975741317e-05, + "loss": 0.2246, + "step": 13326 + }, + { + "epoch": 0.23770199407840759, + "grad_norm": 0.2862691283226013, + "learning_rate": 4.716665005504257e-05, + "loss": 0.1842, + "step": 13327 + }, + { + "epoch": 0.23771983020012127, + "grad_norm": 0.25495675206184387, + "learning_rate": 4.716593026674638e-05, + "loss": 0.2096, + "step": 13328 + }, + { + "epoch": 0.237737666321835, + "grad_norm": 0.26478201150894165, + "learning_rate": 4.716521039252738e-05, + "loss": 0.1761, + "step": 13329 + }, + { + "epoch": 0.23775550244354868, + "grad_norm": 0.23873406648635864, + "learning_rate": 4.7164490432388376e-05, + "loss": 0.1483, + "step": 13330 + }, + { + "epoch": 0.23777333856526237, + "grad_norm": 0.29290691018104553, + "learning_rate": 4.716377038633215e-05, + "loss": 0.1502, + "step": 13331 + }, + { + "epoch": 0.23779117468697605, + "grad_norm": 0.39081674814224243, + "learning_rate": 4.71630502543615e-05, + "loss": 0.1972, + "step": 13332 + }, + { + "epoch": 0.23780901080868977, + "grad_norm": 0.34899893403053284, + "learning_rate": 4.7162330036479205e-05, + "loss": 0.1831, + "step": 13333 + }, + { + "epoch": 0.23782684693040346, + "grad_norm": 0.30515649914741516, + "learning_rate": 4.7161609732688064e-05, + "loss": 0.1556, + "step": 13334 + }, + { + "epoch": 0.23784468305211715, + "grad_norm": 0.2563779056072235, + "learning_rate": 4.716088934299087e-05, + "loss": 0.1454, + "step": 13335 + }, + { + "epoch": 0.23786251917383083, + "grad_norm": 0.34990549087524414, + "learning_rate": 4.716016886739042e-05, + "loss": 0.2211, + "step": 13336 + }, + { + "epoch": 0.23788035529554455, + "grad_norm": 0.3291740417480469, + "learning_rate": 4.7159448305889495e-05, + "loss": 0.1685, + "step": 13337 + }, + { + "epoch": 0.23789819141725824, + "grad_norm": 0.3897515535354614, + "learning_rate": 4.7158727658490894e-05, + "loss": 0.1689, + "step": 13338 + }, + { + "epoch": 0.23791602753897192, + "grad_norm": 0.2098303884267807, + "learning_rate": 4.715800692519742e-05, + "loss": 0.1567, + "step": 13339 + }, + { + "epoch": 0.2379338636606856, + "grad_norm": 0.36193516850471497, + "learning_rate": 4.715728610601185e-05, + "loss": 0.223, + "step": 13340 + }, + { + "epoch": 0.23795169978239933, + "grad_norm": 0.2321547120809555, + "learning_rate": 4.7156565200936984e-05, + "loss": 0.1562, + "step": 13341 + }, + { + "epoch": 0.23796953590411302, + "grad_norm": 0.33449116349220276, + "learning_rate": 4.715584420997563e-05, + "loss": 0.2231, + "step": 13342 + }, + { + "epoch": 0.2379873720258267, + "grad_norm": 0.7138961553573608, + "learning_rate": 4.715512313313056e-05, + "loss": 0.1844, + "step": 13343 + }, + { + "epoch": 0.2380052081475404, + "grad_norm": 0.29571333527565, + "learning_rate": 4.71544019704046e-05, + "loss": 0.2249, + "step": 13344 + }, + { + "epoch": 0.23802304426925408, + "grad_norm": 0.2784079909324646, + "learning_rate": 4.7153680721800496e-05, + "loss": 0.216, + "step": 13345 + }, + { + "epoch": 0.2380408803909678, + "grad_norm": 0.37449702620506287, + "learning_rate": 4.715295938732109e-05, + "loss": 0.2098, + "step": 13346 + }, + { + "epoch": 0.23805871651268148, + "grad_norm": 0.27389535307884216, + "learning_rate": 4.715223796696917e-05, + "loss": 0.2029, + "step": 13347 + }, + { + "epoch": 0.23807655263439517, + "grad_norm": 0.38538825511932373, + "learning_rate": 4.715151646074752e-05, + "loss": 0.1832, + "step": 13348 + }, + { + "epoch": 0.23809438875610886, + "grad_norm": 0.2619556784629822, + "learning_rate": 4.715079486865893e-05, + "loss": 0.193, + "step": 13349 + }, + { + "epoch": 0.23811222487782258, + "grad_norm": 0.21136973798274994, + "learning_rate": 4.7150073190706216e-05, + "loss": 0.1423, + "step": 13350 + }, + { + "epoch": 0.23813006099953626, + "grad_norm": 0.4439290761947632, + "learning_rate": 4.714935142689217e-05, + "loss": 0.2909, + "step": 13351 + }, + { + "epoch": 0.23814789712124995, + "grad_norm": 0.3308385908603668, + "learning_rate": 4.7148629577219584e-05, + "loss": 0.201, + "step": 13352 + }, + { + "epoch": 0.23816573324296364, + "grad_norm": 0.2681209146976471, + "learning_rate": 4.714790764169126e-05, + "loss": 0.1539, + "step": 13353 + }, + { + "epoch": 0.23818356936467736, + "grad_norm": 0.20436906814575195, + "learning_rate": 4.714718562031e-05, + "loss": 0.1623, + "step": 13354 + }, + { + "epoch": 0.23820140548639104, + "grad_norm": 0.39103105664253235, + "learning_rate": 4.71464635130786e-05, + "loss": 0.1803, + "step": 13355 + }, + { + "epoch": 0.23821924160810473, + "grad_norm": 0.2353384643793106, + "learning_rate": 4.714574131999985e-05, + "loss": 0.2236, + "step": 13356 + }, + { + "epoch": 0.23823707772981842, + "grad_norm": 0.3549315631389618, + "learning_rate": 4.714501904107657e-05, + "loss": 0.1785, + "step": 13357 + }, + { + "epoch": 0.23825491385153214, + "grad_norm": 0.4552212059497833, + "learning_rate": 4.714429667631154e-05, + "loss": 0.1652, + "step": 13358 + }, + { + "epoch": 0.23827274997324582, + "grad_norm": 0.2647263705730438, + "learning_rate": 4.7143574225707564e-05, + "loss": 0.1553, + "step": 13359 + }, + { + "epoch": 0.2382905860949595, + "grad_norm": 0.27400949597358704, + "learning_rate": 4.7142851689267455e-05, + "loss": 0.1914, + "step": 13360 + }, + { + "epoch": 0.2383084222166732, + "grad_norm": 0.2867042124271393, + "learning_rate": 4.7142129066993994e-05, + "loss": 0.219, + "step": 13361 + }, + { + "epoch": 0.23832625833838691, + "grad_norm": 0.25492849946022034, + "learning_rate": 4.714140635889e-05, + "loss": 0.1751, + "step": 13362 + }, + { + "epoch": 0.2383440944601006, + "grad_norm": 0.24152772128582, + "learning_rate": 4.7140683564958265e-05, + "loss": 0.2044, + "step": 13363 + }, + { + "epoch": 0.2383619305818143, + "grad_norm": 0.2686931788921356, + "learning_rate": 4.71399606852016e-05, + "loss": 0.1816, + "step": 13364 + }, + { + "epoch": 0.23837976670352798, + "grad_norm": 0.32738491892814636, + "learning_rate": 4.71392377196228e-05, + "loss": 0.1516, + "step": 13365 + }, + { + "epoch": 0.23839760282524167, + "grad_norm": 0.3190734088420868, + "learning_rate": 4.713851466822465e-05, + "loss": 0.1692, + "step": 13366 + }, + { + "epoch": 0.23841543894695538, + "grad_norm": 0.231252059340477, + "learning_rate": 4.713779153100999e-05, + "loss": 0.2009, + "step": 13367 + }, + { + "epoch": 0.23843327506866907, + "grad_norm": 0.2271817922592163, + "learning_rate": 4.7137068307981605e-05, + "loss": 0.1553, + "step": 13368 + }, + { + "epoch": 0.23845111119038276, + "grad_norm": 0.48560476303100586, + "learning_rate": 4.713634499914229e-05, + "loss": 0.2394, + "step": 13369 + }, + { + "epoch": 0.23846894731209645, + "grad_norm": 0.21157796680927277, + "learning_rate": 4.713562160449485e-05, + "loss": 0.1988, + "step": 13370 + }, + { + "epoch": 0.23848678343381016, + "grad_norm": 0.2157321274280548, + "learning_rate": 4.71348981240421e-05, + "loss": 0.2204, + "step": 13371 + }, + { + "epoch": 0.23850461955552385, + "grad_norm": 0.23527267575263977, + "learning_rate": 4.7134174557786845e-05, + "loss": 0.1991, + "step": 13372 + }, + { + "epoch": 0.23852245567723754, + "grad_norm": 0.23589536547660828, + "learning_rate": 4.7133450905731885e-05, + "loss": 0.1622, + "step": 13373 + }, + { + "epoch": 0.23854029179895123, + "grad_norm": 0.33560240268707275, + "learning_rate": 4.7132727167880017e-05, + "loss": 0.2223, + "step": 13374 + }, + { + "epoch": 0.23855812792066494, + "grad_norm": 0.2951493263244629, + "learning_rate": 4.713200334423406e-05, + "loss": 0.1923, + "step": 13375 + }, + { + "epoch": 0.23857596404237863, + "grad_norm": 0.2860085964202881, + "learning_rate": 4.7131279434796814e-05, + "loss": 0.2174, + "step": 13376 + }, + { + "epoch": 0.23859380016409232, + "grad_norm": 0.19068557024002075, + "learning_rate": 4.713055543957108e-05, + "loss": 0.1551, + "step": 13377 + }, + { + "epoch": 0.238611636285806, + "grad_norm": 0.3185647428035736, + "learning_rate": 4.7129831358559674e-05, + "loss": 0.1422, + "step": 13378 + }, + { + "epoch": 0.23862947240751972, + "grad_norm": 0.2435835599899292, + "learning_rate": 4.71291071917654e-05, + "loss": 0.1594, + "step": 13379 + }, + { + "epoch": 0.2386473085292334, + "grad_norm": 0.2497451901435852, + "learning_rate": 4.712838293919106e-05, + "loss": 0.1956, + "step": 13380 + }, + { + "epoch": 0.2386651446509471, + "grad_norm": 0.3476564288139343, + "learning_rate": 4.7127658600839465e-05, + "loss": 0.1992, + "step": 13381 + }, + { + "epoch": 0.23868298077266079, + "grad_norm": 0.2773374617099762, + "learning_rate": 4.7126934176713425e-05, + "loss": 0.1769, + "step": 13382 + }, + { + "epoch": 0.23870081689437447, + "grad_norm": 0.26434776186943054, + "learning_rate": 4.712620966681574e-05, + "loss": 0.1946, + "step": 13383 + }, + { + "epoch": 0.2387186530160882, + "grad_norm": 0.2497277408838272, + "learning_rate": 4.712548507114922e-05, + "loss": 0.2064, + "step": 13384 + }, + { + "epoch": 0.23873648913780188, + "grad_norm": 0.3211526572704315, + "learning_rate": 4.712476038971669e-05, + "loss": 0.2179, + "step": 13385 + }, + { + "epoch": 0.23875432525951557, + "grad_norm": 0.35197022557258606, + "learning_rate": 4.712403562252094e-05, + "loss": 0.1532, + "step": 13386 + }, + { + "epoch": 0.23877216138122925, + "grad_norm": 0.31447118520736694, + "learning_rate": 4.7123310769564795e-05, + "loss": 0.2028, + "step": 13387 + }, + { + "epoch": 0.23878999750294297, + "grad_norm": 0.4230057895183563, + "learning_rate": 4.7122585830851054e-05, + "loss": 0.2099, + "step": 13388 + }, + { + "epoch": 0.23880783362465666, + "grad_norm": 0.2894974946975708, + "learning_rate": 4.7121860806382526e-05, + "loss": 0.162, + "step": 13389 + }, + { + "epoch": 0.23882566974637034, + "grad_norm": 0.1895712912082672, + "learning_rate": 4.712113569616202e-05, + "loss": 0.1744, + "step": 13390 + }, + { + "epoch": 0.23884350586808403, + "grad_norm": 0.3055501878261566, + "learning_rate": 4.712041050019236e-05, + "loss": 0.2189, + "step": 13391 + }, + { + "epoch": 0.23886134198979775, + "grad_norm": 0.22077152132987976, + "learning_rate": 4.711968521847634e-05, + "loss": 0.1747, + "step": 13392 + }, + { + "epoch": 0.23887917811151144, + "grad_norm": 0.2606711983680725, + "learning_rate": 4.711895985101679e-05, + "loss": 0.1814, + "step": 13393 + }, + { + "epoch": 0.23889701423322512, + "grad_norm": 0.2561998665332794, + "learning_rate": 4.71182343978165e-05, + "loss": 0.1778, + "step": 13394 + }, + { + "epoch": 0.2389148503549388, + "grad_norm": 0.3184613287448883, + "learning_rate": 4.71175088588783e-05, + "loss": 0.2359, + "step": 13395 + }, + { + "epoch": 0.23893268647665253, + "grad_norm": 0.25398892164230347, + "learning_rate": 4.7116783234205006e-05, + "loss": 0.178, + "step": 13396 + }, + { + "epoch": 0.23895052259836622, + "grad_norm": 0.24392369389533997, + "learning_rate": 4.7116057523799405e-05, + "loss": 0.1541, + "step": 13397 + }, + { + "epoch": 0.2389683587200799, + "grad_norm": 0.23380355536937714, + "learning_rate": 4.711533172766434e-05, + "loss": 0.1415, + "step": 13398 + }, + { + "epoch": 0.2389861948417936, + "grad_norm": 0.25240951776504517, + "learning_rate": 4.7114605845802606e-05, + "loss": 0.1495, + "step": 13399 + }, + { + "epoch": 0.2390040309635073, + "grad_norm": 0.4615921676158905, + "learning_rate": 4.711387987821701e-05, + "loss": 0.245, + "step": 13400 + }, + { + "epoch": 0.239021867085221, + "grad_norm": 0.3106546401977539, + "learning_rate": 4.711315382491039e-05, + "loss": 0.1752, + "step": 13401 + }, + { + "epoch": 0.23903970320693468, + "grad_norm": 0.27275028824806213, + "learning_rate": 4.711242768588555e-05, + "loss": 0.1914, + "step": 13402 + }, + { + "epoch": 0.23905753932864837, + "grad_norm": 0.2753840684890747, + "learning_rate": 4.711170146114531e-05, + "loss": 0.1445, + "step": 13403 + }, + { + "epoch": 0.23907537545036206, + "grad_norm": 0.2281634360551834, + "learning_rate": 4.711097515069246e-05, + "loss": 0.2016, + "step": 13404 + }, + { + "epoch": 0.23909321157207578, + "grad_norm": 0.24798743426799774, + "learning_rate": 4.711024875452984e-05, + "loss": 0.1813, + "step": 13405 + }, + { + "epoch": 0.23911104769378946, + "grad_norm": 0.32124871015548706, + "learning_rate": 4.7109522272660265e-05, + "loss": 0.164, + "step": 13406 + }, + { + "epoch": 0.23912888381550315, + "grad_norm": 0.29859793186187744, + "learning_rate": 4.710879570508654e-05, + "loss": 0.2564, + "step": 13407 + }, + { + "epoch": 0.23914671993721684, + "grad_norm": 0.33276113867759705, + "learning_rate": 4.7108069051811486e-05, + "loss": 0.1657, + "step": 13408 + }, + { + "epoch": 0.23916455605893056, + "grad_norm": 0.28450921177864075, + "learning_rate": 4.710734231283792e-05, + "loss": 0.1949, + "step": 13409 + }, + { + "epoch": 0.23918239218064424, + "grad_norm": 0.5976627469062805, + "learning_rate": 4.7106615488168664e-05, + "loss": 0.2967, + "step": 13410 + }, + { + "epoch": 0.23920022830235793, + "grad_norm": 0.22599709033966064, + "learning_rate": 4.7105888577806526e-05, + "loss": 0.1392, + "step": 13411 + }, + { + "epoch": 0.23921806442407162, + "grad_norm": 0.33586129546165466, + "learning_rate": 4.710516158175433e-05, + "loss": 0.1978, + "step": 13412 + }, + { + "epoch": 0.23923590054578533, + "grad_norm": 0.2430362105369568, + "learning_rate": 4.71044345000149e-05, + "loss": 0.1672, + "step": 13413 + }, + { + "epoch": 0.23925373666749902, + "grad_norm": 0.3307092785835266, + "learning_rate": 4.710370733259104e-05, + "loss": 0.2427, + "step": 13414 + }, + { + "epoch": 0.2392715727892127, + "grad_norm": 0.3205855190753937, + "learning_rate": 4.710298007948558e-05, + "loss": 0.1744, + "step": 13415 + }, + { + "epoch": 0.2392894089109264, + "grad_norm": 0.2526535987854004, + "learning_rate": 4.7102252740701324e-05, + "loss": 0.1953, + "step": 13416 + }, + { + "epoch": 0.23930724503264011, + "grad_norm": 0.3021654486656189, + "learning_rate": 4.710152531624111e-05, + "loss": 0.1945, + "step": 13417 + }, + { + "epoch": 0.2393250811543538, + "grad_norm": 0.3173452615737915, + "learning_rate": 4.710079780610776e-05, + "loss": 0.1755, + "step": 13418 + }, + { + "epoch": 0.2393429172760675, + "grad_norm": 0.293372243642807, + "learning_rate": 4.710007021030407e-05, + "loss": 0.2034, + "step": 13419 + }, + { + "epoch": 0.23936075339778118, + "grad_norm": 0.34677833318710327, + "learning_rate": 4.709934252883288e-05, + "loss": 0.2264, + "step": 13420 + }, + { + "epoch": 0.2393785895194949, + "grad_norm": 0.3412090539932251, + "learning_rate": 4.709861476169701e-05, + "loss": 0.1467, + "step": 13421 + }, + { + "epoch": 0.23939642564120858, + "grad_norm": 0.3284223973751068, + "learning_rate": 4.709788690889927e-05, + "loss": 0.1861, + "step": 13422 + }, + { + "epoch": 0.23941426176292227, + "grad_norm": 0.2922373414039612, + "learning_rate": 4.709715897044249e-05, + "loss": 0.16, + "step": 13423 + }, + { + "epoch": 0.23943209788463596, + "grad_norm": 0.250119149684906, + "learning_rate": 4.709643094632949e-05, + "loss": 0.1501, + "step": 13424 + }, + { + "epoch": 0.23944993400634965, + "grad_norm": 0.32594916224479675, + "learning_rate": 4.7095702836563094e-05, + "loss": 0.1938, + "step": 13425 + }, + { + "epoch": 0.23946777012806336, + "grad_norm": 0.365202397108078, + "learning_rate": 4.709497464114612e-05, + "loss": 0.1598, + "step": 13426 + }, + { + "epoch": 0.23948560624977705, + "grad_norm": 0.22981515526771545, + "learning_rate": 4.709424636008139e-05, + "loss": 0.1764, + "step": 13427 + }, + { + "epoch": 0.23950344237149074, + "grad_norm": 0.23167452216148376, + "learning_rate": 4.709351799337173e-05, + "loss": 0.1622, + "step": 13428 + }, + { + "epoch": 0.23952127849320443, + "grad_norm": 0.526567816734314, + "learning_rate": 4.709278954101997e-05, + "loss": 0.2555, + "step": 13429 + }, + { + "epoch": 0.23953911461491814, + "grad_norm": 0.25232312083244324, + "learning_rate": 4.709206100302892e-05, + "loss": 0.1451, + "step": 13430 + }, + { + "epoch": 0.23955695073663183, + "grad_norm": 0.3796832263469696, + "learning_rate": 4.709133237940142e-05, + "loss": 0.2461, + "step": 13431 + }, + { + "epoch": 0.23957478685834552, + "grad_norm": 0.20113413035869598, + "learning_rate": 4.7090603670140275e-05, + "loss": 0.2, + "step": 13432 + }, + { + "epoch": 0.2395926229800592, + "grad_norm": 0.27989864349365234, + "learning_rate": 4.708987487524833e-05, + "loss": 0.1512, + "step": 13433 + }, + { + "epoch": 0.23961045910177292, + "grad_norm": 0.2920389175415039, + "learning_rate": 4.708914599472839e-05, + "loss": 0.2306, + "step": 13434 + }, + { + "epoch": 0.2396282952234866, + "grad_norm": 0.2332906424999237, + "learning_rate": 4.70884170285833e-05, + "loss": 0.1965, + "step": 13435 + }, + { + "epoch": 0.2396461313452003, + "grad_norm": 0.27118706703186035, + "learning_rate": 4.7087687976815875e-05, + "loss": 0.1589, + "step": 13436 + }, + { + "epoch": 0.23966396746691399, + "grad_norm": 0.36195898056030273, + "learning_rate": 4.708695883942894e-05, + "loss": 0.2286, + "step": 13437 + }, + { + "epoch": 0.2396818035886277, + "grad_norm": 0.3250676393508911, + "learning_rate": 4.708622961642532e-05, + "loss": 0.1648, + "step": 13438 + }, + { + "epoch": 0.2396996397103414, + "grad_norm": 0.27408838272094727, + "learning_rate": 4.708550030780786e-05, + "loss": 0.1946, + "step": 13439 + }, + { + "epoch": 0.23971747583205508, + "grad_norm": 0.2470027208328247, + "learning_rate": 4.708477091357936e-05, + "loss": 0.1832, + "step": 13440 + }, + { + "epoch": 0.23973531195376876, + "grad_norm": 0.24596144258975983, + "learning_rate": 4.708404143374266e-05, + "loss": 0.1635, + "step": 13441 + }, + { + "epoch": 0.23975314807548248, + "grad_norm": 0.2868826389312744, + "learning_rate": 4.7083311868300596e-05, + "loss": 0.1957, + "step": 13442 + }, + { + "epoch": 0.23977098419719617, + "grad_norm": 0.21057447791099548, + "learning_rate": 4.7082582217255975e-05, + "loss": 0.1766, + "step": 13443 + }, + { + "epoch": 0.23978882031890986, + "grad_norm": 0.2822345495223999, + "learning_rate": 4.708185248061165e-05, + "loss": 0.208, + "step": 13444 + }, + { + "epoch": 0.23980665644062354, + "grad_norm": 0.3563338816165924, + "learning_rate": 4.708112265837044e-05, + "loss": 0.2117, + "step": 13445 + }, + { + "epoch": 0.23982449256233723, + "grad_norm": 0.3101353943347931, + "learning_rate": 4.708039275053516e-05, + "loss": 0.2296, + "step": 13446 + }, + { + "epoch": 0.23984232868405095, + "grad_norm": 0.22491110861301422, + "learning_rate": 4.7079662757108655e-05, + "loss": 0.1631, + "step": 13447 + }, + { + "epoch": 0.23986016480576464, + "grad_norm": 0.2542402148246765, + "learning_rate": 4.707893267809376e-05, + "loss": 0.1587, + "step": 13448 + }, + { + "epoch": 0.23987800092747832, + "grad_norm": 0.3075088560581207, + "learning_rate": 4.7078202513493285e-05, + "loss": 0.1872, + "step": 13449 + }, + { + "epoch": 0.239895837049192, + "grad_norm": 0.3424496352672577, + "learning_rate": 4.707747226331007e-05, + "loss": 0.1851, + "step": 13450 + }, + { + "epoch": 0.23991367317090573, + "grad_norm": 0.26897549629211426, + "learning_rate": 4.707674192754696e-05, + "loss": 0.1397, + "step": 13451 + }, + { + "epoch": 0.23993150929261942, + "grad_norm": 0.29516106843948364, + "learning_rate": 4.707601150620676e-05, + "loss": 0.1486, + "step": 13452 + }, + { + "epoch": 0.2399493454143331, + "grad_norm": 0.24099019169807434, + "learning_rate": 4.707528099929233e-05, + "loss": 0.152, + "step": 13453 + }, + { + "epoch": 0.2399671815360468, + "grad_norm": 0.23713742196559906, + "learning_rate": 4.707455040680647e-05, + "loss": 0.1645, + "step": 13454 + }, + { + "epoch": 0.2399850176577605, + "grad_norm": 0.2817937135696411, + "learning_rate": 4.707381972875204e-05, + "loss": 0.1982, + "step": 13455 + }, + { + "epoch": 0.2400028537794742, + "grad_norm": 0.23757711052894592, + "learning_rate": 4.707308896513185e-05, + "loss": 0.1767, + "step": 13456 + }, + { + "epoch": 0.24002068990118788, + "grad_norm": 0.329540491104126, + "learning_rate": 4.707235811594875e-05, + "loss": 0.2195, + "step": 13457 + }, + { + "epoch": 0.24003852602290157, + "grad_norm": 0.27588650584220886, + "learning_rate": 4.707162718120557e-05, + "loss": 0.2181, + "step": 13458 + }, + { + "epoch": 0.2400563621446153, + "grad_norm": 0.3009127676486969, + "learning_rate": 4.7070896160905136e-05, + "loss": 0.1929, + "step": 13459 + }, + { + "epoch": 0.24007419826632898, + "grad_norm": 0.2650068700313568, + "learning_rate": 4.7070165055050284e-05, + "loss": 0.192, + "step": 13460 + }, + { + "epoch": 0.24009203438804266, + "grad_norm": 0.41197818517684937, + "learning_rate": 4.706943386364385e-05, + "loss": 0.1791, + "step": 13461 + }, + { + "epoch": 0.24010987050975635, + "grad_norm": 0.23184789717197418, + "learning_rate": 4.7068702586688675e-05, + "loss": 0.1753, + "step": 13462 + }, + { + "epoch": 0.24012770663147007, + "grad_norm": 0.43426835536956787, + "learning_rate": 4.7067971224187576e-05, + "loss": 0.1491, + "step": 13463 + }, + { + "epoch": 0.24014554275318375, + "grad_norm": 0.2904857397079468, + "learning_rate": 4.70672397761434e-05, + "loss": 0.215, + "step": 13464 + }, + { + "epoch": 0.24016337887489744, + "grad_norm": 0.322457879781723, + "learning_rate": 4.7066508242558993e-05, + "loss": 0.2209, + "step": 13465 + }, + { + "epoch": 0.24018121499661113, + "grad_norm": 0.28330883383750916, + "learning_rate": 4.706577662343716e-05, + "loss": 0.1696, + "step": 13466 + }, + { + "epoch": 0.24019905111832482, + "grad_norm": 0.18052725493907928, + "learning_rate": 4.706504491878077e-05, + "loss": 0.1727, + "step": 13467 + }, + { + "epoch": 0.24021688724003853, + "grad_norm": 0.24228636920452118, + "learning_rate": 4.7064313128592644e-05, + "loss": 0.1801, + "step": 13468 + }, + { + "epoch": 0.24023472336175222, + "grad_norm": 0.24460303783416748, + "learning_rate": 4.706358125287561e-05, + "loss": 0.1913, + "step": 13469 + }, + { + "epoch": 0.2402525594834659, + "grad_norm": 0.296675443649292, + "learning_rate": 4.7062849291632516e-05, + "loss": 0.2138, + "step": 13470 + }, + { + "epoch": 0.2402703956051796, + "grad_norm": 0.23877549171447754, + "learning_rate": 4.7062117244866205e-05, + "loss": 0.188, + "step": 13471 + }, + { + "epoch": 0.24028823172689331, + "grad_norm": 0.2380308359861374, + "learning_rate": 4.7061385112579503e-05, + "loss": 0.2084, + "step": 13472 + }, + { + "epoch": 0.240306067848607, + "grad_norm": 0.25362640619277954, + "learning_rate": 4.706065289477525e-05, + "loss": 0.1483, + "step": 13473 + }, + { + "epoch": 0.2403239039703207, + "grad_norm": 0.23467028141021729, + "learning_rate": 4.7059920591456295e-05, + "loss": 0.1305, + "step": 13474 + }, + { + "epoch": 0.24034174009203438, + "grad_norm": 0.2669031023979187, + "learning_rate": 4.705918820262546e-05, + "loss": 0.1764, + "step": 13475 + }, + { + "epoch": 0.2403595762137481, + "grad_norm": 0.3469369113445282, + "learning_rate": 4.70584557282856e-05, + "loss": 0.2136, + "step": 13476 + }, + { + "epoch": 0.24037741233546178, + "grad_norm": 0.3109643757343292, + "learning_rate": 4.705772316843955e-05, + "loss": 0.2137, + "step": 13477 + }, + { + "epoch": 0.24039524845717547, + "grad_norm": 0.3049291670322418, + "learning_rate": 4.7056990523090136e-05, + "loss": 0.1494, + "step": 13478 + }, + { + "epoch": 0.24041308457888916, + "grad_norm": 0.3413618206977844, + "learning_rate": 4.705625779224021e-05, + "loss": 0.1783, + "step": 13479 + }, + { + "epoch": 0.24043092070060287, + "grad_norm": 0.3412415683269501, + "learning_rate": 4.7055524975892614e-05, + "loss": 0.1698, + "step": 13480 + }, + { + "epoch": 0.24044875682231656, + "grad_norm": 0.28880491852760315, + "learning_rate": 4.705479207405018e-05, + "loss": 0.1929, + "step": 13481 + }, + { + "epoch": 0.24046659294403025, + "grad_norm": 0.3133130669593811, + "learning_rate": 4.7054059086715766e-05, + "loss": 0.1947, + "step": 13482 + }, + { + "epoch": 0.24048442906574394, + "grad_norm": 0.3986121118068695, + "learning_rate": 4.705332601389219e-05, + "loss": 0.1756, + "step": 13483 + }, + { + "epoch": 0.24050226518745763, + "grad_norm": 0.19810420274734497, + "learning_rate": 4.705259285558231e-05, + "loss": 0.1548, + "step": 13484 + }, + { + "epoch": 0.24052010130917134, + "grad_norm": 0.3069475591182709, + "learning_rate": 4.7051859611788964e-05, + "loss": 0.1922, + "step": 13485 + }, + { + "epoch": 0.24053793743088503, + "grad_norm": 0.2690931260585785, + "learning_rate": 4.705112628251499e-05, + "loss": 0.2428, + "step": 13486 + }, + { + "epoch": 0.24055577355259872, + "grad_norm": 0.3050071597099304, + "learning_rate": 4.7050392867763236e-05, + "loss": 0.1767, + "step": 13487 + }, + { + "epoch": 0.2405736096743124, + "grad_norm": 0.34742656350135803, + "learning_rate": 4.7049659367536546e-05, + "loss": 0.2042, + "step": 13488 + }, + { + "epoch": 0.24059144579602612, + "grad_norm": 0.40350329875946045, + "learning_rate": 4.704892578183776e-05, + "loss": 0.2179, + "step": 13489 + }, + { + "epoch": 0.2406092819177398, + "grad_norm": 0.2532975673675537, + "learning_rate": 4.7048192110669726e-05, + "loss": 0.2364, + "step": 13490 + }, + { + "epoch": 0.2406271180394535, + "grad_norm": 0.2537136673927307, + "learning_rate": 4.704745835403528e-05, + "loss": 0.1617, + "step": 13491 + }, + { + "epoch": 0.24064495416116718, + "grad_norm": 0.21060170233249664, + "learning_rate": 4.704672451193727e-05, + "loss": 0.1526, + "step": 13492 + }, + { + "epoch": 0.2406627902828809, + "grad_norm": 0.27835801243782043, + "learning_rate": 4.704599058437854e-05, + "loss": 0.1425, + "step": 13493 + }, + { + "epoch": 0.2406806264045946, + "grad_norm": 0.29193437099456787, + "learning_rate": 4.704525657136194e-05, + "loss": 0.163, + "step": 13494 + }, + { + "epoch": 0.24069846252630828, + "grad_norm": 0.38879725337028503, + "learning_rate": 4.704452247289031e-05, + "loss": 0.2023, + "step": 13495 + }, + { + "epoch": 0.24071629864802196, + "grad_norm": 0.26055172085762024, + "learning_rate": 4.7043788288966495e-05, + "loss": 0.2143, + "step": 13496 + }, + { + "epoch": 0.24073413476973568, + "grad_norm": 0.278653085231781, + "learning_rate": 4.704305401959334e-05, + "loss": 0.1887, + "step": 13497 + }, + { + "epoch": 0.24075197089144937, + "grad_norm": 0.2415715754032135, + "learning_rate": 4.70423196647737e-05, + "loss": 0.1534, + "step": 13498 + }, + { + "epoch": 0.24076980701316306, + "grad_norm": 0.27306032180786133, + "learning_rate": 4.704158522451041e-05, + "loss": 0.1953, + "step": 13499 + }, + { + "epoch": 0.24078764313487674, + "grad_norm": 0.33374711871147156, + "learning_rate": 4.7040850698806324e-05, + "loss": 0.2288, + "step": 13500 + }, + { + "epoch": 0.24080547925659046, + "grad_norm": 0.28850099444389343, + "learning_rate": 4.704011608766429e-05, + "loss": 0.1811, + "step": 13501 + }, + { + "epoch": 0.24082331537830415, + "grad_norm": 0.2672603726387024, + "learning_rate": 4.703938139108716e-05, + "loss": 0.24, + "step": 13502 + }, + { + "epoch": 0.24084115150001784, + "grad_norm": 0.4115087687969208, + "learning_rate": 4.703864660907776e-05, + "loss": 0.2354, + "step": 13503 + }, + { + "epoch": 0.24085898762173152, + "grad_norm": 0.24012894928455353, + "learning_rate": 4.703791174163897e-05, + "loss": 0.1711, + "step": 13504 + }, + { + "epoch": 0.2408768237434452, + "grad_norm": 0.3480866253376007, + "learning_rate": 4.703717678877362e-05, + "loss": 0.2557, + "step": 13505 + }, + { + "epoch": 0.24089465986515893, + "grad_norm": 0.21841463446617126, + "learning_rate": 4.7036441750484555e-05, + "loss": 0.2063, + "step": 13506 + }, + { + "epoch": 0.24091249598687262, + "grad_norm": 0.192271426320076, + "learning_rate": 4.703570662677463e-05, + "loss": 0.1887, + "step": 13507 + }, + { + "epoch": 0.2409303321085863, + "grad_norm": 0.2641043961048126, + "learning_rate": 4.70349714176467e-05, + "loss": 0.2066, + "step": 13508 + }, + { + "epoch": 0.2409481682303, + "grad_norm": 0.25226837396621704, + "learning_rate": 4.703423612310361e-05, + "loss": 0.1988, + "step": 13509 + }, + { + "epoch": 0.2409660043520137, + "grad_norm": 0.22991840541362762, + "learning_rate": 4.703350074314821e-05, + "loss": 0.1692, + "step": 13510 + }, + { + "epoch": 0.2409838404737274, + "grad_norm": 0.24048392474651337, + "learning_rate": 4.703276527778335e-05, + "loss": 0.157, + "step": 13511 + }, + { + "epoch": 0.24100167659544108, + "grad_norm": 0.3038395941257477, + "learning_rate": 4.703202972701188e-05, + "loss": 0.1583, + "step": 13512 + }, + { + "epoch": 0.24101951271715477, + "grad_norm": 0.5107954144477844, + "learning_rate": 4.7031294090836655e-05, + "loss": 0.1829, + "step": 13513 + }, + { + "epoch": 0.2410373488388685, + "grad_norm": 0.2879059314727783, + "learning_rate": 4.7030558369260525e-05, + "loss": 0.1498, + "step": 13514 + }, + { + "epoch": 0.24105518496058218, + "grad_norm": 0.2327626496553421, + "learning_rate": 4.7029822562286344e-05, + "loss": 0.1626, + "step": 13515 + }, + { + "epoch": 0.24107302108229586, + "grad_norm": 0.31451553106307983, + "learning_rate": 4.702908666991696e-05, + "loss": 0.1328, + "step": 13516 + }, + { + "epoch": 0.24109085720400955, + "grad_norm": 0.2627813518047333, + "learning_rate": 4.702835069215522e-05, + "loss": 0.1384, + "step": 13517 + }, + { + "epoch": 0.24110869332572327, + "grad_norm": 0.4106541574001312, + "learning_rate": 4.702761462900399e-05, + "loss": 0.1711, + "step": 13518 + }, + { + "epoch": 0.24112652944743695, + "grad_norm": 0.30055779218673706, + "learning_rate": 4.702687848046612e-05, + "loss": 0.1599, + "step": 13519 + }, + { + "epoch": 0.24114436556915064, + "grad_norm": 0.40765902400016785, + "learning_rate": 4.702614224654446e-05, + "loss": 0.2343, + "step": 13520 + }, + { + "epoch": 0.24116220169086433, + "grad_norm": 0.3206610381603241, + "learning_rate": 4.7025405927241864e-05, + "loss": 0.2456, + "step": 13521 + }, + { + "epoch": 0.24118003781257805, + "grad_norm": 0.35464996099472046, + "learning_rate": 4.702466952256119e-05, + "loss": 0.2533, + "step": 13522 + }, + { + "epoch": 0.24119787393429173, + "grad_norm": 0.36289629340171814, + "learning_rate": 4.702393303250529e-05, + "loss": 0.1751, + "step": 13523 + }, + { + "epoch": 0.24121571005600542, + "grad_norm": 0.3485252857208252, + "learning_rate": 4.702319645707701e-05, + "loss": 0.2052, + "step": 13524 + }, + { + "epoch": 0.2412335461777191, + "grad_norm": 0.19893619418144226, + "learning_rate": 4.702245979627922e-05, + "loss": 0.1604, + "step": 13525 + }, + { + "epoch": 0.2412513822994328, + "grad_norm": 0.5408480167388916, + "learning_rate": 4.702172305011477e-05, + "loss": 0.1686, + "step": 13526 + }, + { + "epoch": 0.24126921842114651, + "grad_norm": 0.40812939405441284, + "learning_rate": 4.702098621858651e-05, + "loss": 0.2073, + "step": 13527 + }, + { + "epoch": 0.2412870545428602, + "grad_norm": 0.36756691336631775, + "learning_rate": 4.7020249301697315e-05, + "loss": 0.2467, + "step": 13528 + }, + { + "epoch": 0.2413048906645739, + "grad_norm": 0.35905441641807556, + "learning_rate": 4.701951229945002e-05, + "loss": 0.1918, + "step": 13529 + }, + { + "epoch": 0.24132272678628758, + "grad_norm": 0.3223443031311035, + "learning_rate": 4.701877521184749e-05, + "loss": 0.1879, + "step": 13530 + }, + { + "epoch": 0.2413405629080013, + "grad_norm": 0.31790000200271606, + "learning_rate": 4.701803803889259e-05, + "loss": 0.2048, + "step": 13531 + }, + { + "epoch": 0.24135839902971498, + "grad_norm": 0.227671816945076, + "learning_rate": 4.701730078058816e-05, + "loss": 0.169, + "step": 13532 + }, + { + "epoch": 0.24137623515142867, + "grad_norm": 0.20597079396247864, + "learning_rate": 4.7016563436937065e-05, + "loss": 0.1484, + "step": 13533 + }, + { + "epoch": 0.24139407127314236, + "grad_norm": 0.30770665407180786, + "learning_rate": 4.701582600794217e-05, + "loss": 0.2302, + "step": 13534 + }, + { + "epoch": 0.24141190739485607, + "grad_norm": 0.3436049222946167, + "learning_rate": 4.701508849360633e-05, + "loss": 0.2524, + "step": 13535 + }, + { + "epoch": 0.24142974351656976, + "grad_norm": 0.23202307522296906, + "learning_rate": 4.70143508939324e-05, + "loss": 0.2159, + "step": 13536 + }, + { + "epoch": 0.24144757963828345, + "grad_norm": 0.323688268661499, + "learning_rate": 4.701361320892325e-05, + "loss": 0.157, + "step": 13537 + }, + { + "epoch": 0.24146541575999714, + "grad_norm": 0.294297993183136, + "learning_rate": 4.701287543858173e-05, + "loss": 0.1976, + "step": 13538 + }, + { + "epoch": 0.24148325188171085, + "grad_norm": 0.28315097093582153, + "learning_rate": 4.70121375829107e-05, + "loss": 0.1812, + "step": 13539 + }, + { + "epoch": 0.24150108800342454, + "grad_norm": 0.27391791343688965, + "learning_rate": 4.701139964191302e-05, + "loss": 0.1991, + "step": 13540 + }, + { + "epoch": 0.24151892412513823, + "grad_norm": 0.3426937162876129, + "learning_rate": 4.7010661615591556e-05, + "loss": 0.1885, + "step": 13541 + }, + { + "epoch": 0.24153676024685192, + "grad_norm": 0.20778916776180267, + "learning_rate": 4.700992350394916e-05, + "loss": 0.2054, + "step": 13542 + }, + { + "epoch": 0.24155459636856563, + "grad_norm": 0.3177711069583893, + "learning_rate": 4.7009185306988704e-05, + "loss": 0.184, + "step": 13543 + }, + { + "epoch": 0.24157243249027932, + "grad_norm": 0.2847771942615509, + "learning_rate": 4.7008447024713044e-05, + "loss": 0.2483, + "step": 13544 + }, + { + "epoch": 0.241590268611993, + "grad_norm": 0.2583253085613251, + "learning_rate": 4.700770865712504e-05, + "loss": 0.1702, + "step": 13545 + }, + { + "epoch": 0.2416081047337067, + "grad_norm": 0.2858251631259918, + "learning_rate": 4.700697020422755e-05, + "loss": 0.1952, + "step": 13546 + }, + { + "epoch": 0.24162594085542038, + "grad_norm": 0.29717332124710083, + "learning_rate": 4.7006231666023445e-05, + "loss": 0.2201, + "step": 13547 + }, + { + "epoch": 0.2416437769771341, + "grad_norm": 0.28044217824935913, + "learning_rate": 4.700549304251559e-05, + "loss": 0.1922, + "step": 13548 + }, + { + "epoch": 0.2416616130988478, + "grad_norm": 0.2241005003452301, + "learning_rate": 4.7004754333706846e-05, + "loss": 0.1351, + "step": 13549 + }, + { + "epoch": 0.24167944922056148, + "grad_norm": 0.288309246301651, + "learning_rate": 4.700401553960007e-05, + "loss": 0.2256, + "step": 13550 + }, + { + "epoch": 0.24169728534227516, + "grad_norm": 0.3168530762195587, + "learning_rate": 4.7003276660198125e-05, + "loss": 0.1539, + "step": 13551 + }, + { + "epoch": 0.24171512146398888, + "grad_norm": 0.37794947624206543, + "learning_rate": 4.7002537695503887e-05, + "loss": 0.2087, + "step": 13552 + }, + { + "epoch": 0.24173295758570257, + "grad_norm": 0.27659884095191956, + "learning_rate": 4.700179864552021e-05, + "loss": 0.1749, + "step": 13553 + }, + { + "epoch": 0.24175079370741626, + "grad_norm": 0.3827894628047943, + "learning_rate": 4.7001059510249965e-05, + "loss": 0.2402, + "step": 13554 + }, + { + "epoch": 0.24176862982912994, + "grad_norm": 0.221628800034523, + "learning_rate": 4.7000320289696014e-05, + "loss": 0.1403, + "step": 13555 + }, + { + "epoch": 0.24178646595084366, + "grad_norm": 0.28615614771842957, + "learning_rate": 4.699958098386122e-05, + "loss": 0.1782, + "step": 13556 + }, + { + "epoch": 0.24180430207255735, + "grad_norm": 0.2050405740737915, + "learning_rate": 4.699884159274845e-05, + "loss": 0.1849, + "step": 13557 + }, + { + "epoch": 0.24182213819427104, + "grad_norm": 0.21709440648555756, + "learning_rate": 4.699810211636059e-05, + "loss": 0.1412, + "step": 13558 + }, + { + "epoch": 0.24183997431598472, + "grad_norm": 0.2735121250152588, + "learning_rate": 4.6997362554700465e-05, + "loss": 0.1911, + "step": 13559 + }, + { + "epoch": 0.24185781043769844, + "grad_norm": 0.29296016693115234, + "learning_rate": 4.699662290777098e-05, + "loss": 0.1577, + "step": 13560 + }, + { + "epoch": 0.24187564655941213, + "grad_norm": 0.3813689053058624, + "learning_rate": 4.699588317557498e-05, + "loss": 0.1932, + "step": 13561 + }, + { + "epoch": 0.24189348268112582, + "grad_norm": 0.24916289746761322, + "learning_rate": 4.6995143358115336e-05, + "loss": 0.184, + "step": 13562 + }, + { + "epoch": 0.2419113188028395, + "grad_norm": 0.3230383098125458, + "learning_rate": 4.6994403455394925e-05, + "loss": 0.1603, + "step": 13563 + }, + { + "epoch": 0.2419291549245532, + "grad_norm": 0.1946956068277359, + "learning_rate": 4.699366346741661e-05, + "loss": 0.1439, + "step": 13564 + }, + { + "epoch": 0.2419469910462669, + "grad_norm": 0.2937822937965393, + "learning_rate": 4.699292339418326e-05, + "loss": 0.1985, + "step": 13565 + }, + { + "epoch": 0.2419648271679806, + "grad_norm": 0.31975552439689636, + "learning_rate": 4.699218323569774e-05, + "loss": 0.2309, + "step": 13566 + }, + { + "epoch": 0.24198266328969428, + "grad_norm": 0.26577362418174744, + "learning_rate": 4.699144299196292e-05, + "loss": 0.1946, + "step": 13567 + }, + { + "epoch": 0.24200049941140797, + "grad_norm": 0.2582256495952606, + "learning_rate": 4.6990702662981676e-05, + "loss": 0.1907, + "step": 13568 + }, + { + "epoch": 0.2420183355331217, + "grad_norm": 0.34865236282348633, + "learning_rate": 4.698996224875687e-05, + "loss": 0.1794, + "step": 13569 + }, + { + "epoch": 0.24203617165483537, + "grad_norm": 0.3029707074165344, + "learning_rate": 4.698922174929138e-05, + "loss": 0.1558, + "step": 13570 + }, + { + "epoch": 0.24205400777654906, + "grad_norm": 0.2675391733646393, + "learning_rate": 4.6988481164588063e-05, + "loss": 0.1616, + "step": 13571 + }, + { + "epoch": 0.24207184389826275, + "grad_norm": 0.30130940675735474, + "learning_rate": 4.6987740494649806e-05, + "loss": 0.1947, + "step": 13572 + }, + { + "epoch": 0.24208968001997647, + "grad_norm": 0.22134047746658325, + "learning_rate": 4.698699973947947e-05, + "loss": 0.1947, + "step": 13573 + }, + { + "epoch": 0.24210751614169015, + "grad_norm": 0.24363790452480316, + "learning_rate": 4.698625889907993e-05, + "loss": 0.151, + "step": 13574 + }, + { + "epoch": 0.24212535226340384, + "grad_norm": 0.26265788078308105, + "learning_rate": 4.698551797345405e-05, + "loss": 0.1948, + "step": 13575 + }, + { + "epoch": 0.24214318838511753, + "grad_norm": 0.21863479912281036, + "learning_rate": 4.698477696260472e-05, + "loss": 0.1597, + "step": 13576 + }, + { + "epoch": 0.24216102450683125, + "grad_norm": 0.31729671359062195, + "learning_rate": 4.6984035866534795e-05, + "loss": 0.2143, + "step": 13577 + }, + { + "epoch": 0.24217886062854493, + "grad_norm": 0.27609825134277344, + "learning_rate": 4.698329468524715e-05, + "loss": 0.1699, + "step": 13578 + }, + { + "epoch": 0.24219669675025862, + "grad_norm": 0.24096006155014038, + "learning_rate": 4.698255341874467e-05, + "loss": 0.1863, + "step": 13579 + }, + { + "epoch": 0.2422145328719723, + "grad_norm": 0.3234200179576874, + "learning_rate": 4.698181206703022e-05, + "loss": 0.1626, + "step": 13580 + }, + { + "epoch": 0.24223236899368603, + "grad_norm": 0.2748022973537445, + "learning_rate": 4.698107063010667e-05, + "loss": 0.198, + "step": 13581 + }, + { + "epoch": 0.2422502051153997, + "grad_norm": 0.2667080760002136, + "learning_rate": 4.6980329107976895e-05, + "loss": 0.1745, + "step": 13582 + }, + { + "epoch": 0.2422680412371134, + "grad_norm": 0.29853636026382446, + "learning_rate": 4.697958750064378e-05, + "loss": 0.2101, + "step": 13583 + }, + { + "epoch": 0.2422858773588271, + "grad_norm": 0.3241555392742157, + "learning_rate": 4.697884580811019e-05, + "loss": 0.1543, + "step": 13584 + }, + { + "epoch": 0.24230371348054078, + "grad_norm": 0.2748297452926636, + "learning_rate": 4.6978104030379e-05, + "loss": 0.2242, + "step": 13585 + }, + { + "epoch": 0.2423215496022545, + "grad_norm": 0.26102542877197266, + "learning_rate": 4.6977362167453085e-05, + "loss": 0.1627, + "step": 13586 + }, + { + "epoch": 0.24233938572396818, + "grad_norm": 0.32901981472969055, + "learning_rate": 4.6976620219335334e-05, + "loss": 0.151, + "step": 13587 + }, + { + "epoch": 0.24235722184568187, + "grad_norm": 0.31791335344314575, + "learning_rate": 4.6975878186028607e-05, + "loss": 0.2245, + "step": 13588 + }, + { + "epoch": 0.24237505796739556, + "grad_norm": 0.35270991921424866, + "learning_rate": 4.697513606753578e-05, + "loss": 0.178, + "step": 13589 + }, + { + "epoch": 0.24239289408910927, + "grad_norm": 0.2793395221233368, + "learning_rate": 4.697439386385975e-05, + "loss": 0.2045, + "step": 13590 + }, + { + "epoch": 0.24241073021082296, + "grad_norm": 0.21825000643730164, + "learning_rate": 4.697365157500336e-05, + "loss": 0.1787, + "step": 13591 + }, + { + "epoch": 0.24242856633253665, + "grad_norm": 0.33723846077919006, + "learning_rate": 4.697290920096952e-05, + "loss": 0.22, + "step": 13592 + }, + { + "epoch": 0.24244640245425034, + "grad_norm": 0.24851396679878235, + "learning_rate": 4.6972166741761095e-05, + "loss": 0.199, + "step": 13593 + }, + { + "epoch": 0.24246423857596405, + "grad_norm": 0.2603556215763092, + "learning_rate": 4.6971424197380965e-05, + "loss": 0.166, + "step": 13594 + }, + { + "epoch": 0.24248207469767774, + "grad_norm": 0.25877392292022705, + "learning_rate": 4.6970681567832e-05, + "loss": 0.2086, + "step": 13595 + }, + { + "epoch": 0.24249991081939143, + "grad_norm": 0.26149800419807434, + "learning_rate": 4.6969938853117086e-05, + "loss": 0.2017, + "step": 13596 + }, + { + "epoch": 0.24251774694110512, + "grad_norm": 0.3259674608707428, + "learning_rate": 4.6969196053239104e-05, + "loss": 0.1815, + "step": 13597 + }, + { + "epoch": 0.24253558306281883, + "grad_norm": 0.2884747087955475, + "learning_rate": 4.6968453168200924e-05, + "loss": 0.201, + "step": 13598 + }, + { + "epoch": 0.24255341918453252, + "grad_norm": 0.29727765917778015, + "learning_rate": 4.696771019800543e-05, + "loss": 0.1526, + "step": 13599 + }, + { + "epoch": 0.2425712553062462, + "grad_norm": 0.3936905860900879, + "learning_rate": 4.6966967142655516e-05, + "loss": 0.2052, + "step": 13600 + }, + { + "epoch": 0.2425890914279599, + "grad_norm": 0.3130809962749481, + "learning_rate": 4.696622400215404e-05, + "loss": 0.1494, + "step": 13601 + }, + { + "epoch": 0.2426069275496736, + "grad_norm": 0.3812466263771057, + "learning_rate": 4.6965480776503897e-05, + "loss": 0.1655, + "step": 13602 + }, + { + "epoch": 0.2426247636713873, + "grad_norm": 0.21273046731948853, + "learning_rate": 4.6964737465707966e-05, + "loss": 0.1838, + "step": 13603 + }, + { + "epoch": 0.242642599793101, + "grad_norm": 0.2797996997833252, + "learning_rate": 4.696399406976912e-05, + "loss": 0.1663, + "step": 13604 + }, + { + "epoch": 0.24266043591481468, + "grad_norm": 0.35725197196006775, + "learning_rate": 4.696325058869025e-05, + "loss": 0.2026, + "step": 13605 + }, + { + "epoch": 0.24267827203652836, + "grad_norm": 0.28045418858528137, + "learning_rate": 4.696250702247423e-05, + "loss": 0.2055, + "step": 13606 + }, + { + "epoch": 0.24269610815824208, + "grad_norm": 0.2977827191352844, + "learning_rate": 4.6961763371123956e-05, + "loss": 0.1798, + "step": 13607 + }, + { + "epoch": 0.24271394427995577, + "grad_norm": 0.3112426996231079, + "learning_rate": 4.69610196346423e-05, + "loss": 0.2341, + "step": 13608 + }, + { + "epoch": 0.24273178040166946, + "grad_norm": 0.3510107696056366, + "learning_rate": 4.696027581303215e-05, + "loss": 0.2423, + "step": 13609 + }, + { + "epoch": 0.24274961652338314, + "grad_norm": 0.3482983112335205, + "learning_rate": 4.6959531906296375e-05, + "loss": 0.2287, + "step": 13610 + }, + { + "epoch": 0.24276745264509686, + "grad_norm": 0.43641147017478943, + "learning_rate": 4.695878791443788e-05, + "loss": 0.2492, + "step": 13611 + }, + { + "epoch": 0.24278528876681055, + "grad_norm": 0.24481116235256195, + "learning_rate": 4.695804383745953e-05, + "loss": 0.1702, + "step": 13612 + }, + { + "epoch": 0.24280312488852424, + "grad_norm": 0.3286797106266022, + "learning_rate": 4.695729967536422e-05, + "loss": 0.1842, + "step": 13613 + }, + { + "epoch": 0.24282096101023792, + "grad_norm": 0.5271211862564087, + "learning_rate": 4.6956555428154833e-05, + "loss": 0.1755, + "step": 13614 + }, + { + "epoch": 0.24283879713195164, + "grad_norm": 0.39401528239250183, + "learning_rate": 4.6955811095834255e-05, + "loss": 0.2669, + "step": 13615 + }, + { + "epoch": 0.24285663325366533, + "grad_norm": 0.3487893044948578, + "learning_rate": 4.695506667840537e-05, + "loss": 0.2894, + "step": 13616 + }, + { + "epoch": 0.24287446937537902, + "grad_norm": 0.2682149410247803, + "learning_rate": 4.695432217587107e-05, + "loss": 0.2305, + "step": 13617 + }, + { + "epoch": 0.2428923054970927, + "grad_norm": 0.23941993713378906, + "learning_rate": 4.695357758823423e-05, + "loss": 0.2008, + "step": 13618 + }, + { + "epoch": 0.24291014161880642, + "grad_norm": 0.3209134638309479, + "learning_rate": 4.6952832915497736e-05, + "loss": 0.1904, + "step": 13619 + }, + { + "epoch": 0.2429279777405201, + "grad_norm": 0.36643916368484497, + "learning_rate": 4.695208815766448e-05, + "loss": 0.2104, + "step": 13620 + }, + { + "epoch": 0.2429458138622338, + "grad_norm": 0.24296219646930695, + "learning_rate": 4.695134331473735e-05, + "loss": 0.192, + "step": 13621 + }, + { + "epoch": 0.24296364998394748, + "grad_norm": 0.24099262058734894, + "learning_rate": 4.695059838671923e-05, + "loss": 0.1731, + "step": 13622 + }, + { + "epoch": 0.2429814861056612, + "grad_norm": 0.30778101086616516, + "learning_rate": 4.694985337361302e-05, + "loss": 0.1963, + "step": 13623 + }, + { + "epoch": 0.2429993222273749, + "grad_norm": 0.21597592532634735, + "learning_rate": 4.694910827542158e-05, + "loss": 0.1422, + "step": 13624 + }, + { + "epoch": 0.24301715834908857, + "grad_norm": 0.27674752473831177, + "learning_rate": 4.694836309214783e-05, + "loss": 0.2272, + "step": 13625 + }, + { + "epoch": 0.24303499447080226, + "grad_norm": 0.29003921151161194, + "learning_rate": 4.6947617823794636e-05, + "loss": 0.1958, + "step": 13626 + }, + { + "epoch": 0.24305283059251595, + "grad_norm": 0.2573041021823883, + "learning_rate": 4.69468724703649e-05, + "loss": 0.159, + "step": 13627 + }, + { + "epoch": 0.24307066671422967, + "grad_norm": 0.3151061236858368, + "learning_rate": 4.69461270318615e-05, + "loss": 0.1654, + "step": 13628 + }, + { + "epoch": 0.24308850283594335, + "grad_norm": 0.3935214877128601, + "learning_rate": 4.6945381508287335e-05, + "loss": 0.1914, + "step": 13629 + }, + { + "epoch": 0.24310633895765704, + "grad_norm": 0.26115089654922485, + "learning_rate": 4.6944635899645294e-05, + "loss": 0.1855, + "step": 13630 + }, + { + "epoch": 0.24312417507937073, + "grad_norm": 0.33393970131874084, + "learning_rate": 4.6943890205938255e-05, + "loss": 0.214, + "step": 13631 + }, + { + "epoch": 0.24314201120108445, + "grad_norm": 0.21798068284988403, + "learning_rate": 4.6943144427169125e-05, + "loss": 0.1698, + "step": 13632 + }, + { + "epoch": 0.24315984732279813, + "grad_norm": 0.33182114362716675, + "learning_rate": 4.69423985633408e-05, + "loss": 0.236, + "step": 13633 + }, + { + "epoch": 0.24317768344451182, + "grad_norm": 0.22838178277015686, + "learning_rate": 4.6941652614456145e-05, + "loss": 0.1958, + "step": 13634 + }, + { + "epoch": 0.2431955195662255, + "grad_norm": 0.3527720868587494, + "learning_rate": 4.694090658051806e-05, + "loss": 0.1748, + "step": 13635 + }, + { + "epoch": 0.24321335568793923, + "grad_norm": 0.22157545387744904, + "learning_rate": 4.694016046152946e-05, + "loss": 0.124, + "step": 13636 + }, + { + "epoch": 0.2432311918096529, + "grad_norm": 0.29315274953842163, + "learning_rate": 4.693941425749321e-05, + "loss": 0.2178, + "step": 13637 + }, + { + "epoch": 0.2432490279313666, + "grad_norm": 0.26105692982673645, + "learning_rate": 4.693866796841222e-05, + "loss": 0.2202, + "step": 13638 + }, + { + "epoch": 0.2432668640530803, + "grad_norm": 0.2692136764526367, + "learning_rate": 4.693792159428937e-05, + "loss": 0.202, + "step": 13639 + }, + { + "epoch": 0.243284700174794, + "grad_norm": 0.24614101648330688, + "learning_rate": 4.693717513512755e-05, + "loss": 0.1453, + "step": 13640 + }, + { + "epoch": 0.2433025362965077, + "grad_norm": 0.22633449733257294, + "learning_rate": 4.693642859092968e-05, + "loss": 0.1601, + "step": 13641 + }, + { + "epoch": 0.24332037241822138, + "grad_norm": 0.2577258050441742, + "learning_rate": 4.693568196169862e-05, + "loss": 0.2034, + "step": 13642 + }, + { + "epoch": 0.24333820853993507, + "grad_norm": 0.41488316655158997, + "learning_rate": 4.69349352474373e-05, + "loss": 0.1547, + "step": 13643 + }, + { + "epoch": 0.24335604466164878, + "grad_norm": 0.22476181387901306, + "learning_rate": 4.6934188448148574e-05, + "loss": 0.1864, + "step": 13644 + }, + { + "epoch": 0.24337388078336247, + "grad_norm": 0.38418251276016235, + "learning_rate": 4.693344156383537e-05, + "loss": 0.2008, + "step": 13645 + }, + { + "epoch": 0.24339171690507616, + "grad_norm": 0.32155823707580566, + "learning_rate": 4.693269459450057e-05, + "loss": 0.1858, + "step": 13646 + }, + { + "epoch": 0.24340955302678985, + "grad_norm": 0.29565367102622986, + "learning_rate": 4.693194754014707e-05, + "loss": 0.2004, + "step": 13647 + }, + { + "epoch": 0.24342738914850354, + "grad_norm": 0.341865211725235, + "learning_rate": 4.693120040077776e-05, + "loss": 0.182, + "step": 13648 + }, + { + "epoch": 0.24344522527021725, + "grad_norm": 0.28684622049331665, + "learning_rate": 4.693045317639555e-05, + "loss": 0.1776, + "step": 13649 + }, + { + "epoch": 0.24346306139193094, + "grad_norm": 0.36310359835624695, + "learning_rate": 4.692970586700333e-05, + "loss": 0.1937, + "step": 13650 + }, + { + "epoch": 0.24348089751364463, + "grad_norm": 0.3165016770362854, + "learning_rate": 4.692895847260399e-05, + "loss": 0.2052, + "step": 13651 + }, + { + "epoch": 0.24349873363535832, + "grad_norm": 0.31170836091041565, + "learning_rate": 4.6928210993200425e-05, + "loss": 0.2146, + "step": 13652 + }, + { + "epoch": 0.24351656975707203, + "grad_norm": 0.2886374592781067, + "learning_rate": 4.692746342879556e-05, + "loss": 0.2198, + "step": 13653 + }, + { + "epoch": 0.24353440587878572, + "grad_norm": 0.2144119143486023, + "learning_rate": 4.6926715779392264e-05, + "loss": 0.1618, + "step": 13654 + }, + { + "epoch": 0.2435522420004994, + "grad_norm": 0.29546231031417847, + "learning_rate": 4.692596804499344e-05, + "loss": 0.2019, + "step": 13655 + }, + { + "epoch": 0.2435700781222131, + "grad_norm": 0.27046895027160645, + "learning_rate": 4.6925220225602e-05, + "loss": 0.1928, + "step": 13656 + }, + { + "epoch": 0.2435879142439268, + "grad_norm": 0.2773810923099518, + "learning_rate": 4.6924472321220824e-05, + "loss": 0.1639, + "step": 13657 + }, + { + "epoch": 0.2436057503656405, + "grad_norm": 0.17827634513378143, + "learning_rate": 4.692372433185282e-05, + "loss": 0.1166, + "step": 13658 + }, + { + "epoch": 0.2436235864873542, + "grad_norm": 0.30844777822494507, + "learning_rate": 4.6922976257500895e-05, + "loss": 0.1979, + "step": 13659 + }, + { + "epoch": 0.24364142260906788, + "grad_norm": 0.29913416504859924, + "learning_rate": 4.692222809816794e-05, + "loss": 0.1864, + "step": 13660 + }, + { + "epoch": 0.2436592587307816, + "grad_norm": 0.32545220851898193, + "learning_rate": 4.692147985385686e-05, + "loss": 0.2756, + "step": 13661 + }, + { + "epoch": 0.24367709485249528, + "grad_norm": 0.3084665834903717, + "learning_rate": 4.692073152457055e-05, + "loss": 0.2001, + "step": 13662 + }, + { + "epoch": 0.24369493097420897, + "grad_norm": 0.2230866402387619, + "learning_rate": 4.691998311031192e-05, + "loss": 0.1805, + "step": 13663 + }, + { + "epoch": 0.24371276709592266, + "grad_norm": 0.29065218567848206, + "learning_rate": 4.691923461108385e-05, + "loss": 0.1487, + "step": 13664 + }, + { + "epoch": 0.24373060321763634, + "grad_norm": 0.3638111352920532, + "learning_rate": 4.691848602688926e-05, + "loss": 0.2004, + "step": 13665 + }, + { + "epoch": 0.24374843933935006, + "grad_norm": 0.48927006125450134, + "learning_rate": 4.6917737357731064e-05, + "loss": 0.1687, + "step": 13666 + }, + { + "epoch": 0.24376627546106375, + "grad_norm": 0.3269551396369934, + "learning_rate": 4.6916988603612136e-05, + "loss": 0.1619, + "step": 13667 + }, + { + "epoch": 0.24378411158277744, + "grad_norm": 0.2890358865261078, + "learning_rate": 4.6916239764535384e-05, + "loss": 0.1635, + "step": 13668 + }, + { + "epoch": 0.24380194770449112, + "grad_norm": 0.3172226846218109, + "learning_rate": 4.691549084050372e-05, + "loss": 0.1855, + "step": 13669 + }, + { + "epoch": 0.24381978382620484, + "grad_norm": 0.32188624143600464, + "learning_rate": 4.6914741831520046e-05, + "loss": 0.2119, + "step": 13670 + }, + { + "epoch": 0.24383761994791853, + "grad_norm": 0.4289701282978058, + "learning_rate": 4.691399273758727e-05, + "loss": 0.2465, + "step": 13671 + }, + { + "epoch": 0.24385545606963221, + "grad_norm": 0.3131117522716522, + "learning_rate": 4.6913243558708286e-05, + "loss": 0.1885, + "step": 13672 + }, + { + "epoch": 0.2438732921913459, + "grad_norm": 0.20149429142475128, + "learning_rate": 4.6912494294886e-05, + "loss": 0.1781, + "step": 13673 + }, + { + "epoch": 0.24389112831305962, + "grad_norm": 0.2202911376953125, + "learning_rate": 4.6911744946123314e-05, + "loss": 0.1956, + "step": 13674 + }, + { + "epoch": 0.2439089644347733, + "grad_norm": 0.21662744879722595, + "learning_rate": 4.691099551242314e-05, + "loss": 0.1519, + "step": 13675 + }, + { + "epoch": 0.243926800556487, + "grad_norm": 0.3102678954601288, + "learning_rate": 4.6910245993788385e-05, + "loss": 0.2248, + "step": 13676 + }, + { + "epoch": 0.24394463667820068, + "grad_norm": 0.2462979555130005, + "learning_rate": 4.6909496390221944e-05, + "loss": 0.1847, + "step": 13677 + }, + { + "epoch": 0.2439624727999144, + "grad_norm": 0.2958288788795471, + "learning_rate": 4.690874670172672e-05, + "loss": 0.1552, + "step": 13678 + }, + { + "epoch": 0.2439803089216281, + "grad_norm": 0.30526313185691833, + "learning_rate": 4.690799692830564e-05, + "loss": 0.1745, + "step": 13679 + }, + { + "epoch": 0.24399814504334177, + "grad_norm": 0.32857024669647217, + "learning_rate": 4.690724706996159e-05, + "loss": 0.2149, + "step": 13680 + }, + { + "epoch": 0.24401598116505546, + "grad_norm": 0.2644297778606415, + "learning_rate": 4.690649712669748e-05, + "loss": 0.1757, + "step": 13681 + }, + { + "epoch": 0.24403381728676918, + "grad_norm": 0.2937260568141937, + "learning_rate": 4.690574709851623e-05, + "loss": 0.1948, + "step": 13682 + }, + { + "epoch": 0.24405165340848287, + "grad_norm": 0.22789674997329712, + "learning_rate": 4.690499698542074e-05, + "loss": 0.2055, + "step": 13683 + }, + { + "epoch": 0.24406948953019655, + "grad_norm": 0.31945011019706726, + "learning_rate": 4.69042467874139e-05, + "loss": 0.182, + "step": 13684 + }, + { + "epoch": 0.24408732565191024, + "grad_norm": 0.19396725296974182, + "learning_rate": 4.690349650449864e-05, + "loss": 0.15, + "step": 13685 + }, + { + "epoch": 0.24410516177362393, + "grad_norm": 0.429905503988266, + "learning_rate": 4.690274613667787e-05, + "loss": 0.2581, + "step": 13686 + }, + { + "epoch": 0.24412299789533765, + "grad_norm": 0.21720905601978302, + "learning_rate": 4.690199568395449e-05, + "loss": 0.1727, + "step": 13687 + }, + { + "epoch": 0.24414083401705133, + "grad_norm": 0.2965554893016815, + "learning_rate": 4.690124514633141e-05, + "loss": 0.2113, + "step": 13688 + }, + { + "epoch": 0.24415867013876502, + "grad_norm": 0.24341322481632233, + "learning_rate": 4.690049452381153e-05, + "loss": 0.1892, + "step": 13689 + }, + { + "epoch": 0.2441765062604787, + "grad_norm": 0.2477460503578186, + "learning_rate": 4.689974381639778e-05, + "loss": 0.1516, + "step": 13690 + }, + { + "epoch": 0.24419434238219243, + "grad_norm": 0.39323747158050537, + "learning_rate": 4.689899302409305e-05, + "loss": 0.1356, + "step": 13691 + }, + { + "epoch": 0.2442121785039061, + "grad_norm": 0.31786781549453735, + "learning_rate": 4.6898242146900266e-05, + "loss": 0.2181, + "step": 13692 + }, + { + "epoch": 0.2442300146256198, + "grad_norm": 0.29771488904953003, + "learning_rate": 4.689749118482233e-05, + "loss": 0.2113, + "step": 13693 + }, + { + "epoch": 0.2442478507473335, + "grad_norm": 0.21941302716732025, + "learning_rate": 4.689674013786216e-05, + "loss": 0.1809, + "step": 13694 + }, + { + "epoch": 0.2442656868690472, + "grad_norm": 0.2876521944999695, + "learning_rate": 4.689598900602266e-05, + "loss": 0.1833, + "step": 13695 + }, + { + "epoch": 0.2442835229907609, + "grad_norm": 0.21469078958034515, + "learning_rate": 4.6895237789306736e-05, + "loss": 0.156, + "step": 13696 + }, + { + "epoch": 0.24430135911247458, + "grad_norm": 0.26704058051109314, + "learning_rate": 4.6894486487717314e-05, + "loss": 0.2246, + "step": 13697 + }, + { + "epoch": 0.24431919523418827, + "grad_norm": 0.2957056164741516, + "learning_rate": 4.68937351012573e-05, + "loss": 0.1857, + "step": 13698 + }, + { + "epoch": 0.24433703135590198, + "grad_norm": 0.31056469678878784, + "learning_rate": 4.689298362992961e-05, + "loss": 0.1978, + "step": 13699 + }, + { + "epoch": 0.24435486747761567, + "grad_norm": 0.2240130454301834, + "learning_rate": 4.6892232073737144e-05, + "loss": 0.1579, + "step": 13700 + }, + { + "epoch": 0.24437270359932936, + "grad_norm": 0.25795117020606995, + "learning_rate": 4.6891480432682836e-05, + "loss": 0.2125, + "step": 13701 + }, + { + "epoch": 0.24439053972104305, + "grad_norm": 0.22394831478595734, + "learning_rate": 4.689072870676958e-05, + "loss": 0.1604, + "step": 13702 + }, + { + "epoch": 0.24440837584275676, + "grad_norm": 0.2760259211063385, + "learning_rate": 4.688997689600031e-05, + "loss": 0.1932, + "step": 13703 + }, + { + "epoch": 0.24442621196447045, + "grad_norm": 0.21913674473762512, + "learning_rate": 4.688922500037792e-05, + "loss": 0.1583, + "step": 13704 + }, + { + "epoch": 0.24444404808618414, + "grad_norm": 0.2677614986896515, + "learning_rate": 4.688847301990533e-05, + "loss": 0.1855, + "step": 13705 + }, + { + "epoch": 0.24446188420789783, + "grad_norm": 0.29969218373298645, + "learning_rate": 4.688772095458547e-05, + "loss": 0.2287, + "step": 13706 + }, + { + "epoch": 0.24447972032961152, + "grad_norm": 0.3517477512359619, + "learning_rate": 4.688696880442124e-05, + "loss": 0.1335, + "step": 13707 + }, + { + "epoch": 0.24449755645132523, + "grad_norm": 0.3408452272415161, + "learning_rate": 4.688621656941555e-05, + "loss": 0.1944, + "step": 13708 + }, + { + "epoch": 0.24451539257303892, + "grad_norm": 0.2209720015525818, + "learning_rate": 4.688546424957133e-05, + "loss": 0.1603, + "step": 13709 + }, + { + "epoch": 0.2445332286947526, + "grad_norm": 0.45468583703041077, + "learning_rate": 4.68847118448915e-05, + "loss": 0.2845, + "step": 13710 + }, + { + "epoch": 0.2445510648164663, + "grad_norm": 0.2596586048603058, + "learning_rate": 4.6883959355378956e-05, + "loss": 0.2129, + "step": 13711 + }, + { + "epoch": 0.24456890093818, + "grad_norm": 0.21649125218391418, + "learning_rate": 4.6883206781036634e-05, + "loss": 0.1755, + "step": 13712 + }, + { + "epoch": 0.2445867370598937, + "grad_norm": 0.28443682193756104, + "learning_rate": 4.6882454121867446e-05, + "loss": 0.2265, + "step": 13713 + }, + { + "epoch": 0.2446045731816074, + "grad_norm": 0.25232720375061035, + "learning_rate": 4.688170137787431e-05, + "loss": 0.1328, + "step": 13714 + }, + { + "epoch": 0.24462240930332108, + "grad_norm": 0.31689730286598206, + "learning_rate": 4.688094854906013e-05, + "loss": 0.1763, + "step": 13715 + }, + { + "epoch": 0.2446402454250348, + "grad_norm": 0.3855779767036438, + "learning_rate": 4.6880195635427846e-05, + "loss": 0.1935, + "step": 13716 + }, + { + "epoch": 0.24465808154674848, + "grad_norm": 0.3011952042579651, + "learning_rate": 4.687944263698037e-05, + "loss": 0.2933, + "step": 13717 + }, + { + "epoch": 0.24467591766846217, + "grad_norm": 0.24400511384010315, + "learning_rate": 4.687868955372061e-05, + "loss": 0.1732, + "step": 13718 + }, + { + "epoch": 0.24469375379017586, + "grad_norm": 0.25264862179756165, + "learning_rate": 4.687793638565149e-05, + "loss": 0.2059, + "step": 13719 + }, + { + "epoch": 0.24471158991188957, + "grad_norm": 0.2522881031036377, + "learning_rate": 4.687718313277594e-05, + "loss": 0.1687, + "step": 13720 + }, + { + "epoch": 0.24472942603360326, + "grad_norm": 0.32159167528152466, + "learning_rate": 4.687642979509687e-05, + "loss": 0.1804, + "step": 13721 + }, + { + "epoch": 0.24474726215531695, + "grad_norm": 0.24564985930919647, + "learning_rate": 4.68756763726172e-05, + "loss": 0.1819, + "step": 13722 + }, + { + "epoch": 0.24476509827703063, + "grad_norm": 0.25257232785224915, + "learning_rate": 4.687492286533985e-05, + "loss": 0.1674, + "step": 13723 + }, + { + "epoch": 0.24478293439874435, + "grad_norm": 0.27753975987434387, + "learning_rate": 4.687416927326775e-05, + "loss": 0.1861, + "step": 13724 + }, + { + "epoch": 0.24480077052045804, + "grad_norm": 0.4845978915691376, + "learning_rate": 4.687341559640381e-05, + "loss": 0.1874, + "step": 13725 + }, + { + "epoch": 0.24481860664217173, + "grad_norm": 0.24164879322052002, + "learning_rate": 4.687266183475096e-05, + "loss": 0.2065, + "step": 13726 + }, + { + "epoch": 0.24483644276388541, + "grad_norm": 0.2179027944803238, + "learning_rate": 4.6871907988312114e-05, + "loss": 0.1812, + "step": 13727 + }, + { + "epoch": 0.2448542788855991, + "grad_norm": 0.31235405802726746, + "learning_rate": 4.6871154057090204e-05, + "loss": 0.2161, + "step": 13728 + }, + { + "epoch": 0.24487211500731282, + "grad_norm": 0.31012001633644104, + "learning_rate": 4.6870400041088136e-05, + "loss": 0.2249, + "step": 13729 + }, + { + "epoch": 0.2448899511290265, + "grad_norm": 0.2521165609359741, + "learning_rate": 4.686964594030885e-05, + "loss": 0.1872, + "step": 13730 + }, + { + "epoch": 0.2449077872507402, + "grad_norm": 0.2586023509502411, + "learning_rate": 4.686889175475527e-05, + "loss": 0.1946, + "step": 13731 + }, + { + "epoch": 0.24492562337245388, + "grad_norm": 0.2127128690481186, + "learning_rate": 4.6868137484430307e-05, + "loss": 0.1601, + "step": 13732 + }, + { + "epoch": 0.2449434594941676, + "grad_norm": 0.2974259555339813, + "learning_rate": 4.686738312933688e-05, + "loss": 0.182, + "step": 13733 + }, + { + "epoch": 0.24496129561588129, + "grad_norm": 0.31503793597221375, + "learning_rate": 4.686662868947794e-05, + "loss": 0.2267, + "step": 13734 + }, + { + "epoch": 0.24497913173759497, + "grad_norm": 0.2340630441904068, + "learning_rate": 4.686587416485638e-05, + "loss": 0.1891, + "step": 13735 + }, + { + "epoch": 0.24499696785930866, + "grad_norm": 0.4805942177772522, + "learning_rate": 4.686511955547515e-05, + "loss": 0.1731, + "step": 13736 + }, + { + "epoch": 0.24501480398102238, + "grad_norm": 0.21227367222309113, + "learning_rate": 4.6864364861337165e-05, + "loss": 0.1764, + "step": 13737 + }, + { + "epoch": 0.24503264010273607, + "grad_norm": 0.26857635378837585, + "learning_rate": 4.686361008244534e-05, + "loss": 0.1785, + "step": 13738 + }, + { + "epoch": 0.24505047622444975, + "grad_norm": 0.3060723543167114, + "learning_rate": 4.686285521880263e-05, + "loss": 0.1383, + "step": 13739 + }, + { + "epoch": 0.24506831234616344, + "grad_norm": 0.2117903083562851, + "learning_rate": 4.686210027041192e-05, + "loss": 0.1427, + "step": 13740 + }, + { + "epoch": 0.24508614846787716, + "grad_norm": 0.2373329997062683, + "learning_rate": 4.686134523727617e-05, + "loss": 0.1934, + "step": 13741 + }, + { + "epoch": 0.24510398458959085, + "grad_norm": 0.3884005844593048, + "learning_rate": 4.686059011939829e-05, + "loss": 0.225, + "step": 13742 + }, + { + "epoch": 0.24512182071130453, + "grad_norm": 0.2836616635322571, + "learning_rate": 4.685983491678122e-05, + "loss": 0.1636, + "step": 13743 + }, + { + "epoch": 0.24513965683301822, + "grad_norm": 0.24956491589546204, + "learning_rate": 4.685907962942787e-05, + "loss": 0.1826, + "step": 13744 + }, + { + "epoch": 0.2451574929547319, + "grad_norm": 0.2174319326877594, + "learning_rate": 4.685832425734118e-05, + "loss": 0.1401, + "step": 13745 + }, + { + "epoch": 0.24517532907644563, + "grad_norm": 0.35470327734947205, + "learning_rate": 4.6857568800524085e-05, + "loss": 0.2204, + "step": 13746 + }, + { + "epoch": 0.2451931651981593, + "grad_norm": 0.2872637212276459, + "learning_rate": 4.6856813258979494e-05, + "loss": 0.1851, + "step": 13747 + }, + { + "epoch": 0.245211001319873, + "grad_norm": 0.32511037588119507, + "learning_rate": 4.685605763271035e-05, + "loss": 0.2241, + "step": 13748 + }, + { + "epoch": 0.2452288374415867, + "grad_norm": 0.34901243448257446, + "learning_rate": 4.685530192171958e-05, + "loss": 0.1742, + "step": 13749 + }, + { + "epoch": 0.2452466735633004, + "grad_norm": 0.23517319560050964, + "learning_rate": 4.68545461260101e-05, + "loss": 0.1704, + "step": 13750 + }, + { + "epoch": 0.2452645096850141, + "grad_norm": 0.2833958864212036, + "learning_rate": 4.685379024558486e-05, + "loss": 0.1947, + "step": 13751 + }, + { + "epoch": 0.24528234580672778, + "grad_norm": 0.3658854365348816, + "learning_rate": 4.685303428044678e-05, + "loss": 0.2113, + "step": 13752 + }, + { + "epoch": 0.24530018192844147, + "grad_norm": 0.3165271282196045, + "learning_rate": 4.685227823059879e-05, + "loss": 0.2736, + "step": 13753 + }, + { + "epoch": 0.24531801805015518, + "grad_norm": 0.22832855582237244, + "learning_rate": 4.685152209604382e-05, + "loss": 0.1839, + "step": 13754 + }, + { + "epoch": 0.24533585417186887, + "grad_norm": 0.20947571098804474, + "learning_rate": 4.685076587678481e-05, + "loss": 0.2235, + "step": 13755 + }, + { + "epoch": 0.24535369029358256, + "grad_norm": 0.3125641644001007, + "learning_rate": 4.685000957282468e-05, + "loss": 0.2254, + "step": 13756 + }, + { + "epoch": 0.24537152641529625, + "grad_norm": 0.2678644061088562, + "learning_rate": 4.6849253184166366e-05, + "loss": 0.2227, + "step": 13757 + }, + { + "epoch": 0.24538936253700996, + "grad_norm": 0.20531684160232544, + "learning_rate": 4.68484967108128e-05, + "loss": 0.177, + "step": 13758 + }, + { + "epoch": 0.24540719865872365, + "grad_norm": 0.288449764251709, + "learning_rate": 4.684774015276692e-05, + "loss": 0.1627, + "step": 13759 + }, + { + "epoch": 0.24542503478043734, + "grad_norm": 0.2778984606266022, + "learning_rate": 4.684698351003164e-05, + "loss": 0.196, + "step": 13760 + }, + { + "epoch": 0.24544287090215103, + "grad_norm": 0.22770462930202484, + "learning_rate": 4.6846226782609915e-05, + "loss": 0.174, + "step": 13761 + }, + { + "epoch": 0.24546070702386474, + "grad_norm": 0.3416846990585327, + "learning_rate": 4.6845469970504675e-05, + "loss": 0.1793, + "step": 13762 + }, + { + "epoch": 0.24547854314557843, + "grad_norm": 0.2990380525588989, + "learning_rate": 4.684471307371884e-05, + "loss": 0.1673, + "step": 13763 + }, + { + "epoch": 0.24549637926729212, + "grad_norm": 0.32841476798057556, + "learning_rate": 4.6843956092255346e-05, + "loss": 0.1477, + "step": 13764 + }, + { + "epoch": 0.2455142153890058, + "grad_norm": 0.2731004059314728, + "learning_rate": 4.6843199026117146e-05, + "loss": 0.182, + "step": 13765 + }, + { + "epoch": 0.2455320515107195, + "grad_norm": 0.2715333104133606, + "learning_rate": 4.684244187530716e-05, + "loss": 0.1863, + "step": 13766 + }, + { + "epoch": 0.2455498876324332, + "grad_norm": 0.2028142809867859, + "learning_rate": 4.684168463982832e-05, + "loss": 0.1641, + "step": 13767 + }, + { + "epoch": 0.2455677237541469, + "grad_norm": 0.23987728357315063, + "learning_rate": 4.684092731968357e-05, + "loss": 0.1654, + "step": 13768 + }, + { + "epoch": 0.2455855598758606, + "grad_norm": 0.23922580480575562, + "learning_rate": 4.684016991487584e-05, + "loss": 0.198, + "step": 13769 + }, + { + "epoch": 0.24560339599757428, + "grad_norm": 0.2323124259710312, + "learning_rate": 4.683941242540807e-05, + "loss": 0.142, + "step": 13770 + }, + { + "epoch": 0.245621232119288, + "grad_norm": 0.2378569096326828, + "learning_rate": 4.68386548512832e-05, + "loss": 0.2355, + "step": 13771 + }, + { + "epoch": 0.24563906824100168, + "grad_norm": 0.23443584144115448, + "learning_rate": 4.6837897192504154e-05, + "loss": 0.2043, + "step": 13772 + }, + { + "epoch": 0.24565690436271537, + "grad_norm": 0.2111978828907013, + "learning_rate": 4.6837139449073876e-05, + "loss": 0.1934, + "step": 13773 + }, + { + "epoch": 0.24567474048442905, + "grad_norm": 0.3395061790943146, + "learning_rate": 4.6836381620995306e-05, + "loss": 0.2197, + "step": 13774 + }, + { + "epoch": 0.24569257660614277, + "grad_norm": 0.2784058153629303, + "learning_rate": 4.683562370827138e-05, + "loss": 0.2545, + "step": 13775 + }, + { + "epoch": 0.24571041272785646, + "grad_norm": 0.291057288646698, + "learning_rate": 4.683486571090503e-05, + "loss": 0.2525, + "step": 13776 + }, + { + "epoch": 0.24572824884957015, + "grad_norm": 0.3297308385372162, + "learning_rate": 4.68341076288992e-05, + "loss": 0.1446, + "step": 13777 + }, + { + "epoch": 0.24574608497128383, + "grad_norm": 0.3100256621837616, + "learning_rate": 4.6833349462256825e-05, + "loss": 0.1808, + "step": 13778 + }, + { + "epoch": 0.24576392109299755, + "grad_norm": 0.2140001505613327, + "learning_rate": 4.6832591210980855e-05, + "loss": 0.2233, + "step": 13779 + }, + { + "epoch": 0.24578175721471124, + "grad_norm": 0.17805612087249756, + "learning_rate": 4.683183287507421e-05, + "loss": 0.1771, + "step": 13780 + }, + { + "epoch": 0.24579959333642493, + "grad_norm": 0.23784080147743225, + "learning_rate": 4.683107445453985e-05, + "loss": 0.18, + "step": 13781 + }, + { + "epoch": 0.24581742945813861, + "grad_norm": 0.23661647737026215, + "learning_rate": 4.6830315949380696e-05, + "loss": 0.1696, + "step": 13782 + }, + { + "epoch": 0.24583526557985233, + "grad_norm": 0.22108282148838043, + "learning_rate": 4.6829557359599705e-05, + "loss": 0.1479, + "step": 13783 + }, + { + "epoch": 0.24585310170156602, + "grad_norm": 0.327790766954422, + "learning_rate": 4.682879868519981e-05, + "loss": 0.1682, + "step": 13784 + }, + { + "epoch": 0.2458709378232797, + "grad_norm": 0.3368685245513916, + "learning_rate": 4.682803992618395e-05, + "loss": 0.2027, + "step": 13785 + }, + { + "epoch": 0.2458887739449934, + "grad_norm": 0.30198389291763306, + "learning_rate": 4.682728108255506e-05, + "loss": 0.1334, + "step": 13786 + }, + { + "epoch": 0.24590661006670708, + "grad_norm": 0.2669535279273987, + "learning_rate": 4.68265221543161e-05, + "loss": 0.2092, + "step": 13787 + }, + { + "epoch": 0.2459244461884208, + "grad_norm": 0.27204829454421997, + "learning_rate": 4.682576314147e-05, + "loss": 0.206, + "step": 13788 + }, + { + "epoch": 0.24594228231013449, + "grad_norm": 0.30816584825515747, + "learning_rate": 4.68250040440197e-05, + "loss": 0.2155, + "step": 13789 + }, + { + "epoch": 0.24596011843184817, + "grad_norm": 0.30811265110969543, + "learning_rate": 4.6824244861968156e-05, + "loss": 0.1887, + "step": 13790 + }, + { + "epoch": 0.24597795455356186, + "grad_norm": 0.23519901931285858, + "learning_rate": 4.682348559531829e-05, + "loss": 0.2166, + "step": 13791 + }, + { + "epoch": 0.24599579067527558, + "grad_norm": 0.3085532486438751, + "learning_rate": 4.682272624407306e-05, + "loss": 0.2236, + "step": 13792 + }, + { + "epoch": 0.24601362679698927, + "grad_norm": 0.24834729731082916, + "learning_rate": 4.682196680823541e-05, + "loss": 0.1837, + "step": 13793 + }, + { + "epoch": 0.24603146291870295, + "grad_norm": 0.2953791618347168, + "learning_rate": 4.6821207287808274e-05, + "loss": 0.182, + "step": 13794 + }, + { + "epoch": 0.24604929904041664, + "grad_norm": 0.3037715256214142, + "learning_rate": 4.68204476827946e-05, + "loss": 0.2271, + "step": 13795 + }, + { + "epoch": 0.24606713516213036, + "grad_norm": 0.23621883988380432, + "learning_rate": 4.681968799319734e-05, + "loss": 0.1707, + "step": 13796 + }, + { + "epoch": 0.24608497128384405, + "grad_norm": 0.5135491490364075, + "learning_rate": 4.681892821901943e-05, + "loss": 0.1852, + "step": 13797 + }, + { + "epoch": 0.24610280740555773, + "grad_norm": 0.21563830971717834, + "learning_rate": 4.681816836026381e-05, + "loss": 0.1778, + "step": 13798 + }, + { + "epoch": 0.24612064352727142, + "grad_norm": 0.2344902902841568, + "learning_rate": 4.681740841693345e-05, + "loss": 0.1944, + "step": 13799 + }, + { + "epoch": 0.24613847964898514, + "grad_norm": 0.20485229790210724, + "learning_rate": 4.681664838903127e-05, + "loss": 0.1774, + "step": 13800 + }, + { + "epoch": 0.24615631577069882, + "grad_norm": 0.25551292300224304, + "learning_rate": 4.681588827656023e-05, + "loss": 0.1759, + "step": 13801 + }, + { + "epoch": 0.2461741518924125, + "grad_norm": 0.2572571933269501, + "learning_rate": 4.681512807952326e-05, + "loss": 0.2027, + "step": 13802 + }, + { + "epoch": 0.2461919880141262, + "grad_norm": 0.2675979733467102, + "learning_rate": 4.681436779792333e-05, + "loss": 0.1601, + "step": 13803 + }, + { + "epoch": 0.24620982413583992, + "grad_norm": 0.27787625789642334, + "learning_rate": 4.681360743176337e-05, + "loss": 0.1713, + "step": 13804 + }, + { + "epoch": 0.2462276602575536, + "grad_norm": 0.2584262788295746, + "learning_rate": 4.6812846981046346e-05, + "loss": 0.1799, + "step": 13805 + }, + { + "epoch": 0.2462454963792673, + "grad_norm": 0.3029763400554657, + "learning_rate": 4.6812086445775185e-05, + "loss": 0.2036, + "step": 13806 + }, + { + "epoch": 0.24626333250098098, + "grad_norm": 0.46581169962882996, + "learning_rate": 4.6811325825952844e-05, + "loss": 0.16, + "step": 13807 + }, + { + "epoch": 0.24628116862269467, + "grad_norm": 0.3240189850330353, + "learning_rate": 4.681056512158227e-05, + "loss": 0.1892, + "step": 13808 + }, + { + "epoch": 0.24629900474440838, + "grad_norm": 0.24673877656459808, + "learning_rate": 4.680980433266641e-05, + "loss": 0.1674, + "step": 13809 + }, + { + "epoch": 0.24631684086612207, + "grad_norm": 0.20339319109916687, + "learning_rate": 4.6809043459208216e-05, + "loss": 0.0996, + "step": 13810 + }, + { + "epoch": 0.24633467698783576, + "grad_norm": 0.22289863228797913, + "learning_rate": 4.680828250121064e-05, + "loss": 0.1752, + "step": 13811 + }, + { + "epoch": 0.24635251310954945, + "grad_norm": 0.31208154559135437, + "learning_rate": 4.680752145867663e-05, + "loss": 0.1882, + "step": 13812 + }, + { + "epoch": 0.24637034923126316, + "grad_norm": 0.2521425783634186, + "learning_rate": 4.680676033160913e-05, + "loss": 0.1818, + "step": 13813 + }, + { + "epoch": 0.24638818535297685, + "grad_norm": 0.25903889536857605, + "learning_rate": 4.6805999120011093e-05, + "loss": 0.1709, + "step": 13814 + }, + { + "epoch": 0.24640602147469054, + "grad_norm": 0.32050594687461853, + "learning_rate": 4.680523782388548e-05, + "loss": 0.1938, + "step": 13815 + }, + { + "epoch": 0.24642385759640423, + "grad_norm": 0.31170520186424255, + "learning_rate": 4.680447644323523e-05, + "loss": 0.2025, + "step": 13816 + }, + { + "epoch": 0.24644169371811794, + "grad_norm": 0.2918287217617035, + "learning_rate": 4.68037149780633e-05, + "loss": 0.1677, + "step": 13817 + }, + { + "epoch": 0.24645952983983163, + "grad_norm": 0.3026082515716553, + "learning_rate": 4.680295342837263e-05, + "loss": 0.1649, + "step": 13818 + }, + { + "epoch": 0.24647736596154532, + "grad_norm": 0.2618713974952698, + "learning_rate": 4.680219179416619e-05, + "loss": 0.1933, + "step": 13819 + }, + { + "epoch": 0.246495202083259, + "grad_norm": 0.31092211604118347, + "learning_rate": 4.680143007544693e-05, + "loss": 0.2254, + "step": 13820 + }, + { + "epoch": 0.24651303820497272, + "grad_norm": 0.27318161725997925, + "learning_rate": 4.6800668272217795e-05, + "loss": 0.1807, + "step": 13821 + }, + { + "epoch": 0.2465308743266864, + "grad_norm": 0.2453472763299942, + "learning_rate": 4.6799906384481735e-05, + "loss": 0.1732, + "step": 13822 + }, + { + "epoch": 0.2465487104484001, + "grad_norm": 0.32099649310112, + "learning_rate": 4.679914441224171e-05, + "loss": 0.1928, + "step": 13823 + }, + { + "epoch": 0.2465665465701138, + "grad_norm": 0.35580796003341675, + "learning_rate": 4.679838235550067e-05, + "loss": 0.1881, + "step": 13824 + }, + { + "epoch": 0.2465843826918275, + "grad_norm": 0.32416588068008423, + "learning_rate": 4.6797620214261574e-05, + "loss": 0.1656, + "step": 13825 + }, + { + "epoch": 0.2466022188135412, + "grad_norm": 0.28813648223876953, + "learning_rate": 4.679685798852738e-05, + "loss": 0.1873, + "step": 13826 + }, + { + "epoch": 0.24662005493525488, + "grad_norm": 0.30523252487182617, + "learning_rate": 4.679609567830102e-05, + "loss": 0.1882, + "step": 13827 + }, + { + "epoch": 0.24663789105696857, + "grad_norm": 0.3084180951118469, + "learning_rate": 4.679533328358547e-05, + "loss": 0.2246, + "step": 13828 + }, + { + "epoch": 0.24665572717868225, + "grad_norm": 0.3172524869441986, + "learning_rate": 4.6794570804383685e-05, + "loss": 0.2964, + "step": 13829 + }, + { + "epoch": 0.24667356330039597, + "grad_norm": 0.2175578624010086, + "learning_rate": 4.679380824069862e-05, + "loss": 0.1844, + "step": 13830 + }, + { + "epoch": 0.24669139942210966, + "grad_norm": 0.28899720311164856, + "learning_rate": 4.6793045592533225e-05, + "loss": 0.2349, + "step": 13831 + }, + { + "epoch": 0.24670923554382335, + "grad_norm": 0.3691122829914093, + "learning_rate": 4.679228285989045e-05, + "loss": 0.2098, + "step": 13832 + }, + { + "epoch": 0.24672707166553703, + "grad_norm": 0.24319787323474884, + "learning_rate": 4.679152004277327e-05, + "loss": 0.2177, + "step": 13833 + }, + { + "epoch": 0.24674490778725075, + "grad_norm": 0.34025055170059204, + "learning_rate": 4.6790757141184626e-05, + "loss": 0.2257, + "step": 13834 + }, + { + "epoch": 0.24676274390896444, + "grad_norm": 0.22080552577972412, + "learning_rate": 4.678999415512748e-05, + "loss": 0.1809, + "step": 13835 + }, + { + "epoch": 0.24678058003067813, + "grad_norm": 0.2829233407974243, + "learning_rate": 4.678923108460479e-05, + "loss": 0.2224, + "step": 13836 + }, + { + "epoch": 0.24679841615239181, + "grad_norm": 0.2799453139305115, + "learning_rate": 4.678846792961952e-05, + "loss": 0.1787, + "step": 13837 + }, + { + "epoch": 0.24681625227410553, + "grad_norm": 0.25913748145103455, + "learning_rate": 4.678770469017462e-05, + "loss": 0.2047, + "step": 13838 + }, + { + "epoch": 0.24683408839581922, + "grad_norm": 0.22933343052864075, + "learning_rate": 4.678694136627305e-05, + "loss": 0.188, + "step": 13839 + }, + { + "epoch": 0.2468519245175329, + "grad_norm": 0.21951298415660858, + "learning_rate": 4.678617795791777e-05, + "loss": 0.1366, + "step": 13840 + }, + { + "epoch": 0.2468697606392466, + "grad_norm": 0.24158070981502533, + "learning_rate": 4.678541446511174e-05, + "loss": 0.2009, + "step": 13841 + }, + { + "epoch": 0.2468875967609603, + "grad_norm": 0.20856155455112457, + "learning_rate": 4.6784650887857926e-05, + "loss": 0.181, + "step": 13842 + }, + { + "epoch": 0.246905432882674, + "grad_norm": 0.27052634954452515, + "learning_rate": 4.6783887226159276e-05, + "loss": 0.1397, + "step": 13843 + }, + { + "epoch": 0.24692326900438769, + "grad_norm": 0.2849443256855011, + "learning_rate": 4.678312348001875e-05, + "loss": 0.1978, + "step": 13844 + }, + { + "epoch": 0.24694110512610137, + "grad_norm": 0.26534637808799744, + "learning_rate": 4.678235964943932e-05, + "loss": 0.1231, + "step": 13845 + }, + { + "epoch": 0.24695894124781506, + "grad_norm": 0.2178906798362732, + "learning_rate": 4.678159573442394e-05, + "loss": 0.1849, + "step": 13846 + }, + { + "epoch": 0.24697677736952878, + "grad_norm": 0.2549191415309906, + "learning_rate": 4.6780831734975566e-05, + "loss": 0.1632, + "step": 13847 + }, + { + "epoch": 0.24699461349124247, + "grad_norm": 0.31294724345207214, + "learning_rate": 4.678006765109717e-05, + "loss": 0.2104, + "step": 13848 + }, + { + "epoch": 0.24701244961295615, + "grad_norm": 0.41884496808052063, + "learning_rate": 4.677930348279171e-05, + "loss": 0.1686, + "step": 13849 + }, + { + "epoch": 0.24703028573466984, + "grad_norm": 0.2828907370567322, + "learning_rate": 4.6778539230062144e-05, + "loss": 0.2075, + "step": 13850 + }, + { + "epoch": 0.24704812185638356, + "grad_norm": 0.2417198121547699, + "learning_rate": 4.6777774892911443e-05, + "loss": 0.2292, + "step": 13851 + }, + { + "epoch": 0.24706595797809724, + "grad_norm": 0.2362624704837799, + "learning_rate": 4.6777010471342566e-05, + "loss": 0.208, + "step": 13852 + }, + { + "epoch": 0.24708379409981093, + "grad_norm": 0.2034156620502472, + "learning_rate": 4.677624596535847e-05, + "loss": 0.1718, + "step": 13853 + }, + { + "epoch": 0.24710163022152462, + "grad_norm": 0.2332681566476822, + "learning_rate": 4.6775481374962113e-05, + "loss": 0.1626, + "step": 13854 + }, + { + "epoch": 0.24711946634323834, + "grad_norm": 0.42489734292030334, + "learning_rate": 4.677471670015649e-05, + "loss": 0.2102, + "step": 13855 + }, + { + "epoch": 0.24713730246495202, + "grad_norm": 0.24739526212215424, + "learning_rate": 4.677395194094453e-05, + "loss": 0.1599, + "step": 13856 + }, + { + "epoch": 0.2471551385866657, + "grad_norm": 0.2923029363155365, + "learning_rate": 4.677318709732922e-05, + "loss": 0.1866, + "step": 13857 + }, + { + "epoch": 0.2471729747083794, + "grad_norm": 0.34048333764076233, + "learning_rate": 4.677242216931351e-05, + "loss": 0.234, + "step": 13858 + }, + { + "epoch": 0.24719081083009312, + "grad_norm": 0.22982923686504364, + "learning_rate": 4.677165715690038e-05, + "loss": 0.1714, + "step": 13859 + }, + { + "epoch": 0.2472086469518068, + "grad_norm": 0.24919088184833527, + "learning_rate": 4.6770892060092774e-05, + "loss": 0.1907, + "step": 13860 + }, + { + "epoch": 0.2472264830735205, + "grad_norm": 0.2675471007823944, + "learning_rate": 4.6770126878893684e-05, + "loss": 0.1524, + "step": 13861 + }, + { + "epoch": 0.24724431919523418, + "grad_norm": 1.0161223411560059, + "learning_rate": 4.676936161330606e-05, + "loss": 0.2187, + "step": 13862 + }, + { + "epoch": 0.2472621553169479, + "grad_norm": 0.26066574454307556, + "learning_rate": 4.676859626333287e-05, + "loss": 0.1955, + "step": 13863 + }, + { + "epoch": 0.24727999143866158, + "grad_norm": 0.2394704967737198, + "learning_rate": 4.6767830828977076e-05, + "loss": 0.2172, + "step": 13864 + }, + { + "epoch": 0.24729782756037527, + "grad_norm": 0.22838497161865234, + "learning_rate": 4.676706531024166e-05, + "loss": 0.1336, + "step": 13865 + }, + { + "epoch": 0.24731566368208896, + "grad_norm": 0.5352074503898621, + "learning_rate": 4.6766299707129576e-05, + "loss": 0.2227, + "step": 13866 + }, + { + "epoch": 0.24733349980380265, + "grad_norm": 0.26973477005958557, + "learning_rate": 4.6765534019643796e-05, + "loss": 0.1862, + "step": 13867 + }, + { + "epoch": 0.24735133592551636, + "grad_norm": 0.26896733045578003, + "learning_rate": 4.6764768247787294e-05, + "loss": 0.239, + "step": 13868 + }, + { + "epoch": 0.24736917204723005, + "grad_norm": 0.2494468241930008, + "learning_rate": 4.676400239156303e-05, + "loss": 0.2067, + "step": 13869 + }, + { + "epoch": 0.24738700816894374, + "grad_norm": 0.33378568291664124, + "learning_rate": 4.676323645097398e-05, + "loss": 0.2509, + "step": 13870 + }, + { + "epoch": 0.24740484429065743, + "grad_norm": 0.22868886590003967, + "learning_rate": 4.67624704260231e-05, + "loss": 0.1771, + "step": 13871 + }, + { + "epoch": 0.24742268041237114, + "grad_norm": 0.23843562602996826, + "learning_rate": 4.676170431671337e-05, + "loss": 0.1774, + "step": 13872 + }, + { + "epoch": 0.24744051653408483, + "grad_norm": 0.22828061878681183, + "learning_rate": 4.6760938123047763e-05, + "loss": 0.2106, + "step": 13873 + }, + { + "epoch": 0.24745835265579852, + "grad_norm": 0.23694480955600739, + "learning_rate": 4.676017184502924e-05, + "loss": 0.1786, + "step": 13874 + }, + { + "epoch": 0.2474761887775122, + "grad_norm": 0.2634585201740265, + "learning_rate": 4.675940548266078e-05, + "loss": 0.177, + "step": 13875 + }, + { + "epoch": 0.24749402489922592, + "grad_norm": 0.23659314215183258, + "learning_rate": 4.675863903594534e-05, + "loss": 0.1845, + "step": 13876 + }, + { + "epoch": 0.2475118610209396, + "grad_norm": 0.42080968618392944, + "learning_rate": 4.6757872504885906e-05, + "loss": 0.1954, + "step": 13877 + }, + { + "epoch": 0.2475296971426533, + "grad_norm": 0.24054062366485596, + "learning_rate": 4.675710588948544e-05, + "loss": 0.1877, + "step": 13878 + }, + { + "epoch": 0.247547533264367, + "grad_norm": 0.3301849663257599, + "learning_rate": 4.6756339189746925e-05, + "loss": 0.2279, + "step": 13879 + }, + { + "epoch": 0.2475653693860807, + "grad_norm": 0.2731056213378906, + "learning_rate": 4.675557240567332e-05, + "loss": 0.1846, + "step": 13880 + }, + { + "epoch": 0.2475832055077944, + "grad_norm": 0.26883235573768616, + "learning_rate": 4.67548055372676e-05, + "loss": 0.1695, + "step": 13881 + }, + { + "epoch": 0.24760104162950808, + "grad_norm": 0.2931753695011139, + "learning_rate": 4.675403858453274e-05, + "loss": 0.192, + "step": 13882 + }, + { + "epoch": 0.24761887775122177, + "grad_norm": 0.2757262885570526, + "learning_rate": 4.675327154747171e-05, + "loss": 0.192, + "step": 13883 + }, + { + "epoch": 0.24763671387293548, + "grad_norm": 0.2826712131500244, + "learning_rate": 4.67525044260875e-05, + "loss": 0.2447, + "step": 13884 + }, + { + "epoch": 0.24765454999464917, + "grad_norm": 0.2587839663028717, + "learning_rate": 4.6751737220383054e-05, + "loss": 0.1586, + "step": 13885 + }, + { + "epoch": 0.24767238611636286, + "grad_norm": 0.3076946437358856, + "learning_rate": 4.675096993036137e-05, + "loss": 0.2106, + "step": 13886 + }, + { + "epoch": 0.24769022223807655, + "grad_norm": 0.30966705083847046, + "learning_rate": 4.675020255602541e-05, + "loss": 0.1426, + "step": 13887 + }, + { + "epoch": 0.24770805835979023, + "grad_norm": 0.2736508548259735, + "learning_rate": 4.674943509737815e-05, + "loss": 0.1293, + "step": 13888 + }, + { + "epoch": 0.24772589448150395, + "grad_norm": 0.2839398682117462, + "learning_rate": 4.6748667554422575e-05, + "loss": 0.1608, + "step": 13889 + }, + { + "epoch": 0.24774373060321764, + "grad_norm": 0.2591922879219055, + "learning_rate": 4.674789992716165e-05, + "loss": 0.2201, + "step": 13890 + }, + { + "epoch": 0.24776156672493133, + "grad_norm": 0.3511675000190735, + "learning_rate": 4.674713221559836e-05, + "loss": 0.2231, + "step": 13891 + }, + { + "epoch": 0.247779402846645, + "grad_norm": 0.21819651126861572, + "learning_rate": 4.674636441973566e-05, + "loss": 0.1782, + "step": 13892 + }, + { + "epoch": 0.24779723896835873, + "grad_norm": 0.28787264227867126, + "learning_rate": 4.6745596539576546e-05, + "loss": 0.208, + "step": 13893 + }, + { + "epoch": 0.24781507509007242, + "grad_norm": 0.2524046301841736, + "learning_rate": 4.6744828575124e-05, + "loss": 0.1806, + "step": 13894 + }, + { + "epoch": 0.2478329112117861, + "grad_norm": 0.27405714988708496, + "learning_rate": 4.674406052638097e-05, + "loss": 0.2005, + "step": 13895 + }, + { + "epoch": 0.2478507473334998, + "grad_norm": 0.2722206115722656, + "learning_rate": 4.674329239335046e-05, + "loss": 0.2047, + "step": 13896 + }, + { + "epoch": 0.2478685834552135, + "grad_norm": 0.200779527425766, + "learning_rate": 4.674252417603544e-05, + "loss": 0.1557, + "step": 13897 + }, + { + "epoch": 0.2478864195769272, + "grad_norm": 0.32716962695121765, + "learning_rate": 4.6741755874438885e-05, + "loss": 0.2071, + "step": 13898 + }, + { + "epoch": 0.24790425569864089, + "grad_norm": 0.3389391601085663, + "learning_rate": 4.674098748856378e-05, + "loss": 0.1776, + "step": 13899 + }, + { + "epoch": 0.24792209182035457, + "grad_norm": 0.2137138843536377, + "learning_rate": 4.674021901841309e-05, + "loss": 0.176, + "step": 13900 + }, + { + "epoch": 0.2479399279420683, + "grad_norm": 0.29083147644996643, + "learning_rate": 4.673945046398981e-05, + "loss": 0.1618, + "step": 13901 + }, + { + "epoch": 0.24795776406378198, + "grad_norm": 0.3967060446739197, + "learning_rate": 4.6738681825296904e-05, + "loss": 0.2201, + "step": 13902 + }, + { + "epoch": 0.24797560018549566, + "grad_norm": 0.29259899258613586, + "learning_rate": 4.673791310233737e-05, + "loss": 0.2026, + "step": 13903 + }, + { + "epoch": 0.24799343630720935, + "grad_norm": 0.3274999260902405, + "learning_rate": 4.6737144295114164e-05, + "loss": 0.209, + "step": 13904 + }, + { + "epoch": 0.24801127242892307, + "grad_norm": 0.3241322636604309, + "learning_rate": 4.673637540363028e-05, + "loss": 0.1981, + "step": 13905 + }, + { + "epoch": 0.24802910855063676, + "grad_norm": 0.24350115656852722, + "learning_rate": 4.6735606427888705e-05, + "loss": 0.198, + "step": 13906 + }, + { + "epoch": 0.24804694467235044, + "grad_norm": 0.21662767231464386, + "learning_rate": 4.6734837367892416e-05, + "loss": 0.1729, + "step": 13907 + }, + { + "epoch": 0.24806478079406413, + "grad_norm": 0.30358198285102844, + "learning_rate": 4.6734068223644375e-05, + "loss": 0.1966, + "step": 13908 + }, + { + "epoch": 0.24808261691577782, + "grad_norm": 0.24385471642017365, + "learning_rate": 4.673329899514759e-05, + "loss": 0.1605, + "step": 13909 + }, + { + "epoch": 0.24810045303749154, + "grad_norm": 0.24042540788650513, + "learning_rate": 4.673252968240503e-05, + "loss": 0.1512, + "step": 13910 + }, + { + "epoch": 0.24811828915920522, + "grad_norm": 0.22144049406051636, + "learning_rate": 4.673176028541968e-05, + "loss": 0.193, + "step": 13911 + }, + { + "epoch": 0.2481361252809189, + "grad_norm": 0.24732889235019684, + "learning_rate": 4.6730990804194516e-05, + "loss": 0.2004, + "step": 13912 + }, + { + "epoch": 0.2481539614026326, + "grad_norm": 0.28715449571609497, + "learning_rate": 4.673022123873253e-05, + "loss": 0.2223, + "step": 13913 + }, + { + "epoch": 0.24817179752434632, + "grad_norm": 0.2218928039073944, + "learning_rate": 4.67294515890367e-05, + "loss": 0.1211, + "step": 13914 + }, + { + "epoch": 0.24818963364606, + "grad_norm": 0.2684513032436371, + "learning_rate": 4.672868185511001e-05, + "loss": 0.2213, + "step": 13915 + }, + { + "epoch": 0.2482074697677737, + "grad_norm": 0.333819717168808, + "learning_rate": 4.672791203695545e-05, + "loss": 0.2066, + "step": 13916 + }, + { + "epoch": 0.24822530588948738, + "grad_norm": 0.34282979369163513, + "learning_rate": 4.672714213457599e-05, + "loss": 0.1565, + "step": 13917 + }, + { + "epoch": 0.2482431420112011, + "grad_norm": 0.2980656325817108, + "learning_rate": 4.672637214797463e-05, + "loss": 0.2044, + "step": 13918 + }, + { + "epoch": 0.24826097813291478, + "grad_norm": 0.36185529828071594, + "learning_rate": 4.672560207715434e-05, + "loss": 0.2064, + "step": 13919 + }, + { + "epoch": 0.24827881425462847, + "grad_norm": 0.27268537878990173, + "learning_rate": 4.672483192211812e-05, + "loss": 0.1905, + "step": 13920 + }, + { + "epoch": 0.24829665037634216, + "grad_norm": 0.26096203923225403, + "learning_rate": 4.672406168286894e-05, + "loss": 0.1015, + "step": 13921 + }, + { + "epoch": 0.24831448649805588, + "grad_norm": 0.3976256847381592, + "learning_rate": 4.672329135940979e-05, + "loss": 0.1928, + "step": 13922 + }, + { + "epoch": 0.24833232261976956, + "grad_norm": 0.26295849680900574, + "learning_rate": 4.6722520951743675e-05, + "loss": 0.1487, + "step": 13923 + }, + { + "epoch": 0.24835015874148325, + "grad_norm": 0.3481968939304352, + "learning_rate": 4.672175045987356e-05, + "loss": 0.2325, + "step": 13924 + }, + { + "epoch": 0.24836799486319694, + "grad_norm": 0.37518441677093506, + "learning_rate": 4.6720979883802435e-05, + "loss": 0.246, + "step": 13925 + }, + { + "epoch": 0.24838583098491063, + "grad_norm": 0.36704546213150024, + "learning_rate": 4.672020922353329e-05, + "loss": 0.2161, + "step": 13926 + }, + { + "epoch": 0.24840366710662434, + "grad_norm": 0.2759122848510742, + "learning_rate": 4.671943847906911e-05, + "loss": 0.1594, + "step": 13927 + }, + { + "epoch": 0.24842150322833803, + "grad_norm": 0.33444783091545105, + "learning_rate": 4.67186676504129e-05, + "loss": 0.2176, + "step": 13928 + }, + { + "epoch": 0.24843933935005172, + "grad_norm": 0.2398654967546463, + "learning_rate": 4.671789673756761e-05, + "loss": 0.1841, + "step": 13929 + }, + { + "epoch": 0.2484571754717654, + "grad_norm": 0.30182909965515137, + "learning_rate": 4.671712574053626e-05, + "loss": 0.1439, + "step": 13930 + }, + { + "epoch": 0.24847501159347912, + "grad_norm": 0.279514878988266, + "learning_rate": 4.671635465932184e-05, + "loss": 0.1532, + "step": 13931 + }, + { + "epoch": 0.2484928477151928, + "grad_norm": 0.28951379656791687, + "learning_rate": 4.671558349392732e-05, + "loss": 0.1933, + "step": 13932 + }, + { + "epoch": 0.2485106838369065, + "grad_norm": 0.31557130813598633, + "learning_rate": 4.671481224435569e-05, + "loss": 0.21, + "step": 13933 + }, + { + "epoch": 0.2485285199586202, + "grad_norm": 0.24612878262996674, + "learning_rate": 4.6714040910609956e-05, + "loss": 0.2274, + "step": 13934 + }, + { + "epoch": 0.2485463560803339, + "grad_norm": 0.25948524475097656, + "learning_rate": 4.67132694926931e-05, + "loss": 0.2098, + "step": 13935 + }, + { + "epoch": 0.2485641922020476, + "grad_norm": 0.24691280722618103, + "learning_rate": 4.671249799060812e-05, + "loss": 0.1869, + "step": 13936 + }, + { + "epoch": 0.24858202832376128, + "grad_norm": 0.2153436243534088, + "learning_rate": 4.6711726404357984e-05, + "loss": 0.1731, + "step": 13937 + }, + { + "epoch": 0.24859986444547497, + "grad_norm": 0.23904675245285034, + "learning_rate": 4.671095473394571e-05, + "loss": 0.193, + "step": 13938 + }, + { + "epoch": 0.24861770056718868, + "grad_norm": 0.26901721954345703, + "learning_rate": 4.6710182979374266e-05, + "loss": 0.224, + "step": 13939 + }, + { + "epoch": 0.24863553668890237, + "grad_norm": 0.23426665365695953, + "learning_rate": 4.670941114064666e-05, + "loss": 0.1704, + "step": 13940 + }, + { + "epoch": 0.24865337281061606, + "grad_norm": 0.26688480377197266, + "learning_rate": 4.670863921776588e-05, + "loss": 0.1688, + "step": 13941 + }, + { + "epoch": 0.24867120893232975, + "grad_norm": 0.37944281101226807, + "learning_rate": 4.670786721073491e-05, + "loss": 0.1455, + "step": 13942 + }, + { + "epoch": 0.24868904505404346, + "grad_norm": 0.3439227342605591, + "learning_rate": 4.6707095119556754e-05, + "loss": 0.1205, + "step": 13943 + }, + { + "epoch": 0.24870688117575715, + "grad_norm": 0.31361326575279236, + "learning_rate": 4.670632294423439e-05, + "loss": 0.1954, + "step": 13944 + }, + { + "epoch": 0.24872471729747084, + "grad_norm": 0.2773951590061188, + "learning_rate": 4.6705550684770835e-05, + "loss": 0.2428, + "step": 13945 + }, + { + "epoch": 0.24874255341918453, + "grad_norm": 0.2939739227294922, + "learning_rate": 4.670477834116906e-05, + "loss": 0.1485, + "step": 13946 + }, + { + "epoch": 0.2487603895408982, + "grad_norm": 0.2697181701660156, + "learning_rate": 4.6704005913432076e-05, + "loss": 0.1402, + "step": 13947 + }, + { + "epoch": 0.24877822566261193, + "grad_norm": 0.3270324468612671, + "learning_rate": 4.6703233401562864e-05, + "loss": 0.2131, + "step": 13948 + }, + { + "epoch": 0.24879606178432562, + "grad_norm": 0.21917343139648438, + "learning_rate": 4.6702460805564416e-05, + "loss": 0.1686, + "step": 13949 + }, + { + "epoch": 0.2488138979060393, + "grad_norm": 0.2303187996149063, + "learning_rate": 4.6701688125439746e-05, + "loss": 0.1687, + "step": 13950 + }, + { + "epoch": 0.248831734027753, + "grad_norm": 0.2199353575706482, + "learning_rate": 4.670091536119183e-05, + "loss": 0.1237, + "step": 13951 + }, + { + "epoch": 0.2488495701494667, + "grad_norm": 0.22015100717544556, + "learning_rate": 4.6700142512823676e-05, + "loss": 0.1531, + "step": 13952 + }, + { + "epoch": 0.2488674062711804, + "grad_norm": 0.43796634674072266, + "learning_rate": 4.669936958033827e-05, + "loss": 0.2684, + "step": 13953 + }, + { + "epoch": 0.24888524239289408, + "grad_norm": 0.2625996768474579, + "learning_rate": 4.669859656373862e-05, + "loss": 0.1964, + "step": 13954 + }, + { + "epoch": 0.24890307851460777, + "grad_norm": 0.2555631101131439, + "learning_rate": 4.669782346302771e-05, + "loss": 0.2439, + "step": 13955 + }, + { + "epoch": 0.2489209146363215, + "grad_norm": 0.3084341287612915, + "learning_rate": 4.669705027820854e-05, + "loss": 0.1401, + "step": 13956 + }, + { + "epoch": 0.24893875075803518, + "grad_norm": 0.30264008045196533, + "learning_rate": 4.669627700928411e-05, + "loss": 0.1897, + "step": 13957 + }, + { + "epoch": 0.24895658687974886, + "grad_norm": 0.18788209557533264, + "learning_rate": 4.669550365625742e-05, + "loss": 0.1675, + "step": 13958 + }, + { + "epoch": 0.24897442300146255, + "grad_norm": 0.3613806664943695, + "learning_rate": 4.669473021913146e-05, + "loss": 0.1809, + "step": 13959 + }, + { + "epoch": 0.24899225912317627, + "grad_norm": 0.26294445991516113, + "learning_rate": 4.6693956697909236e-05, + "loss": 0.1808, + "step": 13960 + }, + { + "epoch": 0.24901009524488996, + "grad_norm": 0.2537631690502167, + "learning_rate": 4.669318309259374e-05, + "loss": 0.2106, + "step": 13961 + }, + { + "epoch": 0.24902793136660364, + "grad_norm": 0.32182762026786804, + "learning_rate": 4.669240940318797e-05, + "loss": 0.1997, + "step": 13962 + }, + { + "epoch": 0.24904576748831733, + "grad_norm": 0.304286926984787, + "learning_rate": 4.669163562969494e-05, + "loss": 0.231, + "step": 13963 + }, + { + "epoch": 0.24906360361003105, + "grad_norm": 0.23288606107234955, + "learning_rate": 4.669086177211763e-05, + "loss": 0.1375, + "step": 13964 + }, + { + "epoch": 0.24908143973174474, + "grad_norm": 0.2737966477870941, + "learning_rate": 4.6690087830459053e-05, + "loss": 0.1584, + "step": 13965 + }, + { + "epoch": 0.24909927585345842, + "grad_norm": 0.28110888600349426, + "learning_rate": 4.6689313804722204e-05, + "loss": 0.1887, + "step": 13966 + }, + { + "epoch": 0.2491171119751721, + "grad_norm": 0.3648214340209961, + "learning_rate": 4.6688539694910084e-05, + "loss": 0.1903, + "step": 13967 + }, + { + "epoch": 0.2491349480968858, + "grad_norm": 0.3305388391017914, + "learning_rate": 4.668776550102568e-05, + "loss": 0.1804, + "step": 13968 + }, + { + "epoch": 0.24915278421859952, + "grad_norm": 0.3478490710258484, + "learning_rate": 4.668699122307202e-05, + "loss": 0.1593, + "step": 13969 + }, + { + "epoch": 0.2491706203403132, + "grad_norm": 0.35136494040489197, + "learning_rate": 4.668621686105209e-05, + "loss": 0.2062, + "step": 13970 + }, + { + "epoch": 0.2491884564620269, + "grad_norm": 0.2550809681415558, + "learning_rate": 4.6685442414968895e-05, + "loss": 0.2109, + "step": 13971 + }, + { + "epoch": 0.24920629258374058, + "grad_norm": 0.3216593563556671, + "learning_rate": 4.668466788482543e-05, + "loss": 0.1921, + "step": 13972 + }, + { + "epoch": 0.2492241287054543, + "grad_norm": 0.23060789704322815, + "learning_rate": 4.66838932706247e-05, + "loss": 0.1681, + "step": 13973 + }, + { + "epoch": 0.24924196482716798, + "grad_norm": 0.4060821235179901, + "learning_rate": 4.668311857236972e-05, + "loss": 0.1633, + "step": 13974 + }, + { + "epoch": 0.24925980094888167, + "grad_norm": 0.253292441368103, + "learning_rate": 4.668234379006348e-05, + "loss": 0.1831, + "step": 13975 + }, + { + "epoch": 0.24927763707059536, + "grad_norm": 0.25369203090667725, + "learning_rate": 4.668156892370898e-05, + "loss": 0.1615, + "step": 13976 + }, + { + "epoch": 0.24929547319230907, + "grad_norm": 0.31818199157714844, + "learning_rate": 4.668079397330923e-05, + "loss": 0.21, + "step": 13977 + }, + { + "epoch": 0.24931330931402276, + "grad_norm": 0.3466501235961914, + "learning_rate": 4.6680018938867246e-05, + "loss": 0.2102, + "step": 13978 + }, + { + "epoch": 0.24933114543573645, + "grad_norm": 0.2604408264160156, + "learning_rate": 4.667924382038601e-05, + "loss": 0.1582, + "step": 13979 + }, + { + "epoch": 0.24934898155745014, + "grad_norm": 0.24600160121917725, + "learning_rate": 4.6678468617868545e-05, + "loss": 0.1945, + "step": 13980 + }, + { + "epoch": 0.24936681767916385, + "grad_norm": 0.22015896439552307, + "learning_rate": 4.667769333131784e-05, + "loss": 0.1889, + "step": 13981 + }, + { + "epoch": 0.24938465380087754, + "grad_norm": 0.20485515892505646, + "learning_rate": 4.667691796073691e-05, + "loss": 0.1903, + "step": 13982 + }, + { + "epoch": 0.24940248992259123, + "grad_norm": 0.37513837218284607, + "learning_rate": 4.667614250612876e-05, + "loss": 0.2642, + "step": 13983 + }, + { + "epoch": 0.24942032604430492, + "grad_norm": 0.23538362979888916, + "learning_rate": 4.6675366967496405e-05, + "loss": 0.1712, + "step": 13984 + }, + { + "epoch": 0.24943816216601863, + "grad_norm": 0.2649373710155487, + "learning_rate": 4.6674591344842824e-05, + "loss": 0.2313, + "step": 13985 + }, + { + "epoch": 0.24945599828773232, + "grad_norm": 0.2700726389884949, + "learning_rate": 4.667381563817105e-05, + "loss": 0.1567, + "step": 13986 + }, + { + "epoch": 0.249473834409446, + "grad_norm": 0.3040487468242645, + "learning_rate": 4.667303984748408e-05, + "loss": 0.1894, + "step": 13987 + }, + { + "epoch": 0.2494916705311597, + "grad_norm": 0.26262998580932617, + "learning_rate": 4.6672263972784925e-05, + "loss": 0.1648, + "step": 13988 + }, + { + "epoch": 0.2495095066528734, + "grad_norm": 0.3377794623374939, + "learning_rate": 4.667148801407658e-05, + "loss": 0.2279, + "step": 13989 + }, + { + "epoch": 0.2495273427745871, + "grad_norm": 0.3334151804447174, + "learning_rate": 4.667071197136207e-05, + "loss": 0.2328, + "step": 13990 + }, + { + "epoch": 0.2495451788963008, + "grad_norm": 0.30604588985443115, + "learning_rate": 4.6669935844644397e-05, + "loss": 0.158, + "step": 13991 + }, + { + "epoch": 0.24956301501801448, + "grad_norm": 0.28032323718070984, + "learning_rate": 4.6669159633926564e-05, + "loss": 0.1416, + "step": 13992 + }, + { + "epoch": 0.24958085113972817, + "grad_norm": 0.264448344707489, + "learning_rate": 4.6668383339211585e-05, + "loss": 0.1688, + "step": 13993 + }, + { + "epoch": 0.24959868726144188, + "grad_norm": 0.29496899247169495, + "learning_rate": 4.6667606960502474e-05, + "loss": 0.1994, + "step": 13994 + }, + { + "epoch": 0.24961652338315557, + "grad_norm": 0.27877846360206604, + "learning_rate": 4.6666830497802226e-05, + "loss": 0.166, + "step": 13995 + }, + { + "epoch": 0.24963435950486926, + "grad_norm": 0.2546389698982239, + "learning_rate": 4.6666053951113864e-05, + "loss": 0.2118, + "step": 13996 + }, + { + "epoch": 0.24965219562658295, + "grad_norm": 0.28670454025268555, + "learning_rate": 4.666527732044039e-05, + "loss": 0.211, + "step": 13997 + }, + { + "epoch": 0.24967003174829666, + "grad_norm": 0.3268781900405884, + "learning_rate": 4.6664500605784825e-05, + "loss": 0.1489, + "step": 13998 + }, + { + "epoch": 0.24968786787001035, + "grad_norm": 0.2760900855064392, + "learning_rate": 4.6663723807150165e-05, + "loss": 0.1437, + "step": 13999 + }, + { + "epoch": 0.24970570399172404, + "grad_norm": 0.29700177907943726, + "learning_rate": 4.666294692453943e-05, + "loss": 0.1863, + "step": 14000 + }, + { + "epoch": 0.24970570399172404, + "eval_loss": 0.18124181032180786, + "eval_runtime": 107.1097, + "eval_samples_per_second": 9.56, + "eval_steps_per_second": 1.596, + "step": 14000 + }, + { + "epoch": 0.24972354011343773, + "grad_norm": 0.2769380211830139, + "learning_rate": 4.6662169957955636e-05, + "loss": 0.1687, + "step": 14001 + }, + { + "epoch": 0.24974137623515144, + "grad_norm": 0.19689103960990906, + "learning_rate": 4.666139290740179e-05, + "loss": 0.1459, + "step": 14002 + }, + { + "epoch": 0.24975921235686513, + "grad_norm": 0.27558207511901855, + "learning_rate": 4.66606157728809e-05, + "loss": 0.179, + "step": 14003 + }, + { + "epoch": 0.24977704847857882, + "grad_norm": 0.3142256438732147, + "learning_rate": 4.665983855439598e-05, + "loss": 0.2099, + "step": 14004 + }, + { + "epoch": 0.2497948846002925, + "grad_norm": 0.25825726985931396, + "learning_rate": 4.665906125195004e-05, + "loss": 0.1936, + "step": 14005 + }, + { + "epoch": 0.24981272072200622, + "grad_norm": 0.32794031500816345, + "learning_rate": 4.665828386554611e-05, + "loss": 0.2615, + "step": 14006 + }, + { + "epoch": 0.2498305568437199, + "grad_norm": 0.2211904674768448, + "learning_rate": 4.665750639518719e-05, + "loss": 0.1725, + "step": 14007 + }, + { + "epoch": 0.2498483929654336, + "grad_norm": 0.3203650116920471, + "learning_rate": 4.6656728840876285e-05, + "loss": 0.2209, + "step": 14008 + }, + { + "epoch": 0.24986622908714728, + "grad_norm": 0.2985434830188751, + "learning_rate": 4.665595120261643e-05, + "loss": 0.2001, + "step": 14009 + }, + { + "epoch": 0.24988406520886097, + "grad_norm": 0.2967106103897095, + "learning_rate": 4.665517348041062e-05, + "loss": 0.2341, + "step": 14010 + }, + { + "epoch": 0.2499019013305747, + "grad_norm": 0.278253972530365, + "learning_rate": 4.665439567426188e-05, + "loss": 0.2365, + "step": 14011 + }, + { + "epoch": 0.24991973745228838, + "grad_norm": 0.23545487225055695, + "learning_rate": 4.6653617784173226e-05, + "loss": 0.1705, + "step": 14012 + }, + { + "epoch": 0.24993757357400206, + "grad_norm": 0.26917949318885803, + "learning_rate": 4.6652839810147666e-05, + "loss": 0.2017, + "step": 14013 + }, + { + "epoch": 0.24995540969571575, + "grad_norm": 0.28392860293388367, + "learning_rate": 4.665206175218822e-05, + "loss": 0.1958, + "step": 14014 + }, + { + "epoch": 0.24997324581742947, + "grad_norm": 0.3414732813835144, + "learning_rate": 4.6651283610297916e-05, + "loss": 0.2426, + "step": 14015 + }, + { + "epoch": 0.24999108193914316, + "grad_norm": 0.2396143674850464, + "learning_rate": 4.665050538447975e-05, + "loss": 0.1976, + "step": 14016 + }, + { + "epoch": 0.25000891806085684, + "grad_norm": 0.30143946409225464, + "learning_rate": 4.664972707473674e-05, + "loss": 0.1727, + "step": 14017 + }, + { + "epoch": 0.25002675418257053, + "grad_norm": 0.30427926778793335, + "learning_rate": 4.664894868107192e-05, + "loss": 0.212, + "step": 14018 + }, + { + "epoch": 0.2500445903042842, + "grad_norm": 0.25081542134284973, + "learning_rate": 4.66481702034883e-05, + "loss": 0.1948, + "step": 14019 + }, + { + "epoch": 0.2500624264259979, + "grad_norm": 0.28087326884269714, + "learning_rate": 4.664739164198889e-05, + "loss": 0.1889, + "step": 14020 + }, + { + "epoch": 0.25008026254771165, + "grad_norm": 0.30892935395240784, + "learning_rate": 4.664661299657671e-05, + "loss": 0.1885, + "step": 14021 + }, + { + "epoch": 0.25009809866942534, + "grad_norm": 0.2081720232963562, + "learning_rate": 4.6645834267254785e-05, + "loss": 0.1487, + "step": 14022 + }, + { + "epoch": 0.250115934791139, + "grad_norm": 0.2211052030324936, + "learning_rate": 4.6645055454026135e-05, + "loss": 0.1445, + "step": 14023 + }, + { + "epoch": 0.2501337709128527, + "grad_norm": 0.24615950882434845, + "learning_rate": 4.664427655689376e-05, + "loss": 0.2184, + "step": 14024 + }, + { + "epoch": 0.2501516070345664, + "grad_norm": 0.24498583376407623, + "learning_rate": 4.664349757586071e-05, + "loss": 0.2185, + "step": 14025 + }, + { + "epoch": 0.2501694431562801, + "grad_norm": 0.32786017656326294, + "learning_rate": 4.664271851092998e-05, + "loss": 0.2017, + "step": 14026 + }, + { + "epoch": 0.2501872792779938, + "grad_norm": 0.22174333035945892, + "learning_rate": 4.66419393621046e-05, + "loss": 0.1657, + "step": 14027 + }, + { + "epoch": 0.25020511539970747, + "grad_norm": 0.3068290948867798, + "learning_rate": 4.664116012938758e-05, + "loss": 0.1638, + "step": 14028 + }, + { + "epoch": 0.25022295152142116, + "grad_norm": 0.26966479420661926, + "learning_rate": 4.664038081278196e-05, + "loss": 0.2089, + "step": 14029 + }, + { + "epoch": 0.2502407876431349, + "grad_norm": 0.29627472162246704, + "learning_rate": 4.663960141229075e-05, + "loss": 0.2112, + "step": 14030 + }, + { + "epoch": 0.2502586237648486, + "grad_norm": 0.3220474421977997, + "learning_rate": 4.6638821927916966e-05, + "loss": 0.2046, + "step": 14031 + }, + { + "epoch": 0.2502764598865623, + "grad_norm": 0.30703917145729065, + "learning_rate": 4.663804235966363e-05, + "loss": 0.1178, + "step": 14032 + }, + { + "epoch": 0.25029429600827596, + "grad_norm": 0.24785824120044708, + "learning_rate": 4.663726270753377e-05, + "loss": 0.1493, + "step": 14033 + }, + { + "epoch": 0.25031213212998965, + "grad_norm": 0.27839118242263794, + "learning_rate": 4.663648297153041e-05, + "loss": 0.2109, + "step": 14034 + }, + { + "epoch": 0.25032996825170334, + "grad_norm": 0.2667781412601471, + "learning_rate": 4.663570315165657e-05, + "loss": 0.1948, + "step": 14035 + }, + { + "epoch": 0.250347804373417, + "grad_norm": 0.29322579503059387, + "learning_rate": 4.663492324791527e-05, + "loss": 0.2001, + "step": 14036 + }, + { + "epoch": 0.2503656404951307, + "grad_norm": 0.206663116812706, + "learning_rate": 4.6634143260309534e-05, + "loss": 0.1887, + "step": 14037 + }, + { + "epoch": 0.25038347661684446, + "grad_norm": 0.19180206954479218, + "learning_rate": 4.663336318884239e-05, + "loss": 0.1769, + "step": 14038 + }, + { + "epoch": 0.25040131273855815, + "grad_norm": 0.31328073143959045, + "learning_rate": 4.6632583033516855e-05, + "loss": 0.2141, + "step": 14039 + }, + { + "epoch": 0.25041914886027183, + "grad_norm": 0.2851082682609558, + "learning_rate": 4.663180279433595e-05, + "loss": 0.156, + "step": 14040 + }, + { + "epoch": 0.2504369849819855, + "grad_norm": 0.34165433049201965, + "learning_rate": 4.663102247130272e-05, + "loss": 0.2319, + "step": 14041 + }, + { + "epoch": 0.2504548211036992, + "grad_norm": 0.25247126817703247, + "learning_rate": 4.663024206442017e-05, + "loss": 0.2174, + "step": 14042 + }, + { + "epoch": 0.2504726572254129, + "grad_norm": 0.31844043731689453, + "learning_rate": 4.662946157369133e-05, + "loss": 0.2612, + "step": 14043 + }, + { + "epoch": 0.2504904933471266, + "grad_norm": 0.321954607963562, + "learning_rate": 4.6628680999119226e-05, + "loss": 0.2259, + "step": 14044 + }, + { + "epoch": 0.2505083294688403, + "grad_norm": 0.23858575522899628, + "learning_rate": 4.662790034070689e-05, + "loss": 0.2276, + "step": 14045 + }, + { + "epoch": 0.250526165590554, + "grad_norm": 0.43849530816078186, + "learning_rate": 4.662711959845733e-05, + "loss": 0.2025, + "step": 14046 + }, + { + "epoch": 0.2505440017122677, + "grad_norm": 0.350673109292984, + "learning_rate": 4.662633877237359e-05, + "loss": 0.2841, + "step": 14047 + }, + { + "epoch": 0.2505618378339814, + "grad_norm": 0.2753816843032837, + "learning_rate": 4.6625557862458697e-05, + "loss": 0.2114, + "step": 14048 + }, + { + "epoch": 0.2505796739556951, + "grad_norm": 0.26957908272743225, + "learning_rate": 4.662477686871567e-05, + "loss": 0.229, + "step": 14049 + }, + { + "epoch": 0.25059751007740877, + "grad_norm": 0.34246590733528137, + "learning_rate": 4.6623995791147535e-05, + "loss": 0.2168, + "step": 14050 + }, + { + "epoch": 0.25061534619912246, + "grad_norm": 0.2836030423641205, + "learning_rate": 4.662321462975733e-05, + "loss": 0.1862, + "step": 14051 + }, + { + "epoch": 0.25063318232083615, + "grad_norm": 0.24299070239067078, + "learning_rate": 4.662243338454807e-05, + "loss": 0.1729, + "step": 14052 + }, + { + "epoch": 0.25065101844254983, + "grad_norm": 0.24866726994514465, + "learning_rate": 4.662165205552279e-05, + "loss": 0.2179, + "step": 14053 + }, + { + "epoch": 0.2506688545642635, + "grad_norm": 0.23637741804122925, + "learning_rate": 4.6620870642684525e-05, + "loss": 0.1675, + "step": 14054 + }, + { + "epoch": 0.25068669068597726, + "grad_norm": 0.3448881506919861, + "learning_rate": 4.6620089146036294e-05, + "loss": 0.1557, + "step": 14055 + }, + { + "epoch": 0.25070452680769095, + "grad_norm": 0.2475176602602005, + "learning_rate": 4.6619307565581126e-05, + "loss": 0.1809, + "step": 14056 + }, + { + "epoch": 0.25072236292940464, + "grad_norm": 0.3104894459247589, + "learning_rate": 4.661852590132206e-05, + "loss": 0.1852, + "step": 14057 + }, + { + "epoch": 0.25074019905111833, + "grad_norm": 0.23213335871696472, + "learning_rate": 4.661774415326212e-05, + "loss": 0.1819, + "step": 14058 + }, + { + "epoch": 0.250758035172832, + "grad_norm": 0.29872894287109375, + "learning_rate": 4.661696232140434e-05, + "loss": 0.1499, + "step": 14059 + }, + { + "epoch": 0.2507758712945457, + "grad_norm": 0.4917716085910797, + "learning_rate": 4.661618040575174e-05, + "loss": 0.18, + "step": 14060 + }, + { + "epoch": 0.2507937074162594, + "grad_norm": 0.2520187497138977, + "learning_rate": 4.661539840630736e-05, + "loss": 0.1463, + "step": 14061 + }, + { + "epoch": 0.2508115435379731, + "grad_norm": 0.3358801305294037, + "learning_rate": 4.661461632307424e-05, + "loss": 0.2269, + "step": 14062 + }, + { + "epoch": 0.2508293796596868, + "grad_norm": 0.29628220200538635, + "learning_rate": 4.6613834156055396e-05, + "loss": 0.2343, + "step": 14063 + }, + { + "epoch": 0.2508472157814005, + "grad_norm": 0.22381003201007843, + "learning_rate": 4.661305190525387e-05, + "loss": 0.157, + "step": 14064 + }, + { + "epoch": 0.2508650519031142, + "grad_norm": 0.2660071551799774, + "learning_rate": 4.661226957067268e-05, + "loss": 0.1248, + "step": 14065 + }, + { + "epoch": 0.2508828880248279, + "grad_norm": 0.28703537583351135, + "learning_rate": 4.661148715231487e-05, + "loss": 0.1663, + "step": 14066 + }, + { + "epoch": 0.2509007241465416, + "grad_norm": 2.0280601978302, + "learning_rate": 4.661070465018348e-05, + "loss": 0.183, + "step": 14067 + }, + { + "epoch": 0.25091856026825526, + "grad_norm": 0.25482022762298584, + "learning_rate": 4.660992206428153e-05, + "loss": 0.1899, + "step": 14068 + }, + { + "epoch": 0.25093639638996895, + "grad_norm": 0.27000340819358826, + "learning_rate": 4.660913939461206e-05, + "loss": 0.2151, + "step": 14069 + }, + { + "epoch": 0.25095423251168264, + "grad_norm": 0.26275894045829773, + "learning_rate": 4.6608356641178095e-05, + "loss": 0.2011, + "step": 14070 + }, + { + "epoch": 0.25097206863339633, + "grad_norm": 0.3071376383304596, + "learning_rate": 4.6607573803982684e-05, + "loss": 0.1868, + "step": 14071 + }, + { + "epoch": 0.25098990475511007, + "grad_norm": 0.24079711735248566, + "learning_rate": 4.660679088302885e-05, + "loss": 0.1954, + "step": 14072 + }, + { + "epoch": 0.25100774087682376, + "grad_norm": 0.3508301079273224, + "learning_rate": 4.6606007878319634e-05, + "loss": 0.2373, + "step": 14073 + }, + { + "epoch": 0.25102557699853745, + "grad_norm": 0.2862575650215149, + "learning_rate": 4.660522478985807e-05, + "loss": 0.2352, + "step": 14074 + }, + { + "epoch": 0.25104341312025114, + "grad_norm": 0.295369029045105, + "learning_rate": 4.6604441617647185e-05, + "loss": 0.2459, + "step": 14075 + }, + { + "epoch": 0.2510612492419648, + "grad_norm": 0.22300460934638977, + "learning_rate": 4.660365836169003e-05, + "loss": 0.1982, + "step": 14076 + }, + { + "epoch": 0.2510790853636785, + "grad_norm": 0.3116849958896637, + "learning_rate": 4.660287502198963e-05, + "loss": 0.1929, + "step": 14077 + }, + { + "epoch": 0.2510969214853922, + "grad_norm": 0.2994225323200226, + "learning_rate": 4.660209159854902e-05, + "loss": 0.2041, + "step": 14078 + }, + { + "epoch": 0.2511147576071059, + "grad_norm": 0.2952130436897278, + "learning_rate": 4.660130809137125e-05, + "loss": 0.1701, + "step": 14079 + }, + { + "epoch": 0.25113259372881963, + "grad_norm": 0.18220709264278412, + "learning_rate": 4.6600524500459355e-05, + "loss": 0.1484, + "step": 14080 + }, + { + "epoch": 0.2511504298505333, + "grad_norm": 0.24857710301876068, + "learning_rate": 4.6599740825816354e-05, + "loss": 0.2074, + "step": 14081 + }, + { + "epoch": 0.251168265972247, + "grad_norm": 0.25650152564048767, + "learning_rate": 4.6598957067445305e-05, + "loss": 0.1814, + "step": 14082 + }, + { + "epoch": 0.2511861020939607, + "grad_norm": 0.20584620535373688, + "learning_rate": 4.659817322534924e-05, + "loss": 0.1903, + "step": 14083 + }, + { + "epoch": 0.2512039382156744, + "grad_norm": 0.2543705403804779, + "learning_rate": 4.659738929953119e-05, + "loss": 0.2189, + "step": 14084 + }, + { + "epoch": 0.25122177433738807, + "grad_norm": 0.28457480669021606, + "learning_rate": 4.65966052899942e-05, + "loss": 0.2304, + "step": 14085 + }, + { + "epoch": 0.25123961045910176, + "grad_norm": 0.23300664126873016, + "learning_rate": 4.659582119674131e-05, + "loss": 0.1641, + "step": 14086 + }, + { + "epoch": 0.25125744658081545, + "grad_norm": 0.271808385848999, + "learning_rate": 4.6595037019775554e-05, + "loss": 0.1733, + "step": 14087 + }, + { + "epoch": 0.25127528270252913, + "grad_norm": 0.44355881214141846, + "learning_rate": 4.6594252759099974e-05, + "loss": 0.1575, + "step": 14088 + }, + { + "epoch": 0.2512931188242429, + "grad_norm": 0.28535714745521545, + "learning_rate": 4.6593468414717624e-05, + "loss": 0.1157, + "step": 14089 + }, + { + "epoch": 0.25131095494595657, + "grad_norm": 0.32193076610565186, + "learning_rate": 4.6592683986631524e-05, + "loss": 0.1914, + "step": 14090 + }, + { + "epoch": 0.25132879106767025, + "grad_norm": 0.25828805565834045, + "learning_rate": 4.6591899474844726e-05, + "loss": 0.1729, + "step": 14091 + }, + { + "epoch": 0.25134662718938394, + "grad_norm": 0.32732248306274414, + "learning_rate": 4.6591114879360265e-05, + "loss": 0.2501, + "step": 14092 + }, + { + "epoch": 0.25136446331109763, + "grad_norm": 0.21630723774433136, + "learning_rate": 4.659033020018119e-05, + "loss": 0.1578, + "step": 14093 + }, + { + "epoch": 0.2513822994328113, + "grad_norm": 0.23651619255542755, + "learning_rate": 4.6589545437310535e-05, + "loss": 0.1892, + "step": 14094 + }, + { + "epoch": 0.251400135554525, + "grad_norm": 0.20079255104064941, + "learning_rate": 4.6588760590751346e-05, + "loss": 0.1582, + "step": 14095 + }, + { + "epoch": 0.2514179716762387, + "grad_norm": 0.2668065130710602, + "learning_rate": 4.658797566050666e-05, + "loss": 0.1923, + "step": 14096 + }, + { + "epoch": 0.25143580779795244, + "grad_norm": 0.28694504499435425, + "learning_rate": 4.658719064657952e-05, + "loss": 0.207, + "step": 14097 + }, + { + "epoch": 0.2514536439196661, + "grad_norm": 0.23585495352745056, + "learning_rate": 4.658640554897299e-05, + "loss": 0.1879, + "step": 14098 + }, + { + "epoch": 0.2514714800413798, + "grad_norm": 0.30941495299339294, + "learning_rate": 4.658562036769009e-05, + "loss": 0.1921, + "step": 14099 + }, + { + "epoch": 0.2514893161630935, + "grad_norm": 0.284712553024292, + "learning_rate": 4.658483510273386e-05, + "loss": 0.216, + "step": 14100 + }, + { + "epoch": 0.2515071522848072, + "grad_norm": 0.217615008354187, + "learning_rate": 4.658404975410736e-05, + "loss": 0.1759, + "step": 14101 + }, + { + "epoch": 0.2515249884065209, + "grad_norm": 0.3259997069835663, + "learning_rate": 4.6583264321813634e-05, + "loss": 0.2496, + "step": 14102 + }, + { + "epoch": 0.25154282452823457, + "grad_norm": 0.31865620613098145, + "learning_rate": 4.658247880585572e-05, + "loss": 0.1605, + "step": 14103 + }, + { + "epoch": 0.25156066064994825, + "grad_norm": 0.23795956373214722, + "learning_rate": 4.6581693206236655e-05, + "loss": 0.1717, + "step": 14104 + }, + { + "epoch": 0.251578496771662, + "grad_norm": 0.27040642499923706, + "learning_rate": 4.65809075229595e-05, + "loss": 0.1955, + "step": 14105 + }, + { + "epoch": 0.2515963328933757, + "grad_norm": 0.4033662974834442, + "learning_rate": 4.6580121756027296e-05, + "loss": 0.2579, + "step": 14106 + }, + { + "epoch": 0.2516141690150894, + "grad_norm": 0.34568798542022705, + "learning_rate": 4.657933590544308e-05, + "loss": 0.213, + "step": 14107 + }, + { + "epoch": 0.25163200513680306, + "grad_norm": 0.35102951526641846, + "learning_rate": 4.6578549971209904e-05, + "loss": 0.2091, + "step": 14108 + }, + { + "epoch": 0.25164984125851675, + "grad_norm": 0.44054707884788513, + "learning_rate": 4.6577763953330824e-05, + "loss": 0.1845, + "step": 14109 + }, + { + "epoch": 0.25166767738023044, + "grad_norm": 0.18548522889614105, + "learning_rate": 4.6576977851808866e-05, + "loss": 0.1632, + "step": 14110 + }, + { + "epoch": 0.2516855135019441, + "grad_norm": 0.28369206190109253, + "learning_rate": 4.6576191666647095e-05, + "loss": 0.1716, + "step": 14111 + }, + { + "epoch": 0.2517033496236578, + "grad_norm": 0.24996811151504517, + "learning_rate": 4.657540539784856e-05, + "loss": 0.1563, + "step": 14112 + }, + { + "epoch": 0.2517211857453715, + "grad_norm": 0.2981433570384979, + "learning_rate": 4.657461904541629e-05, + "loss": 0.1759, + "step": 14113 + }, + { + "epoch": 0.25173902186708524, + "grad_norm": 0.3042716681957245, + "learning_rate": 4.657383260935335e-05, + "loss": 0.1837, + "step": 14114 + }, + { + "epoch": 0.25175685798879893, + "grad_norm": 0.21691346168518066, + "learning_rate": 4.657304608966278e-05, + "loss": 0.1632, + "step": 14115 + }, + { + "epoch": 0.2517746941105126, + "grad_norm": 0.22383587062358856, + "learning_rate": 4.6572259486347645e-05, + "loss": 0.1602, + "step": 14116 + }, + { + "epoch": 0.2517925302322263, + "grad_norm": 0.24241840839385986, + "learning_rate": 4.657147279941097e-05, + "loss": 0.1841, + "step": 14117 + }, + { + "epoch": 0.25181036635394, + "grad_norm": 0.22580492496490479, + "learning_rate": 4.657068602885582e-05, + "loss": 0.1798, + "step": 14118 + }, + { + "epoch": 0.2518282024756537, + "grad_norm": 0.3285813629627228, + "learning_rate": 4.656989917468524e-05, + "loss": 0.2322, + "step": 14119 + }, + { + "epoch": 0.25184603859736737, + "grad_norm": 0.28502634167671204, + "learning_rate": 4.656911223690228e-05, + "loss": 0.2135, + "step": 14120 + }, + { + "epoch": 0.25186387471908106, + "grad_norm": 0.28449690341949463, + "learning_rate": 4.656832521550999e-05, + "loss": 0.1913, + "step": 14121 + }, + { + "epoch": 0.2518817108407948, + "grad_norm": 0.3513454794883728, + "learning_rate": 4.656753811051142e-05, + "loss": 0.133, + "step": 14122 + }, + { + "epoch": 0.2518995469625085, + "grad_norm": 0.34927311539649963, + "learning_rate": 4.656675092190963e-05, + "loss": 0.252, + "step": 14123 + }, + { + "epoch": 0.2519173830842222, + "grad_norm": 0.24004197120666504, + "learning_rate": 4.6565963649707664e-05, + "loss": 0.1867, + "step": 14124 + }, + { + "epoch": 0.25193521920593587, + "grad_norm": 0.3098396062850952, + "learning_rate": 4.656517629390856e-05, + "loss": 0.1596, + "step": 14125 + }, + { + "epoch": 0.25195305532764956, + "grad_norm": 0.2269553691148758, + "learning_rate": 4.6564388854515404e-05, + "loss": 0.1666, + "step": 14126 + }, + { + "epoch": 0.25197089144936324, + "grad_norm": 0.3178076446056366, + "learning_rate": 4.656360133153122e-05, + "loss": 0.2216, + "step": 14127 + }, + { + "epoch": 0.25198872757107693, + "grad_norm": 0.2884099781513214, + "learning_rate": 4.6562813724959063e-05, + "loss": 0.2332, + "step": 14128 + }, + { + "epoch": 0.2520065636927906, + "grad_norm": 0.2716696858406067, + "learning_rate": 4.6562026034802006e-05, + "loss": 0.1289, + "step": 14129 + }, + { + "epoch": 0.2520243998145043, + "grad_norm": 0.24681685864925385, + "learning_rate": 4.656123826106308e-05, + "loss": 0.1841, + "step": 14130 + }, + { + "epoch": 0.25204223593621805, + "grad_norm": 0.3825761079788208, + "learning_rate": 4.656045040374535e-05, + "loss": 0.1923, + "step": 14131 + }, + { + "epoch": 0.25206007205793174, + "grad_norm": 0.21614547073841095, + "learning_rate": 4.655966246285187e-05, + "loss": 0.1855, + "step": 14132 + }, + { + "epoch": 0.2520779081796454, + "grad_norm": 0.3005830943584442, + "learning_rate": 4.6558874438385684e-05, + "loss": 0.1698, + "step": 14133 + }, + { + "epoch": 0.2520957443013591, + "grad_norm": 0.2826884984970093, + "learning_rate": 4.655808633034986e-05, + "loss": 0.1551, + "step": 14134 + }, + { + "epoch": 0.2521135804230728, + "grad_norm": 0.24026356637477875, + "learning_rate": 4.655729813874745e-05, + "loss": 0.1626, + "step": 14135 + }, + { + "epoch": 0.2521314165447865, + "grad_norm": 0.3220154941082001, + "learning_rate": 4.6556509863581496e-05, + "loss": 0.1735, + "step": 14136 + }, + { + "epoch": 0.2521492526665002, + "grad_norm": 0.24585796892642975, + "learning_rate": 4.655572150485508e-05, + "loss": 0.1848, + "step": 14137 + }, + { + "epoch": 0.25216708878821387, + "grad_norm": 0.27170753479003906, + "learning_rate": 4.6554933062571226e-05, + "loss": 0.1278, + "step": 14138 + }, + { + "epoch": 0.2521849249099276, + "grad_norm": 0.2308691442012787, + "learning_rate": 4.655414453673302e-05, + "loss": 0.1817, + "step": 14139 + }, + { + "epoch": 0.2522027610316413, + "grad_norm": 0.30388322472572327, + "learning_rate": 4.65533559273435e-05, + "loss": 0.153, + "step": 14140 + }, + { + "epoch": 0.252220597153355, + "grad_norm": 0.32387250661849976, + "learning_rate": 4.655256723440573e-05, + "loss": 0.1787, + "step": 14141 + }, + { + "epoch": 0.2522384332750687, + "grad_norm": 0.23165884613990784, + "learning_rate": 4.655177845792276e-05, + "loss": 0.1319, + "step": 14142 + }, + { + "epoch": 0.25225626939678236, + "grad_norm": 0.22295813262462616, + "learning_rate": 4.655098959789765e-05, + "loss": 0.2123, + "step": 14143 + }, + { + "epoch": 0.25227410551849605, + "grad_norm": 0.23038272559642792, + "learning_rate": 4.6550200654333474e-05, + "loss": 0.1977, + "step": 14144 + }, + { + "epoch": 0.25229194164020974, + "grad_norm": 0.2907399535179138, + "learning_rate": 4.6549411627233266e-05, + "loss": 0.1679, + "step": 14145 + }, + { + "epoch": 0.2523097777619234, + "grad_norm": 0.26177433133125305, + "learning_rate": 4.6548622516600106e-05, + "loss": 0.1844, + "step": 14146 + }, + { + "epoch": 0.25232761388363717, + "grad_norm": 0.2839551568031311, + "learning_rate": 4.6547833322437036e-05, + "loss": 0.2156, + "step": 14147 + }, + { + "epoch": 0.25234545000535086, + "grad_norm": 0.22857435047626495, + "learning_rate": 4.6547044044747125e-05, + "loss": 0.1947, + "step": 14148 + }, + { + "epoch": 0.25236328612706455, + "grad_norm": 0.3288918137550354, + "learning_rate": 4.6546254683533416e-05, + "loss": 0.1721, + "step": 14149 + }, + { + "epoch": 0.25238112224877823, + "grad_norm": 0.2554515302181244, + "learning_rate": 4.6545465238799e-05, + "loss": 0.2082, + "step": 14150 + }, + { + "epoch": 0.2523989583704919, + "grad_norm": 0.2436758279800415, + "learning_rate": 4.654467571054691e-05, + "loss": 0.1507, + "step": 14151 + }, + { + "epoch": 0.2524167944922056, + "grad_norm": 0.21374143660068512, + "learning_rate": 4.654388609878022e-05, + "loss": 0.1662, + "step": 14152 + }, + { + "epoch": 0.2524346306139193, + "grad_norm": 0.2745634913444519, + "learning_rate": 4.654309640350198e-05, + "loss": 0.1645, + "step": 14153 + }, + { + "epoch": 0.252452466735633, + "grad_norm": 0.2654988467693329, + "learning_rate": 4.654230662471526e-05, + "loss": 0.1494, + "step": 14154 + }, + { + "epoch": 0.2524703028573467, + "grad_norm": 0.2651924788951874, + "learning_rate": 4.654151676242312e-05, + "loss": 0.1743, + "step": 14155 + }, + { + "epoch": 0.2524881389790604, + "grad_norm": 0.2538650333881378, + "learning_rate": 4.654072681662862e-05, + "loss": 0.15, + "step": 14156 + }, + { + "epoch": 0.2525059751007741, + "grad_norm": 0.2426513284444809, + "learning_rate": 4.653993678733483e-05, + "loss": 0.1764, + "step": 14157 + }, + { + "epoch": 0.2525238112224878, + "grad_norm": 0.2860927879810333, + "learning_rate": 4.65391466745448e-05, + "loss": 0.1428, + "step": 14158 + }, + { + "epoch": 0.2525416473442015, + "grad_norm": 0.3454885184764862, + "learning_rate": 4.653835647826159e-05, + "loss": 0.1676, + "step": 14159 + }, + { + "epoch": 0.25255948346591517, + "grad_norm": 0.2563824951648712, + "learning_rate": 4.653756619848828e-05, + "loss": 0.211, + "step": 14160 + }, + { + "epoch": 0.25257731958762886, + "grad_norm": 0.21312856674194336, + "learning_rate": 4.653677583522793e-05, + "loss": 0.1752, + "step": 14161 + }, + { + "epoch": 0.25259515570934254, + "grad_norm": 0.4886600375175476, + "learning_rate": 4.6535985388483586e-05, + "loss": 0.1739, + "step": 14162 + }, + { + "epoch": 0.25261299183105623, + "grad_norm": 0.2462834119796753, + "learning_rate": 4.653519485825833e-05, + "loss": 0.1785, + "step": 14163 + }, + { + "epoch": 0.25263082795277, + "grad_norm": 0.3622860908508301, + "learning_rate": 4.653440424455522e-05, + "loss": 0.2362, + "step": 14164 + }, + { + "epoch": 0.25264866407448366, + "grad_norm": 0.2675084173679352, + "learning_rate": 4.653361354737732e-05, + "loss": 0.2223, + "step": 14165 + }, + { + "epoch": 0.25266650019619735, + "grad_norm": 0.23243822157382965, + "learning_rate": 4.65328227667277e-05, + "loss": 0.1972, + "step": 14166 + }, + { + "epoch": 0.25268433631791104, + "grad_norm": 0.22301937639713287, + "learning_rate": 4.653203190260942e-05, + "loss": 0.1758, + "step": 14167 + }, + { + "epoch": 0.25270217243962473, + "grad_norm": 0.25945422053337097, + "learning_rate": 4.6531240955025544e-05, + "loss": 0.1572, + "step": 14168 + }, + { + "epoch": 0.2527200085613384, + "grad_norm": 0.4287818372249603, + "learning_rate": 4.6530449923979146e-05, + "loss": 0.1891, + "step": 14169 + }, + { + "epoch": 0.2527378446830521, + "grad_norm": 0.28560107946395874, + "learning_rate": 4.6529658809473285e-05, + "loss": 0.191, + "step": 14170 + }, + { + "epoch": 0.2527556808047658, + "grad_norm": 0.30198612809181213, + "learning_rate": 4.6528867611511036e-05, + "loss": 0.1361, + "step": 14171 + }, + { + "epoch": 0.2527735169264795, + "grad_norm": 0.2874630093574524, + "learning_rate": 4.652807633009546e-05, + "loss": 0.18, + "step": 14172 + }, + { + "epoch": 0.2527913530481932, + "grad_norm": 0.4322587549686432, + "learning_rate": 4.652728496522962e-05, + "loss": 0.1552, + "step": 14173 + }, + { + "epoch": 0.2528091891699069, + "grad_norm": 0.23630595207214355, + "learning_rate": 4.6526493516916584e-05, + "loss": 0.1443, + "step": 14174 + }, + { + "epoch": 0.2528270252916206, + "grad_norm": 0.35395070910453796, + "learning_rate": 4.6525701985159433e-05, + "loss": 0.1965, + "step": 14175 + }, + { + "epoch": 0.2528448614133343, + "grad_norm": 0.32523298263549805, + "learning_rate": 4.6524910369961216e-05, + "loss": 0.2032, + "step": 14176 + }, + { + "epoch": 0.252862697535048, + "grad_norm": 0.18269342184066772, + "learning_rate": 4.652411867132502e-05, + "loss": 0.1205, + "step": 14177 + }, + { + "epoch": 0.25288053365676166, + "grad_norm": 0.24428625404834747, + "learning_rate": 4.652332688925391e-05, + "loss": 0.1807, + "step": 14178 + }, + { + "epoch": 0.25289836977847535, + "grad_norm": 0.2848491966724396, + "learning_rate": 4.652253502375095e-05, + "loss": 0.1517, + "step": 14179 + }, + { + "epoch": 0.25291620590018904, + "grad_norm": 0.3086334764957428, + "learning_rate": 4.65217430748192e-05, + "loss": 0.2058, + "step": 14180 + }, + { + "epoch": 0.2529340420219028, + "grad_norm": 0.3114648461341858, + "learning_rate": 4.6520951042461745e-05, + "loss": 0.2672, + "step": 14181 + }, + { + "epoch": 0.25295187814361647, + "grad_norm": 0.2515789568424225, + "learning_rate": 4.652015892668166e-05, + "loss": 0.1688, + "step": 14182 + }, + { + "epoch": 0.25296971426533016, + "grad_norm": 0.21046918630599976, + "learning_rate": 4.651936672748199e-05, + "loss": 0.1516, + "step": 14183 + }, + { + "epoch": 0.25298755038704385, + "grad_norm": 0.3786681294441223, + "learning_rate": 4.651857444486583e-05, + "loss": 0.1811, + "step": 14184 + }, + { + "epoch": 0.25300538650875753, + "grad_norm": 0.29727619886398315, + "learning_rate": 4.6517782078836244e-05, + "loss": 0.2105, + "step": 14185 + }, + { + "epoch": 0.2530232226304712, + "grad_norm": 0.26006007194519043, + "learning_rate": 4.65169896293963e-05, + "loss": 0.1873, + "step": 14186 + }, + { + "epoch": 0.2530410587521849, + "grad_norm": 0.2879183888435364, + "learning_rate": 4.6516197096549076e-05, + "loss": 0.1966, + "step": 14187 + }, + { + "epoch": 0.2530588948738986, + "grad_norm": 0.2707684636116028, + "learning_rate": 4.651540448029764e-05, + "loss": 0.2031, + "step": 14188 + }, + { + "epoch": 0.2530767309956123, + "grad_norm": 0.26511630415916443, + "learning_rate": 4.6514611780645067e-05, + "loss": 0.1419, + "step": 14189 + }, + { + "epoch": 0.25309456711732603, + "grad_norm": 0.38170453906059265, + "learning_rate": 4.651381899759442e-05, + "loss": 0.1947, + "step": 14190 + }, + { + "epoch": 0.2531124032390397, + "grad_norm": 0.2836182415485382, + "learning_rate": 4.6513026131148786e-05, + "loss": 0.1683, + "step": 14191 + }, + { + "epoch": 0.2531302393607534, + "grad_norm": 0.2315625697374344, + "learning_rate": 4.651223318131123e-05, + "loss": 0.1733, + "step": 14192 + }, + { + "epoch": 0.2531480754824671, + "grad_norm": 0.2781814634799957, + "learning_rate": 4.651144014808483e-05, + "loss": 0.1726, + "step": 14193 + }, + { + "epoch": 0.2531659116041808, + "grad_norm": 0.27597060799598694, + "learning_rate": 4.651064703147266e-05, + "loss": 0.1941, + "step": 14194 + }, + { + "epoch": 0.25318374772589447, + "grad_norm": 0.2447982281446457, + "learning_rate": 4.650985383147779e-05, + "loss": 0.1858, + "step": 14195 + }, + { + "epoch": 0.25320158384760816, + "grad_norm": 0.2601865231990814, + "learning_rate": 4.65090605481033e-05, + "loss": 0.1952, + "step": 14196 + }, + { + "epoch": 0.25321941996932185, + "grad_norm": 0.26416856050491333, + "learning_rate": 4.650826718135226e-05, + "loss": 0.1723, + "step": 14197 + }, + { + "epoch": 0.2532372560910356, + "grad_norm": 0.299405962228775, + "learning_rate": 4.650747373122775e-05, + "loss": 0.2659, + "step": 14198 + }, + { + "epoch": 0.2532550922127493, + "grad_norm": 0.2368491291999817, + "learning_rate": 4.650668019773283e-05, + "loss": 0.18, + "step": 14199 + }, + { + "epoch": 0.25327292833446297, + "grad_norm": 0.375325471162796, + "learning_rate": 4.6505886580870604e-05, + "loss": 0.1724, + "step": 14200 + }, + { + "epoch": 0.25329076445617665, + "grad_norm": 0.22692859172821045, + "learning_rate": 4.650509288064413e-05, + "loss": 0.1803, + "step": 14201 + }, + { + "epoch": 0.25330860057789034, + "grad_norm": 0.29675620794296265, + "learning_rate": 4.650429909705649e-05, + "loss": 0.2412, + "step": 14202 + }, + { + "epoch": 0.25332643669960403, + "grad_norm": 0.33006274700164795, + "learning_rate": 4.650350523011076e-05, + "loss": 0.1499, + "step": 14203 + }, + { + "epoch": 0.2533442728213177, + "grad_norm": 0.30536776781082153, + "learning_rate": 4.650271127981001e-05, + "loss": 0.2154, + "step": 14204 + }, + { + "epoch": 0.2533621089430314, + "grad_norm": 0.3404679000377655, + "learning_rate": 4.650191724615733e-05, + "loss": 0.1753, + "step": 14205 + }, + { + "epoch": 0.25337994506474515, + "grad_norm": 0.20536650717258453, + "learning_rate": 4.650112312915579e-05, + "loss": 0.1947, + "step": 14206 + }, + { + "epoch": 0.25339778118645884, + "grad_norm": 0.2819133698940277, + "learning_rate": 4.650032892880847e-05, + "loss": 0.1656, + "step": 14207 + }, + { + "epoch": 0.2534156173081725, + "grad_norm": 0.29955440759658813, + "learning_rate": 4.649953464511845e-05, + "loss": 0.1229, + "step": 14208 + }, + { + "epoch": 0.2534334534298862, + "grad_norm": 0.293802410364151, + "learning_rate": 4.6498740278088816e-05, + "loss": 0.1391, + "step": 14209 + }, + { + "epoch": 0.2534512895515999, + "grad_norm": 0.2326391488313675, + "learning_rate": 4.6497945827722626e-05, + "loss": 0.1332, + "step": 14210 + }, + { + "epoch": 0.2534691256733136, + "grad_norm": 0.2509083151817322, + "learning_rate": 4.6497151294022976e-05, + "loss": 0.1685, + "step": 14211 + }, + { + "epoch": 0.2534869617950273, + "grad_norm": 0.43427351117134094, + "learning_rate": 4.6496356676992944e-05, + "loss": 0.2319, + "step": 14212 + }, + { + "epoch": 0.25350479791674096, + "grad_norm": 0.2087259441614151, + "learning_rate": 4.649556197663562e-05, + "loss": 0.1308, + "step": 14213 + }, + { + "epoch": 0.25352263403845465, + "grad_norm": 0.2480999231338501, + "learning_rate": 4.649476719295406e-05, + "loss": 0.1598, + "step": 14214 + }, + { + "epoch": 0.2535404701601684, + "grad_norm": 0.22258156538009644, + "learning_rate": 4.6493972325951366e-05, + "loss": 0.1759, + "step": 14215 + }, + { + "epoch": 0.2535583062818821, + "grad_norm": 0.38022181391716003, + "learning_rate": 4.6493177375630605e-05, + "loss": 0.1472, + "step": 14216 + }, + { + "epoch": 0.2535761424035958, + "grad_norm": 0.1586291342973709, + "learning_rate": 4.6492382341994865e-05, + "loss": 0.1396, + "step": 14217 + }, + { + "epoch": 0.25359397852530946, + "grad_norm": 0.3604293465614319, + "learning_rate": 4.6491587225047227e-05, + "loss": 0.1781, + "step": 14218 + }, + { + "epoch": 0.25361181464702315, + "grad_norm": 0.28630000352859497, + "learning_rate": 4.649079202479078e-05, + "loss": 0.2377, + "step": 14219 + }, + { + "epoch": 0.25362965076873684, + "grad_norm": 0.2291271686553955, + "learning_rate": 4.64899967412286e-05, + "loss": 0.1684, + "step": 14220 + }, + { + "epoch": 0.2536474868904505, + "grad_norm": 0.2845189869403839, + "learning_rate": 4.6489201374363766e-05, + "loss": 0.1737, + "step": 14221 + }, + { + "epoch": 0.2536653230121642, + "grad_norm": 0.25370433926582336, + "learning_rate": 4.6488405924199364e-05, + "loss": 0.2223, + "step": 14222 + }, + { + "epoch": 0.25368315913387796, + "grad_norm": 0.27321720123291016, + "learning_rate": 4.6487610390738487e-05, + "loss": 0.1486, + "step": 14223 + }, + { + "epoch": 0.25370099525559164, + "grad_norm": 0.3075837790966034, + "learning_rate": 4.6486814773984204e-05, + "loss": 0.2256, + "step": 14224 + }, + { + "epoch": 0.25371883137730533, + "grad_norm": 0.2625052034854889, + "learning_rate": 4.648601907393961e-05, + "loss": 0.2179, + "step": 14225 + }, + { + "epoch": 0.253736667499019, + "grad_norm": 0.25153347849845886, + "learning_rate": 4.6485223290607785e-05, + "loss": 0.188, + "step": 14226 + }, + { + "epoch": 0.2537545036207327, + "grad_norm": 0.3153325319290161, + "learning_rate": 4.648442742399181e-05, + "loss": 0.2053, + "step": 14227 + }, + { + "epoch": 0.2537723397424464, + "grad_norm": 0.3147667348384857, + "learning_rate": 4.648363147409477e-05, + "loss": 0.2391, + "step": 14228 + }, + { + "epoch": 0.2537901758641601, + "grad_norm": 0.3161238729953766, + "learning_rate": 4.648283544091976e-05, + "loss": 0.2056, + "step": 14229 + }, + { + "epoch": 0.25380801198587377, + "grad_norm": 0.22965151071548462, + "learning_rate": 4.648203932446986e-05, + "loss": 0.1516, + "step": 14230 + }, + { + "epoch": 0.25382584810758746, + "grad_norm": 0.2387286126613617, + "learning_rate": 4.6481243124748155e-05, + "loss": 0.1715, + "step": 14231 + }, + { + "epoch": 0.2538436842293012, + "grad_norm": 0.24368348717689514, + "learning_rate": 4.648044684175773e-05, + "loss": 0.1908, + "step": 14232 + }, + { + "epoch": 0.2538615203510149, + "grad_norm": 0.2694433927536011, + "learning_rate": 4.647965047550168e-05, + "loss": 0.191, + "step": 14233 + }, + { + "epoch": 0.2538793564727286, + "grad_norm": 0.2929813861846924, + "learning_rate": 4.6478854025983075e-05, + "loss": 0.2187, + "step": 14234 + }, + { + "epoch": 0.25389719259444227, + "grad_norm": 0.2125709056854248, + "learning_rate": 4.6478057493205026e-05, + "loss": 0.1898, + "step": 14235 + }, + { + "epoch": 0.25391502871615595, + "grad_norm": 0.36264047026634216, + "learning_rate": 4.64772608771706e-05, + "loss": 0.2234, + "step": 14236 + }, + { + "epoch": 0.25393286483786964, + "grad_norm": 0.2551672160625458, + "learning_rate": 4.647646417788289e-05, + "loss": 0.1913, + "step": 14237 + }, + { + "epoch": 0.25395070095958333, + "grad_norm": 0.2613697946071625, + "learning_rate": 4.6475667395344994e-05, + "loss": 0.169, + "step": 14238 + }, + { + "epoch": 0.253968537081297, + "grad_norm": 0.4206896424293518, + "learning_rate": 4.647487052955999e-05, + "loss": 0.2123, + "step": 14239 + }, + { + "epoch": 0.25398637320301076, + "grad_norm": 0.34507039189338684, + "learning_rate": 4.647407358053097e-05, + "loss": 0.2134, + "step": 14240 + }, + { + "epoch": 0.25400420932472445, + "grad_norm": 0.3244677484035492, + "learning_rate": 4.647327654826104e-05, + "loss": 0.1939, + "step": 14241 + }, + { + "epoch": 0.25402204544643814, + "grad_norm": 0.29289910197257996, + "learning_rate": 4.6472479432753255e-05, + "loss": 0.1963, + "step": 14242 + }, + { + "epoch": 0.2540398815681518, + "grad_norm": 0.2476482391357422, + "learning_rate": 4.647168223401073e-05, + "loss": 0.1624, + "step": 14243 + }, + { + "epoch": 0.2540577176898655, + "grad_norm": 0.24047306180000305, + "learning_rate": 4.6470884952036544e-05, + "loss": 0.2102, + "step": 14244 + }, + { + "epoch": 0.2540755538115792, + "grad_norm": 0.33967599272727966, + "learning_rate": 4.6470087586833796e-05, + "loss": 0.2317, + "step": 14245 + }, + { + "epoch": 0.2540933899332929, + "grad_norm": 0.24407899379730225, + "learning_rate": 4.646929013840556e-05, + "loss": 0.1793, + "step": 14246 + }, + { + "epoch": 0.2541112260550066, + "grad_norm": 0.29077231884002686, + "learning_rate": 4.646849260675496e-05, + "loss": 0.1998, + "step": 14247 + }, + { + "epoch": 0.25412906217672027, + "grad_norm": 0.34494513273239136, + "learning_rate": 4.646769499188506e-05, + "loss": 0.1575, + "step": 14248 + }, + { + "epoch": 0.254146898298434, + "grad_norm": 0.6917784810066223, + "learning_rate": 4.646689729379896e-05, + "loss": 0.1663, + "step": 14249 + }, + { + "epoch": 0.2541647344201477, + "grad_norm": 0.23537231981754303, + "learning_rate": 4.646609951249975e-05, + "loss": 0.1447, + "step": 14250 + }, + { + "epoch": 0.2541825705418614, + "grad_norm": 0.2726292908191681, + "learning_rate": 4.6465301647990525e-05, + "loss": 0.1618, + "step": 14251 + }, + { + "epoch": 0.2542004066635751, + "grad_norm": 0.22526873648166656, + "learning_rate": 4.6464503700274376e-05, + "loss": 0.2046, + "step": 14252 + }, + { + "epoch": 0.25421824278528876, + "grad_norm": 0.23197723925113678, + "learning_rate": 4.64637056693544e-05, + "loss": 0.1549, + "step": 14253 + }, + { + "epoch": 0.25423607890700245, + "grad_norm": 0.27988550066947937, + "learning_rate": 4.646290755523368e-05, + "loss": 0.1789, + "step": 14254 + }, + { + "epoch": 0.25425391502871614, + "grad_norm": 0.32018154859542847, + "learning_rate": 4.646210935791533e-05, + "loss": 0.2266, + "step": 14255 + }, + { + "epoch": 0.2542717511504298, + "grad_norm": 0.24952074885368347, + "learning_rate": 4.646131107740242e-05, + "loss": 0.1817, + "step": 14256 + }, + { + "epoch": 0.25428958727214357, + "grad_norm": 0.31356120109558105, + "learning_rate": 4.6460512713698055e-05, + "loss": 0.1792, + "step": 14257 + }, + { + "epoch": 0.25430742339385726, + "grad_norm": 0.37971311807632446, + "learning_rate": 4.6459714266805346e-05, + "loss": 0.2357, + "step": 14258 + }, + { + "epoch": 0.25432525951557095, + "grad_norm": 0.3118349015712738, + "learning_rate": 4.645891573672736e-05, + "loss": 0.2425, + "step": 14259 + }, + { + "epoch": 0.25434309563728463, + "grad_norm": 0.25742360949516296, + "learning_rate": 4.64581171234672e-05, + "loss": 0.195, + "step": 14260 + }, + { + "epoch": 0.2543609317589983, + "grad_norm": 0.2648634910583496, + "learning_rate": 4.6457318427027977e-05, + "loss": 0.1821, + "step": 14261 + }, + { + "epoch": 0.254378767880712, + "grad_norm": 0.2841801047325134, + "learning_rate": 4.645651964741277e-05, + "loss": 0.1772, + "step": 14262 + }, + { + "epoch": 0.2543966040024257, + "grad_norm": 0.41761597990989685, + "learning_rate": 4.645572078462469e-05, + "loss": 0.1754, + "step": 14263 + }, + { + "epoch": 0.2544144401241394, + "grad_norm": 0.22964833676815033, + "learning_rate": 4.645492183866682e-05, + "loss": 0.1489, + "step": 14264 + }, + { + "epoch": 0.25443227624585313, + "grad_norm": 0.4118020832538605, + "learning_rate": 4.645412280954226e-05, + "loss": 0.2494, + "step": 14265 + }, + { + "epoch": 0.2544501123675668, + "grad_norm": 0.35143670439720154, + "learning_rate": 4.645332369725411e-05, + "loss": 0.2322, + "step": 14266 + }, + { + "epoch": 0.2544679484892805, + "grad_norm": 0.29790523648262024, + "learning_rate": 4.6452524501805474e-05, + "loss": 0.2103, + "step": 14267 + }, + { + "epoch": 0.2544857846109942, + "grad_norm": 0.26203373074531555, + "learning_rate": 4.6451725223199446e-05, + "loss": 0.1729, + "step": 14268 + }, + { + "epoch": 0.2545036207327079, + "grad_norm": 0.2693018317222595, + "learning_rate": 4.645092586143911e-05, + "loss": 0.2157, + "step": 14269 + }, + { + "epoch": 0.25452145685442157, + "grad_norm": 0.2562446892261505, + "learning_rate": 4.645012641652759e-05, + "loss": 0.193, + "step": 14270 + }, + { + "epoch": 0.25453929297613526, + "grad_norm": 0.2537136673927307, + "learning_rate": 4.6449326888467956e-05, + "loss": 0.1669, + "step": 14271 + }, + { + "epoch": 0.25455712909784894, + "grad_norm": 0.2820917069911957, + "learning_rate": 4.6448527277263335e-05, + "loss": 0.2135, + "step": 14272 + }, + { + "epoch": 0.25457496521956263, + "grad_norm": 0.3061208128929138, + "learning_rate": 4.644772758291681e-05, + "loss": 0.1796, + "step": 14273 + }, + { + "epoch": 0.2545928013412764, + "grad_norm": 0.32747864723205566, + "learning_rate": 4.6446927805431484e-05, + "loss": 0.182, + "step": 14274 + }, + { + "epoch": 0.25461063746299006, + "grad_norm": 0.34844040870666504, + "learning_rate": 4.6446127944810456e-05, + "loss": 0.2114, + "step": 14275 + }, + { + "epoch": 0.25462847358470375, + "grad_norm": 0.3345884680747986, + "learning_rate": 4.644532800105684e-05, + "loss": 0.1389, + "step": 14276 + }, + { + "epoch": 0.25464630970641744, + "grad_norm": 0.2675981819629669, + "learning_rate": 4.644452797417371e-05, + "loss": 0.184, + "step": 14277 + }, + { + "epoch": 0.2546641458281311, + "grad_norm": 0.2570849657058716, + "learning_rate": 4.644372786416419e-05, + "loss": 0.2201, + "step": 14278 + }, + { + "epoch": 0.2546819819498448, + "grad_norm": 0.8513707518577576, + "learning_rate": 4.6442927671031376e-05, + "loss": 0.4233, + "step": 14279 + }, + { + "epoch": 0.2546998180715585, + "grad_norm": 0.23098179697990417, + "learning_rate": 4.644212739477837e-05, + "loss": 0.1789, + "step": 14280 + }, + { + "epoch": 0.2547176541932722, + "grad_norm": 0.27836576104164124, + "learning_rate": 4.6441327035408274e-05, + "loss": 0.1846, + "step": 14281 + }, + { + "epoch": 0.25473549031498594, + "grad_norm": 0.34817731380462646, + "learning_rate": 4.644052659292418e-05, + "loss": 0.2512, + "step": 14282 + }, + { + "epoch": 0.2547533264366996, + "grad_norm": 0.24810580909252167, + "learning_rate": 4.6439726067329205e-05, + "loss": 0.1333, + "step": 14283 + }, + { + "epoch": 0.2547711625584133, + "grad_norm": 0.2629035413265228, + "learning_rate": 4.6438925458626445e-05, + "loss": 0.1593, + "step": 14284 + }, + { + "epoch": 0.254788998680127, + "grad_norm": 0.2998231053352356, + "learning_rate": 4.6438124766819006e-05, + "loss": 0.1861, + "step": 14285 + }, + { + "epoch": 0.2548068348018407, + "grad_norm": 0.3157520294189453, + "learning_rate": 4.643732399190999e-05, + "loss": 0.1973, + "step": 14286 + }, + { + "epoch": 0.2548246709235544, + "grad_norm": 0.30597442388534546, + "learning_rate": 4.643652313390251e-05, + "loss": 0.2054, + "step": 14287 + }, + { + "epoch": 0.25484250704526806, + "grad_norm": 0.2914910316467285, + "learning_rate": 4.643572219279965e-05, + "loss": 0.1648, + "step": 14288 + }, + { + "epoch": 0.25486034316698175, + "grad_norm": 0.24053938686847687, + "learning_rate": 4.643492116860453e-05, + "loss": 0.2136, + "step": 14289 + }, + { + "epoch": 0.25487817928869544, + "grad_norm": 0.29190152883529663, + "learning_rate": 4.643412006132026e-05, + "loss": 0.2144, + "step": 14290 + }, + { + "epoch": 0.2548960154104092, + "grad_norm": 0.3769576847553253, + "learning_rate": 4.643331887094993e-05, + "loss": 0.289, + "step": 14291 + }, + { + "epoch": 0.25491385153212287, + "grad_norm": 0.26358842849731445, + "learning_rate": 4.6432517597496654e-05, + "loss": 0.1982, + "step": 14292 + }, + { + "epoch": 0.25493168765383656, + "grad_norm": 0.26545488834381104, + "learning_rate": 4.643171624096354e-05, + "loss": 0.1866, + "step": 14293 + }, + { + "epoch": 0.25494952377555025, + "grad_norm": 0.2900102138519287, + "learning_rate": 4.643091480135369e-05, + "loss": 0.1699, + "step": 14294 + }, + { + "epoch": 0.25496735989726393, + "grad_norm": 0.33532676100730896, + "learning_rate": 4.643011327867021e-05, + "loss": 0.2373, + "step": 14295 + }, + { + "epoch": 0.2549851960189776, + "grad_norm": 0.40710389614105225, + "learning_rate": 4.6429311672916214e-05, + "loss": 0.2046, + "step": 14296 + }, + { + "epoch": 0.2550030321406913, + "grad_norm": 0.21490029990673065, + "learning_rate": 4.642850998409481e-05, + "loss": 0.1755, + "step": 14297 + }, + { + "epoch": 0.255020868262405, + "grad_norm": 0.2678086459636688, + "learning_rate": 4.6427708212209087e-05, + "loss": 0.2155, + "step": 14298 + }, + { + "epoch": 0.25503870438411874, + "grad_norm": 0.24348019063472748, + "learning_rate": 4.642690635726217e-05, + "loss": 0.2043, + "step": 14299 + }, + { + "epoch": 0.25505654050583243, + "grad_norm": 0.35054877400398254, + "learning_rate": 4.642610441925717e-05, + "loss": 0.1627, + "step": 14300 + }, + { + "epoch": 0.2550743766275461, + "grad_norm": 0.3422543704509735, + "learning_rate": 4.642530239819718e-05, + "loss": 0.2355, + "step": 14301 + }, + { + "epoch": 0.2550922127492598, + "grad_norm": 0.2594767212867737, + "learning_rate": 4.6424500294085315e-05, + "loss": 0.1627, + "step": 14302 + }, + { + "epoch": 0.2551100488709735, + "grad_norm": 0.23776432871818542, + "learning_rate": 4.64236981069247e-05, + "loss": 0.1653, + "step": 14303 + }, + { + "epoch": 0.2551278849926872, + "grad_norm": 0.39940017461776733, + "learning_rate": 4.642289583671842e-05, + "loss": 0.205, + "step": 14304 + }, + { + "epoch": 0.25514572111440087, + "grad_norm": 0.2671494483947754, + "learning_rate": 4.6422093483469606e-05, + "loss": 0.1934, + "step": 14305 + }, + { + "epoch": 0.25516355723611456, + "grad_norm": 0.42209678888320923, + "learning_rate": 4.642129104718135e-05, + "loss": 0.1695, + "step": 14306 + }, + { + "epoch": 0.2551813933578283, + "grad_norm": 0.36727413535118103, + "learning_rate": 4.642048852785678e-05, + "loss": 0.2219, + "step": 14307 + }, + { + "epoch": 0.255199229479542, + "grad_norm": 0.21790213882923126, + "learning_rate": 4.641968592549899e-05, + "loss": 0.1695, + "step": 14308 + }, + { + "epoch": 0.2552170656012557, + "grad_norm": 0.3148146867752075, + "learning_rate": 4.64188832401111e-05, + "loss": 0.2148, + "step": 14309 + }, + { + "epoch": 0.25523490172296937, + "grad_norm": 0.23866064846515656, + "learning_rate": 4.641808047169623e-05, + "loss": 0.1901, + "step": 14310 + }, + { + "epoch": 0.25525273784468305, + "grad_norm": 0.33334094285964966, + "learning_rate": 4.641727762025747e-05, + "loss": 0.1811, + "step": 14311 + }, + { + "epoch": 0.25527057396639674, + "grad_norm": 0.385278582572937, + "learning_rate": 4.641647468579795e-05, + "loss": 0.2473, + "step": 14312 + }, + { + "epoch": 0.25528841008811043, + "grad_norm": 0.1848461627960205, + "learning_rate": 4.6415671668320784e-05, + "loss": 0.1515, + "step": 14313 + }, + { + "epoch": 0.2553062462098241, + "grad_norm": 0.29272663593292236, + "learning_rate": 4.6414868567829076e-05, + "loss": 0.2557, + "step": 14314 + }, + { + "epoch": 0.2553240823315378, + "grad_norm": 0.3228347599506378, + "learning_rate": 4.641406538432593e-05, + "loss": 0.1904, + "step": 14315 + }, + { + "epoch": 0.25534191845325155, + "grad_norm": 0.3674481511116028, + "learning_rate": 4.641326211781448e-05, + "loss": 0.2122, + "step": 14316 + }, + { + "epoch": 0.25535975457496524, + "grad_norm": 0.26000264286994934, + "learning_rate": 4.641245876829783e-05, + "loss": 0.1987, + "step": 14317 + }, + { + "epoch": 0.2553775906966789, + "grad_norm": 0.252028226852417, + "learning_rate": 4.6411655335779085e-05, + "loss": 0.1511, + "step": 14318 + }, + { + "epoch": 0.2553954268183926, + "grad_norm": 0.23151366412639618, + "learning_rate": 4.641085182026138e-05, + "loss": 0.1802, + "step": 14319 + }, + { + "epoch": 0.2554132629401063, + "grad_norm": 0.6375302672386169, + "learning_rate": 4.6410048221747814e-05, + "loss": 0.1879, + "step": 14320 + }, + { + "epoch": 0.25543109906182, + "grad_norm": 0.2893012762069702, + "learning_rate": 4.6409244540241507e-05, + "loss": 0.2059, + "step": 14321 + }, + { + "epoch": 0.2554489351835337, + "grad_norm": 0.2172849476337433, + "learning_rate": 4.640844077574557e-05, + "loss": 0.1663, + "step": 14322 + }, + { + "epoch": 0.25546677130524736, + "grad_norm": 0.3070853650569916, + "learning_rate": 4.640763692826312e-05, + "loss": 0.1468, + "step": 14323 + }, + { + "epoch": 0.2554846074269611, + "grad_norm": 0.33351191878318787, + "learning_rate": 4.6406832997797275e-05, + "loss": 0.2312, + "step": 14324 + }, + { + "epoch": 0.2555024435486748, + "grad_norm": 0.2882409691810608, + "learning_rate": 4.640602898435116e-05, + "loss": 0.1617, + "step": 14325 + }, + { + "epoch": 0.2555202796703885, + "grad_norm": 0.29604238271713257, + "learning_rate": 4.640522488792788e-05, + "loss": 0.2165, + "step": 14326 + }, + { + "epoch": 0.25553811579210217, + "grad_norm": 0.9195283055305481, + "learning_rate": 4.640442070853056e-05, + "loss": 0.1609, + "step": 14327 + }, + { + "epoch": 0.25555595191381586, + "grad_norm": 0.3302413821220398, + "learning_rate": 4.64036164461623e-05, + "loss": 0.197, + "step": 14328 + }, + { + "epoch": 0.25557378803552955, + "grad_norm": 0.2042795717716217, + "learning_rate": 4.6402812100826243e-05, + "loss": 0.145, + "step": 14329 + }, + { + "epoch": 0.25559162415724324, + "grad_norm": 0.27050572633743286, + "learning_rate": 4.640200767252549e-05, + "loss": 0.229, + "step": 14330 + }, + { + "epoch": 0.2556094602789569, + "grad_norm": 0.2793087661266327, + "learning_rate": 4.640120316126316e-05, + "loss": 0.1239, + "step": 14331 + }, + { + "epoch": 0.2556272964006706, + "grad_norm": 0.19903309643268585, + "learning_rate": 4.640039856704238e-05, + "loss": 0.1472, + "step": 14332 + }, + { + "epoch": 0.25564513252238436, + "grad_norm": 0.26694461703300476, + "learning_rate": 4.6399593889866254e-05, + "loss": 0.1934, + "step": 14333 + }, + { + "epoch": 0.25566296864409804, + "grad_norm": 0.2252333164215088, + "learning_rate": 4.639878912973792e-05, + "loss": 0.1846, + "step": 14334 + }, + { + "epoch": 0.25568080476581173, + "grad_norm": 0.24357199668884277, + "learning_rate": 4.639798428666049e-05, + "loss": 0.1945, + "step": 14335 + }, + { + "epoch": 0.2556986408875254, + "grad_norm": 0.26161786913871765, + "learning_rate": 4.639717936063707e-05, + "loss": 0.2121, + "step": 14336 + }, + { + "epoch": 0.2557164770092391, + "grad_norm": 0.4109809696674347, + "learning_rate": 4.6396374351670804e-05, + "loss": 0.2219, + "step": 14337 + }, + { + "epoch": 0.2557343131309528, + "grad_norm": 0.26998665928840637, + "learning_rate": 4.63955692597648e-05, + "loss": 0.221, + "step": 14338 + }, + { + "epoch": 0.2557521492526665, + "grad_norm": 0.18102248013019562, + "learning_rate": 4.639476408492217e-05, + "loss": 0.1586, + "step": 14339 + }, + { + "epoch": 0.25576998537438017, + "grad_norm": 0.18463407456874847, + "learning_rate": 4.639395882714606e-05, + "loss": 0.1348, + "step": 14340 + }, + { + "epoch": 0.2557878214960939, + "grad_norm": 0.2712778151035309, + "learning_rate": 4.639315348643957e-05, + "loss": 0.1969, + "step": 14341 + }, + { + "epoch": 0.2558056576178076, + "grad_norm": 0.24200566112995148, + "learning_rate": 4.6392348062805824e-05, + "loss": 0.1914, + "step": 14342 + }, + { + "epoch": 0.2558234937395213, + "grad_norm": 0.30416885018348694, + "learning_rate": 4.6391542556247945e-05, + "loss": 0.1787, + "step": 14343 + }, + { + "epoch": 0.255841329861235, + "grad_norm": 0.3629536032676697, + "learning_rate": 4.6390736966769065e-05, + "loss": 0.2515, + "step": 14344 + }, + { + "epoch": 0.25585916598294867, + "grad_norm": 0.3306543529033661, + "learning_rate": 4.63899312943723e-05, + "loss": 0.2223, + "step": 14345 + }, + { + "epoch": 0.25587700210466235, + "grad_norm": 0.24814771115779877, + "learning_rate": 4.6389125539060774e-05, + "loss": 0.1712, + "step": 14346 + }, + { + "epoch": 0.25589483822637604, + "grad_norm": 0.28987741470336914, + "learning_rate": 4.638831970083761e-05, + "loss": 0.2323, + "step": 14347 + }, + { + "epoch": 0.25591267434808973, + "grad_norm": 0.239081472158432, + "learning_rate": 4.638751377970593e-05, + "loss": 0.1363, + "step": 14348 + }, + { + "epoch": 0.2559305104698034, + "grad_norm": 0.24164652824401855, + "learning_rate": 4.6386707775668856e-05, + "loss": 0.1956, + "step": 14349 + }, + { + "epoch": 0.25594834659151716, + "grad_norm": 0.2343667596578598, + "learning_rate": 4.6385901688729525e-05, + "loss": 0.1861, + "step": 14350 + }, + { + "epoch": 0.25596618271323085, + "grad_norm": 0.27585670351982117, + "learning_rate": 4.6385095518891046e-05, + "loss": 0.1813, + "step": 14351 + }, + { + "epoch": 0.25598401883494454, + "grad_norm": 0.29659050703048706, + "learning_rate": 4.6384289266156555e-05, + "loss": 0.1842, + "step": 14352 + }, + { + "epoch": 0.2560018549566582, + "grad_norm": 0.2685989737510681, + "learning_rate": 4.6383482930529164e-05, + "loss": 0.1583, + "step": 14353 + }, + { + "epoch": 0.2560196910783719, + "grad_norm": 0.292593777179718, + "learning_rate": 4.6382676512012016e-05, + "loss": 0.2134, + "step": 14354 + }, + { + "epoch": 0.2560375272000856, + "grad_norm": 0.3552425503730774, + "learning_rate": 4.638187001060823e-05, + "loss": 0.2069, + "step": 14355 + }, + { + "epoch": 0.2560553633217993, + "grad_norm": 0.23550207912921906, + "learning_rate": 4.6381063426320926e-05, + "loss": 0.151, + "step": 14356 + }, + { + "epoch": 0.256073199443513, + "grad_norm": 0.31950822472572327, + "learning_rate": 4.638025675915323e-05, + "loss": 0.1915, + "step": 14357 + }, + { + "epoch": 0.2560910355652267, + "grad_norm": 0.2374105006456375, + "learning_rate": 4.637945000910828e-05, + "loss": 0.1816, + "step": 14358 + }, + { + "epoch": 0.2561088716869404, + "grad_norm": 0.3021567165851593, + "learning_rate": 4.637864317618921e-05, + "loss": 0.1837, + "step": 14359 + }, + { + "epoch": 0.2561267078086541, + "grad_norm": 0.26406076550483704, + "learning_rate": 4.637783626039912e-05, + "loss": 0.2214, + "step": 14360 + }, + { + "epoch": 0.2561445439303678, + "grad_norm": 0.31598490476608276, + "learning_rate": 4.6377029261741156e-05, + "loss": 0.214, + "step": 14361 + }, + { + "epoch": 0.2561623800520815, + "grad_norm": 0.3069753050804138, + "learning_rate": 4.637622218021844e-05, + "loss": 0.1505, + "step": 14362 + }, + { + "epoch": 0.25618021617379516, + "grad_norm": 0.3438122272491455, + "learning_rate": 4.637541501583411e-05, + "loss": 0.2167, + "step": 14363 + }, + { + "epoch": 0.25619805229550885, + "grad_norm": 0.24852631986141205, + "learning_rate": 4.637460776859128e-05, + "loss": 0.1926, + "step": 14364 + }, + { + "epoch": 0.25621588841722254, + "grad_norm": 0.20911794900894165, + "learning_rate": 4.63738004384931e-05, + "loss": 0.1647, + "step": 14365 + }, + { + "epoch": 0.2562337245389363, + "grad_norm": 0.21022996306419373, + "learning_rate": 4.637299302554268e-05, + "loss": 0.1727, + "step": 14366 + }, + { + "epoch": 0.25625156066064997, + "grad_norm": 0.42533010244369507, + "learning_rate": 4.6372185529743155e-05, + "loss": 0.1711, + "step": 14367 + }, + { + "epoch": 0.25626939678236366, + "grad_norm": 0.2320965677499771, + "learning_rate": 4.637137795109766e-05, + "loss": 0.1642, + "step": 14368 + }, + { + "epoch": 0.25628723290407734, + "grad_norm": 0.27223172783851624, + "learning_rate": 4.6370570289609324e-05, + "loss": 0.128, + "step": 14369 + }, + { + "epoch": 0.25630506902579103, + "grad_norm": 0.23713983595371246, + "learning_rate": 4.636976254528127e-05, + "loss": 0.1496, + "step": 14370 + }, + { + "epoch": 0.2563229051475047, + "grad_norm": 0.28813818097114563, + "learning_rate": 4.6368954718116644e-05, + "loss": 0.1962, + "step": 14371 + }, + { + "epoch": 0.2563407412692184, + "grad_norm": 0.39149922132492065, + "learning_rate": 4.6368146808118566e-05, + "loss": 0.2297, + "step": 14372 + }, + { + "epoch": 0.2563585773909321, + "grad_norm": 0.24483299255371094, + "learning_rate": 4.6367338815290174e-05, + "loss": 0.2152, + "step": 14373 + }, + { + "epoch": 0.2563764135126458, + "grad_norm": 0.2821146249771118, + "learning_rate": 4.636653073963459e-05, + "loss": 0.2063, + "step": 14374 + }, + { + "epoch": 0.25639424963435953, + "grad_norm": 0.25984206795692444, + "learning_rate": 4.636572258115496e-05, + "loss": 0.1158, + "step": 14375 + }, + { + "epoch": 0.2564120857560732, + "grad_norm": 0.34578901529312134, + "learning_rate": 4.6364914339854405e-05, + "loss": 0.2325, + "step": 14376 + }, + { + "epoch": 0.2564299218777869, + "grad_norm": 0.3061266839504242, + "learning_rate": 4.636410601573606e-05, + "loss": 0.1775, + "step": 14377 + }, + { + "epoch": 0.2564477579995006, + "grad_norm": 0.2818215787410736, + "learning_rate": 4.636329760880306e-05, + "loss": 0.175, + "step": 14378 + }, + { + "epoch": 0.2564655941212143, + "grad_norm": 0.3151984214782715, + "learning_rate": 4.636248911905855e-05, + "loss": 0.1827, + "step": 14379 + }, + { + "epoch": 0.25648343024292797, + "grad_norm": 0.29902344942092896, + "learning_rate": 4.636168054650565e-05, + "loss": 0.1939, + "step": 14380 + }, + { + "epoch": 0.25650126636464166, + "grad_norm": 0.30194783210754395, + "learning_rate": 4.636087189114749e-05, + "loss": 0.187, + "step": 14381 + }, + { + "epoch": 0.25651910248635534, + "grad_norm": 0.3023608326911926, + "learning_rate": 4.6360063152987224e-05, + "loss": 0.2081, + "step": 14382 + }, + { + "epoch": 0.2565369386080691, + "grad_norm": 0.17881596088409424, + "learning_rate": 4.6359254332027967e-05, + "loss": 0.176, + "step": 14383 + }, + { + "epoch": 0.2565547747297828, + "grad_norm": 0.2737482190132141, + "learning_rate": 4.6358445428272865e-05, + "loss": 0.2216, + "step": 14384 + }, + { + "epoch": 0.25657261085149646, + "grad_norm": 0.34418344497680664, + "learning_rate": 4.6357636441725056e-05, + "loss": 0.1315, + "step": 14385 + }, + { + "epoch": 0.25659044697321015, + "grad_norm": 0.352762371301651, + "learning_rate": 4.6356827372387664e-05, + "loss": 0.1617, + "step": 14386 + }, + { + "epoch": 0.25660828309492384, + "grad_norm": 0.2598668932914734, + "learning_rate": 4.635601822026384e-05, + "loss": 0.1769, + "step": 14387 + }, + { + "epoch": 0.2566261192166375, + "grad_norm": 0.22755123674869537, + "learning_rate": 4.6355208985356716e-05, + "loss": 0.1632, + "step": 14388 + }, + { + "epoch": 0.2566439553383512, + "grad_norm": 0.24085275828838348, + "learning_rate": 4.635439966766942e-05, + "loss": 0.1666, + "step": 14389 + }, + { + "epoch": 0.2566617914600649, + "grad_norm": 0.41727307438850403, + "learning_rate": 4.63535902672051e-05, + "loss": 0.2247, + "step": 14390 + }, + { + "epoch": 0.2566796275817786, + "grad_norm": 0.2603924572467804, + "learning_rate": 4.635278078396688e-05, + "loss": 0.2157, + "step": 14391 + }, + { + "epoch": 0.25669746370349233, + "grad_norm": 0.24139870703220367, + "learning_rate": 4.6351971217957915e-05, + "loss": 0.177, + "step": 14392 + }, + { + "epoch": 0.256715299825206, + "grad_norm": 0.21795283257961273, + "learning_rate": 4.6351161569181323e-05, + "loss": 0.2322, + "step": 14393 + }, + { + "epoch": 0.2567331359469197, + "grad_norm": 0.29549121856689453, + "learning_rate": 4.635035183764027e-05, + "loss": 0.179, + "step": 14394 + }, + { + "epoch": 0.2567509720686334, + "grad_norm": 0.36327677965164185, + "learning_rate": 4.634954202333787e-05, + "loss": 0.2085, + "step": 14395 + }, + { + "epoch": 0.2567688081903471, + "grad_norm": 0.2972378134727478, + "learning_rate": 4.634873212627727e-05, + "loss": 0.1834, + "step": 14396 + }, + { + "epoch": 0.2567866443120608, + "grad_norm": 0.2354905903339386, + "learning_rate": 4.6347922146461616e-05, + "loss": 0.192, + "step": 14397 + }, + { + "epoch": 0.25680448043377446, + "grad_norm": 0.22473953664302826, + "learning_rate": 4.634711208389404e-05, + "loss": 0.2009, + "step": 14398 + }, + { + "epoch": 0.25682231655548815, + "grad_norm": 0.3361295461654663, + "learning_rate": 4.634630193857768e-05, + "loss": 0.1629, + "step": 14399 + }, + { + "epoch": 0.2568401526772019, + "grad_norm": 0.23022682964801788, + "learning_rate": 4.6345491710515686e-05, + "loss": 0.1571, + "step": 14400 + }, + { + "epoch": 0.2568579887989156, + "grad_norm": 0.21497762203216553, + "learning_rate": 4.63446813997112e-05, + "loss": 0.1869, + "step": 14401 + }, + { + "epoch": 0.25687582492062927, + "grad_norm": 0.331078439950943, + "learning_rate": 4.6343871006167344e-05, + "loss": 0.1123, + "step": 14402 + }, + { + "epoch": 0.25689366104234296, + "grad_norm": 0.307820200920105, + "learning_rate": 4.634306052988728e-05, + "loss": 0.1748, + "step": 14403 + }, + { + "epoch": 0.25691149716405665, + "grad_norm": 0.2890065908432007, + "learning_rate": 4.6342249970874144e-05, + "loss": 0.2082, + "step": 14404 + }, + { + "epoch": 0.25692933328577033, + "grad_norm": 0.37075090408325195, + "learning_rate": 4.634143932913107e-05, + "loss": 0.2356, + "step": 14405 + }, + { + "epoch": 0.256947169407484, + "grad_norm": 0.27535951137542725, + "learning_rate": 4.634062860466121e-05, + "loss": 0.1945, + "step": 14406 + }, + { + "epoch": 0.2569650055291977, + "grad_norm": 0.21282683312892914, + "learning_rate": 4.6339817797467696e-05, + "loss": 0.1408, + "step": 14407 + }, + { + "epoch": 0.25698284165091145, + "grad_norm": 0.30652379989624023, + "learning_rate": 4.633900690755368e-05, + "loss": 0.2446, + "step": 14408 + }, + { + "epoch": 0.25700067777262514, + "grad_norm": 0.2307320386171341, + "learning_rate": 4.63381959349223e-05, + "loss": 0.206, + "step": 14409 + }, + { + "epoch": 0.25701851389433883, + "grad_norm": 0.28244829177856445, + "learning_rate": 4.633738487957671e-05, + "loss": 0.1916, + "step": 14410 + }, + { + "epoch": 0.2570363500160525, + "grad_norm": 0.26034751534461975, + "learning_rate": 4.633657374152005e-05, + "loss": 0.1675, + "step": 14411 + }, + { + "epoch": 0.2570541861377662, + "grad_norm": 0.2954724431037903, + "learning_rate": 4.633576252075546e-05, + "loss": 0.1631, + "step": 14412 + }, + { + "epoch": 0.2570720222594799, + "grad_norm": 0.2681712806224823, + "learning_rate": 4.633495121728607e-05, + "loss": 0.1928, + "step": 14413 + }, + { + "epoch": 0.2570898583811936, + "grad_norm": 0.39349910616874695, + "learning_rate": 4.633413983111505e-05, + "loss": 0.1165, + "step": 14414 + }, + { + "epoch": 0.25710769450290727, + "grad_norm": 0.22883769869804382, + "learning_rate": 4.6333328362245535e-05, + "loss": 0.1484, + "step": 14415 + }, + { + "epoch": 0.25712553062462096, + "grad_norm": 0.3104557991027832, + "learning_rate": 4.633251681068067e-05, + "loss": 0.2302, + "step": 14416 + }, + { + "epoch": 0.2571433667463347, + "grad_norm": 0.34116917848587036, + "learning_rate": 4.633170517642361e-05, + "loss": 0.2161, + "step": 14417 + }, + { + "epoch": 0.2571612028680484, + "grad_norm": 0.23141366243362427, + "learning_rate": 4.633089345947749e-05, + "loss": 0.1492, + "step": 14418 + }, + { + "epoch": 0.2571790389897621, + "grad_norm": 0.2699582576751709, + "learning_rate": 4.633008165984545e-05, + "loss": 0.1906, + "step": 14419 + }, + { + "epoch": 0.25719687511147576, + "grad_norm": 0.2940155565738678, + "learning_rate": 4.632926977753065e-05, + "loss": 0.1785, + "step": 14420 + }, + { + "epoch": 0.25721471123318945, + "grad_norm": 0.21754339337348938, + "learning_rate": 4.632845781253624e-05, + "loss": 0.154, + "step": 14421 + }, + { + "epoch": 0.25723254735490314, + "grad_norm": 0.23799261450767517, + "learning_rate": 4.6327645764865354e-05, + "loss": 0.1213, + "step": 14422 + }, + { + "epoch": 0.25725038347661683, + "grad_norm": 0.3053324818611145, + "learning_rate": 4.632683363452115e-05, + "loss": 0.2201, + "step": 14423 + }, + { + "epoch": 0.2572682195983305, + "grad_norm": 0.36479657888412476, + "learning_rate": 4.632602142150677e-05, + "loss": 0.2298, + "step": 14424 + }, + { + "epoch": 0.25728605572004426, + "grad_norm": 0.33901506662368774, + "learning_rate": 4.632520912582537e-05, + "loss": 0.2352, + "step": 14425 + }, + { + "epoch": 0.25730389184175795, + "grad_norm": 0.35692498087882996, + "learning_rate": 4.632439674748009e-05, + "loss": 0.2259, + "step": 14426 + }, + { + "epoch": 0.25732172796347164, + "grad_norm": 0.29226434230804443, + "learning_rate": 4.6323584286474086e-05, + "loss": 0.1966, + "step": 14427 + }, + { + "epoch": 0.2573395640851853, + "grad_norm": 0.2755231559276581, + "learning_rate": 4.63227717428105e-05, + "loss": 0.1094, + "step": 14428 + }, + { + "epoch": 0.257357400206899, + "grad_norm": 0.3111165165901184, + "learning_rate": 4.632195911649249e-05, + "loss": 0.2024, + "step": 14429 + }, + { + "epoch": 0.2573752363286127, + "grad_norm": 0.3994561731815338, + "learning_rate": 4.6321146407523196e-05, + "loss": 0.19, + "step": 14430 + }, + { + "epoch": 0.2573930724503264, + "grad_norm": 0.33900538086891174, + "learning_rate": 4.6320333615905786e-05, + "loss": 0.1407, + "step": 14431 + }, + { + "epoch": 0.2574109085720401, + "grad_norm": 0.33996281027793884, + "learning_rate": 4.631952074164339e-05, + "loss": 0.2368, + "step": 14432 + }, + { + "epoch": 0.25742874469375376, + "grad_norm": 0.23042534291744232, + "learning_rate": 4.631870778473917e-05, + "loss": 0.1789, + "step": 14433 + }, + { + "epoch": 0.2574465808154675, + "grad_norm": 0.26778653264045715, + "learning_rate": 4.631789474519628e-05, + "loss": 0.2056, + "step": 14434 + }, + { + "epoch": 0.2574644169371812, + "grad_norm": 0.2652350962162018, + "learning_rate": 4.631708162301786e-05, + "loss": 0.1423, + "step": 14435 + }, + { + "epoch": 0.2574822530588949, + "grad_norm": 0.2772522568702698, + "learning_rate": 4.631626841820707e-05, + "loss": 0.1752, + "step": 14436 + }, + { + "epoch": 0.25750008918060857, + "grad_norm": 0.36391860246658325, + "learning_rate": 4.631545513076706e-05, + "loss": 0.2135, + "step": 14437 + }, + { + "epoch": 0.25751792530232226, + "grad_norm": 0.20596438646316528, + "learning_rate": 4.6314641760700995e-05, + "loss": 0.1469, + "step": 14438 + }, + { + "epoch": 0.25753576142403595, + "grad_norm": 0.33062639832496643, + "learning_rate": 4.6313828308012005e-05, + "loss": 0.1792, + "step": 14439 + }, + { + "epoch": 0.25755359754574964, + "grad_norm": 0.48378390073776245, + "learning_rate": 4.631301477270326e-05, + "loss": 0.2645, + "step": 14440 + }, + { + "epoch": 0.2575714336674633, + "grad_norm": 0.3870197534561157, + "learning_rate": 4.63122011547779e-05, + "loss": 0.2003, + "step": 14441 + }, + { + "epoch": 0.25758926978917707, + "grad_norm": 0.2974582612514496, + "learning_rate": 4.63113874542391e-05, + "loss": 0.1891, + "step": 14442 + }, + { + "epoch": 0.25760710591089075, + "grad_norm": 0.2086382806301117, + "learning_rate": 4.631057367109e-05, + "loss": 0.1559, + "step": 14443 + }, + { + "epoch": 0.25762494203260444, + "grad_norm": 0.30199331045150757, + "learning_rate": 4.630975980533374e-05, + "loss": 0.2184, + "step": 14444 + }, + { + "epoch": 0.25764277815431813, + "grad_norm": 0.4113059639930725, + "learning_rate": 4.6308945856973505e-05, + "loss": 0.2554, + "step": 14445 + }, + { + "epoch": 0.2576606142760318, + "grad_norm": 0.2280125916004181, + "learning_rate": 4.630813182601244e-05, + "loss": 0.1617, + "step": 14446 + }, + { + "epoch": 0.2576784503977455, + "grad_norm": 0.2942078709602356, + "learning_rate": 4.6307317712453686e-05, + "loss": 0.2219, + "step": 14447 + }, + { + "epoch": 0.2576962865194592, + "grad_norm": 0.2534199655056, + "learning_rate": 4.630650351630041e-05, + "loss": 0.1639, + "step": 14448 + }, + { + "epoch": 0.2577141226411729, + "grad_norm": 0.30035653710365295, + "learning_rate": 4.630568923755577e-05, + "loss": 0.1793, + "step": 14449 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 0.2677864730358124, + "learning_rate": 4.630487487622292e-05, + "loss": 0.2001, + "step": 14450 + }, + { + "epoch": 0.2577497948846003, + "grad_norm": 0.24835999310016632, + "learning_rate": 4.6304060432305016e-05, + "loss": 0.1675, + "step": 14451 + }, + { + "epoch": 0.257767631006314, + "grad_norm": 0.2797510623931885, + "learning_rate": 4.630324590580522e-05, + "loss": 0.1328, + "step": 14452 + }, + { + "epoch": 0.2577854671280277, + "grad_norm": 0.28699424862861633, + "learning_rate": 4.6302431296726684e-05, + "loss": 0.1877, + "step": 14453 + }, + { + "epoch": 0.2578033032497414, + "grad_norm": 0.28767064213752747, + "learning_rate": 4.630161660507256e-05, + "loss": 0.1907, + "step": 14454 + }, + { + "epoch": 0.25782113937145507, + "grad_norm": 0.2483808547258377, + "learning_rate": 4.630080183084602e-05, + "loss": 0.1943, + "step": 14455 + }, + { + "epoch": 0.25783897549316875, + "grad_norm": 0.26998236775398254, + "learning_rate": 4.6299986974050216e-05, + "loss": 0.1855, + "step": 14456 + }, + { + "epoch": 0.25785681161488244, + "grad_norm": 0.36403292417526245, + "learning_rate": 4.62991720346883e-05, + "loss": 0.1955, + "step": 14457 + }, + { + "epoch": 0.25787464773659613, + "grad_norm": 0.244973823428154, + "learning_rate": 4.629835701276344e-05, + "loss": 0.1955, + "step": 14458 + }, + { + "epoch": 0.2578924838583099, + "grad_norm": 0.3270580470561981, + "learning_rate": 4.629754190827878e-05, + "loss": 0.2258, + "step": 14459 + }, + { + "epoch": 0.25791031998002356, + "grad_norm": 0.2520008981227875, + "learning_rate": 4.629672672123751e-05, + "loss": 0.1627, + "step": 14460 + }, + { + "epoch": 0.25792815610173725, + "grad_norm": 0.2631997764110565, + "learning_rate": 4.629591145164276e-05, + "loss": 0.1792, + "step": 14461 + }, + { + "epoch": 0.25794599222345094, + "grad_norm": 0.2808324992656708, + "learning_rate": 4.629509609949771e-05, + "loss": 0.2423, + "step": 14462 + }, + { + "epoch": 0.2579638283451646, + "grad_norm": 0.25615525245666504, + "learning_rate": 4.62942806648055e-05, + "loss": 0.2088, + "step": 14463 + }, + { + "epoch": 0.2579816644668783, + "grad_norm": 0.25935521721839905, + "learning_rate": 4.629346514756931e-05, + "loss": 0.1888, + "step": 14464 + }, + { + "epoch": 0.257999500588592, + "grad_norm": 0.2394031137228012, + "learning_rate": 4.62926495477923e-05, + "loss": 0.1637, + "step": 14465 + }, + { + "epoch": 0.2580173367103057, + "grad_norm": 0.2117544412612915, + "learning_rate": 4.629183386547762e-05, + "loss": 0.1785, + "step": 14466 + }, + { + "epoch": 0.25803517283201943, + "grad_norm": 0.2255794256925583, + "learning_rate": 4.629101810062844e-05, + "loss": 0.2073, + "step": 14467 + }, + { + "epoch": 0.2580530089537331, + "grad_norm": 0.23256143927574158, + "learning_rate": 4.6290202253247915e-05, + "loss": 0.1647, + "step": 14468 + }, + { + "epoch": 0.2580708450754468, + "grad_norm": 0.25897982716560364, + "learning_rate": 4.628938632333922e-05, + "loss": 0.1574, + "step": 14469 + }, + { + "epoch": 0.2580886811971605, + "grad_norm": 0.26567527651786804, + "learning_rate": 4.628857031090551e-05, + "loss": 0.1861, + "step": 14470 + }, + { + "epoch": 0.2581065173188742, + "grad_norm": 0.29175615310668945, + "learning_rate": 4.628775421594995e-05, + "loss": 0.216, + "step": 14471 + }, + { + "epoch": 0.2581243534405879, + "grad_norm": 0.24691833555698395, + "learning_rate": 4.628693803847569e-05, + "loss": 0.1951, + "step": 14472 + }, + { + "epoch": 0.25814218956230156, + "grad_norm": 0.2945387661457062, + "learning_rate": 4.628612177848592e-05, + "loss": 0.1745, + "step": 14473 + }, + { + "epoch": 0.25816002568401525, + "grad_norm": 0.21037498116493225, + "learning_rate": 4.6285305435983785e-05, + "loss": 0.1821, + "step": 14474 + }, + { + "epoch": 0.25817786180572894, + "grad_norm": 0.45623844861984253, + "learning_rate": 4.6284489010972455e-05, + "loss": 0.18, + "step": 14475 + }, + { + "epoch": 0.2581956979274427, + "grad_norm": 0.2608366012573242, + "learning_rate": 4.628367250345509e-05, + "loss": 0.1741, + "step": 14476 + }, + { + "epoch": 0.25821353404915637, + "grad_norm": 0.20687773823738098, + "learning_rate": 4.628285591343486e-05, + "loss": 0.1601, + "step": 14477 + }, + { + "epoch": 0.25823137017087006, + "grad_norm": 0.4461628496646881, + "learning_rate": 4.6282039240914935e-05, + "loss": 0.1804, + "step": 14478 + }, + { + "epoch": 0.25824920629258374, + "grad_norm": 0.29956209659576416, + "learning_rate": 4.628122248589847e-05, + "loss": 0.1869, + "step": 14479 + }, + { + "epoch": 0.25826704241429743, + "grad_norm": 0.2962927520275116, + "learning_rate": 4.628040564838864e-05, + "loss": 0.1573, + "step": 14480 + }, + { + "epoch": 0.2582848785360111, + "grad_norm": 0.3870565593242645, + "learning_rate": 4.627958872838861e-05, + "loss": 0.1638, + "step": 14481 + }, + { + "epoch": 0.2583027146577248, + "grad_norm": 0.32134172320365906, + "learning_rate": 4.627877172590154e-05, + "loss": 0.2385, + "step": 14482 + }, + { + "epoch": 0.2583205507794385, + "grad_norm": 0.20753470063209534, + "learning_rate": 4.62779546409306e-05, + "loss": 0.1623, + "step": 14483 + }, + { + "epoch": 0.25833838690115224, + "grad_norm": 0.330765962600708, + "learning_rate": 4.627713747347896e-05, + "loss": 0.2135, + "step": 14484 + }, + { + "epoch": 0.2583562230228659, + "grad_norm": 0.45332831144332886, + "learning_rate": 4.6276320223549793e-05, + "loss": 0.1918, + "step": 14485 + }, + { + "epoch": 0.2583740591445796, + "grad_norm": 0.29866304993629456, + "learning_rate": 4.627550289114625e-05, + "loss": 0.2422, + "step": 14486 + }, + { + "epoch": 0.2583918952662933, + "grad_norm": 0.26704224944114685, + "learning_rate": 4.627468547627152e-05, + "loss": 0.1693, + "step": 14487 + }, + { + "epoch": 0.258409731388007, + "grad_norm": 0.21081265807151794, + "learning_rate": 4.627386797892875e-05, + "loss": 0.2097, + "step": 14488 + }, + { + "epoch": 0.2584275675097207, + "grad_norm": 0.26098814606666565, + "learning_rate": 4.627305039912112e-05, + "loss": 0.1999, + "step": 14489 + }, + { + "epoch": 0.25844540363143437, + "grad_norm": 0.34457314014434814, + "learning_rate": 4.6272232736851804e-05, + "loss": 0.2224, + "step": 14490 + }, + { + "epoch": 0.25846323975314806, + "grad_norm": 0.3143419027328491, + "learning_rate": 4.6271414992123976e-05, + "loss": 0.1551, + "step": 14491 + }, + { + "epoch": 0.25848107587486174, + "grad_norm": 0.27008283138275146, + "learning_rate": 4.6270597164940777e-05, + "loss": 0.1689, + "step": 14492 + }, + { + "epoch": 0.2584989119965755, + "grad_norm": 0.30262690782546997, + "learning_rate": 4.626977925530541e-05, + "loss": 0.1857, + "step": 14493 + }, + { + "epoch": 0.2585167481182892, + "grad_norm": 0.2565249502658844, + "learning_rate": 4.626896126322103e-05, + "loss": 0.209, + "step": 14494 + }, + { + "epoch": 0.25853458424000286, + "grad_norm": 0.25042498111724854, + "learning_rate": 4.626814318869081e-05, + "loss": 0.1541, + "step": 14495 + }, + { + "epoch": 0.25855242036171655, + "grad_norm": 0.2768356502056122, + "learning_rate": 4.6267325031717926e-05, + "loss": 0.2116, + "step": 14496 + }, + { + "epoch": 0.25857025648343024, + "grad_norm": 0.2424658238887787, + "learning_rate": 4.626650679230553e-05, + "loss": 0.1552, + "step": 14497 + }, + { + "epoch": 0.2585880926051439, + "grad_norm": 0.23272937536239624, + "learning_rate": 4.626568847045682e-05, + "loss": 0.2005, + "step": 14498 + }, + { + "epoch": 0.2586059287268576, + "grad_norm": 0.30662715435028076, + "learning_rate": 4.626487006617496e-05, + "loss": 0.16, + "step": 14499 + }, + { + "epoch": 0.2586237648485713, + "grad_norm": 0.276518851518631, + "learning_rate": 4.626405157946311e-05, + "loss": 0.1994, + "step": 14500 + }, + { + "epoch": 0.25864160097028505, + "grad_norm": 0.2522293031215668, + "learning_rate": 4.6263233010324456e-05, + "loss": 0.1682, + "step": 14501 + }, + { + "epoch": 0.25865943709199873, + "grad_norm": 0.2535803020000458, + "learning_rate": 4.6262414358762165e-05, + "loss": 0.1662, + "step": 14502 + }, + { + "epoch": 0.2586772732137124, + "grad_norm": 0.2939068675041199, + "learning_rate": 4.626159562477941e-05, + "loss": 0.2031, + "step": 14503 + }, + { + "epoch": 0.2586951093354261, + "grad_norm": 0.2839019000530243, + "learning_rate": 4.626077680837937e-05, + "loss": 0.1746, + "step": 14504 + }, + { + "epoch": 0.2587129454571398, + "grad_norm": 0.2303832322359085, + "learning_rate": 4.625995790956522e-05, + "loss": 0.1915, + "step": 14505 + }, + { + "epoch": 0.2587307815788535, + "grad_norm": 0.2729906439781189, + "learning_rate": 4.625913892834012e-05, + "loss": 0.1714, + "step": 14506 + }, + { + "epoch": 0.2587486177005672, + "grad_norm": 0.24131199717521667, + "learning_rate": 4.625831986470726e-05, + "loss": 0.183, + "step": 14507 + }, + { + "epoch": 0.25876645382228086, + "grad_norm": 0.372177392244339, + "learning_rate": 4.625750071866981e-05, + "loss": 0.2103, + "step": 14508 + }, + { + "epoch": 0.2587842899439946, + "grad_norm": 0.22164888679981232, + "learning_rate": 4.6256681490230945e-05, + "loss": 0.1672, + "step": 14509 + }, + { + "epoch": 0.2588021260657083, + "grad_norm": 0.3263988196849823, + "learning_rate": 4.625586217939384e-05, + "loss": 0.2026, + "step": 14510 + }, + { + "epoch": 0.258819962187422, + "grad_norm": 0.3316977620124817, + "learning_rate": 4.6255042786161675e-05, + "loss": 0.1813, + "step": 14511 + }, + { + "epoch": 0.25883779830913567, + "grad_norm": 0.3202001750469208, + "learning_rate": 4.625422331053762e-05, + "loss": 0.1789, + "step": 14512 + }, + { + "epoch": 0.25885563443084936, + "grad_norm": 0.23884595930576324, + "learning_rate": 4.6253403752524855e-05, + "loss": 0.1991, + "step": 14513 + }, + { + "epoch": 0.25887347055256305, + "grad_norm": 0.26711705327033997, + "learning_rate": 4.625258411212656e-05, + "loss": 0.1802, + "step": 14514 + }, + { + "epoch": 0.25889130667427673, + "grad_norm": 0.23574720323085785, + "learning_rate": 4.62517643893459e-05, + "loss": 0.1384, + "step": 14515 + }, + { + "epoch": 0.2589091427959904, + "grad_norm": 0.33867430686950684, + "learning_rate": 4.625094458418607e-05, + "loss": 0.1882, + "step": 14516 + }, + { + "epoch": 0.2589269789177041, + "grad_norm": 0.29728835821151733, + "learning_rate": 4.6250124696650235e-05, + "loss": 0.2051, + "step": 14517 + }, + { + "epoch": 0.25894481503941785, + "grad_norm": 0.24823889136314392, + "learning_rate": 4.624930472674158e-05, + "loss": 0.2024, + "step": 14518 + }, + { + "epoch": 0.25896265116113154, + "grad_norm": 0.3470822274684906, + "learning_rate": 4.624848467446328e-05, + "loss": 0.1622, + "step": 14519 + }, + { + "epoch": 0.25898048728284523, + "grad_norm": 0.20998916029930115, + "learning_rate": 4.6247664539818504e-05, + "loss": 0.1778, + "step": 14520 + }, + { + "epoch": 0.2589983234045589, + "grad_norm": 0.2235182821750641, + "learning_rate": 4.6246844322810456e-05, + "loss": 0.217, + "step": 14521 + }, + { + "epoch": 0.2590161595262726, + "grad_norm": 0.23888911306858063, + "learning_rate": 4.624602402344229e-05, + "loss": 0.1934, + "step": 14522 + }, + { + "epoch": 0.2590339956479863, + "grad_norm": 0.2445191740989685, + "learning_rate": 4.6245203641717206e-05, + "loss": 0.2209, + "step": 14523 + }, + { + "epoch": 0.2590518317697, + "grad_norm": 0.2210816591978073, + "learning_rate": 4.624438317763837e-05, + "loss": 0.1818, + "step": 14524 + }, + { + "epoch": 0.25906966789141367, + "grad_norm": 0.2667429745197296, + "learning_rate": 4.624356263120897e-05, + "loss": 0.1709, + "step": 14525 + }, + { + "epoch": 0.2590875040131274, + "grad_norm": 0.2377333641052246, + "learning_rate": 4.6242742002432176e-05, + "loss": 0.2153, + "step": 14526 + }, + { + "epoch": 0.2591053401348411, + "grad_norm": 0.2568122446537018, + "learning_rate": 4.6241921291311184e-05, + "loss": 0.1403, + "step": 14527 + }, + { + "epoch": 0.2591231762565548, + "grad_norm": 0.35701045393943787, + "learning_rate": 4.6241100497849165e-05, + "loss": 0.2133, + "step": 14528 + }, + { + "epoch": 0.2591410123782685, + "grad_norm": 0.3056379556655884, + "learning_rate": 4.624027962204931e-05, + "loss": 0.1688, + "step": 14529 + }, + { + "epoch": 0.25915884849998216, + "grad_norm": 0.29644814133644104, + "learning_rate": 4.623945866391479e-05, + "loss": 0.2025, + "step": 14530 + }, + { + "epoch": 0.25917668462169585, + "grad_norm": 0.3030856251716614, + "learning_rate": 4.623863762344879e-05, + "loss": 0.1644, + "step": 14531 + }, + { + "epoch": 0.25919452074340954, + "grad_norm": 0.4780627191066742, + "learning_rate": 4.6237816500654494e-05, + "loss": 0.2265, + "step": 14532 + }, + { + "epoch": 0.25921235686512323, + "grad_norm": 0.23053552210330963, + "learning_rate": 4.62369952955351e-05, + "loss": 0.1829, + "step": 14533 + }, + { + "epoch": 0.2592301929868369, + "grad_norm": 0.22041228413581848, + "learning_rate": 4.6236174008093764e-05, + "loss": 0.1929, + "step": 14534 + }, + { + "epoch": 0.25924802910855066, + "grad_norm": 0.23523174226284027, + "learning_rate": 4.623535263833368e-05, + "loss": 0.1761, + "step": 14535 + }, + { + "epoch": 0.25926586523026435, + "grad_norm": 0.26908382773399353, + "learning_rate": 4.623453118625804e-05, + "loss": 0.1995, + "step": 14536 + }, + { + "epoch": 0.25928370135197804, + "grad_norm": 0.2717522978782654, + "learning_rate": 4.6233709651870026e-05, + "loss": 0.1488, + "step": 14537 + }, + { + "epoch": 0.2593015374736917, + "grad_norm": 0.2552168071269989, + "learning_rate": 4.623288803517282e-05, + "loss": 0.2113, + "step": 14538 + }, + { + "epoch": 0.2593193735954054, + "grad_norm": 0.32829946279525757, + "learning_rate": 4.6232066336169604e-05, + "loss": 0.1177, + "step": 14539 + }, + { + "epoch": 0.2593372097171191, + "grad_norm": 0.2889467477798462, + "learning_rate": 4.623124455486357e-05, + "loss": 0.2252, + "step": 14540 + }, + { + "epoch": 0.2593550458388328, + "grad_norm": 0.22341616451740265, + "learning_rate": 4.6230422691257893e-05, + "loss": 0.1672, + "step": 14541 + }, + { + "epoch": 0.2593728819605465, + "grad_norm": 0.30590370297431946, + "learning_rate": 4.622960074535576e-05, + "loss": 0.2161, + "step": 14542 + }, + { + "epoch": 0.2593907180822602, + "grad_norm": 0.15577496588230133, + "learning_rate": 4.622877871716037e-05, + "loss": 0.1424, + "step": 14543 + }, + { + "epoch": 0.2594085542039739, + "grad_norm": 0.2397838532924652, + "learning_rate": 4.6227956606674905e-05, + "loss": 0.2068, + "step": 14544 + }, + { + "epoch": 0.2594263903256876, + "grad_norm": 0.3154611587524414, + "learning_rate": 4.622713441390254e-05, + "loss": 0.2012, + "step": 14545 + }, + { + "epoch": 0.2594442264474013, + "grad_norm": 0.20436833798885345, + "learning_rate": 4.6226312138846475e-05, + "loss": 0.1531, + "step": 14546 + }, + { + "epoch": 0.25946206256911497, + "grad_norm": 0.3715524673461914, + "learning_rate": 4.622548978150989e-05, + "loss": 0.1785, + "step": 14547 + }, + { + "epoch": 0.25947989869082866, + "grad_norm": 0.2255830615758896, + "learning_rate": 4.622466734189598e-05, + "loss": 0.1252, + "step": 14548 + }, + { + "epoch": 0.25949773481254235, + "grad_norm": 0.28741219639778137, + "learning_rate": 4.6223844820007924e-05, + "loss": 0.1994, + "step": 14549 + }, + { + "epoch": 0.25951557093425603, + "grad_norm": 0.254725843667984, + "learning_rate": 4.622302221584891e-05, + "loss": 0.1266, + "step": 14550 + }, + { + "epoch": 0.2595334070559697, + "grad_norm": 0.22727371752262115, + "learning_rate": 4.6222199529422145e-05, + "loss": 0.1739, + "step": 14551 + }, + { + "epoch": 0.25955124317768347, + "grad_norm": 0.18915487825870514, + "learning_rate": 4.622137676073079e-05, + "loss": 0.1363, + "step": 14552 + }, + { + "epoch": 0.25956907929939715, + "grad_norm": 0.21335235238075256, + "learning_rate": 4.6220553909778065e-05, + "loss": 0.1738, + "step": 14553 + }, + { + "epoch": 0.25958691542111084, + "grad_norm": 0.28781911730766296, + "learning_rate": 4.621973097656713e-05, + "loss": 0.1828, + "step": 14554 + }, + { + "epoch": 0.25960475154282453, + "grad_norm": 0.2517540156841278, + "learning_rate": 4.62189079611012e-05, + "loss": 0.186, + "step": 14555 + }, + { + "epoch": 0.2596225876645382, + "grad_norm": 0.265123188495636, + "learning_rate": 4.621808486338345e-05, + "loss": 0.1472, + "step": 14556 + }, + { + "epoch": 0.2596404237862519, + "grad_norm": 0.3527012765407562, + "learning_rate": 4.621726168341707e-05, + "loss": 0.2345, + "step": 14557 + }, + { + "epoch": 0.2596582599079656, + "grad_norm": 0.2715505361557007, + "learning_rate": 4.621643842120526e-05, + "loss": 0.2043, + "step": 14558 + }, + { + "epoch": 0.2596760960296793, + "grad_norm": 0.30191344022750854, + "learning_rate": 4.6215615076751207e-05, + "loss": 0.1913, + "step": 14559 + }, + { + "epoch": 0.259693932151393, + "grad_norm": 0.2725018560886383, + "learning_rate": 4.62147916500581e-05, + "loss": 0.1931, + "step": 14560 + }, + { + "epoch": 0.2597117682731067, + "grad_norm": 0.2341236174106598, + "learning_rate": 4.6213968141129134e-05, + "loss": 0.1782, + "step": 14561 + }, + { + "epoch": 0.2597296043948204, + "grad_norm": 0.24975833296775818, + "learning_rate": 4.6213144549967495e-05, + "loss": 0.1879, + "step": 14562 + }, + { + "epoch": 0.2597474405165341, + "grad_norm": 0.24866192042827606, + "learning_rate": 4.6212320876576385e-05, + "loss": 0.1764, + "step": 14563 + }, + { + "epoch": 0.2597652766382478, + "grad_norm": 0.338969886302948, + "learning_rate": 4.6211497120958996e-05, + "loss": 0.2346, + "step": 14564 + }, + { + "epoch": 0.25978311275996147, + "grad_norm": 0.2714281678199768, + "learning_rate": 4.6210673283118514e-05, + "loss": 0.1867, + "step": 14565 + }, + { + "epoch": 0.25980094888167515, + "grad_norm": 0.2358761727809906, + "learning_rate": 4.620984936305814e-05, + "loss": 0.2094, + "step": 14566 + }, + { + "epoch": 0.25981878500338884, + "grad_norm": 0.2552592158317566, + "learning_rate": 4.6209025360781066e-05, + "loss": 0.2277, + "step": 14567 + }, + { + "epoch": 0.2598366211251026, + "grad_norm": 0.30145397782325745, + "learning_rate": 4.620820127629048e-05, + "loss": 0.1831, + "step": 14568 + }, + { + "epoch": 0.2598544572468163, + "grad_norm": 0.23235675692558289, + "learning_rate": 4.620737710958958e-05, + "loss": 0.182, + "step": 14569 + }, + { + "epoch": 0.25987229336852996, + "grad_norm": 0.5347310900688171, + "learning_rate": 4.620655286068156e-05, + "loss": 0.2004, + "step": 14570 + }, + { + "epoch": 0.25989012949024365, + "grad_norm": 0.4997529685497284, + "learning_rate": 4.620572852956963e-05, + "loss": 0.1591, + "step": 14571 + }, + { + "epoch": 0.25990796561195734, + "grad_norm": 0.23414045572280884, + "learning_rate": 4.620490411625695e-05, + "loss": 0.1612, + "step": 14572 + }, + { + "epoch": 0.259925801733671, + "grad_norm": 0.1868872344493866, + "learning_rate": 4.620407962074676e-05, + "loss": 0.1527, + "step": 14573 + }, + { + "epoch": 0.2599436378553847, + "grad_norm": 0.391832560300827, + "learning_rate": 4.620325504304221e-05, + "loss": 0.2218, + "step": 14574 + }, + { + "epoch": 0.2599614739770984, + "grad_norm": 0.2683939039707184, + "learning_rate": 4.620243038314654e-05, + "loss": 0.1845, + "step": 14575 + }, + { + "epoch": 0.2599793100988121, + "grad_norm": 0.25123825669288635, + "learning_rate": 4.6201605641062915e-05, + "loss": 0.1849, + "step": 14576 + }, + { + "epoch": 0.25999714622052583, + "grad_norm": 0.3053325116634369, + "learning_rate": 4.6200780816794554e-05, + "loss": 0.1912, + "step": 14577 + }, + { + "epoch": 0.2600149823422395, + "grad_norm": 0.25136125087738037, + "learning_rate": 4.619995591034464e-05, + "loss": 0.1653, + "step": 14578 + }, + { + "epoch": 0.2600328184639532, + "grad_norm": 0.27196529507637024, + "learning_rate": 4.619913092171637e-05, + "loss": 0.2125, + "step": 14579 + }, + { + "epoch": 0.2600506545856669, + "grad_norm": 0.27855661511421204, + "learning_rate": 4.619830585091295e-05, + "loss": 0.1745, + "step": 14580 + }, + { + "epoch": 0.2600684907073806, + "grad_norm": 0.26694926619529724, + "learning_rate": 4.6197480697937576e-05, + "loss": 0.2441, + "step": 14581 + }, + { + "epoch": 0.26008632682909427, + "grad_norm": 0.23903785645961761, + "learning_rate": 4.619665546279345e-05, + "loss": 0.2099, + "step": 14582 + }, + { + "epoch": 0.26010416295080796, + "grad_norm": 0.2713642120361328, + "learning_rate": 4.6195830145483754e-05, + "loss": 0.1685, + "step": 14583 + }, + { + "epoch": 0.26012199907252165, + "grad_norm": 0.27828171849250793, + "learning_rate": 4.61950047460117e-05, + "loss": 0.2328, + "step": 14584 + }, + { + "epoch": 0.2601398351942354, + "grad_norm": 0.24836130440235138, + "learning_rate": 4.61941792643805e-05, + "loss": 0.1673, + "step": 14585 + }, + { + "epoch": 0.2601576713159491, + "grad_norm": 0.2757941484451294, + "learning_rate": 4.619335370059333e-05, + "loss": 0.1338, + "step": 14586 + }, + { + "epoch": 0.26017550743766277, + "grad_norm": 0.2903648912906647, + "learning_rate": 4.619252805465341e-05, + "loss": 0.1709, + "step": 14587 + }, + { + "epoch": 0.26019334355937646, + "grad_norm": 0.223250150680542, + "learning_rate": 4.619170232656393e-05, + "loss": 0.1934, + "step": 14588 + }, + { + "epoch": 0.26021117968109014, + "grad_norm": 0.19557170569896698, + "learning_rate": 4.619087651632808e-05, + "loss": 0.1606, + "step": 14589 + }, + { + "epoch": 0.26022901580280383, + "grad_norm": 0.23488689959049225, + "learning_rate": 4.619005062394909e-05, + "loss": 0.1489, + "step": 14590 + }, + { + "epoch": 0.2602468519245175, + "grad_norm": 0.30712801218032837, + "learning_rate": 4.618922464943014e-05, + "loss": 0.1975, + "step": 14591 + }, + { + "epoch": 0.2602646880462312, + "grad_norm": 0.4634312689304352, + "learning_rate": 4.618839859277443e-05, + "loss": 0.2035, + "step": 14592 + }, + { + "epoch": 0.2602825241679449, + "grad_norm": 0.3740062713623047, + "learning_rate": 4.618757245398517e-05, + "loss": 0.2023, + "step": 14593 + }, + { + "epoch": 0.26030036028965864, + "grad_norm": 0.22786946594715118, + "learning_rate": 4.618674623306557e-05, + "loss": 0.1703, + "step": 14594 + }, + { + "epoch": 0.2603181964113723, + "grad_norm": 0.2764323055744171, + "learning_rate": 4.618591993001882e-05, + "loss": 0.2282, + "step": 14595 + }, + { + "epoch": 0.260336032533086, + "grad_norm": 0.2611202299594879, + "learning_rate": 4.618509354484812e-05, + "loss": 0.1974, + "step": 14596 + }, + { + "epoch": 0.2603538686547997, + "grad_norm": 0.25055521726608276, + "learning_rate": 4.618426707755669e-05, + "loss": 0.1967, + "step": 14597 + }, + { + "epoch": 0.2603717047765134, + "grad_norm": 0.23774002492427826, + "learning_rate": 4.618344052814772e-05, + "loss": 0.2276, + "step": 14598 + }, + { + "epoch": 0.2603895408982271, + "grad_norm": 0.3096005916595459, + "learning_rate": 4.618261389662442e-05, + "loss": 0.2224, + "step": 14599 + }, + { + "epoch": 0.26040737701994077, + "grad_norm": 0.2499011605978012, + "learning_rate": 4.6181787182989986e-05, + "loss": 0.2315, + "step": 14600 + }, + { + "epoch": 0.26042521314165445, + "grad_norm": 0.3614709675312042, + "learning_rate": 4.6180960387247635e-05, + "loss": 0.1578, + "step": 14601 + }, + { + "epoch": 0.2604430492633682, + "grad_norm": 0.3674885034561157, + "learning_rate": 4.6180133509400565e-05, + "loss": 0.2514, + "step": 14602 + }, + { + "epoch": 0.2604608853850819, + "grad_norm": 0.26196131110191345, + "learning_rate": 4.617930654945199e-05, + "loss": 0.1972, + "step": 14603 + }, + { + "epoch": 0.2604787215067956, + "grad_norm": 0.20775820314884186, + "learning_rate": 4.6178479507405086e-05, + "loss": 0.1799, + "step": 14604 + }, + { + "epoch": 0.26049655762850926, + "grad_norm": 0.20737087726593018, + "learning_rate": 4.6177652383263095e-05, + "loss": 0.1558, + "step": 14605 + }, + { + "epoch": 0.26051439375022295, + "grad_norm": 0.2366851568222046, + "learning_rate": 4.617682517702921e-05, + "loss": 0.1748, + "step": 14606 + }, + { + "epoch": 0.26053222987193664, + "grad_norm": 0.37857529520988464, + "learning_rate": 4.6175997888706634e-05, + "loss": 0.1785, + "step": 14607 + }, + { + "epoch": 0.2605500659936503, + "grad_norm": 0.31405702233314514, + "learning_rate": 4.617517051829857e-05, + "loss": 0.1392, + "step": 14608 + }, + { + "epoch": 0.260567902115364, + "grad_norm": 0.26305896043777466, + "learning_rate": 4.6174343065808247e-05, + "loss": 0.1513, + "step": 14609 + }, + { + "epoch": 0.26058573823707776, + "grad_norm": 0.24315159022808075, + "learning_rate": 4.617351553123884e-05, + "loss": 0.1626, + "step": 14610 + }, + { + "epoch": 0.26060357435879145, + "grad_norm": 0.27016130089759827, + "learning_rate": 4.617268791459358e-05, + "loss": 0.1375, + "step": 14611 + }, + { + "epoch": 0.26062141048050513, + "grad_norm": 0.2838382422924042, + "learning_rate": 4.617186021587567e-05, + "loss": 0.1815, + "step": 14612 + }, + { + "epoch": 0.2606392466022188, + "grad_norm": 0.30013149976730347, + "learning_rate": 4.6171032435088316e-05, + "loss": 0.1915, + "step": 14613 + }, + { + "epoch": 0.2606570827239325, + "grad_norm": 0.22592492401599884, + "learning_rate": 4.617020457223473e-05, + "loss": 0.181, + "step": 14614 + }, + { + "epoch": 0.2606749188456462, + "grad_norm": 0.4308866262435913, + "learning_rate": 4.6169376627318116e-05, + "loss": 0.1961, + "step": 14615 + }, + { + "epoch": 0.2606927549673599, + "grad_norm": 0.3728789985179901, + "learning_rate": 4.616854860034169e-05, + "loss": 0.1659, + "step": 14616 + }, + { + "epoch": 0.2607105910890736, + "grad_norm": 0.24493689835071564, + "learning_rate": 4.616772049130866e-05, + "loss": 0.2097, + "step": 14617 + }, + { + "epoch": 0.26072842721078726, + "grad_norm": 0.31610414385795593, + "learning_rate": 4.6166892300222234e-05, + "loss": 0.2272, + "step": 14618 + }, + { + "epoch": 0.260746263332501, + "grad_norm": 0.31300243735313416, + "learning_rate": 4.616606402708561e-05, + "loss": 0.2174, + "step": 14619 + }, + { + "epoch": 0.2607640994542147, + "grad_norm": 0.25682532787323, + "learning_rate": 4.616523567190203e-05, + "loss": 0.1827, + "step": 14620 + }, + { + "epoch": 0.2607819355759284, + "grad_norm": 0.26203516125679016, + "learning_rate": 4.616440723467468e-05, + "loss": 0.2041, + "step": 14621 + }, + { + "epoch": 0.26079977169764207, + "grad_norm": 0.17587150633335114, + "learning_rate": 4.616357871540677e-05, + "loss": 0.1749, + "step": 14622 + }, + { + "epoch": 0.26081760781935576, + "grad_norm": 0.2796902358531952, + "learning_rate": 4.6162750114101526e-05, + "loss": 0.1966, + "step": 14623 + }, + { + "epoch": 0.26083544394106944, + "grad_norm": 0.2103869915008545, + "learning_rate": 4.616192143076214e-05, + "loss": 0.1573, + "step": 14624 + }, + { + "epoch": 0.26085328006278313, + "grad_norm": 0.24828559160232544, + "learning_rate": 4.616109266539186e-05, + "loss": 0.1716, + "step": 14625 + }, + { + "epoch": 0.2608711161844968, + "grad_norm": 0.28482159972190857, + "learning_rate": 4.6160263817993864e-05, + "loss": 0.1845, + "step": 14626 + }, + { + "epoch": 0.26088895230621056, + "grad_norm": 0.20587415993213654, + "learning_rate": 4.615943488857137e-05, + "loss": 0.1297, + "step": 14627 + }, + { + "epoch": 0.26090678842792425, + "grad_norm": 0.24430646002292633, + "learning_rate": 4.615860587712762e-05, + "loss": 0.1856, + "step": 14628 + }, + { + "epoch": 0.26092462454963794, + "grad_norm": 0.24143333733081818, + "learning_rate": 4.6157776783665784e-05, + "loss": 0.2147, + "step": 14629 + }, + { + "epoch": 0.26094246067135163, + "grad_norm": 0.2912479639053345, + "learning_rate": 4.6156947608189104e-05, + "loss": 0.198, + "step": 14630 + }, + { + "epoch": 0.2609602967930653, + "grad_norm": 0.2395295649766922, + "learning_rate": 4.615611835070079e-05, + "loss": 0.1227, + "step": 14631 + }, + { + "epoch": 0.260978132914779, + "grad_norm": 0.29983842372894287, + "learning_rate": 4.615528901120405e-05, + "loss": 0.1272, + "step": 14632 + }, + { + "epoch": 0.2609959690364927, + "grad_norm": 0.30038511753082275, + "learning_rate": 4.615445958970211e-05, + "loss": 0.1831, + "step": 14633 + }, + { + "epoch": 0.2610138051582064, + "grad_norm": 0.27793991565704346, + "learning_rate": 4.615363008619817e-05, + "loss": 0.1801, + "step": 14634 + }, + { + "epoch": 0.26103164127992007, + "grad_norm": 0.3724186420440674, + "learning_rate": 4.6152800500695456e-05, + "loss": 0.184, + "step": 14635 + }, + { + "epoch": 0.2610494774016338, + "grad_norm": 0.2901003658771515, + "learning_rate": 4.615197083319719e-05, + "loss": 0.2038, + "step": 14636 + }, + { + "epoch": 0.2610673135233475, + "grad_norm": 0.39684247970581055, + "learning_rate": 4.615114108370657e-05, + "loss": 0.1443, + "step": 14637 + }, + { + "epoch": 0.2610851496450612, + "grad_norm": 0.26540371775627136, + "learning_rate": 4.615031125222682e-05, + "loss": 0.1815, + "step": 14638 + }, + { + "epoch": 0.2611029857667749, + "grad_norm": 0.24698083102703094, + "learning_rate": 4.614948133876117e-05, + "loss": 0.1761, + "step": 14639 + }, + { + "epoch": 0.26112082188848856, + "grad_norm": 0.3243093192577362, + "learning_rate": 4.6148651343312815e-05, + "loss": 0.1703, + "step": 14640 + }, + { + "epoch": 0.26113865801020225, + "grad_norm": 0.2990565598011017, + "learning_rate": 4.614782126588498e-05, + "loss": 0.1899, + "step": 14641 + }, + { + "epoch": 0.26115649413191594, + "grad_norm": 0.2859545946121216, + "learning_rate": 4.61469911064809e-05, + "loss": 0.1969, + "step": 14642 + }, + { + "epoch": 0.2611743302536296, + "grad_norm": 0.30889734625816345, + "learning_rate": 4.614616086510377e-05, + "loss": 0.2243, + "step": 14643 + }, + { + "epoch": 0.26119216637534337, + "grad_norm": 0.26461878418922424, + "learning_rate": 4.6145330541756814e-05, + "loss": 0.2149, + "step": 14644 + }, + { + "epoch": 0.26121000249705706, + "grad_norm": 0.2526128590106964, + "learning_rate": 4.614450013644326e-05, + "loss": 0.173, + "step": 14645 + }, + { + "epoch": 0.26122783861877075, + "grad_norm": 0.30586960911750793, + "learning_rate": 4.614366964916631e-05, + "loss": 0.2102, + "step": 14646 + }, + { + "epoch": 0.26124567474048443, + "grad_norm": 0.3145003020763397, + "learning_rate": 4.614283907992921e-05, + "loss": 0.113, + "step": 14647 + }, + { + "epoch": 0.2612635108621981, + "grad_norm": 0.4077993631362915, + "learning_rate": 4.6142008428735154e-05, + "loss": 0.2165, + "step": 14648 + }, + { + "epoch": 0.2612813469839118, + "grad_norm": 0.2266421616077423, + "learning_rate": 4.614117769558737e-05, + "loss": 0.1591, + "step": 14649 + }, + { + "epoch": 0.2612991831056255, + "grad_norm": 0.34424862265586853, + "learning_rate": 4.614034688048908e-05, + "loss": 0.2193, + "step": 14650 + }, + { + "epoch": 0.2613170192273392, + "grad_norm": 0.4223160147666931, + "learning_rate": 4.6139515983443506e-05, + "loss": 0.2259, + "step": 14651 + }, + { + "epoch": 0.2613348553490529, + "grad_norm": 0.2760487198829651, + "learning_rate": 4.613868500445386e-05, + "loss": 0.1846, + "step": 14652 + }, + { + "epoch": 0.2613526914707666, + "grad_norm": 0.2091551423072815, + "learning_rate": 4.613785394352337e-05, + "loss": 0.1528, + "step": 14653 + }, + { + "epoch": 0.2613705275924803, + "grad_norm": 0.27764102816581726, + "learning_rate": 4.613702280065527e-05, + "loss": 0.2424, + "step": 14654 + }, + { + "epoch": 0.261388363714194, + "grad_norm": 0.22962181270122528, + "learning_rate": 4.613619157585276e-05, + "loss": 0.168, + "step": 14655 + }, + { + "epoch": 0.2614061998359077, + "grad_norm": 0.2940472662448883, + "learning_rate": 4.613536026911907e-05, + "loss": 0.2298, + "step": 14656 + }, + { + "epoch": 0.26142403595762137, + "grad_norm": 0.23704254627227783, + "learning_rate": 4.613452888045743e-05, + "loss": 0.148, + "step": 14657 + }, + { + "epoch": 0.26144187207933506, + "grad_norm": 0.28653737902641296, + "learning_rate": 4.6133697409871044e-05, + "loss": 0.1515, + "step": 14658 + }, + { + "epoch": 0.26145970820104875, + "grad_norm": 0.2383522242307663, + "learning_rate": 4.613286585736316e-05, + "loss": 0.1776, + "step": 14659 + }, + { + "epoch": 0.26147754432276243, + "grad_norm": 0.3766253590583801, + "learning_rate": 4.613203422293698e-05, + "loss": 0.1528, + "step": 14660 + }, + { + "epoch": 0.2614953804444762, + "grad_norm": 0.31416067481040955, + "learning_rate": 4.613120250659575e-05, + "loss": 0.1836, + "step": 14661 + }, + { + "epoch": 0.26151321656618987, + "grad_norm": 0.3291376829147339, + "learning_rate": 4.6130370708342665e-05, + "loss": 0.135, + "step": 14662 + }, + { + "epoch": 0.26153105268790355, + "grad_norm": 0.3606354892253876, + "learning_rate": 4.612953882818097e-05, + "loss": 0.177, + "step": 14663 + }, + { + "epoch": 0.26154888880961724, + "grad_norm": 0.36071377992630005, + "learning_rate": 4.612870686611389e-05, + "loss": 0.1847, + "step": 14664 + }, + { + "epoch": 0.26156672493133093, + "grad_norm": 0.3293820321559906, + "learning_rate": 4.6127874822144644e-05, + "loss": 0.1932, + "step": 14665 + }, + { + "epoch": 0.2615845610530446, + "grad_norm": 0.2977519929409027, + "learning_rate": 4.6127042696276453e-05, + "loss": 0.199, + "step": 14666 + }, + { + "epoch": 0.2616023971747583, + "grad_norm": 0.2657562792301178, + "learning_rate": 4.612621048851255e-05, + "loss": 0.1939, + "step": 14667 + }, + { + "epoch": 0.261620233296472, + "grad_norm": 0.19356130063533783, + "learning_rate": 4.6125378198856153e-05, + "loss": 0.1556, + "step": 14668 + }, + { + "epoch": 0.26163806941818574, + "grad_norm": 0.2492857277393341, + "learning_rate": 4.612454582731051e-05, + "loss": 0.1617, + "step": 14669 + }, + { + "epoch": 0.2616559055398994, + "grad_norm": 0.25919419527053833, + "learning_rate": 4.612371337387881e-05, + "loss": 0.1589, + "step": 14670 + }, + { + "epoch": 0.2616737416616131, + "grad_norm": 0.2730581760406494, + "learning_rate": 4.612288083856431e-05, + "loss": 0.2175, + "step": 14671 + }, + { + "epoch": 0.2616915777833268, + "grad_norm": 0.3098990023136139, + "learning_rate": 4.6122048221370226e-05, + "loss": 0.1714, + "step": 14672 + }, + { + "epoch": 0.2617094139050405, + "grad_norm": 0.3315688669681549, + "learning_rate": 4.6121215522299796e-05, + "loss": 0.1677, + "step": 14673 + }, + { + "epoch": 0.2617272500267542, + "grad_norm": 0.22881069779396057, + "learning_rate": 4.612038274135624e-05, + "loss": 0.1676, + "step": 14674 + }, + { + "epoch": 0.26174508614846786, + "grad_norm": 0.2312840223312378, + "learning_rate": 4.611954987854278e-05, + "loss": 0.1618, + "step": 14675 + }, + { + "epoch": 0.26176292227018155, + "grad_norm": 0.407091349363327, + "learning_rate": 4.611871693386264e-05, + "loss": 0.2257, + "step": 14676 + }, + { + "epoch": 0.26178075839189524, + "grad_norm": 0.317665159702301, + "learning_rate": 4.611788390731907e-05, + "loss": 0.1842, + "step": 14677 + }, + { + "epoch": 0.261798594513609, + "grad_norm": 0.2729001045227051, + "learning_rate": 4.61170507989153e-05, + "loss": 0.2042, + "step": 14678 + }, + { + "epoch": 0.26181643063532267, + "grad_norm": 0.26450785994529724, + "learning_rate": 4.611621760865453e-05, + "loss": 0.2182, + "step": 14679 + }, + { + "epoch": 0.26183426675703636, + "grad_norm": 0.24056072533130646, + "learning_rate": 4.6115384336540005e-05, + "loss": 0.1514, + "step": 14680 + }, + { + "epoch": 0.26185210287875005, + "grad_norm": 0.35893580317497253, + "learning_rate": 4.611455098257497e-05, + "loss": 0.2196, + "step": 14681 + }, + { + "epoch": 0.26186993900046374, + "grad_norm": 0.23031428456306458, + "learning_rate": 4.611371754676264e-05, + "loss": 0.1942, + "step": 14682 + }, + { + "epoch": 0.2618877751221774, + "grad_norm": 0.17973680794239044, + "learning_rate": 4.611288402910624e-05, + "loss": 0.1529, + "step": 14683 + }, + { + "epoch": 0.2619056112438911, + "grad_norm": 0.25546473264694214, + "learning_rate": 4.611205042960901e-05, + "loss": 0.1887, + "step": 14684 + }, + { + "epoch": 0.2619234473656048, + "grad_norm": 0.3437134325504303, + "learning_rate": 4.611121674827419e-05, + "loss": 0.2135, + "step": 14685 + }, + { + "epoch": 0.26194128348731854, + "grad_norm": 0.2834162414073944, + "learning_rate": 4.611038298510499e-05, + "loss": 0.1867, + "step": 14686 + }, + { + "epoch": 0.26195911960903223, + "grad_norm": 0.25561559200286865, + "learning_rate": 4.610954914010467e-05, + "loss": 0.174, + "step": 14687 + }, + { + "epoch": 0.2619769557307459, + "grad_norm": 0.2823486924171448, + "learning_rate": 4.610871521327644e-05, + "loss": 0.2226, + "step": 14688 + }, + { + "epoch": 0.2619947918524596, + "grad_norm": 0.3704608976840973, + "learning_rate": 4.610788120462354e-05, + "loss": 0.2391, + "step": 14689 + }, + { + "epoch": 0.2620126279741733, + "grad_norm": 0.39324644207954407, + "learning_rate": 4.6107047114149195e-05, + "loss": 0.174, + "step": 14690 + }, + { + "epoch": 0.262030464095887, + "grad_norm": 0.23658964037895203, + "learning_rate": 4.6106212941856655e-05, + "loss": 0.1857, + "step": 14691 + }, + { + "epoch": 0.26204830021760067, + "grad_norm": 0.25668588280677795, + "learning_rate": 4.6105378687749134e-05, + "loss": 0.1974, + "step": 14692 + }, + { + "epoch": 0.26206613633931436, + "grad_norm": 0.23021358251571655, + "learning_rate": 4.610454435182988e-05, + "loss": 0.2043, + "step": 14693 + }, + { + "epoch": 0.26208397246102805, + "grad_norm": 0.3021219074726105, + "learning_rate": 4.610370993410212e-05, + "loss": 0.1289, + "step": 14694 + }, + { + "epoch": 0.2621018085827418, + "grad_norm": 0.3289857506752014, + "learning_rate": 4.6102875434569095e-05, + "loss": 0.2288, + "step": 14695 + }, + { + "epoch": 0.2621196447044555, + "grad_norm": 0.2981726825237274, + "learning_rate": 4.610204085323404e-05, + "loss": 0.1573, + "step": 14696 + }, + { + "epoch": 0.26213748082616917, + "grad_norm": 0.23678918182849884, + "learning_rate": 4.6101206190100175e-05, + "loss": 0.1646, + "step": 14697 + }, + { + "epoch": 0.26215531694788285, + "grad_norm": 0.31751492619514465, + "learning_rate": 4.610037144517076e-05, + "loss": 0.1601, + "step": 14698 + }, + { + "epoch": 0.26217315306959654, + "grad_norm": 0.27791768312454224, + "learning_rate": 4.609953661844901e-05, + "loss": 0.1805, + "step": 14699 + }, + { + "epoch": 0.26219098919131023, + "grad_norm": 0.2689625024795532, + "learning_rate": 4.609870170993817e-05, + "loss": 0.175, + "step": 14700 + }, + { + "epoch": 0.2622088253130239, + "grad_norm": 0.3398139774799347, + "learning_rate": 4.6097866719641474e-05, + "loss": 0.1685, + "step": 14701 + }, + { + "epoch": 0.2622266614347376, + "grad_norm": 0.28132590651512146, + "learning_rate": 4.6097031647562164e-05, + "loss": 0.1978, + "step": 14702 + }, + { + "epoch": 0.26224449755645135, + "grad_norm": 0.1833999902009964, + "learning_rate": 4.609619649370346e-05, + "loss": 0.1689, + "step": 14703 + }, + { + "epoch": 0.26226233367816504, + "grad_norm": 0.2901715040206909, + "learning_rate": 4.6095361258068624e-05, + "loss": 0.1558, + "step": 14704 + }, + { + "epoch": 0.2622801697998787, + "grad_norm": 0.27557581663131714, + "learning_rate": 4.6094525940660885e-05, + "loss": 0.1686, + "step": 14705 + }, + { + "epoch": 0.2622980059215924, + "grad_norm": 0.26411107182502747, + "learning_rate": 4.6093690541483465e-05, + "loss": 0.1837, + "step": 14706 + }, + { + "epoch": 0.2623158420433061, + "grad_norm": 0.2018648236989975, + "learning_rate": 4.609285506053962e-05, + "loss": 0.1646, + "step": 14707 + }, + { + "epoch": 0.2623336781650198, + "grad_norm": 0.28513509035110474, + "learning_rate": 4.609201949783259e-05, + "loss": 0.1896, + "step": 14708 + }, + { + "epoch": 0.2623515142867335, + "grad_norm": 0.1935337334871292, + "learning_rate": 4.609118385336559e-05, + "loss": 0.1637, + "step": 14709 + }, + { + "epoch": 0.26236935040844717, + "grad_norm": 0.36007270216941833, + "learning_rate": 4.6090348127141895e-05, + "loss": 0.2552, + "step": 14710 + }, + { + "epoch": 0.26238718653016085, + "grad_norm": 0.3790152966976166, + "learning_rate": 4.608951231916472e-05, + "loss": 0.1821, + "step": 14711 + }, + { + "epoch": 0.2624050226518746, + "grad_norm": 0.2814227342605591, + "learning_rate": 4.6088676429437314e-05, + "loss": 0.1775, + "step": 14712 + }, + { + "epoch": 0.2624228587735883, + "grad_norm": 0.22261036932468414, + "learning_rate": 4.608784045796291e-05, + "loss": 0.1873, + "step": 14713 + }, + { + "epoch": 0.262440694895302, + "grad_norm": 0.2574446201324463, + "learning_rate": 4.608700440474475e-05, + "loss": 0.1596, + "step": 14714 + }, + { + "epoch": 0.26245853101701566, + "grad_norm": 0.2813780605792999, + "learning_rate": 4.608616826978609e-05, + "loss": 0.2092, + "step": 14715 + }, + { + "epoch": 0.26247636713872935, + "grad_norm": 0.26358798146247864, + "learning_rate": 4.6085332053090146e-05, + "loss": 0.1605, + "step": 14716 + }, + { + "epoch": 0.26249420326044304, + "grad_norm": 0.3280124366283417, + "learning_rate": 4.608449575466018e-05, + "loss": 0.2244, + "step": 14717 + }, + { + "epoch": 0.2625120393821567, + "grad_norm": 0.3204196095466614, + "learning_rate": 4.6083659374499424e-05, + "loss": 0.1648, + "step": 14718 + }, + { + "epoch": 0.2625298755038704, + "grad_norm": 0.24182642996311188, + "learning_rate": 4.608282291261112e-05, + "loss": 0.1921, + "step": 14719 + }, + { + "epoch": 0.26254771162558416, + "grad_norm": 0.3450903296470642, + "learning_rate": 4.608198636899851e-05, + "loss": 0.2257, + "step": 14720 + }, + { + "epoch": 0.26256554774729784, + "grad_norm": 0.30469459295272827, + "learning_rate": 4.608114974366485e-05, + "loss": 0.129, + "step": 14721 + }, + { + "epoch": 0.26258338386901153, + "grad_norm": 0.32286977767944336, + "learning_rate": 4.608031303661337e-05, + "loss": 0.1124, + "step": 14722 + }, + { + "epoch": 0.2626012199907252, + "grad_norm": 0.20940642058849335, + "learning_rate": 4.60794762478473e-05, + "loss": 0.1242, + "step": 14723 + }, + { + "epoch": 0.2626190561124389, + "grad_norm": 0.2885874807834625, + "learning_rate": 4.607863937736991e-05, + "loss": 0.1918, + "step": 14724 + }, + { + "epoch": 0.2626368922341526, + "grad_norm": 0.25926509499549866, + "learning_rate": 4.6077802425184444e-05, + "loss": 0.2039, + "step": 14725 + }, + { + "epoch": 0.2626547283558663, + "grad_norm": 0.2674572467803955, + "learning_rate": 4.6076965391294124e-05, + "loss": 0.15, + "step": 14726 + }, + { + "epoch": 0.26267256447758, + "grad_norm": 0.282661497592926, + "learning_rate": 4.6076128275702205e-05, + "loss": 0.1749, + "step": 14727 + }, + { + "epoch": 0.2626904005992937, + "grad_norm": 0.37727436423301697, + "learning_rate": 4.6075291078411945e-05, + "loss": 0.2124, + "step": 14728 + }, + { + "epoch": 0.2627082367210074, + "grad_norm": 0.2580815553665161, + "learning_rate": 4.6074453799426564e-05, + "loss": 0.192, + "step": 14729 + }, + { + "epoch": 0.2627260728427211, + "grad_norm": 0.3615904748439789, + "learning_rate": 4.607361643874932e-05, + "loss": 0.1766, + "step": 14730 + }, + { + "epoch": 0.2627439089644348, + "grad_norm": 0.2709302306175232, + "learning_rate": 4.607277899638347e-05, + "loss": 0.1882, + "step": 14731 + }, + { + "epoch": 0.26276174508614847, + "grad_norm": 0.24105305969715118, + "learning_rate": 4.607194147233225e-05, + "loss": 0.1433, + "step": 14732 + }, + { + "epoch": 0.26277958120786216, + "grad_norm": 0.20890876650810242, + "learning_rate": 4.607110386659891e-05, + "loss": 0.1813, + "step": 14733 + }, + { + "epoch": 0.26279741732957584, + "grad_norm": 0.26178038120269775, + "learning_rate": 4.607026617918668e-05, + "loss": 0.1533, + "step": 14734 + }, + { + "epoch": 0.26281525345128953, + "grad_norm": 0.23858247697353363, + "learning_rate": 4.606942841009883e-05, + "loss": 0.1827, + "step": 14735 + }, + { + "epoch": 0.2628330895730032, + "grad_norm": 0.2497948259115219, + "learning_rate": 4.60685905593386e-05, + "loss": 0.1655, + "step": 14736 + }, + { + "epoch": 0.26285092569471696, + "grad_norm": 0.23599396646022797, + "learning_rate": 4.606775262690923e-05, + "loss": 0.19, + "step": 14737 + }, + { + "epoch": 0.26286876181643065, + "grad_norm": 0.2434827983379364, + "learning_rate": 4.6066914612813974e-05, + "loss": 0.1701, + "step": 14738 + }, + { + "epoch": 0.26288659793814434, + "grad_norm": 0.2567709982395172, + "learning_rate": 4.606607651705609e-05, + "loss": 0.1936, + "step": 14739 + }, + { + "epoch": 0.262904434059858, + "grad_norm": 0.3965451121330261, + "learning_rate": 4.606523833963881e-05, + "loss": 0.2325, + "step": 14740 + }, + { + "epoch": 0.2629222701815717, + "grad_norm": 0.1909569650888443, + "learning_rate": 4.6064400080565395e-05, + "loss": 0.1411, + "step": 14741 + }, + { + "epoch": 0.2629401063032854, + "grad_norm": 0.2688005566596985, + "learning_rate": 4.606356173983908e-05, + "loss": 0.1833, + "step": 14742 + }, + { + "epoch": 0.2629579424249991, + "grad_norm": 0.2818892002105713, + "learning_rate": 4.6062723317463136e-05, + "loss": 0.1848, + "step": 14743 + }, + { + "epoch": 0.2629757785467128, + "grad_norm": 0.24099214375019073, + "learning_rate": 4.6061884813440796e-05, + "loss": 0.1594, + "step": 14744 + }, + { + "epoch": 0.2629936146684265, + "grad_norm": 0.266245037317276, + "learning_rate": 4.6061046227775316e-05, + "loss": 0.2316, + "step": 14745 + }, + { + "epoch": 0.2630114507901402, + "grad_norm": 0.2762731909751892, + "learning_rate": 4.606020756046995e-05, + "loss": 0.1739, + "step": 14746 + }, + { + "epoch": 0.2630292869118539, + "grad_norm": 0.31140822172164917, + "learning_rate": 4.605936881152794e-05, + "loss": 0.2014, + "step": 14747 + }, + { + "epoch": 0.2630471230335676, + "grad_norm": 0.2451024204492569, + "learning_rate": 4.605852998095255e-05, + "loss": 0.1918, + "step": 14748 + }, + { + "epoch": 0.2630649591552813, + "grad_norm": 0.4069274365901947, + "learning_rate": 4.605769106874702e-05, + "loss": 0.1374, + "step": 14749 + }, + { + "epoch": 0.26308279527699496, + "grad_norm": 0.2511003315448761, + "learning_rate": 4.60568520749146e-05, + "loss": 0.1413, + "step": 14750 + }, + { + "epoch": 0.26310063139870865, + "grad_norm": 0.226557195186615, + "learning_rate": 4.605601299945856e-05, + "loss": 0.2155, + "step": 14751 + }, + { + "epoch": 0.26311846752042234, + "grad_norm": 0.262915700674057, + "learning_rate": 4.605517384238214e-05, + "loss": 0.1531, + "step": 14752 + }, + { + "epoch": 0.263136303642136, + "grad_norm": 0.38432836532592773, + "learning_rate": 4.6054334603688584e-05, + "loss": 0.1539, + "step": 14753 + }, + { + "epoch": 0.26315413976384977, + "grad_norm": 0.3766404092311859, + "learning_rate": 4.605349528338116e-05, + "loss": 0.1755, + "step": 14754 + }, + { + "epoch": 0.26317197588556346, + "grad_norm": 0.2324799746274948, + "learning_rate": 4.6052655881463126e-05, + "loss": 0.1566, + "step": 14755 + }, + { + "epoch": 0.26318981200727715, + "grad_norm": 0.3256889581680298, + "learning_rate": 4.6051816397937714e-05, + "loss": 0.1893, + "step": 14756 + }, + { + "epoch": 0.26320764812899083, + "grad_norm": 0.2917213439941406, + "learning_rate": 4.605097683280819e-05, + "loss": 0.1898, + "step": 14757 + }, + { + "epoch": 0.2632254842507045, + "grad_norm": 0.34756094217300415, + "learning_rate": 4.605013718607782e-05, + "loss": 0.2098, + "step": 14758 + }, + { + "epoch": 0.2632433203724182, + "grad_norm": 0.3783144950866699, + "learning_rate": 4.6049297457749844e-05, + "loss": 0.2609, + "step": 14759 + }, + { + "epoch": 0.2632611564941319, + "grad_norm": 0.4534520208835602, + "learning_rate": 4.6048457647827515e-05, + "loss": 0.1686, + "step": 14760 + }, + { + "epoch": 0.2632789926158456, + "grad_norm": 0.24218110740184784, + "learning_rate": 4.60476177563141e-05, + "loss": 0.2359, + "step": 14761 + }, + { + "epoch": 0.26329682873755933, + "grad_norm": 0.2579290568828583, + "learning_rate": 4.604677778321285e-05, + "loss": 0.1819, + "step": 14762 + }, + { + "epoch": 0.263314664859273, + "grad_norm": 0.39508339762687683, + "learning_rate": 4.6045937728527014e-05, + "loss": 0.2125, + "step": 14763 + }, + { + "epoch": 0.2633325009809867, + "grad_norm": 0.3167424499988556, + "learning_rate": 4.604509759225986e-05, + "loss": 0.151, + "step": 14764 + }, + { + "epoch": 0.2633503371027004, + "grad_norm": 0.23372012376785278, + "learning_rate": 4.6044257374414636e-05, + "loss": 0.2015, + "step": 14765 + }, + { + "epoch": 0.2633681732244141, + "grad_norm": 0.36966976523399353, + "learning_rate": 4.6043417074994596e-05, + "loss": 0.1999, + "step": 14766 + }, + { + "epoch": 0.26338600934612777, + "grad_norm": 0.22495873272418976, + "learning_rate": 4.6042576694003014e-05, + "loss": 0.1575, + "step": 14767 + }, + { + "epoch": 0.26340384546784146, + "grad_norm": 0.3362847566604614, + "learning_rate": 4.6041736231443135e-05, + "loss": 0.2179, + "step": 14768 + }, + { + "epoch": 0.26342168158955515, + "grad_norm": 0.38222536444664, + "learning_rate": 4.604089568731821e-05, + "loss": 0.1585, + "step": 14769 + }, + { + "epoch": 0.2634395177112689, + "grad_norm": 0.3842891752719879, + "learning_rate": 4.604005506163152e-05, + "loss": 0.2175, + "step": 14770 + }, + { + "epoch": 0.2634573538329826, + "grad_norm": 0.33944371342658997, + "learning_rate": 4.6039214354386296e-05, + "loss": 0.1776, + "step": 14771 + }, + { + "epoch": 0.26347518995469627, + "grad_norm": 0.28608548641204834, + "learning_rate": 4.6038373565585816e-05, + "loss": 0.1851, + "step": 14772 + }, + { + "epoch": 0.26349302607640995, + "grad_norm": 0.2767927348613739, + "learning_rate": 4.603753269523333e-05, + "loss": 0.169, + "step": 14773 + }, + { + "epoch": 0.26351086219812364, + "grad_norm": 0.22000938653945923, + "learning_rate": 4.603669174333211e-05, + "loss": 0.1863, + "step": 14774 + }, + { + "epoch": 0.26352869831983733, + "grad_norm": 0.2553474009037018, + "learning_rate": 4.60358507098854e-05, + "loss": 0.1396, + "step": 14775 + }, + { + "epoch": 0.263546534441551, + "grad_norm": 0.2102203667163849, + "learning_rate": 4.603500959489647e-05, + "loss": 0.1612, + "step": 14776 + }, + { + "epoch": 0.2635643705632647, + "grad_norm": 0.3251095712184906, + "learning_rate": 4.603416839836857e-05, + "loss": 0.2209, + "step": 14777 + }, + { + "epoch": 0.2635822066849784, + "grad_norm": 0.23818355798721313, + "learning_rate": 4.603332712030498e-05, + "loss": 0.1646, + "step": 14778 + }, + { + "epoch": 0.26360004280669214, + "grad_norm": 0.36736127734184265, + "learning_rate": 4.603248576070894e-05, + "loss": 0.1883, + "step": 14779 + }, + { + "epoch": 0.2636178789284058, + "grad_norm": 0.39555251598358154, + "learning_rate": 4.603164431958373e-05, + "loss": 0.1586, + "step": 14780 + }, + { + "epoch": 0.2636357150501195, + "grad_norm": 0.3390832245349884, + "learning_rate": 4.6030802796932594e-05, + "loss": 0.1649, + "step": 14781 + }, + { + "epoch": 0.2636535511718332, + "grad_norm": 0.21361882984638214, + "learning_rate": 4.602996119275881e-05, + "loss": 0.169, + "step": 14782 + }, + { + "epoch": 0.2636713872935469, + "grad_norm": 0.2697640657424927, + "learning_rate": 4.602911950706563e-05, + "loss": 0.1596, + "step": 14783 + }, + { + "epoch": 0.2636892234152606, + "grad_norm": 0.23366592824459076, + "learning_rate": 4.6028277739856315e-05, + "loss": 0.1736, + "step": 14784 + }, + { + "epoch": 0.26370705953697426, + "grad_norm": 0.32984569668769836, + "learning_rate": 4.602743589113413e-05, + "loss": 0.1447, + "step": 14785 + }, + { + "epoch": 0.26372489565868795, + "grad_norm": 0.2873001992702484, + "learning_rate": 4.6026593960902356e-05, + "loss": 0.2081, + "step": 14786 + }, + { + "epoch": 0.2637427317804017, + "grad_norm": 0.3492843806743622, + "learning_rate": 4.602575194916423e-05, + "loss": 0.1993, + "step": 14787 + }, + { + "epoch": 0.2637605679021154, + "grad_norm": 0.3649522364139557, + "learning_rate": 4.6024909855923024e-05, + "loss": 0.2071, + "step": 14788 + }, + { + "epoch": 0.26377840402382907, + "grad_norm": 0.23045486211776733, + "learning_rate": 4.6024067681182014e-05, + "loss": 0.1525, + "step": 14789 + }, + { + "epoch": 0.26379624014554276, + "grad_norm": 0.23058190941810608, + "learning_rate": 4.602322542494446e-05, + "loss": 0.1891, + "step": 14790 + }, + { + "epoch": 0.26381407626725645, + "grad_norm": 0.3273070454597473, + "learning_rate": 4.602238308721362e-05, + "loss": 0.2048, + "step": 14791 + }, + { + "epoch": 0.26383191238897014, + "grad_norm": 0.3690316081047058, + "learning_rate": 4.602154066799275e-05, + "loss": 0.2785, + "step": 14792 + }, + { + "epoch": 0.2638497485106838, + "grad_norm": 0.3431062698364258, + "learning_rate": 4.602069816728514e-05, + "loss": 0.1687, + "step": 14793 + }, + { + "epoch": 0.2638675846323975, + "grad_norm": 0.24640703201293945, + "learning_rate": 4.601985558509404e-05, + "loss": 0.2048, + "step": 14794 + }, + { + "epoch": 0.2638854207541112, + "grad_norm": 0.23419922590255737, + "learning_rate": 4.601901292142272e-05, + "loss": 0.1385, + "step": 14795 + }, + { + "epoch": 0.26390325687582494, + "grad_norm": 0.21126295626163483, + "learning_rate": 4.6018170176274445e-05, + "loss": 0.1834, + "step": 14796 + }, + { + "epoch": 0.26392109299753863, + "grad_norm": 0.23499855399131775, + "learning_rate": 4.601732734965248e-05, + "loss": 0.1665, + "step": 14797 + }, + { + "epoch": 0.2639389291192523, + "grad_norm": 0.641426146030426, + "learning_rate": 4.6016484441560103e-05, + "loss": 0.174, + "step": 14798 + }, + { + "epoch": 0.263956765240966, + "grad_norm": 0.31874483823776245, + "learning_rate": 4.601564145200057e-05, + "loss": 0.2092, + "step": 14799 + }, + { + "epoch": 0.2639746013626797, + "grad_norm": 0.41482603549957275, + "learning_rate": 4.601479838097715e-05, + "loss": 0.2215, + "step": 14800 + }, + { + "epoch": 0.2639924374843934, + "grad_norm": 0.3151341378688812, + "learning_rate": 4.6013955228493115e-05, + "loss": 0.2024, + "step": 14801 + }, + { + "epoch": 0.26401027360610707, + "grad_norm": 0.24253544211387634, + "learning_rate": 4.6013111994551736e-05, + "loss": 0.1869, + "step": 14802 + }, + { + "epoch": 0.26402810972782076, + "grad_norm": 0.3533554673194885, + "learning_rate": 4.601226867915627e-05, + "loss": 0.1484, + "step": 14803 + }, + { + "epoch": 0.2640459458495345, + "grad_norm": 0.29354128241539, + "learning_rate": 4.6011425282309996e-05, + "loss": 0.173, + "step": 14804 + }, + { + "epoch": 0.2640637819712482, + "grad_norm": 0.25246462225914, + "learning_rate": 4.601058180401619e-05, + "loss": 0.1489, + "step": 14805 + }, + { + "epoch": 0.2640816180929619, + "grad_norm": 0.30939918756484985, + "learning_rate": 4.600973824427809e-05, + "loss": 0.1946, + "step": 14806 + }, + { + "epoch": 0.26409945421467557, + "grad_norm": 0.2330443263053894, + "learning_rate": 4.600889460309901e-05, + "loss": 0.1423, + "step": 14807 + }, + { + "epoch": 0.26411729033638925, + "grad_norm": 0.24471040070056915, + "learning_rate": 4.6008050880482184e-05, + "loss": 0.1779, + "step": 14808 + }, + { + "epoch": 0.26413512645810294, + "grad_norm": 0.31686288118362427, + "learning_rate": 4.60072070764309e-05, + "loss": 0.2071, + "step": 14809 + }, + { + "epoch": 0.26415296257981663, + "grad_norm": 0.30483901500701904, + "learning_rate": 4.600636319094843e-05, + "loss": 0.1811, + "step": 14810 + }, + { + "epoch": 0.2641707987015303, + "grad_norm": 0.2643861770629883, + "learning_rate": 4.600551922403804e-05, + "loss": 0.162, + "step": 14811 + }, + { + "epoch": 0.264188634823244, + "grad_norm": 0.33887606859207153, + "learning_rate": 4.6004675175702994e-05, + "loss": 0.2067, + "step": 14812 + }, + { + "epoch": 0.26420647094495775, + "grad_norm": 0.22127081453800201, + "learning_rate": 4.6003831045946584e-05, + "loss": 0.182, + "step": 14813 + }, + { + "epoch": 0.26422430706667144, + "grad_norm": 0.27946820855140686, + "learning_rate": 4.6002986834772066e-05, + "loss": 0.2271, + "step": 14814 + }, + { + "epoch": 0.2642421431883851, + "grad_norm": 0.3012564480304718, + "learning_rate": 4.600214254218271e-05, + "loss": 0.1568, + "step": 14815 + }, + { + "epoch": 0.2642599793100988, + "grad_norm": 0.2642149031162262, + "learning_rate": 4.6001298168181804e-05, + "loss": 0.1863, + "step": 14816 + }, + { + "epoch": 0.2642778154318125, + "grad_norm": 0.291229248046875, + "learning_rate": 4.600045371277262e-05, + "loss": 0.2003, + "step": 14817 + }, + { + "epoch": 0.2642956515535262, + "grad_norm": 0.26569968461990356, + "learning_rate": 4.599960917595841e-05, + "loss": 0.1767, + "step": 14818 + }, + { + "epoch": 0.2643134876752399, + "grad_norm": 0.4270315170288086, + "learning_rate": 4.599876455774246e-05, + "loss": 0.2409, + "step": 14819 + }, + { + "epoch": 0.26433132379695357, + "grad_norm": 0.27255186438560486, + "learning_rate": 4.5997919858128056e-05, + "loss": 0.1935, + "step": 14820 + }, + { + "epoch": 0.2643491599186673, + "grad_norm": 0.2789528965950012, + "learning_rate": 4.599707507711846e-05, + "loss": 0.1568, + "step": 14821 + }, + { + "epoch": 0.264366996040381, + "grad_norm": 0.27071553468704224, + "learning_rate": 4.5996230214716944e-05, + "loss": 0.1357, + "step": 14822 + }, + { + "epoch": 0.2643848321620947, + "grad_norm": 0.3277747631072998, + "learning_rate": 4.599538527092679e-05, + "loss": 0.1625, + "step": 14823 + }, + { + "epoch": 0.2644026682838084, + "grad_norm": 0.28402459621429443, + "learning_rate": 4.599454024575127e-05, + "loss": 0.1828, + "step": 14824 + }, + { + "epoch": 0.26442050440552206, + "grad_norm": 0.29564547538757324, + "learning_rate": 4.599369513919367e-05, + "loss": 0.192, + "step": 14825 + }, + { + "epoch": 0.26443834052723575, + "grad_norm": 0.30252617597579956, + "learning_rate": 4.5992849951257246e-05, + "loss": 0.1997, + "step": 14826 + }, + { + "epoch": 0.26445617664894944, + "grad_norm": 0.29845741391181946, + "learning_rate": 4.599200468194529e-05, + "loss": 0.2428, + "step": 14827 + }, + { + "epoch": 0.2644740127706631, + "grad_norm": 0.23279666900634766, + "learning_rate": 4.599115933126107e-05, + "loss": 0.1603, + "step": 14828 + }, + { + "epoch": 0.26449184889237687, + "grad_norm": 0.3187519907951355, + "learning_rate": 4.599031389920787e-05, + "loss": 0.1812, + "step": 14829 + }, + { + "epoch": 0.26450968501409056, + "grad_norm": 0.4159899055957794, + "learning_rate": 4.598946838578896e-05, + "loss": 0.1809, + "step": 14830 + }, + { + "epoch": 0.26452752113580424, + "grad_norm": 0.2733418345451355, + "learning_rate": 4.598862279100762e-05, + "loss": 0.2004, + "step": 14831 + }, + { + "epoch": 0.26454535725751793, + "grad_norm": 0.2938709557056427, + "learning_rate": 4.598777711486714e-05, + "loss": 0.1865, + "step": 14832 + }, + { + "epoch": 0.2645631933792316, + "grad_norm": 0.2521997094154358, + "learning_rate": 4.598693135737078e-05, + "loss": 0.1584, + "step": 14833 + }, + { + "epoch": 0.2645810295009453, + "grad_norm": 0.3254789710044861, + "learning_rate": 4.598608551852181e-05, + "loss": 0.2071, + "step": 14834 + }, + { + "epoch": 0.264598865622659, + "grad_norm": 0.2943840026855469, + "learning_rate": 4.598523959832355e-05, + "loss": 0.1723, + "step": 14835 + }, + { + "epoch": 0.2646167017443727, + "grad_norm": 0.2356782704591751, + "learning_rate": 4.598439359677924e-05, + "loss": 0.1964, + "step": 14836 + }, + { + "epoch": 0.2646345378660864, + "grad_norm": 0.3495713174343109, + "learning_rate": 4.598354751389217e-05, + "loss": 0.2269, + "step": 14837 + }, + { + "epoch": 0.2646523739878001, + "grad_norm": 0.22048158943653107, + "learning_rate": 4.598270134966562e-05, + "loss": 0.163, + "step": 14838 + }, + { + "epoch": 0.2646702101095138, + "grad_norm": 0.3752993047237396, + "learning_rate": 4.598185510410289e-05, + "loss": 0.1806, + "step": 14839 + }, + { + "epoch": 0.2646880462312275, + "grad_norm": 0.29053422808647156, + "learning_rate": 4.5981008777207225e-05, + "loss": 0.1866, + "step": 14840 + }, + { + "epoch": 0.2647058823529412, + "grad_norm": 0.23643746972084045, + "learning_rate": 4.598016236898193e-05, + "loss": 0.1688, + "step": 14841 + }, + { + "epoch": 0.26472371847465487, + "grad_norm": 0.2562626302242279, + "learning_rate": 4.597931587943029e-05, + "loss": 0.1992, + "step": 14842 + }, + { + "epoch": 0.26474155459636856, + "grad_norm": 0.30713966488838196, + "learning_rate": 4.597846930855556e-05, + "loss": 0.2131, + "step": 14843 + }, + { + "epoch": 0.26475939071808224, + "grad_norm": 0.2793387770652771, + "learning_rate": 4.597762265636104e-05, + "loss": 0.1766, + "step": 14844 + }, + { + "epoch": 0.26477722683979593, + "grad_norm": 0.23357027769088745, + "learning_rate": 4.5976775922850014e-05, + "loss": 0.2249, + "step": 14845 + }, + { + "epoch": 0.2647950629615097, + "grad_norm": 0.2755261957645416, + "learning_rate": 4.597592910802575e-05, + "loss": 0.1462, + "step": 14846 + }, + { + "epoch": 0.26481289908322336, + "grad_norm": 0.35624149441719055, + "learning_rate": 4.597508221189155e-05, + "loss": 0.1886, + "step": 14847 + }, + { + "epoch": 0.26483073520493705, + "grad_norm": 0.2380143105983734, + "learning_rate": 4.597423523445068e-05, + "loss": 0.1912, + "step": 14848 + }, + { + "epoch": 0.26484857132665074, + "grad_norm": 0.2834615111351013, + "learning_rate": 4.597338817570643e-05, + "loss": 0.1752, + "step": 14849 + }, + { + "epoch": 0.2648664074483644, + "grad_norm": 0.26098644733428955, + "learning_rate": 4.597254103566209e-05, + "loss": 0.1943, + "step": 14850 + }, + { + "epoch": 0.2648842435700781, + "grad_norm": 0.3464111089706421, + "learning_rate": 4.5971693814320934e-05, + "loss": 0.2229, + "step": 14851 + }, + { + "epoch": 0.2649020796917918, + "grad_norm": 0.2722216844558716, + "learning_rate": 4.597084651168625e-05, + "loss": 0.2095, + "step": 14852 + }, + { + "epoch": 0.2649199158135055, + "grad_norm": 0.25106334686279297, + "learning_rate": 4.596999912776132e-05, + "loss": 0.1757, + "step": 14853 + }, + { + "epoch": 0.2649377519352192, + "grad_norm": 0.2816717028617859, + "learning_rate": 4.5969151662549435e-05, + "loss": 0.1591, + "step": 14854 + }, + { + "epoch": 0.2649555880569329, + "grad_norm": 0.2325526475906372, + "learning_rate": 4.596830411605387e-05, + "loss": 0.142, + "step": 14855 + }, + { + "epoch": 0.2649734241786466, + "grad_norm": 0.29756152629852295, + "learning_rate": 4.596745648827792e-05, + "loss": 0.195, + "step": 14856 + }, + { + "epoch": 0.2649912603003603, + "grad_norm": 0.31311625242233276, + "learning_rate": 4.596660877922486e-05, + "loss": 0.1641, + "step": 14857 + }, + { + "epoch": 0.265009096422074, + "grad_norm": 0.36467182636260986, + "learning_rate": 4.596576098889799e-05, + "loss": 0.2028, + "step": 14858 + }, + { + "epoch": 0.2650269325437877, + "grad_norm": 0.2742144465446472, + "learning_rate": 4.596491311730059e-05, + "loss": 0.1742, + "step": 14859 + }, + { + "epoch": 0.26504476866550136, + "grad_norm": 0.3176640272140503, + "learning_rate": 4.596406516443594e-05, + "loss": 0.1841, + "step": 14860 + }, + { + "epoch": 0.26506260478721505, + "grad_norm": 0.23231923580169678, + "learning_rate": 4.596321713030733e-05, + "loss": 0.144, + "step": 14861 + }, + { + "epoch": 0.26508044090892874, + "grad_norm": 0.18834541738033295, + "learning_rate": 4.596236901491806e-05, + "loss": 0.1796, + "step": 14862 + }, + { + "epoch": 0.2650982770306425, + "grad_norm": 0.35587966442108154, + "learning_rate": 4.5961520818271407e-05, + "loss": 0.2077, + "step": 14863 + }, + { + "epoch": 0.26511611315235617, + "grad_norm": 0.2952534854412079, + "learning_rate": 4.596067254037065e-05, + "loss": 0.2131, + "step": 14864 + }, + { + "epoch": 0.26513394927406986, + "grad_norm": 0.21149951219558716, + "learning_rate": 4.595982418121909e-05, + "loss": 0.1828, + "step": 14865 + }, + { + "epoch": 0.26515178539578355, + "grad_norm": 0.26897165179252625, + "learning_rate": 4.595897574082002e-05, + "loss": 0.1506, + "step": 14866 + }, + { + "epoch": 0.26516962151749723, + "grad_norm": 0.32345300912857056, + "learning_rate": 4.595812721917672e-05, + "loss": 0.1937, + "step": 14867 + }, + { + "epoch": 0.2651874576392109, + "grad_norm": 0.23734283447265625, + "learning_rate": 4.5957278616292466e-05, + "loss": 0.1758, + "step": 14868 + }, + { + "epoch": 0.2652052937609246, + "grad_norm": 0.24701447784900665, + "learning_rate": 4.595642993217057e-05, + "loss": 0.1862, + "step": 14869 + }, + { + "epoch": 0.2652231298826383, + "grad_norm": 0.28314924240112305, + "learning_rate": 4.595558116681432e-05, + "loss": 0.1851, + "step": 14870 + }, + { + "epoch": 0.26524096600435204, + "grad_norm": 0.3213120996952057, + "learning_rate": 4.595473232022699e-05, + "loss": 0.2177, + "step": 14871 + }, + { + "epoch": 0.26525880212606573, + "grad_norm": 0.225111186504364, + "learning_rate": 4.5953883392411886e-05, + "loss": 0.1938, + "step": 14872 + }, + { + "epoch": 0.2652766382477794, + "grad_norm": 0.2745521366596222, + "learning_rate": 4.5953034383372294e-05, + "loss": 0.1955, + "step": 14873 + }, + { + "epoch": 0.2652944743694931, + "grad_norm": 0.29025590419769287, + "learning_rate": 4.595218529311149e-05, + "loss": 0.1437, + "step": 14874 + }, + { + "epoch": 0.2653123104912068, + "grad_norm": 0.43301862478256226, + "learning_rate": 4.595133612163279e-05, + "loss": 0.1717, + "step": 14875 + }, + { + "epoch": 0.2653301466129205, + "grad_norm": 0.37087392807006836, + "learning_rate": 4.595048686893948e-05, + "loss": 0.2419, + "step": 14876 + }, + { + "epoch": 0.26534798273463417, + "grad_norm": 0.272806316614151, + "learning_rate": 4.594963753503484e-05, + "loss": 0.1218, + "step": 14877 + }, + { + "epoch": 0.26536581885634786, + "grad_norm": 0.22667503356933594, + "learning_rate": 4.594878811992217e-05, + "loss": 0.15, + "step": 14878 + }, + { + "epoch": 0.26538365497806155, + "grad_norm": 0.2920016944408417, + "learning_rate": 4.594793862360477e-05, + "loss": 0.2233, + "step": 14879 + }, + { + "epoch": 0.2654014910997753, + "grad_norm": 0.37423068284988403, + "learning_rate": 4.594708904608591e-05, + "loss": 0.1635, + "step": 14880 + }, + { + "epoch": 0.265419327221489, + "grad_norm": 0.26330453157424927, + "learning_rate": 4.5946239387368906e-05, + "loss": 0.2019, + "step": 14881 + }, + { + "epoch": 0.26543716334320266, + "grad_norm": 0.3152129650115967, + "learning_rate": 4.594538964745704e-05, + "loss": 0.1831, + "step": 14882 + }, + { + "epoch": 0.26545499946491635, + "grad_norm": 0.2315073013305664, + "learning_rate": 4.5944539826353614e-05, + "loss": 0.1675, + "step": 14883 + }, + { + "epoch": 0.26547283558663004, + "grad_norm": 0.41672590374946594, + "learning_rate": 4.5943689924061915e-05, + "loss": 0.2603, + "step": 14884 + }, + { + "epoch": 0.26549067170834373, + "grad_norm": 0.5349219441413879, + "learning_rate": 4.594283994058524e-05, + "loss": 0.2172, + "step": 14885 + }, + { + "epoch": 0.2655085078300574, + "grad_norm": 0.23204335570335388, + "learning_rate": 4.5941989875926886e-05, + "loss": 0.1608, + "step": 14886 + }, + { + "epoch": 0.2655263439517711, + "grad_norm": 0.2690698802471161, + "learning_rate": 4.594113973009014e-05, + "loss": 0.1486, + "step": 14887 + }, + { + "epoch": 0.26554418007348485, + "grad_norm": 0.32761090993881226, + "learning_rate": 4.5940289503078313e-05, + "loss": 0.2036, + "step": 14888 + }, + { + "epoch": 0.26556201619519854, + "grad_norm": 0.315785676240921, + "learning_rate": 4.593943919489469e-05, + "loss": 0.2312, + "step": 14889 + }, + { + "epoch": 0.2655798523169122, + "grad_norm": 0.2315066009759903, + "learning_rate": 4.593858880554256e-05, + "loss": 0.1903, + "step": 14890 + }, + { + "epoch": 0.2655976884386259, + "grad_norm": 0.270658016204834, + "learning_rate": 4.593773833502524e-05, + "loss": 0.1951, + "step": 14891 + }, + { + "epoch": 0.2656155245603396, + "grad_norm": 0.26397615671157837, + "learning_rate": 4.5936887783346005e-05, + "loss": 0.1643, + "step": 14892 + }, + { + "epoch": 0.2656333606820533, + "grad_norm": 0.22954022884368896, + "learning_rate": 4.593603715050816e-05, + "loss": 0.1552, + "step": 14893 + }, + { + "epoch": 0.265651196803767, + "grad_norm": 0.2739209234714508, + "learning_rate": 4.593518643651501e-05, + "loss": 0.1594, + "step": 14894 + }, + { + "epoch": 0.26566903292548066, + "grad_norm": 0.24630504846572876, + "learning_rate": 4.593433564136984e-05, + "loss": 0.2015, + "step": 14895 + }, + { + "epoch": 0.26568686904719435, + "grad_norm": 0.26596322655677795, + "learning_rate": 4.593348476507596e-05, + "loss": 0.1862, + "step": 14896 + }, + { + "epoch": 0.2657047051689081, + "grad_norm": 0.21672040224075317, + "learning_rate": 4.5932633807636664e-05, + "loss": 0.1406, + "step": 14897 + }, + { + "epoch": 0.2657225412906218, + "grad_norm": 0.21635863184928894, + "learning_rate": 4.593178276905525e-05, + "loss": 0.1915, + "step": 14898 + }, + { + "epoch": 0.26574037741233547, + "grad_norm": 0.3071916699409485, + "learning_rate": 4.593093164933501e-05, + "loss": 0.2027, + "step": 14899 + }, + { + "epoch": 0.26575821353404916, + "grad_norm": 0.26271358132362366, + "learning_rate": 4.5930080448479254e-05, + "loss": 0.1926, + "step": 14900 + }, + { + "epoch": 0.26577604965576285, + "grad_norm": 0.18739892542362213, + "learning_rate": 4.592922916649128e-05, + "loss": 0.1722, + "step": 14901 + }, + { + "epoch": 0.26579388577747654, + "grad_norm": 0.23717336356639862, + "learning_rate": 4.592837780337438e-05, + "loss": 0.2023, + "step": 14902 + }, + { + "epoch": 0.2658117218991902, + "grad_norm": 0.23732641339302063, + "learning_rate": 4.592752635913186e-05, + "loss": 0.151, + "step": 14903 + }, + { + "epoch": 0.2658295580209039, + "grad_norm": 0.2488204389810562, + "learning_rate": 4.5926674833767026e-05, + "loss": 0.1573, + "step": 14904 + }, + { + "epoch": 0.26584739414261765, + "grad_norm": 0.29487520456314087, + "learning_rate": 4.592582322728316e-05, + "loss": 0.1747, + "step": 14905 + }, + { + "epoch": 0.26586523026433134, + "grad_norm": 0.2556423842906952, + "learning_rate": 4.592497153968358e-05, + "loss": 0.1888, + "step": 14906 + }, + { + "epoch": 0.26588306638604503, + "grad_norm": 0.27322086691856384, + "learning_rate": 4.592411977097159e-05, + "loss": 0.2006, + "step": 14907 + }, + { + "epoch": 0.2659009025077587, + "grad_norm": 0.31154635548591614, + "learning_rate": 4.592326792115048e-05, + "loss": 0.2188, + "step": 14908 + }, + { + "epoch": 0.2659187386294724, + "grad_norm": 0.2314896583557129, + "learning_rate": 4.5922415990223556e-05, + "loss": 0.2032, + "step": 14909 + }, + { + "epoch": 0.2659365747511861, + "grad_norm": 0.265146940946579, + "learning_rate": 4.5921563978194125e-05, + "loss": 0.156, + "step": 14910 + }, + { + "epoch": 0.2659544108728998, + "grad_norm": 0.2430349886417389, + "learning_rate": 4.5920711885065485e-05, + "loss": 0.2064, + "step": 14911 + }, + { + "epoch": 0.26597224699461347, + "grad_norm": 0.22456717491149902, + "learning_rate": 4.591985971084094e-05, + "loss": 0.1786, + "step": 14912 + }, + { + "epoch": 0.26599008311632716, + "grad_norm": 0.45749056339263916, + "learning_rate": 4.5919007455523786e-05, + "loss": 0.2117, + "step": 14913 + }, + { + "epoch": 0.2660079192380409, + "grad_norm": 0.25490471720695496, + "learning_rate": 4.591815511911734e-05, + "loss": 0.1512, + "step": 14914 + }, + { + "epoch": 0.2660257553597546, + "grad_norm": 0.21868674457073212, + "learning_rate": 4.5917302701624896e-05, + "loss": 0.1433, + "step": 14915 + }, + { + "epoch": 0.2660435914814683, + "grad_norm": 0.2644864022731781, + "learning_rate": 4.591645020304977e-05, + "loss": 0.2045, + "step": 14916 + }, + { + "epoch": 0.26606142760318197, + "grad_norm": 0.35034647583961487, + "learning_rate": 4.591559762339526e-05, + "loss": 0.2566, + "step": 14917 + }, + { + "epoch": 0.26607926372489565, + "grad_norm": 0.3079293668270111, + "learning_rate": 4.591474496266466e-05, + "loss": 0.1552, + "step": 14918 + }, + { + "epoch": 0.26609709984660934, + "grad_norm": 0.27546125650405884, + "learning_rate": 4.5913892220861285e-05, + "loss": 0.1505, + "step": 14919 + }, + { + "epoch": 0.26611493596832303, + "grad_norm": 0.2736878991127014, + "learning_rate": 4.5913039397988445e-05, + "loss": 0.1617, + "step": 14920 + }, + { + "epoch": 0.2661327720900367, + "grad_norm": 0.22687847912311554, + "learning_rate": 4.591218649404944e-05, + "loss": 0.1834, + "step": 14921 + }, + { + "epoch": 0.26615060821175046, + "grad_norm": 0.2042544186115265, + "learning_rate": 4.591133350904758e-05, + "loss": 0.1951, + "step": 14922 + }, + { + "epoch": 0.26616844433346415, + "grad_norm": 0.30572453141212463, + "learning_rate": 4.591048044298617e-05, + "loss": 0.2301, + "step": 14923 + }, + { + "epoch": 0.26618628045517784, + "grad_norm": 0.28840720653533936, + "learning_rate": 4.5909627295868506e-05, + "loss": 0.1893, + "step": 14924 + }, + { + "epoch": 0.2662041165768915, + "grad_norm": 0.34331366419792175, + "learning_rate": 4.5908774067697915e-05, + "loss": 0.1541, + "step": 14925 + }, + { + "epoch": 0.2662219526986052, + "grad_norm": 0.35308146476745605, + "learning_rate": 4.590792075847769e-05, + "loss": 0.1943, + "step": 14926 + }, + { + "epoch": 0.2662397888203189, + "grad_norm": 0.2985835373401642, + "learning_rate": 4.590706736821114e-05, + "loss": 0.2134, + "step": 14927 + }, + { + "epoch": 0.2662576249420326, + "grad_norm": 0.2805352210998535, + "learning_rate": 4.590621389690158e-05, + "loss": 0.2112, + "step": 14928 + }, + { + "epoch": 0.2662754610637463, + "grad_norm": 0.28096508979797363, + "learning_rate": 4.5905360344552315e-05, + "loss": 0.1767, + "step": 14929 + }, + { + "epoch": 0.26629329718546, + "grad_norm": 0.2708877623081207, + "learning_rate": 4.5904506711166644e-05, + "loss": 0.2033, + "step": 14930 + }, + { + "epoch": 0.2663111333071737, + "grad_norm": 0.22918345034122467, + "learning_rate": 4.590365299674789e-05, + "loss": 0.1471, + "step": 14931 + }, + { + "epoch": 0.2663289694288874, + "grad_norm": 0.31197041273117065, + "learning_rate": 4.590279920129936e-05, + "loss": 0.2, + "step": 14932 + }, + { + "epoch": 0.2663468055506011, + "grad_norm": 0.25469622015953064, + "learning_rate": 4.590194532482436e-05, + "loss": 0.2023, + "step": 14933 + }, + { + "epoch": 0.2663646416723148, + "grad_norm": 0.2631215751171112, + "learning_rate": 4.59010913673262e-05, + "loss": 0.1821, + "step": 14934 + }, + { + "epoch": 0.26638247779402846, + "grad_norm": 0.2114010453224182, + "learning_rate": 4.5900237328808194e-05, + "loss": 0.1653, + "step": 14935 + }, + { + "epoch": 0.26640031391574215, + "grad_norm": 0.34483587741851807, + "learning_rate": 4.589938320927364e-05, + "loss": 0.2025, + "step": 14936 + }, + { + "epoch": 0.26641815003745584, + "grad_norm": 0.22635510563850403, + "learning_rate": 4.589852900872586e-05, + "loss": 0.1765, + "step": 14937 + }, + { + "epoch": 0.2664359861591695, + "grad_norm": 0.43880748748779297, + "learning_rate": 4.589767472716817e-05, + "loss": 0.1904, + "step": 14938 + }, + { + "epoch": 0.26645382228088327, + "grad_norm": 0.2123252898454666, + "learning_rate": 4.5896820364603874e-05, + "loss": 0.1758, + "step": 14939 + }, + { + "epoch": 0.26647165840259696, + "grad_norm": 0.35721907019615173, + "learning_rate": 4.5895965921036285e-05, + "loss": 0.2411, + "step": 14940 + }, + { + "epoch": 0.26648949452431064, + "grad_norm": 0.31588804721832275, + "learning_rate": 4.589511139646871e-05, + "loss": 0.1815, + "step": 14941 + }, + { + "epoch": 0.26650733064602433, + "grad_norm": 0.275342732667923, + "learning_rate": 4.589425679090446e-05, + "loss": 0.1408, + "step": 14942 + }, + { + "epoch": 0.266525166767738, + "grad_norm": 0.23242709040641785, + "learning_rate": 4.589340210434687e-05, + "loss": 0.133, + "step": 14943 + }, + { + "epoch": 0.2665430028894517, + "grad_norm": 0.268547385931015, + "learning_rate": 4.589254733679923e-05, + "loss": 0.1946, + "step": 14944 + }, + { + "epoch": 0.2665608390111654, + "grad_norm": 0.27724701166152954, + "learning_rate": 4.589169248826486e-05, + "loss": 0.1901, + "step": 14945 + }, + { + "epoch": 0.2665786751328791, + "grad_norm": 0.3501250445842743, + "learning_rate": 4.589083755874708e-05, + "loss": 0.2006, + "step": 14946 + }, + { + "epoch": 0.2665965112545928, + "grad_norm": 0.22979986667633057, + "learning_rate": 4.588998254824919e-05, + "loss": 0.1394, + "step": 14947 + }, + { + "epoch": 0.2666143473763065, + "grad_norm": 0.3628884255886078, + "learning_rate": 4.588912745677452e-05, + "loss": 0.255, + "step": 14948 + }, + { + "epoch": 0.2666321834980202, + "grad_norm": 0.22819317877292633, + "learning_rate": 4.588827228432637e-05, + "loss": 0.1608, + "step": 14949 + }, + { + "epoch": 0.2666500196197339, + "grad_norm": 0.29848426580429077, + "learning_rate": 4.5887417030908067e-05, + "loss": 0.201, + "step": 14950 + }, + { + "epoch": 0.2666678557414476, + "grad_norm": 0.25532805919647217, + "learning_rate": 4.588656169652292e-05, + "loss": 0.1674, + "step": 14951 + }, + { + "epoch": 0.26668569186316127, + "grad_norm": 0.2834542989730835, + "learning_rate": 4.5885706281174244e-05, + "loss": 0.1935, + "step": 14952 + }, + { + "epoch": 0.26670352798487496, + "grad_norm": 0.2601635158061981, + "learning_rate": 4.588485078486536e-05, + "loss": 0.1945, + "step": 14953 + }, + { + "epoch": 0.26672136410658864, + "grad_norm": 0.2619258463382721, + "learning_rate": 4.588399520759957e-05, + "loss": 0.1418, + "step": 14954 + }, + { + "epoch": 0.26673920022830233, + "grad_norm": 0.34515440464019775, + "learning_rate": 4.588313954938022e-05, + "loss": 0.186, + "step": 14955 + }, + { + "epoch": 0.2667570363500161, + "grad_norm": 0.22710682451725006, + "learning_rate": 4.588228381021059e-05, + "loss": 0.1874, + "step": 14956 + }, + { + "epoch": 0.26677487247172976, + "grad_norm": 0.20383551716804504, + "learning_rate": 4.5881427990094025e-05, + "loss": 0.1646, + "step": 14957 + }, + { + "epoch": 0.26679270859344345, + "grad_norm": 0.244561105966568, + "learning_rate": 4.5880572089033835e-05, + "loss": 0.1938, + "step": 14958 + }, + { + "epoch": 0.26681054471515714, + "grad_norm": 0.2626098394393921, + "learning_rate": 4.587971610703333e-05, + "loss": 0.177, + "step": 14959 + }, + { + "epoch": 0.2668283808368708, + "grad_norm": 0.29114869236946106, + "learning_rate": 4.587886004409584e-05, + "loss": 0.2041, + "step": 14960 + }, + { + "epoch": 0.2668462169585845, + "grad_norm": 0.2355378419160843, + "learning_rate": 4.587800390022467e-05, + "loss": 0.1844, + "step": 14961 + }, + { + "epoch": 0.2668640530802982, + "grad_norm": 0.24582120776176453, + "learning_rate": 4.587714767542315e-05, + "loss": 0.2125, + "step": 14962 + }, + { + "epoch": 0.2668818892020119, + "grad_norm": 0.2756332755088806, + "learning_rate": 4.587629136969459e-05, + "loss": 0.1706, + "step": 14963 + }, + { + "epoch": 0.26689972532372563, + "grad_norm": 0.2964188754558563, + "learning_rate": 4.587543498304232e-05, + "loss": 0.1788, + "step": 14964 + }, + { + "epoch": 0.2669175614454393, + "grad_norm": 0.3456663191318512, + "learning_rate": 4.5874578515469655e-05, + "loss": 0.2347, + "step": 14965 + }, + { + "epoch": 0.266935397567153, + "grad_norm": 0.24593006074428558, + "learning_rate": 4.587372196697991e-05, + "loss": 0.1323, + "step": 14966 + }, + { + "epoch": 0.2669532336888667, + "grad_norm": 0.2978859841823578, + "learning_rate": 4.587286533757641e-05, + "loss": 0.1704, + "step": 14967 + }, + { + "epoch": 0.2669710698105804, + "grad_norm": 0.38560938835144043, + "learning_rate": 4.5872008627262476e-05, + "loss": 0.1671, + "step": 14968 + }, + { + "epoch": 0.2669889059322941, + "grad_norm": 0.28182223439216614, + "learning_rate": 4.587115183604143e-05, + "loss": 0.1983, + "step": 14969 + }, + { + "epoch": 0.26700674205400776, + "grad_norm": 0.3404568135738373, + "learning_rate": 4.587029496391658e-05, + "loss": 0.1613, + "step": 14970 + }, + { + "epoch": 0.26702457817572145, + "grad_norm": 0.31590014696121216, + "learning_rate": 4.586943801089126e-05, + "loss": 0.1919, + "step": 14971 + }, + { + "epoch": 0.2670424142974352, + "grad_norm": 0.29793787002563477, + "learning_rate": 4.58685809769688e-05, + "loss": 0.1424, + "step": 14972 + }, + { + "epoch": 0.2670602504191489, + "grad_norm": 0.28106406331062317, + "learning_rate": 4.586772386215251e-05, + "loss": 0.1901, + "step": 14973 + }, + { + "epoch": 0.26707808654086257, + "grad_norm": 0.2836782932281494, + "learning_rate": 4.586686666644571e-05, + "loss": 0.231, + "step": 14974 + }, + { + "epoch": 0.26709592266257626, + "grad_norm": 0.2543480694293976, + "learning_rate": 4.586600938985174e-05, + "loss": 0.1756, + "step": 14975 + }, + { + "epoch": 0.26711375878428995, + "grad_norm": 0.3113311529159546, + "learning_rate": 4.5865152032373895e-05, + "loss": 0.1819, + "step": 14976 + }, + { + "epoch": 0.26713159490600363, + "grad_norm": 0.2961636483669281, + "learning_rate": 4.586429459401552e-05, + "loss": 0.1429, + "step": 14977 + }, + { + "epoch": 0.2671494310277173, + "grad_norm": 0.2527402639389038, + "learning_rate": 4.586343707477994e-05, + "loss": 0.143, + "step": 14978 + }, + { + "epoch": 0.267167267149431, + "grad_norm": 0.3453488051891327, + "learning_rate": 4.586257947467046e-05, + "loss": 0.1915, + "step": 14979 + }, + { + "epoch": 0.2671851032711447, + "grad_norm": 0.23477394878864288, + "learning_rate": 4.586172179369042e-05, + "loss": 0.1219, + "step": 14980 + }, + { + "epoch": 0.26720293939285844, + "grad_norm": 0.193343847990036, + "learning_rate": 4.586086403184314e-05, + "loss": 0.1531, + "step": 14981 + }, + { + "epoch": 0.26722077551457213, + "grad_norm": 0.28575000166893005, + "learning_rate": 4.5860006189131955e-05, + "loss": 0.206, + "step": 14982 + }, + { + "epoch": 0.2672386116362858, + "grad_norm": 0.3874417841434479, + "learning_rate": 4.585914826556017e-05, + "loss": 0.1523, + "step": 14983 + }, + { + "epoch": 0.2672564477579995, + "grad_norm": 0.22443947196006775, + "learning_rate": 4.5858290261131124e-05, + "loss": 0.1754, + "step": 14984 + }, + { + "epoch": 0.2672742838797132, + "grad_norm": 0.37375736236572266, + "learning_rate": 4.5857432175848146e-05, + "loss": 0.1619, + "step": 14985 + }, + { + "epoch": 0.2672921200014269, + "grad_norm": 0.21425506472587585, + "learning_rate": 4.585657400971455e-05, + "loss": 0.1754, + "step": 14986 + }, + { + "epoch": 0.26730995612314057, + "grad_norm": 0.2879641652107239, + "learning_rate": 4.585571576273368e-05, + "loss": 0.1696, + "step": 14987 + }, + { + "epoch": 0.26732779224485426, + "grad_norm": 0.23936870694160461, + "learning_rate": 4.5854857434908846e-05, + "loss": 0.184, + "step": 14988 + }, + { + "epoch": 0.267345628366568, + "grad_norm": 0.4232967793941498, + "learning_rate": 4.585399902624338e-05, + "loss": 0.2345, + "step": 14989 + }, + { + "epoch": 0.2673634644882817, + "grad_norm": 0.35915425419807434, + "learning_rate": 4.5853140536740614e-05, + "loss": 0.2141, + "step": 14990 + }, + { + "epoch": 0.2673813006099954, + "grad_norm": 0.3376728892326355, + "learning_rate": 4.585228196640387e-05, + "loss": 0.1973, + "step": 14991 + }, + { + "epoch": 0.26739913673170906, + "grad_norm": 0.26291733980178833, + "learning_rate": 4.585142331523647e-05, + "loss": 0.1653, + "step": 14992 + }, + { + "epoch": 0.26741697285342275, + "grad_norm": 0.2760443687438965, + "learning_rate": 4.585056458324177e-05, + "loss": 0.1761, + "step": 14993 + }, + { + "epoch": 0.26743480897513644, + "grad_norm": 0.2993004024028778, + "learning_rate": 4.584970577042307e-05, + "loss": 0.2215, + "step": 14994 + }, + { + "epoch": 0.26745264509685013, + "grad_norm": 0.23597221076488495, + "learning_rate": 4.584884687678371e-05, + "loss": 0.177, + "step": 14995 + }, + { + "epoch": 0.2674704812185638, + "grad_norm": 0.24855457246303558, + "learning_rate": 4.584798790232702e-05, + "loss": 0.218, + "step": 14996 + }, + { + "epoch": 0.2674883173402775, + "grad_norm": 0.26287418603897095, + "learning_rate": 4.584712884705633e-05, + "loss": 0.1404, + "step": 14997 + }, + { + "epoch": 0.26750615346199125, + "grad_norm": 0.24942946434020996, + "learning_rate": 4.5846269710974963e-05, + "loss": 0.1757, + "step": 14998 + }, + { + "epoch": 0.26752398958370494, + "grad_norm": 0.29295971989631653, + "learning_rate": 4.584541049408626e-05, + "loss": 0.2236, + "step": 14999 + }, + { + "epoch": 0.2675418257054186, + "grad_norm": 0.2524368166923523, + "learning_rate": 4.584455119639354e-05, + "loss": 0.1359, + "step": 15000 + }, + { + "epoch": 0.2675418257054186, + "eval_loss": 0.17883256077766418, + "eval_runtime": 107.1337, + "eval_samples_per_second": 9.558, + "eval_steps_per_second": 1.596, + "step": 15000 + }, + { + "epoch": 0.2675596618271323, + "grad_norm": 0.20551513135433197, + "learning_rate": 4.584369181790015e-05, + "loss": 0.1701, + "step": 15001 + }, + { + "epoch": 0.267577497948846, + "grad_norm": 0.33774229884147644, + "learning_rate": 4.58428323586094e-05, + "loss": 0.1623, + "step": 15002 + }, + { + "epoch": 0.2675953340705597, + "grad_norm": 0.24167729914188385, + "learning_rate": 4.584197281852464e-05, + "loss": 0.1653, + "step": 15003 + }, + { + "epoch": 0.2676131701922734, + "grad_norm": 0.29225096106529236, + "learning_rate": 4.584111319764919e-05, + "loss": 0.1321, + "step": 15004 + }, + { + "epoch": 0.26763100631398706, + "grad_norm": 0.2736901640892029, + "learning_rate": 4.584025349598639e-05, + "loss": 0.181, + "step": 15005 + }, + { + "epoch": 0.2676488424357008, + "grad_norm": 0.20935440063476562, + "learning_rate": 4.5839393713539567e-05, + "loss": 0.1692, + "step": 15006 + }, + { + "epoch": 0.2676666785574145, + "grad_norm": 0.2919246256351471, + "learning_rate": 4.583853385031206e-05, + "loss": 0.2066, + "step": 15007 + }, + { + "epoch": 0.2676845146791282, + "grad_norm": 0.2584631145000458, + "learning_rate": 4.58376739063072e-05, + "loss": 0.2106, + "step": 15008 + }, + { + "epoch": 0.26770235080084187, + "grad_norm": 0.3070624768733978, + "learning_rate": 4.5836813881528313e-05, + "loss": 0.165, + "step": 15009 + }, + { + "epoch": 0.26772018692255556, + "grad_norm": 0.28218236565589905, + "learning_rate": 4.583595377597874e-05, + "loss": 0.1462, + "step": 15010 + }, + { + "epoch": 0.26773802304426925, + "grad_norm": 0.2848483920097351, + "learning_rate": 4.5835093589661815e-05, + "loss": 0.1612, + "step": 15011 + }, + { + "epoch": 0.26775585916598293, + "grad_norm": 0.3022254705429077, + "learning_rate": 4.583423332258087e-05, + "loss": 0.1627, + "step": 15012 + }, + { + "epoch": 0.2677736952876966, + "grad_norm": 0.25852662324905396, + "learning_rate": 4.583337297473924e-05, + "loss": 0.1949, + "step": 15013 + }, + { + "epoch": 0.2677915314094103, + "grad_norm": 0.3122718334197998, + "learning_rate": 4.5832512546140266e-05, + "loss": 0.1799, + "step": 15014 + }, + { + "epoch": 0.26780936753112405, + "grad_norm": 0.3488292992115021, + "learning_rate": 4.583165203678728e-05, + "loss": 0.1884, + "step": 15015 + }, + { + "epoch": 0.26782720365283774, + "grad_norm": 0.25483936071395874, + "learning_rate": 4.58307914466836e-05, + "loss": 0.205, + "step": 15016 + }, + { + "epoch": 0.26784503977455143, + "grad_norm": 0.3567766845226288, + "learning_rate": 4.582993077583259e-05, + "loss": 0.2572, + "step": 15017 + }, + { + "epoch": 0.2678628758962651, + "grad_norm": 0.27144569158554077, + "learning_rate": 4.582907002423757e-05, + "loss": 0.2435, + "step": 15018 + }, + { + "epoch": 0.2678807120179788, + "grad_norm": 0.35764238238334656, + "learning_rate": 4.582820919190188e-05, + "loss": 0.1753, + "step": 15019 + }, + { + "epoch": 0.2678985481396925, + "grad_norm": 0.24760153889656067, + "learning_rate": 4.5827348278828866e-05, + "loss": 0.15, + "step": 15020 + }, + { + "epoch": 0.2679163842614062, + "grad_norm": 0.23590341210365295, + "learning_rate": 4.582648728502185e-05, + "loss": 0.1474, + "step": 15021 + }, + { + "epoch": 0.26793422038311987, + "grad_norm": 0.3167073428630829, + "learning_rate": 4.582562621048417e-05, + "loss": 0.1975, + "step": 15022 + }, + { + "epoch": 0.2679520565048336, + "grad_norm": 0.1918695569038391, + "learning_rate": 4.5824765055219175e-05, + "loss": 0.132, + "step": 15023 + }, + { + "epoch": 0.2679698926265473, + "grad_norm": 0.2530979812145233, + "learning_rate": 4.5823903819230204e-05, + "loss": 0.1491, + "step": 15024 + }, + { + "epoch": 0.267987728748261, + "grad_norm": 0.3486992120742798, + "learning_rate": 4.5823042502520585e-05, + "loss": 0.2427, + "step": 15025 + }, + { + "epoch": 0.2680055648699747, + "grad_norm": 0.4126201868057251, + "learning_rate": 4.582218110509366e-05, + "loss": 0.2455, + "step": 15026 + }, + { + "epoch": 0.26802340099168837, + "grad_norm": 0.22779715061187744, + "learning_rate": 4.582131962695277e-05, + "loss": 0.1879, + "step": 15027 + }, + { + "epoch": 0.26804123711340205, + "grad_norm": 0.27447912096977234, + "learning_rate": 4.582045806810125e-05, + "loss": 0.2182, + "step": 15028 + }, + { + "epoch": 0.26805907323511574, + "grad_norm": 0.27242714166641235, + "learning_rate": 4.581959642854245e-05, + "loss": 0.147, + "step": 15029 + }, + { + "epoch": 0.26807690935682943, + "grad_norm": 0.22823967039585114, + "learning_rate": 4.5818734708279696e-05, + "loss": 0.141, + "step": 15030 + }, + { + "epoch": 0.2680947454785432, + "grad_norm": 0.29453587532043457, + "learning_rate": 4.581787290731634e-05, + "loss": 0.1765, + "step": 15031 + }, + { + "epoch": 0.26811258160025686, + "grad_norm": 0.4526078999042511, + "learning_rate": 4.581701102565572e-05, + "loss": 0.1799, + "step": 15032 + }, + { + "epoch": 0.26813041772197055, + "grad_norm": 0.3336029350757599, + "learning_rate": 4.5816149063301175e-05, + "loss": 0.2432, + "step": 15033 + }, + { + "epoch": 0.26814825384368424, + "grad_norm": 0.45929092168807983, + "learning_rate": 4.581528702025604e-05, + "loss": 0.2501, + "step": 15034 + }, + { + "epoch": 0.2681660899653979, + "grad_norm": 0.2934766411781311, + "learning_rate": 4.581442489652367e-05, + "loss": 0.2089, + "step": 15035 + }, + { + "epoch": 0.2681839260871116, + "grad_norm": 0.1741962432861328, + "learning_rate": 4.58135626921074e-05, + "loss": 0.1378, + "step": 15036 + }, + { + "epoch": 0.2682017622088253, + "grad_norm": 0.3707410991191864, + "learning_rate": 4.581270040701057e-05, + "loss": 0.2242, + "step": 15037 + }, + { + "epoch": 0.268219598330539, + "grad_norm": 0.21595703065395355, + "learning_rate": 4.581183804123652e-05, + "loss": 0.1724, + "step": 15038 + }, + { + "epoch": 0.2682374344522527, + "grad_norm": 0.25424498319625854, + "learning_rate": 4.5810975594788606e-05, + "loss": 0.1751, + "step": 15039 + }, + { + "epoch": 0.2682552705739664, + "grad_norm": 0.2081408053636551, + "learning_rate": 4.581011306767016e-05, + "loss": 0.1529, + "step": 15040 + }, + { + "epoch": 0.2682731066956801, + "grad_norm": 0.2633403241634369, + "learning_rate": 4.580925045988453e-05, + "loss": 0.1471, + "step": 15041 + }, + { + "epoch": 0.2682909428173938, + "grad_norm": 0.2900846302509308, + "learning_rate": 4.580838777143506e-05, + "loss": 0.1743, + "step": 15042 + }, + { + "epoch": 0.2683087789391075, + "grad_norm": 0.3206022381782532, + "learning_rate": 4.580752500232508e-05, + "loss": 0.1675, + "step": 15043 + }, + { + "epoch": 0.26832661506082117, + "grad_norm": 0.26511016488075256, + "learning_rate": 4.580666215255796e-05, + "loss": 0.1284, + "step": 15044 + }, + { + "epoch": 0.26834445118253486, + "grad_norm": 0.27598950266838074, + "learning_rate": 4.5805799222137025e-05, + "loss": 0.2067, + "step": 15045 + }, + { + "epoch": 0.26836228730424855, + "grad_norm": 0.3592627942562103, + "learning_rate": 4.580493621106562e-05, + "loss": 0.2508, + "step": 15046 + }, + { + "epoch": 0.26838012342596224, + "grad_norm": 0.27034735679626465, + "learning_rate": 4.58040731193471e-05, + "loss": 0.2061, + "step": 15047 + }, + { + "epoch": 0.268397959547676, + "grad_norm": 0.26650625467300415, + "learning_rate": 4.5803209946984814e-05, + "loss": 0.1492, + "step": 15048 + }, + { + "epoch": 0.26841579566938967, + "grad_norm": 0.28974902629852295, + "learning_rate": 4.58023466939821e-05, + "loss": 0.1454, + "step": 15049 + }, + { + "epoch": 0.26843363179110336, + "grad_norm": 0.4166126549243927, + "learning_rate": 4.58014833603423e-05, + "loss": 0.2169, + "step": 15050 + }, + { + "epoch": 0.26845146791281704, + "grad_norm": 0.21722766757011414, + "learning_rate": 4.5800619946068766e-05, + "loss": 0.1607, + "step": 15051 + }, + { + "epoch": 0.26846930403453073, + "grad_norm": 0.2337527573108673, + "learning_rate": 4.579975645116484e-05, + "loss": 0.2084, + "step": 15052 + }, + { + "epoch": 0.2684871401562444, + "grad_norm": 0.3176339566707611, + "learning_rate": 4.579889287563389e-05, + "loss": 0.2024, + "step": 15053 + }, + { + "epoch": 0.2685049762779581, + "grad_norm": 0.30657845735549927, + "learning_rate": 4.579802921947924e-05, + "loss": 0.2619, + "step": 15054 + }, + { + "epoch": 0.2685228123996718, + "grad_norm": 0.24663479626178741, + "learning_rate": 4.579716548270424e-05, + "loss": 0.1725, + "step": 15055 + }, + { + "epoch": 0.2685406485213855, + "grad_norm": 0.24212126433849335, + "learning_rate": 4.5796301665312244e-05, + "loss": 0.1862, + "step": 15056 + }, + { + "epoch": 0.2685584846430992, + "grad_norm": 0.3466230034828186, + "learning_rate": 4.579543776730661e-05, + "loss": 0.2001, + "step": 15057 + }, + { + "epoch": 0.2685763207648129, + "grad_norm": 0.2574392557144165, + "learning_rate": 4.579457378869066e-05, + "loss": 0.1948, + "step": 15058 + }, + { + "epoch": 0.2685941568865266, + "grad_norm": 0.3334366977214813, + "learning_rate": 4.5793709729467776e-05, + "loss": 0.1918, + "step": 15059 + }, + { + "epoch": 0.2686119930082403, + "grad_norm": 0.3107346296310425, + "learning_rate": 4.579284558964129e-05, + "loss": 0.1651, + "step": 15060 + }, + { + "epoch": 0.268629829129954, + "grad_norm": 0.2981800138950348, + "learning_rate": 4.5791981369214546e-05, + "loss": 0.2316, + "step": 15061 + }, + { + "epoch": 0.26864766525166767, + "grad_norm": 0.351725697517395, + "learning_rate": 4.57911170681909e-05, + "loss": 0.1866, + "step": 15062 + }, + { + "epoch": 0.26866550137338135, + "grad_norm": 0.5415006279945374, + "learning_rate": 4.5790252686573705e-05, + "loss": 0.232, + "step": 15063 + }, + { + "epoch": 0.26868333749509504, + "grad_norm": 0.259854257106781, + "learning_rate": 4.5789388224366315e-05, + "loss": 0.1502, + "step": 15064 + }, + { + "epoch": 0.2687011736168088, + "grad_norm": 0.3185490369796753, + "learning_rate": 4.578852368157207e-05, + "loss": 0.2438, + "step": 15065 + }, + { + "epoch": 0.2687190097385225, + "grad_norm": 0.40953490138053894, + "learning_rate": 4.578765905819432e-05, + "loss": 0.2136, + "step": 15066 + }, + { + "epoch": 0.26873684586023616, + "grad_norm": 0.22106356918811798, + "learning_rate": 4.578679435423644e-05, + "loss": 0.2012, + "step": 15067 + }, + { + "epoch": 0.26875468198194985, + "grad_norm": 0.2520470917224884, + "learning_rate": 4.578592956970176e-05, + "loss": 0.1761, + "step": 15068 + }, + { + "epoch": 0.26877251810366354, + "grad_norm": 0.33525723218917847, + "learning_rate": 4.578506470459363e-05, + "loss": 0.2346, + "step": 15069 + }, + { + "epoch": 0.2687903542253772, + "grad_norm": 0.267304927110672, + "learning_rate": 4.578419975891542e-05, + "loss": 0.2033, + "step": 15070 + }, + { + "epoch": 0.2688081903470909, + "grad_norm": 0.2740553319454193, + "learning_rate": 4.578333473267047e-05, + "loss": 0.1558, + "step": 15071 + }, + { + "epoch": 0.2688260264688046, + "grad_norm": 0.3099888563156128, + "learning_rate": 4.578246962586213e-05, + "loss": 0.1418, + "step": 15072 + }, + { + "epoch": 0.26884386259051835, + "grad_norm": 0.3670021593570709, + "learning_rate": 4.5781604438493764e-05, + "loss": 0.1844, + "step": 15073 + }, + { + "epoch": 0.26886169871223203, + "grad_norm": 0.2935854494571686, + "learning_rate": 4.578073917056872e-05, + "loss": 0.1374, + "step": 15074 + }, + { + "epoch": 0.2688795348339457, + "grad_norm": 0.24012885987758636, + "learning_rate": 4.577987382209036e-05, + "loss": 0.2023, + "step": 15075 + }, + { + "epoch": 0.2688973709556594, + "grad_norm": 0.2169635146856308, + "learning_rate": 4.5779008393062026e-05, + "loss": 0.1584, + "step": 15076 + }, + { + "epoch": 0.2689152070773731, + "grad_norm": 0.31807398796081543, + "learning_rate": 4.577814288348708e-05, + "loss": 0.1896, + "step": 15077 + }, + { + "epoch": 0.2689330431990868, + "grad_norm": 0.2244410216808319, + "learning_rate": 4.577727729336888e-05, + "loss": 0.157, + "step": 15078 + }, + { + "epoch": 0.2689508793208005, + "grad_norm": 0.30356472730636597, + "learning_rate": 4.577641162271077e-05, + "loss": 0.1837, + "step": 15079 + }, + { + "epoch": 0.26896871544251416, + "grad_norm": 0.37213581800460815, + "learning_rate": 4.5775545871516115e-05, + "loss": 0.2653, + "step": 15080 + }, + { + "epoch": 0.26898655156422785, + "grad_norm": 0.4679724872112274, + "learning_rate": 4.577468003978827e-05, + "loss": 0.2525, + "step": 15081 + }, + { + "epoch": 0.2690043876859416, + "grad_norm": 0.2462843358516693, + "learning_rate": 4.577381412753059e-05, + "loss": 0.173, + "step": 15082 + }, + { + "epoch": 0.2690222238076553, + "grad_norm": 0.21170172095298767, + "learning_rate": 4.577294813474643e-05, + "loss": 0.1549, + "step": 15083 + }, + { + "epoch": 0.26904005992936897, + "grad_norm": 0.29097288846969604, + "learning_rate": 4.577208206143915e-05, + "loss": 0.2141, + "step": 15084 + }, + { + "epoch": 0.26905789605108266, + "grad_norm": 0.3203429579734802, + "learning_rate": 4.57712159076121e-05, + "loss": 0.2536, + "step": 15085 + }, + { + "epoch": 0.26907573217279634, + "grad_norm": 0.3312043249607086, + "learning_rate": 4.577034967326865e-05, + "loss": 0.2015, + "step": 15086 + }, + { + "epoch": 0.26909356829451003, + "grad_norm": 0.24537771940231323, + "learning_rate": 4.576948335841215e-05, + "loss": 0.1288, + "step": 15087 + }, + { + "epoch": 0.2691114044162237, + "grad_norm": 0.2414066642522812, + "learning_rate": 4.576861696304595e-05, + "loss": 0.1836, + "step": 15088 + }, + { + "epoch": 0.2691292405379374, + "grad_norm": 0.2669928967952728, + "learning_rate": 4.576775048717343e-05, + "loss": 0.1354, + "step": 15089 + }, + { + "epoch": 0.26914707665965115, + "grad_norm": 0.2919745445251465, + "learning_rate": 4.576688393079793e-05, + "loss": 0.231, + "step": 15090 + }, + { + "epoch": 0.26916491278136484, + "grad_norm": 0.6345043182373047, + "learning_rate": 4.576601729392281e-05, + "loss": 0.2241, + "step": 15091 + }, + { + "epoch": 0.26918274890307853, + "grad_norm": 0.30328649282455444, + "learning_rate": 4.5765150576551444e-05, + "loss": 0.1834, + "step": 15092 + }, + { + "epoch": 0.2692005850247922, + "grad_norm": 0.2644284665584564, + "learning_rate": 4.576428377868718e-05, + "loss": 0.1738, + "step": 15093 + }, + { + "epoch": 0.2692184211465059, + "grad_norm": 0.3032969534397125, + "learning_rate": 4.5763416900333376e-05, + "loss": 0.201, + "step": 15094 + }, + { + "epoch": 0.2692362572682196, + "grad_norm": 0.2551535964012146, + "learning_rate": 4.5762549941493406e-05, + "loss": 0.1375, + "step": 15095 + }, + { + "epoch": 0.2692540933899333, + "grad_norm": 0.27850770950317383, + "learning_rate": 4.576168290217061e-05, + "loss": 0.2196, + "step": 15096 + }, + { + "epoch": 0.26927192951164697, + "grad_norm": 0.22138451039791107, + "learning_rate": 4.576081578236836e-05, + "loss": 0.1201, + "step": 15097 + }, + { + "epoch": 0.26928976563336066, + "grad_norm": 0.3289961516857147, + "learning_rate": 4.575994858209003e-05, + "loss": 0.1409, + "step": 15098 + }, + { + "epoch": 0.2693076017550744, + "grad_norm": 0.26809120178222656, + "learning_rate": 4.575908130133896e-05, + "loss": 0.2042, + "step": 15099 + }, + { + "epoch": 0.2693254378767881, + "grad_norm": 0.3855358064174652, + "learning_rate": 4.575821394011852e-05, + "loss": 0.2078, + "step": 15100 + }, + { + "epoch": 0.2693432739985018, + "grad_norm": 0.20090390741825104, + "learning_rate": 4.5757346498432075e-05, + "loss": 0.1927, + "step": 15101 + }, + { + "epoch": 0.26936111012021546, + "grad_norm": 0.32857489585876465, + "learning_rate": 4.5756478976282993e-05, + "loss": 0.1751, + "step": 15102 + }, + { + "epoch": 0.26937894624192915, + "grad_norm": 0.26462826132774353, + "learning_rate": 4.575561137367462e-05, + "loss": 0.1746, + "step": 15103 + }, + { + "epoch": 0.26939678236364284, + "grad_norm": 0.3513377010822296, + "learning_rate": 4.5754743690610324e-05, + "loss": 0.2266, + "step": 15104 + }, + { + "epoch": 0.2694146184853565, + "grad_norm": 0.2739737927913666, + "learning_rate": 4.575387592709348e-05, + "loss": 0.1854, + "step": 15105 + }, + { + "epoch": 0.2694324546070702, + "grad_norm": 0.2547748386859894, + "learning_rate": 4.575300808312744e-05, + "loss": 0.1389, + "step": 15106 + }, + { + "epoch": 0.26945029072878396, + "grad_norm": 0.2273988425731659, + "learning_rate": 4.5752140158715585e-05, + "loss": 0.1672, + "step": 15107 + }, + { + "epoch": 0.26946812685049765, + "grad_norm": 0.24950018525123596, + "learning_rate": 4.5751272153861265e-05, + "loss": 0.1691, + "step": 15108 + }, + { + "epoch": 0.26948596297221133, + "grad_norm": 0.273456871509552, + "learning_rate": 4.5750404068567845e-05, + "loss": 0.1877, + "step": 15109 + }, + { + "epoch": 0.269503799093925, + "grad_norm": 0.3427591919898987, + "learning_rate": 4.574953590283868e-05, + "loss": 0.2205, + "step": 15110 + }, + { + "epoch": 0.2695216352156387, + "grad_norm": 0.2696007788181305, + "learning_rate": 4.574866765667716e-05, + "loss": 0.1362, + "step": 15111 + }, + { + "epoch": 0.2695394713373524, + "grad_norm": 0.2822125554084778, + "learning_rate": 4.574779933008663e-05, + "loss": 0.1755, + "step": 15112 + }, + { + "epoch": 0.2695573074590661, + "grad_norm": 0.29417094588279724, + "learning_rate": 4.5746930923070474e-05, + "loss": 0.174, + "step": 15113 + }, + { + "epoch": 0.2695751435807798, + "grad_norm": 0.32152101397514343, + "learning_rate": 4.5746062435632035e-05, + "loss": 0.161, + "step": 15114 + }, + { + "epoch": 0.26959297970249346, + "grad_norm": 0.42608800530433655, + "learning_rate": 4.57451938677747e-05, + "loss": 0.1339, + "step": 15115 + }, + { + "epoch": 0.2696108158242072, + "grad_norm": 0.4274521768093109, + "learning_rate": 4.574432521950183e-05, + "loss": 0.2179, + "step": 15116 + }, + { + "epoch": 0.2696286519459209, + "grad_norm": 0.20234228670597076, + "learning_rate": 4.5743456490816785e-05, + "loss": 0.1807, + "step": 15117 + }, + { + "epoch": 0.2696464880676346, + "grad_norm": 0.18298442661762238, + "learning_rate": 4.5742587681722944e-05, + "loss": 0.1327, + "step": 15118 + }, + { + "epoch": 0.26966432418934827, + "grad_norm": 0.2521079480648041, + "learning_rate": 4.5741718792223667e-05, + "loss": 0.1772, + "step": 15119 + }, + { + "epoch": 0.26968216031106196, + "grad_norm": 0.2876557409763336, + "learning_rate": 4.574084982232232e-05, + "loss": 0.189, + "step": 15120 + }, + { + "epoch": 0.26969999643277565, + "grad_norm": 0.26367124915122986, + "learning_rate": 4.5739980772022275e-05, + "loss": 0.1821, + "step": 15121 + }, + { + "epoch": 0.26971783255448933, + "grad_norm": 0.2989930510520935, + "learning_rate": 4.573911164132691e-05, + "loss": 0.2458, + "step": 15122 + }, + { + "epoch": 0.269735668676203, + "grad_norm": 0.4386133849620819, + "learning_rate": 4.5738242430239574e-05, + "loss": 0.2609, + "step": 15123 + }, + { + "epoch": 0.26975350479791677, + "grad_norm": 0.2555815875530243, + "learning_rate": 4.5737373138763654e-05, + "loss": 0.1937, + "step": 15124 + }, + { + "epoch": 0.26977134091963045, + "grad_norm": 0.25332406163215637, + "learning_rate": 4.573650376690252e-05, + "loss": 0.1808, + "step": 15125 + }, + { + "epoch": 0.26978917704134414, + "grad_norm": 0.27824124693870544, + "learning_rate": 4.573563431465953e-05, + "loss": 0.1418, + "step": 15126 + }, + { + "epoch": 0.26980701316305783, + "grad_norm": 0.3003767132759094, + "learning_rate": 4.573476478203805e-05, + "loss": 0.2325, + "step": 15127 + }, + { + "epoch": 0.2698248492847715, + "grad_norm": 0.21168194711208344, + "learning_rate": 4.573389516904147e-05, + "loss": 0.1574, + "step": 15128 + }, + { + "epoch": 0.2698426854064852, + "grad_norm": 0.28841474652290344, + "learning_rate": 4.573302547567315e-05, + "loss": 0.1638, + "step": 15129 + }, + { + "epoch": 0.2698605215281989, + "grad_norm": 0.24812036752700806, + "learning_rate": 4.573215570193646e-05, + "loss": 0.1493, + "step": 15130 + }, + { + "epoch": 0.2698783576499126, + "grad_norm": 0.3030625581741333, + "learning_rate": 4.573128584783477e-05, + "loss": 0.1714, + "step": 15131 + }, + { + "epoch": 0.2698961937716263, + "grad_norm": 0.23205551505088806, + "learning_rate": 4.573041591337146e-05, + "loss": 0.1574, + "step": 15132 + }, + { + "epoch": 0.26991402989334, + "grad_norm": 0.201072558760643, + "learning_rate": 4.5729545898549904e-05, + "loss": 0.1601, + "step": 15133 + }, + { + "epoch": 0.2699318660150537, + "grad_norm": 0.49314627051353455, + "learning_rate": 4.5728675803373454e-05, + "loss": 0.1971, + "step": 15134 + }, + { + "epoch": 0.2699497021367674, + "grad_norm": 0.5175096392631531, + "learning_rate": 4.572780562784551e-05, + "loss": 0.1851, + "step": 15135 + }, + { + "epoch": 0.2699675382584811, + "grad_norm": 0.23637612164020538, + "learning_rate": 4.572693537196942e-05, + "loss": 0.1976, + "step": 15136 + }, + { + "epoch": 0.26998537438019476, + "grad_norm": 0.2552511394023895, + "learning_rate": 4.572606503574859e-05, + "loss": 0.211, + "step": 15137 + }, + { + "epoch": 0.27000321050190845, + "grad_norm": 0.2515599727630615, + "learning_rate": 4.5725194619186354e-05, + "loss": 0.1816, + "step": 15138 + }, + { + "epoch": 0.27002104662362214, + "grad_norm": 0.4017189145088196, + "learning_rate": 4.572432412228612e-05, + "loss": 0.1752, + "step": 15139 + }, + { + "epoch": 0.27003888274533583, + "grad_norm": 0.24660325050354004, + "learning_rate": 4.5723453545051236e-05, + "loss": 0.1855, + "step": 15140 + }, + { + "epoch": 0.27005671886704957, + "grad_norm": 0.25051072239875793, + "learning_rate": 4.5722582887485085e-05, + "loss": 0.2097, + "step": 15141 + }, + { + "epoch": 0.27007455498876326, + "grad_norm": 0.3942745625972748, + "learning_rate": 4.572171214959106e-05, + "loss": 0.1987, + "step": 15142 + }, + { + "epoch": 0.27009239111047695, + "grad_norm": 0.3423575758934021, + "learning_rate": 4.572084133137251e-05, + "loss": 0.1915, + "step": 15143 + }, + { + "epoch": 0.27011022723219064, + "grad_norm": 0.20359322428703308, + "learning_rate": 4.5719970432832834e-05, + "loss": 0.1607, + "step": 15144 + }, + { + "epoch": 0.2701280633539043, + "grad_norm": 0.5687680840492249, + "learning_rate": 4.571909945397539e-05, + "loss": 0.2197, + "step": 15145 + }, + { + "epoch": 0.270145899475618, + "grad_norm": 0.27103379368782043, + "learning_rate": 4.571822839480355e-05, + "loss": 0.2201, + "step": 15146 + }, + { + "epoch": 0.2701637355973317, + "grad_norm": 0.2533745765686035, + "learning_rate": 4.5717357255320714e-05, + "loss": 0.2099, + "step": 15147 + }, + { + "epoch": 0.2701815717190454, + "grad_norm": 0.3978731632232666, + "learning_rate": 4.5716486035530244e-05, + "loss": 0.2324, + "step": 15148 + }, + { + "epoch": 0.27019940784075913, + "grad_norm": 0.2688053548336029, + "learning_rate": 4.571561473543552e-05, + "loss": 0.1882, + "step": 15149 + }, + { + "epoch": 0.2702172439624728, + "grad_norm": 0.24664700031280518, + "learning_rate": 4.571474335503991e-05, + "loss": 0.1757, + "step": 15150 + }, + { + "epoch": 0.2702350800841865, + "grad_norm": 0.2718179225921631, + "learning_rate": 4.57138718943468e-05, + "loss": 0.1958, + "step": 15151 + }, + { + "epoch": 0.2702529162059002, + "grad_norm": 0.245918408036232, + "learning_rate": 4.571300035335958e-05, + "loss": 0.1562, + "step": 15152 + }, + { + "epoch": 0.2702707523276139, + "grad_norm": 0.23125310242176056, + "learning_rate": 4.5712128732081614e-05, + "loss": 0.1108, + "step": 15153 + }, + { + "epoch": 0.27028858844932757, + "grad_norm": 0.3100898563861847, + "learning_rate": 4.571125703051627e-05, + "loss": 0.1576, + "step": 15154 + }, + { + "epoch": 0.27030642457104126, + "grad_norm": 0.2776975631713867, + "learning_rate": 4.571038524866695e-05, + "loss": 0.1511, + "step": 15155 + }, + { + "epoch": 0.27032426069275495, + "grad_norm": 0.25399282574653625, + "learning_rate": 4.570951338653703e-05, + "loss": 0.2046, + "step": 15156 + }, + { + "epoch": 0.27034209681446864, + "grad_norm": 0.3336469233036041, + "learning_rate": 4.570864144412987e-05, + "loss": 0.1764, + "step": 15157 + }, + { + "epoch": 0.2703599329361824, + "grad_norm": 0.3277715742588043, + "learning_rate": 4.570776942144888e-05, + "loss": 0.1828, + "step": 15158 + }, + { + "epoch": 0.27037776905789607, + "grad_norm": 0.3869439959526062, + "learning_rate": 4.570689731849741e-05, + "loss": 0.1889, + "step": 15159 + }, + { + "epoch": 0.27039560517960975, + "grad_norm": 0.2036662995815277, + "learning_rate": 4.570602513527886e-05, + "loss": 0.1538, + "step": 15160 + }, + { + "epoch": 0.27041344130132344, + "grad_norm": 0.27623727917671204, + "learning_rate": 4.57051528717966e-05, + "loss": 0.1605, + "step": 15161 + }, + { + "epoch": 0.27043127742303713, + "grad_norm": 0.24336528778076172, + "learning_rate": 4.5704280528054015e-05, + "loss": 0.1821, + "step": 15162 + }, + { + "epoch": 0.2704491135447508, + "grad_norm": 0.23564158380031586, + "learning_rate": 4.570340810405449e-05, + "loss": 0.1986, + "step": 15163 + }, + { + "epoch": 0.2704669496664645, + "grad_norm": 0.23940153419971466, + "learning_rate": 4.570253559980141e-05, + "loss": 0.1807, + "step": 15164 + }, + { + "epoch": 0.2704847857881782, + "grad_norm": 0.253069669008255, + "learning_rate": 4.570166301529815e-05, + "loss": 0.1841, + "step": 15165 + }, + { + "epoch": 0.27050262190989194, + "grad_norm": 0.3073471784591675, + "learning_rate": 4.570079035054808e-05, + "loss": 0.1857, + "step": 15166 + }, + { + "epoch": 0.2705204580316056, + "grad_norm": 0.3081522285938263, + "learning_rate": 4.5699917605554614e-05, + "loss": 0.2263, + "step": 15167 + }, + { + "epoch": 0.2705382941533193, + "grad_norm": 0.29746657609939575, + "learning_rate": 4.569904478032111e-05, + "loss": 0.1909, + "step": 15168 + }, + { + "epoch": 0.270556130275033, + "grad_norm": 0.3431158661842346, + "learning_rate": 4.569817187485096e-05, + "loss": 0.1552, + "step": 15169 + }, + { + "epoch": 0.2705739663967467, + "grad_norm": 0.37991803884506226, + "learning_rate": 4.5697298889147556e-05, + "loss": 0.2265, + "step": 15170 + }, + { + "epoch": 0.2705918025184604, + "grad_norm": 0.2834690809249878, + "learning_rate": 4.569642582321426e-05, + "loss": 0.1491, + "step": 15171 + }, + { + "epoch": 0.27060963864017407, + "grad_norm": 0.2840344309806824, + "learning_rate": 4.5695552677054474e-05, + "loss": 0.1902, + "step": 15172 + }, + { + "epoch": 0.27062747476188775, + "grad_norm": 0.39299482107162476, + "learning_rate": 4.569467945067158e-05, + "loss": 0.2496, + "step": 15173 + }, + { + "epoch": 0.27064531088360144, + "grad_norm": 0.24964050948619843, + "learning_rate": 4.569380614406896e-05, + "loss": 0.1858, + "step": 15174 + }, + { + "epoch": 0.2706631470053152, + "grad_norm": 0.2006838023662567, + "learning_rate": 4.5692932757249994e-05, + "loss": 0.1675, + "step": 15175 + }, + { + "epoch": 0.2706809831270289, + "grad_norm": 0.33417633175849915, + "learning_rate": 4.569205929021808e-05, + "loss": 0.1562, + "step": 15176 + }, + { + "epoch": 0.27069881924874256, + "grad_norm": 0.46648111939430237, + "learning_rate": 4.5691185742976596e-05, + "loss": 0.1778, + "step": 15177 + }, + { + "epoch": 0.27071665537045625, + "grad_norm": 0.26930585503578186, + "learning_rate": 4.5690312115528936e-05, + "loss": 0.2047, + "step": 15178 + }, + { + "epoch": 0.27073449149216994, + "grad_norm": 0.24441000819206238, + "learning_rate": 4.5689438407878464e-05, + "loss": 0.1911, + "step": 15179 + }, + { + "epoch": 0.2707523276138836, + "grad_norm": 0.23304849863052368, + "learning_rate": 4.5688564620028595e-05, + "loss": 0.2294, + "step": 15180 + }, + { + "epoch": 0.2707701637355973, + "grad_norm": 0.24206621944904327, + "learning_rate": 4.56876907519827e-05, + "loss": 0.1727, + "step": 15181 + }, + { + "epoch": 0.270787999857311, + "grad_norm": 0.23368129134178162, + "learning_rate": 4.568681680374417e-05, + "loss": 0.1525, + "step": 15182 + }, + { + "epoch": 0.27080583597902474, + "grad_norm": 0.16269347071647644, + "learning_rate": 4.568594277531639e-05, + "loss": 0.1354, + "step": 15183 + }, + { + "epoch": 0.27082367210073843, + "grad_norm": 0.27379167079925537, + "learning_rate": 4.5685068666702756e-05, + "loss": 0.2198, + "step": 15184 + }, + { + "epoch": 0.2708415082224521, + "grad_norm": 0.5590804815292358, + "learning_rate": 4.568419447790666e-05, + "loss": 0.2431, + "step": 15185 + }, + { + "epoch": 0.2708593443441658, + "grad_norm": 0.33675411343574524, + "learning_rate": 4.568332020893147e-05, + "loss": 0.167, + "step": 15186 + }, + { + "epoch": 0.2708771804658795, + "grad_norm": 0.28820088505744934, + "learning_rate": 4.568244585978059e-05, + "loss": 0.2604, + "step": 15187 + }, + { + "epoch": 0.2708950165875932, + "grad_norm": 0.29879653453826904, + "learning_rate": 4.56815714304574e-05, + "loss": 0.2001, + "step": 15188 + }, + { + "epoch": 0.2709128527093069, + "grad_norm": 0.27476903796195984, + "learning_rate": 4.5680696920965304e-05, + "loss": 0.2205, + "step": 15189 + }, + { + "epoch": 0.27093068883102056, + "grad_norm": 0.32600149512290955, + "learning_rate": 4.5679822331307684e-05, + "loss": 0.1543, + "step": 15190 + }, + { + "epoch": 0.2709485249527343, + "grad_norm": 0.38172951340675354, + "learning_rate": 4.567894766148792e-05, + "loss": 0.2909, + "step": 15191 + }, + { + "epoch": 0.270966361074448, + "grad_norm": 0.24965259432792664, + "learning_rate": 4.567807291150943e-05, + "loss": 0.1667, + "step": 15192 + }, + { + "epoch": 0.2709841971961617, + "grad_norm": 0.36808347702026367, + "learning_rate": 4.567719808137558e-05, + "loss": 0.1784, + "step": 15193 + }, + { + "epoch": 0.27100203331787537, + "grad_norm": 0.2753356993198395, + "learning_rate": 4.5676323171089764e-05, + "loss": 0.1845, + "step": 15194 + }, + { + "epoch": 0.27101986943958906, + "grad_norm": 0.2599482536315918, + "learning_rate": 4.567544818065538e-05, + "loss": 0.1796, + "step": 15195 + }, + { + "epoch": 0.27103770556130274, + "grad_norm": 0.2779335677623749, + "learning_rate": 4.567457311007582e-05, + "loss": 0.2152, + "step": 15196 + }, + { + "epoch": 0.27105554168301643, + "grad_norm": 0.27003762125968933, + "learning_rate": 4.5673697959354464e-05, + "loss": 0.1888, + "step": 15197 + }, + { + "epoch": 0.2710733778047301, + "grad_norm": 0.16075967252254486, + "learning_rate": 4.567282272849473e-05, + "loss": 0.149, + "step": 15198 + }, + { + "epoch": 0.2710912139264438, + "grad_norm": 0.28088581562042236, + "learning_rate": 4.5671947417499986e-05, + "loss": 0.1578, + "step": 15199 + }, + { + "epoch": 0.27110905004815755, + "grad_norm": 0.4064401090145111, + "learning_rate": 4.567107202637364e-05, + "loss": 0.1791, + "step": 15200 + }, + { + "epoch": 0.27112688616987124, + "grad_norm": 0.3230174481868744, + "learning_rate": 4.567019655511907e-05, + "loss": 0.2062, + "step": 15201 + }, + { + "epoch": 0.2711447222915849, + "grad_norm": 0.33291521668434143, + "learning_rate": 4.566932100373968e-05, + "loss": 0.1666, + "step": 15202 + }, + { + "epoch": 0.2711625584132986, + "grad_norm": 0.2896880805492401, + "learning_rate": 4.5668445372238876e-05, + "loss": 0.2153, + "step": 15203 + }, + { + "epoch": 0.2711803945350123, + "grad_norm": 0.28267180919647217, + "learning_rate": 4.566756966062002e-05, + "loss": 0.1751, + "step": 15204 + }, + { + "epoch": 0.271198230656726, + "grad_norm": 0.32702046632766724, + "learning_rate": 4.566669386888655e-05, + "loss": 0.1712, + "step": 15205 + }, + { + "epoch": 0.2712160667784397, + "grad_norm": 0.27298182249069214, + "learning_rate": 4.566581799704182e-05, + "loss": 0.1673, + "step": 15206 + }, + { + "epoch": 0.27123390290015337, + "grad_norm": 0.3961528539657593, + "learning_rate": 4.566494204508923e-05, + "loss": 0.2041, + "step": 15207 + }, + { + "epoch": 0.2712517390218671, + "grad_norm": 0.26520565152168274, + "learning_rate": 4.56640660130322e-05, + "loss": 0.1816, + "step": 15208 + }, + { + "epoch": 0.2712695751435808, + "grad_norm": 0.24210646748542786, + "learning_rate": 4.566318990087412e-05, + "loss": 0.1788, + "step": 15209 + }, + { + "epoch": 0.2712874112652945, + "grad_norm": 0.2602328360080719, + "learning_rate": 4.566231370861838e-05, + "loss": 0.1949, + "step": 15210 + }, + { + "epoch": 0.2713052473870082, + "grad_norm": 0.3372642397880554, + "learning_rate": 4.566143743626836e-05, + "loss": 0.1502, + "step": 15211 + }, + { + "epoch": 0.27132308350872186, + "grad_norm": 0.36982017755508423, + "learning_rate": 4.566056108382748e-05, + "loss": 0.1561, + "step": 15212 + }, + { + "epoch": 0.27134091963043555, + "grad_norm": 0.22503159940242767, + "learning_rate": 4.565968465129913e-05, + "loss": 0.1762, + "step": 15213 + }, + { + "epoch": 0.27135875575214924, + "grad_norm": 0.3456066846847534, + "learning_rate": 4.565880813868671e-05, + "loss": 0.2427, + "step": 15214 + }, + { + "epoch": 0.2713765918738629, + "grad_norm": 0.2989911437034607, + "learning_rate": 4.565793154599361e-05, + "loss": 0.183, + "step": 15215 + }, + { + "epoch": 0.2713944279955766, + "grad_norm": 0.3059474527835846, + "learning_rate": 4.5657054873223234e-05, + "loss": 0.1904, + "step": 15216 + }, + { + "epoch": 0.27141226411729036, + "grad_norm": 0.24165715277194977, + "learning_rate": 4.565617812037898e-05, + "loss": 0.1909, + "step": 15217 + }, + { + "epoch": 0.27143010023900405, + "grad_norm": 0.2177954465150833, + "learning_rate": 4.565530128746424e-05, + "loss": 0.2142, + "step": 15218 + }, + { + "epoch": 0.27144793636071773, + "grad_norm": 0.2765772342681885, + "learning_rate": 4.565442437448242e-05, + "loss": 0.1707, + "step": 15219 + }, + { + "epoch": 0.2714657724824314, + "grad_norm": 0.31171122193336487, + "learning_rate": 4.565354738143692e-05, + "loss": 0.1785, + "step": 15220 + }, + { + "epoch": 0.2714836086041451, + "grad_norm": 0.22197510302066803, + "learning_rate": 4.5652670308331135e-05, + "loss": 0.1732, + "step": 15221 + }, + { + "epoch": 0.2715014447258588, + "grad_norm": 0.2603869140148163, + "learning_rate": 4.5651793155168463e-05, + "loss": 0.1393, + "step": 15222 + }, + { + "epoch": 0.2715192808475725, + "grad_norm": 0.3855699896812439, + "learning_rate": 4.5650915921952315e-05, + "loss": 0.2192, + "step": 15223 + }, + { + "epoch": 0.2715371169692862, + "grad_norm": 0.2805880308151245, + "learning_rate": 4.5650038608686084e-05, + "loss": 0.1452, + "step": 15224 + }, + { + "epoch": 0.2715549530909999, + "grad_norm": 0.23132579028606415, + "learning_rate": 4.564916121537317e-05, + "loss": 0.216, + "step": 15225 + }, + { + "epoch": 0.2715727892127136, + "grad_norm": 0.2955576479434967, + "learning_rate": 4.564828374201697e-05, + "loss": 0.1325, + "step": 15226 + }, + { + "epoch": 0.2715906253344273, + "grad_norm": 0.38246214389801025, + "learning_rate": 4.564740618862089e-05, + "loss": 0.2069, + "step": 15227 + }, + { + "epoch": 0.271608461456141, + "grad_norm": 0.26435205340385437, + "learning_rate": 4.564652855518834e-05, + "loss": 0.1954, + "step": 15228 + }, + { + "epoch": 0.27162629757785467, + "grad_norm": 0.2857755124568939, + "learning_rate": 4.564565084172271e-05, + "loss": 0.1077, + "step": 15229 + }, + { + "epoch": 0.27164413369956836, + "grad_norm": 0.2121935486793518, + "learning_rate": 4.5644773048227406e-05, + "loss": 0.1745, + "step": 15230 + }, + { + "epoch": 0.27166196982128205, + "grad_norm": 0.2694613039493561, + "learning_rate": 4.564389517470583e-05, + "loss": 0.1895, + "step": 15231 + }, + { + "epoch": 0.27167980594299573, + "grad_norm": 0.32408225536346436, + "learning_rate": 4.564301722116139e-05, + "loss": 0.187, + "step": 15232 + }, + { + "epoch": 0.2716976420647095, + "grad_norm": 0.2772509753704071, + "learning_rate": 4.5642139187597484e-05, + "loss": 0.1702, + "step": 15233 + }, + { + "epoch": 0.27171547818642316, + "grad_norm": 0.2935529351234436, + "learning_rate": 4.564126107401751e-05, + "loss": 0.1639, + "step": 15234 + }, + { + "epoch": 0.27173331430813685, + "grad_norm": 0.33130311965942383, + "learning_rate": 4.564038288042489e-05, + "loss": 0.1792, + "step": 15235 + }, + { + "epoch": 0.27175115042985054, + "grad_norm": 0.30109965801239014, + "learning_rate": 4.5639504606823016e-05, + "loss": 0.1646, + "step": 15236 + }, + { + "epoch": 0.27176898655156423, + "grad_norm": 0.3628794848918915, + "learning_rate": 4.563862625321529e-05, + "loss": 0.2068, + "step": 15237 + }, + { + "epoch": 0.2717868226732779, + "grad_norm": 0.21711215376853943, + "learning_rate": 4.563774781960511e-05, + "loss": 0.1474, + "step": 15238 + }, + { + "epoch": 0.2718046587949916, + "grad_norm": 0.3367788791656494, + "learning_rate": 4.563686930599591e-05, + "loss": 0.1977, + "step": 15239 + }, + { + "epoch": 0.2718224949167053, + "grad_norm": 0.374269962310791, + "learning_rate": 4.5635990712391064e-05, + "loss": 0.1788, + "step": 15240 + }, + { + "epoch": 0.271840331038419, + "grad_norm": 0.33344024419784546, + "learning_rate": 4.5635112038794e-05, + "loss": 0.2051, + "step": 15241 + }, + { + "epoch": 0.2718581671601327, + "grad_norm": 0.27690979838371277, + "learning_rate": 4.5634233285208104e-05, + "loss": 0.1796, + "step": 15242 + }, + { + "epoch": 0.2718760032818464, + "grad_norm": 0.37408357858657837, + "learning_rate": 4.56333544516368e-05, + "loss": 0.1933, + "step": 15243 + }, + { + "epoch": 0.2718938394035601, + "grad_norm": 0.27190762758255005, + "learning_rate": 4.5632475538083486e-05, + "loss": 0.2238, + "step": 15244 + }, + { + "epoch": 0.2719116755252738, + "grad_norm": 0.28789904713630676, + "learning_rate": 4.563159654455157e-05, + "loss": 0.1419, + "step": 15245 + }, + { + "epoch": 0.2719295116469875, + "grad_norm": 0.2511211931705475, + "learning_rate": 4.563071747104446e-05, + "loss": 0.2109, + "step": 15246 + }, + { + "epoch": 0.27194734776870116, + "grad_norm": 0.3084287941455841, + "learning_rate": 4.5629838317565566e-05, + "loss": 0.2023, + "step": 15247 + }, + { + "epoch": 0.27196518389041485, + "grad_norm": 0.41966989636421204, + "learning_rate": 4.5628959084118294e-05, + "loss": 0.1518, + "step": 15248 + }, + { + "epoch": 0.27198302001212854, + "grad_norm": 0.2750261127948761, + "learning_rate": 4.562807977070604e-05, + "loss": 0.1979, + "step": 15249 + }, + { + "epoch": 0.2720008561338423, + "grad_norm": 0.3373103141784668, + "learning_rate": 4.5627200377332235e-05, + "loss": 0.1892, + "step": 15250 + }, + { + "epoch": 0.27201869225555597, + "grad_norm": 0.3220573663711548, + "learning_rate": 4.562632090400028e-05, + "loss": 0.1906, + "step": 15251 + }, + { + "epoch": 0.27203652837726966, + "grad_norm": 0.25796929001808167, + "learning_rate": 4.562544135071357e-05, + "loss": 0.197, + "step": 15252 + }, + { + "epoch": 0.27205436449898335, + "grad_norm": 0.30647778511047363, + "learning_rate": 4.5624561717475535e-05, + "loss": 0.184, + "step": 15253 + }, + { + "epoch": 0.27207220062069704, + "grad_norm": 0.2855292856693268, + "learning_rate": 4.562368200428957e-05, + "loss": 0.1901, + "step": 15254 + }, + { + "epoch": 0.2720900367424107, + "grad_norm": 0.295500248670578, + "learning_rate": 4.56228022111591e-05, + "loss": 0.165, + "step": 15255 + }, + { + "epoch": 0.2721078728641244, + "grad_norm": 0.3099755346775055, + "learning_rate": 4.5621922338087513e-05, + "loss": 0.1815, + "step": 15256 + }, + { + "epoch": 0.2721257089858381, + "grad_norm": 0.28054386377334595, + "learning_rate": 4.562104238507824e-05, + "loss": 0.1618, + "step": 15257 + }, + { + "epoch": 0.2721435451075518, + "grad_norm": 0.32812485098838806, + "learning_rate": 4.562016235213468e-05, + "loss": 0.1871, + "step": 15258 + }, + { + "epoch": 0.27216138122926553, + "grad_norm": 0.35932666063308716, + "learning_rate": 4.561928223926025e-05, + "loss": 0.158, + "step": 15259 + }, + { + "epoch": 0.2721792173509792, + "grad_norm": 0.3113170862197876, + "learning_rate": 4.561840204645836e-05, + "loss": 0.1981, + "step": 15260 + }, + { + "epoch": 0.2721970534726929, + "grad_norm": 0.2167622447013855, + "learning_rate": 4.5617521773732416e-05, + "loss": 0.1643, + "step": 15261 + }, + { + "epoch": 0.2722148895944066, + "grad_norm": 0.22766174376010895, + "learning_rate": 4.561664142108585e-05, + "loss": 0.1486, + "step": 15262 + }, + { + "epoch": 0.2722327257161203, + "grad_norm": 0.2789682149887085, + "learning_rate": 4.561576098852206e-05, + "loss": 0.2049, + "step": 15263 + }, + { + "epoch": 0.27225056183783397, + "grad_norm": 0.2753037214279175, + "learning_rate": 4.561488047604445e-05, + "loss": 0.1539, + "step": 15264 + }, + { + "epoch": 0.27226839795954766, + "grad_norm": 0.3413105607032776, + "learning_rate": 4.561399988365645e-05, + "loss": 0.1999, + "step": 15265 + }, + { + "epoch": 0.27228623408126135, + "grad_norm": 0.31180474162101746, + "learning_rate": 4.5613119211361464e-05, + "loss": 0.1791, + "step": 15266 + }, + { + "epoch": 0.2723040702029751, + "grad_norm": 0.2991024851799011, + "learning_rate": 4.56122384591629e-05, + "loss": 0.1634, + "step": 15267 + }, + { + "epoch": 0.2723219063246888, + "grad_norm": 0.2772076427936554, + "learning_rate": 4.56113576270642e-05, + "loss": 0.1863, + "step": 15268 + }, + { + "epoch": 0.27233974244640247, + "grad_norm": 0.3939161002635956, + "learning_rate": 4.5610476715068745e-05, + "loss": 0.229, + "step": 15269 + }, + { + "epoch": 0.27235757856811615, + "grad_norm": 0.2530250549316406, + "learning_rate": 4.560959572317996e-05, + "loss": 0.1737, + "step": 15270 + }, + { + "epoch": 0.27237541468982984, + "grad_norm": 0.29270896315574646, + "learning_rate": 4.560871465140128e-05, + "loss": 0.1937, + "step": 15271 + }, + { + "epoch": 0.27239325081154353, + "grad_norm": 0.25215375423431396, + "learning_rate": 4.5607833499736094e-05, + "loss": 0.1648, + "step": 15272 + }, + { + "epoch": 0.2724110869332572, + "grad_norm": 0.2080620378255844, + "learning_rate": 4.5606952268187823e-05, + "loss": 0.1888, + "step": 15273 + }, + { + "epoch": 0.2724289230549709, + "grad_norm": 0.26196444034576416, + "learning_rate": 4.5606070956759894e-05, + "loss": 0.1699, + "step": 15274 + }, + { + "epoch": 0.2724467591766846, + "grad_norm": 0.214683398604393, + "learning_rate": 4.560518956545572e-05, + "loss": 0.1599, + "step": 15275 + }, + { + "epoch": 0.27246459529839834, + "grad_norm": 0.2834594249725342, + "learning_rate": 4.5604308094278706e-05, + "loss": 0.1734, + "step": 15276 + }, + { + "epoch": 0.272482431420112, + "grad_norm": 0.3563289940357208, + "learning_rate": 4.5603426543232284e-05, + "loss": 0.1581, + "step": 15277 + }, + { + "epoch": 0.2725002675418257, + "grad_norm": 0.23578090965747833, + "learning_rate": 4.5602544912319865e-05, + "loss": 0.1598, + "step": 15278 + }, + { + "epoch": 0.2725181036635394, + "grad_norm": 0.24709348380565643, + "learning_rate": 4.560166320154486e-05, + "loss": 0.1961, + "step": 15279 + }, + { + "epoch": 0.2725359397852531, + "grad_norm": 0.240645632147789, + "learning_rate": 4.56007814109107e-05, + "loss": 0.1769, + "step": 15280 + }, + { + "epoch": 0.2725537759069668, + "grad_norm": 0.37232837080955505, + "learning_rate": 4.559989954042079e-05, + "loss": 0.1247, + "step": 15281 + }, + { + "epoch": 0.27257161202868047, + "grad_norm": 0.3669794499874115, + "learning_rate": 4.559901759007855e-05, + "loss": 0.2142, + "step": 15282 + }, + { + "epoch": 0.27258944815039415, + "grad_norm": 0.2820383310317993, + "learning_rate": 4.559813555988741e-05, + "loss": 0.1648, + "step": 15283 + }, + { + "epoch": 0.2726072842721079, + "grad_norm": 0.2757430970668793, + "learning_rate": 4.5597253449850785e-05, + "loss": 0.2069, + "step": 15284 + }, + { + "epoch": 0.2726251203938216, + "grad_norm": 0.2687505781650543, + "learning_rate": 4.559637125997209e-05, + "loss": 0.1682, + "step": 15285 + }, + { + "epoch": 0.2726429565155353, + "grad_norm": 0.2668740451335907, + "learning_rate": 4.559548899025474e-05, + "loss": 0.1528, + "step": 15286 + }, + { + "epoch": 0.27266079263724896, + "grad_norm": 0.22336526215076447, + "learning_rate": 4.559460664070217e-05, + "loss": 0.1671, + "step": 15287 + }, + { + "epoch": 0.27267862875896265, + "grad_norm": 0.29438310861587524, + "learning_rate": 4.5593724211317775e-05, + "loss": 0.1762, + "step": 15288 + }, + { + "epoch": 0.27269646488067634, + "grad_norm": 0.3443536162376404, + "learning_rate": 4.5592841702105014e-05, + "loss": 0.1756, + "step": 15289 + }, + { + "epoch": 0.27271430100239, + "grad_norm": 0.3066597580909729, + "learning_rate": 4.559195911306727e-05, + "loss": 0.2097, + "step": 15290 + }, + { + "epoch": 0.2727321371241037, + "grad_norm": 0.3857879638671875, + "learning_rate": 4.559107644420799e-05, + "loss": 0.2785, + "step": 15291 + }, + { + "epoch": 0.27274997324581746, + "grad_norm": 0.3418494760990143, + "learning_rate": 4.559019369553058e-05, + "loss": 0.1861, + "step": 15292 + }, + { + "epoch": 0.27276780936753114, + "grad_norm": 0.27305635809898376, + "learning_rate": 4.558931086703847e-05, + "loss": 0.2264, + "step": 15293 + }, + { + "epoch": 0.27278564548924483, + "grad_norm": 0.2765255272388458, + "learning_rate": 4.558842795873508e-05, + "loss": 0.177, + "step": 15294 + }, + { + "epoch": 0.2728034816109585, + "grad_norm": 0.32875490188598633, + "learning_rate": 4.5587544970623833e-05, + "loss": 0.1948, + "step": 15295 + }, + { + "epoch": 0.2728213177326722, + "grad_norm": 0.29781773686408997, + "learning_rate": 4.558666190270815e-05, + "loss": 0.1923, + "step": 15296 + }, + { + "epoch": 0.2728391538543859, + "grad_norm": 0.2744108736515045, + "learning_rate": 4.558577875499146e-05, + "loss": 0.1761, + "step": 15297 + }, + { + "epoch": 0.2728569899760996, + "grad_norm": 0.24439026415348053, + "learning_rate": 4.5584895527477175e-05, + "loss": 0.1542, + "step": 15298 + }, + { + "epoch": 0.2728748260978133, + "grad_norm": 0.28602665662765503, + "learning_rate": 4.558401222016873e-05, + "loss": 0.1762, + "step": 15299 + }, + { + "epoch": 0.27289266221952696, + "grad_norm": 0.21261803805828094, + "learning_rate": 4.558312883306953e-05, + "loss": 0.1741, + "step": 15300 + }, + { + "epoch": 0.2729104983412407, + "grad_norm": 0.329238623380661, + "learning_rate": 4.558224536618303e-05, + "loss": 0.1951, + "step": 15301 + }, + { + "epoch": 0.2729283344629544, + "grad_norm": 0.20495079457759857, + "learning_rate": 4.558136181951263e-05, + "loss": 0.147, + "step": 15302 + }, + { + "epoch": 0.2729461705846681, + "grad_norm": 0.25885844230651855, + "learning_rate": 4.558047819306177e-05, + "loss": 0.1913, + "step": 15303 + }, + { + "epoch": 0.27296400670638177, + "grad_norm": 0.22849741578102112, + "learning_rate": 4.557959448683386e-05, + "loss": 0.0968, + "step": 15304 + }, + { + "epoch": 0.27298184282809546, + "grad_norm": 0.3090474605560303, + "learning_rate": 4.557871070083234e-05, + "loss": 0.1379, + "step": 15305 + }, + { + "epoch": 0.27299967894980914, + "grad_norm": 0.3825876712799072, + "learning_rate": 4.5577826835060625e-05, + "loss": 0.2259, + "step": 15306 + }, + { + "epoch": 0.27301751507152283, + "grad_norm": 0.218611940741539, + "learning_rate": 4.557694288952215e-05, + "loss": 0.1691, + "step": 15307 + }, + { + "epoch": 0.2730353511932365, + "grad_norm": 0.24547012150287628, + "learning_rate": 4.557605886422033e-05, + "loss": 0.1994, + "step": 15308 + }, + { + "epoch": 0.27305318731495026, + "grad_norm": 0.3120046555995941, + "learning_rate": 4.5575174759158604e-05, + "loss": 0.2648, + "step": 15309 + }, + { + "epoch": 0.27307102343666395, + "grad_norm": 0.2578418254852295, + "learning_rate": 4.5574290574340395e-05, + "loss": 0.2256, + "step": 15310 + }, + { + "epoch": 0.27308885955837764, + "grad_norm": 0.2318110316991806, + "learning_rate": 4.557340630976913e-05, + "loss": 0.1761, + "step": 15311 + }, + { + "epoch": 0.2731066956800913, + "grad_norm": 0.2423015534877777, + "learning_rate": 4.5572521965448237e-05, + "loss": 0.1689, + "step": 15312 + }, + { + "epoch": 0.273124531801805, + "grad_norm": 0.1946888267993927, + "learning_rate": 4.557163754138114e-05, + "loss": 0.1528, + "step": 15313 + }, + { + "epoch": 0.2731423679235187, + "grad_norm": 0.29533758759498596, + "learning_rate": 4.557075303757127e-05, + "loss": 0.2106, + "step": 15314 + }, + { + "epoch": 0.2731602040452324, + "grad_norm": 0.26054847240448, + "learning_rate": 4.556986845402206e-05, + "loss": 0.174, + "step": 15315 + }, + { + "epoch": 0.2731780401669461, + "grad_norm": 0.27883899211883545, + "learning_rate": 4.556898379073693e-05, + "loss": 0.2354, + "step": 15316 + }, + { + "epoch": 0.27319587628865977, + "grad_norm": 0.43866193294525146, + "learning_rate": 4.556809904771933e-05, + "loss": 0.2317, + "step": 15317 + }, + { + "epoch": 0.2732137124103735, + "grad_norm": 0.28204163908958435, + "learning_rate": 4.5567214224972653e-05, + "loss": 0.2131, + "step": 15318 + }, + { + "epoch": 0.2732315485320872, + "grad_norm": 0.3200967311859131, + "learning_rate": 4.5566329322500365e-05, + "loss": 0.1531, + "step": 15319 + }, + { + "epoch": 0.2732493846538009, + "grad_norm": 0.2608736753463745, + "learning_rate": 4.556544434030587e-05, + "loss": 0.1412, + "step": 15320 + }, + { + "epoch": 0.2732672207755146, + "grad_norm": 0.30826514959335327, + "learning_rate": 4.556455927839261e-05, + "loss": 0.2068, + "step": 15321 + }, + { + "epoch": 0.27328505689722826, + "grad_norm": 0.47373446822166443, + "learning_rate": 4.556367413676402e-05, + "loss": 0.1911, + "step": 15322 + }, + { + "epoch": 0.27330289301894195, + "grad_norm": 0.5966895222663879, + "learning_rate": 4.556278891542354e-05, + "loss": 0.216, + "step": 15323 + }, + { + "epoch": 0.27332072914065564, + "grad_norm": 0.26217979192733765, + "learning_rate": 4.556190361437457e-05, + "loss": 0.1843, + "step": 15324 + }, + { + "epoch": 0.2733385652623693, + "grad_norm": 0.3170042634010315, + "learning_rate": 4.5561018233620566e-05, + "loss": 0.1974, + "step": 15325 + }, + { + "epoch": 0.27335640138408307, + "grad_norm": 0.4135138988494873, + "learning_rate": 4.556013277316495e-05, + "loss": 0.1779, + "step": 15326 + }, + { + "epoch": 0.27337423750579676, + "grad_norm": 0.17939463257789612, + "learning_rate": 4.555924723301116e-05, + "loss": 0.1834, + "step": 15327 + }, + { + "epoch": 0.27339207362751045, + "grad_norm": 0.2103712409734726, + "learning_rate": 4.555836161316263e-05, + "loss": 0.2001, + "step": 15328 + }, + { + "epoch": 0.27340990974922413, + "grad_norm": 0.30819979310035706, + "learning_rate": 4.5557475913622785e-05, + "loss": 0.146, + "step": 15329 + }, + { + "epoch": 0.2734277458709378, + "grad_norm": 0.3447093069553375, + "learning_rate": 4.5556590134395075e-05, + "loss": 0.1514, + "step": 15330 + }, + { + "epoch": 0.2734455819926515, + "grad_norm": 0.23838426172733307, + "learning_rate": 4.555570427548291e-05, + "loss": 0.187, + "step": 15331 + }, + { + "epoch": 0.2734634181143652, + "grad_norm": 0.25891539454460144, + "learning_rate": 4.555481833688973e-05, + "loss": 0.1719, + "step": 15332 + }, + { + "epoch": 0.2734812542360789, + "grad_norm": 0.3054523766040802, + "learning_rate": 4.5553932318618984e-05, + "loss": 0.1706, + "step": 15333 + }, + { + "epoch": 0.27349909035779263, + "grad_norm": 0.36198678612709045, + "learning_rate": 4.55530462206741e-05, + "loss": 0.2099, + "step": 15334 + }, + { + "epoch": 0.2735169264795063, + "grad_norm": 0.2814268469810486, + "learning_rate": 4.55521600430585e-05, + "loss": 0.1516, + "step": 15335 + }, + { + "epoch": 0.27353476260122, + "grad_norm": 0.3233660161495209, + "learning_rate": 4.555127378577564e-05, + "loss": 0.1371, + "step": 15336 + }, + { + "epoch": 0.2735525987229337, + "grad_norm": 0.35173270106315613, + "learning_rate": 4.5550387448828936e-05, + "loss": 0.2267, + "step": 15337 + }, + { + "epoch": 0.2735704348446474, + "grad_norm": 0.23610329627990723, + "learning_rate": 4.5549501032221836e-05, + "loss": 0.197, + "step": 15338 + }, + { + "epoch": 0.27358827096636107, + "grad_norm": 0.26902562379837036, + "learning_rate": 4.554861453595777e-05, + "loss": 0.1797, + "step": 15339 + }, + { + "epoch": 0.27360610708807476, + "grad_norm": 0.28548333048820496, + "learning_rate": 4.5547727960040185e-05, + "loss": 0.1887, + "step": 15340 + }, + { + "epoch": 0.27362394320978844, + "grad_norm": 0.2753860652446747, + "learning_rate": 4.55468413044725e-05, + "loss": 0.2439, + "step": 15341 + }, + { + "epoch": 0.27364177933150213, + "grad_norm": 0.25360167026519775, + "learning_rate": 4.5545954569258163e-05, + "loss": 0.1943, + "step": 15342 + }, + { + "epoch": 0.2736596154532159, + "grad_norm": 0.26458024978637695, + "learning_rate": 4.554506775440062e-05, + "loss": 0.1502, + "step": 15343 + }, + { + "epoch": 0.27367745157492956, + "grad_norm": 0.19540803134441376, + "learning_rate": 4.554418085990328e-05, + "loss": 0.1574, + "step": 15344 + }, + { + "epoch": 0.27369528769664325, + "grad_norm": 0.428216814994812, + "learning_rate": 4.554329388576961e-05, + "loss": 0.1838, + "step": 15345 + }, + { + "epoch": 0.27371312381835694, + "grad_norm": 0.24133911728858948, + "learning_rate": 4.5542406832003035e-05, + "loss": 0.2111, + "step": 15346 + }, + { + "epoch": 0.27373095994007063, + "grad_norm": 0.37168097496032715, + "learning_rate": 4.554151969860701e-05, + "loss": 0.2317, + "step": 15347 + }, + { + "epoch": 0.2737487960617843, + "grad_norm": 0.248093381524086, + "learning_rate": 4.5540632485584944e-05, + "loss": 0.1809, + "step": 15348 + }, + { + "epoch": 0.273766632183498, + "grad_norm": 0.303943395614624, + "learning_rate": 4.55397451929403e-05, + "loss": 0.1953, + "step": 15349 + }, + { + "epoch": 0.2737844683052117, + "grad_norm": 0.30130651593208313, + "learning_rate": 4.5538857820676495e-05, + "loss": 0.178, + "step": 15350 + }, + { + "epoch": 0.27380230442692544, + "grad_norm": 0.24781788885593414, + "learning_rate": 4.5537970368796995e-05, + "loss": 0.1645, + "step": 15351 + }, + { + "epoch": 0.2738201405486391, + "grad_norm": 0.30300620198249817, + "learning_rate": 4.553708283730523e-05, + "loss": 0.205, + "step": 15352 + }, + { + "epoch": 0.2738379766703528, + "grad_norm": 0.2765817642211914, + "learning_rate": 4.5536195226204634e-05, + "loss": 0.1982, + "step": 15353 + }, + { + "epoch": 0.2738558127920665, + "grad_norm": 0.23178695142269135, + "learning_rate": 4.553530753549865e-05, + "loss": 0.1983, + "step": 15354 + }, + { + "epoch": 0.2738736489137802, + "grad_norm": 0.34527337551116943, + "learning_rate": 4.553441976519073e-05, + "loss": 0.1876, + "step": 15355 + }, + { + "epoch": 0.2738914850354939, + "grad_norm": 0.5574958920478821, + "learning_rate": 4.55335319152843e-05, + "loss": 0.252, + "step": 15356 + }, + { + "epoch": 0.27390932115720756, + "grad_norm": 0.27578604221343994, + "learning_rate": 4.553264398578281e-05, + "loss": 0.2463, + "step": 15357 + }, + { + "epoch": 0.27392715727892125, + "grad_norm": 0.2662498652935028, + "learning_rate": 4.55317559766897e-05, + "loss": 0.2237, + "step": 15358 + }, + { + "epoch": 0.27394499340063494, + "grad_norm": 0.25444018840789795, + "learning_rate": 4.553086788800841e-05, + "loss": 0.1478, + "step": 15359 + }, + { + "epoch": 0.2739628295223487, + "grad_norm": 0.31608253717422485, + "learning_rate": 4.552997971974239e-05, + "loss": 0.1769, + "step": 15360 + }, + { + "epoch": 0.27398066564406237, + "grad_norm": 0.20223009586334229, + "learning_rate": 4.552909147189507e-05, + "loss": 0.1619, + "step": 15361 + }, + { + "epoch": 0.27399850176577606, + "grad_norm": 0.353495717048645, + "learning_rate": 4.55282031444699e-05, + "loss": 0.1505, + "step": 15362 + }, + { + "epoch": 0.27401633788748975, + "grad_norm": 0.35909226536750793, + "learning_rate": 4.552731473747034e-05, + "loss": 0.2422, + "step": 15363 + }, + { + "epoch": 0.27403417400920344, + "grad_norm": 0.2929408848285675, + "learning_rate": 4.552642625089981e-05, + "loss": 0.1617, + "step": 15364 + }, + { + "epoch": 0.2740520101309171, + "grad_norm": 0.27406468987464905, + "learning_rate": 4.5525537684761754e-05, + "loss": 0.2184, + "step": 15365 + }, + { + "epoch": 0.2740698462526308, + "grad_norm": 0.27739614248275757, + "learning_rate": 4.552464903905964e-05, + "loss": 0.2242, + "step": 15366 + }, + { + "epoch": 0.2740876823743445, + "grad_norm": 0.28953036665916443, + "learning_rate": 4.5523760313796884e-05, + "loss": 0.174, + "step": 15367 + }, + { + "epoch": 0.27410551849605824, + "grad_norm": 0.23054704070091248, + "learning_rate": 4.552287150897695e-05, + "loss": 0.1622, + "step": 15368 + }, + { + "epoch": 0.27412335461777193, + "grad_norm": 0.2942816913127899, + "learning_rate": 4.5521982624603274e-05, + "loss": 0.2083, + "step": 15369 + }, + { + "epoch": 0.2741411907394856, + "grad_norm": 0.28185173869132996, + "learning_rate": 4.552109366067931e-05, + "loss": 0.2161, + "step": 15370 + }, + { + "epoch": 0.2741590268611993, + "grad_norm": 0.29522964358329773, + "learning_rate": 4.5520204617208496e-05, + "loss": 0.1486, + "step": 15371 + }, + { + "epoch": 0.274176862982913, + "grad_norm": 0.22718365490436554, + "learning_rate": 4.551931549419428e-05, + "loss": 0.1784, + "step": 15372 + }, + { + "epoch": 0.2741946991046267, + "grad_norm": 0.32544374465942383, + "learning_rate": 4.551842629164012e-05, + "loss": 0.1805, + "step": 15373 + }, + { + "epoch": 0.27421253522634037, + "grad_norm": 0.1977655440568924, + "learning_rate": 4.5517537009549436e-05, + "loss": 0.1325, + "step": 15374 + }, + { + "epoch": 0.27423037134805406, + "grad_norm": 0.3883865773677826, + "learning_rate": 4.551664764792571e-05, + "loss": 0.2036, + "step": 15375 + }, + { + "epoch": 0.27424820746976775, + "grad_norm": 0.23773835599422455, + "learning_rate": 4.5515758206772364e-05, + "loss": 0.1717, + "step": 15376 + }, + { + "epoch": 0.2742660435914815, + "grad_norm": 0.2205808162689209, + "learning_rate": 4.551486868609285e-05, + "loss": 0.1158, + "step": 15377 + }, + { + "epoch": 0.2742838797131952, + "grad_norm": 0.2984471619129181, + "learning_rate": 4.5513979085890626e-05, + "loss": 0.1635, + "step": 15378 + }, + { + "epoch": 0.27430171583490887, + "grad_norm": 0.270412415266037, + "learning_rate": 4.551308940616912e-05, + "loss": 0.195, + "step": 15379 + }, + { + "epoch": 0.27431955195662255, + "grad_norm": 0.24950411915779114, + "learning_rate": 4.5512199646931807e-05, + "loss": 0.1818, + "step": 15380 + }, + { + "epoch": 0.27433738807833624, + "grad_norm": 0.25902676582336426, + "learning_rate": 4.5511309808182125e-05, + "loss": 0.1581, + "step": 15381 + }, + { + "epoch": 0.27435522420004993, + "grad_norm": 0.24803592264652252, + "learning_rate": 4.551041988992352e-05, + "loss": 0.1702, + "step": 15382 + }, + { + "epoch": 0.2743730603217636, + "grad_norm": 0.2569483518600464, + "learning_rate": 4.5509529892159435e-05, + "loss": 0.1739, + "step": 15383 + }, + { + "epoch": 0.2743908964434773, + "grad_norm": 0.2723129987716675, + "learning_rate": 4.550863981489333e-05, + "loss": 0.236, + "step": 15384 + }, + { + "epoch": 0.27440873256519105, + "grad_norm": 0.21242138743400574, + "learning_rate": 4.550774965812866e-05, + "loss": 0.1555, + "step": 15385 + }, + { + "epoch": 0.27442656868690474, + "grad_norm": 0.3287215232849121, + "learning_rate": 4.550685942186887e-05, + "loss": 0.2197, + "step": 15386 + }, + { + "epoch": 0.2744444048086184, + "grad_norm": 0.23314982652664185, + "learning_rate": 4.550596910611741e-05, + "loss": 0.1672, + "step": 15387 + }, + { + "epoch": 0.2744622409303321, + "grad_norm": 0.35281679034233093, + "learning_rate": 4.5505078710877726e-05, + "loss": 0.2133, + "step": 15388 + }, + { + "epoch": 0.2744800770520458, + "grad_norm": 0.31634366512298584, + "learning_rate": 4.550418823615327e-05, + "loss": 0.1904, + "step": 15389 + }, + { + "epoch": 0.2744979131737595, + "grad_norm": 0.22860662639141083, + "learning_rate": 4.550329768194751e-05, + "loss": 0.162, + "step": 15390 + }, + { + "epoch": 0.2745157492954732, + "grad_norm": 0.25230541825294495, + "learning_rate": 4.5502407048263875e-05, + "loss": 0.1578, + "step": 15391 + }, + { + "epoch": 0.27453358541718687, + "grad_norm": 0.30084991455078125, + "learning_rate": 4.550151633510584e-05, + "loss": 0.1828, + "step": 15392 + }, + { + "epoch": 0.2745514215389006, + "grad_norm": 0.2516980469226837, + "learning_rate": 4.550062554247684e-05, + "loss": 0.1682, + "step": 15393 + }, + { + "epoch": 0.2745692576606143, + "grad_norm": 0.39343249797821045, + "learning_rate": 4.549973467038034e-05, + "loss": 0.1658, + "step": 15394 + }, + { + "epoch": 0.274587093782328, + "grad_norm": 0.23363885283470154, + "learning_rate": 4.549884371881978e-05, + "loss": 0.1896, + "step": 15395 + }, + { + "epoch": 0.2746049299040417, + "grad_norm": 0.24483560025691986, + "learning_rate": 4.549795268779863e-05, + "loss": 0.1577, + "step": 15396 + }, + { + "epoch": 0.27462276602575536, + "grad_norm": 0.2151115983724594, + "learning_rate": 4.549706157732033e-05, + "loss": 0.1572, + "step": 15397 + }, + { + "epoch": 0.27464060214746905, + "grad_norm": 0.3536268174648285, + "learning_rate": 4.549617038738835e-05, + "loss": 0.1829, + "step": 15398 + }, + { + "epoch": 0.27465843826918274, + "grad_norm": 0.2710700035095215, + "learning_rate": 4.5495279118006123e-05, + "loss": 0.1582, + "step": 15399 + }, + { + "epoch": 0.2746762743908964, + "grad_norm": 0.25989460945129395, + "learning_rate": 4.549438776917712e-05, + "loss": 0.1704, + "step": 15400 + }, + { + "epoch": 0.2746941105126101, + "grad_norm": 0.43564146757125854, + "learning_rate": 4.54934963409048e-05, + "loss": 0.2206, + "step": 15401 + }, + { + "epoch": 0.27471194663432386, + "grad_norm": 0.26961979269981384, + "learning_rate": 4.549260483319259e-05, + "loss": 0.1404, + "step": 15402 + }, + { + "epoch": 0.27472978275603754, + "grad_norm": 0.2905063033103943, + "learning_rate": 4.5491713246043975e-05, + "loss": 0.2108, + "step": 15403 + }, + { + "epoch": 0.27474761887775123, + "grad_norm": 0.25492873787879944, + "learning_rate": 4.549082157946241e-05, + "loss": 0.1747, + "step": 15404 + }, + { + "epoch": 0.2747654549994649, + "grad_norm": 0.35250574350357056, + "learning_rate": 4.548992983345133e-05, + "loss": 0.1797, + "step": 15405 + }, + { + "epoch": 0.2747832911211786, + "grad_norm": 0.35146665573120117, + "learning_rate": 4.5489038008014214e-05, + "loss": 0.1605, + "step": 15406 + }, + { + "epoch": 0.2748011272428923, + "grad_norm": 0.2627125680446625, + "learning_rate": 4.5488146103154506e-05, + "loss": 0.1681, + "step": 15407 + }, + { + "epoch": 0.274818963364606, + "grad_norm": 0.2415877878665924, + "learning_rate": 4.548725411887567e-05, + "loss": 0.1982, + "step": 15408 + }, + { + "epoch": 0.27483679948631967, + "grad_norm": 0.2854273319244385, + "learning_rate": 4.5486362055181155e-05, + "loss": 0.1669, + "step": 15409 + }, + { + "epoch": 0.2748546356080334, + "grad_norm": 0.29682010412216187, + "learning_rate": 4.548546991207444e-05, + "loss": 0.1937, + "step": 15410 + }, + { + "epoch": 0.2748724717297471, + "grad_norm": 0.3292175829410553, + "learning_rate": 4.548457768955895e-05, + "loss": 0.2037, + "step": 15411 + }, + { + "epoch": 0.2748903078514608, + "grad_norm": 0.3024621605873108, + "learning_rate": 4.548368538763817e-05, + "loss": 0.1943, + "step": 15412 + }, + { + "epoch": 0.2749081439731745, + "grad_norm": 0.3101816475391388, + "learning_rate": 4.548279300631555e-05, + "loss": 0.1604, + "step": 15413 + }, + { + "epoch": 0.27492598009488817, + "grad_norm": 0.30465930700302124, + "learning_rate": 4.548190054559455e-05, + "loss": 0.2189, + "step": 15414 + }, + { + "epoch": 0.27494381621660186, + "grad_norm": 0.26891347765922546, + "learning_rate": 4.548100800547863e-05, + "loss": 0.2279, + "step": 15415 + }, + { + "epoch": 0.27496165233831554, + "grad_norm": 0.2779199779033661, + "learning_rate": 4.548011538597124e-05, + "loss": 0.1995, + "step": 15416 + }, + { + "epoch": 0.27497948846002923, + "grad_norm": 0.45538026094436646, + "learning_rate": 4.547922268707586e-05, + "loss": 0.1889, + "step": 15417 + }, + { + "epoch": 0.2749973245817429, + "grad_norm": 0.6173567771911621, + "learning_rate": 4.547832990879594e-05, + "loss": 0.1916, + "step": 15418 + }, + { + "epoch": 0.27501516070345666, + "grad_norm": 0.23143932223320007, + "learning_rate": 4.547743705113494e-05, + "loss": 0.1951, + "step": 15419 + }, + { + "epoch": 0.27503299682517035, + "grad_norm": 0.24382197856903076, + "learning_rate": 4.547654411409632e-05, + "loss": 0.1634, + "step": 15420 + }, + { + "epoch": 0.27505083294688404, + "grad_norm": 0.28351467847824097, + "learning_rate": 4.5475651097683534e-05, + "loss": 0.1678, + "step": 15421 + }, + { + "epoch": 0.2750686690685977, + "grad_norm": 0.2747480571269989, + "learning_rate": 4.547475800190006e-05, + "loss": 0.1621, + "step": 15422 + }, + { + "epoch": 0.2750865051903114, + "grad_norm": 0.29847392439842224, + "learning_rate": 4.5473864826749354e-05, + "loss": 0.221, + "step": 15423 + }, + { + "epoch": 0.2751043413120251, + "grad_norm": 0.22456280887126923, + "learning_rate": 4.547297157223488e-05, + "loss": 0.2083, + "step": 15424 + }, + { + "epoch": 0.2751221774337388, + "grad_norm": 0.2684950828552246, + "learning_rate": 4.547207823836009e-05, + "loss": 0.2121, + "step": 15425 + }, + { + "epoch": 0.2751400135554525, + "grad_norm": 0.31478646397590637, + "learning_rate": 4.5471184825128454e-05, + "loss": 0.1617, + "step": 15426 + }, + { + "epoch": 0.2751578496771662, + "grad_norm": 0.24621573090553284, + "learning_rate": 4.5470291332543434e-05, + "loss": 0.1471, + "step": 15427 + }, + { + "epoch": 0.2751756857988799, + "grad_norm": 0.2672198712825775, + "learning_rate": 4.54693977606085e-05, + "loss": 0.1447, + "step": 15428 + }, + { + "epoch": 0.2751935219205936, + "grad_norm": 0.32548636198043823, + "learning_rate": 4.54685041093271e-05, + "loss": 0.2174, + "step": 15429 + }, + { + "epoch": 0.2752113580423073, + "grad_norm": 0.2495693415403366, + "learning_rate": 4.546761037870272e-05, + "loss": 0.176, + "step": 15430 + }, + { + "epoch": 0.275229194164021, + "grad_norm": 0.27465543150901794, + "learning_rate": 4.546671656873881e-05, + "loss": 0.1838, + "step": 15431 + }, + { + "epoch": 0.27524703028573466, + "grad_norm": 0.3896167278289795, + "learning_rate": 4.546582267943883e-05, + "loss": 0.2049, + "step": 15432 + }, + { + "epoch": 0.27526486640744835, + "grad_norm": 0.33411529660224915, + "learning_rate": 4.546492871080627e-05, + "loss": 0.188, + "step": 15433 + }, + { + "epoch": 0.27528270252916204, + "grad_norm": 0.3175964653491974, + "learning_rate": 4.546403466284456e-05, + "loss": 0.2119, + "step": 15434 + }, + { + "epoch": 0.2753005386508758, + "grad_norm": 0.2661793529987335, + "learning_rate": 4.546314053555719e-05, + "loss": 0.1817, + "step": 15435 + }, + { + "epoch": 0.27531837477258947, + "grad_norm": 0.36906781792640686, + "learning_rate": 4.546224632894762e-05, + "loss": 0.2069, + "step": 15436 + }, + { + "epoch": 0.27533621089430316, + "grad_norm": 0.27329039573669434, + "learning_rate": 4.546135204301931e-05, + "loss": 0.1883, + "step": 15437 + }, + { + "epoch": 0.27535404701601685, + "grad_norm": 0.2112458050251007, + "learning_rate": 4.5460457677775746e-05, + "loss": 0.1551, + "step": 15438 + }, + { + "epoch": 0.27537188313773053, + "grad_norm": 0.2697179913520813, + "learning_rate": 4.545956323322037e-05, + "loss": 0.2061, + "step": 15439 + }, + { + "epoch": 0.2753897192594442, + "grad_norm": 0.302211195230484, + "learning_rate": 4.5458668709356664e-05, + "loss": 0.1756, + "step": 15440 + }, + { + "epoch": 0.2754075553811579, + "grad_norm": 0.26720723509788513, + "learning_rate": 4.545777410618809e-05, + "loss": 0.1657, + "step": 15441 + }, + { + "epoch": 0.2754253915028716, + "grad_norm": 0.30111950635910034, + "learning_rate": 4.5456879423718126e-05, + "loss": 0.2163, + "step": 15442 + }, + { + "epoch": 0.2754432276245853, + "grad_norm": 0.24537427723407745, + "learning_rate": 4.545598466195022e-05, + "loss": 0.1667, + "step": 15443 + }, + { + "epoch": 0.27546106374629903, + "grad_norm": 0.3326100707054138, + "learning_rate": 4.5455089820887853e-05, + "loss": 0.2425, + "step": 15444 + }, + { + "epoch": 0.2754788998680127, + "grad_norm": 0.2835599184036255, + "learning_rate": 4.5454194900534495e-05, + "loss": 0.2094, + "step": 15445 + }, + { + "epoch": 0.2754967359897264, + "grad_norm": 0.250205397605896, + "learning_rate": 4.545329990089362e-05, + "loss": 0.162, + "step": 15446 + }, + { + "epoch": 0.2755145721114401, + "grad_norm": 0.34966766834259033, + "learning_rate": 4.545240482196868e-05, + "loss": 0.1729, + "step": 15447 + }, + { + "epoch": 0.2755324082331538, + "grad_norm": 0.3049478530883789, + "learning_rate": 4.545150966376317e-05, + "loss": 0.1948, + "step": 15448 + }, + { + "epoch": 0.27555024435486747, + "grad_norm": 0.22935433685779572, + "learning_rate": 4.545061442628054e-05, + "loss": 0.2233, + "step": 15449 + }, + { + "epoch": 0.27556808047658116, + "grad_norm": 0.19765464961528778, + "learning_rate": 4.5449719109524245e-05, + "loss": 0.1745, + "step": 15450 + }, + { + "epoch": 0.27558591659829484, + "grad_norm": 0.21852731704711914, + "learning_rate": 4.54488237134978e-05, + "loss": 0.1762, + "step": 15451 + }, + { + "epoch": 0.2756037527200086, + "grad_norm": 0.31670308113098145, + "learning_rate": 4.544792823820464e-05, + "loss": 0.1639, + "step": 15452 + }, + { + "epoch": 0.2756215888417223, + "grad_norm": 0.28105905652046204, + "learning_rate": 4.544703268364825e-05, + "loss": 0.2124, + "step": 15453 + }, + { + "epoch": 0.27563942496343596, + "grad_norm": 0.26987016201019287, + "learning_rate": 4.544613704983209e-05, + "loss": 0.1766, + "step": 15454 + }, + { + "epoch": 0.27565726108514965, + "grad_norm": 0.2731398046016693, + "learning_rate": 4.544524133675966e-05, + "loss": 0.166, + "step": 15455 + }, + { + "epoch": 0.27567509720686334, + "grad_norm": 0.26097047328948975, + "learning_rate": 4.5444345544434395e-05, + "loss": 0.1943, + "step": 15456 + }, + { + "epoch": 0.27569293332857703, + "grad_norm": 0.31116968393325806, + "learning_rate": 4.544344967285979e-05, + "loss": 0.2096, + "step": 15457 + }, + { + "epoch": 0.2757107694502907, + "grad_norm": 0.26668640971183777, + "learning_rate": 4.5442553722039327e-05, + "loss": 0.1619, + "step": 15458 + }, + { + "epoch": 0.2757286055720044, + "grad_norm": 0.3781563639640808, + "learning_rate": 4.544165769197645e-05, + "loss": 0.1345, + "step": 15459 + }, + { + "epoch": 0.2757464416937181, + "grad_norm": 0.247103750705719, + "learning_rate": 4.544076158267465e-05, + "loss": 0.1285, + "step": 15460 + }, + { + "epoch": 0.27576427781543184, + "grad_norm": 0.28072378039360046, + "learning_rate": 4.54398653941374e-05, + "loss": 0.1613, + "step": 15461 + }, + { + "epoch": 0.2757821139371455, + "grad_norm": 0.2977953553199768, + "learning_rate": 4.543896912636817e-05, + "loss": 0.1716, + "step": 15462 + }, + { + "epoch": 0.2757999500588592, + "grad_norm": 0.2700483798980713, + "learning_rate": 4.543807277937044e-05, + "loss": 0.1382, + "step": 15463 + }, + { + "epoch": 0.2758177861805729, + "grad_norm": 0.31720682978630066, + "learning_rate": 4.543717635314768e-05, + "loss": 0.1381, + "step": 15464 + }, + { + "epoch": 0.2758356223022866, + "grad_norm": 0.2514737844467163, + "learning_rate": 4.5436279847703364e-05, + "loss": 0.182, + "step": 15465 + }, + { + "epoch": 0.2758534584240003, + "grad_norm": 0.40441063046455383, + "learning_rate": 4.5435383263040975e-05, + "loss": 0.1906, + "step": 15466 + }, + { + "epoch": 0.27587129454571396, + "grad_norm": 0.3640082776546478, + "learning_rate": 4.5434486599163977e-05, + "loss": 0.1983, + "step": 15467 + }, + { + "epoch": 0.27588913066742765, + "grad_norm": 0.2816970646381378, + "learning_rate": 4.543358985607585e-05, + "loss": 0.1765, + "step": 15468 + }, + { + "epoch": 0.2759069667891414, + "grad_norm": 0.3926188349723816, + "learning_rate": 4.543269303378007e-05, + "loss": 0.1824, + "step": 15469 + }, + { + "epoch": 0.2759248029108551, + "grad_norm": 0.2605321407318115, + "learning_rate": 4.5431796132280116e-05, + "loss": 0.1742, + "step": 15470 + }, + { + "epoch": 0.27594263903256877, + "grad_norm": 0.2637878358364105, + "learning_rate": 4.543089915157946e-05, + "loss": 0.1741, + "step": 15471 + }, + { + "epoch": 0.27596047515428246, + "grad_norm": 0.23655986785888672, + "learning_rate": 4.543000209168159e-05, + "loss": 0.1625, + "step": 15472 + }, + { + "epoch": 0.27597831127599615, + "grad_norm": 0.28845351934432983, + "learning_rate": 4.5429104952589976e-05, + "loss": 0.1911, + "step": 15473 + }, + { + "epoch": 0.27599614739770983, + "grad_norm": 0.30237093567848206, + "learning_rate": 4.5428207734308094e-05, + "loss": 0.1581, + "step": 15474 + }, + { + "epoch": 0.2760139835194235, + "grad_norm": 0.28601041436195374, + "learning_rate": 4.542731043683942e-05, + "loss": 0.1492, + "step": 15475 + }, + { + "epoch": 0.2760318196411372, + "grad_norm": 0.30665090680122375, + "learning_rate": 4.5426413060187435e-05, + "loss": 0.1735, + "step": 15476 + }, + { + "epoch": 0.2760496557628509, + "grad_norm": 0.3497749865055084, + "learning_rate": 4.542551560435562e-05, + "loss": 0.194, + "step": 15477 + }, + { + "epoch": 0.27606749188456464, + "grad_norm": 0.2997104227542877, + "learning_rate": 4.542461806934745e-05, + "loss": 0.1253, + "step": 15478 + }, + { + "epoch": 0.27608532800627833, + "grad_norm": 0.27530500292778015, + "learning_rate": 4.542372045516641e-05, + "loss": 0.2024, + "step": 15479 + }, + { + "epoch": 0.276103164127992, + "grad_norm": 0.30195263028144836, + "learning_rate": 4.5422822761815966e-05, + "loss": 0.1515, + "step": 15480 + }, + { + "epoch": 0.2761210002497057, + "grad_norm": 0.38357216119766235, + "learning_rate": 4.542192498929961e-05, + "loss": 0.1988, + "step": 15481 + }, + { + "epoch": 0.2761388363714194, + "grad_norm": 0.32098227739334106, + "learning_rate": 4.542102713762082e-05, + "loss": 0.1824, + "step": 15482 + }, + { + "epoch": 0.2761566724931331, + "grad_norm": 0.19651976227760315, + "learning_rate": 4.542012920678308e-05, + "loss": 0.1555, + "step": 15483 + }, + { + "epoch": 0.27617450861484677, + "grad_norm": 0.23746676743030548, + "learning_rate": 4.541923119678987e-05, + "loss": 0.1517, + "step": 15484 + }, + { + "epoch": 0.27619234473656046, + "grad_norm": 0.3970274031162262, + "learning_rate": 4.5418333107644656e-05, + "loss": 0.1928, + "step": 15485 + }, + { + "epoch": 0.2762101808582742, + "grad_norm": 0.3195289969444275, + "learning_rate": 4.5417434939350936e-05, + "loss": 0.2204, + "step": 15486 + }, + { + "epoch": 0.2762280169799879, + "grad_norm": 0.3507113456726074, + "learning_rate": 4.541653669191219e-05, + "loss": 0.1888, + "step": 15487 + }, + { + "epoch": 0.2762458531017016, + "grad_norm": 0.32902273535728455, + "learning_rate": 4.541563836533189e-05, + "loss": 0.2118, + "step": 15488 + }, + { + "epoch": 0.27626368922341527, + "grad_norm": 0.36764469742774963, + "learning_rate": 4.541473995961353e-05, + "loss": 0.1885, + "step": 15489 + }, + { + "epoch": 0.27628152534512895, + "grad_norm": 0.26530030369758606, + "learning_rate": 4.5413841474760586e-05, + "loss": 0.1571, + "step": 15490 + }, + { + "epoch": 0.27629936146684264, + "grad_norm": 0.22426503896713257, + "learning_rate": 4.5412942910776535e-05, + "loss": 0.1767, + "step": 15491 + }, + { + "epoch": 0.27631719758855633, + "grad_norm": 0.24545851349830627, + "learning_rate": 4.541204426766486e-05, + "loss": 0.1417, + "step": 15492 + }, + { + "epoch": 0.27633503371027, + "grad_norm": 0.26633673906326294, + "learning_rate": 4.541114554542907e-05, + "loss": 0.162, + "step": 15493 + }, + { + "epoch": 0.27635286983198376, + "grad_norm": 0.262577086687088, + "learning_rate": 4.541024674407262e-05, + "loss": 0.1978, + "step": 15494 + }, + { + "epoch": 0.27637070595369745, + "grad_norm": 0.33074212074279785, + "learning_rate": 4.540934786359901e-05, + "loss": 0.2299, + "step": 15495 + }, + { + "epoch": 0.27638854207541114, + "grad_norm": 0.28678977489471436, + "learning_rate": 4.540844890401171e-05, + "loss": 0.1768, + "step": 15496 + }, + { + "epoch": 0.2764063781971248, + "grad_norm": 0.35482659935951233, + "learning_rate": 4.5407549865314225e-05, + "loss": 0.2379, + "step": 15497 + }, + { + "epoch": 0.2764242143188385, + "grad_norm": 0.2754009962081909, + "learning_rate": 4.540665074751003e-05, + "loss": 0.2044, + "step": 15498 + }, + { + "epoch": 0.2764420504405522, + "grad_norm": 0.21383485198020935, + "learning_rate": 4.540575155060259e-05, + "loss": 0.1742, + "step": 15499 + }, + { + "epoch": 0.2764598865622659, + "grad_norm": 0.21688519418239594, + "learning_rate": 4.540485227459542e-05, + "loss": 0.1672, + "step": 15500 + }, + { + "epoch": 0.2764777226839796, + "grad_norm": 0.2701101005077362, + "learning_rate": 4.5403952919492e-05, + "loss": 0.1964, + "step": 15501 + }, + { + "epoch": 0.27649555880569326, + "grad_norm": 0.21089604496955872, + "learning_rate": 4.54030534852958e-05, + "loss": 0.1828, + "step": 15502 + }, + { + "epoch": 0.276513394927407, + "grad_norm": 0.2518959939479828, + "learning_rate": 4.540215397201032e-05, + "loss": 0.176, + "step": 15503 + }, + { + "epoch": 0.2765312310491207, + "grad_norm": 0.1921340972185135, + "learning_rate": 4.540125437963905e-05, + "loss": 0.1622, + "step": 15504 + }, + { + "epoch": 0.2765490671708344, + "grad_norm": 0.22540751099586487, + "learning_rate": 4.540035470818547e-05, + "loss": 0.1558, + "step": 15505 + }, + { + "epoch": 0.27656690329254807, + "grad_norm": 0.31669074296951294, + "learning_rate": 4.539945495765307e-05, + "loss": 0.2498, + "step": 15506 + }, + { + "epoch": 0.27658473941426176, + "grad_norm": 0.25251826643943787, + "learning_rate": 4.539855512804534e-05, + "loss": 0.183, + "step": 15507 + }, + { + "epoch": 0.27660257553597545, + "grad_norm": 0.2978300452232361, + "learning_rate": 4.5397655219365756e-05, + "loss": 0.1113, + "step": 15508 + }, + { + "epoch": 0.27662041165768914, + "grad_norm": 0.2912184000015259, + "learning_rate": 4.5396755231617814e-05, + "loss": 0.1499, + "step": 15509 + }, + { + "epoch": 0.2766382477794028, + "grad_norm": 0.25617778301239014, + "learning_rate": 4.539585516480501e-05, + "loss": 0.1744, + "step": 15510 + }, + { + "epoch": 0.27665608390111657, + "grad_norm": 0.3530201315879822, + "learning_rate": 4.539495501893083e-05, + "loss": 0.1697, + "step": 15511 + }, + { + "epoch": 0.27667392002283026, + "grad_norm": 0.25744280219078064, + "learning_rate": 4.539405479399875e-05, + "loss": 0.1812, + "step": 15512 + }, + { + "epoch": 0.27669175614454394, + "grad_norm": 0.3635775148868561, + "learning_rate": 4.539315449001228e-05, + "loss": 0.2089, + "step": 15513 + }, + { + "epoch": 0.27670959226625763, + "grad_norm": 0.2810732424259186, + "learning_rate": 4.53922541069749e-05, + "loss": 0.1799, + "step": 15514 + }, + { + "epoch": 0.2767274283879713, + "grad_norm": 0.2714241147041321, + "learning_rate": 4.539135364489009e-05, + "loss": 0.2276, + "step": 15515 + }, + { + "epoch": 0.276745264509685, + "grad_norm": 0.22411157190799713, + "learning_rate": 4.539045310376136e-05, + "loss": 0.1452, + "step": 15516 + }, + { + "epoch": 0.2767631006313987, + "grad_norm": 0.31750720739364624, + "learning_rate": 4.538955248359219e-05, + "loss": 0.1752, + "step": 15517 + }, + { + "epoch": 0.2767809367531124, + "grad_norm": 0.2537008225917816, + "learning_rate": 4.5388651784386066e-05, + "loss": 0.1992, + "step": 15518 + }, + { + "epoch": 0.27679877287482607, + "grad_norm": 0.2894752025604248, + "learning_rate": 4.538775100614649e-05, + "loss": 0.2175, + "step": 15519 + }, + { + "epoch": 0.2768166089965398, + "grad_norm": 0.22802633047103882, + "learning_rate": 4.5386850148876944e-05, + "loss": 0.1735, + "step": 15520 + }, + { + "epoch": 0.2768344451182535, + "grad_norm": 0.2781098484992981, + "learning_rate": 4.538594921258094e-05, + "loss": 0.1879, + "step": 15521 + }, + { + "epoch": 0.2768522812399672, + "grad_norm": 0.2885425388813019, + "learning_rate": 4.538504819726194e-05, + "loss": 0.1765, + "step": 15522 + }, + { + "epoch": 0.2768701173616809, + "grad_norm": 0.2876160144805908, + "learning_rate": 4.5384147102923454e-05, + "loss": 0.2395, + "step": 15523 + }, + { + "epoch": 0.27688795348339457, + "grad_norm": 0.3026312291622162, + "learning_rate": 4.538324592956898e-05, + "loss": 0.175, + "step": 15524 + }, + { + "epoch": 0.27690578960510825, + "grad_norm": 0.23294112086296082, + "learning_rate": 4.5382344677202e-05, + "loss": 0.2215, + "step": 15525 + }, + { + "epoch": 0.27692362572682194, + "grad_norm": 0.27057570219039917, + "learning_rate": 4.538144334582601e-05, + "loss": 0.1385, + "step": 15526 + }, + { + "epoch": 0.27694146184853563, + "grad_norm": 0.23334906995296478, + "learning_rate": 4.5380541935444514e-05, + "loss": 0.1769, + "step": 15527 + }, + { + "epoch": 0.2769592979702494, + "grad_norm": 0.3130313456058502, + "learning_rate": 4.5379640446061e-05, + "loss": 0.2065, + "step": 15528 + }, + { + "epoch": 0.27697713409196306, + "grad_norm": 0.2886606454849243, + "learning_rate": 4.537873887767895e-05, + "loss": 0.1943, + "step": 15529 + }, + { + "epoch": 0.27699497021367675, + "grad_norm": 0.22477389872074127, + "learning_rate": 4.537783723030188e-05, + "loss": 0.1631, + "step": 15530 + }, + { + "epoch": 0.27701280633539044, + "grad_norm": 0.2547028362751007, + "learning_rate": 4.5376935503933265e-05, + "loss": 0.1769, + "step": 15531 + }, + { + "epoch": 0.2770306424571041, + "grad_norm": 0.25012221932411194, + "learning_rate": 4.537603369857662e-05, + "loss": 0.1017, + "step": 15532 + }, + { + "epoch": 0.2770484785788178, + "grad_norm": 0.26341575384140015, + "learning_rate": 4.5375131814235415e-05, + "loss": 0.1959, + "step": 15533 + }, + { + "epoch": 0.2770663147005315, + "grad_norm": 0.21279609203338623, + "learning_rate": 4.5374229850913174e-05, + "loss": 0.1385, + "step": 15534 + }, + { + "epoch": 0.2770841508222452, + "grad_norm": 0.2950650751590729, + "learning_rate": 4.537332780861338e-05, + "loss": 0.1786, + "step": 15535 + }, + { + "epoch": 0.2771019869439589, + "grad_norm": 0.27218860387802124, + "learning_rate": 4.537242568733952e-05, + "loss": 0.176, + "step": 15536 + }, + { + "epoch": 0.2771198230656726, + "grad_norm": 0.22721554338932037, + "learning_rate": 4.537152348709512e-05, + "loss": 0.1416, + "step": 15537 + }, + { + "epoch": 0.2771376591873863, + "grad_norm": 0.2503151595592499, + "learning_rate": 4.537062120788365e-05, + "loss": 0.1197, + "step": 15538 + }, + { + "epoch": 0.2771554953091, + "grad_norm": 0.32465431094169617, + "learning_rate": 4.536971884970862e-05, + "loss": 0.2091, + "step": 15539 + }, + { + "epoch": 0.2771733314308137, + "grad_norm": 0.4016757607460022, + "learning_rate": 4.5368816412573515e-05, + "loss": 0.1305, + "step": 15540 + }, + { + "epoch": 0.2771911675525274, + "grad_norm": 0.24984855949878693, + "learning_rate": 4.536791389648185e-05, + "loss": 0.1744, + "step": 15541 + }, + { + "epoch": 0.27720900367424106, + "grad_norm": 0.18738994002342224, + "learning_rate": 4.536701130143711e-05, + "loss": 0.1325, + "step": 15542 + }, + { + "epoch": 0.27722683979595475, + "grad_norm": 0.30558136105537415, + "learning_rate": 4.536610862744281e-05, + "loss": 0.1701, + "step": 15543 + }, + { + "epoch": 0.27724467591766844, + "grad_norm": 0.24732215702533722, + "learning_rate": 4.536520587450243e-05, + "loss": 0.1781, + "step": 15544 + }, + { + "epoch": 0.2772625120393822, + "grad_norm": 0.26350006461143494, + "learning_rate": 4.536430304261948e-05, + "loss": 0.1562, + "step": 15545 + }, + { + "epoch": 0.27728034816109587, + "grad_norm": 0.20535536110401154, + "learning_rate": 4.536340013179746e-05, + "loss": 0.1432, + "step": 15546 + }, + { + "epoch": 0.27729818428280956, + "grad_norm": 0.33243194222450256, + "learning_rate": 4.536249714203986e-05, + "loss": 0.218, + "step": 15547 + }, + { + "epoch": 0.27731602040452324, + "grad_norm": 0.20961210131645203, + "learning_rate": 4.53615940733502e-05, + "loss": 0.1231, + "step": 15548 + }, + { + "epoch": 0.27733385652623693, + "grad_norm": 0.2833684980869293, + "learning_rate": 4.5360690925731964e-05, + "loss": 0.1159, + "step": 15549 + }, + { + "epoch": 0.2773516926479506, + "grad_norm": 0.2917201519012451, + "learning_rate": 4.5359787699188656e-05, + "loss": 0.1843, + "step": 15550 + }, + { + "epoch": 0.2773695287696643, + "grad_norm": 0.4551113545894623, + "learning_rate": 4.535888439372378e-05, + "loss": 0.1996, + "step": 15551 + }, + { + "epoch": 0.277387364891378, + "grad_norm": 0.2640388011932373, + "learning_rate": 4.535798100934083e-05, + "loss": 0.1745, + "step": 15552 + }, + { + "epoch": 0.27740520101309174, + "grad_norm": 0.23592965304851532, + "learning_rate": 4.5357077546043323e-05, + "loss": 0.1919, + "step": 15553 + }, + { + "epoch": 0.27742303713480543, + "grad_norm": 0.2902146279811859, + "learning_rate": 4.535617400383475e-05, + "loss": 0.1736, + "step": 15554 + }, + { + "epoch": 0.2774408732565191, + "grad_norm": 0.3253636956214905, + "learning_rate": 4.535527038271862e-05, + "loss": 0.2047, + "step": 15555 + }, + { + "epoch": 0.2774587093782328, + "grad_norm": 0.25144073367118835, + "learning_rate": 4.5354366682698426e-05, + "loss": 0.2093, + "step": 15556 + }, + { + "epoch": 0.2774765454999465, + "grad_norm": 0.20756272971630096, + "learning_rate": 4.535346290377768e-05, + "loss": 0.1816, + "step": 15557 + }, + { + "epoch": 0.2774943816216602, + "grad_norm": 0.22271545231342316, + "learning_rate": 4.535255904595988e-05, + "loss": 0.1624, + "step": 15558 + }, + { + "epoch": 0.27751221774337387, + "grad_norm": 0.23617413640022278, + "learning_rate": 4.5351655109248526e-05, + "loss": 0.1784, + "step": 15559 + }, + { + "epoch": 0.27753005386508756, + "grad_norm": 0.34566089510917664, + "learning_rate": 4.535075109364713e-05, + "loss": 0.2234, + "step": 15560 + }, + { + "epoch": 0.27754788998680124, + "grad_norm": 0.19219085574150085, + "learning_rate": 4.5349846999159194e-05, + "loss": 0.1348, + "step": 15561 + }, + { + "epoch": 0.277565726108515, + "grad_norm": 0.2243836373090744, + "learning_rate": 4.534894282578822e-05, + "loss": 0.1744, + "step": 15562 + }, + { + "epoch": 0.2775835622302287, + "grad_norm": 0.3449123501777649, + "learning_rate": 4.534803857353772e-05, + "loss": 0.1953, + "step": 15563 + }, + { + "epoch": 0.27760139835194236, + "grad_norm": 0.24644972383975983, + "learning_rate": 4.53471342424112e-05, + "loss": 0.1751, + "step": 15564 + }, + { + "epoch": 0.27761923447365605, + "grad_norm": 0.26189693808555603, + "learning_rate": 4.534622983241215e-05, + "loss": 0.2029, + "step": 15565 + }, + { + "epoch": 0.27763707059536974, + "grad_norm": 0.35615599155426025, + "learning_rate": 4.534532534354409e-05, + "loss": 0.228, + "step": 15566 + }, + { + "epoch": 0.2776549067170834, + "grad_norm": 0.2755829393863678, + "learning_rate": 4.534442077581053e-05, + "loss": 0.194, + "step": 15567 + }, + { + "epoch": 0.2776727428387971, + "grad_norm": 0.23754830658435822, + "learning_rate": 4.534351612921496e-05, + "loss": 0.1795, + "step": 15568 + }, + { + "epoch": 0.2776905789605108, + "grad_norm": 0.2579927146434784, + "learning_rate": 4.534261140376089e-05, + "loss": 0.1785, + "step": 15569 + }, + { + "epoch": 0.27770841508222455, + "grad_norm": 0.2636464834213257, + "learning_rate": 4.534170659945184e-05, + "loss": 0.2161, + "step": 15570 + }, + { + "epoch": 0.27772625120393823, + "grad_norm": 0.2908783257007599, + "learning_rate": 4.5340801716291305e-05, + "loss": 0.1814, + "step": 15571 + }, + { + "epoch": 0.2777440873256519, + "grad_norm": 0.37598586082458496, + "learning_rate": 4.53398967542828e-05, + "loss": 0.1054, + "step": 15572 + }, + { + "epoch": 0.2777619234473656, + "grad_norm": 0.26995182037353516, + "learning_rate": 4.533899171342983e-05, + "loss": 0.1885, + "step": 15573 + }, + { + "epoch": 0.2777797595690793, + "grad_norm": 0.2756546139717102, + "learning_rate": 4.5338086593735904e-05, + "loss": 0.1493, + "step": 15574 + }, + { + "epoch": 0.277797595690793, + "grad_norm": 0.23395726084709167, + "learning_rate": 4.533718139520452e-05, + "loss": 0.174, + "step": 15575 + }, + { + "epoch": 0.2778154318125067, + "grad_norm": 0.3361474573612213, + "learning_rate": 4.5336276117839206e-05, + "loss": 0.2173, + "step": 15576 + }, + { + "epoch": 0.27783326793422036, + "grad_norm": 0.25557228922843933, + "learning_rate": 4.533537076164346e-05, + "loss": 0.1562, + "step": 15577 + }, + { + "epoch": 0.27785110405593405, + "grad_norm": 0.3303571939468384, + "learning_rate": 4.533446532662079e-05, + "loss": 0.191, + "step": 15578 + }, + { + "epoch": 0.2778689401776478, + "grad_norm": 0.30470383167266846, + "learning_rate": 4.533355981277472e-05, + "loss": 0.159, + "step": 15579 + }, + { + "epoch": 0.2778867762993615, + "grad_norm": 0.36051464080810547, + "learning_rate": 4.5332654220108736e-05, + "loss": 0.1502, + "step": 15580 + }, + { + "epoch": 0.27790461242107517, + "grad_norm": 0.209768146276474, + "learning_rate": 4.5331748548626374e-05, + "loss": 0.1705, + "step": 15581 + }, + { + "epoch": 0.27792244854278886, + "grad_norm": 0.3213937282562256, + "learning_rate": 4.5330842798331126e-05, + "loss": 0.2146, + "step": 15582 + }, + { + "epoch": 0.27794028466450255, + "grad_norm": 0.25211167335510254, + "learning_rate": 4.53299369692265e-05, + "loss": 0.1991, + "step": 15583 + }, + { + "epoch": 0.27795812078621623, + "grad_norm": 0.2216213494539261, + "learning_rate": 4.5329031061316035e-05, + "loss": 0.1583, + "step": 15584 + }, + { + "epoch": 0.2779759569079299, + "grad_norm": 0.24817034602165222, + "learning_rate": 4.532812507460321e-05, + "loss": 0.1894, + "step": 15585 + }, + { + "epoch": 0.2779937930296436, + "grad_norm": 0.24385593831539154, + "learning_rate": 4.532721900909156e-05, + "loss": 0.1846, + "step": 15586 + }, + { + "epoch": 0.27801162915135735, + "grad_norm": 0.24522389471530914, + "learning_rate": 4.532631286478458e-05, + "loss": 0.1614, + "step": 15587 + }, + { + "epoch": 0.27802946527307104, + "grad_norm": 0.3663196861743927, + "learning_rate": 4.5325406641685796e-05, + "loss": 0.2239, + "step": 15588 + }, + { + "epoch": 0.27804730139478473, + "grad_norm": 0.36047157645225525, + "learning_rate": 4.5324500339798715e-05, + "loss": 0.2091, + "step": 15589 + }, + { + "epoch": 0.2780651375164984, + "grad_norm": 0.20961324870586395, + "learning_rate": 4.5323593959126857e-05, + "loss": 0.1354, + "step": 15590 + }, + { + "epoch": 0.2780829736382121, + "grad_norm": 0.2699880599975586, + "learning_rate": 4.5322687499673724e-05, + "loss": 0.2045, + "step": 15591 + }, + { + "epoch": 0.2781008097599258, + "grad_norm": 0.29620838165283203, + "learning_rate": 4.532178096144283e-05, + "loss": 0.1992, + "step": 15592 + }, + { + "epoch": 0.2781186458816395, + "grad_norm": 0.2632938623428345, + "learning_rate": 4.5320874344437705e-05, + "loss": 0.1829, + "step": 15593 + }, + { + "epoch": 0.27813648200335317, + "grad_norm": 0.27483099699020386, + "learning_rate": 4.5319967648661845e-05, + "loss": 0.1805, + "step": 15594 + }, + { + "epoch": 0.2781543181250669, + "grad_norm": 0.21178576350212097, + "learning_rate": 4.5319060874118766e-05, + "loss": 0.1407, + "step": 15595 + }, + { + "epoch": 0.2781721542467806, + "grad_norm": 0.2852518558502197, + "learning_rate": 4.5318154020811996e-05, + "loss": 0.1534, + "step": 15596 + }, + { + "epoch": 0.2781899903684943, + "grad_norm": 0.2494693398475647, + "learning_rate": 4.531724708874504e-05, + "loss": 0.1846, + "step": 15597 + }, + { + "epoch": 0.278207826490208, + "grad_norm": 0.23659773170948029, + "learning_rate": 4.531634007792143e-05, + "loss": 0.1688, + "step": 15598 + }, + { + "epoch": 0.27822566261192166, + "grad_norm": 0.3187173008918762, + "learning_rate": 4.531543298834465e-05, + "loss": 0.2202, + "step": 15599 + }, + { + "epoch": 0.27824349873363535, + "grad_norm": 0.2896851599216461, + "learning_rate": 4.5314525820018244e-05, + "loss": 0.2044, + "step": 15600 + }, + { + "epoch": 0.27826133485534904, + "grad_norm": 0.2916419804096222, + "learning_rate": 4.531361857294572e-05, + "loss": 0.223, + "step": 15601 + }, + { + "epoch": 0.27827917097706273, + "grad_norm": 0.23943808674812317, + "learning_rate": 4.53127112471306e-05, + "loss": 0.1811, + "step": 15602 + }, + { + "epoch": 0.2782970070987764, + "grad_norm": 0.22779546678066254, + "learning_rate": 4.5311803842576385e-05, + "loss": 0.1674, + "step": 15603 + }, + { + "epoch": 0.27831484322049016, + "grad_norm": 0.320279061794281, + "learning_rate": 4.5310896359286605e-05, + "loss": 0.1746, + "step": 15604 + }, + { + "epoch": 0.27833267934220385, + "grad_norm": 0.2758757472038269, + "learning_rate": 4.530998879726478e-05, + "loss": 0.1649, + "step": 15605 + }, + { + "epoch": 0.27835051546391754, + "grad_norm": 0.2524808347225189, + "learning_rate": 4.530908115651442e-05, + "loss": 0.1967, + "step": 15606 + }, + { + "epoch": 0.2783683515856312, + "grad_norm": 0.2577757239341736, + "learning_rate": 4.530817343703905e-05, + "loss": 0.2039, + "step": 15607 + }, + { + "epoch": 0.2783861877073449, + "grad_norm": 0.29628923535346985, + "learning_rate": 4.530726563884218e-05, + "loss": 0.1545, + "step": 15608 + }, + { + "epoch": 0.2784040238290586, + "grad_norm": 0.33466726541519165, + "learning_rate": 4.5306357761927345e-05, + "loss": 0.1666, + "step": 15609 + }, + { + "epoch": 0.2784218599507723, + "grad_norm": 0.373579740524292, + "learning_rate": 4.530544980629804e-05, + "loss": 0.2333, + "step": 15610 + }, + { + "epoch": 0.278439696072486, + "grad_norm": 0.2565092444419861, + "learning_rate": 4.530454177195781e-05, + "loss": 0.1665, + "step": 15611 + }, + { + "epoch": 0.2784575321941997, + "grad_norm": 0.30497944355010986, + "learning_rate": 4.530363365891015e-05, + "loss": 0.2114, + "step": 15612 + }, + { + "epoch": 0.2784753683159134, + "grad_norm": 0.2598390579223633, + "learning_rate": 4.5302725467158604e-05, + "loss": 0.1372, + "step": 15613 + }, + { + "epoch": 0.2784932044376271, + "grad_norm": 0.2783115804195404, + "learning_rate": 4.530181719670667e-05, + "loss": 0.1664, + "step": 15614 + }, + { + "epoch": 0.2785110405593408, + "grad_norm": 0.4061029851436615, + "learning_rate": 4.53009088475579e-05, + "loss": 0.1523, + "step": 15615 + }, + { + "epoch": 0.27852887668105447, + "grad_norm": 0.2874487638473511, + "learning_rate": 4.530000041971578e-05, + "loss": 0.1768, + "step": 15616 + }, + { + "epoch": 0.27854671280276816, + "grad_norm": 0.27280646562576294, + "learning_rate": 4.529909191318385e-05, + "loss": 0.1952, + "step": 15617 + }, + { + "epoch": 0.27856454892448185, + "grad_norm": 0.28535526990890503, + "learning_rate": 4.529818332796564e-05, + "loss": 0.203, + "step": 15618 + }, + { + "epoch": 0.27858238504619554, + "grad_norm": 0.32303035259246826, + "learning_rate": 4.529727466406465e-05, + "loss": 0.2003, + "step": 15619 + }, + { + "epoch": 0.2786002211679092, + "grad_norm": 0.29018232226371765, + "learning_rate": 4.529636592148441e-05, + "loss": 0.1635, + "step": 15620 + }, + { + "epoch": 0.27861805728962297, + "grad_norm": 0.2523662745952606, + "learning_rate": 4.5295457100228456e-05, + "loss": 0.1109, + "step": 15621 + }, + { + "epoch": 0.27863589341133665, + "grad_norm": 0.2685641348361969, + "learning_rate": 4.529454820030029e-05, + "loss": 0.1804, + "step": 15622 + }, + { + "epoch": 0.27865372953305034, + "grad_norm": 0.28229889273643494, + "learning_rate": 4.529363922170346e-05, + "loss": 0.1954, + "step": 15623 + }, + { + "epoch": 0.27867156565476403, + "grad_norm": 0.23529836535453796, + "learning_rate": 4.5292730164441455e-05, + "loss": 0.1969, + "step": 15624 + }, + { + "epoch": 0.2786894017764777, + "grad_norm": 0.2972094416618347, + "learning_rate": 4.5291821028517834e-05, + "loss": 0.2335, + "step": 15625 + }, + { + "epoch": 0.2787072378981914, + "grad_norm": 0.3776022493839264, + "learning_rate": 4.52909118139361e-05, + "loss": 0.2171, + "step": 15626 + }, + { + "epoch": 0.2787250740199051, + "grad_norm": 0.39162972569465637, + "learning_rate": 4.529000252069978e-05, + "loss": 0.1781, + "step": 15627 + }, + { + "epoch": 0.2787429101416188, + "grad_norm": 0.21789129078388214, + "learning_rate": 4.5289093148812414e-05, + "loss": 0.1551, + "step": 15628 + }, + { + "epoch": 0.2787607462633325, + "grad_norm": 0.27025988698005676, + "learning_rate": 4.528818369827751e-05, + "loss": 0.2257, + "step": 15629 + }, + { + "epoch": 0.2787785823850462, + "grad_norm": 0.2933149039745331, + "learning_rate": 4.528727416909859e-05, + "loss": 0.1475, + "step": 15630 + }, + { + "epoch": 0.2787964185067599, + "grad_norm": 0.2323552668094635, + "learning_rate": 4.52863645612792e-05, + "loss": 0.1631, + "step": 15631 + }, + { + "epoch": 0.2788142546284736, + "grad_norm": 0.262197345495224, + "learning_rate": 4.528545487482285e-05, + "loss": 0.1252, + "step": 15632 + }, + { + "epoch": 0.2788320907501873, + "grad_norm": 0.27623292803764343, + "learning_rate": 4.528454510973307e-05, + "loss": 0.1817, + "step": 15633 + }, + { + "epoch": 0.27884992687190097, + "grad_norm": 0.2861792743206024, + "learning_rate": 4.528363526601339e-05, + "loss": 0.192, + "step": 15634 + }, + { + "epoch": 0.27886776299361465, + "grad_norm": 0.26510319113731384, + "learning_rate": 4.528272534366733e-05, + "loss": 0.1959, + "step": 15635 + }, + { + "epoch": 0.27888559911532834, + "grad_norm": 0.44476866722106934, + "learning_rate": 4.528181534269842e-05, + "loss": 0.1873, + "step": 15636 + }, + { + "epoch": 0.27890343523704203, + "grad_norm": 0.2552759647369385, + "learning_rate": 4.5280905263110194e-05, + "loss": 0.1552, + "step": 15637 + }, + { + "epoch": 0.2789212713587558, + "grad_norm": 1.0172224044799805, + "learning_rate": 4.527999510490617e-05, + "loss": 0.1539, + "step": 15638 + }, + { + "epoch": 0.27893910748046946, + "grad_norm": 0.28688690066337585, + "learning_rate": 4.527908486808988e-05, + "loss": 0.1587, + "step": 15639 + }, + { + "epoch": 0.27895694360218315, + "grad_norm": 0.31575825810432434, + "learning_rate": 4.527817455266485e-05, + "loss": 0.1909, + "step": 15640 + }, + { + "epoch": 0.27897477972389684, + "grad_norm": 0.232138529419899, + "learning_rate": 4.527726415863462e-05, + "loss": 0.1821, + "step": 15641 + }, + { + "epoch": 0.2789926158456105, + "grad_norm": 0.2726914882659912, + "learning_rate": 4.527635368600271e-05, + "loss": 0.2031, + "step": 15642 + }, + { + "epoch": 0.2790104519673242, + "grad_norm": 0.3221610188484192, + "learning_rate": 4.527544313477265e-05, + "loss": 0.2329, + "step": 15643 + }, + { + "epoch": 0.2790282880890379, + "grad_norm": 0.23013746738433838, + "learning_rate": 4.527453250494797e-05, + "loss": 0.188, + "step": 15644 + }, + { + "epoch": 0.2790461242107516, + "grad_norm": 0.34944698214530945, + "learning_rate": 4.5273621796532196e-05, + "loss": 0.1697, + "step": 15645 + }, + { + "epoch": 0.27906396033246533, + "grad_norm": 0.2999511659145355, + "learning_rate": 4.527271100952886e-05, + "loss": 0.2223, + "step": 15646 + }, + { + "epoch": 0.279081796454179, + "grad_norm": 0.278818815946579, + "learning_rate": 4.527180014394149e-05, + "loss": 0.2096, + "step": 15647 + }, + { + "epoch": 0.2790996325758927, + "grad_norm": 0.2949967384338379, + "learning_rate": 4.5270889199773626e-05, + "loss": 0.1548, + "step": 15648 + }, + { + "epoch": 0.2791174686976064, + "grad_norm": 0.2322419136762619, + "learning_rate": 4.52699781770288e-05, + "loss": 0.1427, + "step": 15649 + }, + { + "epoch": 0.2791353048193201, + "grad_norm": 0.30145666003227234, + "learning_rate": 4.526906707571053e-05, + "loss": 0.1613, + "step": 15650 + }, + { + "epoch": 0.2791531409410338, + "grad_norm": 0.33624911308288574, + "learning_rate": 4.5268155895822355e-05, + "loss": 0.1834, + "step": 15651 + }, + { + "epoch": 0.27917097706274746, + "grad_norm": 0.27684760093688965, + "learning_rate": 4.526724463736781e-05, + "loss": 0.1665, + "step": 15652 + }, + { + "epoch": 0.27918881318446115, + "grad_norm": 0.287435382604599, + "learning_rate": 4.526633330035043e-05, + "loss": 0.1928, + "step": 15653 + }, + { + "epoch": 0.2792066493061749, + "grad_norm": 0.38228148221969604, + "learning_rate": 4.526542188477373e-05, + "loss": 0.1746, + "step": 15654 + }, + { + "epoch": 0.2792244854278886, + "grad_norm": 0.274532288312912, + "learning_rate": 4.526451039064127e-05, + "loss": 0.1586, + "step": 15655 + }, + { + "epoch": 0.27924232154960227, + "grad_norm": 0.28259244561195374, + "learning_rate": 4.5263598817956555e-05, + "loss": 0.2192, + "step": 15656 + }, + { + "epoch": 0.27926015767131596, + "grad_norm": 0.27577927708625793, + "learning_rate": 4.526268716672314e-05, + "loss": 0.1721, + "step": 15657 + }, + { + "epoch": 0.27927799379302964, + "grad_norm": 0.2167995572090149, + "learning_rate": 4.5261775436944554e-05, + "loss": 0.2397, + "step": 15658 + }, + { + "epoch": 0.27929582991474333, + "grad_norm": 0.30169492959976196, + "learning_rate": 4.526086362862432e-05, + "loss": 0.259, + "step": 15659 + }, + { + "epoch": 0.279313666036457, + "grad_norm": 0.21073132753372192, + "learning_rate": 4.525995174176598e-05, + "loss": 0.2074, + "step": 15660 + }, + { + "epoch": 0.2793315021581707, + "grad_norm": 0.25644275546073914, + "learning_rate": 4.525903977637308e-05, + "loss": 0.1562, + "step": 15661 + }, + { + "epoch": 0.2793493382798844, + "grad_norm": 0.3562442660331726, + "learning_rate": 4.525812773244914e-05, + "loss": 0.2401, + "step": 15662 + }, + { + "epoch": 0.27936717440159814, + "grad_norm": 0.21228370070457458, + "learning_rate": 4.5257215609997694e-05, + "loss": 0.1583, + "step": 15663 + }, + { + "epoch": 0.2793850105233118, + "grad_norm": 0.21904005110263824, + "learning_rate": 4.525630340902229e-05, + "loss": 0.1509, + "step": 15664 + }, + { + "epoch": 0.2794028466450255, + "grad_norm": 0.3077412545681, + "learning_rate": 4.525539112952645e-05, + "loss": 0.2425, + "step": 15665 + }, + { + "epoch": 0.2794206827667392, + "grad_norm": 0.25142520666122437, + "learning_rate": 4.525447877151373e-05, + "loss": 0.2044, + "step": 15666 + }, + { + "epoch": 0.2794385188884529, + "grad_norm": 0.2700698673725128, + "learning_rate": 4.525356633498764e-05, + "loss": 0.2162, + "step": 15667 + }, + { + "epoch": 0.2794563550101666, + "grad_norm": 0.4805814027786255, + "learning_rate": 4.5252653819951745e-05, + "loss": 0.1724, + "step": 15668 + }, + { + "epoch": 0.27947419113188027, + "grad_norm": 0.35741961002349854, + "learning_rate": 4.525174122640956e-05, + "loss": 0.1872, + "step": 15669 + }, + { + "epoch": 0.27949202725359396, + "grad_norm": 0.2249939739704132, + "learning_rate": 4.525082855436464e-05, + "loss": 0.1577, + "step": 15670 + }, + { + "epoch": 0.2795098633753077, + "grad_norm": 0.27167457342147827, + "learning_rate": 4.524991580382051e-05, + "loss": 0.1447, + "step": 15671 + }, + { + "epoch": 0.2795276994970214, + "grad_norm": 0.24171313643455505, + "learning_rate": 4.524900297478071e-05, + "loss": 0.2023, + "step": 15672 + }, + { + "epoch": 0.2795455356187351, + "grad_norm": 0.3178898096084595, + "learning_rate": 4.524809006724878e-05, + "loss": 0.2133, + "step": 15673 + }, + { + "epoch": 0.27956337174044876, + "grad_norm": 0.27155107259750366, + "learning_rate": 4.5247177081228264e-05, + "loss": 0.1887, + "step": 15674 + }, + { + "epoch": 0.27958120786216245, + "grad_norm": 0.19774875044822693, + "learning_rate": 4.5246264016722696e-05, + "loss": 0.1397, + "step": 15675 + }, + { + "epoch": 0.27959904398387614, + "grad_norm": 0.19590790569782257, + "learning_rate": 4.524535087373561e-05, + "loss": 0.1796, + "step": 15676 + }, + { + "epoch": 0.2796168801055898, + "grad_norm": 0.31075024604797363, + "learning_rate": 4.524443765227055e-05, + "loss": 0.1954, + "step": 15677 + }, + { + "epoch": 0.2796347162273035, + "grad_norm": 0.28407710790634155, + "learning_rate": 4.5243524352331066e-05, + "loss": 0.1575, + "step": 15678 + }, + { + "epoch": 0.2796525523490172, + "grad_norm": 0.23158155381679535, + "learning_rate": 4.5242610973920685e-05, + "loss": 0.1668, + "step": 15679 + }, + { + "epoch": 0.27967038847073095, + "grad_norm": 0.3562926948070526, + "learning_rate": 4.524169751704296e-05, + "loss": 0.191, + "step": 15680 + }, + { + "epoch": 0.27968822459244463, + "grad_norm": 0.3110958933830261, + "learning_rate": 4.524078398170141e-05, + "loss": 0.1832, + "step": 15681 + }, + { + "epoch": 0.2797060607141583, + "grad_norm": 0.35577085614204407, + "learning_rate": 4.52398703678996e-05, + "loss": 0.1431, + "step": 15682 + }, + { + "epoch": 0.279723896835872, + "grad_norm": 0.2610926926136017, + "learning_rate": 4.523895667564106e-05, + "loss": 0.1898, + "step": 15683 + }, + { + "epoch": 0.2797417329575857, + "grad_norm": 0.24440820515155792, + "learning_rate": 4.5238042904929334e-05, + "loss": 0.1796, + "step": 15684 + }, + { + "epoch": 0.2797595690792994, + "grad_norm": 0.34617117047309875, + "learning_rate": 4.5237129055767965e-05, + "loss": 0.223, + "step": 15685 + }, + { + "epoch": 0.2797774052010131, + "grad_norm": 0.2710441052913666, + "learning_rate": 4.523621512816049e-05, + "loss": 0.1601, + "step": 15686 + }, + { + "epoch": 0.27979524132272676, + "grad_norm": 0.25729870796203613, + "learning_rate": 4.5235301122110465e-05, + "loss": 0.1394, + "step": 15687 + }, + { + "epoch": 0.2798130774444405, + "grad_norm": 0.299956351518631, + "learning_rate": 4.5234387037621415e-05, + "loss": 0.1697, + "step": 15688 + }, + { + "epoch": 0.2798309135661542, + "grad_norm": 0.24831891059875488, + "learning_rate": 4.523347287469689e-05, + "loss": 0.1732, + "step": 15689 + }, + { + "epoch": 0.2798487496878679, + "grad_norm": 0.33137819170951843, + "learning_rate": 4.5232558633340436e-05, + "loss": 0.2079, + "step": 15690 + }, + { + "epoch": 0.27986658580958157, + "grad_norm": 0.2328789383172989, + "learning_rate": 4.52316443135556e-05, + "loss": 0.2031, + "step": 15691 + }, + { + "epoch": 0.27988442193129526, + "grad_norm": 0.2618556320667267, + "learning_rate": 4.5230729915345924e-05, + "loss": 0.1432, + "step": 15692 + }, + { + "epoch": 0.27990225805300895, + "grad_norm": 0.257379949092865, + "learning_rate": 4.522981543871495e-05, + "loss": 0.1899, + "step": 15693 + }, + { + "epoch": 0.27992009417472263, + "grad_norm": 0.18314041197299957, + "learning_rate": 4.5228900883666224e-05, + "loss": 0.1298, + "step": 15694 + }, + { + "epoch": 0.2799379302964363, + "grad_norm": 0.3744262158870697, + "learning_rate": 4.52279862502033e-05, + "loss": 0.2006, + "step": 15695 + }, + { + "epoch": 0.27995576641815006, + "grad_norm": 0.2649403512477875, + "learning_rate": 4.52270715383297e-05, + "loss": 0.1696, + "step": 15696 + }, + { + "epoch": 0.27997360253986375, + "grad_norm": 0.2586050033569336, + "learning_rate": 4.5226156748049e-05, + "loss": 0.1774, + "step": 15697 + }, + { + "epoch": 0.27999143866157744, + "grad_norm": 0.31558698415756226, + "learning_rate": 4.5225241879364724e-05, + "loss": 0.2203, + "step": 15698 + }, + { + "epoch": 0.28000927478329113, + "grad_norm": 0.2562309503555298, + "learning_rate": 4.5224326932280414e-05, + "loss": 0.2075, + "step": 15699 + }, + { + "epoch": 0.2800271109050048, + "grad_norm": 0.27281156182289124, + "learning_rate": 4.522341190679964e-05, + "loss": 0.1466, + "step": 15700 + }, + { + "epoch": 0.2800449470267185, + "grad_norm": 0.24587777256965637, + "learning_rate": 4.522249680292593e-05, + "loss": 0.2128, + "step": 15701 + }, + { + "epoch": 0.2800627831484322, + "grad_norm": 0.2370014786720276, + "learning_rate": 4.5221581620662845e-05, + "loss": 0.1666, + "step": 15702 + }, + { + "epoch": 0.2800806192701459, + "grad_norm": 0.3908672034740448, + "learning_rate": 4.522066636001392e-05, + "loss": 0.1888, + "step": 15703 + }, + { + "epoch": 0.28009845539185957, + "grad_norm": 0.556275486946106, + "learning_rate": 4.52197510209827e-05, + "loss": 0.1988, + "step": 15704 + }, + { + "epoch": 0.2801162915135733, + "grad_norm": 0.17236942052841187, + "learning_rate": 4.521883560357276e-05, + "loss": 0.1236, + "step": 15705 + }, + { + "epoch": 0.280134127635287, + "grad_norm": 0.3039679229259491, + "learning_rate": 4.521792010778761e-05, + "loss": 0.1851, + "step": 15706 + }, + { + "epoch": 0.2801519637570007, + "grad_norm": 0.27140864729881287, + "learning_rate": 4.521700453363083e-05, + "loss": 0.1735, + "step": 15707 + }, + { + "epoch": 0.2801697998787144, + "grad_norm": 0.3555116355419159, + "learning_rate": 4.521608888110597e-05, + "loss": 0.2011, + "step": 15708 + }, + { + "epoch": 0.28018763600042806, + "grad_norm": 0.33275192975997925, + "learning_rate": 4.521517315021655e-05, + "loss": 0.1947, + "step": 15709 + }, + { + "epoch": 0.28020547212214175, + "grad_norm": 0.23747345805168152, + "learning_rate": 4.5214257340966134e-05, + "loss": 0.1444, + "step": 15710 + }, + { + "epoch": 0.28022330824385544, + "grad_norm": 0.2809229791164398, + "learning_rate": 4.521334145335828e-05, + "loss": 0.1698, + "step": 15711 + }, + { + "epoch": 0.28024114436556913, + "grad_norm": 0.2740756869316101, + "learning_rate": 4.521242548739654e-05, + "loss": 0.1943, + "step": 15712 + }, + { + "epoch": 0.28025898048728287, + "grad_norm": 0.2608456611633301, + "learning_rate": 4.5211509443084456e-05, + "loss": 0.1756, + "step": 15713 + }, + { + "epoch": 0.28027681660899656, + "grad_norm": 0.20253300666809082, + "learning_rate": 4.5210593320425576e-05, + "loss": 0.2273, + "step": 15714 + }, + { + "epoch": 0.28029465273071025, + "grad_norm": 0.3151959180831909, + "learning_rate": 4.520967711942345e-05, + "loss": 0.1583, + "step": 15715 + }, + { + "epoch": 0.28031248885242394, + "grad_norm": 0.2235242873430252, + "learning_rate": 4.520876084008164e-05, + "loss": 0.1596, + "step": 15716 + }, + { + "epoch": 0.2803303249741376, + "grad_norm": 0.2532644271850586, + "learning_rate": 4.52078444824037e-05, + "loss": 0.1838, + "step": 15717 + }, + { + "epoch": 0.2803481610958513, + "grad_norm": 0.27217957377433777, + "learning_rate": 4.520692804639317e-05, + "loss": 0.1935, + "step": 15718 + }, + { + "epoch": 0.280365997217565, + "grad_norm": 0.42854583263397217, + "learning_rate": 4.5206011532053606e-05, + "loss": 0.183, + "step": 15719 + }, + { + "epoch": 0.2803838333392787, + "grad_norm": 0.30840596556663513, + "learning_rate": 4.5205094939388563e-05, + "loss": 0.1908, + "step": 15720 + }, + { + "epoch": 0.2804016694609924, + "grad_norm": 0.20168592035770416, + "learning_rate": 4.5204178268401596e-05, + "loss": 0.1695, + "step": 15721 + }, + { + "epoch": 0.2804195055827061, + "grad_norm": 0.2682773470878601, + "learning_rate": 4.520326151909625e-05, + "loss": 0.172, + "step": 15722 + }, + { + "epoch": 0.2804373417044198, + "grad_norm": 0.3022426962852478, + "learning_rate": 4.52023446914761e-05, + "loss": 0.1383, + "step": 15723 + }, + { + "epoch": 0.2804551778261335, + "grad_norm": 0.3393450081348419, + "learning_rate": 4.520142778554467e-05, + "loss": 0.191, + "step": 15724 + }, + { + "epoch": 0.2804730139478472, + "grad_norm": 0.35501599311828613, + "learning_rate": 4.520051080130553e-05, + "loss": 0.2231, + "step": 15725 + }, + { + "epoch": 0.28049085006956087, + "grad_norm": 0.3100959062576294, + "learning_rate": 4.5199593738762236e-05, + "loss": 0.1522, + "step": 15726 + }, + { + "epoch": 0.28050868619127456, + "grad_norm": 0.37229984998703003, + "learning_rate": 4.5198676597918334e-05, + "loss": 0.1593, + "step": 15727 + }, + { + "epoch": 0.28052652231298825, + "grad_norm": 0.33646801114082336, + "learning_rate": 4.519775937877739e-05, + "loss": 0.2099, + "step": 15728 + }, + { + "epoch": 0.28054435843470193, + "grad_norm": 0.2767389416694641, + "learning_rate": 4.5196842081342955e-05, + "loss": 0.245, + "step": 15729 + }, + { + "epoch": 0.2805621945564157, + "grad_norm": 0.24052877724170685, + "learning_rate": 4.5195924705618585e-05, + "loss": 0.2056, + "step": 15730 + }, + { + "epoch": 0.28058003067812937, + "grad_norm": 0.31136706471443176, + "learning_rate": 4.519500725160783e-05, + "loss": 0.2343, + "step": 15731 + }, + { + "epoch": 0.28059786679984305, + "grad_norm": 0.258151650428772, + "learning_rate": 4.519408971931426e-05, + "loss": 0.2266, + "step": 15732 + }, + { + "epoch": 0.28061570292155674, + "grad_norm": 0.27437925338745117, + "learning_rate": 4.5193172108741415e-05, + "loss": 0.1958, + "step": 15733 + }, + { + "epoch": 0.28063353904327043, + "grad_norm": 0.2852654457092285, + "learning_rate": 4.5192254419892865e-05, + "loss": 0.1703, + "step": 15734 + }, + { + "epoch": 0.2806513751649841, + "grad_norm": 0.36328765749931335, + "learning_rate": 4.519133665277216e-05, + "loss": 0.1946, + "step": 15735 + }, + { + "epoch": 0.2806692112866978, + "grad_norm": 0.19724664092063904, + "learning_rate": 4.519041880738287e-05, + "loss": 0.1359, + "step": 15736 + }, + { + "epoch": 0.2806870474084115, + "grad_norm": 0.29698875546455383, + "learning_rate": 4.5189500883728534e-05, + "loss": 0.1542, + "step": 15737 + }, + { + "epoch": 0.2807048835301252, + "grad_norm": 0.38125431537628174, + "learning_rate": 4.518858288181272e-05, + "loss": 0.1667, + "step": 15738 + }, + { + "epoch": 0.2807227196518389, + "grad_norm": 0.24442516267299652, + "learning_rate": 4.5187664801638984e-05, + "loss": 0.1305, + "step": 15739 + }, + { + "epoch": 0.2807405557735526, + "grad_norm": 0.40529268980026245, + "learning_rate": 4.518674664321089e-05, + "loss": 0.2371, + "step": 15740 + }, + { + "epoch": 0.2807583918952663, + "grad_norm": 0.25345057249069214, + "learning_rate": 4.518582840653199e-05, + "loss": 0.1759, + "step": 15741 + }, + { + "epoch": 0.28077622801698, + "grad_norm": 0.19947724044322968, + "learning_rate": 4.518491009160585e-05, + "loss": 0.1532, + "step": 15742 + }, + { + "epoch": 0.2807940641386937, + "grad_norm": 0.2914084196090698, + "learning_rate": 4.5183991698436035e-05, + "loss": 0.2232, + "step": 15743 + }, + { + "epoch": 0.28081190026040737, + "grad_norm": 0.2936772108078003, + "learning_rate": 4.5183073227026084e-05, + "loss": 0.1872, + "step": 15744 + }, + { + "epoch": 0.28082973638212105, + "grad_norm": 0.3514951169490814, + "learning_rate": 4.5182154677379576e-05, + "loss": 0.2488, + "step": 15745 + }, + { + "epoch": 0.28084757250383474, + "grad_norm": 0.27922895550727844, + "learning_rate": 4.518123604950006e-05, + "loss": 0.1941, + "step": 15746 + }, + { + "epoch": 0.2808654086255485, + "grad_norm": 0.30157873034477234, + "learning_rate": 4.518031734339111e-05, + "loss": 0.1995, + "step": 15747 + }, + { + "epoch": 0.2808832447472622, + "grad_norm": 0.19730991125106812, + "learning_rate": 4.517939855905628e-05, + "loss": 0.1369, + "step": 15748 + }, + { + "epoch": 0.28090108086897586, + "grad_norm": 0.3510364890098572, + "learning_rate": 4.517847969649913e-05, + "loss": 0.2193, + "step": 15749 + }, + { + "epoch": 0.28091891699068955, + "grad_norm": 0.18678249418735504, + "learning_rate": 4.5177560755723226e-05, + "loss": 0.1495, + "step": 15750 + }, + { + "epoch": 0.28093675311240324, + "grad_norm": 0.3212592601776123, + "learning_rate": 4.5176641736732116e-05, + "loss": 0.1873, + "step": 15751 + }, + { + "epoch": 0.2809545892341169, + "grad_norm": 0.25958016514778137, + "learning_rate": 4.5175722639529386e-05, + "loss": 0.1817, + "step": 15752 + }, + { + "epoch": 0.2809724253558306, + "grad_norm": 0.23775915801525116, + "learning_rate": 4.517480346411858e-05, + "loss": 0.1857, + "step": 15753 + }, + { + "epoch": 0.2809902614775443, + "grad_norm": 0.3295690715312958, + "learning_rate": 4.517388421050327e-05, + "loss": 0.1632, + "step": 15754 + }, + { + "epoch": 0.28100809759925804, + "grad_norm": 0.36992278695106506, + "learning_rate": 4.517296487868702e-05, + "loss": 0.2102, + "step": 15755 + }, + { + "epoch": 0.28102593372097173, + "grad_norm": 0.23655936121940613, + "learning_rate": 4.517204546867338e-05, + "loss": 0.1871, + "step": 15756 + }, + { + "epoch": 0.2810437698426854, + "grad_norm": 0.24623289704322815, + "learning_rate": 4.517112598046593e-05, + "loss": 0.1502, + "step": 15757 + }, + { + "epoch": 0.2810616059643991, + "grad_norm": 0.3252638578414917, + "learning_rate": 4.517020641406824e-05, + "loss": 0.1408, + "step": 15758 + }, + { + "epoch": 0.2810794420861128, + "grad_norm": 0.2367999404668808, + "learning_rate": 4.516928676948385e-05, + "loss": 0.1159, + "step": 15759 + }, + { + "epoch": 0.2810972782078265, + "grad_norm": 0.253127783536911, + "learning_rate": 4.516836704671634e-05, + "loss": 0.1875, + "step": 15760 + }, + { + "epoch": 0.28111511432954017, + "grad_norm": 0.3281935751438141, + "learning_rate": 4.516744724576928e-05, + "loss": 0.1416, + "step": 15761 + }, + { + "epoch": 0.28113295045125386, + "grad_norm": 0.593634843826294, + "learning_rate": 4.516652736664623e-05, + "loss": 0.2005, + "step": 15762 + }, + { + "epoch": 0.28115078657296755, + "grad_norm": 0.38164636492729187, + "learning_rate": 4.516560740935074e-05, + "loss": 0.1912, + "step": 15763 + }, + { + "epoch": 0.2811686226946813, + "grad_norm": 0.4401412308216095, + "learning_rate": 4.5164687373886403e-05, + "loss": 0.1904, + "step": 15764 + }, + { + "epoch": 0.281186458816395, + "grad_norm": 0.2834833860397339, + "learning_rate": 4.5163767260256774e-05, + "loss": 0.1962, + "step": 15765 + }, + { + "epoch": 0.28120429493810867, + "grad_norm": 0.36803925037384033, + "learning_rate": 4.516284706846541e-05, + "loss": 0.2246, + "step": 15766 + }, + { + "epoch": 0.28122213105982236, + "grad_norm": 0.3717997670173645, + "learning_rate": 4.516192679851589e-05, + "loss": 0.1965, + "step": 15767 + }, + { + "epoch": 0.28123996718153604, + "grad_norm": 0.5259448885917664, + "learning_rate": 4.516100645041178e-05, + "loss": 0.1999, + "step": 15768 + }, + { + "epoch": 0.28125780330324973, + "grad_norm": 0.23925939202308655, + "learning_rate": 4.5160086024156644e-05, + "loss": 0.152, + "step": 15769 + }, + { + "epoch": 0.2812756394249634, + "grad_norm": 0.26688462495803833, + "learning_rate": 4.515916551975406e-05, + "loss": 0.2263, + "step": 15770 + }, + { + "epoch": 0.2812934755466771, + "grad_norm": 0.21828119456768036, + "learning_rate": 4.515824493720757e-05, + "loss": 0.1737, + "step": 15771 + }, + { + "epoch": 0.28131131166839085, + "grad_norm": 0.26910701394081116, + "learning_rate": 4.515732427652077e-05, + "loss": 0.162, + "step": 15772 + }, + { + "epoch": 0.28132914779010454, + "grad_norm": 0.3421975374221802, + "learning_rate": 4.515640353769722e-05, + "loss": 0.1402, + "step": 15773 + }, + { + "epoch": 0.2813469839118182, + "grad_norm": 0.23494797945022583, + "learning_rate": 4.515548272074049e-05, + "loss": 0.2232, + "step": 15774 + }, + { + "epoch": 0.2813648200335319, + "grad_norm": 0.24409538507461548, + "learning_rate": 4.5154561825654144e-05, + "loss": 0.165, + "step": 15775 + }, + { + "epoch": 0.2813826561552456, + "grad_norm": 0.23689110577106476, + "learning_rate": 4.515364085244176e-05, + "loss": 0.2181, + "step": 15776 + }, + { + "epoch": 0.2814004922769593, + "grad_norm": 0.3461141884326935, + "learning_rate": 4.51527198011069e-05, + "loss": 0.1654, + "step": 15777 + }, + { + "epoch": 0.281418328398673, + "grad_norm": 0.33549702167510986, + "learning_rate": 4.5151798671653134e-05, + "loss": 0.1857, + "step": 15778 + }, + { + "epoch": 0.28143616452038667, + "grad_norm": 0.22499185800552368, + "learning_rate": 4.515087746408404e-05, + "loss": 0.1889, + "step": 15779 + }, + { + "epoch": 0.28145400064210035, + "grad_norm": 0.25467947125434875, + "learning_rate": 4.514995617840318e-05, + "loss": 0.2109, + "step": 15780 + }, + { + "epoch": 0.2814718367638141, + "grad_norm": 0.2677212059497833, + "learning_rate": 4.514903481461414e-05, + "loss": 0.1932, + "step": 15781 + }, + { + "epoch": 0.2814896728855278, + "grad_norm": 0.23083485662937164, + "learning_rate": 4.514811337272048e-05, + "loss": 0.1351, + "step": 15782 + }, + { + "epoch": 0.2815075090072415, + "grad_norm": 0.24680112302303314, + "learning_rate": 4.514719185272577e-05, + "loss": 0.1678, + "step": 15783 + }, + { + "epoch": 0.28152534512895516, + "grad_norm": 0.3104582726955414, + "learning_rate": 4.5146270254633584e-05, + "loss": 0.1704, + "step": 15784 + }, + { + "epoch": 0.28154318125066885, + "grad_norm": 0.33675116300582886, + "learning_rate": 4.5145348578447495e-05, + "loss": 0.1836, + "step": 15785 + }, + { + "epoch": 0.28156101737238254, + "grad_norm": 0.1662835329771042, + "learning_rate": 4.514442682417108e-05, + "loss": 0.1127, + "step": 15786 + }, + { + "epoch": 0.2815788534940962, + "grad_norm": 0.36040014028549194, + "learning_rate": 4.51435049918079e-05, + "loss": 0.1752, + "step": 15787 + }, + { + "epoch": 0.2815966896158099, + "grad_norm": 0.2832331657409668, + "learning_rate": 4.514258308136156e-05, + "loss": 0.1729, + "step": 15788 + }, + { + "epoch": 0.28161452573752366, + "grad_norm": 0.4248417615890503, + "learning_rate": 4.51416610928356e-05, + "loss": 0.1724, + "step": 15789 + }, + { + "epoch": 0.28163236185923735, + "grad_norm": 0.27826637029647827, + "learning_rate": 4.514073902623359e-05, + "loss": 0.133, + "step": 15790 + }, + { + "epoch": 0.28165019798095103, + "grad_norm": 0.26783037185668945, + "learning_rate": 4.5139816881559137e-05, + "loss": 0.1979, + "step": 15791 + }, + { + "epoch": 0.2816680341026647, + "grad_norm": 0.25952044129371643, + "learning_rate": 4.513889465881579e-05, + "loss": 0.1656, + "step": 15792 + }, + { + "epoch": 0.2816858702243784, + "grad_norm": 0.2946554720401764, + "learning_rate": 4.513797235800713e-05, + "loss": 0.205, + "step": 15793 + }, + { + "epoch": 0.2817037063460921, + "grad_norm": 0.32601019740104675, + "learning_rate": 4.513704997913673e-05, + "loss": 0.2318, + "step": 15794 + }, + { + "epoch": 0.2817215424678058, + "grad_norm": 0.3385814130306244, + "learning_rate": 4.513612752220818e-05, + "loss": 0.1569, + "step": 15795 + }, + { + "epoch": 0.2817393785895195, + "grad_norm": 0.36846330761909485, + "learning_rate": 4.5135204987225044e-05, + "loss": 0.1862, + "step": 15796 + }, + { + "epoch": 0.2817572147112332, + "grad_norm": 0.4911366105079651, + "learning_rate": 4.5134282374190896e-05, + "loss": 0.1768, + "step": 15797 + }, + { + "epoch": 0.2817750508329469, + "grad_norm": 0.3188064992427826, + "learning_rate": 4.5133359683109316e-05, + "loss": 0.2016, + "step": 15798 + }, + { + "epoch": 0.2817928869546606, + "grad_norm": 0.209141805768013, + "learning_rate": 4.5132436913983875e-05, + "loss": 0.1693, + "step": 15799 + }, + { + "epoch": 0.2818107230763743, + "grad_norm": 0.24392643570899963, + "learning_rate": 4.513151406681817e-05, + "loss": 0.1903, + "step": 15800 + }, + { + "epoch": 0.28182855919808797, + "grad_norm": 0.3080965280532837, + "learning_rate": 4.513059114161575e-05, + "loss": 0.1791, + "step": 15801 + }, + { + "epoch": 0.28184639531980166, + "grad_norm": 0.5254833698272705, + "learning_rate": 4.512966813838021e-05, + "loss": 0.2863, + "step": 15802 + }, + { + "epoch": 0.28186423144151534, + "grad_norm": 0.2654857337474823, + "learning_rate": 4.512874505711512e-05, + "loss": 0.1607, + "step": 15803 + }, + { + "epoch": 0.28188206756322903, + "grad_norm": 0.33769118785858154, + "learning_rate": 4.512782189782406e-05, + "loss": 0.174, + "step": 15804 + }, + { + "epoch": 0.2818999036849427, + "grad_norm": 0.30186372995376587, + "learning_rate": 4.512689866051062e-05, + "loss": 0.1966, + "step": 15805 + }, + { + "epoch": 0.28191773980665646, + "grad_norm": 0.2846188247203827, + "learning_rate": 4.512597534517836e-05, + "loss": 0.1708, + "step": 15806 + }, + { + "epoch": 0.28193557592837015, + "grad_norm": 0.2984698414802551, + "learning_rate": 4.512505195183088e-05, + "loss": 0.1305, + "step": 15807 + }, + { + "epoch": 0.28195341205008384, + "grad_norm": 0.27422988414764404, + "learning_rate": 4.5124128480471735e-05, + "loss": 0.1965, + "step": 15808 + }, + { + "epoch": 0.28197124817179753, + "grad_norm": 0.24732251465320587, + "learning_rate": 4.5123204931104524e-05, + "loss": 0.1759, + "step": 15809 + }, + { + "epoch": 0.2819890842935112, + "grad_norm": 0.24280230700969696, + "learning_rate": 4.5122281303732816e-05, + "loss": 0.1907, + "step": 15810 + }, + { + "epoch": 0.2820069204152249, + "grad_norm": 0.2686242163181305, + "learning_rate": 4.5121357598360195e-05, + "loss": 0.2081, + "step": 15811 + }, + { + "epoch": 0.2820247565369386, + "grad_norm": 0.2145245224237442, + "learning_rate": 4.5120433814990246e-05, + "loss": 0.1621, + "step": 15812 + }, + { + "epoch": 0.2820425926586523, + "grad_norm": 0.300889790058136, + "learning_rate": 4.511950995362655e-05, + "loss": 0.1775, + "step": 15813 + }, + { + "epoch": 0.282060428780366, + "grad_norm": 0.18463751673698425, + "learning_rate": 4.511858601427268e-05, + "loss": 0.1686, + "step": 15814 + }, + { + "epoch": 0.2820782649020797, + "grad_norm": 0.28218504786491394, + "learning_rate": 4.511766199693222e-05, + "loss": 0.1936, + "step": 15815 + }, + { + "epoch": 0.2820961010237934, + "grad_norm": 0.27262285351753235, + "learning_rate": 4.511673790160875e-05, + "loss": 0.1374, + "step": 15816 + }, + { + "epoch": 0.2821139371455071, + "grad_norm": 0.29053056240081787, + "learning_rate": 4.5115813728305865e-05, + "loss": 0.138, + "step": 15817 + }, + { + "epoch": 0.2821317732672208, + "grad_norm": 0.3268433213233948, + "learning_rate": 4.511488947702714e-05, + "loss": 0.1758, + "step": 15818 + }, + { + "epoch": 0.28214960938893446, + "grad_norm": 0.3032713234424591, + "learning_rate": 4.511396514777615e-05, + "loss": 0.1281, + "step": 15819 + }, + { + "epoch": 0.28216744551064815, + "grad_norm": 0.41434139013290405, + "learning_rate": 4.511304074055648e-05, + "loss": 0.1791, + "step": 15820 + }, + { + "epoch": 0.28218528163236184, + "grad_norm": 0.31003856658935547, + "learning_rate": 4.511211625537172e-05, + "loss": 0.1785, + "step": 15821 + }, + { + "epoch": 0.2822031177540755, + "grad_norm": 0.34010785818099976, + "learning_rate": 4.511119169222545e-05, + "loss": 0.2364, + "step": 15822 + }, + { + "epoch": 0.28222095387578927, + "grad_norm": 0.2527099847793579, + "learning_rate": 4.511026705112126e-05, + "loss": 0.186, + "step": 15823 + }, + { + "epoch": 0.28223878999750296, + "grad_norm": 0.3292764723300934, + "learning_rate": 4.510934233206273e-05, + "loss": 0.1783, + "step": 15824 + }, + { + "epoch": 0.28225662611921665, + "grad_norm": 0.22024333477020264, + "learning_rate": 4.5108417535053436e-05, + "loss": 0.128, + "step": 15825 + }, + { + "epoch": 0.28227446224093033, + "grad_norm": 0.24847577512264252, + "learning_rate": 4.510749266009697e-05, + "loss": 0.1374, + "step": 15826 + }, + { + "epoch": 0.282292298362644, + "grad_norm": 0.26582664251327515, + "learning_rate": 4.510656770719693e-05, + "loss": 0.2296, + "step": 15827 + }, + { + "epoch": 0.2823101344843577, + "grad_norm": 0.28152212500572205, + "learning_rate": 4.510564267635688e-05, + "loss": 0.1467, + "step": 15828 + }, + { + "epoch": 0.2823279706060714, + "grad_norm": 0.31269505620002747, + "learning_rate": 4.5104717567580415e-05, + "loss": 0.1937, + "step": 15829 + }, + { + "epoch": 0.2823458067277851, + "grad_norm": 0.2444497048854828, + "learning_rate": 4.510379238087112e-05, + "loss": 0.1641, + "step": 15830 + }, + { + "epoch": 0.28236364284949883, + "grad_norm": 0.2732645869255066, + "learning_rate": 4.5102867116232586e-05, + "loss": 0.2163, + "step": 15831 + }, + { + "epoch": 0.2823814789712125, + "grad_norm": 0.20664900541305542, + "learning_rate": 4.5101941773668396e-05, + "loss": 0.1303, + "step": 15832 + }, + { + "epoch": 0.2823993150929262, + "grad_norm": 0.31389695405960083, + "learning_rate": 4.510101635318213e-05, + "loss": 0.1555, + "step": 15833 + }, + { + "epoch": 0.2824171512146399, + "grad_norm": 0.28889691829681396, + "learning_rate": 4.510009085477739e-05, + "loss": 0.2385, + "step": 15834 + }, + { + "epoch": 0.2824349873363536, + "grad_norm": 0.3446749448776245, + "learning_rate": 4.5099165278457746e-05, + "loss": 0.2041, + "step": 15835 + }, + { + "epoch": 0.28245282345806727, + "grad_norm": 0.3042996823787689, + "learning_rate": 4.50982396242268e-05, + "loss": 0.2148, + "step": 15836 + }, + { + "epoch": 0.28247065957978096, + "grad_norm": 0.2645481526851654, + "learning_rate": 4.5097313892088136e-05, + "loss": 0.1815, + "step": 15837 + }, + { + "epoch": 0.28248849570149465, + "grad_norm": 0.3444783389568329, + "learning_rate": 4.509638808204535e-05, + "loss": 0.2059, + "step": 15838 + }, + { + "epoch": 0.28250633182320833, + "grad_norm": 0.265800803899765, + "learning_rate": 4.509546219410201e-05, + "loss": 0.1775, + "step": 15839 + }, + { + "epoch": 0.2825241679449221, + "grad_norm": 0.3954220414161682, + "learning_rate": 4.509453622826172e-05, + "loss": 0.1609, + "step": 15840 + }, + { + "epoch": 0.28254200406663577, + "grad_norm": 0.27506983280181885, + "learning_rate": 4.509361018452807e-05, + "loss": 0.2039, + "step": 15841 + }, + { + "epoch": 0.28255984018834945, + "grad_norm": 0.25239405035972595, + "learning_rate": 4.509268406290465e-05, + "loss": 0.183, + "step": 15842 + }, + { + "epoch": 0.28257767631006314, + "grad_norm": 0.236806720495224, + "learning_rate": 4.5091757863395045e-05, + "loss": 0.1698, + "step": 15843 + }, + { + "epoch": 0.28259551243177683, + "grad_norm": 0.24222493171691895, + "learning_rate": 4.509083158600285e-05, + "loss": 0.1739, + "step": 15844 + }, + { + "epoch": 0.2826133485534905, + "grad_norm": 0.23812562227249146, + "learning_rate": 4.508990523073164e-05, + "loss": 0.163, + "step": 15845 + }, + { + "epoch": 0.2826311846752042, + "grad_norm": 0.28616097569465637, + "learning_rate": 4.508897879758502e-05, + "loss": 0.1929, + "step": 15846 + }, + { + "epoch": 0.2826490207969179, + "grad_norm": 0.24026015400886536, + "learning_rate": 4.5088052286566596e-05, + "loss": 0.1424, + "step": 15847 + }, + { + "epoch": 0.28266685691863164, + "grad_norm": 0.2622207701206207, + "learning_rate": 4.508712569767993e-05, + "loss": 0.1647, + "step": 15848 + }, + { + "epoch": 0.2826846930403453, + "grad_norm": 0.21435169875621796, + "learning_rate": 4.5086199030928635e-05, + "loss": 0.1697, + "step": 15849 + }, + { + "epoch": 0.282702529162059, + "grad_norm": 0.24638047814369202, + "learning_rate": 4.508527228631629e-05, + "loss": 0.181, + "step": 15850 + }, + { + "epoch": 0.2827203652837727, + "grad_norm": 0.26158127188682556, + "learning_rate": 4.508434546384649e-05, + "loss": 0.1694, + "step": 15851 + }, + { + "epoch": 0.2827382014054864, + "grad_norm": 0.2531074285507202, + "learning_rate": 4.508341856352283e-05, + "loss": 0.1967, + "step": 15852 + }, + { + "epoch": 0.2827560375272001, + "grad_norm": 0.31923919916152954, + "learning_rate": 4.5082491585348904e-05, + "loss": 0.2228, + "step": 15853 + }, + { + "epoch": 0.28277387364891376, + "grad_norm": 0.3040517568588257, + "learning_rate": 4.508156452932831e-05, + "loss": 0.1731, + "step": 15854 + }, + { + "epoch": 0.28279170977062745, + "grad_norm": 0.25326067209243774, + "learning_rate": 4.508063739546463e-05, + "loss": 0.2227, + "step": 15855 + }, + { + "epoch": 0.2828095458923412, + "grad_norm": 0.26069650053977966, + "learning_rate": 4.507971018376147e-05, + "loss": 0.1718, + "step": 15856 + }, + { + "epoch": 0.2828273820140549, + "grad_norm": 0.33046719431877136, + "learning_rate": 4.507878289422242e-05, + "loss": 0.2234, + "step": 15857 + }, + { + "epoch": 0.2828452181357686, + "grad_norm": 0.266009658575058, + "learning_rate": 4.507785552685106e-05, + "loss": 0.1793, + "step": 15858 + }, + { + "epoch": 0.28286305425748226, + "grad_norm": 0.20960992574691772, + "learning_rate": 4.507692808165101e-05, + "loss": 0.1672, + "step": 15859 + }, + { + "epoch": 0.28288089037919595, + "grad_norm": 0.29292190074920654, + "learning_rate": 4.507600055862584e-05, + "loss": 0.1763, + "step": 15860 + }, + { + "epoch": 0.28289872650090964, + "grad_norm": 0.24633224308490753, + "learning_rate": 4.5075072957779166e-05, + "loss": 0.1986, + "step": 15861 + }, + { + "epoch": 0.2829165626226233, + "grad_norm": 0.24920177459716797, + "learning_rate": 4.5074145279114574e-05, + "loss": 0.1443, + "step": 15862 + }, + { + "epoch": 0.282934398744337, + "grad_norm": 0.2822324335575104, + "learning_rate": 4.507321752263566e-05, + "loss": 0.1893, + "step": 15863 + }, + { + "epoch": 0.2829522348660507, + "grad_norm": 0.27641648054122925, + "learning_rate": 4.507228968834602e-05, + "loss": 0.1848, + "step": 15864 + }, + { + "epoch": 0.28297007098776444, + "grad_norm": 0.4224631190299988, + "learning_rate": 4.5071361776249253e-05, + "loss": 0.2127, + "step": 15865 + }, + { + "epoch": 0.28298790710947813, + "grad_norm": 0.2558792531490326, + "learning_rate": 4.5070433786348964e-05, + "loss": 0.1771, + "step": 15866 + }, + { + "epoch": 0.2830057432311918, + "grad_norm": 0.2531677782535553, + "learning_rate": 4.506950571864873e-05, + "loss": 0.1854, + "step": 15867 + }, + { + "epoch": 0.2830235793529055, + "grad_norm": 0.22042444348335266, + "learning_rate": 4.506857757315217e-05, + "loss": 0.1594, + "step": 15868 + }, + { + "epoch": 0.2830414154746192, + "grad_norm": 0.2552710473537445, + "learning_rate": 4.506764934986287e-05, + "loss": 0.1441, + "step": 15869 + }, + { + "epoch": 0.2830592515963329, + "grad_norm": 0.27962878346443176, + "learning_rate": 4.506672104878442e-05, + "loss": 0.1876, + "step": 15870 + }, + { + "epoch": 0.28307708771804657, + "grad_norm": 0.29624566435813904, + "learning_rate": 4.5065792669920434e-05, + "loss": 0.206, + "step": 15871 + }, + { + "epoch": 0.28309492383976026, + "grad_norm": 0.3222978413105011, + "learning_rate": 4.506486421327451e-05, + "loss": 0.1811, + "step": 15872 + }, + { + "epoch": 0.283112759961474, + "grad_norm": 0.2283131331205368, + "learning_rate": 4.506393567885024e-05, + "loss": 0.1801, + "step": 15873 + }, + { + "epoch": 0.2831305960831877, + "grad_norm": 0.2531171143054962, + "learning_rate": 4.506300706665122e-05, + "loss": 0.178, + "step": 15874 + }, + { + "epoch": 0.2831484322049014, + "grad_norm": 0.2347114235162735, + "learning_rate": 4.506207837668106e-05, + "loss": 0.1604, + "step": 15875 + }, + { + "epoch": 0.28316626832661507, + "grad_norm": 0.26539233326911926, + "learning_rate": 4.506114960894335e-05, + "loss": 0.1486, + "step": 15876 + }, + { + "epoch": 0.28318410444832876, + "grad_norm": 0.30372369289398193, + "learning_rate": 4.5060220763441706e-05, + "loss": 0.1856, + "step": 15877 + }, + { + "epoch": 0.28320194057004244, + "grad_norm": 0.2657814621925354, + "learning_rate": 4.5059291840179705e-05, + "loss": 0.1849, + "step": 15878 + }, + { + "epoch": 0.28321977669175613, + "grad_norm": 0.2818485200405121, + "learning_rate": 4.5058362839160974e-05, + "loss": 0.1619, + "step": 15879 + }, + { + "epoch": 0.2832376128134698, + "grad_norm": 0.22816427052021027, + "learning_rate": 4.505743376038909e-05, + "loss": 0.1468, + "step": 15880 + }, + { + "epoch": 0.2832554489351835, + "grad_norm": 0.43867549300193787, + "learning_rate": 4.505650460386767e-05, + "loss": 0.1784, + "step": 15881 + }, + { + "epoch": 0.28327328505689725, + "grad_norm": 0.2857999801635742, + "learning_rate": 4.5055575369600314e-05, + "loss": 0.1998, + "step": 15882 + }, + { + "epoch": 0.28329112117861094, + "grad_norm": 0.41786709427833557, + "learning_rate": 4.505464605759061e-05, + "loss": 0.2069, + "step": 15883 + }, + { + "epoch": 0.2833089573003246, + "grad_norm": 0.32303425669670105, + "learning_rate": 4.505371666784218e-05, + "loss": 0.2014, + "step": 15884 + }, + { + "epoch": 0.2833267934220383, + "grad_norm": 0.1937633454799652, + "learning_rate": 4.505278720035862e-05, + "loss": 0.1999, + "step": 15885 + }, + { + "epoch": 0.283344629543752, + "grad_norm": 0.36285242438316345, + "learning_rate": 4.5051857655143525e-05, + "loss": 0.1679, + "step": 15886 + }, + { + "epoch": 0.2833624656654657, + "grad_norm": 0.3296329975128174, + "learning_rate": 4.50509280322005e-05, + "loss": 0.2202, + "step": 15887 + }, + { + "epoch": 0.2833803017871794, + "grad_norm": 0.27836042642593384, + "learning_rate": 4.504999833153316e-05, + "loss": 0.1962, + "step": 15888 + }, + { + "epoch": 0.28339813790889307, + "grad_norm": 0.1825222373008728, + "learning_rate": 4.50490685531451e-05, + "loss": 0.16, + "step": 15889 + }, + { + "epoch": 0.2834159740306068, + "grad_norm": 0.3483729660511017, + "learning_rate": 4.504813869703992e-05, + "loss": 0.2131, + "step": 15890 + }, + { + "epoch": 0.2834338101523205, + "grad_norm": 0.2789473831653595, + "learning_rate": 4.504720876322124e-05, + "loss": 0.1977, + "step": 15891 + }, + { + "epoch": 0.2834516462740342, + "grad_norm": 0.31971925497055054, + "learning_rate": 4.504627875169265e-05, + "loss": 0.2226, + "step": 15892 + }, + { + "epoch": 0.2834694823957479, + "grad_norm": 0.35134172439575195, + "learning_rate": 4.504534866245776e-05, + "loss": 0.1759, + "step": 15893 + }, + { + "epoch": 0.28348731851746156, + "grad_norm": 0.31223055720329285, + "learning_rate": 4.504441849552018e-05, + "loss": 0.2221, + "step": 15894 + }, + { + "epoch": 0.28350515463917525, + "grad_norm": 0.24739018082618713, + "learning_rate": 4.50434882508835e-05, + "loss": 0.1831, + "step": 15895 + }, + { + "epoch": 0.28352299076088894, + "grad_norm": 0.3248162269592285, + "learning_rate": 4.504255792855134e-05, + "loss": 0.1543, + "step": 15896 + }, + { + "epoch": 0.2835408268826026, + "grad_norm": 0.18326567113399506, + "learning_rate": 4.504162752852731e-05, + "loss": 0.1481, + "step": 15897 + }, + { + "epoch": 0.28355866300431637, + "grad_norm": 0.24415653944015503, + "learning_rate": 4.5040697050815e-05, + "loss": 0.2105, + "step": 15898 + }, + { + "epoch": 0.28357649912603006, + "grad_norm": 0.27825552225112915, + "learning_rate": 4.503976649541803e-05, + "loss": 0.2015, + "step": 15899 + }, + { + "epoch": 0.28359433524774375, + "grad_norm": 0.3033321797847748, + "learning_rate": 4.503883586234001e-05, + "loss": 0.2465, + "step": 15900 + }, + { + "epoch": 0.28361217136945743, + "grad_norm": 0.29683420062065125, + "learning_rate": 4.503790515158453e-05, + "loss": 0.2302, + "step": 15901 + }, + { + "epoch": 0.2836300074911711, + "grad_norm": 0.2019442319869995, + "learning_rate": 4.503697436315522e-05, + "loss": 0.1817, + "step": 15902 + }, + { + "epoch": 0.2836478436128848, + "grad_norm": 0.23974061012268066, + "learning_rate": 4.503604349705567e-05, + "loss": 0.1731, + "step": 15903 + }, + { + "epoch": 0.2836656797345985, + "grad_norm": 0.3597039580345154, + "learning_rate": 4.5035112553289495e-05, + "loss": 0.1756, + "step": 15904 + }, + { + "epoch": 0.2836835158563122, + "grad_norm": 0.30218592286109924, + "learning_rate": 4.50341815318603e-05, + "loss": 0.1947, + "step": 15905 + }, + { + "epoch": 0.2837013519780259, + "grad_norm": 0.38055992126464844, + "learning_rate": 4.503325043277171e-05, + "loss": 0.1545, + "step": 15906 + }, + { + "epoch": 0.2837191880997396, + "grad_norm": 0.2936986982822418, + "learning_rate": 4.5032319256027314e-05, + "loss": 0.1945, + "step": 15907 + }, + { + "epoch": 0.2837370242214533, + "grad_norm": 0.2647310793399811, + "learning_rate": 4.5031388001630726e-05, + "loss": 0.1562, + "step": 15908 + }, + { + "epoch": 0.283754860343167, + "grad_norm": 0.23462165892124176, + "learning_rate": 4.5030456669585564e-05, + "loss": 0.1405, + "step": 15909 + }, + { + "epoch": 0.2837726964648807, + "grad_norm": 0.20205175876617432, + "learning_rate": 4.502952525989543e-05, + "loss": 0.1506, + "step": 15910 + }, + { + "epoch": 0.28379053258659437, + "grad_norm": 0.2899705171585083, + "learning_rate": 4.502859377256395e-05, + "loss": 0.1772, + "step": 15911 + }, + { + "epoch": 0.28380836870830806, + "grad_norm": 0.3019639849662781, + "learning_rate": 4.502766220759471e-05, + "loss": 0.1599, + "step": 15912 + }, + { + "epoch": 0.28382620483002174, + "grad_norm": 0.28742510080337524, + "learning_rate": 4.5026730564991334e-05, + "loss": 0.1881, + "step": 15913 + }, + { + "epoch": 0.28384404095173543, + "grad_norm": 0.3420644998550415, + "learning_rate": 4.5025798844757444e-05, + "loss": 0.1596, + "step": 15914 + }, + { + "epoch": 0.2838618770734492, + "grad_norm": 0.3835827708244324, + "learning_rate": 4.5024867046896636e-05, + "loss": 0.187, + "step": 15915 + }, + { + "epoch": 0.28387971319516286, + "grad_norm": 0.2941904067993164, + "learning_rate": 4.502393517141252e-05, + "loss": 0.1844, + "step": 15916 + }, + { + "epoch": 0.28389754931687655, + "grad_norm": 0.31132662296295166, + "learning_rate": 4.502300321830872e-05, + "loss": 0.1877, + "step": 15917 + }, + { + "epoch": 0.28391538543859024, + "grad_norm": 0.2771371006965637, + "learning_rate": 4.5022071187588854e-05, + "loss": 0.1941, + "step": 15918 + }, + { + "epoch": 0.28393322156030393, + "grad_norm": 0.24142910540103912, + "learning_rate": 4.5021139079256515e-05, + "loss": 0.1386, + "step": 15919 + }, + { + "epoch": 0.2839510576820176, + "grad_norm": 0.28513261675834656, + "learning_rate": 4.5020206893315325e-05, + "loss": 0.1107, + "step": 15920 + }, + { + "epoch": 0.2839688938037313, + "grad_norm": 0.23792928457260132, + "learning_rate": 4.50192746297689e-05, + "loss": 0.1243, + "step": 15921 + }, + { + "epoch": 0.283986729925445, + "grad_norm": 0.20691817998886108, + "learning_rate": 4.501834228862085e-05, + "loss": 0.1779, + "step": 15922 + }, + { + "epoch": 0.2840045660471587, + "grad_norm": 0.28103986382484436, + "learning_rate": 4.5017409869874795e-05, + "loss": 0.1761, + "step": 15923 + }, + { + "epoch": 0.2840224021688724, + "grad_norm": 0.2802456021308899, + "learning_rate": 4.501647737353434e-05, + "loss": 0.1595, + "step": 15924 + }, + { + "epoch": 0.2840402382905861, + "grad_norm": 0.27944374084472656, + "learning_rate": 4.501554479960312e-05, + "loss": 0.1916, + "step": 15925 + }, + { + "epoch": 0.2840580744122998, + "grad_norm": 0.2746027112007141, + "learning_rate": 4.501461214808472e-05, + "loss": 0.144, + "step": 15926 + }, + { + "epoch": 0.2840759105340135, + "grad_norm": 0.26997241377830505, + "learning_rate": 4.501367941898277e-05, + "loss": 0.1855, + "step": 15927 + }, + { + "epoch": 0.2840937466557272, + "grad_norm": 0.5294459462165833, + "learning_rate": 4.5012746612300896e-05, + "loss": 0.2166, + "step": 15928 + }, + { + "epoch": 0.28411158277744086, + "grad_norm": 0.2522684335708618, + "learning_rate": 4.50118137280427e-05, + "loss": 0.2079, + "step": 15929 + }, + { + "epoch": 0.28412941889915455, + "grad_norm": 0.2531207203865051, + "learning_rate": 4.501088076621179e-05, + "loss": 0.1759, + "step": 15930 + }, + { + "epoch": 0.28414725502086824, + "grad_norm": 0.2434559315443039, + "learning_rate": 4.5009947726811805e-05, + "loss": 0.1976, + "step": 15931 + }, + { + "epoch": 0.284165091142582, + "grad_norm": 0.24981777369976044, + "learning_rate": 4.5009014609846355e-05, + "loss": 0.1526, + "step": 15932 + }, + { + "epoch": 0.28418292726429567, + "grad_norm": 0.2622455656528473, + "learning_rate": 4.500808141531905e-05, + "loss": 0.1646, + "step": 15933 + }, + { + "epoch": 0.28420076338600936, + "grad_norm": 0.3121867775917053, + "learning_rate": 4.500714814323351e-05, + "loss": 0.1749, + "step": 15934 + }, + { + "epoch": 0.28421859950772305, + "grad_norm": 0.29981881380081177, + "learning_rate": 4.500621479359336e-05, + "loss": 0.2304, + "step": 15935 + }, + { + "epoch": 0.28423643562943673, + "grad_norm": 0.34250420331954956, + "learning_rate": 4.500528136640221e-05, + "loss": 0.2151, + "step": 15936 + }, + { + "epoch": 0.2842542717511504, + "grad_norm": 0.23781858384609222, + "learning_rate": 4.5004347861663673e-05, + "loss": 0.1872, + "step": 15937 + }, + { + "epoch": 0.2842721078728641, + "grad_norm": 0.2146347612142563, + "learning_rate": 4.500341427938137e-05, + "loss": 0.191, + "step": 15938 + }, + { + "epoch": 0.2842899439945778, + "grad_norm": 0.22418838739395142, + "learning_rate": 4.500248061955894e-05, + "loss": 0.1292, + "step": 15939 + }, + { + "epoch": 0.2843077801162915, + "grad_norm": 0.3149854838848114, + "learning_rate": 4.500154688219997e-05, + "loss": 0.1538, + "step": 15940 + }, + { + "epoch": 0.28432561623800523, + "grad_norm": 0.24618928134441376, + "learning_rate": 4.500061306730811e-05, + "loss": 0.1931, + "step": 15941 + }, + { + "epoch": 0.2843434523597189, + "grad_norm": 0.26659172773361206, + "learning_rate": 4.499967917488696e-05, + "loss": 0.1544, + "step": 15942 + }, + { + "epoch": 0.2843612884814326, + "grad_norm": 0.30760619044303894, + "learning_rate": 4.4998745204940146e-05, + "loss": 0.1881, + "step": 15943 + }, + { + "epoch": 0.2843791246031463, + "grad_norm": 0.3104797601699829, + "learning_rate": 4.499781115747129e-05, + "loss": 0.1781, + "step": 15944 + }, + { + "epoch": 0.28439696072486, + "grad_norm": 0.2967168092727661, + "learning_rate": 4.499687703248401e-05, + "loss": 0.1874, + "step": 15945 + }, + { + "epoch": 0.28441479684657367, + "grad_norm": 0.2629615366458893, + "learning_rate": 4.4995942829981926e-05, + "loss": 0.1304, + "step": 15946 + }, + { + "epoch": 0.28443263296828736, + "grad_norm": 0.20061103999614716, + "learning_rate": 4.499500854996867e-05, + "loss": 0.1746, + "step": 15947 + }, + { + "epoch": 0.28445046909000105, + "grad_norm": 0.3300549387931824, + "learning_rate": 4.4994074192447836e-05, + "loss": 0.1365, + "step": 15948 + }, + { + "epoch": 0.2844683052117148, + "grad_norm": 0.30984750390052795, + "learning_rate": 4.4993139757423077e-05, + "loss": 0.1467, + "step": 15949 + }, + { + "epoch": 0.2844861413334285, + "grad_norm": 0.29576873779296875, + "learning_rate": 4.4992205244898e-05, + "loss": 0.2013, + "step": 15950 + }, + { + "epoch": 0.28450397745514217, + "grad_norm": 0.2682577967643738, + "learning_rate": 4.4991270654876234e-05, + "loss": 0.1347, + "step": 15951 + }, + { + "epoch": 0.28452181357685585, + "grad_norm": 0.3534597158432007, + "learning_rate": 4.49903359873614e-05, + "loss": 0.1551, + "step": 15952 + }, + { + "epoch": 0.28453964969856954, + "grad_norm": 0.26593804359436035, + "learning_rate": 4.498940124235711e-05, + "loss": 0.1761, + "step": 15953 + }, + { + "epoch": 0.28455748582028323, + "grad_norm": 0.25501754879951477, + "learning_rate": 4.498846641986701e-05, + "loss": 0.1961, + "step": 15954 + }, + { + "epoch": 0.2845753219419969, + "grad_norm": 0.25656792521476746, + "learning_rate": 4.49875315198947e-05, + "loss": 0.1671, + "step": 15955 + }, + { + "epoch": 0.2845931580637106, + "grad_norm": 0.338867723941803, + "learning_rate": 4.498659654244381e-05, + "loss": 0.2031, + "step": 15956 + }, + { + "epoch": 0.28461099418542435, + "grad_norm": 0.2237289696931839, + "learning_rate": 4.498566148751798e-05, + "loss": 0.2255, + "step": 15957 + }, + { + "epoch": 0.28462883030713804, + "grad_norm": 0.31040847301483154, + "learning_rate": 4.4984726355120814e-05, + "loss": 0.1375, + "step": 15958 + }, + { + "epoch": 0.2846466664288517, + "grad_norm": 0.29024532437324524, + "learning_rate": 4.498379114525595e-05, + "loss": 0.1746, + "step": 15959 + }, + { + "epoch": 0.2846645025505654, + "grad_norm": 0.2775183916091919, + "learning_rate": 4.4982855857927014e-05, + "loss": 0.1702, + "step": 15960 + }, + { + "epoch": 0.2846823386722791, + "grad_norm": 0.27296754717826843, + "learning_rate": 4.498192049313762e-05, + "loss": 0.1466, + "step": 15961 + }, + { + "epoch": 0.2847001747939928, + "grad_norm": 0.23766563832759857, + "learning_rate": 4.498098505089141e-05, + "loss": 0.1397, + "step": 15962 + }, + { + "epoch": 0.2847180109157065, + "grad_norm": 0.2712920308113098, + "learning_rate": 4.4980049531191993e-05, + "loss": 0.2018, + "step": 15963 + }, + { + "epoch": 0.28473584703742016, + "grad_norm": 0.3951716125011444, + "learning_rate": 4.4979113934043004e-05, + "loss": 0.2187, + "step": 15964 + }, + { + "epoch": 0.28475368315913385, + "grad_norm": 0.19918771088123322, + "learning_rate": 4.4978178259448064e-05, + "loss": 0.1585, + "step": 15965 + }, + { + "epoch": 0.2847715192808476, + "grad_norm": 0.3238455653190613, + "learning_rate": 4.497724250741081e-05, + "loss": 0.2022, + "step": 15966 + }, + { + "epoch": 0.2847893554025613, + "grad_norm": 0.2711409330368042, + "learning_rate": 4.497630667793486e-05, + "loss": 0.2327, + "step": 15967 + }, + { + "epoch": 0.28480719152427497, + "grad_norm": 0.3973959982395172, + "learning_rate": 4.497537077102386e-05, + "loss": 0.1422, + "step": 15968 + }, + { + "epoch": 0.28482502764598866, + "grad_norm": 0.2645553946495056, + "learning_rate": 4.497443478668141e-05, + "loss": 0.1579, + "step": 15969 + }, + { + "epoch": 0.28484286376770235, + "grad_norm": 0.2623996138572693, + "learning_rate": 4.497349872491116e-05, + "loss": 0.1766, + "step": 15970 + }, + { + "epoch": 0.28486069988941604, + "grad_norm": 0.2854841947555542, + "learning_rate": 4.497256258571672e-05, + "loss": 0.2177, + "step": 15971 + }, + { + "epoch": 0.2848785360111297, + "grad_norm": 0.23708291351795197, + "learning_rate": 4.497162636910174e-05, + "loss": 0.1779, + "step": 15972 + }, + { + "epoch": 0.2848963721328434, + "grad_norm": 0.25569671392440796, + "learning_rate": 4.497069007506983e-05, + "loss": 0.1841, + "step": 15973 + }, + { + "epoch": 0.28491420825455716, + "grad_norm": 0.2253493219614029, + "learning_rate": 4.496975370362463e-05, + "loss": 0.1947, + "step": 15974 + }, + { + "epoch": 0.28493204437627084, + "grad_norm": 0.38804617524147034, + "learning_rate": 4.4968817254769766e-05, + "loss": 0.1749, + "step": 15975 + }, + { + "epoch": 0.28494988049798453, + "grad_norm": 0.3247669041156769, + "learning_rate": 4.496788072850887e-05, + "loss": 0.2042, + "step": 15976 + }, + { + "epoch": 0.2849677166196982, + "grad_norm": 0.6287062168121338, + "learning_rate": 4.496694412484558e-05, + "loss": 0.2862, + "step": 15977 + }, + { + "epoch": 0.2849855527414119, + "grad_norm": 0.3077385425567627, + "learning_rate": 4.496600744378351e-05, + "loss": 0.1751, + "step": 15978 + }, + { + "epoch": 0.2850033888631256, + "grad_norm": 0.2535353899002075, + "learning_rate": 4.49650706853263e-05, + "loss": 0.1449, + "step": 15979 + }, + { + "epoch": 0.2850212249848393, + "grad_norm": 0.24810250103473663, + "learning_rate": 4.496413384947758e-05, + "loss": 0.2034, + "step": 15980 + }, + { + "epoch": 0.28503906110655297, + "grad_norm": 0.2122296690940857, + "learning_rate": 4.496319693624098e-05, + "loss": 0.1753, + "step": 15981 + }, + { + "epoch": 0.28505689722826666, + "grad_norm": 0.29140132665634155, + "learning_rate": 4.496225994562013e-05, + "loss": 0.1273, + "step": 15982 + }, + { + "epoch": 0.2850747333499804, + "grad_norm": 0.1990610808134079, + "learning_rate": 4.4961322877618676e-05, + "loss": 0.1636, + "step": 15983 + }, + { + "epoch": 0.2850925694716941, + "grad_norm": 0.2398446500301361, + "learning_rate": 4.496038573224024e-05, + "loss": 0.1714, + "step": 15984 + }, + { + "epoch": 0.2851104055934078, + "grad_norm": 0.2520081400871277, + "learning_rate": 4.495944850948845e-05, + "loss": 0.1854, + "step": 15985 + }, + { + "epoch": 0.28512824171512147, + "grad_norm": 0.25864988565444946, + "learning_rate": 4.4958511209366944e-05, + "loss": 0.187, + "step": 15986 + }, + { + "epoch": 0.28514607783683515, + "grad_norm": 0.3334422707557678, + "learning_rate": 4.4957573831879356e-05, + "loss": 0.1846, + "step": 15987 + }, + { + "epoch": 0.28516391395854884, + "grad_norm": 0.24120378494262695, + "learning_rate": 4.4956636377029314e-05, + "loss": 0.1847, + "step": 15988 + }, + { + "epoch": 0.28518175008026253, + "grad_norm": 0.31107571721076965, + "learning_rate": 4.4955698844820465e-05, + "loss": 0.2415, + "step": 15989 + }, + { + "epoch": 0.2851995862019762, + "grad_norm": 0.4167276620864868, + "learning_rate": 4.4954761235256434e-05, + "loss": 0.1459, + "step": 15990 + }, + { + "epoch": 0.28521742232368996, + "grad_norm": 0.2610337436199188, + "learning_rate": 4.4953823548340845e-05, + "loss": 0.1661, + "step": 15991 + }, + { + "epoch": 0.28523525844540365, + "grad_norm": 0.4616617262363434, + "learning_rate": 4.495288578407736e-05, + "loss": 0.1934, + "step": 15992 + }, + { + "epoch": 0.28525309456711734, + "grad_norm": 0.26704883575439453, + "learning_rate": 4.495194794246959e-05, + "loss": 0.1524, + "step": 15993 + }, + { + "epoch": 0.285270930688831, + "grad_norm": 0.3242391347885132, + "learning_rate": 4.495101002352118e-05, + "loss": 0.1772, + "step": 15994 + }, + { + "epoch": 0.2852887668105447, + "grad_norm": 0.24882343411445618, + "learning_rate": 4.4950072027235753e-05, + "loss": 0.165, + "step": 15995 + }, + { + "epoch": 0.2853066029322584, + "grad_norm": 0.21131683886051178, + "learning_rate": 4.494913395361697e-05, + "loss": 0.1413, + "step": 15996 + }, + { + "epoch": 0.2853244390539721, + "grad_norm": 0.2591877281665802, + "learning_rate": 4.4948195802668456e-05, + "loss": 0.1668, + "step": 15997 + }, + { + "epoch": 0.2853422751756858, + "grad_norm": 0.31166186928749084, + "learning_rate": 4.4947257574393836e-05, + "loss": 0.2193, + "step": 15998 + }, + { + "epoch": 0.28536011129739947, + "grad_norm": 0.2110806405544281, + "learning_rate": 4.494631926879676e-05, + "loss": 0.1728, + "step": 15999 + }, + { + "epoch": 0.2853779474191132, + "grad_norm": 0.2495652586221695, + "learning_rate": 4.4945380885880863e-05, + "loss": 0.189, + "step": 16000 + }, + { + "epoch": 0.2853779474191132, + "eval_loss": 0.17411202192306519, + "eval_runtime": 107.1863, + "eval_samples_per_second": 9.553, + "eval_steps_per_second": 1.595, + "step": 16000 + }, + { + "epoch": 0.2853957835408269, + "grad_norm": 0.3134092092514038, + "learning_rate": 4.4944442425649775e-05, + "loss": 0.213, + "step": 16001 + }, + { + "epoch": 0.2854136196625406, + "grad_norm": 0.29065507650375366, + "learning_rate": 4.494350388810714e-05, + "loss": 0.1983, + "step": 16002 + }, + { + "epoch": 0.2854314557842543, + "grad_norm": 0.25940176844596863, + "learning_rate": 4.49425652732566e-05, + "loss": 0.1747, + "step": 16003 + }, + { + "epoch": 0.28544929190596796, + "grad_norm": 0.35320574045181274, + "learning_rate": 4.494162658110179e-05, + "loss": 0.1682, + "step": 16004 + }, + { + "epoch": 0.28546712802768165, + "grad_norm": 0.3480782210826874, + "learning_rate": 4.494068781164634e-05, + "loss": 0.2342, + "step": 16005 + }, + { + "epoch": 0.28548496414939534, + "grad_norm": 0.19702798128128052, + "learning_rate": 4.49397489648939e-05, + "loss": 0.1505, + "step": 16006 + }, + { + "epoch": 0.285502800271109, + "grad_norm": 0.3501552641391754, + "learning_rate": 4.4938810040848115e-05, + "loss": 0.2263, + "step": 16007 + }, + { + "epoch": 0.28552063639282277, + "grad_norm": 0.2384253889322281, + "learning_rate": 4.4937871039512606e-05, + "loss": 0.1625, + "step": 16008 + }, + { + "epoch": 0.28553847251453646, + "grad_norm": 0.31768542528152466, + "learning_rate": 4.4936931960891026e-05, + "loss": 0.1873, + "step": 16009 + }, + { + "epoch": 0.28555630863625014, + "grad_norm": 0.2659898102283478, + "learning_rate": 4.4935992804987014e-05, + "loss": 0.181, + "step": 16010 + }, + { + "epoch": 0.28557414475796383, + "grad_norm": 0.25258708000183105, + "learning_rate": 4.49350535718042e-05, + "loss": 0.1571, + "step": 16011 + }, + { + "epoch": 0.2855919808796775, + "grad_norm": 0.35919901728630066, + "learning_rate": 4.493411426134624e-05, + "loss": 0.1783, + "step": 16012 + }, + { + "epoch": 0.2856098170013912, + "grad_norm": 0.23456792533397675, + "learning_rate": 4.4933174873616766e-05, + "loss": 0.148, + "step": 16013 + }, + { + "epoch": 0.2856276531231049, + "grad_norm": 0.29800763726234436, + "learning_rate": 4.4932235408619426e-05, + "loss": 0.1768, + "step": 16014 + }, + { + "epoch": 0.2856454892448186, + "grad_norm": 0.30805203318595886, + "learning_rate": 4.493129586635785e-05, + "loss": 0.1925, + "step": 16015 + }, + { + "epoch": 0.28566332536653233, + "grad_norm": 0.2858664095401764, + "learning_rate": 4.493035624683569e-05, + "loss": 0.2074, + "step": 16016 + }, + { + "epoch": 0.285681161488246, + "grad_norm": 0.2952503263950348, + "learning_rate": 4.492941655005658e-05, + "loss": 0.1275, + "step": 16017 + }, + { + "epoch": 0.2856989976099597, + "grad_norm": 0.24600088596343994, + "learning_rate": 4.492847677602418e-05, + "loss": 0.1842, + "step": 16018 + }, + { + "epoch": 0.2857168337316734, + "grad_norm": 0.292516827583313, + "learning_rate": 4.492753692474211e-05, + "loss": 0.1338, + "step": 16019 + }, + { + "epoch": 0.2857346698533871, + "grad_norm": 0.29206234216690063, + "learning_rate": 4.492659699621403e-05, + "loss": 0.1858, + "step": 16020 + }, + { + "epoch": 0.28575250597510077, + "grad_norm": 0.23379239439964294, + "learning_rate": 4.4925656990443576e-05, + "loss": 0.166, + "step": 16021 + }, + { + "epoch": 0.28577034209681446, + "grad_norm": 0.3362060785293579, + "learning_rate": 4.4924716907434397e-05, + "loss": 0.1836, + "step": 16022 + }, + { + "epoch": 0.28578817821852814, + "grad_norm": 0.3493463099002838, + "learning_rate": 4.4923776747190124e-05, + "loss": 0.2229, + "step": 16023 + }, + { + "epoch": 0.28580601434024183, + "grad_norm": 0.25529617071151733, + "learning_rate": 4.4922836509714424e-05, + "loss": 0.1619, + "step": 16024 + }, + { + "epoch": 0.2858238504619556, + "grad_norm": 0.29880794882774353, + "learning_rate": 4.492189619501093e-05, + "loss": 0.2155, + "step": 16025 + }, + { + "epoch": 0.28584168658366926, + "grad_norm": 0.23340637981891632, + "learning_rate": 4.492095580308327e-05, + "loss": 0.1734, + "step": 16026 + }, + { + "epoch": 0.28585952270538295, + "grad_norm": 0.29430294036865234, + "learning_rate": 4.4920015333935114e-05, + "loss": 0.1968, + "step": 16027 + }, + { + "epoch": 0.28587735882709664, + "grad_norm": 0.249177947640419, + "learning_rate": 4.491907478757009e-05, + "loss": 0.1598, + "step": 16028 + }, + { + "epoch": 0.2858951949488103, + "grad_norm": 0.23493894934654236, + "learning_rate": 4.4918134163991866e-05, + "loss": 0.1939, + "step": 16029 + }, + { + "epoch": 0.285913031070524, + "grad_norm": 0.23006486892700195, + "learning_rate": 4.4917193463204065e-05, + "loss": 0.1708, + "step": 16030 + }, + { + "epoch": 0.2859308671922377, + "grad_norm": 0.39275267720222473, + "learning_rate": 4.491625268521035e-05, + "loss": 0.2042, + "step": 16031 + }, + { + "epoch": 0.2859487033139514, + "grad_norm": 0.25420838594436646, + "learning_rate": 4.491531183001435e-05, + "loss": 0.1773, + "step": 16032 + }, + { + "epoch": 0.28596653943566513, + "grad_norm": 0.2234090268611908, + "learning_rate": 4.491437089761973e-05, + "loss": 0.1402, + "step": 16033 + }, + { + "epoch": 0.2859843755573788, + "grad_norm": 0.3079547882080078, + "learning_rate": 4.491342988803013e-05, + "loss": 0.1741, + "step": 16034 + }, + { + "epoch": 0.2860022116790925, + "grad_norm": 0.31431812047958374, + "learning_rate": 4.49124888012492e-05, + "loss": 0.153, + "step": 16035 + }, + { + "epoch": 0.2860200478008062, + "grad_norm": 0.18872922658920288, + "learning_rate": 4.491154763728058e-05, + "loss": 0.1713, + "step": 16036 + }, + { + "epoch": 0.2860378839225199, + "grad_norm": 0.44715237617492676, + "learning_rate": 4.491060639612793e-05, + "loss": 0.2204, + "step": 16037 + }, + { + "epoch": 0.2860557200442336, + "grad_norm": 0.23845680058002472, + "learning_rate": 4.490966507779488e-05, + "loss": 0.1751, + "step": 16038 + }, + { + "epoch": 0.28607355616594726, + "grad_norm": 0.2513500154018402, + "learning_rate": 4.49087236822851e-05, + "loss": 0.1917, + "step": 16039 + }, + { + "epoch": 0.28609139228766095, + "grad_norm": 0.24431303143501282, + "learning_rate": 4.4907782209602234e-05, + "loss": 0.1629, + "step": 16040 + }, + { + "epoch": 0.28610922840937464, + "grad_norm": 0.27447381615638733, + "learning_rate": 4.490684065974993e-05, + "loss": 0.1129, + "step": 16041 + }, + { + "epoch": 0.2861270645310884, + "grad_norm": 0.2231118232011795, + "learning_rate": 4.490589903273184e-05, + "loss": 0.1583, + "step": 16042 + }, + { + "epoch": 0.28614490065280207, + "grad_norm": 0.36416152119636536, + "learning_rate": 4.490495732855159e-05, + "loss": 0.1911, + "step": 16043 + }, + { + "epoch": 0.28616273677451576, + "grad_norm": 0.25639608502388, + "learning_rate": 4.4904015547212866e-05, + "loss": 0.206, + "step": 16044 + }, + { + "epoch": 0.28618057289622945, + "grad_norm": 0.25020232796669006, + "learning_rate": 4.49030736887193e-05, + "loss": 0.148, + "step": 16045 + }, + { + "epoch": 0.28619840901794313, + "grad_norm": 0.29758134484291077, + "learning_rate": 4.490213175307455e-05, + "loss": 0.2019, + "step": 16046 + }, + { + "epoch": 0.2862162451396568, + "grad_norm": 0.20389901101589203, + "learning_rate": 4.490118974028226e-05, + "loss": 0.1575, + "step": 16047 + }, + { + "epoch": 0.2862340812613705, + "grad_norm": 0.22760283946990967, + "learning_rate": 4.490024765034608e-05, + "loss": 0.1788, + "step": 16048 + }, + { + "epoch": 0.2862519173830842, + "grad_norm": 0.24378296732902527, + "learning_rate": 4.4899305483269673e-05, + "loss": 0.1735, + "step": 16049 + }, + { + "epoch": 0.28626975350479794, + "grad_norm": 0.3251751959323883, + "learning_rate": 4.489836323905668e-05, + "loss": 0.1411, + "step": 16050 + }, + { + "epoch": 0.28628758962651163, + "grad_norm": 0.22391058504581451, + "learning_rate": 4.489742091771076e-05, + "loss": 0.1652, + "step": 16051 + }, + { + "epoch": 0.2863054257482253, + "grad_norm": 0.26425278186798096, + "learning_rate": 4.489647851923557e-05, + "loss": 0.216, + "step": 16052 + }, + { + "epoch": 0.286323261869939, + "grad_norm": 0.24013806879520416, + "learning_rate": 4.489553604363475e-05, + "loss": 0.1704, + "step": 16053 + }, + { + "epoch": 0.2863410979916527, + "grad_norm": 0.2567487061023712, + "learning_rate": 4.489459349091196e-05, + "loss": 0.2114, + "step": 16054 + }, + { + "epoch": 0.2863589341133664, + "grad_norm": 0.2368803471326828, + "learning_rate": 4.4893650861070855e-05, + "loss": 0.1473, + "step": 16055 + }, + { + "epoch": 0.28637677023508007, + "grad_norm": 0.235275000333786, + "learning_rate": 4.489270815411509e-05, + "loss": 0.1667, + "step": 16056 + }, + { + "epoch": 0.28639460635679376, + "grad_norm": 0.22388041019439697, + "learning_rate": 4.489176537004832e-05, + "loss": 0.1786, + "step": 16057 + }, + { + "epoch": 0.2864124424785075, + "grad_norm": 0.2587563991546631, + "learning_rate": 4.48908225088742e-05, + "loss": 0.1922, + "step": 16058 + }, + { + "epoch": 0.2864302786002212, + "grad_norm": 0.2618034780025482, + "learning_rate": 4.488987957059638e-05, + "loss": 0.1584, + "step": 16059 + }, + { + "epoch": 0.2864481147219349, + "grad_norm": 0.6321316957473755, + "learning_rate": 4.488893655521851e-05, + "loss": 0.1972, + "step": 16060 + }, + { + "epoch": 0.28646595084364856, + "grad_norm": 0.254079669713974, + "learning_rate": 4.488799346274426e-05, + "loss": 0.1788, + "step": 16061 + }, + { + "epoch": 0.28648378696536225, + "grad_norm": 0.2672015130519867, + "learning_rate": 4.488705029317727e-05, + "loss": 0.1884, + "step": 16062 + }, + { + "epoch": 0.28650162308707594, + "grad_norm": 0.2810853123664856, + "learning_rate": 4.488610704652121e-05, + "loss": 0.1715, + "step": 16063 + }, + { + "epoch": 0.28651945920878963, + "grad_norm": 0.31855452060699463, + "learning_rate": 4.488516372277973e-05, + "loss": 0.2235, + "step": 16064 + }, + { + "epoch": 0.2865372953305033, + "grad_norm": 0.2011541724205017, + "learning_rate": 4.4884220321956486e-05, + "loss": 0.1828, + "step": 16065 + }, + { + "epoch": 0.286555131452217, + "grad_norm": 0.23755288124084473, + "learning_rate": 4.4883276844055144e-05, + "loss": 0.1869, + "step": 16066 + }, + { + "epoch": 0.28657296757393075, + "grad_norm": 0.2659914791584015, + "learning_rate": 4.488233328907935e-05, + "loss": 0.2029, + "step": 16067 + }, + { + "epoch": 0.28659080369564444, + "grad_norm": 0.334911972284317, + "learning_rate": 4.4881389657032754e-05, + "loss": 0.175, + "step": 16068 + }, + { + "epoch": 0.2866086398173581, + "grad_norm": 0.22663113474845886, + "learning_rate": 4.488044594791904e-05, + "loss": 0.1851, + "step": 16069 + }, + { + "epoch": 0.2866264759390718, + "grad_norm": 0.1934032142162323, + "learning_rate": 4.487950216174184e-05, + "loss": 0.1067, + "step": 16070 + }, + { + "epoch": 0.2866443120607855, + "grad_norm": 0.34028154611587524, + "learning_rate": 4.4878558298504825e-05, + "loss": 0.1625, + "step": 16071 + }, + { + "epoch": 0.2866621481824992, + "grad_norm": 0.23633654415607452, + "learning_rate": 4.4877614358211653e-05, + "loss": 0.1582, + "step": 16072 + }, + { + "epoch": 0.2866799843042129, + "grad_norm": 0.2550041377544403, + "learning_rate": 4.487667034086599e-05, + "loss": 0.1841, + "step": 16073 + }, + { + "epoch": 0.28669782042592656, + "grad_norm": 0.2965671718120575, + "learning_rate": 4.4875726246471476e-05, + "loss": 0.1795, + "step": 16074 + }, + { + "epoch": 0.2867156565476403, + "grad_norm": 0.3106358051300049, + "learning_rate": 4.487478207503179e-05, + "loss": 0.1938, + "step": 16075 + }, + { + "epoch": 0.286733492669354, + "grad_norm": 0.2519557476043701, + "learning_rate": 4.487383782655058e-05, + "loss": 0.1653, + "step": 16076 + }, + { + "epoch": 0.2867513287910677, + "grad_norm": 0.2858811318874359, + "learning_rate": 4.487289350103151e-05, + "loss": 0.1887, + "step": 16077 + }, + { + "epoch": 0.28676916491278137, + "grad_norm": 0.20570501685142517, + "learning_rate": 4.4871949098478246e-05, + "loss": 0.1403, + "step": 16078 + }, + { + "epoch": 0.28678700103449506, + "grad_norm": 0.23417896032333374, + "learning_rate": 4.4871004618894444e-05, + "loss": 0.1596, + "step": 16079 + }, + { + "epoch": 0.28680483715620875, + "grad_norm": 0.2807213068008423, + "learning_rate": 4.4870060062283755e-05, + "loss": 0.1617, + "step": 16080 + }, + { + "epoch": 0.28682267327792244, + "grad_norm": 0.21266649663448334, + "learning_rate": 4.486911542864986e-05, + "loss": 0.1935, + "step": 16081 + }, + { + "epoch": 0.2868405093996361, + "grad_norm": 0.30692699551582336, + "learning_rate": 4.4868170717996405e-05, + "loss": 0.2132, + "step": 16082 + }, + { + "epoch": 0.2868583455213498, + "grad_norm": 0.2792120575904846, + "learning_rate": 4.486722593032706e-05, + "loss": 0.1397, + "step": 16083 + }, + { + "epoch": 0.28687618164306355, + "grad_norm": 0.2881179451942444, + "learning_rate": 4.486628106564549e-05, + "loss": 0.1825, + "step": 16084 + }, + { + "epoch": 0.28689401776477724, + "grad_norm": 0.2929091453552246, + "learning_rate": 4.486533612395535e-05, + "loss": 0.1632, + "step": 16085 + }, + { + "epoch": 0.28691185388649093, + "grad_norm": 0.2305385321378708, + "learning_rate": 4.486439110526031e-05, + "loss": 0.1824, + "step": 16086 + }, + { + "epoch": 0.2869296900082046, + "grad_norm": 0.29803115129470825, + "learning_rate": 4.486344600956402e-05, + "loss": 0.1523, + "step": 16087 + }, + { + "epoch": 0.2869475261299183, + "grad_norm": 0.25572335720062256, + "learning_rate": 4.486250083687016e-05, + "loss": 0.1771, + "step": 16088 + }, + { + "epoch": 0.286965362251632, + "grad_norm": 0.24149270355701447, + "learning_rate": 4.486155558718238e-05, + "loss": 0.1915, + "step": 16089 + }, + { + "epoch": 0.2869831983733457, + "grad_norm": 0.2909892797470093, + "learning_rate": 4.4860610260504356e-05, + "loss": 0.1773, + "step": 16090 + }, + { + "epoch": 0.28700103449505937, + "grad_norm": 0.24814195930957794, + "learning_rate": 4.485966485683975e-05, + "loss": 0.1479, + "step": 16091 + }, + { + "epoch": 0.2870188706167731, + "grad_norm": 0.3109689950942993, + "learning_rate": 4.485871937619222e-05, + "loss": 0.1627, + "step": 16092 + }, + { + "epoch": 0.2870367067384868, + "grad_norm": 0.25675228238105774, + "learning_rate": 4.4857773818565426e-05, + "loss": 0.1865, + "step": 16093 + }, + { + "epoch": 0.2870545428602005, + "grad_norm": 0.2775726914405823, + "learning_rate": 4.485682818396305e-05, + "loss": 0.216, + "step": 16094 + }, + { + "epoch": 0.2870723789819142, + "grad_norm": 0.22096911072731018, + "learning_rate": 4.485588247238875e-05, + "loss": 0.164, + "step": 16095 + }, + { + "epoch": 0.28709021510362787, + "grad_norm": 0.3239266872406006, + "learning_rate": 4.485493668384619e-05, + "loss": 0.1526, + "step": 16096 + }, + { + "epoch": 0.28710805122534155, + "grad_norm": 0.22390705347061157, + "learning_rate": 4.4853990818339036e-05, + "loss": 0.1629, + "step": 16097 + }, + { + "epoch": 0.28712588734705524, + "grad_norm": 0.21042995154857635, + "learning_rate": 4.4853044875870956e-05, + "loss": 0.1609, + "step": 16098 + }, + { + "epoch": 0.28714372346876893, + "grad_norm": 0.2583172917366028, + "learning_rate": 4.485209885644562e-05, + "loss": 0.2085, + "step": 16099 + }, + { + "epoch": 0.2871615595904826, + "grad_norm": 0.4621339440345764, + "learning_rate": 4.4851152760066696e-05, + "loss": 0.1751, + "step": 16100 + }, + { + "epoch": 0.28717939571219636, + "grad_norm": 0.27855831384658813, + "learning_rate": 4.485020658673784e-05, + "loss": 0.1377, + "step": 16101 + }, + { + "epoch": 0.28719723183391005, + "grad_norm": 0.31874874234199524, + "learning_rate": 4.484926033646273e-05, + "loss": 0.196, + "step": 16102 + }, + { + "epoch": 0.28721506795562374, + "grad_norm": 0.21516236662864685, + "learning_rate": 4.484831400924503e-05, + "loss": 0.1759, + "step": 16103 + }, + { + "epoch": 0.2872329040773374, + "grad_norm": 0.2575733959674835, + "learning_rate": 4.484736760508842e-05, + "loss": 0.1793, + "step": 16104 + }, + { + "epoch": 0.2872507401990511, + "grad_norm": 0.33717402815818787, + "learning_rate": 4.4846421123996546e-05, + "loss": 0.1958, + "step": 16105 + }, + { + "epoch": 0.2872685763207648, + "grad_norm": 0.1957182139158249, + "learning_rate": 4.484547456597309e-05, + "loss": 0.1361, + "step": 16106 + }, + { + "epoch": 0.2872864124424785, + "grad_norm": 0.3578245937824249, + "learning_rate": 4.484452793102173e-05, + "loss": 0.1373, + "step": 16107 + }, + { + "epoch": 0.2873042485641922, + "grad_norm": 0.28512781858444214, + "learning_rate": 4.484358121914611e-05, + "loss": 0.1987, + "step": 16108 + }, + { + "epoch": 0.2873220846859059, + "grad_norm": 0.21096031367778778, + "learning_rate": 4.4842634430349925e-05, + "loss": 0.1641, + "step": 16109 + }, + { + "epoch": 0.2873399208076196, + "grad_norm": 0.29081612825393677, + "learning_rate": 4.484168756463684e-05, + "loss": 0.1778, + "step": 16110 + }, + { + "epoch": 0.2873577569293333, + "grad_norm": 0.28361403942108154, + "learning_rate": 4.4840740622010515e-05, + "loss": 0.1702, + "step": 16111 + }, + { + "epoch": 0.287375593051047, + "grad_norm": 0.1937485784292221, + "learning_rate": 4.4839793602474625e-05, + "loss": 0.1301, + "step": 16112 + }, + { + "epoch": 0.2873934291727607, + "grad_norm": 0.2616284489631653, + "learning_rate": 4.483884650603284e-05, + "loss": 0.1587, + "step": 16113 + }, + { + "epoch": 0.28741126529447436, + "grad_norm": 0.2699330449104309, + "learning_rate": 4.4837899332688836e-05, + "loss": 0.1927, + "step": 16114 + }, + { + "epoch": 0.28742910141618805, + "grad_norm": 0.34607216715812683, + "learning_rate": 4.483695208244629e-05, + "loss": 0.1709, + "step": 16115 + }, + { + "epoch": 0.28744693753790174, + "grad_norm": 0.28219762444496155, + "learning_rate": 4.483600475530886e-05, + "loss": 0.1984, + "step": 16116 + }, + { + "epoch": 0.2874647736596155, + "grad_norm": 0.2999812662601471, + "learning_rate": 4.483505735128023e-05, + "loss": 0.2003, + "step": 16117 + }, + { + "epoch": 0.28748260978132917, + "grad_norm": 0.2562468647956848, + "learning_rate": 4.483410987036406e-05, + "loss": 0.138, + "step": 16118 + }, + { + "epoch": 0.28750044590304286, + "grad_norm": 0.2881334125995636, + "learning_rate": 4.4833162312564033e-05, + "loss": 0.2223, + "step": 16119 + }, + { + "epoch": 0.28751828202475654, + "grad_norm": 0.3034203052520752, + "learning_rate": 4.483221467788381e-05, + "loss": 0.2634, + "step": 16120 + }, + { + "epoch": 0.28753611814647023, + "grad_norm": 0.2534237802028656, + "learning_rate": 4.483126696632708e-05, + "loss": 0.156, + "step": 16121 + }, + { + "epoch": 0.2875539542681839, + "grad_norm": 0.30473223328590393, + "learning_rate": 4.4830319177897514e-05, + "loss": 0.1968, + "step": 16122 + }, + { + "epoch": 0.2875717903898976, + "grad_norm": 0.216000497341156, + "learning_rate": 4.482937131259878e-05, + "loss": 0.1824, + "step": 16123 + }, + { + "epoch": 0.2875896265116113, + "grad_norm": 0.27592262625694275, + "learning_rate": 4.482842337043455e-05, + "loss": 0.1582, + "step": 16124 + }, + { + "epoch": 0.287607462633325, + "grad_norm": 0.37166082859039307, + "learning_rate": 4.482747535140851e-05, + "loss": 0.1667, + "step": 16125 + }, + { + "epoch": 0.2876252987550387, + "grad_norm": 0.2891044616699219, + "learning_rate": 4.482652725552432e-05, + "loss": 0.1693, + "step": 16126 + }, + { + "epoch": 0.2876431348767524, + "grad_norm": 0.34204724431037903, + "learning_rate": 4.482557908278566e-05, + "loss": 0.1022, + "step": 16127 + }, + { + "epoch": 0.2876609709984661, + "grad_norm": 0.3074796497821808, + "learning_rate": 4.482463083319621e-05, + "loss": 0.1205, + "step": 16128 + }, + { + "epoch": 0.2876788071201798, + "grad_norm": 0.32623523473739624, + "learning_rate": 4.4823682506759646e-05, + "loss": 0.1947, + "step": 16129 + }, + { + "epoch": 0.2876966432418935, + "grad_norm": 0.3176324963569641, + "learning_rate": 4.4822734103479643e-05, + "loss": 0.1607, + "step": 16130 + }, + { + "epoch": 0.28771447936360717, + "grad_norm": 0.30377528071403503, + "learning_rate": 4.482178562335988e-05, + "loss": 0.166, + "step": 16131 + }, + { + "epoch": 0.28773231548532086, + "grad_norm": 0.22573193907737732, + "learning_rate": 4.482083706640402e-05, + "loss": 0.1679, + "step": 16132 + }, + { + "epoch": 0.28775015160703454, + "grad_norm": 0.30298227071762085, + "learning_rate": 4.481988843261575e-05, + "loss": 0.1517, + "step": 16133 + }, + { + "epoch": 0.2877679877287483, + "grad_norm": 0.32469886541366577, + "learning_rate": 4.4818939721998754e-05, + "loss": 0.1794, + "step": 16134 + }, + { + "epoch": 0.287785823850462, + "grad_norm": 0.2826855778694153, + "learning_rate": 4.4817990934556695e-05, + "loss": 0.1383, + "step": 16135 + }, + { + "epoch": 0.28780365997217566, + "grad_norm": 0.2807585895061493, + "learning_rate": 4.481704207029327e-05, + "loss": 0.1975, + "step": 16136 + }, + { + "epoch": 0.28782149609388935, + "grad_norm": 0.20302768051624298, + "learning_rate": 4.481609312921213e-05, + "loss": 0.1598, + "step": 16137 + }, + { + "epoch": 0.28783933221560304, + "grad_norm": 0.29405003786087036, + "learning_rate": 4.4815144111316986e-05, + "loss": 0.1407, + "step": 16138 + }, + { + "epoch": 0.2878571683373167, + "grad_norm": 0.25270095467567444, + "learning_rate": 4.481419501661149e-05, + "loss": 0.2086, + "step": 16139 + }, + { + "epoch": 0.2878750044590304, + "grad_norm": 0.26641079783439636, + "learning_rate": 4.481324584509933e-05, + "loss": 0.1359, + "step": 16140 + }, + { + "epoch": 0.2878928405807441, + "grad_norm": 0.2650429606437683, + "learning_rate": 4.4812296596784185e-05, + "loss": 0.1878, + "step": 16141 + }, + { + "epoch": 0.2879106767024578, + "grad_norm": 0.3187381327152252, + "learning_rate": 4.4811347271669735e-05, + "loss": 0.1534, + "step": 16142 + }, + { + "epoch": 0.28792851282417153, + "grad_norm": 0.25793227553367615, + "learning_rate": 4.481039786975967e-05, + "loss": 0.1866, + "step": 16143 + }, + { + "epoch": 0.2879463489458852, + "grad_norm": 0.29353317618370056, + "learning_rate": 4.4809448391057646e-05, + "loss": 0.1588, + "step": 16144 + }, + { + "epoch": 0.2879641850675989, + "grad_norm": 0.3036663234233856, + "learning_rate": 4.480849883556737e-05, + "loss": 0.1698, + "step": 16145 + }, + { + "epoch": 0.2879820211893126, + "grad_norm": 0.23433850705623627, + "learning_rate": 4.4807549203292496e-05, + "loss": 0.2282, + "step": 16146 + }, + { + "epoch": 0.2879998573110263, + "grad_norm": 0.21231277287006378, + "learning_rate": 4.4806599494236735e-05, + "loss": 0.171, + "step": 16147 + }, + { + "epoch": 0.28801769343274, + "grad_norm": 0.24991388618946075, + "learning_rate": 4.480564970840375e-05, + "loss": 0.178, + "step": 16148 + }, + { + "epoch": 0.28803552955445366, + "grad_norm": 0.28997746109962463, + "learning_rate": 4.4804699845797227e-05, + "loss": 0.1636, + "step": 16149 + }, + { + "epoch": 0.28805336567616735, + "grad_norm": 0.2965392470359802, + "learning_rate": 4.4803749906420846e-05, + "loss": 0.1616, + "step": 16150 + }, + { + "epoch": 0.2880712017978811, + "grad_norm": 0.21225705742835999, + "learning_rate": 4.480279989027828e-05, + "loss": 0.1572, + "step": 16151 + }, + { + "epoch": 0.2880890379195948, + "grad_norm": 0.32420283555984497, + "learning_rate": 4.480184979737323e-05, + "loss": 0.1416, + "step": 16152 + }, + { + "epoch": 0.28810687404130847, + "grad_norm": 0.30400407314300537, + "learning_rate": 4.4800899627709375e-05, + "loss": 0.191, + "step": 16153 + }, + { + "epoch": 0.28812471016302216, + "grad_norm": 0.26624125242233276, + "learning_rate": 4.479994938129039e-05, + "loss": 0.1799, + "step": 16154 + }, + { + "epoch": 0.28814254628473585, + "grad_norm": 0.21302437782287598, + "learning_rate": 4.479899905811996e-05, + "loss": 0.1694, + "step": 16155 + }, + { + "epoch": 0.28816038240644953, + "grad_norm": 0.2593947649002075, + "learning_rate": 4.4798048658201776e-05, + "loss": 0.1931, + "step": 16156 + }, + { + "epoch": 0.2881782185281632, + "grad_norm": 0.2870843708515167, + "learning_rate": 4.479709818153952e-05, + "loss": 0.2059, + "step": 16157 + }, + { + "epoch": 0.2881960546498769, + "grad_norm": 0.21844667196273804, + "learning_rate": 4.4796147628136864e-05, + "loss": 0.1499, + "step": 16158 + }, + { + "epoch": 0.28821389077159065, + "grad_norm": 0.27017688751220703, + "learning_rate": 4.479519699799751e-05, + "loss": 0.1787, + "step": 16159 + }, + { + "epoch": 0.28823172689330434, + "grad_norm": 0.27435439825057983, + "learning_rate": 4.4794246291125134e-05, + "loss": 0.1957, + "step": 16160 + }, + { + "epoch": 0.28824956301501803, + "grad_norm": 0.26444387435913086, + "learning_rate": 4.479329550752342e-05, + "loss": 0.1461, + "step": 16161 + }, + { + "epoch": 0.2882673991367317, + "grad_norm": 0.3331790864467621, + "learning_rate": 4.4792344647196064e-05, + "loss": 0.1208, + "step": 16162 + }, + { + "epoch": 0.2882852352584454, + "grad_norm": 0.48470979928970337, + "learning_rate": 4.479139371014673e-05, + "loss": 0.158, + "step": 16163 + }, + { + "epoch": 0.2883030713801591, + "grad_norm": 0.4501863420009613, + "learning_rate": 4.479044269637913e-05, + "loss": 0.1614, + "step": 16164 + }, + { + "epoch": 0.2883209075018728, + "grad_norm": 0.23002241551876068, + "learning_rate": 4.4789491605896935e-05, + "loss": 0.1716, + "step": 16165 + }, + { + "epoch": 0.28833874362358647, + "grad_norm": 0.3586452901363373, + "learning_rate": 4.478854043870384e-05, + "loss": 0.1334, + "step": 16166 + }, + { + "epoch": 0.28835657974530016, + "grad_norm": 0.3058027923107147, + "learning_rate": 4.478758919480352e-05, + "loss": 0.1636, + "step": 16167 + }, + { + "epoch": 0.2883744158670139, + "grad_norm": 0.28845784068107605, + "learning_rate": 4.4786637874199676e-05, + "loss": 0.1559, + "step": 16168 + }, + { + "epoch": 0.2883922519887276, + "grad_norm": 0.22414234280586243, + "learning_rate": 4.4785686476895984e-05, + "loss": 0.1958, + "step": 16169 + }, + { + "epoch": 0.2884100881104413, + "grad_norm": 0.2731773257255554, + "learning_rate": 4.478473500289614e-05, + "loss": 0.1979, + "step": 16170 + }, + { + "epoch": 0.28842792423215496, + "grad_norm": 0.2643968462944031, + "learning_rate": 4.478378345220383e-05, + "loss": 0.1899, + "step": 16171 + }, + { + "epoch": 0.28844576035386865, + "grad_norm": 0.3066263794898987, + "learning_rate": 4.478283182482274e-05, + "loss": 0.1757, + "step": 16172 + }, + { + "epoch": 0.28846359647558234, + "grad_norm": 0.23743121325969696, + "learning_rate": 4.4781880120756565e-05, + "loss": 0.22, + "step": 16173 + }, + { + "epoch": 0.28848143259729603, + "grad_norm": 0.2775743901729584, + "learning_rate": 4.4780928340008986e-05, + "loss": 0.2179, + "step": 16174 + }, + { + "epoch": 0.2884992687190097, + "grad_norm": 0.24497991800308228, + "learning_rate": 4.4779976482583695e-05, + "loss": 0.1511, + "step": 16175 + }, + { + "epoch": 0.28851710484072346, + "grad_norm": 0.2688525915145874, + "learning_rate": 4.477902454848439e-05, + "loss": 0.1542, + "step": 16176 + }, + { + "epoch": 0.28853494096243715, + "grad_norm": 0.2902068495750427, + "learning_rate": 4.477807253771476e-05, + "loss": 0.2002, + "step": 16177 + }, + { + "epoch": 0.28855277708415084, + "grad_norm": 0.40720242261886597, + "learning_rate": 4.4777120450278476e-05, + "loss": 0.2339, + "step": 16178 + }, + { + "epoch": 0.2885706132058645, + "grad_norm": 0.1990911215543747, + "learning_rate": 4.477616828617924e-05, + "loss": 0.1847, + "step": 16179 + }, + { + "epoch": 0.2885884493275782, + "grad_norm": 0.2967304587364197, + "learning_rate": 4.477521604542076e-05, + "loss": 0.1953, + "step": 16180 + }, + { + "epoch": 0.2886062854492919, + "grad_norm": 0.361038476228714, + "learning_rate": 4.4774263728006707e-05, + "loss": 0.1747, + "step": 16181 + }, + { + "epoch": 0.2886241215710056, + "grad_norm": 0.2786751687526703, + "learning_rate": 4.477331133394078e-05, + "loss": 0.154, + "step": 16182 + }, + { + "epoch": 0.2886419576927193, + "grad_norm": 0.2897767424583435, + "learning_rate": 4.477235886322666e-05, + "loss": 0.1081, + "step": 16183 + }, + { + "epoch": 0.28865979381443296, + "grad_norm": 0.2561451494693756, + "learning_rate": 4.477140631586806e-05, + "loss": 0.183, + "step": 16184 + }, + { + "epoch": 0.2886776299361467, + "grad_norm": 0.27485552430152893, + "learning_rate": 4.4770453691868653e-05, + "loss": 0.1809, + "step": 16185 + }, + { + "epoch": 0.2886954660578604, + "grad_norm": 0.2473611682653427, + "learning_rate": 4.476950099123213e-05, + "loss": 0.2197, + "step": 16186 + }, + { + "epoch": 0.2887133021795741, + "grad_norm": 0.2448200136423111, + "learning_rate": 4.476854821396221e-05, + "loss": 0.1991, + "step": 16187 + }, + { + "epoch": 0.28873113830128777, + "grad_norm": 0.28758755326271057, + "learning_rate": 4.476759536006256e-05, + "loss": 0.214, + "step": 16188 + }, + { + "epoch": 0.28874897442300146, + "grad_norm": 0.29136964678764343, + "learning_rate": 4.476664242953688e-05, + "loss": 0.2366, + "step": 16189 + }, + { + "epoch": 0.28876681054471515, + "grad_norm": 0.23921862244606018, + "learning_rate": 4.476568942238887e-05, + "loss": 0.2174, + "step": 16190 + }, + { + "epoch": 0.28878464666642883, + "grad_norm": 0.4133034348487854, + "learning_rate": 4.4764736338622224e-05, + "loss": 0.1984, + "step": 16191 + }, + { + "epoch": 0.2888024827881425, + "grad_norm": 0.30323541164398193, + "learning_rate": 4.4763783178240635e-05, + "loss": 0.2293, + "step": 16192 + }, + { + "epoch": 0.28882031890985627, + "grad_norm": 0.34238171577453613, + "learning_rate": 4.476282994124779e-05, + "loss": 0.169, + "step": 16193 + }, + { + "epoch": 0.28883815503156995, + "grad_norm": 0.2112024873495102, + "learning_rate": 4.476187662764739e-05, + "loss": 0.1441, + "step": 16194 + }, + { + "epoch": 0.28885599115328364, + "grad_norm": 0.3371756970882416, + "learning_rate": 4.476092323744314e-05, + "loss": 0.1603, + "step": 16195 + }, + { + "epoch": 0.28887382727499733, + "grad_norm": 0.27282217144966125, + "learning_rate": 4.475996977063872e-05, + "loss": 0.1527, + "step": 16196 + }, + { + "epoch": 0.288891663396711, + "grad_norm": 0.19841310381889343, + "learning_rate": 4.475901622723783e-05, + "loss": 0.1299, + "step": 16197 + }, + { + "epoch": 0.2889094995184247, + "grad_norm": 0.22017700970172882, + "learning_rate": 4.475806260724417e-05, + "loss": 0.1582, + "step": 16198 + }, + { + "epoch": 0.2889273356401384, + "grad_norm": 0.26420333981513977, + "learning_rate": 4.475710891066144e-05, + "loss": 0.1791, + "step": 16199 + }, + { + "epoch": 0.2889451717618521, + "grad_norm": 0.2915920913219452, + "learning_rate": 4.475615513749333e-05, + "loss": 0.1552, + "step": 16200 + }, + { + "epoch": 0.28896300788356577, + "grad_norm": 0.19410520792007446, + "learning_rate": 4.4755201287743534e-05, + "loss": 0.1395, + "step": 16201 + }, + { + "epoch": 0.2889808440052795, + "grad_norm": 0.23600637912750244, + "learning_rate": 4.475424736141576e-05, + "loss": 0.1798, + "step": 16202 + }, + { + "epoch": 0.2889986801269932, + "grad_norm": 0.28963592648506165, + "learning_rate": 4.475329335851369e-05, + "loss": 0.1606, + "step": 16203 + }, + { + "epoch": 0.2890165162487069, + "grad_norm": 0.3423754572868347, + "learning_rate": 4.475233927904105e-05, + "loss": 0.2047, + "step": 16204 + }, + { + "epoch": 0.2890343523704206, + "grad_norm": 0.24831396341323853, + "learning_rate": 4.475138512300151e-05, + "loss": 0.2355, + "step": 16205 + }, + { + "epoch": 0.28905218849213427, + "grad_norm": 0.22182853519916534, + "learning_rate": 4.475043089039878e-05, + "loss": 0.183, + "step": 16206 + }, + { + "epoch": 0.28907002461384795, + "grad_norm": 0.2502618730068207, + "learning_rate": 4.474947658123656e-05, + "loss": 0.183, + "step": 16207 + }, + { + "epoch": 0.28908786073556164, + "grad_norm": 0.23544780910015106, + "learning_rate": 4.474852219551854e-05, + "loss": 0.1449, + "step": 16208 + }, + { + "epoch": 0.28910569685727533, + "grad_norm": 0.20856021344661713, + "learning_rate": 4.474756773324844e-05, + "loss": 0.1731, + "step": 16209 + }, + { + "epoch": 0.2891235329789891, + "grad_norm": 0.3059997856616974, + "learning_rate": 4.474661319442994e-05, + "loss": 0.2296, + "step": 16210 + }, + { + "epoch": 0.28914136910070276, + "grad_norm": 0.4440188705921173, + "learning_rate": 4.474565857906675e-05, + "loss": 0.1758, + "step": 16211 + }, + { + "epoch": 0.28915920522241645, + "grad_norm": 0.3737524151802063, + "learning_rate": 4.474470388716256e-05, + "loss": 0.2154, + "step": 16212 + }, + { + "epoch": 0.28917704134413014, + "grad_norm": 0.4202517569065094, + "learning_rate": 4.4743749118721084e-05, + "loss": 0.189, + "step": 16213 + }, + { + "epoch": 0.2891948774658438, + "grad_norm": 0.3657699525356293, + "learning_rate": 4.474279427374602e-05, + "loss": 0.1644, + "step": 16214 + }, + { + "epoch": 0.2892127135875575, + "grad_norm": 0.40968450903892517, + "learning_rate": 4.4741839352241056e-05, + "loss": 0.1779, + "step": 16215 + }, + { + "epoch": 0.2892305497092712, + "grad_norm": 0.17167669534683228, + "learning_rate": 4.4740884354209914e-05, + "loss": 0.1364, + "step": 16216 + }, + { + "epoch": 0.2892483858309849, + "grad_norm": 0.3087173104286194, + "learning_rate": 4.473992927965628e-05, + "loss": 0.1835, + "step": 16217 + }, + { + "epoch": 0.28926622195269863, + "grad_norm": 0.22113212943077087, + "learning_rate": 4.4738974128583866e-05, + "loss": 0.1723, + "step": 16218 + }, + { + "epoch": 0.2892840580744123, + "grad_norm": 0.33859553933143616, + "learning_rate": 4.473801890099637e-05, + "loss": 0.185, + "step": 16219 + }, + { + "epoch": 0.289301894196126, + "grad_norm": 0.2641701102256775, + "learning_rate": 4.473706359689749e-05, + "loss": 0.1809, + "step": 16220 + }, + { + "epoch": 0.2893197303178397, + "grad_norm": 0.22278887033462524, + "learning_rate": 4.473610821629094e-05, + "loss": 0.1823, + "step": 16221 + }, + { + "epoch": 0.2893375664395534, + "grad_norm": 1.4023253917694092, + "learning_rate": 4.473515275918042e-05, + "loss": 0.7099, + "step": 16222 + }, + { + "epoch": 0.28935540256126707, + "grad_norm": 0.2187526375055313, + "learning_rate": 4.473419722556963e-05, + "loss": 0.1743, + "step": 16223 + }, + { + "epoch": 0.28937323868298076, + "grad_norm": 0.22437356412410736, + "learning_rate": 4.473324161546227e-05, + "loss": 0.1676, + "step": 16224 + }, + { + "epoch": 0.28939107480469445, + "grad_norm": 0.20635618269443512, + "learning_rate": 4.473228592886206e-05, + "loss": 0.1131, + "step": 16225 + }, + { + "epoch": 0.28940891092640814, + "grad_norm": 0.2504064738750458, + "learning_rate": 4.473133016577269e-05, + "loss": 0.1852, + "step": 16226 + }, + { + "epoch": 0.2894267470481219, + "grad_norm": 0.3178398311138153, + "learning_rate": 4.473037432619787e-05, + "loss": 0.2277, + "step": 16227 + }, + { + "epoch": 0.28944458316983557, + "grad_norm": 0.3087739050388336, + "learning_rate": 4.4729418410141296e-05, + "loss": 0.1435, + "step": 16228 + }, + { + "epoch": 0.28946241929154926, + "grad_norm": 0.2511499226093292, + "learning_rate": 4.4728462417606684e-05, + "loss": 0.2073, + "step": 16229 + }, + { + "epoch": 0.28948025541326294, + "grad_norm": 0.35206353664398193, + "learning_rate": 4.472750634859775e-05, + "loss": 0.2011, + "step": 16230 + }, + { + "epoch": 0.28949809153497663, + "grad_norm": 0.43800655007362366, + "learning_rate": 4.472655020311818e-05, + "loss": 0.1755, + "step": 16231 + }, + { + "epoch": 0.2895159276566903, + "grad_norm": 0.23357662558555603, + "learning_rate": 4.4725593981171685e-05, + "loss": 0.2205, + "step": 16232 + }, + { + "epoch": 0.289533763778404, + "grad_norm": 0.2701023817062378, + "learning_rate": 4.4724637682761976e-05, + "loss": 0.1527, + "step": 16233 + }, + { + "epoch": 0.2895515999001177, + "grad_norm": 0.2746738791465759, + "learning_rate": 4.4723681307892764e-05, + "loss": 0.1606, + "step": 16234 + }, + { + "epoch": 0.28956943602183144, + "grad_norm": 0.22271159291267395, + "learning_rate": 4.472272485656774e-05, + "loss": 0.1603, + "step": 16235 + }, + { + "epoch": 0.2895872721435451, + "grad_norm": 0.2857767939567566, + "learning_rate": 4.472176832879064e-05, + "loss": 0.1911, + "step": 16236 + }, + { + "epoch": 0.2896051082652588, + "grad_norm": 0.2315034419298172, + "learning_rate": 4.472081172456514e-05, + "loss": 0.1822, + "step": 16237 + }, + { + "epoch": 0.2896229443869725, + "grad_norm": 0.2705047130584717, + "learning_rate": 4.4719855043894964e-05, + "loss": 0.1686, + "step": 16238 + }, + { + "epoch": 0.2896407805086862, + "grad_norm": 0.36118659377098083, + "learning_rate": 4.4718898286783825e-05, + "loss": 0.1684, + "step": 16239 + }, + { + "epoch": 0.2896586166303999, + "grad_norm": 0.24807317554950714, + "learning_rate": 4.4717941453235424e-05, + "loss": 0.1541, + "step": 16240 + }, + { + "epoch": 0.28967645275211357, + "grad_norm": 0.30158498883247375, + "learning_rate": 4.471698454325346e-05, + "loss": 0.1844, + "step": 16241 + }, + { + "epoch": 0.28969428887382725, + "grad_norm": 0.4301516115665436, + "learning_rate": 4.4716027556841666e-05, + "loss": 0.2252, + "step": 16242 + }, + { + "epoch": 0.28971212499554094, + "grad_norm": 0.24146386981010437, + "learning_rate": 4.471507049400374e-05, + "loss": 0.1745, + "step": 16243 + }, + { + "epoch": 0.2897299611172547, + "grad_norm": 0.2901882827281952, + "learning_rate": 4.471411335474338e-05, + "loss": 0.1803, + "step": 16244 + }, + { + "epoch": 0.2897477972389684, + "grad_norm": 0.2132776528596878, + "learning_rate": 4.471315613906432e-05, + "loss": 0.1488, + "step": 16245 + }, + { + "epoch": 0.28976563336068206, + "grad_norm": 0.24833212792873383, + "learning_rate": 4.4712198846970256e-05, + "loss": 0.1674, + "step": 16246 + }, + { + "epoch": 0.28978346948239575, + "grad_norm": 0.2625966966152191, + "learning_rate": 4.47112414784649e-05, + "loss": 0.1672, + "step": 16247 + }, + { + "epoch": 0.28980130560410944, + "grad_norm": 0.27172932028770447, + "learning_rate": 4.4710284033551965e-05, + "loss": 0.1671, + "step": 16248 + }, + { + "epoch": 0.2898191417258231, + "grad_norm": 0.336216002702713, + "learning_rate": 4.470932651223516e-05, + "loss": 0.2142, + "step": 16249 + }, + { + "epoch": 0.2898369778475368, + "grad_norm": 0.28624820709228516, + "learning_rate": 4.4708368914518196e-05, + "loss": 0.1741, + "step": 16250 + }, + { + "epoch": 0.2898548139692505, + "grad_norm": 0.2580035328865051, + "learning_rate": 4.4707411240404784e-05, + "loss": 0.1656, + "step": 16251 + }, + { + "epoch": 0.28987265009096425, + "grad_norm": 0.2556898593902588, + "learning_rate": 4.470645348989864e-05, + "loss": 0.119, + "step": 16252 + }, + { + "epoch": 0.28989048621267793, + "grad_norm": 0.36033087968826294, + "learning_rate": 4.470549566300348e-05, + "loss": 0.2276, + "step": 16253 + }, + { + "epoch": 0.2899083223343916, + "grad_norm": 0.2873746454715729, + "learning_rate": 4.4704537759723014e-05, + "loss": 0.1493, + "step": 16254 + }, + { + "epoch": 0.2899261584561053, + "grad_norm": 0.3023276925086975, + "learning_rate": 4.470357978006096e-05, + "loss": 0.1978, + "step": 16255 + }, + { + "epoch": 0.289943994577819, + "grad_norm": 0.30281975865364075, + "learning_rate": 4.470262172402101e-05, + "loss": 0.2077, + "step": 16256 + }, + { + "epoch": 0.2899618306995327, + "grad_norm": 0.2511160969734192, + "learning_rate": 4.47016635916069e-05, + "loss": 0.1688, + "step": 16257 + }, + { + "epoch": 0.2899796668212464, + "grad_norm": 0.20737150311470032, + "learning_rate": 4.470070538282234e-05, + "loss": 0.1596, + "step": 16258 + }, + { + "epoch": 0.28999750294296006, + "grad_norm": 0.4527236819267273, + "learning_rate": 4.4699747097671034e-05, + "loss": 0.2811, + "step": 16259 + }, + { + "epoch": 0.2900153390646738, + "grad_norm": 0.23146802186965942, + "learning_rate": 4.4698788736156714e-05, + "loss": 0.194, + "step": 16260 + }, + { + "epoch": 0.2900331751863875, + "grad_norm": 0.18410581350326538, + "learning_rate": 4.469783029828308e-05, + "loss": 0.1449, + "step": 16261 + }, + { + "epoch": 0.2900510113081012, + "grad_norm": 0.21852976083755493, + "learning_rate": 4.469687178405385e-05, + "loss": 0.1694, + "step": 16262 + }, + { + "epoch": 0.29006884742981487, + "grad_norm": 0.35814619064331055, + "learning_rate": 4.469591319347275e-05, + "loss": 0.2019, + "step": 16263 + }, + { + "epoch": 0.29008668355152856, + "grad_norm": 0.2415800392627716, + "learning_rate": 4.469495452654348e-05, + "loss": 0.1998, + "step": 16264 + }, + { + "epoch": 0.29010451967324224, + "grad_norm": 0.38169074058532715, + "learning_rate": 4.4693995783269766e-05, + "loss": 0.1764, + "step": 16265 + }, + { + "epoch": 0.29012235579495593, + "grad_norm": 0.22764852643013, + "learning_rate": 4.469303696365532e-05, + "loss": 0.1802, + "step": 16266 + }, + { + "epoch": 0.2901401919166696, + "grad_norm": 0.3218633532524109, + "learning_rate": 4.469207806770387e-05, + "loss": 0.1703, + "step": 16267 + }, + { + "epoch": 0.2901580280383833, + "grad_norm": 0.24549725651741028, + "learning_rate": 4.469111909541911e-05, + "loss": 0.1816, + "step": 16268 + }, + { + "epoch": 0.29017586416009705, + "grad_norm": 0.31724241375923157, + "learning_rate": 4.469016004680478e-05, + "loss": 0.1985, + "step": 16269 + }, + { + "epoch": 0.29019370028181074, + "grad_norm": 0.2785017490386963, + "learning_rate": 4.468920092186459e-05, + "loss": 0.1515, + "step": 16270 + }, + { + "epoch": 0.29021153640352443, + "grad_norm": 0.2354467809200287, + "learning_rate": 4.468824172060225e-05, + "loss": 0.1558, + "step": 16271 + }, + { + "epoch": 0.2902293725252381, + "grad_norm": 0.2722052335739136, + "learning_rate": 4.468728244302149e-05, + "loss": 0.1722, + "step": 16272 + }, + { + "epoch": 0.2902472086469518, + "grad_norm": 0.3607449233531952, + "learning_rate": 4.468632308912602e-05, + "loss": 0.2074, + "step": 16273 + }, + { + "epoch": 0.2902650447686655, + "grad_norm": 0.325567364692688, + "learning_rate": 4.468536365891957e-05, + "loss": 0.1837, + "step": 16274 + }, + { + "epoch": 0.2902828808903792, + "grad_norm": 0.2977498173713684, + "learning_rate": 4.4684404152405845e-05, + "loss": 0.1841, + "step": 16275 + }, + { + "epoch": 0.29030071701209287, + "grad_norm": 0.3762569725513458, + "learning_rate": 4.468344456958857e-05, + "loss": 0.182, + "step": 16276 + }, + { + "epoch": 0.2903185531338066, + "grad_norm": 0.22992998361587524, + "learning_rate": 4.4682484910471474e-05, + "loss": 0.2141, + "step": 16277 + }, + { + "epoch": 0.2903363892555203, + "grad_norm": 0.2527478337287903, + "learning_rate": 4.468152517505826e-05, + "loss": 0.1895, + "step": 16278 + }, + { + "epoch": 0.290354225377234, + "grad_norm": 0.2643086612224579, + "learning_rate": 4.4680565363352656e-05, + "loss": 0.1533, + "step": 16279 + }, + { + "epoch": 0.2903720614989477, + "grad_norm": 0.32514500617980957, + "learning_rate": 4.4679605475358385e-05, + "loss": 0.2437, + "step": 16280 + }, + { + "epoch": 0.29038989762066136, + "grad_norm": 0.3089638352394104, + "learning_rate": 4.467864551107917e-05, + "loss": 0.1694, + "step": 16281 + }, + { + "epoch": 0.29040773374237505, + "grad_norm": 0.3204690217971802, + "learning_rate": 4.4677685470518725e-05, + "loss": 0.1727, + "step": 16282 + }, + { + "epoch": 0.29042556986408874, + "grad_norm": 0.29917922616004944, + "learning_rate": 4.4676725353680776e-05, + "loss": 0.2017, + "step": 16283 + }, + { + "epoch": 0.2904434059858024, + "grad_norm": 0.2321767657995224, + "learning_rate": 4.467576516056904e-05, + "loss": 0.164, + "step": 16284 + }, + { + "epoch": 0.2904612421075161, + "grad_norm": 0.22574694454669952, + "learning_rate": 4.467480489118725e-05, + "loss": 0.1909, + "step": 16285 + }, + { + "epoch": 0.29047907822922986, + "grad_norm": 0.8753074407577515, + "learning_rate": 4.467384454553911e-05, + "loss": 0.3279, + "step": 16286 + }, + { + "epoch": 0.29049691435094355, + "grad_norm": 0.22182875871658325, + "learning_rate": 4.467288412362836e-05, + "loss": 0.1634, + "step": 16287 + }, + { + "epoch": 0.29051475047265723, + "grad_norm": 0.45023661851882935, + "learning_rate": 4.4671923625458715e-05, + "loss": 0.2519, + "step": 16288 + }, + { + "epoch": 0.2905325865943709, + "grad_norm": 0.22804221510887146, + "learning_rate": 4.46709630510339e-05, + "loss": 0.1292, + "step": 16289 + }, + { + "epoch": 0.2905504227160846, + "grad_norm": 0.2859315574169159, + "learning_rate": 4.4670002400357634e-05, + "loss": 0.1802, + "step": 16290 + }, + { + "epoch": 0.2905682588377983, + "grad_norm": 0.2908354103565216, + "learning_rate": 4.4669041673433654e-05, + "loss": 0.1816, + "step": 16291 + }, + { + "epoch": 0.290586094959512, + "grad_norm": 0.1963014006614685, + "learning_rate": 4.466808087026567e-05, + "loss": 0.155, + "step": 16292 + }, + { + "epoch": 0.2906039310812257, + "grad_norm": 0.25148773193359375, + "learning_rate": 4.466711999085741e-05, + "loss": 0.1798, + "step": 16293 + }, + { + "epoch": 0.2906217672029394, + "grad_norm": 0.2758645713329315, + "learning_rate": 4.46661590352126e-05, + "loss": 0.1609, + "step": 16294 + }, + { + "epoch": 0.2906396033246531, + "grad_norm": 0.40784990787506104, + "learning_rate": 4.466519800333497e-05, + "loss": 0.1995, + "step": 16295 + }, + { + "epoch": 0.2906574394463668, + "grad_norm": 0.23751278221607208, + "learning_rate": 4.466423689522824e-05, + "loss": 0.2073, + "step": 16296 + }, + { + "epoch": 0.2906752755680805, + "grad_norm": 0.30521532893180847, + "learning_rate": 4.4663275710896126e-05, + "loss": 0.1759, + "step": 16297 + }, + { + "epoch": 0.29069311168979417, + "grad_norm": 0.2473895251750946, + "learning_rate": 4.4662314450342365e-05, + "loss": 0.1801, + "step": 16298 + }, + { + "epoch": 0.29071094781150786, + "grad_norm": 0.2924625277519226, + "learning_rate": 4.466135311357069e-05, + "loss": 0.1567, + "step": 16299 + }, + { + "epoch": 0.29072878393322155, + "grad_norm": 0.2705855369567871, + "learning_rate": 4.4660391700584826e-05, + "loss": 0.2148, + "step": 16300 + }, + { + "epoch": 0.29074662005493523, + "grad_norm": 0.24080908298492432, + "learning_rate": 4.465943021138848e-05, + "loss": 0.1723, + "step": 16301 + }, + { + "epoch": 0.2907644561766489, + "grad_norm": 0.22023464739322662, + "learning_rate": 4.465846864598539e-05, + "loss": 0.1538, + "step": 16302 + }, + { + "epoch": 0.29078229229836267, + "grad_norm": 0.19990479946136475, + "learning_rate": 4.46575070043793e-05, + "loss": 0.1498, + "step": 16303 + }, + { + "epoch": 0.29080012842007635, + "grad_norm": 0.30216798186302185, + "learning_rate": 4.465654528657392e-05, + "loss": 0.1869, + "step": 16304 + }, + { + "epoch": 0.29081796454179004, + "grad_norm": 0.26628923416137695, + "learning_rate": 4.465558349257297e-05, + "loss": 0.1812, + "step": 16305 + }, + { + "epoch": 0.29083580066350373, + "grad_norm": 0.3141874670982361, + "learning_rate": 4.46546216223802e-05, + "loss": 0.1824, + "step": 16306 + }, + { + "epoch": 0.2908536367852174, + "grad_norm": 0.31333398818969727, + "learning_rate": 4.4653659675999326e-05, + "loss": 0.1777, + "step": 16307 + }, + { + "epoch": 0.2908714729069311, + "grad_norm": 0.19809776544570923, + "learning_rate": 4.465269765343408e-05, + "loss": 0.1858, + "step": 16308 + }, + { + "epoch": 0.2908893090286448, + "grad_norm": 0.3457790017127991, + "learning_rate": 4.4651735554688186e-05, + "loss": 0.1896, + "step": 16309 + }, + { + "epoch": 0.2909071451503585, + "grad_norm": 0.3072323203086853, + "learning_rate": 4.4650773379765374e-05, + "loss": 0.2637, + "step": 16310 + }, + { + "epoch": 0.2909249812720722, + "grad_norm": 0.19440221786499023, + "learning_rate": 4.4649811128669384e-05, + "loss": 0.1386, + "step": 16311 + }, + { + "epoch": 0.2909428173937859, + "grad_norm": 0.2634267210960388, + "learning_rate": 4.464884880140394e-05, + "loss": 0.173, + "step": 16312 + }, + { + "epoch": 0.2909606535154996, + "grad_norm": 0.20517735183238983, + "learning_rate": 4.464788639797277e-05, + "loss": 0.1873, + "step": 16313 + }, + { + "epoch": 0.2909784896372133, + "grad_norm": 0.28856387734413147, + "learning_rate": 4.464692391837961e-05, + "loss": 0.1789, + "step": 16314 + }, + { + "epoch": 0.290996325758927, + "grad_norm": 0.41785359382629395, + "learning_rate": 4.464596136262818e-05, + "loss": 0.1681, + "step": 16315 + }, + { + "epoch": 0.29101416188064066, + "grad_norm": 0.28234413266181946, + "learning_rate": 4.464499873072222e-05, + "loss": 0.2008, + "step": 16316 + }, + { + "epoch": 0.29103199800235435, + "grad_norm": 0.21627850830554962, + "learning_rate": 4.4644036022665456e-05, + "loss": 0.1726, + "step": 16317 + }, + { + "epoch": 0.29104983412406804, + "grad_norm": 0.2704116106033325, + "learning_rate": 4.464307323846163e-05, + "loss": 0.1629, + "step": 16318 + }, + { + "epoch": 0.2910676702457818, + "grad_norm": 0.2909679114818573, + "learning_rate": 4.464211037811447e-05, + "loss": 0.1834, + "step": 16319 + }, + { + "epoch": 0.2910855063674955, + "grad_norm": 0.20703548192977905, + "learning_rate": 4.46411474416277e-05, + "loss": 0.1561, + "step": 16320 + }, + { + "epoch": 0.29110334248920916, + "grad_norm": 0.29566439986228943, + "learning_rate": 4.464018442900506e-05, + "loss": 0.2282, + "step": 16321 + }, + { + "epoch": 0.29112117861092285, + "grad_norm": 0.2513722777366638, + "learning_rate": 4.4639221340250284e-05, + "loss": 0.2127, + "step": 16322 + }, + { + "epoch": 0.29113901473263654, + "grad_norm": 0.27034732699394226, + "learning_rate": 4.46382581753671e-05, + "loss": 0.212, + "step": 16323 + }, + { + "epoch": 0.2911568508543502, + "grad_norm": 0.3016135096549988, + "learning_rate": 4.463729493435925e-05, + "loss": 0.1854, + "step": 16324 + }, + { + "epoch": 0.2911746869760639, + "grad_norm": 0.22934338450431824, + "learning_rate": 4.463633161723045e-05, + "loss": 0.1349, + "step": 16325 + }, + { + "epoch": 0.2911925230977776, + "grad_norm": 0.27083444595336914, + "learning_rate": 4.463536822398446e-05, + "loss": 0.1493, + "step": 16326 + }, + { + "epoch": 0.2912103592194913, + "grad_norm": 0.23818299174308777, + "learning_rate": 4.463440475462499e-05, + "loss": 0.2084, + "step": 16327 + }, + { + "epoch": 0.29122819534120503, + "grad_norm": 0.2544391453266144, + "learning_rate": 4.463344120915579e-05, + "loss": 0.1728, + "step": 16328 + }, + { + "epoch": 0.2912460314629187, + "grad_norm": 0.27769050002098083, + "learning_rate": 4.4632477587580596e-05, + "loss": 0.1524, + "step": 16329 + }, + { + "epoch": 0.2912638675846324, + "grad_norm": 0.2713675796985626, + "learning_rate": 4.463151388990313e-05, + "loss": 0.1907, + "step": 16330 + }, + { + "epoch": 0.2912817037063461, + "grad_norm": 0.25703898072242737, + "learning_rate": 4.463055011612715e-05, + "loss": 0.2383, + "step": 16331 + }, + { + "epoch": 0.2912995398280598, + "grad_norm": 0.32257533073425293, + "learning_rate": 4.462958626625636e-05, + "loss": 0.1434, + "step": 16332 + }, + { + "epoch": 0.29131737594977347, + "grad_norm": 0.18602409958839417, + "learning_rate": 4.4628622340294526e-05, + "loss": 0.1671, + "step": 16333 + }, + { + "epoch": 0.29133521207148716, + "grad_norm": 0.34978222846984863, + "learning_rate": 4.462765833824536e-05, + "loss": 0.2345, + "step": 16334 + }, + { + "epoch": 0.29135304819320085, + "grad_norm": 0.2618108093738556, + "learning_rate": 4.4626694260112625e-05, + "loss": 0.1806, + "step": 16335 + }, + { + "epoch": 0.2913708843149146, + "grad_norm": 0.23017603158950806, + "learning_rate": 4.4625730105900034e-05, + "loss": 0.1656, + "step": 16336 + }, + { + "epoch": 0.2913887204366283, + "grad_norm": 0.3865819275379181, + "learning_rate": 4.462476587561134e-05, + "loss": 0.2095, + "step": 16337 + }, + { + "epoch": 0.29140655655834197, + "grad_norm": 0.2807057499885559, + "learning_rate": 4.462380156925027e-05, + "loss": 0.156, + "step": 16338 + }, + { + "epoch": 0.29142439268005566, + "grad_norm": 0.3241732716560364, + "learning_rate": 4.4622837186820574e-05, + "loss": 0.1944, + "step": 16339 + }, + { + "epoch": 0.29144222880176934, + "grad_norm": 0.26081281900405884, + "learning_rate": 4.462187272832597e-05, + "loss": 0.1735, + "step": 16340 + }, + { + "epoch": 0.29146006492348303, + "grad_norm": 0.3468260169029236, + "learning_rate": 4.4620908193770225e-05, + "loss": 0.2009, + "step": 16341 + }, + { + "epoch": 0.2914779010451967, + "grad_norm": 0.25653132796287537, + "learning_rate": 4.461994358315706e-05, + "loss": 0.2034, + "step": 16342 + }, + { + "epoch": 0.2914957371669104, + "grad_norm": 0.2266485095024109, + "learning_rate": 4.461897889649021e-05, + "loss": 0.171, + "step": 16343 + }, + { + "epoch": 0.2915135732886241, + "grad_norm": 0.26951655745506287, + "learning_rate": 4.4618014133773435e-05, + "loss": 0.2089, + "step": 16344 + }, + { + "epoch": 0.29153140941033784, + "grad_norm": 0.4335484206676483, + "learning_rate": 4.4617049295010446e-05, + "loss": 0.1743, + "step": 16345 + }, + { + "epoch": 0.2915492455320515, + "grad_norm": 0.2608528137207031, + "learning_rate": 4.4616084380205013e-05, + "loss": 0.198, + "step": 16346 + }, + { + "epoch": 0.2915670816537652, + "grad_norm": 0.24536316096782684, + "learning_rate": 4.461511938936085e-05, + "loss": 0.2209, + "step": 16347 + }, + { + "epoch": 0.2915849177754789, + "grad_norm": 0.26365524530410767, + "learning_rate": 4.461415432248172e-05, + "loss": 0.207, + "step": 16348 + }, + { + "epoch": 0.2916027538971926, + "grad_norm": 0.16918693482875824, + "learning_rate": 4.461318917957135e-05, + "loss": 0.1601, + "step": 16349 + }, + { + "epoch": 0.2916205900189063, + "grad_norm": 0.25554358959198, + "learning_rate": 4.461222396063348e-05, + "loss": 0.185, + "step": 16350 + }, + { + "epoch": 0.29163842614061997, + "grad_norm": 0.25917184352874756, + "learning_rate": 4.461125866567185e-05, + "loss": 0.1603, + "step": 16351 + }, + { + "epoch": 0.29165626226233365, + "grad_norm": 0.34760043025016785, + "learning_rate": 4.461029329469022e-05, + "loss": 0.2043, + "step": 16352 + }, + { + "epoch": 0.2916740983840474, + "grad_norm": 0.32935231924057007, + "learning_rate": 4.460932784769232e-05, + "loss": 0.1553, + "step": 16353 + }, + { + "epoch": 0.2916919345057611, + "grad_norm": 0.31214600801467896, + "learning_rate": 4.4608362324681885e-05, + "loss": 0.2333, + "step": 16354 + }, + { + "epoch": 0.2917097706274748, + "grad_norm": 0.25096604228019714, + "learning_rate": 4.4607396725662665e-05, + "loss": 0.1409, + "step": 16355 + }, + { + "epoch": 0.29172760674918846, + "grad_norm": 0.27495333552360535, + "learning_rate": 4.4606431050638406e-05, + "loss": 0.1643, + "step": 16356 + }, + { + "epoch": 0.29174544287090215, + "grad_norm": 0.31411153078079224, + "learning_rate": 4.460546529961285e-05, + "loss": 0.2227, + "step": 16357 + }, + { + "epoch": 0.29176327899261584, + "grad_norm": 0.4533727765083313, + "learning_rate": 4.460449947258974e-05, + "loss": 0.2987, + "step": 16358 + }, + { + "epoch": 0.2917811151143295, + "grad_norm": 0.23517009615898132, + "learning_rate": 4.4603533569572815e-05, + "loss": 0.2008, + "step": 16359 + }, + { + "epoch": 0.2917989512360432, + "grad_norm": 0.22573870420455933, + "learning_rate": 4.4602567590565827e-05, + "loss": 0.1616, + "step": 16360 + }, + { + "epoch": 0.2918167873577569, + "grad_norm": 0.21691378951072693, + "learning_rate": 4.4601601535572515e-05, + "loss": 0.17, + "step": 16361 + }, + { + "epoch": 0.29183462347947065, + "grad_norm": 0.365568071603775, + "learning_rate": 4.460063540459663e-05, + "loss": 0.2, + "step": 16362 + }, + { + "epoch": 0.29185245960118433, + "grad_norm": 0.2749923765659332, + "learning_rate": 4.45996691976419e-05, + "loss": 0.1711, + "step": 16363 + }, + { + "epoch": 0.291870295722898, + "grad_norm": 0.3170697093009949, + "learning_rate": 4.459870291471209e-05, + "loss": 0.1839, + "step": 16364 + }, + { + "epoch": 0.2918881318446117, + "grad_norm": 0.23263536393642426, + "learning_rate": 4.459773655581094e-05, + "loss": 0.1599, + "step": 16365 + }, + { + "epoch": 0.2919059679663254, + "grad_norm": 0.3439905047416687, + "learning_rate": 4.459677012094219e-05, + "loss": 0.2006, + "step": 16366 + }, + { + "epoch": 0.2919238040880391, + "grad_norm": 0.2759602963924408, + "learning_rate": 4.459580361010959e-05, + "loss": 0.162, + "step": 16367 + }, + { + "epoch": 0.2919416402097528, + "grad_norm": 0.247183158993721, + "learning_rate": 4.4594837023316896e-05, + "loss": 0.1624, + "step": 16368 + }, + { + "epoch": 0.29195947633146646, + "grad_norm": 0.2692636549472809, + "learning_rate": 4.4593870360567836e-05, + "loss": 0.1234, + "step": 16369 + }, + { + "epoch": 0.2919773124531802, + "grad_norm": 0.37884974479675293, + "learning_rate": 4.4592903621866165e-05, + "loss": 0.1377, + "step": 16370 + }, + { + "epoch": 0.2919951485748939, + "grad_norm": 0.23933671414852142, + "learning_rate": 4.459193680721564e-05, + "loss": 0.133, + "step": 16371 + }, + { + "epoch": 0.2920129846966076, + "grad_norm": 0.3173424005508423, + "learning_rate": 4.459096991662e-05, + "loss": 0.1654, + "step": 16372 + }, + { + "epoch": 0.29203082081832127, + "grad_norm": 0.3288251459598541, + "learning_rate": 4.459000295008299e-05, + "loss": 0.1909, + "step": 16373 + }, + { + "epoch": 0.29204865694003496, + "grad_norm": 0.3331115245819092, + "learning_rate": 4.4589035907608365e-05, + "loss": 0.2064, + "step": 16374 + }, + { + "epoch": 0.29206649306174864, + "grad_norm": 0.2873155176639557, + "learning_rate": 4.4588068789199875e-05, + "loss": 0.2163, + "step": 16375 + }, + { + "epoch": 0.29208432918346233, + "grad_norm": 0.24663789570331573, + "learning_rate": 4.4587101594861266e-05, + "loss": 0.183, + "step": 16376 + }, + { + "epoch": 0.292102165305176, + "grad_norm": 0.35439860820770264, + "learning_rate": 4.4586134324596276e-05, + "loss": 0.2341, + "step": 16377 + }, + { + "epoch": 0.29212000142688976, + "grad_norm": 0.2999183237552643, + "learning_rate": 4.4585166978408674e-05, + "loss": 0.162, + "step": 16378 + }, + { + "epoch": 0.29213783754860345, + "grad_norm": 0.29912668466567993, + "learning_rate": 4.45841995563022e-05, + "loss": 0.217, + "step": 16379 + }, + { + "epoch": 0.29215567367031714, + "grad_norm": 0.23091265559196472, + "learning_rate": 4.4583232058280594e-05, + "loss": 0.1503, + "step": 16380 + }, + { + "epoch": 0.2921735097920308, + "grad_norm": 0.2601783275604248, + "learning_rate": 4.4582264484347625e-05, + "loss": 0.1588, + "step": 16381 + }, + { + "epoch": 0.2921913459137445, + "grad_norm": 0.4871737062931061, + "learning_rate": 4.458129683450703e-05, + "loss": 0.2216, + "step": 16382 + }, + { + "epoch": 0.2922091820354582, + "grad_norm": 0.2018498182296753, + "learning_rate": 4.458032910876258e-05, + "loss": 0.1197, + "step": 16383 + }, + { + "epoch": 0.2922270181571719, + "grad_norm": 0.28650692105293274, + "learning_rate": 4.4579361307117994e-05, + "loss": 0.197, + "step": 16384 + }, + { + "epoch": 0.2922448542788856, + "grad_norm": 0.3046768307685852, + "learning_rate": 4.457839342957705e-05, + "loss": 0.2482, + "step": 16385 + }, + { + "epoch": 0.29226269040059927, + "grad_norm": 0.19669987261295319, + "learning_rate": 4.4577425476143484e-05, + "loss": 0.1836, + "step": 16386 + }, + { + "epoch": 0.292280526522313, + "grad_norm": 0.18548311293125153, + "learning_rate": 4.4576457446821065e-05, + "loss": 0.1525, + "step": 16387 + }, + { + "epoch": 0.2922983626440267, + "grad_norm": 0.3578382134437561, + "learning_rate": 4.457548934161353e-05, + "loss": 0.1644, + "step": 16388 + }, + { + "epoch": 0.2923161987657404, + "grad_norm": 0.2788953185081482, + "learning_rate": 4.457452116052463e-05, + "loss": 0.1934, + "step": 16389 + }, + { + "epoch": 0.2923340348874541, + "grad_norm": 0.20626167953014374, + "learning_rate": 4.457355290355814e-05, + "loss": 0.1689, + "step": 16390 + }, + { + "epoch": 0.29235187100916776, + "grad_norm": 0.3277955949306488, + "learning_rate": 4.4572584570717786e-05, + "loss": 0.1789, + "step": 16391 + }, + { + "epoch": 0.29236970713088145, + "grad_norm": 0.24848464131355286, + "learning_rate": 4.457161616200733e-05, + "loss": 0.1735, + "step": 16392 + }, + { + "epoch": 0.29238754325259514, + "grad_norm": 0.3722819983959198, + "learning_rate": 4.457064767743055e-05, + "loss": 0.2024, + "step": 16393 + }, + { + "epoch": 0.2924053793743088, + "grad_norm": 0.2871563136577606, + "learning_rate": 4.456967911699117e-05, + "loss": 0.1575, + "step": 16394 + }, + { + "epoch": 0.29242321549602257, + "grad_norm": 0.2498369663953781, + "learning_rate": 4.456871048069295e-05, + "loss": 0.1658, + "step": 16395 + }, + { + "epoch": 0.29244105161773626, + "grad_norm": 0.3902013301849365, + "learning_rate": 4.456774176853965e-05, + "loss": 0.1969, + "step": 16396 + }, + { + "epoch": 0.29245888773944995, + "grad_norm": 0.2376020848751068, + "learning_rate": 4.4566772980535035e-05, + "loss": 0.1741, + "step": 16397 + }, + { + "epoch": 0.29247672386116363, + "grad_norm": 0.31966328620910645, + "learning_rate": 4.456580411668284e-05, + "loss": 0.1481, + "step": 16398 + }, + { + "epoch": 0.2924945599828773, + "grad_norm": 0.2637951076030731, + "learning_rate": 4.456483517698683e-05, + "loss": 0.1791, + "step": 16399 + }, + { + "epoch": 0.292512396104591, + "grad_norm": 0.24155883491039276, + "learning_rate": 4.4563866161450764e-05, + "loss": 0.154, + "step": 16400 + }, + { + "epoch": 0.2925302322263047, + "grad_norm": 0.2941102087497711, + "learning_rate": 4.456289707007839e-05, + "loss": 0.1922, + "step": 16401 + }, + { + "epoch": 0.2925480683480184, + "grad_norm": 0.3191867768764496, + "learning_rate": 4.456192790287348e-05, + "loss": 0.2534, + "step": 16402 + }, + { + "epoch": 0.2925659044697321, + "grad_norm": 0.23790162801742554, + "learning_rate": 4.456095865983978e-05, + "loss": 0.1942, + "step": 16403 + }, + { + "epoch": 0.2925837405914458, + "grad_norm": 0.26743048429489136, + "learning_rate": 4.4559989340981045e-05, + "loss": 0.1774, + "step": 16404 + }, + { + "epoch": 0.2926015767131595, + "grad_norm": 0.27811184525489807, + "learning_rate": 4.455901994630103e-05, + "loss": 0.22, + "step": 16405 + }, + { + "epoch": 0.2926194128348732, + "grad_norm": 0.3244760036468506, + "learning_rate": 4.4558050475803505e-05, + "loss": 0.16, + "step": 16406 + }, + { + "epoch": 0.2926372489565869, + "grad_norm": 0.2791687250137329, + "learning_rate": 4.455708092949222e-05, + "loss": 0.1528, + "step": 16407 + }, + { + "epoch": 0.29265508507830057, + "grad_norm": 0.27157244086265564, + "learning_rate": 4.455611130737093e-05, + "loss": 0.1666, + "step": 16408 + }, + { + "epoch": 0.29267292120001426, + "grad_norm": 0.31352323293685913, + "learning_rate": 4.4555141609443406e-05, + "loss": 0.2554, + "step": 16409 + }, + { + "epoch": 0.29269075732172795, + "grad_norm": 0.2395026534795761, + "learning_rate": 4.45541718357134e-05, + "loss": 0.1732, + "step": 16410 + }, + { + "epoch": 0.29270859344344163, + "grad_norm": 0.3108391761779785, + "learning_rate": 4.455320198618466e-05, + "loss": 0.1944, + "step": 16411 + }, + { + "epoch": 0.2927264295651554, + "grad_norm": 0.38609376549720764, + "learning_rate": 4.455223206086097e-05, + "loss": 0.1585, + "step": 16412 + }, + { + "epoch": 0.29274426568686907, + "grad_norm": 0.28550460934638977, + "learning_rate": 4.4551262059746056e-05, + "loss": 0.191, + "step": 16413 + }, + { + "epoch": 0.29276210180858275, + "grad_norm": 0.270965039730072, + "learning_rate": 4.455029198284371e-05, + "loss": 0.1811, + "step": 16414 + }, + { + "epoch": 0.29277993793029644, + "grad_norm": 0.3054213225841522, + "learning_rate": 4.4549321830157674e-05, + "loss": 0.1804, + "step": 16415 + }, + { + "epoch": 0.29279777405201013, + "grad_norm": 0.1913258284330368, + "learning_rate": 4.4548351601691726e-05, + "loss": 0.1709, + "step": 16416 + }, + { + "epoch": 0.2928156101737238, + "grad_norm": 0.2570454478263855, + "learning_rate": 4.45473812974496e-05, + "loss": 0.1335, + "step": 16417 + }, + { + "epoch": 0.2928334462954375, + "grad_norm": 0.2797536253929138, + "learning_rate": 4.454641091743509e-05, + "loss": 0.1657, + "step": 16418 + }, + { + "epoch": 0.2928512824171512, + "grad_norm": 0.20599474012851715, + "learning_rate": 4.454544046165192e-05, + "loss": 0.1553, + "step": 16419 + }, + { + "epoch": 0.29286911853886494, + "grad_norm": 0.2767390012741089, + "learning_rate": 4.454446993010389e-05, + "loss": 0.167, + "step": 16420 + }, + { + "epoch": 0.2928869546605786, + "grad_norm": 0.2473897486925125, + "learning_rate": 4.454349932279474e-05, + "loss": 0.2034, + "step": 16421 + }, + { + "epoch": 0.2929047907822923, + "grad_norm": 0.3787566125392914, + "learning_rate": 4.4542528639728226e-05, + "loss": 0.1621, + "step": 16422 + }, + { + "epoch": 0.292922626904006, + "grad_norm": 0.31824785470962524, + "learning_rate": 4.454155788090813e-05, + "loss": 0.1533, + "step": 16423 + }, + { + "epoch": 0.2929404630257197, + "grad_norm": 0.3889113962650299, + "learning_rate": 4.454058704633821e-05, + "loss": 0.1981, + "step": 16424 + }, + { + "epoch": 0.2929582991474334, + "grad_norm": 0.20811069011688232, + "learning_rate": 4.453961613602221e-05, + "loss": 0.145, + "step": 16425 + }, + { + "epoch": 0.29297613526914706, + "grad_norm": 0.26095858216285706, + "learning_rate": 4.453864514996392e-05, + "loss": 0.1521, + "step": 16426 + }, + { + "epoch": 0.29299397139086075, + "grad_norm": 0.2778921127319336, + "learning_rate": 4.453767408816709e-05, + "loss": 0.1668, + "step": 16427 + }, + { + "epoch": 0.29301180751257444, + "grad_norm": 0.2969571053981781, + "learning_rate": 4.4536702950635494e-05, + "loss": 0.2083, + "step": 16428 + }, + { + "epoch": 0.2930296436342882, + "grad_norm": 0.2110590934753418, + "learning_rate": 4.453573173737289e-05, + "loss": 0.188, + "step": 16429 + }, + { + "epoch": 0.29304747975600187, + "grad_norm": 0.2499048411846161, + "learning_rate": 4.4534760448383026e-05, + "loss": 0.2003, + "step": 16430 + }, + { + "epoch": 0.29306531587771556, + "grad_norm": 0.3023947775363922, + "learning_rate": 4.45337890836697e-05, + "loss": 0.1991, + "step": 16431 + }, + { + "epoch": 0.29308315199942925, + "grad_norm": 0.28928518295288086, + "learning_rate": 4.453281764323666e-05, + "loss": 0.1671, + "step": 16432 + }, + { + "epoch": 0.29310098812114294, + "grad_norm": 0.22792431712150574, + "learning_rate": 4.453184612708766e-05, + "loss": 0.151, + "step": 16433 + }, + { + "epoch": 0.2931188242428566, + "grad_norm": 0.1910247653722763, + "learning_rate": 4.453087453522649e-05, + "loss": 0.124, + "step": 16434 + }, + { + "epoch": 0.2931366603645703, + "grad_norm": 0.2786944508552551, + "learning_rate": 4.4529902867656906e-05, + "loss": 0.1801, + "step": 16435 + }, + { + "epoch": 0.293154496486284, + "grad_norm": 0.284684419631958, + "learning_rate": 4.4528931124382666e-05, + "loss": 0.1949, + "step": 16436 + }, + { + "epoch": 0.29317233260799774, + "grad_norm": 0.23884810507297516, + "learning_rate": 4.452795930540754e-05, + "loss": 0.1779, + "step": 16437 + }, + { + "epoch": 0.29319016872971143, + "grad_norm": 0.2093641757965088, + "learning_rate": 4.452698741073531e-05, + "loss": 0.1926, + "step": 16438 + }, + { + "epoch": 0.2932080048514251, + "grad_norm": 0.21617797017097473, + "learning_rate": 4.452601544036972e-05, + "loss": 0.1285, + "step": 16439 + }, + { + "epoch": 0.2932258409731388, + "grad_norm": 0.3104894459247589, + "learning_rate": 4.452504339431456e-05, + "loss": 0.1905, + "step": 16440 + }, + { + "epoch": 0.2932436770948525, + "grad_norm": 0.2498467117547989, + "learning_rate": 4.4524071272573586e-05, + "loss": 0.2091, + "step": 16441 + }, + { + "epoch": 0.2932615132165662, + "grad_norm": 0.23353685438632965, + "learning_rate": 4.4523099075150563e-05, + "loss": 0.1366, + "step": 16442 + }, + { + "epoch": 0.29327934933827987, + "grad_norm": 0.4609735906124115, + "learning_rate": 4.452212680204927e-05, + "loss": 0.2211, + "step": 16443 + }, + { + "epoch": 0.29329718545999356, + "grad_norm": 0.2026536762714386, + "learning_rate": 4.452115445327347e-05, + "loss": 0.1319, + "step": 16444 + }, + { + "epoch": 0.29331502158170725, + "grad_norm": 0.23939964175224304, + "learning_rate": 4.452018202882694e-05, + "loss": 0.1968, + "step": 16445 + }, + { + "epoch": 0.293332857703421, + "grad_norm": 0.458141028881073, + "learning_rate": 4.4519209528713436e-05, + "loss": 0.2106, + "step": 16446 + }, + { + "epoch": 0.2933506938251347, + "grad_norm": 0.21322348713874817, + "learning_rate": 4.451823695293673e-05, + "loss": 0.1871, + "step": 16447 + }, + { + "epoch": 0.29336852994684837, + "grad_norm": 0.22875843942165375, + "learning_rate": 4.451726430150061e-05, + "loss": 0.1857, + "step": 16448 + }, + { + "epoch": 0.29338636606856205, + "grad_norm": 0.23780286312103271, + "learning_rate": 4.4516291574408815e-05, + "loss": 0.1526, + "step": 16449 + }, + { + "epoch": 0.29340420219027574, + "grad_norm": 0.22688932716846466, + "learning_rate": 4.4515318771665134e-05, + "loss": 0.1538, + "step": 16450 + }, + { + "epoch": 0.29342203831198943, + "grad_norm": 0.25661998987197876, + "learning_rate": 4.451434589327335e-05, + "loss": 0.2017, + "step": 16451 + }, + { + "epoch": 0.2934398744337031, + "grad_norm": 0.28685420751571655, + "learning_rate": 4.4513372939237217e-05, + "loss": 0.1947, + "step": 16452 + }, + { + "epoch": 0.2934577105554168, + "grad_norm": 0.2017870396375656, + "learning_rate": 4.45123999095605e-05, + "loss": 0.1575, + "step": 16453 + }, + { + "epoch": 0.29347554667713055, + "grad_norm": 0.17266018688678741, + "learning_rate": 4.451142680424699e-05, + "loss": 0.1315, + "step": 16454 + }, + { + "epoch": 0.29349338279884424, + "grad_norm": 0.24347445368766785, + "learning_rate": 4.4510453623300455e-05, + "loss": 0.1705, + "step": 16455 + }, + { + "epoch": 0.2935112189205579, + "grad_norm": 0.34976738691329956, + "learning_rate": 4.450948036672466e-05, + "loss": 0.2077, + "step": 16456 + }, + { + "epoch": 0.2935290550422716, + "grad_norm": 0.25179556012153625, + "learning_rate": 4.450850703452338e-05, + "loss": 0.1719, + "step": 16457 + }, + { + "epoch": 0.2935468911639853, + "grad_norm": 0.26511818170547485, + "learning_rate": 4.450753362670039e-05, + "loss": 0.2081, + "step": 16458 + }, + { + "epoch": 0.293564727285699, + "grad_norm": 0.192202627658844, + "learning_rate": 4.450656014325946e-05, + "loss": 0.1345, + "step": 16459 + }, + { + "epoch": 0.2935825634074127, + "grad_norm": 0.19687429070472717, + "learning_rate": 4.450558658420436e-05, + "loss": 0.1406, + "step": 16460 + }, + { + "epoch": 0.29360039952912637, + "grad_norm": 0.22294986248016357, + "learning_rate": 4.450461294953888e-05, + "loss": 0.2098, + "step": 16461 + }, + { + "epoch": 0.29361823565084005, + "grad_norm": 0.2620992660522461, + "learning_rate": 4.450363923926678e-05, + "loss": 0.209, + "step": 16462 + }, + { + "epoch": 0.2936360717725538, + "grad_norm": 0.2383774369955063, + "learning_rate": 4.4502665453391835e-05, + "loss": 0.1484, + "step": 16463 + }, + { + "epoch": 0.2936539078942675, + "grad_norm": 0.16639412939548492, + "learning_rate": 4.450169159191783e-05, + "loss": 0.1278, + "step": 16464 + }, + { + "epoch": 0.2936717440159812, + "grad_norm": 0.32829856872558594, + "learning_rate": 4.450071765484852e-05, + "loss": 0.1683, + "step": 16465 + }, + { + "epoch": 0.29368958013769486, + "grad_norm": 0.3034323751926422, + "learning_rate": 4.449974364218771e-05, + "loss": 0.1302, + "step": 16466 + }, + { + "epoch": 0.29370741625940855, + "grad_norm": 0.3968747556209564, + "learning_rate": 4.449876955393914e-05, + "loss": 0.1726, + "step": 16467 + }, + { + "epoch": 0.29372525238112224, + "grad_norm": 0.25039324164390564, + "learning_rate": 4.449779539010661e-05, + "loss": 0.1478, + "step": 16468 + }, + { + "epoch": 0.2937430885028359, + "grad_norm": 0.2308914214372635, + "learning_rate": 4.44968211506939e-05, + "loss": 0.1033, + "step": 16469 + }, + { + "epoch": 0.2937609246245496, + "grad_norm": 0.2924995720386505, + "learning_rate": 4.449584683570477e-05, + "loss": 0.2233, + "step": 16470 + }, + { + "epoch": 0.29377876074626336, + "grad_norm": 0.2871107757091522, + "learning_rate": 4.449487244514301e-05, + "loss": 0.2046, + "step": 16471 + }, + { + "epoch": 0.29379659686797704, + "grad_norm": 0.2542024254798889, + "learning_rate": 4.4493897979012386e-05, + "loss": 0.1109, + "step": 16472 + }, + { + "epoch": 0.29381443298969073, + "grad_norm": 0.296866774559021, + "learning_rate": 4.449292343731668e-05, + "loss": 0.1886, + "step": 16473 + }, + { + "epoch": 0.2938322691114044, + "grad_norm": 0.32440072298049927, + "learning_rate": 4.449194882005967e-05, + "loss": 0.192, + "step": 16474 + }, + { + "epoch": 0.2938501052331181, + "grad_norm": 0.35222840309143066, + "learning_rate": 4.449097412724513e-05, + "loss": 0.2455, + "step": 16475 + }, + { + "epoch": 0.2938679413548318, + "grad_norm": 0.2419472187757492, + "learning_rate": 4.4489999358876855e-05, + "loss": 0.1846, + "step": 16476 + }, + { + "epoch": 0.2938857774765455, + "grad_norm": 0.29099488258361816, + "learning_rate": 4.44890245149586e-05, + "loss": 0.1922, + "step": 16477 + }, + { + "epoch": 0.2939036135982592, + "grad_norm": 0.2634369432926178, + "learning_rate": 4.448804959549416e-05, + "loss": 0.2269, + "step": 16478 + }, + { + "epoch": 0.2939214497199729, + "grad_norm": 0.2559466063976288, + "learning_rate": 4.44870746004873e-05, + "loss": 0.1766, + "step": 16479 + }, + { + "epoch": 0.2939392858416866, + "grad_norm": 0.253835529088974, + "learning_rate": 4.4486099529941825e-05, + "loss": 0.1702, + "step": 16480 + }, + { + "epoch": 0.2939571219634003, + "grad_norm": 0.256608784198761, + "learning_rate": 4.4485124383861485e-05, + "loss": 0.2, + "step": 16481 + }, + { + "epoch": 0.293974958085114, + "grad_norm": 0.24180968105793, + "learning_rate": 4.448414916225008e-05, + "loss": 0.1519, + "step": 16482 + }, + { + "epoch": 0.29399279420682767, + "grad_norm": 0.2222093939781189, + "learning_rate": 4.448317386511137e-05, + "loss": 0.1588, + "step": 16483 + }, + { + "epoch": 0.29401063032854136, + "grad_norm": 0.3812173902988434, + "learning_rate": 4.448219849244916e-05, + "loss": 0.223, + "step": 16484 + }, + { + "epoch": 0.29402846645025504, + "grad_norm": 0.2957015037536621, + "learning_rate": 4.4481223044267216e-05, + "loss": 0.1788, + "step": 16485 + }, + { + "epoch": 0.29404630257196873, + "grad_norm": 0.2898963391780853, + "learning_rate": 4.448024752056933e-05, + "loss": 0.1845, + "step": 16486 + }, + { + "epoch": 0.2940641386936824, + "grad_norm": 0.21822543442249298, + "learning_rate": 4.4479271921359275e-05, + "loss": 0.1415, + "step": 16487 + }, + { + "epoch": 0.29408197481539616, + "grad_norm": 0.26728910207748413, + "learning_rate": 4.447829624664083e-05, + "loss": 0.2164, + "step": 16488 + }, + { + "epoch": 0.29409981093710985, + "grad_norm": 0.2585594356060028, + "learning_rate": 4.447732049641778e-05, + "loss": 0.1235, + "step": 16489 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.26438191533088684, + "learning_rate": 4.44763446706939e-05, + "loss": 0.1874, + "step": 16490 + }, + { + "epoch": 0.2941354831805372, + "grad_norm": 0.2708672285079956, + "learning_rate": 4.4475368769473e-05, + "loss": 0.236, + "step": 16491 + }, + { + "epoch": 0.2941533193022509, + "grad_norm": 0.16733378171920776, + "learning_rate": 4.447439279275884e-05, + "loss": 0.1544, + "step": 16492 + }, + { + "epoch": 0.2941711554239646, + "grad_norm": 0.21120314300060272, + "learning_rate": 4.44734167405552e-05, + "loss": 0.191, + "step": 16493 + }, + { + "epoch": 0.2941889915456783, + "grad_norm": 0.3308258354663849, + "learning_rate": 4.4472440612865865e-05, + "loss": 0.2124, + "step": 16494 + }, + { + "epoch": 0.294206827667392, + "grad_norm": 0.264169305562973, + "learning_rate": 4.4471464409694635e-05, + "loss": 0.1778, + "step": 16495 + }, + { + "epoch": 0.2942246637891057, + "grad_norm": 0.22388221323490143, + "learning_rate": 4.447048813104528e-05, + "loss": 0.2072, + "step": 16496 + }, + { + "epoch": 0.2942424999108194, + "grad_norm": 0.2517063617706299, + "learning_rate": 4.446951177692159e-05, + "loss": 0.2238, + "step": 16497 + }, + { + "epoch": 0.2942603360325331, + "grad_norm": 0.2153603434562683, + "learning_rate": 4.446853534732735e-05, + "loss": 0.1738, + "step": 16498 + }, + { + "epoch": 0.2942781721542468, + "grad_norm": 0.241929829120636, + "learning_rate": 4.446755884226635e-05, + "loss": 0.1987, + "step": 16499 + }, + { + "epoch": 0.2942960082759605, + "grad_norm": 0.2276146560907364, + "learning_rate": 4.446658226174235e-05, + "loss": 0.1773, + "step": 16500 + }, + { + "epoch": 0.29431384439767416, + "grad_norm": 0.2956530451774597, + "learning_rate": 4.446560560575917e-05, + "loss": 0.1881, + "step": 16501 + }, + { + "epoch": 0.29433168051938785, + "grad_norm": 0.36412230134010315, + "learning_rate": 4.446462887432056e-05, + "loss": 0.1959, + "step": 16502 + }, + { + "epoch": 0.29434951664110154, + "grad_norm": 0.20981663465499878, + "learning_rate": 4.4463652067430336e-05, + "loss": 0.1561, + "step": 16503 + }, + { + "epoch": 0.2943673527628152, + "grad_norm": 0.2986816465854645, + "learning_rate": 4.446267518509228e-05, + "loss": 0.1741, + "step": 16504 + }, + { + "epoch": 0.29438518888452897, + "grad_norm": 0.2284894436597824, + "learning_rate": 4.4461698227310164e-05, + "loss": 0.1851, + "step": 16505 + }, + { + "epoch": 0.29440302500624266, + "grad_norm": 0.2480347603559494, + "learning_rate": 4.4460721194087785e-05, + "loss": 0.1842, + "step": 16506 + }, + { + "epoch": 0.29442086112795635, + "grad_norm": 0.3044646978378296, + "learning_rate": 4.4459744085428935e-05, + "loss": 0.1632, + "step": 16507 + }, + { + "epoch": 0.29443869724967003, + "grad_norm": 0.30180442333221436, + "learning_rate": 4.445876690133739e-05, + "loss": 0.2162, + "step": 16508 + }, + { + "epoch": 0.2944565333713837, + "grad_norm": 0.29936933517456055, + "learning_rate": 4.445778964181695e-05, + "loss": 0.1624, + "step": 16509 + }, + { + "epoch": 0.2944743694930974, + "grad_norm": 0.26864272356033325, + "learning_rate": 4.445681230687139e-05, + "loss": 0.1276, + "step": 16510 + }, + { + "epoch": 0.2944922056148111, + "grad_norm": 0.2878575026988983, + "learning_rate": 4.445583489650451e-05, + "loss": 0.1557, + "step": 16511 + }, + { + "epoch": 0.2945100417365248, + "grad_norm": 0.3502851128578186, + "learning_rate": 4.44548574107201e-05, + "loss": 0.1874, + "step": 16512 + }, + { + "epoch": 0.29452787785823853, + "grad_norm": 0.22545097768306732, + "learning_rate": 4.445387984952193e-05, + "loss": 0.1757, + "step": 16513 + }, + { + "epoch": 0.2945457139799522, + "grad_norm": 0.20361314713954926, + "learning_rate": 4.445290221291381e-05, + "loss": 0.1689, + "step": 16514 + }, + { + "epoch": 0.2945635501016659, + "grad_norm": 0.2162192314863205, + "learning_rate": 4.445192450089952e-05, + "loss": 0.183, + "step": 16515 + }, + { + "epoch": 0.2945813862233796, + "grad_norm": 0.2695448696613312, + "learning_rate": 4.445094671348285e-05, + "loss": 0.1736, + "step": 16516 + }, + { + "epoch": 0.2945992223450933, + "grad_norm": 0.32857251167297363, + "learning_rate": 4.4449968850667595e-05, + "loss": 0.1821, + "step": 16517 + }, + { + "epoch": 0.29461705846680697, + "grad_norm": 0.36193594336509705, + "learning_rate": 4.444899091245754e-05, + "loss": 0.2042, + "step": 16518 + }, + { + "epoch": 0.29463489458852066, + "grad_norm": 0.24590055644512177, + "learning_rate": 4.444801289885648e-05, + "loss": 0.1672, + "step": 16519 + }, + { + "epoch": 0.29465273071023435, + "grad_norm": 0.29725736379623413, + "learning_rate": 4.444703480986821e-05, + "loss": 0.2214, + "step": 16520 + }, + { + "epoch": 0.2946705668319481, + "grad_norm": 0.3494088053703308, + "learning_rate": 4.4446056645496515e-05, + "loss": 0.1908, + "step": 16521 + }, + { + "epoch": 0.2946884029536618, + "grad_norm": 0.22640679776668549, + "learning_rate": 4.4445078405745186e-05, + "loss": 0.1945, + "step": 16522 + }, + { + "epoch": 0.29470623907537546, + "grad_norm": 0.1627998650074005, + "learning_rate": 4.4444100090618014e-05, + "loss": 0.1245, + "step": 16523 + }, + { + "epoch": 0.29472407519708915, + "grad_norm": 0.29106754064559937, + "learning_rate": 4.4443121700118795e-05, + "loss": 0.1671, + "step": 16524 + }, + { + "epoch": 0.29474191131880284, + "grad_norm": 0.38282686471939087, + "learning_rate": 4.444214323425133e-05, + "loss": 0.2336, + "step": 16525 + }, + { + "epoch": 0.29475974744051653, + "grad_norm": 0.22827591001987457, + "learning_rate": 4.4441164693019385e-05, + "loss": 0.1762, + "step": 16526 + }, + { + "epoch": 0.2947775835622302, + "grad_norm": 0.3506973087787628, + "learning_rate": 4.444018607642679e-05, + "loss": 0.16, + "step": 16527 + }, + { + "epoch": 0.2947954196839439, + "grad_norm": 0.2809859812259674, + "learning_rate": 4.4439207384477313e-05, + "loss": 0.1977, + "step": 16528 + }, + { + "epoch": 0.2948132558056576, + "grad_norm": 0.21978464722633362, + "learning_rate": 4.443822861717475e-05, + "loss": 0.1747, + "step": 16529 + }, + { + "epoch": 0.29483109192737134, + "grad_norm": 0.27921798825263977, + "learning_rate": 4.44372497745229e-05, + "loss": 0.1817, + "step": 16530 + }, + { + "epoch": 0.294848928049085, + "grad_norm": 0.23094773292541504, + "learning_rate": 4.4436270856525555e-05, + "loss": 0.1646, + "step": 16531 + }, + { + "epoch": 0.2948667641707987, + "grad_norm": 0.31329360604286194, + "learning_rate": 4.443529186318651e-05, + "loss": 0.154, + "step": 16532 + }, + { + "epoch": 0.2948846002925124, + "grad_norm": 0.3997662663459778, + "learning_rate": 4.443431279450957e-05, + "loss": 0.2434, + "step": 16533 + }, + { + "epoch": 0.2949024364142261, + "grad_norm": 0.287011981010437, + "learning_rate": 4.443333365049851e-05, + "loss": 0.1735, + "step": 16534 + }, + { + "epoch": 0.2949202725359398, + "grad_norm": 0.24818311631679535, + "learning_rate": 4.443235443115715e-05, + "loss": 0.1732, + "step": 16535 + }, + { + "epoch": 0.29493810865765346, + "grad_norm": 0.256538450717926, + "learning_rate": 4.4431375136489264e-05, + "loss": 0.1892, + "step": 16536 + }, + { + "epoch": 0.29495594477936715, + "grad_norm": 0.31276530027389526, + "learning_rate": 4.4430395766498654e-05, + "loss": 0.1708, + "step": 16537 + }, + { + "epoch": 0.2949737809010809, + "grad_norm": 0.3159414529800415, + "learning_rate": 4.442941632118912e-05, + "loss": 0.1441, + "step": 16538 + }, + { + "epoch": 0.2949916170227946, + "grad_norm": 0.21607443690299988, + "learning_rate": 4.4428436800564464e-05, + "loss": 0.1666, + "step": 16539 + }, + { + "epoch": 0.29500945314450827, + "grad_norm": 0.287865549325943, + "learning_rate": 4.442745720462847e-05, + "loss": 0.138, + "step": 16540 + }, + { + "epoch": 0.29502728926622196, + "grad_norm": 0.2994736433029175, + "learning_rate": 4.4426477533384944e-05, + "loss": 0.2064, + "step": 16541 + }, + { + "epoch": 0.29504512538793565, + "grad_norm": 0.2550939917564392, + "learning_rate": 4.4425497786837685e-05, + "loss": 0.1722, + "step": 16542 + }, + { + "epoch": 0.29506296150964934, + "grad_norm": 0.26640585064888, + "learning_rate": 4.4424517964990486e-05, + "loss": 0.1566, + "step": 16543 + }, + { + "epoch": 0.295080797631363, + "grad_norm": 0.21472834050655365, + "learning_rate": 4.4423538067847146e-05, + "loss": 0.1894, + "step": 16544 + }, + { + "epoch": 0.2950986337530767, + "grad_norm": 0.2908587157726288, + "learning_rate": 4.442255809541146e-05, + "loss": 0.1816, + "step": 16545 + }, + { + "epoch": 0.2951164698747904, + "grad_norm": 0.3289947807788849, + "learning_rate": 4.442157804768723e-05, + "loss": 0.2153, + "step": 16546 + }, + { + "epoch": 0.29513430599650414, + "grad_norm": 0.24969275295734406, + "learning_rate": 4.442059792467827e-05, + "loss": 0.1676, + "step": 16547 + }, + { + "epoch": 0.29515214211821783, + "grad_norm": 0.23231945931911469, + "learning_rate": 4.441961772638834e-05, + "loss": 0.1153, + "step": 16548 + }, + { + "epoch": 0.2951699782399315, + "grad_norm": 0.2817572057247162, + "learning_rate": 4.441863745282128e-05, + "loss": 0.1612, + "step": 16549 + }, + { + "epoch": 0.2951878143616452, + "grad_norm": 0.29330912232398987, + "learning_rate": 4.441765710398087e-05, + "loss": 0.1628, + "step": 16550 + }, + { + "epoch": 0.2952056504833589, + "grad_norm": 0.26147744059562683, + "learning_rate": 4.441667667987092e-05, + "loss": 0.1777, + "step": 16551 + }, + { + "epoch": 0.2952234866050726, + "grad_norm": 0.2868327498435974, + "learning_rate": 4.4415696180495225e-05, + "loss": 0.1817, + "step": 16552 + }, + { + "epoch": 0.29524132272678627, + "grad_norm": 0.29684051871299744, + "learning_rate": 4.441471560585758e-05, + "loss": 0.2259, + "step": 16553 + }, + { + "epoch": 0.29525915884849996, + "grad_norm": 0.23250903189182281, + "learning_rate": 4.4413734955961795e-05, + "loss": 0.1579, + "step": 16554 + }, + { + "epoch": 0.2952769949702137, + "grad_norm": 0.29615867137908936, + "learning_rate": 4.441275423081166e-05, + "loss": 0.173, + "step": 16555 + }, + { + "epoch": 0.2952948310919274, + "grad_norm": 0.20359086990356445, + "learning_rate": 4.4411773430410997e-05, + "loss": 0.1345, + "step": 16556 + }, + { + "epoch": 0.2953126672136411, + "grad_norm": 0.19083966314792633, + "learning_rate": 4.441079255476359e-05, + "loss": 0.1509, + "step": 16557 + }, + { + "epoch": 0.29533050333535477, + "grad_norm": 0.36032634973526, + "learning_rate": 4.440981160387324e-05, + "loss": 0.1599, + "step": 16558 + }, + { + "epoch": 0.29534833945706845, + "grad_norm": 0.3039087951183319, + "learning_rate": 4.440883057774377e-05, + "loss": 0.1573, + "step": 16559 + }, + { + "epoch": 0.29536617557878214, + "grad_norm": 0.4068549573421478, + "learning_rate": 4.440784947637896e-05, + "loss": 0.206, + "step": 16560 + }, + { + "epoch": 0.29538401170049583, + "grad_norm": 0.21798011660575867, + "learning_rate": 4.440686829978262e-05, + "loss": 0.1665, + "step": 16561 + }, + { + "epoch": 0.2954018478222095, + "grad_norm": 0.19881121814250946, + "learning_rate": 4.4405887047958564e-05, + "loss": 0.1605, + "step": 16562 + }, + { + "epoch": 0.2954196839439232, + "grad_norm": 0.25778138637542725, + "learning_rate": 4.440490572091058e-05, + "loss": 0.1866, + "step": 16563 + }, + { + "epoch": 0.29543752006563695, + "grad_norm": 0.2487366944551468, + "learning_rate": 4.440392431864248e-05, + "loss": 0.1643, + "step": 16564 + }, + { + "epoch": 0.29545535618735064, + "grad_norm": 0.25507956743240356, + "learning_rate": 4.4402942841158065e-05, + "loss": 0.1417, + "step": 16565 + }, + { + "epoch": 0.2954731923090643, + "grad_norm": 0.23990201950073242, + "learning_rate": 4.4401961288461156e-05, + "loss": 0.1547, + "step": 16566 + }, + { + "epoch": 0.295491028430778, + "grad_norm": 0.2873901128768921, + "learning_rate": 4.4400979660555533e-05, + "loss": 0.1344, + "step": 16567 + }, + { + "epoch": 0.2955088645524917, + "grad_norm": 0.2868228554725647, + "learning_rate": 4.4399997957445014e-05, + "loss": 0.214, + "step": 16568 + }, + { + "epoch": 0.2955267006742054, + "grad_norm": 0.27751776576042175, + "learning_rate": 4.4399016179133404e-05, + "loss": 0.183, + "step": 16569 + }, + { + "epoch": 0.2955445367959191, + "grad_norm": 0.2701549828052521, + "learning_rate": 4.43980343256245e-05, + "loss": 0.2012, + "step": 16570 + }, + { + "epoch": 0.29556237291763277, + "grad_norm": 0.302306205034256, + "learning_rate": 4.439705239692212e-05, + "loss": 0.1199, + "step": 16571 + }, + { + "epoch": 0.2955802090393465, + "grad_norm": 0.2979692220687866, + "learning_rate": 4.439607039303006e-05, + "loss": 0.1567, + "step": 16572 + }, + { + "epoch": 0.2955980451610602, + "grad_norm": 0.34140437841415405, + "learning_rate": 4.439508831395214e-05, + "loss": 0.2476, + "step": 16573 + }, + { + "epoch": 0.2956158812827739, + "grad_norm": 0.35357666015625, + "learning_rate": 4.439410615969216e-05, + "loss": 0.1466, + "step": 16574 + }, + { + "epoch": 0.2956337174044876, + "grad_norm": 0.25594770908355713, + "learning_rate": 4.4393123930253924e-05, + "loss": 0.1402, + "step": 16575 + }, + { + "epoch": 0.29565155352620126, + "grad_norm": 0.2982681095600128, + "learning_rate": 4.439214162564124e-05, + "loss": 0.1896, + "step": 16576 + }, + { + "epoch": 0.29566938964791495, + "grad_norm": 0.281310111284256, + "learning_rate": 4.439115924585792e-05, + "loss": 0.1501, + "step": 16577 + }, + { + "epoch": 0.29568722576962864, + "grad_norm": 0.30204230546951294, + "learning_rate": 4.439017679090775e-05, + "loss": 0.171, + "step": 16578 + }, + { + "epoch": 0.2957050618913423, + "grad_norm": 0.20857369899749756, + "learning_rate": 4.438919426079458e-05, + "loss": 0.2046, + "step": 16579 + }, + { + "epoch": 0.29572289801305607, + "grad_norm": 0.17907004058361053, + "learning_rate": 4.438821165552219e-05, + "loss": 0.1425, + "step": 16580 + }, + { + "epoch": 0.29574073413476976, + "grad_norm": 0.20068205893039703, + "learning_rate": 4.43872289750944e-05, + "loss": 0.1806, + "step": 16581 + }, + { + "epoch": 0.29575857025648344, + "grad_norm": 0.281717985868454, + "learning_rate": 4.4386246219515e-05, + "loss": 0.1969, + "step": 16582 + }, + { + "epoch": 0.29577640637819713, + "grad_norm": 0.26462844014167786, + "learning_rate": 4.438526338878783e-05, + "loss": 0.1409, + "step": 16583 + }, + { + "epoch": 0.2957942424999108, + "grad_norm": 0.2483363151550293, + "learning_rate": 4.438428048291667e-05, + "loss": 0.2049, + "step": 16584 + }, + { + "epoch": 0.2958120786216245, + "grad_norm": 0.19238242506980896, + "learning_rate": 4.438329750190535e-05, + "loss": 0.1468, + "step": 16585 + }, + { + "epoch": 0.2958299147433382, + "grad_norm": 0.1948329210281372, + "learning_rate": 4.438231444575768e-05, + "loss": 0.1252, + "step": 16586 + }, + { + "epoch": 0.2958477508650519, + "grad_norm": 0.27889081835746765, + "learning_rate": 4.438133131447746e-05, + "loss": 0.1692, + "step": 16587 + }, + { + "epoch": 0.29586558698676557, + "grad_norm": 0.27356308698654175, + "learning_rate": 4.4380348108068506e-05, + "loss": 0.149, + "step": 16588 + }, + { + "epoch": 0.2958834231084793, + "grad_norm": 0.33214470744132996, + "learning_rate": 4.437936482653463e-05, + "loss": 0.1724, + "step": 16589 + }, + { + "epoch": 0.295901259230193, + "grad_norm": 0.2601211369037628, + "learning_rate": 4.437838146987964e-05, + "loss": 0.1972, + "step": 16590 + }, + { + "epoch": 0.2959190953519067, + "grad_norm": 0.19905738532543182, + "learning_rate": 4.437739803810735e-05, + "loss": 0.1441, + "step": 16591 + }, + { + "epoch": 0.2959369314736204, + "grad_norm": 0.38575562834739685, + "learning_rate": 4.4376414531221574e-05, + "loss": 0.2653, + "step": 16592 + }, + { + "epoch": 0.29595476759533407, + "grad_norm": 0.279782235622406, + "learning_rate": 4.4375430949226114e-05, + "loss": 0.1601, + "step": 16593 + }, + { + "epoch": 0.29597260371704776, + "grad_norm": 0.27133673429489136, + "learning_rate": 4.4374447292124806e-05, + "loss": 0.1995, + "step": 16594 + }, + { + "epoch": 0.29599043983876144, + "grad_norm": 0.26143351197242737, + "learning_rate": 4.437346355992144e-05, + "loss": 0.202, + "step": 16595 + }, + { + "epoch": 0.29600827596047513, + "grad_norm": 0.2220582515001297, + "learning_rate": 4.437247975261984e-05, + "loss": 0.1458, + "step": 16596 + }, + { + "epoch": 0.2960261120821889, + "grad_norm": 0.2788536846637726, + "learning_rate": 4.437149587022382e-05, + "loss": 0.2483, + "step": 16597 + }, + { + "epoch": 0.29604394820390256, + "grad_norm": 0.261518657207489, + "learning_rate": 4.4370511912737186e-05, + "loss": 0.1945, + "step": 16598 + }, + { + "epoch": 0.29606178432561625, + "grad_norm": 0.24918675422668457, + "learning_rate": 4.436952788016376e-05, + "loss": 0.194, + "step": 16599 + }, + { + "epoch": 0.29607962044732994, + "grad_norm": 0.2793594300746918, + "learning_rate": 4.4368543772507355e-05, + "loss": 0.1964, + "step": 16600 + }, + { + "epoch": 0.2960974565690436, + "grad_norm": 0.2500112056732178, + "learning_rate": 4.436755958977179e-05, + "loss": 0.1993, + "step": 16601 + }, + { + "epoch": 0.2961152926907573, + "grad_norm": 0.3161695897579193, + "learning_rate": 4.436657533196087e-05, + "loss": 0.1831, + "step": 16602 + }, + { + "epoch": 0.296133128812471, + "grad_norm": 0.23257575929164886, + "learning_rate": 4.436559099907841e-05, + "loss": 0.1338, + "step": 16603 + }, + { + "epoch": 0.2961509649341847, + "grad_norm": 0.28345972299575806, + "learning_rate": 4.4364606591128236e-05, + "loss": 0.2257, + "step": 16604 + }, + { + "epoch": 0.2961688010558984, + "grad_norm": 0.28399041295051575, + "learning_rate": 4.436362210811416e-05, + "loss": 0.1781, + "step": 16605 + }, + { + "epoch": 0.2961866371776121, + "grad_norm": 0.25728896260261536, + "learning_rate": 4.436263755003999e-05, + "loss": 0.143, + "step": 16606 + }, + { + "epoch": 0.2962044732993258, + "grad_norm": 0.27097266912460327, + "learning_rate": 4.4361652916909555e-05, + "loss": 0.1712, + "step": 16607 + }, + { + "epoch": 0.2962223094210395, + "grad_norm": 0.26441726088523865, + "learning_rate": 4.436066820872666e-05, + "loss": 0.2159, + "step": 16608 + }, + { + "epoch": 0.2962401455427532, + "grad_norm": 0.25861290097236633, + "learning_rate": 4.435968342549514e-05, + "loss": 0.1559, + "step": 16609 + }, + { + "epoch": 0.2962579816644669, + "grad_norm": 0.25006774067878723, + "learning_rate": 4.4358698567218785e-05, + "loss": 0.2064, + "step": 16610 + }, + { + "epoch": 0.29627581778618056, + "grad_norm": 0.19449922442436218, + "learning_rate": 4.435771363390143e-05, + "loss": 0.1565, + "step": 16611 + }, + { + "epoch": 0.29629365390789425, + "grad_norm": 0.22116151452064514, + "learning_rate": 4.43567286255469e-05, + "loss": 0.1754, + "step": 16612 + }, + { + "epoch": 0.29631149002960794, + "grad_norm": 0.33348163962364197, + "learning_rate": 4.4355743542159e-05, + "loss": 0.1679, + "step": 16613 + }, + { + "epoch": 0.2963293261513217, + "grad_norm": 0.2816435396671295, + "learning_rate": 4.435475838374156e-05, + "loss": 0.1403, + "step": 16614 + }, + { + "epoch": 0.29634716227303537, + "grad_norm": 0.345577210187912, + "learning_rate": 4.435377315029838e-05, + "loss": 0.1647, + "step": 16615 + }, + { + "epoch": 0.29636499839474906, + "grad_norm": 0.2473231703042984, + "learning_rate": 4.43527878418333e-05, + "loss": 0.1687, + "step": 16616 + }, + { + "epoch": 0.29638283451646275, + "grad_norm": 0.28243395686149597, + "learning_rate": 4.4351802458350124e-05, + "loss": 0.2179, + "step": 16617 + }, + { + "epoch": 0.29640067063817643, + "grad_norm": 0.3431648910045624, + "learning_rate": 4.435081699985268e-05, + "loss": 0.1982, + "step": 16618 + }, + { + "epoch": 0.2964185067598901, + "grad_norm": 0.3261888325214386, + "learning_rate": 4.434983146634478e-05, + "loss": 0.1631, + "step": 16619 + }, + { + "epoch": 0.2964363428816038, + "grad_norm": 0.31556057929992676, + "learning_rate": 4.4348845857830254e-05, + "loss": 0.1522, + "step": 16620 + }, + { + "epoch": 0.2964541790033175, + "grad_norm": 0.25531819462776184, + "learning_rate": 4.434786017431293e-05, + "loss": 0.125, + "step": 16621 + }, + { + "epoch": 0.29647201512503124, + "grad_norm": 0.2754364609718323, + "learning_rate": 4.4346874415796605e-05, + "loss": 0.154, + "step": 16622 + }, + { + "epoch": 0.29648985124674493, + "grad_norm": 0.26889702677726746, + "learning_rate": 4.4345888582285114e-05, + "loss": 0.1686, + "step": 16623 + }, + { + "epoch": 0.2965076873684586, + "grad_norm": 0.19343748688697815, + "learning_rate": 4.434490267378227e-05, + "loss": 0.1785, + "step": 16624 + }, + { + "epoch": 0.2965255234901723, + "grad_norm": 0.2791062295436859, + "learning_rate": 4.434391669029192e-05, + "loss": 0.1993, + "step": 16625 + }, + { + "epoch": 0.296543359611886, + "grad_norm": 0.27494120597839355, + "learning_rate": 4.4342930631817854e-05, + "loss": 0.1642, + "step": 16626 + }, + { + "epoch": 0.2965611957335997, + "grad_norm": 0.269661545753479, + "learning_rate": 4.4341944498363907e-05, + "loss": 0.1818, + "step": 16627 + }, + { + "epoch": 0.29657903185531337, + "grad_norm": 0.2954808473587036, + "learning_rate": 4.434095828993391e-05, + "loss": 0.159, + "step": 16628 + }, + { + "epoch": 0.29659686797702706, + "grad_norm": 0.25478166341781616, + "learning_rate": 4.433997200653168e-05, + "loss": 0.1598, + "step": 16629 + }, + { + "epoch": 0.29661470409874074, + "grad_norm": 0.42966318130493164, + "learning_rate": 4.433898564816103e-05, + "loss": 0.1727, + "step": 16630 + }, + { + "epoch": 0.2966325402204545, + "grad_norm": 0.2271733582019806, + "learning_rate": 4.4337999214825796e-05, + "loss": 0.1625, + "step": 16631 + }, + { + "epoch": 0.2966503763421682, + "grad_norm": 0.32869628071784973, + "learning_rate": 4.4337012706529804e-05, + "loss": 0.2204, + "step": 16632 + }, + { + "epoch": 0.29666821246388186, + "grad_norm": 0.38243845105171204, + "learning_rate": 4.4336026123276865e-05, + "loss": 0.1501, + "step": 16633 + }, + { + "epoch": 0.29668604858559555, + "grad_norm": 0.4442615211009979, + "learning_rate": 4.433503946507081e-05, + "loss": 0.1714, + "step": 16634 + }, + { + "epoch": 0.29670388470730924, + "grad_norm": 0.25172099471092224, + "learning_rate": 4.4334052731915466e-05, + "loss": 0.1538, + "step": 16635 + }, + { + "epoch": 0.29672172082902293, + "grad_norm": 0.28673067688941956, + "learning_rate": 4.4333065923814656e-05, + "loss": 0.199, + "step": 16636 + }, + { + "epoch": 0.2967395569507366, + "grad_norm": 0.2143595665693283, + "learning_rate": 4.43320790407722e-05, + "loss": 0.1428, + "step": 16637 + }, + { + "epoch": 0.2967573930724503, + "grad_norm": 0.29313308000564575, + "learning_rate": 4.433109208279194e-05, + "loss": 0.1875, + "step": 16638 + }, + { + "epoch": 0.29677522919416405, + "grad_norm": 0.28862738609313965, + "learning_rate": 4.433010504987768e-05, + "loss": 0.1761, + "step": 16639 + }, + { + "epoch": 0.29679306531587774, + "grad_norm": 0.29017379879951477, + "learning_rate": 4.432911794203326e-05, + "loss": 0.206, + "step": 16640 + }, + { + "epoch": 0.2968109014375914, + "grad_norm": 0.18667002022266388, + "learning_rate": 4.43281307592625e-05, + "loss": 0.136, + "step": 16641 + }, + { + "epoch": 0.2968287375593051, + "grad_norm": 0.3088679611682892, + "learning_rate": 4.4327143501569234e-05, + "loss": 0.1462, + "step": 16642 + }, + { + "epoch": 0.2968465736810188, + "grad_norm": 0.2738398313522339, + "learning_rate": 4.4326156168957285e-05, + "loss": 0.1648, + "step": 16643 + }, + { + "epoch": 0.2968644098027325, + "grad_norm": 0.394050657749176, + "learning_rate": 4.4325168761430476e-05, + "loss": 0.2519, + "step": 16644 + }, + { + "epoch": 0.2968822459244462, + "grad_norm": 0.26776477694511414, + "learning_rate": 4.4324181278992635e-05, + "loss": 0.1636, + "step": 16645 + }, + { + "epoch": 0.29690008204615986, + "grad_norm": 0.22168299555778503, + "learning_rate": 4.43231937216476e-05, + "loss": 0.1896, + "step": 16646 + }, + { + "epoch": 0.29691791816787355, + "grad_norm": 0.34589311480522156, + "learning_rate": 4.432220608939919e-05, + "loss": 0.2142, + "step": 16647 + }, + { + "epoch": 0.2969357542895873, + "grad_norm": 0.2140132635831833, + "learning_rate": 4.432121838225123e-05, + "loss": 0.1885, + "step": 16648 + }, + { + "epoch": 0.296953590411301, + "grad_norm": 0.27988100051879883, + "learning_rate": 4.4320230600207565e-05, + "loss": 0.1425, + "step": 16649 + }, + { + "epoch": 0.29697142653301467, + "grad_norm": 0.29920223355293274, + "learning_rate": 4.4319242743272e-05, + "loss": 0.2004, + "step": 16650 + }, + { + "epoch": 0.29698926265472836, + "grad_norm": 0.33628174662590027, + "learning_rate": 4.431825481144839e-05, + "loss": 0.1209, + "step": 16651 + }, + { + "epoch": 0.29700709877644205, + "grad_norm": 0.30754542350769043, + "learning_rate": 4.431726680474054e-05, + "loss": 0.1978, + "step": 16652 + }, + { + "epoch": 0.29702493489815573, + "grad_norm": 0.2443932145833969, + "learning_rate": 4.4316278723152306e-05, + "loss": 0.1013, + "step": 16653 + }, + { + "epoch": 0.2970427710198694, + "grad_norm": 0.21593210101127625, + "learning_rate": 4.4315290566687497e-05, + "loss": 0.1627, + "step": 16654 + }, + { + "epoch": 0.2970606071415831, + "grad_norm": 0.287570983171463, + "learning_rate": 4.431430233534995e-05, + "loss": 0.1532, + "step": 16655 + }, + { + "epoch": 0.29707844326329685, + "grad_norm": 0.24677424132823944, + "learning_rate": 4.4313314029143496e-05, + "loss": 0.1573, + "step": 16656 + }, + { + "epoch": 0.29709627938501054, + "grad_norm": 0.21025590598583221, + "learning_rate": 4.431232564807197e-05, + "loss": 0.1646, + "step": 16657 + }, + { + "epoch": 0.29711411550672423, + "grad_norm": 0.23474650084972382, + "learning_rate": 4.431133719213919e-05, + "loss": 0.1729, + "step": 16658 + }, + { + "epoch": 0.2971319516284379, + "grad_norm": 0.3554365038871765, + "learning_rate": 4.4310348661349007e-05, + "loss": 0.1624, + "step": 16659 + }, + { + "epoch": 0.2971497877501516, + "grad_norm": 0.3594236671924591, + "learning_rate": 4.430936005570524e-05, + "loss": 0.2111, + "step": 16660 + }, + { + "epoch": 0.2971676238718653, + "grad_norm": 0.3476164937019348, + "learning_rate": 4.430837137521172e-05, + "loss": 0.1833, + "step": 16661 + }, + { + "epoch": 0.297185459993579, + "grad_norm": 0.31929105520248413, + "learning_rate": 4.430738261987229e-05, + "loss": 0.2164, + "step": 16662 + }, + { + "epoch": 0.29720329611529267, + "grad_norm": 0.1930297166109085, + "learning_rate": 4.430639378969077e-05, + "loss": 0.1435, + "step": 16663 + }, + { + "epoch": 0.29722113223700636, + "grad_norm": 0.20953087508678436, + "learning_rate": 4.430540488467101e-05, + "loss": 0.1386, + "step": 16664 + }, + { + "epoch": 0.2972389683587201, + "grad_norm": 0.2832101285457611, + "learning_rate": 4.430441590481682e-05, + "loss": 0.1871, + "step": 16665 + }, + { + "epoch": 0.2972568044804338, + "grad_norm": 0.3205028474330902, + "learning_rate": 4.4303426850132056e-05, + "loss": 0.189, + "step": 16666 + }, + { + "epoch": 0.2972746406021475, + "grad_norm": 0.27176329493522644, + "learning_rate": 4.4302437720620536e-05, + "loss": 0.1738, + "step": 16667 + }, + { + "epoch": 0.29729247672386117, + "grad_norm": 0.3001042604446411, + "learning_rate": 4.4301448516286104e-05, + "loss": 0.2204, + "step": 16668 + }, + { + "epoch": 0.29731031284557485, + "grad_norm": 0.24722713232040405, + "learning_rate": 4.4300459237132594e-05, + "loss": 0.181, + "step": 16669 + }, + { + "epoch": 0.29732814896728854, + "grad_norm": 0.29141971468925476, + "learning_rate": 4.429946988316383e-05, + "loss": 0.1428, + "step": 16670 + }, + { + "epoch": 0.29734598508900223, + "grad_norm": 0.23951655626296997, + "learning_rate": 4.4298480454383664e-05, + "loss": 0.1623, + "step": 16671 + }, + { + "epoch": 0.2973638212107159, + "grad_norm": 0.2430630326271057, + "learning_rate": 4.429749095079591e-05, + "loss": 0.1554, + "step": 16672 + }, + { + "epoch": 0.29738165733242966, + "grad_norm": 0.2622222900390625, + "learning_rate": 4.4296501372404427e-05, + "loss": 0.164, + "step": 16673 + }, + { + "epoch": 0.29739949345414335, + "grad_norm": 0.23790578544139862, + "learning_rate": 4.429551171921303e-05, + "loss": 0.1582, + "step": 16674 + }, + { + "epoch": 0.29741732957585704, + "grad_norm": 0.3748508393764496, + "learning_rate": 4.429452199122557e-05, + "loss": 0.1759, + "step": 16675 + }, + { + "epoch": 0.2974351656975707, + "grad_norm": 0.2436283528804779, + "learning_rate": 4.4293532188445884e-05, + "loss": 0.2123, + "step": 16676 + }, + { + "epoch": 0.2974530018192844, + "grad_norm": 0.28571587800979614, + "learning_rate": 4.42925423108778e-05, + "loss": 0.1597, + "step": 16677 + }, + { + "epoch": 0.2974708379409981, + "grad_norm": 0.25470608472824097, + "learning_rate": 4.429155235852516e-05, + "loss": 0.1993, + "step": 16678 + }, + { + "epoch": 0.2974886740627118, + "grad_norm": 0.21140186488628387, + "learning_rate": 4.4290562331391797e-05, + "loss": 0.1526, + "step": 16679 + }, + { + "epoch": 0.2975065101844255, + "grad_norm": 0.2538129687309265, + "learning_rate": 4.4289572229481555e-05, + "loss": 0.1846, + "step": 16680 + }, + { + "epoch": 0.2975243463061392, + "grad_norm": 0.28790101408958435, + "learning_rate": 4.428858205279826e-05, + "loss": 0.1863, + "step": 16681 + }, + { + "epoch": 0.2975421824278529, + "grad_norm": 0.2624432444572449, + "learning_rate": 4.428759180134577e-05, + "loss": 0.1708, + "step": 16682 + }, + { + "epoch": 0.2975600185495666, + "grad_norm": 0.18219736218452454, + "learning_rate": 4.428660147512791e-05, + "loss": 0.1936, + "step": 16683 + }, + { + "epoch": 0.2975778546712803, + "grad_norm": 0.28530970215797424, + "learning_rate": 4.428561107414852e-05, + "loss": 0.152, + "step": 16684 + }, + { + "epoch": 0.29759569079299397, + "grad_norm": 0.2296566367149353, + "learning_rate": 4.428462059841143e-05, + "loss": 0.1694, + "step": 16685 + }, + { + "epoch": 0.29761352691470766, + "grad_norm": 0.2159237563610077, + "learning_rate": 4.42836300479205e-05, + "loss": 0.186, + "step": 16686 + }, + { + "epoch": 0.29763136303642135, + "grad_norm": 0.2759743630886078, + "learning_rate": 4.428263942267956e-05, + "loss": 0.1812, + "step": 16687 + }, + { + "epoch": 0.29764919915813504, + "grad_norm": 0.24413438141345978, + "learning_rate": 4.4281648722692445e-05, + "loss": 0.18, + "step": 16688 + }, + { + "epoch": 0.2976670352798487, + "grad_norm": 0.2134544998407364, + "learning_rate": 4.428065794796301e-05, + "loss": 0.1829, + "step": 16689 + }, + { + "epoch": 0.29768487140156247, + "grad_norm": 0.2694886326789856, + "learning_rate": 4.427966709849508e-05, + "loss": 0.2157, + "step": 16690 + }, + { + "epoch": 0.29770270752327616, + "grad_norm": 0.23693469166755676, + "learning_rate": 4.42786761742925e-05, + "loss": 0.1481, + "step": 16691 + }, + { + "epoch": 0.29772054364498984, + "grad_norm": 0.2122371643781662, + "learning_rate": 4.427768517535911e-05, + "loss": 0.1281, + "step": 16692 + }, + { + "epoch": 0.29773837976670353, + "grad_norm": 0.2773756682872772, + "learning_rate": 4.427669410169876e-05, + "loss": 0.1849, + "step": 16693 + }, + { + "epoch": 0.2977562158884172, + "grad_norm": 0.2172859162092209, + "learning_rate": 4.427570295331528e-05, + "loss": 0.1818, + "step": 16694 + }, + { + "epoch": 0.2977740520101309, + "grad_norm": 0.2926592230796814, + "learning_rate": 4.4274711730212516e-05, + "loss": 0.1552, + "step": 16695 + }, + { + "epoch": 0.2977918881318446, + "grad_norm": 0.22938333451747894, + "learning_rate": 4.427372043239432e-05, + "loss": 0.1658, + "step": 16696 + }, + { + "epoch": 0.2978097242535583, + "grad_norm": 0.22436851263046265, + "learning_rate": 4.427272905986452e-05, + "loss": 0.1596, + "step": 16697 + }, + { + "epoch": 0.297827560375272, + "grad_norm": 0.28358304500579834, + "learning_rate": 4.427173761262697e-05, + "loss": 0.1611, + "step": 16698 + }, + { + "epoch": 0.2978453964969857, + "grad_norm": 0.2514285445213318, + "learning_rate": 4.42707460906855e-05, + "loss": 0.1726, + "step": 16699 + }, + { + "epoch": 0.2978632326186994, + "grad_norm": 0.7242552638053894, + "learning_rate": 4.426975449404397e-05, + "loss": 0.1726, + "step": 16700 + }, + { + "epoch": 0.2978810687404131, + "grad_norm": 0.28280892968177795, + "learning_rate": 4.4268762822706223e-05, + "loss": 0.1546, + "step": 16701 + }, + { + "epoch": 0.2978989048621268, + "grad_norm": 0.2746674716472626, + "learning_rate": 4.426777107667608e-05, + "loss": 0.1929, + "step": 16702 + }, + { + "epoch": 0.29791674098384047, + "grad_norm": 0.24106605350971222, + "learning_rate": 4.4266779255957416e-05, + "loss": 0.2159, + "step": 16703 + }, + { + "epoch": 0.29793457710555415, + "grad_norm": 0.26024553179740906, + "learning_rate": 4.426578736055405e-05, + "loss": 0.1772, + "step": 16704 + }, + { + "epoch": 0.29795241322726784, + "grad_norm": 0.31141164898872375, + "learning_rate": 4.4264795390469845e-05, + "loss": 0.1689, + "step": 16705 + }, + { + "epoch": 0.29797024934898153, + "grad_norm": 0.4564592242240906, + "learning_rate": 4.426380334570864e-05, + "loss": 0.1468, + "step": 16706 + }, + { + "epoch": 0.2979880854706953, + "grad_norm": 0.2418922781944275, + "learning_rate": 4.426281122627427e-05, + "loss": 0.1907, + "step": 16707 + }, + { + "epoch": 0.29800592159240896, + "grad_norm": 0.24018177390098572, + "learning_rate": 4.4261819032170605e-05, + "loss": 0.2136, + "step": 16708 + }, + { + "epoch": 0.29802375771412265, + "grad_norm": 1.0854966640472412, + "learning_rate": 4.426082676340147e-05, + "loss": 0.1976, + "step": 16709 + }, + { + "epoch": 0.29804159383583634, + "grad_norm": 0.21242307126522064, + "learning_rate": 4.425983441997071e-05, + "loss": 0.1648, + "step": 16710 + }, + { + "epoch": 0.29805942995755, + "grad_norm": 0.28974631428718567, + "learning_rate": 4.425884200188219e-05, + "loss": 0.2355, + "step": 16711 + }, + { + "epoch": 0.2980772660792637, + "grad_norm": 0.31530699133872986, + "learning_rate": 4.4257849509139743e-05, + "loss": 0.1647, + "step": 16712 + }, + { + "epoch": 0.2980951022009774, + "grad_norm": 0.27969062328338623, + "learning_rate": 4.4256856941747215e-05, + "loss": 0.2199, + "step": 16713 + }, + { + "epoch": 0.2981129383226911, + "grad_norm": 0.5517001748085022, + "learning_rate": 4.4255864299708465e-05, + "loss": 0.174, + "step": 16714 + }, + { + "epoch": 0.29813077444440483, + "grad_norm": 0.2415282428264618, + "learning_rate": 4.4254871583027336e-05, + "loss": 0.1789, + "step": 16715 + }, + { + "epoch": 0.2981486105661185, + "grad_norm": 0.34333327412605286, + "learning_rate": 4.425387879170767e-05, + "loss": 0.2178, + "step": 16716 + }, + { + "epoch": 0.2981664466878322, + "grad_norm": 0.2230244278907776, + "learning_rate": 4.425288592575332e-05, + "loss": 0.1383, + "step": 16717 + }, + { + "epoch": 0.2981842828095459, + "grad_norm": 0.2376767247915268, + "learning_rate": 4.425189298516813e-05, + "loss": 0.1889, + "step": 16718 + }, + { + "epoch": 0.2982021189312596, + "grad_norm": 0.24361565709114075, + "learning_rate": 4.425089996995596e-05, + "loss": 0.1784, + "step": 16719 + }, + { + "epoch": 0.2982199550529733, + "grad_norm": 0.27075791358947754, + "learning_rate": 4.424990688012066e-05, + "loss": 0.1605, + "step": 16720 + }, + { + "epoch": 0.29823779117468696, + "grad_norm": 0.21255682408809662, + "learning_rate": 4.424891371566606e-05, + "loss": 0.184, + "step": 16721 + }, + { + "epoch": 0.29825562729640065, + "grad_norm": 0.2611226439476013, + "learning_rate": 4.4247920476596025e-05, + "loss": 0.1443, + "step": 16722 + }, + { + "epoch": 0.2982734634181144, + "grad_norm": 0.25044888257980347, + "learning_rate": 4.4246927162914406e-05, + "loss": 0.177, + "step": 16723 + }, + { + "epoch": 0.2982912995398281, + "grad_norm": 0.26633119583129883, + "learning_rate": 4.424593377462504e-05, + "loss": 0.1774, + "step": 16724 + }, + { + "epoch": 0.29830913566154177, + "grad_norm": 0.39802661538124084, + "learning_rate": 4.42449403117318e-05, + "loss": 0.2054, + "step": 16725 + }, + { + "epoch": 0.29832697178325546, + "grad_norm": 0.2094593495130539, + "learning_rate": 4.424394677423851e-05, + "loss": 0.1875, + "step": 16726 + }, + { + "epoch": 0.29834480790496914, + "grad_norm": 0.23581956326961517, + "learning_rate": 4.424295316214905e-05, + "loss": 0.1857, + "step": 16727 + }, + { + "epoch": 0.29836264402668283, + "grad_norm": 0.24323752522468567, + "learning_rate": 4.424195947546725e-05, + "loss": 0.1358, + "step": 16728 + }, + { + "epoch": 0.2983804801483965, + "grad_norm": 0.2332221269607544, + "learning_rate": 4.424096571419697e-05, + "loss": 0.1281, + "step": 16729 + }, + { + "epoch": 0.2983983162701102, + "grad_norm": 0.24572543799877167, + "learning_rate": 4.4239971878342054e-05, + "loss": 0.1474, + "step": 16730 + }, + { + "epoch": 0.2984161523918239, + "grad_norm": 0.34758806228637695, + "learning_rate": 4.423897796790637e-05, + "loss": 0.2457, + "step": 16731 + }, + { + "epoch": 0.29843398851353764, + "grad_norm": 0.255517840385437, + "learning_rate": 4.4237983982893765e-05, + "loss": 0.1393, + "step": 16732 + }, + { + "epoch": 0.29845182463525133, + "grad_norm": 0.251751571893692, + "learning_rate": 4.423698992330809e-05, + "loss": 0.1901, + "step": 16733 + }, + { + "epoch": 0.298469660756965, + "grad_norm": 0.2226800173521042, + "learning_rate": 4.4235995789153195e-05, + "loss": 0.1723, + "step": 16734 + }, + { + "epoch": 0.2984874968786787, + "grad_norm": 0.2699423134326935, + "learning_rate": 4.4235001580432934e-05, + "loss": 0.1955, + "step": 16735 + }, + { + "epoch": 0.2985053330003924, + "grad_norm": 0.3194892704486847, + "learning_rate": 4.423400729715116e-05, + "loss": 0.1612, + "step": 16736 + }, + { + "epoch": 0.2985231691221061, + "grad_norm": 0.22573094069957733, + "learning_rate": 4.423301293931173e-05, + "loss": 0.1321, + "step": 16737 + }, + { + "epoch": 0.29854100524381977, + "grad_norm": 0.2501831352710724, + "learning_rate": 4.423201850691851e-05, + "loss": 0.1387, + "step": 16738 + }, + { + "epoch": 0.29855884136553346, + "grad_norm": 0.2960270643234253, + "learning_rate": 4.423102399997534e-05, + "loss": 0.1522, + "step": 16739 + }, + { + "epoch": 0.2985766774872472, + "grad_norm": 0.29597237706184387, + "learning_rate": 4.4230029418486075e-05, + "loss": 0.1981, + "step": 16740 + }, + { + "epoch": 0.2985945136089609, + "grad_norm": 0.2796803116798401, + "learning_rate": 4.422903476245457e-05, + "loss": 0.1678, + "step": 16741 + }, + { + "epoch": 0.2986123497306746, + "grad_norm": 0.20084957778453827, + "learning_rate": 4.42280400318847e-05, + "loss": 0.1496, + "step": 16742 + }, + { + "epoch": 0.29863018585238826, + "grad_norm": 0.31416070461273193, + "learning_rate": 4.42270452267803e-05, + "loss": 0.1259, + "step": 16743 + }, + { + "epoch": 0.29864802197410195, + "grad_norm": 0.2169618010520935, + "learning_rate": 4.422605034714522e-05, + "loss": 0.1553, + "step": 16744 + }, + { + "epoch": 0.29866585809581564, + "grad_norm": 0.23236408829689026, + "learning_rate": 4.4225055392983336e-05, + "loss": 0.1581, + "step": 16745 + }, + { + "epoch": 0.2986836942175293, + "grad_norm": 0.2908601760864258, + "learning_rate": 4.4224060364298496e-05, + "loss": 0.141, + "step": 16746 + }, + { + "epoch": 0.298701530339243, + "grad_norm": 0.28868016600608826, + "learning_rate": 4.422306526109456e-05, + "loss": 0.1788, + "step": 16747 + }, + { + "epoch": 0.2987193664609567, + "grad_norm": 0.334247350692749, + "learning_rate": 4.422207008337539e-05, + "loss": 0.1952, + "step": 16748 + }, + { + "epoch": 0.29873720258267045, + "grad_norm": 0.32186469435691833, + "learning_rate": 4.422107483114482e-05, + "loss": 0.2393, + "step": 16749 + }, + { + "epoch": 0.29875503870438413, + "grad_norm": 0.27745431661605835, + "learning_rate": 4.422007950440674e-05, + "loss": 0.16, + "step": 16750 + }, + { + "epoch": 0.2987728748260978, + "grad_norm": 0.30679234862327576, + "learning_rate": 4.4219084103164996e-05, + "loss": 0.2074, + "step": 16751 + }, + { + "epoch": 0.2987907109478115, + "grad_norm": 0.25089892745018005, + "learning_rate": 4.4218088627423437e-05, + "loss": 0.2453, + "step": 16752 + }, + { + "epoch": 0.2988085470695252, + "grad_norm": 0.3112054169178009, + "learning_rate": 4.421709307718592e-05, + "loss": 0.2154, + "step": 16753 + }, + { + "epoch": 0.2988263831912389, + "grad_norm": 0.25213146209716797, + "learning_rate": 4.421609745245633e-05, + "loss": 0.1803, + "step": 16754 + }, + { + "epoch": 0.2988442193129526, + "grad_norm": 0.2951754033565521, + "learning_rate": 4.4215101753238494e-05, + "loss": 0.156, + "step": 16755 + }, + { + "epoch": 0.29886205543466626, + "grad_norm": 0.24440592527389526, + "learning_rate": 4.4214105979536305e-05, + "loss": 0.192, + "step": 16756 + }, + { + "epoch": 0.29887989155638, + "grad_norm": 0.25787267088890076, + "learning_rate": 4.4213110131353586e-05, + "loss": 0.1851, + "step": 16757 + }, + { + "epoch": 0.2988977276780937, + "grad_norm": 0.2564343512058258, + "learning_rate": 4.421211420869423e-05, + "loss": 0.1735, + "step": 16758 + }, + { + "epoch": 0.2989155637998074, + "grad_norm": 0.4714498519897461, + "learning_rate": 4.4211118211562074e-05, + "loss": 0.1955, + "step": 16759 + }, + { + "epoch": 0.29893339992152107, + "grad_norm": 0.24753375351428986, + "learning_rate": 4.421012213996099e-05, + "loss": 0.1915, + "step": 16760 + }, + { + "epoch": 0.29895123604323476, + "grad_norm": 0.4700700044631958, + "learning_rate": 4.4209125993894845e-05, + "loss": 0.2206, + "step": 16761 + }, + { + "epoch": 0.29896907216494845, + "grad_norm": 0.2219686061143875, + "learning_rate": 4.420812977336748e-05, + "loss": 0.1901, + "step": 16762 + }, + { + "epoch": 0.29898690828666213, + "grad_norm": 0.3039661943912506, + "learning_rate": 4.4207133478382785e-05, + "loss": 0.1666, + "step": 16763 + }, + { + "epoch": 0.2990047444083758, + "grad_norm": 0.24209414422512054, + "learning_rate": 4.42061371089446e-05, + "loss": 0.1197, + "step": 16764 + }, + { + "epoch": 0.2990225805300895, + "grad_norm": 0.33031749725341797, + "learning_rate": 4.4205140665056786e-05, + "loss": 0.145, + "step": 16765 + }, + { + "epoch": 0.29904041665180325, + "grad_norm": 0.18115060031414032, + "learning_rate": 4.420414414672322e-05, + "loss": 0.1396, + "step": 16766 + }, + { + "epoch": 0.29905825277351694, + "grad_norm": 0.21951572597026825, + "learning_rate": 4.420314755394776e-05, + "loss": 0.1743, + "step": 16767 + }, + { + "epoch": 0.29907608889523063, + "grad_norm": 0.33737912774086, + "learning_rate": 4.4202150886734274e-05, + "loss": 0.1397, + "step": 16768 + }, + { + "epoch": 0.2990939250169443, + "grad_norm": 0.23330223560333252, + "learning_rate": 4.420115414508661e-05, + "loss": 0.1646, + "step": 16769 + }, + { + "epoch": 0.299111761138658, + "grad_norm": 0.31918492913246155, + "learning_rate": 4.420015732900864e-05, + "loss": 0.1593, + "step": 16770 + }, + { + "epoch": 0.2991295972603717, + "grad_norm": 0.22612160444259644, + "learning_rate": 4.419916043850423e-05, + "loss": 0.1884, + "step": 16771 + }, + { + "epoch": 0.2991474333820854, + "grad_norm": 0.23343156278133392, + "learning_rate": 4.419816347357725e-05, + "loss": 0.1685, + "step": 16772 + }, + { + "epoch": 0.29916526950379907, + "grad_norm": 0.2565903067588806, + "learning_rate": 4.4197166434231554e-05, + "loss": 0.1493, + "step": 16773 + }, + { + "epoch": 0.2991831056255128, + "grad_norm": 0.2400580793619156, + "learning_rate": 4.4196169320471e-05, + "loss": 0.1677, + "step": 16774 + }, + { + "epoch": 0.2992009417472265, + "grad_norm": 0.32715415954589844, + "learning_rate": 4.4195172132299475e-05, + "loss": 0.163, + "step": 16775 + }, + { + "epoch": 0.2992187778689402, + "grad_norm": 0.35406193137168884, + "learning_rate": 4.4194174869720826e-05, + "loss": 0.1695, + "step": 16776 + }, + { + "epoch": 0.2992366139906539, + "grad_norm": 0.1691502332687378, + "learning_rate": 4.419317753273893e-05, + "loss": 0.1366, + "step": 16777 + }, + { + "epoch": 0.29925445011236756, + "grad_norm": 0.3945567011833191, + "learning_rate": 4.419218012135765e-05, + "loss": 0.1989, + "step": 16778 + }, + { + "epoch": 0.29927228623408125, + "grad_norm": 0.2872392535209656, + "learning_rate": 4.419118263558085e-05, + "loss": 0.1513, + "step": 16779 + }, + { + "epoch": 0.29929012235579494, + "grad_norm": 0.18428415060043335, + "learning_rate": 4.41901850754124e-05, + "loss": 0.1533, + "step": 16780 + }, + { + "epoch": 0.29930795847750863, + "grad_norm": 0.24294564127922058, + "learning_rate": 4.4189187440856165e-05, + "loss": 0.1587, + "step": 16781 + }, + { + "epoch": 0.2993257945992224, + "grad_norm": 0.25132066011428833, + "learning_rate": 4.418818973191601e-05, + "loss": 0.202, + "step": 16782 + }, + { + "epoch": 0.29934363072093606, + "grad_norm": 0.2090282142162323, + "learning_rate": 4.4187191948595794e-05, + "loss": 0.1752, + "step": 16783 + }, + { + "epoch": 0.29936146684264975, + "grad_norm": 0.3071610629558563, + "learning_rate": 4.41861940908994e-05, + "loss": 0.2051, + "step": 16784 + }, + { + "epoch": 0.29937930296436344, + "grad_norm": 0.2720433473587036, + "learning_rate": 4.41851961588307e-05, + "loss": 0.1328, + "step": 16785 + }, + { + "epoch": 0.2993971390860771, + "grad_norm": 0.3585069477558136, + "learning_rate": 4.4184198152393544e-05, + "loss": 0.1685, + "step": 16786 + }, + { + "epoch": 0.2994149752077908, + "grad_norm": 0.35692310333251953, + "learning_rate": 4.4183200071591815e-05, + "loss": 0.1749, + "step": 16787 + }, + { + "epoch": 0.2994328113295045, + "grad_norm": 0.2433602660894394, + "learning_rate": 4.4182201916429375e-05, + "loss": 0.1872, + "step": 16788 + }, + { + "epoch": 0.2994506474512182, + "grad_norm": 0.34218353033065796, + "learning_rate": 4.41812036869101e-05, + "loss": 0.1715, + "step": 16789 + }, + { + "epoch": 0.2994684835729319, + "grad_norm": 0.30541422963142395, + "learning_rate": 4.418020538303785e-05, + "loss": 0.2172, + "step": 16790 + }, + { + "epoch": 0.2994863196946456, + "grad_norm": 0.32555091381073, + "learning_rate": 4.41792070048165e-05, + "loss": 0.1927, + "step": 16791 + }, + { + "epoch": 0.2995041558163593, + "grad_norm": 0.3671061098575592, + "learning_rate": 4.4178208552249915e-05, + "loss": 0.2207, + "step": 16792 + }, + { + "epoch": 0.299521991938073, + "grad_norm": 0.21439440548419952, + "learning_rate": 4.4177210025341974e-05, + "loss": 0.1656, + "step": 16793 + }, + { + "epoch": 0.2995398280597867, + "grad_norm": 0.3094068765640259, + "learning_rate": 4.4176211424096545e-05, + "loss": 0.1635, + "step": 16794 + }, + { + "epoch": 0.29955766418150037, + "grad_norm": 0.4374065101146698, + "learning_rate": 4.417521274851749e-05, + "loss": 0.2412, + "step": 16795 + }, + { + "epoch": 0.29957550030321406, + "grad_norm": 0.1831519901752472, + "learning_rate": 4.41742139986087e-05, + "loss": 0.1686, + "step": 16796 + }, + { + "epoch": 0.29959333642492775, + "grad_norm": 0.319697767496109, + "learning_rate": 4.4173215174374025e-05, + "loss": 0.1702, + "step": 16797 + }, + { + "epoch": 0.29961117254664144, + "grad_norm": 0.2710077166557312, + "learning_rate": 4.417221627581735e-05, + "loss": 0.184, + "step": 16798 + }, + { + "epoch": 0.2996290086683552, + "grad_norm": 0.2965708374977112, + "learning_rate": 4.4171217302942534e-05, + "loss": 0.181, + "step": 16799 + }, + { + "epoch": 0.29964684479006887, + "grad_norm": 0.2833975851535797, + "learning_rate": 4.417021825575347e-05, + "loss": 0.1589, + "step": 16800 + }, + { + "epoch": 0.29966468091178255, + "grad_norm": 0.2365575134754181, + "learning_rate": 4.416921913425401e-05, + "loss": 0.1444, + "step": 16801 + }, + { + "epoch": 0.29968251703349624, + "grad_norm": 0.29016876220703125, + "learning_rate": 4.416821993844804e-05, + "loss": 0.2005, + "step": 16802 + }, + { + "epoch": 0.29970035315520993, + "grad_norm": 0.295404851436615, + "learning_rate": 4.416722066833943e-05, + "loss": 0.1074, + "step": 16803 + }, + { + "epoch": 0.2997181892769236, + "grad_norm": 0.24563342332839966, + "learning_rate": 4.4166221323932045e-05, + "loss": 0.1864, + "step": 16804 + }, + { + "epoch": 0.2997360253986373, + "grad_norm": 0.1913757175207138, + "learning_rate": 4.4165221905229775e-05, + "loss": 0.1764, + "step": 16805 + }, + { + "epoch": 0.299753861520351, + "grad_norm": 0.30059218406677246, + "learning_rate": 4.416422241223648e-05, + "loss": 0.1176, + "step": 16806 + }, + { + "epoch": 0.2997716976420647, + "grad_norm": 0.23344455659389496, + "learning_rate": 4.416322284495604e-05, + "loss": 0.177, + "step": 16807 + }, + { + "epoch": 0.2997895337637784, + "grad_norm": 0.31662899255752563, + "learning_rate": 4.416222320339234e-05, + "loss": 0.1532, + "step": 16808 + }, + { + "epoch": 0.2998073698854921, + "grad_norm": 0.2756274342536926, + "learning_rate": 4.416122348754923e-05, + "loss": 0.1861, + "step": 16809 + }, + { + "epoch": 0.2998252060072058, + "grad_norm": 0.25212037563323975, + "learning_rate": 4.416022369743061e-05, + "loss": 0.1799, + "step": 16810 + }, + { + "epoch": 0.2998430421289195, + "grad_norm": 0.21718133985996246, + "learning_rate": 4.415922383304034e-05, + "loss": 0.1476, + "step": 16811 + }, + { + "epoch": 0.2998608782506332, + "grad_norm": 0.3229748606681824, + "learning_rate": 4.41582238943823e-05, + "loss": 0.1899, + "step": 16812 + }, + { + "epoch": 0.29987871437234687, + "grad_norm": 0.26731520891189575, + "learning_rate": 4.415722388146037e-05, + "loss": 0.1704, + "step": 16813 + }, + { + "epoch": 0.29989655049406055, + "grad_norm": 0.27418631315231323, + "learning_rate": 4.4156223794278426e-05, + "loss": 0.1823, + "step": 16814 + }, + { + "epoch": 0.29991438661577424, + "grad_norm": 0.36438408493995667, + "learning_rate": 4.4155223632840334e-05, + "loss": 0.1565, + "step": 16815 + }, + { + "epoch": 0.299932222737488, + "grad_norm": 0.40100619196891785, + "learning_rate": 4.415422339714999e-05, + "loss": 0.2397, + "step": 16816 + }, + { + "epoch": 0.2999500588592017, + "grad_norm": 0.26747944951057434, + "learning_rate": 4.4153223087211257e-05, + "loss": 0.1773, + "step": 16817 + }, + { + "epoch": 0.29996789498091536, + "grad_norm": 0.21279700100421906, + "learning_rate": 4.415222270302801e-05, + "loss": 0.1777, + "step": 16818 + }, + { + "epoch": 0.29998573110262905, + "grad_norm": 0.2474087029695511, + "learning_rate": 4.415122224460414e-05, + "loss": 0.1784, + "step": 16819 + }, + { + "epoch": 0.30000356722434274, + "grad_norm": 0.33131587505340576, + "learning_rate": 4.415022171194351e-05, + "loss": 0.1669, + "step": 16820 + }, + { + "epoch": 0.3000214033460564, + "grad_norm": 0.3297050893306732, + "learning_rate": 4.414922110505001e-05, + "loss": 0.2364, + "step": 16821 + }, + { + "epoch": 0.3000392394677701, + "grad_norm": 0.24292252957820892, + "learning_rate": 4.414822042392752e-05, + "loss": 0.1897, + "step": 16822 + }, + { + "epoch": 0.3000570755894838, + "grad_norm": 0.22840245068073273, + "learning_rate": 4.414721966857991e-05, + "loss": 0.176, + "step": 16823 + }, + { + "epoch": 0.3000749117111975, + "grad_norm": 0.33364999294281006, + "learning_rate": 4.414621883901106e-05, + "loss": 0.163, + "step": 16824 + }, + { + "epoch": 0.30009274783291123, + "grad_norm": 0.2814142405986786, + "learning_rate": 4.414521793522486e-05, + "loss": 0.2098, + "step": 16825 + }, + { + "epoch": 0.3001105839546249, + "grad_norm": 0.293827623128891, + "learning_rate": 4.4144216957225185e-05, + "loss": 0.223, + "step": 16826 + }, + { + "epoch": 0.3001284200763386, + "grad_norm": 0.22876468300819397, + "learning_rate": 4.41432159050159e-05, + "loss": 0.2196, + "step": 16827 + }, + { + "epoch": 0.3001462561980523, + "grad_norm": 0.21677803993225098, + "learning_rate": 4.41422147786009e-05, + "loss": 0.1742, + "step": 16828 + }, + { + "epoch": 0.300164092319766, + "grad_norm": 0.3324127495288849, + "learning_rate": 4.414121357798408e-05, + "loss": 0.1657, + "step": 16829 + }, + { + "epoch": 0.3001819284414797, + "grad_norm": 0.2706266939640045, + "learning_rate": 4.4140212303169295e-05, + "loss": 0.1961, + "step": 16830 + }, + { + "epoch": 0.30019976456319336, + "grad_norm": 0.2568056583404541, + "learning_rate": 4.413921095416042e-05, + "loss": 0.1777, + "step": 16831 + }, + { + "epoch": 0.30021760068490705, + "grad_norm": 0.27072325348854065, + "learning_rate": 4.413820953096138e-05, + "loss": 0.1561, + "step": 16832 + }, + { + "epoch": 0.3002354368066208, + "grad_norm": 0.24612703919410706, + "learning_rate": 4.413720803357602e-05, + "loss": 0.1712, + "step": 16833 + }, + { + "epoch": 0.3002532729283345, + "grad_norm": 0.35841497778892517, + "learning_rate": 4.413620646200822e-05, + "loss": 0.1531, + "step": 16834 + }, + { + "epoch": 0.30027110905004817, + "grad_norm": 0.34456342458724976, + "learning_rate": 4.413520481626189e-05, + "loss": 0.1665, + "step": 16835 + }, + { + "epoch": 0.30028894517176186, + "grad_norm": 0.24517013132572174, + "learning_rate": 4.413420309634089e-05, + "loss": 0.2115, + "step": 16836 + }, + { + "epoch": 0.30030678129347554, + "grad_norm": 0.24718232452869415, + "learning_rate": 4.4133201302249113e-05, + "loss": 0.2094, + "step": 16837 + }, + { + "epoch": 0.30032461741518923, + "grad_norm": 0.2879747748374939, + "learning_rate": 4.413219943399044e-05, + "loss": 0.1718, + "step": 16838 + }, + { + "epoch": 0.3003424535369029, + "grad_norm": 0.4600638747215271, + "learning_rate": 4.413119749156875e-05, + "loss": 0.2177, + "step": 16839 + }, + { + "epoch": 0.3003602896586166, + "grad_norm": 0.29009267687797546, + "learning_rate": 4.4130195474987934e-05, + "loss": 0.1403, + "step": 16840 + }, + { + "epoch": 0.30037812578033035, + "grad_norm": 0.2331254929304123, + "learning_rate": 4.4129193384251874e-05, + "loss": 0.1742, + "step": 16841 + }, + { + "epoch": 0.30039596190204404, + "grad_norm": 0.33358296751976013, + "learning_rate": 4.412819121936445e-05, + "loss": 0.1584, + "step": 16842 + }, + { + "epoch": 0.3004137980237577, + "grad_norm": 0.3982353210449219, + "learning_rate": 4.412718898032955e-05, + "loss": 0.1226, + "step": 16843 + }, + { + "epoch": 0.3004316341454714, + "grad_norm": 0.2488405704498291, + "learning_rate": 4.412618666715106e-05, + "loss": 0.1862, + "step": 16844 + }, + { + "epoch": 0.3004494702671851, + "grad_norm": 0.288212388753891, + "learning_rate": 4.4125184279832864e-05, + "loss": 0.2595, + "step": 16845 + }, + { + "epoch": 0.3004673063888988, + "grad_norm": 0.2693442404270172, + "learning_rate": 4.412418181837885e-05, + "loss": 0.2192, + "step": 16846 + }, + { + "epoch": 0.3004851425106125, + "grad_norm": 0.24485181272029877, + "learning_rate": 4.41231792827929e-05, + "loss": 0.2134, + "step": 16847 + }, + { + "epoch": 0.30050297863232617, + "grad_norm": 0.33057427406311035, + "learning_rate": 4.41221766730789e-05, + "loss": 0.1735, + "step": 16848 + }, + { + "epoch": 0.30052081475403986, + "grad_norm": 0.24340255558490753, + "learning_rate": 4.412117398924074e-05, + "loss": 0.2448, + "step": 16849 + }, + { + "epoch": 0.3005386508757536, + "grad_norm": 0.19123055040836334, + "learning_rate": 4.412017123128231e-05, + "loss": 0.1649, + "step": 16850 + }, + { + "epoch": 0.3005564869974673, + "grad_norm": 0.278317928314209, + "learning_rate": 4.411916839920749e-05, + "loss": 0.1513, + "step": 16851 + }, + { + "epoch": 0.300574323119181, + "grad_norm": 0.24234922230243683, + "learning_rate": 4.411816549302017e-05, + "loss": 0.1623, + "step": 16852 + }, + { + "epoch": 0.30059215924089466, + "grad_norm": 0.26652705669403076, + "learning_rate": 4.4117162512724236e-05, + "loss": 0.1527, + "step": 16853 + }, + { + "epoch": 0.30060999536260835, + "grad_norm": 0.27909377217292786, + "learning_rate": 4.411615945832358e-05, + "loss": 0.1931, + "step": 16854 + }, + { + "epoch": 0.30062783148432204, + "grad_norm": 0.2569110691547394, + "learning_rate": 4.411515632982208e-05, + "loss": 0.1482, + "step": 16855 + }, + { + "epoch": 0.3006456676060357, + "grad_norm": 0.32553431391716003, + "learning_rate": 4.411415312722364e-05, + "loss": 0.194, + "step": 16856 + }, + { + "epoch": 0.3006635037277494, + "grad_norm": 0.4370788633823395, + "learning_rate": 4.411314985053214e-05, + "loss": 0.2551, + "step": 16857 + }, + { + "epoch": 0.30068133984946316, + "grad_norm": 0.23892571032047272, + "learning_rate": 4.4112146499751465e-05, + "loss": 0.2054, + "step": 16858 + }, + { + "epoch": 0.30069917597117685, + "grad_norm": 0.20712657272815704, + "learning_rate": 4.411114307488551e-05, + "loss": 0.1528, + "step": 16859 + }, + { + "epoch": 0.30071701209289053, + "grad_norm": 0.2771521806716919, + "learning_rate": 4.411013957593817e-05, + "loss": 0.1666, + "step": 16860 + }, + { + "epoch": 0.3007348482146042, + "grad_norm": 0.2017921805381775, + "learning_rate": 4.410913600291332e-05, + "loss": 0.1416, + "step": 16861 + }, + { + "epoch": 0.3007526843363179, + "grad_norm": 0.35346201062202454, + "learning_rate": 4.4108132355814864e-05, + "loss": 0.1382, + "step": 16862 + }, + { + "epoch": 0.3007705204580316, + "grad_norm": 0.22023862600326538, + "learning_rate": 4.410712863464668e-05, + "loss": 0.1864, + "step": 16863 + }, + { + "epoch": 0.3007883565797453, + "grad_norm": 0.22985489666461945, + "learning_rate": 4.410612483941268e-05, + "loss": 0.1269, + "step": 16864 + }, + { + "epoch": 0.300806192701459, + "grad_norm": 0.4157508611679077, + "learning_rate": 4.410512097011673e-05, + "loss": 0.2055, + "step": 16865 + }, + { + "epoch": 0.30082402882317266, + "grad_norm": 0.24915878474712372, + "learning_rate": 4.4104117026762734e-05, + "loss": 0.1615, + "step": 16866 + }, + { + "epoch": 0.3008418649448864, + "grad_norm": 0.2501599192619324, + "learning_rate": 4.410311300935459e-05, + "loss": 0.1782, + "step": 16867 + }, + { + "epoch": 0.3008597010666001, + "grad_norm": 0.26404523849487305, + "learning_rate": 4.4102108917896165e-05, + "loss": 0.1854, + "step": 16868 + }, + { + "epoch": 0.3008775371883138, + "grad_norm": 0.2470647692680359, + "learning_rate": 4.410110475239139e-05, + "loss": 0.1605, + "step": 16869 + }, + { + "epoch": 0.30089537331002747, + "grad_norm": 0.28237083554267883, + "learning_rate": 4.4100100512844116e-05, + "loss": 0.1612, + "step": 16870 + }, + { + "epoch": 0.30091320943174116, + "grad_norm": 0.32255420088768005, + "learning_rate": 4.409909619925827e-05, + "loss": 0.1972, + "step": 16871 + }, + { + "epoch": 0.30093104555345485, + "grad_norm": 0.25200021266937256, + "learning_rate": 4.409809181163772e-05, + "loss": 0.1604, + "step": 16872 + }, + { + "epoch": 0.30094888167516853, + "grad_norm": 0.2677861750125885, + "learning_rate": 4.4097087349986376e-05, + "loss": 0.1711, + "step": 16873 + }, + { + "epoch": 0.3009667177968822, + "grad_norm": 0.38924846053123474, + "learning_rate": 4.409608281430812e-05, + "loss": 0.1854, + "step": 16874 + }, + { + "epoch": 0.30098455391859597, + "grad_norm": 0.3989528715610504, + "learning_rate": 4.409507820460686e-05, + "loss": 0.1908, + "step": 16875 + }, + { + "epoch": 0.30100239004030965, + "grad_norm": 0.32209452986717224, + "learning_rate": 4.409407352088647e-05, + "loss": 0.1957, + "step": 16876 + }, + { + "epoch": 0.30102022616202334, + "grad_norm": 0.2875838577747345, + "learning_rate": 4.409306876315087e-05, + "loss": 0.1689, + "step": 16877 + }, + { + "epoch": 0.30103806228373703, + "grad_norm": 0.46301335096359253, + "learning_rate": 4.4092063931403924e-05, + "loss": 0.1897, + "step": 16878 + }, + { + "epoch": 0.3010558984054507, + "grad_norm": 0.30506759881973267, + "learning_rate": 4.4091059025649564e-05, + "loss": 0.1541, + "step": 16879 + }, + { + "epoch": 0.3010737345271644, + "grad_norm": 0.3367038369178772, + "learning_rate": 4.409005404589165e-05, + "loss": 0.2031, + "step": 16880 + }, + { + "epoch": 0.3010915706488781, + "grad_norm": 0.2865173816680908, + "learning_rate": 4.40890489921341e-05, + "loss": 0.1877, + "step": 16881 + }, + { + "epoch": 0.3011094067705918, + "grad_norm": 0.28864696621894836, + "learning_rate": 4.40880438643808e-05, + "loss": 0.1728, + "step": 16882 + }, + { + "epoch": 0.3011272428923055, + "grad_norm": 0.3322356939315796, + "learning_rate": 4.408703866263565e-05, + "loss": 0.2066, + "step": 16883 + }, + { + "epoch": 0.3011450790140192, + "grad_norm": 0.28695881366729736, + "learning_rate": 4.408603338690255e-05, + "loss": 0.191, + "step": 16884 + }, + { + "epoch": 0.3011629151357329, + "grad_norm": 0.2675721347332001, + "learning_rate": 4.408502803718538e-05, + "loss": 0.2096, + "step": 16885 + }, + { + "epoch": 0.3011807512574466, + "grad_norm": 0.2305581122636795, + "learning_rate": 4.408402261348806e-05, + "loss": 0.1986, + "step": 16886 + }, + { + "epoch": 0.3011985873791603, + "grad_norm": 0.29373911023139954, + "learning_rate": 4.4083017115814474e-05, + "loss": 0.2142, + "step": 16887 + }, + { + "epoch": 0.30121642350087396, + "grad_norm": 0.25480425357818604, + "learning_rate": 4.408201154416853e-05, + "loss": 0.1703, + "step": 16888 + }, + { + "epoch": 0.30123425962258765, + "grad_norm": 0.2277686595916748, + "learning_rate": 4.4081005898554106e-05, + "loss": 0.1317, + "step": 16889 + }, + { + "epoch": 0.30125209574430134, + "grad_norm": 0.2672290503978729, + "learning_rate": 4.4080000178975126e-05, + "loss": 0.1957, + "step": 16890 + }, + { + "epoch": 0.30126993186601503, + "grad_norm": 0.41452622413635254, + "learning_rate": 4.407899438543547e-05, + "loss": 0.1771, + "step": 16891 + }, + { + "epoch": 0.30128776798772877, + "grad_norm": 0.3050772249698639, + "learning_rate": 4.407798851793904e-05, + "loss": 0.2246, + "step": 16892 + }, + { + "epoch": 0.30130560410944246, + "grad_norm": 0.25255250930786133, + "learning_rate": 4.407698257648973e-05, + "loss": 0.1535, + "step": 16893 + }, + { + "epoch": 0.30132344023115615, + "grad_norm": 0.24606359004974365, + "learning_rate": 4.407597656109146e-05, + "loss": 0.1419, + "step": 16894 + }, + { + "epoch": 0.30134127635286984, + "grad_norm": 0.22171658277511597, + "learning_rate": 4.4074970471748114e-05, + "loss": 0.1746, + "step": 16895 + }, + { + "epoch": 0.3013591124745835, + "grad_norm": 0.36377984285354614, + "learning_rate": 4.407396430846358e-05, + "loss": 0.1746, + "step": 16896 + }, + { + "epoch": 0.3013769485962972, + "grad_norm": 0.40162044763565063, + "learning_rate": 4.407295807124179e-05, + "loss": 0.2032, + "step": 16897 + }, + { + "epoch": 0.3013947847180109, + "grad_norm": 0.24698717892169952, + "learning_rate": 4.4071951760086615e-05, + "loss": 0.1909, + "step": 16898 + }, + { + "epoch": 0.3014126208397246, + "grad_norm": 0.23228327929973602, + "learning_rate": 4.407094537500197e-05, + "loss": 0.1582, + "step": 16899 + }, + { + "epoch": 0.30143045696143833, + "grad_norm": 0.32421815395355225, + "learning_rate": 4.4069938915991756e-05, + "loss": 0.2042, + "step": 16900 + }, + { + "epoch": 0.301448293083152, + "grad_norm": 0.3217966854572296, + "learning_rate": 4.406893238305988e-05, + "loss": 0.1732, + "step": 16901 + }, + { + "epoch": 0.3014661292048657, + "grad_norm": 0.21149370074272156, + "learning_rate": 4.4067925776210226e-05, + "loss": 0.1763, + "step": 16902 + }, + { + "epoch": 0.3014839653265794, + "grad_norm": 0.31402158737182617, + "learning_rate": 4.406691909544671e-05, + "loss": 0.1626, + "step": 16903 + }, + { + "epoch": 0.3015018014482931, + "grad_norm": 0.2893681824207306, + "learning_rate": 4.406591234077323e-05, + "loss": 0.1634, + "step": 16904 + }, + { + "epoch": 0.30151963757000677, + "grad_norm": 0.24363847076892853, + "learning_rate": 4.406490551219368e-05, + "loss": 0.158, + "step": 16905 + }, + { + "epoch": 0.30153747369172046, + "grad_norm": 0.2350783348083496, + "learning_rate": 4.4063898609711986e-05, + "loss": 0.1762, + "step": 16906 + }, + { + "epoch": 0.30155530981343415, + "grad_norm": 0.2344968467950821, + "learning_rate": 4.406289163333203e-05, + "loss": 0.1593, + "step": 16907 + }, + { + "epoch": 0.30157314593514783, + "grad_norm": 0.2735636234283447, + "learning_rate": 4.406188458305771e-05, + "loss": 0.2033, + "step": 16908 + }, + { + "epoch": 0.3015909820568616, + "grad_norm": 0.18813778460025787, + "learning_rate": 4.4060877458892954e-05, + "loss": 0.1818, + "step": 16909 + }, + { + "epoch": 0.30160881817857527, + "grad_norm": 0.25619176030158997, + "learning_rate": 4.4059870260841654e-05, + "loss": 0.1856, + "step": 16910 + }, + { + "epoch": 0.30162665430028895, + "grad_norm": 0.22614216804504395, + "learning_rate": 4.4058862988907715e-05, + "loss": 0.2121, + "step": 16911 + }, + { + "epoch": 0.30164449042200264, + "grad_norm": 0.36545228958129883, + "learning_rate": 4.4057855643095034e-05, + "loss": 0.1449, + "step": 16912 + }, + { + "epoch": 0.30166232654371633, + "grad_norm": 0.3354528546333313, + "learning_rate": 4.405684822340753e-05, + "loss": 0.1603, + "step": 16913 + }, + { + "epoch": 0.30168016266543, + "grad_norm": 0.21166838705539703, + "learning_rate": 4.40558407298491e-05, + "loss": 0.1556, + "step": 16914 + }, + { + "epoch": 0.3016979987871437, + "grad_norm": 0.21349474787712097, + "learning_rate": 4.405483316242364e-05, + "loss": 0.1481, + "step": 16915 + }, + { + "epoch": 0.3017158349088574, + "grad_norm": 0.3126707673072815, + "learning_rate": 4.4053825521135066e-05, + "loss": 0.2052, + "step": 16916 + }, + { + "epoch": 0.30173367103057114, + "grad_norm": 0.28107452392578125, + "learning_rate": 4.405281780598729e-05, + "loss": 0.1964, + "step": 16917 + }, + { + "epoch": 0.3017515071522848, + "grad_norm": 0.21973201632499695, + "learning_rate": 4.405181001698421e-05, + "loss": 0.1209, + "step": 16918 + }, + { + "epoch": 0.3017693432739985, + "grad_norm": 0.2201208770275116, + "learning_rate": 4.4050802154129734e-05, + "loss": 0.2096, + "step": 16919 + }, + { + "epoch": 0.3017871793957122, + "grad_norm": 0.27609607577323914, + "learning_rate": 4.4049794217427764e-05, + "loss": 0.1694, + "step": 16920 + }, + { + "epoch": 0.3018050155174259, + "grad_norm": 0.2822144329547882, + "learning_rate": 4.404878620688222e-05, + "loss": 0.1164, + "step": 16921 + }, + { + "epoch": 0.3018228516391396, + "grad_norm": 0.2591968774795532, + "learning_rate": 4.4047778122497e-05, + "loss": 0.1594, + "step": 16922 + }, + { + "epoch": 0.30184068776085327, + "grad_norm": 0.3573529124259949, + "learning_rate": 4.404676996427601e-05, + "loss": 0.233, + "step": 16923 + }, + { + "epoch": 0.30185852388256695, + "grad_norm": 0.7974376678466797, + "learning_rate": 4.4045761732223165e-05, + "loss": 0.3663, + "step": 16924 + }, + { + "epoch": 0.30187636000428064, + "grad_norm": 0.26573729515075684, + "learning_rate": 4.404475342634236e-05, + "loss": 0.1647, + "step": 16925 + }, + { + "epoch": 0.3018941961259944, + "grad_norm": 0.24243800342082977, + "learning_rate": 4.404374504663752e-05, + "loss": 0.1596, + "step": 16926 + }, + { + "epoch": 0.3019120322477081, + "grad_norm": 0.2948571443557739, + "learning_rate": 4.4042736593112544e-05, + "loss": 0.2185, + "step": 16927 + }, + { + "epoch": 0.30192986836942176, + "grad_norm": 0.25806403160095215, + "learning_rate": 4.404172806577135e-05, + "loss": 0.1774, + "step": 16928 + }, + { + "epoch": 0.30194770449113545, + "grad_norm": 0.28467893600463867, + "learning_rate": 4.404071946461784e-05, + "loss": 0.1841, + "step": 16929 + }, + { + "epoch": 0.30196554061284914, + "grad_norm": 0.20621229708194733, + "learning_rate": 4.4039710789655916e-05, + "loss": 0.1508, + "step": 16930 + }, + { + "epoch": 0.3019833767345628, + "grad_norm": 0.2832823395729065, + "learning_rate": 4.403870204088951e-05, + "loss": 0.1672, + "step": 16931 + }, + { + "epoch": 0.3020012128562765, + "grad_norm": 0.244558185338974, + "learning_rate": 4.4037693218322506e-05, + "loss": 0.1845, + "step": 16932 + }, + { + "epoch": 0.3020190489779902, + "grad_norm": 0.2347700595855713, + "learning_rate": 4.403668432195883e-05, + "loss": 0.1458, + "step": 16933 + }, + { + "epoch": 0.30203688509970394, + "grad_norm": 0.2341766655445099, + "learning_rate": 4.4035675351802396e-05, + "loss": 0.154, + "step": 16934 + }, + { + "epoch": 0.30205472122141763, + "grad_norm": 0.2508016526699066, + "learning_rate": 4.403466630785711e-05, + "loss": 0.1407, + "step": 16935 + }, + { + "epoch": 0.3020725573431313, + "grad_norm": 0.25141480565071106, + "learning_rate": 4.403365719012688e-05, + "loss": 0.1419, + "step": 16936 + }, + { + "epoch": 0.302090393464845, + "grad_norm": 0.4405762851238251, + "learning_rate": 4.4032647998615623e-05, + "loss": 0.1858, + "step": 16937 + }, + { + "epoch": 0.3021082295865587, + "grad_norm": 0.27560028433799744, + "learning_rate": 4.403163873332725e-05, + "loss": 0.2225, + "step": 16938 + }, + { + "epoch": 0.3021260657082724, + "grad_norm": 0.33127138018608093, + "learning_rate": 4.4030629394265666e-05, + "loss": 0.1982, + "step": 16939 + }, + { + "epoch": 0.3021439018299861, + "grad_norm": 0.31129294633865356, + "learning_rate": 4.402961998143479e-05, + "loss": 0.1921, + "step": 16940 + }, + { + "epoch": 0.30216173795169976, + "grad_norm": 0.30967044830322266, + "learning_rate": 4.402861049483854e-05, + "loss": 0.1767, + "step": 16941 + }, + { + "epoch": 0.3021795740734135, + "grad_norm": 0.2037944197654724, + "learning_rate": 4.402760093448082e-05, + "loss": 0.1484, + "step": 16942 + }, + { + "epoch": 0.3021974101951272, + "grad_norm": 0.25828155875205994, + "learning_rate": 4.4026591300365545e-05, + "loss": 0.1489, + "step": 16943 + }, + { + "epoch": 0.3022152463168409, + "grad_norm": 0.2212415188550949, + "learning_rate": 4.4025581592496635e-05, + "loss": 0.1618, + "step": 16944 + }, + { + "epoch": 0.30223308243855457, + "grad_norm": 0.20023676753044128, + "learning_rate": 4.4024571810878e-05, + "loss": 0.1984, + "step": 16945 + }, + { + "epoch": 0.30225091856026826, + "grad_norm": 0.2713087499141693, + "learning_rate": 4.402356195551355e-05, + "loss": 0.172, + "step": 16946 + }, + { + "epoch": 0.30226875468198194, + "grad_norm": 0.33247488737106323, + "learning_rate": 4.4022552026407204e-05, + "loss": 0.2011, + "step": 16947 + }, + { + "epoch": 0.30228659080369563, + "grad_norm": 0.24287216365337372, + "learning_rate": 4.402154202356288e-05, + "loss": 0.133, + "step": 16948 + }, + { + "epoch": 0.3023044269254093, + "grad_norm": 0.20639853179454803, + "learning_rate": 4.4020531946984476e-05, + "loss": 0.1544, + "step": 16949 + }, + { + "epoch": 0.302322263047123, + "grad_norm": 0.2537766695022583, + "learning_rate": 4.4019521796675936e-05, + "loss": 0.1678, + "step": 16950 + }, + { + "epoch": 0.30234009916883675, + "grad_norm": 0.23143869638442993, + "learning_rate": 4.401851157264115e-05, + "loss": 0.1496, + "step": 16951 + }, + { + "epoch": 0.30235793529055044, + "grad_norm": 0.2683291435241699, + "learning_rate": 4.401750127488405e-05, + "loss": 0.1121, + "step": 16952 + }, + { + "epoch": 0.3023757714122641, + "grad_norm": 0.2808206081390381, + "learning_rate": 4.401649090340855e-05, + "loss": 0.1899, + "step": 16953 + }, + { + "epoch": 0.3023936075339778, + "grad_norm": 0.23837272822856903, + "learning_rate": 4.4015480458218564e-05, + "loss": 0.1872, + "step": 16954 + }, + { + "epoch": 0.3024114436556915, + "grad_norm": 0.27478933334350586, + "learning_rate": 4.4014469939318e-05, + "loss": 0.1489, + "step": 16955 + }, + { + "epoch": 0.3024292797774052, + "grad_norm": 0.2566738724708557, + "learning_rate": 4.401345934671078e-05, + "loss": 0.2003, + "step": 16956 + }, + { + "epoch": 0.3024471158991189, + "grad_norm": 0.28140363097190857, + "learning_rate": 4.4012448680400835e-05, + "loss": 0.1738, + "step": 16957 + }, + { + "epoch": 0.30246495202083257, + "grad_norm": 0.2634008824825287, + "learning_rate": 4.401143794039207e-05, + "loss": 0.2415, + "step": 16958 + }, + { + "epoch": 0.3024827881425463, + "grad_norm": 0.3733348846435547, + "learning_rate": 4.40104271266884e-05, + "loss": 0.2047, + "step": 16959 + }, + { + "epoch": 0.30250062426426, + "grad_norm": 0.27941736578941345, + "learning_rate": 4.4009416239293756e-05, + "loss": 0.1713, + "step": 16960 + }, + { + "epoch": 0.3025184603859737, + "grad_norm": 0.23974740505218506, + "learning_rate": 4.400840527821204e-05, + "loss": 0.2106, + "step": 16961 + }, + { + "epoch": 0.3025362965076874, + "grad_norm": 0.29771092534065247, + "learning_rate": 4.400739424344719e-05, + "loss": 0.1727, + "step": 16962 + }, + { + "epoch": 0.30255413262940106, + "grad_norm": 0.27438756823539734, + "learning_rate": 4.4006383135003106e-05, + "loss": 0.2312, + "step": 16963 + }, + { + "epoch": 0.30257196875111475, + "grad_norm": 0.1773771047592163, + "learning_rate": 4.4005371952883725e-05, + "loss": 0.1468, + "step": 16964 + }, + { + "epoch": 0.30258980487282844, + "grad_norm": 0.24084240198135376, + "learning_rate": 4.400436069709295e-05, + "loss": 0.1759, + "step": 16965 + }, + { + "epoch": 0.3026076409945421, + "grad_norm": 0.29347968101501465, + "learning_rate": 4.400334936763471e-05, + "loss": 0.1842, + "step": 16966 + }, + { + "epoch": 0.3026254771162558, + "grad_norm": 0.2621789872646332, + "learning_rate": 4.4002337964512926e-05, + "loss": 0.16, + "step": 16967 + }, + { + "epoch": 0.30264331323796956, + "grad_norm": 0.3632178008556366, + "learning_rate": 4.400132648773151e-05, + "loss": 0.2045, + "step": 16968 + }, + { + "epoch": 0.30266114935968325, + "grad_norm": 0.24869462847709656, + "learning_rate": 4.400031493729441e-05, + "loss": 0.1782, + "step": 16969 + }, + { + "epoch": 0.30267898548139693, + "grad_norm": 0.32511067390441895, + "learning_rate": 4.399930331320551e-05, + "loss": 0.2122, + "step": 16970 + }, + { + "epoch": 0.3026968216031106, + "grad_norm": 0.3275891840457916, + "learning_rate": 4.3998291615468746e-05, + "loss": 0.1884, + "step": 16971 + }, + { + "epoch": 0.3027146577248243, + "grad_norm": 0.2646700441837311, + "learning_rate": 4.399727984408805e-05, + "loss": 0.1741, + "step": 16972 + }, + { + "epoch": 0.302732493846538, + "grad_norm": 0.2906220555305481, + "learning_rate": 4.399626799906733e-05, + "loss": 0.1499, + "step": 16973 + }, + { + "epoch": 0.3027503299682517, + "grad_norm": 0.3187178373336792, + "learning_rate": 4.399525608041052e-05, + "loss": 0.2222, + "step": 16974 + }, + { + "epoch": 0.3027681660899654, + "grad_norm": 0.24243609607219696, + "learning_rate": 4.399424408812154e-05, + "loss": 0.1795, + "step": 16975 + }, + { + "epoch": 0.3027860022116791, + "grad_norm": 0.2514675557613373, + "learning_rate": 4.39932320222043e-05, + "loss": 0.176, + "step": 16976 + }, + { + "epoch": 0.3028038383333928, + "grad_norm": 0.31117239594459534, + "learning_rate": 4.399221988266273e-05, + "loss": 0.1634, + "step": 16977 + }, + { + "epoch": 0.3028216744551065, + "grad_norm": 0.2353062927722931, + "learning_rate": 4.399120766950077e-05, + "loss": 0.159, + "step": 16978 + }, + { + "epoch": 0.3028395105768202, + "grad_norm": 0.1824413239955902, + "learning_rate": 4.399019538272232e-05, + "loss": 0.1327, + "step": 16979 + }, + { + "epoch": 0.30285734669853387, + "grad_norm": 0.3388562500476837, + "learning_rate": 4.3989183022331315e-05, + "loss": 0.1105, + "step": 16980 + }, + { + "epoch": 0.30287518282024756, + "grad_norm": 0.1896107941865921, + "learning_rate": 4.398817058833168e-05, + "loss": 0.1198, + "step": 16981 + }, + { + "epoch": 0.30289301894196125, + "grad_norm": 0.23556606471538544, + "learning_rate": 4.398715808072734e-05, + "loss": 0.1921, + "step": 16982 + }, + { + "epoch": 0.30291085506367493, + "grad_norm": 0.29156801104545593, + "learning_rate": 4.3986145499522216e-05, + "loss": 0.197, + "step": 16983 + }, + { + "epoch": 0.3029286911853887, + "grad_norm": 0.41407445073127747, + "learning_rate": 4.398513284472023e-05, + "loss": 0.17, + "step": 16984 + }, + { + "epoch": 0.30294652730710236, + "grad_norm": 0.19947557151317596, + "learning_rate": 4.398412011632531e-05, + "loss": 0.1516, + "step": 16985 + }, + { + "epoch": 0.30296436342881605, + "grad_norm": 0.2209668606519699, + "learning_rate": 4.398310731434139e-05, + "loss": 0.156, + "step": 16986 + }, + { + "epoch": 0.30298219955052974, + "grad_norm": 0.3041382133960724, + "learning_rate": 4.398209443877239e-05, + "loss": 0.1352, + "step": 16987 + }, + { + "epoch": 0.30300003567224343, + "grad_norm": 0.2699025273323059, + "learning_rate": 4.398108148962223e-05, + "loss": 0.1499, + "step": 16988 + }, + { + "epoch": 0.3030178717939571, + "grad_norm": 0.21555422246456146, + "learning_rate": 4.398006846689484e-05, + "loss": 0.1698, + "step": 16989 + }, + { + "epoch": 0.3030357079156708, + "grad_norm": 0.4011364281177521, + "learning_rate": 4.397905537059416e-05, + "loss": 0.2007, + "step": 16990 + }, + { + "epoch": 0.3030535440373845, + "grad_norm": 0.2812120020389557, + "learning_rate": 4.39780422007241e-05, + "loss": 0.1468, + "step": 16991 + }, + { + "epoch": 0.3030713801590982, + "grad_norm": 0.2659045457839966, + "learning_rate": 4.397702895728859e-05, + "loss": 0.1718, + "step": 16992 + }, + { + "epoch": 0.3030892162808119, + "grad_norm": 0.2492925375699997, + "learning_rate": 4.3976015640291566e-05, + "loss": 0.1778, + "step": 16993 + }, + { + "epoch": 0.3031070524025256, + "grad_norm": 0.30119457840919495, + "learning_rate": 4.3975002249736955e-05, + "loss": 0.1842, + "step": 16994 + }, + { + "epoch": 0.3031248885242393, + "grad_norm": 0.37086451053619385, + "learning_rate": 4.397398878562867e-05, + "loss": 0.1545, + "step": 16995 + }, + { + "epoch": 0.303142724645953, + "grad_norm": 0.20378327369689941, + "learning_rate": 4.397297524797066e-05, + "loss": 0.1358, + "step": 16996 + }, + { + "epoch": 0.3031605607676667, + "grad_norm": 0.3548892140388489, + "learning_rate": 4.397196163676685e-05, + "loss": 0.22, + "step": 16997 + }, + { + "epoch": 0.30317839688938036, + "grad_norm": 0.21069128811359406, + "learning_rate": 4.3970947952021154e-05, + "loss": 0.1642, + "step": 16998 + }, + { + "epoch": 0.30319623301109405, + "grad_norm": 0.23091521859169006, + "learning_rate": 4.3969934193737516e-05, + "loss": 0.2219, + "step": 16999 + }, + { + "epoch": 0.30321406913280774, + "grad_norm": 0.2196367084980011, + "learning_rate": 4.3968920361919865e-05, + "loss": 0.1685, + "step": 17000 + }, + { + "epoch": 0.30321406913280774, + "eval_loss": 0.16822969913482666, + "eval_runtime": 106.0413, + "eval_samples_per_second": 9.657, + "eval_steps_per_second": 1.613, + "step": 17000 + }, + { + "epoch": 0.3032319052545215, + "grad_norm": 0.25229987502098083, + "learning_rate": 4.396790645657212e-05, + "loss": 0.1903, + "step": 17001 + }, + { + "epoch": 0.30324974137623517, + "grad_norm": 0.32935184240341187, + "learning_rate": 4.3966892477698216e-05, + "loss": 0.1428, + "step": 17002 + }, + { + "epoch": 0.30326757749794886, + "grad_norm": 0.2073621153831482, + "learning_rate": 4.3965878425302085e-05, + "loss": 0.1182, + "step": 17003 + }, + { + "epoch": 0.30328541361966255, + "grad_norm": 0.1906985491514206, + "learning_rate": 4.396486429938766e-05, + "loss": 0.2033, + "step": 17004 + }, + { + "epoch": 0.30330324974137624, + "grad_norm": 0.2603358030319214, + "learning_rate": 4.3963850099958884e-05, + "loss": 0.1933, + "step": 17005 + }, + { + "epoch": 0.3033210858630899, + "grad_norm": 0.23725542426109314, + "learning_rate": 4.396283582701967e-05, + "loss": 0.1724, + "step": 17006 + }, + { + "epoch": 0.3033389219848036, + "grad_norm": 0.23338156938552856, + "learning_rate": 4.396182148057394e-05, + "loss": 0.1604, + "step": 17007 + }, + { + "epoch": 0.3033567581065173, + "grad_norm": 0.29763856530189514, + "learning_rate": 4.396080706062565e-05, + "loss": 0.1257, + "step": 17008 + }, + { + "epoch": 0.303374594228231, + "grad_norm": 0.2634035050868988, + "learning_rate": 4.395979256717873e-05, + "loss": 0.1998, + "step": 17009 + }, + { + "epoch": 0.30339243034994473, + "grad_norm": 0.28416603803634644, + "learning_rate": 4.39587780002371e-05, + "loss": 0.1675, + "step": 17010 + }, + { + "epoch": 0.3034102664716584, + "grad_norm": 0.2536405920982361, + "learning_rate": 4.3957763359804695e-05, + "loss": 0.1796, + "step": 17011 + }, + { + "epoch": 0.3034281025933721, + "grad_norm": 0.2704066336154938, + "learning_rate": 4.3956748645885455e-05, + "loss": 0.1731, + "step": 17012 + }, + { + "epoch": 0.3034459387150858, + "grad_norm": 0.27453184127807617, + "learning_rate": 4.395573385848331e-05, + "loss": 0.2455, + "step": 17013 + }, + { + "epoch": 0.3034637748367995, + "grad_norm": 0.23233869671821594, + "learning_rate": 4.395471899760219e-05, + "loss": 0.1834, + "step": 17014 + }, + { + "epoch": 0.30348161095851317, + "grad_norm": 0.26142966747283936, + "learning_rate": 4.395370406324603e-05, + "loss": 0.1477, + "step": 17015 + }, + { + "epoch": 0.30349944708022686, + "grad_norm": 0.39710143208503723, + "learning_rate": 4.395268905541877e-05, + "loss": 0.1755, + "step": 17016 + }, + { + "epoch": 0.30351728320194055, + "grad_norm": 0.3166236877441406, + "learning_rate": 4.3951673974124346e-05, + "loss": 0.187, + "step": 17017 + }, + { + "epoch": 0.3035351193236543, + "grad_norm": 0.26329943537712097, + "learning_rate": 4.395065881936669e-05, + "loss": 0.1379, + "step": 17018 + }, + { + "epoch": 0.303552955445368, + "grad_norm": 0.2653287351131439, + "learning_rate": 4.394964359114972e-05, + "loss": 0.148, + "step": 17019 + }, + { + "epoch": 0.30357079156708167, + "grad_norm": 0.21244986355304718, + "learning_rate": 4.39486282894774e-05, + "loss": 0.135, + "step": 17020 + }, + { + "epoch": 0.30358862768879535, + "grad_norm": 0.31565529108047485, + "learning_rate": 4.3947612914353654e-05, + "loss": 0.1928, + "step": 17021 + }, + { + "epoch": 0.30360646381050904, + "grad_norm": 0.18823598325252533, + "learning_rate": 4.3946597465782403e-05, + "loss": 0.1371, + "step": 17022 + }, + { + "epoch": 0.30362429993222273, + "grad_norm": 0.22636570036411285, + "learning_rate": 4.39455819437676e-05, + "loss": 0.1774, + "step": 17023 + }, + { + "epoch": 0.3036421360539364, + "grad_norm": 0.2921881377696991, + "learning_rate": 4.394456634831319e-05, + "loss": 0.2319, + "step": 17024 + }, + { + "epoch": 0.3036599721756501, + "grad_norm": 0.24715429544448853, + "learning_rate": 4.3943550679423085e-05, + "loss": 0.1943, + "step": 17025 + }, + { + "epoch": 0.3036778082973638, + "grad_norm": 0.1877565234899521, + "learning_rate": 4.3942534937101235e-05, + "loss": 0.1729, + "step": 17026 + }, + { + "epoch": 0.30369564441907754, + "grad_norm": 0.3187621235847473, + "learning_rate": 4.394151912135158e-05, + "loss": 0.1537, + "step": 17027 + }, + { + "epoch": 0.3037134805407912, + "grad_norm": 0.2842850089073181, + "learning_rate": 4.394050323217806e-05, + "loss": 0.1646, + "step": 17028 + }, + { + "epoch": 0.3037313166625049, + "grad_norm": 0.2343442589044571, + "learning_rate": 4.39394872695846e-05, + "loss": 0.1509, + "step": 17029 + }, + { + "epoch": 0.3037491527842186, + "grad_norm": 0.2878730893135071, + "learning_rate": 4.393847123357515e-05, + "loss": 0.1865, + "step": 17030 + }, + { + "epoch": 0.3037669889059323, + "grad_norm": 0.22307166457176208, + "learning_rate": 4.3937455124153645e-05, + "loss": 0.1549, + "step": 17031 + }, + { + "epoch": 0.303784825027646, + "grad_norm": 0.2224445641040802, + "learning_rate": 4.3936438941324024e-05, + "loss": 0.1291, + "step": 17032 + }, + { + "epoch": 0.30380266114935967, + "grad_norm": 0.26214656233787537, + "learning_rate": 4.3935422685090215e-05, + "loss": 0.1418, + "step": 17033 + }, + { + "epoch": 0.30382049727107335, + "grad_norm": 0.26912835240364075, + "learning_rate": 4.3934406355456184e-05, + "loss": 0.1797, + "step": 17034 + }, + { + "epoch": 0.3038383333927871, + "grad_norm": 0.27315858006477356, + "learning_rate": 4.393338995242584e-05, + "loss": 0.2291, + "step": 17035 + }, + { + "epoch": 0.3038561695145008, + "grad_norm": 0.2199607938528061, + "learning_rate": 4.393237347600314e-05, + "loss": 0.2154, + "step": 17036 + }, + { + "epoch": 0.3038740056362145, + "grad_norm": 0.3718980550765991, + "learning_rate": 4.393135692619202e-05, + "loss": 0.2124, + "step": 17037 + }, + { + "epoch": 0.30389184175792816, + "grad_norm": 0.47460609674453735, + "learning_rate": 4.393034030299643e-05, + "loss": 0.2096, + "step": 17038 + }, + { + "epoch": 0.30390967787964185, + "grad_norm": 0.2550341486930847, + "learning_rate": 4.39293236064203e-05, + "loss": 0.1941, + "step": 17039 + }, + { + "epoch": 0.30392751400135554, + "grad_norm": 0.23543892800807953, + "learning_rate": 4.392830683646757e-05, + "loss": 0.1297, + "step": 17040 + }, + { + "epoch": 0.3039453501230692, + "grad_norm": 0.2201838344335556, + "learning_rate": 4.3927289993142185e-05, + "loss": 0.1795, + "step": 17041 + }, + { + "epoch": 0.3039631862447829, + "grad_norm": 0.3389892280101776, + "learning_rate": 4.392627307644809e-05, + "loss": 0.1682, + "step": 17042 + }, + { + "epoch": 0.30398102236649666, + "grad_norm": 0.25401630997657776, + "learning_rate": 4.392525608638922e-05, + "loss": 0.2027, + "step": 17043 + }, + { + "epoch": 0.30399885848821034, + "grad_norm": 0.3649926483631134, + "learning_rate": 4.392423902296953e-05, + "loss": 0.2452, + "step": 17044 + }, + { + "epoch": 0.30401669460992403, + "grad_norm": 0.25108101963996887, + "learning_rate": 4.3923221886192945e-05, + "loss": 0.1491, + "step": 17045 + }, + { + "epoch": 0.3040345307316377, + "grad_norm": 0.3020821809768677, + "learning_rate": 4.3922204676063415e-05, + "loss": 0.192, + "step": 17046 + }, + { + "epoch": 0.3040523668533514, + "grad_norm": 0.18951134383678436, + "learning_rate": 4.3921187392584884e-05, + "loss": 0.1634, + "step": 17047 + }, + { + "epoch": 0.3040702029750651, + "grad_norm": 0.2740190923213959, + "learning_rate": 4.39201700357613e-05, + "loss": 0.1755, + "step": 17048 + }, + { + "epoch": 0.3040880390967788, + "grad_norm": 0.29754915833473206, + "learning_rate": 4.39191526055966e-05, + "loss": 0.1942, + "step": 17049 + }, + { + "epoch": 0.30410587521849247, + "grad_norm": 0.39213114976882935, + "learning_rate": 4.3918135102094736e-05, + "loss": 0.2362, + "step": 17050 + }, + { + "epoch": 0.30412371134020616, + "grad_norm": 0.299716591835022, + "learning_rate": 4.391711752525964e-05, + "loss": 0.1955, + "step": 17051 + }, + { + "epoch": 0.3041415474619199, + "grad_norm": 0.26223134994506836, + "learning_rate": 4.391609987509526e-05, + "loss": 0.1917, + "step": 17052 + }, + { + "epoch": 0.3041593835836336, + "grad_norm": 0.3177604675292969, + "learning_rate": 4.391508215160555e-05, + "loss": 0.1793, + "step": 17053 + }, + { + "epoch": 0.3041772197053473, + "grad_norm": 0.21626578271389008, + "learning_rate": 4.391406435479444e-05, + "loss": 0.1666, + "step": 17054 + }, + { + "epoch": 0.30419505582706097, + "grad_norm": 0.30217376351356506, + "learning_rate": 4.391304648466589e-05, + "loss": 0.1758, + "step": 17055 + }, + { + "epoch": 0.30421289194877466, + "grad_norm": 0.204359769821167, + "learning_rate": 4.3912028541223844e-05, + "loss": 0.1664, + "step": 17056 + }, + { + "epoch": 0.30423072807048834, + "grad_norm": 0.18372325599193573, + "learning_rate": 4.391101052447224e-05, + "loss": 0.1368, + "step": 17057 + }, + { + "epoch": 0.30424856419220203, + "grad_norm": 0.22900182008743286, + "learning_rate": 4.390999243441502e-05, + "loss": 0.1176, + "step": 17058 + }, + { + "epoch": 0.3042664003139157, + "grad_norm": 0.24883610010147095, + "learning_rate": 4.3908974271056145e-05, + "loss": 0.2003, + "step": 17059 + }, + { + "epoch": 0.30428423643562946, + "grad_norm": 0.3380500376224518, + "learning_rate": 4.390795603439955e-05, + "loss": 0.1279, + "step": 17060 + }, + { + "epoch": 0.30430207255734315, + "grad_norm": 0.1848413646221161, + "learning_rate": 4.390693772444919e-05, + "loss": 0.1643, + "step": 17061 + }, + { + "epoch": 0.30431990867905684, + "grad_norm": 0.25698333978652954, + "learning_rate": 4.390591934120901e-05, + "loss": 0.166, + "step": 17062 + }, + { + "epoch": 0.3043377448007705, + "grad_norm": 0.36611783504486084, + "learning_rate": 4.3904900884682966e-05, + "loss": 0.1765, + "step": 17063 + }, + { + "epoch": 0.3043555809224842, + "grad_norm": 0.29192814230918884, + "learning_rate": 4.390388235487498e-05, + "loss": 0.1683, + "step": 17064 + }, + { + "epoch": 0.3043734170441979, + "grad_norm": 0.24549239873886108, + "learning_rate": 4.390286375178903e-05, + "loss": 0.1958, + "step": 17065 + }, + { + "epoch": 0.3043912531659116, + "grad_norm": 0.24323663115501404, + "learning_rate": 4.390184507542904e-05, + "loss": 0.121, + "step": 17066 + }, + { + "epoch": 0.3044090892876253, + "grad_norm": 0.24716830253601074, + "learning_rate": 4.3900826325798974e-05, + "loss": 0.1688, + "step": 17067 + }, + { + "epoch": 0.30442692540933897, + "grad_norm": 0.2796953320503235, + "learning_rate": 4.389980750290278e-05, + "loss": 0.1656, + "step": 17068 + }, + { + "epoch": 0.3044447615310527, + "grad_norm": 0.2372235506772995, + "learning_rate": 4.38987886067444e-05, + "loss": 0.1685, + "step": 17069 + }, + { + "epoch": 0.3044625976527664, + "grad_norm": 0.3148409128189087, + "learning_rate": 4.389776963732779e-05, + "loss": 0.2177, + "step": 17070 + }, + { + "epoch": 0.3044804337744801, + "grad_norm": 0.2756401002407074, + "learning_rate": 4.38967505946569e-05, + "loss": 0.1725, + "step": 17071 + }, + { + "epoch": 0.3044982698961938, + "grad_norm": 0.3646738529205322, + "learning_rate": 4.3895731478735675e-05, + "loss": 0.1571, + "step": 17072 + }, + { + "epoch": 0.30451610601790746, + "grad_norm": 0.28116896748542786, + "learning_rate": 4.389471228956807e-05, + "loss": 0.2026, + "step": 17073 + }, + { + "epoch": 0.30453394213962115, + "grad_norm": 0.3507188856601715, + "learning_rate": 4.3893693027158035e-05, + "loss": 0.1763, + "step": 17074 + }, + { + "epoch": 0.30455177826133484, + "grad_norm": 0.20710495114326477, + "learning_rate": 4.389267369150951e-05, + "loss": 0.175, + "step": 17075 + }, + { + "epoch": 0.3045696143830485, + "grad_norm": 0.29449790716171265, + "learning_rate": 4.3891654282626474e-05, + "loss": 0.2129, + "step": 17076 + }, + { + "epoch": 0.30458745050476227, + "grad_norm": 0.18216033279895782, + "learning_rate": 4.389063480051285e-05, + "loss": 0.1475, + "step": 17077 + }, + { + "epoch": 0.30460528662647596, + "grad_norm": 0.2358478307723999, + "learning_rate": 4.38896152451726e-05, + "loss": 0.1749, + "step": 17078 + }, + { + "epoch": 0.30462312274818965, + "grad_norm": 0.21464504301548004, + "learning_rate": 4.388859561660969e-05, + "loss": 0.1624, + "step": 17079 + }, + { + "epoch": 0.30464095886990333, + "grad_norm": 0.2703222930431366, + "learning_rate": 4.3887575914828036e-05, + "loss": 0.1453, + "step": 17080 + }, + { + "epoch": 0.304658794991617, + "grad_norm": 0.27065274119377136, + "learning_rate": 4.388655613983163e-05, + "loss": 0.1824, + "step": 17081 + }, + { + "epoch": 0.3046766311133307, + "grad_norm": 0.26811903715133667, + "learning_rate": 4.388553629162441e-05, + "loss": 0.1799, + "step": 17082 + }, + { + "epoch": 0.3046944672350444, + "grad_norm": 0.35675516724586487, + "learning_rate": 4.3884516370210325e-05, + "loss": 0.1683, + "step": 17083 + }, + { + "epoch": 0.3047123033567581, + "grad_norm": 0.2833549678325653, + "learning_rate": 4.388349637559334e-05, + "loss": 0.2409, + "step": 17084 + }, + { + "epoch": 0.30473013947847183, + "grad_norm": 0.3995152711868286, + "learning_rate": 4.388247630777739e-05, + "loss": 0.2324, + "step": 17085 + }, + { + "epoch": 0.3047479756001855, + "grad_norm": 0.2744006812572479, + "learning_rate": 4.388145616676644e-05, + "loss": 0.2162, + "step": 17086 + }, + { + "epoch": 0.3047658117218992, + "grad_norm": 0.2544262707233429, + "learning_rate": 4.388043595256445e-05, + "loss": 0.2443, + "step": 17087 + }, + { + "epoch": 0.3047836478436129, + "grad_norm": 0.30234959721565247, + "learning_rate": 4.387941566517536e-05, + "loss": 0.1803, + "step": 17088 + }, + { + "epoch": 0.3048014839653266, + "grad_norm": 0.27694976329803467, + "learning_rate": 4.387839530460315e-05, + "loss": 0.1879, + "step": 17089 + }, + { + "epoch": 0.30481932008704027, + "grad_norm": 0.3590095043182373, + "learning_rate": 4.387737487085175e-05, + "loss": 0.1256, + "step": 17090 + }, + { + "epoch": 0.30483715620875396, + "grad_norm": 0.2435324341058731, + "learning_rate": 4.3876354363925124e-05, + "loss": 0.174, + "step": 17091 + }, + { + "epoch": 0.30485499233046764, + "grad_norm": 0.21426516771316528, + "learning_rate": 4.387533378382723e-05, + "loss": 0.1253, + "step": 17092 + }, + { + "epoch": 0.30487282845218133, + "grad_norm": 0.2532746493816376, + "learning_rate": 4.3874313130562014e-05, + "loss": 0.2078, + "step": 17093 + }, + { + "epoch": 0.3048906645738951, + "grad_norm": 0.28635111451148987, + "learning_rate": 4.3873292404133457e-05, + "loss": 0.1921, + "step": 17094 + }, + { + "epoch": 0.30490850069560876, + "grad_norm": 0.2923727333545685, + "learning_rate": 4.387227160454549e-05, + "loss": 0.1847, + "step": 17095 + }, + { + "epoch": 0.30492633681732245, + "grad_norm": 0.21463806927204132, + "learning_rate": 4.387125073180208e-05, + "loss": 0.2156, + "step": 17096 + }, + { + "epoch": 0.30494417293903614, + "grad_norm": 0.22355765104293823, + "learning_rate": 4.387022978590719e-05, + "loss": 0.1733, + "step": 17097 + }, + { + "epoch": 0.30496200906074983, + "grad_norm": 0.28531867265701294, + "learning_rate": 4.386920876686478e-05, + "loss": 0.1947, + "step": 17098 + }, + { + "epoch": 0.3049798451824635, + "grad_norm": 0.2557225525379181, + "learning_rate": 4.3868187674678784e-05, + "loss": 0.1881, + "step": 17099 + }, + { + "epoch": 0.3049976813041772, + "grad_norm": 0.2729876637458801, + "learning_rate": 4.386716650935318e-05, + "loss": 0.1497, + "step": 17100 + }, + { + "epoch": 0.3050155174258909, + "grad_norm": 0.3143730163574219, + "learning_rate": 4.3866145270891924e-05, + "loss": 0.1619, + "step": 17101 + }, + { + "epoch": 0.30503335354760464, + "grad_norm": 0.30513739585876465, + "learning_rate": 4.386512395929897e-05, + "loss": 0.1733, + "step": 17102 + }, + { + "epoch": 0.3050511896693183, + "grad_norm": 0.2931326925754547, + "learning_rate": 4.386410257457828e-05, + "loss": 0.2307, + "step": 17103 + }, + { + "epoch": 0.305069025791032, + "grad_norm": 0.28533676266670227, + "learning_rate": 4.386308111673382e-05, + "loss": 0.1861, + "step": 17104 + }, + { + "epoch": 0.3050868619127457, + "grad_norm": 0.2460443526506424, + "learning_rate": 4.3862059585769534e-05, + "loss": 0.1598, + "step": 17105 + }, + { + "epoch": 0.3051046980344594, + "grad_norm": 0.3634937107563019, + "learning_rate": 4.386103798168939e-05, + "loss": 0.1424, + "step": 17106 + }, + { + "epoch": 0.3051225341561731, + "grad_norm": 0.28374966979026794, + "learning_rate": 4.3860016304497354e-05, + "loss": 0.1848, + "step": 17107 + }, + { + "epoch": 0.30514037027788676, + "grad_norm": 0.2175067961215973, + "learning_rate": 4.385899455419738e-05, + "loss": 0.1523, + "step": 17108 + }, + { + "epoch": 0.30515820639960045, + "grad_norm": 0.24766366183757782, + "learning_rate": 4.3857972730793426e-05, + "loss": 0.2065, + "step": 17109 + }, + { + "epoch": 0.30517604252131414, + "grad_norm": 0.2719930112361908, + "learning_rate": 4.385695083428946e-05, + "loss": 0.1626, + "step": 17110 + }, + { + "epoch": 0.3051938786430279, + "grad_norm": 0.27074936032295227, + "learning_rate": 4.3855928864689435e-05, + "loss": 0.1483, + "step": 17111 + }, + { + "epoch": 0.30521171476474157, + "grad_norm": 0.2806743383407593, + "learning_rate": 4.385490682199732e-05, + "loss": 0.1627, + "step": 17112 + }, + { + "epoch": 0.30522955088645526, + "grad_norm": 0.28043878078460693, + "learning_rate": 4.3853884706217074e-05, + "loss": 0.1478, + "step": 17113 + }, + { + "epoch": 0.30524738700816895, + "grad_norm": 0.2837179899215698, + "learning_rate": 4.385286251735266e-05, + "loss": 0.1718, + "step": 17114 + }, + { + "epoch": 0.30526522312988263, + "grad_norm": 0.2241058051586151, + "learning_rate": 4.385184025540804e-05, + "loss": 0.1681, + "step": 17115 + }, + { + "epoch": 0.3052830592515963, + "grad_norm": 0.24122250080108643, + "learning_rate": 4.385081792038717e-05, + "loss": 0.1488, + "step": 17116 + }, + { + "epoch": 0.30530089537331, + "grad_norm": 0.3232344686985016, + "learning_rate": 4.3849795512294025e-05, + "loss": 0.1572, + "step": 17117 + }, + { + "epoch": 0.3053187314950237, + "grad_norm": 0.24130931496620178, + "learning_rate": 4.384877303113256e-05, + "loss": 0.154, + "step": 17118 + }, + { + "epoch": 0.30533656761673744, + "grad_norm": 0.21955129504203796, + "learning_rate": 4.384775047690674e-05, + "loss": 0.1614, + "step": 17119 + }, + { + "epoch": 0.30535440373845113, + "grad_norm": 0.2242218255996704, + "learning_rate": 4.3846727849620527e-05, + "loss": 0.202, + "step": 17120 + }, + { + "epoch": 0.3053722398601648, + "grad_norm": 0.3091730773448944, + "learning_rate": 4.3845705149277895e-05, + "loss": 0.2094, + "step": 17121 + }, + { + "epoch": 0.3053900759818785, + "grad_norm": 0.22022242844104767, + "learning_rate": 4.38446823758828e-05, + "loss": 0.1751, + "step": 17122 + }, + { + "epoch": 0.3054079121035922, + "grad_norm": 0.24822382628917694, + "learning_rate": 4.3843659529439193e-05, + "loss": 0.1522, + "step": 17123 + }, + { + "epoch": 0.3054257482253059, + "grad_norm": 0.2767241597175598, + "learning_rate": 4.384263660995107e-05, + "loss": 0.2572, + "step": 17124 + }, + { + "epoch": 0.30544358434701957, + "grad_norm": 0.21422463655471802, + "learning_rate": 4.384161361742237e-05, + "loss": 0.1706, + "step": 17125 + }, + { + "epoch": 0.30546142046873326, + "grad_norm": 0.2786320447921753, + "learning_rate": 4.384059055185708e-05, + "loss": 0.1848, + "step": 17126 + }, + { + "epoch": 0.30547925659044695, + "grad_norm": 0.30014023184776306, + "learning_rate": 4.383956741325914e-05, + "loss": 0.2029, + "step": 17127 + }, + { + "epoch": 0.3054970927121607, + "grad_norm": 0.22989259660243988, + "learning_rate": 4.383854420163253e-05, + "loss": 0.1599, + "step": 17128 + }, + { + "epoch": 0.3055149288338744, + "grad_norm": 0.2684986889362335, + "learning_rate": 4.383752091698122e-05, + "loss": 0.1779, + "step": 17129 + }, + { + "epoch": 0.30553276495558807, + "grad_norm": 0.2732818126678467, + "learning_rate": 4.3836497559309175e-05, + "loss": 0.1662, + "step": 17130 + }, + { + "epoch": 0.30555060107730175, + "grad_norm": 0.3699675500392914, + "learning_rate": 4.383547412862036e-05, + "loss": 0.2001, + "step": 17131 + }, + { + "epoch": 0.30556843719901544, + "grad_norm": 0.22524546086788177, + "learning_rate": 4.3834450624918735e-05, + "loss": 0.2368, + "step": 17132 + }, + { + "epoch": 0.30558627332072913, + "grad_norm": 0.2979113459587097, + "learning_rate": 4.3833427048208284e-05, + "loss": 0.2163, + "step": 17133 + }, + { + "epoch": 0.3056041094424428, + "grad_norm": 0.3119056820869446, + "learning_rate": 4.383240339849296e-05, + "loss": 0.1366, + "step": 17134 + }, + { + "epoch": 0.3056219455641565, + "grad_norm": 0.34714776277542114, + "learning_rate": 4.383137967577673e-05, + "loss": 0.1378, + "step": 17135 + }, + { + "epoch": 0.30563978168587025, + "grad_norm": 0.33409959077835083, + "learning_rate": 4.3830355880063576e-05, + "loss": 0.2002, + "step": 17136 + }, + { + "epoch": 0.30565761780758394, + "grad_norm": 0.22078469395637512, + "learning_rate": 4.3829332011357456e-05, + "loss": 0.171, + "step": 17137 + }, + { + "epoch": 0.3056754539292976, + "grad_norm": 0.27064353227615356, + "learning_rate": 4.382830806966234e-05, + "loss": 0.1886, + "step": 17138 + }, + { + "epoch": 0.3056932900510113, + "grad_norm": 0.2435254603624344, + "learning_rate": 4.38272840549822e-05, + "loss": 0.2141, + "step": 17139 + }, + { + "epoch": 0.305711126172725, + "grad_norm": 0.23598533868789673, + "learning_rate": 4.3826259967321e-05, + "loss": 0.1696, + "step": 17140 + }, + { + "epoch": 0.3057289622944387, + "grad_norm": 0.23116660118103027, + "learning_rate": 4.382523580668273e-05, + "loss": 0.1612, + "step": 17141 + }, + { + "epoch": 0.3057467984161524, + "grad_norm": 0.32354721426963806, + "learning_rate": 4.3824211573071324e-05, + "loss": 0.2126, + "step": 17142 + }, + { + "epoch": 0.30576463453786606, + "grad_norm": 0.2540068030357361, + "learning_rate": 4.3823187266490784e-05, + "loss": 0.1455, + "step": 17143 + }, + { + "epoch": 0.3057824706595798, + "grad_norm": 0.2564283609390259, + "learning_rate": 4.382216288694507e-05, + "loss": 0.1593, + "step": 17144 + }, + { + "epoch": 0.3058003067812935, + "grad_norm": 0.2832253873348236, + "learning_rate": 4.382113843443815e-05, + "loss": 0.1416, + "step": 17145 + }, + { + "epoch": 0.3058181429030072, + "grad_norm": 0.2201293259859085, + "learning_rate": 4.382011390897399e-05, + "loss": 0.1622, + "step": 17146 + }, + { + "epoch": 0.30583597902472087, + "grad_norm": 0.290938138961792, + "learning_rate": 4.381908931055657e-05, + "loss": 0.1608, + "step": 17147 + }, + { + "epoch": 0.30585381514643456, + "grad_norm": 0.2617526352405548, + "learning_rate": 4.381806463918987e-05, + "loss": 0.1552, + "step": 17148 + }, + { + "epoch": 0.30587165126814825, + "grad_norm": 0.2418537139892578, + "learning_rate": 4.3817039894877845e-05, + "loss": 0.1533, + "step": 17149 + }, + { + "epoch": 0.30588948738986194, + "grad_norm": 0.3873763680458069, + "learning_rate": 4.3816015077624474e-05, + "loss": 0.2121, + "step": 17150 + }, + { + "epoch": 0.3059073235115756, + "grad_norm": 0.242770254611969, + "learning_rate": 4.3814990187433726e-05, + "loss": 0.2069, + "step": 17151 + }, + { + "epoch": 0.3059251596332893, + "grad_norm": 0.25278064608573914, + "learning_rate": 4.3813965224309586e-05, + "loss": 0.2042, + "step": 17152 + }, + { + "epoch": 0.30594299575500306, + "grad_norm": 0.3051237463951111, + "learning_rate": 4.381294018825601e-05, + "loss": 0.1854, + "step": 17153 + }, + { + "epoch": 0.30596083187671674, + "grad_norm": 0.40557852387428284, + "learning_rate": 4.3811915079276986e-05, + "loss": 0.1503, + "step": 17154 + }, + { + "epoch": 0.30597866799843043, + "grad_norm": 0.2661967873573303, + "learning_rate": 4.381088989737649e-05, + "loss": 0.1682, + "step": 17155 + }, + { + "epoch": 0.3059965041201441, + "grad_norm": 0.19283799827098846, + "learning_rate": 4.3809864642558466e-05, + "loss": 0.1478, + "step": 17156 + }, + { + "epoch": 0.3060143402418578, + "grad_norm": 0.30515480041503906, + "learning_rate": 4.380883931482692e-05, + "loss": 0.188, + "step": 17157 + }, + { + "epoch": 0.3060321763635715, + "grad_norm": 0.3168274164199829, + "learning_rate": 4.380781391418582e-05, + "loss": 0.1695, + "step": 17158 + }, + { + "epoch": 0.3060500124852852, + "grad_norm": 0.2728056311607361, + "learning_rate": 4.380678844063913e-05, + "loss": 0.1714, + "step": 17159 + }, + { + "epoch": 0.30606784860699887, + "grad_norm": 0.3205297291278839, + "learning_rate": 4.3805762894190845e-05, + "loss": 0.1787, + "step": 17160 + }, + { + "epoch": 0.3060856847287126, + "grad_norm": 0.20630665123462677, + "learning_rate": 4.380473727484492e-05, + "loss": 0.1685, + "step": 17161 + }, + { + "epoch": 0.3061035208504263, + "grad_norm": 0.26877710223197937, + "learning_rate": 4.380371158260533e-05, + "loss": 0.2076, + "step": 17162 + }, + { + "epoch": 0.30612135697214, + "grad_norm": 0.2504344582557678, + "learning_rate": 4.380268581747608e-05, + "loss": 0.2159, + "step": 17163 + }, + { + "epoch": 0.3061391930938537, + "grad_norm": 0.2371448129415512, + "learning_rate": 4.3801659979461106e-05, + "loss": 0.168, + "step": 17164 + }, + { + "epoch": 0.30615702921556737, + "grad_norm": 0.2899780571460724, + "learning_rate": 4.380063406856441e-05, + "loss": 0.1826, + "step": 17165 + }, + { + "epoch": 0.30617486533728105, + "grad_norm": 0.2562422752380371, + "learning_rate": 4.3799608084789965e-05, + "loss": 0.1702, + "step": 17166 + }, + { + "epoch": 0.30619270145899474, + "grad_norm": 0.19431371986865997, + "learning_rate": 4.379858202814174e-05, + "loss": 0.1566, + "step": 17167 + }, + { + "epoch": 0.30621053758070843, + "grad_norm": 0.32905110716819763, + "learning_rate": 4.379755589862373e-05, + "loss": 0.2069, + "step": 17168 + }, + { + "epoch": 0.3062283737024221, + "grad_norm": 0.233099102973938, + "learning_rate": 4.37965296962399e-05, + "loss": 0.1482, + "step": 17169 + }, + { + "epoch": 0.30624620982413586, + "grad_norm": 0.253312349319458, + "learning_rate": 4.379550342099422e-05, + "loss": 0.1815, + "step": 17170 + }, + { + "epoch": 0.30626404594584955, + "grad_norm": 0.34701377153396606, + "learning_rate": 4.3794477072890674e-05, + "loss": 0.1758, + "step": 17171 + }, + { + "epoch": 0.30628188206756324, + "grad_norm": 0.29600778222084045, + "learning_rate": 4.3793450651933257e-05, + "loss": 0.139, + "step": 17172 + }, + { + "epoch": 0.3062997181892769, + "grad_norm": 0.35207584500312805, + "learning_rate": 4.379242415812592e-05, + "loss": 0.1272, + "step": 17173 + }, + { + "epoch": 0.3063175543109906, + "grad_norm": 0.3003990948200226, + "learning_rate": 4.3791397591472666e-05, + "loss": 0.1526, + "step": 17174 + }, + { + "epoch": 0.3063353904327043, + "grad_norm": 0.29370614886283875, + "learning_rate": 4.379037095197746e-05, + "loss": 0.169, + "step": 17175 + }, + { + "epoch": 0.306353226554418, + "grad_norm": 0.2670826315879822, + "learning_rate": 4.3789344239644294e-05, + "loss": 0.1456, + "step": 17176 + }, + { + "epoch": 0.3063710626761317, + "grad_norm": 0.36985883116722107, + "learning_rate": 4.378831745447713e-05, + "loss": 0.1643, + "step": 17177 + }, + { + "epoch": 0.3063888987978454, + "grad_norm": 0.29399147629737854, + "learning_rate": 4.378729059647996e-05, + "loss": 0.1661, + "step": 17178 + }, + { + "epoch": 0.3064067349195591, + "grad_norm": 0.431037575006485, + "learning_rate": 4.378626366565677e-05, + "loss": 0.1877, + "step": 17179 + }, + { + "epoch": 0.3064245710412728, + "grad_norm": 0.23802313208580017, + "learning_rate": 4.378523666201152e-05, + "loss": 0.142, + "step": 17180 + }, + { + "epoch": 0.3064424071629865, + "grad_norm": 0.25053784251213074, + "learning_rate": 4.3784209585548216e-05, + "loss": 0.1811, + "step": 17181 + }, + { + "epoch": 0.3064602432847002, + "grad_norm": 0.2363741248846054, + "learning_rate": 4.378318243627083e-05, + "loss": 0.1885, + "step": 17182 + }, + { + "epoch": 0.30647807940641386, + "grad_norm": 0.2585192322731018, + "learning_rate": 4.378215521418333e-05, + "loss": 0.1299, + "step": 17183 + }, + { + "epoch": 0.30649591552812755, + "grad_norm": 0.30256012082099915, + "learning_rate": 4.378112791928972e-05, + "loss": 0.1838, + "step": 17184 + }, + { + "epoch": 0.30651375164984124, + "grad_norm": 0.27657467126846313, + "learning_rate": 4.378010055159396e-05, + "loss": 0.2155, + "step": 17185 + }, + { + "epoch": 0.306531587771555, + "grad_norm": 0.4010425806045532, + "learning_rate": 4.3779073111100055e-05, + "loss": 0.2099, + "step": 17186 + }, + { + "epoch": 0.30654942389326867, + "grad_norm": 0.2496582418680191, + "learning_rate": 4.3778045597811964e-05, + "loss": 0.1639, + "step": 17187 + }, + { + "epoch": 0.30656726001498236, + "grad_norm": 0.30885085463523865, + "learning_rate": 4.37770180117337e-05, + "loss": 0.1791, + "step": 17188 + }, + { + "epoch": 0.30658509613669604, + "grad_norm": 0.26914405822753906, + "learning_rate": 4.377599035286921e-05, + "loss": 0.1732, + "step": 17189 + }, + { + "epoch": 0.30660293225840973, + "grad_norm": 0.26386067271232605, + "learning_rate": 4.377496262122251e-05, + "loss": 0.1529, + "step": 17190 + }, + { + "epoch": 0.3066207683801234, + "grad_norm": 0.20294250547885895, + "learning_rate": 4.377393481679757e-05, + "loss": 0.1964, + "step": 17191 + }, + { + "epoch": 0.3066386045018371, + "grad_norm": 0.2744026184082031, + "learning_rate": 4.3772906939598367e-05, + "loss": 0.1844, + "step": 17192 + }, + { + "epoch": 0.3066564406235508, + "grad_norm": 0.23701460659503937, + "learning_rate": 4.37718789896289e-05, + "loss": 0.1898, + "step": 17193 + }, + { + "epoch": 0.3066742767452645, + "grad_norm": 0.2524645924568176, + "learning_rate": 4.377085096689314e-05, + "loss": 0.1885, + "step": 17194 + }, + { + "epoch": 0.30669211286697823, + "grad_norm": 0.2189350575208664, + "learning_rate": 4.376982287139508e-05, + "loss": 0.1358, + "step": 17195 + }, + { + "epoch": 0.3067099489886919, + "grad_norm": 0.32083654403686523, + "learning_rate": 4.37687947031387e-05, + "loss": 0.1954, + "step": 17196 + }, + { + "epoch": 0.3067277851104056, + "grad_norm": 0.21298474073410034, + "learning_rate": 4.3767766462128e-05, + "loss": 0.1522, + "step": 17197 + }, + { + "epoch": 0.3067456212321193, + "grad_norm": 0.2039504051208496, + "learning_rate": 4.376673814836695e-05, + "loss": 0.1351, + "step": 17198 + }, + { + "epoch": 0.306763457353833, + "grad_norm": 0.28641849756240845, + "learning_rate": 4.3765709761859534e-05, + "loss": 0.1752, + "step": 17199 + }, + { + "epoch": 0.30678129347554667, + "grad_norm": 0.3778807818889618, + "learning_rate": 4.376468130260976e-05, + "loss": 0.1751, + "step": 17200 + }, + { + "epoch": 0.30679912959726036, + "grad_norm": 0.22659938037395477, + "learning_rate": 4.376365277062159e-05, + "loss": 0.1534, + "step": 17201 + }, + { + "epoch": 0.30681696571897404, + "grad_norm": 0.25706738233566284, + "learning_rate": 4.376262416589902e-05, + "loss": 0.1812, + "step": 17202 + }, + { + "epoch": 0.3068348018406878, + "grad_norm": 0.3632875680923462, + "learning_rate": 4.376159548844604e-05, + "loss": 0.2162, + "step": 17203 + }, + { + "epoch": 0.3068526379624015, + "grad_norm": 0.2262653112411499, + "learning_rate": 4.3760566738266635e-05, + "loss": 0.1542, + "step": 17204 + }, + { + "epoch": 0.30687047408411516, + "grad_norm": 0.4392228424549103, + "learning_rate": 4.37595379153648e-05, + "loss": 0.1254, + "step": 17205 + }, + { + "epoch": 0.30688831020582885, + "grad_norm": 0.26066115498542786, + "learning_rate": 4.375850901974451e-05, + "loss": 0.1238, + "step": 17206 + }, + { + "epoch": 0.30690614632754254, + "grad_norm": 0.29774630069732666, + "learning_rate": 4.375748005140976e-05, + "loss": 0.1976, + "step": 17207 + }, + { + "epoch": 0.3069239824492562, + "grad_norm": 0.2691112458705902, + "learning_rate": 4.375645101036454e-05, + "loss": 0.1854, + "step": 17208 + }, + { + "epoch": 0.3069418185709699, + "grad_norm": 0.426113098859787, + "learning_rate": 4.375542189661284e-05, + "loss": 0.1784, + "step": 17209 + }, + { + "epoch": 0.3069596546926836, + "grad_norm": 0.2552269697189331, + "learning_rate": 4.3754392710158644e-05, + "loss": 0.1699, + "step": 17210 + }, + { + "epoch": 0.3069774908143973, + "grad_norm": 0.26327285170555115, + "learning_rate": 4.3753363451005944e-05, + "loss": 0.1784, + "step": 17211 + }, + { + "epoch": 0.30699532693611103, + "grad_norm": 0.2895050346851349, + "learning_rate": 4.3752334119158736e-05, + "loss": 0.1336, + "step": 17212 + }, + { + "epoch": 0.3070131630578247, + "grad_norm": 0.20796377956867218, + "learning_rate": 4.3751304714621e-05, + "loss": 0.1743, + "step": 17213 + }, + { + "epoch": 0.3070309991795384, + "grad_norm": 0.7539920806884766, + "learning_rate": 4.375027523739672e-05, + "loss": 0.1892, + "step": 17214 + }, + { + "epoch": 0.3070488353012521, + "grad_norm": 0.2287214696407318, + "learning_rate": 4.3749245687489915e-05, + "loss": 0.2039, + "step": 17215 + }, + { + "epoch": 0.3070666714229658, + "grad_norm": 0.21478188037872314, + "learning_rate": 4.374821606490454e-05, + "loss": 0.1654, + "step": 17216 + }, + { + "epoch": 0.3070845075446795, + "grad_norm": 0.23590540885925293, + "learning_rate": 4.3747186369644624e-05, + "loss": 0.166, + "step": 17217 + }, + { + "epoch": 0.30710234366639316, + "grad_norm": 0.2834855318069458, + "learning_rate": 4.374615660171413e-05, + "loss": 0.1892, + "step": 17218 + }, + { + "epoch": 0.30712017978810685, + "grad_norm": 0.3188692629337311, + "learning_rate": 4.3745126761117054e-05, + "loss": 0.1828, + "step": 17219 + }, + { + "epoch": 0.3071380159098206, + "grad_norm": 0.2887386381626129, + "learning_rate": 4.37440968478574e-05, + "loss": 0.1477, + "step": 17220 + }, + { + "epoch": 0.3071558520315343, + "grad_norm": 0.2905057370662689, + "learning_rate": 4.374306686193914e-05, + "loss": 0.1556, + "step": 17221 + }, + { + "epoch": 0.30717368815324797, + "grad_norm": 0.28902292251586914, + "learning_rate": 4.374203680336629e-05, + "loss": 0.1583, + "step": 17222 + }, + { + "epoch": 0.30719152427496166, + "grad_norm": 0.31475913524627686, + "learning_rate": 4.374100667214283e-05, + "loss": 0.193, + "step": 17223 + }, + { + "epoch": 0.30720936039667535, + "grad_norm": 0.3393841087818146, + "learning_rate": 4.373997646827276e-05, + "loss": 0.1628, + "step": 17224 + }, + { + "epoch": 0.30722719651838903, + "grad_norm": 0.3062064051628113, + "learning_rate": 4.3738946191760055e-05, + "loss": 0.1806, + "step": 17225 + }, + { + "epoch": 0.3072450326401027, + "grad_norm": 0.2175796627998352, + "learning_rate": 4.373791584260873e-05, + "loss": 0.1511, + "step": 17226 + }, + { + "epoch": 0.3072628687618164, + "grad_norm": 0.22644783556461334, + "learning_rate": 4.373688542082278e-05, + "loss": 0.1762, + "step": 17227 + }, + { + "epoch": 0.3072807048835301, + "grad_norm": 0.26100754737854004, + "learning_rate": 4.373585492640618e-05, + "loss": 0.2004, + "step": 17228 + }, + { + "epoch": 0.30729854100524384, + "grad_norm": 0.24516786634922028, + "learning_rate": 4.3734824359362936e-05, + "loss": 0.1916, + "step": 17229 + }, + { + "epoch": 0.30731637712695753, + "grad_norm": 0.23080916702747345, + "learning_rate": 4.3733793719697047e-05, + "loss": 0.1986, + "step": 17230 + }, + { + "epoch": 0.3073342132486712, + "grad_norm": 0.24642398953437805, + "learning_rate": 4.3732763007412495e-05, + "loss": 0.1859, + "step": 17231 + }, + { + "epoch": 0.3073520493703849, + "grad_norm": 0.3539651930332184, + "learning_rate": 4.3731732222513286e-05, + "loss": 0.1669, + "step": 17232 + }, + { + "epoch": 0.3073698854920986, + "grad_norm": 0.26970309019088745, + "learning_rate": 4.3730701365003425e-05, + "loss": 0.2197, + "step": 17233 + }, + { + "epoch": 0.3073877216138123, + "grad_norm": 0.28341996669769287, + "learning_rate": 4.372967043488688e-05, + "loss": 0.1459, + "step": 17234 + }, + { + "epoch": 0.30740555773552597, + "grad_norm": 0.26668596267700195, + "learning_rate": 4.3728639432167675e-05, + "loss": 0.1805, + "step": 17235 + }, + { + "epoch": 0.30742339385723966, + "grad_norm": 0.30018532276153564, + "learning_rate": 4.372760835684978e-05, + "loss": 0.179, + "step": 17236 + }, + { + "epoch": 0.3074412299789534, + "grad_norm": 0.2692541778087616, + "learning_rate": 4.372657720893722e-05, + "loss": 0.1888, + "step": 17237 + }, + { + "epoch": 0.3074590661006671, + "grad_norm": 0.24511288106441498, + "learning_rate": 4.3725545988433974e-05, + "loss": 0.2257, + "step": 17238 + }, + { + "epoch": 0.3074769022223808, + "grad_norm": 0.26978081464767456, + "learning_rate": 4.372451469534404e-05, + "loss": 0.1928, + "step": 17239 + }, + { + "epoch": 0.30749473834409446, + "grad_norm": 0.2563156187534332, + "learning_rate": 4.372348332967143e-05, + "loss": 0.182, + "step": 17240 + }, + { + "epoch": 0.30751257446580815, + "grad_norm": 0.33181315660476685, + "learning_rate": 4.372245189142012e-05, + "loss": 0.1829, + "step": 17241 + }, + { + "epoch": 0.30753041058752184, + "grad_norm": 0.21629559993743896, + "learning_rate": 4.3721420380594135e-05, + "loss": 0.1821, + "step": 17242 + }, + { + "epoch": 0.30754824670923553, + "grad_norm": 0.34097108244895935, + "learning_rate": 4.3720388797197455e-05, + "loss": 0.2506, + "step": 17243 + }, + { + "epoch": 0.3075660828309492, + "grad_norm": 0.24103644490242004, + "learning_rate": 4.371935714123407e-05, + "loss": 0.1105, + "step": 17244 + }, + { + "epoch": 0.30758391895266296, + "grad_norm": 0.3936353921890259, + "learning_rate": 4.3718325412708e-05, + "loss": 0.1796, + "step": 17245 + }, + { + "epoch": 0.30760175507437665, + "grad_norm": 0.2143954038619995, + "learning_rate": 4.3717293611623236e-05, + "loss": 0.1783, + "step": 17246 + }, + { + "epoch": 0.30761959119609034, + "grad_norm": 0.25006744265556335, + "learning_rate": 4.371626173798378e-05, + "loss": 0.159, + "step": 17247 + }, + { + "epoch": 0.307637427317804, + "grad_norm": 0.29980263113975525, + "learning_rate": 4.371522979179362e-05, + "loss": 0.1539, + "step": 17248 + }, + { + "epoch": 0.3076552634395177, + "grad_norm": 0.21040914952754974, + "learning_rate": 4.371419777305677e-05, + "loss": 0.2223, + "step": 17249 + }, + { + "epoch": 0.3076730995612314, + "grad_norm": 0.27245983481407166, + "learning_rate": 4.3713165681777224e-05, + "loss": 0.1717, + "step": 17250 + }, + { + "epoch": 0.3076909356829451, + "grad_norm": 0.24427540600299835, + "learning_rate": 4.371213351795899e-05, + "loss": 0.1641, + "step": 17251 + }, + { + "epoch": 0.3077087718046588, + "grad_norm": 0.2430589348077774, + "learning_rate": 4.371110128160606e-05, + "loss": 0.1879, + "step": 17252 + }, + { + "epoch": 0.30772660792637246, + "grad_norm": 0.2516360878944397, + "learning_rate": 4.371006897272244e-05, + "loss": 0.1809, + "step": 17253 + }, + { + "epoch": 0.3077444440480862, + "grad_norm": 0.3043888509273529, + "learning_rate": 4.3709036591312125e-05, + "loss": 0.18, + "step": 17254 + }, + { + "epoch": 0.3077622801697999, + "grad_norm": 0.28843557834625244, + "learning_rate": 4.370800413737912e-05, + "loss": 0.213, + "step": 17255 + }, + { + "epoch": 0.3077801162915136, + "grad_norm": 0.2079518437385559, + "learning_rate": 4.370697161092744e-05, + "loss": 0.1728, + "step": 17256 + }, + { + "epoch": 0.30779795241322727, + "grad_norm": 0.2885398864746094, + "learning_rate": 4.370593901196107e-05, + "loss": 0.1863, + "step": 17257 + }, + { + "epoch": 0.30781578853494096, + "grad_norm": 0.3651898503303528, + "learning_rate": 4.370490634048403e-05, + "loss": 0.1501, + "step": 17258 + }, + { + "epoch": 0.30783362465665465, + "grad_norm": 0.360970675945282, + "learning_rate": 4.37038735965003e-05, + "loss": 0.1497, + "step": 17259 + }, + { + "epoch": 0.30785146077836834, + "grad_norm": 0.22116002440452576, + "learning_rate": 4.37028407800139e-05, + "loss": 0.2013, + "step": 17260 + }, + { + "epoch": 0.307869296900082, + "grad_norm": 0.29022619128227234, + "learning_rate": 4.3701807891028836e-05, + "loss": 0.2063, + "step": 17261 + }, + { + "epoch": 0.30788713302179577, + "grad_norm": 0.22850602865219116, + "learning_rate": 4.370077492954909e-05, + "loss": 0.1719, + "step": 17262 + }, + { + "epoch": 0.30790496914350945, + "grad_norm": 0.2563099265098572, + "learning_rate": 4.369974189557869e-05, + "loss": 0.1956, + "step": 17263 + }, + { + "epoch": 0.30792280526522314, + "grad_norm": 0.30301570892333984, + "learning_rate": 4.3698708789121634e-05, + "loss": 0.2086, + "step": 17264 + }, + { + "epoch": 0.30794064138693683, + "grad_norm": 0.3367981016635895, + "learning_rate": 4.369767561018192e-05, + "loss": 0.2608, + "step": 17265 + }, + { + "epoch": 0.3079584775086505, + "grad_norm": 0.33726802468299866, + "learning_rate": 4.369664235876356e-05, + "loss": 0.1827, + "step": 17266 + }, + { + "epoch": 0.3079763136303642, + "grad_norm": 0.31587645411491394, + "learning_rate": 4.369560903487056e-05, + "loss": 0.2149, + "step": 17267 + }, + { + "epoch": 0.3079941497520779, + "grad_norm": 0.2821025848388672, + "learning_rate": 4.369457563850692e-05, + "loss": 0.1538, + "step": 17268 + }, + { + "epoch": 0.3080119858737916, + "grad_norm": 0.27898287773132324, + "learning_rate": 4.369354216967665e-05, + "loss": 0.214, + "step": 17269 + }, + { + "epoch": 0.30802982199550527, + "grad_norm": 0.24345724284648895, + "learning_rate": 4.369250862838374e-05, + "loss": 0.1739, + "step": 17270 + }, + { + "epoch": 0.308047658117219, + "grad_norm": 0.2954026758670807, + "learning_rate": 4.369147501463223e-05, + "loss": 0.1588, + "step": 17271 + }, + { + "epoch": 0.3080654942389327, + "grad_norm": 0.2502005994319916, + "learning_rate": 4.369044132842609e-05, + "loss": 0.1401, + "step": 17272 + }, + { + "epoch": 0.3080833303606464, + "grad_norm": 0.29373979568481445, + "learning_rate": 4.368940756976936e-05, + "loss": 0.1795, + "step": 17273 + }, + { + "epoch": 0.3081011664823601, + "grad_norm": 0.27883976697921753, + "learning_rate": 4.3688373738666025e-05, + "loss": 0.1644, + "step": 17274 + }, + { + "epoch": 0.30811900260407377, + "grad_norm": 0.23872879147529602, + "learning_rate": 4.368733983512009e-05, + "loss": 0.1419, + "step": 17275 + }, + { + "epoch": 0.30813683872578745, + "grad_norm": 0.32773467898368835, + "learning_rate": 4.368630585913558e-05, + "loss": 0.1985, + "step": 17276 + }, + { + "epoch": 0.30815467484750114, + "grad_norm": 0.252444863319397, + "learning_rate": 4.368527181071649e-05, + "loss": 0.1241, + "step": 17277 + }, + { + "epoch": 0.30817251096921483, + "grad_norm": 0.2558678090572357, + "learning_rate": 4.3684237689866837e-05, + "loss": 0.1577, + "step": 17278 + }, + { + "epoch": 0.3081903470909286, + "grad_norm": 0.2435806840658188, + "learning_rate": 4.3683203496590626e-05, + "loss": 0.1546, + "step": 17279 + }, + { + "epoch": 0.30820818321264226, + "grad_norm": 0.2824627161026001, + "learning_rate": 4.368216923089186e-05, + "loss": 0.1921, + "step": 17280 + }, + { + "epoch": 0.30822601933435595, + "grad_norm": 0.35193753242492676, + "learning_rate": 4.368113489277455e-05, + "loss": 0.2017, + "step": 17281 + }, + { + "epoch": 0.30824385545606964, + "grad_norm": 0.24292060732841492, + "learning_rate": 4.368010048224273e-05, + "loss": 0.2002, + "step": 17282 + }, + { + "epoch": 0.3082616915777833, + "grad_norm": 0.28214728832244873, + "learning_rate": 4.3679065999300365e-05, + "loss": 0.1605, + "step": 17283 + }, + { + "epoch": 0.308279527699497, + "grad_norm": 0.4588879942893982, + "learning_rate": 4.367803144395149e-05, + "loss": 0.2342, + "step": 17284 + }, + { + "epoch": 0.3082973638212107, + "grad_norm": 0.2713479697704315, + "learning_rate": 4.367699681620013e-05, + "loss": 0.2488, + "step": 17285 + }, + { + "epoch": 0.3083151999429244, + "grad_norm": 0.20883305370807648, + "learning_rate": 4.367596211605027e-05, + "loss": 0.197, + "step": 17286 + }, + { + "epoch": 0.3083330360646381, + "grad_norm": 0.22023865580558777, + "learning_rate": 4.3674927343505936e-05, + "loss": 0.1833, + "step": 17287 + }, + { + "epoch": 0.3083508721863518, + "grad_norm": 0.21465279161930084, + "learning_rate": 4.367389249857112e-05, + "loss": 0.1819, + "step": 17288 + }, + { + "epoch": 0.3083687083080655, + "grad_norm": 0.29342034459114075, + "learning_rate": 4.367285758124986e-05, + "loss": 0.1844, + "step": 17289 + }, + { + "epoch": 0.3083865444297792, + "grad_norm": 0.3028952181339264, + "learning_rate": 4.367182259154615e-05, + "loss": 0.2116, + "step": 17290 + }, + { + "epoch": 0.3084043805514929, + "grad_norm": 0.20493605732917786, + "learning_rate": 4.3670787529464005e-05, + "loss": 0.1623, + "step": 17291 + }, + { + "epoch": 0.3084222166732066, + "grad_norm": 0.22596469521522522, + "learning_rate": 4.366975239500745e-05, + "loss": 0.1513, + "step": 17292 + }, + { + "epoch": 0.30844005279492026, + "grad_norm": 0.2786181569099426, + "learning_rate": 4.3668717188180476e-05, + "loss": 0.1435, + "step": 17293 + }, + { + "epoch": 0.30845788891663395, + "grad_norm": 0.2698395252227783, + "learning_rate": 4.36676819089871e-05, + "loss": 0.1784, + "step": 17294 + }, + { + "epoch": 0.30847572503834764, + "grad_norm": 0.20768164098262787, + "learning_rate": 4.366664655743136e-05, + "loss": 0.1471, + "step": 17295 + }, + { + "epoch": 0.3084935611600614, + "grad_norm": 0.3134932518005371, + "learning_rate": 4.366561113351723e-05, + "loss": 0.1872, + "step": 17296 + }, + { + "epoch": 0.30851139728177507, + "grad_norm": 0.2926786243915558, + "learning_rate": 4.366457563724876e-05, + "loss": 0.1931, + "step": 17297 + }, + { + "epoch": 0.30852923340348876, + "grad_norm": 0.24032816290855408, + "learning_rate": 4.366354006862994e-05, + "loss": 0.1605, + "step": 17298 + }, + { + "epoch": 0.30854706952520244, + "grad_norm": 0.37049993872642517, + "learning_rate": 4.3662504427664796e-05, + "loss": 0.186, + "step": 17299 + }, + { + "epoch": 0.30856490564691613, + "grad_norm": 0.2395107001066208, + "learning_rate": 4.366146871435733e-05, + "loss": 0.1417, + "step": 17300 + }, + { + "epoch": 0.3085827417686298, + "grad_norm": 0.22518615424633026, + "learning_rate": 4.366043292871158e-05, + "loss": 0.1417, + "step": 17301 + }, + { + "epoch": 0.3086005778903435, + "grad_norm": 0.2506428062915802, + "learning_rate": 4.3659397070731536e-05, + "loss": 0.1312, + "step": 17302 + }, + { + "epoch": 0.3086184140120572, + "grad_norm": 0.2959437668323517, + "learning_rate": 4.365836114042123e-05, + "loss": 0.1882, + "step": 17303 + }, + { + "epoch": 0.30863625013377094, + "grad_norm": 0.269821435213089, + "learning_rate": 4.365732513778467e-05, + "loss": 0.1459, + "step": 17304 + }, + { + "epoch": 0.3086540862554846, + "grad_norm": 0.4256460964679718, + "learning_rate": 4.365628906282587e-05, + "loss": 0.2011, + "step": 17305 + }, + { + "epoch": 0.3086719223771983, + "grad_norm": 0.2815365493297577, + "learning_rate": 4.3655252915548864e-05, + "loss": 0.1517, + "step": 17306 + }, + { + "epoch": 0.308689758498912, + "grad_norm": 0.2836959660053253, + "learning_rate": 4.365421669595764e-05, + "loss": 0.1126, + "step": 17307 + }, + { + "epoch": 0.3087075946206257, + "grad_norm": 0.3010525405406952, + "learning_rate": 4.365318040405623e-05, + "loss": 0.1686, + "step": 17308 + }, + { + "epoch": 0.3087254307423394, + "grad_norm": 0.34020987153053284, + "learning_rate": 4.3652144039848654e-05, + "loss": 0.1472, + "step": 17309 + }, + { + "epoch": 0.30874326686405307, + "grad_norm": 0.3223326802253723, + "learning_rate": 4.3651107603338924e-05, + "loss": 0.2415, + "step": 17310 + }, + { + "epoch": 0.30876110298576676, + "grad_norm": 0.20760977268218994, + "learning_rate": 4.3650071094531064e-05, + "loss": 0.1644, + "step": 17311 + }, + { + "epoch": 0.30877893910748044, + "grad_norm": 0.22713865339756012, + "learning_rate": 4.364903451342908e-05, + "loss": 0.1622, + "step": 17312 + }, + { + "epoch": 0.3087967752291942, + "grad_norm": 0.31559303402900696, + "learning_rate": 4.3647997860037e-05, + "loss": 0.1898, + "step": 17313 + }, + { + "epoch": 0.3088146113509079, + "grad_norm": 0.25354933738708496, + "learning_rate": 4.3646961134358844e-05, + "loss": 0.1771, + "step": 17314 + }, + { + "epoch": 0.30883244747262156, + "grad_norm": 0.24059563875198364, + "learning_rate": 4.364592433639862e-05, + "loss": 0.1636, + "step": 17315 + }, + { + "epoch": 0.30885028359433525, + "grad_norm": 0.38591647148132324, + "learning_rate": 4.364488746616036e-05, + "loss": 0.2283, + "step": 17316 + }, + { + "epoch": 0.30886811971604894, + "grad_norm": 0.2507249712944031, + "learning_rate": 4.364385052364807e-05, + "loss": 0.1879, + "step": 17317 + }, + { + "epoch": 0.3088859558377626, + "grad_norm": 0.3062756061553955, + "learning_rate": 4.3642813508865774e-05, + "loss": 0.1718, + "step": 17318 + }, + { + "epoch": 0.3089037919594763, + "grad_norm": 0.31196069717407227, + "learning_rate": 4.3641776421817495e-05, + "loss": 0.2159, + "step": 17319 + }, + { + "epoch": 0.30892162808119, + "grad_norm": 0.3008164167404175, + "learning_rate": 4.364073926250726e-05, + "loss": 0.1425, + "step": 17320 + }, + { + "epoch": 0.30893946420290375, + "grad_norm": 0.26491355895996094, + "learning_rate": 4.3639702030939065e-05, + "loss": 0.1523, + "step": 17321 + }, + { + "epoch": 0.30895730032461743, + "grad_norm": 0.3596571683883667, + "learning_rate": 4.363866472711696e-05, + "loss": 0.1549, + "step": 17322 + }, + { + "epoch": 0.3089751364463311, + "grad_norm": 0.24343734979629517, + "learning_rate": 4.363762735104495e-05, + "loss": 0.1802, + "step": 17323 + }, + { + "epoch": 0.3089929725680448, + "grad_norm": 0.22927075624465942, + "learning_rate": 4.363658990272706e-05, + "loss": 0.1251, + "step": 17324 + }, + { + "epoch": 0.3090108086897585, + "grad_norm": 0.2397066354751587, + "learning_rate": 4.363555238216731e-05, + "loss": 0.1629, + "step": 17325 + }, + { + "epoch": 0.3090286448114722, + "grad_norm": 0.36761361360549927, + "learning_rate": 4.363451478936973e-05, + "loss": 0.1735, + "step": 17326 + }, + { + "epoch": 0.3090464809331859, + "grad_norm": 0.4123034179210663, + "learning_rate": 4.363347712433832e-05, + "loss": 0.187, + "step": 17327 + }, + { + "epoch": 0.30906431705489956, + "grad_norm": 0.31158921122550964, + "learning_rate": 4.363243938707713e-05, + "loss": 0.2051, + "step": 17328 + }, + { + "epoch": 0.30908215317661325, + "grad_norm": 0.26064300537109375, + "learning_rate": 4.363140157759016e-05, + "loss": 0.1735, + "step": 17329 + }, + { + "epoch": 0.309099989298327, + "grad_norm": 0.30597352981567383, + "learning_rate": 4.363036369588145e-05, + "loss": 0.1712, + "step": 17330 + }, + { + "epoch": 0.3091178254200407, + "grad_norm": 0.29409223794937134, + "learning_rate": 4.362932574195501e-05, + "loss": 0.238, + "step": 17331 + }, + { + "epoch": 0.30913566154175437, + "grad_norm": 0.333935022354126, + "learning_rate": 4.362828771581487e-05, + "loss": 0.1698, + "step": 17332 + }, + { + "epoch": 0.30915349766346806, + "grad_norm": 0.35833579301834106, + "learning_rate": 4.362724961746505e-05, + "loss": 0.1768, + "step": 17333 + }, + { + "epoch": 0.30917133378518175, + "grad_norm": 0.28956031799316406, + "learning_rate": 4.362621144690958e-05, + "loss": 0.1254, + "step": 17334 + }, + { + "epoch": 0.30918916990689543, + "grad_norm": 0.3301190733909607, + "learning_rate": 4.362517320415248e-05, + "loss": 0.1549, + "step": 17335 + }, + { + "epoch": 0.3092070060286091, + "grad_norm": 0.2762179672718048, + "learning_rate": 4.362413488919778e-05, + "loss": 0.1428, + "step": 17336 + }, + { + "epoch": 0.3092248421503228, + "grad_norm": 0.3325973451137543, + "learning_rate": 4.36230965020495e-05, + "loss": 0.1582, + "step": 17337 + }, + { + "epoch": 0.30924267827203655, + "grad_norm": 0.2459629476070404, + "learning_rate": 4.3622058042711666e-05, + "loss": 0.1622, + "step": 17338 + }, + { + "epoch": 0.30926051439375024, + "grad_norm": 0.3135835826396942, + "learning_rate": 4.36210195111883e-05, + "loss": 0.1808, + "step": 17339 + }, + { + "epoch": 0.30927835051546393, + "grad_norm": 0.3161329925060272, + "learning_rate": 4.361998090748342e-05, + "loss": 0.1572, + "step": 17340 + }, + { + "epoch": 0.3092961866371776, + "grad_norm": 0.20491014420986176, + "learning_rate": 4.3618942231601086e-05, + "loss": 0.1568, + "step": 17341 + }, + { + "epoch": 0.3093140227588913, + "grad_norm": 0.22547106444835663, + "learning_rate": 4.361790348354529e-05, + "loss": 0.16, + "step": 17342 + }, + { + "epoch": 0.309331858880605, + "grad_norm": 0.30976414680480957, + "learning_rate": 4.361686466332007e-05, + "loss": 0.2246, + "step": 17343 + }, + { + "epoch": 0.3093496950023187, + "grad_norm": 0.32153210043907166, + "learning_rate": 4.3615825770929454e-05, + "loss": 0.1568, + "step": 17344 + }, + { + "epoch": 0.30936753112403237, + "grad_norm": 0.25154581665992737, + "learning_rate": 4.361478680637746e-05, + "loss": 0.1949, + "step": 17345 + }, + { + "epoch": 0.3093853672457461, + "grad_norm": 0.26917651295661926, + "learning_rate": 4.361374776966813e-05, + "loss": 0.1809, + "step": 17346 + }, + { + "epoch": 0.3094032033674598, + "grad_norm": 0.27427390217781067, + "learning_rate": 4.361270866080548e-05, + "loss": 0.2064, + "step": 17347 + }, + { + "epoch": 0.3094210394891735, + "grad_norm": 0.25331488251686096, + "learning_rate": 4.361166947979355e-05, + "loss": 0.2109, + "step": 17348 + }, + { + "epoch": 0.3094388756108872, + "grad_norm": 0.42039036750793457, + "learning_rate": 4.361063022663635e-05, + "loss": 0.2599, + "step": 17349 + }, + { + "epoch": 0.30945671173260086, + "grad_norm": 0.2885431945323944, + "learning_rate": 4.3609590901337926e-05, + "loss": 0.0975, + "step": 17350 + }, + { + "epoch": 0.30947454785431455, + "grad_norm": 0.20504805445671082, + "learning_rate": 4.3608551503902306e-05, + "loss": 0.1643, + "step": 17351 + }, + { + "epoch": 0.30949238397602824, + "grad_norm": 0.26486822962760925, + "learning_rate": 4.36075120343335e-05, + "loss": 0.1607, + "step": 17352 + }, + { + "epoch": 0.30951022009774193, + "grad_norm": 0.2809804677963257, + "learning_rate": 4.360647249263556e-05, + "loss": 0.1803, + "step": 17353 + }, + { + "epoch": 0.3095280562194556, + "grad_norm": 0.2266402244567871, + "learning_rate": 4.36054328788125e-05, + "loss": 0.1397, + "step": 17354 + }, + { + "epoch": 0.30954589234116936, + "grad_norm": 0.24769458174705505, + "learning_rate": 4.3604393192868365e-05, + "loss": 0.1794, + "step": 17355 + }, + { + "epoch": 0.30956372846288305, + "grad_norm": 0.21029578149318695, + "learning_rate": 4.3603353434807174e-05, + "loss": 0.1573, + "step": 17356 + }, + { + "epoch": 0.30958156458459674, + "grad_norm": 0.22738248109817505, + "learning_rate": 4.360231360463295e-05, + "loss": 0.1322, + "step": 17357 + }, + { + "epoch": 0.3095994007063104, + "grad_norm": 0.2758842706680298, + "learning_rate": 4.3601273702349743e-05, + "loss": 0.1968, + "step": 17358 + }, + { + "epoch": 0.3096172368280241, + "grad_norm": 0.2754844129085541, + "learning_rate": 4.360023372796157e-05, + "loss": 0.1805, + "step": 17359 + }, + { + "epoch": 0.3096350729497378, + "grad_norm": 0.41801783442497253, + "learning_rate": 4.359919368147247e-05, + "loss": 0.1818, + "step": 17360 + }, + { + "epoch": 0.3096529090714515, + "grad_norm": 0.28461480140686035, + "learning_rate": 4.3598153562886465e-05, + "loss": 0.1524, + "step": 17361 + }, + { + "epoch": 0.3096707451931652, + "grad_norm": 0.16162221133708954, + "learning_rate": 4.359711337220761e-05, + "loss": 0.1228, + "step": 17362 + }, + { + "epoch": 0.3096885813148789, + "grad_norm": 0.2785652279853821, + "learning_rate": 4.35960731094399e-05, + "loss": 0.1512, + "step": 17363 + }, + { + "epoch": 0.3097064174365926, + "grad_norm": 0.19198106229305267, + "learning_rate": 4.35950327745874e-05, + "loss": 0.1469, + "step": 17364 + }, + { + "epoch": 0.3097242535583063, + "grad_norm": 0.3432587683200836, + "learning_rate": 4.359399236765412e-05, + "loss": 0.2441, + "step": 17365 + }, + { + "epoch": 0.30974208968002, + "grad_norm": 0.27107396721839905, + "learning_rate": 4.359295188864411e-05, + "loss": 0.1463, + "step": 17366 + }, + { + "epoch": 0.30975992580173367, + "grad_norm": 0.2346051186323166, + "learning_rate": 4.3591911337561395e-05, + "loss": 0.1431, + "step": 17367 + }, + { + "epoch": 0.30977776192344736, + "grad_norm": 0.29845526814460754, + "learning_rate": 4.359087071441002e-05, + "loss": 0.2149, + "step": 17368 + }, + { + "epoch": 0.30979559804516105, + "grad_norm": 0.27903053164482117, + "learning_rate": 4.3589830019194e-05, + "loss": 0.1745, + "step": 17369 + }, + { + "epoch": 0.30981343416687473, + "grad_norm": 0.27437329292297363, + "learning_rate": 4.358878925191737e-05, + "loss": 0.1697, + "step": 17370 + }, + { + "epoch": 0.3098312702885884, + "grad_norm": 0.22781068086624146, + "learning_rate": 4.3587748412584186e-05, + "loss": 0.1764, + "step": 17371 + }, + { + "epoch": 0.30984910641030217, + "grad_norm": 0.3300037682056427, + "learning_rate": 4.358670750119847e-05, + "loss": 0.1777, + "step": 17372 + }, + { + "epoch": 0.30986694253201585, + "grad_norm": 0.28571605682373047, + "learning_rate": 4.358566651776424e-05, + "loss": 0.1556, + "step": 17373 + }, + { + "epoch": 0.30988477865372954, + "grad_norm": 0.22507601976394653, + "learning_rate": 4.358462546228557e-05, + "loss": 0.172, + "step": 17374 + }, + { + "epoch": 0.30990261477544323, + "grad_norm": 0.29112255573272705, + "learning_rate": 4.358358433476646e-05, + "loss": 0.1888, + "step": 17375 + }, + { + "epoch": 0.3099204508971569, + "grad_norm": 0.3871815800666809, + "learning_rate": 4.358254313521095e-05, + "loss": 0.1649, + "step": 17376 + }, + { + "epoch": 0.3099382870188706, + "grad_norm": 0.3167188763618469, + "learning_rate": 4.3581501863623096e-05, + "loss": 0.202, + "step": 17377 + }, + { + "epoch": 0.3099561231405843, + "grad_norm": 0.19899222254753113, + "learning_rate": 4.358046052000693e-05, + "loss": 0.1685, + "step": 17378 + }, + { + "epoch": 0.309973959262298, + "grad_norm": 0.3238217830657959, + "learning_rate": 4.3579419104366463e-05, + "loss": 0.1283, + "step": 17379 + }, + { + "epoch": 0.3099917953840117, + "grad_norm": 0.24417483806610107, + "learning_rate": 4.357837761670576e-05, + "loss": 0.1574, + "step": 17380 + }, + { + "epoch": 0.3100096315057254, + "grad_norm": 0.20129568874835968, + "learning_rate": 4.357733605702885e-05, + "loss": 0.1268, + "step": 17381 + }, + { + "epoch": 0.3100274676274391, + "grad_norm": 0.25981321930885315, + "learning_rate": 4.357629442533977e-05, + "loss": 0.1876, + "step": 17382 + }, + { + "epoch": 0.3100453037491528, + "grad_norm": 0.2805124521255493, + "learning_rate": 4.357525272164255e-05, + "loss": 0.1063, + "step": 17383 + }, + { + "epoch": 0.3100631398708665, + "grad_norm": 0.33776137232780457, + "learning_rate": 4.357421094594124e-05, + "loss": 0.1321, + "step": 17384 + }, + { + "epoch": 0.31008097599258017, + "grad_norm": 0.20445755124092102, + "learning_rate": 4.357316909823988e-05, + "loss": 0.1703, + "step": 17385 + }, + { + "epoch": 0.31009881211429385, + "grad_norm": 0.21691961586475372, + "learning_rate": 4.3572127178542487e-05, + "loss": 0.1214, + "step": 17386 + }, + { + "epoch": 0.31011664823600754, + "grad_norm": 0.2529012858867645, + "learning_rate": 4.357108518685312e-05, + "loss": 0.1548, + "step": 17387 + }, + { + "epoch": 0.31013448435772123, + "grad_norm": 0.23972001671791077, + "learning_rate": 4.357004312317581e-05, + "loss": 0.1363, + "step": 17388 + }, + { + "epoch": 0.310152320479435, + "grad_norm": 0.3075212836265564, + "learning_rate": 4.356900098751461e-05, + "loss": 0.1396, + "step": 17389 + }, + { + "epoch": 0.31017015660114866, + "grad_norm": 0.27812814712524414, + "learning_rate": 4.356795877987354e-05, + "loss": 0.1708, + "step": 17390 + }, + { + "epoch": 0.31018799272286235, + "grad_norm": 0.2706345021724701, + "learning_rate": 4.356691650025665e-05, + "loss": 0.1799, + "step": 17391 + }, + { + "epoch": 0.31020582884457604, + "grad_norm": 0.3186160624027252, + "learning_rate": 4.356587414866798e-05, + "loss": 0.2028, + "step": 17392 + }, + { + "epoch": 0.3102236649662897, + "grad_norm": 0.27843767404556274, + "learning_rate": 4.3564831725111565e-05, + "loss": 0.1757, + "step": 17393 + }, + { + "epoch": 0.3102415010880034, + "grad_norm": 0.23253169655799866, + "learning_rate": 4.356378922959146e-05, + "loss": 0.1302, + "step": 17394 + }, + { + "epoch": 0.3102593372097171, + "grad_norm": 0.27413132786750793, + "learning_rate": 4.3562746662111684e-05, + "loss": 0.1621, + "step": 17395 + }, + { + "epoch": 0.3102771733314308, + "grad_norm": 0.3344685137271881, + "learning_rate": 4.3561704022676296e-05, + "loss": 0.173, + "step": 17396 + }, + { + "epoch": 0.31029500945314453, + "grad_norm": 0.2624615728855133, + "learning_rate": 4.356066131128933e-05, + "loss": 0.1745, + "step": 17397 + }, + { + "epoch": 0.3103128455748582, + "grad_norm": 0.2921142876148224, + "learning_rate": 4.3559618527954834e-05, + "loss": 0.1806, + "step": 17398 + }, + { + "epoch": 0.3103306816965719, + "grad_norm": 0.22078359127044678, + "learning_rate": 4.3558575672676844e-05, + "loss": 0.1887, + "step": 17399 + }, + { + "epoch": 0.3103485178182856, + "grad_norm": 0.2526251971721649, + "learning_rate": 4.3557532745459404e-05, + "loss": 0.1724, + "step": 17400 + }, + { + "epoch": 0.3103663539399993, + "grad_norm": 0.2232562154531479, + "learning_rate": 4.355648974630656e-05, + "loss": 0.1628, + "step": 17401 + }, + { + "epoch": 0.310384190061713, + "grad_norm": 0.3004521131515503, + "learning_rate": 4.355544667522235e-05, + "loss": 0.2032, + "step": 17402 + }, + { + "epoch": 0.31040202618342666, + "grad_norm": 0.2784850597381592, + "learning_rate": 4.355440353221082e-05, + "loss": 0.1416, + "step": 17403 + }, + { + "epoch": 0.31041986230514035, + "grad_norm": 0.39136961102485657, + "learning_rate": 4.355336031727602e-05, + "loss": 0.2049, + "step": 17404 + }, + { + "epoch": 0.3104376984268541, + "grad_norm": 0.24422284960746765, + "learning_rate": 4.355231703042198e-05, + "loss": 0.1931, + "step": 17405 + }, + { + "epoch": 0.3104555345485678, + "grad_norm": 0.3265067934989929, + "learning_rate": 4.355127367165275e-05, + "loss": 0.2096, + "step": 17406 + }, + { + "epoch": 0.31047337067028147, + "grad_norm": 0.2704797387123108, + "learning_rate": 4.355023024097238e-05, + "loss": 0.2306, + "step": 17407 + }, + { + "epoch": 0.31049120679199516, + "grad_norm": 0.6578344106674194, + "learning_rate": 4.3549186738384913e-05, + "loss": 0.1838, + "step": 17408 + }, + { + "epoch": 0.31050904291370884, + "grad_norm": 0.20448662340641022, + "learning_rate": 4.3548143163894385e-05, + "loss": 0.1397, + "step": 17409 + }, + { + "epoch": 0.31052687903542253, + "grad_norm": 0.3903447985649109, + "learning_rate": 4.3547099517504855e-05, + "loss": 0.1963, + "step": 17410 + }, + { + "epoch": 0.3105447151571362, + "grad_norm": 0.16988320648670197, + "learning_rate": 4.354605579922035e-05, + "loss": 0.1134, + "step": 17411 + }, + { + "epoch": 0.3105625512788499, + "grad_norm": 0.26739633083343506, + "learning_rate": 4.354501200904494e-05, + "loss": 0.1788, + "step": 17412 + }, + { + "epoch": 0.3105803874005636, + "grad_norm": 0.33215096592903137, + "learning_rate": 4.354396814698265e-05, + "loss": 0.218, + "step": 17413 + }, + { + "epoch": 0.31059822352227734, + "grad_norm": 0.2763465344905853, + "learning_rate": 4.354292421303754e-05, + "loss": 0.1544, + "step": 17414 + }, + { + "epoch": 0.310616059643991, + "grad_norm": 0.23078393936157227, + "learning_rate": 4.354188020721365e-05, + "loss": 0.0807, + "step": 17415 + }, + { + "epoch": 0.3106338957657047, + "grad_norm": 0.23900263011455536, + "learning_rate": 4.354083612951503e-05, + "loss": 0.1406, + "step": 17416 + }, + { + "epoch": 0.3106517318874184, + "grad_norm": 0.29303503036499023, + "learning_rate": 4.353979197994572e-05, + "loss": 0.136, + "step": 17417 + }, + { + "epoch": 0.3106695680091321, + "grad_norm": 0.4580296576023102, + "learning_rate": 4.353874775850977e-05, + "loss": 0.17, + "step": 17418 + }, + { + "epoch": 0.3106874041308458, + "grad_norm": 0.30153709650039673, + "learning_rate": 4.353770346521124e-05, + "loss": 0.1571, + "step": 17419 + }, + { + "epoch": 0.31070524025255947, + "grad_norm": 0.2793455719947815, + "learning_rate": 4.353665910005416e-05, + "loss": 0.1329, + "step": 17420 + }, + { + "epoch": 0.31072307637427315, + "grad_norm": 0.27351051568984985, + "learning_rate": 4.353561466304259e-05, + "loss": 0.1701, + "step": 17421 + }, + { + "epoch": 0.3107409124959869, + "grad_norm": 0.364711195230484, + "learning_rate": 4.3534570154180575e-05, + "loss": 0.1723, + "step": 17422 + }, + { + "epoch": 0.3107587486177006, + "grad_norm": 0.29340437054634094, + "learning_rate": 4.3533525573472165e-05, + "loss": 0.1945, + "step": 17423 + }, + { + "epoch": 0.3107765847394143, + "grad_norm": 0.2669852077960968, + "learning_rate": 4.3532480920921416e-05, + "loss": 0.1245, + "step": 17424 + }, + { + "epoch": 0.31079442086112796, + "grad_norm": 0.25489360094070435, + "learning_rate": 4.353143619653236e-05, + "loss": 0.1515, + "step": 17425 + }, + { + "epoch": 0.31081225698284165, + "grad_norm": 0.22249282896518707, + "learning_rate": 4.353039140030906e-05, + "loss": 0.1599, + "step": 17426 + }, + { + "epoch": 0.31083009310455534, + "grad_norm": 0.39028260111808777, + "learning_rate": 4.3529346532255564e-05, + "loss": 0.1624, + "step": 17427 + }, + { + "epoch": 0.310847929226269, + "grad_norm": 0.3274352252483368, + "learning_rate": 4.352830159237592e-05, + "loss": 0.1655, + "step": 17428 + }, + { + "epoch": 0.3108657653479827, + "grad_norm": 0.24809467792510986, + "learning_rate": 4.352725658067418e-05, + "loss": 0.1797, + "step": 17429 + }, + { + "epoch": 0.3108836014696964, + "grad_norm": 0.26507964730262756, + "learning_rate": 4.35262114971544e-05, + "loss": 0.1354, + "step": 17430 + }, + { + "epoch": 0.31090143759141015, + "grad_norm": 0.28164511919021606, + "learning_rate": 4.3525166341820615e-05, + "loss": 0.1574, + "step": 17431 + }, + { + "epoch": 0.31091927371312383, + "grad_norm": 0.22720032930374146, + "learning_rate": 4.3524121114676894e-05, + "loss": 0.1394, + "step": 17432 + }, + { + "epoch": 0.3109371098348375, + "grad_norm": 0.38103148341178894, + "learning_rate": 4.3523075815727275e-05, + "loss": 0.1851, + "step": 17433 + }, + { + "epoch": 0.3109549459565512, + "grad_norm": 0.2739613354206085, + "learning_rate": 4.3522030444975826e-05, + "loss": 0.1162, + "step": 17434 + }, + { + "epoch": 0.3109727820782649, + "grad_norm": 0.22264060378074646, + "learning_rate": 4.3520985002426585e-05, + "loss": 0.1542, + "step": 17435 + }, + { + "epoch": 0.3109906181999786, + "grad_norm": 0.300814688205719, + "learning_rate": 4.35199394880836e-05, + "loss": 0.2375, + "step": 17436 + }, + { + "epoch": 0.3110084543216923, + "grad_norm": 0.40395182371139526, + "learning_rate": 4.351889390195095e-05, + "loss": 0.1801, + "step": 17437 + }, + { + "epoch": 0.31102629044340596, + "grad_norm": 0.23144060373306274, + "learning_rate": 4.351784824403266e-05, + "loss": 0.1936, + "step": 17438 + }, + { + "epoch": 0.3110441265651197, + "grad_norm": 0.3169465959072113, + "learning_rate": 4.3516802514332794e-05, + "loss": 0.1873, + "step": 17439 + }, + { + "epoch": 0.3110619626868334, + "grad_norm": 0.25428691506385803, + "learning_rate": 4.351575671285541e-05, + "loss": 0.1489, + "step": 17440 + }, + { + "epoch": 0.3110797988085471, + "grad_norm": 0.31068018078804016, + "learning_rate": 4.3514710839604556e-05, + "loss": 0.1771, + "step": 17441 + }, + { + "epoch": 0.31109763493026077, + "grad_norm": 0.4185318648815155, + "learning_rate": 4.351366489458429e-05, + "loss": 0.282, + "step": 17442 + }, + { + "epoch": 0.31111547105197446, + "grad_norm": 0.31771883368492126, + "learning_rate": 4.351261887779866e-05, + "loss": 0.168, + "step": 17443 + }, + { + "epoch": 0.31113330717368815, + "grad_norm": 0.3612484037876129, + "learning_rate": 4.351157278925173e-05, + "loss": 0.2258, + "step": 17444 + }, + { + "epoch": 0.31115114329540183, + "grad_norm": 0.26935887336730957, + "learning_rate": 4.3510526628947544e-05, + "loss": 0.1536, + "step": 17445 + }, + { + "epoch": 0.3111689794171155, + "grad_norm": 0.21014304459095, + "learning_rate": 4.3509480396890175e-05, + "loss": 0.1443, + "step": 17446 + }, + { + "epoch": 0.31118681553882926, + "grad_norm": 0.33301955461502075, + "learning_rate": 4.350843409308366e-05, + "loss": 0.1771, + "step": 17447 + }, + { + "epoch": 0.31120465166054295, + "grad_norm": 0.26040083169937134, + "learning_rate": 4.350738771753206e-05, + "loss": 0.1623, + "step": 17448 + }, + { + "epoch": 0.31122248778225664, + "grad_norm": 0.23314322531223297, + "learning_rate": 4.350634127023944e-05, + "loss": 0.17, + "step": 17449 + }, + { + "epoch": 0.31124032390397033, + "grad_norm": 0.27478617429733276, + "learning_rate": 4.350529475120983e-05, + "loss": 0.1801, + "step": 17450 + }, + { + "epoch": 0.311258160025684, + "grad_norm": 0.22157971560955048, + "learning_rate": 4.3504248160447326e-05, + "loss": 0.1781, + "step": 17451 + }, + { + "epoch": 0.3112759961473977, + "grad_norm": 0.3089222311973572, + "learning_rate": 4.350320149795596e-05, + "loss": 0.1754, + "step": 17452 + }, + { + "epoch": 0.3112938322691114, + "grad_norm": 0.31412917375564575, + "learning_rate": 4.350215476373979e-05, + "loss": 0.1494, + "step": 17453 + }, + { + "epoch": 0.3113116683908251, + "grad_norm": 0.3011000156402588, + "learning_rate": 4.350110795780289e-05, + "loss": 0.239, + "step": 17454 + }, + { + "epoch": 0.31132950451253877, + "grad_norm": 0.2290801703929901, + "learning_rate": 4.350006108014929e-05, + "loss": 0.1547, + "step": 17455 + }, + { + "epoch": 0.3113473406342525, + "grad_norm": 0.20584842562675476, + "learning_rate": 4.349901413078307e-05, + "loss": 0.1444, + "step": 17456 + }, + { + "epoch": 0.3113651767559662, + "grad_norm": 0.2979958951473236, + "learning_rate": 4.349796710970828e-05, + "loss": 0.1964, + "step": 17457 + }, + { + "epoch": 0.3113830128776799, + "grad_norm": 0.23865488171577454, + "learning_rate": 4.3496920016928985e-05, + "loss": 0.19, + "step": 17458 + }, + { + "epoch": 0.3114008489993936, + "grad_norm": 0.2655833959579468, + "learning_rate": 4.3495872852449237e-05, + "loss": 0.1498, + "step": 17459 + }, + { + "epoch": 0.31141868512110726, + "grad_norm": 0.29966792464256287, + "learning_rate": 4.34948256162731e-05, + "loss": 0.2114, + "step": 17460 + }, + { + "epoch": 0.31143652124282095, + "grad_norm": 0.22978246212005615, + "learning_rate": 4.349377830840463e-05, + "loss": 0.1428, + "step": 17461 + }, + { + "epoch": 0.31145435736453464, + "grad_norm": 0.31427451968193054, + "learning_rate": 4.349273092884788e-05, + "loss": 0.164, + "step": 17462 + }, + { + "epoch": 0.3114721934862483, + "grad_norm": 0.2699473202228546, + "learning_rate": 4.349168347760692e-05, + "loss": 0.1706, + "step": 17463 + }, + { + "epoch": 0.31149002960796207, + "grad_norm": 0.22348356246948242, + "learning_rate": 4.349063595468582e-05, + "loss": 0.1734, + "step": 17464 + }, + { + "epoch": 0.31150786572967576, + "grad_norm": 0.31114187836647034, + "learning_rate": 4.348958836008862e-05, + "loss": 0.2079, + "step": 17465 + }, + { + "epoch": 0.31152570185138945, + "grad_norm": 0.26533836126327515, + "learning_rate": 4.348854069381939e-05, + "loss": 0.1868, + "step": 17466 + }, + { + "epoch": 0.31154353797310314, + "grad_norm": 0.25131261348724365, + "learning_rate": 4.348749295588219e-05, + "loss": 0.1315, + "step": 17467 + }, + { + "epoch": 0.3115613740948168, + "grad_norm": 0.3486991822719574, + "learning_rate": 4.348644514628108e-05, + "loss": 0.2017, + "step": 17468 + }, + { + "epoch": 0.3115792102165305, + "grad_norm": 0.2648819386959076, + "learning_rate": 4.348539726502012e-05, + "loss": 0.1758, + "step": 17469 + }, + { + "epoch": 0.3115970463382442, + "grad_norm": 0.2868684232234955, + "learning_rate": 4.348434931210339e-05, + "loss": 0.241, + "step": 17470 + }, + { + "epoch": 0.3116148824599579, + "grad_norm": 0.38698622584342957, + "learning_rate": 4.348330128753493e-05, + "loss": 0.2512, + "step": 17471 + }, + { + "epoch": 0.3116327185816716, + "grad_norm": 0.26002180576324463, + "learning_rate": 4.3482253191318803e-05, + "loss": 0.1401, + "step": 17472 + }, + { + "epoch": 0.3116505547033853, + "grad_norm": 0.2657237946987152, + "learning_rate": 4.3481205023459086e-05, + "loss": 0.137, + "step": 17473 + }, + { + "epoch": 0.311668390825099, + "grad_norm": 0.32502228021621704, + "learning_rate": 4.3480156783959835e-05, + "loss": 0.1943, + "step": 17474 + }, + { + "epoch": 0.3116862269468127, + "grad_norm": 0.20528516173362732, + "learning_rate": 4.347910847282511e-05, + "loss": 0.1575, + "step": 17475 + }, + { + "epoch": 0.3117040630685264, + "grad_norm": 0.2571054697036743, + "learning_rate": 4.3478060090058986e-05, + "loss": 0.141, + "step": 17476 + }, + { + "epoch": 0.31172189919024007, + "grad_norm": 0.3063930869102478, + "learning_rate": 4.347701163566551e-05, + "loss": 0.1138, + "step": 17477 + }, + { + "epoch": 0.31173973531195376, + "grad_norm": 0.25448077917099, + "learning_rate": 4.347596310964877e-05, + "loss": 0.1762, + "step": 17478 + }, + { + "epoch": 0.31175757143366745, + "grad_norm": 0.2913854122161865, + "learning_rate": 4.34749145120128e-05, + "loss": 0.1364, + "step": 17479 + }, + { + "epoch": 0.31177540755538113, + "grad_norm": 0.4470829963684082, + "learning_rate": 4.347386584276169e-05, + "loss": 0.143, + "step": 17480 + }, + { + "epoch": 0.3117932436770949, + "grad_norm": 0.27488455176353455, + "learning_rate": 4.347281710189948e-05, + "loss": 0.1593, + "step": 17481 + }, + { + "epoch": 0.31181107979880857, + "grad_norm": 0.2817961275577545, + "learning_rate": 4.347176828943026e-05, + "loss": 0.1863, + "step": 17482 + }, + { + "epoch": 0.31182891592052225, + "grad_norm": 0.23810303211212158, + "learning_rate": 4.3470719405358095e-05, + "loss": 0.1594, + "step": 17483 + }, + { + "epoch": 0.31184675204223594, + "grad_norm": 0.2999715805053711, + "learning_rate": 4.3469670449687026e-05, + "loss": 0.1703, + "step": 17484 + }, + { + "epoch": 0.31186458816394963, + "grad_norm": 0.23091301321983337, + "learning_rate": 4.3468621422421155e-05, + "loss": 0.165, + "step": 17485 + }, + { + "epoch": 0.3118824242856633, + "grad_norm": 0.2133583426475525, + "learning_rate": 4.346757232356451e-05, + "loss": 0.1373, + "step": 17486 + }, + { + "epoch": 0.311900260407377, + "grad_norm": 0.27956530451774597, + "learning_rate": 4.3466523153121186e-05, + "loss": 0.1996, + "step": 17487 + }, + { + "epoch": 0.3119180965290907, + "grad_norm": 0.22427357733249664, + "learning_rate": 4.3465473911095234e-05, + "loss": 0.1129, + "step": 17488 + }, + { + "epoch": 0.3119359326508044, + "grad_norm": 0.29926085472106934, + "learning_rate": 4.3464424597490735e-05, + "loss": 0.1381, + "step": 17489 + }, + { + "epoch": 0.3119537687725181, + "grad_norm": 0.32807281613349915, + "learning_rate": 4.346337521231174e-05, + "loss": 0.1662, + "step": 17490 + }, + { + "epoch": 0.3119716048942318, + "grad_norm": 0.2632836401462555, + "learning_rate": 4.346232575556233e-05, + "loss": 0.2041, + "step": 17491 + }, + { + "epoch": 0.3119894410159455, + "grad_norm": 0.2649456262588501, + "learning_rate": 4.346127622724657e-05, + "loss": 0.1755, + "step": 17492 + }, + { + "epoch": 0.3120072771376592, + "grad_norm": 0.45056968927383423, + "learning_rate": 4.346022662736853e-05, + "loss": 0.1996, + "step": 17493 + }, + { + "epoch": 0.3120251132593729, + "grad_norm": 0.33872950077056885, + "learning_rate": 4.3459176955932267e-05, + "loss": 0.2236, + "step": 17494 + }, + { + "epoch": 0.31204294938108657, + "grad_norm": 0.33895570039749146, + "learning_rate": 4.3458127212941864e-05, + "loss": 0.1254, + "step": 17495 + }, + { + "epoch": 0.31206078550280025, + "grad_norm": 0.41798847913742065, + "learning_rate": 4.345707739840138e-05, + "loss": 0.2177, + "step": 17496 + }, + { + "epoch": 0.31207862162451394, + "grad_norm": 0.22913534939289093, + "learning_rate": 4.3456027512314894e-05, + "loss": 0.195, + "step": 17497 + }, + { + "epoch": 0.3120964577462277, + "grad_norm": 0.28038740158081055, + "learning_rate": 4.345497755468647e-05, + "loss": 0.186, + "step": 17498 + }, + { + "epoch": 0.3121142938679414, + "grad_norm": 0.29573291540145874, + "learning_rate": 4.345392752552018e-05, + "loss": 0.212, + "step": 17499 + }, + { + "epoch": 0.31213212998965506, + "grad_norm": 0.2219371348619461, + "learning_rate": 4.3452877424820094e-05, + "loss": 0.198, + "step": 17500 + }, + { + "epoch": 0.31214996611136875, + "grad_norm": 0.21954099833965302, + "learning_rate": 4.345182725259027e-05, + "loss": 0.1816, + "step": 17501 + }, + { + "epoch": 0.31216780223308244, + "grad_norm": 0.23090429604053497, + "learning_rate": 4.345077700883481e-05, + "loss": 0.1688, + "step": 17502 + }, + { + "epoch": 0.3121856383547961, + "grad_norm": 0.258075475692749, + "learning_rate": 4.344972669355775e-05, + "loss": 0.1479, + "step": 17503 + }, + { + "epoch": 0.3122034744765098, + "grad_norm": 0.33138346672058105, + "learning_rate": 4.3448676306763184e-05, + "loss": 0.1712, + "step": 17504 + }, + { + "epoch": 0.3122213105982235, + "grad_norm": 0.22639347612857819, + "learning_rate": 4.344762584845518e-05, + "loss": 0.1812, + "step": 17505 + }, + { + "epoch": 0.31223914671993724, + "grad_norm": 0.2866867780685425, + "learning_rate": 4.344657531863779e-05, + "loss": 0.1932, + "step": 17506 + }, + { + "epoch": 0.31225698284165093, + "grad_norm": 0.2818053066730499, + "learning_rate": 4.3445524717315125e-05, + "loss": 0.1793, + "step": 17507 + }, + { + "epoch": 0.3122748189633646, + "grad_norm": 0.3275313377380371, + "learning_rate": 4.3444474044491215e-05, + "loss": 0.1701, + "step": 17508 + }, + { + "epoch": 0.3122926550850783, + "grad_norm": 0.32207781076431274, + "learning_rate": 4.3443423300170175e-05, + "loss": 0.2237, + "step": 17509 + }, + { + "epoch": 0.312310491206792, + "grad_norm": 0.3252350986003876, + "learning_rate": 4.3442372484356044e-05, + "loss": 0.2112, + "step": 17510 + }, + { + "epoch": 0.3123283273285057, + "grad_norm": 0.3287123143672943, + "learning_rate": 4.3441321597052895e-05, + "loss": 0.2224, + "step": 17511 + }, + { + "epoch": 0.31234616345021937, + "grad_norm": 0.33139023184776306, + "learning_rate": 4.3440270638264834e-05, + "loss": 0.1603, + "step": 17512 + }, + { + "epoch": 0.31236399957193306, + "grad_norm": 0.23307359218597412, + "learning_rate": 4.343921960799591e-05, + "loss": 0.1743, + "step": 17513 + }, + { + "epoch": 0.31238183569364675, + "grad_norm": 0.2689676880836487, + "learning_rate": 4.34381685062502e-05, + "loss": 0.1283, + "step": 17514 + }, + { + "epoch": 0.3123996718153605, + "grad_norm": 0.2605196237564087, + "learning_rate": 4.343711733303178e-05, + "loss": 0.1613, + "step": 17515 + }, + { + "epoch": 0.3124175079370742, + "grad_norm": 0.2356814295053482, + "learning_rate": 4.343606608834472e-05, + "loss": 0.1716, + "step": 17516 + }, + { + "epoch": 0.31243534405878787, + "grad_norm": 0.327284038066864, + "learning_rate": 4.3435014772193106e-05, + "loss": 0.1708, + "step": 17517 + }, + { + "epoch": 0.31245318018050156, + "grad_norm": 0.33501574397087097, + "learning_rate": 4.343396338458101e-05, + "loss": 0.1536, + "step": 17518 + }, + { + "epoch": 0.31247101630221524, + "grad_norm": 0.2738770544528961, + "learning_rate": 4.34329119255125e-05, + "loss": 0.1775, + "step": 17519 + }, + { + "epoch": 0.31248885242392893, + "grad_norm": 0.27891844511032104, + "learning_rate": 4.343186039499166e-05, + "loss": 0.1788, + "step": 17520 + }, + { + "epoch": 0.3125066885456426, + "grad_norm": 0.20665976405143738, + "learning_rate": 4.343080879302256e-05, + "loss": 0.1285, + "step": 17521 + }, + { + "epoch": 0.3125245246673563, + "grad_norm": 0.9507989287376404, + "learning_rate": 4.342975711960928e-05, + "loss": 0.1991, + "step": 17522 + }, + { + "epoch": 0.31254236078907005, + "grad_norm": 0.25839972496032715, + "learning_rate": 4.34287053747559e-05, + "loss": 0.1852, + "step": 17523 + }, + { + "epoch": 0.31256019691078374, + "grad_norm": 0.2607276141643524, + "learning_rate": 4.342765355846649e-05, + "loss": 0.16, + "step": 17524 + }, + { + "epoch": 0.3125780330324974, + "grad_norm": 0.3024839460849762, + "learning_rate": 4.342660167074513e-05, + "loss": 0.1396, + "step": 17525 + }, + { + "epoch": 0.3125958691542111, + "grad_norm": 0.29394084215164185, + "learning_rate": 4.3425549711595896e-05, + "loss": 0.1725, + "step": 17526 + }, + { + "epoch": 0.3126137052759248, + "grad_norm": 0.2145133912563324, + "learning_rate": 4.342449768102287e-05, + "loss": 0.1586, + "step": 17527 + }, + { + "epoch": 0.3126315413976385, + "grad_norm": 0.26951074600219727, + "learning_rate": 4.342344557903013e-05, + "loss": 0.1677, + "step": 17528 + }, + { + "epoch": 0.3126493775193522, + "grad_norm": 0.3360919654369354, + "learning_rate": 4.3422393405621744e-05, + "loss": 0.2025, + "step": 17529 + }, + { + "epoch": 0.31266721364106587, + "grad_norm": 0.20517614483833313, + "learning_rate": 4.34213411608018e-05, + "loss": 0.1693, + "step": 17530 + }, + { + "epoch": 0.31268504976277955, + "grad_norm": 0.22563259303569794, + "learning_rate": 4.342028884457438e-05, + "loss": 0.1703, + "step": 17531 + }, + { + "epoch": 0.3127028858844933, + "grad_norm": 0.20919504761695862, + "learning_rate": 4.3419236456943556e-05, + "loss": 0.151, + "step": 17532 + }, + { + "epoch": 0.312720722006207, + "grad_norm": 0.221780464053154, + "learning_rate": 4.3418183997913406e-05, + "loss": 0.1782, + "step": 17533 + }, + { + "epoch": 0.3127385581279207, + "grad_norm": 0.26816293597221375, + "learning_rate": 4.341713146748802e-05, + "loss": 0.175, + "step": 17534 + }, + { + "epoch": 0.31275639424963436, + "grad_norm": 0.25535818934440613, + "learning_rate": 4.341607886567147e-05, + "loss": 0.1361, + "step": 17535 + }, + { + "epoch": 0.31277423037134805, + "grad_norm": 0.32535871863365173, + "learning_rate": 4.3415026192467835e-05, + "loss": 0.2581, + "step": 17536 + }, + { + "epoch": 0.31279206649306174, + "grad_norm": 0.25587764382362366, + "learning_rate": 4.34139734478812e-05, + "loss": 0.175, + "step": 17537 + }, + { + "epoch": 0.3128099026147754, + "grad_norm": 0.2733164429664612, + "learning_rate": 4.341292063191564e-05, + "loss": 0.1765, + "step": 17538 + }, + { + "epoch": 0.3128277387364891, + "grad_norm": 0.25258323550224304, + "learning_rate": 4.3411867744575246e-05, + "loss": 0.1478, + "step": 17539 + }, + { + "epoch": 0.31284557485820286, + "grad_norm": 0.3264720141887665, + "learning_rate": 4.341081478586409e-05, + "loss": 0.1432, + "step": 17540 + }, + { + "epoch": 0.31286341097991655, + "grad_norm": 0.2438495010137558, + "learning_rate": 4.340976175578626e-05, + "loss": 0.2233, + "step": 17541 + }, + { + "epoch": 0.31288124710163023, + "grad_norm": 0.2688412368297577, + "learning_rate": 4.340870865434583e-05, + "loss": 0.1874, + "step": 17542 + }, + { + "epoch": 0.3128990832233439, + "grad_norm": 0.3897169530391693, + "learning_rate": 4.340765548154689e-05, + "loss": 0.2205, + "step": 17543 + }, + { + "epoch": 0.3129169193450576, + "grad_norm": 0.28889134526252747, + "learning_rate": 4.340660223739352e-05, + "loss": 0.1996, + "step": 17544 + }, + { + "epoch": 0.3129347554667713, + "grad_norm": 0.20531803369522095, + "learning_rate": 4.34055489218898e-05, + "loss": 0.1715, + "step": 17545 + }, + { + "epoch": 0.312952591588485, + "grad_norm": 0.2323165386915207, + "learning_rate": 4.3404495535039814e-05, + "loss": 0.1754, + "step": 17546 + }, + { + "epoch": 0.3129704277101987, + "grad_norm": 0.2058759480714798, + "learning_rate": 4.340344207684765e-05, + "loss": 0.1793, + "step": 17547 + }, + { + "epoch": 0.3129882638319124, + "grad_norm": 0.28017207980155945, + "learning_rate": 4.340238854731738e-05, + "loss": 0.1798, + "step": 17548 + }, + { + "epoch": 0.3130060999536261, + "grad_norm": 0.32554730772972107, + "learning_rate": 4.340133494645311e-05, + "loss": 0.1691, + "step": 17549 + }, + { + "epoch": 0.3130239360753398, + "grad_norm": 0.2488563358783722, + "learning_rate": 4.34002812742589e-05, + "loss": 0.1813, + "step": 17550 + }, + { + "epoch": 0.3130417721970535, + "grad_norm": 0.2750335931777954, + "learning_rate": 4.339922753073885e-05, + "loss": 0.1705, + "step": 17551 + }, + { + "epoch": 0.31305960831876717, + "grad_norm": 0.20450814068317413, + "learning_rate": 4.339817371589704e-05, + "loss": 0.1333, + "step": 17552 + }, + { + "epoch": 0.31307744444048086, + "grad_norm": 0.23762629926204681, + "learning_rate": 4.3397119829737555e-05, + "loss": 0.1686, + "step": 17553 + }, + { + "epoch": 0.31309528056219454, + "grad_norm": 0.18951566517353058, + "learning_rate": 4.339606587226447e-05, + "loss": 0.1309, + "step": 17554 + }, + { + "epoch": 0.31311311668390823, + "grad_norm": 0.3012663722038269, + "learning_rate": 4.3395011843481884e-05, + "loss": 0.168, + "step": 17555 + }, + { + "epoch": 0.3131309528056219, + "grad_norm": 0.2693444788455963, + "learning_rate": 4.3393957743393886e-05, + "loss": 0.1865, + "step": 17556 + }, + { + "epoch": 0.31314878892733566, + "grad_norm": 0.27569329738616943, + "learning_rate": 4.3392903572004545e-05, + "loss": 0.187, + "step": 17557 + }, + { + "epoch": 0.31316662504904935, + "grad_norm": 0.2794274389743805, + "learning_rate": 4.339184932931796e-05, + "loss": 0.2003, + "step": 17558 + }, + { + "epoch": 0.31318446117076304, + "grad_norm": 0.2437286674976349, + "learning_rate": 4.339079501533821e-05, + "loss": 0.1588, + "step": 17559 + }, + { + "epoch": 0.31320229729247673, + "grad_norm": 0.2621024250984192, + "learning_rate": 4.33897406300694e-05, + "loss": 0.2289, + "step": 17560 + }, + { + "epoch": 0.3132201334141904, + "grad_norm": 0.24582625925540924, + "learning_rate": 4.3388686173515596e-05, + "loss": 0.1654, + "step": 17561 + }, + { + "epoch": 0.3132379695359041, + "grad_norm": 0.23326288163661957, + "learning_rate": 4.338763164568089e-05, + "loss": 0.1669, + "step": 17562 + }, + { + "epoch": 0.3132558056576178, + "grad_norm": 0.1829545646905899, + "learning_rate": 4.3386577046569376e-05, + "loss": 0.1585, + "step": 17563 + }, + { + "epoch": 0.3132736417793315, + "grad_norm": 0.223091259598732, + "learning_rate": 4.338552237618514e-05, + "loss": 0.1699, + "step": 17564 + }, + { + "epoch": 0.3132914779010452, + "grad_norm": 0.2604628801345825, + "learning_rate": 4.338446763453226e-05, + "loss": 0.195, + "step": 17565 + }, + { + "epoch": 0.3133093140227589, + "grad_norm": 0.2905992269515991, + "learning_rate": 4.338341282161485e-05, + "loss": 0.1771, + "step": 17566 + }, + { + "epoch": 0.3133271501444726, + "grad_norm": 0.4702034890651703, + "learning_rate": 4.338235793743697e-05, + "loss": 0.1737, + "step": 17567 + }, + { + "epoch": 0.3133449862661863, + "grad_norm": 0.23991736769676208, + "learning_rate": 4.338130298200273e-05, + "loss": 0.1416, + "step": 17568 + }, + { + "epoch": 0.3133628223879, + "grad_norm": 1.385790228843689, + "learning_rate": 4.338024795531621e-05, + "loss": 0.1783, + "step": 17569 + }, + { + "epoch": 0.31338065850961366, + "grad_norm": 0.26064804196357727, + "learning_rate": 4.337919285738149e-05, + "loss": 0.1635, + "step": 17570 + }, + { + "epoch": 0.31339849463132735, + "grad_norm": 0.4337914288043976, + "learning_rate": 4.337813768820268e-05, + "loss": 0.2102, + "step": 17571 + }, + { + "epoch": 0.31341633075304104, + "grad_norm": 0.2873128652572632, + "learning_rate": 4.337708244778386e-05, + "loss": 0.1841, + "step": 17572 + }, + { + "epoch": 0.3134341668747547, + "grad_norm": 0.21467001736164093, + "learning_rate": 4.337602713612912e-05, + "loss": 0.1805, + "step": 17573 + }, + { + "epoch": 0.31345200299646847, + "grad_norm": 0.2837502658367157, + "learning_rate": 4.337497175324255e-05, + "loss": 0.128, + "step": 17574 + }, + { + "epoch": 0.31346983911818216, + "grad_norm": 0.2778398394584656, + "learning_rate": 4.337391629912825e-05, + "loss": 0.138, + "step": 17575 + }, + { + "epoch": 0.31348767523989585, + "grad_norm": 0.34074631333351135, + "learning_rate": 4.3372860773790296e-05, + "loss": 0.1775, + "step": 17576 + }, + { + "epoch": 0.31350551136160953, + "grad_norm": 0.2707691192626953, + "learning_rate": 4.3371805177232785e-05, + "loss": 0.1607, + "step": 17577 + }, + { + "epoch": 0.3135233474833232, + "grad_norm": 0.2902049422264099, + "learning_rate": 4.337074950945982e-05, + "loss": 0.1887, + "step": 17578 + }, + { + "epoch": 0.3135411836050369, + "grad_norm": 0.3427014648914337, + "learning_rate": 4.336969377047548e-05, + "loss": 0.2461, + "step": 17579 + }, + { + "epoch": 0.3135590197267506, + "grad_norm": 0.2717701494693756, + "learning_rate": 4.336863796028387e-05, + "loss": 0.1928, + "step": 17580 + }, + { + "epoch": 0.3135768558484643, + "grad_norm": 0.27019140124320984, + "learning_rate": 4.336758207888907e-05, + "loss": 0.1902, + "step": 17581 + }, + { + "epoch": 0.31359469197017803, + "grad_norm": 0.24754968285560608, + "learning_rate": 4.336652612629517e-05, + "loss": 0.1503, + "step": 17582 + }, + { + "epoch": 0.3136125280918917, + "grad_norm": 0.22130009531974792, + "learning_rate": 4.336547010250628e-05, + "loss": 0.1346, + "step": 17583 + }, + { + "epoch": 0.3136303642136054, + "grad_norm": 0.26831161975860596, + "learning_rate": 4.336441400752649e-05, + "loss": 0.1379, + "step": 17584 + }, + { + "epoch": 0.3136482003353191, + "grad_norm": 0.23630496859550476, + "learning_rate": 4.3363357841359874e-05, + "loss": 0.1802, + "step": 17585 + }, + { + "epoch": 0.3136660364570328, + "grad_norm": 0.32348525524139404, + "learning_rate": 4.3362301604010554e-05, + "loss": 0.1893, + "step": 17586 + }, + { + "epoch": 0.31368387257874647, + "grad_norm": 0.23855532705783844, + "learning_rate": 4.33612452954826e-05, + "loss": 0.206, + "step": 17587 + }, + { + "epoch": 0.31370170870046016, + "grad_norm": 0.3500916659832001, + "learning_rate": 4.3360188915780126e-05, + "loss": 0.2904, + "step": 17588 + }, + { + "epoch": 0.31371954482217385, + "grad_norm": 0.3429223597049713, + "learning_rate": 4.335913246490722e-05, + "loss": 0.2181, + "step": 17589 + }, + { + "epoch": 0.31373738094388753, + "grad_norm": 0.2651059031486511, + "learning_rate": 4.335807594286797e-05, + "loss": 0.1865, + "step": 17590 + }, + { + "epoch": 0.3137552170656013, + "grad_norm": 0.34736937284469604, + "learning_rate": 4.335701934966647e-05, + "loss": 0.2246, + "step": 17591 + }, + { + "epoch": 0.31377305318731497, + "grad_norm": 0.37737807631492615, + "learning_rate": 4.3355962685306825e-05, + "loss": 0.2229, + "step": 17592 + }, + { + "epoch": 0.31379088930902865, + "grad_norm": 0.20219923555850983, + "learning_rate": 4.335490594979314e-05, + "loss": 0.1033, + "step": 17593 + }, + { + "epoch": 0.31380872543074234, + "grad_norm": 0.2116667926311493, + "learning_rate": 4.335384914312949e-05, + "loss": 0.1664, + "step": 17594 + }, + { + "epoch": 0.31382656155245603, + "grad_norm": 0.2642044425010681, + "learning_rate": 4.3352792265319987e-05, + "loss": 0.1694, + "step": 17595 + }, + { + "epoch": 0.3138443976741697, + "grad_norm": 0.2847067713737488, + "learning_rate": 4.3351735316368726e-05, + "loss": 0.1503, + "step": 17596 + }, + { + "epoch": 0.3138622337958834, + "grad_norm": 0.4029041528701782, + "learning_rate": 4.335067829627979e-05, + "loss": 0.1862, + "step": 17597 + }, + { + "epoch": 0.3138800699175971, + "grad_norm": 0.2765880525112152, + "learning_rate": 4.334962120505729e-05, + "loss": 0.2553, + "step": 17598 + }, + { + "epoch": 0.31389790603931084, + "grad_norm": 0.23255957663059235, + "learning_rate": 4.334856404270532e-05, + "loss": 0.1656, + "step": 17599 + }, + { + "epoch": 0.3139157421610245, + "grad_norm": 0.19842900335788727, + "learning_rate": 4.3347506809227984e-05, + "loss": 0.1774, + "step": 17600 + }, + { + "epoch": 0.3139335782827382, + "grad_norm": 0.30295294523239136, + "learning_rate": 4.3346449504629375e-05, + "loss": 0.2032, + "step": 17601 + }, + { + "epoch": 0.3139514144044519, + "grad_norm": 0.2216023951768875, + "learning_rate": 4.334539212891359e-05, + "loss": 0.1693, + "step": 17602 + }, + { + "epoch": 0.3139692505261656, + "grad_norm": 0.3422231674194336, + "learning_rate": 4.3344334682084716e-05, + "loss": 0.1765, + "step": 17603 + }, + { + "epoch": 0.3139870866478793, + "grad_norm": 0.22290359437465668, + "learning_rate": 4.334327716414688e-05, + "loss": 0.1474, + "step": 17604 + }, + { + "epoch": 0.31400492276959296, + "grad_norm": 0.2919217348098755, + "learning_rate": 4.334221957510416e-05, + "loss": 0.185, + "step": 17605 + }, + { + "epoch": 0.31402275889130665, + "grad_norm": 0.5418250560760498, + "learning_rate": 4.334116191496066e-05, + "loss": 0.205, + "step": 17606 + }, + { + "epoch": 0.3140405950130204, + "grad_norm": 0.3279336988925934, + "learning_rate": 4.3340104183720484e-05, + "loss": 0.1088, + "step": 17607 + }, + { + "epoch": 0.3140584311347341, + "grad_norm": 0.28544193506240845, + "learning_rate": 4.333904638138773e-05, + "loss": 0.191, + "step": 17608 + }, + { + "epoch": 0.31407626725644777, + "grad_norm": 0.19915661215782166, + "learning_rate": 4.33379885079665e-05, + "loss": 0.129, + "step": 17609 + }, + { + "epoch": 0.31409410337816146, + "grad_norm": 0.2938247323036194, + "learning_rate": 4.333693056346089e-05, + "loss": 0.1913, + "step": 17610 + }, + { + "epoch": 0.31411193949987515, + "grad_norm": 0.23385438323020935, + "learning_rate": 4.3335872547875e-05, + "loss": 0.1343, + "step": 17611 + }, + { + "epoch": 0.31412977562158884, + "grad_norm": 0.29365524649620056, + "learning_rate": 4.333481446121294e-05, + "loss": 0.1626, + "step": 17612 + }, + { + "epoch": 0.3141476117433025, + "grad_norm": 0.25309011340141296, + "learning_rate": 4.3333756303478815e-05, + "loss": 0.1856, + "step": 17613 + }, + { + "epoch": 0.3141654478650162, + "grad_norm": 0.30498093366622925, + "learning_rate": 4.33326980746767e-05, + "loss": 0.1858, + "step": 17614 + }, + { + "epoch": 0.3141832839867299, + "grad_norm": 0.26543283462524414, + "learning_rate": 4.333163977481073e-05, + "loss": 0.1621, + "step": 17615 + }, + { + "epoch": 0.31420112010844364, + "grad_norm": 0.37338709831237793, + "learning_rate": 4.3330581403884984e-05, + "loss": 0.1444, + "step": 17616 + }, + { + "epoch": 0.31421895623015733, + "grad_norm": 0.27330338954925537, + "learning_rate": 4.332952296190358e-05, + "loss": 0.2034, + "step": 17617 + }, + { + "epoch": 0.314236792351871, + "grad_norm": 0.46846622228622437, + "learning_rate": 4.332846444887061e-05, + "loss": 0.1368, + "step": 17618 + }, + { + "epoch": 0.3142546284735847, + "grad_norm": 0.2901749610900879, + "learning_rate": 4.332740586479018e-05, + "loss": 0.1834, + "step": 17619 + }, + { + "epoch": 0.3142724645952984, + "grad_norm": 0.30629175901412964, + "learning_rate": 4.33263472096664e-05, + "loss": 0.16, + "step": 17620 + }, + { + "epoch": 0.3142903007170121, + "grad_norm": 0.24373897910118103, + "learning_rate": 4.332528848350337e-05, + "loss": 0.1846, + "step": 17621 + }, + { + "epoch": 0.31430813683872577, + "grad_norm": 0.22953234612941742, + "learning_rate": 4.3324229686305186e-05, + "loss": 0.1867, + "step": 17622 + }, + { + "epoch": 0.31432597296043946, + "grad_norm": 0.3272095024585724, + "learning_rate": 4.332317081807595e-05, + "loss": 0.1377, + "step": 17623 + }, + { + "epoch": 0.3143438090821532, + "grad_norm": 0.2385309487581253, + "learning_rate": 4.3322111878819797e-05, + "loss": 0.1565, + "step": 17624 + }, + { + "epoch": 0.3143616452038669, + "grad_norm": 0.20896194875240326, + "learning_rate": 4.33210528685408e-05, + "loss": 0.1624, + "step": 17625 + }, + { + "epoch": 0.3143794813255806, + "grad_norm": 0.3582569658756256, + "learning_rate": 4.3319993787243066e-05, + "loss": 0.2019, + "step": 17626 + }, + { + "epoch": 0.31439731744729427, + "grad_norm": 0.27730584144592285, + "learning_rate": 4.3318934634930716e-05, + "loss": 0.1937, + "step": 17627 + }, + { + "epoch": 0.31441515356900795, + "grad_norm": 0.2537687122821808, + "learning_rate": 4.3317875411607853e-05, + "loss": 0.18, + "step": 17628 + }, + { + "epoch": 0.31443298969072164, + "grad_norm": 0.26678311824798584, + "learning_rate": 4.331681611727857e-05, + "loss": 0.1561, + "step": 17629 + }, + { + "epoch": 0.31445082581243533, + "grad_norm": 0.2528023421764374, + "learning_rate": 4.331575675194698e-05, + "loss": 0.1749, + "step": 17630 + }, + { + "epoch": 0.314468661934149, + "grad_norm": 0.22930273413658142, + "learning_rate": 4.33146973156172e-05, + "loss": 0.1321, + "step": 17631 + }, + { + "epoch": 0.3144864980558627, + "grad_norm": 0.22550135850906372, + "learning_rate": 4.3313637808293326e-05, + "loss": 0.1569, + "step": 17632 + }, + { + "epoch": 0.31450433417757645, + "grad_norm": 0.3391514718532562, + "learning_rate": 4.331257822997946e-05, + "loss": 0.1693, + "step": 17633 + }, + { + "epoch": 0.31452217029929014, + "grad_norm": 0.3209320306777954, + "learning_rate": 4.331151858067972e-05, + "loss": 0.192, + "step": 17634 + }, + { + "epoch": 0.3145400064210038, + "grad_norm": 0.2813419997692108, + "learning_rate": 4.331045886039821e-05, + "loss": 0.1798, + "step": 17635 + }, + { + "epoch": 0.3145578425427175, + "grad_norm": 0.20678496360778809, + "learning_rate": 4.330939906913904e-05, + "loss": 0.1898, + "step": 17636 + }, + { + "epoch": 0.3145756786644312, + "grad_norm": 0.23105883598327637, + "learning_rate": 4.3308339206906303e-05, + "loss": 0.1355, + "step": 17637 + }, + { + "epoch": 0.3145935147861449, + "grad_norm": 0.3000940680503845, + "learning_rate": 4.330727927370413e-05, + "loss": 0.1435, + "step": 17638 + }, + { + "epoch": 0.3146113509078586, + "grad_norm": 0.29369547963142395, + "learning_rate": 4.330621926953662e-05, + "loss": 0.1724, + "step": 17639 + }, + { + "epoch": 0.31462918702957227, + "grad_norm": 0.4290458559989929, + "learning_rate": 4.330515919440787e-05, + "loss": 0.1768, + "step": 17640 + }, + { + "epoch": 0.314647023151286, + "grad_norm": 0.2907050848007202, + "learning_rate": 4.330409904832202e-05, + "loss": 0.185, + "step": 17641 + }, + { + "epoch": 0.3146648592729997, + "grad_norm": 0.29109328985214233, + "learning_rate": 4.330303883128315e-05, + "loss": 0.1892, + "step": 17642 + }, + { + "epoch": 0.3146826953947134, + "grad_norm": 0.19267438352108002, + "learning_rate": 4.3301978543295375e-05, + "loss": 0.1169, + "step": 17643 + }, + { + "epoch": 0.3147005315164271, + "grad_norm": 0.24215887486934662, + "learning_rate": 4.330091818436281e-05, + "loss": 0.1646, + "step": 17644 + }, + { + "epoch": 0.31471836763814076, + "grad_norm": 0.30494052171707153, + "learning_rate": 4.329985775448957e-05, + "loss": 0.1715, + "step": 17645 + }, + { + "epoch": 0.31473620375985445, + "grad_norm": 0.3105016052722931, + "learning_rate": 4.3298797253679766e-05, + "loss": 0.1915, + "step": 17646 + }, + { + "epoch": 0.31475403988156814, + "grad_norm": 0.27297598123550415, + "learning_rate": 4.3297736681937494e-05, + "loss": 0.1255, + "step": 17647 + }, + { + "epoch": 0.3147718760032818, + "grad_norm": 0.24972057342529297, + "learning_rate": 4.329667603926688e-05, + "loss": 0.1931, + "step": 17648 + }, + { + "epoch": 0.3147897121249955, + "grad_norm": 0.20854078233242035, + "learning_rate": 4.3295615325672026e-05, + "loss": 0.1909, + "step": 17649 + }, + { + "epoch": 0.31480754824670926, + "grad_norm": 0.27245384454727173, + "learning_rate": 4.329455454115705e-05, + "loss": 0.1602, + "step": 17650 + }, + { + "epoch": 0.31482538436842294, + "grad_norm": 0.38677680492401123, + "learning_rate": 4.329349368572606e-05, + "loss": 0.2028, + "step": 17651 + }, + { + "epoch": 0.31484322049013663, + "grad_norm": 0.28773415088653564, + "learning_rate": 4.329243275938317e-05, + "loss": 0.1821, + "step": 17652 + }, + { + "epoch": 0.3148610566118503, + "grad_norm": 0.2797043025493622, + "learning_rate": 4.32913717621325e-05, + "loss": 0.2245, + "step": 17653 + }, + { + "epoch": 0.314878892733564, + "grad_norm": 0.29197850823402405, + "learning_rate": 4.3290310693978155e-05, + "loss": 0.1761, + "step": 17654 + }, + { + "epoch": 0.3148967288552777, + "grad_norm": 0.29030150175094604, + "learning_rate": 4.3289249554924236e-05, + "loss": 0.1636, + "step": 17655 + }, + { + "epoch": 0.3149145649769914, + "grad_norm": 0.3648754954338074, + "learning_rate": 4.328818834497488e-05, + "loss": 0.2098, + "step": 17656 + }, + { + "epoch": 0.3149324010987051, + "grad_norm": 0.31052032113075256, + "learning_rate": 4.3287127064134185e-05, + "loss": 0.1907, + "step": 17657 + }, + { + "epoch": 0.3149502372204188, + "grad_norm": 0.31157252192497253, + "learning_rate": 4.328606571240627e-05, + "loss": 0.13, + "step": 17658 + }, + { + "epoch": 0.3149680733421325, + "grad_norm": 0.26587140560150146, + "learning_rate": 4.328500428979525e-05, + "loss": 0.1884, + "step": 17659 + }, + { + "epoch": 0.3149859094638462, + "grad_norm": 0.2714605927467346, + "learning_rate": 4.328394279630524e-05, + "loss": 0.2003, + "step": 17660 + }, + { + "epoch": 0.3150037455855599, + "grad_norm": 0.22059091925621033, + "learning_rate": 4.328288123194034e-05, + "loss": 0.1353, + "step": 17661 + }, + { + "epoch": 0.31502158170727357, + "grad_norm": 0.268187940120697, + "learning_rate": 4.3281819596704694e-05, + "loss": 0.1595, + "step": 17662 + }, + { + "epoch": 0.31503941782898726, + "grad_norm": 0.22585149109363556, + "learning_rate": 4.3280757890602394e-05, + "loss": 0.1785, + "step": 17663 + }, + { + "epoch": 0.31505725395070094, + "grad_norm": 0.36876484751701355, + "learning_rate": 4.327969611363756e-05, + "loss": 0.2152, + "step": 17664 + }, + { + "epoch": 0.31507509007241463, + "grad_norm": 0.2486571967601776, + "learning_rate": 4.327863426581431e-05, + "loss": 0.186, + "step": 17665 + }, + { + "epoch": 0.3150929261941284, + "grad_norm": 0.4804657995700836, + "learning_rate": 4.3277572347136766e-05, + "loss": 0.1796, + "step": 17666 + }, + { + "epoch": 0.31511076231584206, + "grad_norm": 0.3148822784423828, + "learning_rate": 4.3276510357609035e-05, + "loss": 0.155, + "step": 17667 + }, + { + "epoch": 0.31512859843755575, + "grad_norm": 0.2525266408920288, + "learning_rate": 4.3275448297235246e-05, + "loss": 0.133, + "step": 17668 + }, + { + "epoch": 0.31514643455926944, + "grad_norm": 0.2096313238143921, + "learning_rate": 4.3274386166019496e-05, + "loss": 0.1442, + "step": 17669 + }, + { + "epoch": 0.3151642706809831, + "grad_norm": 0.24418789148330688, + "learning_rate": 4.3273323963965914e-05, + "loss": 0.1286, + "step": 17670 + }, + { + "epoch": 0.3151821068026968, + "grad_norm": 0.32168519496917725, + "learning_rate": 4.327226169107862e-05, + "loss": 0.1855, + "step": 17671 + }, + { + "epoch": 0.3151999429244105, + "grad_norm": 0.2948327362537384, + "learning_rate": 4.327119934736173e-05, + "loss": 0.1876, + "step": 17672 + }, + { + "epoch": 0.3152177790461242, + "grad_norm": 0.28940072655677795, + "learning_rate": 4.327013693281936e-05, + "loss": 0.0912, + "step": 17673 + }, + { + "epoch": 0.3152356151678379, + "grad_norm": 0.2675217092037201, + "learning_rate": 4.326907444745563e-05, + "loss": 0.1323, + "step": 17674 + }, + { + "epoch": 0.3152534512895516, + "grad_norm": 0.4579310715198517, + "learning_rate": 4.3268011891274654e-05, + "loss": 0.2177, + "step": 17675 + }, + { + "epoch": 0.3152712874112653, + "grad_norm": 0.2734963297843933, + "learning_rate": 4.326694926428055e-05, + "loss": 0.1759, + "step": 17676 + }, + { + "epoch": 0.315289123532979, + "grad_norm": 0.26294007897377014, + "learning_rate": 4.326588656647745e-05, + "loss": 0.1719, + "step": 17677 + }, + { + "epoch": 0.3153069596546927, + "grad_norm": 0.29418545961380005, + "learning_rate": 4.3264823797869463e-05, + "loss": 0.1734, + "step": 17678 + }, + { + "epoch": 0.3153247957764064, + "grad_norm": 0.2187463343143463, + "learning_rate": 4.326376095846071e-05, + "loss": 0.1579, + "step": 17679 + }, + { + "epoch": 0.31534263189812006, + "grad_norm": 0.2335316389799118, + "learning_rate": 4.3262698048255314e-05, + "loss": 0.1878, + "step": 17680 + }, + { + "epoch": 0.31536046801983375, + "grad_norm": 0.33935675024986267, + "learning_rate": 4.3261635067257386e-05, + "loss": 0.1843, + "step": 17681 + }, + { + "epoch": 0.31537830414154744, + "grad_norm": 0.2613784372806549, + "learning_rate": 4.326057201547106e-05, + "loss": 0.1872, + "step": 17682 + }, + { + "epoch": 0.3153961402632612, + "grad_norm": 0.22758524119853973, + "learning_rate": 4.325950889290045e-05, + "loss": 0.1857, + "step": 17683 + }, + { + "epoch": 0.31541397638497487, + "grad_norm": 0.2417377084493637, + "learning_rate": 4.325844569954967e-05, + "loss": 0.2008, + "step": 17684 + }, + { + "epoch": 0.31543181250668856, + "grad_norm": 0.2967318594455719, + "learning_rate": 4.325738243542285e-05, + "loss": 0.1754, + "step": 17685 + }, + { + "epoch": 0.31544964862840225, + "grad_norm": 0.3004534840583801, + "learning_rate": 4.3256319100524115e-05, + "loss": 0.1342, + "step": 17686 + }, + { + "epoch": 0.31546748475011593, + "grad_norm": 0.25647005438804626, + "learning_rate": 4.325525569485758e-05, + "loss": 0.1879, + "step": 17687 + }, + { + "epoch": 0.3154853208718296, + "grad_norm": 0.22697731852531433, + "learning_rate": 4.325419221842736e-05, + "loss": 0.1732, + "step": 17688 + }, + { + "epoch": 0.3155031569935433, + "grad_norm": 0.2968234121799469, + "learning_rate": 4.32531286712376e-05, + "loss": 0.1347, + "step": 17689 + }, + { + "epoch": 0.315520993115257, + "grad_norm": 0.2540208697319031, + "learning_rate": 4.32520650532924e-05, + "loss": 0.1925, + "step": 17690 + }, + { + "epoch": 0.3155388292369707, + "grad_norm": 0.2718205451965332, + "learning_rate": 4.325100136459589e-05, + "loss": 0.1272, + "step": 17691 + }, + { + "epoch": 0.31555666535868443, + "grad_norm": 0.3270924687385559, + "learning_rate": 4.32499376051522e-05, + "loss": 0.1611, + "step": 17692 + }, + { + "epoch": 0.3155745014803981, + "grad_norm": 0.22599878907203674, + "learning_rate": 4.324887377496545e-05, + "loss": 0.1767, + "step": 17693 + }, + { + "epoch": 0.3155923376021118, + "grad_norm": 0.21597085893154144, + "learning_rate": 4.324780987403976e-05, + "loss": 0.1682, + "step": 17694 + }, + { + "epoch": 0.3156101737238255, + "grad_norm": 0.2529672384262085, + "learning_rate": 4.3246745902379256e-05, + "loss": 0.1325, + "step": 17695 + }, + { + "epoch": 0.3156280098455392, + "grad_norm": 0.2538890242576599, + "learning_rate": 4.3245681859988065e-05, + "loss": 0.1582, + "step": 17696 + }, + { + "epoch": 0.31564584596725287, + "grad_norm": 0.1813114881515503, + "learning_rate": 4.32446177468703e-05, + "loss": 0.121, + "step": 17697 + }, + { + "epoch": 0.31566368208896656, + "grad_norm": 0.22774241864681244, + "learning_rate": 4.3243553563030103e-05, + "loss": 0.1608, + "step": 17698 + }, + { + "epoch": 0.31568151821068025, + "grad_norm": 0.2458595484495163, + "learning_rate": 4.324248930847159e-05, + "loss": 0.1473, + "step": 17699 + }, + { + "epoch": 0.315699354332394, + "grad_norm": 0.4012024700641632, + "learning_rate": 4.324142498319889e-05, + "loss": 0.2038, + "step": 17700 + }, + { + "epoch": 0.3157171904541077, + "grad_norm": 0.20740777254104614, + "learning_rate": 4.3240360587216125e-05, + "loss": 0.1125, + "step": 17701 + }, + { + "epoch": 0.31573502657582136, + "grad_norm": 0.3383517861366272, + "learning_rate": 4.323929612052742e-05, + "loss": 0.1655, + "step": 17702 + }, + { + "epoch": 0.31575286269753505, + "grad_norm": 0.20333009958267212, + "learning_rate": 4.32382315831369e-05, + "loss": 0.1242, + "step": 17703 + }, + { + "epoch": 0.31577069881924874, + "grad_norm": 0.36358100175857544, + "learning_rate": 4.32371669750487e-05, + "loss": 0.1657, + "step": 17704 + }, + { + "epoch": 0.31578853494096243, + "grad_norm": 0.27666720747947693, + "learning_rate": 4.323610229626695e-05, + "loss": 0.2082, + "step": 17705 + }, + { + "epoch": 0.3158063710626761, + "grad_norm": 0.2461199015378952, + "learning_rate": 4.323503754679576e-05, + "loss": 0.1647, + "step": 17706 + }, + { + "epoch": 0.3158242071843898, + "grad_norm": 0.3557877540588379, + "learning_rate": 4.323397272663927e-05, + "loss": 0.2248, + "step": 17707 + }, + { + "epoch": 0.31584204330610355, + "grad_norm": 0.22189338505268097, + "learning_rate": 4.32329078358016e-05, + "loss": 0.1736, + "step": 17708 + }, + { + "epoch": 0.31585987942781724, + "grad_norm": 0.2568211555480957, + "learning_rate": 4.323184287428688e-05, + "loss": 0.2251, + "step": 17709 + }, + { + "epoch": 0.3158777155495309, + "grad_norm": 0.1955392062664032, + "learning_rate": 4.323077784209925e-05, + "loss": 0.1407, + "step": 17710 + }, + { + "epoch": 0.3158955516712446, + "grad_norm": 0.2459288388490677, + "learning_rate": 4.322971273924282e-05, + "loss": 0.1621, + "step": 17711 + }, + { + "epoch": 0.3159133877929583, + "grad_norm": 0.21687082946300507, + "learning_rate": 4.322864756572173e-05, + "loss": 0.1394, + "step": 17712 + }, + { + "epoch": 0.315931223914672, + "grad_norm": 0.34187471866607666, + "learning_rate": 4.32275823215401e-05, + "loss": 0.2061, + "step": 17713 + }, + { + "epoch": 0.3159490600363857, + "grad_norm": 0.25744685530662537, + "learning_rate": 4.3226517006702074e-05, + "loss": 0.1817, + "step": 17714 + }, + { + "epoch": 0.31596689615809936, + "grad_norm": 0.3716115951538086, + "learning_rate": 4.322545162121177e-05, + "loss": 0.1637, + "step": 17715 + }, + { + "epoch": 0.31598473227981305, + "grad_norm": 0.31630340218544006, + "learning_rate": 4.322438616507332e-05, + "loss": 0.2016, + "step": 17716 + }, + { + "epoch": 0.3160025684015268, + "grad_norm": 0.35728713870048523, + "learning_rate": 4.322332063829085e-05, + "loss": 0.1806, + "step": 17717 + }, + { + "epoch": 0.3160204045232405, + "grad_norm": 0.33885928988456726, + "learning_rate": 4.32222550408685e-05, + "loss": 0.2021, + "step": 17718 + }, + { + "epoch": 0.31603824064495417, + "grad_norm": 0.3298684060573578, + "learning_rate": 4.3221189372810387e-05, + "loss": 0.2422, + "step": 17719 + }, + { + "epoch": 0.31605607676666786, + "grad_norm": 0.20445676147937775, + "learning_rate": 4.322012363412067e-05, + "loss": 0.1507, + "step": 17720 + }, + { + "epoch": 0.31607391288838155, + "grad_norm": 0.2581339478492737, + "learning_rate": 4.3219057824803445e-05, + "loss": 0.2137, + "step": 17721 + }, + { + "epoch": 0.31609174901009524, + "grad_norm": 0.3025096356868744, + "learning_rate": 4.321799194486286e-05, + "loss": 0.193, + "step": 17722 + }, + { + "epoch": 0.3161095851318089, + "grad_norm": 0.27923035621643066, + "learning_rate": 4.321692599430305e-05, + "loss": 0.2062, + "step": 17723 + }, + { + "epoch": 0.3161274212535226, + "grad_norm": 0.22612594068050385, + "learning_rate": 4.321585997312815e-05, + "loss": 0.1573, + "step": 17724 + }, + { + "epoch": 0.31614525737523635, + "grad_norm": 0.32120323181152344, + "learning_rate": 4.3214793881342273e-05, + "loss": 0.1743, + "step": 17725 + }, + { + "epoch": 0.31616309349695004, + "grad_norm": 0.32446640729904175, + "learning_rate": 4.321372771894957e-05, + "loss": 0.1627, + "step": 17726 + }, + { + "epoch": 0.31618092961866373, + "grad_norm": 0.2871161103248596, + "learning_rate": 4.3212661485954166e-05, + "loss": 0.2228, + "step": 17727 + }, + { + "epoch": 0.3161987657403774, + "grad_norm": 0.29146233201026917, + "learning_rate": 4.3211595182360194e-05, + "loss": 0.1833, + "step": 17728 + }, + { + "epoch": 0.3162166018620911, + "grad_norm": 0.2407662272453308, + "learning_rate": 4.321052880817179e-05, + "loss": 0.1793, + "step": 17729 + }, + { + "epoch": 0.3162344379838048, + "grad_norm": 0.3406382203102112, + "learning_rate": 4.320946236339308e-05, + "loss": 0.2022, + "step": 17730 + }, + { + "epoch": 0.3162522741055185, + "grad_norm": 0.21744000911712646, + "learning_rate": 4.3208395848028215e-05, + "loss": 0.1589, + "step": 17731 + }, + { + "epoch": 0.31627011022723217, + "grad_norm": 0.3789297938346863, + "learning_rate": 4.320732926208132e-05, + "loss": 0.1698, + "step": 17732 + }, + { + "epoch": 0.31628794634894586, + "grad_norm": 0.31260377168655396, + "learning_rate": 4.320626260555652e-05, + "loss": 0.1602, + "step": 17733 + }, + { + "epoch": 0.3163057824706596, + "grad_norm": 0.25988101959228516, + "learning_rate": 4.320519587845796e-05, + "loss": 0.1633, + "step": 17734 + }, + { + "epoch": 0.3163236185923733, + "grad_norm": 0.26145365834236145, + "learning_rate": 4.320412908078978e-05, + "loss": 0.1523, + "step": 17735 + }, + { + "epoch": 0.316341454714087, + "grad_norm": 0.24997448921203613, + "learning_rate": 4.32030622125561e-05, + "loss": 0.1667, + "step": 17736 + }, + { + "epoch": 0.31635929083580067, + "grad_norm": 0.2432536482810974, + "learning_rate": 4.3201995273761066e-05, + "loss": 0.1629, + "step": 17737 + }, + { + "epoch": 0.31637712695751435, + "grad_norm": 0.2759473919868469, + "learning_rate": 4.3200928264408814e-05, + "loss": 0.187, + "step": 17738 + }, + { + "epoch": 0.31639496307922804, + "grad_norm": 0.29780685901641846, + "learning_rate": 4.3199861184503474e-05, + "loss": 0.1673, + "step": 17739 + }, + { + "epoch": 0.31641279920094173, + "grad_norm": 0.3092334270477295, + "learning_rate": 4.319879403404919e-05, + "loss": 0.2258, + "step": 17740 + }, + { + "epoch": 0.3164306353226554, + "grad_norm": 0.32397884130477905, + "learning_rate": 4.3197726813050086e-05, + "loss": 0.2177, + "step": 17741 + }, + { + "epoch": 0.31644847144436916, + "grad_norm": 0.2735349237918854, + "learning_rate": 4.319665952151032e-05, + "loss": 0.1838, + "step": 17742 + }, + { + "epoch": 0.31646630756608285, + "grad_norm": 0.20680665969848633, + "learning_rate": 4.3195592159434005e-05, + "loss": 0.1378, + "step": 17743 + }, + { + "epoch": 0.31648414368779654, + "grad_norm": 0.2287045568227768, + "learning_rate": 4.31945247268253e-05, + "loss": 0.1724, + "step": 17744 + }, + { + "epoch": 0.3165019798095102, + "grad_norm": 0.3623093366622925, + "learning_rate": 4.3193457223688325e-05, + "loss": 0.2417, + "step": 17745 + }, + { + "epoch": 0.3165198159312239, + "grad_norm": 0.2668391466140747, + "learning_rate": 4.319238965002723e-05, + "loss": 0.1979, + "step": 17746 + }, + { + "epoch": 0.3165376520529376, + "grad_norm": 0.28566113114356995, + "learning_rate": 4.319132200584615e-05, + "loss": 0.132, + "step": 17747 + }, + { + "epoch": 0.3165554881746513, + "grad_norm": 0.2487669438123703, + "learning_rate": 4.3190254291149225e-05, + "loss": 0.1735, + "step": 17748 + }, + { + "epoch": 0.316573324296365, + "grad_norm": 0.27945441007614136, + "learning_rate": 4.318918650594059e-05, + "loss": 0.1551, + "step": 17749 + }, + { + "epoch": 0.31659116041807867, + "grad_norm": 0.2320530265569687, + "learning_rate": 4.318811865022438e-05, + "loss": 0.1885, + "step": 17750 + }, + { + "epoch": 0.3166089965397924, + "grad_norm": 0.22671692073345184, + "learning_rate": 4.318705072400474e-05, + "loss": 0.1763, + "step": 17751 + }, + { + "epoch": 0.3166268326615061, + "grad_norm": 0.31538254022598267, + "learning_rate": 4.318598272728582e-05, + "loss": 0.2117, + "step": 17752 + }, + { + "epoch": 0.3166446687832198, + "grad_norm": 0.29472115635871887, + "learning_rate": 4.318491466007174e-05, + "loss": 0.1251, + "step": 17753 + }, + { + "epoch": 0.3166625049049335, + "grad_norm": 0.24860672652721405, + "learning_rate": 4.318384652236665e-05, + "loss": 0.2087, + "step": 17754 + }, + { + "epoch": 0.31668034102664716, + "grad_norm": 0.2869946360588074, + "learning_rate": 4.31827783141747e-05, + "loss": 0.2165, + "step": 17755 + }, + { + "epoch": 0.31669817714836085, + "grad_norm": 0.2744007706642151, + "learning_rate": 4.318171003550001e-05, + "loss": 0.2106, + "step": 17756 + }, + { + "epoch": 0.31671601327007454, + "grad_norm": 0.1817733347415924, + "learning_rate": 4.318064168634675e-05, + "loss": 0.1692, + "step": 17757 + }, + { + "epoch": 0.3167338493917882, + "grad_norm": 0.29223644733428955, + "learning_rate": 4.317957326671902e-05, + "loss": 0.1847, + "step": 17758 + }, + { + "epoch": 0.31675168551350197, + "grad_norm": 0.3327445387840271, + "learning_rate": 4.3178504776621e-05, + "loss": 0.1526, + "step": 17759 + }, + { + "epoch": 0.31676952163521566, + "grad_norm": 0.2552843391895294, + "learning_rate": 4.317743621605681e-05, + "loss": 0.1326, + "step": 17760 + }, + { + "epoch": 0.31678735775692934, + "grad_norm": 0.255779892206192, + "learning_rate": 4.3176367585030605e-05, + "loss": 0.2056, + "step": 17761 + }, + { + "epoch": 0.31680519387864303, + "grad_norm": 0.31038182973861694, + "learning_rate": 4.317529888354652e-05, + "loss": 0.1719, + "step": 17762 + }, + { + "epoch": 0.3168230300003567, + "grad_norm": 0.2724645435810089, + "learning_rate": 4.317423011160869e-05, + "loss": 0.2022, + "step": 17763 + }, + { + "epoch": 0.3168408661220704, + "grad_norm": 0.20053817331790924, + "learning_rate": 4.317316126922127e-05, + "loss": 0.158, + "step": 17764 + }, + { + "epoch": 0.3168587022437841, + "grad_norm": 0.30450016260147095, + "learning_rate": 4.317209235638841e-05, + "loss": 0.1726, + "step": 17765 + }, + { + "epoch": 0.3168765383654978, + "grad_norm": 0.2536017596721649, + "learning_rate": 4.317102337311424e-05, + "loss": 0.1745, + "step": 17766 + }, + { + "epoch": 0.3168943744872115, + "grad_norm": 0.2314152866601944, + "learning_rate": 4.3169954319402906e-05, + "loss": 0.1474, + "step": 17767 + }, + { + "epoch": 0.3169122106089252, + "grad_norm": 0.18448486924171448, + "learning_rate": 4.316888519525855e-05, + "loss": 0.1371, + "step": 17768 + }, + { + "epoch": 0.3169300467306389, + "grad_norm": 0.239515021443367, + "learning_rate": 4.3167816000685325e-05, + "loss": 0.114, + "step": 17769 + }, + { + "epoch": 0.3169478828523526, + "grad_norm": 0.28303033113479614, + "learning_rate": 4.316674673568736e-05, + "loss": 0.177, + "step": 17770 + }, + { + "epoch": 0.3169657189740663, + "grad_norm": 0.2584919035434723, + "learning_rate": 4.3165677400268824e-05, + "loss": 0.1718, + "step": 17771 + }, + { + "epoch": 0.31698355509577997, + "grad_norm": 0.25381705164909363, + "learning_rate": 4.316460799443383e-05, + "loss": 0.1954, + "step": 17772 + }, + { + "epoch": 0.31700139121749366, + "grad_norm": 0.28671392798423767, + "learning_rate": 4.3163538518186566e-05, + "loss": 0.1434, + "step": 17773 + }, + { + "epoch": 0.31701922733920734, + "grad_norm": 0.34216809272766113, + "learning_rate": 4.3162468971531135e-05, + "loss": 0.1491, + "step": 17774 + }, + { + "epoch": 0.31703706346092103, + "grad_norm": 0.2896101772785187, + "learning_rate": 4.31613993544717e-05, + "loss": 0.1783, + "step": 17775 + }, + { + "epoch": 0.3170548995826348, + "grad_norm": 0.34536120295524597, + "learning_rate": 4.3160329667012425e-05, + "loss": 0.2219, + "step": 17776 + }, + { + "epoch": 0.31707273570434846, + "grad_norm": 0.2488478720188141, + "learning_rate": 4.3159259909157427e-05, + "loss": 0.1626, + "step": 17777 + }, + { + "epoch": 0.31709057182606215, + "grad_norm": 0.2681307792663574, + "learning_rate": 4.3158190080910866e-05, + "loss": 0.1798, + "step": 17778 + }, + { + "epoch": 0.31710840794777584, + "grad_norm": 0.4173508882522583, + "learning_rate": 4.315712018227689e-05, + "loss": 0.1876, + "step": 17779 + }, + { + "epoch": 0.3171262440694895, + "grad_norm": 0.27446678280830383, + "learning_rate": 4.315605021325965e-05, + "loss": 0.2041, + "step": 17780 + }, + { + "epoch": 0.3171440801912032, + "grad_norm": 0.3628494143486023, + "learning_rate": 4.315498017386328e-05, + "loss": 0.1822, + "step": 17781 + }, + { + "epoch": 0.3171619163129169, + "grad_norm": 0.23815074563026428, + "learning_rate": 4.315391006409194e-05, + "loss": 0.1902, + "step": 17782 + }, + { + "epoch": 0.3171797524346306, + "grad_norm": 0.2864932119846344, + "learning_rate": 4.315283988394977e-05, + "loss": 0.1888, + "step": 17783 + }, + { + "epoch": 0.31719758855634433, + "grad_norm": 0.3273811638355255, + "learning_rate": 4.315176963344093e-05, + "loss": 0.2123, + "step": 17784 + }, + { + "epoch": 0.317215424678058, + "grad_norm": 0.2772671580314636, + "learning_rate": 4.315069931256957e-05, + "loss": 0.1828, + "step": 17785 + }, + { + "epoch": 0.3172332607997717, + "grad_norm": 0.26042822003364563, + "learning_rate": 4.3149628921339815e-05, + "loss": 0.1836, + "step": 17786 + }, + { + "epoch": 0.3172510969214854, + "grad_norm": 0.3124202787876129, + "learning_rate": 4.314855845975583e-05, + "loss": 0.1257, + "step": 17787 + }, + { + "epoch": 0.3172689330431991, + "grad_norm": 0.2609250545501709, + "learning_rate": 4.3147487927821775e-05, + "loss": 0.1878, + "step": 17788 + }, + { + "epoch": 0.3172867691649128, + "grad_norm": 0.4113426208496094, + "learning_rate": 4.3146417325541776e-05, + "loss": 0.1534, + "step": 17789 + }, + { + "epoch": 0.31730460528662646, + "grad_norm": 0.2329111248254776, + "learning_rate": 4.314534665292001e-05, + "loss": 0.1401, + "step": 17790 + }, + { + "epoch": 0.31732244140834015, + "grad_norm": 0.29550719261169434, + "learning_rate": 4.3144275909960595e-05, + "loss": 0.2085, + "step": 17791 + }, + { + "epoch": 0.31734027753005384, + "grad_norm": 0.2701817750930786, + "learning_rate": 4.3143205096667714e-05, + "loss": 0.1978, + "step": 17792 + }, + { + "epoch": 0.3173581136517676, + "grad_norm": 0.27645787596702576, + "learning_rate": 4.314213421304549e-05, + "loss": 0.1815, + "step": 17793 + }, + { + "epoch": 0.31737594977348127, + "grad_norm": 0.36288371682167053, + "learning_rate": 4.31410632590981e-05, + "loss": 0.1781, + "step": 17794 + }, + { + "epoch": 0.31739378589519496, + "grad_norm": 0.22071725130081177, + "learning_rate": 4.313999223482969e-05, + "loss": 0.1495, + "step": 17795 + }, + { + "epoch": 0.31741162201690865, + "grad_norm": 0.3010807931423187, + "learning_rate": 4.313892114024439e-05, + "loss": 0.1505, + "step": 17796 + }, + { + "epoch": 0.31742945813862233, + "grad_norm": 0.4616325795650482, + "learning_rate": 4.313784997534637e-05, + "loss": 0.1538, + "step": 17797 + }, + { + "epoch": 0.317447294260336, + "grad_norm": 0.36488229036331177, + "learning_rate": 4.3136778740139785e-05, + "loss": 0.181, + "step": 17798 + }, + { + "epoch": 0.3174651303820497, + "grad_norm": 0.27671733498573303, + "learning_rate": 4.313570743462877e-05, + "loss": 0.1997, + "step": 17799 + }, + { + "epoch": 0.3174829665037634, + "grad_norm": 0.3063640892505646, + "learning_rate": 4.3134636058817504e-05, + "loss": 0.1537, + "step": 17800 + }, + { + "epoch": 0.31750080262547714, + "grad_norm": 0.33603960275650024, + "learning_rate": 4.313356461271011e-05, + "loss": 0.1663, + "step": 17801 + }, + { + "epoch": 0.31751863874719083, + "grad_norm": 0.3382076025009155, + "learning_rate": 4.3132493096310765e-05, + "loss": 0.1612, + "step": 17802 + }, + { + "epoch": 0.3175364748689045, + "grad_norm": 0.23141314089298248, + "learning_rate": 4.3131421509623616e-05, + "loss": 0.1684, + "step": 17803 + }, + { + "epoch": 0.3175543109906182, + "grad_norm": 0.18326106667518616, + "learning_rate": 4.3130349852652804e-05, + "loss": 0.1437, + "step": 17804 + }, + { + "epoch": 0.3175721471123319, + "grad_norm": 0.2562151551246643, + "learning_rate": 4.31292781254025e-05, + "loss": 0.212, + "step": 17805 + }, + { + "epoch": 0.3175899832340456, + "grad_norm": 0.2702171206474304, + "learning_rate": 4.312820632787686e-05, + "loss": 0.1528, + "step": 17806 + }, + { + "epoch": 0.31760781935575927, + "grad_norm": 0.25395190715789795, + "learning_rate": 4.312713446008002e-05, + "loss": 0.1776, + "step": 17807 + }, + { + "epoch": 0.31762565547747296, + "grad_norm": 0.2341102808713913, + "learning_rate": 4.3126062522016156e-05, + "loss": 0.1867, + "step": 17808 + }, + { + "epoch": 0.3176434915991867, + "grad_norm": 0.2933019995689392, + "learning_rate": 4.31249905136894e-05, + "loss": 0.2105, + "step": 17809 + }, + { + "epoch": 0.3176613277209004, + "grad_norm": 0.2942465841770172, + "learning_rate": 4.312391843510393e-05, + "loss": 0.1832, + "step": 17810 + }, + { + "epoch": 0.3176791638426141, + "grad_norm": 0.24671952426433563, + "learning_rate": 4.31228462862639e-05, + "loss": 0.1907, + "step": 17811 + }, + { + "epoch": 0.31769699996432776, + "grad_norm": 0.2603171169757843, + "learning_rate": 4.3121774067173446e-05, + "loss": 0.1489, + "step": 17812 + }, + { + "epoch": 0.31771483608604145, + "grad_norm": 0.35887011885643005, + "learning_rate": 4.312070177783674e-05, + "loss": 0.1108, + "step": 17813 + }, + { + "epoch": 0.31773267220775514, + "grad_norm": 0.2506074607372284, + "learning_rate": 4.3119629418257936e-05, + "loss": 0.1893, + "step": 17814 + }, + { + "epoch": 0.31775050832946883, + "grad_norm": 0.32645896077156067, + "learning_rate": 4.3118556988441185e-05, + "loss": 0.1811, + "step": 17815 + }, + { + "epoch": 0.3177683444511825, + "grad_norm": 0.2704823613166809, + "learning_rate": 4.311748448839066e-05, + "loss": 0.1686, + "step": 17816 + }, + { + "epoch": 0.3177861805728962, + "grad_norm": 0.2628668546676636, + "learning_rate": 4.311641191811049e-05, + "loss": 0.1914, + "step": 17817 + }, + { + "epoch": 0.31780401669460995, + "grad_norm": 0.31883901357650757, + "learning_rate": 4.311533927760487e-05, + "loss": 0.161, + "step": 17818 + }, + { + "epoch": 0.31782185281632364, + "grad_norm": 0.21375906467437744, + "learning_rate": 4.311426656687793e-05, + "loss": 0.1419, + "step": 17819 + }, + { + "epoch": 0.3178396889380373, + "grad_norm": 0.30643463134765625, + "learning_rate": 4.311319378593383e-05, + "loss": 0.1577, + "step": 17820 + }, + { + "epoch": 0.317857525059751, + "grad_norm": 0.30004793405532837, + "learning_rate": 4.311212093477674e-05, + "loss": 0.1798, + "step": 17821 + }, + { + "epoch": 0.3178753611814647, + "grad_norm": 0.29790979623794556, + "learning_rate": 4.3111048013410814e-05, + "loss": 0.1829, + "step": 17822 + }, + { + "epoch": 0.3178931973031784, + "grad_norm": 0.24758180975914001, + "learning_rate": 4.310997502184021e-05, + "loss": 0.1608, + "step": 17823 + }, + { + "epoch": 0.3179110334248921, + "grad_norm": 0.294776052236557, + "learning_rate": 4.310890196006909e-05, + "loss": 0.1232, + "step": 17824 + }, + { + "epoch": 0.31792886954660576, + "grad_norm": 0.3303149938583374, + "learning_rate": 4.31078288281016e-05, + "loss": 0.2212, + "step": 17825 + }, + { + "epoch": 0.3179467056683195, + "grad_norm": 0.21274681389331818, + "learning_rate": 4.310675562594193e-05, + "loss": 0.1606, + "step": 17826 + }, + { + "epoch": 0.3179645417900332, + "grad_norm": 0.21112030744552612, + "learning_rate": 4.310568235359421e-05, + "loss": 0.1681, + "step": 17827 + }, + { + "epoch": 0.3179823779117469, + "grad_norm": 0.2439056932926178, + "learning_rate": 4.3104609011062615e-05, + "loss": 0.1296, + "step": 17828 + }, + { + "epoch": 0.31800021403346057, + "grad_norm": 0.2420627623796463, + "learning_rate": 4.31035355983513e-05, + "loss": 0.192, + "step": 17829 + }, + { + "epoch": 0.31801805015517426, + "grad_norm": 0.2784542441368103, + "learning_rate": 4.310246211546443e-05, + "loss": 0.1746, + "step": 17830 + }, + { + "epoch": 0.31803588627688795, + "grad_norm": 0.27748286724090576, + "learning_rate": 4.310138856240616e-05, + "loss": 0.1465, + "step": 17831 + }, + { + "epoch": 0.31805372239860163, + "grad_norm": 0.30073028802871704, + "learning_rate": 4.310031493918066e-05, + "loss": 0.152, + "step": 17832 + }, + { + "epoch": 0.3180715585203153, + "grad_norm": 0.3731083273887634, + "learning_rate": 4.309924124579209e-05, + "loss": 0.111, + "step": 17833 + }, + { + "epoch": 0.318089394642029, + "grad_norm": 0.6728917956352234, + "learning_rate": 4.3098167482244605e-05, + "loss": 0.2232, + "step": 17834 + }, + { + "epoch": 0.31810723076374275, + "grad_norm": 0.21494494378566742, + "learning_rate": 4.3097093648542376e-05, + "loss": 0.1661, + "step": 17835 + }, + { + "epoch": 0.31812506688545644, + "grad_norm": 0.20629215240478516, + "learning_rate": 4.3096019744689555e-05, + "loss": 0.1403, + "step": 17836 + }, + { + "epoch": 0.31814290300717013, + "grad_norm": 0.2372989058494568, + "learning_rate": 4.309494577069032e-05, + "loss": 0.1099, + "step": 17837 + }, + { + "epoch": 0.3181607391288838, + "grad_norm": 0.2816111445426941, + "learning_rate": 4.309387172654882e-05, + "loss": 0.1788, + "step": 17838 + }, + { + "epoch": 0.3181785752505975, + "grad_norm": 0.2464326173067093, + "learning_rate": 4.309279761226922e-05, + "loss": 0.1546, + "step": 17839 + }, + { + "epoch": 0.3181964113723112, + "grad_norm": 0.26137423515319824, + "learning_rate": 4.30917234278557e-05, + "loss": 0.1549, + "step": 17840 + }, + { + "epoch": 0.3182142474940249, + "grad_norm": 0.21578945219516754, + "learning_rate": 4.30906491733124e-05, + "loss": 0.1648, + "step": 17841 + }, + { + "epoch": 0.31823208361573857, + "grad_norm": 0.2710860073566437, + "learning_rate": 4.30895748486435e-05, + "loss": 0.2181, + "step": 17842 + }, + { + "epoch": 0.3182499197374523, + "grad_norm": 0.2509026527404785, + "learning_rate": 4.3088500453853154e-05, + "loss": 0.1794, + "step": 17843 + }, + { + "epoch": 0.318267755859166, + "grad_norm": 0.3039661645889282, + "learning_rate": 4.308742598894554e-05, + "loss": 0.2058, + "step": 17844 + }, + { + "epoch": 0.3182855919808797, + "grad_norm": 0.27470862865448, + "learning_rate": 4.3086351453924815e-05, + "loss": 0.1643, + "step": 17845 + }, + { + "epoch": 0.3183034281025934, + "grad_norm": 0.24679537117481232, + "learning_rate": 4.308527684879514e-05, + "loss": 0.1375, + "step": 17846 + }, + { + "epoch": 0.31832126422430707, + "grad_norm": 0.2020549774169922, + "learning_rate": 4.308420217356069e-05, + "loss": 0.1621, + "step": 17847 + }, + { + "epoch": 0.31833910034602075, + "grad_norm": 0.34716740250587463, + "learning_rate": 4.3083127428225626e-05, + "loss": 0.2077, + "step": 17848 + }, + { + "epoch": 0.31835693646773444, + "grad_norm": 0.29732653498649597, + "learning_rate": 4.308205261279411e-05, + "loss": 0.2329, + "step": 17849 + }, + { + "epoch": 0.31837477258944813, + "grad_norm": 0.22680331766605377, + "learning_rate": 4.308097772727032e-05, + "loss": 0.1742, + "step": 17850 + }, + { + "epoch": 0.3183926087111618, + "grad_norm": 0.22886672616004944, + "learning_rate": 4.307990277165841e-05, + "loss": 0.2042, + "step": 17851 + }, + { + "epoch": 0.31841044483287556, + "grad_norm": 0.23851677775382996, + "learning_rate": 4.3078827745962556e-05, + "loss": 0.1967, + "step": 17852 + }, + { + "epoch": 0.31842828095458925, + "grad_norm": 0.24372057616710663, + "learning_rate": 4.307775265018692e-05, + "loss": 0.1926, + "step": 17853 + }, + { + "epoch": 0.31844611707630294, + "grad_norm": 0.32449987530708313, + "learning_rate": 4.307667748433567e-05, + "loss": 0.1918, + "step": 17854 + }, + { + "epoch": 0.3184639531980166, + "grad_norm": 0.24032112956047058, + "learning_rate": 4.3075602248412975e-05, + "loss": 0.1259, + "step": 17855 + }, + { + "epoch": 0.3184817893197303, + "grad_norm": 0.2844131886959076, + "learning_rate": 4.3074526942423e-05, + "loss": 0.2125, + "step": 17856 + }, + { + "epoch": 0.318499625441444, + "grad_norm": 0.23106344044208527, + "learning_rate": 4.3073451566369915e-05, + "loss": 0.1889, + "step": 17857 + }, + { + "epoch": 0.3185174615631577, + "grad_norm": 0.26120078563690186, + "learning_rate": 4.3072376120257895e-05, + "loss": 0.1333, + "step": 17858 + }, + { + "epoch": 0.3185352976848714, + "grad_norm": 0.2818737328052521, + "learning_rate": 4.30713006040911e-05, + "loss": 0.1384, + "step": 17859 + }, + { + "epoch": 0.3185531338065851, + "grad_norm": 0.2503306567668915, + "learning_rate": 4.30702250178737e-05, + "loss": 0.1587, + "step": 17860 + }, + { + "epoch": 0.3185709699282988, + "grad_norm": 0.4025648534297943, + "learning_rate": 4.3069149361609876e-05, + "loss": 0.1353, + "step": 17861 + }, + { + "epoch": 0.3185888060500125, + "grad_norm": 0.3140362501144409, + "learning_rate": 4.3068073635303775e-05, + "loss": 0.2106, + "step": 17862 + }, + { + "epoch": 0.3186066421717262, + "grad_norm": 0.24204808473587036, + "learning_rate": 4.306699783895959e-05, + "loss": 0.1582, + "step": 17863 + }, + { + "epoch": 0.3186244782934399, + "grad_norm": 0.2421853244304657, + "learning_rate": 4.306592197258148e-05, + "loss": 0.1856, + "step": 17864 + }, + { + "epoch": 0.31864231441515356, + "grad_norm": 0.3057219088077545, + "learning_rate": 4.306484603617361e-05, + "loss": 0.2206, + "step": 17865 + }, + { + "epoch": 0.31866015053686725, + "grad_norm": 0.25261229276657104, + "learning_rate": 4.3063770029740164e-05, + "loss": 0.1655, + "step": 17866 + }, + { + "epoch": 0.31867798665858094, + "grad_norm": 0.2884041368961334, + "learning_rate": 4.306269395328531e-05, + "loss": 0.1751, + "step": 17867 + }, + { + "epoch": 0.3186958227802947, + "grad_norm": 0.26545092463493347, + "learning_rate": 4.30616178068132e-05, + "loss": 0.1926, + "step": 17868 + }, + { + "epoch": 0.31871365890200837, + "grad_norm": 0.23007084429264069, + "learning_rate": 4.306054159032803e-05, + "loss": 0.1805, + "step": 17869 + }, + { + "epoch": 0.31873149502372206, + "grad_norm": 0.3021833598613739, + "learning_rate": 4.3059465303833965e-05, + "loss": 0.1996, + "step": 17870 + }, + { + "epoch": 0.31874933114543574, + "grad_norm": 0.33298414945602417, + "learning_rate": 4.3058388947335175e-05, + "loss": 0.2013, + "step": 17871 + }, + { + "epoch": 0.31876716726714943, + "grad_norm": 0.23252364993095398, + "learning_rate": 4.3057312520835834e-05, + "loss": 0.144, + "step": 17872 + }, + { + "epoch": 0.3187850033888631, + "grad_norm": 0.180417999625206, + "learning_rate": 4.305623602434011e-05, + "loss": 0.1488, + "step": 17873 + }, + { + "epoch": 0.3188028395105768, + "grad_norm": 0.23891793191432953, + "learning_rate": 4.3055159457852176e-05, + "loss": 0.1902, + "step": 17874 + }, + { + "epoch": 0.3188206756322905, + "grad_norm": 0.24990332126617432, + "learning_rate": 4.305408282137621e-05, + "loss": 0.1906, + "step": 17875 + }, + { + "epoch": 0.3188385117540042, + "grad_norm": 0.3409096598625183, + "learning_rate": 4.305300611491638e-05, + "loss": 0.2136, + "step": 17876 + }, + { + "epoch": 0.3188563478757179, + "grad_norm": 0.2706303894519806, + "learning_rate": 4.305192933847687e-05, + "loss": 0.1568, + "step": 17877 + }, + { + "epoch": 0.3188741839974316, + "grad_norm": 0.2687775790691376, + "learning_rate": 4.305085249206184e-05, + "loss": 0.1418, + "step": 17878 + }, + { + "epoch": 0.3188920201191453, + "grad_norm": 0.43368518352508545, + "learning_rate": 4.3049775575675474e-05, + "loss": 0.1, + "step": 17879 + }, + { + "epoch": 0.318909856240859, + "grad_norm": 0.31225666403770447, + "learning_rate": 4.304869858932195e-05, + "loss": 0.1754, + "step": 17880 + }, + { + "epoch": 0.3189276923625727, + "grad_norm": 0.22142820060253143, + "learning_rate": 4.304762153300543e-05, + "loss": 0.1537, + "step": 17881 + }, + { + "epoch": 0.31894552848428637, + "grad_norm": 0.2720910608768463, + "learning_rate": 4.30465444067301e-05, + "loss": 0.1557, + "step": 17882 + }, + { + "epoch": 0.31896336460600005, + "grad_norm": 0.2982274293899536, + "learning_rate": 4.3045467210500125e-05, + "loss": 0.1774, + "step": 17883 + }, + { + "epoch": 0.31898120072771374, + "grad_norm": 0.22959260642528534, + "learning_rate": 4.304438994431969e-05, + "loss": 0.1626, + "step": 17884 + }, + { + "epoch": 0.3189990368494275, + "grad_norm": 0.28869178891181946, + "learning_rate": 4.304331260819297e-05, + "loss": 0.1382, + "step": 17885 + }, + { + "epoch": 0.3190168729711412, + "grad_norm": 0.2013123631477356, + "learning_rate": 4.304223520212413e-05, + "loss": 0.1295, + "step": 17886 + }, + { + "epoch": 0.31903470909285486, + "grad_norm": 0.26151391863822937, + "learning_rate": 4.304115772611736e-05, + "loss": 0.1606, + "step": 17887 + }, + { + "epoch": 0.31905254521456855, + "grad_norm": 0.2295144945383072, + "learning_rate": 4.304008018017683e-05, + "loss": 0.1762, + "step": 17888 + }, + { + "epoch": 0.31907038133628224, + "grad_norm": 0.23897890746593475, + "learning_rate": 4.303900256430672e-05, + "loss": 0.1516, + "step": 17889 + }, + { + "epoch": 0.3190882174579959, + "grad_norm": 0.2275541126728058, + "learning_rate": 4.30379248785112e-05, + "loss": 0.1847, + "step": 17890 + }, + { + "epoch": 0.3191060535797096, + "grad_norm": 0.3158656358718872, + "learning_rate": 4.303684712279446e-05, + "loss": 0.2082, + "step": 17891 + }, + { + "epoch": 0.3191238897014233, + "grad_norm": 0.29559525847435, + "learning_rate": 4.303576929716067e-05, + "loss": 0.1768, + "step": 17892 + }, + { + "epoch": 0.319141725823137, + "grad_norm": 0.2867787778377533, + "learning_rate": 4.3034691401614e-05, + "loss": 0.179, + "step": 17893 + }, + { + "epoch": 0.31915956194485073, + "grad_norm": 0.23168174922466278, + "learning_rate": 4.303361343615865e-05, + "loss": 0.1506, + "step": 17894 + }, + { + "epoch": 0.3191773980665644, + "grad_norm": 0.24208880960941315, + "learning_rate": 4.303253540079878e-05, + "loss": 0.2005, + "step": 17895 + }, + { + "epoch": 0.3191952341882781, + "grad_norm": 0.3277707099914551, + "learning_rate": 4.303145729553858e-05, + "loss": 0.1329, + "step": 17896 + }, + { + "epoch": 0.3192130703099918, + "grad_norm": 0.24475747346878052, + "learning_rate": 4.3030379120382216e-05, + "loss": 0.1708, + "step": 17897 + }, + { + "epoch": 0.3192309064317055, + "grad_norm": 0.3401337265968323, + "learning_rate": 4.3029300875333875e-05, + "loss": 0.161, + "step": 17898 + }, + { + "epoch": 0.3192487425534192, + "grad_norm": 0.2839389145374298, + "learning_rate": 4.302822256039774e-05, + "loss": 0.2711, + "step": 17899 + }, + { + "epoch": 0.31926657867513286, + "grad_norm": 0.29284125566482544, + "learning_rate": 4.3027144175577984e-05, + "loss": 0.1699, + "step": 17900 + }, + { + "epoch": 0.31928441479684655, + "grad_norm": 0.2563447952270508, + "learning_rate": 4.3026065720878796e-05, + "loss": 0.1682, + "step": 17901 + }, + { + "epoch": 0.3193022509185603, + "grad_norm": 0.24518202245235443, + "learning_rate": 4.3024987196304344e-05, + "loss": 0.1915, + "step": 17902 + }, + { + "epoch": 0.319320087040274, + "grad_norm": 0.22739307582378387, + "learning_rate": 4.302390860185883e-05, + "loss": 0.169, + "step": 17903 + }, + { + "epoch": 0.31933792316198767, + "grad_norm": 0.25473618507385254, + "learning_rate": 4.3022829937546404e-05, + "loss": 0.1501, + "step": 17904 + }, + { + "epoch": 0.31935575928370136, + "grad_norm": 0.23113799095153809, + "learning_rate": 4.302175120337128e-05, + "loss": 0.1184, + "step": 17905 + }, + { + "epoch": 0.31937359540541504, + "grad_norm": 0.23945507407188416, + "learning_rate": 4.3020672399337616e-05, + "loss": 0.1226, + "step": 17906 + }, + { + "epoch": 0.31939143152712873, + "grad_norm": 0.38307005167007446, + "learning_rate": 4.3019593525449596e-05, + "loss": 0.1922, + "step": 17907 + }, + { + "epoch": 0.3194092676488424, + "grad_norm": 0.43980878591537476, + "learning_rate": 4.301851458171141e-05, + "loss": 0.2004, + "step": 17908 + }, + { + "epoch": 0.3194271037705561, + "grad_norm": 0.3586626648902893, + "learning_rate": 4.3017435568127246e-05, + "loss": 0.134, + "step": 17909 + }, + { + "epoch": 0.31944493989226985, + "grad_norm": 0.2454012632369995, + "learning_rate": 4.301635648470127e-05, + "loss": 0.1572, + "step": 17910 + }, + { + "epoch": 0.31946277601398354, + "grad_norm": 0.26246950030326843, + "learning_rate": 4.3015277331437675e-05, + "loss": 0.1736, + "step": 17911 + }, + { + "epoch": 0.31948061213569723, + "grad_norm": 0.5175898671150208, + "learning_rate": 4.301419810834065e-05, + "loss": 0.262, + "step": 17912 + }, + { + "epoch": 0.3194984482574109, + "grad_norm": 0.17227597534656525, + "learning_rate": 4.3013118815414365e-05, + "loss": 0.1416, + "step": 17913 + }, + { + "epoch": 0.3195162843791246, + "grad_norm": 0.2439941018819809, + "learning_rate": 4.3012039452663014e-05, + "loss": 0.1202, + "step": 17914 + }, + { + "epoch": 0.3195341205008383, + "grad_norm": 0.3642473816871643, + "learning_rate": 4.301096002009077e-05, + "loss": 0.1716, + "step": 17915 + }, + { + "epoch": 0.319551956622552, + "grad_norm": 0.29081660509109497, + "learning_rate": 4.3009880517701836e-05, + "loss": 0.2104, + "step": 17916 + }, + { + "epoch": 0.31956979274426567, + "grad_norm": 0.26018771529197693, + "learning_rate": 4.300880094550037e-05, + "loss": 0.1824, + "step": 17917 + }, + { + "epoch": 0.31958762886597936, + "grad_norm": 0.26684725284576416, + "learning_rate": 4.3007721303490586e-05, + "loss": 0.1701, + "step": 17918 + }, + { + "epoch": 0.3196054649876931, + "grad_norm": 0.2873651087284088, + "learning_rate": 4.3006641591676645e-05, + "loss": 0.2203, + "step": 17919 + }, + { + "epoch": 0.3196233011094068, + "grad_norm": 0.2741551399230957, + "learning_rate": 4.3005561810062745e-05, + "loss": 0.1628, + "step": 17920 + }, + { + "epoch": 0.3196411372311205, + "grad_norm": 0.22419938445091248, + "learning_rate": 4.3004481958653065e-05, + "loss": 0.159, + "step": 17921 + }, + { + "epoch": 0.31965897335283416, + "grad_norm": 0.2226715385913849, + "learning_rate": 4.30034020374518e-05, + "loss": 0.1063, + "step": 17922 + }, + { + "epoch": 0.31967680947454785, + "grad_norm": 0.33590811491012573, + "learning_rate": 4.3002322046463125e-05, + "loss": 0.1508, + "step": 17923 + }, + { + "epoch": 0.31969464559626154, + "grad_norm": 0.23108139634132385, + "learning_rate": 4.3001241985691234e-05, + "loss": 0.1979, + "step": 17924 + }, + { + "epoch": 0.3197124817179752, + "grad_norm": 0.18934516608715057, + "learning_rate": 4.3000161855140315e-05, + "loss": 0.2001, + "step": 17925 + }, + { + "epoch": 0.3197303178396889, + "grad_norm": 0.255266934633255, + "learning_rate": 4.299908165481455e-05, + "loss": 0.1581, + "step": 17926 + }, + { + "epoch": 0.31974815396140266, + "grad_norm": 0.24443794786930084, + "learning_rate": 4.299800138471812e-05, + "loss": 0.1445, + "step": 17927 + }, + { + "epoch": 0.31976599008311635, + "grad_norm": 0.273963063955307, + "learning_rate": 4.299692104485523e-05, + "loss": 0.1677, + "step": 17928 + }, + { + "epoch": 0.31978382620483004, + "grad_norm": 0.272589772939682, + "learning_rate": 4.299584063523006e-05, + "loss": 0.1651, + "step": 17929 + }, + { + "epoch": 0.3198016623265437, + "grad_norm": 0.5103332996368408, + "learning_rate": 4.299476015584679e-05, + "loss": 0.2404, + "step": 17930 + }, + { + "epoch": 0.3198194984482574, + "grad_norm": 0.21750696003437042, + "learning_rate": 4.299367960670961e-05, + "loss": 0.184, + "step": 17931 + }, + { + "epoch": 0.3198373345699711, + "grad_norm": 0.23367401957511902, + "learning_rate": 4.2992598987822725e-05, + "loss": 0.1895, + "step": 17932 + }, + { + "epoch": 0.3198551706916848, + "grad_norm": 0.23731425404548645, + "learning_rate": 4.2991518299190305e-05, + "loss": 0.1918, + "step": 17933 + }, + { + "epoch": 0.3198730068133985, + "grad_norm": 0.2755047678947449, + "learning_rate": 4.2990437540816546e-05, + "loss": 0.2193, + "step": 17934 + }, + { + "epoch": 0.31989084293511216, + "grad_norm": 0.24228043854236603, + "learning_rate": 4.2989356712705636e-05, + "loss": 0.1282, + "step": 17935 + }, + { + "epoch": 0.3199086790568259, + "grad_norm": 0.22782939672470093, + "learning_rate": 4.298827581486177e-05, + "loss": 0.1886, + "step": 17936 + }, + { + "epoch": 0.3199265151785396, + "grad_norm": 0.24840177595615387, + "learning_rate": 4.298719484728913e-05, + "loss": 0.1684, + "step": 17937 + }, + { + "epoch": 0.3199443513002533, + "grad_norm": 0.3157387971878052, + "learning_rate": 4.298611380999191e-05, + "loss": 0.1706, + "step": 17938 + }, + { + "epoch": 0.31996218742196697, + "grad_norm": 0.2063383311033249, + "learning_rate": 4.2985032702974303e-05, + "loss": 0.1452, + "step": 17939 + }, + { + "epoch": 0.31998002354368066, + "grad_norm": 0.3949030935764313, + "learning_rate": 4.29839515262405e-05, + "loss": 0.2054, + "step": 17940 + }, + { + "epoch": 0.31999785966539435, + "grad_norm": 0.31042471528053284, + "learning_rate": 4.2982870279794684e-05, + "loss": 0.2162, + "step": 17941 + }, + { + "epoch": 0.32001569578710803, + "grad_norm": 0.292270302772522, + "learning_rate": 4.2981788963641055e-05, + "loss": 0.127, + "step": 17942 + }, + { + "epoch": 0.3200335319088217, + "grad_norm": 0.2829902768135071, + "learning_rate": 4.2980707577783805e-05, + "loss": 0.1843, + "step": 17943 + }, + { + "epoch": 0.32005136803053547, + "grad_norm": 0.3233446180820465, + "learning_rate": 4.297962612222712e-05, + "loss": 0.1795, + "step": 17944 + }, + { + "epoch": 0.32006920415224915, + "grad_norm": 0.24719838798046112, + "learning_rate": 4.297854459697519e-05, + "loss": 0.1604, + "step": 17945 + }, + { + "epoch": 0.32008704027396284, + "grad_norm": 0.2635244131088257, + "learning_rate": 4.2977463002032214e-05, + "loss": 0.1642, + "step": 17946 + }, + { + "epoch": 0.32010487639567653, + "grad_norm": 0.2845657467842102, + "learning_rate": 4.297638133740238e-05, + "loss": 0.2115, + "step": 17947 + }, + { + "epoch": 0.3201227125173902, + "grad_norm": 0.26517805457115173, + "learning_rate": 4.297529960308988e-05, + "loss": 0.2183, + "step": 17948 + }, + { + "epoch": 0.3201405486391039, + "grad_norm": 0.2887776494026184, + "learning_rate": 4.297421779909892e-05, + "loss": 0.1804, + "step": 17949 + }, + { + "epoch": 0.3201583847608176, + "grad_norm": 0.29620370268821716, + "learning_rate": 4.297313592543368e-05, + "loss": 0.1338, + "step": 17950 + }, + { + "epoch": 0.3201762208825313, + "grad_norm": 0.3269090950489044, + "learning_rate": 4.297205398209836e-05, + "loss": 0.1748, + "step": 17951 + }, + { + "epoch": 0.32019405700424497, + "grad_norm": 0.4068647623062134, + "learning_rate": 4.297097196909714e-05, + "loss": 0.1703, + "step": 17952 + }, + { + "epoch": 0.3202118931259587, + "grad_norm": 0.2973577678203583, + "learning_rate": 4.2969889886434236e-05, + "loss": 0.2525, + "step": 17953 + }, + { + "epoch": 0.3202297292476724, + "grad_norm": 0.27513816952705383, + "learning_rate": 4.296880773411383e-05, + "loss": 0.1818, + "step": 17954 + }, + { + "epoch": 0.3202475653693861, + "grad_norm": 0.30197274684906006, + "learning_rate": 4.296772551214012e-05, + "loss": 0.1695, + "step": 17955 + }, + { + "epoch": 0.3202654014910998, + "grad_norm": 0.2208343744277954, + "learning_rate": 4.29666432205173e-05, + "loss": 0.114, + "step": 17956 + }, + { + "epoch": 0.32028323761281347, + "grad_norm": 0.31481072306632996, + "learning_rate": 4.2965560859249566e-05, + "loss": 0.2217, + "step": 17957 + }, + { + "epoch": 0.32030107373452715, + "grad_norm": 0.2951738238334656, + "learning_rate": 4.2964478428341104e-05, + "loss": 0.2324, + "step": 17958 + }, + { + "epoch": 0.32031890985624084, + "grad_norm": 0.39909499883651733, + "learning_rate": 4.2963395927796125e-05, + "loss": 0.1922, + "step": 17959 + }, + { + "epoch": 0.32033674597795453, + "grad_norm": 0.2836631238460541, + "learning_rate": 4.2962313357618824e-05, + "loss": 0.1942, + "step": 17960 + }, + { + "epoch": 0.3203545820996683, + "grad_norm": 0.22878852486610413, + "learning_rate": 4.296123071781339e-05, + "loss": 0.1746, + "step": 17961 + }, + { + "epoch": 0.32037241822138196, + "grad_norm": 0.19215981662273407, + "learning_rate": 4.2960148008384014e-05, + "loss": 0.1215, + "step": 17962 + }, + { + "epoch": 0.32039025434309565, + "grad_norm": 0.3427749574184418, + "learning_rate": 4.2959065229334913e-05, + "loss": 0.189, + "step": 17963 + }, + { + "epoch": 0.32040809046480934, + "grad_norm": 0.31281399726867676, + "learning_rate": 4.295798238067026e-05, + "loss": 0.1841, + "step": 17964 + }, + { + "epoch": 0.320425926586523, + "grad_norm": 0.4562949240207672, + "learning_rate": 4.2956899462394275e-05, + "loss": 0.1742, + "step": 17965 + }, + { + "epoch": 0.3204437627082367, + "grad_norm": 0.33912232518196106, + "learning_rate": 4.295581647451115e-05, + "loss": 0.1455, + "step": 17966 + }, + { + "epoch": 0.3204615988299504, + "grad_norm": 0.2856380045413971, + "learning_rate": 4.2954733417025065e-05, + "loss": 0.1739, + "step": 17967 + }, + { + "epoch": 0.3204794349516641, + "grad_norm": 0.3533669114112854, + "learning_rate": 4.295365028994024e-05, + "loss": 0.1713, + "step": 17968 + }, + { + "epoch": 0.32049727107337783, + "grad_norm": 0.267322838306427, + "learning_rate": 4.2952567093260864e-05, + "loss": 0.1324, + "step": 17969 + }, + { + "epoch": 0.3205151071950915, + "grad_norm": 0.31280526518821716, + "learning_rate": 4.2951483826991135e-05, + "loss": 0.1922, + "step": 17970 + }, + { + "epoch": 0.3205329433168052, + "grad_norm": 0.251764178276062, + "learning_rate": 4.295040049113526e-05, + "loss": 0.2005, + "step": 17971 + }, + { + "epoch": 0.3205507794385189, + "grad_norm": 0.2617865800857544, + "learning_rate": 4.2949317085697426e-05, + "loss": 0.1818, + "step": 17972 + }, + { + "epoch": 0.3205686155602326, + "grad_norm": 0.35505223274230957, + "learning_rate": 4.294823361068184e-05, + "loss": 0.2046, + "step": 17973 + }, + { + "epoch": 0.32058645168194627, + "grad_norm": 0.4510609805583954, + "learning_rate": 4.29471500660927e-05, + "loss": 0.1637, + "step": 17974 + }, + { + "epoch": 0.32060428780365996, + "grad_norm": 0.2971402108669281, + "learning_rate": 4.294606645193422e-05, + "loss": 0.2013, + "step": 17975 + }, + { + "epoch": 0.32062212392537365, + "grad_norm": 0.32822614908218384, + "learning_rate": 4.2944982768210576e-05, + "loss": 0.1625, + "step": 17976 + }, + { + "epoch": 0.32063996004708734, + "grad_norm": 0.2518485486507416, + "learning_rate": 4.294389901492598e-05, + "loss": 0.1526, + "step": 17977 + }, + { + "epoch": 0.3206577961688011, + "grad_norm": 0.2990824282169342, + "learning_rate": 4.294281519208464e-05, + "loss": 0.1523, + "step": 17978 + }, + { + "epoch": 0.32067563229051477, + "grad_norm": 0.33606094121932983, + "learning_rate": 4.294173129969075e-05, + "loss": 0.1609, + "step": 17979 + }, + { + "epoch": 0.32069346841222846, + "grad_norm": 0.2032424807548523, + "learning_rate": 4.294064733774851e-05, + "loss": 0.1142, + "step": 17980 + }, + { + "epoch": 0.32071130453394214, + "grad_norm": 0.23036698997020721, + "learning_rate": 4.2939563306262126e-05, + "loss": 0.1897, + "step": 17981 + }, + { + "epoch": 0.32072914065565583, + "grad_norm": 0.2209492176771164, + "learning_rate": 4.2938479205235803e-05, + "loss": 0.1688, + "step": 17982 + }, + { + "epoch": 0.3207469767773695, + "grad_norm": 0.2883176803588867, + "learning_rate": 4.2937395034673734e-05, + "loss": 0.1577, + "step": 17983 + }, + { + "epoch": 0.3207648128990832, + "grad_norm": 0.19783201813697815, + "learning_rate": 4.2936310794580125e-05, + "loss": 0.145, + "step": 17984 + }, + { + "epoch": 0.3207826490207969, + "grad_norm": 0.2921576201915741, + "learning_rate": 4.293522648495918e-05, + "loss": 0.1858, + "step": 17985 + }, + { + "epoch": 0.32080048514251064, + "grad_norm": 0.3217620849609375, + "learning_rate": 4.293414210581511e-05, + "loss": 0.1595, + "step": 17986 + }, + { + "epoch": 0.3208183212642243, + "grad_norm": 0.2626230716705322, + "learning_rate": 4.29330576571521e-05, + "loss": 0.1856, + "step": 17987 + }, + { + "epoch": 0.320836157385938, + "grad_norm": 0.30646297335624695, + "learning_rate": 4.293197313897438e-05, + "loss": 0.1783, + "step": 17988 + }, + { + "epoch": 0.3208539935076517, + "grad_norm": 0.23388750851154327, + "learning_rate": 4.293088855128612e-05, + "loss": 0.164, + "step": 17989 + }, + { + "epoch": 0.3208718296293654, + "grad_norm": 0.26077958941459656, + "learning_rate": 4.2929803894091555e-05, + "loss": 0.1739, + "step": 17990 + }, + { + "epoch": 0.3208896657510791, + "grad_norm": 0.2520488202571869, + "learning_rate": 4.292871916739487e-05, + "loss": 0.1531, + "step": 17991 + }, + { + "epoch": 0.32090750187279277, + "grad_norm": 0.276787668466568, + "learning_rate": 4.292763437120029e-05, + "loss": 0.184, + "step": 17992 + }, + { + "epoch": 0.32092533799450645, + "grad_norm": 0.2858846187591553, + "learning_rate": 4.292654950551199e-05, + "loss": 0.1568, + "step": 17993 + }, + { + "epoch": 0.32094317411622014, + "grad_norm": 0.301724910736084, + "learning_rate": 4.292546457033421e-05, + "loss": 0.1696, + "step": 17994 + }, + { + "epoch": 0.3209610102379339, + "grad_norm": 0.24382881820201874, + "learning_rate": 4.292437956567113e-05, + "loss": 0.1827, + "step": 17995 + }, + { + "epoch": 0.3209788463596476, + "grad_norm": 0.3244045674800873, + "learning_rate": 4.292329449152696e-05, + "loss": 0.1548, + "step": 17996 + }, + { + "epoch": 0.32099668248136126, + "grad_norm": 0.24747657775878906, + "learning_rate": 4.2922209347905907e-05, + "loss": 0.1354, + "step": 17997 + }, + { + "epoch": 0.32101451860307495, + "grad_norm": 0.2584995627403259, + "learning_rate": 4.292112413481218e-05, + "loss": 0.2232, + "step": 17998 + }, + { + "epoch": 0.32103235472478864, + "grad_norm": 0.23093536496162415, + "learning_rate": 4.292003885225e-05, + "loss": 0.1428, + "step": 17999 + }, + { + "epoch": 0.3210501908465023, + "grad_norm": 0.23213060200214386, + "learning_rate": 4.291895350022356e-05, + "loss": 0.1582, + "step": 18000 + }, + { + "epoch": 0.3210501908465023, + "eval_loss": 0.16559597849845886, + "eval_runtime": 106.6244, + "eval_samples_per_second": 9.604, + "eval_steps_per_second": 1.604, + "step": 18000 + }, + { + "epoch": 0.321068026968216, + "grad_norm": 0.5585169196128845, + "learning_rate": 4.2917868078737056e-05, + "loss": 0.215, + "step": 18001 + }, + { + "epoch": 0.3210858630899297, + "grad_norm": 0.36891061067581177, + "learning_rate": 4.2916782587794705e-05, + "loss": 0.1554, + "step": 18002 + }, + { + "epoch": 0.32110369921164345, + "grad_norm": 0.24026136100292206, + "learning_rate": 4.291569702740073e-05, + "loss": 0.1681, + "step": 18003 + }, + { + "epoch": 0.32112153533335713, + "grad_norm": 0.31819137930870056, + "learning_rate": 4.291461139755931e-05, + "loss": 0.1581, + "step": 18004 + }, + { + "epoch": 0.3211393714550708, + "grad_norm": 0.3256298005580902, + "learning_rate": 4.291352569827467e-05, + "loss": 0.1932, + "step": 18005 + }, + { + "epoch": 0.3211572075767845, + "grad_norm": 0.31874772906303406, + "learning_rate": 4.291243992955103e-05, + "loss": 0.2097, + "step": 18006 + }, + { + "epoch": 0.3211750436984982, + "grad_norm": 0.2279292643070221, + "learning_rate": 4.291135409139258e-05, + "loss": 0.179, + "step": 18007 + }, + { + "epoch": 0.3211928798202119, + "grad_norm": 0.24464398622512817, + "learning_rate": 4.2910268183803535e-05, + "loss": 0.2034, + "step": 18008 + }, + { + "epoch": 0.3212107159419256, + "grad_norm": 0.26906678080558777, + "learning_rate": 4.29091822067881e-05, + "loss": 0.1958, + "step": 18009 + }, + { + "epoch": 0.32122855206363926, + "grad_norm": 0.2615545094013214, + "learning_rate": 4.29080961603505e-05, + "loss": 0.1901, + "step": 18010 + }, + { + "epoch": 0.321246388185353, + "grad_norm": 0.23821043968200684, + "learning_rate": 4.290701004449492e-05, + "loss": 0.1626, + "step": 18011 + }, + { + "epoch": 0.3212642243070667, + "grad_norm": 0.2942897379398346, + "learning_rate": 4.2905923859225595e-05, + "loss": 0.1616, + "step": 18012 + }, + { + "epoch": 0.3212820604287804, + "grad_norm": 0.21033979952335358, + "learning_rate": 4.2904837604546724e-05, + "loss": 0.1649, + "step": 18013 + }, + { + "epoch": 0.32129989655049407, + "grad_norm": 0.36140936613082886, + "learning_rate": 4.290375128046251e-05, + "loss": 0.1705, + "step": 18014 + }, + { + "epoch": 0.32131773267220776, + "grad_norm": 0.3626006245613098, + "learning_rate": 4.2902664886977185e-05, + "loss": 0.143, + "step": 18015 + }, + { + "epoch": 0.32133556879392144, + "grad_norm": 0.34682902693748474, + "learning_rate": 4.290157842409493e-05, + "loss": 0.1967, + "step": 18016 + }, + { + "epoch": 0.32135340491563513, + "grad_norm": 0.23780466616153717, + "learning_rate": 4.290049189181999e-05, + "loss": 0.1848, + "step": 18017 + }, + { + "epoch": 0.3213712410373488, + "grad_norm": 0.346086323261261, + "learning_rate": 4.2899405290156555e-05, + "loss": 0.1945, + "step": 18018 + }, + { + "epoch": 0.3213890771590625, + "grad_norm": 0.6469464898109436, + "learning_rate": 4.289831861910885e-05, + "loss": 0.146, + "step": 18019 + }, + { + "epoch": 0.32140691328077625, + "grad_norm": 0.26612937450408936, + "learning_rate": 4.2897231878681064e-05, + "loss": 0.1983, + "step": 18020 + }, + { + "epoch": 0.32142474940248994, + "grad_norm": 0.28209903836250305, + "learning_rate": 4.289614506887743e-05, + "loss": 0.2231, + "step": 18021 + }, + { + "epoch": 0.32144258552420363, + "grad_norm": 0.22776180505752563, + "learning_rate": 4.2895058189702163e-05, + "loss": 0.1589, + "step": 18022 + }, + { + "epoch": 0.3214604216459173, + "grad_norm": 0.23473331332206726, + "learning_rate": 4.289397124115947e-05, + "loss": 0.2098, + "step": 18023 + }, + { + "epoch": 0.321478257767631, + "grad_norm": 0.3704344928264618, + "learning_rate": 4.2892884223253565e-05, + "loss": 0.2588, + "step": 18024 + }, + { + "epoch": 0.3214960938893447, + "grad_norm": 0.22363069653511047, + "learning_rate": 4.289179713598865e-05, + "loss": 0.1614, + "step": 18025 + }, + { + "epoch": 0.3215139300110584, + "grad_norm": 0.30653658509254456, + "learning_rate": 4.289070997936897e-05, + "loss": 0.1998, + "step": 18026 + }, + { + "epoch": 0.32153176613277207, + "grad_norm": 0.2966502606868744, + "learning_rate": 4.2889622753398703e-05, + "loss": 0.2054, + "step": 18027 + }, + { + "epoch": 0.3215496022544858, + "grad_norm": 0.23826436698436737, + "learning_rate": 4.288853545808208e-05, + "loss": 0.1506, + "step": 18028 + }, + { + "epoch": 0.3215674383761995, + "grad_norm": 0.22722485661506653, + "learning_rate": 4.288744809342332e-05, + "loss": 0.1643, + "step": 18029 + }, + { + "epoch": 0.3215852744979132, + "grad_norm": 0.32047396898269653, + "learning_rate": 4.288636065942663e-05, + "loss": 0.143, + "step": 18030 + }, + { + "epoch": 0.3216031106196269, + "grad_norm": 0.28161951899528503, + "learning_rate": 4.2885273156096226e-05, + "loss": 0.1583, + "step": 18031 + }, + { + "epoch": 0.32162094674134056, + "grad_norm": 0.2806845009326935, + "learning_rate": 4.288418558343633e-05, + "loss": 0.1753, + "step": 18032 + }, + { + "epoch": 0.32163878286305425, + "grad_norm": 0.3080390691757202, + "learning_rate": 4.2883097941451155e-05, + "loss": 0.166, + "step": 18033 + }, + { + "epoch": 0.32165661898476794, + "grad_norm": 0.30956411361694336, + "learning_rate": 4.288201023014492e-05, + "loss": 0.2139, + "step": 18034 + }, + { + "epoch": 0.3216744551064816, + "grad_norm": 0.2906337380409241, + "learning_rate": 4.288092244952182e-05, + "loss": 0.1867, + "step": 18035 + }, + { + "epoch": 0.3216922912281953, + "grad_norm": 0.22089581191539764, + "learning_rate": 4.28798345995861e-05, + "loss": 0.1377, + "step": 18036 + }, + { + "epoch": 0.32171012734990906, + "grad_norm": 0.2225026786327362, + "learning_rate": 4.287874668034197e-05, + "loss": 0.133, + "step": 18037 + }, + { + "epoch": 0.32172796347162275, + "grad_norm": 0.27453580498695374, + "learning_rate": 4.287765869179364e-05, + "loss": 0.1601, + "step": 18038 + }, + { + "epoch": 0.32174579959333643, + "grad_norm": 0.25948649644851685, + "learning_rate": 4.287657063394532e-05, + "loss": 0.1507, + "step": 18039 + }, + { + "epoch": 0.3217636357150501, + "grad_norm": 0.2517499327659607, + "learning_rate": 4.287548250680124e-05, + "loss": 0.1689, + "step": 18040 + }, + { + "epoch": 0.3217814718367638, + "grad_norm": 0.23956286907196045, + "learning_rate": 4.2874394310365626e-05, + "loss": 0.176, + "step": 18041 + }, + { + "epoch": 0.3217993079584775, + "grad_norm": 0.22735492885112762, + "learning_rate": 4.2873306044642687e-05, + "loss": 0.1352, + "step": 18042 + }, + { + "epoch": 0.3218171440801912, + "grad_norm": 0.2797132134437561, + "learning_rate": 4.287221770963663e-05, + "loss": 0.1664, + "step": 18043 + }, + { + "epoch": 0.3218349802019049, + "grad_norm": 0.24348405003547668, + "learning_rate": 4.2871129305351694e-05, + "loss": 0.1413, + "step": 18044 + }, + { + "epoch": 0.3218528163236186, + "grad_norm": 0.21892757713794708, + "learning_rate": 4.287004083179208e-05, + "loss": 0.1761, + "step": 18045 + }, + { + "epoch": 0.3218706524453323, + "grad_norm": 0.31122085452079773, + "learning_rate": 4.286895228896202e-05, + "loss": 0.1875, + "step": 18046 + }, + { + "epoch": 0.321888488567046, + "grad_norm": 0.20841598510742188, + "learning_rate": 4.2867863676865724e-05, + "loss": 0.1378, + "step": 18047 + }, + { + "epoch": 0.3219063246887597, + "grad_norm": 0.33254650235176086, + "learning_rate": 4.286677499550743e-05, + "loss": 0.2204, + "step": 18048 + }, + { + "epoch": 0.32192416081047337, + "grad_norm": 0.17228543758392334, + "learning_rate": 4.2865686244891334e-05, + "loss": 0.0946, + "step": 18049 + }, + { + "epoch": 0.32194199693218706, + "grad_norm": 0.2636859714984894, + "learning_rate": 4.2864597425021666e-05, + "loss": 0.1819, + "step": 18050 + }, + { + "epoch": 0.32195983305390075, + "grad_norm": 0.2608453929424286, + "learning_rate": 4.286350853590266e-05, + "loss": 0.2011, + "step": 18051 + }, + { + "epoch": 0.32197766917561443, + "grad_norm": 0.27565479278564453, + "learning_rate": 4.2862419577538516e-05, + "loss": 0.1788, + "step": 18052 + }, + { + "epoch": 0.3219955052973281, + "grad_norm": 0.3326278328895569, + "learning_rate": 4.286133054993346e-05, + "loss": 0.1506, + "step": 18053 + }, + { + "epoch": 0.32201334141904187, + "grad_norm": 0.3037939965724945, + "learning_rate": 4.2860241453091726e-05, + "loss": 0.2235, + "step": 18054 + }, + { + "epoch": 0.32203117754075555, + "grad_norm": 0.191414475440979, + "learning_rate": 4.285915228701752e-05, + "loss": 0.1257, + "step": 18055 + }, + { + "epoch": 0.32204901366246924, + "grad_norm": 0.28974199295043945, + "learning_rate": 4.285806305171508e-05, + "loss": 0.1823, + "step": 18056 + }, + { + "epoch": 0.32206684978418293, + "grad_norm": 0.24226349592208862, + "learning_rate": 4.285697374718862e-05, + "loss": 0.1901, + "step": 18057 + }, + { + "epoch": 0.3220846859058966, + "grad_norm": 0.376336932182312, + "learning_rate": 4.285588437344236e-05, + "loss": 0.248, + "step": 18058 + }, + { + "epoch": 0.3221025220276103, + "grad_norm": 0.36374542117118835, + "learning_rate": 4.285479493048052e-05, + "loss": 0.1721, + "step": 18059 + }, + { + "epoch": 0.322120358149324, + "grad_norm": 0.25528842210769653, + "learning_rate": 4.285370541830733e-05, + "loss": 0.175, + "step": 18060 + }, + { + "epoch": 0.3221381942710377, + "grad_norm": 0.25853872299194336, + "learning_rate": 4.2852615836927015e-05, + "loss": 0.147, + "step": 18061 + }, + { + "epoch": 0.3221560303927514, + "grad_norm": 0.3118174374103546, + "learning_rate": 4.2851526186343785e-05, + "loss": 0.1511, + "step": 18062 + }, + { + "epoch": 0.3221738665144651, + "grad_norm": 0.2361362725496292, + "learning_rate": 4.2850436466561886e-05, + "loss": 0.1922, + "step": 18063 + }, + { + "epoch": 0.3221917026361788, + "grad_norm": 0.2701319754123688, + "learning_rate": 4.284934667758552e-05, + "loss": 0.165, + "step": 18064 + }, + { + "epoch": 0.3222095387578925, + "grad_norm": 0.2908462584018707, + "learning_rate": 4.284825681941893e-05, + "loss": 0.1274, + "step": 18065 + }, + { + "epoch": 0.3222273748796062, + "grad_norm": 0.23834800720214844, + "learning_rate": 4.284716689206633e-05, + "loss": 0.1738, + "step": 18066 + }, + { + "epoch": 0.32224521100131986, + "grad_norm": 0.27812689542770386, + "learning_rate": 4.284607689553194e-05, + "loss": 0.1772, + "step": 18067 + }, + { + "epoch": 0.32226304712303355, + "grad_norm": 0.5655857920646667, + "learning_rate": 4.284498682982e-05, + "loss": 0.1929, + "step": 18068 + }, + { + "epoch": 0.32228088324474724, + "grad_norm": 0.2239859402179718, + "learning_rate": 4.2843896694934725e-05, + "loss": 0.1858, + "step": 18069 + }, + { + "epoch": 0.322298719366461, + "grad_norm": 0.28623759746551514, + "learning_rate": 4.284280649088034e-05, + "loss": 0.1417, + "step": 18070 + }, + { + "epoch": 0.32231655548817467, + "grad_norm": 0.2485145926475525, + "learning_rate": 4.284171621766108e-05, + "loss": 0.1754, + "step": 18071 + }, + { + "epoch": 0.32233439160988836, + "grad_norm": 0.45249781012535095, + "learning_rate": 4.284062587528116e-05, + "loss": 0.1542, + "step": 18072 + }, + { + "epoch": 0.32235222773160205, + "grad_norm": 0.22429540753364563, + "learning_rate": 4.283953546374482e-05, + "loss": 0.1805, + "step": 18073 + }, + { + "epoch": 0.32237006385331574, + "grad_norm": 0.3011311888694763, + "learning_rate": 4.283844498305627e-05, + "loss": 0.1842, + "step": 18074 + }, + { + "epoch": 0.3223878999750294, + "grad_norm": 0.28535404801368713, + "learning_rate": 4.283735443321975e-05, + "loss": 0.1747, + "step": 18075 + }, + { + "epoch": 0.3224057360967431, + "grad_norm": 0.16767239570617676, + "learning_rate": 4.2836263814239485e-05, + "loss": 0.1412, + "step": 18076 + }, + { + "epoch": 0.3224235722184568, + "grad_norm": 0.2633972764015198, + "learning_rate": 4.28351731261197e-05, + "loss": 0.196, + "step": 18077 + }, + { + "epoch": 0.3224414083401705, + "grad_norm": 0.2889309525489807, + "learning_rate": 4.283408236886462e-05, + "loss": 0.1878, + "step": 18078 + }, + { + "epoch": 0.32245924446188423, + "grad_norm": 0.3738020956516266, + "learning_rate": 4.283299154247849e-05, + "loss": 0.2166, + "step": 18079 + }, + { + "epoch": 0.3224770805835979, + "grad_norm": 0.26379501819610596, + "learning_rate": 4.2831900646965506e-05, + "loss": 0.1685, + "step": 18080 + }, + { + "epoch": 0.3224949167053116, + "grad_norm": 0.27132490277290344, + "learning_rate": 4.2830809682329926e-05, + "loss": 0.1802, + "step": 18081 + }, + { + "epoch": 0.3225127528270253, + "grad_norm": 0.25412696599960327, + "learning_rate": 4.282971864857597e-05, + "loss": 0.1609, + "step": 18082 + }, + { + "epoch": 0.322530588948739, + "grad_norm": 0.30349597334861755, + "learning_rate": 4.282862754570787e-05, + "loss": 0.2038, + "step": 18083 + }, + { + "epoch": 0.32254842507045267, + "grad_norm": 0.24025368690490723, + "learning_rate": 4.282753637372984e-05, + "loss": 0.1752, + "step": 18084 + }, + { + "epoch": 0.32256626119216636, + "grad_norm": 0.30440738797187805, + "learning_rate": 4.282644513264613e-05, + "loss": 0.169, + "step": 18085 + }, + { + "epoch": 0.32258409731388005, + "grad_norm": 0.2684110105037689, + "learning_rate": 4.2825353822460965e-05, + "loss": 0.1651, + "step": 18086 + }, + { + "epoch": 0.3226019334355938, + "grad_norm": 0.25851064920425415, + "learning_rate": 4.282426244317857e-05, + "loss": 0.159, + "step": 18087 + }, + { + "epoch": 0.3226197695573075, + "grad_norm": 0.25092485547065735, + "learning_rate": 4.282317099480317e-05, + "loss": 0.1722, + "step": 18088 + }, + { + "epoch": 0.32263760567902117, + "grad_norm": 0.5327639579772949, + "learning_rate": 4.282207947733901e-05, + "loss": 0.2262, + "step": 18089 + }, + { + "epoch": 0.32265544180073485, + "grad_norm": 0.2172667533159256, + "learning_rate": 4.282098789079031e-05, + "loss": 0.1423, + "step": 18090 + }, + { + "epoch": 0.32267327792244854, + "grad_norm": 0.20758669078350067, + "learning_rate": 4.2819896235161305e-05, + "loss": 0.1493, + "step": 18091 + }, + { + "epoch": 0.32269111404416223, + "grad_norm": 0.252105176448822, + "learning_rate": 4.2818804510456235e-05, + "loss": 0.1884, + "step": 18092 + }, + { + "epoch": 0.3227089501658759, + "grad_norm": 0.2258213609457016, + "learning_rate": 4.2817712716679314e-05, + "loss": 0.1361, + "step": 18093 + }, + { + "epoch": 0.3227267862875896, + "grad_norm": 0.20108555257320404, + "learning_rate": 4.28166208538348e-05, + "loss": 0.1683, + "step": 18094 + }, + { + "epoch": 0.3227446224093033, + "grad_norm": 0.25176340341567993, + "learning_rate": 4.2815528921926896e-05, + "loss": 0.1391, + "step": 18095 + }, + { + "epoch": 0.32276245853101704, + "grad_norm": 0.19316431879997253, + "learning_rate": 4.2814436920959855e-05, + "loss": 0.1304, + "step": 18096 + }, + { + "epoch": 0.3227802946527307, + "grad_norm": 0.2648905813694, + "learning_rate": 4.28133448509379e-05, + "loss": 0.1787, + "step": 18097 + }, + { + "epoch": 0.3227981307744444, + "grad_norm": 0.2939186692237854, + "learning_rate": 4.2812252711865265e-05, + "loss": 0.1871, + "step": 18098 + }, + { + "epoch": 0.3228159668961581, + "grad_norm": 0.3293476700782776, + "learning_rate": 4.281116050374619e-05, + "loss": 0.1517, + "step": 18099 + }, + { + "epoch": 0.3228338030178718, + "grad_norm": 0.23596060276031494, + "learning_rate": 4.281006822658491e-05, + "loss": 0.1495, + "step": 18100 + }, + { + "epoch": 0.3228516391395855, + "grad_norm": 0.34427961707115173, + "learning_rate": 4.280897588038565e-05, + "loss": 0.1968, + "step": 18101 + }, + { + "epoch": 0.32286947526129917, + "grad_norm": 0.27826881408691406, + "learning_rate": 4.280788346515265e-05, + "loss": 0.1635, + "step": 18102 + }, + { + "epoch": 0.32288731138301285, + "grad_norm": 0.26027441024780273, + "learning_rate": 4.2806790980890144e-05, + "loss": 0.1905, + "step": 18103 + }, + { + "epoch": 0.3229051475047266, + "grad_norm": 0.36218979954719543, + "learning_rate": 4.280569842760236e-05, + "loss": 0.1578, + "step": 18104 + }, + { + "epoch": 0.3229229836264403, + "grad_norm": 0.29456502199172974, + "learning_rate": 4.280460580529354e-05, + "loss": 0.1556, + "step": 18105 + }, + { + "epoch": 0.322940819748154, + "grad_norm": 0.33097246289253235, + "learning_rate": 4.280351311396792e-05, + "loss": 0.1665, + "step": 18106 + }, + { + "epoch": 0.32295865586986766, + "grad_norm": 0.27826792001724243, + "learning_rate": 4.2802420353629733e-05, + "loss": 0.1446, + "step": 18107 + }, + { + "epoch": 0.32297649199158135, + "grad_norm": 0.3184787333011627, + "learning_rate": 4.280132752428322e-05, + "loss": 0.1545, + "step": 18108 + }, + { + "epoch": 0.32299432811329504, + "grad_norm": 0.3142584264278412, + "learning_rate": 4.280023462593261e-05, + "loss": 0.2329, + "step": 18109 + }, + { + "epoch": 0.3230121642350087, + "grad_norm": 0.24121132493019104, + "learning_rate": 4.2799141658582144e-05, + "loss": 0.1601, + "step": 18110 + }, + { + "epoch": 0.3230300003567224, + "grad_norm": 0.24124382436275482, + "learning_rate": 4.279804862223606e-05, + "loss": 0.1654, + "step": 18111 + }, + { + "epoch": 0.3230478364784361, + "grad_norm": 0.24795156717300415, + "learning_rate": 4.2796955516898584e-05, + "loss": 0.1704, + "step": 18112 + }, + { + "epoch": 0.32306567260014984, + "grad_norm": 0.20948849618434906, + "learning_rate": 4.279586234257397e-05, + "loss": 0.1289, + "step": 18113 + }, + { + "epoch": 0.32308350872186353, + "grad_norm": 0.2522103190422058, + "learning_rate": 4.279476909926644e-05, + "loss": 0.1711, + "step": 18114 + }, + { + "epoch": 0.3231013448435772, + "grad_norm": 0.2595446705818176, + "learning_rate": 4.2793675786980244e-05, + "loss": 0.1255, + "step": 18115 + }, + { + "epoch": 0.3231191809652909, + "grad_norm": 0.2632864713668823, + "learning_rate": 4.279258240571962e-05, + "loss": 0.1573, + "step": 18116 + }, + { + "epoch": 0.3231370170870046, + "grad_norm": 0.2851024866104126, + "learning_rate": 4.279148895548879e-05, + "loss": 0.1646, + "step": 18117 + }, + { + "epoch": 0.3231548532087183, + "grad_norm": 0.24163654446601868, + "learning_rate": 4.279039543629201e-05, + "loss": 0.1779, + "step": 18118 + }, + { + "epoch": 0.323172689330432, + "grad_norm": 0.28096872568130493, + "learning_rate": 4.278930184813351e-05, + "loss": 0.1786, + "step": 18119 + }, + { + "epoch": 0.32319052545214566, + "grad_norm": 0.3556716740131378, + "learning_rate": 4.278820819101753e-05, + "loss": 0.1745, + "step": 18120 + }, + { + "epoch": 0.3232083615738594, + "grad_norm": 0.3623379170894623, + "learning_rate": 4.278711446494832e-05, + "loss": 0.1723, + "step": 18121 + }, + { + "epoch": 0.3232261976955731, + "grad_norm": 0.20797869563102722, + "learning_rate": 4.27860206699301e-05, + "loss": 0.1804, + "step": 18122 + }, + { + "epoch": 0.3232440338172868, + "grad_norm": 0.35048994421958923, + "learning_rate": 4.278492680596713e-05, + "loss": 0.1948, + "step": 18123 + }, + { + "epoch": 0.32326186993900047, + "grad_norm": 0.5778950452804565, + "learning_rate": 4.2783832873063635e-05, + "loss": 0.2105, + "step": 18124 + }, + { + "epoch": 0.32327970606071416, + "grad_norm": 0.25398486852645874, + "learning_rate": 4.278273887122386e-05, + "loss": 0.2011, + "step": 18125 + }, + { + "epoch": 0.32329754218242784, + "grad_norm": 0.26170650124549866, + "learning_rate": 4.2781644800452055e-05, + "loss": 0.1197, + "step": 18126 + }, + { + "epoch": 0.32331537830414153, + "grad_norm": 0.26197847723960876, + "learning_rate": 4.278055066075245e-05, + "loss": 0.1496, + "step": 18127 + }, + { + "epoch": 0.3233332144258552, + "grad_norm": 0.32092928886413574, + "learning_rate": 4.2779456452129286e-05, + "loss": 0.1655, + "step": 18128 + }, + { + "epoch": 0.32335105054756896, + "grad_norm": 0.2509569227695465, + "learning_rate": 4.2778362174586805e-05, + "loss": 0.165, + "step": 18129 + }, + { + "epoch": 0.32336888666928265, + "grad_norm": 0.2260201871395111, + "learning_rate": 4.277726782812926e-05, + "loss": 0.1573, + "step": 18130 + }, + { + "epoch": 0.32338672279099634, + "grad_norm": 0.43924370408058167, + "learning_rate": 4.277617341276088e-05, + "loss": 0.2046, + "step": 18131 + }, + { + "epoch": 0.32340455891271, + "grad_norm": 0.18697215616703033, + "learning_rate": 4.2775078928485915e-05, + "loss": 0.1201, + "step": 18132 + }, + { + "epoch": 0.3234223950344237, + "grad_norm": 0.3008309304714203, + "learning_rate": 4.277398437530861e-05, + "loss": 0.2267, + "step": 18133 + }, + { + "epoch": 0.3234402311561374, + "grad_norm": 0.2293943166732788, + "learning_rate": 4.277288975323319e-05, + "loss": 0.1186, + "step": 18134 + }, + { + "epoch": 0.3234580672778511, + "grad_norm": 0.25892719626426697, + "learning_rate": 4.277179506226392e-05, + "loss": 0.1293, + "step": 18135 + }, + { + "epoch": 0.3234759033995648, + "grad_norm": 0.18936260044574738, + "learning_rate": 4.277070030240503e-05, + "loss": 0.1765, + "step": 18136 + }, + { + "epoch": 0.32349373952127847, + "grad_norm": 0.28534770011901855, + "learning_rate": 4.276960547366077e-05, + "loss": 0.1825, + "step": 18137 + }, + { + "epoch": 0.3235115756429922, + "grad_norm": 0.265889436006546, + "learning_rate": 4.2768510576035384e-05, + "loss": 0.1744, + "step": 18138 + }, + { + "epoch": 0.3235294117647059, + "grad_norm": 0.21549122035503387, + "learning_rate": 4.27674156095331e-05, + "loss": 0.1406, + "step": 18139 + }, + { + "epoch": 0.3235472478864196, + "grad_norm": 0.21736140549182892, + "learning_rate": 4.276632057415819e-05, + "loss": 0.1534, + "step": 18140 + }, + { + "epoch": 0.3235650840081333, + "grad_norm": 0.45892786979675293, + "learning_rate": 4.276522546991488e-05, + "loss": 0.1832, + "step": 18141 + }, + { + "epoch": 0.32358292012984696, + "grad_norm": 0.2051456868648529, + "learning_rate": 4.276413029680743e-05, + "loss": 0.135, + "step": 18142 + }, + { + "epoch": 0.32360075625156065, + "grad_norm": 0.39524784684181213, + "learning_rate": 4.2763035054840063e-05, + "loss": 0.1868, + "step": 18143 + }, + { + "epoch": 0.32361859237327434, + "grad_norm": 0.2699621021747589, + "learning_rate": 4.2761939744017046e-05, + "loss": 0.1755, + "step": 18144 + }, + { + "epoch": 0.323636428494988, + "grad_norm": 0.4233923554420471, + "learning_rate": 4.276084436434261e-05, + "loss": 0.1819, + "step": 18145 + }, + { + "epoch": 0.32365426461670177, + "grad_norm": 0.23033460974693298, + "learning_rate": 4.275974891582101e-05, + "loss": 0.1716, + "step": 18146 + }, + { + "epoch": 0.32367210073841546, + "grad_norm": 0.19312624633312225, + "learning_rate": 4.275865339845648e-05, + "loss": 0.1179, + "step": 18147 + }, + { + "epoch": 0.32368993686012915, + "grad_norm": 0.22600796818733215, + "learning_rate": 4.275755781225329e-05, + "loss": 0.1694, + "step": 18148 + }, + { + "epoch": 0.32370777298184283, + "grad_norm": 0.23917296528816223, + "learning_rate": 4.2756462157215663e-05, + "loss": 0.1751, + "step": 18149 + }, + { + "epoch": 0.3237256091035565, + "grad_norm": 0.29062923789024353, + "learning_rate": 4.275536643334786e-05, + "loss": 0.1288, + "step": 18150 + }, + { + "epoch": 0.3237434452252702, + "grad_norm": 0.25291189551353455, + "learning_rate": 4.2754270640654125e-05, + "loss": 0.1705, + "step": 18151 + }, + { + "epoch": 0.3237612813469839, + "grad_norm": 0.30322882533073425, + "learning_rate": 4.275317477913871e-05, + "loss": 0.1068, + "step": 18152 + }, + { + "epoch": 0.3237791174686976, + "grad_norm": 0.3245062828063965, + "learning_rate": 4.275207884880584e-05, + "loss": 0.1843, + "step": 18153 + }, + { + "epoch": 0.3237969535904113, + "grad_norm": 0.20447225868701935, + "learning_rate": 4.2750982849659795e-05, + "loss": 0.1439, + "step": 18154 + }, + { + "epoch": 0.323814789712125, + "grad_norm": 0.2562181055545807, + "learning_rate": 4.27498867817048e-05, + "loss": 0.1595, + "step": 18155 + }, + { + "epoch": 0.3238326258338387, + "grad_norm": 0.31041577458381653, + "learning_rate": 4.274879064494512e-05, + "loss": 0.1778, + "step": 18156 + }, + { + "epoch": 0.3238504619555524, + "grad_norm": 0.25057414174079895, + "learning_rate": 4.2747694439385e-05, + "loss": 0.1423, + "step": 18157 + }, + { + "epoch": 0.3238682980772661, + "grad_norm": 0.2723214328289032, + "learning_rate": 4.2746598165028686e-05, + "loss": 0.1843, + "step": 18158 + }, + { + "epoch": 0.32388613419897977, + "grad_norm": 0.29241931438446045, + "learning_rate": 4.274550182188042e-05, + "loss": 0.1871, + "step": 18159 + }, + { + "epoch": 0.32390397032069346, + "grad_norm": 0.22207072377204895, + "learning_rate": 4.274440540994447e-05, + "loss": 0.1774, + "step": 18160 + }, + { + "epoch": 0.32392180644240715, + "grad_norm": 0.29096317291259766, + "learning_rate": 4.274330892922507e-05, + "loss": 0.1869, + "step": 18161 + }, + { + "epoch": 0.32393964256412083, + "grad_norm": 0.2469264566898346, + "learning_rate": 4.2742212379726475e-05, + "loss": 0.1335, + "step": 18162 + }, + { + "epoch": 0.3239574786858346, + "grad_norm": 0.3547193109989166, + "learning_rate": 4.2741115761452944e-05, + "loss": 0.1721, + "step": 18163 + }, + { + "epoch": 0.32397531480754826, + "grad_norm": 0.2278842329978943, + "learning_rate": 4.274001907440871e-05, + "loss": 0.1411, + "step": 18164 + }, + { + "epoch": 0.32399315092926195, + "grad_norm": 0.2389397770166397, + "learning_rate": 4.273892231859804e-05, + "loss": 0.1863, + "step": 18165 + }, + { + "epoch": 0.32401098705097564, + "grad_norm": 0.3268055319786072, + "learning_rate": 4.273782549402519e-05, + "loss": 0.1431, + "step": 18166 + }, + { + "epoch": 0.32402882317268933, + "grad_norm": 0.24398286640644073, + "learning_rate": 4.2736728600694384e-05, + "loss": 0.1659, + "step": 18167 + }, + { + "epoch": 0.324046659294403, + "grad_norm": 0.2801712453365326, + "learning_rate": 4.27356316386099e-05, + "loss": 0.1773, + "step": 18168 + }, + { + "epoch": 0.3240644954161167, + "grad_norm": 0.22851204872131348, + "learning_rate": 4.273453460777599e-05, + "loss": 0.1555, + "step": 18169 + }, + { + "epoch": 0.3240823315378304, + "grad_norm": 0.25567877292633057, + "learning_rate": 4.2733437508196886e-05, + "loss": 0.255, + "step": 18170 + }, + { + "epoch": 0.32410016765954414, + "grad_norm": 0.19158430397510529, + "learning_rate": 4.2732340339876856e-05, + "loss": 0.142, + "step": 18171 + }, + { + "epoch": 0.3241180037812578, + "grad_norm": 0.27221187949180603, + "learning_rate": 4.2731243102820157e-05, + "loss": 0.1619, + "step": 18172 + }, + { + "epoch": 0.3241358399029715, + "grad_norm": 0.3070240318775177, + "learning_rate": 4.273014579703103e-05, + "loss": 0.1505, + "step": 18173 + }, + { + "epoch": 0.3241536760246852, + "grad_norm": 0.30097055435180664, + "learning_rate": 4.272904842251374e-05, + "loss": 0.1904, + "step": 18174 + }, + { + "epoch": 0.3241715121463989, + "grad_norm": 0.24105100333690643, + "learning_rate": 4.272795097927252e-05, + "loss": 0.1902, + "step": 18175 + }, + { + "epoch": 0.3241893482681126, + "grad_norm": 0.7637127041816711, + "learning_rate": 4.272685346731166e-05, + "loss": 0.2229, + "step": 18176 + }, + { + "epoch": 0.32420718438982626, + "grad_norm": 0.26520559191703796, + "learning_rate": 4.272575588663538e-05, + "loss": 0.1474, + "step": 18177 + }, + { + "epoch": 0.32422502051153995, + "grad_norm": 0.3035420775413513, + "learning_rate": 4.272465823724795e-05, + "loss": 0.1418, + "step": 18178 + }, + { + "epoch": 0.32424285663325364, + "grad_norm": 0.3116084635257721, + "learning_rate": 4.2723560519153625e-05, + "loss": 0.1664, + "step": 18179 + }, + { + "epoch": 0.3242606927549674, + "grad_norm": 0.2435222715139389, + "learning_rate": 4.272246273235665e-05, + "loss": 0.1146, + "step": 18180 + }, + { + "epoch": 0.32427852887668107, + "grad_norm": 0.2081175148487091, + "learning_rate": 4.2721364876861296e-05, + "loss": 0.1768, + "step": 18181 + }, + { + "epoch": 0.32429636499839476, + "grad_norm": 0.3875510096549988, + "learning_rate": 4.272026695267181e-05, + "loss": 0.257, + "step": 18182 + }, + { + "epoch": 0.32431420112010845, + "grad_norm": 0.3454514145851135, + "learning_rate": 4.2719168959792455e-05, + "loss": 0.1889, + "step": 18183 + }, + { + "epoch": 0.32433203724182214, + "grad_norm": 0.2673277258872986, + "learning_rate": 4.271807089822747e-05, + "loss": 0.203, + "step": 18184 + }, + { + "epoch": 0.3243498733635358, + "grad_norm": 0.2972363233566284, + "learning_rate": 4.2716972767981125e-05, + "loss": 0.2217, + "step": 18185 + }, + { + "epoch": 0.3243677094852495, + "grad_norm": 0.34821441769599915, + "learning_rate": 4.271587456905768e-05, + "loss": 0.1567, + "step": 18186 + }, + { + "epoch": 0.3243855456069632, + "grad_norm": 0.32106438279151917, + "learning_rate": 4.271477630146138e-05, + "loss": 0.2012, + "step": 18187 + }, + { + "epoch": 0.32440338172867694, + "grad_norm": 0.3968912661075592, + "learning_rate": 4.271367796519649e-05, + "loss": 0.1703, + "step": 18188 + }, + { + "epoch": 0.32442121785039063, + "grad_norm": 0.25592130422592163, + "learning_rate": 4.271257956026727e-05, + "loss": 0.1942, + "step": 18189 + }, + { + "epoch": 0.3244390539721043, + "grad_norm": 0.36070919036865234, + "learning_rate": 4.271148108667797e-05, + "loss": 0.1688, + "step": 18190 + }, + { + "epoch": 0.324456890093818, + "grad_norm": 0.20616500079631805, + "learning_rate": 4.271038254443286e-05, + "loss": 0.1653, + "step": 18191 + }, + { + "epoch": 0.3244747262155317, + "grad_norm": 0.2631708085536957, + "learning_rate": 4.270928393353618e-05, + "loss": 0.1728, + "step": 18192 + }, + { + "epoch": 0.3244925623372454, + "grad_norm": 0.24998736381530762, + "learning_rate": 4.2708185253992205e-05, + "loss": 0.1605, + "step": 18193 + }, + { + "epoch": 0.32451039845895907, + "grad_norm": 0.24360230565071106, + "learning_rate": 4.270708650580518e-05, + "loss": 0.1537, + "step": 18194 + }, + { + "epoch": 0.32452823458067276, + "grad_norm": 0.21705761551856995, + "learning_rate": 4.2705987688979376e-05, + "loss": 0.1402, + "step": 18195 + }, + { + "epoch": 0.32454607070238645, + "grad_norm": 0.2770020365715027, + "learning_rate": 4.270488880351905e-05, + "loss": 0.1531, + "step": 18196 + }, + { + "epoch": 0.3245639068241002, + "grad_norm": 0.2462465614080429, + "learning_rate": 4.270378984942846e-05, + "loss": 0.2068, + "step": 18197 + }, + { + "epoch": 0.3245817429458139, + "grad_norm": 0.22040067613124847, + "learning_rate": 4.270269082671187e-05, + "loss": 0.1933, + "step": 18198 + }, + { + "epoch": 0.32459957906752757, + "grad_norm": 0.2138727754354477, + "learning_rate": 4.270159173537353e-05, + "loss": 0.1719, + "step": 18199 + }, + { + "epoch": 0.32461741518924125, + "grad_norm": 0.2432202845811844, + "learning_rate": 4.2700492575417705e-05, + "loss": 0.1705, + "step": 18200 + }, + { + "epoch": 0.32463525131095494, + "grad_norm": 0.25168612599372864, + "learning_rate": 4.269939334684866e-05, + "loss": 0.1317, + "step": 18201 + }, + { + "epoch": 0.32465308743266863, + "grad_norm": 0.21846838295459747, + "learning_rate": 4.269829404967065e-05, + "loss": 0.1772, + "step": 18202 + }, + { + "epoch": 0.3246709235543823, + "grad_norm": 0.23773692548274994, + "learning_rate": 4.269719468388794e-05, + "loss": 0.1469, + "step": 18203 + }, + { + "epoch": 0.324688759676096, + "grad_norm": 0.2535833716392517, + "learning_rate": 4.2696095249504795e-05, + "loss": 0.1527, + "step": 18204 + }, + { + "epoch": 0.32470659579780975, + "grad_norm": 0.20184482634067535, + "learning_rate": 4.269499574652548e-05, + "loss": 0.1395, + "step": 18205 + }, + { + "epoch": 0.32472443191952344, + "grad_norm": 0.2776055634021759, + "learning_rate": 4.269389617495424e-05, + "loss": 0.1265, + "step": 18206 + }, + { + "epoch": 0.3247422680412371, + "grad_norm": 0.32144761085510254, + "learning_rate": 4.269279653479534e-05, + "loss": 0.1754, + "step": 18207 + }, + { + "epoch": 0.3247601041629508, + "grad_norm": 0.2513435184955597, + "learning_rate": 4.2691696826053065e-05, + "loss": 0.1671, + "step": 18208 + }, + { + "epoch": 0.3247779402846645, + "grad_norm": 0.3405788540840149, + "learning_rate": 4.269059704873165e-05, + "loss": 0.16, + "step": 18209 + }, + { + "epoch": 0.3247957764063782, + "grad_norm": 0.2149336189031601, + "learning_rate": 4.2689497202835385e-05, + "loss": 0.1479, + "step": 18210 + }, + { + "epoch": 0.3248136125280919, + "grad_norm": 0.29224058985710144, + "learning_rate": 4.2688397288368506e-05, + "loss": 0.1558, + "step": 18211 + }, + { + "epoch": 0.32483144864980557, + "grad_norm": 0.2925548553466797, + "learning_rate": 4.268729730533529e-05, + "loss": 0.2015, + "step": 18212 + }, + { + "epoch": 0.32484928477151925, + "grad_norm": 0.26221764087677, + "learning_rate": 4.2686197253740005e-05, + "loss": 0.1924, + "step": 18213 + }, + { + "epoch": 0.324867120893233, + "grad_norm": 0.24553368985652924, + "learning_rate": 4.2685097133586915e-05, + "loss": 0.1996, + "step": 18214 + }, + { + "epoch": 0.3248849570149467, + "grad_norm": 0.24499189853668213, + "learning_rate": 4.268399694488028e-05, + "loss": 0.1925, + "step": 18215 + }, + { + "epoch": 0.3249027931366604, + "grad_norm": 0.2467213124036789, + "learning_rate": 4.2682896687624355e-05, + "loss": 0.1573, + "step": 18216 + }, + { + "epoch": 0.32492062925837406, + "grad_norm": 0.4217036962509155, + "learning_rate": 4.268179636182342e-05, + "loss": 0.1552, + "step": 18217 + }, + { + "epoch": 0.32493846538008775, + "grad_norm": 0.28804704546928406, + "learning_rate": 4.268069596748174e-05, + "loss": 0.2181, + "step": 18218 + }, + { + "epoch": 0.32495630150180144, + "grad_norm": 0.268893837928772, + "learning_rate": 4.267959550460357e-05, + "loss": 0.1574, + "step": 18219 + }, + { + "epoch": 0.3249741376235151, + "grad_norm": 0.20434194803237915, + "learning_rate": 4.2678494973193184e-05, + "loss": 0.0877, + "step": 18220 + }, + { + "epoch": 0.3249919737452288, + "grad_norm": 0.22750738263130188, + "learning_rate": 4.267739437325484e-05, + "loss": 0.1663, + "step": 18221 + }, + { + "epoch": 0.32500980986694256, + "grad_norm": 0.35077619552612305, + "learning_rate": 4.2676293704792816e-05, + "loss": 0.1855, + "step": 18222 + }, + { + "epoch": 0.32502764598865624, + "grad_norm": 0.31882402300834656, + "learning_rate": 4.2675192967811374e-05, + "loss": 0.1793, + "step": 18223 + }, + { + "epoch": 0.32504548211036993, + "grad_norm": 0.21044772863388062, + "learning_rate": 4.267409216231477e-05, + "loss": 0.173, + "step": 18224 + }, + { + "epoch": 0.3250633182320836, + "grad_norm": 0.3163749873638153, + "learning_rate": 4.267299128830729e-05, + "loss": 0.1535, + "step": 18225 + }, + { + "epoch": 0.3250811543537973, + "grad_norm": 0.2738315463066101, + "learning_rate": 4.267189034579319e-05, + "loss": 0.1564, + "step": 18226 + }, + { + "epoch": 0.325098990475511, + "grad_norm": 0.29425275325775146, + "learning_rate": 4.2670789334776736e-05, + "loss": 0.1891, + "step": 18227 + }, + { + "epoch": 0.3251168265972247, + "grad_norm": 0.31764763593673706, + "learning_rate": 4.266968825526221e-05, + "loss": 0.2118, + "step": 18228 + }, + { + "epoch": 0.32513466271893837, + "grad_norm": 0.2998555600643158, + "learning_rate": 4.266858710725386e-05, + "loss": 0.1591, + "step": 18229 + }, + { + "epoch": 0.3251524988406521, + "grad_norm": 0.3281696140766144, + "learning_rate": 4.266748589075596e-05, + "loss": 0.1816, + "step": 18230 + }, + { + "epoch": 0.3251703349623658, + "grad_norm": 0.30124592781066895, + "learning_rate": 4.266638460577278e-05, + "loss": 0.1929, + "step": 18231 + }, + { + "epoch": 0.3251881710840795, + "grad_norm": 0.27557745575904846, + "learning_rate": 4.266528325230861e-05, + "loss": 0.099, + "step": 18232 + }, + { + "epoch": 0.3252060072057932, + "grad_norm": 0.2500791549682617, + "learning_rate": 4.266418183036768e-05, + "loss": 0.1781, + "step": 18233 + }, + { + "epoch": 0.32522384332750687, + "grad_norm": 0.3229110836982727, + "learning_rate": 4.2663080339954295e-05, + "loss": 0.1729, + "step": 18234 + }, + { + "epoch": 0.32524167944922056, + "grad_norm": 0.2780783772468567, + "learning_rate": 4.2661978781072695e-05, + "loss": 0.1751, + "step": 18235 + }, + { + "epoch": 0.32525951557093424, + "grad_norm": 0.3489958643913269, + "learning_rate": 4.2660877153727183e-05, + "loss": 0.2497, + "step": 18236 + }, + { + "epoch": 0.32527735169264793, + "grad_norm": 0.23013868927955627, + "learning_rate": 4.2659775457921996e-05, + "loss": 0.171, + "step": 18237 + }, + { + "epoch": 0.3252951878143616, + "grad_norm": 0.30657973885536194, + "learning_rate": 4.265867369366143e-05, + "loss": 0.1353, + "step": 18238 + }, + { + "epoch": 0.32531302393607536, + "grad_norm": 0.22024422883987427, + "learning_rate": 4.265757186094974e-05, + "loss": 0.157, + "step": 18239 + }, + { + "epoch": 0.32533086005778905, + "grad_norm": 0.39033645391464233, + "learning_rate": 4.26564699597912e-05, + "loss": 0.2621, + "step": 18240 + }, + { + "epoch": 0.32534869617950274, + "grad_norm": 0.20797429978847504, + "learning_rate": 4.2655367990190095e-05, + "loss": 0.1854, + "step": 18241 + }, + { + "epoch": 0.3253665323012164, + "grad_norm": 0.21181592345237732, + "learning_rate": 4.265426595215067e-05, + "loss": 0.1897, + "step": 18242 + }, + { + "epoch": 0.3253843684229301, + "grad_norm": 0.37151506543159485, + "learning_rate": 4.265316384567723e-05, + "loss": 0.2125, + "step": 18243 + }, + { + "epoch": 0.3254022045446438, + "grad_norm": 0.23360081017017365, + "learning_rate": 4.265206167077402e-05, + "loss": 0.1733, + "step": 18244 + }, + { + "epoch": 0.3254200406663575, + "grad_norm": 0.3226069211959839, + "learning_rate": 4.265095942744533e-05, + "loss": 0.1609, + "step": 18245 + }, + { + "epoch": 0.3254378767880712, + "grad_norm": 0.33413782715797424, + "learning_rate": 4.264985711569541e-05, + "loss": 0.2181, + "step": 18246 + }, + { + "epoch": 0.3254557129097849, + "grad_norm": 0.3707151710987091, + "learning_rate": 4.264875473552856e-05, + "loss": 0.2095, + "step": 18247 + }, + { + "epoch": 0.3254735490314986, + "grad_norm": 0.26449453830718994, + "learning_rate": 4.264765228694904e-05, + "loss": 0.2147, + "step": 18248 + }, + { + "epoch": 0.3254913851532123, + "grad_norm": 0.273408442735672, + "learning_rate": 4.264654976996112e-05, + "loss": 0.1858, + "step": 18249 + }, + { + "epoch": 0.325509221274926, + "grad_norm": 0.1965082734823227, + "learning_rate": 4.2645447184569074e-05, + "loss": 0.1376, + "step": 18250 + }, + { + "epoch": 0.3255270573966397, + "grad_norm": 0.27523037791252136, + "learning_rate": 4.264434453077719e-05, + "loss": 0.1785, + "step": 18251 + }, + { + "epoch": 0.32554489351835336, + "grad_norm": 0.19104528427124023, + "learning_rate": 4.264324180858973e-05, + "loss": 0.1452, + "step": 18252 + }, + { + "epoch": 0.32556272964006705, + "grad_norm": 0.1954038441181183, + "learning_rate": 4.264213901801097e-05, + "loss": 0.1323, + "step": 18253 + }, + { + "epoch": 0.32558056576178074, + "grad_norm": 0.34060055017471313, + "learning_rate": 4.264103615904519e-05, + "loss": 0.1926, + "step": 18254 + }, + { + "epoch": 0.3255984018834944, + "grad_norm": 0.24086469411849976, + "learning_rate": 4.263993323169665e-05, + "loss": 0.1734, + "step": 18255 + }, + { + "epoch": 0.32561623800520817, + "grad_norm": 0.2462422251701355, + "learning_rate": 4.263883023596965e-05, + "loss": 0.1457, + "step": 18256 + }, + { + "epoch": 0.32563407412692186, + "grad_norm": 0.2652982473373413, + "learning_rate": 4.2637727171868434e-05, + "loss": 0.1584, + "step": 18257 + }, + { + "epoch": 0.32565191024863555, + "grad_norm": 0.3294752240180969, + "learning_rate": 4.263662403939731e-05, + "loss": 0.1908, + "step": 18258 + }, + { + "epoch": 0.32566974637034923, + "grad_norm": 0.259242981672287, + "learning_rate": 4.2635520838560534e-05, + "loss": 0.137, + "step": 18259 + }, + { + "epoch": 0.3256875824920629, + "grad_norm": 0.3338519036769867, + "learning_rate": 4.2634417569362394e-05, + "loss": 0.1787, + "step": 18260 + }, + { + "epoch": 0.3257054186137766, + "grad_norm": 0.24972964823246002, + "learning_rate": 4.2633314231807157e-05, + "loss": 0.173, + "step": 18261 + }, + { + "epoch": 0.3257232547354903, + "grad_norm": 0.2628253996372223, + "learning_rate": 4.26322108258991e-05, + "loss": 0.173, + "step": 18262 + }, + { + "epoch": 0.325741090857204, + "grad_norm": 0.31130221486091614, + "learning_rate": 4.263110735164251e-05, + "loss": 0.1917, + "step": 18263 + }, + { + "epoch": 0.32575892697891773, + "grad_norm": 0.25815194845199585, + "learning_rate": 4.2630003809041654e-05, + "loss": 0.1476, + "step": 18264 + }, + { + "epoch": 0.3257767631006314, + "grad_norm": 0.2450726181268692, + "learning_rate": 4.2628900198100814e-05, + "loss": 0.1577, + "step": 18265 + }, + { + "epoch": 0.3257945992223451, + "grad_norm": 0.26551350951194763, + "learning_rate": 4.262779651882427e-05, + "loss": 0.1579, + "step": 18266 + }, + { + "epoch": 0.3258124353440588, + "grad_norm": 0.23024778068065643, + "learning_rate": 4.2626692771216296e-05, + "loss": 0.1336, + "step": 18267 + }, + { + "epoch": 0.3258302714657725, + "grad_norm": 0.32547813653945923, + "learning_rate": 4.262558895528117e-05, + "loss": 0.1991, + "step": 18268 + }, + { + "epoch": 0.32584810758748617, + "grad_norm": 0.20113670825958252, + "learning_rate": 4.262448507102318e-05, + "loss": 0.15, + "step": 18269 + }, + { + "epoch": 0.32586594370919986, + "grad_norm": 0.2653674781322479, + "learning_rate": 4.262338111844659e-05, + "loss": 0.218, + "step": 18270 + }, + { + "epoch": 0.32588377983091354, + "grad_norm": 0.20280010998249054, + "learning_rate": 4.26222770975557e-05, + "loss": 0.1307, + "step": 18271 + }, + { + "epoch": 0.3259016159526273, + "grad_norm": 0.2655740976333618, + "learning_rate": 4.262117300835477e-05, + "loss": 0.199, + "step": 18272 + }, + { + "epoch": 0.325919452074341, + "grad_norm": 0.25241681933403015, + "learning_rate": 4.262006885084809e-05, + "loss": 0.1918, + "step": 18273 + }, + { + "epoch": 0.32593728819605466, + "grad_norm": 0.28184425830841064, + "learning_rate": 4.261896462503994e-05, + "loss": 0.1618, + "step": 18274 + }, + { + "epoch": 0.32595512431776835, + "grad_norm": 0.2922350764274597, + "learning_rate": 4.261786033093459e-05, + "loss": 0.1428, + "step": 18275 + }, + { + "epoch": 0.32597296043948204, + "grad_norm": 0.2177983820438385, + "learning_rate": 4.261675596853633e-05, + "loss": 0.1313, + "step": 18276 + }, + { + "epoch": 0.32599079656119573, + "grad_norm": 0.26058638095855713, + "learning_rate": 4.261565153784945e-05, + "loss": 0.1764, + "step": 18277 + }, + { + "epoch": 0.3260086326829094, + "grad_norm": 0.21988406777381897, + "learning_rate": 4.261454703887821e-05, + "loss": 0.1052, + "step": 18278 + }, + { + "epoch": 0.3260264688046231, + "grad_norm": 0.2672010064125061, + "learning_rate": 4.26134424716269e-05, + "loss": 0.2196, + "step": 18279 + }, + { + "epoch": 0.3260443049263368, + "grad_norm": 0.2395225465297699, + "learning_rate": 4.261233783609981e-05, + "loss": 0.1485, + "step": 18280 + }, + { + "epoch": 0.32606214104805054, + "grad_norm": 0.2792045474052429, + "learning_rate": 4.2611233132301206e-05, + "loss": 0.1246, + "step": 18281 + }, + { + "epoch": 0.3260799771697642, + "grad_norm": 0.29213061928749084, + "learning_rate": 4.261012836023539e-05, + "loss": 0.1554, + "step": 18282 + }, + { + "epoch": 0.3260978132914779, + "grad_norm": 0.3063087463378906, + "learning_rate": 4.2609023519906635e-05, + "loss": 0.1446, + "step": 18283 + }, + { + "epoch": 0.3261156494131916, + "grad_norm": 0.2854937016963959, + "learning_rate": 4.260791861131922e-05, + "loss": 0.1295, + "step": 18284 + }, + { + "epoch": 0.3261334855349053, + "grad_norm": 0.22289389371871948, + "learning_rate": 4.2606813634477424e-05, + "loss": 0.1701, + "step": 18285 + }, + { + "epoch": 0.326151321656619, + "grad_norm": 0.42393597960472107, + "learning_rate": 4.260570858938554e-05, + "loss": 0.2563, + "step": 18286 + }, + { + "epoch": 0.32616915777833266, + "grad_norm": 0.26955538988113403, + "learning_rate": 4.2604603476047855e-05, + "loss": 0.216, + "step": 18287 + }, + { + "epoch": 0.32618699390004635, + "grad_norm": 0.3739461302757263, + "learning_rate": 4.260349829446864e-05, + "loss": 0.155, + "step": 18288 + }, + { + "epoch": 0.3262048300217601, + "grad_norm": 0.23768596351146698, + "learning_rate": 4.260239304465219e-05, + "loss": 0.1714, + "step": 18289 + }, + { + "epoch": 0.3262226661434738, + "grad_norm": 0.2925799489021301, + "learning_rate": 4.260128772660278e-05, + "loss": 0.208, + "step": 18290 + }, + { + "epoch": 0.32624050226518747, + "grad_norm": 0.35943129658699036, + "learning_rate": 4.260018234032471e-05, + "loss": 0.1596, + "step": 18291 + }, + { + "epoch": 0.32625833838690116, + "grad_norm": 0.5245606899261475, + "learning_rate": 4.259907688582224e-05, + "loss": 0.1387, + "step": 18292 + }, + { + "epoch": 0.32627617450861485, + "grad_norm": 0.22465407848358154, + "learning_rate": 4.2597971363099675e-05, + "loss": 0.1723, + "step": 18293 + }, + { + "epoch": 0.32629401063032853, + "grad_norm": 0.2072666734457016, + "learning_rate": 4.2596865772161296e-05, + "loss": 0.1574, + "step": 18294 + }, + { + "epoch": 0.3263118467520422, + "grad_norm": 0.26960331201553345, + "learning_rate": 4.2595760113011394e-05, + "loss": 0.1718, + "step": 18295 + }, + { + "epoch": 0.3263296828737559, + "grad_norm": 0.3110816776752472, + "learning_rate": 4.259465438565424e-05, + "loss": 0.1904, + "step": 18296 + }, + { + "epoch": 0.3263475189954696, + "grad_norm": 0.2772740423679352, + "learning_rate": 4.259354859009413e-05, + "loss": 0.1288, + "step": 18297 + }, + { + "epoch": 0.32636535511718334, + "grad_norm": 0.27265042066574097, + "learning_rate": 4.2592442726335344e-05, + "loss": 0.1606, + "step": 18298 + }, + { + "epoch": 0.32638319123889703, + "grad_norm": 0.19521182775497437, + "learning_rate": 4.2591336794382184e-05, + "loss": 0.1454, + "step": 18299 + }, + { + "epoch": 0.3264010273606107, + "grad_norm": 0.42005911469459534, + "learning_rate": 4.2590230794238915e-05, + "loss": 0.2162, + "step": 18300 + }, + { + "epoch": 0.3264188634823244, + "grad_norm": 0.28822359442710876, + "learning_rate": 4.258912472590985e-05, + "loss": 0.2034, + "step": 18301 + }, + { + "epoch": 0.3264366996040381, + "grad_norm": 0.21401211619377136, + "learning_rate": 4.258801858939926e-05, + "loss": 0.1937, + "step": 18302 + }, + { + "epoch": 0.3264545357257518, + "grad_norm": 0.26566237211227417, + "learning_rate": 4.258691238471143e-05, + "loss": 0.1961, + "step": 18303 + }, + { + "epoch": 0.32647237184746547, + "grad_norm": 0.22111135721206665, + "learning_rate": 4.258580611185066e-05, + "loss": 0.1534, + "step": 18304 + }, + { + "epoch": 0.32649020796917916, + "grad_norm": 0.2629345655441284, + "learning_rate": 4.2584699770821215e-05, + "loss": 0.1809, + "step": 18305 + }, + { + "epoch": 0.3265080440908929, + "grad_norm": 0.2634925842285156, + "learning_rate": 4.258359336162742e-05, + "loss": 0.1408, + "step": 18306 + }, + { + "epoch": 0.3265258802126066, + "grad_norm": 0.17384250462055206, + "learning_rate": 4.2582486884273526e-05, + "loss": 0.1372, + "step": 18307 + }, + { + "epoch": 0.3265437163343203, + "grad_norm": 0.21742819249629974, + "learning_rate": 4.258138033876385e-05, + "loss": 0.1656, + "step": 18308 + }, + { + "epoch": 0.32656155245603397, + "grad_norm": 0.2189481258392334, + "learning_rate": 4.258027372510267e-05, + "loss": 0.1468, + "step": 18309 + }, + { + "epoch": 0.32657938857774765, + "grad_norm": 0.23182353377342224, + "learning_rate": 4.257916704329428e-05, + "loss": 0.1578, + "step": 18310 + }, + { + "epoch": 0.32659722469946134, + "grad_norm": 0.1941414475440979, + "learning_rate": 4.257806029334296e-05, + "loss": 0.1141, + "step": 18311 + }, + { + "epoch": 0.32661506082117503, + "grad_norm": 0.20877231657505035, + "learning_rate": 4.257695347525301e-05, + "loss": 0.164, + "step": 18312 + }, + { + "epoch": 0.3266328969428887, + "grad_norm": 0.32865646481513977, + "learning_rate": 4.257584658902872e-05, + "loss": 0.1712, + "step": 18313 + }, + { + "epoch": 0.3266507330646024, + "grad_norm": 0.26722660660743713, + "learning_rate": 4.257473963467438e-05, + "loss": 0.1873, + "step": 18314 + }, + { + "epoch": 0.32666856918631615, + "grad_norm": 0.21177662909030914, + "learning_rate": 4.257363261219427e-05, + "loss": 0.1351, + "step": 18315 + }, + { + "epoch": 0.32668640530802984, + "grad_norm": 0.22980764508247375, + "learning_rate": 4.25725255215927e-05, + "loss": 0.1449, + "step": 18316 + }, + { + "epoch": 0.3267042414297435, + "grad_norm": 0.2627589702606201, + "learning_rate": 4.257141836287395e-05, + "loss": 0.1754, + "step": 18317 + }, + { + "epoch": 0.3267220775514572, + "grad_norm": 0.20913569629192352, + "learning_rate": 4.2570311136042305e-05, + "loss": 0.1482, + "step": 18318 + }, + { + "epoch": 0.3267399136731709, + "grad_norm": 0.24080607295036316, + "learning_rate": 4.256920384110208e-05, + "loss": 0.1406, + "step": 18319 + }, + { + "epoch": 0.3267577497948846, + "grad_norm": 0.24826756119728088, + "learning_rate": 4.256809647805754e-05, + "loss": 0.1497, + "step": 18320 + }, + { + "epoch": 0.3267755859165983, + "grad_norm": 0.19989757239818573, + "learning_rate": 4.2566989046913e-05, + "loss": 0.1378, + "step": 18321 + }, + { + "epoch": 0.32679342203831196, + "grad_norm": 0.20871931314468384, + "learning_rate": 4.256588154767273e-05, + "loss": 0.168, + "step": 18322 + }, + { + "epoch": 0.3268112581600257, + "grad_norm": 0.23451226949691772, + "learning_rate": 4.256477398034104e-05, + "loss": 0.1445, + "step": 18323 + }, + { + "epoch": 0.3268290942817394, + "grad_norm": 0.25865402817726135, + "learning_rate": 4.2563666344922225e-05, + "loss": 0.1688, + "step": 18324 + }, + { + "epoch": 0.3268469304034531, + "grad_norm": 0.2108352780342102, + "learning_rate": 4.2562558641420575e-05, + "loss": 0.17, + "step": 18325 + }, + { + "epoch": 0.32686476652516677, + "grad_norm": 0.28229469060897827, + "learning_rate": 4.256145086984038e-05, + "loss": 0.191, + "step": 18326 + }, + { + "epoch": 0.32688260264688046, + "grad_norm": 0.2260260432958603, + "learning_rate": 4.2560343030185934e-05, + "loss": 0.1174, + "step": 18327 + }, + { + "epoch": 0.32690043876859415, + "grad_norm": 0.23733361065387726, + "learning_rate": 4.255923512246153e-05, + "loss": 0.169, + "step": 18328 + }, + { + "epoch": 0.32691827489030784, + "grad_norm": 0.36725863814353943, + "learning_rate": 4.255812714667147e-05, + "loss": 0.1882, + "step": 18329 + }, + { + "epoch": 0.3269361110120215, + "grad_norm": 0.2420433759689331, + "learning_rate": 4.255701910282005e-05, + "loss": 0.1788, + "step": 18330 + }, + { + "epoch": 0.32695394713373527, + "grad_norm": 0.21113458275794983, + "learning_rate": 4.255591099091155e-05, + "loss": 0.1315, + "step": 18331 + }, + { + "epoch": 0.32697178325544896, + "grad_norm": 0.22553254663944244, + "learning_rate": 4.255480281095028e-05, + "loss": 0.1532, + "step": 18332 + }, + { + "epoch": 0.32698961937716264, + "grad_norm": 0.24502426385879517, + "learning_rate": 4.2553694562940525e-05, + "loss": 0.1809, + "step": 18333 + }, + { + "epoch": 0.32700745549887633, + "grad_norm": 0.35403308272361755, + "learning_rate": 4.2552586246886595e-05, + "loss": 0.1972, + "step": 18334 + }, + { + "epoch": 0.32702529162059, + "grad_norm": 0.2816767692565918, + "learning_rate": 4.255147786279277e-05, + "loss": 0.1925, + "step": 18335 + }, + { + "epoch": 0.3270431277423037, + "grad_norm": 0.25400564074516296, + "learning_rate": 4.2550369410663366e-05, + "loss": 0.1706, + "step": 18336 + }, + { + "epoch": 0.3270609638640174, + "grad_norm": 0.19140376150608063, + "learning_rate": 4.2549260890502664e-05, + "loss": 0.1436, + "step": 18337 + }, + { + "epoch": 0.3270787999857311, + "grad_norm": 0.30130836367607117, + "learning_rate": 4.254815230231496e-05, + "loss": 0.2071, + "step": 18338 + }, + { + "epoch": 0.32709663610744477, + "grad_norm": 0.2753887176513672, + "learning_rate": 4.254704364610456e-05, + "loss": 0.0837, + "step": 18339 + }, + { + "epoch": 0.3271144722291585, + "grad_norm": 0.252464234828949, + "learning_rate": 4.2545934921875764e-05, + "loss": 0.1421, + "step": 18340 + }, + { + "epoch": 0.3271323083508722, + "grad_norm": 0.37632113695144653, + "learning_rate": 4.2544826129632854e-05, + "loss": 0.2364, + "step": 18341 + }, + { + "epoch": 0.3271501444725859, + "grad_norm": 0.24468207359313965, + "learning_rate": 4.2543717269380144e-05, + "loss": 0.1391, + "step": 18342 + }, + { + "epoch": 0.3271679805942996, + "grad_norm": 0.2854864299297333, + "learning_rate": 4.254260834112192e-05, + "loss": 0.1829, + "step": 18343 + }, + { + "epoch": 0.32718581671601327, + "grad_norm": 0.23584985733032227, + "learning_rate": 4.25414993448625e-05, + "loss": 0.1593, + "step": 18344 + }, + { + "epoch": 0.32720365283772695, + "grad_norm": 0.3183591961860657, + "learning_rate": 4.254039028060616e-05, + "loss": 0.1819, + "step": 18345 + }, + { + "epoch": 0.32722148895944064, + "grad_norm": 0.22881759703159332, + "learning_rate": 4.253928114835721e-05, + "loss": 0.17, + "step": 18346 + }, + { + "epoch": 0.32723932508115433, + "grad_norm": 0.3099662959575653, + "learning_rate": 4.253817194811995e-05, + "loss": 0.1401, + "step": 18347 + }, + { + "epoch": 0.3272571612028681, + "grad_norm": 0.19419437646865845, + "learning_rate": 4.2537062679898675e-05, + "loss": 0.1621, + "step": 18348 + }, + { + "epoch": 0.32727499732458176, + "grad_norm": 0.2671164274215698, + "learning_rate": 4.25359533436977e-05, + "loss": 0.1673, + "step": 18349 + }, + { + "epoch": 0.32729283344629545, + "grad_norm": 0.31209197640419006, + "learning_rate": 4.25348439395213e-05, + "loss": 0.1858, + "step": 18350 + }, + { + "epoch": 0.32731066956800914, + "grad_norm": 0.21778671443462372, + "learning_rate": 4.2533734467373795e-05, + "loss": 0.1357, + "step": 18351 + }, + { + "epoch": 0.3273285056897228, + "grad_norm": 0.23879684507846832, + "learning_rate": 4.2532624927259475e-05, + "loss": 0.1764, + "step": 18352 + }, + { + "epoch": 0.3273463418114365, + "grad_norm": 0.30174243450164795, + "learning_rate": 4.253151531918265e-05, + "loss": 0.1675, + "step": 18353 + }, + { + "epoch": 0.3273641779331502, + "grad_norm": 0.24705903232097626, + "learning_rate": 4.2530405643147606e-05, + "loss": 0.1369, + "step": 18354 + }, + { + "epoch": 0.3273820140548639, + "grad_norm": 0.25176241993904114, + "learning_rate": 4.252929589915867e-05, + "loss": 0.1624, + "step": 18355 + }, + { + "epoch": 0.3273998501765776, + "grad_norm": 0.2652989327907562, + "learning_rate": 4.252818608722012e-05, + "loss": 0.1542, + "step": 18356 + }, + { + "epoch": 0.3274176862982913, + "grad_norm": 0.2860439419746399, + "learning_rate": 4.2527076207336267e-05, + "loss": 0.1492, + "step": 18357 + }, + { + "epoch": 0.327435522420005, + "grad_norm": 0.28419235348701477, + "learning_rate": 4.252596625951141e-05, + "loss": 0.1847, + "step": 18358 + }, + { + "epoch": 0.3274533585417187, + "grad_norm": 0.40242666006088257, + "learning_rate": 4.252485624374986e-05, + "loss": 0.1407, + "step": 18359 + }, + { + "epoch": 0.3274711946634324, + "grad_norm": 0.27525705099105835, + "learning_rate": 4.2523746160055915e-05, + "loss": 0.1656, + "step": 18360 + }, + { + "epoch": 0.3274890307851461, + "grad_norm": 0.287389874458313, + "learning_rate": 4.252263600843387e-05, + "loss": 0.225, + "step": 18361 + }, + { + "epoch": 0.32750686690685976, + "grad_norm": 0.30568593740463257, + "learning_rate": 4.252152578888804e-05, + "loss": 0.1625, + "step": 18362 + }, + { + "epoch": 0.32752470302857345, + "grad_norm": 0.32873180508613586, + "learning_rate": 4.252041550142273e-05, + "loss": 0.1651, + "step": 18363 + }, + { + "epoch": 0.32754253915028714, + "grad_norm": 0.2423868477344513, + "learning_rate": 4.2519305146042234e-05, + "loss": 0.189, + "step": 18364 + }, + { + "epoch": 0.3275603752720009, + "grad_norm": 0.2557133436203003, + "learning_rate": 4.251819472275086e-05, + "loss": 0.1258, + "step": 18365 + }, + { + "epoch": 0.32757821139371457, + "grad_norm": 0.32561326026916504, + "learning_rate": 4.2517084231552905e-05, + "loss": 0.1876, + "step": 18366 + }, + { + "epoch": 0.32759604751542826, + "grad_norm": 0.3135305941104889, + "learning_rate": 4.251597367245269e-05, + "loss": 0.1807, + "step": 18367 + }, + { + "epoch": 0.32761388363714194, + "grad_norm": 0.240975022315979, + "learning_rate": 4.251486304545451e-05, + "loss": 0.1449, + "step": 18368 + }, + { + "epoch": 0.32763171975885563, + "grad_norm": 0.22965987026691437, + "learning_rate": 4.251375235056267e-05, + "loss": 0.1196, + "step": 18369 + }, + { + "epoch": 0.3276495558805693, + "grad_norm": 0.26841914653778076, + "learning_rate": 4.251264158778148e-05, + "loss": 0.1807, + "step": 18370 + }, + { + "epoch": 0.327667392002283, + "grad_norm": 0.23701927065849304, + "learning_rate": 4.2511530757115246e-05, + "loss": 0.1543, + "step": 18371 + }, + { + "epoch": 0.3276852281239967, + "grad_norm": 0.25472572445869446, + "learning_rate": 4.251041985856826e-05, + "loss": 0.116, + "step": 18372 + }, + { + "epoch": 0.32770306424571044, + "grad_norm": 0.2580103576183319, + "learning_rate": 4.250930889214484e-05, + "loss": 0.1326, + "step": 18373 + }, + { + "epoch": 0.32772090036742413, + "grad_norm": 0.24860207736492157, + "learning_rate": 4.25081978578493e-05, + "loss": 0.197, + "step": 18374 + }, + { + "epoch": 0.3277387364891378, + "grad_norm": 0.3765740394592285, + "learning_rate": 4.250708675568593e-05, + "loss": 0.2144, + "step": 18375 + }, + { + "epoch": 0.3277565726108515, + "grad_norm": 0.28787291049957275, + "learning_rate": 4.2505975585659045e-05, + "loss": 0.1522, + "step": 18376 + }, + { + "epoch": 0.3277744087325652, + "grad_norm": 0.291194349527359, + "learning_rate": 4.250486434777296e-05, + "loss": 0.1529, + "step": 18377 + }, + { + "epoch": 0.3277922448542789, + "grad_norm": 0.239347442984581, + "learning_rate": 4.2503753042031966e-05, + "loss": 0.1521, + "step": 18378 + }, + { + "epoch": 0.32781008097599257, + "grad_norm": 0.20403681695461273, + "learning_rate": 4.250264166844039e-05, + "loss": 0.144, + "step": 18379 + }, + { + "epoch": 0.32782791709770626, + "grad_norm": 0.36052584648132324, + "learning_rate": 4.2501530227002514e-05, + "loss": 0.1433, + "step": 18380 + }, + { + "epoch": 0.32784575321941994, + "grad_norm": 0.33250558376312256, + "learning_rate": 4.250041871772268e-05, + "loss": 0.1756, + "step": 18381 + }, + { + "epoch": 0.3278635893411337, + "grad_norm": 0.2789793312549591, + "learning_rate": 4.249930714060517e-05, + "loss": 0.1847, + "step": 18382 + }, + { + "epoch": 0.3278814254628474, + "grad_norm": 0.24241597950458527, + "learning_rate": 4.24981954956543e-05, + "loss": 0.1454, + "step": 18383 + }, + { + "epoch": 0.32789926158456106, + "grad_norm": 0.29094740748405457, + "learning_rate": 4.249708378287438e-05, + "loss": 0.1645, + "step": 18384 + }, + { + "epoch": 0.32791709770627475, + "grad_norm": 0.31322869658470154, + "learning_rate": 4.249597200226972e-05, + "loss": 0.1764, + "step": 18385 + }, + { + "epoch": 0.32793493382798844, + "grad_norm": 0.3057456910610199, + "learning_rate": 4.249486015384463e-05, + "loss": 0.1757, + "step": 18386 + }, + { + "epoch": 0.3279527699497021, + "grad_norm": 0.24807824194431305, + "learning_rate": 4.249374823760343e-05, + "loss": 0.1491, + "step": 18387 + }, + { + "epoch": 0.3279706060714158, + "grad_norm": 0.3567791283130646, + "learning_rate": 4.249263625355041e-05, + "loss": 0.2344, + "step": 18388 + }, + { + "epoch": 0.3279884421931295, + "grad_norm": 0.2898887097835541, + "learning_rate": 4.249152420168988e-05, + "loss": 0.1698, + "step": 18389 + }, + { + "epoch": 0.32800627831484325, + "grad_norm": 0.2851754128932953, + "learning_rate": 4.249041208202618e-05, + "loss": 0.1549, + "step": 18390 + }, + { + "epoch": 0.32802411443655694, + "grad_norm": 0.3616372048854828, + "learning_rate": 4.248929989456359e-05, + "loss": 0.198, + "step": 18391 + }, + { + "epoch": 0.3280419505582706, + "grad_norm": 0.4312104284763336, + "learning_rate": 4.248818763930644e-05, + "loss": 0.1711, + "step": 18392 + }, + { + "epoch": 0.3280597866799843, + "grad_norm": 0.30319151282310486, + "learning_rate": 4.248707531625903e-05, + "loss": 0.157, + "step": 18393 + }, + { + "epoch": 0.328077622801698, + "grad_norm": 0.27680492401123047, + "learning_rate": 4.248596292542567e-05, + "loss": 0.1607, + "step": 18394 + }, + { + "epoch": 0.3280954589234117, + "grad_norm": 0.25055864453315735, + "learning_rate": 4.2484850466810686e-05, + "loss": 0.1369, + "step": 18395 + }, + { + "epoch": 0.3281132950451254, + "grad_norm": 0.2178165316581726, + "learning_rate": 4.2483737940418386e-05, + "loss": 0.18, + "step": 18396 + }, + { + "epoch": 0.32813113116683906, + "grad_norm": 0.24201984703540802, + "learning_rate": 4.2482625346253076e-05, + "loss": 0.1453, + "step": 18397 + }, + { + "epoch": 0.32814896728855275, + "grad_norm": 0.23836806416511536, + "learning_rate": 4.2481512684319066e-05, + "loss": 0.1803, + "step": 18398 + }, + { + "epoch": 0.3281668034102665, + "grad_norm": 0.3131429851055145, + "learning_rate": 4.248039995462068e-05, + "loss": 0.1721, + "step": 18399 + }, + { + "epoch": 0.3281846395319802, + "grad_norm": 0.21951258182525635, + "learning_rate": 4.247928715716223e-05, + "loss": 0.1619, + "step": 18400 + }, + { + "epoch": 0.32820247565369387, + "grad_norm": 0.29232263565063477, + "learning_rate": 4.2478174291948016e-05, + "loss": 0.182, + "step": 18401 + }, + { + "epoch": 0.32822031177540756, + "grad_norm": 0.3089362680912018, + "learning_rate": 4.2477061358982375e-05, + "loss": 0.2145, + "step": 18402 + }, + { + "epoch": 0.32823814789712125, + "grad_norm": 0.46059197187423706, + "learning_rate": 4.247594835826959e-05, + "loss": 0.1601, + "step": 18403 + }, + { + "epoch": 0.32825598401883493, + "grad_norm": 0.29953014850616455, + "learning_rate": 4.247483528981401e-05, + "loss": 0.1692, + "step": 18404 + }, + { + "epoch": 0.3282738201405486, + "grad_norm": 0.27145713567733765, + "learning_rate": 4.247372215361992e-05, + "loss": 0.1869, + "step": 18405 + }, + { + "epoch": 0.3282916562622623, + "grad_norm": 0.38528114557266235, + "learning_rate": 4.247260894969166e-05, + "loss": 0.1693, + "step": 18406 + }, + { + "epoch": 0.32830949238397605, + "grad_norm": 0.2525404095649719, + "learning_rate": 4.2471495678033524e-05, + "loss": 0.1349, + "step": 18407 + }, + { + "epoch": 0.32832732850568974, + "grad_norm": 0.27676740288734436, + "learning_rate": 4.247038233864984e-05, + "loss": 0.1418, + "step": 18408 + }, + { + "epoch": 0.32834516462740343, + "grad_norm": 0.2454938292503357, + "learning_rate": 4.246926893154492e-05, + "loss": 0.16, + "step": 18409 + }, + { + "epoch": 0.3283630007491171, + "grad_norm": 0.2871152460575104, + "learning_rate": 4.246815545672308e-05, + "loss": 0.177, + "step": 18410 + }, + { + "epoch": 0.3283808368708308, + "grad_norm": 0.29965710639953613, + "learning_rate": 4.246704191418863e-05, + "loss": 0.1702, + "step": 18411 + }, + { + "epoch": 0.3283986729925445, + "grad_norm": 0.225668266415596, + "learning_rate": 4.24659283039459e-05, + "loss": 0.1379, + "step": 18412 + }, + { + "epoch": 0.3284165091142582, + "grad_norm": 0.30417150259017944, + "learning_rate": 4.24648146259992e-05, + "loss": 0.1898, + "step": 18413 + }, + { + "epoch": 0.32843434523597187, + "grad_norm": 0.2466631531715393, + "learning_rate": 4.246370088035284e-05, + "loss": 0.1836, + "step": 18414 + }, + { + "epoch": 0.32845218135768556, + "grad_norm": 0.21825666725635529, + "learning_rate": 4.246258706701114e-05, + "loss": 0.1564, + "step": 18415 + }, + { + "epoch": 0.3284700174793993, + "grad_norm": 0.2429237961769104, + "learning_rate": 4.246147318597844e-05, + "loss": 0.176, + "step": 18416 + }, + { + "epoch": 0.328487853601113, + "grad_norm": 0.3928097188472748, + "learning_rate": 4.2460359237259016e-05, + "loss": 0.207, + "step": 18417 + }, + { + "epoch": 0.3285056897228267, + "grad_norm": 0.32049092650413513, + "learning_rate": 4.2459245220857225e-05, + "loss": 0.2578, + "step": 18418 + }, + { + "epoch": 0.32852352584454036, + "grad_norm": 0.2793530225753784, + "learning_rate": 4.245813113677736e-05, + "loss": 0.1993, + "step": 18419 + }, + { + "epoch": 0.32854136196625405, + "grad_norm": 0.2271171659231186, + "learning_rate": 4.2457016985023756e-05, + "loss": 0.1693, + "step": 18420 + }, + { + "epoch": 0.32855919808796774, + "grad_norm": 0.2803175151348114, + "learning_rate": 4.2455902765600724e-05, + "loss": 0.2041, + "step": 18421 + }, + { + "epoch": 0.32857703420968143, + "grad_norm": 0.29815348982810974, + "learning_rate": 4.245478847851258e-05, + "loss": 0.2603, + "step": 18422 + }, + { + "epoch": 0.3285948703313951, + "grad_norm": 0.27385538816452026, + "learning_rate": 4.2453674123763655e-05, + "loss": 0.1344, + "step": 18423 + }, + { + "epoch": 0.32861270645310886, + "grad_norm": 0.2498927265405655, + "learning_rate": 4.245255970135825e-05, + "loss": 0.1503, + "step": 18424 + }, + { + "epoch": 0.32863054257482255, + "grad_norm": 0.2583667039871216, + "learning_rate": 4.24514452113007e-05, + "loss": 0.1918, + "step": 18425 + }, + { + "epoch": 0.32864837869653624, + "grad_norm": 0.2544820308685303, + "learning_rate": 4.245033065359532e-05, + "loss": 0.1437, + "step": 18426 + }, + { + "epoch": 0.3286662148182499, + "grad_norm": 0.24424763023853302, + "learning_rate": 4.244921602824643e-05, + "loss": 0.1826, + "step": 18427 + }, + { + "epoch": 0.3286840509399636, + "grad_norm": 0.335256963968277, + "learning_rate": 4.244810133525836e-05, + "loss": 0.1749, + "step": 18428 + }, + { + "epoch": 0.3287018870616773, + "grad_norm": 0.2927291989326477, + "learning_rate": 4.2446986574635415e-05, + "loss": 0.1631, + "step": 18429 + }, + { + "epoch": 0.328719723183391, + "grad_norm": 0.37019386887550354, + "learning_rate": 4.2445871746381927e-05, + "loss": 0.1941, + "step": 18430 + }, + { + "epoch": 0.3287375593051047, + "grad_norm": 0.3443300724029541, + "learning_rate": 4.244475685050221e-05, + "loss": 0.1655, + "step": 18431 + }, + { + "epoch": 0.3287553954268184, + "grad_norm": 0.3643217384815216, + "learning_rate": 4.24436418870006e-05, + "loss": 0.1953, + "step": 18432 + }, + { + "epoch": 0.3287732315485321, + "grad_norm": 0.24501550197601318, + "learning_rate": 4.24425268558814e-05, + "loss": 0.1802, + "step": 18433 + }, + { + "epoch": 0.3287910676702458, + "grad_norm": 0.292643666267395, + "learning_rate": 4.244141175714894e-05, + "loss": 0.1916, + "step": 18434 + }, + { + "epoch": 0.3288089037919595, + "grad_norm": 0.2928709089756012, + "learning_rate": 4.244029659080755e-05, + "loss": 0.1525, + "step": 18435 + }, + { + "epoch": 0.32882673991367317, + "grad_norm": 0.26694509387016296, + "learning_rate": 4.243918135686155e-05, + "loss": 0.1618, + "step": 18436 + }, + { + "epoch": 0.32884457603538686, + "grad_norm": 0.239248588681221, + "learning_rate": 4.243806605531525e-05, + "loss": 0.1874, + "step": 18437 + }, + { + "epoch": 0.32886241215710055, + "grad_norm": 0.23508448898792267, + "learning_rate": 4.243695068617299e-05, + "loss": 0.1949, + "step": 18438 + }, + { + "epoch": 0.32888024827881424, + "grad_norm": 0.25928160548210144, + "learning_rate": 4.243583524943908e-05, + "loss": 0.1392, + "step": 18439 + }, + { + "epoch": 0.3288980844005279, + "grad_norm": 0.39715102314949036, + "learning_rate": 4.243471974511786e-05, + "loss": 0.1657, + "step": 18440 + }, + { + "epoch": 0.32891592052224167, + "grad_norm": 0.2791080176830292, + "learning_rate": 4.2433604173213634e-05, + "loss": 0.2072, + "step": 18441 + }, + { + "epoch": 0.32893375664395536, + "grad_norm": 0.20456264913082123, + "learning_rate": 4.243248853373075e-05, + "loss": 0.1172, + "step": 18442 + }, + { + "epoch": 0.32895159276566904, + "grad_norm": 0.24758672714233398, + "learning_rate": 4.243137282667351e-05, + "loss": 0.1514, + "step": 18443 + }, + { + "epoch": 0.32896942888738273, + "grad_norm": 0.3261067569255829, + "learning_rate": 4.243025705204625e-05, + "loss": 0.1031, + "step": 18444 + }, + { + "epoch": 0.3289872650090964, + "grad_norm": 0.2676059901714325, + "learning_rate": 4.2429141209853296e-05, + "loss": 0.1389, + "step": 18445 + }, + { + "epoch": 0.3290051011308101, + "grad_norm": 0.28574347496032715, + "learning_rate": 4.2428025300098965e-05, + "loss": 0.2363, + "step": 18446 + }, + { + "epoch": 0.3290229372525238, + "grad_norm": 0.3670462369918823, + "learning_rate": 4.242690932278759e-05, + "loss": 0.1963, + "step": 18447 + }, + { + "epoch": 0.3290407733742375, + "grad_norm": 0.26557764410972595, + "learning_rate": 4.24257932779235e-05, + "loss": 0.1884, + "step": 18448 + }, + { + "epoch": 0.3290586094959512, + "grad_norm": 0.17650456726551056, + "learning_rate": 4.2424677165511015e-05, + "loss": 0.143, + "step": 18449 + }, + { + "epoch": 0.3290764456176649, + "grad_norm": 0.3135792016983032, + "learning_rate": 4.242356098555446e-05, + "loss": 0.1892, + "step": 18450 + }, + { + "epoch": 0.3290942817393786, + "grad_norm": 0.2925012409687042, + "learning_rate": 4.242244473805816e-05, + "loss": 0.1663, + "step": 18451 + }, + { + "epoch": 0.3291121178610923, + "grad_norm": 0.3600432872772217, + "learning_rate": 4.2421328423026465e-05, + "loss": 0.2134, + "step": 18452 + }, + { + "epoch": 0.329129953982806, + "grad_norm": 0.23821434378623962, + "learning_rate": 4.242021204046367e-05, + "loss": 0.1479, + "step": 18453 + }, + { + "epoch": 0.32914779010451967, + "grad_norm": 0.2633552551269531, + "learning_rate": 4.241909559037411e-05, + "loss": 0.2214, + "step": 18454 + }, + { + "epoch": 0.32916562622623335, + "grad_norm": 0.3139183521270752, + "learning_rate": 4.241797907276214e-05, + "loss": 0.1884, + "step": 18455 + }, + { + "epoch": 0.32918346234794704, + "grad_norm": 0.2700309455394745, + "learning_rate": 4.241686248763205e-05, + "loss": 0.209, + "step": 18456 + }, + { + "epoch": 0.32920129846966073, + "grad_norm": 0.3206835091114044, + "learning_rate": 4.241574583498819e-05, + "loss": 0.1928, + "step": 18457 + }, + { + "epoch": 0.3292191345913745, + "grad_norm": 0.26301684975624084, + "learning_rate": 4.2414629114834884e-05, + "loss": 0.1378, + "step": 18458 + }, + { + "epoch": 0.32923697071308816, + "grad_norm": 0.2872345447540283, + "learning_rate": 4.241351232717647e-05, + "loss": 0.1627, + "step": 18459 + }, + { + "epoch": 0.32925480683480185, + "grad_norm": 0.40082958340644836, + "learning_rate": 4.241239547201725e-05, + "loss": 0.1309, + "step": 18460 + }, + { + "epoch": 0.32927264295651554, + "grad_norm": 0.32533612847328186, + "learning_rate": 4.241127854936158e-05, + "loss": 0.1704, + "step": 18461 + }, + { + "epoch": 0.3292904790782292, + "grad_norm": 0.29047784209251404, + "learning_rate": 4.241016155921378e-05, + "loss": 0.2423, + "step": 18462 + }, + { + "epoch": 0.3293083151999429, + "grad_norm": 0.28228312730789185, + "learning_rate": 4.240904450157818e-05, + "loss": 0.1639, + "step": 18463 + }, + { + "epoch": 0.3293261513216566, + "grad_norm": 0.323364794254303, + "learning_rate": 4.240792737645911e-05, + "loss": 0.1957, + "step": 18464 + }, + { + "epoch": 0.3293439874433703, + "grad_norm": 0.2613949775695801, + "learning_rate": 4.2406810183860904e-05, + "loss": 0.1732, + "step": 18465 + }, + { + "epoch": 0.32936182356508403, + "grad_norm": 0.2527284324169159, + "learning_rate": 4.2405692923787886e-05, + "loss": 0.1751, + "step": 18466 + }, + { + "epoch": 0.3293796596867977, + "grad_norm": 0.22423000633716583, + "learning_rate": 4.24045755962444e-05, + "loss": 0.1856, + "step": 18467 + }, + { + "epoch": 0.3293974958085114, + "grad_norm": 0.23415596783161163, + "learning_rate": 4.240345820123476e-05, + "loss": 0.1824, + "step": 18468 + }, + { + "epoch": 0.3294153319302251, + "grad_norm": 0.23365390300750732, + "learning_rate": 4.24023407387633e-05, + "loss": 0.1735, + "step": 18469 + }, + { + "epoch": 0.3294331680519388, + "grad_norm": 0.4614911675453186, + "learning_rate": 4.240122320883436e-05, + "loss": 0.1293, + "step": 18470 + }, + { + "epoch": 0.3294510041736525, + "grad_norm": 0.2389400750398636, + "learning_rate": 4.2400105611452276e-05, + "loss": 0.1366, + "step": 18471 + }, + { + "epoch": 0.32946884029536616, + "grad_norm": 0.26551803946495056, + "learning_rate": 4.239898794662137e-05, + "loss": 0.1606, + "step": 18472 + }, + { + "epoch": 0.32948667641707985, + "grad_norm": 0.21386370062828064, + "learning_rate": 4.239787021434597e-05, + "loss": 0.1431, + "step": 18473 + }, + { + "epoch": 0.3295045125387936, + "grad_norm": 0.3108559548854828, + "learning_rate": 4.239675241463042e-05, + "loss": 0.186, + "step": 18474 + }, + { + "epoch": 0.3295223486605073, + "grad_norm": 0.3738678991794586, + "learning_rate": 4.239563454747906e-05, + "loss": 0.1575, + "step": 18475 + }, + { + "epoch": 0.32954018478222097, + "grad_norm": 0.2810070514678955, + "learning_rate": 4.2394516612896194e-05, + "loss": 0.1711, + "step": 18476 + }, + { + "epoch": 0.32955802090393466, + "grad_norm": 0.30859261751174927, + "learning_rate": 4.239339861088618e-05, + "loss": 0.1779, + "step": 18477 + }, + { + "epoch": 0.32957585702564834, + "grad_norm": 0.4246044158935547, + "learning_rate": 4.239228054145335e-05, + "loss": 0.185, + "step": 18478 + }, + { + "epoch": 0.32959369314736203, + "grad_norm": 0.2715553939342499, + "learning_rate": 4.2391162404602036e-05, + "loss": 0.1603, + "step": 18479 + }, + { + "epoch": 0.3296115292690757, + "grad_norm": 0.34074312448501587, + "learning_rate": 4.239004420033656e-05, + "loss": 0.156, + "step": 18480 + }, + { + "epoch": 0.3296293653907894, + "grad_norm": 0.2593488395214081, + "learning_rate": 4.2388925928661274e-05, + "loss": 0.1727, + "step": 18481 + }, + { + "epoch": 0.3296472015125031, + "grad_norm": 0.4301450252532959, + "learning_rate": 4.2387807589580495e-05, + "loss": 0.2347, + "step": 18482 + }, + { + "epoch": 0.32966503763421684, + "grad_norm": 0.27101439237594604, + "learning_rate": 4.238668918309858e-05, + "loss": 0.1846, + "step": 18483 + }, + { + "epoch": 0.32968287375593053, + "grad_norm": 0.2908579111099243, + "learning_rate": 4.238557070921985e-05, + "loss": 0.1393, + "step": 18484 + }, + { + "epoch": 0.3297007098776442, + "grad_norm": 0.3351699113845825, + "learning_rate": 4.238445216794864e-05, + "loss": 0.1807, + "step": 18485 + }, + { + "epoch": 0.3297185459993579, + "grad_norm": 0.2559354901313782, + "learning_rate": 4.238333355928929e-05, + "loss": 0.1361, + "step": 18486 + }, + { + "epoch": 0.3297363821210716, + "grad_norm": 0.24197784066200256, + "learning_rate": 4.2382214883246134e-05, + "loss": 0.1406, + "step": 18487 + }, + { + "epoch": 0.3297542182427853, + "grad_norm": 0.2392469346523285, + "learning_rate": 4.238109613982352e-05, + "loss": 0.1364, + "step": 18488 + }, + { + "epoch": 0.32977205436449897, + "grad_norm": 0.4118051826953888, + "learning_rate": 4.2379977329025755e-05, + "loss": 0.1935, + "step": 18489 + }, + { + "epoch": 0.32978989048621266, + "grad_norm": 0.2652823030948639, + "learning_rate": 4.2378858450857207e-05, + "loss": 0.174, + "step": 18490 + }, + { + "epoch": 0.3298077266079264, + "grad_norm": 0.27608683705329895, + "learning_rate": 4.23777395053222e-05, + "loss": 0.1676, + "step": 18491 + }, + { + "epoch": 0.3298255627296401, + "grad_norm": 0.22772575914859772, + "learning_rate": 4.2376620492425075e-05, + "loss": 0.1457, + "step": 18492 + }, + { + "epoch": 0.3298433988513538, + "grad_norm": 0.3159489035606384, + "learning_rate": 4.237550141217016e-05, + "loss": 0.154, + "step": 18493 + }, + { + "epoch": 0.32986123497306746, + "grad_norm": 0.27029287815093994, + "learning_rate": 4.2374382264561806e-05, + "loss": 0.1786, + "step": 18494 + }, + { + "epoch": 0.32987907109478115, + "grad_norm": 0.21946412324905396, + "learning_rate": 4.237326304960434e-05, + "loss": 0.1937, + "step": 18495 + }, + { + "epoch": 0.32989690721649484, + "grad_norm": 0.23954299092292786, + "learning_rate": 4.2372143767302113e-05, + "loss": 0.1345, + "step": 18496 + }, + { + "epoch": 0.3299147433382085, + "grad_norm": 0.19765381515026093, + "learning_rate": 4.2371024417659455e-05, + "loss": 0.1778, + "step": 18497 + }, + { + "epoch": 0.3299325794599222, + "grad_norm": 0.2995823323726654, + "learning_rate": 4.23699050006807e-05, + "loss": 0.191, + "step": 18498 + }, + { + "epoch": 0.3299504155816359, + "grad_norm": 0.30685746669769287, + "learning_rate": 4.23687855163702e-05, + "loss": 0.1709, + "step": 18499 + }, + { + "epoch": 0.32996825170334965, + "grad_norm": 0.26829075813293457, + "learning_rate": 4.236766596473229e-05, + "loss": 0.2112, + "step": 18500 + }, + { + "epoch": 0.32998608782506333, + "grad_norm": 0.3012070059776306, + "learning_rate": 4.2366546345771305e-05, + "loss": 0.1842, + "step": 18501 + }, + { + "epoch": 0.330003923946777, + "grad_norm": 0.5140218734741211, + "learning_rate": 4.236542665949158e-05, + "loss": 0.243, + "step": 18502 + }, + { + "epoch": 0.3300217600684907, + "grad_norm": 0.27802202105522156, + "learning_rate": 4.2364306905897475e-05, + "loss": 0.1633, + "step": 18503 + }, + { + "epoch": 0.3300395961902044, + "grad_norm": 0.23743318021297455, + "learning_rate": 4.236318708499332e-05, + "loss": 0.1878, + "step": 18504 + }, + { + "epoch": 0.3300574323119181, + "grad_norm": 0.2713869512081146, + "learning_rate": 4.236206719678345e-05, + "loss": 0.166, + "step": 18505 + }, + { + "epoch": 0.3300752684336318, + "grad_norm": 0.29285669326782227, + "learning_rate": 4.236094724127221e-05, + "loss": 0.2136, + "step": 18506 + }, + { + "epoch": 0.33009310455534546, + "grad_norm": 0.3492244482040405, + "learning_rate": 4.235982721846394e-05, + "loss": 0.1721, + "step": 18507 + }, + { + "epoch": 0.3301109406770592, + "grad_norm": 0.23571038246154785, + "learning_rate": 4.235870712836299e-05, + "loss": 0.1601, + "step": 18508 + }, + { + "epoch": 0.3301287767987729, + "grad_norm": 0.1872876137495041, + "learning_rate": 4.235758697097369e-05, + "loss": 0.1336, + "step": 18509 + }, + { + "epoch": 0.3301466129204866, + "grad_norm": 0.2855657637119293, + "learning_rate": 4.2356466746300395e-05, + "loss": 0.1812, + "step": 18510 + }, + { + "epoch": 0.33016444904220027, + "grad_norm": 0.2859123647212982, + "learning_rate": 4.235534645434743e-05, + "loss": 0.1967, + "step": 18511 + }, + { + "epoch": 0.33018228516391396, + "grad_norm": 0.2978751063346863, + "learning_rate": 4.235422609511916e-05, + "loss": 0.1979, + "step": 18512 + }, + { + "epoch": 0.33020012128562765, + "grad_norm": 0.2722541391849518, + "learning_rate": 4.235310566861991e-05, + "loss": 0.1795, + "step": 18513 + }, + { + "epoch": 0.33021795740734133, + "grad_norm": 0.24375705420970917, + "learning_rate": 4.2351985174854024e-05, + "loss": 0.1601, + "step": 18514 + }, + { + "epoch": 0.330235793529055, + "grad_norm": 0.2665567696094513, + "learning_rate": 4.235086461382586e-05, + "loss": 0.1969, + "step": 18515 + }, + { + "epoch": 0.3302536296507687, + "grad_norm": 0.24968458712100983, + "learning_rate": 4.2349743985539744e-05, + "loss": 0.1772, + "step": 18516 + }, + { + "epoch": 0.33027146577248245, + "grad_norm": 0.2429434359073639, + "learning_rate": 4.234862329000003e-05, + "loss": 0.147, + "step": 18517 + }, + { + "epoch": 0.33028930189419614, + "grad_norm": 0.32692569494247437, + "learning_rate": 4.2347502527211066e-05, + "loss": 0.2007, + "step": 18518 + }, + { + "epoch": 0.33030713801590983, + "grad_norm": 0.20782728493213654, + "learning_rate": 4.2346381697177186e-05, + "loss": 0.1583, + "step": 18519 + }, + { + "epoch": 0.3303249741376235, + "grad_norm": 0.4226702153682709, + "learning_rate": 4.234526079990273e-05, + "loss": 0.2149, + "step": 18520 + }, + { + "epoch": 0.3303428102593372, + "grad_norm": 0.21528585255146027, + "learning_rate": 4.2344139835392065e-05, + "loss": 0.1671, + "step": 18521 + }, + { + "epoch": 0.3303606463810509, + "grad_norm": 0.3536587357521057, + "learning_rate": 4.234301880364952e-05, + "loss": 0.2172, + "step": 18522 + }, + { + "epoch": 0.3303784825027646, + "grad_norm": 0.2704784870147705, + "learning_rate": 4.2341897704679445e-05, + "loss": 0.1313, + "step": 18523 + }, + { + "epoch": 0.33039631862447827, + "grad_norm": 0.231796532869339, + "learning_rate": 4.234077653848618e-05, + "loss": 0.171, + "step": 18524 + }, + { + "epoch": 0.330414154746192, + "grad_norm": 0.22569707036018372, + "learning_rate": 4.2339655305074075e-05, + "loss": 0.2, + "step": 18525 + }, + { + "epoch": 0.3304319908679057, + "grad_norm": 0.3265334963798523, + "learning_rate": 4.2338534004447486e-05, + "loss": 0.1638, + "step": 18526 + }, + { + "epoch": 0.3304498269896194, + "grad_norm": 0.390085369348526, + "learning_rate": 4.233741263661075e-05, + "loss": 0.195, + "step": 18527 + }, + { + "epoch": 0.3304676631113331, + "grad_norm": 0.31128254532814026, + "learning_rate": 4.23362912015682e-05, + "loss": 0.1843, + "step": 18528 + }, + { + "epoch": 0.33048549923304676, + "grad_norm": 0.4441210627555847, + "learning_rate": 4.233516969932422e-05, + "loss": 0.2224, + "step": 18529 + }, + { + "epoch": 0.33050333535476045, + "grad_norm": 0.30675169825553894, + "learning_rate": 4.233404812988312e-05, + "loss": 0.1153, + "step": 18530 + }, + { + "epoch": 0.33052117147647414, + "grad_norm": 0.2166266143321991, + "learning_rate": 4.233292649324926e-05, + "loss": 0.1663, + "step": 18531 + }, + { + "epoch": 0.33053900759818783, + "grad_norm": 0.3506048619747162, + "learning_rate": 4.2331804789427e-05, + "loss": 0.2411, + "step": 18532 + }, + { + "epoch": 0.33055684371990157, + "grad_norm": 0.3138313591480255, + "learning_rate": 4.233068301842067e-05, + "loss": 0.2076, + "step": 18533 + }, + { + "epoch": 0.33057467984161526, + "grad_norm": 0.2720378339290619, + "learning_rate": 4.2329561180234634e-05, + "loss": 0.1463, + "step": 18534 + }, + { + "epoch": 0.33059251596332895, + "grad_norm": 0.3374897837638855, + "learning_rate": 4.232843927487323e-05, + "loss": 0.2054, + "step": 18535 + }, + { + "epoch": 0.33061035208504264, + "grad_norm": 0.24583639204502106, + "learning_rate": 4.2327317302340804e-05, + "loss": 0.163, + "step": 18536 + }, + { + "epoch": 0.3306281882067563, + "grad_norm": 0.35280197858810425, + "learning_rate": 4.232619526264172e-05, + "loss": 0.2484, + "step": 18537 + }, + { + "epoch": 0.33064602432847, + "grad_norm": 0.2738872468471527, + "learning_rate": 4.2325073155780315e-05, + "loss": 0.1718, + "step": 18538 + }, + { + "epoch": 0.3306638604501837, + "grad_norm": 0.35030362010002136, + "learning_rate": 4.2323950981760944e-05, + "loss": 0.1766, + "step": 18539 + }, + { + "epoch": 0.3306816965718974, + "grad_norm": 0.2816300392150879, + "learning_rate": 4.232282874058796e-05, + "loss": 0.2027, + "step": 18540 + }, + { + "epoch": 0.3306995326936111, + "grad_norm": 0.2774301767349243, + "learning_rate": 4.232170643226571e-05, + "loss": 0.1856, + "step": 18541 + }, + { + "epoch": 0.3307173688153248, + "grad_norm": 0.2686683237552643, + "learning_rate": 4.232058405679853e-05, + "loss": 0.2048, + "step": 18542 + }, + { + "epoch": 0.3307352049370385, + "grad_norm": 0.2277437299489975, + "learning_rate": 4.2319461614190793e-05, + "loss": 0.1058, + "step": 18543 + }, + { + "epoch": 0.3307530410587522, + "grad_norm": 0.21263213455677032, + "learning_rate": 4.2318339104446844e-05, + "loss": 0.1802, + "step": 18544 + }, + { + "epoch": 0.3307708771804659, + "grad_norm": 0.24947726726531982, + "learning_rate": 4.231721652757102e-05, + "loss": 0.1625, + "step": 18545 + }, + { + "epoch": 0.33078871330217957, + "grad_norm": 0.32679250836372375, + "learning_rate": 4.2316093883567695e-05, + "loss": 0.17, + "step": 18546 + }, + { + "epoch": 0.33080654942389326, + "grad_norm": 0.2497401237487793, + "learning_rate": 4.2314971172441195e-05, + "loss": 0.1582, + "step": 18547 + }, + { + "epoch": 0.33082438554560695, + "grad_norm": 0.35179224610328674, + "learning_rate": 4.23138483941959e-05, + "loss": 0.1829, + "step": 18548 + }, + { + "epoch": 0.33084222166732064, + "grad_norm": 0.2542244791984558, + "learning_rate": 4.2312725548836144e-05, + "loss": 0.1622, + "step": 18549 + }, + { + "epoch": 0.3308600577890344, + "grad_norm": 0.3894548714160919, + "learning_rate": 4.231160263636629e-05, + "loss": 0.1783, + "step": 18550 + }, + { + "epoch": 0.33087789391074807, + "grad_norm": 0.30261433124542236, + "learning_rate": 4.231047965679067e-05, + "loss": 0.1755, + "step": 18551 + }, + { + "epoch": 0.33089573003246175, + "grad_norm": 0.26256614923477173, + "learning_rate": 4.230935661011367e-05, + "loss": 0.1328, + "step": 18552 + }, + { + "epoch": 0.33091356615417544, + "grad_norm": 0.24681198596954346, + "learning_rate": 4.230823349633961e-05, + "loss": 0.1312, + "step": 18553 + }, + { + "epoch": 0.33093140227588913, + "grad_norm": 0.49699294567108154, + "learning_rate": 4.230711031547286e-05, + "loss": 0.21, + "step": 18554 + }, + { + "epoch": 0.3309492383976028, + "grad_norm": 0.4116224944591522, + "learning_rate": 4.230598706751779e-05, + "loss": 0.1647, + "step": 18555 + }, + { + "epoch": 0.3309670745193165, + "grad_norm": 0.3963545858860016, + "learning_rate": 4.230486375247872e-05, + "loss": 0.207, + "step": 18556 + }, + { + "epoch": 0.3309849106410302, + "grad_norm": 0.22137166559696198, + "learning_rate": 4.230374037036003e-05, + "loss": 0.1497, + "step": 18557 + }, + { + "epoch": 0.3310027467627439, + "grad_norm": 0.2381601482629776, + "learning_rate": 4.230261692116606e-05, + "loss": 0.1583, + "step": 18558 + }, + { + "epoch": 0.3310205828844576, + "grad_norm": 0.284512996673584, + "learning_rate": 4.230149340490117e-05, + "loss": 0.1561, + "step": 18559 + }, + { + "epoch": 0.3310384190061713, + "grad_norm": 0.22340558469295502, + "learning_rate": 4.230036982156972e-05, + "loss": 0.1429, + "step": 18560 + }, + { + "epoch": 0.331056255127885, + "grad_norm": 0.2640310525894165, + "learning_rate": 4.229924617117606e-05, + "loss": 0.1536, + "step": 18561 + }, + { + "epoch": 0.3310740912495987, + "grad_norm": 0.26734215021133423, + "learning_rate": 4.229812245372454e-05, + "loss": 0.161, + "step": 18562 + }, + { + "epoch": 0.3310919273713124, + "grad_norm": 0.3465871214866638, + "learning_rate": 4.2296998669219535e-05, + "loss": 0.202, + "step": 18563 + }, + { + "epoch": 0.33110976349302607, + "grad_norm": 0.2486545294523239, + "learning_rate": 4.2295874817665385e-05, + "loss": 0.1349, + "step": 18564 + }, + { + "epoch": 0.33112759961473975, + "grad_norm": 0.27449294924736023, + "learning_rate": 4.229475089906645e-05, + "loss": 0.1151, + "step": 18565 + }, + { + "epoch": 0.33114543573645344, + "grad_norm": 0.27632424235343933, + "learning_rate": 4.2293626913427085e-05, + "loss": 0.1692, + "step": 18566 + }, + { + "epoch": 0.3311632718581672, + "grad_norm": 0.31477048993110657, + "learning_rate": 4.229250286075165e-05, + "loss": 0.1354, + "step": 18567 + }, + { + "epoch": 0.3311811079798809, + "grad_norm": 0.23741310834884644, + "learning_rate": 4.22913787410445e-05, + "loss": 0.1868, + "step": 18568 + }, + { + "epoch": 0.33119894410159456, + "grad_norm": 0.2532624304294586, + "learning_rate": 4.2290254554309994e-05, + "loss": 0.1407, + "step": 18569 + }, + { + "epoch": 0.33121678022330825, + "grad_norm": 0.339575856924057, + "learning_rate": 4.2289130300552494e-05, + "loss": 0.27, + "step": 18570 + }, + { + "epoch": 0.33123461634502194, + "grad_norm": 0.3205280303955078, + "learning_rate": 4.2288005979776345e-05, + "loss": 0.1445, + "step": 18571 + }, + { + "epoch": 0.3312524524667356, + "grad_norm": 0.2817758321762085, + "learning_rate": 4.2286881591985924e-05, + "loss": 0.1586, + "step": 18572 + }, + { + "epoch": 0.3312702885884493, + "grad_norm": 0.3039351999759674, + "learning_rate": 4.2285757137185575e-05, + "loss": 0.1379, + "step": 18573 + }, + { + "epoch": 0.331288124710163, + "grad_norm": 0.29977691173553467, + "learning_rate": 4.228463261537966e-05, + "loss": 0.1962, + "step": 18574 + }, + { + "epoch": 0.3313059608318767, + "grad_norm": 0.34099602699279785, + "learning_rate": 4.228350802657254e-05, + "loss": 0.1707, + "step": 18575 + }, + { + "epoch": 0.33132379695359043, + "grad_norm": 0.3081951141357422, + "learning_rate": 4.228238337076857e-05, + "loss": 0.1797, + "step": 18576 + }, + { + "epoch": 0.3313416330753041, + "grad_norm": 0.29143157601356506, + "learning_rate": 4.228125864797211e-05, + "loss": 0.1891, + "step": 18577 + }, + { + "epoch": 0.3313594691970178, + "grad_norm": 0.22725003957748413, + "learning_rate": 4.228013385818753e-05, + "loss": 0.1511, + "step": 18578 + }, + { + "epoch": 0.3313773053187315, + "grad_norm": 0.310830682516098, + "learning_rate": 4.2279009001419184e-05, + "loss": 0.1945, + "step": 18579 + }, + { + "epoch": 0.3313951414404452, + "grad_norm": 0.2794807553291321, + "learning_rate": 4.2277884077671424e-05, + "loss": 0.1742, + "step": 18580 + }, + { + "epoch": 0.3314129775621589, + "grad_norm": 0.3090817332267761, + "learning_rate": 4.2276759086948626e-05, + "loss": 0.2105, + "step": 18581 + }, + { + "epoch": 0.33143081368387256, + "grad_norm": 0.28938645124435425, + "learning_rate": 4.227563402925514e-05, + "loss": 0.1995, + "step": 18582 + }, + { + "epoch": 0.33144864980558625, + "grad_norm": 0.4023700952529907, + "learning_rate": 4.227450890459532e-05, + "loss": 0.2434, + "step": 18583 + }, + { + "epoch": 0.3314664859273, + "grad_norm": 0.425690233707428, + "learning_rate": 4.2273383712973545e-05, + "loss": 0.1813, + "step": 18584 + }, + { + "epoch": 0.3314843220490137, + "grad_norm": 0.29323694109916687, + "learning_rate": 4.2272258454394176e-05, + "loss": 0.1479, + "step": 18585 + }, + { + "epoch": 0.33150215817072737, + "grad_norm": 0.30246469378471375, + "learning_rate": 4.2271133128861554e-05, + "loss": 0.1413, + "step": 18586 + }, + { + "epoch": 0.33151999429244106, + "grad_norm": 0.29485321044921875, + "learning_rate": 4.2270007736380066e-05, + "loss": 0.1248, + "step": 18587 + }, + { + "epoch": 0.33153783041415474, + "grad_norm": 0.3434136211872101, + "learning_rate": 4.226888227695406e-05, + "loss": 0.22, + "step": 18588 + }, + { + "epoch": 0.33155566653586843, + "grad_norm": 0.35784029960632324, + "learning_rate": 4.2267756750587894e-05, + "loss": 0.1284, + "step": 18589 + }, + { + "epoch": 0.3315735026575821, + "grad_norm": 0.2941616475582123, + "learning_rate": 4.2266631157285945e-05, + "loss": 0.1428, + "step": 18590 + }, + { + "epoch": 0.3315913387792958, + "grad_norm": 0.28665873408317566, + "learning_rate": 4.226550549705257e-05, + "loss": 0.1511, + "step": 18591 + }, + { + "epoch": 0.33160917490100955, + "grad_norm": 0.2732289433479309, + "learning_rate": 4.2264379769892136e-05, + "loss": 0.1591, + "step": 18592 + }, + { + "epoch": 0.33162701102272324, + "grad_norm": 0.30442890524864197, + "learning_rate": 4.2263253975808996e-05, + "loss": 0.2504, + "step": 18593 + }, + { + "epoch": 0.3316448471444369, + "grad_norm": 0.22205771505832672, + "learning_rate": 4.226212811480752e-05, + "loss": 0.1949, + "step": 18594 + }, + { + "epoch": 0.3316626832661506, + "grad_norm": 0.22176052629947662, + "learning_rate": 4.226100218689209e-05, + "loss": 0.1501, + "step": 18595 + }, + { + "epoch": 0.3316805193878643, + "grad_norm": 0.311583548784256, + "learning_rate": 4.225987619206704e-05, + "loss": 0.1464, + "step": 18596 + }, + { + "epoch": 0.331698355509578, + "grad_norm": 0.20730595290660858, + "learning_rate": 4.225875013033675e-05, + "loss": 0.1782, + "step": 18597 + }, + { + "epoch": 0.3317161916312917, + "grad_norm": 0.41936686635017395, + "learning_rate": 4.225762400170558e-05, + "loss": 0.2092, + "step": 18598 + }, + { + "epoch": 0.33173402775300537, + "grad_norm": 0.24077372252941132, + "learning_rate": 4.2256497806177895e-05, + "loss": 0.1574, + "step": 18599 + }, + { + "epoch": 0.33175186387471906, + "grad_norm": 0.22923429310321808, + "learning_rate": 4.2255371543758075e-05, + "loss": 0.1534, + "step": 18600 + }, + { + "epoch": 0.3317696999964328, + "grad_norm": 0.24850907921791077, + "learning_rate": 4.225424521445047e-05, + "loss": 0.1717, + "step": 18601 + }, + { + "epoch": 0.3317875361181465, + "grad_norm": 0.3402264714241028, + "learning_rate": 4.2253118818259454e-05, + "loss": 0.1619, + "step": 18602 + }, + { + "epoch": 0.3318053722398602, + "grad_norm": 0.37563788890838623, + "learning_rate": 4.225199235518939e-05, + "loss": 0.1812, + "step": 18603 + }, + { + "epoch": 0.33182320836157386, + "grad_norm": 0.25607120990753174, + "learning_rate": 4.225086582524465e-05, + "loss": 0.1485, + "step": 18604 + }, + { + "epoch": 0.33184104448328755, + "grad_norm": 0.3265029489994049, + "learning_rate": 4.224973922842958e-05, + "loss": 0.1834, + "step": 18605 + }, + { + "epoch": 0.33185888060500124, + "grad_norm": 0.2571837306022644, + "learning_rate": 4.224861256474858e-05, + "loss": 0.1837, + "step": 18606 + }, + { + "epoch": 0.3318767167267149, + "grad_norm": 0.256216436624527, + "learning_rate": 4.224748583420599e-05, + "loss": 0.1953, + "step": 18607 + }, + { + "epoch": 0.3318945528484286, + "grad_norm": 0.33437854051589966, + "learning_rate": 4.224635903680619e-05, + "loss": 0.1653, + "step": 18608 + }, + { + "epoch": 0.33191238897014236, + "grad_norm": 0.24014635384082794, + "learning_rate": 4.224523217255355e-05, + "loss": 0.1689, + "step": 18609 + }, + { + "epoch": 0.33193022509185605, + "grad_norm": 0.2903948724269867, + "learning_rate": 4.2244105241452425e-05, + "loss": 0.1395, + "step": 18610 + }, + { + "epoch": 0.33194806121356973, + "grad_norm": 0.2848571240901947, + "learning_rate": 4.2242978243507195e-05, + "loss": 0.1533, + "step": 18611 + }, + { + "epoch": 0.3319658973352834, + "grad_norm": 0.2606837749481201, + "learning_rate": 4.224185117872223e-05, + "loss": 0.1585, + "step": 18612 + }, + { + "epoch": 0.3319837334569971, + "grad_norm": 0.26199620962142944, + "learning_rate": 4.22407240471019e-05, + "loss": 0.1296, + "step": 18613 + }, + { + "epoch": 0.3320015695787108, + "grad_norm": 0.23331594467163086, + "learning_rate": 4.2239596848650553e-05, + "loss": 0.1614, + "step": 18614 + }, + { + "epoch": 0.3320194057004245, + "grad_norm": 0.30470937490463257, + "learning_rate": 4.2238469583372584e-05, + "loss": 0.189, + "step": 18615 + }, + { + "epoch": 0.3320372418221382, + "grad_norm": 0.2593975365161896, + "learning_rate": 4.223734225127235e-05, + "loss": 0.1602, + "step": 18616 + }, + { + "epoch": 0.33205507794385186, + "grad_norm": 0.28764283657073975, + "learning_rate": 4.223621485235423e-05, + "loss": 0.1441, + "step": 18617 + }, + { + "epoch": 0.3320729140655656, + "grad_norm": 0.2824632525444031, + "learning_rate": 4.223508738662259e-05, + "loss": 0.1612, + "step": 18618 + }, + { + "epoch": 0.3320907501872793, + "grad_norm": 0.5225083827972412, + "learning_rate": 4.223395985408178e-05, + "loss": 0.1899, + "step": 18619 + }, + { + "epoch": 0.332108586308993, + "grad_norm": 0.19410806894302368, + "learning_rate": 4.223283225473621e-05, + "loss": 0.1289, + "step": 18620 + }, + { + "epoch": 0.33212642243070667, + "grad_norm": 0.23426847159862518, + "learning_rate": 4.2231704588590214e-05, + "loss": 0.2083, + "step": 18621 + }, + { + "epoch": 0.33214425855242036, + "grad_norm": 0.2894057631492615, + "learning_rate": 4.223057685564819e-05, + "loss": 0.1419, + "step": 18622 + }, + { + "epoch": 0.33216209467413405, + "grad_norm": 0.22195616364479065, + "learning_rate": 4.2229449055914495e-05, + "loss": 0.1455, + "step": 18623 + }, + { + "epoch": 0.33217993079584773, + "grad_norm": 0.2332436442375183, + "learning_rate": 4.2228321189393505e-05, + "loss": 0.1823, + "step": 18624 + }, + { + "epoch": 0.3321977669175614, + "grad_norm": 0.34190788865089417, + "learning_rate": 4.222719325608959e-05, + "loss": 0.144, + "step": 18625 + }, + { + "epoch": 0.33221560303927516, + "grad_norm": 0.22976568341255188, + "learning_rate": 4.222606525600713e-05, + "loss": 0.1605, + "step": 18626 + }, + { + "epoch": 0.33223343916098885, + "grad_norm": 0.2539297044277191, + "learning_rate": 4.2224937189150484e-05, + "loss": 0.1671, + "step": 18627 + }, + { + "epoch": 0.33225127528270254, + "grad_norm": 0.3443310260772705, + "learning_rate": 4.222380905552404e-05, + "loss": 0.204, + "step": 18628 + }, + { + "epoch": 0.33226911140441623, + "grad_norm": 0.40971699357032776, + "learning_rate": 4.222268085513216e-05, + "loss": 0.1999, + "step": 18629 + }, + { + "epoch": 0.3322869475261299, + "grad_norm": 0.247114896774292, + "learning_rate": 4.222155258797922e-05, + "loss": 0.1647, + "step": 18630 + }, + { + "epoch": 0.3323047836478436, + "grad_norm": 0.2625199854373932, + "learning_rate": 4.222042425406959e-05, + "loss": 0.1446, + "step": 18631 + }, + { + "epoch": 0.3323226197695573, + "grad_norm": 0.35182222723960876, + "learning_rate": 4.2219295853407647e-05, + "loss": 0.1802, + "step": 18632 + }, + { + "epoch": 0.332340455891271, + "grad_norm": 0.26384562253952026, + "learning_rate": 4.221816738599778e-05, + "loss": 0.1807, + "step": 18633 + }, + { + "epoch": 0.3323582920129847, + "grad_norm": 0.3444817364215851, + "learning_rate": 4.2217038851844335e-05, + "loss": 0.1556, + "step": 18634 + }, + { + "epoch": 0.3323761281346984, + "grad_norm": 0.4344008266925812, + "learning_rate": 4.22159102509517e-05, + "loss": 0.2188, + "step": 18635 + }, + { + "epoch": 0.3323939642564121, + "grad_norm": 0.1910325288772583, + "learning_rate": 4.221478158332426e-05, + "loss": 0.1672, + "step": 18636 + }, + { + "epoch": 0.3324118003781258, + "grad_norm": 0.23896032571792603, + "learning_rate": 4.221365284896637e-05, + "loss": 0.1927, + "step": 18637 + }, + { + "epoch": 0.3324296364998395, + "grad_norm": 0.28261053562164307, + "learning_rate": 4.221252404788242e-05, + "loss": 0.1974, + "step": 18638 + }, + { + "epoch": 0.33244747262155316, + "grad_norm": 0.30259254574775696, + "learning_rate": 4.221139518007679e-05, + "loss": 0.183, + "step": 18639 + }, + { + "epoch": 0.33246530874326685, + "grad_norm": 0.246919646859169, + "learning_rate": 4.221026624555384e-05, + "loss": 0.1836, + "step": 18640 + }, + { + "epoch": 0.33248314486498054, + "grad_norm": 0.20554570853710175, + "learning_rate": 4.2209137244317956e-05, + "loss": 0.1848, + "step": 18641 + }, + { + "epoch": 0.33250098098669423, + "grad_norm": 0.2147049754858017, + "learning_rate": 4.220800817637351e-05, + "loss": 0.1928, + "step": 18642 + }, + { + "epoch": 0.33251881710840797, + "grad_norm": 0.2581635117530823, + "learning_rate": 4.220687904172489e-05, + "loss": 0.1625, + "step": 18643 + }, + { + "epoch": 0.33253665323012166, + "grad_norm": 0.26008719205856323, + "learning_rate": 4.220574984037645e-05, + "loss": 0.1725, + "step": 18644 + }, + { + "epoch": 0.33255448935183535, + "grad_norm": 0.25723814964294434, + "learning_rate": 4.220462057233259e-05, + "loss": 0.138, + "step": 18645 + }, + { + "epoch": 0.33257232547354904, + "grad_norm": 0.30129143595695496, + "learning_rate": 4.2203491237597674e-05, + "loss": 0.1515, + "step": 18646 + }, + { + "epoch": 0.3325901615952627, + "grad_norm": 0.21004438400268555, + "learning_rate": 4.2202361836176087e-05, + "loss": 0.1605, + "step": 18647 + }, + { + "epoch": 0.3326079977169764, + "grad_norm": 0.2537418603897095, + "learning_rate": 4.220123236807221e-05, + "loss": 0.2125, + "step": 18648 + }, + { + "epoch": 0.3326258338386901, + "grad_norm": 0.21132661402225494, + "learning_rate": 4.22001028332904e-05, + "loss": 0.1336, + "step": 18649 + }, + { + "epoch": 0.3326436699604038, + "grad_norm": 0.300246924161911, + "learning_rate": 4.219897323183506e-05, + "loss": 0.2098, + "step": 18650 + }, + { + "epoch": 0.33266150608211753, + "grad_norm": 0.3238065838813782, + "learning_rate": 4.219784356371056e-05, + "loss": 0.2127, + "step": 18651 + }, + { + "epoch": 0.3326793422038312, + "grad_norm": 0.23425902426242828, + "learning_rate": 4.219671382892127e-05, + "loss": 0.1537, + "step": 18652 + }, + { + "epoch": 0.3326971783255449, + "grad_norm": 0.34821856021881104, + "learning_rate": 4.219558402747159e-05, + "loss": 0.2092, + "step": 18653 + }, + { + "epoch": 0.3327150144472586, + "grad_norm": 0.19794493913650513, + "learning_rate": 4.219445415936588e-05, + "loss": 0.0962, + "step": 18654 + }, + { + "epoch": 0.3327328505689723, + "grad_norm": 0.3300984799861908, + "learning_rate": 4.219332422460853e-05, + "loss": 0.2468, + "step": 18655 + }, + { + "epoch": 0.33275068669068597, + "grad_norm": 0.26174670457839966, + "learning_rate": 4.219219422320392e-05, + "loss": 0.171, + "step": 18656 + }, + { + "epoch": 0.33276852281239966, + "grad_norm": 0.23382668197155, + "learning_rate": 4.219106415515642e-05, + "loss": 0.1685, + "step": 18657 + }, + { + "epoch": 0.33278635893411335, + "grad_norm": 0.32599666714668274, + "learning_rate": 4.2189934020470415e-05, + "loss": 0.1452, + "step": 18658 + }, + { + "epoch": 0.33280419505582703, + "grad_norm": 0.21382218599319458, + "learning_rate": 4.21888038191503e-05, + "loss": 0.1606, + "step": 18659 + }, + { + "epoch": 0.3328220311775408, + "grad_norm": 0.34457188844680786, + "learning_rate": 4.218767355120044e-05, + "loss": 0.1367, + "step": 18660 + }, + { + "epoch": 0.33283986729925447, + "grad_norm": 0.24509088695049286, + "learning_rate": 4.218654321662522e-05, + "loss": 0.1743, + "step": 18661 + }, + { + "epoch": 0.33285770342096815, + "grad_norm": 0.21297286450862885, + "learning_rate": 4.2185412815429013e-05, + "loss": 0.1497, + "step": 18662 + }, + { + "epoch": 0.33287553954268184, + "grad_norm": 0.2544233798980713, + "learning_rate": 4.218428234761622e-05, + "loss": 0.1676, + "step": 18663 + }, + { + "epoch": 0.33289337566439553, + "grad_norm": 0.2422240823507309, + "learning_rate": 4.2183151813191215e-05, + "loss": 0.0894, + "step": 18664 + }, + { + "epoch": 0.3329112117861092, + "grad_norm": 0.28995004296302795, + "learning_rate": 4.2182021212158376e-05, + "loss": 0.1642, + "step": 18665 + }, + { + "epoch": 0.3329290479078229, + "grad_norm": 0.2857154607772827, + "learning_rate": 4.218089054452209e-05, + "loss": 0.1771, + "step": 18666 + }, + { + "epoch": 0.3329468840295366, + "grad_norm": 0.21182294189929962, + "learning_rate": 4.2179759810286734e-05, + "loss": 0.139, + "step": 18667 + }, + { + "epoch": 0.33296472015125034, + "grad_norm": 0.2658856511116028, + "learning_rate": 4.217862900945669e-05, + "loss": 0.16, + "step": 18668 + }, + { + "epoch": 0.332982556272964, + "grad_norm": 0.3196842670440674, + "learning_rate": 4.217749814203636e-05, + "loss": 0.1521, + "step": 18669 + }, + { + "epoch": 0.3330003923946777, + "grad_norm": 0.3064384162425995, + "learning_rate": 4.217636720803011e-05, + "loss": 0.1316, + "step": 18670 + }, + { + "epoch": 0.3330182285163914, + "grad_norm": 0.4255395233631134, + "learning_rate": 4.217523620744233e-05, + "loss": 0.2148, + "step": 18671 + }, + { + "epoch": 0.3330360646381051, + "grad_norm": 0.25622081756591797, + "learning_rate": 4.21741051402774e-05, + "loss": 0.1632, + "step": 18672 + }, + { + "epoch": 0.3330539007598188, + "grad_norm": 0.2683780789375305, + "learning_rate": 4.21729740065397e-05, + "loss": 0.1317, + "step": 18673 + }, + { + "epoch": 0.33307173688153247, + "grad_norm": 0.33805572986602783, + "learning_rate": 4.217184280623363e-05, + "loss": 0.1889, + "step": 18674 + }, + { + "epoch": 0.33308957300324615, + "grad_norm": 0.33626481890678406, + "learning_rate": 4.217071153936356e-05, + "loss": 0.1635, + "step": 18675 + }, + { + "epoch": 0.33310740912495984, + "grad_norm": 0.19020210206508636, + "learning_rate": 4.216958020593389e-05, + "loss": 0.1613, + "step": 18676 + }, + { + "epoch": 0.3331252452466736, + "grad_norm": 0.35197606682777405, + "learning_rate": 4.216844880594899e-05, + "loss": 0.1529, + "step": 18677 + }, + { + "epoch": 0.3331430813683873, + "grad_norm": 0.23571282625198364, + "learning_rate": 4.2167317339413256e-05, + "loss": 0.1734, + "step": 18678 + }, + { + "epoch": 0.33316091749010096, + "grad_norm": 0.3509941101074219, + "learning_rate": 4.216618580633107e-05, + "loss": 0.1884, + "step": 18679 + }, + { + "epoch": 0.33317875361181465, + "grad_norm": 0.20872823894023895, + "learning_rate": 4.2165054206706825e-05, + "loss": 0.1596, + "step": 18680 + }, + { + "epoch": 0.33319658973352834, + "grad_norm": 0.273303359746933, + "learning_rate": 4.216392254054489e-05, + "loss": 0.1932, + "step": 18681 + }, + { + "epoch": 0.333214425855242, + "grad_norm": 0.2837095558643341, + "learning_rate": 4.216279080784966e-05, + "loss": 0.2065, + "step": 18682 + }, + { + "epoch": 0.3332322619769557, + "grad_norm": 0.32746848464012146, + "learning_rate": 4.2161659008625534e-05, + "loss": 0.1523, + "step": 18683 + }, + { + "epoch": 0.3332500980986694, + "grad_norm": 0.22128203511238098, + "learning_rate": 4.216052714287689e-05, + "loss": 0.1334, + "step": 18684 + }, + { + "epoch": 0.33326793422038314, + "grad_norm": 0.24784326553344727, + "learning_rate": 4.2159395210608116e-05, + "loss": 0.1961, + "step": 18685 + }, + { + "epoch": 0.33328577034209683, + "grad_norm": 0.1982993185520172, + "learning_rate": 4.21582632118236e-05, + "loss": 0.1117, + "step": 18686 + }, + { + "epoch": 0.3333036064638105, + "grad_norm": 0.33177274465560913, + "learning_rate": 4.215713114652773e-05, + "loss": 0.1903, + "step": 18687 + }, + { + "epoch": 0.3333214425855242, + "grad_norm": 0.26933786273002625, + "learning_rate": 4.215599901472489e-05, + "loss": 0.1512, + "step": 18688 + }, + { + "epoch": 0.3333392787072379, + "grad_norm": 0.2538033723831177, + "learning_rate": 4.215486681641947e-05, + "loss": 0.1624, + "step": 18689 + }, + { + "epoch": 0.3333571148289516, + "grad_norm": 0.21105821430683136, + "learning_rate": 4.2153734551615864e-05, + "loss": 0.1258, + "step": 18690 + }, + { + "epoch": 0.33337495095066527, + "grad_norm": 0.33613941073417664, + "learning_rate": 4.215260222031846e-05, + "loss": 0.1905, + "step": 18691 + }, + { + "epoch": 0.33339278707237896, + "grad_norm": 0.2952043414115906, + "learning_rate": 4.2151469822531645e-05, + "loss": 0.1837, + "step": 18692 + }, + { + "epoch": 0.3334106231940927, + "grad_norm": 0.243205726146698, + "learning_rate": 4.2150337358259805e-05, + "loss": 0.1457, + "step": 18693 + }, + { + "epoch": 0.3334284593158064, + "grad_norm": 0.2892273962497711, + "learning_rate": 4.214920482750734e-05, + "loss": 0.247, + "step": 18694 + }, + { + "epoch": 0.3334462954375201, + "grad_norm": 0.27271246910095215, + "learning_rate": 4.2148072230278626e-05, + "loss": 0.1384, + "step": 18695 + }, + { + "epoch": 0.33346413155923377, + "grad_norm": 0.2792697846889496, + "learning_rate": 4.214693956657807e-05, + "loss": 0.1509, + "step": 18696 + }, + { + "epoch": 0.33348196768094746, + "grad_norm": 0.3380482792854309, + "learning_rate": 4.214580683641005e-05, + "loss": 0.1487, + "step": 18697 + }, + { + "epoch": 0.33349980380266114, + "grad_norm": 0.22144699096679688, + "learning_rate": 4.214467403977896e-05, + "loss": 0.1456, + "step": 18698 + }, + { + "epoch": 0.33351763992437483, + "grad_norm": 0.1926298588514328, + "learning_rate": 4.2143541176689195e-05, + "loss": 0.1284, + "step": 18699 + }, + { + "epoch": 0.3335354760460885, + "grad_norm": 0.23470169305801392, + "learning_rate": 4.214240824714514e-05, + "loss": 0.1506, + "step": 18700 + }, + { + "epoch": 0.3335533121678022, + "grad_norm": 0.23451177775859833, + "learning_rate": 4.214127525115119e-05, + "loss": 0.1396, + "step": 18701 + }, + { + "epoch": 0.33357114828951595, + "grad_norm": 0.2369646281003952, + "learning_rate": 4.214014218871174e-05, + "loss": 0.1655, + "step": 18702 + }, + { + "epoch": 0.33358898441122964, + "grad_norm": 0.2560775876045227, + "learning_rate": 4.213900905983118e-05, + "loss": 0.1932, + "step": 18703 + }, + { + "epoch": 0.3336068205329433, + "grad_norm": 0.29328837990760803, + "learning_rate": 4.213787586451389e-05, + "loss": 0.1075, + "step": 18704 + }, + { + "epoch": 0.333624656654657, + "grad_norm": 0.21619559824466705, + "learning_rate": 4.2136742602764286e-05, + "loss": 0.1621, + "step": 18705 + }, + { + "epoch": 0.3336424927763707, + "grad_norm": 0.2599635422229767, + "learning_rate": 4.213560927458674e-05, + "loss": 0.1808, + "step": 18706 + }, + { + "epoch": 0.3336603288980844, + "grad_norm": 0.2735520303249359, + "learning_rate": 4.213447587998566e-05, + "loss": 0.1052, + "step": 18707 + }, + { + "epoch": 0.3336781650197981, + "grad_norm": 0.304398775100708, + "learning_rate": 4.213334241896544e-05, + "loss": 0.1698, + "step": 18708 + }, + { + "epoch": 0.33369600114151177, + "grad_norm": 0.27705931663513184, + "learning_rate": 4.213220889153045e-05, + "loss": 0.1733, + "step": 18709 + }, + { + "epoch": 0.3337138372632255, + "grad_norm": 0.23909097909927368, + "learning_rate": 4.2131075297685113e-05, + "loss": 0.1379, + "step": 18710 + }, + { + "epoch": 0.3337316733849392, + "grad_norm": 0.2765835225582123, + "learning_rate": 4.2129941637433814e-05, + "loss": 0.1892, + "step": 18711 + }, + { + "epoch": 0.3337495095066529, + "grad_norm": 0.22051171958446503, + "learning_rate": 4.212880791078093e-05, + "loss": 0.1512, + "step": 18712 + }, + { + "epoch": 0.3337673456283666, + "grad_norm": 0.2813849151134491, + "learning_rate": 4.212767411773089e-05, + "loss": 0.1595, + "step": 18713 + }, + { + "epoch": 0.33378518175008026, + "grad_norm": 0.33392274379730225, + "learning_rate": 4.212654025828805e-05, + "loss": 0.1147, + "step": 18714 + }, + { + "epoch": 0.33380301787179395, + "grad_norm": 0.2104877531528473, + "learning_rate": 4.212540633245683e-05, + "loss": 0.1391, + "step": 18715 + }, + { + "epoch": 0.33382085399350764, + "grad_norm": 0.3532252907752991, + "learning_rate": 4.2124272340241625e-05, + "loss": 0.1785, + "step": 18716 + }, + { + "epoch": 0.3338386901152213, + "grad_norm": 0.3038238286972046, + "learning_rate": 4.212313828164683e-05, + "loss": 0.137, + "step": 18717 + }, + { + "epoch": 0.333856526236935, + "grad_norm": 0.3160938322544098, + "learning_rate": 4.212200415667683e-05, + "loss": 0.1779, + "step": 18718 + }, + { + "epoch": 0.33387436235864876, + "grad_norm": 0.2350011020898819, + "learning_rate": 4.212086996533603e-05, + "loss": 0.131, + "step": 18719 + }, + { + "epoch": 0.33389219848036245, + "grad_norm": 0.18767918646335602, + "learning_rate": 4.211973570762882e-05, + "loss": 0.1438, + "step": 18720 + }, + { + "epoch": 0.33391003460207613, + "grad_norm": 0.29537296295166016, + "learning_rate": 4.211860138355961e-05, + "loss": 0.1782, + "step": 18721 + }, + { + "epoch": 0.3339278707237898, + "grad_norm": 0.2523314356803894, + "learning_rate": 4.211746699313278e-05, + "loss": 0.1556, + "step": 18722 + }, + { + "epoch": 0.3339457068455035, + "grad_norm": 0.251301646232605, + "learning_rate": 4.2116332536352744e-05, + "loss": 0.182, + "step": 18723 + }, + { + "epoch": 0.3339635429672172, + "grad_norm": 0.2746959924697876, + "learning_rate": 4.2115198013223886e-05, + "loss": 0.2163, + "step": 18724 + }, + { + "epoch": 0.3339813790889309, + "grad_norm": 0.37001949548721313, + "learning_rate": 4.211406342375061e-05, + "loss": 0.1818, + "step": 18725 + }, + { + "epoch": 0.3339992152106446, + "grad_norm": 0.27162232995033264, + "learning_rate": 4.2112928767937313e-05, + "loss": 0.1532, + "step": 18726 + }, + { + "epoch": 0.3340170513323583, + "grad_norm": 0.34194305539131165, + "learning_rate": 4.2111794045788395e-05, + "loss": 0.0979, + "step": 18727 + }, + { + "epoch": 0.334034887454072, + "grad_norm": 0.2687188684940338, + "learning_rate": 4.211065925730825e-05, + "loss": 0.1669, + "step": 18728 + }, + { + "epoch": 0.3340527235757857, + "grad_norm": 0.41771769523620605, + "learning_rate": 4.210952440250128e-05, + "loss": 0.1245, + "step": 18729 + }, + { + "epoch": 0.3340705596974994, + "grad_norm": 0.3138853907585144, + "learning_rate": 4.210838948137189e-05, + "loss": 0.1755, + "step": 18730 + }, + { + "epoch": 0.33408839581921307, + "grad_norm": 0.2310846447944641, + "learning_rate": 4.2107254493924464e-05, + "loss": 0.1689, + "step": 18731 + }, + { + "epoch": 0.33410623194092676, + "grad_norm": 0.313462495803833, + "learning_rate": 4.210611944016342e-05, + "loss": 0.1747, + "step": 18732 + }, + { + "epoch": 0.33412406806264044, + "grad_norm": 0.2913760840892792, + "learning_rate": 4.210498432009314e-05, + "loss": 0.1801, + "step": 18733 + }, + { + "epoch": 0.33414190418435413, + "grad_norm": 0.3188319504261017, + "learning_rate": 4.2103849133718044e-05, + "loss": 0.1551, + "step": 18734 + }, + { + "epoch": 0.3341597403060679, + "grad_norm": 0.22271277010440826, + "learning_rate": 4.210271388104251e-05, + "loss": 0.1068, + "step": 18735 + }, + { + "epoch": 0.33417757642778156, + "grad_norm": 0.19057299196720123, + "learning_rate": 4.210157856207096e-05, + "loss": 0.1556, + "step": 18736 + }, + { + "epoch": 0.33419541254949525, + "grad_norm": 0.29498863220214844, + "learning_rate": 4.210044317680778e-05, + "loss": 0.1463, + "step": 18737 + }, + { + "epoch": 0.33421324867120894, + "grad_norm": 0.20593951642513275, + "learning_rate": 4.2099307725257376e-05, + "loss": 0.124, + "step": 18738 + }, + { + "epoch": 0.33423108479292263, + "grad_norm": 0.2947141230106354, + "learning_rate": 4.2098172207424145e-05, + "loss": 0.0981, + "step": 18739 + }, + { + "epoch": 0.3342489209146363, + "grad_norm": 0.374240905046463, + "learning_rate": 4.20970366233125e-05, + "loss": 0.2417, + "step": 18740 + }, + { + "epoch": 0.33426675703635, + "grad_norm": 0.27606189250946045, + "learning_rate": 4.2095900972926835e-05, + "loss": 0.2424, + "step": 18741 + }, + { + "epoch": 0.3342845931580637, + "grad_norm": 0.23406417667865753, + "learning_rate": 4.209476525627155e-05, + "loss": 0.1597, + "step": 18742 + }, + { + "epoch": 0.3343024292797774, + "grad_norm": 0.29940590262413025, + "learning_rate": 4.2093629473351046e-05, + "loss": 0.2361, + "step": 18743 + }, + { + "epoch": 0.3343202654014911, + "grad_norm": 0.2661944329738617, + "learning_rate": 4.209249362416974e-05, + "loss": 0.1585, + "step": 18744 + }, + { + "epoch": 0.3343381015232048, + "grad_norm": 0.3111472427845001, + "learning_rate": 4.209135770873202e-05, + "loss": 0.1607, + "step": 18745 + }, + { + "epoch": 0.3343559376449185, + "grad_norm": 0.32731369137763977, + "learning_rate": 4.20902217270423e-05, + "loss": 0.1747, + "step": 18746 + }, + { + "epoch": 0.3343737737666322, + "grad_norm": 0.2322869598865509, + "learning_rate": 4.208908567910497e-05, + "loss": 0.1709, + "step": 18747 + }, + { + "epoch": 0.3343916098883459, + "grad_norm": 0.23245777189731598, + "learning_rate": 4.2087949564924445e-05, + "loss": 0.2006, + "step": 18748 + }, + { + "epoch": 0.33440944601005956, + "grad_norm": 0.33070069551467896, + "learning_rate": 4.2086813384505125e-05, + "loss": 0.193, + "step": 18749 + }, + { + "epoch": 0.33442728213177325, + "grad_norm": 0.3557249903678894, + "learning_rate": 4.2085677137851413e-05, + "loss": 0.1592, + "step": 18750 + }, + { + "epoch": 0.33444511825348694, + "grad_norm": 0.1902570277452469, + "learning_rate": 4.208454082496772e-05, + "loss": 0.146, + "step": 18751 + }, + { + "epoch": 0.3344629543752007, + "grad_norm": 0.27751973271369934, + "learning_rate": 4.208340444585844e-05, + "loss": 0.1939, + "step": 18752 + }, + { + "epoch": 0.33448079049691437, + "grad_norm": 0.20552270114421844, + "learning_rate": 4.2082268000527994e-05, + "loss": 0.1441, + "step": 18753 + }, + { + "epoch": 0.33449862661862806, + "grad_norm": 0.45457321405410767, + "learning_rate": 4.208113148898076e-05, + "loss": 0.1558, + "step": 18754 + }, + { + "epoch": 0.33451646274034175, + "grad_norm": 0.2982683777809143, + "learning_rate": 4.207999491122118e-05, + "loss": 0.1841, + "step": 18755 + }, + { + "epoch": 0.33453429886205543, + "grad_norm": 0.28458109498023987, + "learning_rate": 4.2078858267253626e-05, + "loss": 0.1539, + "step": 18756 + }, + { + "epoch": 0.3345521349837691, + "grad_norm": 0.2325965315103531, + "learning_rate": 4.207772155708253e-05, + "loss": 0.1295, + "step": 18757 + }, + { + "epoch": 0.3345699711054828, + "grad_norm": 0.21238230168819427, + "learning_rate": 4.207658478071228e-05, + "loss": 0.1261, + "step": 18758 + }, + { + "epoch": 0.3345878072271965, + "grad_norm": 0.33513128757476807, + "learning_rate": 4.207544793814728e-05, + "loss": 0.1901, + "step": 18759 + }, + { + "epoch": 0.3346056433489102, + "grad_norm": 0.31899237632751465, + "learning_rate": 4.2074311029391963e-05, + "loss": 0.1952, + "step": 18760 + }, + { + "epoch": 0.33462347947062393, + "grad_norm": 0.29127180576324463, + "learning_rate": 4.207317405445072e-05, + "loss": 0.1794, + "step": 18761 + }, + { + "epoch": 0.3346413155923376, + "grad_norm": 0.2133120447397232, + "learning_rate": 4.207203701332794e-05, + "loss": 0.1338, + "step": 18762 + }, + { + "epoch": 0.3346591517140513, + "grad_norm": 0.22214551270008087, + "learning_rate": 4.207089990602806e-05, + "loss": 0.1584, + "step": 18763 + }, + { + "epoch": 0.334676987835765, + "grad_norm": 0.25202473998069763, + "learning_rate": 4.206976273255547e-05, + "loss": 0.1978, + "step": 18764 + }, + { + "epoch": 0.3346948239574787, + "grad_norm": 0.252835750579834, + "learning_rate": 4.2068625492914595e-05, + "loss": 0.1542, + "step": 18765 + }, + { + "epoch": 0.33471266007919237, + "grad_norm": 0.24503786861896515, + "learning_rate": 4.206748818710982e-05, + "loss": 0.1294, + "step": 18766 + }, + { + "epoch": 0.33473049620090606, + "grad_norm": 0.27153870463371277, + "learning_rate": 4.206635081514557e-05, + "loss": 0.1203, + "step": 18767 + }, + { + "epoch": 0.33474833232261975, + "grad_norm": 0.23417600989341736, + "learning_rate": 4.2065213377026244e-05, + "loss": 0.1707, + "step": 18768 + }, + { + "epoch": 0.3347661684443335, + "grad_norm": 0.17935870587825775, + "learning_rate": 4.206407587275627e-05, + "loss": 0.115, + "step": 18769 + }, + { + "epoch": 0.3347840045660472, + "grad_norm": 0.36513659358024597, + "learning_rate": 4.206293830234004e-05, + "loss": 0.1214, + "step": 18770 + }, + { + "epoch": 0.33480184068776087, + "grad_norm": 0.30545157194137573, + "learning_rate": 4.206180066578196e-05, + "loss": 0.1475, + "step": 18771 + }, + { + "epoch": 0.33481967680947455, + "grad_norm": 0.2862299680709839, + "learning_rate": 4.2060662963086454e-05, + "loss": 0.1745, + "step": 18772 + }, + { + "epoch": 0.33483751293118824, + "grad_norm": 0.34034794569015503, + "learning_rate": 4.2059525194257934e-05, + "loss": 0.2029, + "step": 18773 + }, + { + "epoch": 0.33485534905290193, + "grad_norm": 0.3153921961784363, + "learning_rate": 4.2058387359300786e-05, + "loss": 0.1925, + "step": 18774 + }, + { + "epoch": 0.3348731851746156, + "grad_norm": 0.29298266768455505, + "learning_rate": 4.205724945821944e-05, + "loss": 0.1766, + "step": 18775 + }, + { + "epoch": 0.3348910212963293, + "grad_norm": 0.27643847465515137, + "learning_rate": 4.2056111491018314e-05, + "loss": 0.135, + "step": 18776 + }, + { + "epoch": 0.334908857418043, + "grad_norm": 0.21980465948581696, + "learning_rate": 4.2054973457701804e-05, + "loss": 0.1806, + "step": 18777 + }, + { + "epoch": 0.33492669353975674, + "grad_norm": 0.21354219317436218, + "learning_rate": 4.205383535827432e-05, + "loss": 0.1162, + "step": 18778 + }, + { + "epoch": 0.3349445296614704, + "grad_norm": 0.28409087657928467, + "learning_rate": 4.2052697192740284e-05, + "loss": 0.1902, + "step": 18779 + }, + { + "epoch": 0.3349623657831841, + "grad_norm": 0.3530551493167877, + "learning_rate": 4.205155896110411e-05, + "loss": 0.1857, + "step": 18780 + }, + { + "epoch": 0.3349802019048978, + "grad_norm": 0.25170814990997314, + "learning_rate": 4.205042066337019e-05, + "loss": 0.175, + "step": 18781 + }, + { + "epoch": 0.3349980380266115, + "grad_norm": 0.2856610417366028, + "learning_rate": 4.2049282299542964e-05, + "loss": 0.1688, + "step": 18782 + }, + { + "epoch": 0.3350158741483252, + "grad_norm": 0.2508857846260071, + "learning_rate": 4.204814386962682e-05, + "loss": 0.1817, + "step": 18783 + }, + { + "epoch": 0.33503371027003886, + "grad_norm": 0.28833040595054626, + "learning_rate": 4.204700537362619e-05, + "loss": 0.1799, + "step": 18784 + }, + { + "epoch": 0.33505154639175255, + "grad_norm": 0.2016073763370514, + "learning_rate": 4.204586681154548e-05, + "loss": 0.1324, + "step": 18785 + }, + { + "epoch": 0.3350693825134663, + "grad_norm": 0.22017642855644226, + "learning_rate": 4.20447281833891e-05, + "loss": 0.1274, + "step": 18786 + }, + { + "epoch": 0.33508721863518, + "grad_norm": 0.41027387976646423, + "learning_rate": 4.204358948916147e-05, + "loss": 0.1389, + "step": 18787 + }, + { + "epoch": 0.33510505475689367, + "grad_norm": 0.28757673501968384, + "learning_rate": 4.2042450728867e-05, + "loss": 0.1717, + "step": 18788 + }, + { + "epoch": 0.33512289087860736, + "grad_norm": 0.4009237587451935, + "learning_rate": 4.20413119025101e-05, + "loss": 0.2333, + "step": 18789 + }, + { + "epoch": 0.33514072700032105, + "grad_norm": 0.3056463301181793, + "learning_rate": 4.2040173010095187e-05, + "loss": 0.2015, + "step": 18790 + }, + { + "epoch": 0.33515856312203474, + "grad_norm": 0.27369338274002075, + "learning_rate": 4.203903405162669e-05, + "loss": 0.1947, + "step": 18791 + }, + { + "epoch": 0.3351763992437484, + "grad_norm": 0.3525892496109009, + "learning_rate": 4.2037895027109e-05, + "loss": 0.1485, + "step": 18792 + }, + { + "epoch": 0.3351942353654621, + "grad_norm": 0.2792639136314392, + "learning_rate": 4.203675593654654e-05, + "loss": 0.1653, + "step": 18793 + }, + { + "epoch": 0.33521207148717586, + "grad_norm": 0.24967066943645477, + "learning_rate": 4.203561677994374e-05, + "loss": 0.1525, + "step": 18794 + }, + { + "epoch": 0.33522990760888954, + "grad_norm": 0.2366359680891037, + "learning_rate": 4.2034477557305005e-05, + "loss": 0.1316, + "step": 18795 + }, + { + "epoch": 0.33524774373060323, + "grad_norm": 0.27418941259384155, + "learning_rate": 4.2033338268634744e-05, + "loss": 0.1704, + "step": 18796 + }, + { + "epoch": 0.3352655798523169, + "grad_norm": 0.22611431777477264, + "learning_rate": 4.203219891393739e-05, + "loss": 0.1486, + "step": 18797 + }, + { + "epoch": 0.3352834159740306, + "grad_norm": 0.23127515614032745, + "learning_rate": 4.203105949321735e-05, + "loss": 0.1639, + "step": 18798 + }, + { + "epoch": 0.3353012520957443, + "grad_norm": 0.33221226930618286, + "learning_rate": 4.202992000647904e-05, + "loss": 0.1964, + "step": 18799 + }, + { + "epoch": 0.335319088217458, + "grad_norm": 0.27608874440193176, + "learning_rate": 4.202878045372687e-05, + "loss": 0.1633, + "step": 18800 + }, + { + "epoch": 0.33533692433917167, + "grad_norm": 0.22302059829235077, + "learning_rate": 4.2027640834965276e-05, + "loss": 0.1322, + "step": 18801 + }, + { + "epoch": 0.33535476046088536, + "grad_norm": 0.28315451741218567, + "learning_rate": 4.202650115019866e-05, + "loss": 0.1399, + "step": 18802 + }, + { + "epoch": 0.3353725965825991, + "grad_norm": 0.269192636013031, + "learning_rate": 4.202536139943144e-05, + "loss": 0.183, + "step": 18803 + }, + { + "epoch": 0.3353904327043128, + "grad_norm": 0.27647823095321655, + "learning_rate": 4.2024221582668056e-05, + "loss": 0.1668, + "step": 18804 + }, + { + "epoch": 0.3354082688260265, + "grad_norm": 0.26751378178596497, + "learning_rate": 4.2023081699912895e-05, + "loss": 0.2002, + "step": 18805 + }, + { + "epoch": 0.33542610494774017, + "grad_norm": 0.23461028933525085, + "learning_rate": 4.202194175117039e-05, + "loss": 0.1267, + "step": 18806 + }, + { + "epoch": 0.33544394106945385, + "grad_norm": 0.22237063944339752, + "learning_rate": 4.202080173644496e-05, + "loss": 0.1425, + "step": 18807 + }, + { + "epoch": 0.33546177719116754, + "grad_norm": 0.21484938263893127, + "learning_rate": 4.2019661655741026e-05, + "loss": 0.1057, + "step": 18808 + }, + { + "epoch": 0.33547961331288123, + "grad_norm": 0.2188551276922226, + "learning_rate": 4.2018521509063e-05, + "loss": 0.1941, + "step": 18809 + }, + { + "epoch": 0.3354974494345949, + "grad_norm": 0.23399877548217773, + "learning_rate": 4.2017381296415314e-05, + "loss": 0.18, + "step": 18810 + }, + { + "epoch": 0.33551528555630866, + "grad_norm": 0.2748350203037262, + "learning_rate": 4.2016241017802374e-05, + "loss": 0.1398, + "step": 18811 + }, + { + "epoch": 0.33553312167802235, + "grad_norm": 0.29247432947158813, + "learning_rate": 4.2015100673228614e-05, + "loss": 0.1594, + "step": 18812 + }, + { + "epoch": 0.33555095779973604, + "grad_norm": 0.3833557367324829, + "learning_rate": 4.2013960262698444e-05, + "loss": 0.2128, + "step": 18813 + }, + { + "epoch": 0.3355687939214497, + "grad_norm": 0.4648134708404541, + "learning_rate": 4.2012819786216284e-05, + "loss": 0.1587, + "step": 18814 + }, + { + "epoch": 0.3355866300431634, + "grad_norm": 0.25184133648872375, + "learning_rate": 4.2011679243786564e-05, + "loss": 0.1823, + "step": 18815 + }, + { + "epoch": 0.3356044661648771, + "grad_norm": 0.23803114891052246, + "learning_rate": 4.2010538635413696e-05, + "loss": 0.1664, + "step": 18816 + }, + { + "epoch": 0.3356223022865908, + "grad_norm": 0.32847172021865845, + "learning_rate": 4.2009397961102105e-05, + "loss": 0.1394, + "step": 18817 + }, + { + "epoch": 0.3356401384083045, + "grad_norm": 0.33813828229904175, + "learning_rate": 4.200825722085621e-05, + "loss": 0.2061, + "step": 18818 + }, + { + "epoch": 0.33565797453001817, + "grad_norm": 0.21931812167167664, + "learning_rate": 4.200711641468044e-05, + "loss": 0.1318, + "step": 18819 + }, + { + "epoch": 0.3356758106517319, + "grad_norm": 0.287255197763443, + "learning_rate": 4.2005975542579215e-05, + "loss": 0.1652, + "step": 18820 + }, + { + "epoch": 0.3356936467734456, + "grad_norm": 0.28176724910736084, + "learning_rate": 4.200483460455695e-05, + "loss": 0.1197, + "step": 18821 + }, + { + "epoch": 0.3357114828951593, + "grad_norm": 0.22769640386104584, + "learning_rate": 4.200369360061808e-05, + "loss": 0.1906, + "step": 18822 + }, + { + "epoch": 0.335729319016873, + "grad_norm": 0.28410032391548157, + "learning_rate": 4.200255253076701e-05, + "loss": 0.1662, + "step": 18823 + }, + { + "epoch": 0.33574715513858666, + "grad_norm": 0.3849674165248871, + "learning_rate": 4.2001411395008175e-05, + "loss": 0.2569, + "step": 18824 + }, + { + "epoch": 0.33576499126030035, + "grad_norm": 0.3412947952747345, + "learning_rate": 4.2000270193346e-05, + "loss": 0.1621, + "step": 18825 + }, + { + "epoch": 0.33578282738201404, + "grad_norm": 0.24359185993671417, + "learning_rate": 4.199912892578491e-05, + "loss": 0.1939, + "step": 18826 + }, + { + "epoch": 0.3358006635037277, + "grad_norm": 0.21283644437789917, + "learning_rate": 4.1997987592329325e-05, + "loss": 0.1921, + "step": 18827 + }, + { + "epoch": 0.33581849962544147, + "grad_norm": 0.3339155316352844, + "learning_rate": 4.199684619298366e-05, + "loss": 0.1767, + "step": 18828 + }, + { + "epoch": 0.33583633574715516, + "grad_norm": 0.19576819241046906, + "learning_rate": 4.199570472775236e-05, + "loss": 0.1636, + "step": 18829 + }, + { + "epoch": 0.33585417186886884, + "grad_norm": 0.3733035624027252, + "learning_rate": 4.1994563196639835e-05, + "loss": 0.121, + "step": 18830 + }, + { + "epoch": 0.33587200799058253, + "grad_norm": 0.20671199262142181, + "learning_rate": 4.199342159965051e-05, + "loss": 0.1389, + "step": 18831 + }, + { + "epoch": 0.3358898441122962, + "grad_norm": 0.29223716259002686, + "learning_rate": 4.199227993678882e-05, + "loss": 0.1829, + "step": 18832 + }, + { + "epoch": 0.3359076802340099, + "grad_norm": 0.2457880973815918, + "learning_rate": 4.199113820805918e-05, + "loss": 0.1862, + "step": 18833 + }, + { + "epoch": 0.3359255163557236, + "grad_norm": 0.1931418925523758, + "learning_rate": 4.198999641346601e-05, + "loss": 0.1103, + "step": 18834 + }, + { + "epoch": 0.3359433524774373, + "grad_norm": 0.22421900928020477, + "learning_rate": 4.198885455301376e-05, + "loss": 0.1524, + "step": 18835 + }, + { + "epoch": 0.33596118859915103, + "grad_norm": 0.1974003165960312, + "learning_rate": 4.198771262670684e-05, + "loss": 0.1435, + "step": 18836 + }, + { + "epoch": 0.3359790247208647, + "grad_norm": 0.34305670857429504, + "learning_rate": 4.198657063454967e-05, + "loss": 0.1236, + "step": 18837 + }, + { + "epoch": 0.3359968608425784, + "grad_norm": 0.20368368923664093, + "learning_rate": 4.1985428576546694e-05, + "loss": 0.1999, + "step": 18838 + }, + { + "epoch": 0.3360146969642921, + "grad_norm": 0.2729019522666931, + "learning_rate": 4.198428645270233e-05, + "loss": 0.1967, + "step": 18839 + }, + { + "epoch": 0.3360325330860058, + "grad_norm": 0.28014278411865234, + "learning_rate": 4.1983144263021004e-05, + "loss": 0.1935, + "step": 18840 + }, + { + "epoch": 0.33605036920771947, + "grad_norm": 0.2641333043575287, + "learning_rate": 4.1982002007507135e-05, + "loss": 0.1514, + "step": 18841 + }, + { + "epoch": 0.33606820532943316, + "grad_norm": 0.40066924691200256, + "learning_rate": 4.198085968616517e-05, + "loss": 0.1941, + "step": 18842 + }, + { + "epoch": 0.33608604145114684, + "grad_norm": 0.3447968661785126, + "learning_rate": 4.1979717298999534e-05, + "loss": 0.1392, + "step": 18843 + }, + { + "epoch": 0.33610387757286053, + "grad_norm": 0.31529876589775085, + "learning_rate": 4.197857484601464e-05, + "loss": 0.1946, + "step": 18844 + }, + { + "epoch": 0.3361217136945743, + "grad_norm": 0.2976031005382538, + "learning_rate": 4.197743232721493e-05, + "loss": 0.1655, + "step": 18845 + }, + { + "epoch": 0.33613954981628796, + "grad_norm": 0.25934863090515137, + "learning_rate": 4.1976289742604827e-05, + "loss": 0.1776, + "step": 18846 + }, + { + "epoch": 0.33615738593800165, + "grad_norm": 0.2988622486591339, + "learning_rate": 4.1975147092188754e-05, + "loss": 0.199, + "step": 18847 + }, + { + "epoch": 0.33617522205971534, + "grad_norm": 0.24079221487045288, + "learning_rate": 4.197400437597115e-05, + "loss": 0.1401, + "step": 18848 + }, + { + "epoch": 0.336193058181429, + "grad_norm": 0.25547999143600464, + "learning_rate": 4.1972861593956456e-05, + "loss": 0.1551, + "step": 18849 + }, + { + "epoch": 0.3362108943031427, + "grad_norm": 0.20761536061763763, + "learning_rate": 4.197171874614908e-05, + "loss": 0.1549, + "step": 18850 + }, + { + "epoch": 0.3362287304248564, + "grad_norm": 0.29660919308662415, + "learning_rate": 4.197057583255346e-05, + "loss": 0.207, + "step": 18851 + }, + { + "epoch": 0.3362465665465701, + "grad_norm": 0.2257426679134369, + "learning_rate": 4.196943285317402e-05, + "loss": 0.1694, + "step": 18852 + }, + { + "epoch": 0.33626440266828383, + "grad_norm": 0.29529380798339844, + "learning_rate": 4.196828980801521e-05, + "loss": 0.1898, + "step": 18853 + }, + { + "epoch": 0.3362822387899975, + "grad_norm": 0.29268166422843933, + "learning_rate": 4.196714669708144e-05, + "loss": 0.168, + "step": 18854 + }, + { + "epoch": 0.3363000749117112, + "grad_norm": 0.26879534125328064, + "learning_rate": 4.196600352037715e-05, + "loss": 0.2051, + "step": 18855 + }, + { + "epoch": 0.3363179110334249, + "grad_norm": 0.23126782476902008, + "learning_rate": 4.196486027790677e-05, + "loss": 0.1575, + "step": 18856 + }, + { + "epoch": 0.3363357471551386, + "grad_norm": 0.2529090344905853, + "learning_rate": 4.1963716969674736e-05, + "loss": 0.1476, + "step": 18857 + }, + { + "epoch": 0.3363535832768523, + "grad_norm": 0.26247191429138184, + "learning_rate": 4.196257359568547e-05, + "loss": 0.1397, + "step": 18858 + }, + { + "epoch": 0.33637141939856596, + "grad_norm": 0.46113982796669006, + "learning_rate": 4.196143015594342e-05, + "loss": 0.1593, + "step": 18859 + }, + { + "epoch": 0.33638925552027965, + "grad_norm": 0.28498685359954834, + "learning_rate": 4.196028665045299e-05, + "loss": 0.2064, + "step": 18860 + }, + { + "epoch": 0.33640709164199334, + "grad_norm": 0.24048668146133423, + "learning_rate": 4.195914307921865e-05, + "loss": 0.1627, + "step": 18861 + }, + { + "epoch": 0.3364249277637071, + "grad_norm": 0.20519572496414185, + "learning_rate": 4.1957999442244803e-05, + "loss": 0.1794, + "step": 18862 + }, + { + "epoch": 0.33644276388542077, + "grad_norm": 0.2637174129486084, + "learning_rate": 4.19568557395359e-05, + "loss": 0.1464, + "step": 18863 + }, + { + "epoch": 0.33646060000713446, + "grad_norm": 0.2308375984430313, + "learning_rate": 4.1955711971096364e-05, + "loss": 0.1759, + "step": 18864 + }, + { + "epoch": 0.33647843612884815, + "grad_norm": 0.33197465538978577, + "learning_rate": 4.1954568136930634e-05, + "loss": 0.1432, + "step": 18865 + }, + { + "epoch": 0.33649627225056183, + "grad_norm": 0.26629436016082764, + "learning_rate": 4.1953424237043135e-05, + "loss": 0.1714, + "step": 18866 + }, + { + "epoch": 0.3365141083722755, + "grad_norm": 0.24002783000469208, + "learning_rate": 4.1952280271438315e-05, + "loss": 0.1623, + "step": 18867 + }, + { + "epoch": 0.3365319444939892, + "grad_norm": 0.32216477394104004, + "learning_rate": 4.1951136240120604e-05, + "loss": 0.1486, + "step": 18868 + }, + { + "epoch": 0.3365497806157029, + "grad_norm": 0.2146845906972885, + "learning_rate": 4.194999214309443e-05, + "loss": 0.18, + "step": 18869 + }, + { + "epoch": 0.33656761673741664, + "grad_norm": 0.2908448576927185, + "learning_rate": 4.194884798036423e-05, + "loss": 0.1619, + "step": 18870 + }, + { + "epoch": 0.33658545285913033, + "grad_norm": 0.19791792333126068, + "learning_rate": 4.1947703751934444e-05, + "loss": 0.1374, + "step": 18871 + }, + { + "epoch": 0.336603288980844, + "grad_norm": 0.33534741401672363, + "learning_rate": 4.19465594578095e-05, + "loss": 0.1615, + "step": 18872 + }, + { + "epoch": 0.3366211251025577, + "grad_norm": 0.5692492127418518, + "learning_rate": 4.1945415097993846e-05, + "loss": 0.1862, + "step": 18873 + }, + { + "epoch": 0.3366389612242714, + "grad_norm": 0.20844325423240662, + "learning_rate": 4.1944270672491904e-05, + "loss": 0.1615, + "step": 18874 + }, + { + "epoch": 0.3366567973459851, + "grad_norm": 0.2584037780761719, + "learning_rate": 4.194312618130812e-05, + "loss": 0.1539, + "step": 18875 + }, + { + "epoch": 0.33667463346769877, + "grad_norm": 0.26378270983695984, + "learning_rate": 4.1941981624446926e-05, + "loss": 0.1824, + "step": 18876 + }, + { + "epoch": 0.33669246958941246, + "grad_norm": 0.2567320168018341, + "learning_rate": 4.194083700191276e-05, + "loss": 0.2199, + "step": 18877 + }, + { + "epoch": 0.33671030571112615, + "grad_norm": 0.2289617657661438, + "learning_rate": 4.193969231371006e-05, + "loss": 0.1683, + "step": 18878 + }, + { + "epoch": 0.3367281418328399, + "grad_norm": 0.24370422959327698, + "learning_rate": 4.193854755984327e-05, + "loss": 0.1779, + "step": 18879 + }, + { + "epoch": 0.3367459779545536, + "grad_norm": 0.29774782061576843, + "learning_rate": 4.19374027403168e-05, + "loss": 0.1998, + "step": 18880 + }, + { + "epoch": 0.33676381407626726, + "grad_norm": 0.2203705757856369, + "learning_rate": 4.193625785513512e-05, + "loss": 0.1796, + "step": 18881 + }, + { + "epoch": 0.33678165019798095, + "grad_norm": 0.2693965435028076, + "learning_rate": 4.193511290430265e-05, + "loss": 0.1321, + "step": 18882 + }, + { + "epoch": 0.33679948631969464, + "grad_norm": 0.34647831320762634, + "learning_rate": 4.193396788782383e-05, + "loss": 0.2512, + "step": 18883 + }, + { + "epoch": 0.33681732244140833, + "grad_norm": 0.30219072103500366, + "learning_rate": 4.193282280570311e-05, + "loss": 0.1705, + "step": 18884 + }, + { + "epoch": 0.336835158563122, + "grad_norm": 0.3962326645851135, + "learning_rate": 4.1931677657944925e-05, + "loss": 0.1679, + "step": 18885 + }, + { + "epoch": 0.3368529946848357, + "grad_norm": 0.28498876094818115, + "learning_rate": 4.193053244455369e-05, + "loss": 0.1264, + "step": 18886 + }, + { + "epoch": 0.33687083080654945, + "grad_norm": 0.24789181351661682, + "learning_rate": 4.192938716553388e-05, + "loss": 0.1621, + "step": 18887 + }, + { + "epoch": 0.33688866692826314, + "grad_norm": 0.27573931217193604, + "learning_rate": 4.1928241820889914e-05, + "loss": 0.1509, + "step": 18888 + }, + { + "epoch": 0.3369065030499768, + "grad_norm": 0.27217361330986023, + "learning_rate": 4.1927096410626234e-05, + "loss": 0.2067, + "step": 18889 + }, + { + "epoch": 0.3369243391716905, + "grad_norm": 0.2628231644630432, + "learning_rate": 4.192595093474728e-05, + "loss": 0.1161, + "step": 18890 + }, + { + "epoch": 0.3369421752934042, + "grad_norm": 0.24457767605781555, + "learning_rate": 4.19248053932575e-05, + "loss": 0.1672, + "step": 18891 + }, + { + "epoch": 0.3369600114151179, + "grad_norm": 0.24799640476703644, + "learning_rate": 4.192365978616133e-05, + "loss": 0.1568, + "step": 18892 + }, + { + "epoch": 0.3369778475368316, + "grad_norm": 0.2729053497314453, + "learning_rate": 4.1922514113463196e-05, + "loss": 0.1928, + "step": 18893 + }, + { + "epoch": 0.33699568365854526, + "grad_norm": 0.2368975132703781, + "learning_rate": 4.192136837516757e-05, + "loss": 0.1425, + "step": 18894 + }, + { + "epoch": 0.337013519780259, + "grad_norm": 0.2980508804321289, + "learning_rate": 4.192022257127887e-05, + "loss": 0.1762, + "step": 18895 + }, + { + "epoch": 0.3370313559019727, + "grad_norm": 0.2621249556541443, + "learning_rate": 4.1919076701801536e-05, + "loss": 0.1236, + "step": 18896 + }, + { + "epoch": 0.3370491920236864, + "grad_norm": 0.36102551221847534, + "learning_rate": 4.191793076674002e-05, + "loss": 0.1448, + "step": 18897 + }, + { + "epoch": 0.33706702814540007, + "grad_norm": 0.23766344785690308, + "learning_rate": 4.191678476609876e-05, + "loss": 0.1656, + "step": 18898 + }, + { + "epoch": 0.33708486426711376, + "grad_norm": 0.2627926468849182, + "learning_rate": 4.191563869988221e-05, + "loss": 0.1361, + "step": 18899 + }, + { + "epoch": 0.33710270038882745, + "grad_norm": 0.3469749093055725, + "learning_rate": 4.191449256809479e-05, + "loss": 0.1371, + "step": 18900 + }, + { + "epoch": 0.33712053651054114, + "grad_norm": 0.4508765637874603, + "learning_rate": 4.191334637074096e-05, + "loss": 0.2091, + "step": 18901 + }, + { + "epoch": 0.3371383726322548, + "grad_norm": 0.33867523074150085, + "learning_rate": 4.191220010782515e-05, + "loss": 0.182, + "step": 18902 + }, + { + "epoch": 0.3371562087539685, + "grad_norm": 0.23206554353237152, + "learning_rate": 4.191105377935182e-05, + "loss": 0.1797, + "step": 18903 + }, + { + "epoch": 0.33717404487568226, + "grad_norm": 0.2141391634941101, + "learning_rate": 4.19099073853254e-05, + "loss": 0.1529, + "step": 18904 + }, + { + "epoch": 0.33719188099739594, + "grad_norm": 0.2958926856517792, + "learning_rate": 4.1908760925750346e-05, + "loss": 0.1403, + "step": 18905 + }, + { + "epoch": 0.33720971711910963, + "grad_norm": 0.30149391293525696, + "learning_rate": 4.190761440063109e-05, + "loss": 0.1474, + "step": 18906 + }, + { + "epoch": 0.3372275532408233, + "grad_norm": 0.2546897828578949, + "learning_rate": 4.190646780997208e-05, + "loss": 0.1548, + "step": 18907 + }, + { + "epoch": 0.337245389362537, + "grad_norm": 0.21468181908130646, + "learning_rate": 4.1905321153777765e-05, + "loss": 0.1744, + "step": 18908 + }, + { + "epoch": 0.3372632254842507, + "grad_norm": 0.1941651999950409, + "learning_rate": 4.190417443205258e-05, + "loss": 0.1395, + "step": 18909 + }, + { + "epoch": 0.3372810616059644, + "grad_norm": 0.2625371813774109, + "learning_rate": 4.1903027644800974e-05, + "loss": 0.1615, + "step": 18910 + }, + { + "epoch": 0.33729889772767807, + "grad_norm": 0.25621822476387024, + "learning_rate": 4.1901880792027405e-05, + "loss": 0.1241, + "step": 18911 + }, + { + "epoch": 0.3373167338493918, + "grad_norm": 0.30450040102005005, + "learning_rate": 4.1900733873736305e-05, + "loss": 0.1912, + "step": 18912 + }, + { + "epoch": 0.3373345699711055, + "grad_norm": 0.22386007010936737, + "learning_rate": 4.189958688993212e-05, + "loss": 0.1593, + "step": 18913 + }, + { + "epoch": 0.3373524060928192, + "grad_norm": 0.24250273406505585, + "learning_rate": 4.1898439840619294e-05, + "loss": 0.1317, + "step": 18914 + }, + { + "epoch": 0.3373702422145329, + "grad_norm": 0.27967870235443115, + "learning_rate": 4.189729272580229e-05, + "loss": 0.1873, + "step": 18915 + }, + { + "epoch": 0.33738807833624657, + "grad_norm": 0.2978518009185791, + "learning_rate": 4.189614554548554e-05, + "loss": 0.1945, + "step": 18916 + }, + { + "epoch": 0.33740591445796025, + "grad_norm": 0.28990477323532104, + "learning_rate": 4.189499829967349e-05, + "loss": 0.1886, + "step": 18917 + }, + { + "epoch": 0.33742375057967394, + "grad_norm": 0.27220380306243896, + "learning_rate": 4.18938509883706e-05, + "loss": 0.1688, + "step": 18918 + }, + { + "epoch": 0.33744158670138763, + "grad_norm": 0.369583398103714, + "learning_rate": 4.1892703611581296e-05, + "loss": 0.202, + "step": 18919 + }, + { + "epoch": 0.3374594228231013, + "grad_norm": 0.21575228869915009, + "learning_rate": 4.1891556169310045e-05, + "loss": 0.1455, + "step": 18920 + }, + { + "epoch": 0.33747725894481506, + "grad_norm": 0.25246548652648926, + "learning_rate": 4.189040866156129e-05, + "loss": 0.1414, + "step": 18921 + }, + { + "epoch": 0.33749509506652875, + "grad_norm": 0.2783922255039215, + "learning_rate": 4.188926108833948e-05, + "loss": 0.1981, + "step": 18922 + }, + { + "epoch": 0.33751293118824244, + "grad_norm": 0.300258070230484, + "learning_rate": 4.188811344964905e-05, + "loss": 0.179, + "step": 18923 + }, + { + "epoch": 0.3375307673099561, + "grad_norm": 0.2789432108402252, + "learning_rate": 4.1886965745494464e-05, + "loss": 0.1912, + "step": 18924 + }, + { + "epoch": 0.3375486034316698, + "grad_norm": 0.23929810523986816, + "learning_rate": 4.188581797588017e-05, + "loss": 0.1686, + "step": 18925 + }, + { + "epoch": 0.3375664395533835, + "grad_norm": 0.41566815972328186, + "learning_rate": 4.188467014081061e-05, + "loss": 0.1489, + "step": 18926 + }, + { + "epoch": 0.3375842756750972, + "grad_norm": 0.29327622056007385, + "learning_rate": 4.1883522240290243e-05, + "loss": 0.2405, + "step": 18927 + }, + { + "epoch": 0.3376021117968109, + "grad_norm": 0.260224848985672, + "learning_rate": 4.18823742743235e-05, + "loss": 0.1393, + "step": 18928 + }, + { + "epoch": 0.3376199479185246, + "grad_norm": 0.2737530767917633, + "learning_rate": 4.188122624291485e-05, + "loss": 0.1609, + "step": 18929 + }, + { + "epoch": 0.3376377840402383, + "grad_norm": 0.3170583248138428, + "learning_rate": 4.188007814606874e-05, + "loss": 0.1808, + "step": 18930 + }, + { + "epoch": 0.337655620161952, + "grad_norm": 0.3437529504299164, + "learning_rate": 4.187892998378962e-05, + "loss": 0.2453, + "step": 18931 + }, + { + "epoch": 0.3376734562836657, + "grad_norm": 0.20769667625427246, + "learning_rate": 4.1877781756081926e-05, + "loss": 0.1341, + "step": 18932 + }, + { + "epoch": 0.3376912924053794, + "grad_norm": 0.23483915627002716, + "learning_rate": 4.187663346295013e-05, + "loss": 0.1553, + "step": 18933 + }, + { + "epoch": 0.33770912852709306, + "grad_norm": 0.31459829211235046, + "learning_rate": 4.187548510439866e-05, + "loss": 0.1393, + "step": 18934 + }, + { + "epoch": 0.33772696464880675, + "grad_norm": 0.23074886202812195, + "learning_rate": 4.1874336680431994e-05, + "loss": 0.1554, + "step": 18935 + }, + { + "epoch": 0.33774480077052044, + "grad_norm": 0.33776792883872986, + "learning_rate": 4.187318819105457e-05, + "loss": 0.1477, + "step": 18936 + }, + { + "epoch": 0.3377626368922341, + "grad_norm": 0.2515294551849365, + "learning_rate": 4.1872039636270836e-05, + "loss": 0.1421, + "step": 18937 + }, + { + "epoch": 0.33778047301394787, + "grad_norm": 0.206778421998024, + "learning_rate": 4.187089101608526e-05, + "loss": 0.174, + "step": 18938 + }, + { + "epoch": 0.33779830913566156, + "grad_norm": 0.39762958884239197, + "learning_rate": 4.1869742330502266e-05, + "loss": 0.1512, + "step": 18939 + }, + { + "epoch": 0.33781614525737524, + "grad_norm": 0.2478768229484558, + "learning_rate": 4.186859357952634e-05, + "loss": 0.1413, + "step": 18940 + }, + { + "epoch": 0.33783398137908893, + "grad_norm": 0.29932278394699097, + "learning_rate": 4.1867444763161905e-05, + "loss": 0.2039, + "step": 18941 + }, + { + "epoch": 0.3378518175008026, + "grad_norm": 0.26540690660476685, + "learning_rate": 4.1866295881413434e-05, + "loss": 0.1806, + "step": 18942 + }, + { + "epoch": 0.3378696536225163, + "grad_norm": 0.20086121559143066, + "learning_rate": 4.186514693428538e-05, + "loss": 0.1507, + "step": 18943 + }, + { + "epoch": 0.33788748974423, + "grad_norm": 0.1826017051935196, + "learning_rate": 4.1863997921782185e-05, + "loss": 0.1588, + "step": 18944 + }, + { + "epoch": 0.3379053258659437, + "grad_norm": 0.22912786900997162, + "learning_rate": 4.186284884390831e-05, + "loss": 0.1602, + "step": 18945 + }, + { + "epoch": 0.3379231619876574, + "grad_norm": 0.20957501232624054, + "learning_rate": 4.186169970066821e-05, + "loss": 0.1517, + "step": 18946 + }, + { + "epoch": 0.3379409981093711, + "grad_norm": 0.2784116268157959, + "learning_rate": 4.186055049206634e-05, + "loss": 0.181, + "step": 18947 + }, + { + "epoch": 0.3379588342310848, + "grad_norm": 0.27196410298347473, + "learning_rate": 4.185940121810715e-05, + "loss": 0.1576, + "step": 18948 + }, + { + "epoch": 0.3379766703527985, + "grad_norm": 0.2507927417755127, + "learning_rate": 4.18582518787951e-05, + "loss": 0.2018, + "step": 18949 + }, + { + "epoch": 0.3379945064745122, + "grad_norm": 0.21486423909664154, + "learning_rate": 4.185710247413465e-05, + "loss": 0.1532, + "step": 18950 + }, + { + "epoch": 0.33801234259622587, + "grad_norm": 0.25737878680229187, + "learning_rate": 4.185595300413023e-05, + "loss": 0.1472, + "step": 18951 + }, + { + "epoch": 0.33803017871793956, + "grad_norm": 0.23167891800403595, + "learning_rate": 4.185480346878633e-05, + "loss": 0.1656, + "step": 18952 + }, + { + "epoch": 0.33804801483965324, + "grad_norm": 0.21269112825393677, + "learning_rate": 4.1853653868107385e-05, + "loss": 0.1753, + "step": 18953 + }, + { + "epoch": 0.338065850961367, + "grad_norm": 0.3019649386405945, + "learning_rate": 4.185250420209785e-05, + "loss": 0.1394, + "step": 18954 + }, + { + "epoch": 0.3380836870830807, + "grad_norm": 0.23444515466690063, + "learning_rate": 4.18513544707622e-05, + "loss": 0.1452, + "step": 18955 + }, + { + "epoch": 0.33810152320479436, + "grad_norm": 0.24718981981277466, + "learning_rate": 4.1850204674104875e-05, + "loss": 0.1191, + "step": 18956 + }, + { + "epoch": 0.33811935932650805, + "grad_norm": 0.2582503855228424, + "learning_rate": 4.184905481213034e-05, + "loss": 0.1952, + "step": 18957 + }, + { + "epoch": 0.33813719544822174, + "grad_norm": 0.2911236882209778, + "learning_rate": 4.184790488484304e-05, + "loss": 0.2184, + "step": 18958 + }, + { + "epoch": 0.3381550315699354, + "grad_norm": 0.28882530331611633, + "learning_rate": 4.184675489224745e-05, + "loss": 0.19, + "step": 18959 + }, + { + "epoch": 0.3381728676916491, + "grad_norm": 0.32744723558425903, + "learning_rate": 4.1845604834348015e-05, + "loss": 0.21, + "step": 18960 + }, + { + "epoch": 0.3381907038133628, + "grad_norm": 0.29121214151382446, + "learning_rate": 4.18444547111492e-05, + "loss": 0.1671, + "step": 18961 + }, + { + "epoch": 0.3382085399350765, + "grad_norm": 0.22942860424518585, + "learning_rate": 4.184330452265546e-05, + "loss": 0.1702, + "step": 18962 + }, + { + "epoch": 0.33822637605679023, + "grad_norm": 0.26729515194892883, + "learning_rate": 4.1842154268871254e-05, + "loss": 0.2024, + "step": 18963 + }, + { + "epoch": 0.3382442121785039, + "grad_norm": 0.2039777934551239, + "learning_rate": 4.184100394980104e-05, + "loss": 0.1405, + "step": 18964 + }, + { + "epoch": 0.3382620483002176, + "grad_norm": 0.26942625641822815, + "learning_rate": 4.1839853565449275e-05, + "loss": 0.178, + "step": 18965 + }, + { + "epoch": 0.3382798844219313, + "grad_norm": 0.3534669578075409, + "learning_rate": 4.183870311582043e-05, + "loss": 0.1928, + "step": 18966 + }, + { + "epoch": 0.338297720543645, + "grad_norm": 0.2680956721305847, + "learning_rate": 4.183755260091895e-05, + "loss": 0.1361, + "step": 18967 + }, + { + "epoch": 0.3383155566653587, + "grad_norm": 0.3817254900932312, + "learning_rate": 4.18364020207493e-05, + "loss": 0.2145, + "step": 18968 + }, + { + "epoch": 0.33833339278707236, + "grad_norm": 0.22404317557811737, + "learning_rate": 4.1835251375315944e-05, + "loss": 0.1644, + "step": 18969 + }, + { + "epoch": 0.33835122890878605, + "grad_norm": 0.2065775841474533, + "learning_rate": 4.183410066462333e-05, + "loss": 0.1455, + "step": 18970 + }, + { + "epoch": 0.3383690650304998, + "grad_norm": 0.22631201148033142, + "learning_rate": 4.183294988867594e-05, + "loss": 0.1675, + "step": 18971 + }, + { + "epoch": 0.3383869011522135, + "grad_norm": 0.24366283416748047, + "learning_rate": 4.1831799047478215e-05, + "loss": 0.1697, + "step": 18972 + }, + { + "epoch": 0.33840473727392717, + "grad_norm": 0.2905050814151764, + "learning_rate": 4.183064814103463e-05, + "loss": 0.1773, + "step": 18973 + }, + { + "epoch": 0.33842257339564086, + "grad_norm": 0.20870763063430786, + "learning_rate": 4.182949716934963e-05, + "loss": 0.1723, + "step": 18974 + }, + { + "epoch": 0.33844040951735455, + "grad_norm": 0.25424304604530334, + "learning_rate": 4.182834613242769e-05, + "loss": 0.1653, + "step": 18975 + }, + { + "epoch": 0.33845824563906823, + "grad_norm": 0.19270065426826477, + "learning_rate": 4.182719503027327e-05, + "loss": 0.1929, + "step": 18976 + }, + { + "epoch": 0.3384760817607819, + "grad_norm": 0.24269334971904755, + "learning_rate": 4.182604386289083e-05, + "loss": 0.1599, + "step": 18977 + }, + { + "epoch": 0.3384939178824956, + "grad_norm": 0.38319239020347595, + "learning_rate": 4.182489263028484e-05, + "loss": 0.2316, + "step": 18978 + }, + { + "epoch": 0.3385117540042093, + "grad_norm": 0.25000420212745667, + "learning_rate": 4.1823741332459744e-05, + "loss": 0.1317, + "step": 18979 + }, + { + "epoch": 0.33852959012592304, + "grad_norm": 0.23778657615184784, + "learning_rate": 4.182258996942001e-05, + "loss": 0.1782, + "step": 18980 + }, + { + "epoch": 0.33854742624763673, + "grad_norm": 0.2352931648492813, + "learning_rate": 4.182143854117012e-05, + "loss": 0.1787, + "step": 18981 + }, + { + "epoch": 0.3385652623693504, + "grad_norm": 0.25397995114326477, + "learning_rate": 4.1820287047714526e-05, + "loss": 0.1149, + "step": 18982 + }, + { + "epoch": 0.3385830984910641, + "grad_norm": 0.258090078830719, + "learning_rate": 4.1819135489057684e-05, + "loss": 0.1363, + "step": 18983 + }, + { + "epoch": 0.3386009346127778, + "grad_norm": 0.2313193827867508, + "learning_rate": 4.181798386520406e-05, + "loss": 0.2097, + "step": 18984 + }, + { + "epoch": 0.3386187707344915, + "grad_norm": 0.3382202386856079, + "learning_rate": 4.181683217615813e-05, + "loss": 0.2026, + "step": 18985 + }, + { + "epoch": 0.33863660685620517, + "grad_norm": 0.31504493951797485, + "learning_rate": 4.181568042192434e-05, + "loss": 0.2082, + "step": 18986 + }, + { + "epoch": 0.33865444297791886, + "grad_norm": 0.24363857507705688, + "learning_rate": 4.181452860250717e-05, + "loss": 0.1844, + "step": 18987 + }, + { + "epoch": 0.3386722790996326, + "grad_norm": 0.26484745740890503, + "learning_rate": 4.181337671791108e-05, + "loss": 0.1689, + "step": 18988 + }, + { + "epoch": 0.3386901152213463, + "grad_norm": 0.21356326341629028, + "learning_rate": 4.1812224768140534e-05, + "loss": 0.1451, + "step": 18989 + }, + { + "epoch": 0.33870795134306, + "grad_norm": 0.43112072348594666, + "learning_rate": 4.18110727532e-05, + "loss": 0.2201, + "step": 18990 + }, + { + "epoch": 0.33872578746477366, + "grad_norm": 0.22192348539829254, + "learning_rate": 4.180992067309394e-05, + "loss": 0.1517, + "step": 18991 + }, + { + "epoch": 0.33874362358648735, + "grad_norm": 0.2938961684703827, + "learning_rate": 4.1808768527826824e-05, + "loss": 0.1768, + "step": 18992 + }, + { + "epoch": 0.33876145970820104, + "grad_norm": 0.3255133032798767, + "learning_rate": 4.1807616317403106e-05, + "loss": 0.1877, + "step": 18993 + }, + { + "epoch": 0.33877929582991473, + "grad_norm": 0.2388078272342682, + "learning_rate": 4.1806464041827275e-05, + "loss": 0.1559, + "step": 18994 + }, + { + "epoch": 0.3387971319516284, + "grad_norm": 0.17218881845474243, + "learning_rate": 4.1805311701103774e-05, + "loss": 0.153, + "step": 18995 + }, + { + "epoch": 0.33881496807334216, + "grad_norm": 0.2763029634952545, + "learning_rate": 4.180415929523709e-05, + "loss": 0.1719, + "step": 18996 + }, + { + "epoch": 0.33883280419505585, + "grad_norm": 0.35350432991981506, + "learning_rate": 4.180300682423167e-05, + "loss": 0.1559, + "step": 18997 + }, + { + "epoch": 0.33885064031676954, + "grad_norm": 0.3385790288448334, + "learning_rate": 4.1801854288092004e-05, + "loss": 0.1068, + "step": 18998 + }, + { + "epoch": 0.3388684764384832, + "grad_norm": 0.2966400682926178, + "learning_rate": 4.1800701686822544e-05, + "loss": 0.1325, + "step": 18999 + }, + { + "epoch": 0.3388863125601969, + "grad_norm": 0.21844731271266937, + "learning_rate": 4.179954902042775e-05, + "loss": 0.1666, + "step": 19000 + }, + { + "epoch": 0.3388863125601969, + "eval_loss": 0.1632959097623825, + "eval_runtime": 107.0663, + "eval_samples_per_second": 9.564, + "eval_steps_per_second": 1.597, + "step": 19000 + }, + { + "epoch": 0.3389041486819106, + "grad_norm": 0.25229185819625854, + "learning_rate": 4.179839628891211e-05, + "loss": 0.1547, + "step": 19001 + }, + { + "epoch": 0.3389219848036243, + "grad_norm": 0.31327369809150696, + "learning_rate": 4.179724349228009e-05, + "loss": 0.1199, + "step": 19002 + }, + { + "epoch": 0.338939820925338, + "grad_norm": 0.32854384183883667, + "learning_rate": 4.179609063053615e-05, + "loss": 0.1869, + "step": 19003 + }, + { + "epoch": 0.33895765704705166, + "grad_norm": 0.3303373157978058, + "learning_rate": 4.1794937703684754e-05, + "loss": 0.1761, + "step": 19004 + }, + { + "epoch": 0.3389754931687654, + "grad_norm": 0.2542996406555176, + "learning_rate": 4.179378471173039e-05, + "loss": 0.1598, + "step": 19005 + }, + { + "epoch": 0.3389933292904791, + "grad_norm": 0.3003520965576172, + "learning_rate": 4.1792631654677506e-05, + "loss": 0.1924, + "step": 19006 + }, + { + "epoch": 0.3390111654121928, + "grad_norm": 0.27404484152793884, + "learning_rate": 4.1791478532530584e-05, + "loss": 0.1501, + "step": 19007 + }, + { + "epoch": 0.33902900153390647, + "grad_norm": 0.22072987258434296, + "learning_rate": 4.179032534529409e-05, + "loss": 0.1388, + "step": 19008 + }, + { + "epoch": 0.33904683765562016, + "grad_norm": 0.2703891396522522, + "learning_rate": 4.17891720929725e-05, + "loss": 0.1694, + "step": 19009 + }, + { + "epoch": 0.33906467377733385, + "grad_norm": 0.33079418540000916, + "learning_rate": 4.178801877557028e-05, + "loss": 0.1501, + "step": 19010 + }, + { + "epoch": 0.33908250989904754, + "grad_norm": 0.40734580159187317, + "learning_rate": 4.17868653930919e-05, + "loss": 0.2557, + "step": 19011 + }, + { + "epoch": 0.3391003460207612, + "grad_norm": 0.23386681079864502, + "learning_rate": 4.1785711945541834e-05, + "loss": 0.1536, + "step": 19012 + }, + { + "epoch": 0.33911818214247497, + "grad_norm": 0.24409335851669312, + "learning_rate": 4.178455843292455e-05, + "loss": 0.201, + "step": 19013 + }, + { + "epoch": 0.33913601826418865, + "grad_norm": 0.2245641052722931, + "learning_rate": 4.178340485524451e-05, + "loss": 0.1115, + "step": 19014 + }, + { + "epoch": 0.33915385438590234, + "grad_norm": 0.8578737378120422, + "learning_rate": 4.178225121250621e-05, + "loss": 0.1742, + "step": 19015 + }, + { + "epoch": 0.33917169050761603, + "grad_norm": 0.20405761897563934, + "learning_rate": 4.1781097504714106e-05, + "loss": 0.1538, + "step": 19016 + }, + { + "epoch": 0.3391895266293297, + "grad_norm": 0.27465230226516724, + "learning_rate": 4.177994373187266e-05, + "loss": 0.1398, + "step": 19017 + }, + { + "epoch": 0.3392073627510434, + "grad_norm": 0.24772217869758606, + "learning_rate": 4.177878989398637e-05, + "loss": 0.1208, + "step": 19018 + }, + { + "epoch": 0.3392251988727571, + "grad_norm": 0.24304911494255066, + "learning_rate": 4.1777635991059686e-05, + "loss": 0.1103, + "step": 19019 + }, + { + "epoch": 0.3392430349944708, + "grad_norm": 0.3815910518169403, + "learning_rate": 4.17764820230971e-05, + "loss": 0.1725, + "step": 19020 + }, + { + "epoch": 0.33926087111618447, + "grad_norm": 0.27894261479377747, + "learning_rate": 4.177532799010307e-05, + "loss": 0.1723, + "step": 19021 + }, + { + "epoch": 0.3392787072378982, + "grad_norm": 0.25985851883888245, + "learning_rate": 4.1774173892082066e-05, + "loss": 0.1507, + "step": 19022 + }, + { + "epoch": 0.3392965433596119, + "grad_norm": 0.2939731180667877, + "learning_rate": 4.177301972903858e-05, + "loss": 0.1664, + "step": 19023 + }, + { + "epoch": 0.3393143794813256, + "grad_norm": 0.22582286596298218, + "learning_rate": 4.1771865500977084e-05, + "loss": 0.2168, + "step": 19024 + }, + { + "epoch": 0.3393322156030393, + "grad_norm": 0.2266026735305786, + "learning_rate": 4.1770711207902034e-05, + "loss": 0.1583, + "step": 19025 + }, + { + "epoch": 0.33935005172475297, + "grad_norm": 0.33099600672721863, + "learning_rate": 4.176955684981792e-05, + "loss": 0.2193, + "step": 19026 + }, + { + "epoch": 0.33936788784646665, + "grad_norm": 0.22976182401180267, + "learning_rate": 4.1768402426729205e-05, + "loss": 0.1169, + "step": 19027 + }, + { + "epoch": 0.33938572396818034, + "grad_norm": 0.2766157388687134, + "learning_rate": 4.176724793864037e-05, + "loss": 0.2064, + "step": 19028 + }, + { + "epoch": 0.33940356008989403, + "grad_norm": 0.22305607795715332, + "learning_rate": 4.176609338555589e-05, + "loss": 0.1565, + "step": 19029 + }, + { + "epoch": 0.3394213962116078, + "grad_norm": 0.2595331370830536, + "learning_rate": 4.176493876748025e-05, + "loss": 0.1679, + "step": 19030 + }, + { + "epoch": 0.33943923233332146, + "grad_norm": 0.29117804765701294, + "learning_rate": 4.176378408441791e-05, + "loss": 0.1959, + "step": 19031 + }, + { + "epoch": 0.33945706845503515, + "grad_norm": 0.31189867854118347, + "learning_rate": 4.1762629336373356e-05, + "loss": 0.1683, + "step": 19032 + }, + { + "epoch": 0.33947490457674884, + "grad_norm": 0.293280690908432, + "learning_rate": 4.176147452335106e-05, + "loss": 0.1445, + "step": 19033 + }, + { + "epoch": 0.3394927406984625, + "grad_norm": 0.29017212986946106, + "learning_rate": 4.17603196453555e-05, + "loss": 0.1898, + "step": 19034 + }, + { + "epoch": 0.3395105768201762, + "grad_norm": 0.2960911989212036, + "learning_rate": 4.1759164702391155e-05, + "loss": 0.1466, + "step": 19035 + }, + { + "epoch": 0.3395284129418899, + "grad_norm": 0.28645211458206177, + "learning_rate": 4.17580096944625e-05, + "loss": 0.1503, + "step": 19036 + }, + { + "epoch": 0.3395462490636036, + "grad_norm": 0.2455846220254898, + "learning_rate": 4.175685462157401e-05, + "loss": 0.1521, + "step": 19037 + }, + { + "epoch": 0.3395640851853173, + "grad_norm": 0.285341739654541, + "learning_rate": 4.1755699483730157e-05, + "loss": 0.1525, + "step": 19038 + }, + { + "epoch": 0.339581921307031, + "grad_norm": 0.25104281306266785, + "learning_rate": 4.1754544280935426e-05, + "loss": 0.1301, + "step": 19039 + }, + { + "epoch": 0.3395997574287447, + "grad_norm": 0.3304446041584015, + "learning_rate": 4.17533890131943e-05, + "loss": 0.1836, + "step": 19040 + }, + { + "epoch": 0.3396175935504584, + "grad_norm": 0.34443068504333496, + "learning_rate": 4.175223368051126e-05, + "loss": 0.1941, + "step": 19041 + }, + { + "epoch": 0.3396354296721721, + "grad_norm": 0.3427804410457611, + "learning_rate": 4.1751078282890757e-05, + "loss": 0.1338, + "step": 19042 + }, + { + "epoch": 0.3396532657938858, + "grad_norm": 0.2696840763092041, + "learning_rate": 4.174992282033729e-05, + "loss": 0.1887, + "step": 19043 + }, + { + "epoch": 0.33967110191559946, + "grad_norm": 0.24931056797504425, + "learning_rate": 4.174876729285536e-05, + "loss": 0.126, + "step": 19044 + }, + { + "epoch": 0.33968893803731315, + "grad_norm": 0.28129658102989197, + "learning_rate": 4.1747611700449406e-05, + "loss": 0.1261, + "step": 19045 + }, + { + "epoch": 0.33970677415902684, + "grad_norm": 0.23587551712989807, + "learning_rate": 4.174645604312393e-05, + "loss": 0.1395, + "step": 19046 + }, + { + "epoch": 0.3397246102807406, + "grad_norm": 0.3077877461910248, + "learning_rate": 4.17453003208834e-05, + "loss": 0.1046, + "step": 19047 + }, + { + "epoch": 0.33974244640245427, + "grad_norm": 0.26231181621551514, + "learning_rate": 4.174414453373231e-05, + "loss": 0.1976, + "step": 19048 + }, + { + "epoch": 0.33976028252416796, + "grad_norm": 0.2857707142829895, + "learning_rate": 4.174298868167512e-05, + "loss": 0.1199, + "step": 19049 + }, + { + "epoch": 0.33977811864588164, + "grad_norm": 0.3958567678928375, + "learning_rate": 4.1741832764716335e-05, + "loss": 0.2751, + "step": 19050 + }, + { + "epoch": 0.33979595476759533, + "grad_norm": 0.2019142359495163, + "learning_rate": 4.174067678286042e-05, + "loss": 0.1544, + "step": 19051 + }, + { + "epoch": 0.339813790889309, + "grad_norm": 0.2342025339603424, + "learning_rate": 4.173952073611186e-05, + "loss": 0.1301, + "step": 19052 + }, + { + "epoch": 0.3398316270110227, + "grad_norm": 0.3273864686489105, + "learning_rate": 4.173836462447514e-05, + "loss": 0.1369, + "step": 19053 + }, + { + "epoch": 0.3398494631327364, + "grad_norm": 0.2864091992378235, + "learning_rate": 4.173720844795473e-05, + "loss": 0.1333, + "step": 19054 + }, + { + "epoch": 0.33986729925445014, + "grad_norm": 0.3018883466720581, + "learning_rate": 4.173605220655512e-05, + "loss": 0.132, + "step": 19055 + }, + { + "epoch": 0.3398851353761638, + "grad_norm": 0.2493281364440918, + "learning_rate": 4.17348959002808e-05, + "loss": 0.1469, + "step": 19056 + }, + { + "epoch": 0.3399029714978775, + "grad_norm": 0.3162676990032196, + "learning_rate": 4.1733739529136234e-05, + "loss": 0.2015, + "step": 19057 + }, + { + "epoch": 0.3399208076195912, + "grad_norm": 0.3037779629230499, + "learning_rate": 4.1732583093125914e-05, + "loss": 0.1714, + "step": 19058 + }, + { + "epoch": 0.3399386437413049, + "grad_norm": 0.34416380524635315, + "learning_rate": 4.173142659225433e-05, + "loss": 0.1774, + "step": 19059 + }, + { + "epoch": 0.3399564798630186, + "grad_norm": 0.2441510409116745, + "learning_rate": 4.1730270026525955e-05, + "loss": 0.165, + "step": 19060 + }, + { + "epoch": 0.33997431598473227, + "grad_norm": 0.3016694188117981, + "learning_rate": 4.1729113395945276e-05, + "loss": 0.1479, + "step": 19061 + }, + { + "epoch": 0.33999215210644596, + "grad_norm": 0.32537052035331726, + "learning_rate": 4.172795670051677e-05, + "loss": 0.185, + "step": 19062 + }, + { + "epoch": 0.34000998822815964, + "grad_norm": 0.18432371318340302, + "learning_rate": 4.1726799940244924e-05, + "loss": 0.122, + "step": 19063 + }, + { + "epoch": 0.3400278243498734, + "grad_norm": 0.2336127758026123, + "learning_rate": 4.172564311513423e-05, + "loss": 0.1561, + "step": 19064 + }, + { + "epoch": 0.3400456604715871, + "grad_norm": 0.23246034979820251, + "learning_rate": 4.172448622518917e-05, + "loss": 0.1427, + "step": 19065 + }, + { + "epoch": 0.34006349659330076, + "grad_norm": 0.3123519718647003, + "learning_rate": 4.172332927041422e-05, + "loss": 0.1826, + "step": 19066 + }, + { + "epoch": 0.34008133271501445, + "grad_norm": 0.30995211005210876, + "learning_rate": 4.172217225081387e-05, + "loss": 0.1823, + "step": 19067 + }, + { + "epoch": 0.34009916883672814, + "grad_norm": 0.3009769022464752, + "learning_rate": 4.1721015166392606e-05, + "loss": 0.1199, + "step": 19068 + }, + { + "epoch": 0.3401170049584418, + "grad_norm": 0.3721808195114136, + "learning_rate": 4.171985801715491e-05, + "loss": 0.169, + "step": 19069 + }, + { + "epoch": 0.3401348410801555, + "grad_norm": 0.26624995470046997, + "learning_rate": 4.171870080310527e-05, + "loss": 0.1986, + "step": 19070 + }, + { + "epoch": 0.3401526772018692, + "grad_norm": 0.24667924642562866, + "learning_rate": 4.171754352424817e-05, + "loss": 0.1645, + "step": 19071 + }, + { + "epoch": 0.34017051332358295, + "grad_norm": 0.2952944040298462, + "learning_rate": 4.17163861805881e-05, + "loss": 0.1941, + "step": 19072 + }, + { + "epoch": 0.34018834944529663, + "grad_norm": 0.2621082067489624, + "learning_rate": 4.1715228772129546e-05, + "loss": 0.1345, + "step": 19073 + }, + { + "epoch": 0.3402061855670103, + "grad_norm": 0.3187590539455414, + "learning_rate": 4.1714071298876987e-05, + "loss": 0.1682, + "step": 19074 + }, + { + "epoch": 0.340224021688724, + "grad_norm": 0.1794048696756363, + "learning_rate": 4.171291376083492e-05, + "loss": 0.138, + "step": 19075 + }, + { + "epoch": 0.3402418578104377, + "grad_norm": 0.24293185770511627, + "learning_rate": 4.171175615800782e-05, + "loss": 0.2048, + "step": 19076 + }, + { + "epoch": 0.3402596939321514, + "grad_norm": 0.28766682744026184, + "learning_rate": 4.1710598490400175e-05, + "loss": 0.143, + "step": 19077 + }, + { + "epoch": 0.3402775300538651, + "grad_norm": 0.2681846618652344, + "learning_rate": 4.170944075801649e-05, + "loss": 0.1567, + "step": 19078 + }, + { + "epoch": 0.34029536617557876, + "grad_norm": 0.27746152877807617, + "learning_rate": 4.1708282960861245e-05, + "loss": 0.1827, + "step": 19079 + }, + { + "epoch": 0.34031320229729245, + "grad_norm": 0.30340448021888733, + "learning_rate": 4.170712509893892e-05, + "loss": 0.1978, + "step": 19080 + }, + { + "epoch": 0.3403310384190062, + "grad_norm": 0.24910910427570343, + "learning_rate": 4.170596717225401e-05, + "loss": 0.1615, + "step": 19081 + }, + { + "epoch": 0.3403488745407199, + "grad_norm": 0.32467159628868103, + "learning_rate": 4.1704809180810986e-05, + "loss": 0.1925, + "step": 19082 + }, + { + "epoch": 0.34036671066243357, + "grad_norm": 0.28001829981803894, + "learning_rate": 4.1703651124614374e-05, + "loss": 0.1825, + "step": 19083 + }, + { + "epoch": 0.34038454678414726, + "grad_norm": 0.28363335132598877, + "learning_rate": 4.1702493003668625e-05, + "loss": 0.0969, + "step": 19084 + }, + { + "epoch": 0.34040238290586095, + "grad_norm": 0.2827285826206207, + "learning_rate": 4.170133481797825e-05, + "loss": 0.1875, + "step": 19085 + }, + { + "epoch": 0.34042021902757463, + "grad_norm": 0.2151196002960205, + "learning_rate": 4.170017656754773e-05, + "loss": 0.1284, + "step": 19086 + }, + { + "epoch": 0.3404380551492883, + "grad_norm": 0.3551812171936035, + "learning_rate": 4.169901825238156e-05, + "loss": 0.1745, + "step": 19087 + }, + { + "epoch": 0.340455891271002, + "grad_norm": 0.19007790088653564, + "learning_rate": 4.169785987248423e-05, + "loss": 0.1384, + "step": 19088 + }, + { + "epoch": 0.34047372739271575, + "grad_norm": 0.2538352906703949, + "learning_rate": 4.169670142786023e-05, + "loss": 0.1586, + "step": 19089 + }, + { + "epoch": 0.34049156351442944, + "grad_norm": 0.28654947876930237, + "learning_rate": 4.169554291851404e-05, + "loss": 0.1656, + "step": 19090 + }, + { + "epoch": 0.34050939963614313, + "grad_norm": 0.185479074716568, + "learning_rate": 4.1694384344450164e-05, + "loss": 0.1138, + "step": 19091 + }, + { + "epoch": 0.3405272357578568, + "grad_norm": 0.25111615657806396, + "learning_rate": 4.169322570567309e-05, + "loss": 0.1756, + "step": 19092 + }, + { + "epoch": 0.3405450718795705, + "grad_norm": 0.26057636737823486, + "learning_rate": 4.16920670021873e-05, + "loss": 0.2197, + "step": 19093 + }, + { + "epoch": 0.3405629080012842, + "grad_norm": 0.3298758566379547, + "learning_rate": 4.16909082339973e-05, + "loss": 0.159, + "step": 19094 + }, + { + "epoch": 0.3405807441229979, + "grad_norm": 0.31102806329727173, + "learning_rate": 4.168974940110757e-05, + "loss": 0.2075, + "step": 19095 + }, + { + "epoch": 0.34059858024471157, + "grad_norm": 0.20528429746627808, + "learning_rate": 4.168859050352261e-05, + "loss": 0.1812, + "step": 19096 + }, + { + "epoch": 0.3406164163664253, + "grad_norm": 0.2151535302400589, + "learning_rate": 4.168743154124691e-05, + "loss": 0.1337, + "step": 19097 + }, + { + "epoch": 0.340634252488139, + "grad_norm": 0.22697141766548157, + "learning_rate": 4.168627251428496e-05, + "loss": 0.1541, + "step": 19098 + }, + { + "epoch": 0.3406520886098527, + "grad_norm": 0.29048141837120056, + "learning_rate": 4.168511342264125e-05, + "loss": 0.1924, + "step": 19099 + }, + { + "epoch": 0.3406699247315664, + "grad_norm": 0.3127744793891907, + "learning_rate": 4.168395426632029e-05, + "loss": 0.1863, + "step": 19100 + }, + { + "epoch": 0.34068776085328006, + "grad_norm": 0.24152883887290955, + "learning_rate": 4.168279504532655e-05, + "loss": 0.1631, + "step": 19101 + }, + { + "epoch": 0.34070559697499375, + "grad_norm": 0.2404649704694748, + "learning_rate": 4.168163575966454e-05, + "loss": 0.1684, + "step": 19102 + }, + { + "epoch": 0.34072343309670744, + "grad_norm": 0.31002750992774963, + "learning_rate": 4.1680476409338744e-05, + "loss": 0.1714, + "step": 19103 + }, + { + "epoch": 0.34074126921842113, + "grad_norm": 0.2534800171852112, + "learning_rate": 4.1679316994353664e-05, + "loss": 0.177, + "step": 19104 + }, + { + "epoch": 0.3407591053401348, + "grad_norm": 0.275722473859787, + "learning_rate": 4.1678157514713786e-05, + "loss": 0.1377, + "step": 19105 + }, + { + "epoch": 0.34077694146184856, + "grad_norm": 0.21818099915981293, + "learning_rate": 4.167699797042362e-05, + "loss": 0.1374, + "step": 19106 + }, + { + "epoch": 0.34079477758356225, + "grad_norm": 0.24218273162841797, + "learning_rate": 4.1675838361487637e-05, + "loss": 0.1574, + "step": 19107 + }, + { + "epoch": 0.34081261370527594, + "grad_norm": 0.21971909701824188, + "learning_rate": 4.167467868791034e-05, + "loss": 0.1436, + "step": 19108 + }, + { + "epoch": 0.3408304498269896, + "grad_norm": 0.3221576511859894, + "learning_rate": 4.167351894969624e-05, + "loss": 0.1952, + "step": 19109 + }, + { + "epoch": 0.3408482859487033, + "grad_norm": 0.34070682525634766, + "learning_rate": 4.167235914684982e-05, + "loss": 0.2105, + "step": 19110 + }, + { + "epoch": 0.340866122070417, + "grad_norm": 0.21998755633831024, + "learning_rate": 4.167119927937558e-05, + "loss": 0.1448, + "step": 19111 + }, + { + "epoch": 0.3408839581921307, + "grad_norm": 0.34537848830223083, + "learning_rate": 4.167003934727801e-05, + "loss": 0.2052, + "step": 19112 + }, + { + "epoch": 0.3409017943138444, + "grad_norm": 0.33918866515159607, + "learning_rate": 4.166887935056162e-05, + "loss": 0.1506, + "step": 19113 + }, + { + "epoch": 0.3409196304355581, + "grad_norm": 0.23078452050685883, + "learning_rate": 4.166771928923088e-05, + "loss": 0.1933, + "step": 19114 + }, + { + "epoch": 0.3409374665572718, + "grad_norm": 0.31279078125953674, + "learning_rate": 4.1666559163290307e-05, + "loss": 0.1446, + "step": 19115 + }, + { + "epoch": 0.3409553026789855, + "grad_norm": 0.2269740104675293, + "learning_rate": 4.16653989727444e-05, + "loss": 0.1582, + "step": 19116 + }, + { + "epoch": 0.3409731388006992, + "grad_norm": 0.3424963355064392, + "learning_rate": 4.166423871759765e-05, + "loss": 0.1955, + "step": 19117 + }, + { + "epoch": 0.34099097492241287, + "grad_norm": 0.3185013234615326, + "learning_rate": 4.166307839785456e-05, + "loss": 0.1999, + "step": 19118 + }, + { + "epoch": 0.34100881104412656, + "grad_norm": 0.2266213446855545, + "learning_rate": 4.1661918013519606e-05, + "loss": 0.1444, + "step": 19119 + }, + { + "epoch": 0.34102664716584025, + "grad_norm": 0.2339271605014801, + "learning_rate": 4.166075756459732e-05, + "loss": 0.1623, + "step": 19120 + }, + { + "epoch": 0.34104448328755393, + "grad_norm": 0.33711886405944824, + "learning_rate": 4.165959705109217e-05, + "loss": 0.182, + "step": 19121 + }, + { + "epoch": 0.3410623194092676, + "grad_norm": 0.1971094012260437, + "learning_rate": 4.1658436473008676e-05, + "loss": 0.1492, + "step": 19122 + }, + { + "epoch": 0.34108015553098137, + "grad_norm": 0.21723458170890808, + "learning_rate": 4.165727583035133e-05, + "loss": 0.1673, + "step": 19123 + }, + { + "epoch": 0.34109799165269505, + "grad_norm": 0.37158021330833435, + "learning_rate": 4.165611512312463e-05, + "loss": 0.1622, + "step": 19124 + }, + { + "epoch": 0.34111582777440874, + "grad_norm": 0.2687208950519562, + "learning_rate": 4.1654954351333065e-05, + "loss": 0.1682, + "step": 19125 + }, + { + "epoch": 0.34113366389612243, + "grad_norm": 0.27835360169410706, + "learning_rate": 4.1653793514981156e-05, + "loss": 0.1718, + "step": 19126 + }, + { + "epoch": 0.3411515000178361, + "grad_norm": 0.2776113748550415, + "learning_rate": 4.1652632614073383e-05, + "loss": 0.2274, + "step": 19127 + }, + { + "epoch": 0.3411693361395498, + "grad_norm": 0.23038287460803986, + "learning_rate": 4.165147164861426e-05, + "loss": 0.17, + "step": 19128 + }, + { + "epoch": 0.3411871722612635, + "grad_norm": 0.2833652198314667, + "learning_rate": 4.165031061860827e-05, + "loss": 0.1785, + "step": 19129 + }, + { + "epoch": 0.3412050083829772, + "grad_norm": 0.2975352108478546, + "learning_rate": 4.1649149524059936e-05, + "loss": 0.1668, + "step": 19130 + }, + { + "epoch": 0.3412228445046909, + "grad_norm": 0.3174719214439392, + "learning_rate": 4.1647988364973756e-05, + "loss": 0.1964, + "step": 19131 + }, + { + "epoch": 0.3412406806264046, + "grad_norm": 0.211166113615036, + "learning_rate": 4.16468271413542e-05, + "loss": 0.1154, + "step": 19132 + }, + { + "epoch": 0.3412585167481183, + "grad_norm": 0.2222771942615509, + "learning_rate": 4.1645665853205816e-05, + "loss": 0.1886, + "step": 19133 + }, + { + "epoch": 0.341276352869832, + "grad_norm": 0.21896757185459137, + "learning_rate": 4.164450450053307e-05, + "loss": 0.1477, + "step": 19134 + }, + { + "epoch": 0.3412941889915457, + "grad_norm": 0.2650519013404846, + "learning_rate": 4.164334308334048e-05, + "loss": 0.1594, + "step": 19135 + }, + { + "epoch": 0.34131202511325937, + "grad_norm": 0.2698211073875427, + "learning_rate": 4.1642181601632534e-05, + "loss": 0.1415, + "step": 19136 + }, + { + "epoch": 0.34132986123497305, + "grad_norm": 0.28691112995147705, + "learning_rate": 4.164102005541376e-05, + "loss": 0.1493, + "step": 19137 + }, + { + "epoch": 0.34134769735668674, + "grad_norm": 0.37525513768196106, + "learning_rate": 4.163985844468863e-05, + "loss": 0.2048, + "step": 19138 + }, + { + "epoch": 0.34136553347840043, + "grad_norm": 0.18402189016342163, + "learning_rate": 4.163869676946167e-05, + "loss": 0.1517, + "step": 19139 + }, + { + "epoch": 0.3413833696001142, + "grad_norm": 0.2589471638202667, + "learning_rate": 4.1637535029737367e-05, + "loss": 0.2165, + "step": 19140 + }, + { + "epoch": 0.34140120572182786, + "grad_norm": 0.3775065839290619, + "learning_rate": 4.1636373225520245e-05, + "loss": 0.2236, + "step": 19141 + }, + { + "epoch": 0.34141904184354155, + "grad_norm": 0.32576048374176025, + "learning_rate": 4.163521135681478e-05, + "loss": 0.2056, + "step": 19142 + }, + { + "epoch": 0.34143687796525524, + "grad_norm": 0.5183864235877991, + "learning_rate": 4.16340494236255e-05, + "loss": 0.2, + "step": 19143 + }, + { + "epoch": 0.3414547140869689, + "grad_norm": 0.23035433888435364, + "learning_rate": 4.1632887425956894e-05, + "loss": 0.1526, + "step": 19144 + }, + { + "epoch": 0.3414725502086826, + "grad_norm": 0.34884533286094666, + "learning_rate": 4.163172536381347e-05, + "loss": 0.1531, + "step": 19145 + }, + { + "epoch": 0.3414903863303963, + "grad_norm": 0.24414996802806854, + "learning_rate": 4.163056323719974e-05, + "loss": 0.1502, + "step": 19146 + }, + { + "epoch": 0.34150822245211, + "grad_norm": 0.2550983130931854, + "learning_rate": 4.16294010461202e-05, + "loss": 0.189, + "step": 19147 + }, + { + "epoch": 0.34152605857382373, + "grad_norm": 0.22872644662857056, + "learning_rate": 4.162823879057935e-05, + "loss": 0.177, + "step": 19148 + }, + { + "epoch": 0.3415438946955374, + "grad_norm": 0.2678053379058838, + "learning_rate": 4.162707647058172e-05, + "loss": 0.1767, + "step": 19149 + }, + { + "epoch": 0.3415617308172511, + "grad_norm": 0.26402002573013306, + "learning_rate": 4.162591408613179e-05, + "loss": 0.1402, + "step": 19150 + }, + { + "epoch": 0.3415795669389648, + "grad_norm": 0.34151071310043335, + "learning_rate": 4.1624751637234075e-05, + "loss": 0.1846, + "step": 19151 + }, + { + "epoch": 0.3415974030606785, + "grad_norm": 0.3045713007450104, + "learning_rate": 4.162358912389308e-05, + "loss": 0.178, + "step": 19152 + }, + { + "epoch": 0.34161523918239217, + "grad_norm": 0.3080938756465912, + "learning_rate": 4.1622426546113306e-05, + "loss": 0.1307, + "step": 19153 + }, + { + "epoch": 0.34163307530410586, + "grad_norm": 0.34643495082855225, + "learning_rate": 4.1621263903899276e-05, + "loss": 0.1692, + "step": 19154 + }, + { + "epoch": 0.34165091142581955, + "grad_norm": 0.2636805772781372, + "learning_rate": 4.162010119725548e-05, + "loss": 0.1668, + "step": 19155 + }, + { + "epoch": 0.3416687475475333, + "grad_norm": 0.40771761536598206, + "learning_rate": 4.161893842618643e-05, + "loss": 0.203, + "step": 19156 + }, + { + "epoch": 0.341686583669247, + "grad_norm": 0.346594899892807, + "learning_rate": 4.1617775590696645e-05, + "loss": 0.2061, + "step": 19157 + }, + { + "epoch": 0.34170441979096067, + "grad_norm": 0.26792019605636597, + "learning_rate": 4.1616612690790615e-05, + "loss": 0.1563, + "step": 19158 + }, + { + "epoch": 0.34172225591267436, + "grad_norm": 0.2884537875652313, + "learning_rate": 4.161544972647285e-05, + "loss": 0.1703, + "step": 19159 + }, + { + "epoch": 0.34174009203438804, + "grad_norm": 0.2372352033853531, + "learning_rate": 4.161428669774787e-05, + "loss": 0.1776, + "step": 19160 + }, + { + "epoch": 0.34175792815610173, + "grad_norm": 0.26953575015068054, + "learning_rate": 4.1613123604620174e-05, + "loss": 0.1499, + "step": 19161 + }, + { + "epoch": 0.3417757642778154, + "grad_norm": 0.3464611768722534, + "learning_rate": 4.161196044709428e-05, + "loss": 0.1914, + "step": 19162 + }, + { + "epoch": 0.3417936003995291, + "grad_norm": 0.31093481183052063, + "learning_rate": 4.161079722517468e-05, + "loss": 0.14, + "step": 19163 + }, + { + "epoch": 0.3418114365212428, + "grad_norm": 0.24490484595298767, + "learning_rate": 4.160963393886589e-05, + "loss": 0.2012, + "step": 19164 + }, + { + "epoch": 0.34182927264295654, + "grad_norm": 0.36950263381004333, + "learning_rate": 4.160847058817243e-05, + "loss": 0.2236, + "step": 19165 + }, + { + "epoch": 0.3418471087646702, + "grad_norm": 0.39886242151260376, + "learning_rate": 4.1607307173098796e-05, + "loss": 0.1388, + "step": 19166 + }, + { + "epoch": 0.3418649448863839, + "grad_norm": 0.25907090306282043, + "learning_rate": 4.1606143693649516e-05, + "loss": 0.1912, + "step": 19167 + }, + { + "epoch": 0.3418827810080976, + "grad_norm": 0.2528982162475586, + "learning_rate": 4.1604980149829074e-05, + "loss": 0.2227, + "step": 19168 + }, + { + "epoch": 0.3419006171298113, + "grad_norm": 0.18718492984771729, + "learning_rate": 4.1603816541642e-05, + "loss": 0.124, + "step": 19169 + }, + { + "epoch": 0.341918453251525, + "grad_norm": 0.2484995424747467, + "learning_rate": 4.16026528690928e-05, + "loss": 0.2193, + "step": 19170 + }, + { + "epoch": 0.34193628937323867, + "grad_norm": 0.36052754521369934, + "learning_rate": 4.160148913218598e-05, + "loss": 0.2217, + "step": 19171 + }, + { + "epoch": 0.34195412549495235, + "grad_norm": 0.2591293156147003, + "learning_rate": 4.1600325330926046e-05, + "loss": 0.2093, + "step": 19172 + }, + { + "epoch": 0.3419719616166661, + "grad_norm": 0.23855255544185638, + "learning_rate": 4.159916146531753e-05, + "loss": 0.1235, + "step": 19173 + }, + { + "epoch": 0.3419897977383798, + "grad_norm": 0.2903277277946472, + "learning_rate": 4.159799753536493e-05, + "loss": 0.1701, + "step": 19174 + }, + { + "epoch": 0.3420076338600935, + "grad_norm": 0.23712298274040222, + "learning_rate": 4.159683354107275e-05, + "loss": 0.1645, + "step": 19175 + }, + { + "epoch": 0.34202546998180716, + "grad_norm": 0.27789393067359924, + "learning_rate": 4.159566948244552e-05, + "loss": 0.1375, + "step": 19176 + }, + { + "epoch": 0.34204330610352085, + "grad_norm": 0.3419671356678009, + "learning_rate": 4.159450535948773e-05, + "loss": 0.1699, + "step": 19177 + }, + { + "epoch": 0.34206114222523454, + "grad_norm": 0.25967729091644287, + "learning_rate": 4.1593341172203925e-05, + "loss": 0.1893, + "step": 19178 + }, + { + "epoch": 0.3420789783469482, + "grad_norm": 0.2332470715045929, + "learning_rate": 4.159217692059858e-05, + "loss": 0.1734, + "step": 19179 + }, + { + "epoch": 0.3420968144686619, + "grad_norm": 0.26110801100730896, + "learning_rate": 4.1591012604676236e-05, + "loss": 0.1873, + "step": 19180 + }, + { + "epoch": 0.3421146505903756, + "grad_norm": 0.26965662837028503, + "learning_rate": 4.1589848224441394e-05, + "loss": 0.1876, + "step": 19181 + }, + { + "epoch": 0.34213248671208935, + "grad_norm": 0.29505789279937744, + "learning_rate": 4.158868377989858e-05, + "loss": 0.1499, + "step": 19182 + }, + { + "epoch": 0.34215032283380303, + "grad_norm": 0.2164822220802307, + "learning_rate": 4.158751927105229e-05, + "loss": 0.1464, + "step": 19183 + }, + { + "epoch": 0.3421681589555167, + "grad_norm": 0.25503554940223694, + "learning_rate": 4.1586354697907043e-05, + "loss": 0.1345, + "step": 19184 + }, + { + "epoch": 0.3421859950772304, + "grad_norm": 0.21500547230243683, + "learning_rate": 4.158519006046736e-05, + "loss": 0.2017, + "step": 19185 + }, + { + "epoch": 0.3422038311989441, + "grad_norm": 0.26423928141593933, + "learning_rate": 4.1584025358737754e-05, + "loss": 0.1562, + "step": 19186 + }, + { + "epoch": 0.3422216673206578, + "grad_norm": 0.25571408867836, + "learning_rate": 4.158286059272273e-05, + "loss": 0.1515, + "step": 19187 + }, + { + "epoch": 0.3422395034423715, + "grad_norm": 0.2532646358013153, + "learning_rate": 4.158169576242682e-05, + "loss": 0.1189, + "step": 19188 + }, + { + "epoch": 0.34225733956408516, + "grad_norm": 0.2708805799484253, + "learning_rate": 4.158053086785453e-05, + "loss": 0.211, + "step": 19189 + }, + { + "epoch": 0.3422751756857989, + "grad_norm": 0.4668770134449005, + "learning_rate": 4.157936590901036e-05, + "loss": 0.1475, + "step": 19190 + }, + { + "epoch": 0.3422930118075126, + "grad_norm": 0.23428738117218018, + "learning_rate": 4.157820088589886e-05, + "loss": 0.1508, + "step": 19191 + }, + { + "epoch": 0.3423108479292263, + "grad_norm": 0.26442089676856995, + "learning_rate": 4.157703579852452e-05, + "loss": 0.1223, + "step": 19192 + }, + { + "epoch": 0.34232868405093997, + "grad_norm": 0.35886138677597046, + "learning_rate": 4.157587064689187e-05, + "loss": 0.1032, + "step": 19193 + }, + { + "epoch": 0.34234652017265366, + "grad_norm": 0.26686227321624756, + "learning_rate": 4.157470543100541e-05, + "loss": 0.1678, + "step": 19194 + }, + { + "epoch": 0.34236435629436734, + "grad_norm": 0.21827208995819092, + "learning_rate": 4.1573540150869674e-05, + "loss": 0.1502, + "step": 19195 + }, + { + "epoch": 0.34238219241608103, + "grad_norm": 0.22195547819137573, + "learning_rate": 4.1572374806489164e-05, + "loss": 0.1194, + "step": 19196 + }, + { + "epoch": 0.3424000285377947, + "grad_norm": 0.2899116575717926, + "learning_rate": 4.1571209397868415e-05, + "loss": 0.1814, + "step": 19197 + }, + { + "epoch": 0.34241786465950846, + "grad_norm": 0.31989017128944397, + "learning_rate": 4.157004392501193e-05, + "loss": 0.1658, + "step": 19198 + }, + { + "epoch": 0.34243570078122215, + "grad_norm": 0.284021258354187, + "learning_rate": 4.156887838792423e-05, + "loss": 0.2061, + "step": 19199 + }, + { + "epoch": 0.34245353690293584, + "grad_norm": 0.3226366639137268, + "learning_rate": 4.156771278660984e-05, + "loss": 0.1928, + "step": 19200 + }, + { + "epoch": 0.34247137302464953, + "grad_norm": 0.3172580897808075, + "learning_rate": 4.1566547121073274e-05, + "loss": 0.1593, + "step": 19201 + }, + { + "epoch": 0.3424892091463632, + "grad_norm": 0.2661484181880951, + "learning_rate": 4.1565381391319045e-05, + "loss": 0.1857, + "step": 19202 + }, + { + "epoch": 0.3425070452680769, + "grad_norm": 0.42309898138046265, + "learning_rate": 4.1564215597351675e-05, + "loss": 0.1459, + "step": 19203 + }, + { + "epoch": 0.3425248813897906, + "grad_norm": 0.24071981012821198, + "learning_rate": 4.1563049739175684e-05, + "loss": 0.1326, + "step": 19204 + }, + { + "epoch": 0.3425427175115043, + "grad_norm": 0.399253249168396, + "learning_rate": 4.156188381679559e-05, + "loss": 0.1559, + "step": 19205 + }, + { + "epoch": 0.34256055363321797, + "grad_norm": 0.28449612855911255, + "learning_rate": 4.1560717830215924e-05, + "loss": 0.1387, + "step": 19206 + }, + { + "epoch": 0.3425783897549317, + "grad_norm": 0.2965076267719269, + "learning_rate": 4.155955177944119e-05, + "loss": 0.147, + "step": 19207 + }, + { + "epoch": 0.3425962258766454, + "grad_norm": 0.23407036066055298, + "learning_rate": 4.155838566447591e-05, + "loss": 0.153, + "step": 19208 + }, + { + "epoch": 0.3426140619983591, + "grad_norm": 0.2760251760482788, + "learning_rate": 4.155721948532462e-05, + "loss": 0.2395, + "step": 19209 + }, + { + "epoch": 0.3426318981200728, + "grad_norm": 0.21404099464416504, + "learning_rate": 4.155605324199181e-05, + "loss": 0.1712, + "step": 19210 + }, + { + "epoch": 0.34264973424178646, + "grad_norm": 0.292790949344635, + "learning_rate": 4.155488693448203e-05, + "loss": 0.1192, + "step": 19211 + }, + { + "epoch": 0.34266757036350015, + "grad_norm": 0.34389957785606384, + "learning_rate": 4.155372056279979e-05, + "loss": 0.1546, + "step": 19212 + }, + { + "epoch": 0.34268540648521384, + "grad_norm": 0.3921763598918915, + "learning_rate": 4.155255412694962e-05, + "loss": 0.1958, + "step": 19213 + }, + { + "epoch": 0.3427032426069275, + "grad_norm": 0.44704189896583557, + "learning_rate": 4.155138762693602e-05, + "loss": 0.1292, + "step": 19214 + }, + { + "epoch": 0.34272107872864127, + "grad_norm": 0.3125969469547272, + "learning_rate": 4.155022106276353e-05, + "loss": 0.1621, + "step": 19215 + }, + { + "epoch": 0.34273891485035496, + "grad_norm": 0.23105350136756897, + "learning_rate": 4.1549054434436665e-05, + "loss": 0.1834, + "step": 19216 + }, + { + "epoch": 0.34275675097206865, + "grad_norm": 0.24655385315418243, + "learning_rate": 4.1547887741959954e-05, + "loss": 0.1686, + "step": 19217 + }, + { + "epoch": 0.34277458709378233, + "grad_norm": 0.3027952015399933, + "learning_rate": 4.1546720985337904e-05, + "loss": 0.1472, + "step": 19218 + }, + { + "epoch": 0.342792423215496, + "grad_norm": 0.3027576208114624, + "learning_rate": 4.154555416457506e-05, + "loss": 0.1678, + "step": 19219 + }, + { + "epoch": 0.3428102593372097, + "grad_norm": 0.28220006823539734, + "learning_rate": 4.154438727967592e-05, + "loss": 0.1136, + "step": 19220 + }, + { + "epoch": 0.3428280954589234, + "grad_norm": 0.3291475176811218, + "learning_rate": 4.154322033064504e-05, + "loss": 0.178, + "step": 19221 + }, + { + "epoch": 0.3428459315806371, + "grad_norm": 0.2588566541671753, + "learning_rate": 4.154205331748691e-05, + "loss": 0.1692, + "step": 19222 + }, + { + "epoch": 0.3428637677023508, + "grad_norm": 0.24905306100845337, + "learning_rate": 4.1540886240206064e-05, + "loss": 0.2073, + "step": 19223 + }, + { + "epoch": 0.3428816038240645, + "grad_norm": 0.30145522952079773, + "learning_rate": 4.153971909880705e-05, + "loss": 0.2173, + "step": 19224 + }, + { + "epoch": 0.3428994399457782, + "grad_norm": 0.24704104661941528, + "learning_rate": 4.153855189329435e-05, + "loss": 0.1644, + "step": 19225 + }, + { + "epoch": 0.3429172760674919, + "grad_norm": 0.2351163923740387, + "learning_rate": 4.1537384623672523e-05, + "loss": 0.1173, + "step": 19226 + }, + { + "epoch": 0.3429351121892056, + "grad_norm": 0.2364279180765152, + "learning_rate": 4.1536217289946077e-05, + "loss": 0.1555, + "step": 19227 + }, + { + "epoch": 0.34295294831091927, + "grad_norm": 0.3030035197734833, + "learning_rate": 4.153504989211955e-05, + "loss": 0.2037, + "step": 19228 + }, + { + "epoch": 0.34297078443263296, + "grad_norm": 0.22794194519519806, + "learning_rate": 4.1533882430197446e-05, + "loss": 0.2207, + "step": 19229 + }, + { + "epoch": 0.34298862055434665, + "grad_norm": 0.31897976994514465, + "learning_rate": 4.153271490418431e-05, + "loss": 0.2311, + "step": 19230 + }, + { + "epoch": 0.34300645667606033, + "grad_norm": 0.3582265079021454, + "learning_rate": 4.1531547314084664e-05, + "loss": 0.1762, + "step": 19231 + }, + { + "epoch": 0.3430242927977741, + "grad_norm": 0.2571417987346649, + "learning_rate": 4.153037965990302e-05, + "loss": 0.1894, + "step": 19232 + }, + { + "epoch": 0.34304212891948777, + "grad_norm": 0.2909238040447235, + "learning_rate": 4.152921194164392e-05, + "loss": 0.1604, + "step": 19233 + }, + { + "epoch": 0.34305996504120145, + "grad_norm": 0.28462904691696167, + "learning_rate": 4.152804415931189e-05, + "loss": 0.1806, + "step": 19234 + }, + { + "epoch": 0.34307780116291514, + "grad_norm": 0.2746331989765167, + "learning_rate": 4.152687631291145e-05, + "loss": 0.2101, + "step": 19235 + }, + { + "epoch": 0.34309563728462883, + "grad_norm": 0.3891318440437317, + "learning_rate": 4.152570840244713e-05, + "loss": 0.2383, + "step": 19236 + }, + { + "epoch": 0.3431134734063425, + "grad_norm": 0.32095879316329956, + "learning_rate": 4.152454042792345e-05, + "loss": 0.1465, + "step": 19237 + }, + { + "epoch": 0.3431313095280562, + "grad_norm": 0.2463955283164978, + "learning_rate": 4.152337238934495e-05, + "loss": 0.162, + "step": 19238 + }, + { + "epoch": 0.3431491456497699, + "grad_norm": 0.3604165315628052, + "learning_rate": 4.152220428671615e-05, + "loss": 0.2, + "step": 19239 + }, + { + "epoch": 0.3431669817714836, + "grad_norm": 0.29314178228378296, + "learning_rate": 4.152103612004158e-05, + "loss": 0.1881, + "step": 19240 + }, + { + "epoch": 0.3431848178931973, + "grad_norm": 0.30398428440093994, + "learning_rate": 4.151986788932577e-05, + "loss": 0.1742, + "step": 19241 + }, + { + "epoch": 0.343202654014911, + "grad_norm": 0.28699666261672974, + "learning_rate": 4.151869959457324e-05, + "loss": 0.17, + "step": 19242 + }, + { + "epoch": 0.3432204901366247, + "grad_norm": 0.21540312469005585, + "learning_rate": 4.151753123578853e-05, + "loss": 0.1988, + "step": 19243 + }, + { + "epoch": 0.3432383262583384, + "grad_norm": 0.24821065366268158, + "learning_rate": 4.151636281297616e-05, + "loss": 0.1586, + "step": 19244 + }, + { + "epoch": 0.3432561623800521, + "grad_norm": 0.32822316884994507, + "learning_rate": 4.1515194326140674e-05, + "loss": 0.2074, + "step": 19245 + }, + { + "epoch": 0.34327399850176576, + "grad_norm": 0.2227536141872406, + "learning_rate": 4.151402577528658e-05, + "loss": 0.1384, + "step": 19246 + }, + { + "epoch": 0.34329183462347945, + "grad_norm": 0.2184383124113083, + "learning_rate": 4.151285716041842e-05, + "loss": 0.1654, + "step": 19247 + }, + { + "epoch": 0.34330967074519314, + "grad_norm": 0.3004356324672699, + "learning_rate": 4.151168848154072e-05, + "loss": 0.1441, + "step": 19248 + }, + { + "epoch": 0.3433275068669069, + "grad_norm": 0.2525661587715149, + "learning_rate": 4.151051973865802e-05, + "loss": 0.1954, + "step": 19249 + }, + { + "epoch": 0.34334534298862057, + "grad_norm": 0.2357158660888672, + "learning_rate": 4.1509350931774835e-05, + "loss": 0.152, + "step": 19250 + }, + { + "epoch": 0.34336317911033426, + "grad_norm": 0.2263195663690567, + "learning_rate": 4.1508182060895713e-05, + "loss": 0.1229, + "step": 19251 + }, + { + "epoch": 0.34338101523204795, + "grad_norm": 0.2520339787006378, + "learning_rate": 4.150701312602518e-05, + "loss": 0.1859, + "step": 19252 + }, + { + "epoch": 0.34339885135376164, + "grad_norm": 0.2920095920562744, + "learning_rate": 4.1505844127167745e-05, + "loss": 0.1861, + "step": 19253 + }, + { + "epoch": 0.3434166874754753, + "grad_norm": 0.28727173805236816, + "learning_rate": 4.1504675064327965e-05, + "loss": 0.172, + "step": 19254 + }, + { + "epoch": 0.343434523597189, + "grad_norm": 0.28142687678337097, + "learning_rate": 4.150350593751036e-05, + "loss": 0.157, + "step": 19255 + }, + { + "epoch": 0.3434523597189027, + "grad_norm": 0.23413558304309845, + "learning_rate": 4.1502336746719475e-05, + "loss": 0.1602, + "step": 19256 + }, + { + "epoch": 0.34347019584061644, + "grad_norm": 0.48061373829841614, + "learning_rate": 4.1501167491959824e-05, + "loss": 0.1552, + "step": 19257 + }, + { + "epoch": 0.34348803196233013, + "grad_norm": 0.22398672997951508, + "learning_rate": 4.149999817323595e-05, + "loss": 0.1409, + "step": 19258 + }, + { + "epoch": 0.3435058680840438, + "grad_norm": 0.230852872133255, + "learning_rate": 4.149882879055239e-05, + "loss": 0.1632, + "step": 19259 + }, + { + "epoch": 0.3435237042057575, + "grad_norm": 0.2472366839647293, + "learning_rate": 4.149765934391366e-05, + "loss": 0.1625, + "step": 19260 + }, + { + "epoch": 0.3435415403274712, + "grad_norm": 0.23507563769817352, + "learning_rate": 4.149648983332432e-05, + "loss": 0.1448, + "step": 19261 + }, + { + "epoch": 0.3435593764491849, + "grad_norm": 0.3527578115463257, + "learning_rate": 4.149532025878888e-05, + "loss": 0.1565, + "step": 19262 + }, + { + "epoch": 0.34357721257089857, + "grad_norm": 0.3142579197883606, + "learning_rate": 4.149415062031188e-05, + "loss": 0.1599, + "step": 19263 + }, + { + "epoch": 0.34359504869261226, + "grad_norm": 0.2729453146457672, + "learning_rate": 4.149298091789785e-05, + "loss": 0.1629, + "step": 19264 + }, + { + "epoch": 0.34361288481432595, + "grad_norm": 0.29182925820350647, + "learning_rate": 4.149181115155134e-05, + "loss": 0.1501, + "step": 19265 + }, + { + "epoch": 0.3436307209360397, + "grad_norm": 0.2653812766075134, + "learning_rate": 4.149064132127687e-05, + "loss": 0.1825, + "step": 19266 + }, + { + "epoch": 0.3436485570577534, + "grad_norm": 0.22801761329174042, + "learning_rate": 4.1489471427078976e-05, + "loss": 0.121, + "step": 19267 + }, + { + "epoch": 0.34366639317946707, + "grad_norm": 0.340135782957077, + "learning_rate": 4.14883014689622e-05, + "loss": 0.1662, + "step": 19268 + }, + { + "epoch": 0.34368422930118075, + "grad_norm": 0.20019252598285675, + "learning_rate": 4.148713144693107e-05, + "loss": 0.1477, + "step": 19269 + }, + { + "epoch": 0.34370206542289444, + "grad_norm": 0.40863582491874695, + "learning_rate": 4.148596136099012e-05, + "loss": 0.1712, + "step": 19270 + }, + { + "epoch": 0.34371990154460813, + "grad_norm": 0.24625706672668457, + "learning_rate": 4.1484791211143896e-05, + "loss": 0.1362, + "step": 19271 + }, + { + "epoch": 0.3437377376663218, + "grad_norm": 0.3434460163116455, + "learning_rate": 4.148362099739693e-05, + "loss": 0.1471, + "step": 19272 + }, + { + "epoch": 0.3437555737880355, + "grad_norm": 0.19441038370132446, + "learning_rate": 4.148245071975375e-05, + "loss": 0.145, + "step": 19273 + }, + { + "epoch": 0.34377340990974925, + "grad_norm": 0.2680801749229431, + "learning_rate": 4.1481280378218904e-05, + "loss": 0.1824, + "step": 19274 + }, + { + "epoch": 0.34379124603146294, + "grad_norm": 0.32224753499031067, + "learning_rate": 4.148010997279691e-05, + "loss": 0.2753, + "step": 19275 + }, + { + "epoch": 0.3438090821531766, + "grad_norm": 0.21381701529026031, + "learning_rate": 4.147893950349233e-05, + "loss": 0.1671, + "step": 19276 + }, + { + "epoch": 0.3438269182748903, + "grad_norm": 0.21210013329982758, + "learning_rate": 4.1477768970309686e-05, + "loss": 0.1478, + "step": 19277 + }, + { + "epoch": 0.343844754396604, + "grad_norm": 0.23080043494701385, + "learning_rate": 4.1476598373253515e-05, + "loss": 0.1416, + "step": 19278 + }, + { + "epoch": 0.3438625905183177, + "grad_norm": 0.2671366333961487, + "learning_rate": 4.1475427712328356e-05, + "loss": 0.1137, + "step": 19279 + }, + { + "epoch": 0.3438804266400314, + "grad_norm": 0.3107810318470001, + "learning_rate": 4.1474256987538756e-05, + "loss": 0.1924, + "step": 19280 + }, + { + "epoch": 0.34389826276174507, + "grad_norm": 0.28378915786743164, + "learning_rate": 4.147308619888924e-05, + "loss": 0.1628, + "step": 19281 + }, + { + "epoch": 0.34391609888345875, + "grad_norm": 0.2703215479850769, + "learning_rate": 4.147191534638436e-05, + "loss": 0.2018, + "step": 19282 + }, + { + "epoch": 0.3439339350051725, + "grad_norm": 0.16677279770374298, + "learning_rate": 4.147074443002864e-05, + "loss": 0.138, + "step": 19283 + }, + { + "epoch": 0.3439517711268862, + "grad_norm": 0.26388025283813477, + "learning_rate": 4.1469573449826624e-05, + "loss": 0.185, + "step": 19284 + }, + { + "epoch": 0.3439696072485999, + "grad_norm": 0.4443589448928833, + "learning_rate": 4.146840240578286e-05, + "loss": 0.206, + "step": 19285 + }, + { + "epoch": 0.34398744337031356, + "grad_norm": 0.22026492655277252, + "learning_rate": 4.1467231297901874e-05, + "loss": 0.2038, + "step": 19286 + }, + { + "epoch": 0.34400527949202725, + "grad_norm": 0.20402655005455017, + "learning_rate": 4.1466060126188214e-05, + "loss": 0.1777, + "step": 19287 + }, + { + "epoch": 0.34402311561374094, + "grad_norm": 0.2890566885471344, + "learning_rate": 4.146488889064642e-05, + "loss": 0.1513, + "step": 19288 + }, + { + "epoch": 0.3440409517354546, + "grad_norm": 0.3009870648384094, + "learning_rate": 4.146371759128103e-05, + "loss": 0.1686, + "step": 19289 + }, + { + "epoch": 0.3440587878571683, + "grad_norm": 0.324984610080719, + "learning_rate": 4.1462546228096585e-05, + "loss": 0.1814, + "step": 19290 + }, + { + "epoch": 0.34407662397888206, + "grad_norm": 0.26800447702407837, + "learning_rate": 4.146137480109762e-05, + "loss": 0.1972, + "step": 19291 + }, + { + "epoch": 0.34409446010059574, + "grad_norm": 0.3157986104488373, + "learning_rate": 4.146020331028868e-05, + "loss": 0.1999, + "step": 19292 + }, + { + "epoch": 0.34411229622230943, + "grad_norm": 0.25662344694137573, + "learning_rate": 4.1459031755674314e-05, + "loss": 0.1446, + "step": 19293 + }, + { + "epoch": 0.3441301323440231, + "grad_norm": 0.2624521851539612, + "learning_rate": 4.145786013725905e-05, + "loss": 0.1176, + "step": 19294 + }, + { + "epoch": 0.3441479684657368, + "grad_norm": 0.21069125831127167, + "learning_rate": 4.145668845504744e-05, + "loss": 0.1703, + "step": 19295 + }, + { + "epoch": 0.3441658045874505, + "grad_norm": 0.3734355866909027, + "learning_rate": 4.1455516709044016e-05, + "loss": 0.1622, + "step": 19296 + }, + { + "epoch": 0.3441836407091642, + "grad_norm": 0.27586856484413147, + "learning_rate": 4.145434489925333e-05, + "loss": 0.1849, + "step": 19297 + }, + { + "epoch": 0.3442014768308779, + "grad_norm": 0.2133214771747589, + "learning_rate": 4.1453173025679916e-05, + "loss": 0.1763, + "step": 19298 + }, + { + "epoch": 0.3442193129525916, + "grad_norm": 0.2775794565677643, + "learning_rate": 4.145200108832833e-05, + "loss": 0.1528, + "step": 19299 + }, + { + "epoch": 0.3442371490743053, + "grad_norm": 0.26951631903648376, + "learning_rate": 4.145082908720309e-05, + "loss": 0.1893, + "step": 19300 + }, + { + "epoch": 0.344254985196019, + "grad_norm": 0.24548238515853882, + "learning_rate": 4.144965702230877e-05, + "loss": 0.1619, + "step": 19301 + }, + { + "epoch": 0.3442728213177327, + "grad_norm": 0.2351071536540985, + "learning_rate": 4.144848489364989e-05, + "loss": 0.1674, + "step": 19302 + }, + { + "epoch": 0.34429065743944637, + "grad_norm": 0.2926071882247925, + "learning_rate": 4.1447312701231e-05, + "loss": 0.1681, + "step": 19303 + }, + { + "epoch": 0.34430849356116006, + "grad_norm": 0.24202954769134521, + "learning_rate": 4.1446140445056656e-05, + "loss": 0.1327, + "step": 19304 + }, + { + "epoch": 0.34432632968287374, + "grad_norm": 0.32067757844924927, + "learning_rate": 4.144496812513138e-05, + "loss": 0.131, + "step": 19305 + }, + { + "epoch": 0.34434416580458743, + "grad_norm": 0.2718522548675537, + "learning_rate": 4.1443795741459734e-05, + "loss": 0.1747, + "step": 19306 + }, + { + "epoch": 0.3443620019263011, + "grad_norm": 0.27440202236175537, + "learning_rate": 4.1442623294046246e-05, + "loss": 0.1702, + "step": 19307 + }, + { + "epoch": 0.34437983804801486, + "grad_norm": 0.2720183730125427, + "learning_rate": 4.1441450782895487e-05, + "loss": 0.1594, + "step": 19308 + }, + { + "epoch": 0.34439767416972855, + "grad_norm": 0.2942847013473511, + "learning_rate": 4.1440278208011976e-05, + "loss": 0.1446, + "step": 19309 + }, + { + "epoch": 0.34441551029144224, + "grad_norm": 0.2485940158367157, + "learning_rate": 4.143910556940027e-05, + "loss": 0.1635, + "step": 19310 + }, + { + "epoch": 0.3444333464131559, + "grad_norm": 0.20638635754585266, + "learning_rate": 4.143793286706491e-05, + "loss": 0.0965, + "step": 19311 + }, + { + "epoch": 0.3444511825348696, + "grad_norm": 0.29417091608047485, + "learning_rate": 4.143676010101045e-05, + "loss": 0.2289, + "step": 19312 + }, + { + "epoch": 0.3444690186565833, + "grad_norm": 0.25009727478027344, + "learning_rate": 4.143558727124142e-05, + "loss": 0.1516, + "step": 19313 + }, + { + "epoch": 0.344486854778297, + "grad_norm": 0.2975854277610779, + "learning_rate": 4.1434414377762386e-05, + "loss": 0.1704, + "step": 19314 + }, + { + "epoch": 0.3445046909000107, + "grad_norm": 0.3443279266357422, + "learning_rate": 4.1433241420577886e-05, + "loss": 0.2154, + "step": 19315 + }, + { + "epoch": 0.3445225270217244, + "grad_norm": 0.33312395215034485, + "learning_rate": 4.143206839969247e-05, + "loss": 0.2016, + "step": 19316 + }, + { + "epoch": 0.3445403631434381, + "grad_norm": 0.2558199167251587, + "learning_rate": 4.143089531511067e-05, + "loss": 0.1445, + "step": 19317 + }, + { + "epoch": 0.3445581992651518, + "grad_norm": 0.2809750735759735, + "learning_rate": 4.142972216683705e-05, + "loss": 0.1312, + "step": 19318 + }, + { + "epoch": 0.3445760353868655, + "grad_norm": 0.5312844514846802, + "learning_rate": 4.1428548954876155e-05, + "loss": 0.1531, + "step": 19319 + }, + { + "epoch": 0.3445938715085792, + "grad_norm": 0.2636854350566864, + "learning_rate": 4.142737567923253e-05, + "loss": 0.1529, + "step": 19320 + }, + { + "epoch": 0.34461170763029286, + "grad_norm": 0.2276621013879776, + "learning_rate": 4.142620233991072e-05, + "loss": 0.1445, + "step": 19321 + }, + { + "epoch": 0.34462954375200655, + "grad_norm": 0.3009655177593231, + "learning_rate": 4.142502893691528e-05, + "loss": 0.1354, + "step": 19322 + }, + { + "epoch": 0.34464737987372024, + "grad_norm": 0.22071021795272827, + "learning_rate": 4.1423855470250756e-05, + "loss": 0.122, + "step": 19323 + }, + { + "epoch": 0.3446652159954339, + "grad_norm": 0.24714696407318115, + "learning_rate": 4.142268193992169e-05, + "loss": 0.1485, + "step": 19324 + }, + { + "epoch": 0.34468305211714767, + "grad_norm": 0.27075421810150146, + "learning_rate": 4.142150834593264e-05, + "loss": 0.1582, + "step": 19325 + }, + { + "epoch": 0.34470088823886136, + "grad_norm": 0.28965651988983154, + "learning_rate": 4.142033468828815e-05, + "loss": 0.1921, + "step": 19326 + }, + { + "epoch": 0.34471872436057505, + "grad_norm": 0.36095792055130005, + "learning_rate": 4.141916096699277e-05, + "loss": 0.1264, + "step": 19327 + }, + { + "epoch": 0.34473656048228873, + "grad_norm": 0.29419201612472534, + "learning_rate": 4.1417987182051057e-05, + "loss": 0.1657, + "step": 19328 + }, + { + "epoch": 0.3447543966040024, + "grad_norm": 0.20871809124946594, + "learning_rate": 4.141681333346755e-05, + "loss": 0.1623, + "step": 19329 + }, + { + "epoch": 0.3447722327257161, + "grad_norm": 0.27883103489875793, + "learning_rate": 4.141563942124681e-05, + "loss": 0.1552, + "step": 19330 + }, + { + "epoch": 0.3447900688474298, + "grad_norm": 0.561913251876831, + "learning_rate": 4.141446544539337e-05, + "loss": 0.1955, + "step": 19331 + }, + { + "epoch": 0.3448079049691435, + "grad_norm": 0.29841604828834534, + "learning_rate": 4.14132914059118e-05, + "loss": 0.1586, + "step": 19332 + }, + { + "epoch": 0.34482574109085723, + "grad_norm": 0.22488825023174286, + "learning_rate": 4.1412117302806644e-05, + "loss": 0.105, + "step": 19333 + }, + { + "epoch": 0.3448435772125709, + "grad_norm": 0.3224492371082306, + "learning_rate": 4.141094313608246e-05, + "loss": 0.1544, + "step": 19334 + }, + { + "epoch": 0.3448614133342846, + "grad_norm": 0.21744197607040405, + "learning_rate": 4.140976890574378e-05, + "loss": 0.1161, + "step": 19335 + }, + { + "epoch": 0.3448792494559983, + "grad_norm": 0.2855767011642456, + "learning_rate": 4.140859461179517e-05, + "loss": 0.1487, + "step": 19336 + }, + { + "epoch": 0.344897085577712, + "grad_norm": 0.2090531885623932, + "learning_rate": 4.140742025424118e-05, + "loss": 0.1845, + "step": 19337 + }, + { + "epoch": 0.34491492169942567, + "grad_norm": 0.2449500858783722, + "learning_rate": 4.140624583308637e-05, + "loss": 0.1654, + "step": 19338 + }, + { + "epoch": 0.34493275782113936, + "grad_norm": 0.1828378140926361, + "learning_rate": 4.140507134833528e-05, + "loss": 0.1117, + "step": 19339 + }, + { + "epoch": 0.34495059394285305, + "grad_norm": 0.19028717279434204, + "learning_rate": 4.1403896799992465e-05, + "loss": 0.1424, + "step": 19340 + }, + { + "epoch": 0.34496843006456673, + "grad_norm": 0.29808419942855835, + "learning_rate": 4.140272218806248e-05, + "loss": 0.1853, + "step": 19341 + }, + { + "epoch": 0.3449862661862805, + "grad_norm": 0.3018692135810852, + "learning_rate": 4.1401547512549884e-05, + "loss": 0.1226, + "step": 19342 + }, + { + "epoch": 0.34500410230799416, + "grad_norm": 0.32953277230262756, + "learning_rate": 4.140037277345922e-05, + "loss": 0.2063, + "step": 19343 + }, + { + "epoch": 0.34502193842970785, + "grad_norm": 0.2793973684310913, + "learning_rate": 4.1399197970795045e-05, + "loss": 0.219, + "step": 19344 + }, + { + "epoch": 0.34503977455142154, + "grad_norm": 0.2436210960149765, + "learning_rate": 4.139802310456192e-05, + "loss": 0.1495, + "step": 19345 + }, + { + "epoch": 0.34505761067313523, + "grad_norm": 0.22010815143585205, + "learning_rate": 4.139684817476439e-05, + "loss": 0.1974, + "step": 19346 + }, + { + "epoch": 0.3450754467948489, + "grad_norm": 0.26123496890068054, + "learning_rate": 4.1395673181407014e-05, + "loss": 0.1422, + "step": 19347 + }, + { + "epoch": 0.3450932829165626, + "grad_norm": 0.2590912878513336, + "learning_rate": 4.139449812449435e-05, + "loss": 0.2137, + "step": 19348 + }, + { + "epoch": 0.3451111190382763, + "grad_norm": 0.2490239143371582, + "learning_rate": 4.139332300403094e-05, + "loss": 0.129, + "step": 19349 + }, + { + "epoch": 0.34512895515999004, + "grad_norm": 0.2790435254573822, + "learning_rate": 4.139214782002135e-05, + "loss": 0.1489, + "step": 19350 + }, + { + "epoch": 0.3451467912817037, + "grad_norm": 0.25684571266174316, + "learning_rate": 4.1390972572470145e-05, + "loss": 0.1373, + "step": 19351 + }, + { + "epoch": 0.3451646274034174, + "grad_norm": 0.20819757878780365, + "learning_rate": 4.138979726138186e-05, + "loss": 0.1703, + "step": 19352 + }, + { + "epoch": 0.3451824635251311, + "grad_norm": 0.3392389714717865, + "learning_rate": 4.138862188676106e-05, + "loss": 0.1651, + "step": 19353 + }, + { + "epoch": 0.3452002996468448, + "grad_norm": 0.4044768214225769, + "learning_rate": 4.13874464486123e-05, + "loss": 0.1924, + "step": 19354 + }, + { + "epoch": 0.3452181357685585, + "grad_norm": 0.28278350830078125, + "learning_rate": 4.138627094694014e-05, + "loss": 0.1944, + "step": 19355 + }, + { + "epoch": 0.34523597189027216, + "grad_norm": 0.2475716918706894, + "learning_rate": 4.1385095381749134e-05, + "loss": 0.183, + "step": 19356 + }, + { + "epoch": 0.34525380801198585, + "grad_norm": 0.2546561062335968, + "learning_rate": 4.1383919753043834e-05, + "loss": 0.1761, + "step": 19357 + }, + { + "epoch": 0.3452716441336996, + "grad_norm": 0.33256638050079346, + "learning_rate": 4.138274406082881e-05, + "loss": 0.2232, + "step": 19358 + }, + { + "epoch": 0.3452894802554133, + "grad_norm": 0.24753150343894958, + "learning_rate": 4.138156830510861e-05, + "loss": 0.1647, + "step": 19359 + }, + { + "epoch": 0.34530731637712697, + "grad_norm": 0.2755405604839325, + "learning_rate": 4.1380392485887785e-05, + "loss": 0.1229, + "step": 19360 + }, + { + "epoch": 0.34532515249884066, + "grad_norm": 0.2919138967990875, + "learning_rate": 4.137921660317091e-05, + "loss": 0.1804, + "step": 19361 + }, + { + "epoch": 0.34534298862055435, + "grad_norm": 0.25106585025787354, + "learning_rate": 4.1378040656962534e-05, + "loss": 0.1517, + "step": 19362 + }, + { + "epoch": 0.34536082474226804, + "grad_norm": 0.22420300543308258, + "learning_rate": 4.137686464726721e-05, + "loss": 0.1721, + "step": 19363 + }, + { + "epoch": 0.3453786608639817, + "grad_norm": 0.3089165985584259, + "learning_rate": 4.1375688574089514e-05, + "loss": 0.1381, + "step": 19364 + }, + { + "epoch": 0.3453964969856954, + "grad_norm": 0.27446988224983215, + "learning_rate": 4.137451243743398e-05, + "loss": 0.1994, + "step": 19365 + }, + { + "epoch": 0.3454143331074091, + "grad_norm": 0.231646329164505, + "learning_rate": 4.137333623730518e-05, + "loss": 0.1655, + "step": 19366 + }, + { + "epoch": 0.34543216922912284, + "grad_norm": 0.2537112534046173, + "learning_rate": 4.137215997370768e-05, + "loss": 0.1078, + "step": 19367 + }, + { + "epoch": 0.34545000535083653, + "grad_norm": 0.25047567486763, + "learning_rate": 4.1370983646646034e-05, + "loss": 0.1943, + "step": 19368 + }, + { + "epoch": 0.3454678414725502, + "grad_norm": 0.29900601506233215, + "learning_rate": 4.13698072561248e-05, + "loss": 0.1746, + "step": 19369 + }, + { + "epoch": 0.3454856775942639, + "grad_norm": 0.3534618020057678, + "learning_rate": 4.1368630802148534e-05, + "loss": 0.1446, + "step": 19370 + }, + { + "epoch": 0.3455035137159776, + "grad_norm": 0.32002121210098267, + "learning_rate": 4.1367454284721805e-05, + "loss": 0.199, + "step": 19371 + }, + { + "epoch": 0.3455213498376913, + "grad_norm": 0.3234451115131378, + "learning_rate": 4.136627770384917e-05, + "loss": 0.177, + "step": 19372 + }, + { + "epoch": 0.34553918595940497, + "grad_norm": 0.22080664336681366, + "learning_rate": 4.136510105953519e-05, + "loss": 0.2138, + "step": 19373 + }, + { + "epoch": 0.34555702208111866, + "grad_norm": 0.330020934343338, + "learning_rate": 4.136392435178443e-05, + "loss": 0.1813, + "step": 19374 + }, + { + "epoch": 0.3455748582028324, + "grad_norm": 0.23628132045269012, + "learning_rate": 4.136274758060144e-05, + "loss": 0.113, + "step": 19375 + }, + { + "epoch": 0.3455926943245461, + "grad_norm": 0.2844630479812622, + "learning_rate": 4.13615707459908e-05, + "loss": 0.1711, + "step": 19376 + }, + { + "epoch": 0.3456105304462598, + "grad_norm": 0.2452024221420288, + "learning_rate": 4.1360393847957045e-05, + "loss": 0.1421, + "step": 19377 + }, + { + "epoch": 0.34562836656797347, + "grad_norm": 0.25904181599617004, + "learning_rate": 4.1359216886504756e-05, + "loss": 0.2128, + "step": 19378 + }, + { + "epoch": 0.34564620268968715, + "grad_norm": 0.2607460021972656, + "learning_rate": 4.1358039861638505e-05, + "loss": 0.1727, + "step": 19379 + }, + { + "epoch": 0.34566403881140084, + "grad_norm": 0.22307580709457397, + "learning_rate": 4.135686277336284e-05, + "loss": 0.1376, + "step": 19380 + }, + { + "epoch": 0.34568187493311453, + "grad_norm": 0.292441189289093, + "learning_rate": 4.135568562168231e-05, + "loss": 0.1604, + "step": 19381 + }, + { + "epoch": 0.3456997110548282, + "grad_norm": 0.2761858105659485, + "learning_rate": 4.1354508406601504e-05, + "loss": 0.1722, + "step": 19382 + }, + { + "epoch": 0.3457175471765419, + "grad_norm": 0.26929807662963867, + "learning_rate": 4.135333112812497e-05, + "loss": 0.1403, + "step": 19383 + }, + { + "epoch": 0.34573538329825565, + "grad_norm": 0.2326447069644928, + "learning_rate": 4.135215378625729e-05, + "loss": 0.1542, + "step": 19384 + }, + { + "epoch": 0.34575321941996934, + "grad_norm": 0.3189031183719635, + "learning_rate": 4.1350976381002994e-05, + "loss": 0.1687, + "step": 19385 + }, + { + "epoch": 0.345771055541683, + "grad_norm": 0.2811721861362457, + "learning_rate": 4.1349798912366675e-05, + "loss": 0.1506, + "step": 19386 + }, + { + "epoch": 0.3457888916633967, + "grad_norm": 0.25603535771369934, + "learning_rate": 4.134862138035289e-05, + "loss": 0.2115, + "step": 19387 + }, + { + "epoch": 0.3458067277851104, + "grad_norm": 0.3122118413448334, + "learning_rate": 4.13474437849662e-05, + "loss": 0.1875, + "step": 19388 + }, + { + "epoch": 0.3458245639068241, + "grad_norm": 0.218451589345932, + "learning_rate": 4.1346266126211177e-05, + "loss": 0.1754, + "step": 19389 + }, + { + "epoch": 0.3458424000285378, + "grad_norm": 0.2716617286205292, + "learning_rate": 4.1345088404092374e-05, + "loss": 0.1164, + "step": 19390 + }, + { + "epoch": 0.34586023615025147, + "grad_norm": 0.28143441677093506, + "learning_rate": 4.134391061861436e-05, + "loss": 0.1815, + "step": 19391 + }, + { + "epoch": 0.3458780722719652, + "grad_norm": 0.39164450764656067, + "learning_rate": 4.1342732769781714e-05, + "loss": 0.1882, + "step": 19392 + }, + { + "epoch": 0.3458959083936789, + "grad_norm": 0.34961459040641785, + "learning_rate": 4.1341554857598984e-05, + "loss": 0.1431, + "step": 19393 + }, + { + "epoch": 0.3459137445153926, + "grad_norm": 0.3550502359867096, + "learning_rate": 4.1340376882070745e-05, + "loss": 0.1518, + "step": 19394 + }, + { + "epoch": 0.3459315806371063, + "grad_norm": 0.23281922936439514, + "learning_rate": 4.1339198843201567e-05, + "loss": 0.1566, + "step": 19395 + }, + { + "epoch": 0.34594941675881996, + "grad_norm": 0.3236084580421448, + "learning_rate": 4.1338020740996e-05, + "loss": 0.1545, + "step": 19396 + }, + { + "epoch": 0.34596725288053365, + "grad_norm": 0.3396601676940918, + "learning_rate": 4.133684257545863e-05, + "loss": 0.1691, + "step": 19397 + }, + { + "epoch": 0.34598508900224734, + "grad_norm": 0.2830646336078644, + "learning_rate": 4.133566434659401e-05, + "loss": 0.1795, + "step": 19398 + }, + { + "epoch": 0.346002925123961, + "grad_norm": 0.3032616376876831, + "learning_rate": 4.133448605440672e-05, + "loss": 0.1938, + "step": 19399 + }, + { + "epoch": 0.3460207612456747, + "grad_norm": 0.25963565707206726, + "learning_rate": 4.133330769890131e-05, + "loss": 0.1843, + "step": 19400 + }, + { + "epoch": 0.34603859736738846, + "grad_norm": 0.22302964329719543, + "learning_rate": 4.1332129280082364e-05, + "loss": 0.1482, + "step": 19401 + }, + { + "epoch": 0.34605643348910214, + "grad_norm": 0.1829168200492859, + "learning_rate": 4.133095079795444e-05, + "loss": 0.1483, + "step": 19402 + }, + { + "epoch": 0.34607426961081583, + "grad_norm": 0.34463924169540405, + "learning_rate": 4.132977225252211e-05, + "loss": 0.164, + "step": 19403 + }, + { + "epoch": 0.3460921057325295, + "grad_norm": 0.2819700837135315, + "learning_rate": 4.1328593643789957e-05, + "loss": 0.0853, + "step": 19404 + }, + { + "epoch": 0.3461099418542432, + "grad_norm": 0.2864101529121399, + "learning_rate": 4.1327414971762525e-05, + "loss": 0.1892, + "step": 19405 + }, + { + "epoch": 0.3461277779759569, + "grad_norm": 0.24050670862197876, + "learning_rate": 4.132623623644439e-05, + "loss": 0.1432, + "step": 19406 + }, + { + "epoch": 0.3461456140976706, + "grad_norm": 0.27650803327560425, + "learning_rate": 4.1325057437840123e-05, + "loss": 0.2029, + "step": 19407 + }, + { + "epoch": 0.34616345021938427, + "grad_norm": 0.23626144230365753, + "learning_rate": 4.132387857595431e-05, + "loss": 0.1899, + "step": 19408 + }, + { + "epoch": 0.346181286341098, + "grad_norm": 0.22747544944286346, + "learning_rate": 4.132269965079149e-05, + "loss": 0.1796, + "step": 19409 + }, + { + "epoch": 0.3461991224628117, + "grad_norm": 0.30286508798599243, + "learning_rate": 4.132152066235625e-05, + "loss": 0.1795, + "step": 19410 + }, + { + "epoch": 0.3462169585845254, + "grad_norm": 0.22571417689323425, + "learning_rate": 4.132034161065316e-05, + "loss": 0.1735, + "step": 19411 + }, + { + "epoch": 0.3462347947062391, + "grad_norm": 0.18470734357833862, + "learning_rate": 4.1319162495686796e-05, + "loss": 0.1364, + "step": 19412 + }, + { + "epoch": 0.34625263082795277, + "grad_norm": 0.26024067401885986, + "learning_rate": 4.1317983317461714e-05, + "loss": 0.1666, + "step": 19413 + }, + { + "epoch": 0.34627046694966646, + "grad_norm": 0.23578520119190216, + "learning_rate": 4.1316804075982496e-05, + "loss": 0.1763, + "step": 19414 + }, + { + "epoch": 0.34628830307138014, + "grad_norm": 0.37316691875457764, + "learning_rate": 4.131562477125371e-05, + "loss": 0.1796, + "step": 19415 + }, + { + "epoch": 0.34630613919309383, + "grad_norm": 0.3864397406578064, + "learning_rate": 4.131444540327993e-05, + "loss": 0.1544, + "step": 19416 + }, + { + "epoch": 0.3463239753148076, + "grad_norm": 0.34760424494743347, + "learning_rate": 4.131326597206572e-05, + "loss": 0.2304, + "step": 19417 + }, + { + "epoch": 0.34634181143652126, + "grad_norm": 0.18980322778224945, + "learning_rate": 4.131208647761565e-05, + "loss": 0.1571, + "step": 19418 + }, + { + "epoch": 0.34635964755823495, + "grad_norm": 0.24487678706645966, + "learning_rate": 4.131090691993431e-05, + "loss": 0.1522, + "step": 19419 + }, + { + "epoch": 0.34637748367994864, + "grad_norm": 0.2938240170478821, + "learning_rate": 4.1309727299026255e-05, + "loss": 0.167, + "step": 19420 + }, + { + "epoch": 0.3463953198016623, + "grad_norm": 0.375690221786499, + "learning_rate": 4.1308547614896075e-05, + "loss": 0.1868, + "step": 19421 + }, + { + "epoch": 0.346413155923376, + "grad_norm": 0.25675103068351746, + "learning_rate": 4.1307367867548316e-05, + "loss": 0.2172, + "step": 19422 + }, + { + "epoch": 0.3464309920450897, + "grad_norm": 0.26187554001808167, + "learning_rate": 4.130618805698758e-05, + "loss": 0.1654, + "step": 19423 + }, + { + "epoch": 0.3464488281668034, + "grad_norm": 0.21872815489768982, + "learning_rate": 4.130500818321842e-05, + "loss": 0.1764, + "step": 19424 + }, + { + "epoch": 0.3464666642885171, + "grad_norm": 0.26111993193626404, + "learning_rate": 4.130382824624541e-05, + "loss": 0.1372, + "step": 19425 + }, + { + "epoch": 0.3464845004102308, + "grad_norm": 0.2385575920343399, + "learning_rate": 4.1302648246073145e-05, + "loss": 0.1715, + "step": 19426 + }, + { + "epoch": 0.3465023365319445, + "grad_norm": 0.2510746717453003, + "learning_rate": 4.1301468182706176e-05, + "loss": 0.1295, + "step": 19427 + }, + { + "epoch": 0.3465201726536582, + "grad_norm": 0.32107454538345337, + "learning_rate": 4.1300288056149084e-05, + "loss": 0.1804, + "step": 19428 + }, + { + "epoch": 0.3465380087753719, + "grad_norm": 0.2929087281227112, + "learning_rate": 4.129910786640645e-05, + "loss": 0.1589, + "step": 19429 + }, + { + "epoch": 0.3465558448970856, + "grad_norm": 0.28323599696159363, + "learning_rate": 4.129792761348285e-05, + "loss": 0.1565, + "step": 19430 + }, + { + "epoch": 0.34657368101879926, + "grad_norm": 0.19271722435951233, + "learning_rate": 4.129674729738284e-05, + "loss": 0.1382, + "step": 19431 + }, + { + "epoch": 0.34659151714051295, + "grad_norm": 0.22486954927444458, + "learning_rate": 4.129556691811102e-05, + "loss": 0.1819, + "step": 19432 + }, + { + "epoch": 0.34660935326222664, + "grad_norm": 0.3742815852165222, + "learning_rate": 4.1294386475671955e-05, + "loss": 0.1853, + "step": 19433 + }, + { + "epoch": 0.3466271893839404, + "grad_norm": 0.19420146942138672, + "learning_rate": 4.1293205970070216e-05, + "loss": 0.1717, + "step": 19434 + }, + { + "epoch": 0.34664502550565407, + "grad_norm": 0.2316415160894394, + "learning_rate": 4.129202540131037e-05, + "loss": 0.1695, + "step": 19435 + }, + { + "epoch": 0.34666286162736776, + "grad_norm": 0.20413222908973694, + "learning_rate": 4.129084476939703e-05, + "loss": 0.1538, + "step": 19436 + }, + { + "epoch": 0.34668069774908145, + "grad_norm": 0.36453738808631897, + "learning_rate": 4.128966407433473e-05, + "loss": 0.2106, + "step": 19437 + }, + { + "epoch": 0.34669853387079513, + "grad_norm": 0.33726999163627625, + "learning_rate": 4.128848331612808e-05, + "loss": 0.2279, + "step": 19438 + }, + { + "epoch": 0.3467163699925088, + "grad_norm": 0.2563611567020416, + "learning_rate": 4.128730249478163e-05, + "loss": 0.114, + "step": 19439 + }, + { + "epoch": 0.3467342061142225, + "grad_norm": 0.2862267792224884, + "learning_rate": 4.1286121610299986e-05, + "loss": 0.1915, + "step": 19440 + }, + { + "epoch": 0.3467520422359362, + "grad_norm": 0.2832556962966919, + "learning_rate": 4.1284940662687696e-05, + "loss": 0.2229, + "step": 19441 + }, + { + "epoch": 0.3467698783576499, + "grad_norm": 0.31398025155067444, + "learning_rate": 4.128375965194936e-05, + "loss": 0.1493, + "step": 19442 + }, + { + "epoch": 0.34678771447936363, + "grad_norm": 0.25471949577331543, + "learning_rate": 4.1282578578089534e-05, + "loss": 0.2328, + "step": 19443 + }, + { + "epoch": 0.3468055506010773, + "grad_norm": 0.2671929597854614, + "learning_rate": 4.1281397441112823e-05, + "loss": 0.1614, + "step": 19444 + }, + { + "epoch": 0.346823386722791, + "grad_norm": 0.2736206352710724, + "learning_rate": 4.128021624102378e-05, + "loss": 0.1663, + "step": 19445 + }, + { + "epoch": 0.3468412228445047, + "grad_norm": 0.4274255335330963, + "learning_rate": 4.127903497782701e-05, + "loss": 0.1802, + "step": 19446 + }, + { + "epoch": 0.3468590589662184, + "grad_norm": 0.29369959235191345, + "learning_rate": 4.1277853651527075e-05, + "loss": 0.1822, + "step": 19447 + }, + { + "epoch": 0.34687689508793207, + "grad_norm": 0.24593660235404968, + "learning_rate": 4.127667226212855e-05, + "loss": 0.1353, + "step": 19448 + }, + { + "epoch": 0.34689473120964576, + "grad_norm": 0.3930591344833374, + "learning_rate": 4.127549080963603e-05, + "loss": 0.1963, + "step": 19449 + }, + { + "epoch": 0.34691256733135944, + "grad_norm": 0.3083154857158661, + "learning_rate": 4.127430929405408e-05, + "loss": 0.2248, + "step": 19450 + }, + { + "epoch": 0.3469304034530732, + "grad_norm": 0.25477007031440735, + "learning_rate": 4.127312771538729e-05, + "loss": 0.1788, + "step": 19451 + }, + { + "epoch": 0.3469482395747869, + "grad_norm": 0.21664482355117798, + "learning_rate": 4.127194607364023e-05, + "loss": 0.1457, + "step": 19452 + }, + { + "epoch": 0.34696607569650056, + "grad_norm": 0.2607729136943817, + "learning_rate": 4.127076436881749e-05, + "loss": 0.172, + "step": 19453 + }, + { + "epoch": 0.34698391181821425, + "grad_norm": 0.2806100845336914, + "learning_rate": 4.126958260092365e-05, + "loss": 0.2115, + "step": 19454 + }, + { + "epoch": 0.34700174793992794, + "grad_norm": 0.31121960282325745, + "learning_rate": 4.126840076996329e-05, + "loss": 0.1938, + "step": 19455 + }, + { + "epoch": 0.34701958406164163, + "grad_norm": 0.3361174166202545, + "learning_rate": 4.126721887594098e-05, + "loss": 0.0998, + "step": 19456 + }, + { + "epoch": 0.3470374201833553, + "grad_norm": 0.2513003945350647, + "learning_rate": 4.126603691886132e-05, + "loss": 0.136, + "step": 19457 + }, + { + "epoch": 0.347055256305069, + "grad_norm": 0.26615363359451294, + "learning_rate": 4.126485489872888e-05, + "loss": 0.1571, + "step": 19458 + }, + { + "epoch": 0.34707309242678275, + "grad_norm": 0.26913365721702576, + "learning_rate": 4.126367281554825e-05, + "loss": 0.1893, + "step": 19459 + }, + { + "epoch": 0.34709092854849644, + "grad_norm": 0.3098093867301941, + "learning_rate": 4.1262490669324005e-05, + "loss": 0.199, + "step": 19460 + }, + { + "epoch": 0.3471087646702101, + "grad_norm": 0.2577618658542633, + "learning_rate": 4.126130846006072e-05, + "loss": 0.1995, + "step": 19461 + }, + { + "epoch": 0.3471266007919238, + "grad_norm": 0.24977301061153412, + "learning_rate": 4.126012618776299e-05, + "loss": 0.1648, + "step": 19462 + }, + { + "epoch": 0.3471444369136375, + "grad_norm": 0.21640323102474213, + "learning_rate": 4.12589438524354e-05, + "loss": 0.1713, + "step": 19463 + }, + { + "epoch": 0.3471622730353512, + "grad_norm": 0.281476765871048, + "learning_rate": 4.1257761454082535e-05, + "loss": 0.1961, + "step": 19464 + }, + { + "epoch": 0.3471801091570649, + "grad_norm": 0.21089307963848114, + "learning_rate": 4.1256578992708954e-05, + "loss": 0.1637, + "step": 19465 + }, + { + "epoch": 0.34719794527877856, + "grad_norm": 0.28609150648117065, + "learning_rate": 4.1255396468319265e-05, + "loss": 0.1598, + "step": 19466 + }, + { + "epoch": 0.34721578140049225, + "grad_norm": 0.2323131412267685, + "learning_rate": 4.1254213880918044e-05, + "loss": 0.1672, + "step": 19467 + }, + { + "epoch": 0.347233617522206, + "grad_norm": 0.2568100392818451, + "learning_rate": 4.1253031230509884e-05, + "loss": 0.1372, + "step": 19468 + }, + { + "epoch": 0.3472514536439197, + "grad_norm": 0.2932114005088806, + "learning_rate": 4.125184851709936e-05, + "loss": 0.1747, + "step": 19469 + }, + { + "epoch": 0.34726928976563337, + "grad_norm": 0.30081841349601746, + "learning_rate": 4.125066574069105e-05, + "loss": 0.2114, + "step": 19470 + }, + { + "epoch": 0.34728712588734706, + "grad_norm": 0.3816368877887726, + "learning_rate": 4.124948290128955e-05, + "loss": 0.1471, + "step": 19471 + }, + { + "epoch": 0.34730496200906075, + "grad_norm": 0.30220168828964233, + "learning_rate": 4.124829999889944e-05, + "loss": 0.1502, + "step": 19472 + }, + { + "epoch": 0.34732279813077443, + "grad_norm": 0.21175815165042877, + "learning_rate": 4.124711703352531e-05, + "loss": 0.1739, + "step": 19473 + }, + { + "epoch": 0.3473406342524881, + "grad_norm": 0.3254470229148865, + "learning_rate": 4.124593400517174e-05, + "loss": 0.1609, + "step": 19474 + }, + { + "epoch": 0.3473584703742018, + "grad_norm": 0.25704342126846313, + "learning_rate": 4.124475091384332e-05, + "loss": 0.1735, + "step": 19475 + }, + { + "epoch": 0.34737630649591555, + "grad_norm": 0.2239055186510086, + "learning_rate": 4.124356775954464e-05, + "loss": 0.1936, + "step": 19476 + }, + { + "epoch": 0.34739414261762924, + "grad_norm": 0.29571130871772766, + "learning_rate": 4.124238454228028e-05, + "loss": 0.1138, + "step": 19477 + }, + { + "epoch": 0.34741197873934293, + "grad_norm": 0.36035099625587463, + "learning_rate": 4.124120126205482e-05, + "loss": 0.1758, + "step": 19478 + }, + { + "epoch": 0.3474298148610566, + "grad_norm": 0.25826746225357056, + "learning_rate": 4.124001791887286e-05, + "loss": 0.1607, + "step": 19479 + }, + { + "epoch": 0.3474476509827703, + "grad_norm": 0.3327743709087372, + "learning_rate": 4.1238834512738976e-05, + "loss": 0.1577, + "step": 19480 + }, + { + "epoch": 0.347465487104484, + "grad_norm": 0.1871565878391266, + "learning_rate": 4.1237651043657765e-05, + "loss": 0.1428, + "step": 19481 + }, + { + "epoch": 0.3474833232261977, + "grad_norm": 0.2558317482471466, + "learning_rate": 4.123646751163381e-05, + "loss": 0.1567, + "step": 19482 + }, + { + "epoch": 0.34750115934791137, + "grad_norm": 0.2345658540725708, + "learning_rate": 4.1235283916671695e-05, + "loss": 0.175, + "step": 19483 + }, + { + "epoch": 0.34751899546962506, + "grad_norm": 0.29519787430763245, + "learning_rate": 4.123410025877602e-05, + "loss": 0.1409, + "step": 19484 + }, + { + "epoch": 0.3475368315913388, + "grad_norm": 0.25815102458000183, + "learning_rate": 4.1232916537951363e-05, + "loss": 0.1219, + "step": 19485 + }, + { + "epoch": 0.3475546677130525, + "grad_norm": 0.39939600229263306, + "learning_rate": 4.123173275420231e-05, + "loss": 0.1523, + "step": 19486 + }, + { + "epoch": 0.3475725038347662, + "grad_norm": 0.2604312002658844, + "learning_rate": 4.1230548907533464e-05, + "loss": 0.1718, + "step": 19487 + }, + { + "epoch": 0.34759033995647987, + "grad_norm": 0.301343709230423, + "learning_rate": 4.1229364997949394e-05, + "loss": 0.186, + "step": 19488 + }, + { + "epoch": 0.34760817607819355, + "grad_norm": 0.29630666971206665, + "learning_rate": 4.12281810254547e-05, + "loss": 0.2056, + "step": 19489 + }, + { + "epoch": 0.34762601219990724, + "grad_norm": 0.31824612617492676, + "learning_rate": 4.122699699005398e-05, + "loss": 0.1576, + "step": 19490 + }, + { + "epoch": 0.34764384832162093, + "grad_norm": 0.3133047819137573, + "learning_rate": 4.1225812891751815e-05, + "loss": 0.1398, + "step": 19491 + }, + { + "epoch": 0.3476616844433346, + "grad_norm": 0.31197676062583923, + "learning_rate": 4.1224628730552786e-05, + "loss": 0.1357, + "step": 19492 + }, + { + "epoch": 0.34767952056504836, + "grad_norm": 0.35021182894706726, + "learning_rate": 4.122344450646149e-05, + "loss": 0.2013, + "step": 19493 + }, + { + "epoch": 0.34769735668676205, + "grad_norm": 0.24721401929855347, + "learning_rate": 4.122226021948253e-05, + "loss": 0.1196, + "step": 19494 + }, + { + "epoch": 0.34771519280847574, + "grad_norm": 0.38173720240592957, + "learning_rate": 4.1221075869620486e-05, + "loss": 0.1861, + "step": 19495 + }, + { + "epoch": 0.3477330289301894, + "grad_norm": 0.2375587821006775, + "learning_rate": 4.1219891456879946e-05, + "loss": 0.1624, + "step": 19496 + }, + { + "epoch": 0.3477508650519031, + "grad_norm": 0.4578511714935303, + "learning_rate": 4.1218706981265506e-05, + "loss": 0.2162, + "step": 19497 + }, + { + "epoch": 0.3477687011736168, + "grad_norm": 0.2093442678451538, + "learning_rate": 4.121752244278175e-05, + "loss": 0.1595, + "step": 19498 + }, + { + "epoch": 0.3477865372953305, + "grad_norm": 0.22799724340438843, + "learning_rate": 4.121633784143328e-05, + "loss": 0.1565, + "step": 19499 + }, + { + "epoch": 0.3478043734170442, + "grad_norm": 0.18609677255153656, + "learning_rate": 4.1215153177224686e-05, + "loss": 0.1679, + "step": 19500 + }, + { + "epoch": 0.34782220953875786, + "grad_norm": 0.3047395646572113, + "learning_rate": 4.1213968450160554e-05, + "loss": 0.1433, + "step": 19501 + }, + { + "epoch": 0.3478400456604716, + "grad_norm": 0.2760085165500641, + "learning_rate": 4.1212783660245485e-05, + "loss": 0.1337, + "step": 19502 + }, + { + "epoch": 0.3478578817821853, + "grad_norm": 0.2929195463657379, + "learning_rate": 4.121159880748406e-05, + "loss": 0.2277, + "step": 19503 + }, + { + "epoch": 0.347875717903899, + "grad_norm": 0.41020041704177856, + "learning_rate": 4.1210413891880885e-05, + "loss": 0.1974, + "step": 19504 + }, + { + "epoch": 0.3478935540256127, + "grad_norm": 0.25318020582199097, + "learning_rate": 4.120922891344054e-05, + "loss": 0.1945, + "step": 19505 + }, + { + "epoch": 0.34791139014732636, + "grad_norm": 0.22687697410583496, + "learning_rate": 4.1208043872167634e-05, + "loss": 0.1589, + "step": 19506 + }, + { + "epoch": 0.34792922626904005, + "grad_norm": 0.34781762957572937, + "learning_rate": 4.120685876806675e-05, + "loss": 0.1606, + "step": 19507 + }, + { + "epoch": 0.34794706239075374, + "grad_norm": 0.3028854727745056, + "learning_rate": 4.120567360114248e-05, + "loss": 0.1311, + "step": 19508 + }, + { + "epoch": 0.3479648985124674, + "grad_norm": 0.2773214876651764, + "learning_rate": 4.1204488371399426e-05, + "loss": 0.1712, + "step": 19509 + }, + { + "epoch": 0.34798273463418117, + "grad_norm": 0.2004251629114151, + "learning_rate": 4.120330307884217e-05, + "loss": 0.17, + "step": 19510 + }, + { + "epoch": 0.34800057075589486, + "grad_norm": 0.29450973868370056, + "learning_rate": 4.120211772347532e-05, + "loss": 0.164, + "step": 19511 + }, + { + "epoch": 0.34801840687760854, + "grad_norm": 0.3179113566875458, + "learning_rate": 4.120093230530347e-05, + "loss": 0.2042, + "step": 19512 + }, + { + "epoch": 0.34803624299932223, + "grad_norm": 0.19407083094120026, + "learning_rate": 4.119974682433121e-05, + "loss": 0.103, + "step": 19513 + }, + { + "epoch": 0.3480540791210359, + "grad_norm": 0.2156829982995987, + "learning_rate": 4.119856128056313e-05, + "loss": 0.1394, + "step": 19514 + }, + { + "epoch": 0.3480719152427496, + "grad_norm": 0.2385161817073822, + "learning_rate": 4.119737567400383e-05, + "loss": 0.1878, + "step": 19515 + }, + { + "epoch": 0.3480897513644633, + "grad_norm": 0.2861107885837555, + "learning_rate": 4.119619000465791e-05, + "loss": 0.2155, + "step": 19516 + }, + { + "epoch": 0.348107587486177, + "grad_norm": 0.2676670253276825, + "learning_rate": 4.119500427252997e-05, + "loss": 0.1794, + "step": 19517 + }, + { + "epoch": 0.3481254236078907, + "grad_norm": 0.2797057628631592, + "learning_rate": 4.1193818477624594e-05, + "loss": 0.1848, + "step": 19518 + }, + { + "epoch": 0.3481432597296044, + "grad_norm": 0.2694793939590454, + "learning_rate": 4.119263261994638e-05, + "loss": 0.1199, + "step": 19519 + }, + { + "epoch": 0.3481610958513181, + "grad_norm": 0.25778859853744507, + "learning_rate": 4.119144669949994e-05, + "loss": 0.179, + "step": 19520 + }, + { + "epoch": 0.3481789319730318, + "grad_norm": 0.34032678604125977, + "learning_rate": 4.119026071628985e-05, + "loss": 0.1824, + "step": 19521 + }, + { + "epoch": 0.3481967680947455, + "grad_norm": 0.23092837631702423, + "learning_rate": 4.118907467032073e-05, + "loss": 0.1531, + "step": 19522 + }, + { + "epoch": 0.34821460421645917, + "grad_norm": 0.38058170676231384, + "learning_rate": 4.1187888561597155e-05, + "loss": 0.2384, + "step": 19523 + }, + { + "epoch": 0.34823244033817286, + "grad_norm": 0.34737053513526917, + "learning_rate": 4.118670239012373e-05, + "loss": 0.1765, + "step": 19524 + }, + { + "epoch": 0.34825027645988654, + "grad_norm": 0.2629907727241516, + "learning_rate": 4.118551615590507e-05, + "loss": 0.1841, + "step": 19525 + }, + { + "epoch": 0.34826811258160023, + "grad_norm": 0.22977577149868011, + "learning_rate": 4.1184329858945745e-05, + "loss": 0.1456, + "step": 19526 + }, + { + "epoch": 0.348285948703314, + "grad_norm": 0.37088480591773987, + "learning_rate": 4.1183143499250374e-05, + "loss": 0.1409, + "step": 19527 + }, + { + "epoch": 0.34830378482502766, + "grad_norm": 0.23603902757167816, + "learning_rate": 4.118195707682355e-05, + "loss": 0.1746, + "step": 19528 + }, + { + "epoch": 0.34832162094674135, + "grad_norm": 0.23476672172546387, + "learning_rate": 4.118077059166987e-05, + "loss": 0.1571, + "step": 19529 + }, + { + "epoch": 0.34833945706845504, + "grad_norm": 0.5951573848724365, + "learning_rate": 4.1179584043793925e-05, + "loss": 0.1862, + "step": 19530 + }, + { + "epoch": 0.3483572931901687, + "grad_norm": 0.3055584132671356, + "learning_rate": 4.117839743320034e-05, + "loss": 0.2119, + "step": 19531 + }, + { + "epoch": 0.3483751293118824, + "grad_norm": 0.23719768226146698, + "learning_rate": 4.117721075989369e-05, + "loss": 0.1363, + "step": 19532 + }, + { + "epoch": 0.3483929654335961, + "grad_norm": 0.32017260789871216, + "learning_rate": 4.117602402387859e-05, + "loss": 0.1906, + "step": 19533 + }, + { + "epoch": 0.3484108015553098, + "grad_norm": 0.2891557514667511, + "learning_rate": 4.1174837225159625e-05, + "loss": 0.2109, + "step": 19534 + }, + { + "epoch": 0.34842863767702353, + "grad_norm": 0.265910804271698, + "learning_rate": 4.117365036374141e-05, + "loss": 0.1904, + "step": 19535 + }, + { + "epoch": 0.3484464737987372, + "grad_norm": 0.19597332179546356, + "learning_rate": 4.1172463439628536e-05, + "loss": 0.1369, + "step": 19536 + }, + { + "epoch": 0.3484643099204509, + "grad_norm": 0.2930004894733429, + "learning_rate": 4.1171276452825614e-05, + "loss": 0.1263, + "step": 19537 + }, + { + "epoch": 0.3484821460421646, + "grad_norm": 0.32766494154930115, + "learning_rate": 4.117008940333724e-05, + "loss": 0.107, + "step": 19538 + }, + { + "epoch": 0.3484999821638783, + "grad_norm": 0.2567526698112488, + "learning_rate": 4.1168902291168006e-05, + "loss": 0.1937, + "step": 19539 + }, + { + "epoch": 0.348517818285592, + "grad_norm": 0.21148563921451569, + "learning_rate": 4.116771511632252e-05, + "loss": 0.1662, + "step": 19540 + }, + { + "epoch": 0.34853565440730566, + "grad_norm": 0.22094793617725372, + "learning_rate": 4.11665278788054e-05, + "loss": 0.1636, + "step": 19541 + }, + { + "epoch": 0.34855349052901935, + "grad_norm": 0.21725133061408997, + "learning_rate": 4.116534057862122e-05, + "loss": 0.1517, + "step": 19542 + }, + { + "epoch": 0.34857132665073304, + "grad_norm": 0.29790034890174866, + "learning_rate": 4.11641532157746e-05, + "loss": 0.2164, + "step": 19543 + }, + { + "epoch": 0.3485891627724468, + "grad_norm": 0.429353266954422, + "learning_rate": 4.1162965790270155e-05, + "loss": 0.1336, + "step": 19544 + }, + { + "epoch": 0.34860699889416047, + "grad_norm": 0.2449774593114853, + "learning_rate": 4.116177830211245e-05, + "loss": 0.1674, + "step": 19545 + }, + { + "epoch": 0.34862483501587416, + "grad_norm": 0.37135180830955505, + "learning_rate": 4.116059075130613e-05, + "loss": 0.2445, + "step": 19546 + }, + { + "epoch": 0.34864267113758785, + "grad_norm": 0.2782810628414154, + "learning_rate": 4.115940313785576e-05, + "loss": 0.1724, + "step": 19547 + }, + { + "epoch": 0.34866050725930153, + "grad_norm": 0.3206256926059723, + "learning_rate": 4.115821546176598e-05, + "loss": 0.1592, + "step": 19548 + }, + { + "epoch": 0.3486783433810152, + "grad_norm": 0.3780229389667511, + "learning_rate": 4.115702772304136e-05, + "loss": 0.185, + "step": 19549 + }, + { + "epoch": 0.3486961795027289, + "grad_norm": 0.24184750020503998, + "learning_rate": 4.1155839921686525e-05, + "loss": 0.1649, + "step": 19550 + }, + { + "epoch": 0.3487140156244426, + "grad_norm": 0.2922285199165344, + "learning_rate": 4.115465205770608e-05, + "loss": 0.1324, + "step": 19551 + }, + { + "epoch": 0.34873185174615634, + "grad_norm": 0.2572081685066223, + "learning_rate": 4.115346413110461e-05, + "loss": 0.1214, + "step": 19552 + }, + { + "epoch": 0.34874968786787003, + "grad_norm": 0.2920433282852173, + "learning_rate": 4.115227614188675e-05, + "loss": 0.1615, + "step": 19553 + }, + { + "epoch": 0.3487675239895837, + "grad_norm": 0.2628594636917114, + "learning_rate": 4.1151088090057085e-05, + "loss": 0.1218, + "step": 19554 + }, + { + "epoch": 0.3487853601112974, + "grad_norm": 0.2535925507545471, + "learning_rate": 4.114989997562022e-05, + "loss": 0.1527, + "step": 19555 + }, + { + "epoch": 0.3488031962330111, + "grad_norm": 0.29852116107940674, + "learning_rate": 4.114871179858076e-05, + "loss": 0.1039, + "step": 19556 + }, + { + "epoch": 0.3488210323547248, + "grad_norm": 0.2597878873348236, + "learning_rate": 4.114752355894333e-05, + "loss": 0.186, + "step": 19557 + }, + { + "epoch": 0.34883886847643847, + "grad_norm": 0.41820937395095825, + "learning_rate": 4.114633525671251e-05, + "loss": 0.1951, + "step": 19558 + }, + { + "epoch": 0.34885670459815216, + "grad_norm": 0.20156671106815338, + "learning_rate": 4.114514689189292e-05, + "loss": 0.1423, + "step": 19559 + }, + { + "epoch": 0.3488745407198659, + "grad_norm": 0.3613940477371216, + "learning_rate": 4.114395846448916e-05, + "loss": 0.1918, + "step": 19560 + }, + { + "epoch": 0.3488923768415796, + "grad_norm": 0.23352019488811493, + "learning_rate": 4.114276997450586e-05, + "loss": 0.1447, + "step": 19561 + }, + { + "epoch": 0.3489102129632933, + "grad_norm": 0.2274901419878006, + "learning_rate": 4.1141581421947586e-05, + "loss": 0.1693, + "step": 19562 + }, + { + "epoch": 0.34892804908500696, + "grad_norm": 0.3260687291622162, + "learning_rate": 4.1140392806818975e-05, + "loss": 0.1784, + "step": 19563 + }, + { + "epoch": 0.34894588520672065, + "grad_norm": 0.2123492807149887, + "learning_rate": 4.113920412912463e-05, + "loss": 0.1327, + "step": 19564 + }, + { + "epoch": 0.34896372132843434, + "grad_norm": 0.3055228590965271, + "learning_rate": 4.113801538886915e-05, + "loss": 0.1529, + "step": 19565 + }, + { + "epoch": 0.348981557450148, + "grad_norm": 0.35890069603919983, + "learning_rate": 4.1136826586057156e-05, + "loss": 0.1649, + "step": 19566 + }, + { + "epoch": 0.3489993935718617, + "grad_norm": 0.3369129002094269, + "learning_rate": 4.113563772069324e-05, + "loss": 0.1677, + "step": 19567 + }, + { + "epoch": 0.3490172296935754, + "grad_norm": 0.34517356753349304, + "learning_rate": 4.1134448792782034e-05, + "loss": 0.232, + "step": 19568 + }, + { + "epoch": 0.34903506581528915, + "grad_norm": 0.22877280414104462, + "learning_rate": 4.1133259802328116e-05, + "loss": 0.1532, + "step": 19569 + }, + { + "epoch": 0.34905290193700284, + "grad_norm": 0.27136847376823425, + "learning_rate": 4.1132070749336116e-05, + "loss": 0.1756, + "step": 19570 + }, + { + "epoch": 0.3490707380587165, + "grad_norm": 0.2570529580116272, + "learning_rate": 4.1130881633810635e-05, + "loss": 0.2028, + "step": 19571 + }, + { + "epoch": 0.3490885741804302, + "grad_norm": 0.311628520488739, + "learning_rate": 4.1129692455756294e-05, + "loss": 0.1573, + "step": 19572 + }, + { + "epoch": 0.3491064103021439, + "grad_norm": 0.3021625578403473, + "learning_rate": 4.112850321517768e-05, + "loss": 0.1662, + "step": 19573 + }, + { + "epoch": 0.3491242464238576, + "grad_norm": 0.31669682264328003, + "learning_rate": 4.112731391207943e-05, + "loss": 0.2294, + "step": 19574 + }, + { + "epoch": 0.3491420825455713, + "grad_norm": 0.28785020112991333, + "learning_rate": 4.112612454646613e-05, + "loss": 0.1785, + "step": 19575 + }, + { + "epoch": 0.34915991866728496, + "grad_norm": 0.3462860584259033, + "learning_rate": 4.1124935118342414e-05, + "loss": 0.2149, + "step": 19576 + }, + { + "epoch": 0.3491777547889987, + "grad_norm": 0.26719382405281067, + "learning_rate": 4.112374562771287e-05, + "loss": 0.1384, + "step": 19577 + }, + { + "epoch": 0.3491955909107124, + "grad_norm": 0.26933640241622925, + "learning_rate": 4.112255607458212e-05, + "loss": 0.1593, + "step": 19578 + }, + { + "epoch": 0.3492134270324261, + "grad_norm": 0.2914995849132538, + "learning_rate": 4.112136645895478e-05, + "loss": 0.1535, + "step": 19579 + }, + { + "epoch": 0.34923126315413977, + "grad_norm": 0.2068871706724167, + "learning_rate": 4.112017678083545e-05, + "loss": 0.1312, + "step": 19580 + }, + { + "epoch": 0.34924909927585346, + "grad_norm": 0.2618831396102905, + "learning_rate": 4.1118987040228754e-05, + "loss": 0.1925, + "step": 19581 + }, + { + "epoch": 0.34926693539756715, + "grad_norm": 0.3109544813632965, + "learning_rate": 4.111779723713929e-05, + "loss": 0.1459, + "step": 19582 + }, + { + "epoch": 0.34928477151928083, + "grad_norm": 0.2558040916919708, + "learning_rate": 4.111660737157168e-05, + "loss": 0.2371, + "step": 19583 + }, + { + "epoch": 0.3493026076409945, + "grad_norm": 0.2118171751499176, + "learning_rate": 4.111541744353054e-05, + "loss": 0.1606, + "step": 19584 + }, + { + "epoch": 0.3493204437627082, + "grad_norm": 0.2972525954246521, + "learning_rate": 4.111422745302046e-05, + "loss": 0.185, + "step": 19585 + }, + { + "epoch": 0.34933827988442195, + "grad_norm": 0.27432727813720703, + "learning_rate": 4.111303740004608e-05, + "loss": 0.1481, + "step": 19586 + }, + { + "epoch": 0.34935611600613564, + "grad_norm": 0.22611775994300842, + "learning_rate": 4.1111847284612e-05, + "loss": 0.1752, + "step": 19587 + }, + { + "epoch": 0.34937395212784933, + "grad_norm": 0.26797839999198914, + "learning_rate": 4.1110657106722826e-05, + "loss": 0.1658, + "step": 19588 + }, + { + "epoch": 0.349391788249563, + "grad_norm": 0.4832557439804077, + "learning_rate": 4.1109466866383195e-05, + "loss": 0.2024, + "step": 19589 + }, + { + "epoch": 0.3494096243712767, + "grad_norm": 0.2917376756668091, + "learning_rate": 4.11082765635977e-05, + "loss": 0.1771, + "step": 19590 + }, + { + "epoch": 0.3494274604929904, + "grad_norm": 0.22100360691547394, + "learning_rate": 4.1107086198370955e-05, + "loss": 0.1692, + "step": 19591 + }, + { + "epoch": 0.3494452966147041, + "grad_norm": 0.31009167432785034, + "learning_rate": 4.1105895770707596e-05, + "loss": 0.1777, + "step": 19592 + }, + { + "epoch": 0.34946313273641777, + "grad_norm": 0.2743508219718933, + "learning_rate": 4.110470528061221e-05, + "loss": 0.1872, + "step": 19593 + }, + { + "epoch": 0.3494809688581315, + "grad_norm": 0.2346644550561905, + "learning_rate": 4.110351472808943e-05, + "loss": 0.1463, + "step": 19594 + }, + { + "epoch": 0.3494988049798452, + "grad_norm": 0.3018113374710083, + "learning_rate": 4.110232411314386e-05, + "loss": 0.1966, + "step": 19595 + }, + { + "epoch": 0.3495166411015589, + "grad_norm": 0.22118301689624786, + "learning_rate": 4.1101133435780124e-05, + "loss": 0.1073, + "step": 19596 + }, + { + "epoch": 0.3495344772232726, + "grad_norm": 0.39103150367736816, + "learning_rate": 4.109994269600284e-05, + "loss": 0.1411, + "step": 19597 + }, + { + "epoch": 0.34955231334498627, + "grad_norm": 0.3187457323074341, + "learning_rate": 4.10987518938166e-05, + "loss": 0.1952, + "step": 19598 + }, + { + "epoch": 0.34957014946669995, + "grad_norm": 0.2902323305606842, + "learning_rate": 4.109756102922605e-05, + "loss": 0.1908, + "step": 19599 + }, + { + "epoch": 0.34958798558841364, + "grad_norm": 0.31453147530555725, + "learning_rate": 4.109637010223579e-05, + "loss": 0.1307, + "step": 19600 + }, + { + "epoch": 0.34960582171012733, + "grad_norm": 0.30840393900871277, + "learning_rate": 4.109517911285043e-05, + "loss": 0.2156, + "step": 19601 + }, + { + "epoch": 0.349623657831841, + "grad_norm": 0.49145349860191345, + "learning_rate": 4.1093988061074617e-05, + "loss": 0.14, + "step": 19602 + }, + { + "epoch": 0.34964149395355476, + "grad_norm": 0.19740381836891174, + "learning_rate": 4.1092796946912934e-05, + "loss": 0.1343, + "step": 19603 + }, + { + "epoch": 0.34965933007526845, + "grad_norm": 0.28548941016197205, + "learning_rate": 4.109160577037002e-05, + "loss": 0.1606, + "step": 19604 + }, + { + "epoch": 0.34967716619698214, + "grad_norm": 0.24940462410449982, + "learning_rate": 4.109041453145048e-05, + "loss": 0.1771, + "step": 19605 + }, + { + "epoch": 0.3496950023186958, + "grad_norm": 0.31209197640419006, + "learning_rate": 4.108922323015893e-05, + "loss": 0.1913, + "step": 19606 + }, + { + "epoch": 0.3497128384404095, + "grad_norm": 0.36832955479621887, + "learning_rate": 4.1088031866499996e-05, + "loss": 0.1897, + "step": 19607 + }, + { + "epoch": 0.3497306745621232, + "grad_norm": 0.34475061297416687, + "learning_rate": 4.1086840440478305e-05, + "loss": 0.1799, + "step": 19608 + }, + { + "epoch": 0.3497485106838369, + "grad_norm": 0.24442026019096375, + "learning_rate": 4.1085648952098456e-05, + "loss": 0.1307, + "step": 19609 + }, + { + "epoch": 0.3497663468055506, + "grad_norm": 0.21195891499519348, + "learning_rate": 4.108445740136507e-05, + "loss": 0.1738, + "step": 19610 + }, + { + "epoch": 0.3497841829272643, + "grad_norm": 0.21677544713020325, + "learning_rate": 4.108326578828278e-05, + "loss": 0.1351, + "step": 19611 + }, + { + "epoch": 0.349802019048978, + "grad_norm": 0.28859928250312805, + "learning_rate": 4.1082074112856196e-05, + "loss": 0.1642, + "step": 19612 + }, + { + "epoch": 0.3498198551706917, + "grad_norm": 0.22792096436023712, + "learning_rate": 4.108088237508993e-05, + "loss": 0.1147, + "step": 19613 + }, + { + "epoch": 0.3498376912924054, + "grad_norm": 0.2346087098121643, + "learning_rate": 4.107969057498862e-05, + "loss": 0.1976, + "step": 19614 + }, + { + "epoch": 0.34985552741411907, + "grad_norm": 0.20971044898033142, + "learning_rate": 4.1078498712556876e-05, + "loss": 0.1455, + "step": 19615 + }, + { + "epoch": 0.34987336353583276, + "grad_norm": 0.2765769958496094, + "learning_rate": 4.107730678779932e-05, + "loss": 0.2243, + "step": 19616 + }, + { + "epoch": 0.34989119965754645, + "grad_norm": 0.26900607347488403, + "learning_rate": 4.1076114800720556e-05, + "loss": 0.1952, + "step": 19617 + }, + { + "epoch": 0.34990903577926014, + "grad_norm": 0.21946214139461517, + "learning_rate": 4.1074922751325226e-05, + "loss": 0.1205, + "step": 19618 + }, + { + "epoch": 0.3499268719009739, + "grad_norm": 0.24247460067272186, + "learning_rate": 4.1073730639617945e-05, + "loss": 0.1774, + "step": 19619 + }, + { + "epoch": 0.34994470802268757, + "grad_norm": 0.28337591886520386, + "learning_rate": 4.107253846560333e-05, + "loss": 0.1297, + "step": 19620 + }, + { + "epoch": 0.34996254414440126, + "grad_norm": 0.24794021248817444, + "learning_rate": 4.1071346229286005e-05, + "loss": 0.1442, + "step": 19621 + }, + { + "epoch": 0.34998038026611494, + "grad_norm": 0.2338310033082962, + "learning_rate": 4.1070153930670597e-05, + "loss": 0.173, + "step": 19622 + }, + { + "epoch": 0.34999821638782863, + "grad_norm": 0.23294194042682648, + "learning_rate": 4.106896156976171e-05, + "loss": 0.1742, + "step": 19623 + }, + { + "epoch": 0.3500160525095423, + "grad_norm": 0.3466554582118988, + "learning_rate": 4.106776914656399e-05, + "loss": 0.1234, + "step": 19624 + }, + { + "epoch": 0.350033888631256, + "grad_norm": 0.2669873833656311, + "learning_rate": 4.106657666108204e-05, + "loss": 0.1773, + "step": 19625 + }, + { + "epoch": 0.3500517247529697, + "grad_norm": 0.34078970551490784, + "learning_rate": 4.106538411332049e-05, + "loss": 0.184, + "step": 19626 + }, + { + "epoch": 0.3500695608746834, + "grad_norm": 0.2385529726743698, + "learning_rate": 4.1064191503283966e-05, + "loss": 0.1375, + "step": 19627 + }, + { + "epoch": 0.3500873969963971, + "grad_norm": 0.2472548633813858, + "learning_rate": 4.106299883097708e-05, + "loss": 0.15, + "step": 19628 + }, + { + "epoch": 0.3501052331181108, + "grad_norm": 0.21401570737361908, + "learning_rate": 4.1061806096404476e-05, + "loss": 0.1167, + "step": 19629 + }, + { + "epoch": 0.3501230692398245, + "grad_norm": 0.2543730139732361, + "learning_rate": 4.106061329957075e-05, + "loss": 0.154, + "step": 19630 + }, + { + "epoch": 0.3501409053615382, + "grad_norm": 0.22819173336029053, + "learning_rate": 4.105942044048055e-05, + "loss": 0.1897, + "step": 19631 + }, + { + "epoch": 0.3501587414832519, + "grad_norm": 0.2190040498971939, + "learning_rate": 4.1058227519138484e-05, + "loss": 0.1729, + "step": 19632 + }, + { + "epoch": 0.35017657760496557, + "grad_norm": 0.37843620777130127, + "learning_rate": 4.1057034535549174e-05, + "loss": 0.2404, + "step": 19633 + }, + { + "epoch": 0.35019441372667925, + "grad_norm": 0.2770199477672577, + "learning_rate": 4.105584148971726e-05, + "loss": 0.0757, + "step": 19634 + }, + { + "epoch": 0.35021224984839294, + "grad_norm": 0.17997154593467712, + "learning_rate": 4.105464838164737e-05, + "loss": 0.125, + "step": 19635 + }, + { + "epoch": 0.3502300859701067, + "grad_norm": 0.22524304687976837, + "learning_rate": 4.1053455211344105e-05, + "loss": 0.1405, + "step": 19636 + }, + { + "epoch": 0.3502479220918204, + "grad_norm": 0.26091763377189636, + "learning_rate": 4.1052261978812104e-05, + "loss": 0.1441, + "step": 19637 + }, + { + "epoch": 0.35026575821353406, + "grad_norm": 0.3988263010978699, + "learning_rate": 4.105106868405599e-05, + "loss": 0.1399, + "step": 19638 + }, + { + "epoch": 0.35028359433524775, + "grad_norm": 0.30785071849823, + "learning_rate": 4.10498753270804e-05, + "loss": 0.1576, + "step": 19639 + }, + { + "epoch": 0.35030143045696144, + "grad_norm": 0.23383331298828125, + "learning_rate": 4.1048681907889934e-05, + "loss": 0.1434, + "step": 19640 + }, + { + "epoch": 0.3503192665786751, + "grad_norm": 0.22138716280460358, + "learning_rate": 4.1047488426489244e-05, + "loss": 0.1385, + "step": 19641 + }, + { + "epoch": 0.3503371027003888, + "grad_norm": 0.21598877012729645, + "learning_rate": 4.104629488288294e-05, + "loss": 0.2074, + "step": 19642 + }, + { + "epoch": 0.3503549388221025, + "grad_norm": 0.25582030415534973, + "learning_rate": 4.1045101277075665e-05, + "loss": 0.1636, + "step": 19643 + }, + { + "epoch": 0.3503727749438162, + "grad_norm": 0.24900773167610168, + "learning_rate": 4.104390760907203e-05, + "loss": 0.1566, + "step": 19644 + }, + { + "epoch": 0.35039061106552993, + "grad_norm": 0.3139323890209198, + "learning_rate": 4.104271387887667e-05, + "loss": 0.1398, + "step": 19645 + }, + { + "epoch": 0.3504084471872436, + "grad_norm": 0.2185821235179901, + "learning_rate": 4.104152008649421e-05, + "loss": 0.1529, + "step": 19646 + }, + { + "epoch": 0.3504262833089573, + "grad_norm": 0.20959828794002533, + "learning_rate": 4.1040326231929275e-05, + "loss": 0.1191, + "step": 19647 + }, + { + "epoch": 0.350444119430671, + "grad_norm": 0.24508103728294373, + "learning_rate": 4.1039132315186495e-05, + "loss": 0.1931, + "step": 19648 + }, + { + "epoch": 0.3504619555523847, + "grad_norm": 0.2361128032207489, + "learning_rate": 4.10379383362705e-05, + "loss": 0.1878, + "step": 19649 + }, + { + "epoch": 0.3504797916740984, + "grad_norm": 0.2508060038089752, + "learning_rate": 4.1036744295185913e-05, + "loss": 0.1323, + "step": 19650 + }, + { + "epoch": 0.35049762779581206, + "grad_norm": 0.2504958510398865, + "learning_rate": 4.103555019193737e-05, + "loss": 0.1508, + "step": 19651 + }, + { + "epoch": 0.35051546391752575, + "grad_norm": 0.3027316629886627, + "learning_rate": 4.10343560265295e-05, + "loss": 0.1755, + "step": 19652 + }, + { + "epoch": 0.3505333000392395, + "grad_norm": 0.21214164793491364, + "learning_rate": 4.103316179896691e-05, + "loss": 0.1456, + "step": 19653 + }, + { + "epoch": 0.3505511361609532, + "grad_norm": 0.3208407461643219, + "learning_rate": 4.1031967509254266e-05, + "loss": 0.1793, + "step": 19654 + }, + { + "epoch": 0.35056897228266687, + "grad_norm": 0.19025740027427673, + "learning_rate": 4.103077315739618e-05, + "loss": 0.1516, + "step": 19655 + }, + { + "epoch": 0.35058680840438056, + "grad_norm": 0.22745627164840698, + "learning_rate": 4.102957874339727e-05, + "loss": 0.1727, + "step": 19656 + }, + { + "epoch": 0.35060464452609424, + "grad_norm": 0.3025979697704315, + "learning_rate": 4.1028384267262176e-05, + "loss": 0.1766, + "step": 19657 + }, + { + "epoch": 0.35062248064780793, + "grad_norm": 0.24098078906536102, + "learning_rate": 4.102718972899553e-05, + "loss": 0.1768, + "step": 19658 + }, + { + "epoch": 0.3506403167695216, + "grad_norm": 0.29610326886177063, + "learning_rate": 4.1025995128601966e-05, + "loss": 0.1375, + "step": 19659 + }, + { + "epoch": 0.3506581528912353, + "grad_norm": 0.2935595214366913, + "learning_rate": 4.102480046608611e-05, + "loss": 0.1333, + "step": 19660 + }, + { + "epoch": 0.35067598901294905, + "grad_norm": 0.23132145404815674, + "learning_rate": 4.1023605741452586e-05, + "loss": 0.1525, + "step": 19661 + }, + { + "epoch": 0.35069382513466274, + "grad_norm": 0.26081085205078125, + "learning_rate": 4.102241095470604e-05, + "loss": 0.1617, + "step": 19662 + }, + { + "epoch": 0.35071166125637643, + "grad_norm": 0.38519376516342163, + "learning_rate": 4.102121610585109e-05, + "loss": 0.1971, + "step": 19663 + }, + { + "epoch": 0.3507294973780901, + "grad_norm": 0.286912202835083, + "learning_rate": 4.102002119489237e-05, + "loss": 0.154, + "step": 19664 + }, + { + "epoch": 0.3507473334998038, + "grad_norm": 0.2111155390739441, + "learning_rate": 4.101882622183452e-05, + "loss": 0.1563, + "step": 19665 + }, + { + "epoch": 0.3507651696215175, + "grad_norm": 0.32270610332489014, + "learning_rate": 4.101763118668216e-05, + "loss": 0.248, + "step": 19666 + }, + { + "epoch": 0.3507830057432312, + "grad_norm": 0.24495652318000793, + "learning_rate": 4.101643608943994e-05, + "loss": 0.1455, + "step": 19667 + }, + { + "epoch": 0.35080084186494487, + "grad_norm": 0.41668838262557983, + "learning_rate": 4.101524093011247e-05, + "loss": 0.1601, + "step": 19668 + }, + { + "epoch": 0.35081867798665856, + "grad_norm": 0.26712530851364136, + "learning_rate": 4.1014045708704404e-05, + "loss": 0.1853, + "step": 19669 + }, + { + "epoch": 0.3508365141083723, + "grad_norm": 0.23040862381458282, + "learning_rate": 4.101285042522036e-05, + "loss": 0.1684, + "step": 19670 + }, + { + "epoch": 0.350854350230086, + "grad_norm": 0.263711154460907, + "learning_rate": 4.1011655079664976e-05, + "loss": 0.1524, + "step": 19671 + }, + { + "epoch": 0.3508721863517997, + "grad_norm": 0.22974510490894318, + "learning_rate": 4.10104596720429e-05, + "loss": 0.1093, + "step": 19672 + }, + { + "epoch": 0.35089002247351336, + "grad_norm": 0.2576884329319, + "learning_rate": 4.1009264202358735e-05, + "loss": 0.1515, + "step": 19673 + }, + { + "epoch": 0.35090785859522705, + "grad_norm": 0.28994515538215637, + "learning_rate": 4.100806867061714e-05, + "loss": 0.065, + "step": 19674 + }, + { + "epoch": 0.35092569471694074, + "grad_norm": 0.25028344988822937, + "learning_rate": 4.100687307682274e-05, + "loss": 0.1496, + "step": 19675 + }, + { + "epoch": 0.3509435308386544, + "grad_norm": 0.29133617877960205, + "learning_rate": 4.100567742098018e-05, + "loss": 0.1464, + "step": 19676 + }, + { + "epoch": 0.3509613669603681, + "grad_norm": 0.27885901927948, + "learning_rate": 4.1004481703094075e-05, + "loss": 0.1661, + "step": 19677 + }, + { + "epoch": 0.35097920308208186, + "grad_norm": 0.2527541220188141, + "learning_rate": 4.100328592316908e-05, + "loss": 0.1534, + "step": 19678 + }, + { + "epoch": 0.35099703920379555, + "grad_norm": 0.29238662123680115, + "learning_rate": 4.100209008120981e-05, + "loss": 0.1773, + "step": 19679 + }, + { + "epoch": 0.35101487532550923, + "grad_norm": 0.2656225562095642, + "learning_rate": 4.100089417722092e-05, + "loss": 0.1159, + "step": 19680 + }, + { + "epoch": 0.3510327114472229, + "grad_norm": 0.2631196677684784, + "learning_rate": 4.0999698211207036e-05, + "loss": 0.1862, + "step": 19681 + }, + { + "epoch": 0.3510505475689366, + "grad_norm": 0.2563972771167755, + "learning_rate": 4.09985021831728e-05, + "loss": 0.1464, + "step": 19682 + }, + { + "epoch": 0.3510683836906503, + "grad_norm": 0.2973809540271759, + "learning_rate": 4.099730609312284e-05, + "loss": 0.173, + "step": 19683 + }, + { + "epoch": 0.351086219812364, + "grad_norm": 0.37659335136413574, + "learning_rate": 4.099610994106179e-05, + "loss": 0.1368, + "step": 19684 + }, + { + "epoch": 0.3511040559340777, + "grad_norm": 0.25841158628463745, + "learning_rate": 4.09949137269943e-05, + "loss": 0.1539, + "step": 19685 + }, + { + "epoch": 0.35112189205579136, + "grad_norm": 0.24447759985923767, + "learning_rate": 4.0993717450925e-05, + "loss": 0.1729, + "step": 19686 + }, + { + "epoch": 0.3511397281775051, + "grad_norm": 0.5070403814315796, + "learning_rate": 4.099252111285853e-05, + "loss": 0.2365, + "step": 19687 + }, + { + "epoch": 0.3511575642992188, + "grad_norm": 0.29022204875946045, + "learning_rate": 4.099132471279952e-05, + "loss": 0.1988, + "step": 19688 + }, + { + "epoch": 0.3511754004209325, + "grad_norm": 0.2991228997707367, + "learning_rate": 4.0990128250752614e-05, + "loss": 0.1468, + "step": 19689 + }, + { + "epoch": 0.35119323654264617, + "grad_norm": 0.3976593315601349, + "learning_rate": 4.098893172672245e-05, + "loss": 0.1826, + "step": 19690 + }, + { + "epoch": 0.35121107266435986, + "grad_norm": 0.20655082166194916, + "learning_rate": 4.0987735140713656e-05, + "loss": 0.1809, + "step": 19691 + }, + { + "epoch": 0.35122890878607355, + "grad_norm": 0.30448174476623535, + "learning_rate": 4.098653849273088e-05, + "loss": 0.1702, + "step": 19692 + }, + { + "epoch": 0.35124674490778723, + "grad_norm": 0.33953171968460083, + "learning_rate": 4.0985341782778764e-05, + "loss": 0.1701, + "step": 19693 + }, + { + "epoch": 0.3512645810295009, + "grad_norm": 0.19479066133499146, + "learning_rate": 4.098414501086194e-05, + "loss": 0.1711, + "step": 19694 + }, + { + "epoch": 0.35128241715121467, + "grad_norm": 0.22470815479755402, + "learning_rate": 4.0982948176985056e-05, + "loss": 0.1716, + "step": 19695 + }, + { + "epoch": 0.35130025327292835, + "grad_norm": 0.41108715534210205, + "learning_rate": 4.098175128115273e-05, + "loss": 0.1797, + "step": 19696 + }, + { + "epoch": 0.35131808939464204, + "grad_norm": 0.2554748058319092, + "learning_rate": 4.098055432336963e-05, + "loss": 0.1931, + "step": 19697 + }, + { + "epoch": 0.35133592551635573, + "grad_norm": 0.2931663393974304, + "learning_rate": 4.097935730364038e-05, + "loss": 0.1631, + "step": 19698 + }, + { + "epoch": 0.3513537616380694, + "grad_norm": 0.2234746664762497, + "learning_rate": 4.0978160221969616e-05, + "loss": 0.1494, + "step": 19699 + }, + { + "epoch": 0.3513715977597831, + "grad_norm": 0.19326429069042206, + "learning_rate": 4.097696307836199e-05, + "loss": 0.1759, + "step": 19700 + }, + { + "epoch": 0.3513894338814968, + "grad_norm": 0.2622203528881073, + "learning_rate": 4.097576587282214e-05, + "loss": 0.1594, + "step": 19701 + }, + { + "epoch": 0.3514072700032105, + "grad_norm": 0.2425583451986313, + "learning_rate": 4.097456860535469e-05, + "loss": 0.1997, + "step": 19702 + }, + { + "epoch": 0.35142510612492417, + "grad_norm": 0.25813931226730347, + "learning_rate": 4.09733712759643e-05, + "loss": 0.2243, + "step": 19703 + }, + { + "epoch": 0.3514429422466379, + "grad_norm": 0.19891194999217987, + "learning_rate": 4.097217388465561e-05, + "loss": 0.139, + "step": 19704 + }, + { + "epoch": 0.3514607783683516, + "grad_norm": 0.1712683141231537, + "learning_rate": 4.097097643143325e-05, + "loss": 0.1432, + "step": 19705 + }, + { + "epoch": 0.3514786144900653, + "grad_norm": 0.23824673891067505, + "learning_rate": 4.096977891630188e-05, + "loss": 0.1327, + "step": 19706 + }, + { + "epoch": 0.351496450611779, + "grad_norm": 0.23117029666900635, + "learning_rate": 4.0968581339266125e-05, + "loss": 0.1118, + "step": 19707 + }, + { + "epoch": 0.35151428673349266, + "grad_norm": 0.24423429369926453, + "learning_rate": 4.096738370033064e-05, + "loss": 0.1837, + "step": 19708 + }, + { + "epoch": 0.35153212285520635, + "grad_norm": 0.2343129962682724, + "learning_rate": 4.096618599950005e-05, + "loss": 0.1345, + "step": 19709 + }, + { + "epoch": 0.35154995897692004, + "grad_norm": 0.36951953172683716, + "learning_rate": 4.096498823677901e-05, + "loss": 0.1884, + "step": 19710 + }, + { + "epoch": 0.35156779509863373, + "grad_norm": 0.2731028199195862, + "learning_rate": 4.096379041217217e-05, + "loss": 0.1782, + "step": 19711 + }, + { + "epoch": 0.35158563122034747, + "grad_norm": 0.3639281094074249, + "learning_rate": 4.096259252568416e-05, + "loss": 0.2356, + "step": 19712 + }, + { + "epoch": 0.35160346734206116, + "grad_norm": 0.23560945689678192, + "learning_rate": 4.0961394577319626e-05, + "loss": 0.1997, + "step": 19713 + }, + { + "epoch": 0.35162130346377485, + "grad_norm": 0.25245076417922974, + "learning_rate": 4.096019656708322e-05, + "loss": 0.1422, + "step": 19714 + }, + { + "epoch": 0.35163913958548854, + "grad_norm": 0.3055262565612793, + "learning_rate": 4.095899849497957e-05, + "loss": 0.1948, + "step": 19715 + }, + { + "epoch": 0.3516569757072022, + "grad_norm": 0.28720512986183167, + "learning_rate": 4.095780036101333e-05, + "loss": 0.1164, + "step": 19716 + }, + { + "epoch": 0.3516748118289159, + "grad_norm": 0.32368525862693787, + "learning_rate": 4.095660216518916e-05, + "loss": 0.2153, + "step": 19717 + }, + { + "epoch": 0.3516926479506296, + "grad_norm": 0.2521657645702362, + "learning_rate": 4.0955403907511675e-05, + "loss": 0.1607, + "step": 19718 + }, + { + "epoch": 0.3517104840723433, + "grad_norm": 0.19980616867542267, + "learning_rate": 4.095420558798554e-05, + "loss": 0.1521, + "step": 19719 + }, + { + "epoch": 0.35172832019405703, + "grad_norm": 0.2124643474817276, + "learning_rate": 4.095300720661538e-05, + "loss": 0.1571, + "step": 19720 + }, + { + "epoch": 0.3517461563157707, + "grad_norm": 0.33116739988327026, + "learning_rate": 4.095180876340588e-05, + "loss": 0.1195, + "step": 19721 + }, + { + "epoch": 0.3517639924374844, + "grad_norm": 0.26021748781204224, + "learning_rate": 4.095061025836163e-05, + "loss": 0.1726, + "step": 19722 + }, + { + "epoch": 0.3517818285591981, + "grad_norm": 0.29108476638793945, + "learning_rate": 4.094941169148732e-05, + "loss": 0.1629, + "step": 19723 + }, + { + "epoch": 0.3517996646809118, + "grad_norm": 0.2202785611152649, + "learning_rate": 4.094821306278759e-05, + "loss": 0.1679, + "step": 19724 + }, + { + "epoch": 0.35181750080262547, + "grad_norm": 0.22796787321567535, + "learning_rate": 4.094701437226707e-05, + "loss": 0.1349, + "step": 19725 + }, + { + "epoch": 0.35183533692433916, + "grad_norm": 0.26325109601020813, + "learning_rate": 4.094581561993041e-05, + "loss": 0.1563, + "step": 19726 + }, + { + "epoch": 0.35185317304605285, + "grad_norm": 0.2852124273777008, + "learning_rate": 4.094461680578226e-05, + "loss": 0.2054, + "step": 19727 + }, + { + "epoch": 0.35187100916776654, + "grad_norm": 0.2270687222480774, + "learning_rate": 4.094341792982728e-05, + "loss": 0.1512, + "step": 19728 + }, + { + "epoch": 0.3518888452894803, + "grad_norm": 0.2668840289115906, + "learning_rate": 4.094221899207009e-05, + "loss": 0.1547, + "step": 19729 + }, + { + "epoch": 0.35190668141119397, + "grad_norm": 0.27421703934669495, + "learning_rate": 4.094101999251536e-05, + "loss": 0.1644, + "step": 19730 + }, + { + "epoch": 0.35192451753290765, + "grad_norm": 0.2846114933490753, + "learning_rate": 4.093982093116773e-05, + "loss": 0.1211, + "step": 19731 + }, + { + "epoch": 0.35194235365462134, + "grad_norm": 0.3455972969532013, + "learning_rate": 4.093862180803185e-05, + "loss": 0.1644, + "step": 19732 + }, + { + "epoch": 0.35196018977633503, + "grad_norm": 0.2865823209285736, + "learning_rate": 4.0937422623112367e-05, + "loss": 0.167, + "step": 19733 + }, + { + "epoch": 0.3519780258980487, + "grad_norm": 0.25498536229133606, + "learning_rate": 4.0936223376413926e-05, + "loss": 0.1926, + "step": 19734 + }, + { + "epoch": 0.3519958620197624, + "grad_norm": 0.25609761476516724, + "learning_rate": 4.093502406794118e-05, + "loss": 0.1512, + "step": 19735 + }, + { + "epoch": 0.3520136981414761, + "grad_norm": 0.23590657114982605, + "learning_rate": 4.0933824697698786e-05, + "loss": 0.1502, + "step": 19736 + }, + { + "epoch": 0.35203153426318984, + "grad_norm": 0.25410187244415283, + "learning_rate": 4.093262526569138e-05, + "loss": 0.1249, + "step": 19737 + }, + { + "epoch": 0.3520493703849035, + "grad_norm": 0.2516137659549713, + "learning_rate": 4.0931425771923616e-05, + "loss": 0.156, + "step": 19738 + }, + { + "epoch": 0.3520672065066172, + "grad_norm": 0.3644959330558777, + "learning_rate": 4.0930226216400134e-05, + "loss": 0.2473, + "step": 19739 + }, + { + "epoch": 0.3520850426283309, + "grad_norm": 0.28251513838768005, + "learning_rate": 4.09290265991256e-05, + "loss": 0.152, + "step": 19740 + }, + { + "epoch": 0.3521028787500446, + "grad_norm": 0.30688774585723877, + "learning_rate": 4.092782692010466e-05, + "loss": 0.237, + "step": 19741 + }, + { + "epoch": 0.3521207148717583, + "grad_norm": 0.2338908314704895, + "learning_rate": 4.0926627179341957e-05, + "loss": 0.1657, + "step": 19742 + }, + { + "epoch": 0.35213855099347197, + "grad_norm": 0.2764683663845062, + "learning_rate": 4.0925427376842146e-05, + "loss": 0.206, + "step": 19743 + }, + { + "epoch": 0.35215638711518565, + "grad_norm": 0.3336687982082367, + "learning_rate": 4.092422751260988e-05, + "loss": 0.1607, + "step": 19744 + }, + { + "epoch": 0.35217422323689934, + "grad_norm": 0.22442187368869781, + "learning_rate": 4.092302758664981e-05, + "loss": 0.19, + "step": 19745 + }, + { + "epoch": 0.3521920593586131, + "grad_norm": 0.2398339807987213, + "learning_rate": 4.092182759896658e-05, + "loss": 0.1636, + "step": 19746 + }, + { + "epoch": 0.3522098954803268, + "grad_norm": 0.21279588341712952, + "learning_rate": 4.092062754956485e-05, + "loss": 0.1641, + "step": 19747 + }, + { + "epoch": 0.35222773160204046, + "grad_norm": 0.19786347448825836, + "learning_rate": 4.0919427438449265e-05, + "loss": 0.1436, + "step": 19748 + }, + { + "epoch": 0.35224556772375415, + "grad_norm": 0.3153798580169678, + "learning_rate": 4.091822726562449e-05, + "loss": 0.13, + "step": 19749 + }, + { + "epoch": 0.35226340384546784, + "grad_norm": 0.2867707908153534, + "learning_rate": 4.091702703109516e-05, + "loss": 0.1635, + "step": 19750 + }, + { + "epoch": 0.3522812399671815, + "grad_norm": 0.3336258828639984, + "learning_rate": 4.0915826734865934e-05, + "loss": 0.1836, + "step": 19751 + }, + { + "epoch": 0.3522990760888952, + "grad_norm": 0.31477659940719604, + "learning_rate": 4.091462637694147e-05, + "loss": 0.1516, + "step": 19752 + }, + { + "epoch": 0.3523169122106089, + "grad_norm": 0.2180374413728714, + "learning_rate": 4.0913425957326424e-05, + "loss": 0.1147, + "step": 19753 + }, + { + "epoch": 0.35233474833232264, + "grad_norm": 0.2784854471683502, + "learning_rate": 4.0912225476025435e-05, + "loss": 0.1858, + "step": 19754 + }, + { + "epoch": 0.35235258445403633, + "grad_norm": 0.34138286113739014, + "learning_rate": 4.091102493304316e-05, + "loss": 0.1917, + "step": 19755 + }, + { + "epoch": 0.35237042057575, + "grad_norm": 0.3120967745780945, + "learning_rate": 4.0909824328384263e-05, + "loss": 0.16, + "step": 19756 + }, + { + "epoch": 0.3523882566974637, + "grad_norm": 0.2316724956035614, + "learning_rate": 4.090862366205339e-05, + "loss": 0.1259, + "step": 19757 + }, + { + "epoch": 0.3524060928191774, + "grad_norm": 0.27696239948272705, + "learning_rate": 4.090742293405521e-05, + "loss": 0.1774, + "step": 19758 + }, + { + "epoch": 0.3524239289408911, + "grad_norm": 0.36350706219673157, + "learning_rate": 4.090622214439435e-05, + "loss": 0.1314, + "step": 19759 + }, + { + "epoch": 0.3524417650626048, + "grad_norm": 0.2812734842300415, + "learning_rate": 4.090502129307547e-05, + "loss": 0.1733, + "step": 19760 + }, + { + "epoch": 0.35245960118431846, + "grad_norm": 0.23775143921375275, + "learning_rate": 4.090382038010325e-05, + "loss": 0.1698, + "step": 19761 + }, + { + "epoch": 0.35247743730603215, + "grad_norm": 0.26190581917762756, + "learning_rate": 4.090261940548233e-05, + "loss": 0.1379, + "step": 19762 + }, + { + "epoch": 0.3524952734277459, + "grad_norm": 0.39719924330711365, + "learning_rate": 4.090141836921736e-05, + "loss": 0.1644, + "step": 19763 + }, + { + "epoch": 0.3525131095494596, + "grad_norm": 0.3248538374900818, + "learning_rate": 4.090021727131299e-05, + "loss": 0.1423, + "step": 19764 + }, + { + "epoch": 0.35253094567117327, + "grad_norm": 0.2315378040075302, + "learning_rate": 4.0899016111773905e-05, + "loss": 0.1686, + "step": 19765 + }, + { + "epoch": 0.35254878179288696, + "grad_norm": 0.2608569264411926, + "learning_rate": 4.0897814890604734e-05, + "loss": 0.1671, + "step": 19766 + }, + { + "epoch": 0.35256661791460064, + "grad_norm": 0.3050379455089569, + "learning_rate": 4.089661360781014e-05, + "loss": 0.1599, + "step": 19767 + }, + { + "epoch": 0.35258445403631433, + "grad_norm": 0.3049139082431793, + "learning_rate": 4.089541226339478e-05, + "loss": 0.1385, + "step": 19768 + }, + { + "epoch": 0.352602290158028, + "grad_norm": 0.223703995347023, + "learning_rate": 4.089421085736332e-05, + "loss": 0.1795, + "step": 19769 + }, + { + "epoch": 0.3526201262797417, + "grad_norm": 0.2346150428056717, + "learning_rate": 4.08930093897204e-05, + "loss": 0.1319, + "step": 19770 + }, + { + "epoch": 0.35263796240145545, + "grad_norm": 0.670360267162323, + "learning_rate": 4.089180786047069e-05, + "loss": 0.2171, + "step": 19771 + }, + { + "epoch": 0.35265579852316914, + "grad_norm": 0.24883748590946198, + "learning_rate": 4.0890606269618835e-05, + "loss": 0.1175, + "step": 19772 + }, + { + "epoch": 0.3526736346448828, + "grad_norm": 0.2740766704082489, + "learning_rate": 4.0889404617169516e-05, + "loss": 0.1714, + "step": 19773 + }, + { + "epoch": 0.3526914707665965, + "grad_norm": 0.39677295088768005, + "learning_rate": 4.088820290312737e-05, + "loss": 0.147, + "step": 19774 + }, + { + "epoch": 0.3527093068883102, + "grad_norm": 0.24014437198638916, + "learning_rate": 4.088700112749706e-05, + "loss": 0.1636, + "step": 19775 + }, + { + "epoch": 0.3527271430100239, + "grad_norm": 0.14541755616664886, + "learning_rate": 4.088579929028326e-05, + "loss": 0.0994, + "step": 19776 + }, + { + "epoch": 0.3527449791317376, + "grad_norm": 0.1968134194612503, + "learning_rate": 4.0884597391490595e-05, + "loss": 0.1088, + "step": 19777 + }, + { + "epoch": 0.35276281525345127, + "grad_norm": 0.32403284311294556, + "learning_rate": 4.088339543112375e-05, + "loss": 0.1326, + "step": 19778 + }, + { + "epoch": 0.352780651375165, + "grad_norm": 0.2569282054901123, + "learning_rate": 4.088219340918739e-05, + "loss": 0.1512, + "step": 19779 + }, + { + "epoch": 0.3527984874968787, + "grad_norm": 0.20336899161338806, + "learning_rate": 4.088099132568616e-05, + "loss": 0.1673, + "step": 19780 + }, + { + "epoch": 0.3528163236185924, + "grad_norm": 0.2858489155769348, + "learning_rate": 4.0879789180624705e-05, + "loss": 0.1655, + "step": 19781 + }, + { + "epoch": 0.3528341597403061, + "grad_norm": 0.29759639501571655, + "learning_rate": 4.087858697400771e-05, + "loss": 0.236, + "step": 19782 + }, + { + "epoch": 0.35285199586201976, + "grad_norm": 0.20409435033798218, + "learning_rate": 4.087738470583984e-05, + "loss": 0.1323, + "step": 19783 + }, + { + "epoch": 0.35286983198373345, + "grad_norm": 0.26103392243385315, + "learning_rate": 4.087618237612574e-05, + "loss": 0.174, + "step": 19784 + }, + { + "epoch": 0.35288766810544714, + "grad_norm": 0.22956660389900208, + "learning_rate": 4.087497998487006e-05, + "loss": 0.1262, + "step": 19785 + }, + { + "epoch": 0.3529055042271608, + "grad_norm": 0.22196048498153687, + "learning_rate": 4.087377753207749e-05, + "loss": 0.1761, + "step": 19786 + }, + { + "epoch": 0.3529233403488745, + "grad_norm": 0.26203569769859314, + "learning_rate": 4.087257501775267e-05, + "loss": 0.1635, + "step": 19787 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.305367112159729, + "learning_rate": 4.087137244190026e-05, + "loss": 0.2214, + "step": 19788 + }, + { + "epoch": 0.35295901259230195, + "grad_norm": 0.2782062590122223, + "learning_rate": 4.087016980452494e-05, + "loss": 0.1567, + "step": 19789 + }, + { + "epoch": 0.35297684871401563, + "grad_norm": 0.45773640275001526, + "learning_rate": 4.086896710563135e-05, + "loss": 0.1742, + "step": 19790 + }, + { + "epoch": 0.3529946848357293, + "grad_norm": 0.253086656332016, + "learning_rate": 4.086776434522417e-05, + "loss": 0.146, + "step": 19791 + }, + { + "epoch": 0.353012520957443, + "grad_norm": 0.2581816017627716, + "learning_rate": 4.086656152330805e-05, + "loss": 0.1804, + "step": 19792 + }, + { + "epoch": 0.3530303570791567, + "grad_norm": 0.22439709305763245, + "learning_rate": 4.086535863988766e-05, + "loss": 0.1503, + "step": 19793 + }, + { + "epoch": 0.3530481932008704, + "grad_norm": 0.2890036404132843, + "learning_rate": 4.086415569496767e-05, + "loss": 0.1983, + "step": 19794 + }, + { + "epoch": 0.3530660293225841, + "grad_norm": 0.3186407685279846, + "learning_rate": 4.086295268855271e-05, + "loss": 0.1844, + "step": 19795 + }, + { + "epoch": 0.3530838654442978, + "grad_norm": 0.26146066188812256, + "learning_rate": 4.0861749620647484e-05, + "loss": 0.166, + "step": 19796 + }, + { + "epoch": 0.3531017015660115, + "grad_norm": 0.21023817360401154, + "learning_rate": 4.086054649125664e-05, + "loss": 0.0948, + "step": 19797 + }, + { + "epoch": 0.3531195376877252, + "grad_norm": 0.3218429684638977, + "learning_rate": 4.085934330038483e-05, + "loss": 0.1614, + "step": 19798 + }, + { + "epoch": 0.3531373738094389, + "grad_norm": 0.2928532063961029, + "learning_rate": 4.085814004803673e-05, + "loss": 0.1228, + "step": 19799 + }, + { + "epoch": 0.35315520993115257, + "grad_norm": 0.2698879539966583, + "learning_rate": 4.0856936734217005e-05, + "loss": 0.1644, + "step": 19800 + }, + { + "epoch": 0.35317304605286626, + "grad_norm": 0.43529677391052246, + "learning_rate": 4.085573335893031e-05, + "loss": 0.1473, + "step": 19801 + }, + { + "epoch": 0.35319088217457995, + "grad_norm": 0.2933369278907776, + "learning_rate": 4.085452992218132e-05, + "loss": 0.1496, + "step": 19802 + }, + { + "epoch": 0.35320871829629363, + "grad_norm": 0.24520447850227356, + "learning_rate": 4.08533264239747e-05, + "loss": 0.1719, + "step": 19803 + }, + { + "epoch": 0.3532265544180073, + "grad_norm": 0.2467024177312851, + "learning_rate": 4.085212286431511e-05, + "loss": 0.1838, + "step": 19804 + }, + { + "epoch": 0.35324439053972106, + "grad_norm": 0.2627450227737427, + "learning_rate": 4.0850919243207206e-05, + "loss": 0.1657, + "step": 19805 + }, + { + "epoch": 0.35326222666143475, + "grad_norm": 0.19614753127098083, + "learning_rate": 4.084971556065568e-05, + "loss": 0.1135, + "step": 19806 + }, + { + "epoch": 0.35328006278314844, + "grad_norm": 0.22114554047584534, + "learning_rate": 4.0848511816665166e-05, + "loss": 0.1076, + "step": 19807 + }, + { + "epoch": 0.35329789890486213, + "grad_norm": 0.2613510489463806, + "learning_rate": 4.0847308011240364e-05, + "loss": 0.1874, + "step": 19808 + }, + { + "epoch": 0.3533157350265758, + "grad_norm": 0.2803811728954315, + "learning_rate": 4.084610414438591e-05, + "loss": 0.2278, + "step": 19809 + }, + { + "epoch": 0.3533335711482895, + "grad_norm": 0.22982949018478394, + "learning_rate": 4.084490021610649e-05, + "loss": 0.1377, + "step": 19810 + }, + { + "epoch": 0.3533514072700032, + "grad_norm": 0.30077412724494934, + "learning_rate": 4.0843696226406756e-05, + "loss": 0.1477, + "step": 19811 + }, + { + "epoch": 0.3533692433917169, + "grad_norm": 0.2277221977710724, + "learning_rate": 4.0842492175291394e-05, + "loss": 0.1323, + "step": 19812 + }, + { + "epoch": 0.3533870795134306, + "grad_norm": 0.21202826499938965, + "learning_rate": 4.084128806276506e-05, + "loss": 0.1613, + "step": 19813 + }, + { + "epoch": 0.3534049156351443, + "grad_norm": 0.27329447865486145, + "learning_rate": 4.084008388883241e-05, + "loss": 0.2075, + "step": 19814 + }, + { + "epoch": 0.353422751756858, + "grad_norm": 0.4323183298110962, + "learning_rate": 4.083887965349813e-05, + "loss": 0.1704, + "step": 19815 + }, + { + "epoch": 0.3534405878785717, + "grad_norm": 0.2226802259683609, + "learning_rate": 4.083767535676688e-05, + "loss": 0.1366, + "step": 19816 + }, + { + "epoch": 0.3534584240002854, + "grad_norm": 0.2623763978481293, + "learning_rate": 4.083647099864334e-05, + "loss": 0.1678, + "step": 19817 + }, + { + "epoch": 0.35347626012199906, + "grad_norm": 0.2511468231678009, + "learning_rate": 4.083526657913216e-05, + "loss": 0.1646, + "step": 19818 + }, + { + "epoch": 0.35349409624371275, + "grad_norm": 0.2920827269554138, + "learning_rate": 4.083406209823802e-05, + "loss": 0.1865, + "step": 19819 + }, + { + "epoch": 0.35351193236542644, + "grad_norm": 0.26181209087371826, + "learning_rate": 4.083285755596559e-05, + "loss": 0.1774, + "step": 19820 + }, + { + "epoch": 0.3535297684871402, + "grad_norm": 0.2334909439086914, + "learning_rate": 4.083165295231953e-05, + "loss": 0.1844, + "step": 19821 + }, + { + "epoch": 0.35354760460885387, + "grad_norm": 0.24480712413787842, + "learning_rate": 4.0830448287304515e-05, + "loss": 0.1888, + "step": 19822 + }, + { + "epoch": 0.35356544073056756, + "grad_norm": 0.2666739523410797, + "learning_rate": 4.082924356092521e-05, + "loss": 0.1614, + "step": 19823 + }, + { + "epoch": 0.35358327685228125, + "grad_norm": 0.27341511845588684, + "learning_rate": 4.08280387731863e-05, + "loss": 0.1661, + "step": 19824 + }, + { + "epoch": 0.35360111297399494, + "grad_norm": 0.3112272620201111, + "learning_rate": 4.082683392409244e-05, + "loss": 0.2085, + "step": 19825 + }, + { + "epoch": 0.3536189490957086, + "grad_norm": 0.24370110034942627, + "learning_rate": 4.0825629013648306e-05, + "loss": 0.1475, + "step": 19826 + }, + { + "epoch": 0.3536367852174223, + "grad_norm": 0.2231035977602005, + "learning_rate": 4.0824424041858566e-05, + "loss": 0.1643, + "step": 19827 + }, + { + "epoch": 0.353654621339136, + "grad_norm": 0.23225586116313934, + "learning_rate": 4.08232190087279e-05, + "loss": 0.1475, + "step": 19828 + }, + { + "epoch": 0.3536724574608497, + "grad_norm": 0.31278154253959656, + "learning_rate": 4.082201391426096e-05, + "loss": 0.1531, + "step": 19829 + }, + { + "epoch": 0.35369029358256343, + "grad_norm": 0.2686065137386322, + "learning_rate": 4.0820808758462435e-05, + "loss": 0.1829, + "step": 19830 + }, + { + "epoch": 0.3537081297042771, + "grad_norm": 0.425155907869339, + "learning_rate": 4.0819603541336994e-05, + "loss": 0.2167, + "step": 19831 + }, + { + "epoch": 0.3537259658259908, + "grad_norm": 0.3046644628047943, + "learning_rate": 4.081839826288931e-05, + "loss": 0.1805, + "step": 19832 + }, + { + "epoch": 0.3537438019477045, + "grad_norm": 0.24657128751277924, + "learning_rate": 4.0817192923124035e-05, + "loss": 0.1847, + "step": 19833 + }, + { + "epoch": 0.3537616380694182, + "grad_norm": 0.23382999002933502, + "learning_rate": 4.0815987522045875e-05, + "loss": 0.145, + "step": 19834 + }, + { + "epoch": 0.35377947419113187, + "grad_norm": 0.21233895421028137, + "learning_rate": 4.0814782059659476e-05, + "loss": 0.1615, + "step": 19835 + }, + { + "epoch": 0.35379731031284556, + "grad_norm": 0.25242379307746887, + "learning_rate": 4.0813576535969516e-05, + "loss": 0.1538, + "step": 19836 + }, + { + "epoch": 0.35381514643455925, + "grad_norm": 0.4201928973197937, + "learning_rate": 4.0812370950980675e-05, + "loss": 0.1676, + "step": 19837 + }, + { + "epoch": 0.353832982556273, + "grad_norm": 0.2942999005317688, + "learning_rate": 4.081116530469762e-05, + "loss": 0.1524, + "step": 19838 + }, + { + "epoch": 0.3538508186779867, + "grad_norm": 0.30660712718963623, + "learning_rate": 4.080995959712503e-05, + "loss": 0.1477, + "step": 19839 + }, + { + "epoch": 0.35386865479970037, + "grad_norm": 0.3902914822101593, + "learning_rate": 4.0808753828267575e-05, + "loss": 0.1491, + "step": 19840 + }, + { + "epoch": 0.35388649092141405, + "grad_norm": 0.32468390464782715, + "learning_rate": 4.080754799812994e-05, + "loss": 0.1522, + "step": 19841 + }, + { + "epoch": 0.35390432704312774, + "grad_norm": 0.2665756642818451, + "learning_rate": 4.080634210671678e-05, + "loss": 0.1614, + "step": 19842 + }, + { + "epoch": 0.35392216316484143, + "grad_norm": 0.43207842111587524, + "learning_rate": 4.080513615403278e-05, + "loss": 0.2466, + "step": 19843 + }, + { + "epoch": 0.3539399992865551, + "grad_norm": 0.2925707697868347, + "learning_rate": 4.08039301400826e-05, + "loss": 0.2039, + "step": 19844 + }, + { + "epoch": 0.3539578354082688, + "grad_norm": 0.3350176513195038, + "learning_rate": 4.0802724064870954e-05, + "loss": 0.1928, + "step": 19845 + }, + { + "epoch": 0.3539756715299825, + "grad_norm": 0.23238037526607513, + "learning_rate": 4.0801517928402475e-05, + "loss": 0.1533, + "step": 19846 + }, + { + "epoch": 0.35399350765169624, + "grad_norm": 0.2267867624759674, + "learning_rate": 4.080031173068186e-05, + "loss": 0.1494, + "step": 19847 + }, + { + "epoch": 0.3540113437734099, + "grad_norm": 0.4750889539718628, + "learning_rate": 4.0799105471713774e-05, + "loss": 0.1536, + "step": 19848 + }, + { + "epoch": 0.3540291798951236, + "grad_norm": 0.24716679751873016, + "learning_rate": 4.07978991515029e-05, + "loss": 0.1779, + "step": 19849 + }, + { + "epoch": 0.3540470160168373, + "grad_norm": 0.28831833600997925, + "learning_rate": 4.0796692770053915e-05, + "loss": 0.1563, + "step": 19850 + }, + { + "epoch": 0.354064852138551, + "grad_norm": 0.21190913021564484, + "learning_rate": 4.079548632737149e-05, + "loss": 0.1566, + "step": 19851 + }, + { + "epoch": 0.3540826882602647, + "grad_norm": 0.21971747279167175, + "learning_rate": 4.0794279823460304e-05, + "loss": 0.1815, + "step": 19852 + }, + { + "epoch": 0.35410052438197837, + "grad_norm": 0.256743460893631, + "learning_rate": 4.0793073258325046e-05, + "loss": 0.1283, + "step": 19853 + }, + { + "epoch": 0.35411836050369205, + "grad_norm": 0.25874578952789307, + "learning_rate": 4.0791866631970366e-05, + "loss": 0.1951, + "step": 19854 + }, + { + "epoch": 0.3541361966254058, + "grad_norm": 0.3698365390300751, + "learning_rate": 4.079065994440097e-05, + "loss": 0.1975, + "step": 19855 + }, + { + "epoch": 0.3541540327471195, + "grad_norm": 0.19486546516418457, + "learning_rate": 4.078945319562151e-05, + "loss": 0.1528, + "step": 19856 + }, + { + "epoch": 0.3541718688688332, + "grad_norm": 0.2912094295024872, + "learning_rate": 4.078824638563668e-05, + "loss": 0.1658, + "step": 19857 + }, + { + "epoch": 0.35418970499054686, + "grad_norm": 0.19902034103870392, + "learning_rate": 4.0787039514451154e-05, + "loss": 0.126, + "step": 19858 + }, + { + "epoch": 0.35420754111226055, + "grad_norm": 0.29859817028045654, + "learning_rate": 4.078583258206961e-05, + "loss": 0.1919, + "step": 19859 + }, + { + "epoch": 0.35422537723397424, + "grad_norm": 0.24333952367305756, + "learning_rate": 4.078462558849673e-05, + "loss": 0.1798, + "step": 19860 + }, + { + "epoch": 0.3542432133556879, + "grad_norm": 0.25449657440185547, + "learning_rate": 4.078341853373718e-05, + "loss": 0.1561, + "step": 19861 + }, + { + "epoch": 0.3542610494774016, + "grad_norm": 0.2693297266960144, + "learning_rate": 4.078221141779566e-05, + "loss": 0.1913, + "step": 19862 + }, + { + "epoch": 0.3542788855991153, + "grad_norm": 0.20248349010944366, + "learning_rate": 4.0781004240676835e-05, + "loss": 0.1201, + "step": 19863 + }, + { + "epoch": 0.35429672172082904, + "grad_norm": 0.25756850838661194, + "learning_rate": 4.0779797002385384e-05, + "loss": 0.1666, + "step": 19864 + }, + { + "epoch": 0.35431455784254273, + "grad_norm": 0.22621749341487885, + "learning_rate": 4.077858970292599e-05, + "loss": 0.1278, + "step": 19865 + }, + { + "epoch": 0.3543323939642564, + "grad_norm": 0.26753324270248413, + "learning_rate": 4.077738234230334e-05, + "loss": 0.1357, + "step": 19866 + }, + { + "epoch": 0.3543502300859701, + "grad_norm": 0.30189305543899536, + "learning_rate": 4.0776174920522095e-05, + "loss": 0.1357, + "step": 19867 + }, + { + "epoch": 0.3543680662076838, + "grad_norm": 0.28078579902648926, + "learning_rate": 4.0774967437586956e-05, + "loss": 0.1533, + "step": 19868 + }, + { + "epoch": 0.3543859023293975, + "grad_norm": 0.24791203439235687, + "learning_rate": 4.0773759893502585e-05, + "loss": 0.1516, + "step": 19869 + }, + { + "epoch": 0.35440373845111117, + "grad_norm": 0.24133937060832977, + "learning_rate": 4.077255228827368e-05, + "loss": 0.2027, + "step": 19870 + }, + { + "epoch": 0.35442157457282486, + "grad_norm": 0.41438114643096924, + "learning_rate": 4.077134462190491e-05, + "loss": 0.2251, + "step": 19871 + }, + { + "epoch": 0.3544394106945386, + "grad_norm": 0.21450918912887573, + "learning_rate": 4.077013689440097e-05, + "loss": 0.1986, + "step": 19872 + }, + { + "epoch": 0.3544572468162523, + "grad_norm": 0.2632606625556946, + "learning_rate": 4.0768929105766525e-05, + "loss": 0.1716, + "step": 19873 + }, + { + "epoch": 0.354475082937966, + "grad_norm": 0.2613445222377777, + "learning_rate": 4.0767721256006266e-05, + "loss": 0.1583, + "step": 19874 + }, + { + "epoch": 0.35449291905967967, + "grad_norm": 0.3841835558414459, + "learning_rate": 4.076651334512487e-05, + "loss": 0.1103, + "step": 19875 + }, + { + "epoch": 0.35451075518139336, + "grad_norm": 0.2413518875837326, + "learning_rate": 4.076530537312703e-05, + "loss": 0.1895, + "step": 19876 + }, + { + "epoch": 0.35452859130310704, + "grad_norm": 0.26866576075553894, + "learning_rate": 4.076409734001741e-05, + "loss": 0.1197, + "step": 19877 + }, + { + "epoch": 0.35454642742482073, + "grad_norm": 0.25959742069244385, + "learning_rate": 4.0762889245800713e-05, + "loss": 0.1977, + "step": 19878 + }, + { + "epoch": 0.3545642635465344, + "grad_norm": 0.2680872082710266, + "learning_rate": 4.0761681090481606e-05, + "loss": 0.1678, + "step": 19879 + }, + { + "epoch": 0.35458209966824816, + "grad_norm": 0.2539249658584595, + "learning_rate": 4.076047287406479e-05, + "loss": 0.1935, + "step": 19880 + }, + { + "epoch": 0.35459993578996185, + "grad_norm": 0.2112245410680771, + "learning_rate": 4.075926459655493e-05, + "loss": 0.1666, + "step": 19881 + }, + { + "epoch": 0.35461777191167554, + "grad_norm": 0.18986941874027252, + "learning_rate": 4.075805625795672e-05, + "loss": 0.1631, + "step": 19882 + }, + { + "epoch": 0.3546356080333892, + "grad_norm": 0.20954051613807678, + "learning_rate": 4.075684785827484e-05, + "loss": 0.151, + "step": 19883 + }, + { + "epoch": 0.3546534441551029, + "grad_norm": 0.25232070684432983, + "learning_rate": 4.075563939751398e-05, + "loss": 0.1764, + "step": 19884 + }, + { + "epoch": 0.3546712802768166, + "grad_norm": 0.40389907360076904, + "learning_rate": 4.0754430875678815e-05, + "loss": 0.2092, + "step": 19885 + }, + { + "epoch": 0.3546891163985303, + "grad_norm": 0.1998995691537857, + "learning_rate": 4.075322229277403e-05, + "loss": 0.159, + "step": 19886 + }, + { + "epoch": 0.354706952520244, + "grad_norm": 0.27081090211868286, + "learning_rate": 4.075201364880432e-05, + "loss": 0.1611, + "step": 19887 + }, + { + "epoch": 0.35472478864195767, + "grad_norm": 0.24346719682216644, + "learning_rate": 4.075080494377437e-05, + "loss": 0.1462, + "step": 19888 + }, + { + "epoch": 0.3547426247636714, + "grad_norm": 0.3675137758255005, + "learning_rate": 4.0749596177688846e-05, + "loss": 0.178, + "step": 19889 + }, + { + "epoch": 0.3547604608853851, + "grad_norm": 0.17535705864429474, + "learning_rate": 4.074838735055246e-05, + "loss": 0.145, + "step": 19890 + }, + { + "epoch": 0.3547782970070988, + "grad_norm": 0.20113764703273773, + "learning_rate": 4.074717846236988e-05, + "loss": 0.1344, + "step": 19891 + }, + { + "epoch": 0.3547961331288125, + "grad_norm": 0.25401291251182556, + "learning_rate": 4.07459695131458e-05, + "loss": 0.1752, + "step": 19892 + }, + { + "epoch": 0.35481396925052616, + "grad_norm": 0.2587297856807709, + "learning_rate": 4.0744760502884905e-05, + "loss": 0.1545, + "step": 19893 + }, + { + "epoch": 0.35483180537223985, + "grad_norm": 0.2889137268066406, + "learning_rate": 4.0743551431591876e-05, + "loss": 0.1113, + "step": 19894 + }, + { + "epoch": 0.35484964149395354, + "grad_norm": 0.2278619110584259, + "learning_rate": 4.074234229927141e-05, + "loss": 0.1583, + "step": 19895 + }, + { + "epoch": 0.3548674776156672, + "grad_norm": 0.27764952182769775, + "learning_rate": 4.074113310592818e-05, + "loss": 0.1794, + "step": 19896 + }, + { + "epoch": 0.35488531373738097, + "grad_norm": 0.273314893245697, + "learning_rate": 4.0739923851566887e-05, + "loss": 0.1686, + "step": 19897 + }, + { + "epoch": 0.35490314985909466, + "grad_norm": 0.2454201579093933, + "learning_rate": 4.07387145361922e-05, + "loss": 0.1772, + "step": 19898 + }, + { + "epoch": 0.35492098598080835, + "grad_norm": 0.2265036702156067, + "learning_rate": 4.073750515980883e-05, + "loss": 0.1566, + "step": 19899 + }, + { + "epoch": 0.35493882210252203, + "grad_norm": 0.3226830065250397, + "learning_rate": 4.0736295722421456e-05, + "loss": 0.196, + "step": 19900 + }, + { + "epoch": 0.3549566582242357, + "grad_norm": 0.331459641456604, + "learning_rate": 4.073508622403477e-05, + "loss": 0.1774, + "step": 19901 + }, + { + "epoch": 0.3549744943459494, + "grad_norm": 0.3452409505844116, + "learning_rate": 4.073387666465344e-05, + "loss": 0.1581, + "step": 19902 + }, + { + "epoch": 0.3549923304676631, + "grad_norm": 0.2745140492916107, + "learning_rate": 4.073266704428218e-05, + "loss": 0.1711, + "step": 19903 + }, + { + "epoch": 0.3550101665893768, + "grad_norm": 0.25147125124931335, + "learning_rate": 4.073145736292566e-05, + "loss": 0.2093, + "step": 19904 + }, + { + "epoch": 0.3550280027110905, + "grad_norm": 0.22041663527488708, + "learning_rate": 4.073024762058859e-05, + "loss": 0.1345, + "step": 19905 + }, + { + "epoch": 0.3550458388328042, + "grad_norm": 0.23245128989219666, + "learning_rate": 4.072903781727564e-05, + "loss": 0.1737, + "step": 19906 + }, + { + "epoch": 0.3550636749545179, + "grad_norm": 0.3767424523830414, + "learning_rate": 4.072782795299151e-05, + "loss": 0.1729, + "step": 19907 + }, + { + "epoch": 0.3550815110762316, + "grad_norm": 0.26995816826820374, + "learning_rate": 4.0726618027740885e-05, + "loss": 0.1259, + "step": 19908 + }, + { + "epoch": 0.3550993471979453, + "grad_norm": 0.28562813997268677, + "learning_rate": 4.072540804152846e-05, + "loss": 0.1351, + "step": 19909 + }, + { + "epoch": 0.35511718331965897, + "grad_norm": 0.2921011447906494, + "learning_rate": 4.0724197994358916e-05, + "loss": 0.1426, + "step": 19910 + }, + { + "epoch": 0.35513501944137266, + "grad_norm": 0.2537221610546112, + "learning_rate": 4.072298788623695e-05, + "loss": 0.1158, + "step": 19911 + }, + { + "epoch": 0.35515285556308634, + "grad_norm": 0.3903716206550598, + "learning_rate": 4.0721777717167256e-05, + "loss": 0.2055, + "step": 19912 + }, + { + "epoch": 0.35517069168480003, + "grad_norm": 0.2782233655452728, + "learning_rate": 4.0720567487154514e-05, + "loss": 0.1463, + "step": 19913 + }, + { + "epoch": 0.3551885278065138, + "grad_norm": 0.24708178639411926, + "learning_rate": 4.0719357196203436e-05, + "loss": 0.1811, + "step": 19914 + }, + { + "epoch": 0.35520636392822746, + "grad_norm": 0.2346726506948471, + "learning_rate": 4.0718146844318686e-05, + "loss": 0.1469, + "step": 19915 + }, + { + "epoch": 0.35522420004994115, + "grad_norm": 0.28966253995895386, + "learning_rate": 4.071693643150498e-05, + "loss": 0.1861, + "step": 19916 + }, + { + "epoch": 0.35524203617165484, + "grad_norm": 0.32770591974258423, + "learning_rate": 4.071572595776699e-05, + "loss": 0.175, + "step": 19917 + }, + { + "epoch": 0.35525987229336853, + "grad_norm": 0.21523083746433258, + "learning_rate": 4.0714515423109436e-05, + "loss": 0.175, + "step": 19918 + }, + { + "epoch": 0.3552777084150822, + "grad_norm": 0.2968734800815582, + "learning_rate": 4.071330482753698e-05, + "loss": 0.1277, + "step": 19919 + }, + { + "epoch": 0.3552955445367959, + "grad_norm": 0.3430662751197815, + "learning_rate": 4.071209417105433e-05, + "loss": 0.1323, + "step": 19920 + }, + { + "epoch": 0.3553133806585096, + "grad_norm": 0.28144368529319763, + "learning_rate": 4.071088345366617e-05, + "loss": 0.1258, + "step": 19921 + }, + { + "epoch": 0.35533121678022334, + "grad_norm": 0.25554487109184265, + "learning_rate": 4.0709672675377205e-05, + "loss": 0.1633, + "step": 19922 + }, + { + "epoch": 0.355349052901937, + "grad_norm": 0.21577578783035278, + "learning_rate": 4.070846183619212e-05, + "loss": 0.1432, + "step": 19923 + }, + { + "epoch": 0.3553668890236507, + "grad_norm": 0.30410903692245483, + "learning_rate": 4.070725093611562e-05, + "loss": 0.171, + "step": 19924 + }, + { + "epoch": 0.3553847251453644, + "grad_norm": 0.25397956371307373, + "learning_rate": 4.0706039975152386e-05, + "loss": 0.1538, + "step": 19925 + }, + { + "epoch": 0.3554025612670781, + "grad_norm": 0.35547491908073425, + "learning_rate": 4.070482895330711e-05, + "loss": 0.2167, + "step": 19926 + }, + { + "epoch": 0.3554203973887918, + "grad_norm": 0.2257125973701477, + "learning_rate": 4.0703617870584496e-05, + "loss": 0.1785, + "step": 19927 + }, + { + "epoch": 0.35543823351050546, + "grad_norm": 0.22685030102729797, + "learning_rate": 4.070240672698924e-05, + "loss": 0.1841, + "step": 19928 + }, + { + "epoch": 0.35545606963221915, + "grad_norm": 0.2880244255065918, + "learning_rate": 4.070119552252603e-05, + "loss": 0.1503, + "step": 19929 + }, + { + "epoch": 0.35547390575393284, + "grad_norm": 0.29200857877731323, + "learning_rate": 4.069998425719955e-05, + "loss": 0.076, + "step": 19930 + }, + { + "epoch": 0.3554917418756466, + "grad_norm": 0.27487778663635254, + "learning_rate": 4.069877293101453e-05, + "loss": 0.195, + "step": 19931 + }, + { + "epoch": 0.35550957799736027, + "grad_norm": 0.21253696084022522, + "learning_rate": 4.0697561543975626e-05, + "loss": 0.1327, + "step": 19932 + }, + { + "epoch": 0.35552741411907396, + "grad_norm": 0.2654271125793457, + "learning_rate": 4.069635009608757e-05, + "loss": 0.1566, + "step": 19933 + }, + { + "epoch": 0.35554525024078765, + "grad_norm": 0.30429890751838684, + "learning_rate": 4.069513858735502e-05, + "loss": 0.2393, + "step": 19934 + }, + { + "epoch": 0.35556308636250133, + "grad_norm": 0.21206673979759216, + "learning_rate": 4.06939270177827e-05, + "loss": 0.1621, + "step": 19935 + }, + { + "epoch": 0.355580922484215, + "grad_norm": 0.3072971701622009, + "learning_rate": 4.0692715387375304e-05, + "loss": 0.1706, + "step": 19936 + }, + { + "epoch": 0.3555987586059287, + "grad_norm": 0.19217704236507416, + "learning_rate": 4.0691503696137514e-05, + "loss": 0.1435, + "step": 19937 + }, + { + "epoch": 0.3556165947276424, + "grad_norm": 0.27029523253440857, + "learning_rate": 4.0690291944074044e-05, + "loss": 0.1603, + "step": 19938 + }, + { + "epoch": 0.35563443084935614, + "grad_norm": 0.2937868535518646, + "learning_rate": 4.0689080131189576e-05, + "loss": 0.1266, + "step": 19939 + }, + { + "epoch": 0.35565226697106983, + "grad_norm": 0.20645561814308167, + "learning_rate": 4.068786825748882e-05, + "loss": 0.1669, + "step": 19940 + }, + { + "epoch": 0.3556701030927835, + "grad_norm": 0.1976676732301712, + "learning_rate": 4.0686656322976466e-05, + "loss": 0.1657, + "step": 19941 + }, + { + "epoch": 0.3556879392144972, + "grad_norm": 0.2827216386795044, + "learning_rate": 4.0685444327657215e-05, + "loss": 0.1525, + "step": 19942 + }, + { + "epoch": 0.3557057753362109, + "grad_norm": 0.2267533242702484, + "learning_rate": 4.068423227153576e-05, + "loss": 0.1571, + "step": 19943 + }, + { + "epoch": 0.3557236114579246, + "grad_norm": 0.3056357800960541, + "learning_rate": 4.0683020154616816e-05, + "loss": 0.1344, + "step": 19944 + }, + { + "epoch": 0.35574144757963827, + "grad_norm": 0.2581568658351898, + "learning_rate": 4.068180797690506e-05, + "loss": 0.1303, + "step": 19945 + }, + { + "epoch": 0.35575928370135196, + "grad_norm": 0.23742994666099548, + "learning_rate": 4.068059573840519e-05, + "loss": 0.1454, + "step": 19946 + }, + { + "epoch": 0.35577711982306565, + "grad_norm": 0.34085613489151, + "learning_rate": 4.067938343912193e-05, + "loss": 0.1552, + "step": 19947 + }, + { + "epoch": 0.3557949559447794, + "grad_norm": 0.23910628259181976, + "learning_rate": 4.0678171079059955e-05, + "loss": 0.156, + "step": 19948 + }, + { + "epoch": 0.3558127920664931, + "grad_norm": 0.2141461819410324, + "learning_rate": 4.0676958658223986e-05, + "loss": 0.1299, + "step": 19949 + }, + { + "epoch": 0.35583062818820677, + "grad_norm": 0.2811725437641144, + "learning_rate": 4.06757461766187e-05, + "loss": 0.2097, + "step": 19950 + }, + { + "epoch": 0.35584846430992045, + "grad_norm": 0.25468626618385315, + "learning_rate": 4.067453363424881e-05, + "loss": 0.2093, + "step": 19951 + }, + { + "epoch": 0.35586630043163414, + "grad_norm": 0.2904917895793915, + "learning_rate": 4.0673321031119015e-05, + "loss": 0.1651, + "step": 19952 + }, + { + "epoch": 0.35588413655334783, + "grad_norm": 0.22564221918582916, + "learning_rate": 4.0672108367234016e-05, + "loss": 0.1017, + "step": 19953 + }, + { + "epoch": 0.3559019726750615, + "grad_norm": 0.20709817111492157, + "learning_rate": 4.0670895642598506e-05, + "loss": 0.1493, + "step": 19954 + }, + { + "epoch": 0.3559198087967752, + "grad_norm": 0.22773197293281555, + "learning_rate": 4.0669682857217196e-05, + "loss": 0.1825, + "step": 19955 + }, + { + "epoch": 0.35593764491848895, + "grad_norm": 0.2977898418903351, + "learning_rate": 4.0668470011094786e-05, + "loss": 0.1459, + "step": 19956 + }, + { + "epoch": 0.35595548104020264, + "grad_norm": 0.32465144991874695, + "learning_rate": 4.066725710423597e-05, + "loss": 0.1972, + "step": 19957 + }, + { + "epoch": 0.3559733171619163, + "grad_norm": 0.32574278116226196, + "learning_rate": 4.0666044136645456e-05, + "loss": 0.1948, + "step": 19958 + }, + { + "epoch": 0.35599115328363, + "grad_norm": 0.2731695771217346, + "learning_rate": 4.066483110832794e-05, + "loss": 0.1711, + "step": 19959 + }, + { + "epoch": 0.3560089894053437, + "grad_norm": 0.24134424328804016, + "learning_rate": 4.066361801928814e-05, + "loss": 0.1473, + "step": 19960 + }, + { + "epoch": 0.3560268255270574, + "grad_norm": 0.19726938009262085, + "learning_rate": 4.0662404869530735e-05, + "loss": 0.1283, + "step": 19961 + }, + { + "epoch": 0.3560446616487711, + "grad_norm": 0.2203519195318222, + "learning_rate": 4.066119165906044e-05, + "loss": 0.142, + "step": 19962 + }, + { + "epoch": 0.35606249777048476, + "grad_norm": 0.2862675189971924, + "learning_rate": 4.065997838788196e-05, + "loss": 0.1402, + "step": 19963 + }, + { + "epoch": 0.35608033389219845, + "grad_norm": 0.21290041506290436, + "learning_rate": 4.065876505599999e-05, + "loss": 0.1184, + "step": 19964 + }, + { + "epoch": 0.3560981700139122, + "grad_norm": 0.2929311692714691, + "learning_rate": 4.0657551663419245e-05, + "loss": 0.1221, + "step": 19965 + }, + { + "epoch": 0.3561160061356259, + "grad_norm": 0.24848604202270508, + "learning_rate": 4.065633821014442e-05, + "loss": 0.1784, + "step": 19966 + }, + { + "epoch": 0.3561338422573396, + "grad_norm": 0.31184107065200806, + "learning_rate": 4.065512469618022e-05, + "loss": 0.2016, + "step": 19967 + }, + { + "epoch": 0.35615167837905326, + "grad_norm": 0.26397597789764404, + "learning_rate": 4.065391112153135e-05, + "loss": 0.1797, + "step": 19968 + }, + { + "epoch": 0.35616951450076695, + "grad_norm": 0.28391069173812866, + "learning_rate": 4.065269748620251e-05, + "loss": 0.1687, + "step": 19969 + }, + { + "epoch": 0.35618735062248064, + "grad_norm": 0.22017905116081238, + "learning_rate": 4.065148379019842e-05, + "loss": 0.1272, + "step": 19970 + }, + { + "epoch": 0.3562051867441943, + "grad_norm": 0.26468491554260254, + "learning_rate": 4.0650270033523766e-05, + "loss": 0.1723, + "step": 19971 + }, + { + "epoch": 0.356223022865908, + "grad_norm": 0.30175405740737915, + "learning_rate": 4.064905621618325e-05, + "loss": 0.1419, + "step": 19972 + }, + { + "epoch": 0.35624085898762176, + "grad_norm": 0.3725954294204712, + "learning_rate": 4.06478423381816e-05, + "loss": 0.1907, + "step": 19973 + }, + { + "epoch": 0.35625869510933544, + "grad_norm": 0.2569026052951813, + "learning_rate": 4.06466283995235e-05, + "loss": 0.1772, + "step": 19974 + }, + { + "epoch": 0.35627653123104913, + "grad_norm": 0.33000338077545166, + "learning_rate": 4.064541440021367e-05, + "loss": 0.122, + "step": 19975 + }, + { + "epoch": 0.3562943673527628, + "grad_norm": 0.3174501955509186, + "learning_rate": 4.064420034025681e-05, + "loss": 0.1406, + "step": 19976 + }, + { + "epoch": 0.3563122034744765, + "grad_norm": 0.2697739601135254, + "learning_rate": 4.0642986219657624e-05, + "loss": 0.1761, + "step": 19977 + }, + { + "epoch": 0.3563300395961902, + "grad_norm": 0.21084244549274445, + "learning_rate": 4.064177203842082e-05, + "loss": 0.1671, + "step": 19978 + }, + { + "epoch": 0.3563478757179039, + "grad_norm": 0.2385108321905136, + "learning_rate": 4.0640557796551106e-05, + "loss": 0.0984, + "step": 19979 + }, + { + "epoch": 0.35636571183961757, + "grad_norm": 0.31695103645324707, + "learning_rate": 4.063934349405318e-05, + "loss": 0.1611, + "step": 19980 + }, + { + "epoch": 0.3563835479613313, + "grad_norm": 0.25559332966804504, + "learning_rate": 4.063812913093177e-05, + "loss": 0.1667, + "step": 19981 + }, + { + "epoch": 0.356401384083045, + "grad_norm": 0.3464248776435852, + "learning_rate": 4.0636914707191564e-05, + "loss": 0.2051, + "step": 19982 + }, + { + "epoch": 0.3564192202047587, + "grad_norm": 0.25592154264450073, + "learning_rate": 4.063570022283728e-05, + "loss": 0.1724, + "step": 19983 + }, + { + "epoch": 0.3564370563264724, + "grad_norm": 0.27940690517425537, + "learning_rate": 4.063448567787362e-05, + "loss": 0.164, + "step": 19984 + }, + { + "epoch": 0.35645489244818607, + "grad_norm": 0.2590286135673523, + "learning_rate": 4.063327107230529e-05, + "loss": 0.1617, + "step": 19985 + }, + { + "epoch": 0.35647272856989975, + "grad_norm": 0.2647278308868408, + "learning_rate": 4.063205640613701e-05, + "loss": 0.1499, + "step": 19986 + }, + { + "epoch": 0.35649056469161344, + "grad_norm": 0.2159040868282318, + "learning_rate": 4.0630841679373464e-05, + "loss": 0.1409, + "step": 19987 + }, + { + "epoch": 0.35650840081332713, + "grad_norm": 0.2959003150463104, + "learning_rate": 4.062962689201939e-05, + "loss": 0.1497, + "step": 19988 + }, + { + "epoch": 0.3565262369350408, + "grad_norm": 0.23684212565422058, + "learning_rate": 4.062841204407948e-05, + "loss": 0.1457, + "step": 19989 + }, + { + "epoch": 0.35654407305675456, + "grad_norm": 0.25725317001342773, + "learning_rate": 4.062719713555845e-05, + "loss": 0.1547, + "step": 19990 + }, + { + "epoch": 0.35656190917846825, + "grad_norm": 0.22904419898986816, + "learning_rate": 4.0625982166461e-05, + "loss": 0.1794, + "step": 19991 + }, + { + "epoch": 0.35657974530018194, + "grad_norm": 0.5962663888931274, + "learning_rate": 4.062476713679185e-05, + "loss": 0.1599, + "step": 19992 + }, + { + "epoch": 0.3565975814218956, + "grad_norm": 0.2700563073158264, + "learning_rate": 4.0623552046555706e-05, + "loss": 0.1769, + "step": 19993 + }, + { + "epoch": 0.3566154175436093, + "grad_norm": 0.25697433948516846, + "learning_rate": 4.062233689575728e-05, + "loss": 0.1534, + "step": 19994 + }, + { + "epoch": 0.356633253665323, + "grad_norm": 0.29047948122024536, + "learning_rate": 4.062112168440128e-05, + "loss": 0.1517, + "step": 19995 + }, + { + "epoch": 0.3566510897870367, + "grad_norm": 0.3100607693195343, + "learning_rate": 4.061990641249241e-05, + "loss": 0.1813, + "step": 19996 + }, + { + "epoch": 0.3566689259087504, + "grad_norm": 0.2511794865131378, + "learning_rate": 4.0618691080035405e-05, + "loss": 0.1656, + "step": 19997 + }, + { + "epoch": 0.3566867620304641, + "grad_norm": 0.3291493356227875, + "learning_rate": 4.061747568703494e-05, + "loss": 0.1274, + "step": 19998 + }, + { + "epoch": 0.3567045981521778, + "grad_norm": 0.3066295087337494, + "learning_rate": 4.0616260233495755e-05, + "loss": 0.167, + "step": 19999 + }, + { + "epoch": 0.3567224342738915, + "grad_norm": 0.22795794904232025, + "learning_rate": 4.0615044719422545e-05, + "loss": 0.1347, + "step": 20000 + }, + { + "epoch": 0.3567224342738915, + "eval_loss": 0.15922844409942627, + "eval_runtime": 106.7988, + "eval_samples_per_second": 9.588, + "eval_steps_per_second": 1.601, + "step": 20000 + }, + { + "epoch": 0.3567402703956052, + "grad_norm": 0.3071390688419342, + "learning_rate": 4.0613829144820035e-05, + "loss": 0.1581, + "step": 20001 + }, + { + "epoch": 0.3567581065173189, + "grad_norm": 0.2813947796821594, + "learning_rate": 4.061261350969293e-05, + "loss": 0.1343, + "step": 20002 + }, + { + "epoch": 0.35677594263903256, + "grad_norm": 0.36832886934280396, + "learning_rate": 4.0611397814045934e-05, + "loss": 0.1832, + "step": 20003 + }, + { + "epoch": 0.35679377876074625, + "grad_norm": 0.27840691804885864, + "learning_rate": 4.061018205788378e-05, + "loss": 0.1589, + "step": 20004 + }, + { + "epoch": 0.35681161488245994, + "grad_norm": 0.31088972091674805, + "learning_rate": 4.060896624121117e-05, + "loss": 0.2003, + "step": 20005 + }, + { + "epoch": 0.3568294510041736, + "grad_norm": 0.2268286794424057, + "learning_rate": 4.06077503640328e-05, + "loss": 0.1719, + "step": 20006 + }, + { + "epoch": 0.35684728712588737, + "grad_norm": 0.2342464178800583, + "learning_rate": 4.0606534426353415e-05, + "loss": 0.1701, + "step": 20007 + }, + { + "epoch": 0.35686512324760106, + "grad_norm": 0.3153845965862274, + "learning_rate": 4.0605318428177694e-05, + "loss": 0.1449, + "step": 20008 + }, + { + "epoch": 0.35688295936931475, + "grad_norm": 0.16588109731674194, + "learning_rate": 4.060410236951039e-05, + "loss": 0.1508, + "step": 20009 + }, + { + "epoch": 0.35690079549102843, + "grad_norm": 0.23996523022651672, + "learning_rate": 4.0602886250356185e-05, + "loss": 0.0992, + "step": 20010 + }, + { + "epoch": 0.3569186316127421, + "grad_norm": 0.2849825620651245, + "learning_rate": 4.0601670070719796e-05, + "loss": 0.1603, + "step": 20011 + }, + { + "epoch": 0.3569364677344558, + "grad_norm": 0.2879987955093384, + "learning_rate": 4.0600453830605966e-05, + "loss": 0.1537, + "step": 20012 + }, + { + "epoch": 0.3569543038561695, + "grad_norm": 0.24345509707927704, + "learning_rate": 4.059923753001937e-05, + "loss": 0.1621, + "step": 20013 + }, + { + "epoch": 0.3569721399778832, + "grad_norm": 0.29284825921058655, + "learning_rate": 4.059802116896475e-05, + "loss": 0.1751, + "step": 20014 + }, + { + "epoch": 0.35698997609959693, + "grad_norm": 0.4531479775905609, + "learning_rate": 4.059680474744681e-05, + "loss": 0.1114, + "step": 20015 + }, + { + "epoch": 0.3570078122213106, + "grad_norm": 0.24714669585227966, + "learning_rate": 4.059558826547027e-05, + "loss": 0.1474, + "step": 20016 + }, + { + "epoch": 0.3570256483430243, + "grad_norm": 0.2343725562095642, + "learning_rate": 4.059437172303984e-05, + "loss": 0.1725, + "step": 20017 + }, + { + "epoch": 0.357043484464738, + "grad_norm": 0.36711928248405457, + "learning_rate": 4.059315512016024e-05, + "loss": 0.1752, + "step": 20018 + }, + { + "epoch": 0.3570613205864517, + "grad_norm": 0.2591914236545563, + "learning_rate": 4.0591938456836186e-05, + "loss": 0.167, + "step": 20019 + }, + { + "epoch": 0.35707915670816537, + "grad_norm": 0.2744218707084656, + "learning_rate": 4.05907217330724e-05, + "loss": 0.1487, + "step": 20020 + }, + { + "epoch": 0.35709699282987906, + "grad_norm": 0.26663652062416077, + "learning_rate": 4.058950494887358e-05, + "loss": 0.1807, + "step": 20021 + }, + { + "epoch": 0.35711482895159274, + "grad_norm": 0.2801266312599182, + "learning_rate": 4.058828810424446e-05, + "loss": 0.1885, + "step": 20022 + }, + { + "epoch": 0.3571326650733065, + "grad_norm": 0.31054872274398804, + "learning_rate": 4.0587071199189756e-05, + "loss": 0.1532, + "step": 20023 + }, + { + "epoch": 0.3571505011950202, + "grad_norm": 0.26875048875808716, + "learning_rate": 4.058585423371417e-05, + "loss": 0.1761, + "step": 20024 + }, + { + "epoch": 0.35716833731673386, + "grad_norm": 0.23426620662212372, + "learning_rate": 4.058463720782243e-05, + "loss": 0.1207, + "step": 20025 + }, + { + "epoch": 0.35718617343844755, + "grad_norm": 0.2521340847015381, + "learning_rate": 4.058342012151926e-05, + "loss": 0.1967, + "step": 20026 + }, + { + "epoch": 0.35720400956016124, + "grad_norm": 0.30015817284584045, + "learning_rate": 4.058220297480937e-05, + "loss": 0.1402, + "step": 20027 + }, + { + "epoch": 0.3572218456818749, + "grad_norm": 0.3039630949497223, + "learning_rate": 4.058098576769748e-05, + "loss": 0.1853, + "step": 20028 + }, + { + "epoch": 0.3572396818035886, + "grad_norm": 0.21330486238002777, + "learning_rate": 4.057976850018831e-05, + "loss": 0.1423, + "step": 20029 + }, + { + "epoch": 0.3572575179253023, + "grad_norm": 0.3427489101886749, + "learning_rate": 4.057855117228657e-05, + "loss": 0.2264, + "step": 20030 + }, + { + "epoch": 0.357275354047016, + "grad_norm": 0.25500714778900146, + "learning_rate": 4.0577333783996985e-05, + "loss": 0.1883, + "step": 20031 + }, + { + "epoch": 0.35729319016872974, + "grad_norm": 0.243867427110672, + "learning_rate": 4.0576116335324274e-05, + "loss": 0.1644, + "step": 20032 + }, + { + "epoch": 0.3573110262904434, + "grad_norm": 0.2899976670742035, + "learning_rate": 4.0574898826273164e-05, + "loss": 0.1629, + "step": 20033 + }, + { + "epoch": 0.3573288624121571, + "grad_norm": 0.2780800759792328, + "learning_rate": 4.0573681256848364e-05, + "loss": 0.1271, + "step": 20034 + }, + { + "epoch": 0.3573466985338708, + "grad_norm": 0.3251967430114746, + "learning_rate": 4.057246362705459e-05, + "loss": 0.1794, + "step": 20035 + }, + { + "epoch": 0.3573645346555845, + "grad_norm": 0.2088649868965149, + "learning_rate": 4.0571245936896575e-05, + "loss": 0.1659, + "step": 20036 + }, + { + "epoch": 0.3573823707772982, + "grad_norm": 0.21452626585960388, + "learning_rate": 4.0570028186379025e-05, + "loss": 0.1518, + "step": 20037 + }, + { + "epoch": 0.35740020689901186, + "grad_norm": 0.29758983850479126, + "learning_rate": 4.056881037550668e-05, + "loss": 0.1687, + "step": 20038 + }, + { + "epoch": 0.35741804302072555, + "grad_norm": 0.2507551312446594, + "learning_rate": 4.0567592504284236e-05, + "loss": 0.1633, + "step": 20039 + }, + { + "epoch": 0.3574358791424393, + "grad_norm": 0.246376171708107, + "learning_rate": 4.0566374572716435e-05, + "loss": 0.1494, + "step": 20040 + }, + { + "epoch": 0.357453715264153, + "grad_norm": 0.25536832213401794, + "learning_rate": 4.056515658080799e-05, + "loss": 0.163, + "step": 20041 + }, + { + "epoch": 0.35747155138586667, + "grad_norm": 0.257061243057251, + "learning_rate": 4.056393852856362e-05, + "loss": 0.1494, + "step": 20042 + }, + { + "epoch": 0.35748938750758036, + "grad_norm": 0.26048627495765686, + "learning_rate": 4.056272041598804e-05, + "loss": 0.2195, + "step": 20043 + }, + { + "epoch": 0.35750722362929405, + "grad_norm": 0.3398483097553253, + "learning_rate": 4.0561502243085994e-05, + "loss": 0.1723, + "step": 20044 + }, + { + "epoch": 0.35752505975100773, + "grad_norm": 0.28536954522132874, + "learning_rate": 4.056028400986218e-05, + "loss": 0.1698, + "step": 20045 + }, + { + "epoch": 0.3575428958727214, + "grad_norm": 0.2261572778224945, + "learning_rate": 4.0559065716321344e-05, + "loss": 0.1427, + "step": 20046 + }, + { + "epoch": 0.3575607319944351, + "grad_norm": 0.2272304743528366, + "learning_rate": 4.055784736246818e-05, + "loss": 0.1619, + "step": 20047 + }, + { + "epoch": 0.3575785681161488, + "grad_norm": 0.3480839729309082, + "learning_rate": 4.055662894830744e-05, + "loss": 0.1973, + "step": 20048 + }, + { + "epoch": 0.35759640423786254, + "grad_norm": 0.3085460364818573, + "learning_rate": 4.0555410473843826e-05, + "loss": 0.1777, + "step": 20049 + }, + { + "epoch": 0.35761424035957623, + "grad_norm": 0.23200970888137817, + "learning_rate": 4.0554191939082065e-05, + "loss": 0.1804, + "step": 20050 + }, + { + "epoch": 0.3576320764812899, + "grad_norm": 0.24992962181568146, + "learning_rate": 4.055297334402689e-05, + "loss": 0.1669, + "step": 20051 + }, + { + "epoch": 0.3576499126030036, + "grad_norm": 0.27971726655960083, + "learning_rate": 4.055175468868301e-05, + "loss": 0.1912, + "step": 20052 + }, + { + "epoch": 0.3576677487247173, + "grad_norm": 0.25915294885635376, + "learning_rate": 4.055053597305517e-05, + "loss": 0.2176, + "step": 20053 + }, + { + "epoch": 0.357685584846431, + "grad_norm": 0.24156317114830017, + "learning_rate": 4.054931719714807e-05, + "loss": 0.128, + "step": 20054 + }, + { + "epoch": 0.35770342096814467, + "grad_norm": 0.23737764358520508, + "learning_rate": 4.054809836096646e-05, + "loss": 0.1269, + "step": 20055 + }, + { + "epoch": 0.35772125708985836, + "grad_norm": 0.2549412250518799, + "learning_rate": 4.054687946451503e-05, + "loss": 0.1995, + "step": 20056 + }, + { + "epoch": 0.3577390932115721, + "grad_norm": 0.21873600780963898, + "learning_rate": 4.054566050779855e-05, + "loss": 0.1353, + "step": 20057 + }, + { + "epoch": 0.3577569293332858, + "grad_norm": 0.32668742537498474, + "learning_rate": 4.05444414908217e-05, + "loss": 0.2179, + "step": 20058 + }, + { + "epoch": 0.3577747654549995, + "grad_norm": 0.2762688994407654, + "learning_rate": 4.054322241358923e-05, + "loss": 0.1623, + "step": 20059 + }, + { + "epoch": 0.35779260157671317, + "grad_norm": 0.3811163306236267, + "learning_rate": 4.054200327610587e-05, + "loss": 0.2306, + "step": 20060 + }, + { + "epoch": 0.35781043769842685, + "grad_norm": 0.22776132822036743, + "learning_rate": 4.054078407837633e-05, + "loss": 0.0969, + "step": 20061 + }, + { + "epoch": 0.35782827382014054, + "grad_norm": 0.2461792230606079, + "learning_rate": 4.0539564820405344e-05, + "loss": 0.2207, + "step": 20062 + }, + { + "epoch": 0.35784610994185423, + "grad_norm": 0.2780037224292755, + "learning_rate": 4.0538345502197636e-05, + "loss": 0.1675, + "step": 20063 + }, + { + "epoch": 0.3578639460635679, + "grad_norm": 0.2432805299758911, + "learning_rate": 4.0537126123757944e-05, + "loss": 0.1589, + "step": 20064 + }, + { + "epoch": 0.3578817821852816, + "grad_norm": 0.27001407742500305, + "learning_rate": 4.053590668509098e-05, + "loss": 0.1658, + "step": 20065 + }, + { + "epoch": 0.35789961830699535, + "grad_norm": 0.3425554931163788, + "learning_rate": 4.053468718620147e-05, + "loss": 0.1818, + "step": 20066 + }, + { + "epoch": 0.35791745442870904, + "grad_norm": 0.2458711713552475, + "learning_rate": 4.053346762709415e-05, + "loss": 0.1598, + "step": 20067 + }, + { + "epoch": 0.3579352905504227, + "grad_norm": 0.2911379635334015, + "learning_rate": 4.0532248007773746e-05, + "loss": 0.1709, + "step": 20068 + }, + { + "epoch": 0.3579531266721364, + "grad_norm": 0.28506800532341003, + "learning_rate": 4.0531028328244985e-05, + "loss": 0.1565, + "step": 20069 + }, + { + "epoch": 0.3579709627938501, + "grad_norm": 0.2757311761379242, + "learning_rate": 4.052980858851259e-05, + "loss": 0.2468, + "step": 20070 + }, + { + "epoch": 0.3579887989155638, + "grad_norm": 0.19196200370788574, + "learning_rate": 4.0528588788581295e-05, + "loss": 0.0889, + "step": 20071 + }, + { + "epoch": 0.3580066350372775, + "grad_norm": 0.31583845615386963, + "learning_rate": 4.0527368928455826e-05, + "loss": 0.2032, + "step": 20072 + }, + { + "epoch": 0.35802447115899116, + "grad_norm": 0.23932869732379913, + "learning_rate": 4.0526149008140914e-05, + "loss": 0.138, + "step": 20073 + }, + { + "epoch": 0.3580423072807049, + "grad_norm": 0.20993123948574066, + "learning_rate": 4.052492902764129e-05, + "loss": 0.1554, + "step": 20074 + }, + { + "epoch": 0.3580601434024186, + "grad_norm": 0.1977933794260025, + "learning_rate": 4.052370898696167e-05, + "loss": 0.1144, + "step": 20075 + }, + { + "epoch": 0.3580779795241323, + "grad_norm": 0.2511729300022125, + "learning_rate": 4.052248888610679e-05, + "loss": 0.185, + "step": 20076 + }, + { + "epoch": 0.35809581564584597, + "grad_norm": 0.28162476420402527, + "learning_rate": 4.05212687250814e-05, + "loss": 0.1705, + "step": 20077 + }, + { + "epoch": 0.35811365176755966, + "grad_norm": 0.23591631650924683, + "learning_rate": 4.052004850389019e-05, + "loss": 0.1693, + "step": 20078 + }, + { + "epoch": 0.35813148788927335, + "grad_norm": 0.28343600034713745, + "learning_rate": 4.0518828222537916e-05, + "loss": 0.1794, + "step": 20079 + }, + { + "epoch": 0.35814932401098704, + "grad_norm": 0.2719506323337555, + "learning_rate": 4.051760788102931e-05, + "loss": 0.2279, + "step": 20080 + }, + { + "epoch": 0.3581671601327007, + "grad_norm": 0.23200997710227966, + "learning_rate": 4.05163874793691e-05, + "loss": 0.1785, + "step": 20081 + }, + { + "epoch": 0.35818499625441447, + "grad_norm": 0.22298255562782288, + "learning_rate": 4.0515167017562006e-05, + "loss": 0.1567, + "step": 20082 + }, + { + "epoch": 0.35820283237612816, + "grad_norm": 0.31438174843788147, + "learning_rate": 4.051394649561277e-05, + "loss": 0.1597, + "step": 20083 + }, + { + "epoch": 0.35822066849784184, + "grad_norm": 0.18996065855026245, + "learning_rate": 4.0512725913526115e-05, + "loss": 0.1371, + "step": 20084 + }, + { + "epoch": 0.35823850461955553, + "grad_norm": 0.2507956326007843, + "learning_rate": 4.051150527130678e-05, + "loss": 0.1644, + "step": 20085 + }, + { + "epoch": 0.3582563407412692, + "grad_norm": 0.25392869114875793, + "learning_rate": 4.051028456895949e-05, + "loss": 0.186, + "step": 20086 + }, + { + "epoch": 0.3582741768629829, + "grad_norm": 0.22865475714206696, + "learning_rate": 4.050906380648898e-05, + "loss": 0.1422, + "step": 20087 + }, + { + "epoch": 0.3582920129846966, + "grad_norm": 0.3003014624118805, + "learning_rate": 4.050784298389998e-05, + "loss": 0.1733, + "step": 20088 + }, + { + "epoch": 0.3583098491064103, + "grad_norm": 0.3805437982082367, + "learning_rate": 4.050662210119723e-05, + "loss": 0.2735, + "step": 20089 + }, + { + "epoch": 0.35832768522812397, + "grad_norm": 0.27107667922973633, + "learning_rate": 4.050540115838546e-05, + "loss": 0.2042, + "step": 20090 + }, + { + "epoch": 0.3583455213498377, + "grad_norm": 0.3716125786304474, + "learning_rate": 4.050418015546939e-05, + "loss": 0.1865, + "step": 20091 + }, + { + "epoch": 0.3583633574715514, + "grad_norm": 0.3723642826080322, + "learning_rate": 4.050295909245377e-05, + "loss": 0.1819, + "step": 20092 + }, + { + "epoch": 0.3583811935932651, + "grad_norm": 0.29444295167922974, + "learning_rate": 4.0501737969343326e-05, + "loss": 0.1629, + "step": 20093 + }, + { + "epoch": 0.3583990297149788, + "grad_norm": 0.27687492966651917, + "learning_rate": 4.0500516786142784e-05, + "loss": 0.176, + "step": 20094 + }, + { + "epoch": 0.35841686583669247, + "grad_norm": 0.2261350452899933, + "learning_rate": 4.0499295542856884e-05, + "loss": 0.1514, + "step": 20095 + }, + { + "epoch": 0.35843470195840615, + "grad_norm": 0.29730236530303955, + "learning_rate": 4.0498074239490367e-05, + "loss": 0.0942, + "step": 20096 + }, + { + "epoch": 0.35845253808011984, + "grad_norm": 0.2816241681575775, + "learning_rate": 4.049685287604796e-05, + "loss": 0.1718, + "step": 20097 + }, + { + "epoch": 0.35847037420183353, + "grad_norm": 0.2102278470993042, + "learning_rate": 4.04956314525344e-05, + "loss": 0.1874, + "step": 20098 + }, + { + "epoch": 0.3584882103235473, + "grad_norm": 0.3193916380405426, + "learning_rate": 4.0494409968954424e-05, + "loss": 0.218, + "step": 20099 + }, + { + "epoch": 0.35850604644526096, + "grad_norm": 0.2492050975561142, + "learning_rate": 4.0493188425312754e-05, + "loss": 0.1714, + "step": 20100 + }, + { + "epoch": 0.35852388256697465, + "grad_norm": 0.23118208348751068, + "learning_rate": 4.0491966821614144e-05, + "loss": 0.1564, + "step": 20101 + }, + { + "epoch": 0.35854171868868834, + "grad_norm": 0.28668248653411865, + "learning_rate": 4.049074515786332e-05, + "loss": 0.1549, + "step": 20102 + }, + { + "epoch": 0.358559554810402, + "grad_norm": 0.5270244479179382, + "learning_rate": 4.048952343406501e-05, + "loss": 0.2129, + "step": 20103 + }, + { + "epoch": 0.3585773909321157, + "grad_norm": 0.2226092368364334, + "learning_rate": 4.048830165022396e-05, + "loss": 0.1598, + "step": 20104 + }, + { + "epoch": 0.3585952270538294, + "grad_norm": 0.28536170721054077, + "learning_rate": 4.048707980634491e-05, + "loss": 0.1779, + "step": 20105 + }, + { + "epoch": 0.3586130631755431, + "grad_norm": 0.1809590607881546, + "learning_rate": 4.0485857902432575e-05, + "loss": 0.1054, + "step": 20106 + }, + { + "epoch": 0.3586308992972568, + "grad_norm": 0.2945133447647095, + "learning_rate": 4.048463593849172e-05, + "loss": 0.1682, + "step": 20107 + }, + { + "epoch": 0.3586487354189705, + "grad_norm": 0.4864062964916229, + "learning_rate": 4.0483413914527055e-05, + "loss": 0.1762, + "step": 20108 + }, + { + "epoch": 0.3586665715406842, + "grad_norm": 0.31564784049987793, + "learning_rate": 4.048219183054335e-05, + "loss": 0.1481, + "step": 20109 + }, + { + "epoch": 0.3586844076623979, + "grad_norm": 0.3170863389968872, + "learning_rate": 4.04809696865453e-05, + "loss": 0.1562, + "step": 20110 + }, + { + "epoch": 0.3587022437841116, + "grad_norm": 0.275523841381073, + "learning_rate": 4.0479747482537675e-05, + "loss": 0.1543, + "step": 20111 + }, + { + "epoch": 0.3587200799058253, + "grad_norm": 0.20725567638874054, + "learning_rate": 4.047852521852521e-05, + "loss": 0.1092, + "step": 20112 + }, + { + "epoch": 0.35873791602753896, + "grad_norm": 0.21188224852085114, + "learning_rate": 4.0477302894512625e-05, + "loss": 0.1754, + "step": 20113 + }, + { + "epoch": 0.35875575214925265, + "grad_norm": 0.25122159719467163, + "learning_rate": 4.0476080510504666e-05, + "loss": 0.228, + "step": 20114 + }, + { + "epoch": 0.35877358827096634, + "grad_norm": 0.28943324089050293, + "learning_rate": 4.047485806650608e-05, + "loss": 0.1529, + "step": 20115 + }, + { + "epoch": 0.3587914243926801, + "grad_norm": 0.2326575517654419, + "learning_rate": 4.0473635562521594e-05, + "loss": 0.1525, + "step": 20116 + }, + { + "epoch": 0.35880926051439377, + "grad_norm": 0.2267775684595108, + "learning_rate": 4.0472412998555956e-05, + "loss": 0.1714, + "step": 20117 + }, + { + "epoch": 0.35882709663610746, + "grad_norm": 0.2923399806022644, + "learning_rate": 4.04711903746139e-05, + "loss": 0.2147, + "step": 20118 + }, + { + "epoch": 0.35884493275782114, + "grad_norm": 0.22854118049144745, + "learning_rate": 4.046996769070017e-05, + "loss": 0.1105, + "step": 20119 + }, + { + "epoch": 0.35886276887953483, + "grad_norm": 0.3726526200771332, + "learning_rate": 4.0468744946819495e-05, + "loss": 0.1624, + "step": 20120 + }, + { + "epoch": 0.3588806050012485, + "grad_norm": 0.4243506193161011, + "learning_rate": 4.0467522142976626e-05, + "loss": 0.1917, + "step": 20121 + }, + { + "epoch": 0.3588984411229622, + "grad_norm": 0.19965283572673798, + "learning_rate": 4.04662992791763e-05, + "loss": 0.1407, + "step": 20122 + }, + { + "epoch": 0.3589162772446759, + "grad_norm": 0.23447652161121368, + "learning_rate": 4.046507635542325e-05, + "loss": 0.1902, + "step": 20123 + }, + { + "epoch": 0.35893411336638964, + "grad_norm": 0.26635512709617615, + "learning_rate": 4.0463853371722234e-05, + "loss": 0.1594, + "step": 20124 + }, + { + "epoch": 0.35895194948810333, + "grad_norm": 0.23608747124671936, + "learning_rate": 4.046263032807797e-05, + "loss": 0.1385, + "step": 20125 + }, + { + "epoch": 0.358969785609817, + "grad_norm": 0.19783037900924683, + "learning_rate": 4.046140722449522e-05, + "loss": 0.1389, + "step": 20126 + }, + { + "epoch": 0.3589876217315307, + "grad_norm": 0.3190438449382782, + "learning_rate": 4.04601840609787e-05, + "loss": 0.1859, + "step": 20127 + }, + { + "epoch": 0.3590054578532444, + "grad_norm": 0.22001811861991882, + "learning_rate": 4.0458960837533185e-05, + "loss": 0.1702, + "step": 20128 + }, + { + "epoch": 0.3590232939749581, + "grad_norm": 0.2458259016275406, + "learning_rate": 4.045773755416339e-05, + "loss": 0.177, + "step": 20129 + }, + { + "epoch": 0.35904113009667177, + "grad_norm": 0.3692992031574249, + "learning_rate": 4.045651421087406e-05, + "loss": 0.1903, + "step": 20130 + }, + { + "epoch": 0.35905896621838546, + "grad_norm": 0.20430196821689606, + "learning_rate": 4.0455290807669955e-05, + "loss": 0.1625, + "step": 20131 + }, + { + "epoch": 0.35907680234009914, + "grad_norm": 0.2893421947956085, + "learning_rate": 4.045406734455579e-05, + "loss": 0.1033, + "step": 20132 + }, + { + "epoch": 0.3590946384618129, + "grad_norm": 0.2364894151687622, + "learning_rate": 4.045284382153633e-05, + "loss": 0.1377, + "step": 20133 + }, + { + "epoch": 0.3591124745835266, + "grad_norm": 0.27595511078834534, + "learning_rate": 4.0451620238616315e-05, + "loss": 0.1541, + "step": 20134 + }, + { + "epoch": 0.35913031070524026, + "grad_norm": 0.26502010226249695, + "learning_rate": 4.045039659580048e-05, + "loss": 0.115, + "step": 20135 + }, + { + "epoch": 0.35914814682695395, + "grad_norm": 0.29014939069747925, + "learning_rate": 4.0449172893093565e-05, + "loss": 0.1608, + "step": 20136 + }, + { + "epoch": 0.35916598294866764, + "grad_norm": 0.26655539870262146, + "learning_rate": 4.044794913050033e-05, + "loss": 0.1356, + "step": 20137 + }, + { + "epoch": 0.3591838190703813, + "grad_norm": 0.24565160274505615, + "learning_rate": 4.04467253080255e-05, + "loss": 0.1849, + "step": 20138 + }, + { + "epoch": 0.359201655192095, + "grad_norm": 0.3990464210510254, + "learning_rate": 4.044550142567383e-05, + "loss": 0.1777, + "step": 20139 + }, + { + "epoch": 0.3592194913138087, + "grad_norm": 0.2843189537525177, + "learning_rate": 4.0444277483450064e-05, + "loss": 0.1697, + "step": 20140 + }, + { + "epoch": 0.35923732743552245, + "grad_norm": 0.3496202826499939, + "learning_rate": 4.044305348135894e-05, + "loss": 0.2031, + "step": 20141 + }, + { + "epoch": 0.35925516355723613, + "grad_norm": 0.22710567712783813, + "learning_rate": 4.0441829419405215e-05, + "loss": 0.1681, + "step": 20142 + }, + { + "epoch": 0.3592729996789498, + "grad_norm": 0.37579599022865295, + "learning_rate": 4.0440605297593616e-05, + "loss": 0.1999, + "step": 20143 + }, + { + "epoch": 0.3592908358006635, + "grad_norm": 0.2775934338569641, + "learning_rate": 4.0439381115928906e-05, + "loss": 0.1689, + "step": 20144 + }, + { + "epoch": 0.3593086719223772, + "grad_norm": 0.21812483668327332, + "learning_rate": 4.0438156874415816e-05, + "loss": 0.171, + "step": 20145 + }, + { + "epoch": 0.3593265080440909, + "grad_norm": 0.25060713291168213, + "learning_rate": 4.0436932573059104e-05, + "loss": 0.1715, + "step": 20146 + }, + { + "epoch": 0.3593443441658046, + "grad_norm": 0.23388399183750153, + "learning_rate": 4.04357082118635e-05, + "loss": 0.1793, + "step": 20147 + }, + { + "epoch": 0.35936218028751826, + "grad_norm": 0.2578822374343872, + "learning_rate": 4.043448379083377e-05, + "loss": 0.1115, + "step": 20148 + }, + { + "epoch": 0.35938001640923195, + "grad_norm": 0.30807995796203613, + "learning_rate": 4.043325930997464e-05, + "loss": 0.147, + "step": 20149 + }, + { + "epoch": 0.3593978525309457, + "grad_norm": 0.4385431706905365, + "learning_rate": 4.0432034769290876e-05, + "loss": 0.1824, + "step": 20150 + }, + { + "epoch": 0.3594156886526594, + "grad_norm": 0.26442015171051025, + "learning_rate": 4.043081016878721e-05, + "loss": 0.1565, + "step": 20151 + }, + { + "epoch": 0.35943352477437307, + "grad_norm": 0.38670381903648376, + "learning_rate": 4.042958550846839e-05, + "loss": 0.2768, + "step": 20152 + }, + { + "epoch": 0.35945136089608676, + "grad_norm": 0.28464677929878235, + "learning_rate": 4.042836078833917e-05, + "loss": 0.1648, + "step": 20153 + }, + { + "epoch": 0.35946919701780045, + "grad_norm": 0.28415244817733765, + "learning_rate": 4.042713600840431e-05, + "loss": 0.1377, + "step": 20154 + }, + { + "epoch": 0.35948703313951413, + "grad_norm": 0.24842941761016846, + "learning_rate": 4.042591116866853e-05, + "loss": 0.1763, + "step": 20155 + }, + { + "epoch": 0.3595048692612278, + "grad_norm": 0.30126526951789856, + "learning_rate": 4.042468626913659e-05, + "loss": 0.2284, + "step": 20156 + }, + { + "epoch": 0.3595227053829415, + "grad_norm": 0.22162488102912903, + "learning_rate": 4.042346130981324e-05, + "loss": 0.1644, + "step": 20157 + }, + { + "epoch": 0.35954054150465525, + "grad_norm": 0.1988789290189743, + "learning_rate": 4.042223629070322e-05, + "loss": 0.1553, + "step": 20158 + }, + { + "epoch": 0.35955837762636894, + "grad_norm": 0.2553611993789673, + "learning_rate": 4.042101121181129e-05, + "loss": 0.1982, + "step": 20159 + }, + { + "epoch": 0.35957621374808263, + "grad_norm": 0.2910846471786499, + "learning_rate": 4.0419786073142193e-05, + "loss": 0.1703, + "step": 20160 + }, + { + "epoch": 0.3595940498697963, + "grad_norm": 0.22156549990177155, + "learning_rate": 4.0418560874700686e-05, + "loss": 0.1665, + "step": 20161 + }, + { + "epoch": 0.35961188599151, + "grad_norm": 0.2697817087173462, + "learning_rate": 4.04173356164915e-05, + "loss": 0.1346, + "step": 20162 + }, + { + "epoch": 0.3596297221132237, + "grad_norm": 0.23115389049053192, + "learning_rate": 4.041611029851941e-05, + "loss": 0.1593, + "step": 20163 + }, + { + "epoch": 0.3596475582349374, + "grad_norm": 0.3282219171524048, + "learning_rate": 4.041488492078914e-05, + "loss": 0.2124, + "step": 20164 + }, + { + "epoch": 0.35966539435665107, + "grad_norm": 0.2314584106206894, + "learning_rate": 4.041365948330546e-05, + "loss": 0.171, + "step": 20165 + }, + { + "epoch": 0.35968323047836476, + "grad_norm": 0.270245760679245, + "learning_rate": 4.041243398607311e-05, + "loss": 0.1752, + "step": 20166 + }, + { + "epoch": 0.3597010666000785, + "grad_norm": 0.22855812311172485, + "learning_rate": 4.041120842909685e-05, + "loss": 0.1658, + "step": 20167 + }, + { + "epoch": 0.3597189027217922, + "grad_norm": 0.2193540781736374, + "learning_rate": 4.040998281238141e-05, + "loss": 0.1729, + "step": 20168 + }, + { + "epoch": 0.3597367388435059, + "grad_norm": 0.21676138043403625, + "learning_rate": 4.0408757135931564e-05, + "loss": 0.1599, + "step": 20169 + }, + { + "epoch": 0.35975457496521956, + "grad_norm": 0.30260127782821655, + "learning_rate": 4.040753139975205e-05, + "loss": 0.165, + "step": 20170 + }, + { + "epoch": 0.35977241108693325, + "grad_norm": 0.22339145839214325, + "learning_rate": 4.040630560384761e-05, + "loss": 0.1815, + "step": 20171 + }, + { + "epoch": 0.35979024720864694, + "grad_norm": 0.3318864405155182, + "learning_rate": 4.040507974822303e-05, + "loss": 0.1661, + "step": 20172 + }, + { + "epoch": 0.35980808333036063, + "grad_norm": 0.21138843894004822, + "learning_rate": 4.0403853832883024e-05, + "loss": 0.1666, + "step": 20173 + }, + { + "epoch": 0.3598259194520743, + "grad_norm": 0.24807460606098175, + "learning_rate": 4.040262785783237e-05, + "loss": 0.2165, + "step": 20174 + }, + { + "epoch": 0.35984375557378806, + "grad_norm": 0.2565484046936035, + "learning_rate": 4.0401401823075805e-05, + "loss": 0.181, + "step": 20175 + }, + { + "epoch": 0.35986159169550175, + "grad_norm": 0.2567596435546875, + "learning_rate": 4.040017572861809e-05, + "loss": 0.1857, + "step": 20176 + }, + { + "epoch": 0.35987942781721544, + "grad_norm": 0.23779326677322388, + "learning_rate": 4.039894957446398e-05, + "loss": 0.1601, + "step": 20177 + }, + { + "epoch": 0.3598972639389291, + "grad_norm": 0.2920511066913605, + "learning_rate": 4.039772336061821e-05, + "loss": 0.1678, + "step": 20178 + }, + { + "epoch": 0.3599151000606428, + "grad_norm": 0.27880293130874634, + "learning_rate": 4.039649708708555e-05, + "loss": 0.2438, + "step": 20179 + }, + { + "epoch": 0.3599329361823565, + "grad_norm": 0.3981066346168518, + "learning_rate": 4.039527075387075e-05, + "loss": 0.1534, + "step": 20180 + }, + { + "epoch": 0.3599507723040702, + "grad_norm": 0.33346423506736755, + "learning_rate": 4.039404436097857e-05, + "loss": 0.137, + "step": 20181 + }, + { + "epoch": 0.3599686084257839, + "grad_norm": 0.2764245569705963, + "learning_rate": 4.039281790841375e-05, + "loss": 0.1224, + "step": 20182 + }, + { + "epoch": 0.3599864445474976, + "grad_norm": 0.17735832929611206, + "learning_rate": 4.039159139618106e-05, + "loss": 0.1388, + "step": 20183 + }, + { + "epoch": 0.3600042806692113, + "grad_norm": 0.4991561472415924, + "learning_rate": 4.0390364824285234e-05, + "loss": 0.1564, + "step": 20184 + }, + { + "epoch": 0.360022116790925, + "grad_norm": 0.3999602198600769, + "learning_rate": 4.0389138192731044e-05, + "loss": 0.1669, + "step": 20185 + }, + { + "epoch": 0.3600399529126387, + "grad_norm": 0.20516230165958405, + "learning_rate": 4.038791150152324e-05, + "loss": 0.1716, + "step": 20186 + }, + { + "epoch": 0.36005778903435237, + "grad_norm": 0.2424459308385849, + "learning_rate": 4.038668475066657e-05, + "loss": 0.1364, + "step": 20187 + }, + { + "epoch": 0.36007562515606606, + "grad_norm": 0.3136383295059204, + "learning_rate": 4.03854579401658e-05, + "loss": 0.1081, + "step": 20188 + }, + { + "epoch": 0.36009346127777975, + "grad_norm": 0.2521072328090668, + "learning_rate": 4.038423107002569e-05, + "loss": 0.1619, + "step": 20189 + }, + { + "epoch": 0.36011129739949344, + "grad_norm": 0.28016197681427, + "learning_rate": 4.038300414025098e-05, + "loss": 0.2076, + "step": 20190 + }, + { + "epoch": 0.3601291335212071, + "grad_norm": 0.31766772270202637, + "learning_rate": 4.038177715084642e-05, + "loss": 0.1259, + "step": 20191 + }, + { + "epoch": 0.36014696964292087, + "grad_norm": 0.33110103011131287, + "learning_rate": 4.03805501018168e-05, + "loss": 0.1397, + "step": 20192 + }, + { + "epoch": 0.36016480576463455, + "grad_norm": 0.2855415642261505, + "learning_rate": 4.037932299316685e-05, + "loss": 0.1588, + "step": 20193 + }, + { + "epoch": 0.36018264188634824, + "grad_norm": 0.20734168589115143, + "learning_rate": 4.0378095824901317e-05, + "loss": 0.1383, + "step": 20194 + }, + { + "epoch": 0.36020047800806193, + "grad_norm": 0.22390116751194, + "learning_rate": 4.037686859702499e-05, + "loss": 0.1511, + "step": 20195 + }, + { + "epoch": 0.3602183141297756, + "grad_norm": 0.30205607414245605, + "learning_rate": 4.03756413095426e-05, + "loss": 0.1713, + "step": 20196 + }, + { + "epoch": 0.3602361502514893, + "grad_norm": 0.30078405141830444, + "learning_rate": 4.037441396245892e-05, + "loss": 0.2272, + "step": 20197 + }, + { + "epoch": 0.360253986373203, + "grad_norm": 0.28949621319770813, + "learning_rate": 4.03731865557787e-05, + "loss": 0.1742, + "step": 20198 + }, + { + "epoch": 0.3602718224949167, + "grad_norm": 0.2577759325504303, + "learning_rate": 4.03719590895067e-05, + "loss": 0.1611, + "step": 20199 + }, + { + "epoch": 0.3602896586166304, + "grad_norm": 0.26126524806022644, + "learning_rate": 4.037073156364767e-05, + "loss": 0.1569, + "step": 20200 + }, + { + "epoch": 0.3603074947383441, + "grad_norm": 0.23403382301330566, + "learning_rate": 4.036950397820638e-05, + "loss": 0.1798, + "step": 20201 + }, + { + "epoch": 0.3603253308600578, + "grad_norm": 0.2359326332807541, + "learning_rate": 4.0368276333187585e-05, + "loss": 0.1322, + "step": 20202 + }, + { + "epoch": 0.3603431669817715, + "grad_norm": 0.236772358417511, + "learning_rate": 4.036704862859604e-05, + "loss": 0.1693, + "step": 20203 + }, + { + "epoch": 0.3603610031034852, + "grad_norm": 0.2861442565917969, + "learning_rate": 4.036582086443651e-05, + "loss": 0.1268, + "step": 20204 + }, + { + "epoch": 0.36037883922519887, + "grad_norm": 0.3536750376224518, + "learning_rate": 4.036459304071375e-05, + "loss": 0.2235, + "step": 20205 + }, + { + "epoch": 0.36039667534691255, + "grad_norm": 0.2823757827281952, + "learning_rate": 4.036336515743252e-05, + "loss": 0.1731, + "step": 20206 + }, + { + "epoch": 0.36041451146862624, + "grad_norm": 0.2587491273880005, + "learning_rate": 4.0362137214597585e-05, + "loss": 0.1733, + "step": 20207 + }, + { + "epoch": 0.36043234759033993, + "grad_norm": 0.2883797287940979, + "learning_rate": 4.0360909212213696e-05, + "loss": 0.1425, + "step": 20208 + }, + { + "epoch": 0.3604501837120537, + "grad_norm": 0.256644070148468, + "learning_rate": 4.035968115028562e-05, + "loss": 0.1473, + "step": 20209 + }, + { + "epoch": 0.36046801983376736, + "grad_norm": 0.3776950538158417, + "learning_rate": 4.035845302881811e-05, + "loss": 0.1503, + "step": 20210 + }, + { + "epoch": 0.36048585595548105, + "grad_norm": 0.3086390495300293, + "learning_rate": 4.035722484781593e-05, + "loss": 0.1937, + "step": 20211 + }, + { + "epoch": 0.36050369207719474, + "grad_norm": 0.2689470052719116, + "learning_rate": 4.035599660728385e-05, + "loss": 0.1792, + "step": 20212 + }, + { + "epoch": 0.3605215281989084, + "grad_norm": 0.25602486729621887, + "learning_rate": 4.0354768307226623e-05, + "loss": 0.1327, + "step": 20213 + }, + { + "epoch": 0.3605393643206221, + "grad_norm": 0.22623009979724884, + "learning_rate": 4.035353994764901e-05, + "loss": 0.1178, + "step": 20214 + }, + { + "epoch": 0.3605572004423358, + "grad_norm": 0.302060604095459, + "learning_rate": 4.035231152855576e-05, + "loss": 0.1669, + "step": 20215 + }, + { + "epoch": 0.3605750365640495, + "grad_norm": 0.22856928408145905, + "learning_rate": 4.035108304995167e-05, + "loss": 0.126, + "step": 20216 + }, + { + "epoch": 0.36059287268576323, + "grad_norm": 0.23531246185302734, + "learning_rate": 4.034985451184147e-05, + "loss": 0.1498, + "step": 20217 + }, + { + "epoch": 0.3606107088074769, + "grad_norm": 0.30605563521385193, + "learning_rate": 4.0348625914229925e-05, + "loss": 0.2622, + "step": 20218 + }, + { + "epoch": 0.3606285449291906, + "grad_norm": 0.3773329555988312, + "learning_rate": 4.034739725712181e-05, + "loss": 0.2051, + "step": 20219 + }, + { + "epoch": 0.3606463810509043, + "grad_norm": 0.27986735105514526, + "learning_rate": 4.034616854052189e-05, + "loss": 0.1941, + "step": 20220 + }, + { + "epoch": 0.360664217172618, + "grad_norm": 0.2917799949645996, + "learning_rate": 4.034493976443491e-05, + "loss": 0.2118, + "step": 20221 + }, + { + "epoch": 0.3606820532943317, + "grad_norm": 0.25020676851272583, + "learning_rate": 4.034371092886565e-05, + "loss": 0.1457, + "step": 20222 + }, + { + "epoch": 0.36069988941604536, + "grad_norm": 0.29000723361968994, + "learning_rate": 4.034248203381886e-05, + "loss": 0.1552, + "step": 20223 + }, + { + "epoch": 0.36071772553775905, + "grad_norm": 0.22609686851501465, + "learning_rate": 4.034125307929932e-05, + "loss": 0.1588, + "step": 20224 + }, + { + "epoch": 0.36073556165947274, + "grad_norm": 0.24163925647735596, + "learning_rate": 4.034002406531178e-05, + "loss": 0.1616, + "step": 20225 + }, + { + "epoch": 0.3607533977811865, + "grad_norm": 0.7998350858688354, + "learning_rate": 4.0338794991861e-05, + "loss": 0.2186, + "step": 20226 + }, + { + "epoch": 0.36077123390290017, + "grad_norm": 0.22135712206363678, + "learning_rate": 4.033756585895177e-05, + "loss": 0.1455, + "step": 20227 + }, + { + "epoch": 0.36078907002461386, + "grad_norm": 0.30494847893714905, + "learning_rate": 4.033633666658883e-05, + "loss": 0.1097, + "step": 20228 + }, + { + "epoch": 0.36080690614632754, + "grad_norm": 0.27180221676826477, + "learning_rate": 4.033510741477694e-05, + "loss": 0.1366, + "step": 20229 + }, + { + "epoch": 0.36082474226804123, + "grad_norm": 0.19971726834774017, + "learning_rate": 4.033387810352088e-05, + "loss": 0.126, + "step": 20230 + }, + { + "epoch": 0.3608425783897549, + "grad_norm": 0.24496595561504364, + "learning_rate": 4.033264873282542e-05, + "loss": 0.173, + "step": 20231 + }, + { + "epoch": 0.3608604145114686, + "grad_norm": 0.26250410079956055, + "learning_rate": 4.033141930269532e-05, + "loss": 0.1656, + "step": 20232 + }, + { + "epoch": 0.3608782506331823, + "grad_norm": 0.26627466082572937, + "learning_rate": 4.0330189813135345e-05, + "loss": 0.1947, + "step": 20233 + }, + { + "epoch": 0.36089608675489604, + "grad_norm": 0.30126819014549255, + "learning_rate": 4.032896026415025e-05, + "loss": 0.1833, + "step": 20234 + }, + { + "epoch": 0.3609139228766097, + "grad_norm": 0.22844769060611725, + "learning_rate": 4.032773065574482e-05, + "loss": 0.1791, + "step": 20235 + }, + { + "epoch": 0.3609317589983234, + "grad_norm": 0.2675713002681732, + "learning_rate": 4.03265009879238e-05, + "loss": 0.2061, + "step": 20236 + }, + { + "epoch": 0.3609495951200371, + "grad_norm": 0.2921369671821594, + "learning_rate": 4.032527126069198e-05, + "loss": 0.206, + "step": 20237 + }, + { + "epoch": 0.3609674312417508, + "grad_norm": 0.24295756220817566, + "learning_rate": 4.0324041474054106e-05, + "loss": 0.1495, + "step": 20238 + }, + { + "epoch": 0.3609852673634645, + "grad_norm": 0.27312779426574707, + "learning_rate": 4.032281162801497e-05, + "loss": 0.1853, + "step": 20239 + }, + { + "epoch": 0.36100310348517817, + "grad_norm": 0.3118610680103302, + "learning_rate": 4.03215817225793e-05, + "loss": 0.1941, + "step": 20240 + }, + { + "epoch": 0.36102093960689186, + "grad_norm": 0.32590848207473755, + "learning_rate": 4.032035175775191e-05, + "loss": 0.1419, + "step": 20241 + }, + { + "epoch": 0.3610387757286056, + "grad_norm": 0.29744282364845276, + "learning_rate": 4.0319121733537535e-05, + "loss": 0.1357, + "step": 20242 + }, + { + "epoch": 0.3610566118503193, + "grad_norm": 0.3176305890083313, + "learning_rate": 4.0317891649940955e-05, + "loss": 0.208, + "step": 20243 + }, + { + "epoch": 0.361074447972033, + "grad_norm": 0.1861177235841751, + "learning_rate": 4.031666150696693e-05, + "loss": 0.1158, + "step": 20244 + }, + { + "epoch": 0.36109228409374666, + "grad_norm": 0.25487369298934937, + "learning_rate": 4.031543130462024e-05, + "loss": 0.218, + "step": 20245 + }, + { + "epoch": 0.36111012021546035, + "grad_norm": 0.27307257056236267, + "learning_rate": 4.031420104290565e-05, + "loss": 0.1661, + "step": 20246 + }, + { + "epoch": 0.36112795633717404, + "grad_norm": 0.24589209258556366, + "learning_rate": 4.031297072182793e-05, + "loss": 0.1597, + "step": 20247 + }, + { + "epoch": 0.3611457924588877, + "grad_norm": 0.31624308228492737, + "learning_rate": 4.0311740341391844e-05, + "loss": 0.1449, + "step": 20248 + }, + { + "epoch": 0.3611636285806014, + "grad_norm": 0.40471410751342773, + "learning_rate": 4.0310509901602155e-05, + "loss": 0.147, + "step": 20249 + }, + { + "epoch": 0.3611814647023151, + "grad_norm": 0.24861374497413635, + "learning_rate": 4.030927940246365e-05, + "loss": 0.1714, + "step": 20250 + }, + { + "epoch": 0.36119930082402885, + "grad_norm": 0.31324025988578796, + "learning_rate": 4.030804884398109e-05, + "loss": 0.1214, + "step": 20251 + }, + { + "epoch": 0.36121713694574253, + "grad_norm": 0.36588868498802185, + "learning_rate": 4.030681822615925e-05, + "loss": 0.2327, + "step": 20252 + }, + { + "epoch": 0.3612349730674562, + "grad_norm": 0.2740863263607025, + "learning_rate": 4.030558754900289e-05, + "loss": 0.1548, + "step": 20253 + }, + { + "epoch": 0.3612528091891699, + "grad_norm": 0.2316063642501831, + "learning_rate": 4.030435681251679e-05, + "loss": 0.2011, + "step": 20254 + }, + { + "epoch": 0.3612706453108836, + "grad_norm": 0.2927284836769104, + "learning_rate": 4.030312601670571e-05, + "loss": 0.1778, + "step": 20255 + }, + { + "epoch": 0.3612884814325973, + "grad_norm": 0.3773501515388489, + "learning_rate": 4.030189516157443e-05, + "loss": 0.1714, + "step": 20256 + }, + { + "epoch": 0.361306317554311, + "grad_norm": 0.26292529702186584, + "learning_rate": 4.030066424712772e-05, + "loss": 0.1644, + "step": 20257 + }, + { + "epoch": 0.36132415367602466, + "grad_norm": 0.27771130204200745, + "learning_rate": 4.0299433273370356e-05, + "loss": 0.1351, + "step": 20258 + }, + { + "epoch": 0.3613419897977384, + "grad_norm": 0.2810702323913574, + "learning_rate": 4.0298202240307095e-05, + "loss": 0.1806, + "step": 20259 + }, + { + "epoch": 0.3613598259194521, + "grad_norm": 0.24073146283626556, + "learning_rate": 4.0296971147942725e-05, + "loss": 0.1518, + "step": 20260 + }, + { + "epoch": 0.3613776620411658, + "grad_norm": 0.2514442205429077, + "learning_rate": 4.029573999628201e-05, + "loss": 0.1801, + "step": 20261 + }, + { + "epoch": 0.36139549816287947, + "grad_norm": 0.2692815959453583, + "learning_rate": 4.029450878532973e-05, + "loss": 0.1748, + "step": 20262 + }, + { + "epoch": 0.36141333428459316, + "grad_norm": 0.29654932022094727, + "learning_rate": 4.029327751509064e-05, + "loss": 0.2166, + "step": 20263 + }, + { + "epoch": 0.36143117040630685, + "grad_norm": 0.319212406873703, + "learning_rate": 4.029204618556953e-05, + "loss": 0.1679, + "step": 20264 + }, + { + "epoch": 0.36144900652802053, + "grad_norm": 0.30208736658096313, + "learning_rate": 4.029081479677117e-05, + "loss": 0.1377, + "step": 20265 + }, + { + "epoch": 0.3614668426497342, + "grad_norm": 0.3648304045200348, + "learning_rate": 4.0289583348700325e-05, + "loss": 0.1505, + "step": 20266 + }, + { + "epoch": 0.3614846787714479, + "grad_norm": 0.25528931617736816, + "learning_rate": 4.0288351841361775e-05, + "loss": 0.1526, + "step": 20267 + }, + { + "epoch": 0.36150251489316165, + "grad_norm": 0.20291338860988617, + "learning_rate": 4.0287120274760294e-05, + "loss": 0.1671, + "step": 20268 + }, + { + "epoch": 0.36152035101487534, + "grad_norm": 0.22874824702739716, + "learning_rate": 4.028588864890066e-05, + "loss": 0.0885, + "step": 20269 + }, + { + "epoch": 0.36153818713658903, + "grad_norm": 0.24226036667823792, + "learning_rate": 4.0284656963787634e-05, + "loss": 0.1432, + "step": 20270 + }, + { + "epoch": 0.3615560232583027, + "grad_norm": 0.28844985365867615, + "learning_rate": 4.0283425219425995e-05, + "loss": 0.1908, + "step": 20271 + }, + { + "epoch": 0.3615738593800164, + "grad_norm": 0.3035147190093994, + "learning_rate": 4.028219341582053e-05, + "loss": 0.1807, + "step": 20272 + }, + { + "epoch": 0.3615916955017301, + "grad_norm": 0.3126792013645172, + "learning_rate": 4.0280961552976e-05, + "loss": 0.1739, + "step": 20273 + }, + { + "epoch": 0.3616095316234438, + "grad_norm": 0.3192787766456604, + "learning_rate": 4.0279729630897196e-05, + "loss": 0.1718, + "step": 20274 + }, + { + "epoch": 0.36162736774515747, + "grad_norm": 0.27367380261421204, + "learning_rate": 4.027849764958887e-05, + "loss": 0.0961, + "step": 20275 + }, + { + "epoch": 0.3616452038668712, + "grad_norm": 0.3001756966114044, + "learning_rate": 4.0277265609055814e-05, + "loss": 0.1957, + "step": 20276 + }, + { + "epoch": 0.3616630399885849, + "grad_norm": 0.31588807702064514, + "learning_rate": 4.02760335093028e-05, + "loss": 0.1361, + "step": 20277 + }, + { + "epoch": 0.3616808761102986, + "grad_norm": 0.3375159502029419, + "learning_rate": 4.027480135033461e-05, + "loss": 0.1704, + "step": 20278 + }, + { + "epoch": 0.3616987122320123, + "grad_norm": 0.3191787004470825, + "learning_rate": 4.0273569132156e-05, + "loss": 0.2551, + "step": 20279 + }, + { + "epoch": 0.36171654835372596, + "grad_norm": 0.2709929645061493, + "learning_rate": 4.0272336854771775e-05, + "loss": 0.1642, + "step": 20280 + }, + { + "epoch": 0.36173438447543965, + "grad_norm": 0.35836878418922424, + "learning_rate": 4.027110451818669e-05, + "loss": 0.1281, + "step": 20281 + }, + { + "epoch": 0.36175222059715334, + "grad_norm": 0.2236219197511673, + "learning_rate": 4.0269872122405526e-05, + "loss": 0.1545, + "step": 20282 + }, + { + "epoch": 0.36177005671886703, + "grad_norm": 0.22378231585025787, + "learning_rate": 4.026863966743307e-05, + "loss": 0.163, + "step": 20283 + }, + { + "epoch": 0.36178789284058077, + "grad_norm": 0.23444709181785583, + "learning_rate": 4.0267407153274094e-05, + "loss": 0.1854, + "step": 20284 + }, + { + "epoch": 0.36180572896229446, + "grad_norm": 0.43053171038627625, + "learning_rate": 4.026617457993337e-05, + "loss": 0.1926, + "step": 20285 + }, + { + "epoch": 0.36182356508400815, + "grad_norm": 0.3429013192653656, + "learning_rate": 4.026494194741568e-05, + "loss": 0.1752, + "step": 20286 + }, + { + "epoch": 0.36184140120572184, + "grad_norm": 0.24252557754516602, + "learning_rate": 4.026370925572581e-05, + "loss": 0.2251, + "step": 20287 + }, + { + "epoch": 0.3618592373274355, + "grad_norm": 0.23021471500396729, + "learning_rate": 4.026247650486853e-05, + "loss": 0.1434, + "step": 20288 + }, + { + "epoch": 0.3618770734491492, + "grad_norm": 0.281147837638855, + "learning_rate": 4.0261243694848616e-05, + "loss": 0.173, + "step": 20289 + }, + { + "epoch": 0.3618949095708629, + "grad_norm": 0.29727640748023987, + "learning_rate": 4.026001082567085e-05, + "loss": 0.1578, + "step": 20290 + }, + { + "epoch": 0.3619127456925766, + "grad_norm": 0.253035306930542, + "learning_rate": 4.025877789734001e-05, + "loss": 0.1645, + "step": 20291 + }, + { + "epoch": 0.3619305818142903, + "grad_norm": 0.282619446516037, + "learning_rate": 4.0257544909860877e-05, + "loss": 0.1522, + "step": 20292 + }, + { + "epoch": 0.361948417936004, + "grad_norm": 0.26037371158599854, + "learning_rate": 4.025631186323824e-05, + "loss": 0.1163, + "step": 20293 + }, + { + "epoch": 0.3619662540577177, + "grad_norm": 0.24035592377185822, + "learning_rate": 4.025507875747685e-05, + "loss": 0.175, + "step": 20294 + }, + { + "epoch": 0.3619840901794314, + "grad_norm": 0.24814194440841675, + "learning_rate": 4.025384559258152e-05, + "loss": 0.1837, + "step": 20295 + }, + { + "epoch": 0.3620019263011451, + "grad_norm": 0.3079022765159607, + "learning_rate": 4.025261236855701e-05, + "loss": 0.1216, + "step": 20296 + }, + { + "epoch": 0.36201976242285877, + "grad_norm": 0.24591481685638428, + "learning_rate": 4.0251379085408116e-05, + "loss": 0.1487, + "step": 20297 + }, + { + "epoch": 0.36203759854457246, + "grad_norm": 0.35132238268852234, + "learning_rate": 4.02501457431396e-05, + "loss": 0.1553, + "step": 20298 + }, + { + "epoch": 0.36205543466628615, + "grad_norm": 0.24435901641845703, + "learning_rate": 4.024891234175625e-05, + "loss": 0.127, + "step": 20299 + }, + { + "epoch": 0.36207327078799983, + "grad_norm": 0.24215549230575562, + "learning_rate": 4.0247678881262854e-05, + "loss": 0.1524, + "step": 20300 + }, + { + "epoch": 0.3620911069097136, + "grad_norm": 0.29649487137794495, + "learning_rate": 4.024644536166419e-05, + "loss": 0.1656, + "step": 20301 + }, + { + "epoch": 0.36210894303142727, + "grad_norm": 0.30124008655548096, + "learning_rate": 4.024521178296503e-05, + "loss": 0.1796, + "step": 20302 + }, + { + "epoch": 0.36212677915314095, + "grad_norm": 0.307167649269104, + "learning_rate": 4.024397814517017e-05, + "loss": 0.155, + "step": 20303 + }, + { + "epoch": 0.36214461527485464, + "grad_norm": 0.2808796167373657, + "learning_rate": 4.024274444828439e-05, + "loss": 0.1285, + "step": 20304 + }, + { + "epoch": 0.36216245139656833, + "grad_norm": 0.43239811062812805, + "learning_rate": 4.024151069231246e-05, + "loss": 0.1693, + "step": 20305 + }, + { + "epoch": 0.362180287518282, + "grad_norm": 0.2988102436065674, + "learning_rate": 4.024027687725917e-05, + "loss": 0.2092, + "step": 20306 + }, + { + "epoch": 0.3621981236399957, + "grad_norm": 0.2658236622810364, + "learning_rate": 4.02390430031293e-05, + "loss": 0.1795, + "step": 20307 + }, + { + "epoch": 0.3622159597617094, + "grad_norm": 0.2768888771533966, + "learning_rate": 4.0237809069927646e-05, + "loss": 0.1641, + "step": 20308 + }, + { + "epoch": 0.3622337958834231, + "grad_norm": 0.36782771348953247, + "learning_rate": 4.0236575077658974e-05, + "loss": 0.2209, + "step": 20309 + }, + { + "epoch": 0.3622516320051368, + "grad_norm": 0.2993672788143158, + "learning_rate": 4.023534102632808e-05, + "loss": 0.2264, + "step": 20310 + }, + { + "epoch": 0.3622694681268505, + "grad_norm": 0.2562003433704376, + "learning_rate": 4.023410691593973e-05, + "loss": 0.1386, + "step": 20311 + }, + { + "epoch": 0.3622873042485642, + "grad_norm": 0.23674288392066956, + "learning_rate": 4.023287274649873e-05, + "loss": 0.1853, + "step": 20312 + }, + { + "epoch": 0.3623051403702779, + "grad_norm": 0.3505714237689972, + "learning_rate": 4.0231638518009857e-05, + "loss": 0.2693, + "step": 20313 + }, + { + "epoch": 0.3623229764919916, + "grad_norm": 0.3001098930835724, + "learning_rate": 4.0230404230477886e-05, + "loss": 0.1457, + "step": 20314 + }, + { + "epoch": 0.36234081261370527, + "grad_norm": 0.21215714514255524, + "learning_rate": 4.022916988390761e-05, + "loss": 0.1778, + "step": 20315 + }, + { + "epoch": 0.36235864873541895, + "grad_norm": 0.3627565801143646, + "learning_rate": 4.0227935478303815e-05, + "loss": 0.1737, + "step": 20316 + }, + { + "epoch": 0.36237648485713264, + "grad_norm": 0.25186824798583984, + "learning_rate": 4.0226701013671276e-05, + "loss": 0.2377, + "step": 20317 + }, + { + "epoch": 0.3623943209788464, + "grad_norm": 0.2834342420101166, + "learning_rate": 4.0225466490014784e-05, + "loss": 0.1475, + "step": 20318 + }, + { + "epoch": 0.3624121571005601, + "grad_norm": 0.35382670164108276, + "learning_rate": 4.022423190733913e-05, + "loss": 0.1641, + "step": 20319 + }, + { + "epoch": 0.36242999322227376, + "grad_norm": 0.2001914381980896, + "learning_rate": 4.022299726564909e-05, + "loss": 0.137, + "step": 20320 + }, + { + "epoch": 0.36244782934398745, + "grad_norm": 0.36918172240257263, + "learning_rate": 4.022176256494946e-05, + "loss": 0.1722, + "step": 20321 + }, + { + "epoch": 0.36246566546570114, + "grad_norm": 0.22326025366783142, + "learning_rate": 4.0220527805245023e-05, + "loss": 0.1783, + "step": 20322 + }, + { + "epoch": 0.3624835015874148, + "grad_norm": 0.24765141308307648, + "learning_rate": 4.0219292986540555e-05, + "loss": 0.1892, + "step": 20323 + }, + { + "epoch": 0.3625013377091285, + "grad_norm": 0.3298339247703552, + "learning_rate": 4.021805810884085e-05, + "loss": 0.1635, + "step": 20324 + }, + { + "epoch": 0.3625191738308422, + "grad_norm": 0.19307364523410797, + "learning_rate": 4.0216823172150706e-05, + "loss": 0.1191, + "step": 20325 + }, + { + "epoch": 0.3625370099525559, + "grad_norm": 0.29228341579437256, + "learning_rate": 4.021558817647489e-05, + "loss": 0.2044, + "step": 20326 + }, + { + "epoch": 0.36255484607426963, + "grad_norm": 0.2923954725265503, + "learning_rate": 4.02143531218182e-05, + "loss": 0.121, + "step": 20327 + }, + { + "epoch": 0.3625726821959833, + "grad_norm": 0.2576627731323242, + "learning_rate": 4.0213118008185434e-05, + "loss": 0.1339, + "step": 20328 + }, + { + "epoch": 0.362590518317697, + "grad_norm": 0.33647146821022034, + "learning_rate": 4.021188283558136e-05, + "loss": 0.1723, + "step": 20329 + }, + { + "epoch": 0.3626083544394107, + "grad_norm": 0.2549893856048584, + "learning_rate": 4.021064760401078e-05, + "loss": 0.1728, + "step": 20330 + }, + { + "epoch": 0.3626261905611244, + "grad_norm": 0.2981374263763428, + "learning_rate": 4.020941231347846e-05, + "loss": 0.1769, + "step": 20331 + }, + { + "epoch": 0.36264402668283807, + "grad_norm": 0.3461923599243164, + "learning_rate": 4.020817696398922e-05, + "loss": 0.1447, + "step": 20332 + }, + { + "epoch": 0.36266186280455176, + "grad_norm": 0.395556777715683, + "learning_rate": 4.020694155554783e-05, + "loss": 0.0792, + "step": 20333 + }, + { + "epoch": 0.36267969892626545, + "grad_norm": 0.3223714232444763, + "learning_rate": 4.020570608815908e-05, + "loss": 0.1297, + "step": 20334 + }, + { + "epoch": 0.3626975350479792, + "grad_norm": 0.30979812145233154, + "learning_rate": 4.0204470561827754e-05, + "loss": 0.1487, + "step": 20335 + }, + { + "epoch": 0.3627153711696929, + "grad_norm": 0.2697942554950714, + "learning_rate": 4.020323497655866e-05, + "loss": 0.142, + "step": 20336 + }, + { + "epoch": 0.36273320729140657, + "grad_norm": 0.2825428545475006, + "learning_rate": 4.020199933235657e-05, + "loss": 0.1931, + "step": 20337 + }, + { + "epoch": 0.36275104341312026, + "grad_norm": 0.3282732665538788, + "learning_rate": 4.020076362922629e-05, + "loss": 0.1224, + "step": 20338 + }, + { + "epoch": 0.36276887953483394, + "grad_norm": 0.21660448610782623, + "learning_rate": 4.019952786717259e-05, + "loss": 0.1391, + "step": 20339 + }, + { + "epoch": 0.36278671565654763, + "grad_norm": 0.25640738010406494, + "learning_rate": 4.019829204620027e-05, + "loss": 0.1953, + "step": 20340 + }, + { + "epoch": 0.3628045517782613, + "grad_norm": 0.2200232595205307, + "learning_rate": 4.019705616631413e-05, + "loss": 0.1011, + "step": 20341 + }, + { + "epoch": 0.362822387899975, + "grad_norm": 0.3159371614456177, + "learning_rate": 4.0195820227518945e-05, + "loss": 0.1801, + "step": 20342 + }, + { + "epoch": 0.36284022402168875, + "grad_norm": 0.21758824586868286, + "learning_rate": 4.019458422981951e-05, + "loss": 0.1646, + "step": 20343 + }, + { + "epoch": 0.36285806014340244, + "grad_norm": 0.3435939848423004, + "learning_rate": 4.019334817322062e-05, + "loss": 0.1404, + "step": 20344 + }, + { + "epoch": 0.3628758962651161, + "grad_norm": 0.35938042402267456, + "learning_rate": 4.019211205772707e-05, + "loss": 0.1781, + "step": 20345 + }, + { + "epoch": 0.3628937323868298, + "grad_norm": 0.3149055540561676, + "learning_rate": 4.019087588334364e-05, + "loss": 0.1981, + "step": 20346 + }, + { + "epoch": 0.3629115685085435, + "grad_norm": 0.31614789366722107, + "learning_rate": 4.0189639650075126e-05, + "loss": 0.2255, + "step": 20347 + }, + { + "epoch": 0.3629294046302572, + "grad_norm": 0.20064190030097961, + "learning_rate": 4.018840335792633e-05, + "loss": 0.1418, + "step": 20348 + }, + { + "epoch": 0.3629472407519709, + "grad_norm": 0.23846852779388428, + "learning_rate": 4.0187167006902035e-05, + "loss": 0.1162, + "step": 20349 + }, + { + "epoch": 0.36296507687368457, + "grad_norm": 0.3052425682544708, + "learning_rate": 4.018593059700703e-05, + "loss": 0.1585, + "step": 20350 + }, + { + "epoch": 0.36298291299539825, + "grad_norm": 0.24612314999103546, + "learning_rate": 4.018469412824611e-05, + "loss": 0.1711, + "step": 20351 + }, + { + "epoch": 0.363000749117112, + "grad_norm": 0.36478111147880554, + "learning_rate": 4.0183457600624085e-05, + "loss": 0.1523, + "step": 20352 + }, + { + "epoch": 0.3630185852388257, + "grad_norm": 0.33922725915908813, + "learning_rate": 4.018222101414573e-05, + "loss": 0.1839, + "step": 20353 + }, + { + "epoch": 0.3630364213605394, + "grad_norm": 0.22540529072284698, + "learning_rate": 4.0180984368815835e-05, + "loss": 0.1638, + "step": 20354 + }, + { + "epoch": 0.36305425748225306, + "grad_norm": 0.35284557938575745, + "learning_rate": 4.017974766463921e-05, + "loss": 0.2088, + "step": 20355 + }, + { + "epoch": 0.36307209360396675, + "grad_norm": 0.25336530804634094, + "learning_rate": 4.017851090162064e-05, + "loss": 0.1721, + "step": 20356 + }, + { + "epoch": 0.36308992972568044, + "grad_norm": 0.24890448153018951, + "learning_rate": 4.0177274079764904e-05, + "loss": 0.1728, + "step": 20357 + }, + { + "epoch": 0.3631077658473941, + "grad_norm": 0.26287776231765747, + "learning_rate": 4.017603719907683e-05, + "loss": 0.2118, + "step": 20358 + }, + { + "epoch": 0.3631256019691078, + "grad_norm": 0.31241151690483093, + "learning_rate": 4.0174800259561185e-05, + "loss": 0.1209, + "step": 20359 + }, + { + "epoch": 0.36314343809082156, + "grad_norm": 0.35665857791900635, + "learning_rate": 4.017356326122277e-05, + "loss": 0.1798, + "step": 20360 + }, + { + "epoch": 0.36316127421253525, + "grad_norm": 0.24914267659187317, + "learning_rate": 4.017232620406639e-05, + "loss": 0.1464, + "step": 20361 + }, + { + "epoch": 0.36317911033424893, + "grad_norm": 0.2625211179256439, + "learning_rate": 4.017108908809683e-05, + "loss": 0.1495, + "step": 20362 + }, + { + "epoch": 0.3631969464559626, + "grad_norm": 0.3509328067302704, + "learning_rate": 4.016985191331889e-05, + "loss": 0.1097, + "step": 20363 + }, + { + "epoch": 0.3632147825776763, + "grad_norm": 0.357641339302063, + "learning_rate": 4.0168614679737366e-05, + "loss": 0.1696, + "step": 20364 + }, + { + "epoch": 0.36323261869939, + "grad_norm": 0.2892305850982666, + "learning_rate": 4.016737738735705e-05, + "loss": 0.1297, + "step": 20365 + }, + { + "epoch": 0.3632504548211037, + "grad_norm": 0.31118863821029663, + "learning_rate": 4.0166140036182745e-05, + "loss": 0.1358, + "step": 20366 + }, + { + "epoch": 0.3632682909428174, + "grad_norm": 0.29650670289993286, + "learning_rate": 4.0164902626219235e-05, + "loss": 0.2323, + "step": 20367 + }, + { + "epoch": 0.36328612706453106, + "grad_norm": 0.3526393473148346, + "learning_rate": 4.0163665157471333e-05, + "loss": 0.1311, + "step": 20368 + }, + { + "epoch": 0.3633039631862448, + "grad_norm": 0.3056434094905853, + "learning_rate": 4.0162427629943825e-05, + "loss": 0.1389, + "step": 20369 + }, + { + "epoch": 0.3633217993079585, + "grad_norm": 0.26603442430496216, + "learning_rate": 4.0161190043641506e-05, + "loss": 0.1858, + "step": 20370 + }, + { + "epoch": 0.3633396354296722, + "grad_norm": 0.2744593322277069, + "learning_rate": 4.0159952398569175e-05, + "loss": 0.1727, + "step": 20371 + }, + { + "epoch": 0.36335747155138587, + "grad_norm": 0.2747959792613983, + "learning_rate": 4.0158714694731636e-05, + "loss": 0.173, + "step": 20372 + }, + { + "epoch": 0.36337530767309956, + "grad_norm": 0.27869388461112976, + "learning_rate": 4.0157476932133694e-05, + "loss": 0.1481, + "step": 20373 + }, + { + "epoch": 0.36339314379481324, + "grad_norm": 0.30759885907173157, + "learning_rate": 4.0156239110780126e-05, + "loss": 0.1388, + "step": 20374 + }, + { + "epoch": 0.36341097991652693, + "grad_norm": 0.3121108412742615, + "learning_rate": 4.0155001230675735e-05, + "loss": 0.1392, + "step": 20375 + }, + { + "epoch": 0.3634288160382406, + "grad_norm": 0.38811028003692627, + "learning_rate": 4.0153763291825334e-05, + "loss": 0.1844, + "step": 20376 + }, + { + "epoch": 0.36344665215995436, + "grad_norm": 0.341267853975296, + "learning_rate": 4.0152525294233714e-05, + "loss": 0.1326, + "step": 20377 + }, + { + "epoch": 0.36346448828166805, + "grad_norm": 0.29311496019363403, + "learning_rate": 4.015128723790567e-05, + "loss": 0.1649, + "step": 20378 + }, + { + "epoch": 0.36348232440338174, + "grad_norm": 0.22935625910758972, + "learning_rate": 4.0150049122846e-05, + "loss": 0.1547, + "step": 20379 + }, + { + "epoch": 0.36350016052509543, + "grad_norm": 0.2277718335390091, + "learning_rate": 4.0148810949059514e-05, + "loss": 0.1768, + "step": 20380 + }, + { + "epoch": 0.3635179966468091, + "grad_norm": 0.2998082935810089, + "learning_rate": 4.0147572716550996e-05, + "loss": 0.1387, + "step": 20381 + }, + { + "epoch": 0.3635358327685228, + "grad_norm": 0.23907089233398438, + "learning_rate": 4.014633442532526e-05, + "loss": 0.1668, + "step": 20382 + }, + { + "epoch": 0.3635536688902365, + "grad_norm": 0.38662102818489075, + "learning_rate": 4.01450960753871e-05, + "loss": 0.238, + "step": 20383 + }, + { + "epoch": 0.3635715050119502, + "grad_norm": 0.19294650852680206, + "learning_rate": 4.0143857666741316e-05, + "loss": 0.1221, + "step": 20384 + }, + { + "epoch": 0.3635893411336639, + "grad_norm": 0.2246742993593216, + "learning_rate": 4.0142619199392704e-05, + "loss": 0.1269, + "step": 20385 + }, + { + "epoch": 0.3636071772553776, + "grad_norm": 0.2402191460132599, + "learning_rate": 4.014138067334608e-05, + "loss": 0.2034, + "step": 20386 + }, + { + "epoch": 0.3636250133770913, + "grad_norm": 0.195674329996109, + "learning_rate": 4.0140142088606226e-05, + "loss": 0.1176, + "step": 20387 + }, + { + "epoch": 0.363642849498805, + "grad_norm": 0.288394570350647, + "learning_rate": 4.0138903445177957e-05, + "loss": 0.1603, + "step": 20388 + }, + { + "epoch": 0.3636606856205187, + "grad_norm": 0.2395041286945343, + "learning_rate": 4.013766474306606e-05, + "loss": 0.1783, + "step": 20389 + }, + { + "epoch": 0.36367852174223236, + "grad_norm": 0.32888275384902954, + "learning_rate": 4.013642598227536e-05, + "loss": 0.1456, + "step": 20390 + }, + { + "epoch": 0.36369635786394605, + "grad_norm": 0.2046915739774704, + "learning_rate": 4.013518716281064e-05, + "loss": 0.1478, + "step": 20391 + }, + { + "epoch": 0.36371419398565974, + "grad_norm": 0.30634281039237976, + "learning_rate": 4.0133948284676705e-05, + "loss": 0.1485, + "step": 20392 + }, + { + "epoch": 0.3637320301073734, + "grad_norm": 0.2923586070537567, + "learning_rate": 4.0132709347878363e-05, + "loss": 0.1428, + "step": 20393 + }, + { + "epoch": 0.36374986622908717, + "grad_norm": 0.34686318039894104, + "learning_rate": 4.013147035242041e-05, + "loss": 0.1343, + "step": 20394 + }, + { + "epoch": 0.36376770235080086, + "grad_norm": 0.26981616020202637, + "learning_rate": 4.013023129830765e-05, + "loss": 0.1499, + "step": 20395 + }, + { + "epoch": 0.36378553847251455, + "grad_norm": 0.24971304833889008, + "learning_rate": 4.012899218554489e-05, + "loss": 0.1444, + "step": 20396 + }, + { + "epoch": 0.36380337459422823, + "grad_norm": 0.27691808342933655, + "learning_rate": 4.012775301413693e-05, + "loss": 0.1365, + "step": 20397 + }, + { + "epoch": 0.3638212107159419, + "grad_norm": 0.20154893398284912, + "learning_rate": 4.012651378408857e-05, + "loss": 0.1261, + "step": 20398 + }, + { + "epoch": 0.3638390468376556, + "grad_norm": 0.2605063319206238, + "learning_rate": 4.012527449540463e-05, + "loss": 0.1802, + "step": 20399 + }, + { + "epoch": 0.3638568829593693, + "grad_norm": 0.21625597774982452, + "learning_rate": 4.012403514808989e-05, + "loss": 0.1858, + "step": 20400 + }, + { + "epoch": 0.363874719081083, + "grad_norm": 0.21393819153308868, + "learning_rate": 4.0122795742149175e-05, + "loss": 0.1864, + "step": 20401 + }, + { + "epoch": 0.36389255520279673, + "grad_norm": 0.3559608459472656, + "learning_rate": 4.012155627758727e-05, + "loss": 0.1713, + "step": 20402 + }, + { + "epoch": 0.3639103913245104, + "grad_norm": 0.21698346734046936, + "learning_rate": 4.0120316754409e-05, + "loss": 0.1382, + "step": 20403 + }, + { + "epoch": 0.3639282274462241, + "grad_norm": 0.304470032453537, + "learning_rate": 4.011907717261916e-05, + "loss": 0.1355, + "step": 20404 + }, + { + "epoch": 0.3639460635679378, + "grad_norm": 0.24674120545387268, + "learning_rate": 4.0117837532222546e-05, + "loss": 0.2016, + "step": 20405 + }, + { + "epoch": 0.3639638996896515, + "grad_norm": 0.24448737502098083, + "learning_rate": 4.011659783322398e-05, + "loss": 0.1609, + "step": 20406 + }, + { + "epoch": 0.36398173581136517, + "grad_norm": 0.3851648271083832, + "learning_rate": 4.011535807562825e-05, + "loss": 0.155, + "step": 20407 + }, + { + "epoch": 0.36399957193307886, + "grad_norm": 0.23602813482284546, + "learning_rate": 4.011411825944018e-05, + "loss": 0.1311, + "step": 20408 + }, + { + "epoch": 0.36401740805479255, + "grad_norm": 0.2662386894226074, + "learning_rate": 4.011287838466456e-05, + "loss": 0.1647, + "step": 20409 + }, + { + "epoch": 0.36403524417650623, + "grad_norm": 0.3345221281051636, + "learning_rate": 4.011163845130622e-05, + "loss": 0.1744, + "step": 20410 + }, + { + "epoch": 0.36405308029822, + "grad_norm": 0.4055943787097931, + "learning_rate": 4.011039845936994e-05, + "loss": 0.1456, + "step": 20411 + }, + { + "epoch": 0.36407091641993367, + "grad_norm": 0.3128933906555176, + "learning_rate": 4.010915840886054e-05, + "loss": 0.1529, + "step": 20412 + }, + { + "epoch": 0.36408875254164735, + "grad_norm": 0.2648877203464508, + "learning_rate": 4.010791829978281e-05, + "loss": 0.1633, + "step": 20413 + }, + { + "epoch": 0.36410658866336104, + "grad_norm": 0.2558538019657135, + "learning_rate": 4.0106678132141585e-05, + "loss": 0.1817, + "step": 20414 + }, + { + "epoch": 0.36412442478507473, + "grad_norm": 0.30200332403182983, + "learning_rate": 4.010543790594165e-05, + "loss": 0.1424, + "step": 20415 + }, + { + "epoch": 0.3641422609067884, + "grad_norm": 0.3777584731578827, + "learning_rate": 4.010419762118782e-05, + "loss": 0.1664, + "step": 20416 + }, + { + "epoch": 0.3641600970285021, + "grad_norm": 0.27539873123168945, + "learning_rate": 4.0102957277884914e-05, + "loss": 0.1584, + "step": 20417 + }, + { + "epoch": 0.3641779331502158, + "grad_norm": 0.24165771901607513, + "learning_rate": 4.010171687603772e-05, + "loss": 0.144, + "step": 20418 + }, + { + "epoch": 0.36419576927192954, + "grad_norm": 0.24389661848545074, + "learning_rate": 4.0100476415651055e-05, + "loss": 0.174, + "step": 20419 + }, + { + "epoch": 0.3642136053936432, + "grad_norm": 0.27097684144973755, + "learning_rate": 4.0099235896729725e-05, + "loss": 0.1907, + "step": 20420 + }, + { + "epoch": 0.3642314415153569, + "grad_norm": 0.24715308845043182, + "learning_rate": 4.0097995319278554e-05, + "loss": 0.1253, + "step": 20421 + }, + { + "epoch": 0.3642492776370706, + "grad_norm": 0.2176176756620407, + "learning_rate": 4.009675468330233e-05, + "loss": 0.1783, + "step": 20422 + }, + { + "epoch": 0.3642671137587843, + "grad_norm": 0.29457953572273254, + "learning_rate": 4.0095513988805864e-05, + "loss": 0.2018, + "step": 20423 + }, + { + "epoch": 0.364284949880498, + "grad_norm": 0.26480454206466675, + "learning_rate": 4.009427323579398e-05, + "loss": 0.131, + "step": 20424 + }, + { + "epoch": 0.36430278600221166, + "grad_norm": 0.3660164773464203, + "learning_rate": 4.009303242427148e-05, + "loss": 0.165, + "step": 20425 + }, + { + "epoch": 0.36432062212392535, + "grad_norm": 0.1991516649723053, + "learning_rate": 4.009179155424317e-05, + "loss": 0.1527, + "step": 20426 + }, + { + "epoch": 0.36433845824563904, + "grad_norm": 0.221453458070755, + "learning_rate": 4.009055062571387e-05, + "loss": 0.1286, + "step": 20427 + }, + { + "epoch": 0.3643562943673528, + "grad_norm": 0.23807577788829803, + "learning_rate": 4.0089309638688376e-05, + "loss": 0.1592, + "step": 20428 + }, + { + "epoch": 0.3643741304890665, + "grad_norm": 0.3022037744522095, + "learning_rate": 4.0088068593171514e-05, + "loss": 0.1593, + "step": 20429 + }, + { + "epoch": 0.36439196661078016, + "grad_norm": 0.33922526240348816, + "learning_rate": 4.008682748916809e-05, + "loss": 0.1779, + "step": 20430 + }, + { + "epoch": 0.36440980273249385, + "grad_norm": 0.1978517770767212, + "learning_rate": 4.00855863266829e-05, + "loss": 0.1254, + "step": 20431 + }, + { + "epoch": 0.36442763885420754, + "grad_norm": 0.24193094670772552, + "learning_rate": 4.008434510572077e-05, + "loss": 0.1537, + "step": 20432 + }, + { + "epoch": 0.3644454749759212, + "grad_norm": 0.22125987708568573, + "learning_rate": 4.0083103826286506e-05, + "loss": 0.1665, + "step": 20433 + }, + { + "epoch": 0.3644633110976349, + "grad_norm": 0.2807343304157257, + "learning_rate": 4.008186248838493e-05, + "loss": 0.1668, + "step": 20434 + }, + { + "epoch": 0.3644811472193486, + "grad_norm": 0.23143427073955536, + "learning_rate": 4.008062109202084e-05, + "loss": 0.152, + "step": 20435 + }, + { + "epoch": 0.36449898334106234, + "grad_norm": 0.23137961328029633, + "learning_rate": 4.007937963719906e-05, + "loss": 0.1703, + "step": 20436 + }, + { + "epoch": 0.36451681946277603, + "grad_norm": 0.3320882022380829, + "learning_rate": 4.0078138123924385e-05, + "loss": 0.1682, + "step": 20437 + }, + { + "epoch": 0.3645346555844897, + "grad_norm": 0.2713082730770111, + "learning_rate": 4.007689655220165e-05, + "loss": 0.1856, + "step": 20438 + }, + { + "epoch": 0.3645524917062034, + "grad_norm": 0.31148722767829895, + "learning_rate": 4.007565492203565e-05, + "loss": 0.2232, + "step": 20439 + }, + { + "epoch": 0.3645703278279171, + "grad_norm": 0.2416645735502243, + "learning_rate": 4.007441323343121e-05, + "loss": 0.1566, + "step": 20440 + }, + { + "epoch": 0.3645881639496308, + "grad_norm": 0.26091521978378296, + "learning_rate": 4.007317148639313e-05, + "loss": 0.1528, + "step": 20441 + }, + { + "epoch": 0.36460600007134447, + "grad_norm": 0.5213764309883118, + "learning_rate": 4.007192968092623e-05, + "loss": 0.1449, + "step": 20442 + }, + { + "epoch": 0.36462383619305816, + "grad_norm": 0.31789395213127136, + "learning_rate": 4.0070687817035337e-05, + "loss": 0.1753, + "step": 20443 + }, + { + "epoch": 0.3646416723147719, + "grad_norm": 0.5056061148643494, + "learning_rate": 4.006944589472524e-05, + "loss": 0.1754, + "step": 20444 + }, + { + "epoch": 0.3646595084364856, + "grad_norm": 0.2900460660457611, + "learning_rate": 4.006820391400078e-05, + "loss": 0.1443, + "step": 20445 + }, + { + "epoch": 0.3646773445581993, + "grad_norm": 0.34964457154273987, + "learning_rate": 4.006696187486675e-05, + "loss": 0.2074, + "step": 20446 + }, + { + "epoch": 0.36469518067991297, + "grad_norm": 0.32497259974479675, + "learning_rate": 4.006571977732797e-05, + "loss": 0.173, + "step": 20447 + }, + { + "epoch": 0.36471301680162665, + "grad_norm": 0.3151305615901947, + "learning_rate": 4.006447762138926e-05, + "loss": 0.1569, + "step": 20448 + }, + { + "epoch": 0.36473085292334034, + "grad_norm": 0.23915310204029083, + "learning_rate": 4.0063235407055434e-05, + "loss": 0.184, + "step": 20449 + }, + { + "epoch": 0.36474868904505403, + "grad_norm": 0.29487887024879456, + "learning_rate": 4.00619931343313e-05, + "loss": 0.147, + "step": 20450 + }, + { + "epoch": 0.3647665251667677, + "grad_norm": 0.18383778631687164, + "learning_rate": 4.006075080322168e-05, + "loss": 0.1426, + "step": 20451 + }, + { + "epoch": 0.3647843612884814, + "grad_norm": 0.4039187431335449, + "learning_rate": 4.0059508413731387e-05, + "loss": 0.1412, + "step": 20452 + }, + { + "epoch": 0.36480219741019515, + "grad_norm": 0.3977547287940979, + "learning_rate": 4.005826596586523e-05, + "loss": 0.2561, + "step": 20453 + }, + { + "epoch": 0.36482003353190884, + "grad_norm": 0.2797873914241791, + "learning_rate": 4.005702345962804e-05, + "loss": 0.1562, + "step": 20454 + }, + { + "epoch": 0.3648378696536225, + "grad_norm": 0.28851714730262756, + "learning_rate": 4.005578089502463e-05, + "loss": 0.125, + "step": 20455 + }, + { + "epoch": 0.3648557057753362, + "grad_norm": 0.27626490592956543, + "learning_rate": 4.005453827205981e-05, + "loss": 0.1477, + "step": 20456 + }, + { + "epoch": 0.3648735418970499, + "grad_norm": 0.22205011546611786, + "learning_rate": 4.005329559073841e-05, + "loss": 0.1648, + "step": 20457 + }, + { + "epoch": 0.3648913780187636, + "grad_norm": 0.1904202550649643, + "learning_rate": 4.005205285106522e-05, + "loss": 0.1459, + "step": 20458 + }, + { + "epoch": 0.3649092141404773, + "grad_norm": 0.2292073518037796, + "learning_rate": 4.0050810053045086e-05, + "loss": 0.1682, + "step": 20459 + }, + { + "epoch": 0.36492705026219097, + "grad_norm": 0.29050785303115845, + "learning_rate": 4.004956719668281e-05, + "loss": 0.1577, + "step": 20460 + }, + { + "epoch": 0.3649448863839047, + "grad_norm": 0.22272981703281403, + "learning_rate": 4.004832428198321e-05, + "loss": 0.1855, + "step": 20461 + }, + { + "epoch": 0.3649627225056184, + "grad_norm": 0.2937490940093994, + "learning_rate": 4.004708130895111e-05, + "loss": 0.1733, + "step": 20462 + }, + { + "epoch": 0.3649805586273321, + "grad_norm": 0.3048675060272217, + "learning_rate": 4.004583827759133e-05, + "loss": 0.1445, + "step": 20463 + }, + { + "epoch": 0.3649983947490458, + "grad_norm": 0.24152550101280212, + "learning_rate": 4.004459518790868e-05, + "loss": 0.1408, + "step": 20464 + }, + { + "epoch": 0.36501623087075946, + "grad_norm": 0.3479171693325043, + "learning_rate": 4.004335203990798e-05, + "loss": 0.1519, + "step": 20465 + }, + { + "epoch": 0.36503406699247315, + "grad_norm": 0.26340603828430176, + "learning_rate": 4.004210883359406e-05, + "loss": 0.2036, + "step": 20466 + }, + { + "epoch": 0.36505190311418684, + "grad_norm": 0.2923718988895416, + "learning_rate": 4.0040865568971725e-05, + "loss": 0.1447, + "step": 20467 + }, + { + "epoch": 0.3650697392359005, + "grad_norm": 0.32460954785346985, + "learning_rate": 4.003962224604581e-05, + "loss": 0.1481, + "step": 20468 + }, + { + "epoch": 0.3650875753576142, + "grad_norm": 0.22123803198337555, + "learning_rate": 4.0038378864821106e-05, + "loss": 0.1284, + "step": 20469 + }, + { + "epoch": 0.36510541147932796, + "grad_norm": 0.3040921986103058, + "learning_rate": 4.0037135425302465e-05, + "loss": 0.18, + "step": 20470 + }, + { + "epoch": 0.36512324760104165, + "grad_norm": 0.2259899377822876, + "learning_rate": 4.003589192749469e-05, + "loss": 0.0989, + "step": 20471 + }, + { + "epoch": 0.36514108372275533, + "grad_norm": 0.4131251871585846, + "learning_rate": 4.0034648371402605e-05, + "loss": 0.1749, + "step": 20472 + }, + { + "epoch": 0.365158919844469, + "grad_norm": 0.19247211515903473, + "learning_rate": 4.0033404757031034e-05, + "loss": 0.1363, + "step": 20473 + }, + { + "epoch": 0.3651767559661827, + "grad_norm": 0.21143482625484467, + "learning_rate": 4.003216108438478e-05, + "loss": 0.1368, + "step": 20474 + }, + { + "epoch": 0.3651945920878964, + "grad_norm": 0.24007849395275116, + "learning_rate": 4.003091735346869e-05, + "loss": 0.1372, + "step": 20475 + }, + { + "epoch": 0.3652124282096101, + "grad_norm": 0.2866557538509369, + "learning_rate": 4.0029673564287576e-05, + "loss": 0.116, + "step": 20476 + }, + { + "epoch": 0.3652302643313238, + "grad_norm": 0.3185955286026001, + "learning_rate": 4.002842971684625e-05, + "loss": 0.1878, + "step": 20477 + }, + { + "epoch": 0.3652481004530375, + "grad_norm": 0.29474368691444397, + "learning_rate": 4.0027185811149536e-05, + "loss": 0.2068, + "step": 20478 + }, + { + "epoch": 0.3652659365747512, + "grad_norm": 0.2426108866930008, + "learning_rate": 4.0025941847202264e-05, + "loss": 0.1836, + "step": 20479 + }, + { + "epoch": 0.3652837726964649, + "grad_norm": 0.28309541940689087, + "learning_rate": 4.002469782500925e-05, + "loss": 0.2011, + "step": 20480 + }, + { + "epoch": 0.3653016088181786, + "grad_norm": 0.24353143572807312, + "learning_rate": 4.0023453744575326e-05, + "loss": 0.1866, + "step": 20481 + }, + { + "epoch": 0.36531944493989227, + "grad_norm": 0.318086177110672, + "learning_rate": 4.002220960590529e-05, + "loss": 0.1218, + "step": 20482 + }, + { + "epoch": 0.36533728106160596, + "grad_norm": 0.2994476854801178, + "learning_rate": 4.002096540900399e-05, + "loss": 0.2045, + "step": 20483 + }, + { + "epoch": 0.36535511718331964, + "grad_norm": 0.25481754541397095, + "learning_rate": 4.0019721153876244e-05, + "loss": 0.1588, + "step": 20484 + }, + { + "epoch": 0.36537295330503333, + "grad_norm": 0.335957407951355, + "learning_rate": 4.001847684052687e-05, + "loss": 0.1467, + "step": 20485 + }, + { + "epoch": 0.3653907894267471, + "grad_norm": 0.29596617817878723, + "learning_rate": 4.0017232468960694e-05, + "loss": 0.188, + "step": 20486 + }, + { + "epoch": 0.36540862554846076, + "grad_norm": 0.3041633367538452, + "learning_rate": 4.001598803918253e-05, + "loss": 0.1831, + "step": 20487 + }, + { + "epoch": 0.36542646167017445, + "grad_norm": 0.3252646327018738, + "learning_rate": 4.001474355119722e-05, + "loss": 0.1647, + "step": 20488 + }, + { + "epoch": 0.36544429779188814, + "grad_norm": 0.253260999917984, + "learning_rate": 4.0013499005009566e-05, + "loss": 0.1099, + "step": 20489 + }, + { + "epoch": 0.3654621339136018, + "grad_norm": 0.29536011815071106, + "learning_rate": 4.0012254400624416e-05, + "loss": 0.1472, + "step": 20490 + }, + { + "epoch": 0.3654799700353155, + "grad_norm": 0.24268674850463867, + "learning_rate": 4.001100973804657e-05, + "loss": 0.1572, + "step": 20491 + }, + { + "epoch": 0.3654978061570292, + "grad_norm": 0.25012004375457764, + "learning_rate": 4.000976501728088e-05, + "loss": 0.159, + "step": 20492 + }, + { + "epoch": 0.3655156422787429, + "grad_norm": 0.30402180552482605, + "learning_rate": 4.000852023833215e-05, + "loss": 0.1634, + "step": 20493 + }, + { + "epoch": 0.3655334784004566, + "grad_norm": 0.27871835231781006, + "learning_rate": 4.0007275401205216e-05, + "loss": 0.1479, + "step": 20494 + }, + { + "epoch": 0.3655513145221703, + "grad_norm": 0.25464093685150146, + "learning_rate": 4.000603050590489e-05, + "loss": 0.2031, + "step": 20495 + }, + { + "epoch": 0.365569150643884, + "grad_norm": 0.1704392433166504, + "learning_rate": 4.0004785552436005e-05, + "loss": 0.1398, + "step": 20496 + }, + { + "epoch": 0.3655869867655977, + "grad_norm": 0.389527827501297, + "learning_rate": 4.0003540540803405e-05, + "loss": 0.2127, + "step": 20497 + }, + { + "epoch": 0.3656048228873114, + "grad_norm": 0.23409676551818848, + "learning_rate": 4.000229547101189e-05, + "loss": 0.1856, + "step": 20498 + }, + { + "epoch": 0.3656226590090251, + "grad_norm": 0.24304834008216858, + "learning_rate": 4.0001050343066296e-05, + "loss": 0.1626, + "step": 20499 + }, + { + "epoch": 0.36564049513073876, + "grad_norm": 0.31907039880752563, + "learning_rate": 3.999980515697145e-05, + "loss": 0.1489, + "step": 20500 + }, + { + "epoch": 0.36565833125245245, + "grad_norm": 0.2660103142261505, + "learning_rate": 3.999855991273218e-05, + "loss": 0.1725, + "step": 20501 + }, + { + "epoch": 0.36567616737416614, + "grad_norm": 0.3440670669078827, + "learning_rate": 3.99973146103533e-05, + "loss": 0.1603, + "step": 20502 + }, + { + "epoch": 0.3656940034958799, + "grad_norm": 0.2671903371810913, + "learning_rate": 3.999606924983966e-05, + "loss": 0.1788, + "step": 20503 + }, + { + "epoch": 0.36571183961759357, + "grad_norm": 0.2599971294403076, + "learning_rate": 3.9994823831196075e-05, + "loss": 0.1896, + "step": 20504 + }, + { + "epoch": 0.36572967573930726, + "grad_norm": 0.24406467378139496, + "learning_rate": 3.999357835442737e-05, + "loss": 0.1166, + "step": 20505 + }, + { + "epoch": 0.36574751186102095, + "grad_norm": 0.2240985482931137, + "learning_rate": 3.999233281953839e-05, + "loss": 0.1741, + "step": 20506 + }, + { + "epoch": 0.36576534798273463, + "grad_norm": 0.2051028162240982, + "learning_rate": 3.9991087226533936e-05, + "loss": 0.1579, + "step": 20507 + }, + { + "epoch": 0.3657831841044483, + "grad_norm": 0.29228609800338745, + "learning_rate": 3.998984157541885e-05, + "loss": 0.1165, + "step": 20508 + }, + { + "epoch": 0.365801020226162, + "grad_norm": 0.24455542862415314, + "learning_rate": 3.998859586619797e-05, + "loss": 0.1324, + "step": 20509 + }, + { + "epoch": 0.3658188563478757, + "grad_norm": 0.4160729944705963, + "learning_rate": 3.998735009887611e-05, + "loss": 0.2102, + "step": 20510 + }, + { + "epoch": 0.3658366924695894, + "grad_norm": 0.2766497731208801, + "learning_rate": 3.99861042734581e-05, + "loss": 0.173, + "step": 20511 + }, + { + "epoch": 0.36585452859130313, + "grad_norm": 0.24490101635456085, + "learning_rate": 3.9984858389948784e-05, + "loss": 0.1597, + "step": 20512 + }, + { + "epoch": 0.3658723647130168, + "grad_norm": 0.19108742475509644, + "learning_rate": 3.998361244835298e-05, + "loss": 0.126, + "step": 20513 + }, + { + "epoch": 0.3658902008347305, + "grad_norm": 0.25275447964668274, + "learning_rate": 3.998236644867551e-05, + "loss": 0.1161, + "step": 20514 + }, + { + "epoch": 0.3659080369564442, + "grad_norm": 0.2700955271720886, + "learning_rate": 3.998112039092122e-05, + "loss": 0.1323, + "step": 20515 + }, + { + "epoch": 0.3659258730781579, + "grad_norm": 0.33192551136016846, + "learning_rate": 3.997987427509493e-05, + "loss": 0.1688, + "step": 20516 + }, + { + "epoch": 0.36594370919987157, + "grad_norm": 0.24112530052661896, + "learning_rate": 3.997862810120148e-05, + "loss": 0.1891, + "step": 20517 + }, + { + "epoch": 0.36596154532158526, + "grad_norm": 0.3015174865722656, + "learning_rate": 3.9977381869245684e-05, + "loss": 0.2004, + "step": 20518 + }, + { + "epoch": 0.36597938144329895, + "grad_norm": 0.2796415686607361, + "learning_rate": 3.997613557923239e-05, + "loss": 0.1413, + "step": 20519 + }, + { + "epoch": 0.3659972175650127, + "grad_norm": 0.2002744823694229, + "learning_rate": 3.997488923116641e-05, + "loss": 0.1409, + "step": 20520 + }, + { + "epoch": 0.3660150536867264, + "grad_norm": 0.1873876005411148, + "learning_rate": 3.99736428250526e-05, + "loss": 0.1158, + "step": 20521 + }, + { + "epoch": 0.36603288980844007, + "grad_norm": 0.23740717768669128, + "learning_rate": 3.997239636089578e-05, + "loss": 0.1765, + "step": 20522 + }, + { + "epoch": 0.36605072593015375, + "grad_norm": 0.42529770731925964, + "learning_rate": 3.9971149838700774e-05, + "loss": 0.1524, + "step": 20523 + }, + { + "epoch": 0.36606856205186744, + "grad_norm": 0.2501377463340759, + "learning_rate": 3.9969903258472415e-05, + "loss": 0.1734, + "step": 20524 + }, + { + "epoch": 0.36608639817358113, + "grad_norm": 0.24219156801700592, + "learning_rate": 3.996865662021556e-05, + "loss": 0.187, + "step": 20525 + }, + { + "epoch": 0.3661042342952948, + "grad_norm": 0.3499244749546051, + "learning_rate": 3.9967409923935e-05, + "loss": 0.1378, + "step": 20526 + }, + { + "epoch": 0.3661220704170085, + "grad_norm": 0.31709209084510803, + "learning_rate": 3.99661631696356e-05, + "loss": 0.2221, + "step": 20527 + }, + { + "epoch": 0.3661399065387222, + "grad_norm": 0.2493421882390976, + "learning_rate": 3.996491635732218e-05, + "loss": 0.208, + "step": 20528 + }, + { + "epoch": 0.36615774266043594, + "grad_norm": 0.40529438853263855, + "learning_rate": 3.996366948699958e-05, + "loss": 0.2433, + "step": 20529 + }, + { + "epoch": 0.3661755787821496, + "grad_norm": 0.2923068404197693, + "learning_rate": 3.9962422558672624e-05, + "loss": 0.1478, + "step": 20530 + }, + { + "epoch": 0.3661934149038633, + "grad_norm": 0.23993302881717682, + "learning_rate": 3.996117557234616e-05, + "loss": 0.194, + "step": 20531 + }, + { + "epoch": 0.366211251025577, + "grad_norm": 0.24556699395179749, + "learning_rate": 3.995992852802499e-05, + "loss": 0.1589, + "step": 20532 + }, + { + "epoch": 0.3662290871472907, + "grad_norm": 0.37080827355384827, + "learning_rate": 3.995868142571399e-05, + "loss": 0.1466, + "step": 20533 + }, + { + "epoch": 0.3662469232690044, + "grad_norm": 0.24904875457286835, + "learning_rate": 3.995743426541797e-05, + "loss": 0.1968, + "step": 20534 + }, + { + "epoch": 0.36626475939071806, + "grad_norm": 0.31126147508621216, + "learning_rate": 3.995618704714177e-05, + "loss": 0.1999, + "step": 20535 + }, + { + "epoch": 0.36628259551243175, + "grad_norm": 0.2658511996269226, + "learning_rate": 3.995493977089022e-05, + "loss": 0.1666, + "step": 20536 + }, + { + "epoch": 0.3663004316341455, + "grad_norm": 0.21540863811969757, + "learning_rate": 3.995369243666815e-05, + "loss": 0.1638, + "step": 20537 + }, + { + "epoch": 0.3663182677558592, + "grad_norm": 0.20054097473621368, + "learning_rate": 3.9952445044480414e-05, + "loss": 0.1718, + "step": 20538 + }, + { + "epoch": 0.36633610387757287, + "grad_norm": 0.21968497335910797, + "learning_rate": 3.995119759433184e-05, + "loss": 0.1107, + "step": 20539 + }, + { + "epoch": 0.36635393999928656, + "grad_norm": 0.2569504976272583, + "learning_rate": 3.994995008622725e-05, + "loss": 0.1751, + "step": 20540 + }, + { + "epoch": 0.36637177612100025, + "grad_norm": 0.27129343152046204, + "learning_rate": 3.9948702520171496e-05, + "loss": 0.1411, + "step": 20541 + }, + { + "epoch": 0.36638961224271394, + "grad_norm": 0.2291068583726883, + "learning_rate": 3.99474548961694e-05, + "loss": 0.1844, + "step": 20542 + }, + { + "epoch": 0.3664074483644276, + "grad_norm": 0.2738098204135895, + "learning_rate": 3.994620721422582e-05, + "loss": 0.1345, + "step": 20543 + }, + { + "epoch": 0.3664252844861413, + "grad_norm": 0.31332671642303467, + "learning_rate": 3.9944959474345565e-05, + "loss": 0.1685, + "step": 20544 + }, + { + "epoch": 0.36644312060785506, + "grad_norm": 0.4531061351299286, + "learning_rate": 3.994371167653349e-05, + "loss": 0.169, + "step": 20545 + }, + { + "epoch": 0.36646095672956874, + "grad_norm": 0.2348158061504364, + "learning_rate": 3.9942463820794426e-05, + "loss": 0.1287, + "step": 20546 + }, + { + "epoch": 0.36647879285128243, + "grad_norm": 0.23562182486057281, + "learning_rate": 3.994121590713322e-05, + "loss": 0.1225, + "step": 20547 + }, + { + "epoch": 0.3664966289729961, + "grad_norm": 0.2010362297296524, + "learning_rate": 3.993996793555469e-05, + "loss": 0.1835, + "step": 20548 + }, + { + "epoch": 0.3665144650947098, + "grad_norm": 0.3551895022392273, + "learning_rate": 3.993871990606369e-05, + "loss": 0.1431, + "step": 20549 + }, + { + "epoch": 0.3665323012164235, + "grad_norm": 0.22388547658920288, + "learning_rate": 3.993747181866505e-05, + "loss": 0.1433, + "step": 20550 + }, + { + "epoch": 0.3665501373381372, + "grad_norm": 0.1926109939813614, + "learning_rate": 3.9936223673363616e-05, + "loss": 0.1271, + "step": 20551 + }, + { + "epoch": 0.36656797345985087, + "grad_norm": 0.24497747421264648, + "learning_rate": 3.993497547016421e-05, + "loss": 0.1274, + "step": 20552 + }, + { + "epoch": 0.36658580958156456, + "grad_norm": 0.2648605704307556, + "learning_rate": 3.9933727209071686e-05, + "loss": 0.1663, + "step": 20553 + }, + { + "epoch": 0.3666036457032783, + "grad_norm": 0.1955610066652298, + "learning_rate": 3.9932478890090875e-05, + "loss": 0.1843, + "step": 20554 + }, + { + "epoch": 0.366621481824992, + "grad_norm": 0.32756561040878296, + "learning_rate": 3.9931230513226624e-05, + "loss": 0.1518, + "step": 20555 + }, + { + "epoch": 0.3666393179467057, + "grad_norm": 0.2855414152145386, + "learning_rate": 3.992998207848376e-05, + "loss": 0.1761, + "step": 20556 + }, + { + "epoch": 0.36665715406841937, + "grad_norm": 0.2493400275707245, + "learning_rate": 3.992873358586713e-05, + "loss": 0.166, + "step": 20557 + }, + { + "epoch": 0.36667499019013305, + "grad_norm": 0.3972211480140686, + "learning_rate": 3.9927485035381575e-05, + "loss": 0.1928, + "step": 20558 + }, + { + "epoch": 0.36669282631184674, + "grad_norm": 0.24886314570903778, + "learning_rate": 3.992623642703193e-05, + "loss": 0.1554, + "step": 20559 + }, + { + "epoch": 0.36671066243356043, + "grad_norm": 0.3397632837295532, + "learning_rate": 3.992498776082304e-05, + "loss": 0.213, + "step": 20560 + }, + { + "epoch": 0.3667284985552741, + "grad_norm": 0.18415671586990356, + "learning_rate": 3.9923739036759745e-05, + "loss": 0.1577, + "step": 20561 + }, + { + "epoch": 0.36674633467698786, + "grad_norm": 0.24915896356105804, + "learning_rate": 3.992249025484688e-05, + "loss": 0.1678, + "step": 20562 + }, + { + "epoch": 0.36676417079870155, + "grad_norm": 0.31253454089164734, + "learning_rate": 3.992124141508928e-05, + "loss": 0.133, + "step": 20563 + }, + { + "epoch": 0.36678200692041524, + "grad_norm": 0.32473352551460266, + "learning_rate": 3.9919992517491806e-05, + "loss": 0.1443, + "step": 20564 + }, + { + "epoch": 0.3667998430421289, + "grad_norm": 0.2487039566040039, + "learning_rate": 3.991874356205928e-05, + "loss": 0.1191, + "step": 20565 + }, + { + "epoch": 0.3668176791638426, + "grad_norm": 0.2990691661834717, + "learning_rate": 3.991749454879655e-05, + "loss": 0.1594, + "step": 20566 + }, + { + "epoch": 0.3668355152855563, + "grad_norm": 0.25821274518966675, + "learning_rate": 3.991624547770847e-05, + "loss": 0.1767, + "step": 20567 + }, + { + "epoch": 0.36685335140727, + "grad_norm": 0.28561002016067505, + "learning_rate": 3.991499634879987e-05, + "loss": 0.1738, + "step": 20568 + }, + { + "epoch": 0.3668711875289837, + "grad_norm": 0.32387417554855347, + "learning_rate": 3.991374716207558e-05, + "loss": 0.2243, + "step": 20569 + }, + { + "epoch": 0.36688902365069737, + "grad_norm": 0.2864395081996918, + "learning_rate": 3.991249791754046e-05, + "loss": 0.1338, + "step": 20570 + }, + { + "epoch": 0.3669068597724111, + "grad_norm": 0.2525719106197357, + "learning_rate": 3.991124861519935e-05, + "loss": 0.1545, + "step": 20571 + }, + { + "epoch": 0.3669246958941248, + "grad_norm": 0.2808087468147278, + "learning_rate": 3.990999925505709e-05, + "loss": 0.1805, + "step": 20572 + }, + { + "epoch": 0.3669425320158385, + "grad_norm": 0.24211646616458893, + "learning_rate": 3.990874983711852e-05, + "loss": 0.1378, + "step": 20573 + }, + { + "epoch": 0.3669603681375522, + "grad_norm": 0.24511969089508057, + "learning_rate": 3.9907500361388494e-05, + "loss": 0.1819, + "step": 20574 + }, + { + "epoch": 0.36697820425926586, + "grad_norm": 0.2839509844779968, + "learning_rate": 3.990625082787185e-05, + "loss": 0.128, + "step": 20575 + }, + { + "epoch": 0.36699604038097955, + "grad_norm": 0.26853278279304504, + "learning_rate": 3.9905001236573417e-05, + "loss": 0.1554, + "step": 20576 + }, + { + "epoch": 0.36701387650269324, + "grad_norm": 0.3065044581890106, + "learning_rate": 3.9903751587498056e-05, + "loss": 0.1486, + "step": 20577 + }, + { + "epoch": 0.3670317126244069, + "grad_norm": 0.29642218351364136, + "learning_rate": 3.99025018806506e-05, + "loss": 0.1197, + "step": 20578 + }, + { + "epoch": 0.36704954874612067, + "grad_norm": 0.3315908908843994, + "learning_rate": 3.9901252116035917e-05, + "loss": 0.2342, + "step": 20579 + }, + { + "epoch": 0.36706738486783436, + "grad_norm": 0.2164284884929657, + "learning_rate": 3.9900002293658814e-05, + "loss": 0.1592, + "step": 20580 + }, + { + "epoch": 0.36708522098954804, + "grad_norm": 0.23315350711345673, + "learning_rate": 3.989875241352417e-05, + "loss": 0.1002, + "step": 20581 + }, + { + "epoch": 0.36710305711126173, + "grad_norm": 0.2701401114463806, + "learning_rate": 3.9897502475636804e-05, + "loss": 0.1475, + "step": 20582 + }, + { + "epoch": 0.3671208932329754, + "grad_norm": 0.28790125250816345, + "learning_rate": 3.9896252480001586e-05, + "loss": 0.1451, + "step": 20583 + }, + { + "epoch": 0.3671387293546891, + "grad_norm": 0.25865453481674194, + "learning_rate": 3.989500242662334e-05, + "loss": 0.1527, + "step": 20584 + }, + { + "epoch": 0.3671565654764028, + "grad_norm": 0.23018702864646912, + "learning_rate": 3.989375231550693e-05, + "loss": 0.1458, + "step": 20585 + }, + { + "epoch": 0.3671744015981165, + "grad_norm": 0.25528964400291443, + "learning_rate": 3.989250214665717e-05, + "loss": 0.16, + "step": 20586 + }, + { + "epoch": 0.36719223771983023, + "grad_norm": 0.2600228190422058, + "learning_rate": 3.989125192007895e-05, + "loss": 0.1698, + "step": 20587 + }, + { + "epoch": 0.3672100738415439, + "grad_norm": 0.2692631185054779, + "learning_rate": 3.9890001635777084e-05, + "loss": 0.2139, + "step": 20588 + }, + { + "epoch": 0.3672279099632576, + "grad_norm": 0.2748851478099823, + "learning_rate": 3.988875129375643e-05, + "loss": 0.1304, + "step": 20589 + }, + { + "epoch": 0.3672457460849713, + "grad_norm": 0.2953891158103943, + "learning_rate": 3.9887500894021836e-05, + "loss": 0.1869, + "step": 20590 + }, + { + "epoch": 0.367263582206685, + "grad_norm": 0.33523455262184143, + "learning_rate": 3.988625043657814e-05, + "loss": 0.2501, + "step": 20591 + }, + { + "epoch": 0.36728141832839867, + "grad_norm": 0.24251173436641693, + "learning_rate": 3.988499992143021e-05, + "loss": 0.161, + "step": 20592 + }, + { + "epoch": 0.36729925445011236, + "grad_norm": 0.19889268279075623, + "learning_rate": 3.9883749348582863e-05, + "loss": 0.1592, + "step": 20593 + }, + { + "epoch": 0.36731709057182604, + "grad_norm": 0.22585001587867737, + "learning_rate": 3.9882498718040974e-05, + "loss": 0.1711, + "step": 20594 + }, + { + "epoch": 0.36733492669353973, + "grad_norm": 0.26180481910705566, + "learning_rate": 3.988124802980938e-05, + "loss": 0.1688, + "step": 20595 + }, + { + "epoch": 0.3673527628152535, + "grad_norm": 0.2743278443813324, + "learning_rate": 3.987999728389292e-05, + "loss": 0.1156, + "step": 20596 + }, + { + "epoch": 0.36737059893696716, + "grad_norm": 0.2452520728111267, + "learning_rate": 3.987874648029646e-05, + "loss": 0.1178, + "step": 20597 + }, + { + "epoch": 0.36738843505868085, + "grad_norm": 0.24680772423744202, + "learning_rate": 3.987749561902483e-05, + "loss": 0.1188, + "step": 20598 + }, + { + "epoch": 0.36740627118039454, + "grad_norm": 0.27082571387290955, + "learning_rate": 3.98762447000829e-05, + "loss": 0.1597, + "step": 20599 + }, + { + "epoch": 0.3674241073021082, + "grad_norm": 0.2516172528266907, + "learning_rate": 3.9874993723475493e-05, + "loss": 0.1784, + "step": 20600 + }, + { + "epoch": 0.3674419434238219, + "grad_norm": 0.2623244822025299, + "learning_rate": 3.9873742689207486e-05, + "loss": 0.1522, + "step": 20601 + }, + { + "epoch": 0.3674597795455356, + "grad_norm": 0.44623157382011414, + "learning_rate": 3.9872491597283714e-05, + "loss": 0.1731, + "step": 20602 + }, + { + "epoch": 0.3674776156672493, + "grad_norm": 0.2736106514930725, + "learning_rate": 3.9871240447709024e-05, + "loss": 0.1539, + "step": 20603 + }, + { + "epoch": 0.36749545178896303, + "grad_norm": 0.2611555755138397, + "learning_rate": 3.9869989240488266e-05, + "loss": 0.1309, + "step": 20604 + }, + { + "epoch": 0.3675132879106767, + "grad_norm": 0.19687646627426147, + "learning_rate": 3.9868737975626306e-05, + "loss": 0.1252, + "step": 20605 + }, + { + "epoch": 0.3675311240323904, + "grad_norm": 0.22891318798065186, + "learning_rate": 3.986748665312796e-05, + "loss": 0.1447, + "step": 20606 + }, + { + "epoch": 0.3675489601541041, + "grad_norm": 0.3005000948905945, + "learning_rate": 3.986623527299812e-05, + "loss": 0.132, + "step": 20607 + }, + { + "epoch": 0.3675667962758178, + "grad_norm": 0.20634649693965912, + "learning_rate": 3.986498383524161e-05, + "loss": 0.1379, + "step": 20608 + }, + { + "epoch": 0.3675846323975315, + "grad_norm": 0.26238688826560974, + "learning_rate": 3.986373233986329e-05, + "loss": 0.1763, + "step": 20609 + }, + { + "epoch": 0.36760246851924516, + "grad_norm": 0.23865243792533875, + "learning_rate": 3.9862480786868006e-05, + "loss": 0.1511, + "step": 20610 + }, + { + "epoch": 0.36762030464095885, + "grad_norm": 0.37497448921203613, + "learning_rate": 3.9861229176260614e-05, + "loss": 0.1733, + "step": 20611 + }, + { + "epoch": 0.36763814076267254, + "grad_norm": 0.23728413879871368, + "learning_rate": 3.9859977508045976e-05, + "loss": 0.1468, + "step": 20612 + }, + { + "epoch": 0.3676559768843863, + "grad_norm": 0.28528738021850586, + "learning_rate": 3.985872578222892e-05, + "loss": 0.1984, + "step": 20613 + }, + { + "epoch": 0.36767381300609997, + "grad_norm": 0.26065874099731445, + "learning_rate": 3.985747399881432e-05, + "loss": 0.1678, + "step": 20614 + }, + { + "epoch": 0.36769164912781366, + "grad_norm": 0.2710501253604889, + "learning_rate": 3.985622215780701e-05, + "loss": 0.2159, + "step": 20615 + }, + { + "epoch": 0.36770948524952735, + "grad_norm": 0.32494550943374634, + "learning_rate": 3.9854970259211863e-05, + "loss": 0.2324, + "step": 20616 + }, + { + "epoch": 0.36772732137124103, + "grad_norm": 0.20609810948371887, + "learning_rate": 3.985371830303371e-05, + "loss": 0.1306, + "step": 20617 + }, + { + "epoch": 0.3677451574929547, + "grad_norm": 0.23852016031742096, + "learning_rate": 3.985246628927742e-05, + "loss": 0.1819, + "step": 20618 + }, + { + "epoch": 0.3677629936146684, + "grad_norm": 0.25980716943740845, + "learning_rate": 3.985121421794783e-05, + "loss": 0.1576, + "step": 20619 + }, + { + "epoch": 0.3677808297363821, + "grad_norm": 0.35172727704048157, + "learning_rate": 3.9849962089049816e-05, + "loss": 0.1409, + "step": 20620 + }, + { + "epoch": 0.36779866585809584, + "grad_norm": 0.2904670834541321, + "learning_rate": 3.984870990258822e-05, + "loss": 0.1581, + "step": 20621 + }, + { + "epoch": 0.36781650197980953, + "grad_norm": 0.24317657947540283, + "learning_rate": 3.9847457658567896e-05, + "loss": 0.1544, + "step": 20622 + }, + { + "epoch": 0.3678343381015232, + "grad_norm": 0.3140999376773834, + "learning_rate": 3.9846205356993696e-05, + "loss": 0.1589, + "step": 20623 + }, + { + "epoch": 0.3678521742232369, + "grad_norm": 0.27958840131759644, + "learning_rate": 3.984495299787047e-05, + "loss": 0.1742, + "step": 20624 + }, + { + "epoch": 0.3678700103449506, + "grad_norm": 0.2750065326690674, + "learning_rate": 3.984370058120308e-05, + "loss": 0.1475, + "step": 20625 + }, + { + "epoch": 0.3678878464666643, + "grad_norm": 0.39659854769706726, + "learning_rate": 3.984244810699639e-05, + "loss": 0.1407, + "step": 20626 + }, + { + "epoch": 0.36790568258837797, + "grad_norm": 0.2191317230463028, + "learning_rate": 3.9841195575255244e-05, + "loss": 0.1262, + "step": 20627 + }, + { + "epoch": 0.36792351871009166, + "grad_norm": 0.2404310405254364, + "learning_rate": 3.9839942985984494e-05, + "loss": 0.2014, + "step": 20628 + }, + { + "epoch": 0.36794135483180535, + "grad_norm": 0.2965210974216461, + "learning_rate": 3.983869033918899e-05, + "loss": 0.1392, + "step": 20629 + }, + { + "epoch": 0.3679591909535191, + "grad_norm": 0.25626567006111145, + "learning_rate": 3.983743763487361e-05, + "loss": 0.1662, + "step": 20630 + }, + { + "epoch": 0.3679770270752328, + "grad_norm": 0.2228141874074936, + "learning_rate": 3.9836184873043194e-05, + "loss": 0.1389, + "step": 20631 + }, + { + "epoch": 0.36799486319694646, + "grad_norm": 0.36724740266799927, + "learning_rate": 3.983493205370259e-05, + "loss": 0.1317, + "step": 20632 + }, + { + "epoch": 0.36801269931866015, + "grad_norm": 0.36563339829444885, + "learning_rate": 3.983367917685668e-05, + "loss": 0.1758, + "step": 20633 + }, + { + "epoch": 0.36803053544037384, + "grad_norm": 0.2705327868461609, + "learning_rate": 3.98324262425103e-05, + "loss": 0.1926, + "step": 20634 + }, + { + "epoch": 0.36804837156208753, + "grad_norm": 0.3557082712650299, + "learning_rate": 3.983117325066832e-05, + "loss": 0.1927, + "step": 20635 + }, + { + "epoch": 0.3680662076838012, + "grad_norm": 0.30475980043411255, + "learning_rate": 3.982992020133558e-05, + "loss": 0.1535, + "step": 20636 + }, + { + "epoch": 0.3680840438055149, + "grad_norm": 0.29667025804519653, + "learning_rate": 3.982866709451695e-05, + "loss": 0.1458, + "step": 20637 + }, + { + "epoch": 0.36810187992722865, + "grad_norm": 0.2111435830593109, + "learning_rate": 3.982741393021728e-05, + "loss": 0.199, + "step": 20638 + }, + { + "epoch": 0.36811971604894234, + "grad_norm": 0.21722495555877686, + "learning_rate": 3.982616070844144e-05, + "loss": 0.11, + "step": 20639 + }, + { + "epoch": 0.368137552170656, + "grad_norm": 0.22200269997119904, + "learning_rate": 3.982490742919428e-05, + "loss": 0.1786, + "step": 20640 + }, + { + "epoch": 0.3681553882923697, + "grad_norm": 0.3229037821292877, + "learning_rate": 3.982365409248066e-05, + "loss": 0.1806, + "step": 20641 + }, + { + "epoch": 0.3681732244140834, + "grad_norm": 0.17968812584877014, + "learning_rate": 3.9822400698305434e-05, + "loss": 0.1348, + "step": 20642 + }, + { + "epoch": 0.3681910605357971, + "grad_norm": 0.27382978796958923, + "learning_rate": 3.982114724667346e-05, + "loss": 0.1353, + "step": 20643 + }, + { + "epoch": 0.3682088966575108, + "grad_norm": 0.2185668796300888, + "learning_rate": 3.981989373758961e-05, + "loss": 0.1485, + "step": 20644 + }, + { + "epoch": 0.36822673277922446, + "grad_norm": 0.2252895087003708, + "learning_rate": 3.981864017105872e-05, + "loss": 0.1391, + "step": 20645 + }, + { + "epoch": 0.3682445689009382, + "grad_norm": 0.2759244441986084, + "learning_rate": 3.981738654708567e-05, + "loss": 0.1306, + "step": 20646 + }, + { + "epoch": 0.3682624050226519, + "grad_norm": 0.3097649812698364, + "learning_rate": 3.9816132865675316e-05, + "loss": 0.1925, + "step": 20647 + }, + { + "epoch": 0.3682802411443656, + "grad_norm": 0.24489419162273407, + "learning_rate": 3.9814879126832504e-05, + "loss": 0.1596, + "step": 20648 + }, + { + "epoch": 0.36829807726607927, + "grad_norm": 0.4304359555244446, + "learning_rate": 3.981362533056211e-05, + "loss": 0.1945, + "step": 20649 + }, + { + "epoch": 0.36831591338779296, + "grad_norm": 0.2890470623970032, + "learning_rate": 3.9812371476868984e-05, + "loss": 0.1539, + "step": 20650 + }, + { + "epoch": 0.36833374950950665, + "grad_norm": 0.2140313982963562, + "learning_rate": 3.9811117565757994e-05, + "loss": 0.1501, + "step": 20651 + }, + { + "epoch": 0.36835158563122034, + "grad_norm": 0.24240589141845703, + "learning_rate": 3.9809863597234e-05, + "loss": 0.1745, + "step": 20652 + }, + { + "epoch": 0.368369421752934, + "grad_norm": 0.2301190048456192, + "learning_rate": 3.9808609571301844e-05, + "loss": 0.1891, + "step": 20653 + }, + { + "epoch": 0.3683872578746477, + "grad_norm": 0.2814874053001404, + "learning_rate": 3.9807355487966416e-05, + "loss": 0.1543, + "step": 20654 + }, + { + "epoch": 0.36840509399636145, + "grad_norm": 0.18487438559532166, + "learning_rate": 3.980610134723256e-05, + "loss": 0.1735, + "step": 20655 + }, + { + "epoch": 0.36842293011807514, + "grad_norm": 0.3395955264568329, + "learning_rate": 3.9804847149105145e-05, + "loss": 0.1939, + "step": 20656 + }, + { + "epoch": 0.36844076623978883, + "grad_norm": 0.3092682957649231, + "learning_rate": 3.9803592893589027e-05, + "loss": 0.2114, + "step": 20657 + }, + { + "epoch": 0.3684586023615025, + "grad_norm": 0.25286003947257996, + "learning_rate": 3.980233858068907e-05, + "loss": 0.1414, + "step": 20658 + }, + { + "epoch": 0.3684764384832162, + "grad_norm": 0.20957231521606445, + "learning_rate": 3.980108421041013e-05, + "loss": 0.1625, + "step": 20659 + }, + { + "epoch": 0.3684942746049299, + "grad_norm": 0.35376012325286865, + "learning_rate": 3.979982978275708e-05, + "loss": 0.2041, + "step": 20660 + }, + { + "epoch": 0.3685121107266436, + "grad_norm": 0.1887214183807373, + "learning_rate": 3.9798575297734785e-05, + "loss": 0.1323, + "step": 20661 + }, + { + "epoch": 0.36852994684835727, + "grad_norm": 0.2860632538795471, + "learning_rate": 3.9797320755348096e-05, + "loss": 0.1418, + "step": 20662 + }, + { + "epoch": 0.368547782970071, + "grad_norm": 0.20776663720607758, + "learning_rate": 3.9796066155601874e-05, + "loss": 0.1347, + "step": 20663 + }, + { + "epoch": 0.3685656190917847, + "grad_norm": 0.2547638714313507, + "learning_rate": 3.9794811498501e-05, + "loss": 0.1543, + "step": 20664 + }, + { + "epoch": 0.3685834552134984, + "grad_norm": 0.3296222984790802, + "learning_rate": 3.9793556784050326e-05, + "loss": 0.1845, + "step": 20665 + }, + { + "epoch": 0.3686012913352121, + "grad_norm": 0.3723026514053345, + "learning_rate": 3.9792302012254704e-05, + "loss": 0.185, + "step": 20666 + }, + { + "epoch": 0.36861912745692577, + "grad_norm": 0.26877814531326294, + "learning_rate": 3.9791047183119024e-05, + "loss": 0.1515, + "step": 20667 + }, + { + "epoch": 0.36863696357863945, + "grad_norm": 0.29769232869148254, + "learning_rate": 3.978979229664813e-05, + "loss": 0.22, + "step": 20668 + }, + { + "epoch": 0.36865479970035314, + "grad_norm": 0.21737362444400787, + "learning_rate": 3.978853735284689e-05, + "loss": 0.1518, + "step": 20669 + }, + { + "epoch": 0.36867263582206683, + "grad_norm": 0.24438200891017914, + "learning_rate": 3.978728235172018e-05, + "loss": 0.1187, + "step": 20670 + }, + { + "epoch": 0.3686904719437805, + "grad_norm": 0.3249605596065521, + "learning_rate": 3.978602729327284e-05, + "loss": 0.1731, + "step": 20671 + }, + { + "epoch": 0.36870830806549426, + "grad_norm": 0.31561630964279175, + "learning_rate": 3.978477217750977e-05, + "loss": 0.1521, + "step": 20672 + }, + { + "epoch": 0.36872614418720795, + "grad_norm": 0.20443210005760193, + "learning_rate": 3.9783517004435806e-05, + "loss": 0.1878, + "step": 20673 + }, + { + "epoch": 0.36874398030892164, + "grad_norm": 0.25708481669425964, + "learning_rate": 3.978226177405583e-05, + "loss": 0.1637, + "step": 20674 + }, + { + "epoch": 0.3687618164306353, + "grad_norm": 0.21317251026630402, + "learning_rate": 3.9781006486374694e-05, + "loss": 0.1262, + "step": 20675 + }, + { + "epoch": 0.368779652552349, + "grad_norm": 0.3076186180114746, + "learning_rate": 3.9779751141397284e-05, + "loss": 0.117, + "step": 20676 + }, + { + "epoch": 0.3687974886740627, + "grad_norm": 0.23719710111618042, + "learning_rate": 3.977849573912844e-05, + "loss": 0.1509, + "step": 20677 + }, + { + "epoch": 0.3688153247957764, + "grad_norm": 0.3630293607711792, + "learning_rate": 3.977724027957305e-05, + "loss": 0.1669, + "step": 20678 + }, + { + "epoch": 0.3688331609174901, + "grad_norm": 0.263110488653183, + "learning_rate": 3.9775984762735974e-05, + "loss": 0.2011, + "step": 20679 + }, + { + "epoch": 0.3688509970392038, + "grad_norm": 0.19805766642093658, + "learning_rate": 3.977472918862207e-05, + "loss": 0.1589, + "step": 20680 + }, + { + "epoch": 0.3688688331609175, + "grad_norm": 0.18016962707042694, + "learning_rate": 3.977347355723622e-05, + "loss": 0.1199, + "step": 20681 + }, + { + "epoch": 0.3688866692826312, + "grad_norm": 0.31155461072921753, + "learning_rate": 3.9772217868583287e-05, + "loss": 0.1555, + "step": 20682 + }, + { + "epoch": 0.3689045054043449, + "grad_norm": 0.24009375274181366, + "learning_rate": 3.977096212266812e-05, + "loss": 0.1479, + "step": 20683 + }, + { + "epoch": 0.3689223415260586, + "grad_norm": 0.23271121084690094, + "learning_rate": 3.976970631949561e-05, + "loss": 0.1589, + "step": 20684 + }, + { + "epoch": 0.36894017764777226, + "grad_norm": 0.3241308033466339, + "learning_rate": 3.976845045907063e-05, + "loss": 0.213, + "step": 20685 + }, + { + "epoch": 0.36895801376948595, + "grad_norm": 0.2038877308368683, + "learning_rate": 3.976719454139802e-05, + "loss": 0.149, + "step": 20686 + }, + { + "epoch": 0.36897584989119964, + "grad_norm": 0.33551889657974243, + "learning_rate": 3.976593856648266e-05, + "loss": 0.1687, + "step": 20687 + }, + { + "epoch": 0.3689936860129133, + "grad_norm": 0.2658495306968689, + "learning_rate": 3.976468253432944e-05, + "loss": 0.1767, + "step": 20688 + }, + { + "epoch": 0.36901152213462707, + "grad_norm": 0.20284298062324524, + "learning_rate": 3.97634264449432e-05, + "loss": 0.1393, + "step": 20689 + }, + { + "epoch": 0.36902935825634076, + "grad_norm": 0.2537386417388916, + "learning_rate": 3.9762170298328814e-05, + "loss": 0.1714, + "step": 20690 + }, + { + "epoch": 0.36904719437805444, + "grad_norm": 0.22543781995773315, + "learning_rate": 3.9760914094491166e-05, + "loss": 0.1279, + "step": 20691 + }, + { + "epoch": 0.36906503049976813, + "grad_norm": 0.2585589587688446, + "learning_rate": 3.9759657833435115e-05, + "loss": 0.166, + "step": 20692 + }, + { + "epoch": 0.3690828666214818, + "grad_norm": 0.2211478054523468, + "learning_rate": 3.975840151516553e-05, + "loss": 0.146, + "step": 20693 + }, + { + "epoch": 0.3691007027431955, + "grad_norm": 0.3498081862926483, + "learning_rate": 3.975714513968729e-05, + "loss": 0.1767, + "step": 20694 + }, + { + "epoch": 0.3691185388649092, + "grad_norm": 0.30740275979042053, + "learning_rate": 3.975588870700525e-05, + "loss": 0.1838, + "step": 20695 + }, + { + "epoch": 0.3691363749866229, + "grad_norm": 0.2446300983428955, + "learning_rate": 3.97546322171243e-05, + "loss": 0.1529, + "step": 20696 + }, + { + "epoch": 0.3691542111083366, + "grad_norm": 0.22753500938415527, + "learning_rate": 3.9753375670049285e-05, + "loss": 0.1874, + "step": 20697 + }, + { + "epoch": 0.3691720472300503, + "grad_norm": 0.19337725639343262, + "learning_rate": 3.97521190657851e-05, + "loss": 0.1147, + "step": 20698 + }, + { + "epoch": 0.369189883351764, + "grad_norm": 0.29787009954452515, + "learning_rate": 3.97508624043366e-05, + "loss": 0.1063, + "step": 20699 + }, + { + "epoch": 0.3692077194734777, + "grad_norm": 0.294334352016449, + "learning_rate": 3.974960568570867e-05, + "loss": 0.1619, + "step": 20700 + }, + { + "epoch": 0.3692255555951914, + "grad_norm": 0.21719473600387573, + "learning_rate": 3.974834890990616e-05, + "loss": 0.1594, + "step": 20701 + }, + { + "epoch": 0.36924339171690507, + "grad_norm": 0.27734988927841187, + "learning_rate": 3.974709207693397e-05, + "loss": 0.1595, + "step": 20702 + }, + { + "epoch": 0.36926122783861876, + "grad_norm": 0.31374648213386536, + "learning_rate": 3.974583518679695e-05, + "loss": 0.163, + "step": 20703 + }, + { + "epoch": 0.36927906396033244, + "grad_norm": 0.18755824863910675, + "learning_rate": 3.974457823949999e-05, + "loss": 0.139, + "step": 20704 + }, + { + "epoch": 0.3692969000820462, + "grad_norm": 0.28139257431030273, + "learning_rate": 3.974332123504794e-05, + "loss": 0.1229, + "step": 20705 + }, + { + "epoch": 0.3693147362037599, + "grad_norm": 0.28746309876441956, + "learning_rate": 3.974206417344569e-05, + "loss": 0.1515, + "step": 20706 + }, + { + "epoch": 0.36933257232547356, + "grad_norm": 0.21319624781608582, + "learning_rate": 3.974080705469812e-05, + "loss": 0.1646, + "step": 20707 + }, + { + "epoch": 0.36935040844718725, + "grad_norm": 0.2256128191947937, + "learning_rate": 3.973954987881007e-05, + "loss": 0.1308, + "step": 20708 + }, + { + "epoch": 0.36936824456890094, + "grad_norm": 0.38180941343307495, + "learning_rate": 3.973829264578645e-05, + "loss": 0.1748, + "step": 20709 + }, + { + "epoch": 0.3693860806906146, + "grad_norm": 0.23181700706481934, + "learning_rate": 3.97370353556321e-05, + "loss": 0.2155, + "step": 20710 + }, + { + "epoch": 0.3694039168123283, + "grad_norm": 0.331217497587204, + "learning_rate": 3.973577800835192e-05, + "loss": 0.2849, + "step": 20711 + }, + { + "epoch": 0.369421752934042, + "grad_norm": 0.28522709012031555, + "learning_rate": 3.973452060395077e-05, + "loss": 0.1678, + "step": 20712 + }, + { + "epoch": 0.3694395890557557, + "grad_norm": 0.21633446216583252, + "learning_rate": 3.9733263142433544e-05, + "loss": 0.158, + "step": 20713 + }, + { + "epoch": 0.36945742517746943, + "grad_norm": 0.21896664798259735, + "learning_rate": 3.973200562380509e-05, + "loss": 0.115, + "step": 20714 + }, + { + "epoch": 0.3694752612991831, + "grad_norm": 0.22210277616977692, + "learning_rate": 3.973074804807029e-05, + "loss": 0.2155, + "step": 20715 + }, + { + "epoch": 0.3694930974208968, + "grad_norm": 0.25365108251571655, + "learning_rate": 3.972949041523403e-05, + "loss": 0.108, + "step": 20716 + }, + { + "epoch": 0.3695109335426105, + "grad_norm": 0.24365845322608948, + "learning_rate": 3.972823272530118e-05, + "loss": 0.1724, + "step": 20717 + }, + { + "epoch": 0.3695287696643242, + "grad_norm": 0.24406962096691132, + "learning_rate": 3.9726974978276606e-05, + "loss": 0.1744, + "step": 20718 + }, + { + "epoch": 0.3695466057860379, + "grad_norm": 0.29617372155189514, + "learning_rate": 3.972571717416519e-05, + "loss": 0.1755, + "step": 20719 + }, + { + "epoch": 0.36956444190775156, + "grad_norm": 0.24914050102233887, + "learning_rate": 3.972445931297182e-05, + "loss": 0.1625, + "step": 20720 + }, + { + "epoch": 0.36958227802946525, + "grad_norm": 0.298544317483902, + "learning_rate": 3.972320139470135e-05, + "loss": 0.1517, + "step": 20721 + }, + { + "epoch": 0.369600114151179, + "grad_norm": 0.25870659947395325, + "learning_rate": 3.972194341935867e-05, + "loss": 0.1658, + "step": 20722 + }, + { + "epoch": 0.3696179502728927, + "grad_norm": 0.26940783858299255, + "learning_rate": 3.9720685386948645e-05, + "loss": 0.1725, + "step": 20723 + }, + { + "epoch": 0.36963578639460637, + "grad_norm": 0.1911354660987854, + "learning_rate": 3.971942729747617e-05, + "loss": 0.1491, + "step": 20724 + }, + { + "epoch": 0.36965362251632006, + "grad_norm": 0.21466852724552155, + "learning_rate": 3.971816915094609e-05, + "loss": 0.1404, + "step": 20725 + }, + { + "epoch": 0.36967145863803375, + "grad_norm": 0.3736649751663208, + "learning_rate": 3.971691094736333e-05, + "loss": 0.2264, + "step": 20726 + }, + { + "epoch": 0.36968929475974743, + "grad_norm": 0.29381558299064636, + "learning_rate": 3.9715652686732726e-05, + "loss": 0.1273, + "step": 20727 + }, + { + "epoch": 0.3697071308814611, + "grad_norm": 0.2900749444961548, + "learning_rate": 3.971439436905917e-05, + "loss": 0.151, + "step": 20728 + }, + { + "epoch": 0.3697249670031748, + "grad_norm": 0.2699805200099945, + "learning_rate": 3.971313599434754e-05, + "loss": 0.1438, + "step": 20729 + }, + { + "epoch": 0.3697428031248885, + "grad_norm": 0.4069530665874481, + "learning_rate": 3.971187756260272e-05, + "loss": 0.1943, + "step": 20730 + }, + { + "epoch": 0.36976063924660224, + "grad_norm": 0.3974965214729309, + "learning_rate": 3.971061907382957e-05, + "loss": 0.1448, + "step": 20731 + }, + { + "epoch": 0.36977847536831593, + "grad_norm": 0.21953637897968292, + "learning_rate": 3.970936052803298e-05, + "loss": 0.1718, + "step": 20732 + }, + { + "epoch": 0.3697963114900296, + "grad_norm": 0.2323625534772873, + "learning_rate": 3.970810192521784e-05, + "loss": 0.0813, + "step": 20733 + }, + { + "epoch": 0.3698141476117433, + "grad_norm": 0.37191709876060486, + "learning_rate": 3.9706843265389004e-05, + "loss": 0.1205, + "step": 20734 + }, + { + "epoch": 0.369831983733457, + "grad_norm": 0.2520902156829834, + "learning_rate": 3.9705584548551375e-05, + "loss": 0.0957, + "step": 20735 + }, + { + "epoch": 0.3698498198551707, + "grad_norm": 0.2635739743709564, + "learning_rate": 3.970432577470981e-05, + "loss": 0.1837, + "step": 20736 + }, + { + "epoch": 0.36986765597688437, + "grad_norm": 0.2611275017261505, + "learning_rate": 3.970306694386921e-05, + "loss": 0.1733, + "step": 20737 + }, + { + "epoch": 0.36988549209859806, + "grad_norm": 0.2665553092956543, + "learning_rate": 3.9701808056034436e-05, + "loss": 0.1567, + "step": 20738 + }, + { + "epoch": 0.3699033282203118, + "grad_norm": 0.2911405861377716, + "learning_rate": 3.970054911121038e-05, + "loss": 0.162, + "step": 20739 + }, + { + "epoch": 0.3699211643420255, + "grad_norm": 0.25105607509613037, + "learning_rate": 3.969929010940192e-05, + "loss": 0.1511, + "step": 20740 + }, + { + "epoch": 0.3699390004637392, + "grad_norm": 0.3187588155269623, + "learning_rate": 3.9698031050613935e-05, + "loss": 0.1879, + "step": 20741 + }, + { + "epoch": 0.36995683658545286, + "grad_norm": 0.3255598247051239, + "learning_rate": 3.96967719348513e-05, + "loss": 0.1986, + "step": 20742 + }, + { + "epoch": 0.36997467270716655, + "grad_norm": 0.18469995260238647, + "learning_rate": 3.96955127621189e-05, + "loss": 0.1256, + "step": 20743 + }, + { + "epoch": 0.36999250882888024, + "grad_norm": 0.21111544966697693, + "learning_rate": 3.969425353242162e-05, + "loss": 0.1369, + "step": 20744 + }, + { + "epoch": 0.37001034495059393, + "grad_norm": 0.21260325610637665, + "learning_rate": 3.969299424576435e-05, + "loss": 0.1478, + "step": 20745 + }, + { + "epoch": 0.3700281810723076, + "grad_norm": 0.29275450110435486, + "learning_rate": 3.969173490215195e-05, + "loss": 0.1898, + "step": 20746 + }, + { + "epoch": 0.37004601719402136, + "grad_norm": 0.33503565192222595, + "learning_rate": 3.9690475501589297e-05, + "loss": 0.1788, + "step": 20747 + }, + { + "epoch": 0.37006385331573505, + "grad_norm": 0.18387584388256073, + "learning_rate": 3.96892160440813e-05, + "loss": 0.1343, + "step": 20748 + }, + { + "epoch": 0.37008168943744874, + "grad_norm": 0.3641549050807953, + "learning_rate": 3.968795652963283e-05, + "loss": 0.1881, + "step": 20749 + }, + { + "epoch": 0.3700995255591624, + "grad_norm": 0.2309250831604004, + "learning_rate": 3.968669695824877e-05, + "loss": 0.1303, + "step": 20750 + }, + { + "epoch": 0.3701173616808761, + "grad_norm": 0.28715309500694275, + "learning_rate": 3.968543732993399e-05, + "loss": 0.1468, + "step": 20751 + }, + { + "epoch": 0.3701351978025898, + "grad_norm": 0.2431654930114746, + "learning_rate": 3.968417764469339e-05, + "loss": 0.1678, + "step": 20752 + }, + { + "epoch": 0.3701530339243035, + "grad_norm": 0.2502996027469635, + "learning_rate": 3.968291790253183e-05, + "loss": 0.1938, + "step": 20753 + }, + { + "epoch": 0.3701708700460172, + "grad_norm": 0.3158586323261261, + "learning_rate": 3.9681658103454234e-05, + "loss": 0.1553, + "step": 20754 + }, + { + "epoch": 0.37018870616773086, + "grad_norm": 0.28493940830230713, + "learning_rate": 3.968039824746545e-05, + "loss": 0.1345, + "step": 20755 + }, + { + "epoch": 0.3702065422894446, + "grad_norm": 0.33351123332977295, + "learning_rate": 3.9679138334570365e-05, + "loss": 0.1955, + "step": 20756 + }, + { + "epoch": 0.3702243784111583, + "grad_norm": 0.21014797687530518, + "learning_rate": 3.967787836477387e-05, + "loss": 0.1493, + "step": 20757 + }, + { + "epoch": 0.370242214532872, + "grad_norm": 0.20715588331222534, + "learning_rate": 3.967661833808086e-05, + "loss": 0.1505, + "step": 20758 + }, + { + "epoch": 0.37026005065458567, + "grad_norm": 0.22673384845256805, + "learning_rate": 3.967535825449621e-05, + "loss": 0.1657, + "step": 20759 + }, + { + "epoch": 0.37027788677629936, + "grad_norm": 0.24830789864063263, + "learning_rate": 3.9674098114024785e-05, + "loss": 0.1399, + "step": 20760 + }, + { + "epoch": 0.37029572289801305, + "grad_norm": 0.33110448718070984, + "learning_rate": 3.96728379166715e-05, + "loss": 0.1089, + "step": 20761 + }, + { + "epoch": 0.37031355901972673, + "grad_norm": 0.281533420085907, + "learning_rate": 3.967157766244123e-05, + "loss": 0.207, + "step": 20762 + }, + { + "epoch": 0.3703313951414404, + "grad_norm": 0.290824294090271, + "learning_rate": 3.967031735133885e-05, + "loss": 0.1489, + "step": 20763 + }, + { + "epoch": 0.37034923126315417, + "grad_norm": 0.22915130853652954, + "learning_rate": 3.966905698336925e-05, + "loss": 0.1382, + "step": 20764 + }, + { + "epoch": 0.37036706738486785, + "grad_norm": 0.26040738821029663, + "learning_rate": 3.966779655853733e-05, + "loss": 0.1796, + "step": 20765 + }, + { + "epoch": 0.37038490350658154, + "grad_norm": 0.30748674273490906, + "learning_rate": 3.9666536076847954e-05, + "loss": 0.1534, + "step": 20766 + }, + { + "epoch": 0.37040273962829523, + "grad_norm": 0.2495889663696289, + "learning_rate": 3.9665275538306026e-05, + "loss": 0.1643, + "step": 20767 + }, + { + "epoch": 0.3704205757500089, + "grad_norm": 0.2611350417137146, + "learning_rate": 3.9664014942916416e-05, + "loss": 0.1957, + "step": 20768 + }, + { + "epoch": 0.3704384118717226, + "grad_norm": 0.524882435798645, + "learning_rate": 3.966275429068403e-05, + "loss": 0.1667, + "step": 20769 + }, + { + "epoch": 0.3704562479934363, + "grad_norm": 0.2503334879875183, + "learning_rate": 3.966149358161374e-05, + "loss": 0.1336, + "step": 20770 + }, + { + "epoch": 0.37047408411515, + "grad_norm": 0.2533748149871826, + "learning_rate": 3.966023281571043e-05, + "loss": 0.1312, + "step": 20771 + }, + { + "epoch": 0.37049192023686367, + "grad_norm": 0.21128694713115692, + "learning_rate": 3.965897199297901e-05, + "loss": 0.0763, + "step": 20772 + }, + { + "epoch": 0.3705097563585774, + "grad_norm": 0.39581942558288574, + "learning_rate": 3.9657711113424334e-05, + "loss": 0.171, + "step": 20773 + }, + { + "epoch": 0.3705275924802911, + "grad_norm": 0.3122757077217102, + "learning_rate": 3.965645017705132e-05, + "loss": 0.1512, + "step": 20774 + }, + { + "epoch": 0.3705454286020048, + "grad_norm": 0.2682683765888214, + "learning_rate": 3.9655189183864835e-05, + "loss": 0.1763, + "step": 20775 + }, + { + "epoch": 0.3705632647237185, + "grad_norm": 0.2530685067176819, + "learning_rate": 3.9653928133869776e-05, + "loss": 0.1638, + "step": 20776 + }, + { + "epoch": 0.37058110084543217, + "grad_norm": 0.24746932089328766, + "learning_rate": 3.9652667027071034e-05, + "loss": 0.2045, + "step": 20777 + }, + { + "epoch": 0.37059893696714585, + "grad_norm": 0.2642756402492523, + "learning_rate": 3.965140586347349e-05, + "loss": 0.1711, + "step": 20778 + }, + { + "epoch": 0.37061677308885954, + "grad_norm": 0.2608848512172699, + "learning_rate": 3.965014464308204e-05, + "loss": 0.1502, + "step": 20779 + }, + { + "epoch": 0.37063460921057323, + "grad_norm": 0.30534178018569946, + "learning_rate": 3.964888336590157e-05, + "loss": 0.1828, + "step": 20780 + }, + { + "epoch": 0.370652445332287, + "grad_norm": 0.26097339391708374, + "learning_rate": 3.964762203193696e-05, + "loss": 0.1483, + "step": 20781 + }, + { + "epoch": 0.37067028145400066, + "grad_norm": 0.19460429251194, + "learning_rate": 3.964636064119312e-05, + "loss": 0.1464, + "step": 20782 + }, + { + "epoch": 0.37068811757571435, + "grad_norm": 0.2558959424495697, + "learning_rate": 3.964509919367492e-05, + "loss": 0.1403, + "step": 20783 + }, + { + "epoch": 0.37070595369742804, + "grad_norm": 0.2617916762828827, + "learning_rate": 3.964383768938725e-05, + "loss": 0.1565, + "step": 20784 + }, + { + "epoch": 0.3707237898191417, + "grad_norm": 0.2780449688434601, + "learning_rate": 3.9642576128335026e-05, + "loss": 0.1655, + "step": 20785 + }, + { + "epoch": 0.3707416259408554, + "grad_norm": 0.2817031741142273, + "learning_rate": 3.964131451052311e-05, + "loss": 0.1484, + "step": 20786 + }, + { + "epoch": 0.3707594620625691, + "grad_norm": 0.22971881926059723, + "learning_rate": 3.96400528359564e-05, + "loss": 0.1066, + "step": 20787 + }, + { + "epoch": 0.3707772981842828, + "grad_norm": 0.35492902994155884, + "learning_rate": 3.963879110463978e-05, + "loss": 0.1828, + "step": 20788 + }, + { + "epoch": 0.3707951343059965, + "grad_norm": 0.3692830502986908, + "learning_rate": 3.963752931657817e-05, + "loss": 0.1475, + "step": 20789 + }, + { + "epoch": 0.3708129704277102, + "grad_norm": 0.30633455514907837, + "learning_rate": 3.963626747177642e-05, + "loss": 0.1903, + "step": 20790 + }, + { + "epoch": 0.3708308065494239, + "grad_norm": 0.3038899898529053, + "learning_rate": 3.9635005570239456e-05, + "loss": 0.1499, + "step": 20791 + }, + { + "epoch": 0.3708486426711376, + "grad_norm": 0.26397743821144104, + "learning_rate": 3.9633743611972154e-05, + "loss": 0.1774, + "step": 20792 + }, + { + "epoch": 0.3708664787928513, + "grad_norm": 0.3259041905403137, + "learning_rate": 3.963248159697941e-05, + "loss": 0.1567, + "step": 20793 + }, + { + "epoch": 0.37088431491456497, + "grad_norm": 0.25907421112060547, + "learning_rate": 3.96312195252661e-05, + "loss": 0.2146, + "step": 20794 + }, + { + "epoch": 0.37090215103627866, + "grad_norm": 0.2867237627506256, + "learning_rate": 3.9629957396837146e-05, + "loss": 0.1442, + "step": 20795 + }, + { + "epoch": 0.37091998715799235, + "grad_norm": 0.25982075929641724, + "learning_rate": 3.9628695211697415e-05, + "loss": 0.1878, + "step": 20796 + }, + { + "epoch": 0.37093782327970604, + "grad_norm": 0.20911921560764313, + "learning_rate": 3.962743296985181e-05, + "loss": 0.1732, + "step": 20797 + }, + { + "epoch": 0.3709556594014198, + "grad_norm": 0.2497350573539734, + "learning_rate": 3.9626170671305223e-05, + "loss": 0.1633, + "step": 20798 + }, + { + "epoch": 0.37097349552313347, + "grad_norm": 0.24562138319015503, + "learning_rate": 3.962490831606255e-05, + "loss": 0.1661, + "step": 20799 + }, + { + "epoch": 0.37099133164484716, + "grad_norm": 0.21879325807094574, + "learning_rate": 3.962364590412868e-05, + "loss": 0.1458, + "step": 20800 + }, + { + "epoch": 0.37100916776656084, + "grad_norm": 0.20751112699508667, + "learning_rate": 3.9622383435508504e-05, + "loss": 0.1191, + "step": 20801 + }, + { + "epoch": 0.37102700388827453, + "grad_norm": 0.27567774057388306, + "learning_rate": 3.962112091020692e-05, + "loss": 0.1698, + "step": 20802 + }, + { + "epoch": 0.3710448400099882, + "grad_norm": 0.20791807770729065, + "learning_rate": 3.961985832822882e-05, + "loss": 0.1398, + "step": 20803 + }, + { + "epoch": 0.3710626761317019, + "grad_norm": 0.232282817363739, + "learning_rate": 3.961859568957911e-05, + "loss": 0.1501, + "step": 20804 + }, + { + "epoch": 0.3710805122534156, + "grad_norm": 0.257133424282074, + "learning_rate": 3.9617332994262654e-05, + "loss": 0.1858, + "step": 20805 + }, + { + "epoch": 0.37109834837512934, + "grad_norm": 0.23405201733112335, + "learning_rate": 3.961607024228439e-05, + "loss": 0.152, + "step": 20806 + }, + { + "epoch": 0.371116184496843, + "grad_norm": 0.31592363119125366, + "learning_rate": 3.961480743364917e-05, + "loss": 0.188, + "step": 20807 + }, + { + "epoch": 0.3711340206185567, + "grad_norm": 0.23744623363018036, + "learning_rate": 3.9613544568361916e-05, + "loss": 0.1608, + "step": 20808 + }, + { + "epoch": 0.3711518567402704, + "grad_norm": 0.2736688256263733, + "learning_rate": 3.961228164642752e-05, + "loss": 0.1619, + "step": 20809 + }, + { + "epoch": 0.3711696928619841, + "grad_norm": 0.3179933726787567, + "learning_rate": 3.9611018667850866e-05, + "loss": 0.2267, + "step": 20810 + }, + { + "epoch": 0.3711875289836978, + "grad_norm": 0.22657260298728943, + "learning_rate": 3.960975563263687e-05, + "loss": 0.1358, + "step": 20811 + }, + { + "epoch": 0.37120536510541147, + "grad_norm": 0.20992985367774963, + "learning_rate": 3.96084925407904e-05, + "loss": 0.1721, + "step": 20812 + }, + { + "epoch": 0.37122320122712515, + "grad_norm": 0.33215299248695374, + "learning_rate": 3.9607229392316376e-05, + "loss": 0.2161, + "step": 20813 + }, + { + "epoch": 0.37124103734883884, + "grad_norm": 0.31987807154655457, + "learning_rate": 3.960596618721968e-05, + "loss": 0.1111, + "step": 20814 + }, + { + "epoch": 0.3712588734705526, + "grad_norm": 0.24876877665519714, + "learning_rate": 3.960470292550522e-05, + "loss": 0.1653, + "step": 20815 + }, + { + "epoch": 0.3712767095922663, + "grad_norm": 0.23401497304439545, + "learning_rate": 3.960343960717788e-05, + "loss": 0.1292, + "step": 20816 + }, + { + "epoch": 0.37129454571397996, + "grad_norm": 0.4843595027923584, + "learning_rate": 3.960217623224257e-05, + "loss": 0.2191, + "step": 20817 + }, + { + "epoch": 0.37131238183569365, + "grad_norm": 0.24649792909622192, + "learning_rate": 3.9600912800704184e-05, + "loss": 0.1831, + "step": 20818 + }, + { + "epoch": 0.37133021795740734, + "grad_norm": 0.24797876179218292, + "learning_rate": 3.959964931256761e-05, + "loss": 0.1444, + "step": 20819 + }, + { + "epoch": 0.371348054079121, + "grad_norm": 0.20644067227840424, + "learning_rate": 3.959838576783776e-05, + "loss": 0.1449, + "step": 20820 + }, + { + "epoch": 0.3713658902008347, + "grad_norm": 0.2838148772716522, + "learning_rate": 3.9597122166519516e-05, + "loss": 0.159, + "step": 20821 + }, + { + "epoch": 0.3713837263225484, + "grad_norm": 0.30081966519355774, + "learning_rate": 3.9595858508617796e-05, + "loss": 0.1271, + "step": 20822 + }, + { + "epoch": 0.37140156244426215, + "grad_norm": 0.21972356736660004, + "learning_rate": 3.959459479413748e-05, + "loss": 0.1321, + "step": 20823 + }, + { + "epoch": 0.37141939856597583, + "grad_norm": 0.2762352526187897, + "learning_rate": 3.959333102308348e-05, + "loss": 0.1696, + "step": 20824 + }, + { + "epoch": 0.3714372346876895, + "grad_norm": 0.2654752731323242, + "learning_rate": 3.959206719546068e-05, + "loss": 0.1403, + "step": 20825 + }, + { + "epoch": 0.3714550708094032, + "grad_norm": 0.29895031452178955, + "learning_rate": 3.959080331127399e-05, + "loss": 0.1792, + "step": 20826 + }, + { + "epoch": 0.3714729069311169, + "grad_norm": 0.2670992314815521, + "learning_rate": 3.9589539370528306e-05, + "loss": 0.2022, + "step": 20827 + }, + { + "epoch": 0.3714907430528306, + "grad_norm": 0.2724984884262085, + "learning_rate": 3.9588275373228534e-05, + "loss": 0.1986, + "step": 20828 + }, + { + "epoch": 0.3715085791745443, + "grad_norm": 0.35580363869667053, + "learning_rate": 3.958701131937956e-05, + "loss": 0.1148, + "step": 20829 + }, + { + "epoch": 0.37152641529625796, + "grad_norm": 0.22566132247447968, + "learning_rate": 3.9585747208986296e-05, + "loss": 0.153, + "step": 20830 + }, + { + "epoch": 0.37154425141797165, + "grad_norm": 0.25295549631118774, + "learning_rate": 3.9584483042053634e-05, + "loss": 0.1817, + "step": 20831 + }, + { + "epoch": 0.3715620875396854, + "grad_norm": 0.27537551522254944, + "learning_rate": 3.9583218818586484e-05, + "loss": 0.1565, + "step": 20832 + }, + { + "epoch": 0.3715799236613991, + "grad_norm": 0.2592831254005432, + "learning_rate": 3.958195453858974e-05, + "loss": 0.1326, + "step": 20833 + }, + { + "epoch": 0.37159775978311277, + "grad_norm": 0.4563406705856323, + "learning_rate": 3.9580690202068307e-05, + "loss": 0.2275, + "step": 20834 + }, + { + "epoch": 0.37161559590482646, + "grad_norm": 0.2982614040374756, + "learning_rate": 3.957942580902708e-05, + "loss": 0.1852, + "step": 20835 + }, + { + "epoch": 0.37163343202654014, + "grad_norm": 0.18832939863204956, + "learning_rate": 3.957816135947096e-05, + "loss": 0.1435, + "step": 20836 + }, + { + "epoch": 0.37165126814825383, + "grad_norm": 0.27177080512046814, + "learning_rate": 3.957689685340486e-05, + "loss": 0.1669, + "step": 20837 + }, + { + "epoch": 0.3716691042699675, + "grad_norm": 0.27617475390434265, + "learning_rate": 3.957563229083366e-05, + "loss": 0.1031, + "step": 20838 + }, + { + "epoch": 0.3716869403916812, + "grad_norm": 0.33756181597709656, + "learning_rate": 3.957436767176228e-05, + "loss": 0.1378, + "step": 20839 + }, + { + "epoch": 0.37170477651339495, + "grad_norm": 0.24287867546081543, + "learning_rate": 3.957310299619562e-05, + "loss": 0.1658, + "step": 20840 + }, + { + "epoch": 0.37172261263510864, + "grad_norm": 0.3450038731098175, + "learning_rate": 3.9571838264138575e-05, + "loss": 0.1396, + "step": 20841 + }, + { + "epoch": 0.37174044875682233, + "grad_norm": 0.27262115478515625, + "learning_rate": 3.9570573475596054e-05, + "loss": 0.1925, + "step": 20842 + }, + { + "epoch": 0.371758284878536, + "grad_norm": 0.2437928318977356, + "learning_rate": 3.956930863057297e-05, + "loss": 0.1875, + "step": 20843 + }, + { + "epoch": 0.3717761210002497, + "grad_norm": 0.28718969225883484, + "learning_rate": 3.9568043729074186e-05, + "loss": 0.1645, + "step": 20844 + }, + { + "epoch": 0.3717939571219634, + "grad_norm": 0.2277885228395462, + "learning_rate": 3.956677877110466e-05, + "loss": 0.1749, + "step": 20845 + }, + { + "epoch": 0.3718117932436771, + "grad_norm": 0.25676479935646057, + "learning_rate": 3.956551375666925e-05, + "loss": 0.1767, + "step": 20846 + }, + { + "epoch": 0.37182962936539077, + "grad_norm": 0.27865269780158997, + "learning_rate": 3.9564248685772885e-05, + "loss": 0.1236, + "step": 20847 + }, + { + "epoch": 0.3718474654871045, + "grad_norm": 0.2554360032081604, + "learning_rate": 3.9562983558420464e-05, + "loss": 0.1695, + "step": 20848 + }, + { + "epoch": 0.3718653016088182, + "grad_norm": 0.18411928415298462, + "learning_rate": 3.956171837461689e-05, + "loss": 0.1228, + "step": 20849 + }, + { + "epoch": 0.3718831377305319, + "grad_norm": 0.2741954028606415, + "learning_rate": 3.956045313436706e-05, + "loss": 0.1619, + "step": 20850 + }, + { + "epoch": 0.3719009738522456, + "grad_norm": 0.319092720746994, + "learning_rate": 3.955918783767589e-05, + "loss": 0.1531, + "step": 20851 + }, + { + "epoch": 0.37191880997395926, + "grad_norm": 0.3173488676548004, + "learning_rate": 3.9557922484548284e-05, + "loss": 0.1107, + "step": 20852 + }, + { + "epoch": 0.37193664609567295, + "grad_norm": 0.23448435962200165, + "learning_rate": 3.955665707498913e-05, + "loss": 0.1432, + "step": 20853 + }, + { + "epoch": 0.37195448221738664, + "grad_norm": 0.2821740508079529, + "learning_rate": 3.955539160900335e-05, + "loss": 0.1561, + "step": 20854 + }, + { + "epoch": 0.3719723183391003, + "grad_norm": 0.2684013843536377, + "learning_rate": 3.955412608659584e-05, + "loss": 0.1376, + "step": 20855 + }, + { + "epoch": 0.371990154460814, + "grad_norm": 0.361016184091568, + "learning_rate": 3.955286050777152e-05, + "loss": 0.1574, + "step": 20856 + }, + { + "epoch": 0.37200799058252776, + "grad_norm": 0.425301194190979, + "learning_rate": 3.955159487253528e-05, + "loss": 0.1455, + "step": 20857 + }, + { + "epoch": 0.37202582670424145, + "grad_norm": 0.24172918498516083, + "learning_rate": 3.9550329180892044e-05, + "loss": 0.1629, + "step": 20858 + }, + { + "epoch": 0.37204366282595513, + "grad_norm": 0.27436745166778564, + "learning_rate": 3.9549063432846687e-05, + "loss": 0.1513, + "step": 20859 + }, + { + "epoch": 0.3720614989476688, + "grad_norm": 0.22665497660636902, + "learning_rate": 3.954779762840415e-05, + "loss": 0.1297, + "step": 20860 + }, + { + "epoch": 0.3720793350693825, + "grad_norm": 0.2823479473590851, + "learning_rate": 3.954653176756932e-05, + "loss": 0.1084, + "step": 20861 + }, + { + "epoch": 0.3720971711910962, + "grad_norm": 0.2251901477575302, + "learning_rate": 3.9545265850347116e-05, + "loss": 0.1879, + "step": 20862 + }, + { + "epoch": 0.3721150073128099, + "grad_norm": 0.234126478433609, + "learning_rate": 3.954399987674242e-05, + "loss": 0.1461, + "step": 20863 + }, + { + "epoch": 0.3721328434345236, + "grad_norm": 0.23911389708518982, + "learning_rate": 3.954273384676017e-05, + "loss": 0.1852, + "step": 20864 + }, + { + "epoch": 0.3721506795562373, + "grad_norm": 0.2928299605846405, + "learning_rate": 3.954146776040526e-05, + "loss": 0.1937, + "step": 20865 + }, + { + "epoch": 0.372168515677951, + "grad_norm": 0.2781590521335602, + "learning_rate": 3.9540201617682596e-05, + "loss": 0.157, + "step": 20866 + }, + { + "epoch": 0.3721863517996647, + "grad_norm": 0.21413928270339966, + "learning_rate": 3.953893541859709e-05, + "loss": 0.1479, + "step": 20867 + }, + { + "epoch": 0.3722041879213784, + "grad_norm": 0.19536499679088593, + "learning_rate": 3.9537669163153644e-05, + "loss": 0.1567, + "step": 20868 + }, + { + "epoch": 0.37222202404309207, + "grad_norm": 0.20709502696990967, + "learning_rate": 3.953640285135718e-05, + "loss": 0.1458, + "step": 20869 + }, + { + "epoch": 0.37223986016480576, + "grad_norm": 0.2385265976190567, + "learning_rate": 3.953513648321259e-05, + "loss": 0.0996, + "step": 20870 + }, + { + "epoch": 0.37225769628651945, + "grad_norm": 0.258060097694397, + "learning_rate": 3.9533870058724797e-05, + "loss": 0.1818, + "step": 20871 + }, + { + "epoch": 0.37227553240823313, + "grad_norm": 0.3334944546222687, + "learning_rate": 3.9532603577898694e-05, + "loss": 0.1721, + "step": 20872 + }, + { + "epoch": 0.3722933685299468, + "grad_norm": 0.2653723955154419, + "learning_rate": 3.9531337040739215e-05, + "loss": 0.146, + "step": 20873 + }, + { + "epoch": 0.37231120465166057, + "grad_norm": 0.25973907113075256, + "learning_rate": 3.9530070447251246e-05, + "loss": 0.15, + "step": 20874 + }, + { + "epoch": 0.37232904077337425, + "grad_norm": 0.25220730900764465, + "learning_rate": 3.9528803797439705e-05, + "loss": 0.1698, + "step": 20875 + }, + { + "epoch": 0.37234687689508794, + "grad_norm": 0.25818932056427, + "learning_rate": 3.95275370913095e-05, + "loss": 0.1699, + "step": 20876 + }, + { + "epoch": 0.37236471301680163, + "grad_norm": 0.310071736574173, + "learning_rate": 3.952627032886555e-05, + "loss": 0.2, + "step": 20877 + }, + { + "epoch": 0.3723825491385153, + "grad_norm": 0.31427833437919617, + "learning_rate": 3.952500351011276e-05, + "loss": 0.1645, + "step": 20878 + }, + { + "epoch": 0.372400385260229, + "grad_norm": 0.2230541855096817, + "learning_rate": 3.952373663505603e-05, + "loss": 0.1492, + "step": 20879 + }, + { + "epoch": 0.3724182213819427, + "grad_norm": 0.3833317756652832, + "learning_rate": 3.952246970370029e-05, + "loss": 0.1131, + "step": 20880 + }, + { + "epoch": 0.3724360575036564, + "grad_norm": 0.20893841981887817, + "learning_rate": 3.952120271605043e-05, + "loss": 0.1477, + "step": 20881 + }, + { + "epoch": 0.3724538936253701, + "grad_norm": 0.37659087777137756, + "learning_rate": 3.9519935672111384e-05, + "loss": 0.1925, + "step": 20882 + }, + { + "epoch": 0.3724717297470838, + "grad_norm": 0.21617098152637482, + "learning_rate": 3.9518668571888054e-05, + "loss": 0.1567, + "step": 20883 + }, + { + "epoch": 0.3724895658687975, + "grad_norm": 0.4688357710838318, + "learning_rate": 3.9517401415385335e-05, + "loss": 0.2293, + "step": 20884 + }, + { + "epoch": 0.3725074019905112, + "grad_norm": 0.3353196084499359, + "learning_rate": 3.9516134202608166e-05, + "loss": 0.1377, + "step": 20885 + }, + { + "epoch": 0.3725252381122249, + "grad_norm": 0.1893564760684967, + "learning_rate": 3.9514866933561446e-05, + "loss": 0.1269, + "step": 20886 + }, + { + "epoch": 0.37254307423393856, + "grad_norm": 0.25817370414733887, + "learning_rate": 3.951359960825008e-05, + "loss": 0.1893, + "step": 20887 + }, + { + "epoch": 0.37256091035565225, + "grad_norm": 0.2507270574569702, + "learning_rate": 3.9512332226679005e-05, + "loss": 0.1531, + "step": 20888 + }, + { + "epoch": 0.37257874647736594, + "grad_norm": 0.2700137794017792, + "learning_rate": 3.9511064788853104e-05, + "loss": 0.1798, + "step": 20889 + }, + { + "epoch": 0.37259658259907963, + "grad_norm": 0.2808361351490021, + "learning_rate": 3.950979729477731e-05, + "loss": 0.1587, + "step": 20890 + }, + { + "epoch": 0.37261441872079337, + "grad_norm": 0.27956733107566833, + "learning_rate": 3.9508529744456535e-05, + "loss": 0.1664, + "step": 20891 + }, + { + "epoch": 0.37263225484250706, + "grad_norm": 0.2957374155521393, + "learning_rate": 3.950726213789568e-05, + "loss": 0.1688, + "step": 20892 + }, + { + "epoch": 0.37265009096422075, + "grad_norm": 0.2731435000896454, + "learning_rate": 3.950599447509967e-05, + "loss": 0.146, + "step": 20893 + }, + { + "epoch": 0.37266792708593444, + "grad_norm": 0.38462555408477783, + "learning_rate": 3.9504726756073405e-05, + "loss": 0.1402, + "step": 20894 + }, + { + "epoch": 0.3726857632076481, + "grad_norm": 0.3309250473976135, + "learning_rate": 3.9503458980821826e-05, + "loss": 0.1335, + "step": 20895 + }, + { + "epoch": 0.3727035993293618, + "grad_norm": 0.17635788023471832, + "learning_rate": 3.950219114934982e-05, + "loss": 0.1415, + "step": 20896 + }, + { + "epoch": 0.3727214354510755, + "grad_norm": 0.27848848700523376, + "learning_rate": 3.950092326166232e-05, + "loss": 0.1813, + "step": 20897 + }, + { + "epoch": 0.3727392715727892, + "grad_norm": 0.2748837172985077, + "learning_rate": 3.949965531776422e-05, + "loss": 0.1233, + "step": 20898 + }, + { + "epoch": 0.37275710769450293, + "grad_norm": 0.24764619767665863, + "learning_rate": 3.9498387317660454e-05, + "loss": 0.179, + "step": 20899 + }, + { + "epoch": 0.3727749438162166, + "grad_norm": 0.2562500536441803, + "learning_rate": 3.949711926135593e-05, + "loss": 0.1662, + "step": 20900 + }, + { + "epoch": 0.3727927799379303, + "grad_norm": 0.2847629487514496, + "learning_rate": 3.949585114885558e-05, + "loss": 0.1769, + "step": 20901 + }, + { + "epoch": 0.372810616059644, + "grad_norm": 0.23247461020946503, + "learning_rate": 3.9494582980164284e-05, + "loss": 0.1536, + "step": 20902 + }, + { + "epoch": 0.3728284521813577, + "grad_norm": 0.39158889651298523, + "learning_rate": 3.949331475528699e-05, + "loss": 0.2121, + "step": 20903 + }, + { + "epoch": 0.37284628830307137, + "grad_norm": 0.2710724174976349, + "learning_rate": 3.9492046474228594e-05, + "loss": 0.1508, + "step": 20904 + }, + { + "epoch": 0.37286412442478506, + "grad_norm": 0.29780441522598267, + "learning_rate": 3.9490778136994025e-05, + "loss": 0.1785, + "step": 20905 + }, + { + "epoch": 0.37288196054649875, + "grad_norm": 0.2175154834985733, + "learning_rate": 3.9489509743588195e-05, + "loss": 0.1534, + "step": 20906 + }, + { + "epoch": 0.3728997966682125, + "grad_norm": 0.24034057557582855, + "learning_rate": 3.9488241294016017e-05, + "loss": 0.1109, + "step": 20907 + }, + { + "epoch": 0.3729176327899262, + "grad_norm": 0.28218695521354675, + "learning_rate": 3.948697278828242e-05, + "loss": 0.1463, + "step": 20908 + }, + { + "epoch": 0.37293546891163987, + "grad_norm": 0.32997918128967285, + "learning_rate": 3.948570422639231e-05, + "loss": 0.1766, + "step": 20909 + }, + { + "epoch": 0.37295330503335355, + "grad_norm": 0.20871379971504211, + "learning_rate": 3.94844356083506e-05, + "loss": 0.1579, + "step": 20910 + }, + { + "epoch": 0.37297114115506724, + "grad_norm": 0.20707568526268005, + "learning_rate": 3.948316693416222e-05, + "loss": 0.1615, + "step": 20911 + }, + { + "epoch": 0.37298897727678093, + "grad_norm": 0.23493592441082, + "learning_rate": 3.948189820383208e-05, + "loss": 0.1697, + "step": 20912 + }, + { + "epoch": 0.3730068133984946, + "grad_norm": 0.28584012389183044, + "learning_rate": 3.94806294173651e-05, + "loss": 0.189, + "step": 20913 + }, + { + "epoch": 0.3730246495202083, + "grad_norm": 0.21202589571475983, + "learning_rate": 3.9479360574766204e-05, + "loss": 0.1257, + "step": 20914 + }, + { + "epoch": 0.373042485641922, + "grad_norm": 0.23167893290519714, + "learning_rate": 3.9478091676040305e-05, + "loss": 0.1819, + "step": 20915 + }, + { + "epoch": 0.37306032176363574, + "grad_norm": 0.41091087460517883, + "learning_rate": 3.947682272119232e-05, + "loss": 0.199, + "step": 20916 + }, + { + "epoch": 0.3730781578853494, + "grad_norm": 0.2537975609302521, + "learning_rate": 3.9475553710227175e-05, + "loss": 0.1587, + "step": 20917 + }, + { + "epoch": 0.3730959940070631, + "grad_norm": 0.3241865038871765, + "learning_rate": 3.947428464314977e-05, + "loss": 0.1349, + "step": 20918 + }, + { + "epoch": 0.3731138301287768, + "grad_norm": 0.24923311173915863, + "learning_rate": 3.947301551996505e-05, + "loss": 0.1769, + "step": 20919 + }, + { + "epoch": 0.3731316662504905, + "grad_norm": 0.31199532747268677, + "learning_rate": 3.9471746340677915e-05, + "loss": 0.196, + "step": 20920 + }, + { + "epoch": 0.3731495023722042, + "grad_norm": 0.19318658113479614, + "learning_rate": 3.947047710529331e-05, + "loss": 0.1514, + "step": 20921 + }, + { + "epoch": 0.37316733849391787, + "grad_norm": 0.2515678107738495, + "learning_rate": 3.946920781381612e-05, + "loss": 0.1819, + "step": 20922 + }, + { + "epoch": 0.37318517461563155, + "grad_norm": 0.24265235662460327, + "learning_rate": 3.946793846625129e-05, + "loss": 0.1908, + "step": 20923 + }, + { + "epoch": 0.3732030107373453, + "grad_norm": 0.26168200373649597, + "learning_rate": 3.946666906260373e-05, + "loss": 0.1075, + "step": 20924 + }, + { + "epoch": 0.373220846859059, + "grad_norm": 0.2690441310405731, + "learning_rate": 3.946539960287837e-05, + "loss": 0.1712, + "step": 20925 + }, + { + "epoch": 0.3732386829807727, + "grad_norm": 0.27775177359580994, + "learning_rate": 3.946413008708012e-05, + "loss": 0.1554, + "step": 20926 + }, + { + "epoch": 0.37325651910248636, + "grad_norm": 0.26072096824645996, + "learning_rate": 3.946286051521391e-05, + "loss": 0.19, + "step": 20927 + }, + { + "epoch": 0.37327435522420005, + "grad_norm": 0.19600936770439148, + "learning_rate": 3.946159088728465e-05, + "loss": 0.1571, + "step": 20928 + }, + { + "epoch": 0.37329219134591374, + "grad_norm": 0.20395301282405853, + "learning_rate": 3.946032120329728e-05, + "loss": 0.1812, + "step": 20929 + }, + { + "epoch": 0.3733100274676274, + "grad_norm": 0.22938822209835052, + "learning_rate": 3.945905146325671e-05, + "loss": 0.1466, + "step": 20930 + }, + { + "epoch": 0.3733278635893411, + "grad_norm": 0.25904223322868347, + "learning_rate": 3.9457781667167854e-05, + "loss": 0.1873, + "step": 20931 + }, + { + "epoch": 0.3733456997110548, + "grad_norm": 0.2521626055240631, + "learning_rate": 3.945651181503565e-05, + "loss": 0.1341, + "step": 20932 + }, + { + "epoch": 0.37336353583276854, + "grad_norm": 0.2962108552455902, + "learning_rate": 3.9455241906865003e-05, + "loss": 0.1872, + "step": 20933 + }, + { + "epoch": 0.37338137195448223, + "grad_norm": 0.2521093487739563, + "learning_rate": 3.945397194266086e-05, + "loss": 0.1723, + "step": 20934 + }, + { + "epoch": 0.3733992080761959, + "grad_norm": 0.27413707971572876, + "learning_rate": 3.9452701922428114e-05, + "loss": 0.1729, + "step": 20935 + }, + { + "epoch": 0.3734170441979096, + "grad_norm": 0.34369009733200073, + "learning_rate": 3.945143184617171e-05, + "loss": 0.1894, + "step": 20936 + }, + { + "epoch": 0.3734348803196233, + "grad_norm": 0.2235700935125351, + "learning_rate": 3.945016171389656e-05, + "loss": 0.1803, + "step": 20937 + }, + { + "epoch": 0.373452716441337, + "grad_norm": 0.21069109439849854, + "learning_rate": 3.944889152560761e-05, + "loss": 0.1379, + "step": 20938 + }, + { + "epoch": 0.3734705525630507, + "grad_norm": 0.2789163589477539, + "learning_rate": 3.944762128130975e-05, + "loss": 0.1669, + "step": 20939 + }, + { + "epoch": 0.37348838868476436, + "grad_norm": 0.27874982357025146, + "learning_rate": 3.9446350981007924e-05, + "loss": 0.1284, + "step": 20940 + }, + { + "epoch": 0.3735062248064781, + "grad_norm": 0.24538666009902954, + "learning_rate": 3.944508062470705e-05, + "loss": 0.1501, + "step": 20941 + }, + { + "epoch": 0.3735240609281918, + "grad_norm": 0.18700912594795227, + "learning_rate": 3.9443810212412055e-05, + "loss": 0.1489, + "step": 20942 + }, + { + "epoch": 0.3735418970499055, + "grad_norm": 0.30852341651916504, + "learning_rate": 3.9442539744127864e-05, + "loss": 0.1101, + "step": 20943 + }, + { + "epoch": 0.37355973317161917, + "grad_norm": 0.3122432231903076, + "learning_rate": 3.94412692198594e-05, + "loss": 0.1717, + "step": 20944 + }, + { + "epoch": 0.37357756929333286, + "grad_norm": 0.2327127903699875, + "learning_rate": 3.94399986396116e-05, + "loss": 0.1706, + "step": 20945 + }, + { + "epoch": 0.37359540541504654, + "grad_norm": 0.23632730543613434, + "learning_rate": 3.943872800338936e-05, + "loss": 0.1533, + "step": 20946 + }, + { + "epoch": 0.37361324153676023, + "grad_norm": 0.23022763431072235, + "learning_rate": 3.943745731119763e-05, + "loss": 0.1395, + "step": 20947 + }, + { + "epoch": 0.3736310776584739, + "grad_norm": 0.2568363547325134, + "learning_rate": 3.943618656304133e-05, + "loss": 0.1735, + "step": 20948 + }, + { + "epoch": 0.37364891378018766, + "grad_norm": 0.24698111414909363, + "learning_rate": 3.9434915758925385e-05, + "loss": 0.1097, + "step": 20949 + }, + { + "epoch": 0.37366674990190135, + "grad_norm": 0.3155350685119629, + "learning_rate": 3.9433644898854716e-05, + "loss": 0.1764, + "step": 20950 + }, + { + "epoch": 0.37368458602361504, + "grad_norm": 0.20479997992515564, + "learning_rate": 3.943237398283426e-05, + "loss": 0.1036, + "step": 20951 + }, + { + "epoch": 0.3737024221453287, + "grad_norm": 0.30280378460884094, + "learning_rate": 3.943110301086893e-05, + "loss": 0.2106, + "step": 20952 + }, + { + "epoch": 0.3737202582670424, + "grad_norm": 0.26288577914237976, + "learning_rate": 3.9429831982963674e-05, + "loss": 0.1708, + "step": 20953 + }, + { + "epoch": 0.3737380943887561, + "grad_norm": 0.32733508944511414, + "learning_rate": 3.942856089912339e-05, + "loss": 0.1926, + "step": 20954 + }, + { + "epoch": 0.3737559305104698, + "grad_norm": 0.2616784870624542, + "learning_rate": 3.9427289759353034e-05, + "loss": 0.18, + "step": 20955 + }, + { + "epoch": 0.3737737666321835, + "grad_norm": 0.18744464218616486, + "learning_rate": 3.942601856365752e-05, + "loss": 0.131, + "step": 20956 + }, + { + "epoch": 0.37379160275389717, + "grad_norm": 0.2464095652103424, + "learning_rate": 3.9424747312041765e-05, + "loss": 0.1691, + "step": 20957 + }, + { + "epoch": 0.3738094388756109, + "grad_norm": 0.1791401207447052, + "learning_rate": 3.942347600451071e-05, + "loss": 0.1239, + "step": 20958 + }, + { + "epoch": 0.3738272749973246, + "grad_norm": 0.2502337694168091, + "learning_rate": 3.9422204641069284e-05, + "loss": 0.1442, + "step": 20959 + }, + { + "epoch": 0.3738451111190383, + "grad_norm": 0.31434884667396545, + "learning_rate": 3.942093322172241e-05, + "loss": 0.1529, + "step": 20960 + }, + { + "epoch": 0.373862947240752, + "grad_norm": 0.3276646137237549, + "learning_rate": 3.941966174647501e-05, + "loss": 0.2384, + "step": 20961 + }, + { + "epoch": 0.37388078336246566, + "grad_norm": 0.19896353781223297, + "learning_rate": 3.941839021533203e-05, + "loss": 0.1157, + "step": 20962 + }, + { + "epoch": 0.37389861948417935, + "grad_norm": 0.300504595041275, + "learning_rate": 3.9417118628298386e-05, + "loss": 0.1526, + "step": 20963 + }, + { + "epoch": 0.37391645560589304, + "grad_norm": 0.3498363196849823, + "learning_rate": 3.941584698537901e-05, + "loss": 0.1688, + "step": 20964 + }, + { + "epoch": 0.3739342917276067, + "grad_norm": 0.6301300525665283, + "learning_rate": 3.941457528657884e-05, + "loss": 0.1944, + "step": 20965 + }, + { + "epoch": 0.37395212784932047, + "grad_norm": 0.3406152129173279, + "learning_rate": 3.941330353190279e-05, + "loss": 0.1562, + "step": 20966 + }, + { + "epoch": 0.37396996397103416, + "grad_norm": 0.31587889790534973, + "learning_rate": 3.94120317213558e-05, + "loss": 0.1423, + "step": 20967 + }, + { + "epoch": 0.37398780009274785, + "grad_norm": 0.3007887005805969, + "learning_rate": 3.94107598549428e-05, + "loss": 0.1608, + "step": 20968 + }, + { + "epoch": 0.37400563621446153, + "grad_norm": 0.21104386448860168, + "learning_rate": 3.940948793266871e-05, + "loss": 0.1422, + "step": 20969 + }, + { + "epoch": 0.3740234723361752, + "grad_norm": 0.2530542016029358, + "learning_rate": 3.940821595453847e-05, + "loss": 0.1245, + "step": 20970 + }, + { + "epoch": 0.3740413084578889, + "grad_norm": 0.22693689167499542, + "learning_rate": 3.9406943920557014e-05, + "loss": 0.1272, + "step": 20971 + }, + { + "epoch": 0.3740591445796026, + "grad_norm": 0.23822426795959473, + "learning_rate": 3.9405671830729266e-05, + "loss": 0.129, + "step": 20972 + }, + { + "epoch": 0.3740769807013163, + "grad_norm": 0.2835020124912262, + "learning_rate": 3.940439968506016e-05, + "loss": 0.1759, + "step": 20973 + }, + { + "epoch": 0.37409481682303, + "grad_norm": 0.35984939336776733, + "learning_rate": 3.940312748355461e-05, + "loss": 0.2107, + "step": 20974 + }, + { + "epoch": 0.3741126529447437, + "grad_norm": 0.27591952681541443, + "learning_rate": 3.9401855226217574e-05, + "loss": 0.1729, + "step": 20975 + }, + { + "epoch": 0.3741304890664574, + "grad_norm": 0.2780141234397888, + "learning_rate": 3.940058291305398e-05, + "loss": 0.1758, + "step": 20976 + }, + { + "epoch": 0.3741483251881711, + "grad_norm": 0.33819320797920227, + "learning_rate": 3.9399310544068745e-05, + "loss": 0.1705, + "step": 20977 + }, + { + "epoch": 0.3741661613098848, + "grad_norm": 0.23138880729675293, + "learning_rate": 3.939803811926681e-05, + "loss": 0.2254, + "step": 20978 + }, + { + "epoch": 0.37418399743159847, + "grad_norm": 0.19835540652275085, + "learning_rate": 3.939676563865311e-05, + "loss": 0.1103, + "step": 20979 + }, + { + "epoch": 0.37420183355331216, + "grad_norm": 0.4091818332672119, + "learning_rate": 3.9395493102232574e-05, + "loss": 0.1671, + "step": 20980 + }, + { + "epoch": 0.37421966967502585, + "grad_norm": 0.25975480675697327, + "learning_rate": 3.939422051001013e-05, + "loss": 0.1202, + "step": 20981 + }, + { + "epoch": 0.37423750579673953, + "grad_norm": 0.33079445362091064, + "learning_rate": 3.939294786199072e-05, + "loss": 0.1929, + "step": 20982 + }, + { + "epoch": 0.3742553419184533, + "grad_norm": 0.21822389960289001, + "learning_rate": 3.9391675158179265e-05, + "loss": 0.1468, + "step": 20983 + }, + { + "epoch": 0.37427317804016697, + "grad_norm": 0.2974565625190735, + "learning_rate": 3.939040239858072e-05, + "loss": 0.1736, + "step": 20984 + }, + { + "epoch": 0.37429101416188065, + "grad_norm": 0.2653729319572449, + "learning_rate": 3.938912958319999e-05, + "loss": 0.1455, + "step": 20985 + }, + { + "epoch": 0.37430885028359434, + "grad_norm": 0.20173591375350952, + "learning_rate": 3.9387856712042034e-05, + "loss": 0.1437, + "step": 20986 + }, + { + "epoch": 0.37432668640530803, + "grad_norm": 0.3011341989040375, + "learning_rate": 3.938658378511177e-05, + "loss": 0.1838, + "step": 20987 + }, + { + "epoch": 0.3743445225270217, + "grad_norm": 0.3004617989063263, + "learning_rate": 3.938531080241414e-05, + "loss": 0.1894, + "step": 20988 + }, + { + "epoch": 0.3743623586487354, + "grad_norm": 0.31858810782432556, + "learning_rate": 3.9384037763954074e-05, + "loss": 0.1248, + "step": 20989 + }, + { + "epoch": 0.3743801947704491, + "grad_norm": 0.3206600248813629, + "learning_rate": 3.938276466973652e-05, + "loss": 0.1528, + "step": 20990 + }, + { + "epoch": 0.3743980308921628, + "grad_norm": 0.24963733553886414, + "learning_rate": 3.938149151976639e-05, + "loss": 0.1703, + "step": 20991 + }, + { + "epoch": 0.3744158670138765, + "grad_norm": 0.34398558735847473, + "learning_rate": 3.938021831404864e-05, + "loss": 0.1437, + "step": 20992 + }, + { + "epoch": 0.3744337031355902, + "grad_norm": 0.29185688495635986, + "learning_rate": 3.937894505258819e-05, + "loss": 0.1519, + "step": 20993 + }, + { + "epoch": 0.3744515392573039, + "grad_norm": 0.36593085527420044, + "learning_rate": 3.9377671735389995e-05, + "loss": 0.1943, + "step": 20994 + }, + { + "epoch": 0.3744693753790176, + "grad_norm": 0.23105423152446747, + "learning_rate": 3.937639836245896e-05, + "loss": 0.1575, + "step": 20995 + }, + { + "epoch": 0.3744872115007313, + "grad_norm": 0.2864378094673157, + "learning_rate": 3.9375124933800056e-05, + "loss": 0.1532, + "step": 20996 + }, + { + "epoch": 0.37450504762244496, + "grad_norm": 0.2871434688568115, + "learning_rate": 3.93738514494182e-05, + "loss": 0.1639, + "step": 20997 + }, + { + "epoch": 0.37452288374415865, + "grad_norm": 0.29691845178604126, + "learning_rate": 3.937257790931832e-05, + "loss": 0.1348, + "step": 20998 + }, + { + "epoch": 0.37454071986587234, + "grad_norm": 0.2640552818775177, + "learning_rate": 3.937130431350538e-05, + "loss": 0.1645, + "step": 20999 + }, + { + "epoch": 0.3745585559875861, + "grad_norm": 0.20489969849586487, + "learning_rate": 3.9370030661984295e-05, + "loss": 0.1862, + "step": 21000 + }, + { + "epoch": 0.3745585559875861, + "eval_loss": 0.15552359819412231, + "eval_runtime": 107.8589, + "eval_samples_per_second": 9.494, + "eval_steps_per_second": 1.585, + "step": 21000 + }, + { + "epoch": 0.37457639210929977, + "grad_norm": 0.24851778149604797, + "learning_rate": 3.936875695476e-05, + "loss": 0.205, + "step": 21001 + }, + { + "epoch": 0.37459422823101346, + "grad_norm": 0.2541540563106537, + "learning_rate": 3.9367483191837444e-05, + "loss": 0.1742, + "step": 21002 + }, + { + "epoch": 0.37461206435272715, + "grad_norm": 0.2621119022369385, + "learning_rate": 3.936620937322156e-05, + "loss": 0.1776, + "step": 21003 + }, + { + "epoch": 0.37462990047444084, + "grad_norm": 0.20699170231819153, + "learning_rate": 3.9364935498917296e-05, + "loss": 0.1541, + "step": 21004 + }, + { + "epoch": 0.3746477365961545, + "grad_norm": 0.26266545057296753, + "learning_rate": 3.936366156892958e-05, + "loss": 0.1916, + "step": 21005 + }, + { + "epoch": 0.3746655727178682, + "grad_norm": 0.28171104192733765, + "learning_rate": 3.9362387583263336e-05, + "loss": 0.2057, + "step": 21006 + }, + { + "epoch": 0.3746834088395819, + "grad_norm": 0.24466729164123535, + "learning_rate": 3.936111354192352e-05, + "loss": 0.1487, + "step": 21007 + }, + { + "epoch": 0.37470124496129564, + "grad_norm": 0.2627304196357727, + "learning_rate": 3.935983944491508e-05, + "loss": 0.1871, + "step": 21008 + }, + { + "epoch": 0.37471908108300933, + "grad_norm": 0.22136624157428741, + "learning_rate": 3.935856529224293e-05, + "loss": 0.1501, + "step": 21009 + }, + { + "epoch": 0.374736917204723, + "grad_norm": 0.2881280183792114, + "learning_rate": 3.9357291083912036e-05, + "loss": 0.1421, + "step": 21010 + }, + { + "epoch": 0.3747547533264367, + "grad_norm": 0.25228843092918396, + "learning_rate": 3.935601681992731e-05, + "loss": 0.1649, + "step": 21011 + }, + { + "epoch": 0.3747725894481504, + "grad_norm": 0.2941727340221405, + "learning_rate": 3.9354742500293715e-05, + "loss": 0.1832, + "step": 21012 + }, + { + "epoch": 0.3747904255698641, + "grad_norm": 0.2715993821620941, + "learning_rate": 3.935346812501617e-05, + "loss": 0.133, + "step": 21013 + }, + { + "epoch": 0.37480826169157777, + "grad_norm": 0.2922652065753937, + "learning_rate": 3.9352193694099624e-05, + "loss": 0.1326, + "step": 21014 + }, + { + "epoch": 0.37482609781329146, + "grad_norm": 0.2693033516407013, + "learning_rate": 3.935091920754903e-05, + "loss": 0.1809, + "step": 21015 + }, + { + "epoch": 0.37484393393500515, + "grad_norm": 0.2567836046218872, + "learning_rate": 3.9349644665369304e-05, + "loss": 0.1387, + "step": 21016 + }, + { + "epoch": 0.3748617700567189, + "grad_norm": 0.212194561958313, + "learning_rate": 3.93483700675654e-05, + "loss": 0.154, + "step": 21017 + }, + { + "epoch": 0.3748796061784326, + "grad_norm": 0.282661110162735, + "learning_rate": 3.934709541414227e-05, + "loss": 0.2028, + "step": 21018 + }, + { + "epoch": 0.37489744230014627, + "grad_norm": 0.284035325050354, + "learning_rate": 3.934582070510483e-05, + "loss": 0.1786, + "step": 21019 + }, + { + "epoch": 0.37491527842185995, + "grad_norm": 0.25869375467300415, + "learning_rate": 3.9344545940458044e-05, + "loss": 0.1232, + "step": 21020 + }, + { + "epoch": 0.37493311454357364, + "grad_norm": 0.26101529598236084, + "learning_rate": 3.934327112020684e-05, + "loss": 0.1454, + "step": 21021 + }, + { + "epoch": 0.37495095066528733, + "grad_norm": 0.4360544979572296, + "learning_rate": 3.9341996244356164e-05, + "loss": 0.1473, + "step": 21022 + }, + { + "epoch": 0.374968786787001, + "grad_norm": 0.2781641185283661, + "learning_rate": 3.934072131291096e-05, + "loss": 0.1365, + "step": 21023 + }, + { + "epoch": 0.3749866229087147, + "grad_norm": 0.32837679982185364, + "learning_rate": 3.933944632587615e-05, + "loss": 0.1905, + "step": 21024 + }, + { + "epoch": 0.37500445903042845, + "grad_norm": 0.24864588677883148, + "learning_rate": 3.933817128325671e-05, + "loss": 0.1508, + "step": 21025 + }, + { + "epoch": 0.37502229515214214, + "grad_norm": 0.19751842319965363, + "learning_rate": 3.933689618505756e-05, + "loss": 0.1384, + "step": 21026 + }, + { + "epoch": 0.3750401312738558, + "grad_norm": 0.27384525537490845, + "learning_rate": 3.933562103128365e-05, + "loss": 0.1231, + "step": 21027 + }, + { + "epoch": 0.3750579673955695, + "grad_norm": 0.4188830554485321, + "learning_rate": 3.9334345821939925e-05, + "loss": 0.1691, + "step": 21028 + }, + { + "epoch": 0.3750758035172832, + "grad_norm": 0.20186211168766022, + "learning_rate": 3.933307055703132e-05, + "loss": 0.1015, + "step": 21029 + }, + { + "epoch": 0.3750936396389969, + "grad_norm": 0.25390926003456116, + "learning_rate": 3.9331795236562785e-05, + "loss": 0.1405, + "step": 21030 + }, + { + "epoch": 0.3751114757607106, + "grad_norm": 0.25701770186424255, + "learning_rate": 3.933051986053926e-05, + "loss": 0.1939, + "step": 21031 + }, + { + "epoch": 0.37512931188242427, + "grad_norm": 0.30005180835723877, + "learning_rate": 3.9329244428965684e-05, + "loss": 0.1419, + "step": 21032 + }, + { + "epoch": 0.37514714800413795, + "grad_norm": 0.2700938582420349, + "learning_rate": 3.932796894184702e-05, + "loss": 0.2098, + "step": 21033 + }, + { + "epoch": 0.3751649841258517, + "grad_norm": 0.2825222909450531, + "learning_rate": 3.9326693399188195e-05, + "loss": 0.1904, + "step": 21034 + }, + { + "epoch": 0.3751828202475654, + "grad_norm": 0.28326770663261414, + "learning_rate": 3.932541780099416e-05, + "loss": 0.198, + "step": 21035 + }, + { + "epoch": 0.3752006563692791, + "grad_norm": 0.29205089807510376, + "learning_rate": 3.932414214726985e-05, + "loss": 0.1544, + "step": 21036 + }, + { + "epoch": 0.37521849249099276, + "grad_norm": 0.31696927547454834, + "learning_rate": 3.932286643802022e-05, + "loss": 0.2181, + "step": 21037 + }, + { + "epoch": 0.37523632861270645, + "grad_norm": 0.24711734056472778, + "learning_rate": 3.932159067325022e-05, + "loss": 0.1392, + "step": 21038 + }, + { + "epoch": 0.37525416473442014, + "grad_norm": 0.3203314244747162, + "learning_rate": 3.932031485296478e-05, + "loss": 0.1857, + "step": 21039 + }, + { + "epoch": 0.3752720008561338, + "grad_norm": 0.30566754937171936, + "learning_rate": 3.9319038977168865e-05, + "loss": 0.2117, + "step": 21040 + }, + { + "epoch": 0.3752898369778475, + "grad_norm": 0.29654553532600403, + "learning_rate": 3.9317763045867393e-05, + "loss": 0.1825, + "step": 21041 + }, + { + "epoch": 0.37530767309956126, + "grad_norm": 0.2398095726966858, + "learning_rate": 3.9316487059065335e-05, + "loss": 0.1418, + "step": 21042 + }, + { + "epoch": 0.37532550922127494, + "grad_norm": 0.35327228903770447, + "learning_rate": 3.931521101676763e-05, + "loss": 0.14, + "step": 21043 + }, + { + "epoch": 0.37534334534298863, + "grad_norm": 0.2769358456134796, + "learning_rate": 3.9313934918979224e-05, + "loss": 0.1257, + "step": 21044 + }, + { + "epoch": 0.3753611814647023, + "grad_norm": 0.29926323890686035, + "learning_rate": 3.931265876570506e-05, + "loss": 0.143, + "step": 21045 + }, + { + "epoch": 0.375379017586416, + "grad_norm": 0.33838826417922974, + "learning_rate": 3.9311382556950084e-05, + "loss": 0.2138, + "step": 21046 + }, + { + "epoch": 0.3753968537081297, + "grad_norm": 0.2684604525566101, + "learning_rate": 3.931010629271924e-05, + "loss": 0.1789, + "step": 21047 + }, + { + "epoch": 0.3754146898298434, + "grad_norm": 0.22139166295528412, + "learning_rate": 3.9308829973017495e-05, + "loss": 0.1723, + "step": 21048 + }, + { + "epoch": 0.3754325259515571, + "grad_norm": 0.28836438059806824, + "learning_rate": 3.930755359784978e-05, + "loss": 0.0951, + "step": 21049 + }, + { + "epoch": 0.37545036207327076, + "grad_norm": 0.38170522451400757, + "learning_rate": 3.930627716722104e-05, + "loss": 0.1896, + "step": 21050 + }, + { + "epoch": 0.3754681981949845, + "grad_norm": 0.3336580693721771, + "learning_rate": 3.9305000681136236e-05, + "loss": 0.1902, + "step": 21051 + }, + { + "epoch": 0.3754860343166982, + "grad_norm": 0.39005589485168457, + "learning_rate": 3.93037241396003e-05, + "loss": 0.1897, + "step": 21052 + }, + { + "epoch": 0.3755038704384119, + "grad_norm": 0.2585568428039551, + "learning_rate": 3.93024475426182e-05, + "loss": 0.1839, + "step": 21053 + }, + { + "epoch": 0.37552170656012557, + "grad_norm": 0.22890785336494446, + "learning_rate": 3.930117089019486e-05, + "loss": 0.1314, + "step": 21054 + }, + { + "epoch": 0.37553954268183926, + "grad_norm": 0.20664522051811218, + "learning_rate": 3.929989418233525e-05, + "loss": 0.16, + "step": 21055 + }, + { + "epoch": 0.37555737880355294, + "grad_norm": 0.41688215732574463, + "learning_rate": 3.929861741904431e-05, + "loss": 0.183, + "step": 21056 + }, + { + "epoch": 0.37557521492526663, + "grad_norm": 0.208701953291893, + "learning_rate": 3.9297340600326995e-05, + "loss": 0.1426, + "step": 21057 + }, + { + "epoch": 0.3755930510469803, + "grad_norm": 0.2092231810092926, + "learning_rate": 3.9296063726188244e-05, + "loss": 0.1363, + "step": 21058 + }, + { + "epoch": 0.37561088716869406, + "grad_norm": 0.3435227870941162, + "learning_rate": 3.9294786796633007e-05, + "loss": 0.2322, + "step": 21059 + }, + { + "epoch": 0.37562872329040775, + "grad_norm": 0.20513859391212463, + "learning_rate": 3.929350981166625e-05, + "loss": 0.1475, + "step": 21060 + }, + { + "epoch": 0.37564655941212144, + "grad_norm": 0.22249682247638702, + "learning_rate": 3.929223277129291e-05, + "loss": 0.14, + "step": 21061 + }, + { + "epoch": 0.3756643955338351, + "grad_norm": 0.267733633518219, + "learning_rate": 3.9290955675517934e-05, + "loss": 0.1979, + "step": 21062 + }, + { + "epoch": 0.3756822316555488, + "grad_norm": 0.32979053258895874, + "learning_rate": 3.9289678524346284e-05, + "loss": 0.1409, + "step": 21063 + }, + { + "epoch": 0.3757000677772625, + "grad_norm": 0.3158261477947235, + "learning_rate": 3.92884013177829e-05, + "loss": 0.1691, + "step": 21064 + }, + { + "epoch": 0.3757179038989762, + "grad_norm": 0.26205506920814514, + "learning_rate": 3.928712405583274e-05, + "loss": 0.1657, + "step": 21065 + }, + { + "epoch": 0.3757357400206899, + "grad_norm": 0.27283820509910583, + "learning_rate": 3.9285846738500754e-05, + "loss": 0.2025, + "step": 21066 + }, + { + "epoch": 0.3757535761424036, + "grad_norm": 0.2515011429786682, + "learning_rate": 3.9284569365791885e-05, + "loss": 0.1875, + "step": 21067 + }, + { + "epoch": 0.3757714122641173, + "grad_norm": 0.2335003763437271, + "learning_rate": 3.9283291937711096e-05, + "loss": 0.1596, + "step": 21068 + }, + { + "epoch": 0.375789248385831, + "grad_norm": 0.24934139847755432, + "learning_rate": 3.9282014454263335e-05, + "loss": 0.2232, + "step": 21069 + }, + { + "epoch": 0.3758070845075447, + "grad_norm": 0.3563438653945923, + "learning_rate": 3.9280736915453555e-05, + "loss": 0.2303, + "step": 21070 + }, + { + "epoch": 0.3758249206292584, + "grad_norm": 0.23537486791610718, + "learning_rate": 3.92794593212867e-05, + "loss": 0.1473, + "step": 21071 + }, + { + "epoch": 0.37584275675097206, + "grad_norm": 0.23724907636642456, + "learning_rate": 3.927818167176773e-05, + "loss": 0.1377, + "step": 21072 + }, + { + "epoch": 0.37586059287268575, + "grad_norm": 0.22681060433387756, + "learning_rate": 3.92769039669016e-05, + "loss": 0.1606, + "step": 21073 + }, + { + "epoch": 0.37587842899439944, + "grad_norm": 0.18544234335422516, + "learning_rate": 3.927562620669326e-05, + "loss": 0.1211, + "step": 21074 + }, + { + "epoch": 0.3758962651161131, + "grad_norm": 0.2739073932170868, + "learning_rate": 3.927434839114766e-05, + "loss": 0.1589, + "step": 21075 + }, + { + "epoch": 0.37591410123782687, + "grad_norm": 0.31435784697532654, + "learning_rate": 3.927307052026975e-05, + "loss": 0.1195, + "step": 21076 + }, + { + "epoch": 0.37593193735954056, + "grad_norm": 0.28558459877967834, + "learning_rate": 3.9271792594064495e-05, + "loss": 0.2036, + "step": 21077 + }, + { + "epoch": 0.37594977348125425, + "grad_norm": 0.18557113409042358, + "learning_rate": 3.9270514612536844e-05, + "loss": 0.1515, + "step": 21078 + }, + { + "epoch": 0.37596760960296793, + "grad_norm": 0.2325652688741684, + "learning_rate": 3.926923657569175e-05, + "loss": 0.1551, + "step": 21079 + }, + { + "epoch": 0.3759854457246816, + "grad_norm": 0.36040163040161133, + "learning_rate": 3.926795848353416e-05, + "loss": 0.2233, + "step": 21080 + }, + { + "epoch": 0.3760032818463953, + "grad_norm": 0.21082210540771484, + "learning_rate": 3.9266680336069036e-05, + "loss": 0.1633, + "step": 21081 + }, + { + "epoch": 0.376021117968109, + "grad_norm": 0.2819143533706665, + "learning_rate": 3.926540213330133e-05, + "loss": 0.2116, + "step": 21082 + }, + { + "epoch": 0.3760389540898227, + "grad_norm": 0.2111992985010147, + "learning_rate": 3.9264123875236006e-05, + "loss": 0.1362, + "step": 21083 + }, + { + "epoch": 0.37605679021153643, + "grad_norm": 0.28136852383613586, + "learning_rate": 3.926284556187801e-05, + "loss": 0.1426, + "step": 21084 + }, + { + "epoch": 0.3760746263332501, + "grad_norm": 0.31279659271240234, + "learning_rate": 3.92615671932323e-05, + "loss": 0.1968, + "step": 21085 + }, + { + "epoch": 0.3760924624549638, + "grad_norm": 0.24259164929389954, + "learning_rate": 3.926028876930382e-05, + "loss": 0.1567, + "step": 21086 + }, + { + "epoch": 0.3761102985766775, + "grad_norm": 0.2827032506465912, + "learning_rate": 3.925901029009754e-05, + "loss": 0.1736, + "step": 21087 + }, + { + "epoch": 0.3761281346983912, + "grad_norm": 0.2551811635494232, + "learning_rate": 3.9257731755618414e-05, + "loss": 0.1454, + "step": 21088 + }, + { + "epoch": 0.37614597082010487, + "grad_norm": 0.42513418197631836, + "learning_rate": 3.9256453165871397e-05, + "loss": 0.1492, + "step": 21089 + }, + { + "epoch": 0.37616380694181856, + "grad_norm": 0.18518340587615967, + "learning_rate": 3.9255174520861436e-05, + "loss": 0.107, + "step": 21090 + }, + { + "epoch": 0.37618164306353225, + "grad_norm": 0.22969557344913483, + "learning_rate": 3.92538958205935e-05, + "loss": 0.1651, + "step": 21091 + }, + { + "epoch": 0.37619947918524593, + "grad_norm": 0.19186931848526, + "learning_rate": 3.925261706507254e-05, + "loss": 0.1357, + "step": 21092 + }, + { + "epoch": 0.3762173153069597, + "grad_norm": 0.3463195860385895, + "learning_rate": 3.925133825430351e-05, + "loss": 0.1844, + "step": 21093 + }, + { + "epoch": 0.37623515142867336, + "grad_norm": 0.22685328125953674, + "learning_rate": 3.9250059388291375e-05, + "loss": 0.1736, + "step": 21094 + }, + { + "epoch": 0.37625298755038705, + "grad_norm": 0.239205002784729, + "learning_rate": 3.9248780467041094e-05, + "loss": 0.1552, + "step": 21095 + }, + { + "epoch": 0.37627082367210074, + "grad_norm": 0.24057349562644958, + "learning_rate": 3.92475014905576e-05, + "loss": 0.1665, + "step": 21096 + }, + { + "epoch": 0.37628865979381443, + "grad_norm": 0.27467283606529236, + "learning_rate": 3.924622245884588e-05, + "loss": 0.153, + "step": 21097 + }, + { + "epoch": 0.3763064959155281, + "grad_norm": 0.23760554194450378, + "learning_rate": 3.9244943371910895e-05, + "loss": 0.1503, + "step": 21098 + }, + { + "epoch": 0.3763243320372418, + "grad_norm": 0.28076374530792236, + "learning_rate": 3.924366422975757e-05, + "loss": 0.1867, + "step": 21099 + }, + { + "epoch": 0.3763421681589555, + "grad_norm": 0.24499619007110596, + "learning_rate": 3.924238503239089e-05, + "loss": 0.2002, + "step": 21100 + }, + { + "epoch": 0.37636000428066924, + "grad_norm": 0.30110907554626465, + "learning_rate": 3.924110577981581e-05, + "loss": 0.2248, + "step": 21101 + }, + { + "epoch": 0.3763778404023829, + "grad_norm": 0.30522486567497253, + "learning_rate": 3.923982647203728e-05, + "loss": 0.1622, + "step": 21102 + }, + { + "epoch": 0.3763956765240966, + "grad_norm": 0.40530920028686523, + "learning_rate": 3.9238547109060265e-05, + "loss": 0.2179, + "step": 21103 + }, + { + "epoch": 0.3764135126458103, + "grad_norm": 0.2657020688056946, + "learning_rate": 3.9237267690889716e-05, + "loss": 0.1418, + "step": 21104 + }, + { + "epoch": 0.376431348767524, + "grad_norm": 0.26725947856903076, + "learning_rate": 3.923598821753061e-05, + "loss": 0.1743, + "step": 21105 + }, + { + "epoch": 0.3764491848892377, + "grad_norm": 0.3668763041496277, + "learning_rate": 3.9234708688987896e-05, + "loss": 0.2352, + "step": 21106 + }, + { + "epoch": 0.37646702101095136, + "grad_norm": 0.2298911064863205, + "learning_rate": 3.923342910526653e-05, + "loss": 0.168, + "step": 21107 + }, + { + "epoch": 0.37648485713266505, + "grad_norm": 0.38377413153648376, + "learning_rate": 3.923214946637148e-05, + "loss": 0.2558, + "step": 21108 + }, + { + "epoch": 0.3765026932543788, + "grad_norm": 0.2673284113407135, + "learning_rate": 3.9230869772307713e-05, + "loss": 0.2117, + "step": 21109 + }, + { + "epoch": 0.3765205293760925, + "grad_norm": 0.21532359719276428, + "learning_rate": 3.9229590023080164e-05, + "loss": 0.1594, + "step": 21110 + }, + { + "epoch": 0.37653836549780617, + "grad_norm": 0.2449541538953781, + "learning_rate": 3.9228310218693816e-05, + "loss": 0.1225, + "step": 21111 + }, + { + "epoch": 0.37655620161951986, + "grad_norm": 0.37000924348831177, + "learning_rate": 3.9227030359153616e-05, + "loss": 0.1485, + "step": 21112 + }, + { + "epoch": 0.37657403774123355, + "grad_norm": 0.31963300704956055, + "learning_rate": 3.922575044446454e-05, + "loss": 0.1402, + "step": 21113 + }, + { + "epoch": 0.37659187386294724, + "grad_norm": 0.28932279348373413, + "learning_rate": 3.9224470474631546e-05, + "loss": 0.2088, + "step": 21114 + }, + { + "epoch": 0.3766097099846609, + "grad_norm": 0.30094069242477417, + "learning_rate": 3.922319044965958e-05, + "loss": 0.1553, + "step": 21115 + }, + { + "epoch": 0.3766275461063746, + "grad_norm": 0.2677748203277588, + "learning_rate": 3.922191036955363e-05, + "loss": 0.191, + "step": 21116 + }, + { + "epoch": 0.3766453822280883, + "grad_norm": 0.24286675453186035, + "learning_rate": 3.922063023431863e-05, + "loss": 0.1299, + "step": 21117 + }, + { + "epoch": 0.37666321834980204, + "grad_norm": 0.3031335771083832, + "learning_rate": 3.9219350043959556e-05, + "loss": 0.1389, + "step": 21118 + }, + { + "epoch": 0.37668105447151573, + "grad_norm": 0.24761976301670074, + "learning_rate": 3.921806979848137e-05, + "loss": 0.1323, + "step": 21119 + }, + { + "epoch": 0.3766988905932294, + "grad_norm": 0.28467342257499695, + "learning_rate": 3.9216789497889046e-05, + "loss": 0.1877, + "step": 21120 + }, + { + "epoch": 0.3767167267149431, + "grad_norm": 0.2957324683666229, + "learning_rate": 3.921550914218752e-05, + "loss": 0.2204, + "step": 21121 + }, + { + "epoch": 0.3767345628366568, + "grad_norm": 0.3219175636768341, + "learning_rate": 3.9214228731381784e-05, + "loss": 0.1894, + "step": 21122 + }, + { + "epoch": 0.3767523989583705, + "grad_norm": 0.30537447333335876, + "learning_rate": 3.9212948265476785e-05, + "loss": 0.1536, + "step": 21123 + }, + { + "epoch": 0.37677023508008417, + "grad_norm": 0.37514179944992065, + "learning_rate": 3.921166774447749e-05, + "loss": 0.1865, + "step": 21124 + }, + { + "epoch": 0.37678807120179786, + "grad_norm": 0.3233698904514313, + "learning_rate": 3.921038716838886e-05, + "loss": 0.1629, + "step": 21125 + }, + { + "epoch": 0.3768059073235116, + "grad_norm": 0.23803956806659698, + "learning_rate": 3.9209106537215854e-05, + "loss": 0.168, + "step": 21126 + }, + { + "epoch": 0.3768237434452253, + "grad_norm": 0.27758491039276123, + "learning_rate": 3.9207825850963454e-05, + "loss": 0.1622, + "step": 21127 + }, + { + "epoch": 0.376841579566939, + "grad_norm": 0.3723032772541046, + "learning_rate": 3.920654510963661e-05, + "loss": 0.1071, + "step": 21128 + }, + { + "epoch": 0.37685941568865267, + "grad_norm": 0.2727159857749939, + "learning_rate": 3.9205264313240296e-05, + "loss": 0.1745, + "step": 21129 + }, + { + "epoch": 0.37687725181036635, + "grad_norm": 0.31325629353523254, + "learning_rate": 3.9203983461779465e-05, + "loss": 0.1851, + "step": 21130 + }, + { + "epoch": 0.37689508793208004, + "grad_norm": 0.25675976276397705, + "learning_rate": 3.920270255525909e-05, + "loss": 0.1714, + "step": 21131 + }, + { + "epoch": 0.37691292405379373, + "grad_norm": 0.3355540931224823, + "learning_rate": 3.920142159368413e-05, + "loss": 0.1971, + "step": 21132 + }, + { + "epoch": 0.3769307601755074, + "grad_norm": 0.29059579968452454, + "learning_rate": 3.9200140577059566e-05, + "loss": 0.1527, + "step": 21133 + }, + { + "epoch": 0.3769485962972211, + "grad_norm": 0.28188809752464294, + "learning_rate": 3.919885950539034e-05, + "loss": 0.0655, + "step": 21134 + }, + { + "epoch": 0.37696643241893485, + "grad_norm": 0.2044854313135147, + "learning_rate": 3.919757837868143e-05, + "loss": 0.1503, + "step": 21135 + }, + { + "epoch": 0.37698426854064854, + "grad_norm": 0.3224719762802124, + "learning_rate": 3.919629719693781e-05, + "loss": 0.135, + "step": 21136 + }, + { + "epoch": 0.3770021046623622, + "grad_norm": 0.15747763216495514, + "learning_rate": 3.919501596016444e-05, + "loss": 0.111, + "step": 21137 + }, + { + "epoch": 0.3770199407840759, + "grad_norm": 0.1874404102563858, + "learning_rate": 3.919373466836628e-05, + "loss": 0.158, + "step": 21138 + }, + { + "epoch": 0.3770377769057896, + "grad_norm": 0.3001151978969574, + "learning_rate": 3.919245332154831e-05, + "loss": 0.2188, + "step": 21139 + }, + { + "epoch": 0.3770556130275033, + "grad_norm": 0.31146538257598877, + "learning_rate": 3.919117191971548e-05, + "loss": 0.139, + "step": 21140 + }, + { + "epoch": 0.377073449149217, + "grad_norm": 0.19606070220470428, + "learning_rate": 3.918989046287277e-05, + "loss": 0.1604, + "step": 21141 + }, + { + "epoch": 0.37709128527093067, + "grad_norm": 0.2943406105041504, + "learning_rate": 3.918860895102514e-05, + "loss": 0.1831, + "step": 21142 + }, + { + "epoch": 0.3771091213926444, + "grad_norm": 0.254599004983902, + "learning_rate": 3.9187327384177564e-05, + "loss": 0.148, + "step": 21143 + }, + { + "epoch": 0.3771269575143581, + "grad_norm": 0.2885769307613373, + "learning_rate": 3.918604576233501e-05, + "loss": 0.1687, + "step": 21144 + }, + { + "epoch": 0.3771447936360718, + "grad_norm": 0.24128156900405884, + "learning_rate": 3.918476408550243e-05, + "loss": 0.1578, + "step": 21145 + }, + { + "epoch": 0.3771626297577855, + "grad_norm": 0.3192015588283539, + "learning_rate": 3.918348235368482e-05, + "loss": 0.1563, + "step": 21146 + }, + { + "epoch": 0.37718046587949916, + "grad_norm": 0.2293146252632141, + "learning_rate": 3.9182200566887126e-05, + "loss": 0.14, + "step": 21147 + }, + { + "epoch": 0.37719830200121285, + "grad_norm": 0.2671649158000946, + "learning_rate": 3.918091872511433e-05, + "loss": 0.1686, + "step": 21148 + }, + { + "epoch": 0.37721613812292654, + "grad_norm": 0.3107260465621948, + "learning_rate": 3.9179636828371394e-05, + "loss": 0.136, + "step": 21149 + }, + { + "epoch": 0.3772339742446402, + "grad_norm": 0.22389011085033417, + "learning_rate": 3.917835487666328e-05, + "loss": 0.1297, + "step": 21150 + }, + { + "epoch": 0.3772518103663539, + "grad_norm": 0.30652227997779846, + "learning_rate": 3.917707286999497e-05, + "loss": 0.1561, + "step": 21151 + }, + { + "epoch": 0.37726964648806766, + "grad_norm": 0.22887754440307617, + "learning_rate": 3.917579080837144e-05, + "loss": 0.1541, + "step": 21152 + }, + { + "epoch": 0.37728748260978134, + "grad_norm": 0.3071889877319336, + "learning_rate": 3.917450869179764e-05, + "loss": 0.1735, + "step": 21153 + }, + { + "epoch": 0.37730531873149503, + "grad_norm": 0.2959241569042206, + "learning_rate": 3.917322652027854e-05, + "loss": 0.1717, + "step": 21154 + }, + { + "epoch": 0.3773231548532087, + "grad_norm": 0.22360247373580933, + "learning_rate": 3.917194429381913e-05, + "loss": 0.1345, + "step": 21155 + }, + { + "epoch": 0.3773409909749224, + "grad_norm": 0.24642226099967957, + "learning_rate": 3.9170662012424364e-05, + "loss": 0.1548, + "step": 21156 + }, + { + "epoch": 0.3773588270966361, + "grad_norm": 0.2418583184480667, + "learning_rate": 3.916937967609922e-05, + "loss": 0.1735, + "step": 21157 + }, + { + "epoch": 0.3773766632183498, + "grad_norm": 0.3053596317768097, + "learning_rate": 3.916809728484866e-05, + "loss": 0.1867, + "step": 21158 + }, + { + "epoch": 0.37739449934006347, + "grad_norm": 0.25629428029060364, + "learning_rate": 3.9166814838677676e-05, + "loss": 0.1911, + "step": 21159 + }, + { + "epoch": 0.3774123354617772, + "grad_norm": 0.26770350337028503, + "learning_rate": 3.916553233759121e-05, + "loss": 0.151, + "step": 21160 + }, + { + "epoch": 0.3774301715834909, + "grad_norm": 0.22661587595939636, + "learning_rate": 3.916424978159425e-05, + "loss": 0.1535, + "step": 21161 + }, + { + "epoch": 0.3774480077052046, + "grad_norm": 0.3032943606376648, + "learning_rate": 3.9162967170691776e-05, + "loss": 0.2057, + "step": 21162 + }, + { + "epoch": 0.3774658438269183, + "grad_norm": 0.26406124234199524, + "learning_rate": 3.916168450488874e-05, + "loss": 0.1682, + "step": 21163 + }, + { + "epoch": 0.37748367994863197, + "grad_norm": 0.27670973539352417, + "learning_rate": 3.9160401784190124e-05, + "loss": 0.1538, + "step": 21164 + }, + { + "epoch": 0.37750151607034566, + "grad_norm": 0.24491195380687714, + "learning_rate": 3.915911900860091e-05, + "loss": 0.1317, + "step": 21165 + }, + { + "epoch": 0.37751935219205934, + "grad_norm": 0.22537662088871002, + "learning_rate": 3.915783617812605e-05, + "loss": 0.1394, + "step": 21166 + }, + { + "epoch": 0.37753718831377303, + "grad_norm": 0.38055962324142456, + "learning_rate": 3.915655329277052e-05, + "loss": 0.1363, + "step": 21167 + }, + { + "epoch": 0.3775550244354868, + "grad_norm": 0.27480241656303406, + "learning_rate": 3.915527035253932e-05, + "loss": 0.1405, + "step": 21168 + }, + { + "epoch": 0.37757286055720046, + "grad_norm": 0.17853978276252747, + "learning_rate": 3.9153987357437396e-05, + "loss": 0.1156, + "step": 21169 + }, + { + "epoch": 0.37759069667891415, + "grad_norm": 0.2834928333759308, + "learning_rate": 3.915270430746972e-05, + "loss": 0.1674, + "step": 21170 + }, + { + "epoch": 0.37760853280062784, + "grad_norm": 0.32998737692832947, + "learning_rate": 3.915142120264128e-05, + "loss": 0.1654, + "step": 21171 + }, + { + "epoch": 0.3776263689223415, + "grad_norm": 0.32447749376296997, + "learning_rate": 3.915013804295704e-05, + "loss": 0.1997, + "step": 21172 + }, + { + "epoch": 0.3776442050440552, + "grad_norm": 0.2537434995174408, + "learning_rate": 3.9148854828421975e-05, + "loss": 0.1497, + "step": 21173 + }, + { + "epoch": 0.3776620411657689, + "grad_norm": 0.23773936927318573, + "learning_rate": 3.914757155904107e-05, + "loss": 0.1195, + "step": 21174 + }, + { + "epoch": 0.3776798772874826, + "grad_norm": 0.26881924271583557, + "learning_rate": 3.914628823481929e-05, + "loss": 0.1611, + "step": 21175 + }, + { + "epoch": 0.3776977134091963, + "grad_norm": 0.2859360873699188, + "learning_rate": 3.9145004855761605e-05, + "loss": 0.2267, + "step": 21176 + }, + { + "epoch": 0.37771554953091, + "grad_norm": 0.24947260320186615, + "learning_rate": 3.9143721421873006e-05, + "loss": 0.1907, + "step": 21177 + }, + { + "epoch": 0.3777333856526237, + "grad_norm": 0.20801378786563873, + "learning_rate": 3.914243793315845e-05, + "loss": 0.1439, + "step": 21178 + }, + { + "epoch": 0.3777512217743374, + "grad_norm": 0.2590799331665039, + "learning_rate": 3.914115438962292e-05, + "loss": 0.19, + "step": 21179 + }, + { + "epoch": 0.3777690578960511, + "grad_norm": 0.2983662486076355, + "learning_rate": 3.913987079127139e-05, + "loss": 0.1356, + "step": 21180 + }, + { + "epoch": 0.3777868940177648, + "grad_norm": 0.25143465399742126, + "learning_rate": 3.913858713810885e-05, + "loss": 0.1909, + "step": 21181 + }, + { + "epoch": 0.37780473013947846, + "grad_norm": 0.2417428195476532, + "learning_rate": 3.913730343014025e-05, + "loss": 0.1447, + "step": 21182 + }, + { + "epoch": 0.37782256626119215, + "grad_norm": 0.21844208240509033, + "learning_rate": 3.9136019667370576e-05, + "loss": 0.1193, + "step": 21183 + }, + { + "epoch": 0.37784040238290584, + "grad_norm": 0.28542360663414, + "learning_rate": 3.913473584980482e-05, + "loss": 0.1791, + "step": 21184 + }, + { + "epoch": 0.3778582385046196, + "grad_norm": 0.2789161205291748, + "learning_rate": 3.9133451977447933e-05, + "loss": 0.1743, + "step": 21185 + }, + { + "epoch": 0.37787607462633327, + "grad_norm": 0.25786641240119934, + "learning_rate": 3.9132168050304904e-05, + "loss": 0.1184, + "step": 21186 + }, + { + "epoch": 0.37789391074804696, + "grad_norm": 0.3577745854854584, + "learning_rate": 3.9130884068380724e-05, + "loss": 0.1474, + "step": 21187 + }, + { + "epoch": 0.37791174686976065, + "grad_norm": 0.2462194263935089, + "learning_rate": 3.9129600031680346e-05, + "loss": 0.1736, + "step": 21188 + }, + { + "epoch": 0.37792958299147433, + "grad_norm": 0.2648969888687134, + "learning_rate": 3.912831594020877e-05, + "loss": 0.1365, + "step": 21189 + }, + { + "epoch": 0.377947419113188, + "grad_norm": 0.22943319380283356, + "learning_rate": 3.9127031793970946e-05, + "loss": 0.1215, + "step": 21190 + }, + { + "epoch": 0.3779652552349017, + "grad_norm": 0.45379889011383057, + "learning_rate": 3.912574759297188e-05, + "loss": 0.157, + "step": 21191 + }, + { + "epoch": 0.3779830913566154, + "grad_norm": 0.2934167683124542, + "learning_rate": 3.9124463337216535e-05, + "loss": 0.1353, + "step": 21192 + }, + { + "epoch": 0.3780009274783291, + "grad_norm": 0.2566176950931549, + "learning_rate": 3.912317902670989e-05, + "loss": 0.1437, + "step": 21193 + }, + { + "epoch": 0.37801876360004283, + "grad_norm": 0.35906800627708435, + "learning_rate": 3.912189466145692e-05, + "loss": 0.2763, + "step": 21194 + }, + { + "epoch": 0.3780365997217565, + "grad_norm": 0.2270839810371399, + "learning_rate": 3.9120610241462605e-05, + "loss": 0.1636, + "step": 21195 + }, + { + "epoch": 0.3780544358434702, + "grad_norm": 0.23638534545898438, + "learning_rate": 3.9119325766731945e-05, + "loss": 0.1472, + "step": 21196 + }, + { + "epoch": 0.3780722719651839, + "grad_norm": 0.2532123327255249, + "learning_rate": 3.9118041237269886e-05, + "loss": 0.1549, + "step": 21197 + }, + { + "epoch": 0.3780901080868976, + "grad_norm": 0.21310847997665405, + "learning_rate": 3.9116756653081434e-05, + "loss": 0.1577, + "step": 21198 + }, + { + "epoch": 0.37810794420861127, + "grad_norm": 0.33060526847839355, + "learning_rate": 3.911547201417155e-05, + "loss": 0.1887, + "step": 21199 + }, + { + "epoch": 0.37812578033032496, + "grad_norm": 0.3737643361091614, + "learning_rate": 3.911418732054522e-05, + "loss": 0.1514, + "step": 21200 + }, + { + "epoch": 0.37814361645203864, + "grad_norm": 0.2087615728378296, + "learning_rate": 3.911290257220743e-05, + "loss": 0.1471, + "step": 21201 + }, + { + "epoch": 0.3781614525737524, + "grad_norm": 0.38428646326065063, + "learning_rate": 3.9111617769163155e-05, + "loss": 0.133, + "step": 21202 + }, + { + "epoch": 0.3781792886954661, + "grad_norm": 0.3405749499797821, + "learning_rate": 3.911033291141738e-05, + "loss": 0.126, + "step": 21203 + }, + { + "epoch": 0.37819712481717976, + "grad_norm": 0.19987717270851135, + "learning_rate": 3.9109047998975076e-05, + "loss": 0.1207, + "step": 21204 + }, + { + "epoch": 0.37821496093889345, + "grad_norm": 0.2265322059392929, + "learning_rate": 3.9107763031841226e-05, + "loss": 0.1377, + "step": 21205 + }, + { + "epoch": 0.37823279706060714, + "grad_norm": 0.4929906725883484, + "learning_rate": 3.910647801002082e-05, + "loss": 0.1679, + "step": 21206 + }, + { + "epoch": 0.37825063318232083, + "grad_norm": 0.25322386622428894, + "learning_rate": 3.9105192933518824e-05, + "loss": 0.1363, + "step": 21207 + }, + { + "epoch": 0.3782684693040345, + "grad_norm": 0.19747935235500336, + "learning_rate": 3.910390780234023e-05, + "loss": 0.1324, + "step": 21208 + }, + { + "epoch": 0.3782863054257482, + "grad_norm": 0.22482000291347504, + "learning_rate": 3.910262261649003e-05, + "loss": 0.142, + "step": 21209 + }, + { + "epoch": 0.37830414154746195, + "grad_norm": 0.2056960016489029, + "learning_rate": 3.910133737597318e-05, + "loss": 0.1479, + "step": 21210 + }, + { + "epoch": 0.37832197766917564, + "grad_norm": 0.2445351928472519, + "learning_rate": 3.910005208079468e-05, + "loss": 0.1177, + "step": 21211 + }, + { + "epoch": 0.3783398137908893, + "grad_norm": 0.3166850805282593, + "learning_rate": 3.9098766730959516e-05, + "loss": 0.1354, + "step": 21212 + }, + { + "epoch": 0.378357649912603, + "grad_norm": 0.3053915798664093, + "learning_rate": 3.909748132647265e-05, + "loss": 0.1398, + "step": 21213 + }, + { + "epoch": 0.3783754860343167, + "grad_norm": 0.2880917191505432, + "learning_rate": 3.9096195867339085e-05, + "loss": 0.1758, + "step": 21214 + }, + { + "epoch": 0.3783933221560304, + "grad_norm": 0.2977958917617798, + "learning_rate": 3.9094910353563795e-05, + "loss": 0.1254, + "step": 21215 + }, + { + "epoch": 0.3784111582777441, + "grad_norm": 0.24911907315254211, + "learning_rate": 3.909362478515176e-05, + "loss": 0.1166, + "step": 21216 + }, + { + "epoch": 0.37842899439945776, + "grad_norm": 0.29075637459754944, + "learning_rate": 3.9092339162107976e-05, + "loss": 0.1511, + "step": 21217 + }, + { + "epoch": 0.37844683052117145, + "grad_norm": 0.2191636562347412, + "learning_rate": 3.9091053484437415e-05, + "loss": 0.1333, + "step": 21218 + }, + { + "epoch": 0.3784646666428852, + "grad_norm": 0.25263407826423645, + "learning_rate": 3.908976775214506e-05, + "loss": 0.1088, + "step": 21219 + }, + { + "epoch": 0.3784825027645989, + "grad_norm": 0.18928050994873047, + "learning_rate": 3.90884819652359e-05, + "loss": 0.1152, + "step": 21220 + }, + { + "epoch": 0.37850033888631257, + "grad_norm": 0.3013874888420105, + "learning_rate": 3.908719612371492e-05, + "loss": 0.154, + "step": 21221 + }, + { + "epoch": 0.37851817500802626, + "grad_norm": 0.21188804507255554, + "learning_rate": 3.90859102275871e-05, + "loss": 0.1385, + "step": 21222 + }, + { + "epoch": 0.37853601112973995, + "grad_norm": 0.3053354322910309, + "learning_rate": 3.908462427685743e-05, + "loss": 0.1564, + "step": 21223 + }, + { + "epoch": 0.37855384725145363, + "grad_norm": 0.2779475748538971, + "learning_rate": 3.908333827153089e-05, + "loss": 0.1343, + "step": 21224 + }, + { + "epoch": 0.3785716833731673, + "grad_norm": 0.37374675273895264, + "learning_rate": 3.9082052211612464e-05, + "loss": 0.1169, + "step": 21225 + }, + { + "epoch": 0.378589519494881, + "grad_norm": 0.6503673195838928, + "learning_rate": 3.9080766097107144e-05, + "loss": 0.3509, + "step": 21226 + }, + { + "epoch": 0.37860735561659475, + "grad_norm": 0.19449803233146667, + "learning_rate": 3.907947992801991e-05, + "loss": 0.1154, + "step": 21227 + }, + { + "epoch": 0.37862519173830844, + "grad_norm": 0.2848609983921051, + "learning_rate": 3.9078193704355745e-05, + "loss": 0.1827, + "step": 21228 + }, + { + "epoch": 0.37864302786002213, + "grad_norm": 0.2472040057182312, + "learning_rate": 3.907690742611964e-05, + "loss": 0.1787, + "step": 21229 + }, + { + "epoch": 0.3786608639817358, + "grad_norm": 0.26020362973213196, + "learning_rate": 3.907562109331658e-05, + "loss": 0.1735, + "step": 21230 + }, + { + "epoch": 0.3786787001034495, + "grad_norm": 0.3421153426170349, + "learning_rate": 3.907433470595156e-05, + "loss": 0.1924, + "step": 21231 + }, + { + "epoch": 0.3786965362251632, + "grad_norm": 0.24508266150951385, + "learning_rate": 3.907304826402955e-05, + "loss": 0.1411, + "step": 21232 + }, + { + "epoch": 0.3787143723468769, + "grad_norm": 0.2621370553970337, + "learning_rate": 3.907176176755555e-05, + "loss": 0.1592, + "step": 21233 + }, + { + "epoch": 0.37873220846859057, + "grad_norm": 0.278695672750473, + "learning_rate": 3.907047521653453e-05, + "loss": 0.1405, + "step": 21234 + }, + { + "epoch": 0.37875004459030426, + "grad_norm": 0.40978237986564636, + "learning_rate": 3.9069188610971495e-05, + "loss": 0.1605, + "step": 21235 + }, + { + "epoch": 0.378767880712018, + "grad_norm": 0.3006298243999481, + "learning_rate": 3.906790195087142e-05, + "loss": 0.197, + "step": 21236 + }, + { + "epoch": 0.3787857168337317, + "grad_norm": 0.3026232421398163, + "learning_rate": 3.906661523623931e-05, + "loss": 0.1473, + "step": 21237 + }, + { + "epoch": 0.3788035529554454, + "grad_norm": 0.360795795917511, + "learning_rate": 3.9065328467080134e-05, + "loss": 0.1819, + "step": 21238 + }, + { + "epoch": 0.37882138907715907, + "grad_norm": 0.29126667976379395, + "learning_rate": 3.9064041643398884e-05, + "loss": 0.1964, + "step": 21239 + }, + { + "epoch": 0.37883922519887275, + "grad_norm": 0.26633015275001526, + "learning_rate": 3.906275476520055e-05, + "loss": 0.1807, + "step": 21240 + }, + { + "epoch": 0.37885706132058644, + "grad_norm": 0.20632028579711914, + "learning_rate": 3.9061467832490125e-05, + "loss": 0.1247, + "step": 21241 + }, + { + "epoch": 0.37887489744230013, + "grad_norm": 0.31353750824928284, + "learning_rate": 3.90601808452726e-05, + "loss": 0.1945, + "step": 21242 + }, + { + "epoch": 0.3788927335640138, + "grad_norm": 0.3836959898471832, + "learning_rate": 3.905889380355295e-05, + "loss": 0.1782, + "step": 21243 + }, + { + "epoch": 0.37891056968572756, + "grad_norm": 0.32514551281929016, + "learning_rate": 3.9057606707336174e-05, + "loss": 0.1609, + "step": 21244 + }, + { + "epoch": 0.37892840580744125, + "grad_norm": 0.17793434858322144, + "learning_rate": 3.905631955662726e-05, + "loss": 0.1334, + "step": 21245 + }, + { + "epoch": 0.37894624192915494, + "grad_norm": 0.22976845502853394, + "learning_rate": 3.90550323514312e-05, + "loss": 0.168, + "step": 21246 + }, + { + "epoch": 0.3789640780508686, + "grad_norm": 0.2741779088973999, + "learning_rate": 3.905374509175297e-05, + "loss": 0.1788, + "step": 21247 + }, + { + "epoch": 0.3789819141725823, + "grad_norm": 0.3179311752319336, + "learning_rate": 3.905245777759757e-05, + "loss": 0.1563, + "step": 21248 + }, + { + "epoch": 0.378999750294296, + "grad_norm": 0.1751677691936493, + "learning_rate": 3.905117040896999e-05, + "loss": 0.1265, + "step": 21249 + }, + { + "epoch": 0.3790175864160097, + "grad_norm": 0.27600058913230896, + "learning_rate": 3.904988298587524e-05, + "loss": 0.1717, + "step": 21250 + }, + { + "epoch": 0.3790354225377234, + "grad_norm": 0.2622409760951996, + "learning_rate": 3.904859550831827e-05, + "loss": 0.1359, + "step": 21251 + }, + { + "epoch": 0.37905325865943706, + "grad_norm": 0.2521669268608093, + "learning_rate": 3.90473079763041e-05, + "loss": 0.1846, + "step": 21252 + }, + { + "epoch": 0.3790710947811508, + "grad_norm": 0.2424364686012268, + "learning_rate": 3.904602038983771e-05, + "loss": 0.1084, + "step": 21253 + }, + { + "epoch": 0.3790889309028645, + "grad_norm": 0.2561042904853821, + "learning_rate": 3.904473274892409e-05, + "loss": 0.153, + "step": 21254 + }, + { + "epoch": 0.3791067670245782, + "grad_norm": 0.3898017704486847, + "learning_rate": 3.904344505356824e-05, + "loss": 0.1666, + "step": 21255 + }, + { + "epoch": 0.37912460314629187, + "grad_norm": 0.3199654221534729, + "learning_rate": 3.904215730377515e-05, + "loss": 0.1522, + "step": 21256 + }, + { + "epoch": 0.37914243926800556, + "grad_norm": 0.2556706666946411, + "learning_rate": 3.9040869499549806e-05, + "loss": 0.1583, + "step": 21257 + }, + { + "epoch": 0.37916027538971925, + "grad_norm": 0.2571445107460022, + "learning_rate": 3.9039581640897206e-05, + "loss": 0.1755, + "step": 21258 + }, + { + "epoch": 0.37917811151143294, + "grad_norm": 0.28556835651397705, + "learning_rate": 3.9038293727822326e-05, + "loss": 0.1232, + "step": 21259 + }, + { + "epoch": 0.3791959476331466, + "grad_norm": 0.28444764018058777, + "learning_rate": 3.903700576033018e-05, + "loss": 0.1542, + "step": 21260 + }, + { + "epoch": 0.37921378375486037, + "grad_norm": 0.35084104537963867, + "learning_rate": 3.903571773842575e-05, + "loss": 0.2259, + "step": 21261 + }, + { + "epoch": 0.37923161987657406, + "grad_norm": 0.2079666405916214, + "learning_rate": 3.9034429662114026e-05, + "loss": 0.0906, + "step": 21262 + }, + { + "epoch": 0.37924945599828774, + "grad_norm": 0.2521560490131378, + "learning_rate": 3.903314153140001e-05, + "loss": 0.1741, + "step": 21263 + }, + { + "epoch": 0.37926729212000143, + "grad_norm": 0.2655588984489441, + "learning_rate": 3.903185334628869e-05, + "loss": 0.1366, + "step": 21264 + }, + { + "epoch": 0.3792851282417151, + "grad_norm": 0.263223797082901, + "learning_rate": 3.903056510678506e-05, + "loss": 0.2033, + "step": 21265 + }, + { + "epoch": 0.3793029643634288, + "grad_norm": 0.2996566593647003, + "learning_rate": 3.902927681289411e-05, + "loss": 0.199, + "step": 21266 + }, + { + "epoch": 0.3793208004851425, + "grad_norm": 0.34763407707214355, + "learning_rate": 3.902798846462085e-05, + "loss": 0.2062, + "step": 21267 + }, + { + "epoch": 0.3793386366068562, + "grad_norm": 0.2197709083557129, + "learning_rate": 3.902670006197024e-05, + "loss": 0.1169, + "step": 21268 + }, + { + "epoch": 0.3793564727285699, + "grad_norm": 0.23021401464939117, + "learning_rate": 3.902541160494732e-05, + "loss": 0.1165, + "step": 21269 + }, + { + "epoch": 0.3793743088502836, + "grad_norm": 0.36089032888412476, + "learning_rate": 3.902412309355704e-05, + "loss": 0.1859, + "step": 21270 + }, + { + "epoch": 0.3793921449719973, + "grad_norm": 0.17536687850952148, + "learning_rate": 3.9022834527804425e-05, + "loss": 0.1293, + "step": 21271 + }, + { + "epoch": 0.379409981093711, + "grad_norm": 0.22288085520267487, + "learning_rate": 3.902154590769446e-05, + "loss": 0.2072, + "step": 21272 + }, + { + "epoch": 0.3794278172154247, + "grad_norm": 0.32936733961105347, + "learning_rate": 3.902025723323214e-05, + "loss": 0.1509, + "step": 21273 + }, + { + "epoch": 0.37944565333713837, + "grad_norm": 0.22451190650463104, + "learning_rate": 3.901896850442246e-05, + "loss": 0.1763, + "step": 21274 + }, + { + "epoch": 0.37946348945885205, + "grad_norm": 0.309291809797287, + "learning_rate": 3.9017679721270415e-05, + "loss": 0.1725, + "step": 21275 + }, + { + "epoch": 0.37948132558056574, + "grad_norm": 0.3289521038532257, + "learning_rate": 3.9016390883781e-05, + "loss": 0.1937, + "step": 21276 + }, + { + "epoch": 0.37949916170227943, + "grad_norm": 0.26594680547714233, + "learning_rate": 3.9015101991959215e-05, + "loss": 0.1087, + "step": 21277 + }, + { + "epoch": 0.3795169978239932, + "grad_norm": 0.35846877098083496, + "learning_rate": 3.9013813045810054e-05, + "loss": 0.1338, + "step": 21278 + }, + { + "epoch": 0.37953483394570686, + "grad_norm": 0.23709867894649506, + "learning_rate": 3.901252404533851e-05, + "loss": 0.1589, + "step": 21279 + }, + { + "epoch": 0.37955267006742055, + "grad_norm": 0.37287458777427673, + "learning_rate": 3.901123499054959e-05, + "loss": 0.1838, + "step": 21280 + }, + { + "epoch": 0.37957050618913424, + "grad_norm": 0.21567387878894806, + "learning_rate": 3.900994588144828e-05, + "loss": 0.1388, + "step": 21281 + }, + { + "epoch": 0.3795883423108479, + "grad_norm": 0.2750032842159271, + "learning_rate": 3.9008656718039585e-05, + "loss": 0.1665, + "step": 21282 + }, + { + "epoch": 0.3796061784325616, + "grad_norm": 0.23288658261299133, + "learning_rate": 3.900736750032849e-05, + "loss": 0.14, + "step": 21283 + }, + { + "epoch": 0.3796240145542753, + "grad_norm": 0.21842624247074127, + "learning_rate": 3.900607822832001e-05, + "loss": 0.1446, + "step": 21284 + }, + { + "epoch": 0.379641850675989, + "grad_norm": 0.25891900062561035, + "learning_rate": 3.900478890201913e-05, + "loss": 0.1166, + "step": 21285 + }, + { + "epoch": 0.37965968679770273, + "grad_norm": 0.28322353959083557, + "learning_rate": 3.9003499521430844e-05, + "loss": 0.1617, + "step": 21286 + }, + { + "epoch": 0.3796775229194164, + "grad_norm": 0.24988368153572083, + "learning_rate": 3.9002210086560165e-05, + "loss": 0.1246, + "step": 21287 + }, + { + "epoch": 0.3796953590411301, + "grad_norm": 0.2221831977367401, + "learning_rate": 3.9000920597412076e-05, + "loss": 0.139, + "step": 21288 + }, + { + "epoch": 0.3797131951628438, + "grad_norm": 0.24445988237857819, + "learning_rate": 3.899963105399159e-05, + "loss": 0.1336, + "step": 21289 + }, + { + "epoch": 0.3797310312845575, + "grad_norm": 0.2632139325141907, + "learning_rate": 3.89983414563037e-05, + "loss": 0.1645, + "step": 21290 + }, + { + "epoch": 0.3797488674062712, + "grad_norm": 0.3205166757106781, + "learning_rate": 3.8997051804353395e-05, + "loss": 0.0936, + "step": 21291 + }, + { + "epoch": 0.37976670352798486, + "grad_norm": 0.29358381032943726, + "learning_rate": 3.899576209814569e-05, + "loss": 0.1812, + "step": 21292 + }, + { + "epoch": 0.37978453964969855, + "grad_norm": 0.2555774748325348, + "learning_rate": 3.899447233768557e-05, + "loss": 0.1811, + "step": 21293 + }, + { + "epoch": 0.37980237577141224, + "grad_norm": 0.32090839743614197, + "learning_rate": 3.899318252297805e-05, + "loss": 0.2228, + "step": 21294 + }, + { + "epoch": 0.379820211893126, + "grad_norm": 0.2669735848903656, + "learning_rate": 3.8991892654028115e-05, + "loss": 0.1243, + "step": 21295 + }, + { + "epoch": 0.37983804801483967, + "grad_norm": 0.2927315831184387, + "learning_rate": 3.8990602730840774e-05, + "loss": 0.155, + "step": 21296 + }, + { + "epoch": 0.37985588413655336, + "grad_norm": 0.2219819873571396, + "learning_rate": 3.898931275342104e-05, + "loss": 0.17, + "step": 21297 + }, + { + "epoch": 0.37987372025826704, + "grad_norm": 0.23766762018203735, + "learning_rate": 3.898802272177388e-05, + "loss": 0.1575, + "step": 21298 + }, + { + "epoch": 0.37989155637998073, + "grad_norm": 0.24942241609096527, + "learning_rate": 3.898673263590431e-05, + "loss": 0.1357, + "step": 21299 + }, + { + "epoch": 0.3799093925016944, + "grad_norm": 0.30427879095077515, + "learning_rate": 3.8985442495817345e-05, + "loss": 0.1702, + "step": 21300 + }, + { + "epoch": 0.3799272286234081, + "grad_norm": 0.2935236692428589, + "learning_rate": 3.898415230151796e-05, + "loss": 0.1042, + "step": 21301 + }, + { + "epoch": 0.3799450647451218, + "grad_norm": 0.2680317163467407, + "learning_rate": 3.898286205301118e-05, + "loss": 0.1427, + "step": 21302 + }, + { + "epoch": 0.37996290086683554, + "grad_norm": 0.3771883249282837, + "learning_rate": 3.8981571750302e-05, + "loss": 0.1681, + "step": 21303 + }, + { + "epoch": 0.37998073698854923, + "grad_norm": 0.20892265439033508, + "learning_rate": 3.898028139339542e-05, + "loss": 0.1501, + "step": 21304 + }, + { + "epoch": 0.3799985731102629, + "grad_norm": 0.23447923362255096, + "learning_rate": 3.897899098229643e-05, + "loss": 0.1736, + "step": 21305 + }, + { + "epoch": 0.3800164092319766, + "grad_norm": 0.28292062878608704, + "learning_rate": 3.897770051701005e-05, + "loss": 0.1891, + "step": 21306 + }, + { + "epoch": 0.3800342453536903, + "grad_norm": 0.20241063833236694, + "learning_rate": 3.8976409997541276e-05, + "loss": 0.1402, + "step": 21307 + }, + { + "epoch": 0.380052081475404, + "grad_norm": 0.24780890345573425, + "learning_rate": 3.897511942389511e-05, + "loss": 0.151, + "step": 21308 + }, + { + "epoch": 0.38006991759711767, + "grad_norm": 0.2711176574230194, + "learning_rate": 3.897382879607655e-05, + "loss": 0.1622, + "step": 21309 + }, + { + "epoch": 0.38008775371883136, + "grad_norm": 0.2536943554878235, + "learning_rate": 3.8972538114090605e-05, + "loss": 0.1352, + "step": 21310 + }, + { + "epoch": 0.3801055898405451, + "grad_norm": 0.2562722861766815, + "learning_rate": 3.897124737794228e-05, + "loss": 0.181, + "step": 21311 + }, + { + "epoch": 0.3801234259622588, + "grad_norm": 0.3270852565765381, + "learning_rate": 3.896995658763657e-05, + "loss": 0.1412, + "step": 21312 + }, + { + "epoch": 0.3801412620839725, + "grad_norm": 0.22758400440216064, + "learning_rate": 3.8968665743178484e-05, + "loss": 0.1631, + "step": 21313 + }, + { + "epoch": 0.38015909820568616, + "grad_norm": 0.23497611284255981, + "learning_rate": 3.8967374844573026e-05, + "loss": 0.2003, + "step": 21314 + }, + { + "epoch": 0.38017693432739985, + "grad_norm": 0.3024590313434601, + "learning_rate": 3.89660838918252e-05, + "loss": 0.1772, + "step": 21315 + }, + { + "epoch": 0.38019477044911354, + "grad_norm": 0.29801487922668457, + "learning_rate": 3.8964792884940004e-05, + "loss": 0.1364, + "step": 21316 + }, + { + "epoch": 0.3802126065708272, + "grad_norm": 0.3958665430545807, + "learning_rate": 3.8963501823922456e-05, + "loss": 0.2566, + "step": 21317 + }, + { + "epoch": 0.3802304426925409, + "grad_norm": 0.24767561256885529, + "learning_rate": 3.896221070877754e-05, + "loss": 0.1649, + "step": 21318 + }, + { + "epoch": 0.3802482788142546, + "grad_norm": 0.2352449893951416, + "learning_rate": 3.8960919539510284e-05, + "loss": 0.1362, + "step": 21319 + }, + { + "epoch": 0.38026611493596835, + "grad_norm": 0.3384075164794922, + "learning_rate": 3.8959628316125675e-05, + "loss": 0.1882, + "step": 21320 + }, + { + "epoch": 0.38028395105768203, + "grad_norm": 0.43732476234436035, + "learning_rate": 3.895833703862873e-05, + "loss": 0.1852, + "step": 21321 + }, + { + "epoch": 0.3803017871793957, + "grad_norm": 0.2435319423675537, + "learning_rate": 3.895704570702444e-05, + "loss": 0.1486, + "step": 21322 + }, + { + "epoch": 0.3803196233011094, + "grad_norm": 0.23220938444137573, + "learning_rate": 3.8955754321317833e-05, + "loss": 0.1757, + "step": 21323 + }, + { + "epoch": 0.3803374594228231, + "grad_norm": 0.3116278350353241, + "learning_rate": 3.89544628815139e-05, + "loss": 0.1468, + "step": 21324 + }, + { + "epoch": 0.3803552955445368, + "grad_norm": 0.2871105670928955, + "learning_rate": 3.8953171387617644e-05, + "loss": 0.1331, + "step": 21325 + }, + { + "epoch": 0.3803731316662505, + "grad_norm": 0.3474808931350708, + "learning_rate": 3.895187983963408e-05, + "loss": 0.1122, + "step": 21326 + }, + { + "epoch": 0.38039096778796416, + "grad_norm": 0.2743847370147705, + "learning_rate": 3.895058823756821e-05, + "loss": 0.134, + "step": 21327 + }, + { + "epoch": 0.3804088039096779, + "grad_norm": 0.23974527418613434, + "learning_rate": 3.8949296581425044e-05, + "loss": 0.1466, + "step": 21328 + }, + { + "epoch": 0.3804266400313916, + "grad_norm": 0.2747625708580017, + "learning_rate": 3.8948004871209576e-05, + "loss": 0.2196, + "step": 21329 + }, + { + "epoch": 0.3804444761531053, + "grad_norm": 0.1899239718914032, + "learning_rate": 3.894671310692684e-05, + "loss": 0.0982, + "step": 21330 + }, + { + "epoch": 0.38046231227481897, + "grad_norm": 0.3362436294555664, + "learning_rate": 3.8945421288581807e-05, + "loss": 0.1282, + "step": 21331 + }, + { + "epoch": 0.38048014839653266, + "grad_norm": 0.3614468276500702, + "learning_rate": 3.894412941617952e-05, + "loss": 0.176, + "step": 21332 + }, + { + "epoch": 0.38049798451824635, + "grad_norm": 0.2632768452167511, + "learning_rate": 3.894283748972496e-05, + "loss": 0.1586, + "step": 21333 + }, + { + "epoch": 0.38051582063996003, + "grad_norm": 0.26188868284225464, + "learning_rate": 3.894154550922315e-05, + "loss": 0.1101, + "step": 21334 + }, + { + "epoch": 0.3805336567616737, + "grad_norm": 0.30740755796432495, + "learning_rate": 3.89402534746791e-05, + "loss": 0.2337, + "step": 21335 + }, + { + "epoch": 0.3805514928833874, + "grad_norm": 0.3170974552631378, + "learning_rate": 3.893896138609782e-05, + "loss": 0.1768, + "step": 21336 + }, + { + "epoch": 0.38056932900510115, + "grad_norm": 0.3016303777694702, + "learning_rate": 3.8937669243484296e-05, + "loss": 0.173, + "step": 21337 + }, + { + "epoch": 0.38058716512681484, + "grad_norm": 0.2653926908969879, + "learning_rate": 3.893637704684356e-05, + "loss": 0.1472, + "step": 21338 + }, + { + "epoch": 0.38060500124852853, + "grad_norm": 0.19086246192455292, + "learning_rate": 3.893508479618061e-05, + "loss": 0.1832, + "step": 21339 + }, + { + "epoch": 0.3806228373702422, + "grad_norm": 0.2814791798591614, + "learning_rate": 3.893379249150045e-05, + "loss": 0.1481, + "step": 21340 + }, + { + "epoch": 0.3806406734919559, + "grad_norm": 0.24880683422088623, + "learning_rate": 3.893250013280811e-05, + "loss": 0.1455, + "step": 21341 + }, + { + "epoch": 0.3806585096136696, + "grad_norm": 0.2275916486978531, + "learning_rate": 3.893120772010859e-05, + "loss": 0.1565, + "step": 21342 + }, + { + "epoch": 0.3806763457353833, + "grad_norm": 0.1871194988489151, + "learning_rate": 3.892991525340689e-05, + "loss": 0.1073, + "step": 21343 + }, + { + "epoch": 0.38069418185709697, + "grad_norm": 0.24523408710956573, + "learning_rate": 3.892862273270803e-05, + "loss": 0.1133, + "step": 21344 + }, + { + "epoch": 0.3807120179788107, + "grad_norm": 0.1998000293970108, + "learning_rate": 3.8927330158017016e-05, + "loss": 0.1473, + "step": 21345 + }, + { + "epoch": 0.3807298541005244, + "grad_norm": 0.23183801770210266, + "learning_rate": 3.8926037529338855e-05, + "loss": 0.1417, + "step": 21346 + }, + { + "epoch": 0.3807476902222381, + "grad_norm": 0.3325054347515106, + "learning_rate": 3.8924744846678566e-05, + "loss": 0.1371, + "step": 21347 + }, + { + "epoch": 0.3807655263439518, + "grad_norm": 0.23030337691307068, + "learning_rate": 3.892345211004116e-05, + "loss": 0.13, + "step": 21348 + }, + { + "epoch": 0.38078336246566546, + "grad_norm": 0.28740453720092773, + "learning_rate": 3.892215931943164e-05, + "loss": 0.1826, + "step": 21349 + }, + { + "epoch": 0.38080119858737915, + "grad_norm": 0.2430138736963272, + "learning_rate": 3.892086647485503e-05, + "loss": 0.1241, + "step": 21350 + }, + { + "epoch": 0.38081903470909284, + "grad_norm": 0.22843456268310547, + "learning_rate": 3.8919573576316323e-05, + "loss": 0.1501, + "step": 21351 + }, + { + "epoch": 0.38083687083080653, + "grad_norm": 0.3002528250217438, + "learning_rate": 3.891828062382055e-05, + "loss": 0.2014, + "step": 21352 + }, + { + "epoch": 0.3808547069525202, + "grad_norm": 0.2832220494747162, + "learning_rate": 3.891698761737271e-05, + "loss": 0.2018, + "step": 21353 + }, + { + "epoch": 0.38087254307423396, + "grad_norm": 0.22214335203170776, + "learning_rate": 3.8915694556977825e-05, + "loss": 0.1408, + "step": 21354 + }, + { + "epoch": 0.38089037919594765, + "grad_norm": 0.21637408435344696, + "learning_rate": 3.891440144264089e-05, + "loss": 0.1366, + "step": 21355 + }, + { + "epoch": 0.38090821531766134, + "grad_norm": 0.2187330573797226, + "learning_rate": 3.8913108274366935e-05, + "loss": 0.1432, + "step": 21356 + }, + { + "epoch": 0.380926051439375, + "grad_norm": 0.42244794964790344, + "learning_rate": 3.891181505216096e-05, + "loss": 0.1338, + "step": 21357 + }, + { + "epoch": 0.3809438875610887, + "grad_norm": 0.22175978124141693, + "learning_rate": 3.8910521776027995e-05, + "loss": 0.1336, + "step": 21358 + }, + { + "epoch": 0.3809617236828024, + "grad_norm": 0.3732893466949463, + "learning_rate": 3.8909228445973045e-05, + "loss": 0.1501, + "step": 21359 + }, + { + "epoch": 0.3809795598045161, + "grad_norm": 0.24530957639217377, + "learning_rate": 3.8907935062001114e-05, + "loss": 0.15, + "step": 21360 + }, + { + "epoch": 0.3809973959262298, + "grad_norm": 0.26988792419433594, + "learning_rate": 3.890664162411722e-05, + "loss": 0.1167, + "step": 21361 + }, + { + "epoch": 0.3810152320479435, + "grad_norm": 0.2606840431690216, + "learning_rate": 3.8905348132326394e-05, + "loss": 0.1496, + "step": 21362 + }, + { + "epoch": 0.3810330681696572, + "grad_norm": 0.3573759198188782, + "learning_rate": 3.8904054586633627e-05, + "loss": 0.1985, + "step": 21363 + }, + { + "epoch": 0.3810509042913709, + "grad_norm": 0.25282537937164307, + "learning_rate": 3.890276098704394e-05, + "loss": 0.1786, + "step": 21364 + }, + { + "epoch": 0.3810687404130846, + "grad_norm": 0.24276134371757507, + "learning_rate": 3.890146733356235e-05, + "loss": 0.1014, + "step": 21365 + }, + { + "epoch": 0.38108657653479827, + "grad_norm": 0.2656461298465729, + "learning_rate": 3.890017362619387e-05, + "loss": 0.167, + "step": 21366 + }, + { + "epoch": 0.38110441265651196, + "grad_norm": 0.2616214454174042, + "learning_rate": 3.8898879864943524e-05, + "loss": 0.1631, + "step": 21367 + }, + { + "epoch": 0.38112224877822565, + "grad_norm": 0.253395140171051, + "learning_rate": 3.889758604981631e-05, + "loss": 0.1479, + "step": 21368 + }, + { + "epoch": 0.38114008489993934, + "grad_norm": 0.26738929748535156, + "learning_rate": 3.889629218081726e-05, + "loss": 0.1797, + "step": 21369 + }, + { + "epoch": 0.3811579210216531, + "grad_norm": 0.2278267741203308, + "learning_rate": 3.8894998257951376e-05, + "loss": 0.1458, + "step": 21370 + }, + { + "epoch": 0.38117575714336677, + "grad_norm": 0.2141774445772171, + "learning_rate": 3.889370428122369e-05, + "loss": 0.1265, + "step": 21371 + }, + { + "epoch": 0.38119359326508045, + "grad_norm": 0.23725609481334686, + "learning_rate": 3.889241025063919e-05, + "loss": 0.1922, + "step": 21372 + }, + { + "epoch": 0.38121142938679414, + "grad_norm": 0.2640169858932495, + "learning_rate": 3.889111616620292e-05, + "loss": 0.1573, + "step": 21373 + }, + { + "epoch": 0.38122926550850783, + "grad_norm": 0.29436805844306946, + "learning_rate": 3.888982202791989e-05, + "loss": 0.1488, + "step": 21374 + }, + { + "epoch": 0.3812471016302215, + "grad_norm": 0.35168540477752686, + "learning_rate": 3.888852783579511e-05, + "loss": 0.1973, + "step": 21375 + }, + { + "epoch": 0.3812649377519352, + "grad_norm": 0.2316911369562149, + "learning_rate": 3.8887233589833595e-05, + "loss": 0.1076, + "step": 21376 + }, + { + "epoch": 0.3812827738736489, + "grad_norm": 0.2865270972251892, + "learning_rate": 3.8885939290040364e-05, + "loss": 0.1776, + "step": 21377 + }, + { + "epoch": 0.3813006099953626, + "grad_norm": 0.22280453145503998, + "learning_rate": 3.888464493642045e-05, + "loss": 0.1348, + "step": 21378 + }, + { + "epoch": 0.3813184461170763, + "grad_norm": 0.3431093990802765, + "learning_rate": 3.8883350528978836e-05, + "loss": 0.2177, + "step": 21379 + }, + { + "epoch": 0.38133628223879, + "grad_norm": 0.24846325814723969, + "learning_rate": 3.8882056067720573e-05, + "loss": 0.1498, + "step": 21380 + }, + { + "epoch": 0.3813541183605037, + "grad_norm": 0.29790768027305603, + "learning_rate": 3.888076155265066e-05, + "loss": 0.1658, + "step": 21381 + }, + { + "epoch": 0.3813719544822174, + "grad_norm": 0.3084394931793213, + "learning_rate": 3.8879466983774124e-05, + "loss": 0.1668, + "step": 21382 + }, + { + "epoch": 0.3813897906039311, + "grad_norm": 0.3174588978290558, + "learning_rate": 3.887817236109598e-05, + "loss": 0.1738, + "step": 21383 + }, + { + "epoch": 0.38140762672564477, + "grad_norm": 0.24115942418575287, + "learning_rate": 3.887687768462125e-05, + "loss": 0.1534, + "step": 21384 + }, + { + "epoch": 0.38142546284735845, + "grad_norm": 0.2526243031024933, + "learning_rate": 3.887558295435495e-05, + "loss": 0.127, + "step": 21385 + }, + { + "epoch": 0.38144329896907214, + "grad_norm": 0.3192991316318512, + "learning_rate": 3.8874288170302095e-05, + "loss": 0.1741, + "step": 21386 + }, + { + "epoch": 0.3814611350907859, + "grad_norm": 0.27901574969291687, + "learning_rate": 3.88729933324677e-05, + "loss": 0.1616, + "step": 21387 + }, + { + "epoch": 0.3814789712124996, + "grad_norm": 0.2452971637248993, + "learning_rate": 3.88716984408568e-05, + "loss": 0.1622, + "step": 21388 + }, + { + "epoch": 0.38149680733421326, + "grad_norm": 0.422721803188324, + "learning_rate": 3.8870403495474404e-05, + "loss": 0.2254, + "step": 21389 + }, + { + "epoch": 0.38151464345592695, + "grad_norm": 0.31347596645355225, + "learning_rate": 3.8869108496325534e-05, + "loss": 0.2171, + "step": 21390 + }, + { + "epoch": 0.38153247957764064, + "grad_norm": 0.23665069043636322, + "learning_rate": 3.886781344341521e-05, + "loss": 0.1526, + "step": 21391 + }, + { + "epoch": 0.3815503156993543, + "grad_norm": 0.3179226219654083, + "learning_rate": 3.8866518336748445e-05, + "loss": 0.1996, + "step": 21392 + }, + { + "epoch": 0.381568151821068, + "grad_norm": 0.2677600383758545, + "learning_rate": 3.8865223176330275e-05, + "loss": 0.161, + "step": 21393 + }, + { + "epoch": 0.3815859879427817, + "grad_norm": 0.253722220659256, + "learning_rate": 3.8863927962165704e-05, + "loss": 0.1984, + "step": 21394 + }, + { + "epoch": 0.3816038240644954, + "grad_norm": 0.2372373342514038, + "learning_rate": 3.886263269425976e-05, + "loss": 0.1393, + "step": 21395 + }, + { + "epoch": 0.38162166018620913, + "grad_norm": 0.24138881266117096, + "learning_rate": 3.8861337372617466e-05, + "loss": 0.181, + "step": 21396 + }, + { + "epoch": 0.3816394963079228, + "grad_norm": 0.24450714886188507, + "learning_rate": 3.886004199724385e-05, + "loss": 0.1359, + "step": 21397 + }, + { + "epoch": 0.3816573324296365, + "grad_norm": 0.20878930389881134, + "learning_rate": 3.8858746568143914e-05, + "loss": 0.1351, + "step": 21398 + }, + { + "epoch": 0.3816751685513502, + "grad_norm": 0.31570184230804443, + "learning_rate": 3.8857451085322684e-05, + "loss": 0.2223, + "step": 21399 + }, + { + "epoch": 0.3816930046730639, + "grad_norm": 0.3362729549407959, + "learning_rate": 3.885615554878519e-05, + "loss": 0.2215, + "step": 21400 + }, + { + "epoch": 0.3817108407947776, + "grad_norm": 0.23103202879428864, + "learning_rate": 3.885485995853646e-05, + "loss": 0.1711, + "step": 21401 + }, + { + "epoch": 0.38172867691649126, + "grad_norm": 0.20432880520820618, + "learning_rate": 3.88535643145815e-05, + "loss": 0.1724, + "step": 21402 + }, + { + "epoch": 0.38174651303820495, + "grad_norm": 0.23820751905441284, + "learning_rate": 3.885226861692534e-05, + "loss": 0.1335, + "step": 21403 + }, + { + "epoch": 0.3817643491599187, + "grad_norm": 0.2773251235485077, + "learning_rate": 3.885097286557301e-05, + "loss": 0.1557, + "step": 21404 + }, + { + "epoch": 0.3817821852816324, + "grad_norm": 0.28145670890808105, + "learning_rate": 3.884967706052952e-05, + "loss": 0.1891, + "step": 21405 + }, + { + "epoch": 0.38180002140334607, + "grad_norm": 0.2892676591873169, + "learning_rate": 3.88483812017999e-05, + "loss": 0.1466, + "step": 21406 + }, + { + "epoch": 0.38181785752505976, + "grad_norm": 0.20915253460407257, + "learning_rate": 3.884708528938916e-05, + "loss": 0.1048, + "step": 21407 + }, + { + "epoch": 0.38183569364677344, + "grad_norm": 0.27298206090927124, + "learning_rate": 3.884578932330235e-05, + "loss": 0.1788, + "step": 21408 + }, + { + "epoch": 0.38185352976848713, + "grad_norm": 0.21241484582424164, + "learning_rate": 3.884449330354447e-05, + "loss": 0.1282, + "step": 21409 + }, + { + "epoch": 0.3818713658902008, + "grad_norm": 0.23862679302692413, + "learning_rate": 3.8843197230120555e-05, + "loss": 0.1351, + "step": 21410 + }, + { + "epoch": 0.3818892020119145, + "grad_norm": 0.33557847142219543, + "learning_rate": 3.884190110303563e-05, + "loss": 0.1403, + "step": 21411 + }, + { + "epoch": 0.38190703813362825, + "grad_norm": 0.26551353931427, + "learning_rate": 3.884060492229471e-05, + "loss": 0.1894, + "step": 21412 + }, + { + "epoch": 0.38192487425534194, + "grad_norm": 0.39905598759651184, + "learning_rate": 3.883930868790282e-05, + "loss": 0.2243, + "step": 21413 + }, + { + "epoch": 0.3819427103770556, + "grad_norm": 0.3208553194999695, + "learning_rate": 3.8838012399865006e-05, + "loss": 0.1497, + "step": 21414 + }, + { + "epoch": 0.3819605464987693, + "grad_norm": 0.2847328782081604, + "learning_rate": 3.883671605818626e-05, + "loss": 0.1947, + "step": 21415 + }, + { + "epoch": 0.381978382620483, + "grad_norm": 0.3341546654701233, + "learning_rate": 3.883541966287163e-05, + "loss": 0.1793, + "step": 21416 + }, + { + "epoch": 0.3819962187421967, + "grad_norm": 0.2877810597419739, + "learning_rate": 3.883412321392614e-05, + "loss": 0.1468, + "step": 21417 + }, + { + "epoch": 0.3820140548639104, + "grad_norm": 0.26836660504341125, + "learning_rate": 3.88328267113548e-05, + "loss": 0.1618, + "step": 21418 + }, + { + "epoch": 0.38203189098562407, + "grad_norm": 0.287304550409317, + "learning_rate": 3.883153015516266e-05, + "loss": 0.1516, + "step": 21419 + }, + { + "epoch": 0.38204972710733776, + "grad_norm": 0.23285901546478271, + "learning_rate": 3.883023354535472e-05, + "loss": 0.1471, + "step": 21420 + }, + { + "epoch": 0.3820675632290515, + "grad_norm": 0.2514244019985199, + "learning_rate": 3.882893688193602e-05, + "loss": 0.1369, + "step": 21421 + }, + { + "epoch": 0.3820853993507652, + "grad_norm": 0.2865428328514099, + "learning_rate": 3.8827640164911586e-05, + "loss": 0.2377, + "step": 21422 + }, + { + "epoch": 0.3821032354724789, + "grad_norm": 0.2535489499568939, + "learning_rate": 3.882634339428643e-05, + "loss": 0.1507, + "step": 21423 + }, + { + "epoch": 0.38212107159419256, + "grad_norm": 0.26763004064559937, + "learning_rate": 3.882504657006561e-05, + "loss": 0.1703, + "step": 21424 + }, + { + "epoch": 0.38213890771590625, + "grad_norm": 0.3215639889240265, + "learning_rate": 3.882374969225413e-05, + "loss": 0.1775, + "step": 21425 + }, + { + "epoch": 0.38215674383761994, + "grad_norm": 0.23614788055419922, + "learning_rate": 3.882245276085702e-05, + "loss": 0.1262, + "step": 21426 + }, + { + "epoch": 0.3821745799593336, + "grad_norm": 0.2856839895248413, + "learning_rate": 3.882115577587931e-05, + "loss": 0.142, + "step": 21427 + }, + { + "epoch": 0.3821924160810473, + "grad_norm": 0.228141650557518, + "learning_rate": 3.881985873732603e-05, + "loss": 0.1919, + "step": 21428 + }, + { + "epoch": 0.38221025220276106, + "grad_norm": 0.198617085814476, + "learning_rate": 3.881856164520219e-05, + "loss": 0.123, + "step": 21429 + }, + { + "epoch": 0.38222808832447475, + "grad_norm": 0.3066501319408417, + "learning_rate": 3.8817264499512846e-05, + "loss": 0.1867, + "step": 21430 + }, + { + "epoch": 0.38224592444618843, + "grad_norm": 0.24149169027805328, + "learning_rate": 3.881596730026301e-05, + "loss": 0.1022, + "step": 21431 + }, + { + "epoch": 0.3822637605679021, + "grad_norm": 0.22939139604568481, + "learning_rate": 3.8814670047457715e-05, + "loss": 0.1746, + "step": 21432 + }, + { + "epoch": 0.3822815966896158, + "grad_norm": 0.26302871108055115, + "learning_rate": 3.881337274110197e-05, + "loss": 0.1833, + "step": 21433 + }, + { + "epoch": 0.3822994328113295, + "grad_norm": 0.2806277275085449, + "learning_rate": 3.881207538120084e-05, + "loss": 0.2147, + "step": 21434 + }, + { + "epoch": 0.3823172689330432, + "grad_norm": 0.24292179942131042, + "learning_rate": 3.881077796775933e-05, + "loss": 0.1512, + "step": 21435 + }, + { + "epoch": 0.3823351050547569, + "grad_norm": 0.4341593384742737, + "learning_rate": 3.8809480500782474e-05, + "loss": 0.1599, + "step": 21436 + }, + { + "epoch": 0.38235294117647056, + "grad_norm": 0.24315306544303894, + "learning_rate": 3.88081829802753e-05, + "loss": 0.0894, + "step": 21437 + }, + { + "epoch": 0.3823707772981843, + "grad_norm": 0.1908676028251648, + "learning_rate": 3.8806885406242844e-05, + "loss": 0.1528, + "step": 21438 + }, + { + "epoch": 0.382388613419898, + "grad_norm": 0.30401498079299927, + "learning_rate": 3.880558777869013e-05, + "loss": 0.1994, + "step": 21439 + }, + { + "epoch": 0.3824064495416117, + "grad_norm": 0.2861475646495819, + "learning_rate": 3.880429009762219e-05, + "loss": 0.1818, + "step": 21440 + }, + { + "epoch": 0.38242428566332537, + "grad_norm": 0.25684410333633423, + "learning_rate": 3.880299236304405e-05, + "loss": 0.0947, + "step": 21441 + }, + { + "epoch": 0.38244212178503906, + "grad_norm": 0.33284851908683777, + "learning_rate": 3.880169457496075e-05, + "loss": 0.1738, + "step": 21442 + }, + { + "epoch": 0.38245995790675275, + "grad_norm": 0.3143170177936554, + "learning_rate": 3.880039673337731e-05, + "loss": 0.2008, + "step": 21443 + }, + { + "epoch": 0.38247779402846643, + "grad_norm": 0.29654011130332947, + "learning_rate": 3.879909883829877e-05, + "loss": 0.1546, + "step": 21444 + }, + { + "epoch": 0.3824956301501801, + "grad_norm": 0.1805470585823059, + "learning_rate": 3.879780088973016e-05, + "loss": 0.14, + "step": 21445 + }, + { + "epoch": 0.38251346627189386, + "grad_norm": 0.2544291317462921, + "learning_rate": 3.87965028876765e-05, + "loss": 0.1599, + "step": 21446 + }, + { + "epoch": 0.38253130239360755, + "grad_norm": 0.24353724718093872, + "learning_rate": 3.879520483214283e-05, + "loss": 0.1858, + "step": 21447 + }, + { + "epoch": 0.38254913851532124, + "grad_norm": 0.23016461730003357, + "learning_rate": 3.879390672313418e-05, + "loss": 0.1382, + "step": 21448 + }, + { + "epoch": 0.38256697463703493, + "grad_norm": 0.26850953698158264, + "learning_rate": 3.8792608560655594e-05, + "loss": 0.1737, + "step": 21449 + }, + { + "epoch": 0.3825848107587486, + "grad_norm": 0.3238796889781952, + "learning_rate": 3.879131034471208e-05, + "loss": 0.1482, + "step": 21450 + }, + { + "epoch": 0.3826026468804623, + "grad_norm": 0.3209700584411621, + "learning_rate": 3.879001207530869e-05, + "loss": 0.1852, + "step": 21451 + }, + { + "epoch": 0.382620483002176, + "grad_norm": 0.25144535303115845, + "learning_rate": 3.878871375245045e-05, + "loss": 0.1725, + "step": 21452 + }, + { + "epoch": 0.3826383191238897, + "grad_norm": 0.21510393917560577, + "learning_rate": 3.87874153761424e-05, + "loss": 0.1507, + "step": 21453 + }, + { + "epoch": 0.38265615524560337, + "grad_norm": 0.258568674325943, + "learning_rate": 3.878611694638955e-05, + "loss": 0.1819, + "step": 21454 + }, + { + "epoch": 0.3826739913673171, + "grad_norm": 0.20808269083499908, + "learning_rate": 3.8784818463196956e-05, + "loss": 0.1074, + "step": 21455 + }, + { + "epoch": 0.3826918274890308, + "grad_norm": 0.2588508427143097, + "learning_rate": 3.878351992656966e-05, + "loss": 0.1693, + "step": 21456 + }, + { + "epoch": 0.3827096636107445, + "grad_norm": 0.22035464644432068, + "learning_rate": 3.878222133651266e-05, + "loss": 0.1918, + "step": 21457 + }, + { + "epoch": 0.3827274997324582, + "grad_norm": 0.2975381016731262, + "learning_rate": 3.878092269303102e-05, + "loss": 0.1943, + "step": 21458 + }, + { + "epoch": 0.38274533585417186, + "grad_norm": 0.26722148060798645, + "learning_rate": 3.8779623996129753e-05, + "loss": 0.1313, + "step": 21459 + }, + { + "epoch": 0.38276317197588555, + "grad_norm": 0.2953987121582031, + "learning_rate": 3.877832524581392e-05, + "loss": 0.1735, + "step": 21460 + }, + { + "epoch": 0.38278100809759924, + "grad_norm": 0.3316297233104706, + "learning_rate": 3.877702644208853e-05, + "loss": 0.2455, + "step": 21461 + }, + { + "epoch": 0.38279884421931293, + "grad_norm": 0.32206907868385315, + "learning_rate": 3.8775727584958625e-05, + "loss": 0.1248, + "step": 21462 + }, + { + "epoch": 0.38281668034102667, + "grad_norm": 0.25685063004493713, + "learning_rate": 3.8774428674429245e-05, + "loss": 0.1143, + "step": 21463 + }, + { + "epoch": 0.38283451646274036, + "grad_norm": 0.2138339728116989, + "learning_rate": 3.877312971050542e-05, + "loss": 0.1517, + "step": 21464 + }, + { + "epoch": 0.38285235258445405, + "grad_norm": 0.2990202009677887, + "learning_rate": 3.877183069319219e-05, + "loss": 0.1569, + "step": 21465 + }, + { + "epoch": 0.38287018870616774, + "grad_norm": 0.23563086986541748, + "learning_rate": 3.8770531622494585e-05, + "loss": 0.1687, + "step": 21466 + }, + { + "epoch": 0.3828880248278814, + "grad_norm": 0.2641627788543701, + "learning_rate": 3.8769232498417655e-05, + "loss": 0.1356, + "step": 21467 + }, + { + "epoch": 0.3829058609495951, + "grad_norm": 0.30582690238952637, + "learning_rate": 3.876793332096641e-05, + "loss": 0.1541, + "step": 21468 + }, + { + "epoch": 0.3829236970713088, + "grad_norm": 0.3575577437877655, + "learning_rate": 3.8766634090145904e-05, + "loss": 0.1483, + "step": 21469 + }, + { + "epoch": 0.3829415331930225, + "grad_norm": 0.44699031114578247, + "learning_rate": 3.876533480596117e-05, + "loss": 0.1528, + "step": 21470 + }, + { + "epoch": 0.38295936931473623, + "grad_norm": 0.36194881796836853, + "learning_rate": 3.876403546841725e-05, + "loss": 0.1608, + "step": 21471 + }, + { + "epoch": 0.3829772054364499, + "grad_norm": 0.24675357341766357, + "learning_rate": 3.876273607751916e-05, + "loss": 0.1608, + "step": 21472 + }, + { + "epoch": 0.3829950415581636, + "grad_norm": 0.3369715213775635, + "learning_rate": 3.876143663327196e-05, + "loss": 0.217, + "step": 21473 + }, + { + "epoch": 0.3830128776798773, + "grad_norm": 0.355873167514801, + "learning_rate": 3.876013713568068e-05, + "loss": 0.2866, + "step": 21474 + }, + { + "epoch": 0.383030713801591, + "grad_norm": 0.21274901926517487, + "learning_rate": 3.8758837584750354e-05, + "loss": 0.1613, + "step": 21475 + }, + { + "epoch": 0.38304854992330467, + "grad_norm": 0.19465862214565277, + "learning_rate": 3.875753798048603e-05, + "loss": 0.1117, + "step": 21476 + }, + { + "epoch": 0.38306638604501836, + "grad_norm": 0.1701757162809372, + "learning_rate": 3.8756238322892724e-05, + "loss": 0.1219, + "step": 21477 + }, + { + "epoch": 0.38308422216673205, + "grad_norm": 0.3683616816997528, + "learning_rate": 3.875493861197549e-05, + "loss": 0.1381, + "step": 21478 + }, + { + "epoch": 0.38310205828844573, + "grad_norm": 0.2715027332305908, + "learning_rate": 3.875363884773936e-05, + "loss": 0.1118, + "step": 21479 + }, + { + "epoch": 0.3831198944101595, + "grad_norm": 0.20807315409183502, + "learning_rate": 3.8752339030189384e-05, + "loss": 0.1491, + "step": 21480 + }, + { + "epoch": 0.38313773053187317, + "grad_norm": 0.23150336742401123, + "learning_rate": 3.875103915933059e-05, + "loss": 0.1436, + "step": 21481 + }, + { + "epoch": 0.38315556665358685, + "grad_norm": 0.2969799041748047, + "learning_rate": 3.874973923516802e-05, + "loss": 0.2224, + "step": 21482 + }, + { + "epoch": 0.38317340277530054, + "grad_norm": 0.3532843589782715, + "learning_rate": 3.87484392577067e-05, + "loss": 0.168, + "step": 21483 + }, + { + "epoch": 0.38319123889701423, + "grad_norm": 0.29249686002731323, + "learning_rate": 3.87471392269517e-05, + "loss": 0.1467, + "step": 21484 + }, + { + "epoch": 0.3832090750187279, + "grad_norm": 0.2062867432832718, + "learning_rate": 3.874583914290802e-05, + "loss": 0.1426, + "step": 21485 + }, + { + "epoch": 0.3832269111404416, + "grad_norm": 0.23950551450252533, + "learning_rate": 3.8744539005580736e-05, + "loss": 0.1638, + "step": 21486 + }, + { + "epoch": 0.3832447472621553, + "grad_norm": 0.292850524187088, + "learning_rate": 3.874323881497487e-05, + "loss": 0.1975, + "step": 21487 + }, + { + "epoch": 0.38326258338386904, + "grad_norm": 0.26524338126182556, + "learning_rate": 3.874193857109545e-05, + "loss": 0.1329, + "step": 21488 + }, + { + "epoch": 0.3832804195055827, + "grad_norm": 0.2546485364437103, + "learning_rate": 3.8740638273947535e-05, + "loss": 0.1324, + "step": 21489 + }, + { + "epoch": 0.3832982556272964, + "grad_norm": 0.28564849495887756, + "learning_rate": 3.873933792353617e-05, + "loss": 0.1533, + "step": 21490 + }, + { + "epoch": 0.3833160917490101, + "grad_norm": 0.3343300521373749, + "learning_rate": 3.873803751986638e-05, + "loss": 0.1144, + "step": 21491 + }, + { + "epoch": 0.3833339278707238, + "grad_norm": 0.19869981706142426, + "learning_rate": 3.873673706294321e-05, + "loss": 0.1297, + "step": 21492 + }, + { + "epoch": 0.3833517639924375, + "grad_norm": 0.24130672216415405, + "learning_rate": 3.87354365527717e-05, + "loss": 0.1467, + "step": 21493 + }, + { + "epoch": 0.38336960011415117, + "grad_norm": 0.28504040837287903, + "learning_rate": 3.87341359893569e-05, + "loss": 0.1615, + "step": 21494 + }, + { + "epoch": 0.38338743623586485, + "grad_norm": 0.2641395926475525, + "learning_rate": 3.873283537270385e-05, + "loss": 0.1396, + "step": 21495 + }, + { + "epoch": 0.38340527235757854, + "grad_norm": 0.2860143482685089, + "learning_rate": 3.873153470281757e-05, + "loss": 0.12, + "step": 21496 + }, + { + "epoch": 0.3834231084792923, + "grad_norm": 0.2566232681274414, + "learning_rate": 3.8730233979703136e-05, + "loss": 0.1532, + "step": 21497 + }, + { + "epoch": 0.383440944601006, + "grad_norm": 0.23894478380680084, + "learning_rate": 3.872893320336556e-05, + "loss": 0.1692, + "step": 21498 + }, + { + "epoch": 0.38345878072271966, + "grad_norm": 0.2790544033050537, + "learning_rate": 3.872763237380991e-05, + "loss": 0.156, + "step": 21499 + }, + { + "epoch": 0.38347661684443335, + "grad_norm": 0.31465715169906616, + "learning_rate": 3.87263314910412e-05, + "loss": 0.1715, + "step": 21500 + }, + { + "epoch": 0.38349445296614704, + "grad_norm": 0.25573471188545227, + "learning_rate": 3.87250305550645e-05, + "loss": 0.149, + "step": 21501 + }, + { + "epoch": 0.3835122890878607, + "grad_norm": 0.38926461338996887, + "learning_rate": 3.872372956588484e-05, + "loss": 0.1401, + "step": 21502 + }, + { + "epoch": 0.3835301252095744, + "grad_norm": 0.23719745874404907, + "learning_rate": 3.872242852350726e-05, + "loss": 0.1439, + "step": 21503 + }, + { + "epoch": 0.3835479613312881, + "grad_norm": 0.2531653940677643, + "learning_rate": 3.872112742793681e-05, + "loss": 0.1583, + "step": 21504 + }, + { + "epoch": 0.38356579745300184, + "grad_norm": 0.22133518755435944, + "learning_rate": 3.871982627917853e-05, + "loss": 0.1297, + "step": 21505 + }, + { + "epoch": 0.38358363357471553, + "grad_norm": 0.2650386393070221, + "learning_rate": 3.8718525077237465e-05, + "loss": 0.1733, + "step": 21506 + }, + { + "epoch": 0.3836014696964292, + "grad_norm": 0.30449217557907104, + "learning_rate": 3.871722382211866e-05, + "loss": 0.11, + "step": 21507 + }, + { + "epoch": 0.3836193058181429, + "grad_norm": 0.20377209782600403, + "learning_rate": 3.871592251382716e-05, + "loss": 0.1272, + "step": 21508 + }, + { + "epoch": 0.3836371419398566, + "grad_norm": 0.2618069052696228, + "learning_rate": 3.8714621152367994e-05, + "loss": 0.1201, + "step": 21509 + }, + { + "epoch": 0.3836549780615703, + "grad_norm": 0.3051649034023285, + "learning_rate": 3.8713319737746235e-05, + "loss": 0.1256, + "step": 21510 + }, + { + "epoch": 0.383672814183284, + "grad_norm": 0.2645023465156555, + "learning_rate": 3.87120182699669e-05, + "loss": 0.1302, + "step": 21511 + }, + { + "epoch": 0.38369065030499766, + "grad_norm": 0.24106624722480774, + "learning_rate": 3.8710716749035056e-05, + "loss": 0.1158, + "step": 21512 + }, + { + "epoch": 0.38370848642671135, + "grad_norm": 0.368408739566803, + "learning_rate": 3.870941517495573e-05, + "loss": 0.1925, + "step": 21513 + }, + { + "epoch": 0.3837263225484251, + "grad_norm": 0.2175321727991104, + "learning_rate": 3.870811354773398e-05, + "loss": 0.1249, + "step": 21514 + }, + { + "epoch": 0.3837441586701388, + "grad_norm": 0.2704111337661743, + "learning_rate": 3.870681186737485e-05, + "loss": 0.177, + "step": 21515 + }, + { + "epoch": 0.38376199479185247, + "grad_norm": 0.27574092149734497, + "learning_rate": 3.870551013388338e-05, + "loss": 0.1898, + "step": 21516 + }, + { + "epoch": 0.38377983091356616, + "grad_norm": 0.29014134407043457, + "learning_rate": 3.870420834726462e-05, + "loss": 0.1653, + "step": 21517 + }, + { + "epoch": 0.38379766703527984, + "grad_norm": 0.34705883264541626, + "learning_rate": 3.870290650752362e-05, + "loss": 0.212, + "step": 21518 + }, + { + "epoch": 0.38381550315699353, + "grad_norm": 0.2782190442085266, + "learning_rate": 3.870160461466541e-05, + "loss": 0.1575, + "step": 21519 + }, + { + "epoch": 0.3838333392787072, + "grad_norm": 0.25739315152168274, + "learning_rate": 3.870030266869505e-05, + "loss": 0.1315, + "step": 21520 + }, + { + "epoch": 0.3838511754004209, + "grad_norm": 0.24090169370174408, + "learning_rate": 3.86990006696176e-05, + "loss": 0.1468, + "step": 21521 + }, + { + "epoch": 0.38386901152213465, + "grad_norm": 0.23150403797626495, + "learning_rate": 3.8697698617438075e-05, + "loss": 0.1095, + "step": 21522 + }, + { + "epoch": 0.38388684764384834, + "grad_norm": 0.36557114124298096, + "learning_rate": 3.869639651216155e-05, + "loss": 0.1641, + "step": 21523 + }, + { + "epoch": 0.383904683765562, + "grad_norm": 0.30151206254959106, + "learning_rate": 3.869509435379305e-05, + "loss": 0.1717, + "step": 21524 + }, + { + "epoch": 0.3839225198872757, + "grad_norm": 0.27847880125045776, + "learning_rate": 3.869379214233765e-05, + "loss": 0.1725, + "step": 21525 + }, + { + "epoch": 0.3839403560089894, + "grad_norm": 0.23101796209812164, + "learning_rate": 3.869248987780036e-05, + "loss": 0.1308, + "step": 21526 + }, + { + "epoch": 0.3839581921307031, + "grad_norm": 0.32623910903930664, + "learning_rate": 3.869118756018627e-05, + "loss": 0.1522, + "step": 21527 + }, + { + "epoch": 0.3839760282524168, + "grad_norm": 0.2518160045146942, + "learning_rate": 3.8689885189500396e-05, + "loss": 0.185, + "step": 21528 + }, + { + "epoch": 0.38399386437413047, + "grad_norm": 0.30759429931640625, + "learning_rate": 3.868858276574781e-05, + "loss": 0.1314, + "step": 21529 + }, + { + "epoch": 0.3840117004958442, + "grad_norm": 0.2791420519351959, + "learning_rate": 3.868728028893354e-05, + "loss": 0.183, + "step": 21530 + }, + { + "epoch": 0.3840295366175579, + "grad_norm": 0.29207777976989746, + "learning_rate": 3.868597775906265e-05, + "loss": 0.1437, + "step": 21531 + }, + { + "epoch": 0.3840473727392716, + "grad_norm": 0.25608715415000916, + "learning_rate": 3.868467517614018e-05, + "loss": 0.1787, + "step": 21532 + }, + { + "epoch": 0.3840652088609853, + "grad_norm": 0.2718445956707001, + "learning_rate": 3.868337254017118e-05, + "loss": 0.1427, + "step": 21533 + }, + { + "epoch": 0.38408304498269896, + "grad_norm": 0.2942187190055847, + "learning_rate": 3.868206985116071e-05, + "loss": 0.1639, + "step": 21534 + }, + { + "epoch": 0.38410088110441265, + "grad_norm": 0.2436235398054123, + "learning_rate": 3.86807671091138e-05, + "loss": 0.1575, + "step": 21535 + }, + { + "epoch": 0.38411871722612634, + "grad_norm": 0.29770779609680176, + "learning_rate": 3.867946431403552e-05, + "loss": 0.1282, + "step": 21536 + }, + { + "epoch": 0.38413655334784, + "grad_norm": 0.2541572153568268, + "learning_rate": 3.867816146593091e-05, + "loss": 0.1549, + "step": 21537 + }, + { + "epoch": 0.3841543894695537, + "grad_norm": 0.22383952140808105, + "learning_rate": 3.8676858564805026e-05, + "loss": 0.0999, + "step": 21538 + }, + { + "epoch": 0.38417222559126746, + "grad_norm": 0.23770824074745178, + "learning_rate": 3.8675555610662904e-05, + "loss": 0.1405, + "step": 21539 + }, + { + "epoch": 0.38419006171298115, + "grad_norm": 0.2964611053466797, + "learning_rate": 3.867425260350961e-05, + "loss": 0.1803, + "step": 21540 + }, + { + "epoch": 0.38420789783469483, + "grad_norm": 0.26499688625335693, + "learning_rate": 3.867294954335019e-05, + "loss": 0.1859, + "step": 21541 + }, + { + "epoch": 0.3842257339564085, + "grad_norm": 0.3101528584957123, + "learning_rate": 3.86716464301897e-05, + "loss": 0.2197, + "step": 21542 + }, + { + "epoch": 0.3842435700781222, + "grad_norm": 0.445478618144989, + "learning_rate": 3.867034326403318e-05, + "loss": 0.1536, + "step": 21543 + }, + { + "epoch": 0.3842614061998359, + "grad_norm": 0.2813752293586731, + "learning_rate": 3.8669040044885693e-05, + "loss": 0.1539, + "step": 21544 + }, + { + "epoch": 0.3842792423215496, + "grad_norm": 0.31148186326026917, + "learning_rate": 3.8667736772752285e-05, + "loss": 0.167, + "step": 21545 + }, + { + "epoch": 0.3842970784432633, + "grad_norm": 0.31324324011802673, + "learning_rate": 3.8666433447638e-05, + "loss": 0.1531, + "step": 21546 + }, + { + "epoch": 0.384314914564977, + "grad_norm": 0.27325940132141113, + "learning_rate": 3.866513006954791e-05, + "loss": 0.169, + "step": 21547 + }, + { + "epoch": 0.3843327506866907, + "grad_norm": 0.29829758405685425, + "learning_rate": 3.866382663848706e-05, + "loss": 0.2055, + "step": 21548 + }, + { + "epoch": 0.3843505868084044, + "grad_norm": 0.20920826494693756, + "learning_rate": 3.8662523154460484e-05, + "loss": 0.1349, + "step": 21549 + }, + { + "epoch": 0.3843684229301181, + "grad_norm": 0.24438165128231049, + "learning_rate": 3.8661219617473256e-05, + "loss": 0.1426, + "step": 21550 + }, + { + "epoch": 0.38438625905183177, + "grad_norm": 0.24366679787635803, + "learning_rate": 3.865991602753042e-05, + "loss": 0.1926, + "step": 21551 + }, + { + "epoch": 0.38440409517354546, + "grad_norm": 0.3072656989097595, + "learning_rate": 3.8658612384637034e-05, + "loss": 0.2118, + "step": 21552 + }, + { + "epoch": 0.38442193129525914, + "grad_norm": 0.22273799777030945, + "learning_rate": 3.865730868879815e-05, + "loss": 0.1756, + "step": 21553 + }, + { + "epoch": 0.38443976741697283, + "grad_norm": 0.2541927993297577, + "learning_rate": 3.8656004940018816e-05, + "loss": 0.1543, + "step": 21554 + }, + { + "epoch": 0.3844576035386865, + "grad_norm": 0.20942744612693787, + "learning_rate": 3.865470113830409e-05, + "loss": 0.1138, + "step": 21555 + }, + { + "epoch": 0.38447543966040026, + "grad_norm": 0.29884278774261475, + "learning_rate": 3.865339728365903e-05, + "loss": 0.1976, + "step": 21556 + }, + { + "epoch": 0.38449327578211395, + "grad_norm": 0.24784955382347107, + "learning_rate": 3.865209337608869e-05, + "loss": 0.1919, + "step": 21557 + }, + { + "epoch": 0.38451111190382764, + "grad_norm": 0.26318109035491943, + "learning_rate": 3.865078941559811e-05, + "loss": 0.186, + "step": 21558 + }, + { + "epoch": 0.38452894802554133, + "grad_norm": 0.309365451335907, + "learning_rate": 3.864948540219237e-05, + "loss": 0.2005, + "step": 21559 + }, + { + "epoch": 0.384546784147255, + "grad_norm": 0.21352458000183105, + "learning_rate": 3.86481813358765e-05, + "loss": 0.1083, + "step": 21560 + }, + { + "epoch": 0.3845646202689687, + "grad_norm": 0.2547975778579712, + "learning_rate": 3.8646877216655566e-05, + "loss": 0.1931, + "step": 21561 + }, + { + "epoch": 0.3845824563906824, + "grad_norm": 0.2607114911079407, + "learning_rate": 3.864557304453462e-05, + "loss": 0.1353, + "step": 21562 + }, + { + "epoch": 0.3846002925123961, + "grad_norm": 0.30372127890586853, + "learning_rate": 3.8644268819518726e-05, + "loss": 0.1907, + "step": 21563 + }, + { + "epoch": 0.3846181286341098, + "grad_norm": 0.261281818151474, + "learning_rate": 3.864296454161292e-05, + "loss": 0.1633, + "step": 21564 + }, + { + "epoch": 0.3846359647558235, + "grad_norm": 0.2646227180957794, + "learning_rate": 3.864166021082229e-05, + "loss": 0.1713, + "step": 21565 + }, + { + "epoch": 0.3846538008775372, + "grad_norm": 0.2874150276184082, + "learning_rate": 3.8640355827151865e-05, + "loss": 0.2352, + "step": 21566 + }, + { + "epoch": 0.3846716369992509, + "grad_norm": 0.30703097581863403, + "learning_rate": 3.863905139060671e-05, + "loss": 0.1595, + "step": 21567 + }, + { + "epoch": 0.3846894731209646, + "grad_norm": 0.31142300367355347, + "learning_rate": 3.8637746901191885e-05, + "loss": 0.1671, + "step": 21568 + }, + { + "epoch": 0.38470730924267826, + "grad_norm": 0.41892385482788086, + "learning_rate": 3.8636442358912434e-05, + "loss": 0.2343, + "step": 21569 + }, + { + "epoch": 0.38472514536439195, + "grad_norm": 0.2691972851753235, + "learning_rate": 3.863513776377343e-05, + "loss": 0.1467, + "step": 21570 + }, + { + "epoch": 0.38474298148610564, + "grad_norm": 0.22149133682250977, + "learning_rate": 3.863383311577992e-05, + "loss": 0.1427, + "step": 21571 + }, + { + "epoch": 0.3847608176078194, + "grad_norm": 0.35529500246047974, + "learning_rate": 3.863252841493696e-05, + "loss": 0.0988, + "step": 21572 + }, + { + "epoch": 0.38477865372953307, + "grad_norm": 0.2516630291938782, + "learning_rate": 3.863122366124961e-05, + "loss": 0.1523, + "step": 21573 + }, + { + "epoch": 0.38479648985124676, + "grad_norm": 0.3124215602874756, + "learning_rate": 3.862991885472294e-05, + "loss": 0.1271, + "step": 21574 + }, + { + "epoch": 0.38481432597296045, + "grad_norm": 0.33533337712287903, + "learning_rate": 3.8628613995361996e-05, + "loss": 0.1131, + "step": 21575 + }, + { + "epoch": 0.38483216209467414, + "grad_norm": 0.2423637956380844, + "learning_rate": 3.8627309083171825e-05, + "loss": 0.1941, + "step": 21576 + }, + { + "epoch": 0.3848499982163878, + "grad_norm": 0.24964472651481628, + "learning_rate": 3.862600411815751e-05, + "loss": 0.2078, + "step": 21577 + }, + { + "epoch": 0.3848678343381015, + "grad_norm": 0.3183000981807709, + "learning_rate": 3.862469910032409e-05, + "loss": 0.1426, + "step": 21578 + }, + { + "epoch": 0.3848856704598152, + "grad_norm": 0.2533881962299347, + "learning_rate": 3.862339402967663e-05, + "loss": 0.1065, + "step": 21579 + }, + { + "epoch": 0.3849035065815289, + "grad_norm": 0.3077002763748169, + "learning_rate": 3.8622088906220185e-05, + "loss": 0.1535, + "step": 21580 + }, + { + "epoch": 0.38492134270324263, + "grad_norm": 0.24204425513744354, + "learning_rate": 3.862078372995983e-05, + "loss": 0.132, + "step": 21581 + }, + { + "epoch": 0.3849391788249563, + "grad_norm": 0.3703136146068573, + "learning_rate": 3.861947850090061e-05, + "loss": 0.1863, + "step": 21582 + }, + { + "epoch": 0.38495701494667, + "grad_norm": 0.20650333166122437, + "learning_rate": 3.861817321904758e-05, + "loss": 0.1334, + "step": 21583 + }, + { + "epoch": 0.3849748510683837, + "grad_norm": 0.22031338512897491, + "learning_rate": 3.8616867884405805e-05, + "loss": 0.1385, + "step": 21584 + }, + { + "epoch": 0.3849926871900974, + "grad_norm": 0.20216822624206543, + "learning_rate": 3.861556249698036e-05, + "loss": 0.1254, + "step": 21585 + }, + { + "epoch": 0.38501052331181107, + "grad_norm": 0.25917112827301025, + "learning_rate": 3.861425705677629e-05, + "loss": 0.1412, + "step": 21586 + }, + { + "epoch": 0.38502835943352476, + "grad_norm": 0.21913515031337738, + "learning_rate": 3.861295156379865e-05, + "loss": 0.1074, + "step": 21587 + }, + { + "epoch": 0.38504619555523845, + "grad_norm": 0.28667446970939636, + "learning_rate": 3.861164601805251e-05, + "loss": 0.1526, + "step": 21588 + }, + { + "epoch": 0.3850640316769522, + "grad_norm": 0.31146547198295593, + "learning_rate": 3.861034041954292e-05, + "loss": 0.1507, + "step": 21589 + }, + { + "epoch": 0.3850818677986659, + "grad_norm": 0.311732679605484, + "learning_rate": 3.8609034768274965e-05, + "loss": 0.1117, + "step": 21590 + }, + { + "epoch": 0.38509970392037957, + "grad_norm": 0.3943082094192505, + "learning_rate": 3.860772906425368e-05, + "loss": 0.2095, + "step": 21591 + }, + { + "epoch": 0.38511754004209325, + "grad_norm": 0.3782370388507843, + "learning_rate": 3.8606423307484154e-05, + "loss": 0.1876, + "step": 21592 + }, + { + "epoch": 0.38513537616380694, + "grad_norm": 0.29106682538986206, + "learning_rate": 3.860511749797141e-05, + "loss": 0.1268, + "step": 21593 + }, + { + "epoch": 0.38515321228552063, + "grad_norm": 0.2340225726366043, + "learning_rate": 3.860381163572055e-05, + "loss": 0.1491, + "step": 21594 + }, + { + "epoch": 0.3851710484072343, + "grad_norm": 0.32674309611320496, + "learning_rate": 3.86025057207366e-05, + "loss": 0.1392, + "step": 21595 + }, + { + "epoch": 0.385188884528948, + "grad_norm": 0.32957813143730164, + "learning_rate": 3.860119975302465e-05, + "loss": 0.186, + "step": 21596 + }, + { + "epoch": 0.3852067206506617, + "grad_norm": 0.2759378254413605, + "learning_rate": 3.8599893732589754e-05, + "loss": 0.1881, + "step": 21597 + }, + { + "epoch": 0.38522455677237544, + "grad_norm": 0.28699544072151184, + "learning_rate": 3.859858765943697e-05, + "loss": 0.1252, + "step": 21598 + }, + { + "epoch": 0.3852423928940891, + "grad_norm": 0.2482403814792633, + "learning_rate": 3.859728153357136e-05, + "loss": 0.1755, + "step": 21599 + }, + { + "epoch": 0.3852602290158028, + "grad_norm": 0.2203858345746994, + "learning_rate": 3.859597535499799e-05, + "loss": 0.172, + "step": 21600 + }, + { + "epoch": 0.3852780651375165, + "grad_norm": 0.22645880281925201, + "learning_rate": 3.8594669123721935e-05, + "loss": 0.0836, + "step": 21601 + }, + { + "epoch": 0.3852959012592302, + "grad_norm": 0.24230335652828217, + "learning_rate": 3.859336283974824e-05, + "loss": 0.1522, + "step": 21602 + }, + { + "epoch": 0.3853137373809439, + "grad_norm": 0.4262154698371887, + "learning_rate": 3.859205650308198e-05, + "loss": 0.2049, + "step": 21603 + }, + { + "epoch": 0.38533157350265757, + "grad_norm": 0.34571588039398193, + "learning_rate": 3.85907501137282e-05, + "loss": 0.1275, + "step": 21604 + }, + { + "epoch": 0.38534940962437125, + "grad_norm": 0.27967000007629395, + "learning_rate": 3.8589443671691995e-05, + "loss": 0.1223, + "step": 21605 + }, + { + "epoch": 0.385367245746085, + "grad_norm": 0.25240615010261536, + "learning_rate": 3.85881371769784e-05, + "loss": 0.1564, + "step": 21606 + }, + { + "epoch": 0.3853850818677987, + "grad_norm": 0.2631133496761322, + "learning_rate": 3.85868306295925e-05, + "loss": 0.1977, + "step": 21607 + }, + { + "epoch": 0.3854029179895124, + "grad_norm": 0.3707141876220703, + "learning_rate": 3.858552402953934e-05, + "loss": 0.1582, + "step": 21608 + }, + { + "epoch": 0.38542075411122606, + "grad_norm": 0.24326975643634796, + "learning_rate": 3.858421737682401e-05, + "loss": 0.1087, + "step": 21609 + }, + { + "epoch": 0.38543859023293975, + "grad_norm": 0.33008456230163574, + "learning_rate": 3.8582910671451556e-05, + "loss": 0.104, + "step": 21610 + }, + { + "epoch": 0.38545642635465344, + "grad_norm": 0.2847519814968109, + "learning_rate": 3.8581603913427054e-05, + "loss": 0.1589, + "step": 21611 + }, + { + "epoch": 0.3854742624763671, + "grad_norm": 0.36371302604675293, + "learning_rate": 3.858029710275556e-05, + "loss": 0.1704, + "step": 21612 + }, + { + "epoch": 0.3854920985980808, + "grad_norm": 0.40692663192749023, + "learning_rate": 3.857899023944215e-05, + "loss": 0.2057, + "step": 21613 + }, + { + "epoch": 0.3855099347197945, + "grad_norm": 0.34015026688575745, + "learning_rate": 3.857768332349187e-05, + "loss": 0.1696, + "step": 21614 + }, + { + "epoch": 0.38552777084150824, + "grad_norm": 0.24905699491500854, + "learning_rate": 3.857637635490981e-05, + "loss": 0.1762, + "step": 21615 + }, + { + "epoch": 0.38554560696322193, + "grad_norm": 0.23990298807621002, + "learning_rate": 3.857506933370102e-05, + "loss": 0.1239, + "step": 21616 + }, + { + "epoch": 0.3855634430849356, + "grad_norm": 0.27261462807655334, + "learning_rate": 3.857376225987058e-05, + "loss": 0.1556, + "step": 21617 + }, + { + "epoch": 0.3855812792066493, + "grad_norm": 0.2500361204147339, + "learning_rate": 3.8572455133423546e-05, + "loss": 0.155, + "step": 21618 + }, + { + "epoch": 0.385599115328363, + "grad_norm": 0.38261857628822327, + "learning_rate": 3.857114795436498e-05, + "loss": 0.2161, + "step": 21619 + }, + { + "epoch": 0.3856169514500767, + "grad_norm": 0.24765770137310028, + "learning_rate": 3.856984072269997e-05, + "loss": 0.1387, + "step": 21620 + }, + { + "epoch": 0.38563478757179037, + "grad_norm": 0.29271161556243896, + "learning_rate": 3.856853343843356e-05, + "loss": 0.1553, + "step": 21621 + }, + { + "epoch": 0.38565262369350406, + "grad_norm": 0.2441394329071045, + "learning_rate": 3.856722610157084e-05, + "loss": 0.1897, + "step": 21622 + }, + { + "epoch": 0.3856704598152178, + "grad_norm": 0.23362718522548676, + "learning_rate": 3.856591871211686e-05, + "loss": 0.0922, + "step": 21623 + }, + { + "epoch": 0.3856882959369315, + "grad_norm": 0.40063345432281494, + "learning_rate": 3.856461127007669e-05, + "loss": 0.1127, + "step": 21624 + }, + { + "epoch": 0.3857061320586452, + "grad_norm": 0.36613729596138, + "learning_rate": 3.856330377545541e-05, + "loss": 0.1274, + "step": 21625 + }, + { + "epoch": 0.38572396818035887, + "grad_norm": 0.2747527062892914, + "learning_rate": 3.8561996228258076e-05, + "loss": 0.1613, + "step": 21626 + }, + { + "epoch": 0.38574180430207256, + "grad_norm": 0.20412592589855194, + "learning_rate": 3.856068862848976e-05, + "loss": 0.1519, + "step": 21627 + }, + { + "epoch": 0.38575964042378624, + "grad_norm": 0.23202116787433624, + "learning_rate": 3.8559380976155525e-05, + "loss": 0.1891, + "step": 21628 + }, + { + "epoch": 0.38577747654549993, + "grad_norm": 0.22255411744117737, + "learning_rate": 3.855807327126045e-05, + "loss": 0.1417, + "step": 21629 + }, + { + "epoch": 0.3857953126672136, + "grad_norm": 0.3256995379924774, + "learning_rate": 3.8556765513809604e-05, + "loss": 0.1934, + "step": 21630 + }, + { + "epoch": 0.38581314878892736, + "grad_norm": 0.2698051929473877, + "learning_rate": 3.8555457703808054e-05, + "loss": 0.1605, + "step": 21631 + }, + { + "epoch": 0.38583098491064105, + "grad_norm": 0.21564166247844696, + "learning_rate": 3.8554149841260856e-05, + "loss": 0.1331, + "step": 21632 + }, + { + "epoch": 0.38584882103235474, + "grad_norm": 0.28024598956108093, + "learning_rate": 3.8552841926173106e-05, + "loss": 0.2173, + "step": 21633 + }, + { + "epoch": 0.3858666571540684, + "grad_norm": 0.5236746072769165, + "learning_rate": 3.855153395854985e-05, + "loss": 0.2136, + "step": 21634 + }, + { + "epoch": 0.3858844932757821, + "grad_norm": 0.27546462416648865, + "learning_rate": 3.8550225938396175e-05, + "loss": 0.1847, + "step": 21635 + }, + { + "epoch": 0.3859023293974958, + "grad_norm": 0.23911882936954498, + "learning_rate": 3.854891786571714e-05, + "loss": 0.1428, + "step": 21636 + }, + { + "epoch": 0.3859201655192095, + "grad_norm": 0.24938642978668213, + "learning_rate": 3.8547609740517824e-05, + "loss": 0.1288, + "step": 21637 + }, + { + "epoch": 0.3859380016409232, + "grad_norm": 0.34109604358673096, + "learning_rate": 3.8546301562803286e-05, + "loss": 0.1251, + "step": 21638 + }, + { + "epoch": 0.38595583776263687, + "grad_norm": 0.3316737115383148, + "learning_rate": 3.85449933325786e-05, + "loss": 0.17, + "step": 21639 + }, + { + "epoch": 0.3859736738843506, + "grad_norm": 0.31739625334739685, + "learning_rate": 3.854368504984885e-05, + "loss": 0.1609, + "step": 21640 + }, + { + "epoch": 0.3859915100060643, + "grad_norm": 0.2375878244638443, + "learning_rate": 3.85423767146191e-05, + "loss": 0.1234, + "step": 21641 + }, + { + "epoch": 0.386009346127778, + "grad_norm": 0.277337908744812, + "learning_rate": 3.8541068326894424e-05, + "loss": 0.158, + "step": 21642 + }, + { + "epoch": 0.3860271822494917, + "grad_norm": 0.3270178735256195, + "learning_rate": 3.8539759886679884e-05, + "loss": 0.173, + "step": 21643 + }, + { + "epoch": 0.38604501837120536, + "grad_norm": 0.3947116434574127, + "learning_rate": 3.853845139398056e-05, + "loss": 0.1369, + "step": 21644 + }, + { + "epoch": 0.38606285449291905, + "grad_norm": 0.19846835732460022, + "learning_rate": 3.8537142848801514e-05, + "loss": 0.1601, + "step": 21645 + }, + { + "epoch": 0.38608069061463274, + "grad_norm": 0.260405570268631, + "learning_rate": 3.853583425114784e-05, + "loss": 0.1925, + "step": 21646 + }, + { + "epoch": 0.3860985267363464, + "grad_norm": 0.29649707674980164, + "learning_rate": 3.853452560102459e-05, + "loss": 0.1478, + "step": 21647 + }, + { + "epoch": 0.38611636285806017, + "grad_norm": 0.3264923095703125, + "learning_rate": 3.8533216898436845e-05, + "loss": 0.1844, + "step": 21648 + }, + { + "epoch": 0.38613419897977386, + "grad_norm": 0.259240984916687, + "learning_rate": 3.853190814338968e-05, + "loss": 0.1611, + "step": 21649 + }, + { + "epoch": 0.38615203510148755, + "grad_norm": 0.25992581248283386, + "learning_rate": 3.853059933588816e-05, + "loss": 0.1476, + "step": 21650 + }, + { + "epoch": 0.38616987122320123, + "grad_norm": 0.25727441906929016, + "learning_rate": 3.8529290475937374e-05, + "loss": 0.1611, + "step": 21651 + }, + { + "epoch": 0.3861877073449149, + "grad_norm": 0.22839511930942535, + "learning_rate": 3.852798156354237e-05, + "loss": 0.1678, + "step": 21652 + }, + { + "epoch": 0.3862055434666286, + "grad_norm": 0.18959090113639832, + "learning_rate": 3.852667259870825e-05, + "loss": 0.1034, + "step": 21653 + }, + { + "epoch": 0.3862233795883423, + "grad_norm": 0.30014801025390625, + "learning_rate": 3.852536358144007e-05, + "loss": 0.1277, + "step": 21654 + }, + { + "epoch": 0.386241215710056, + "grad_norm": 0.19943666458129883, + "learning_rate": 3.852405451174291e-05, + "loss": 0.1307, + "step": 21655 + }, + { + "epoch": 0.3862590518317697, + "grad_norm": 0.28613272309303284, + "learning_rate": 3.852274538962184e-05, + "loss": 0.1565, + "step": 21656 + }, + { + "epoch": 0.3862768879534834, + "grad_norm": 0.2960610091686249, + "learning_rate": 3.8521436215081945e-05, + "loss": 0.1585, + "step": 21657 + }, + { + "epoch": 0.3862947240751971, + "grad_norm": 0.2918239235877991, + "learning_rate": 3.852012698812829e-05, + "loss": 0.2052, + "step": 21658 + }, + { + "epoch": 0.3863125601969108, + "grad_norm": 0.38167905807495117, + "learning_rate": 3.851881770876595e-05, + "loss": 0.2402, + "step": 21659 + }, + { + "epoch": 0.3863303963186245, + "grad_norm": 0.21273306012153625, + "learning_rate": 3.8517508377000006e-05, + "loss": 0.166, + "step": 21660 + }, + { + "epoch": 0.38634823244033817, + "grad_norm": 0.22714021801948547, + "learning_rate": 3.851619899283553e-05, + "loss": 0.141, + "step": 21661 + }, + { + "epoch": 0.38636606856205186, + "grad_norm": 0.232809379696846, + "learning_rate": 3.85148895562776e-05, + "loss": 0.1739, + "step": 21662 + }, + { + "epoch": 0.38638390468376554, + "grad_norm": 0.24009500443935394, + "learning_rate": 3.851358006733129e-05, + "loss": 0.1675, + "step": 21663 + }, + { + "epoch": 0.38640174080547923, + "grad_norm": 0.28202730417251587, + "learning_rate": 3.851227052600167e-05, + "loss": 0.1919, + "step": 21664 + }, + { + "epoch": 0.386419576927193, + "grad_norm": 0.2808999717235565, + "learning_rate": 3.8510960932293835e-05, + "loss": 0.2019, + "step": 21665 + }, + { + "epoch": 0.38643741304890666, + "grad_norm": 0.18273112177848816, + "learning_rate": 3.850965128621284e-05, + "loss": 0.1042, + "step": 21666 + }, + { + "epoch": 0.38645524917062035, + "grad_norm": 0.24341915547847748, + "learning_rate": 3.850834158776377e-05, + "loss": 0.135, + "step": 21667 + }, + { + "epoch": 0.38647308529233404, + "grad_norm": 0.30897343158721924, + "learning_rate": 3.8507031836951704e-05, + "loss": 0.189, + "step": 21668 + }, + { + "epoch": 0.38649092141404773, + "grad_norm": 0.23671004176139832, + "learning_rate": 3.850572203378172e-05, + "loss": 0.1754, + "step": 21669 + }, + { + "epoch": 0.3865087575357614, + "grad_norm": 0.3675598204135895, + "learning_rate": 3.8504412178258886e-05, + "loss": 0.1918, + "step": 21670 + }, + { + "epoch": 0.3865265936574751, + "grad_norm": 0.18170593678951263, + "learning_rate": 3.850310227038829e-05, + "loss": 0.133, + "step": 21671 + }, + { + "epoch": 0.3865444297791888, + "grad_norm": 0.21059879660606384, + "learning_rate": 3.850179231017501e-05, + "loss": 0.1692, + "step": 21672 + }, + { + "epoch": 0.38656226590090254, + "grad_norm": 0.23930495977401733, + "learning_rate": 3.850048229762412e-05, + "loss": 0.1628, + "step": 21673 + }, + { + "epoch": 0.3865801020226162, + "grad_norm": 0.3416757583618164, + "learning_rate": 3.849917223274069e-05, + "loss": 0.1497, + "step": 21674 + }, + { + "epoch": 0.3865979381443299, + "grad_norm": 0.2754180431365967, + "learning_rate": 3.849786211552981e-05, + "loss": 0.1016, + "step": 21675 + }, + { + "epoch": 0.3866157742660436, + "grad_norm": 0.24283252656459808, + "learning_rate": 3.8496551945996556e-05, + "loss": 0.1746, + "step": 21676 + }, + { + "epoch": 0.3866336103877573, + "grad_norm": 0.3330623209476471, + "learning_rate": 3.8495241724146006e-05, + "loss": 0.1303, + "step": 21677 + }, + { + "epoch": 0.386651446509471, + "grad_norm": 0.33630359172821045, + "learning_rate": 3.849393144998324e-05, + "loss": 0.1668, + "step": 21678 + }, + { + "epoch": 0.38666928263118466, + "grad_norm": 0.32251837849617004, + "learning_rate": 3.849262112351332e-05, + "loss": 0.1187, + "step": 21679 + }, + { + "epoch": 0.38668711875289835, + "grad_norm": 0.32943660020828247, + "learning_rate": 3.849131074474135e-05, + "loss": 0.1358, + "step": 21680 + }, + { + "epoch": 0.38670495487461204, + "grad_norm": 0.24475125968456268, + "learning_rate": 3.84900003136724e-05, + "loss": 0.1217, + "step": 21681 + }, + { + "epoch": 0.3867227909963258, + "grad_norm": 0.36403796076774597, + "learning_rate": 3.8488689830311554e-05, + "loss": 0.1751, + "step": 21682 + }, + { + "epoch": 0.38674062711803947, + "grad_norm": 0.26338905096054077, + "learning_rate": 3.8487379294663886e-05, + "loss": 0.2033, + "step": 21683 + }, + { + "epoch": 0.38675846323975316, + "grad_norm": 0.22861157357692719, + "learning_rate": 3.8486068706734465e-05, + "loss": 0.1232, + "step": 21684 + }, + { + "epoch": 0.38677629936146685, + "grad_norm": 0.25246092677116394, + "learning_rate": 3.84847580665284e-05, + "loss": 0.1377, + "step": 21685 + }, + { + "epoch": 0.38679413548318053, + "grad_norm": 0.23795294761657715, + "learning_rate": 3.8483447374050746e-05, + "loss": 0.1212, + "step": 21686 + }, + { + "epoch": 0.3868119716048942, + "grad_norm": 0.2064659744501114, + "learning_rate": 3.84821366293066e-05, + "loss": 0.1455, + "step": 21687 + }, + { + "epoch": 0.3868298077266079, + "grad_norm": 0.2659085690975189, + "learning_rate": 3.8480825832301026e-05, + "loss": 0.1492, + "step": 21688 + }, + { + "epoch": 0.3868476438483216, + "grad_norm": 0.22061419486999512, + "learning_rate": 3.8479514983039125e-05, + "loss": 0.1379, + "step": 21689 + }, + { + "epoch": 0.38686547997003534, + "grad_norm": 0.2448718398809433, + "learning_rate": 3.847820408152596e-05, + "loss": 0.1889, + "step": 21690 + }, + { + "epoch": 0.38688331609174903, + "grad_norm": 0.5124644637107849, + "learning_rate": 3.847689312776663e-05, + "loss": 0.1861, + "step": 21691 + }, + { + "epoch": 0.3869011522134627, + "grad_norm": 0.27622994780540466, + "learning_rate": 3.84755821217662e-05, + "loss": 0.1889, + "step": 21692 + }, + { + "epoch": 0.3869189883351764, + "grad_norm": 0.31315430998802185, + "learning_rate": 3.847427106352976e-05, + "loss": 0.1445, + "step": 21693 + }, + { + "epoch": 0.3869368244568901, + "grad_norm": 0.21457968652248383, + "learning_rate": 3.8472959953062394e-05, + "loss": 0.1384, + "step": 21694 + }, + { + "epoch": 0.3869546605786038, + "grad_norm": 0.23343250155448914, + "learning_rate": 3.847164879036918e-05, + "loss": 0.1887, + "step": 21695 + }, + { + "epoch": 0.38697249670031747, + "grad_norm": 0.33423900604248047, + "learning_rate": 3.847033757545521e-05, + "loss": 0.1818, + "step": 21696 + }, + { + "epoch": 0.38699033282203116, + "grad_norm": 0.35718485713005066, + "learning_rate": 3.846902630832555e-05, + "loss": 0.1553, + "step": 21697 + }, + { + "epoch": 0.38700816894374485, + "grad_norm": 0.26283907890319824, + "learning_rate": 3.846771498898529e-05, + "loss": 0.1542, + "step": 21698 + }, + { + "epoch": 0.3870260050654586, + "grad_norm": 0.3402228057384491, + "learning_rate": 3.846640361743952e-05, + "loss": 0.1389, + "step": 21699 + }, + { + "epoch": 0.3870438411871723, + "grad_norm": 0.26243266463279724, + "learning_rate": 3.846509219369332e-05, + "loss": 0.1499, + "step": 21700 + }, + { + "epoch": 0.38706167730888597, + "grad_norm": 0.18809480965137482, + "learning_rate": 3.846378071775176e-05, + "loss": 0.1384, + "step": 21701 + }, + { + "epoch": 0.38707951343059965, + "grad_norm": 0.29910749197006226, + "learning_rate": 3.8462469189619955e-05, + "loss": 0.1778, + "step": 21702 + }, + { + "epoch": 0.38709734955231334, + "grad_norm": 0.26675617694854736, + "learning_rate": 3.846115760930296e-05, + "loss": 0.1842, + "step": 21703 + }, + { + "epoch": 0.38711518567402703, + "grad_norm": 0.19826537370681763, + "learning_rate": 3.8459845976805866e-05, + "loss": 0.12, + "step": 21704 + }, + { + "epoch": 0.3871330217957407, + "grad_norm": 0.23616188764572144, + "learning_rate": 3.845853429213377e-05, + "loss": 0.1357, + "step": 21705 + }, + { + "epoch": 0.3871508579174544, + "grad_norm": 0.33836686611175537, + "learning_rate": 3.845722255529173e-05, + "loss": 0.1604, + "step": 21706 + }, + { + "epoch": 0.38716869403916815, + "grad_norm": 0.21005497872829437, + "learning_rate": 3.845591076628486e-05, + "loss": 0.1221, + "step": 21707 + }, + { + "epoch": 0.38718653016088184, + "grad_norm": 0.4515397548675537, + "learning_rate": 3.845459892511822e-05, + "loss": 0.1368, + "step": 21708 + }, + { + "epoch": 0.3872043662825955, + "grad_norm": 0.320442795753479, + "learning_rate": 3.845328703179692e-05, + "loss": 0.1526, + "step": 21709 + }, + { + "epoch": 0.3872222024043092, + "grad_norm": 0.2485407143831253, + "learning_rate": 3.845197508632603e-05, + "loss": 0.1396, + "step": 21710 + }, + { + "epoch": 0.3872400385260229, + "grad_norm": 0.2517538368701935, + "learning_rate": 3.845066308871065e-05, + "loss": 0.213, + "step": 21711 + }, + { + "epoch": 0.3872578746477366, + "grad_norm": 0.21894724667072296, + "learning_rate": 3.8449351038955836e-05, + "loss": 0.0998, + "step": 21712 + }, + { + "epoch": 0.3872757107694503, + "grad_norm": 0.1558244526386261, + "learning_rate": 3.84480389370667e-05, + "loss": 0.1074, + "step": 21713 + }, + { + "epoch": 0.38729354689116396, + "grad_norm": 0.2754209637641907, + "learning_rate": 3.844672678304831e-05, + "loss": 0.142, + "step": 21714 + }, + { + "epoch": 0.38731138301287765, + "grad_norm": 0.24693749845027924, + "learning_rate": 3.844541457690578e-05, + "loss": 0.1736, + "step": 21715 + }, + { + "epoch": 0.3873292191345914, + "grad_norm": 0.2691107392311096, + "learning_rate": 3.8444102318644165e-05, + "loss": 0.1575, + "step": 21716 + }, + { + "epoch": 0.3873470552563051, + "grad_norm": 0.25201287865638733, + "learning_rate": 3.8442790008268576e-05, + "loss": 0.1509, + "step": 21717 + }, + { + "epoch": 0.38736489137801877, + "grad_norm": 0.28409329056739807, + "learning_rate": 3.8441477645784084e-05, + "loss": 0.1729, + "step": 21718 + }, + { + "epoch": 0.38738272749973246, + "grad_norm": 0.23766516149044037, + "learning_rate": 3.844016523119578e-05, + "loss": 0.1601, + "step": 21719 + }, + { + "epoch": 0.38740056362144615, + "grad_norm": 0.25733527541160583, + "learning_rate": 3.843885276450876e-05, + "loss": 0.1327, + "step": 21720 + }, + { + "epoch": 0.38741839974315984, + "grad_norm": 0.3048568069934845, + "learning_rate": 3.8437540245728095e-05, + "loss": 0.1091, + "step": 21721 + }, + { + "epoch": 0.3874362358648735, + "grad_norm": 0.2690945863723755, + "learning_rate": 3.8436227674858895e-05, + "loss": 0.1946, + "step": 21722 + }, + { + "epoch": 0.3874540719865872, + "grad_norm": 0.23019066452980042, + "learning_rate": 3.843491505190623e-05, + "loss": 0.1736, + "step": 21723 + }, + { + "epoch": 0.38747190810830096, + "grad_norm": 0.18285711109638214, + "learning_rate": 3.84336023768752e-05, + "loss": 0.169, + "step": 21724 + }, + { + "epoch": 0.38748974423001464, + "grad_norm": 0.28343191742897034, + "learning_rate": 3.843228964977088e-05, + "loss": 0.1573, + "step": 21725 + }, + { + "epoch": 0.38750758035172833, + "grad_norm": 0.25772616267204285, + "learning_rate": 3.8430976870598366e-05, + "loss": 0.2121, + "step": 21726 + }, + { + "epoch": 0.387525416473442, + "grad_norm": 0.24936121702194214, + "learning_rate": 3.842966403936274e-05, + "loss": 0.1319, + "step": 21727 + }, + { + "epoch": 0.3875432525951557, + "grad_norm": 0.281258225440979, + "learning_rate": 3.842835115606911e-05, + "loss": 0.1184, + "step": 21728 + }, + { + "epoch": 0.3875610887168694, + "grad_norm": 0.38405823707580566, + "learning_rate": 3.842703822072255e-05, + "loss": 0.1969, + "step": 21729 + }, + { + "epoch": 0.3875789248385831, + "grad_norm": 0.3724100589752197, + "learning_rate": 3.8425725233328157e-05, + "loss": 0.1761, + "step": 21730 + }, + { + "epoch": 0.38759676096029677, + "grad_norm": 0.2624076306819916, + "learning_rate": 3.8424412193891016e-05, + "loss": 0.1827, + "step": 21731 + }, + { + "epoch": 0.3876145970820105, + "grad_norm": 0.19694197177886963, + "learning_rate": 3.84230991024162e-05, + "loss": 0.1873, + "step": 21732 + }, + { + "epoch": 0.3876324332037242, + "grad_norm": 0.21732911467552185, + "learning_rate": 3.8421785958908826e-05, + "loss": 0.1376, + "step": 21733 + }, + { + "epoch": 0.3876502693254379, + "grad_norm": 0.320049911737442, + "learning_rate": 3.8420472763373976e-05, + "loss": 0.1449, + "step": 21734 + }, + { + "epoch": 0.3876681054471516, + "grad_norm": 0.38718122243881226, + "learning_rate": 3.841915951581674e-05, + "loss": 0.1507, + "step": 21735 + }, + { + "epoch": 0.38768594156886527, + "grad_norm": 0.3303813934326172, + "learning_rate": 3.84178462162422e-05, + "loss": 0.1582, + "step": 21736 + }, + { + "epoch": 0.38770377769057895, + "grad_norm": 0.3199482560157776, + "learning_rate": 3.841653286465546e-05, + "loss": 0.1461, + "step": 21737 + }, + { + "epoch": 0.38772161381229264, + "grad_norm": 0.2577473819255829, + "learning_rate": 3.8415219461061605e-05, + "loss": 0.1477, + "step": 21738 + }, + { + "epoch": 0.38773944993400633, + "grad_norm": 0.22839266061782837, + "learning_rate": 3.8413906005465725e-05, + "loss": 0.1219, + "step": 21739 + }, + { + "epoch": 0.38775728605572, + "grad_norm": 0.3764893710613251, + "learning_rate": 3.8412592497872905e-05, + "loss": 0.174, + "step": 21740 + }, + { + "epoch": 0.38777512217743376, + "grad_norm": 0.23945572972297668, + "learning_rate": 3.8411278938288254e-05, + "loss": 0.1756, + "step": 21741 + }, + { + "epoch": 0.38779295829914745, + "grad_norm": 0.26251915097236633, + "learning_rate": 3.840996532671685e-05, + "loss": 0.1487, + "step": 21742 + }, + { + "epoch": 0.38781079442086114, + "grad_norm": 0.2576858103275299, + "learning_rate": 3.840865166316379e-05, + "loss": 0.1665, + "step": 21743 + }, + { + "epoch": 0.3878286305425748, + "grad_norm": 0.3027690052986145, + "learning_rate": 3.840733794763416e-05, + "loss": 0.1587, + "step": 21744 + }, + { + "epoch": 0.3878464666642885, + "grad_norm": 0.21462975442409515, + "learning_rate": 3.840602418013306e-05, + "loss": 0.156, + "step": 21745 + }, + { + "epoch": 0.3878643027860022, + "grad_norm": 0.19485387206077576, + "learning_rate": 3.840471036066559e-05, + "loss": 0.1318, + "step": 21746 + }, + { + "epoch": 0.3878821389077159, + "grad_norm": 0.2827766239643097, + "learning_rate": 3.8403396489236806e-05, + "loss": 0.1589, + "step": 21747 + }, + { + "epoch": 0.3878999750294296, + "grad_norm": 0.4065455496311188, + "learning_rate": 3.840208256585185e-05, + "loss": 0.166, + "step": 21748 + }, + { + "epoch": 0.3879178111511433, + "grad_norm": 0.23038853704929352, + "learning_rate": 3.840076859051578e-05, + "loss": 0.1877, + "step": 21749 + }, + { + "epoch": 0.387935647272857, + "grad_norm": 0.27545079588890076, + "learning_rate": 3.8399454563233716e-05, + "loss": 0.1404, + "step": 21750 + }, + { + "epoch": 0.3879534833945707, + "grad_norm": 0.2805190980434418, + "learning_rate": 3.839814048401074e-05, + "loss": 0.13, + "step": 21751 + }, + { + "epoch": 0.3879713195162844, + "grad_norm": 0.329283744096756, + "learning_rate": 3.839682635285193e-05, + "loss": 0.1753, + "step": 21752 + }, + { + "epoch": 0.3879891556379981, + "grad_norm": 0.29797378182411194, + "learning_rate": 3.8395512169762406e-05, + "loss": 0.1533, + "step": 21753 + }, + { + "epoch": 0.38800699175971176, + "grad_norm": 0.2928731441497803, + "learning_rate": 3.839419793474723e-05, + "loss": 0.2148, + "step": 21754 + }, + { + "epoch": 0.38802482788142545, + "grad_norm": 0.2856592833995819, + "learning_rate": 3.839288364781154e-05, + "loss": 0.1616, + "step": 21755 + }, + { + "epoch": 0.38804266400313914, + "grad_norm": 0.26209142804145813, + "learning_rate": 3.839156930896039e-05, + "loss": 0.168, + "step": 21756 + }, + { + "epoch": 0.3880605001248528, + "grad_norm": 0.2429530918598175, + "learning_rate": 3.839025491819891e-05, + "loss": 0.2252, + "step": 21757 + }, + { + "epoch": 0.38807833624656657, + "grad_norm": 0.46106863021850586, + "learning_rate": 3.838894047553217e-05, + "loss": 0.1843, + "step": 21758 + }, + { + "epoch": 0.38809617236828026, + "grad_norm": 0.2597132623195648, + "learning_rate": 3.838762598096527e-05, + "loss": 0.1671, + "step": 21759 + }, + { + "epoch": 0.38811400848999394, + "grad_norm": 0.2598365247249603, + "learning_rate": 3.838631143450331e-05, + "loss": 0.1248, + "step": 21760 + }, + { + "epoch": 0.38813184461170763, + "grad_norm": 0.23313425481319427, + "learning_rate": 3.8384996836151374e-05, + "loss": 0.1692, + "step": 21761 + }, + { + "epoch": 0.3881496807334213, + "grad_norm": 0.2174071967601776, + "learning_rate": 3.838368218591457e-05, + "loss": 0.1385, + "step": 21762 + }, + { + "epoch": 0.388167516855135, + "grad_norm": 0.3024972975254059, + "learning_rate": 3.8382367483797996e-05, + "loss": 0.2051, + "step": 21763 + }, + { + "epoch": 0.3881853529768487, + "grad_norm": 0.286173552274704, + "learning_rate": 3.8381052729806745e-05, + "loss": 0.168, + "step": 21764 + }, + { + "epoch": 0.3882031890985624, + "grad_norm": 0.4113507866859436, + "learning_rate": 3.837973792394591e-05, + "loss": 0.1419, + "step": 21765 + }, + { + "epoch": 0.38822102522027613, + "grad_norm": 0.2892780303955078, + "learning_rate": 3.837842306622059e-05, + "loss": 0.1706, + "step": 21766 + }, + { + "epoch": 0.3882388613419898, + "grad_norm": 0.27912452816963196, + "learning_rate": 3.837710815663589e-05, + "loss": 0.2151, + "step": 21767 + }, + { + "epoch": 0.3882566974637035, + "grad_norm": 0.3342384696006775, + "learning_rate": 3.8375793195196886e-05, + "loss": 0.1705, + "step": 21768 + }, + { + "epoch": 0.3882745335854172, + "grad_norm": 0.2256297618150711, + "learning_rate": 3.8374478181908684e-05, + "loss": 0.1586, + "step": 21769 + }, + { + "epoch": 0.3882923697071309, + "grad_norm": 0.1997583657503128, + "learning_rate": 3.837316311677639e-05, + "loss": 0.171, + "step": 21770 + }, + { + "epoch": 0.38831020582884457, + "grad_norm": 0.2296144664287567, + "learning_rate": 3.8371847999805096e-05, + "loss": 0.159, + "step": 21771 + }, + { + "epoch": 0.38832804195055826, + "grad_norm": 0.30388113856315613, + "learning_rate": 3.837053283099992e-05, + "loss": 0.1758, + "step": 21772 + }, + { + "epoch": 0.38834587807227194, + "grad_norm": 0.2751940190792084, + "learning_rate": 3.8369217610365916e-05, + "loss": 0.1995, + "step": 21773 + }, + { + "epoch": 0.3883637141939857, + "grad_norm": 0.27325740456581116, + "learning_rate": 3.836790233790821e-05, + "loss": 0.1748, + "step": 21774 + }, + { + "epoch": 0.3883815503156994, + "grad_norm": 0.2639085054397583, + "learning_rate": 3.836658701363191e-05, + "loss": 0.189, + "step": 21775 + }, + { + "epoch": 0.38839938643741306, + "grad_norm": 0.2682328224182129, + "learning_rate": 3.836527163754209e-05, + "loss": 0.1438, + "step": 21776 + }, + { + "epoch": 0.38841722255912675, + "grad_norm": 0.2557844817638397, + "learning_rate": 3.836395620964387e-05, + "loss": 0.1246, + "step": 21777 + }, + { + "epoch": 0.38843505868084044, + "grad_norm": 0.3033906817436218, + "learning_rate": 3.836264072994233e-05, + "loss": 0.1671, + "step": 21778 + }, + { + "epoch": 0.3884528948025541, + "grad_norm": 0.25656500458717346, + "learning_rate": 3.836132519844259e-05, + "loss": 0.1269, + "step": 21779 + }, + { + "epoch": 0.3884707309242678, + "grad_norm": 0.22285985946655273, + "learning_rate": 3.836000961514974e-05, + "loss": 0.126, + "step": 21780 + }, + { + "epoch": 0.3884885670459815, + "grad_norm": 0.3256186842918396, + "learning_rate": 3.835869398006887e-05, + "loss": 0.201, + "step": 21781 + }, + { + "epoch": 0.3885064031676952, + "grad_norm": 0.21302203834056854, + "learning_rate": 3.835737829320508e-05, + "loss": 0.116, + "step": 21782 + }, + { + "epoch": 0.38852423928940893, + "grad_norm": 0.24312090873718262, + "learning_rate": 3.83560625545635e-05, + "loss": 0.1744, + "step": 21783 + }, + { + "epoch": 0.3885420754111226, + "grad_norm": 0.31743359565734863, + "learning_rate": 3.8354746764149194e-05, + "loss": 0.1298, + "step": 21784 + }, + { + "epoch": 0.3885599115328363, + "grad_norm": 0.23558802902698517, + "learning_rate": 3.835343092196728e-05, + "loss": 0.1825, + "step": 21785 + }, + { + "epoch": 0.38857774765455, + "grad_norm": 0.21505975723266602, + "learning_rate": 3.835211502802285e-05, + "loss": 0.1673, + "step": 21786 + }, + { + "epoch": 0.3885955837762637, + "grad_norm": 0.3972247540950775, + "learning_rate": 3.835079908232102e-05, + "loss": 0.1849, + "step": 21787 + }, + { + "epoch": 0.3886134198979774, + "grad_norm": 0.26634714007377625, + "learning_rate": 3.834948308486688e-05, + "loss": 0.1688, + "step": 21788 + }, + { + "epoch": 0.38863125601969106, + "grad_norm": 0.2278829962015152, + "learning_rate": 3.8348167035665525e-05, + "loss": 0.1511, + "step": 21789 + }, + { + "epoch": 0.38864909214140475, + "grad_norm": 0.32002341747283936, + "learning_rate": 3.834685093472207e-05, + "loss": 0.1361, + "step": 21790 + }, + { + "epoch": 0.3886669282631185, + "grad_norm": 0.2517262399196625, + "learning_rate": 3.8345534782041614e-05, + "loss": 0.148, + "step": 21791 + }, + { + "epoch": 0.3886847643848322, + "grad_norm": 0.24844186007976532, + "learning_rate": 3.834421857762925e-05, + "loss": 0.168, + "step": 21792 + }, + { + "epoch": 0.38870260050654587, + "grad_norm": 0.3359774947166443, + "learning_rate": 3.8342902321490095e-05, + "loss": 0.2133, + "step": 21793 + }, + { + "epoch": 0.38872043662825956, + "grad_norm": 0.24227365851402283, + "learning_rate": 3.834158601362923e-05, + "loss": 0.1768, + "step": 21794 + }, + { + "epoch": 0.38873827274997325, + "grad_norm": 0.23671619594097137, + "learning_rate": 3.8340269654051775e-05, + "loss": 0.1648, + "step": 21795 + }, + { + "epoch": 0.38875610887168693, + "grad_norm": 0.3522777855396271, + "learning_rate": 3.8338953242762826e-05, + "loss": 0.2152, + "step": 21796 + }, + { + "epoch": 0.3887739449934006, + "grad_norm": 0.21119537949562073, + "learning_rate": 3.833763677976748e-05, + "loss": 0.1568, + "step": 21797 + }, + { + "epoch": 0.3887917811151143, + "grad_norm": 0.28719526529312134, + "learning_rate": 3.8336320265070865e-05, + "loss": 0.1569, + "step": 21798 + }, + { + "epoch": 0.388809617236828, + "grad_norm": 0.20979127287864685, + "learning_rate": 3.8335003698678053e-05, + "loss": 0.1177, + "step": 21799 + }, + { + "epoch": 0.38882745335854174, + "grad_norm": 0.2139885425567627, + "learning_rate": 3.8333687080594175e-05, + "loss": 0.1463, + "step": 21800 + }, + { + "epoch": 0.38884528948025543, + "grad_norm": 0.42440083622932434, + "learning_rate": 3.8332370410824305e-05, + "loss": 0.1749, + "step": 21801 + }, + { + "epoch": 0.3888631256019691, + "grad_norm": 0.2534075081348419, + "learning_rate": 3.833105368937356e-05, + "loss": 0.1405, + "step": 21802 + }, + { + "epoch": 0.3888809617236828, + "grad_norm": 0.3661023676395416, + "learning_rate": 3.832973691624706e-05, + "loss": 0.1899, + "step": 21803 + }, + { + "epoch": 0.3888987978453965, + "grad_norm": 0.3053882420063019, + "learning_rate": 3.832842009144989e-05, + "loss": 0.1715, + "step": 21804 + }, + { + "epoch": 0.3889166339671102, + "grad_norm": 0.32161349058151245, + "learning_rate": 3.8327103214987156e-05, + "loss": 0.1542, + "step": 21805 + }, + { + "epoch": 0.38893447008882387, + "grad_norm": 0.23700089752674103, + "learning_rate": 3.832578628686397e-05, + "loss": 0.1774, + "step": 21806 + }, + { + "epoch": 0.38895230621053756, + "grad_norm": 0.3484228849411011, + "learning_rate": 3.832446930708544e-05, + "loss": 0.1781, + "step": 21807 + }, + { + "epoch": 0.3889701423322513, + "grad_norm": 0.3152025640010834, + "learning_rate": 3.832315227565666e-05, + "loss": 0.1742, + "step": 21808 + }, + { + "epoch": 0.388987978453965, + "grad_norm": 0.18587124347686768, + "learning_rate": 3.832183519258274e-05, + "loss": 0.1241, + "step": 21809 + }, + { + "epoch": 0.3890058145756787, + "grad_norm": 0.2782955765724182, + "learning_rate": 3.832051805786878e-05, + "loss": 0.1613, + "step": 21810 + }, + { + "epoch": 0.38902365069739236, + "grad_norm": 0.30954328179359436, + "learning_rate": 3.83192008715199e-05, + "loss": 0.2096, + "step": 21811 + }, + { + "epoch": 0.38904148681910605, + "grad_norm": 0.2265925109386444, + "learning_rate": 3.8317883633541195e-05, + "loss": 0.1692, + "step": 21812 + }, + { + "epoch": 0.38905932294081974, + "grad_norm": 0.22565065324306488, + "learning_rate": 3.8316566343937774e-05, + "loss": 0.1624, + "step": 21813 + }, + { + "epoch": 0.38907715906253343, + "grad_norm": 0.35718339681625366, + "learning_rate": 3.8315249002714737e-05, + "loss": 0.1792, + "step": 21814 + }, + { + "epoch": 0.3890949951842471, + "grad_norm": 0.2791139781475067, + "learning_rate": 3.8313931609877204e-05, + "loss": 0.1936, + "step": 21815 + }, + { + "epoch": 0.3891128313059608, + "grad_norm": 0.2908702790737152, + "learning_rate": 3.8312614165430266e-05, + "loss": 0.1762, + "step": 21816 + }, + { + "epoch": 0.38913066742767455, + "grad_norm": 0.3196234703063965, + "learning_rate": 3.831129666937904e-05, + "loss": 0.1217, + "step": 21817 + }, + { + "epoch": 0.38914850354938824, + "grad_norm": 0.22735252976417542, + "learning_rate": 3.830997912172863e-05, + "loss": 0.1548, + "step": 21818 + }, + { + "epoch": 0.3891663396711019, + "grad_norm": 0.29175421595573425, + "learning_rate": 3.830866152248414e-05, + "loss": 0.1526, + "step": 21819 + }, + { + "epoch": 0.3891841757928156, + "grad_norm": 0.330176442861557, + "learning_rate": 3.830734387165069e-05, + "loss": 0.2183, + "step": 21820 + }, + { + "epoch": 0.3892020119145293, + "grad_norm": 0.27319836616516113, + "learning_rate": 3.8306026169233375e-05, + "loss": 0.1433, + "step": 21821 + }, + { + "epoch": 0.389219848036243, + "grad_norm": 0.3326437175273895, + "learning_rate": 3.830470841523731e-05, + "loss": 0.2051, + "step": 21822 + }, + { + "epoch": 0.3892376841579567, + "grad_norm": 0.2857581079006195, + "learning_rate": 3.83033906096676e-05, + "loss": 0.1825, + "step": 21823 + }, + { + "epoch": 0.38925552027967036, + "grad_norm": 0.22990186512470245, + "learning_rate": 3.830207275252934e-05, + "loss": 0.1832, + "step": 21824 + }, + { + "epoch": 0.3892733564013841, + "grad_norm": 0.2547556161880493, + "learning_rate": 3.830075484382767e-05, + "loss": 0.2202, + "step": 21825 + }, + { + "epoch": 0.3892911925230978, + "grad_norm": 0.2691248059272766, + "learning_rate": 3.829943688356767e-05, + "loss": 0.1813, + "step": 21826 + }, + { + "epoch": 0.3893090286448115, + "grad_norm": 0.2959471046924591, + "learning_rate": 3.8298118871754465e-05, + "loss": 0.1736, + "step": 21827 + }, + { + "epoch": 0.38932686476652517, + "grad_norm": 0.29780328273773193, + "learning_rate": 3.829680080839315e-05, + "loss": 0.1457, + "step": 21828 + }, + { + "epoch": 0.38934470088823886, + "grad_norm": 0.26026979088783264, + "learning_rate": 3.829548269348885e-05, + "loss": 0.1616, + "step": 21829 + }, + { + "epoch": 0.38936253700995255, + "grad_norm": 0.2784067690372467, + "learning_rate": 3.829416452704666e-05, + "loss": 0.214, + "step": 21830 + }, + { + "epoch": 0.38938037313166624, + "grad_norm": 0.23501050472259521, + "learning_rate": 3.82928463090717e-05, + "loss": 0.1798, + "step": 21831 + }, + { + "epoch": 0.3893982092533799, + "grad_norm": 0.2049226462841034, + "learning_rate": 3.829152803956908e-05, + "loss": 0.1342, + "step": 21832 + }, + { + "epoch": 0.38941604537509367, + "grad_norm": 0.3493288457393646, + "learning_rate": 3.8290209718543896e-05, + "loss": 0.1866, + "step": 21833 + }, + { + "epoch": 0.38943388149680735, + "grad_norm": 0.3064088225364685, + "learning_rate": 3.828889134600128e-05, + "loss": 0.1812, + "step": 21834 + }, + { + "epoch": 0.38945171761852104, + "grad_norm": 0.3190014660358429, + "learning_rate": 3.828757292194633e-05, + "loss": 0.1919, + "step": 21835 + }, + { + "epoch": 0.38946955374023473, + "grad_norm": 0.23864136636257172, + "learning_rate": 3.828625444638415e-05, + "loss": 0.1391, + "step": 21836 + }, + { + "epoch": 0.3894873898619484, + "grad_norm": 0.34682396054267883, + "learning_rate": 3.828493591931986e-05, + "loss": 0.1363, + "step": 21837 + }, + { + "epoch": 0.3895052259836621, + "grad_norm": 0.21826475858688354, + "learning_rate": 3.8283617340758584e-05, + "loss": 0.1496, + "step": 21838 + }, + { + "epoch": 0.3895230621053758, + "grad_norm": 0.2667967677116394, + "learning_rate": 3.828229871070541e-05, + "loss": 0.1047, + "step": 21839 + }, + { + "epoch": 0.3895408982270895, + "grad_norm": 0.30081990361213684, + "learning_rate": 3.828098002916545e-05, + "loss": 0.1719, + "step": 21840 + }, + { + "epoch": 0.38955873434880317, + "grad_norm": 0.26464831829071045, + "learning_rate": 3.827966129614384e-05, + "loss": 0.1065, + "step": 21841 + }, + { + "epoch": 0.3895765704705169, + "grad_norm": 0.2265709638595581, + "learning_rate": 3.827834251164567e-05, + "loss": 0.1594, + "step": 21842 + }, + { + "epoch": 0.3895944065922306, + "grad_norm": 0.2632579803466797, + "learning_rate": 3.827702367567606e-05, + "loss": 0.1339, + "step": 21843 + }, + { + "epoch": 0.3896122427139443, + "grad_norm": 0.2761947810649872, + "learning_rate": 3.827570478824011e-05, + "loss": 0.1955, + "step": 21844 + }, + { + "epoch": 0.389630078835658, + "grad_norm": 0.27302491664886475, + "learning_rate": 3.827438584934295e-05, + "loss": 0.1887, + "step": 21845 + }, + { + "epoch": 0.38964791495737167, + "grad_norm": 0.26912567019462585, + "learning_rate": 3.827306685898968e-05, + "loss": 0.1366, + "step": 21846 + }, + { + "epoch": 0.38966575107908535, + "grad_norm": 0.2460840344429016, + "learning_rate": 3.8271747817185434e-05, + "loss": 0.0945, + "step": 21847 + }, + { + "epoch": 0.38968358720079904, + "grad_norm": 0.28369140625, + "learning_rate": 3.82704287239353e-05, + "loss": 0.1759, + "step": 21848 + }, + { + "epoch": 0.38970142332251273, + "grad_norm": 0.2833975553512573, + "learning_rate": 3.82691095792444e-05, + "loss": 0.1409, + "step": 21849 + }, + { + "epoch": 0.3897192594442265, + "grad_norm": 0.24005232751369476, + "learning_rate": 3.826779038311785e-05, + "loss": 0.1578, + "step": 21850 + }, + { + "epoch": 0.38973709556594016, + "grad_norm": 0.35843420028686523, + "learning_rate": 3.8266471135560756e-05, + "loss": 0.149, + "step": 21851 + }, + { + "epoch": 0.38975493168765385, + "grad_norm": 0.2971751093864441, + "learning_rate": 3.8265151836578237e-05, + "loss": 0.1273, + "step": 21852 + }, + { + "epoch": 0.38977276780936754, + "grad_norm": 0.3116188049316406, + "learning_rate": 3.8263832486175416e-05, + "loss": 0.22, + "step": 21853 + }, + { + "epoch": 0.3897906039310812, + "grad_norm": 0.28640657663345337, + "learning_rate": 3.82625130843574e-05, + "loss": 0.15, + "step": 21854 + }, + { + "epoch": 0.3898084400527949, + "grad_norm": 0.2245357185602188, + "learning_rate": 3.82611936311293e-05, + "loss": 0.1454, + "step": 21855 + }, + { + "epoch": 0.3898262761745086, + "grad_norm": 0.23760348558425903, + "learning_rate": 3.825987412649623e-05, + "loss": 0.1515, + "step": 21856 + }, + { + "epoch": 0.3898441122962223, + "grad_norm": 0.21389377117156982, + "learning_rate": 3.8258554570463314e-05, + "loss": 0.1222, + "step": 21857 + }, + { + "epoch": 0.389861948417936, + "grad_norm": 0.30927756428718567, + "learning_rate": 3.825723496303565e-05, + "loss": 0.1904, + "step": 21858 + }, + { + "epoch": 0.3898797845396497, + "grad_norm": 0.31562522053718567, + "learning_rate": 3.825591530421837e-05, + "loss": 0.1818, + "step": 21859 + }, + { + "epoch": 0.3898976206613634, + "grad_norm": 0.20763133466243744, + "learning_rate": 3.8254595594016594e-05, + "loss": 0.1643, + "step": 21860 + }, + { + "epoch": 0.3899154567830771, + "grad_norm": 0.27545467019081116, + "learning_rate": 3.825327583243541e-05, + "loss": 0.1748, + "step": 21861 + }, + { + "epoch": 0.3899332929047908, + "grad_norm": 0.2428862750530243, + "learning_rate": 3.825195601947997e-05, + "loss": 0.1759, + "step": 21862 + }, + { + "epoch": 0.3899511290265045, + "grad_norm": 0.2666897475719452, + "learning_rate": 3.825063615515536e-05, + "loss": 0.1404, + "step": 21863 + }, + { + "epoch": 0.38996896514821816, + "grad_norm": 0.27640974521636963, + "learning_rate": 3.8249316239466714e-05, + "loss": 0.2102, + "step": 21864 + }, + { + "epoch": 0.38998680126993185, + "grad_norm": 0.3013707995414734, + "learning_rate": 3.824799627241913e-05, + "loss": 0.1749, + "step": 21865 + }, + { + "epoch": 0.39000463739164554, + "grad_norm": 0.21664096415042877, + "learning_rate": 3.824667625401774e-05, + "loss": 0.1292, + "step": 21866 + }, + { + "epoch": 0.3900224735133593, + "grad_norm": 0.48859432339668274, + "learning_rate": 3.8245356184267665e-05, + "loss": 0.214, + "step": 21867 + }, + { + "epoch": 0.39004030963507297, + "grad_norm": 0.24816346168518066, + "learning_rate": 3.8244036063174015e-05, + "loss": 0.1271, + "step": 21868 + }, + { + "epoch": 0.39005814575678666, + "grad_norm": 0.2757461369037628, + "learning_rate": 3.824271589074191e-05, + "loss": 0.1222, + "step": 21869 + }, + { + "epoch": 0.39007598187850034, + "grad_norm": 0.30328255891799927, + "learning_rate": 3.824139566697645e-05, + "loss": 0.131, + "step": 21870 + }, + { + "epoch": 0.39009381800021403, + "grad_norm": 0.22096651792526245, + "learning_rate": 3.824007539188278e-05, + "loss": 0.1842, + "step": 21871 + }, + { + "epoch": 0.3901116541219277, + "grad_norm": 0.3081246018409729, + "learning_rate": 3.8238755065466e-05, + "loss": 0.1937, + "step": 21872 + }, + { + "epoch": 0.3901294902436414, + "grad_norm": 0.363783061504364, + "learning_rate": 3.823743468773123e-05, + "loss": 0.1664, + "step": 21873 + }, + { + "epoch": 0.3901473263653551, + "grad_norm": 0.3148186206817627, + "learning_rate": 3.8236114258683594e-05, + "loss": 0.1795, + "step": 21874 + }, + { + "epoch": 0.3901651624870688, + "grad_norm": 0.21325372159481049, + "learning_rate": 3.8234793778328204e-05, + "loss": 0.121, + "step": 21875 + }, + { + "epoch": 0.3901829986087825, + "grad_norm": 0.2629788815975189, + "learning_rate": 3.8233473246670196e-05, + "loss": 0.1563, + "step": 21876 + }, + { + "epoch": 0.3902008347304962, + "grad_norm": 0.29258981347084045, + "learning_rate": 3.823215266371466e-05, + "loss": 0.1693, + "step": 21877 + }, + { + "epoch": 0.3902186708522099, + "grad_norm": 0.2947407066822052, + "learning_rate": 3.823083202946673e-05, + "loss": 0.1726, + "step": 21878 + }, + { + "epoch": 0.3902365069739236, + "grad_norm": 0.25252288579940796, + "learning_rate": 3.822951134393152e-05, + "loss": 0.1148, + "step": 21879 + }, + { + "epoch": 0.3902543430956373, + "grad_norm": 0.28613191843032837, + "learning_rate": 3.822819060711418e-05, + "loss": 0.1277, + "step": 21880 + }, + { + "epoch": 0.39027217921735097, + "grad_norm": 0.32311660051345825, + "learning_rate": 3.8226869819019786e-05, + "loss": 0.2133, + "step": 21881 + }, + { + "epoch": 0.39029001533906466, + "grad_norm": 0.2721821963787079, + "learning_rate": 3.822554897965348e-05, + "loss": 0.1831, + "step": 21882 + }, + { + "epoch": 0.39030785146077834, + "grad_norm": 0.35092875361442566, + "learning_rate": 3.822422808902037e-05, + "loss": 0.2193, + "step": 21883 + }, + { + "epoch": 0.3903256875824921, + "grad_norm": 0.2275260090827942, + "learning_rate": 3.82229071471256e-05, + "loss": 0.1633, + "step": 21884 + }, + { + "epoch": 0.3903435237042058, + "grad_norm": 0.2982957065105438, + "learning_rate": 3.822158615397426e-05, + "loss": 0.1646, + "step": 21885 + }, + { + "epoch": 0.39036135982591946, + "grad_norm": 0.2884788513183594, + "learning_rate": 3.822026510957149e-05, + "loss": 0.1416, + "step": 21886 + }, + { + "epoch": 0.39037919594763315, + "grad_norm": 0.38194820284843445, + "learning_rate": 3.821894401392241e-05, + "loss": 0.1224, + "step": 21887 + }, + { + "epoch": 0.39039703206934684, + "grad_norm": 0.2699371576309204, + "learning_rate": 3.821762286703213e-05, + "loss": 0.1196, + "step": 21888 + }, + { + "epoch": 0.3904148681910605, + "grad_norm": 0.20406337082386017, + "learning_rate": 3.821630166890579e-05, + "loss": 0.1091, + "step": 21889 + }, + { + "epoch": 0.3904327043127742, + "grad_norm": 0.30036500096321106, + "learning_rate": 3.8214980419548495e-05, + "loss": 0.1745, + "step": 21890 + }, + { + "epoch": 0.3904505404344879, + "grad_norm": 0.2711452543735504, + "learning_rate": 3.8213659118965375e-05, + "loss": 0.1545, + "step": 21891 + }, + { + "epoch": 0.39046837655620165, + "grad_norm": 0.23234771192073822, + "learning_rate": 3.8212337767161536e-05, + "loss": 0.1368, + "step": 21892 + }, + { + "epoch": 0.39048621267791533, + "grad_norm": 0.22849218547344208, + "learning_rate": 3.821101636414212e-05, + "loss": 0.1523, + "step": 21893 + }, + { + "epoch": 0.390504048799629, + "grad_norm": 0.3615040183067322, + "learning_rate": 3.820969490991224e-05, + "loss": 0.1711, + "step": 21894 + }, + { + "epoch": 0.3905218849213427, + "grad_norm": 0.24644432961940765, + "learning_rate": 3.820837340447703e-05, + "loss": 0.1279, + "step": 21895 + }, + { + "epoch": 0.3905397210430564, + "grad_norm": 0.25518178939819336, + "learning_rate": 3.820705184784159e-05, + "loss": 0.1138, + "step": 21896 + }, + { + "epoch": 0.3905575571647701, + "grad_norm": 0.25514426827430725, + "learning_rate": 3.820573024001106e-05, + "loss": 0.1536, + "step": 21897 + }, + { + "epoch": 0.3905753932864838, + "grad_norm": 0.2216884195804596, + "learning_rate": 3.8204408580990556e-05, + "loss": 0.1854, + "step": 21898 + }, + { + "epoch": 0.39059322940819746, + "grad_norm": 0.2821498215198517, + "learning_rate": 3.820308687078521e-05, + "loss": 0.1779, + "step": 21899 + }, + { + "epoch": 0.39061106552991115, + "grad_norm": 0.3352297246456146, + "learning_rate": 3.8201765109400134e-05, + "loss": 0.1788, + "step": 21900 + }, + { + "epoch": 0.3906289016516249, + "grad_norm": 0.39840319752693176, + "learning_rate": 3.820044329684046e-05, + "loss": 0.206, + "step": 21901 + }, + { + "epoch": 0.3906467377733386, + "grad_norm": 0.29875972867012024, + "learning_rate": 3.8199121433111306e-05, + "loss": 0.1897, + "step": 21902 + }, + { + "epoch": 0.39066457389505227, + "grad_norm": 0.26019251346588135, + "learning_rate": 3.81977995182178e-05, + "loss": 0.1171, + "step": 21903 + }, + { + "epoch": 0.39068241001676596, + "grad_norm": 0.3058810830116272, + "learning_rate": 3.819647755216507e-05, + "loss": 0.138, + "step": 21904 + }, + { + "epoch": 0.39070024613847965, + "grad_norm": 0.3068974018096924, + "learning_rate": 3.819515553495822e-05, + "loss": 0.1239, + "step": 21905 + }, + { + "epoch": 0.39071808226019333, + "grad_norm": 0.35325440764427185, + "learning_rate": 3.81938334666024e-05, + "loss": 0.1634, + "step": 21906 + }, + { + "epoch": 0.390735918381907, + "grad_norm": 0.2938697040081024, + "learning_rate": 3.8192511347102725e-05, + "loss": 0.176, + "step": 21907 + }, + { + "epoch": 0.3907537545036207, + "grad_norm": 0.29065683484077454, + "learning_rate": 3.8191189176464316e-05, + "loss": 0.1425, + "step": 21908 + }, + { + "epoch": 0.39077159062533445, + "grad_norm": 0.4246031939983368, + "learning_rate": 3.81898669546923e-05, + "loss": 0.1649, + "step": 21909 + }, + { + "epoch": 0.39078942674704814, + "grad_norm": 0.22863398492336273, + "learning_rate": 3.818854468179181e-05, + "loss": 0.1612, + "step": 21910 + }, + { + "epoch": 0.39080726286876183, + "grad_norm": 0.27201688289642334, + "learning_rate": 3.818722235776796e-05, + "loss": 0.1632, + "step": 21911 + }, + { + "epoch": 0.3908250989904755, + "grad_norm": 0.19823288917541504, + "learning_rate": 3.818589998262589e-05, + "loss": 0.152, + "step": 21912 + }, + { + "epoch": 0.3908429351121892, + "grad_norm": 0.35975563526153564, + "learning_rate": 3.81845775563707e-05, + "loss": 0.1282, + "step": 21913 + }, + { + "epoch": 0.3908607712339029, + "grad_norm": 0.22304262220859528, + "learning_rate": 3.8183255079007555e-05, + "loss": 0.1151, + "step": 21914 + }, + { + "epoch": 0.3908786073556166, + "grad_norm": 0.3620227575302124, + "learning_rate": 3.818193255054155e-05, + "loss": 0.1705, + "step": 21915 + }, + { + "epoch": 0.39089644347733027, + "grad_norm": 0.2189723700284958, + "learning_rate": 3.818060997097782e-05, + "loss": 0.1804, + "step": 21916 + }, + { + "epoch": 0.39091427959904396, + "grad_norm": 0.2931801974773407, + "learning_rate": 3.8179287340321494e-05, + "loss": 0.1589, + "step": 21917 + }, + { + "epoch": 0.3909321157207577, + "grad_norm": 0.2061789631843567, + "learning_rate": 3.8177964658577706e-05, + "loss": 0.1486, + "step": 21918 + }, + { + "epoch": 0.3909499518424714, + "grad_norm": 0.41693368554115295, + "learning_rate": 3.817664192575156e-05, + "loss": 0.1699, + "step": 21919 + }, + { + "epoch": 0.3909677879641851, + "grad_norm": 0.25839945673942566, + "learning_rate": 3.817531914184821e-05, + "loss": 0.131, + "step": 21920 + }, + { + "epoch": 0.39098562408589876, + "grad_norm": 0.28907933831214905, + "learning_rate": 3.817399630687277e-05, + "loss": 0.1826, + "step": 21921 + }, + { + "epoch": 0.39100346020761245, + "grad_norm": 0.2138252705335617, + "learning_rate": 3.817267342083037e-05, + "loss": 0.1296, + "step": 21922 + }, + { + "epoch": 0.39102129632932614, + "grad_norm": 0.35851821303367615, + "learning_rate": 3.8171350483726145e-05, + "loss": 0.1647, + "step": 21923 + }, + { + "epoch": 0.39103913245103983, + "grad_norm": 0.26591426134109497, + "learning_rate": 3.81700274955652e-05, + "loss": 0.1395, + "step": 21924 + }, + { + "epoch": 0.3910569685727535, + "grad_norm": 0.37865716218948364, + "learning_rate": 3.816870445635269e-05, + "loss": 0.2024, + "step": 21925 + }, + { + "epoch": 0.39107480469446726, + "grad_norm": 0.25254198908805847, + "learning_rate": 3.8167381366093736e-05, + "loss": 0.1739, + "step": 21926 + }, + { + "epoch": 0.39109264081618095, + "grad_norm": 0.29420819878578186, + "learning_rate": 3.816605822479346e-05, + "loss": 0.1693, + "step": 21927 + }, + { + "epoch": 0.39111047693789464, + "grad_norm": 0.23101702332496643, + "learning_rate": 3.8164735032457e-05, + "loss": 0.1448, + "step": 21928 + }, + { + "epoch": 0.3911283130596083, + "grad_norm": 0.2782047688961029, + "learning_rate": 3.816341178908947e-05, + "loss": 0.1411, + "step": 21929 + }, + { + "epoch": 0.391146149181322, + "grad_norm": 0.27685534954071045, + "learning_rate": 3.816208849469601e-05, + "loss": 0.1306, + "step": 21930 + }, + { + "epoch": 0.3911639853030357, + "grad_norm": 0.3027855455875397, + "learning_rate": 3.816076514928176e-05, + "loss": 0.1756, + "step": 21931 + }, + { + "epoch": 0.3911818214247494, + "grad_norm": 0.34002652764320374, + "learning_rate": 3.815944175285183e-05, + "loss": 0.1714, + "step": 21932 + }, + { + "epoch": 0.3911996575464631, + "grad_norm": 0.24387463927268982, + "learning_rate": 3.815811830541136e-05, + "loss": 0.1842, + "step": 21933 + }, + { + "epoch": 0.3912174936681768, + "grad_norm": 0.28653308749198914, + "learning_rate": 3.815679480696548e-05, + "loss": 0.1682, + "step": 21934 + }, + { + "epoch": 0.3912353297898905, + "grad_norm": 0.18379320204257965, + "learning_rate": 3.815547125751931e-05, + "loss": 0.1411, + "step": 21935 + }, + { + "epoch": 0.3912531659116042, + "grad_norm": 0.29890039563179016, + "learning_rate": 3.815414765707801e-05, + "loss": 0.1422, + "step": 21936 + }, + { + "epoch": 0.3912710020333179, + "grad_norm": 0.2728569507598877, + "learning_rate": 3.815282400564668e-05, + "loss": 0.1703, + "step": 21937 + }, + { + "epoch": 0.39128883815503157, + "grad_norm": 0.2689141631126404, + "learning_rate": 3.815150030323046e-05, + "loss": 0.1852, + "step": 21938 + }, + { + "epoch": 0.39130667427674526, + "grad_norm": 0.24566493928432465, + "learning_rate": 3.8150176549834484e-05, + "loss": 0.1185, + "step": 21939 + }, + { + "epoch": 0.39132451039845895, + "grad_norm": 0.2373753786087036, + "learning_rate": 3.8148852745463883e-05, + "loss": 0.1134, + "step": 21940 + }, + { + "epoch": 0.39134234652017263, + "grad_norm": 0.27469125390052795, + "learning_rate": 3.814752889012378e-05, + "loss": 0.1914, + "step": 21941 + }, + { + "epoch": 0.3913601826418863, + "grad_norm": 0.26927450299263, + "learning_rate": 3.814620498381932e-05, + "loss": 0.1366, + "step": 21942 + }, + { + "epoch": 0.39137801876360007, + "grad_norm": 0.26489993929862976, + "learning_rate": 3.814488102655563e-05, + "loss": 0.1604, + "step": 21943 + }, + { + "epoch": 0.39139585488531375, + "grad_norm": 0.23299403488636017, + "learning_rate": 3.814355701833784e-05, + "loss": 0.1635, + "step": 21944 + }, + { + "epoch": 0.39141369100702744, + "grad_norm": 0.22611603140830994, + "learning_rate": 3.814223295917107e-05, + "loss": 0.1619, + "step": 21945 + }, + { + "epoch": 0.39143152712874113, + "grad_norm": 0.2590923309326172, + "learning_rate": 3.814090884906049e-05, + "loss": 0.1148, + "step": 21946 + }, + { + "epoch": 0.3914493632504548, + "grad_norm": 0.2542882561683655, + "learning_rate": 3.81395846880112e-05, + "loss": 0.1659, + "step": 21947 + }, + { + "epoch": 0.3914671993721685, + "grad_norm": 0.3152708411216736, + "learning_rate": 3.813826047602833e-05, + "loss": 0.1694, + "step": 21948 + }, + { + "epoch": 0.3914850354938822, + "grad_norm": 0.26109474897384644, + "learning_rate": 3.8136936213117036e-05, + "loss": 0.1467, + "step": 21949 + }, + { + "epoch": 0.3915028716155959, + "grad_norm": 0.26293495297431946, + "learning_rate": 3.813561189928243e-05, + "loss": 0.1767, + "step": 21950 + }, + { + "epoch": 0.3915207077373096, + "grad_norm": 0.3232541084289551, + "learning_rate": 3.8134287534529665e-05, + "loss": 0.1565, + "step": 21951 + }, + { + "epoch": 0.3915385438590233, + "grad_norm": 0.28313371539115906, + "learning_rate": 3.8132963118863864e-05, + "loss": 0.197, + "step": 21952 + }, + { + "epoch": 0.391556379980737, + "grad_norm": 0.3670051693916321, + "learning_rate": 3.8131638652290156e-05, + "loss": 0.1344, + "step": 21953 + }, + { + "epoch": 0.3915742161024507, + "grad_norm": 0.20290377736091614, + "learning_rate": 3.813031413481369e-05, + "loss": 0.1458, + "step": 21954 + }, + { + "epoch": 0.3915920522241644, + "grad_norm": 0.31401917338371277, + "learning_rate": 3.8128989566439586e-05, + "loss": 0.2025, + "step": 21955 + }, + { + "epoch": 0.39160988834587807, + "grad_norm": 0.31459882855415344, + "learning_rate": 3.812766494717298e-05, + "loss": 0.1763, + "step": 21956 + }, + { + "epoch": 0.39162772446759175, + "grad_norm": 0.3275716006755829, + "learning_rate": 3.812634027701901e-05, + "loss": 0.1369, + "step": 21957 + }, + { + "epoch": 0.39164556058930544, + "grad_norm": 0.23377977311611176, + "learning_rate": 3.8125015555982824e-05, + "loss": 0.179, + "step": 21958 + }, + { + "epoch": 0.39166339671101913, + "grad_norm": 0.18560905754566193, + "learning_rate": 3.8123690784069534e-05, + "loss": 0.1205, + "step": 21959 + }, + { + "epoch": 0.3916812328327329, + "grad_norm": 0.2274252325296402, + "learning_rate": 3.812236596128429e-05, + "loss": 0.1436, + "step": 21960 + }, + { + "epoch": 0.39169906895444656, + "grad_norm": 0.21028538048267365, + "learning_rate": 3.812104108763223e-05, + "loss": 0.1562, + "step": 21961 + }, + { + "epoch": 0.39171690507616025, + "grad_norm": 0.24540400505065918, + "learning_rate": 3.811971616311847e-05, + "loss": 0.1006, + "step": 21962 + }, + { + "epoch": 0.39173474119787394, + "grad_norm": 0.25537940859794617, + "learning_rate": 3.811839118774816e-05, + "loss": 0.1304, + "step": 21963 + }, + { + "epoch": 0.3917525773195876, + "grad_norm": 0.20558589696884155, + "learning_rate": 3.811706616152644e-05, + "loss": 0.1664, + "step": 21964 + }, + { + "epoch": 0.3917704134413013, + "grad_norm": 0.3093239367008209, + "learning_rate": 3.811574108445843e-05, + "loss": 0.1098, + "step": 21965 + }, + { + "epoch": 0.391788249563015, + "grad_norm": 0.3342975974082947, + "learning_rate": 3.8114415956549296e-05, + "loss": 0.1729, + "step": 21966 + }, + { + "epoch": 0.3918060856847287, + "grad_norm": 0.2315700501203537, + "learning_rate": 3.811309077780415e-05, + "loss": 0.1863, + "step": 21967 + }, + { + "epoch": 0.39182392180644243, + "grad_norm": 0.2535211741924286, + "learning_rate": 3.811176554822813e-05, + "loss": 0.1636, + "step": 21968 + }, + { + "epoch": 0.3918417579281561, + "grad_norm": 0.2755582928657532, + "learning_rate": 3.811044026782637e-05, + "loss": 0.192, + "step": 21969 + }, + { + "epoch": 0.3918595940498698, + "grad_norm": 0.2801467478275299, + "learning_rate": 3.810911493660403e-05, + "loss": 0.1459, + "step": 21970 + }, + { + "epoch": 0.3918774301715835, + "grad_norm": 0.2801015079021454, + "learning_rate": 3.810778955456623e-05, + "loss": 0.1518, + "step": 21971 + }, + { + "epoch": 0.3918952662932972, + "grad_norm": 0.29074493050575256, + "learning_rate": 3.8106464121718106e-05, + "loss": 0.1525, + "step": 21972 + }, + { + "epoch": 0.39191310241501087, + "grad_norm": 0.25924018025398254, + "learning_rate": 3.810513863806481e-05, + "loss": 0.1272, + "step": 21973 + }, + { + "epoch": 0.39193093853672456, + "grad_norm": 0.2612329423427582, + "learning_rate": 3.810381310361146e-05, + "loss": 0.1723, + "step": 21974 + }, + { + "epoch": 0.39194877465843825, + "grad_norm": 0.335252046585083, + "learning_rate": 3.8102487518363206e-05, + "loss": 0.1788, + "step": 21975 + }, + { + "epoch": 0.39196661078015194, + "grad_norm": 0.20958372950553894, + "learning_rate": 3.8101161882325185e-05, + "loss": 0.1775, + "step": 21976 + }, + { + "epoch": 0.3919844469018657, + "grad_norm": 0.23732659220695496, + "learning_rate": 3.809983619550254e-05, + "loss": 0.1528, + "step": 21977 + }, + { + "epoch": 0.39200228302357937, + "grad_norm": 0.38146233558654785, + "learning_rate": 3.8098510457900396e-05, + "loss": 0.1397, + "step": 21978 + }, + { + "epoch": 0.39202011914529306, + "grad_norm": 0.27583402395248413, + "learning_rate": 3.809718466952391e-05, + "loss": 0.1749, + "step": 21979 + }, + { + "epoch": 0.39203795526700674, + "grad_norm": 0.37987610697746277, + "learning_rate": 3.809585883037822e-05, + "loss": 0.1471, + "step": 21980 + }, + { + "epoch": 0.39205579138872043, + "grad_norm": 0.21410000324249268, + "learning_rate": 3.809453294046844e-05, + "loss": 0.1316, + "step": 21981 + }, + { + "epoch": 0.3920736275104341, + "grad_norm": 0.3638959527015686, + "learning_rate": 3.8093206999799737e-05, + "loss": 0.1574, + "step": 21982 + }, + { + "epoch": 0.3920914636321478, + "grad_norm": 0.29879966378211975, + "learning_rate": 3.809188100837724e-05, + "loss": 0.1504, + "step": 21983 + }, + { + "epoch": 0.3921092997538615, + "grad_norm": 0.2999878227710724, + "learning_rate": 3.809055496620609e-05, + "loss": 0.1055, + "step": 21984 + }, + { + "epoch": 0.39212713587557524, + "grad_norm": 0.2939455807209015, + "learning_rate": 3.808922887329143e-05, + "loss": 0.1417, + "step": 21985 + }, + { + "epoch": 0.3921449719972889, + "grad_norm": 0.2802325487136841, + "learning_rate": 3.808790272963839e-05, + "loss": 0.1563, + "step": 21986 + }, + { + "epoch": 0.3921628081190026, + "grad_norm": 0.2000342160463333, + "learning_rate": 3.808657653525213e-05, + "loss": 0.1626, + "step": 21987 + }, + { + "epoch": 0.3921806442407163, + "grad_norm": 0.22181005775928497, + "learning_rate": 3.8085250290137776e-05, + "loss": 0.1525, + "step": 21988 + }, + { + "epoch": 0.39219848036243, + "grad_norm": 0.29537370800971985, + "learning_rate": 3.8083923994300466e-05, + "loss": 0.2184, + "step": 21989 + }, + { + "epoch": 0.3922163164841437, + "grad_norm": 0.2631196677684784, + "learning_rate": 3.808259764774536e-05, + "loss": 0.1687, + "step": 21990 + }, + { + "epoch": 0.39223415260585737, + "grad_norm": 0.25447559356689453, + "learning_rate": 3.808127125047757e-05, + "loss": 0.134, + "step": 21991 + }, + { + "epoch": 0.39225198872757105, + "grad_norm": 0.2882976233959198, + "learning_rate": 3.8079944802502265e-05, + "loss": 0.157, + "step": 21992 + }, + { + "epoch": 0.3922698248492848, + "grad_norm": 0.3343904912471771, + "learning_rate": 3.807861830382457e-05, + "loss": 0.1216, + "step": 21993 + }, + { + "epoch": 0.3922876609709985, + "grad_norm": 0.31246238946914673, + "learning_rate": 3.807729175444965e-05, + "loss": 0.1805, + "step": 21994 + }, + { + "epoch": 0.3923054970927122, + "grad_norm": 0.23825860023498535, + "learning_rate": 3.807596515438262e-05, + "loss": 0.1401, + "step": 21995 + }, + { + "epoch": 0.39232333321442586, + "grad_norm": 0.2660251259803772, + "learning_rate": 3.8074638503628626e-05, + "loss": 0.1385, + "step": 21996 + }, + { + "epoch": 0.39234116933613955, + "grad_norm": 0.2345874309539795, + "learning_rate": 3.807331180219282e-05, + "loss": 0.1474, + "step": 21997 + }, + { + "epoch": 0.39235900545785324, + "grad_norm": 0.25704678893089294, + "learning_rate": 3.8071985050080345e-05, + "loss": 0.1405, + "step": 21998 + }, + { + "epoch": 0.3923768415795669, + "grad_norm": 0.313232421875, + "learning_rate": 3.8070658247296344e-05, + "loss": 0.1126, + "step": 21999 + }, + { + "epoch": 0.3923946777012806, + "grad_norm": 0.27740153670310974, + "learning_rate": 3.8069331393845956e-05, + "loss": 0.1568, + "step": 22000 + }, + { + "epoch": 0.3923946777012806, + "eval_loss": 0.15289585292339325, + "eval_runtime": 107.1293, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 1.596, + "step": 22000 + }, + { + "epoch": 0.3924125138229943, + "grad_norm": 0.2480067014694214, + "learning_rate": 3.8068004489734324e-05, + "loss": 0.161, + "step": 22001 + }, + { + "epoch": 0.39243034994470805, + "grad_norm": 0.31506189703941345, + "learning_rate": 3.8066677534966585e-05, + "loss": 0.2246, + "step": 22002 + }, + { + "epoch": 0.39244818606642173, + "grad_norm": 0.19158689677715302, + "learning_rate": 3.80653505295479e-05, + "loss": 0.1396, + "step": 22003 + }, + { + "epoch": 0.3924660221881354, + "grad_norm": 0.3690859079360962, + "learning_rate": 3.8064023473483404e-05, + "loss": 0.1614, + "step": 22004 + }, + { + "epoch": 0.3924838583098491, + "grad_norm": 0.21442316472530365, + "learning_rate": 3.8062696366778236e-05, + "loss": 0.1873, + "step": 22005 + }, + { + "epoch": 0.3925016944315628, + "grad_norm": 0.2317836433649063, + "learning_rate": 3.806136920943755e-05, + "loss": 0.164, + "step": 22006 + }, + { + "epoch": 0.3925195305532765, + "grad_norm": 0.200971320271492, + "learning_rate": 3.8060042001466485e-05, + "loss": 0.1537, + "step": 22007 + }, + { + "epoch": 0.3925373666749902, + "grad_norm": 0.2796572148799896, + "learning_rate": 3.805871474287018e-05, + "loss": 0.0724, + "step": 22008 + }, + { + "epoch": 0.39255520279670386, + "grad_norm": 0.247846782207489, + "learning_rate": 3.8057387433653795e-05, + "loss": 0.1694, + "step": 22009 + }, + { + "epoch": 0.3925730389184176, + "grad_norm": 0.33010581135749817, + "learning_rate": 3.8056060073822466e-05, + "loss": 0.1605, + "step": 22010 + }, + { + "epoch": 0.3925908750401313, + "grad_norm": 0.26217830181121826, + "learning_rate": 3.805473266338133e-05, + "loss": 0.1904, + "step": 22011 + }, + { + "epoch": 0.392608711161845, + "grad_norm": 0.303500235080719, + "learning_rate": 3.805340520233555e-05, + "loss": 0.174, + "step": 22012 + }, + { + "epoch": 0.39262654728355867, + "grad_norm": 0.27895060181617737, + "learning_rate": 3.805207769069026e-05, + "loss": 0.1717, + "step": 22013 + }, + { + "epoch": 0.39264438340527236, + "grad_norm": 0.2857508659362793, + "learning_rate": 3.805075012845061e-05, + "loss": 0.1142, + "step": 22014 + }, + { + "epoch": 0.39266221952698604, + "grad_norm": 0.33048808574676514, + "learning_rate": 3.804942251562174e-05, + "loss": 0.1298, + "step": 22015 + }, + { + "epoch": 0.39268005564869973, + "grad_norm": 0.20053231716156006, + "learning_rate": 3.804809485220881e-05, + "loss": 0.1246, + "step": 22016 + }, + { + "epoch": 0.3926978917704134, + "grad_norm": 0.2536463737487793, + "learning_rate": 3.804676713821695e-05, + "loss": 0.1166, + "step": 22017 + }, + { + "epoch": 0.3927157278921271, + "grad_norm": 0.24449673295021057, + "learning_rate": 3.804543937365131e-05, + "loss": 0.1449, + "step": 22018 + }, + { + "epoch": 0.39273356401384085, + "grad_norm": 0.25147151947021484, + "learning_rate": 3.804411155851706e-05, + "loss": 0.1417, + "step": 22019 + }, + { + "epoch": 0.39275140013555454, + "grad_norm": 0.22726039588451385, + "learning_rate": 3.804278369281931e-05, + "loss": 0.128, + "step": 22020 + }, + { + "epoch": 0.39276923625726823, + "grad_norm": 0.23316025733947754, + "learning_rate": 3.804145577656324e-05, + "loss": 0.1225, + "step": 22021 + }, + { + "epoch": 0.3927870723789819, + "grad_norm": 0.2329714596271515, + "learning_rate": 3.8040127809753966e-05, + "loss": 0.1555, + "step": 22022 + }, + { + "epoch": 0.3928049085006956, + "grad_norm": 0.26174619793891907, + "learning_rate": 3.803879979239667e-05, + "loss": 0.1559, + "step": 22023 + }, + { + "epoch": 0.3928227446224093, + "grad_norm": 0.31746765971183777, + "learning_rate": 3.803747172449647e-05, + "loss": 0.2248, + "step": 22024 + }, + { + "epoch": 0.392840580744123, + "grad_norm": 0.2919473648071289, + "learning_rate": 3.803614360605853e-05, + "loss": 0.195, + "step": 22025 + }, + { + "epoch": 0.39285841686583667, + "grad_norm": 0.2725481688976288, + "learning_rate": 3.803481543708799e-05, + "loss": 0.1591, + "step": 22026 + }, + { + "epoch": 0.3928762529875504, + "grad_norm": 0.2553013563156128, + "learning_rate": 3.803348721759002e-05, + "loss": 0.1602, + "step": 22027 + }, + { + "epoch": 0.3928940891092641, + "grad_norm": 0.32993438839912415, + "learning_rate": 3.803215894756973e-05, + "loss": 0.1637, + "step": 22028 + }, + { + "epoch": 0.3929119252309778, + "grad_norm": 0.2857172191143036, + "learning_rate": 3.803083062703231e-05, + "loss": 0.1502, + "step": 22029 + }, + { + "epoch": 0.3929297613526915, + "grad_norm": 0.2713981568813324, + "learning_rate": 3.8029502255982875e-05, + "loss": 0.16, + "step": 22030 + }, + { + "epoch": 0.39294759747440516, + "grad_norm": 0.24887074530124664, + "learning_rate": 3.80281738344266e-05, + "loss": 0.2073, + "step": 22031 + }, + { + "epoch": 0.39296543359611885, + "grad_norm": 0.34048402309417725, + "learning_rate": 3.802684536236862e-05, + "loss": 0.1989, + "step": 22032 + }, + { + "epoch": 0.39298326971783254, + "grad_norm": 0.22031594812870026, + "learning_rate": 3.802551683981408e-05, + "loss": 0.1838, + "step": 22033 + }, + { + "epoch": 0.3930011058395462, + "grad_norm": 0.2398686408996582, + "learning_rate": 3.802418826676815e-05, + "loss": 0.1797, + "step": 22034 + }, + { + "epoch": 0.39301894196125997, + "grad_norm": 0.32442519068717957, + "learning_rate": 3.8022859643235966e-05, + "loss": 0.1654, + "step": 22035 + }, + { + "epoch": 0.39303677808297366, + "grad_norm": 0.23995022475719452, + "learning_rate": 3.802153096922267e-05, + "loss": 0.1665, + "step": 22036 + }, + { + "epoch": 0.39305461420468735, + "grad_norm": 0.21244728565216064, + "learning_rate": 3.802020224473343e-05, + "loss": 0.1599, + "step": 22037 + }, + { + "epoch": 0.39307245032640103, + "grad_norm": 0.3076680302619934, + "learning_rate": 3.8018873469773386e-05, + "loss": 0.1303, + "step": 22038 + }, + { + "epoch": 0.3930902864481147, + "grad_norm": 0.2760683596134186, + "learning_rate": 3.8017544644347694e-05, + "loss": 0.1435, + "step": 22039 + }, + { + "epoch": 0.3931081225698284, + "grad_norm": 0.28017204999923706, + "learning_rate": 3.8016215768461505e-05, + "loss": 0.2137, + "step": 22040 + }, + { + "epoch": 0.3931259586915421, + "grad_norm": 0.32485297322273254, + "learning_rate": 3.801488684211997e-05, + "loss": 0.1693, + "step": 22041 + }, + { + "epoch": 0.3931437948132558, + "grad_norm": 0.22792267799377441, + "learning_rate": 3.801355786532823e-05, + "loss": 0.1126, + "step": 22042 + }, + { + "epoch": 0.3931616309349695, + "grad_norm": 0.3081381916999817, + "learning_rate": 3.801222883809145e-05, + "loss": 0.1653, + "step": 22043 + }, + { + "epoch": 0.3931794670566832, + "grad_norm": 0.28138798475265503, + "learning_rate": 3.801089976041478e-05, + "loss": 0.1563, + "step": 22044 + }, + { + "epoch": 0.3931973031783969, + "grad_norm": 0.38139185309410095, + "learning_rate": 3.800957063230336e-05, + "loss": 0.1471, + "step": 22045 + }, + { + "epoch": 0.3932151393001106, + "grad_norm": 0.19801881909370422, + "learning_rate": 3.800824145376236e-05, + "loss": 0.1639, + "step": 22046 + }, + { + "epoch": 0.3932329754218243, + "grad_norm": 0.23384051024913788, + "learning_rate": 3.800691222479692e-05, + "loss": 0.1333, + "step": 22047 + }, + { + "epoch": 0.39325081154353797, + "grad_norm": 0.28682368993759155, + "learning_rate": 3.800558294541219e-05, + "loss": 0.1937, + "step": 22048 + }, + { + "epoch": 0.39326864766525166, + "grad_norm": 0.2321326583623886, + "learning_rate": 3.800425361561334e-05, + "loss": 0.1601, + "step": 22049 + }, + { + "epoch": 0.39328648378696535, + "grad_norm": 0.26319414377212524, + "learning_rate": 3.80029242354055e-05, + "loss": 0.1716, + "step": 22050 + }, + { + "epoch": 0.39330431990867903, + "grad_norm": 0.3451308608055115, + "learning_rate": 3.8001594804793836e-05, + "loss": 0.2004, + "step": 22051 + }, + { + "epoch": 0.3933221560303928, + "grad_norm": 0.34930896759033203, + "learning_rate": 3.800026532378351e-05, + "loss": 0.1627, + "step": 22052 + }, + { + "epoch": 0.39333999215210647, + "grad_norm": 0.17884886264801025, + "learning_rate": 3.799893579237965e-05, + "loss": 0.1142, + "step": 22053 + }, + { + "epoch": 0.39335782827382015, + "grad_norm": 0.27442896366119385, + "learning_rate": 3.7997606210587434e-05, + "loss": 0.169, + "step": 22054 + }, + { + "epoch": 0.39337566439553384, + "grad_norm": 0.2611648440361023, + "learning_rate": 3.7996276578412015e-05, + "loss": 0.1393, + "step": 22055 + }, + { + "epoch": 0.39339350051724753, + "grad_norm": 0.26651236414909363, + "learning_rate": 3.799494689585853e-05, + "loss": 0.1643, + "step": 22056 + }, + { + "epoch": 0.3934113366389612, + "grad_norm": 0.270926296710968, + "learning_rate": 3.799361716293214e-05, + "loss": 0.1672, + "step": 22057 + }, + { + "epoch": 0.3934291727606749, + "grad_norm": 0.24135589599609375, + "learning_rate": 3.7992287379638e-05, + "loss": 0.1981, + "step": 22058 + }, + { + "epoch": 0.3934470088823886, + "grad_norm": 0.29821598529815674, + "learning_rate": 3.799095754598128e-05, + "loss": 0.2353, + "step": 22059 + }, + { + "epoch": 0.3934648450041023, + "grad_norm": 0.24282781779766083, + "learning_rate": 3.798962766196712e-05, + "loss": 0.1557, + "step": 22060 + }, + { + "epoch": 0.393482681125816, + "grad_norm": 0.22573618590831757, + "learning_rate": 3.798829772760067e-05, + "loss": 0.1527, + "step": 22061 + }, + { + "epoch": 0.3935005172475297, + "grad_norm": 0.1954156756401062, + "learning_rate": 3.798696774288709e-05, + "loss": 0.1307, + "step": 22062 + }, + { + "epoch": 0.3935183533692434, + "grad_norm": 0.3251855671405792, + "learning_rate": 3.798563770783153e-05, + "loss": 0.1541, + "step": 22063 + }, + { + "epoch": 0.3935361894909571, + "grad_norm": 0.2626895010471344, + "learning_rate": 3.7984307622439174e-05, + "loss": 0.1764, + "step": 22064 + }, + { + "epoch": 0.3935540256126708, + "grad_norm": 0.22068209946155548, + "learning_rate": 3.798297748671514e-05, + "loss": 0.1235, + "step": 22065 + }, + { + "epoch": 0.39357186173438446, + "grad_norm": 0.2717522978782654, + "learning_rate": 3.798164730066461e-05, + "loss": 0.111, + "step": 22066 + }, + { + "epoch": 0.39358969785609815, + "grad_norm": 0.1987258791923523, + "learning_rate": 3.798031706429273e-05, + "loss": 0.1106, + "step": 22067 + }, + { + "epoch": 0.39360753397781184, + "grad_norm": 0.23701933026313782, + "learning_rate": 3.797898677760465e-05, + "loss": 0.1199, + "step": 22068 + }, + { + "epoch": 0.3936253700995256, + "grad_norm": 0.32771843671798706, + "learning_rate": 3.7977656440605547e-05, + "loss": 0.1536, + "step": 22069 + }, + { + "epoch": 0.3936432062212393, + "grad_norm": 0.2665507197380066, + "learning_rate": 3.797632605330056e-05, + "loss": 0.153, + "step": 22070 + }, + { + "epoch": 0.39366104234295296, + "grad_norm": 0.2856513559818268, + "learning_rate": 3.797499561569485e-05, + "loss": 0.141, + "step": 22071 + }, + { + "epoch": 0.39367887846466665, + "grad_norm": 0.2678152322769165, + "learning_rate": 3.797366512779358e-05, + "loss": 0.1661, + "step": 22072 + }, + { + "epoch": 0.39369671458638034, + "grad_norm": 0.26610809564590454, + "learning_rate": 3.7972334589601896e-05, + "loss": 0.1872, + "step": 22073 + }, + { + "epoch": 0.393714550708094, + "grad_norm": 0.22589686512947083, + "learning_rate": 3.7971004001124965e-05, + "loss": 0.1947, + "step": 22074 + }, + { + "epoch": 0.3937323868298077, + "grad_norm": 0.23755650222301483, + "learning_rate": 3.796967336236794e-05, + "loss": 0.1542, + "step": 22075 + }, + { + "epoch": 0.3937502229515214, + "grad_norm": 0.25805798172950745, + "learning_rate": 3.796834267333599e-05, + "loss": 0.1975, + "step": 22076 + }, + { + "epoch": 0.3937680590732351, + "grad_norm": 0.21485163271427155, + "learning_rate": 3.796701193403426e-05, + "loss": 0.149, + "step": 22077 + }, + { + "epoch": 0.39378589519494883, + "grad_norm": 0.27399903535842896, + "learning_rate": 3.7965681144467916e-05, + "loss": 0.1433, + "step": 22078 + }, + { + "epoch": 0.3938037313166625, + "grad_norm": 0.3664385974407196, + "learning_rate": 3.79643503046421e-05, + "loss": 0.2024, + "step": 22079 + }, + { + "epoch": 0.3938215674383762, + "grad_norm": 0.33587291836738586, + "learning_rate": 3.7963019414562e-05, + "loss": 0.1486, + "step": 22080 + }, + { + "epoch": 0.3938394035600899, + "grad_norm": 0.5331902503967285, + "learning_rate": 3.7961688474232754e-05, + "loss": 0.2395, + "step": 22081 + }, + { + "epoch": 0.3938572396818036, + "grad_norm": 0.2669908404350281, + "learning_rate": 3.7960357483659525e-05, + "loss": 0.1929, + "step": 22082 + }, + { + "epoch": 0.39387507580351727, + "grad_norm": 0.26761671900749207, + "learning_rate": 3.795902644284748e-05, + "loss": 0.1697, + "step": 22083 + }, + { + "epoch": 0.39389291192523096, + "grad_norm": 0.27993664145469666, + "learning_rate": 3.795769535180176e-05, + "loss": 0.1688, + "step": 22084 + }, + { + "epoch": 0.39391074804694465, + "grad_norm": 0.25549808144569397, + "learning_rate": 3.795636421052755e-05, + "loss": 0.1345, + "step": 22085 + }, + { + "epoch": 0.3939285841686584, + "grad_norm": 0.26063039898872375, + "learning_rate": 3.795503301902999e-05, + "loss": 0.1487, + "step": 22086 + }, + { + "epoch": 0.3939464202903721, + "grad_norm": 0.21613682806491852, + "learning_rate": 3.795370177731425e-05, + "loss": 0.1573, + "step": 22087 + }, + { + "epoch": 0.39396425641208577, + "grad_norm": 0.24348105490207672, + "learning_rate": 3.795237048538549e-05, + "loss": 0.1602, + "step": 22088 + }, + { + "epoch": 0.39398209253379946, + "grad_norm": 0.25034475326538086, + "learning_rate": 3.795103914324887e-05, + "loss": 0.1727, + "step": 22089 + }, + { + "epoch": 0.39399992865551314, + "grad_norm": 0.22405272722244263, + "learning_rate": 3.794970775090955e-05, + "loss": 0.15, + "step": 22090 + }, + { + "epoch": 0.39401776477722683, + "grad_norm": 0.1801377832889557, + "learning_rate": 3.794837630837268e-05, + "loss": 0.1121, + "step": 22091 + }, + { + "epoch": 0.3940356008989405, + "grad_norm": 0.36839425563812256, + "learning_rate": 3.794704481564344e-05, + "loss": 0.2228, + "step": 22092 + }, + { + "epoch": 0.3940534370206542, + "grad_norm": 0.2218928039073944, + "learning_rate": 3.794571327272698e-05, + "loss": 0.1595, + "step": 22093 + }, + { + "epoch": 0.39407127314236795, + "grad_norm": 0.2231508046388626, + "learning_rate": 3.794438167962846e-05, + "loss": 0.1874, + "step": 22094 + }, + { + "epoch": 0.39408910926408164, + "grad_norm": 0.2118954062461853, + "learning_rate": 3.794305003635305e-05, + "loss": 0.1369, + "step": 22095 + }, + { + "epoch": 0.3941069453857953, + "grad_norm": 0.2247815579175949, + "learning_rate": 3.7941718342905905e-05, + "loss": 0.1593, + "step": 22096 + }, + { + "epoch": 0.394124781507509, + "grad_norm": 0.3307328224182129, + "learning_rate": 3.79403865992922e-05, + "loss": 0.1784, + "step": 22097 + }, + { + "epoch": 0.3941426176292227, + "grad_norm": 0.2795007526874542, + "learning_rate": 3.793905480551708e-05, + "loss": 0.1423, + "step": 22098 + }, + { + "epoch": 0.3941604537509364, + "grad_norm": 0.22355566918849945, + "learning_rate": 3.793772296158571e-05, + "loss": 0.1173, + "step": 22099 + }, + { + "epoch": 0.3941782898726501, + "grad_norm": 0.3083060383796692, + "learning_rate": 3.793639106750326e-05, + "loss": 0.1703, + "step": 22100 + }, + { + "epoch": 0.39419612599436377, + "grad_norm": 0.2806645631790161, + "learning_rate": 3.7935059123274895e-05, + "loss": 0.1689, + "step": 22101 + }, + { + "epoch": 0.39421396211607745, + "grad_norm": 0.34339267015457153, + "learning_rate": 3.793372712890576e-05, + "loss": 0.133, + "step": 22102 + }, + { + "epoch": 0.3942317982377912, + "grad_norm": 0.1788356453180313, + "learning_rate": 3.793239508440105e-05, + "loss": 0.1014, + "step": 22103 + }, + { + "epoch": 0.3942496343595049, + "grad_norm": 0.25159740447998047, + "learning_rate": 3.7931062989765896e-05, + "loss": 0.1134, + "step": 22104 + }, + { + "epoch": 0.3942674704812186, + "grad_norm": 0.2123439460992813, + "learning_rate": 3.792973084500548e-05, + "loss": 0.1578, + "step": 22105 + }, + { + "epoch": 0.39428530660293226, + "grad_norm": 0.3405452370643616, + "learning_rate": 3.792839865012496e-05, + "loss": 0.2322, + "step": 22106 + }, + { + "epoch": 0.39430314272464595, + "grad_norm": 0.19305455684661865, + "learning_rate": 3.7927066405129515e-05, + "loss": 0.1152, + "step": 22107 + }, + { + "epoch": 0.39432097884635964, + "grad_norm": 0.3004809319972992, + "learning_rate": 3.792573411002428e-05, + "loss": 0.1843, + "step": 22108 + }, + { + "epoch": 0.3943388149680733, + "grad_norm": 0.29001888632774353, + "learning_rate": 3.7924401764814436e-05, + "loss": 0.1585, + "step": 22109 + }, + { + "epoch": 0.394356651089787, + "grad_norm": 0.24814005196094513, + "learning_rate": 3.792306936950515e-05, + "loss": 0.1451, + "step": 22110 + }, + { + "epoch": 0.39437448721150076, + "grad_norm": 0.2911388576030731, + "learning_rate": 3.792173692410159e-05, + "loss": 0.1994, + "step": 22111 + }, + { + "epoch": 0.39439232333321445, + "grad_norm": 1.0766066312789917, + "learning_rate": 3.7920404428608905e-05, + "loss": 0.2033, + "step": 22112 + }, + { + "epoch": 0.39441015945492813, + "grad_norm": 0.21739526093006134, + "learning_rate": 3.7919071883032276e-05, + "loss": 0.2058, + "step": 22113 + }, + { + "epoch": 0.3944279955766418, + "grad_norm": 0.23893605172634125, + "learning_rate": 3.791773928737685e-05, + "loss": 0.1465, + "step": 22114 + }, + { + "epoch": 0.3944458316983555, + "grad_norm": 0.27785786986351013, + "learning_rate": 3.791640664164782e-05, + "loss": 0.1663, + "step": 22115 + }, + { + "epoch": 0.3944636678200692, + "grad_norm": 0.19211722910404205, + "learning_rate": 3.791507394585033e-05, + "loss": 0.0984, + "step": 22116 + }, + { + "epoch": 0.3944815039417829, + "grad_norm": 0.2102123647928238, + "learning_rate": 3.7913741199989556e-05, + "loss": 0.1285, + "step": 22117 + }, + { + "epoch": 0.3944993400634966, + "grad_norm": 0.3376919627189636, + "learning_rate": 3.791240840407066e-05, + "loss": 0.1439, + "step": 22118 + }, + { + "epoch": 0.39451717618521026, + "grad_norm": 0.2521054446697235, + "learning_rate": 3.791107555809881e-05, + "loss": 0.1594, + "step": 22119 + }, + { + "epoch": 0.394535012306924, + "grad_norm": 0.26240217685699463, + "learning_rate": 3.7909742662079165e-05, + "loss": 0.1593, + "step": 22120 + }, + { + "epoch": 0.3945528484286377, + "grad_norm": 0.3223896324634552, + "learning_rate": 3.79084097160169e-05, + "loss": 0.1718, + "step": 22121 + }, + { + "epoch": 0.3945706845503514, + "grad_norm": 0.24559205770492554, + "learning_rate": 3.790707671991719e-05, + "loss": 0.1484, + "step": 22122 + }, + { + "epoch": 0.39458852067206507, + "grad_norm": 0.2500097155570984, + "learning_rate": 3.790574367378518e-05, + "loss": 0.1589, + "step": 22123 + }, + { + "epoch": 0.39460635679377876, + "grad_norm": 0.2428676187992096, + "learning_rate": 3.790441057762606e-05, + "loss": 0.1567, + "step": 22124 + }, + { + "epoch": 0.39462419291549244, + "grad_norm": 0.3735401928424835, + "learning_rate": 3.790307743144499e-05, + "loss": 0.1904, + "step": 22125 + }, + { + "epoch": 0.39464202903720613, + "grad_norm": 0.2904157340526581, + "learning_rate": 3.790174423524713e-05, + "loss": 0.1492, + "step": 22126 + }, + { + "epoch": 0.3946598651589198, + "grad_norm": 0.21411994099617004, + "learning_rate": 3.790041098903765e-05, + "loss": 0.1638, + "step": 22127 + }, + { + "epoch": 0.39467770128063356, + "grad_norm": 0.29817232489585876, + "learning_rate": 3.7899077692821724e-05, + "loss": 0.1892, + "step": 22128 + }, + { + "epoch": 0.39469553740234725, + "grad_norm": 0.2873034179210663, + "learning_rate": 3.7897744346604515e-05, + "loss": 0.1507, + "step": 22129 + }, + { + "epoch": 0.39471337352406094, + "grad_norm": 0.29415106773376465, + "learning_rate": 3.789641095039119e-05, + "loss": 0.1887, + "step": 22130 + }, + { + "epoch": 0.39473120964577463, + "grad_norm": 0.22156280279159546, + "learning_rate": 3.7895077504186936e-05, + "loss": 0.1575, + "step": 22131 + }, + { + "epoch": 0.3947490457674883, + "grad_norm": 0.3194652795791626, + "learning_rate": 3.78937440079969e-05, + "loss": 0.2154, + "step": 22132 + }, + { + "epoch": 0.394766881889202, + "grad_norm": 0.3019934296607971, + "learning_rate": 3.789241046182626e-05, + "loss": 0.1236, + "step": 22133 + }, + { + "epoch": 0.3947847180109157, + "grad_norm": 0.2365180402994156, + "learning_rate": 3.789107686568018e-05, + "loss": 0.1471, + "step": 22134 + }, + { + "epoch": 0.3948025541326294, + "grad_norm": 0.25868576765060425, + "learning_rate": 3.7889743219563844e-05, + "loss": 0.147, + "step": 22135 + }, + { + "epoch": 0.3948203902543431, + "grad_norm": 0.22758358716964722, + "learning_rate": 3.78884095234824e-05, + "loss": 0.1384, + "step": 22136 + }, + { + "epoch": 0.3948382263760568, + "grad_norm": 0.2360512614250183, + "learning_rate": 3.788707577744103e-05, + "loss": 0.1284, + "step": 22137 + }, + { + "epoch": 0.3948560624977705, + "grad_norm": 0.19539006054401398, + "learning_rate": 3.7885741981444914e-05, + "loss": 0.1389, + "step": 22138 + }, + { + "epoch": 0.3948738986194842, + "grad_norm": 0.34173741936683655, + "learning_rate": 3.78844081354992e-05, + "loss": 0.1732, + "step": 22139 + }, + { + "epoch": 0.3948917347411979, + "grad_norm": 0.1790994256734848, + "learning_rate": 3.788307423960907e-05, + "loss": 0.1578, + "step": 22140 + }, + { + "epoch": 0.39490957086291156, + "grad_norm": 0.19829240441322327, + "learning_rate": 3.78817402937797e-05, + "loss": 0.1373, + "step": 22141 + }, + { + "epoch": 0.39492740698462525, + "grad_norm": 0.21088539063930511, + "learning_rate": 3.788040629801626e-05, + "loss": 0.159, + "step": 22142 + }, + { + "epoch": 0.39494524310633894, + "grad_norm": 0.26099902391433716, + "learning_rate": 3.78790722523239e-05, + "loss": 0.0941, + "step": 22143 + }, + { + "epoch": 0.3949630792280526, + "grad_norm": 0.28984883427619934, + "learning_rate": 3.7877738156707826e-05, + "loss": 0.1461, + "step": 22144 + }, + { + "epoch": 0.39498091534976637, + "grad_norm": 0.2620595097541809, + "learning_rate": 3.7876404011173184e-05, + "loss": 0.2001, + "step": 22145 + }, + { + "epoch": 0.39499875147148006, + "grad_norm": 0.25255337357521057, + "learning_rate": 3.7875069815725154e-05, + "loss": 0.1438, + "step": 22146 + }, + { + "epoch": 0.39501658759319375, + "grad_norm": 0.21990849077701569, + "learning_rate": 3.78737355703689e-05, + "loss": 0.1665, + "step": 22147 + }, + { + "epoch": 0.39503442371490743, + "grad_norm": 0.24035438895225525, + "learning_rate": 3.787240127510961e-05, + "loss": 0.1826, + "step": 22148 + }, + { + "epoch": 0.3950522598366211, + "grad_norm": 0.3047240674495697, + "learning_rate": 3.7871066929952436e-05, + "loss": 0.1259, + "step": 22149 + }, + { + "epoch": 0.3950700959583348, + "grad_norm": 0.3384449779987335, + "learning_rate": 3.786973253490257e-05, + "loss": 0.1539, + "step": 22150 + }, + { + "epoch": 0.3950879320800485, + "grad_norm": 0.3169119954109192, + "learning_rate": 3.786839808996517e-05, + "loss": 0.1561, + "step": 22151 + }, + { + "epoch": 0.3951057682017622, + "grad_norm": 0.3256834149360657, + "learning_rate": 3.7867063595145414e-05, + "loss": 0.1343, + "step": 22152 + }, + { + "epoch": 0.39512360432347593, + "grad_norm": 0.28872379660606384, + "learning_rate": 3.786572905044848e-05, + "loss": 0.1954, + "step": 22153 + }, + { + "epoch": 0.3951414404451896, + "grad_norm": 0.3238251507282257, + "learning_rate": 3.7864394455879536e-05, + "loss": 0.1151, + "step": 22154 + }, + { + "epoch": 0.3951592765669033, + "grad_norm": 0.27694183588027954, + "learning_rate": 3.7863059811443755e-05, + "loss": 0.1521, + "step": 22155 + }, + { + "epoch": 0.395177112688617, + "grad_norm": 0.24887990951538086, + "learning_rate": 3.7861725117146316e-05, + "loss": 0.1931, + "step": 22156 + }, + { + "epoch": 0.3951949488103307, + "grad_norm": 0.21174845099449158, + "learning_rate": 3.7860390372992375e-05, + "loss": 0.1691, + "step": 22157 + }, + { + "epoch": 0.39521278493204437, + "grad_norm": 0.30390486121177673, + "learning_rate": 3.785905557898712e-05, + "loss": 0.1876, + "step": 22158 + }, + { + "epoch": 0.39523062105375806, + "grad_norm": 0.2500879168510437, + "learning_rate": 3.785772073513574e-05, + "loss": 0.1433, + "step": 22159 + }, + { + "epoch": 0.39524845717547175, + "grad_norm": 0.33439457416534424, + "learning_rate": 3.785638584144339e-05, + "loss": 0.1475, + "step": 22160 + }, + { + "epoch": 0.39526629329718543, + "grad_norm": 0.32102084159851074, + "learning_rate": 3.785505089791524e-05, + "loss": 0.1603, + "step": 22161 + }, + { + "epoch": 0.3952841294188992, + "grad_norm": 0.26970216631889343, + "learning_rate": 3.7853715904556473e-05, + "loss": 0.1781, + "step": 22162 + }, + { + "epoch": 0.39530196554061287, + "grad_norm": 0.3270612359046936, + "learning_rate": 3.785238086137227e-05, + "loss": 0.1474, + "step": 22163 + }, + { + "epoch": 0.39531980166232655, + "grad_norm": 0.2136305421590805, + "learning_rate": 3.7851045768367795e-05, + "loss": 0.1329, + "step": 22164 + }, + { + "epoch": 0.39533763778404024, + "grad_norm": 0.2370368391275406, + "learning_rate": 3.784971062554823e-05, + "loss": 0.1869, + "step": 22165 + }, + { + "epoch": 0.39535547390575393, + "grad_norm": 0.2910391390323639, + "learning_rate": 3.784837543291875e-05, + "loss": 0.1436, + "step": 22166 + }, + { + "epoch": 0.3953733100274676, + "grad_norm": 0.37344807386398315, + "learning_rate": 3.784704019048452e-05, + "loss": 0.1599, + "step": 22167 + }, + { + "epoch": 0.3953911461491813, + "grad_norm": 0.21428127586841583, + "learning_rate": 3.784570489825073e-05, + "loss": 0.117, + "step": 22168 + }, + { + "epoch": 0.395408982270895, + "grad_norm": 0.22344574332237244, + "learning_rate": 3.784436955622256e-05, + "loss": 0.1478, + "step": 22169 + }, + { + "epoch": 0.39542681839260874, + "grad_norm": 0.22434456646442413, + "learning_rate": 3.784303416440517e-05, + "loss": 0.1709, + "step": 22170 + }, + { + "epoch": 0.3954446545143224, + "grad_norm": 0.3279266953468323, + "learning_rate": 3.7841698722803736e-05, + "loss": 0.2596, + "step": 22171 + }, + { + "epoch": 0.3954624906360361, + "grad_norm": 0.23844437301158905, + "learning_rate": 3.784036323142345e-05, + "loss": 0.1516, + "step": 22172 + }, + { + "epoch": 0.3954803267577498, + "grad_norm": 0.18370580673217773, + "learning_rate": 3.783902769026948e-05, + "loss": 0.1142, + "step": 22173 + }, + { + "epoch": 0.3954981628794635, + "grad_norm": 0.37334781885147095, + "learning_rate": 3.7837692099347014e-05, + "loss": 0.2122, + "step": 22174 + }, + { + "epoch": 0.3955159990011772, + "grad_norm": 0.32715171575546265, + "learning_rate": 3.78363564586612e-05, + "loss": 0.1851, + "step": 22175 + }, + { + "epoch": 0.39553383512289086, + "grad_norm": 0.2590390145778656, + "learning_rate": 3.783502076821726e-05, + "loss": 0.1726, + "step": 22176 + }, + { + "epoch": 0.39555167124460455, + "grad_norm": 0.36881256103515625, + "learning_rate": 3.7833685028020327e-05, + "loss": 0.2004, + "step": 22177 + }, + { + "epoch": 0.39556950736631824, + "grad_norm": 0.38517460227012634, + "learning_rate": 3.78323492380756e-05, + "loss": 0.2125, + "step": 22178 + }, + { + "epoch": 0.395587343488032, + "grad_norm": 0.18701235949993134, + "learning_rate": 3.783101339838826e-05, + "loss": 0.1699, + "step": 22179 + }, + { + "epoch": 0.39560517960974567, + "grad_norm": 0.43048009276390076, + "learning_rate": 3.782967750896348e-05, + "loss": 0.1691, + "step": 22180 + }, + { + "epoch": 0.39562301573145936, + "grad_norm": 0.26353105902671814, + "learning_rate": 3.782834156980643e-05, + "loss": 0.1758, + "step": 22181 + }, + { + "epoch": 0.39564085185317305, + "grad_norm": 0.3236018419265747, + "learning_rate": 3.7827005580922316e-05, + "loss": 0.1492, + "step": 22182 + }, + { + "epoch": 0.39565868797488674, + "grad_norm": 0.22929665446281433, + "learning_rate": 3.782566954231629e-05, + "loss": 0.1317, + "step": 22183 + }, + { + "epoch": 0.3956765240966004, + "grad_norm": 0.23387475311756134, + "learning_rate": 3.782433345399353e-05, + "loss": 0.1433, + "step": 22184 + }, + { + "epoch": 0.3956943602183141, + "grad_norm": 0.3089311420917511, + "learning_rate": 3.782299731595923e-05, + "loss": 0.1411, + "step": 22185 + }, + { + "epoch": 0.3957121963400278, + "grad_norm": 0.4106653332710266, + "learning_rate": 3.782166112821855e-05, + "loss": 0.1611, + "step": 22186 + }, + { + "epoch": 0.39573003246174154, + "grad_norm": 0.242791548371315, + "learning_rate": 3.782032489077671e-05, + "loss": 0.1436, + "step": 22187 + }, + { + "epoch": 0.39574786858345523, + "grad_norm": 0.2590719759464264, + "learning_rate": 3.781898860363885e-05, + "loss": 0.1918, + "step": 22188 + }, + { + "epoch": 0.3957657047051689, + "grad_norm": 0.25568678975105286, + "learning_rate": 3.781765226681016e-05, + "loss": 0.1872, + "step": 22189 + }, + { + "epoch": 0.3957835408268826, + "grad_norm": 0.28192266821861267, + "learning_rate": 3.781631588029583e-05, + "loss": 0.1826, + "step": 22190 + }, + { + "epoch": 0.3958013769485963, + "grad_norm": 0.30621835589408875, + "learning_rate": 3.781497944410102e-05, + "loss": 0.142, + "step": 22191 + }, + { + "epoch": 0.39581921307031, + "grad_norm": 0.389258474111557, + "learning_rate": 3.781364295823093e-05, + "loss": 0.189, + "step": 22192 + }, + { + "epoch": 0.39583704919202367, + "grad_norm": 0.21447554230690002, + "learning_rate": 3.781230642269073e-05, + "loss": 0.1682, + "step": 22193 + }, + { + "epoch": 0.39585488531373736, + "grad_norm": 0.29244545102119446, + "learning_rate": 3.781096983748562e-05, + "loss": 0.1636, + "step": 22194 + }, + { + "epoch": 0.3958727214354511, + "grad_norm": 0.30486592650413513, + "learning_rate": 3.780963320262075e-05, + "loss": 0.1675, + "step": 22195 + }, + { + "epoch": 0.3958905575571648, + "grad_norm": 0.24616453051567078, + "learning_rate": 3.7808296518101334e-05, + "loss": 0.1936, + "step": 22196 + }, + { + "epoch": 0.3959083936788785, + "grad_norm": 0.2412756085395813, + "learning_rate": 3.780695978393253e-05, + "loss": 0.1565, + "step": 22197 + }, + { + "epoch": 0.39592622980059217, + "grad_norm": 0.25053858757019043, + "learning_rate": 3.780562300011952e-05, + "loss": 0.1249, + "step": 22198 + }, + { + "epoch": 0.39594406592230585, + "grad_norm": 0.26132482290267944, + "learning_rate": 3.780428616666749e-05, + "loss": 0.2577, + "step": 22199 + }, + { + "epoch": 0.39596190204401954, + "grad_norm": 0.23156170547008514, + "learning_rate": 3.7802949283581634e-05, + "loss": 0.1324, + "step": 22200 + }, + { + "epoch": 0.39597973816573323, + "grad_norm": 0.309425413608551, + "learning_rate": 3.780161235086712e-05, + "loss": 0.1115, + "step": 22201 + }, + { + "epoch": 0.3959975742874469, + "grad_norm": 0.2567881643772125, + "learning_rate": 3.780027536852914e-05, + "loss": 0.1254, + "step": 22202 + }, + { + "epoch": 0.3960154104091606, + "grad_norm": 0.5447714924812317, + "learning_rate": 3.7798938336572864e-05, + "loss": 0.1988, + "step": 22203 + }, + { + "epoch": 0.39603324653087435, + "grad_norm": 0.26598888635635376, + "learning_rate": 3.779760125500349e-05, + "loss": 0.1727, + "step": 22204 + }, + { + "epoch": 0.39605108265258804, + "grad_norm": 0.34892353415489197, + "learning_rate": 3.7796264123826185e-05, + "loss": 0.1434, + "step": 22205 + }, + { + "epoch": 0.3960689187743017, + "grad_norm": 0.27633213996887207, + "learning_rate": 3.779492694304614e-05, + "loss": 0.1531, + "step": 22206 + }, + { + "epoch": 0.3960867548960154, + "grad_norm": 0.23184406757354736, + "learning_rate": 3.7793589712668545e-05, + "loss": 0.1598, + "step": 22207 + }, + { + "epoch": 0.3961045910177291, + "grad_norm": 0.20744064450263977, + "learning_rate": 3.779225243269858e-05, + "loss": 0.1175, + "step": 22208 + }, + { + "epoch": 0.3961224271394428, + "grad_norm": 0.4271492063999176, + "learning_rate": 3.7790915103141425e-05, + "loss": 0.1515, + "step": 22209 + }, + { + "epoch": 0.3961402632611565, + "grad_norm": 0.27226418256759644, + "learning_rate": 3.778957772400226e-05, + "loss": 0.1531, + "step": 22210 + }, + { + "epoch": 0.39615809938287017, + "grad_norm": 0.24618516862392426, + "learning_rate": 3.778824029528628e-05, + "loss": 0.1661, + "step": 22211 + }, + { + "epoch": 0.3961759355045839, + "grad_norm": 0.26164987683296204, + "learning_rate": 3.778690281699866e-05, + "loss": 0.1401, + "step": 22212 + }, + { + "epoch": 0.3961937716262976, + "grad_norm": 0.22632980346679688, + "learning_rate": 3.778556528914459e-05, + "loss": 0.1127, + "step": 22213 + }, + { + "epoch": 0.3962116077480113, + "grad_norm": 0.30319634079933167, + "learning_rate": 3.778422771172925e-05, + "loss": 0.1264, + "step": 22214 + }, + { + "epoch": 0.396229443869725, + "grad_norm": 0.2946068346500397, + "learning_rate": 3.778289008475783e-05, + "loss": 0.1342, + "step": 22215 + }, + { + "epoch": 0.39624727999143866, + "grad_norm": 0.25682687759399414, + "learning_rate": 3.778155240823551e-05, + "loss": 0.1679, + "step": 22216 + }, + { + "epoch": 0.39626511611315235, + "grad_norm": 0.29321160912513733, + "learning_rate": 3.7780214682167484e-05, + "loss": 0.1814, + "step": 22217 + }, + { + "epoch": 0.39628295223486604, + "grad_norm": 0.3286098539829254, + "learning_rate": 3.7778876906558926e-05, + "loss": 0.136, + "step": 22218 + }, + { + "epoch": 0.3963007883565797, + "grad_norm": 0.1981721818447113, + "learning_rate": 3.777753908141503e-05, + "loss": 0.1721, + "step": 22219 + }, + { + "epoch": 0.3963186244782934, + "grad_norm": 0.16597972810268402, + "learning_rate": 3.777620120674098e-05, + "loss": 0.1545, + "step": 22220 + }, + { + "epoch": 0.39633646060000716, + "grad_norm": 0.3351482152938843, + "learning_rate": 3.777486328254196e-05, + "loss": 0.1784, + "step": 22221 + }, + { + "epoch": 0.39635429672172084, + "grad_norm": 0.37906208634376526, + "learning_rate": 3.777352530882316e-05, + "loss": 0.1893, + "step": 22222 + }, + { + "epoch": 0.39637213284343453, + "grad_norm": 0.2731074094772339, + "learning_rate": 3.7772187285589764e-05, + "loss": 0.1375, + "step": 22223 + }, + { + "epoch": 0.3963899689651482, + "grad_norm": 0.2367071658372879, + "learning_rate": 3.777084921284696e-05, + "loss": 0.1333, + "step": 22224 + }, + { + "epoch": 0.3964078050868619, + "grad_norm": 0.2556086778640747, + "learning_rate": 3.7769511090599926e-05, + "loss": 0.1646, + "step": 22225 + }, + { + "epoch": 0.3964256412085756, + "grad_norm": 0.2598172426223755, + "learning_rate": 3.776817291885386e-05, + "loss": 0.1692, + "step": 22226 + }, + { + "epoch": 0.3964434773302893, + "grad_norm": 0.2346215397119522, + "learning_rate": 3.776683469761394e-05, + "loss": 0.1794, + "step": 22227 + }, + { + "epoch": 0.396461313452003, + "grad_norm": 0.39767736196517944, + "learning_rate": 3.776549642688537e-05, + "loss": 0.2489, + "step": 22228 + }, + { + "epoch": 0.3964791495737167, + "grad_norm": 0.2696288824081421, + "learning_rate": 3.776415810667333e-05, + "loss": 0.1633, + "step": 22229 + }, + { + "epoch": 0.3964969856954304, + "grad_norm": 0.3055456280708313, + "learning_rate": 3.7762819736982994e-05, + "loss": 0.1834, + "step": 22230 + }, + { + "epoch": 0.3965148218171441, + "grad_norm": 0.30127641558647156, + "learning_rate": 3.776148131781957e-05, + "loss": 0.1647, + "step": 22231 + }, + { + "epoch": 0.3965326579388578, + "grad_norm": 0.3112037777900696, + "learning_rate": 3.776014284918823e-05, + "loss": 0.2043, + "step": 22232 + }, + { + "epoch": 0.39655049406057147, + "grad_norm": 0.2905730903148651, + "learning_rate": 3.775880433109417e-05, + "loss": 0.1893, + "step": 22233 + }, + { + "epoch": 0.39656833018228516, + "grad_norm": 0.23809996247291565, + "learning_rate": 3.775746576354257e-05, + "loss": 0.1828, + "step": 22234 + }, + { + "epoch": 0.39658616630399884, + "grad_norm": 0.3379803001880646, + "learning_rate": 3.775612714653864e-05, + "loss": 0.1538, + "step": 22235 + }, + { + "epoch": 0.39660400242571253, + "grad_norm": 0.25696924328804016, + "learning_rate": 3.775478848008754e-05, + "loss": 0.13, + "step": 22236 + }, + { + "epoch": 0.3966218385474263, + "grad_norm": 0.2714236378669739, + "learning_rate": 3.775344976419449e-05, + "loss": 0.1624, + "step": 22237 + }, + { + "epoch": 0.39663967466913996, + "grad_norm": 0.26460570096969604, + "learning_rate": 3.775211099886466e-05, + "loss": 0.134, + "step": 22238 + }, + { + "epoch": 0.39665751079085365, + "grad_norm": 0.30905723571777344, + "learning_rate": 3.775077218410324e-05, + "loss": 0.1707, + "step": 22239 + }, + { + "epoch": 0.39667534691256734, + "grad_norm": 0.34891965985298157, + "learning_rate": 3.7749433319915425e-05, + "loss": 0.1807, + "step": 22240 + }, + { + "epoch": 0.396693183034281, + "grad_norm": 0.216149240732193, + "learning_rate": 3.77480944063064e-05, + "loss": 0.1588, + "step": 22241 + }, + { + "epoch": 0.3967110191559947, + "grad_norm": 0.26740992069244385, + "learning_rate": 3.7746755443281363e-05, + "loss": 0.1853, + "step": 22242 + }, + { + "epoch": 0.3967288552777084, + "grad_norm": 0.23010098934173584, + "learning_rate": 3.7745416430845494e-05, + "loss": 0.1816, + "step": 22243 + }, + { + "epoch": 0.3967466913994221, + "grad_norm": 0.22635534405708313, + "learning_rate": 3.7744077369003995e-05, + "loss": 0.1484, + "step": 22244 + }, + { + "epoch": 0.3967645275211358, + "grad_norm": 0.31512561440467834, + "learning_rate": 3.7742738257762044e-05, + "loss": 0.1606, + "step": 22245 + }, + { + "epoch": 0.3967823636428495, + "grad_norm": 0.35845622420310974, + "learning_rate": 3.774139909712484e-05, + "loss": 0.1722, + "step": 22246 + }, + { + "epoch": 0.3968001997645632, + "grad_norm": 0.2786567807197571, + "learning_rate": 3.774005988709757e-05, + "loss": 0.1694, + "step": 22247 + }, + { + "epoch": 0.3968180358862769, + "grad_norm": 0.22597715258598328, + "learning_rate": 3.773872062768543e-05, + "loss": 0.1437, + "step": 22248 + }, + { + "epoch": 0.3968358720079906, + "grad_norm": 0.3453173041343689, + "learning_rate": 3.773738131889362e-05, + "loss": 0.2041, + "step": 22249 + }, + { + "epoch": 0.3968537081297043, + "grad_norm": 0.3514271080493927, + "learning_rate": 3.773604196072731e-05, + "loss": 0.186, + "step": 22250 + }, + { + "epoch": 0.39687154425141796, + "grad_norm": 0.2693471908569336, + "learning_rate": 3.77347025531917e-05, + "loss": 0.1178, + "step": 22251 + }, + { + "epoch": 0.39688938037313165, + "grad_norm": 0.26904910802841187, + "learning_rate": 3.7733363096291985e-05, + "loss": 0.118, + "step": 22252 + }, + { + "epoch": 0.39690721649484534, + "grad_norm": 0.23082782328128815, + "learning_rate": 3.773202359003336e-05, + "loss": 0.1706, + "step": 22253 + }, + { + "epoch": 0.3969250526165591, + "grad_norm": 0.34383949637413025, + "learning_rate": 3.773068403442102e-05, + "loss": 0.1695, + "step": 22254 + }, + { + "epoch": 0.39694288873827277, + "grad_norm": 0.21252372860908508, + "learning_rate": 3.7729344429460136e-05, + "loss": 0.1475, + "step": 22255 + }, + { + "epoch": 0.39696072485998646, + "grad_norm": 0.2579759359359741, + "learning_rate": 3.7728004775155926e-05, + "loss": 0.1334, + "step": 22256 + }, + { + "epoch": 0.39697856098170015, + "grad_norm": 0.22919298708438873, + "learning_rate": 3.772666507151358e-05, + "loss": 0.1211, + "step": 22257 + }, + { + "epoch": 0.39699639710341383, + "grad_norm": 0.2603635787963867, + "learning_rate": 3.772532531853827e-05, + "loss": 0.2079, + "step": 22258 + }, + { + "epoch": 0.3970142332251275, + "grad_norm": 0.32817214727401733, + "learning_rate": 3.772398551623521e-05, + "loss": 0.1618, + "step": 22259 + }, + { + "epoch": 0.3970320693468412, + "grad_norm": 0.1930309534072876, + "learning_rate": 3.772264566460959e-05, + "loss": 0.1442, + "step": 22260 + }, + { + "epoch": 0.3970499054685549, + "grad_norm": 0.27403295040130615, + "learning_rate": 3.7721305763666604e-05, + "loss": 0.1389, + "step": 22261 + }, + { + "epoch": 0.3970677415902686, + "grad_norm": 0.31754204630851746, + "learning_rate": 3.7719965813411434e-05, + "loss": 0.2156, + "step": 22262 + }, + { + "epoch": 0.39708557771198233, + "grad_norm": 0.2536564767360687, + "learning_rate": 3.7718625813849297e-05, + "loss": 0.2135, + "step": 22263 + }, + { + "epoch": 0.397103413833696, + "grad_norm": 0.2569412589073181, + "learning_rate": 3.7717285764985356e-05, + "loss": 0.2338, + "step": 22264 + }, + { + "epoch": 0.3971212499554097, + "grad_norm": 0.2939034700393677, + "learning_rate": 3.771594566682484e-05, + "loss": 0.2066, + "step": 22265 + }, + { + "epoch": 0.3971390860771234, + "grad_norm": 0.19327019155025482, + "learning_rate": 3.7714605519372916e-05, + "loss": 0.1411, + "step": 22266 + }, + { + "epoch": 0.3971569221988371, + "grad_norm": 0.33009007573127747, + "learning_rate": 3.7713265322634797e-05, + "loss": 0.127, + "step": 22267 + }, + { + "epoch": 0.39717475832055077, + "grad_norm": 0.180852010846138, + "learning_rate": 3.771192507661565e-05, + "loss": 0.1446, + "step": 22268 + }, + { + "epoch": 0.39719259444226446, + "grad_norm": 0.2605664134025574, + "learning_rate": 3.7710584781320715e-05, + "loss": 0.1308, + "step": 22269 + }, + { + "epoch": 0.39721043056397815, + "grad_norm": 0.5936512351036072, + "learning_rate": 3.770924443675515e-05, + "loss": 0.3512, + "step": 22270 + }, + { + "epoch": 0.3972282666856919, + "grad_norm": 0.32129839062690735, + "learning_rate": 3.770790404292417e-05, + "loss": 0.1741, + "step": 22271 + }, + { + "epoch": 0.3972461028074056, + "grad_norm": 0.2187911570072174, + "learning_rate": 3.770656359983297e-05, + "loss": 0.1535, + "step": 22272 + }, + { + "epoch": 0.39726393892911926, + "grad_norm": 0.28206637501716614, + "learning_rate": 3.770522310748673e-05, + "loss": 0.1475, + "step": 22273 + }, + { + "epoch": 0.39728177505083295, + "grad_norm": 0.23699229955673218, + "learning_rate": 3.770388256589066e-05, + "loss": 0.1526, + "step": 22274 + }, + { + "epoch": 0.39729961117254664, + "grad_norm": 0.2758311629295349, + "learning_rate": 3.770254197504995e-05, + "loss": 0.1657, + "step": 22275 + }, + { + "epoch": 0.39731744729426033, + "grad_norm": 0.36731085181236267, + "learning_rate": 3.7701201334969804e-05, + "loss": 0.175, + "step": 22276 + }, + { + "epoch": 0.397335283415974, + "grad_norm": 0.2826375961303711, + "learning_rate": 3.7699860645655416e-05, + "loss": 0.1211, + "step": 22277 + }, + { + "epoch": 0.3973531195376877, + "grad_norm": 0.28067928552627563, + "learning_rate": 3.769851990711198e-05, + "loss": 0.11, + "step": 22278 + }, + { + "epoch": 0.3973709556594014, + "grad_norm": 0.26242756843566895, + "learning_rate": 3.76971791193447e-05, + "loss": 0.1722, + "step": 22279 + }, + { + "epoch": 0.39738879178111514, + "grad_norm": 0.26454052329063416, + "learning_rate": 3.769583828235876e-05, + "loss": 0.1722, + "step": 22280 + }, + { + "epoch": 0.3974066279028288, + "grad_norm": 0.2344706952571869, + "learning_rate": 3.7694497396159364e-05, + "loss": 0.1143, + "step": 22281 + }, + { + "epoch": 0.3974244640245425, + "grad_norm": 0.3165126144886017, + "learning_rate": 3.769315646075172e-05, + "loss": 0.1744, + "step": 22282 + }, + { + "epoch": 0.3974423001462562, + "grad_norm": 0.23090432584285736, + "learning_rate": 3.769181547614102e-05, + "loss": 0.1397, + "step": 22283 + }, + { + "epoch": 0.3974601362679699, + "grad_norm": 0.2685081660747528, + "learning_rate": 3.769047444233245e-05, + "loss": 0.1917, + "step": 22284 + }, + { + "epoch": 0.3974779723896836, + "grad_norm": 0.3602457642555237, + "learning_rate": 3.768913335933123e-05, + "loss": 0.152, + "step": 22285 + }, + { + "epoch": 0.39749580851139726, + "grad_norm": 0.2178330272436142, + "learning_rate": 3.768779222714254e-05, + "loss": 0.1505, + "step": 22286 + }, + { + "epoch": 0.39751364463311095, + "grad_norm": 0.2397204488515854, + "learning_rate": 3.768645104577158e-05, + "loss": 0.1106, + "step": 22287 + }, + { + "epoch": 0.3975314807548247, + "grad_norm": 0.21508941054344177, + "learning_rate": 3.7685109815223554e-05, + "loss": 0.1281, + "step": 22288 + }, + { + "epoch": 0.3975493168765384, + "grad_norm": 0.246878981590271, + "learning_rate": 3.768376853550367e-05, + "loss": 0.1827, + "step": 22289 + }, + { + "epoch": 0.39756715299825207, + "grad_norm": 0.2514778673648834, + "learning_rate": 3.768242720661711e-05, + "loss": 0.1654, + "step": 22290 + }, + { + "epoch": 0.39758498911996576, + "grad_norm": 0.27585023641586304, + "learning_rate": 3.7681085828569086e-05, + "loss": 0.117, + "step": 22291 + }, + { + "epoch": 0.39760282524167945, + "grad_norm": 0.2424265444278717, + "learning_rate": 3.767974440136479e-05, + "loss": 0.174, + "step": 22292 + }, + { + "epoch": 0.39762066136339314, + "grad_norm": 0.366639107465744, + "learning_rate": 3.7678402925009425e-05, + "loss": 0.1621, + "step": 22293 + }, + { + "epoch": 0.3976384974851068, + "grad_norm": 0.2671668827533722, + "learning_rate": 3.76770613995082e-05, + "loss": 0.1739, + "step": 22294 + }, + { + "epoch": 0.3976563336068205, + "grad_norm": 0.21506819128990173, + "learning_rate": 3.767571982486629e-05, + "loss": 0.1608, + "step": 22295 + }, + { + "epoch": 0.39767416972853425, + "grad_norm": 0.2388947308063507, + "learning_rate": 3.7674378201088935e-05, + "loss": 0.1498, + "step": 22296 + }, + { + "epoch": 0.39769200585024794, + "grad_norm": 0.23440612852573395, + "learning_rate": 3.7673036528181294e-05, + "loss": 0.1819, + "step": 22297 + }, + { + "epoch": 0.39770984197196163, + "grad_norm": 0.3379155993461609, + "learning_rate": 3.7671694806148596e-05, + "loss": 0.1383, + "step": 22298 + }, + { + "epoch": 0.3977276780936753, + "grad_norm": 0.26498284935951233, + "learning_rate": 3.767035303499602e-05, + "loss": 0.1522, + "step": 22299 + }, + { + "epoch": 0.397745514215389, + "grad_norm": 0.3507102131843567, + "learning_rate": 3.76690112147288e-05, + "loss": 0.1182, + "step": 22300 + }, + { + "epoch": 0.3977633503371027, + "grad_norm": 0.2222672402858734, + "learning_rate": 3.76676693453521e-05, + "loss": 0.0872, + "step": 22301 + }, + { + "epoch": 0.3977811864588164, + "grad_norm": 0.2475961446762085, + "learning_rate": 3.766632742687114e-05, + "loss": 0.1414, + "step": 22302 + }, + { + "epoch": 0.39779902258053007, + "grad_norm": 0.3509806990623474, + "learning_rate": 3.7664985459291125e-05, + "loss": 0.1876, + "step": 22303 + }, + { + "epoch": 0.39781685870224376, + "grad_norm": 0.27201980352401733, + "learning_rate": 3.766364344261725e-05, + "loss": 0.138, + "step": 22304 + }, + { + "epoch": 0.3978346948239575, + "grad_norm": 0.25919872522354126, + "learning_rate": 3.766230137685471e-05, + "loss": 0.1495, + "step": 22305 + }, + { + "epoch": 0.3978525309456712, + "grad_norm": 0.3091748356819153, + "learning_rate": 3.766095926200873e-05, + "loss": 0.1728, + "step": 22306 + }, + { + "epoch": 0.3978703670673849, + "grad_norm": 0.22791191935539246, + "learning_rate": 3.76596170980845e-05, + "loss": 0.1409, + "step": 22307 + }, + { + "epoch": 0.39788820318909857, + "grad_norm": 0.2801245152950287, + "learning_rate": 3.765827488508722e-05, + "loss": 0.1685, + "step": 22308 + }, + { + "epoch": 0.39790603931081225, + "grad_norm": 0.29067695140838623, + "learning_rate": 3.7656932623022084e-05, + "loss": 0.165, + "step": 22309 + }, + { + "epoch": 0.39792387543252594, + "grad_norm": 0.28457409143447876, + "learning_rate": 3.7655590311894314e-05, + "loss": 0.1358, + "step": 22310 + }, + { + "epoch": 0.39794171155423963, + "grad_norm": 0.24204015731811523, + "learning_rate": 3.76542479517091e-05, + "loss": 0.0948, + "step": 22311 + }, + { + "epoch": 0.3979595476759533, + "grad_norm": 0.27102288603782654, + "learning_rate": 3.765290554247165e-05, + "loss": 0.1943, + "step": 22312 + }, + { + "epoch": 0.39797738379766706, + "grad_norm": 0.2584947943687439, + "learning_rate": 3.765156308418718e-05, + "loss": 0.191, + "step": 22313 + }, + { + "epoch": 0.39799521991938075, + "grad_norm": 0.21742399036884308, + "learning_rate": 3.7650220576860874e-05, + "loss": 0.1574, + "step": 22314 + }, + { + "epoch": 0.39801305604109444, + "grad_norm": 0.4372897148132324, + "learning_rate": 3.7648878020497944e-05, + "loss": 0.1629, + "step": 22315 + }, + { + "epoch": 0.3980308921628081, + "grad_norm": 0.23204554617404938, + "learning_rate": 3.764753541510359e-05, + "loss": 0.1393, + "step": 22316 + }, + { + "epoch": 0.3980487282845218, + "grad_norm": 0.23232850432395935, + "learning_rate": 3.764619276068302e-05, + "loss": 0.162, + "step": 22317 + }, + { + "epoch": 0.3980665644062355, + "grad_norm": 0.38425225019454956, + "learning_rate": 3.764485005724144e-05, + "loss": 0.1872, + "step": 22318 + }, + { + "epoch": 0.3980844005279492, + "grad_norm": 0.2631813585758209, + "learning_rate": 3.764350730478406e-05, + "loss": 0.1142, + "step": 22319 + }, + { + "epoch": 0.3981022366496629, + "grad_norm": 0.24448172748088837, + "learning_rate": 3.7642164503316076e-05, + "loss": 0.1155, + "step": 22320 + }, + { + "epoch": 0.39812007277137657, + "grad_norm": 0.30166247487068176, + "learning_rate": 3.7640821652842693e-05, + "loss": 0.174, + "step": 22321 + }, + { + "epoch": 0.3981379088930903, + "grad_norm": 0.2300938218832016, + "learning_rate": 3.7639478753369126e-05, + "loss": 0.1878, + "step": 22322 + }, + { + "epoch": 0.398155745014804, + "grad_norm": 0.32995182275772095, + "learning_rate": 3.763813580490056e-05, + "loss": 0.1745, + "step": 22323 + }, + { + "epoch": 0.3981735811365177, + "grad_norm": 0.22452934086322784, + "learning_rate": 3.763679280744223e-05, + "loss": 0.1085, + "step": 22324 + }, + { + "epoch": 0.3981914172582314, + "grad_norm": 0.4042935073375702, + "learning_rate": 3.763544976099932e-05, + "loss": 0.256, + "step": 22325 + }, + { + "epoch": 0.39820925337994506, + "grad_norm": 0.20228131115436554, + "learning_rate": 3.763410666557704e-05, + "loss": 0.0926, + "step": 22326 + }, + { + "epoch": 0.39822708950165875, + "grad_norm": 0.25248879194259644, + "learning_rate": 3.76327635211806e-05, + "loss": 0.1934, + "step": 22327 + }, + { + "epoch": 0.39824492562337244, + "grad_norm": 0.20059946179389954, + "learning_rate": 3.763142032781522e-05, + "loss": 0.1151, + "step": 22328 + }, + { + "epoch": 0.3982627617450861, + "grad_norm": 0.33941495418548584, + "learning_rate": 3.763007708548607e-05, + "loss": 0.1386, + "step": 22329 + }, + { + "epoch": 0.39828059786679987, + "grad_norm": 0.20216259360313416, + "learning_rate": 3.762873379419839e-05, + "loss": 0.1175, + "step": 22330 + }, + { + "epoch": 0.39829843398851356, + "grad_norm": 0.42369261384010315, + "learning_rate": 3.7627390453957365e-05, + "loss": 0.1556, + "step": 22331 + }, + { + "epoch": 0.39831627011022724, + "grad_norm": 0.2654978334903717, + "learning_rate": 3.762604706476823e-05, + "loss": 0.145, + "step": 22332 + }, + { + "epoch": 0.39833410623194093, + "grad_norm": 0.23754015564918518, + "learning_rate": 3.762470362663616e-05, + "loss": 0.1397, + "step": 22333 + }, + { + "epoch": 0.3983519423536546, + "grad_norm": 0.43318185210227966, + "learning_rate": 3.762336013956639e-05, + "loss": 0.2091, + "step": 22334 + }, + { + "epoch": 0.3983697784753683, + "grad_norm": 0.26307061314582825, + "learning_rate": 3.762201660356411e-05, + "loss": 0.1639, + "step": 22335 + }, + { + "epoch": 0.398387614597082, + "grad_norm": 0.34982284903526306, + "learning_rate": 3.7620673018634545e-05, + "loss": 0.1432, + "step": 22336 + }, + { + "epoch": 0.3984054507187957, + "grad_norm": 0.2314211130142212, + "learning_rate": 3.761932938478288e-05, + "loss": 0.1615, + "step": 22337 + }, + { + "epoch": 0.39842328684050937, + "grad_norm": 0.3286464512348175, + "learning_rate": 3.761798570201434e-05, + "loss": 0.1348, + "step": 22338 + }, + { + "epoch": 0.3984411229622231, + "grad_norm": 0.3589918613433838, + "learning_rate": 3.7616641970334124e-05, + "loss": 0.1657, + "step": 22339 + }, + { + "epoch": 0.3984589590839368, + "grad_norm": 0.4123172461986542, + "learning_rate": 3.761529818974745e-05, + "loss": 0.1621, + "step": 22340 + }, + { + "epoch": 0.3984767952056505, + "grad_norm": 0.29044750332832336, + "learning_rate": 3.761395436025953e-05, + "loss": 0.1466, + "step": 22341 + }, + { + "epoch": 0.3984946313273642, + "grad_norm": 0.24817374348640442, + "learning_rate": 3.761261048187555e-05, + "loss": 0.2051, + "step": 22342 + }, + { + "epoch": 0.39851246744907787, + "grad_norm": 0.18409378826618195, + "learning_rate": 3.7611266554600755e-05, + "loss": 0.1436, + "step": 22343 + }, + { + "epoch": 0.39853030357079156, + "grad_norm": 0.35922834277153015, + "learning_rate": 3.7609922578440325e-05, + "loss": 0.187, + "step": 22344 + }, + { + "epoch": 0.39854813969250524, + "grad_norm": 0.28755804896354675, + "learning_rate": 3.760857855339947e-05, + "loss": 0.2006, + "step": 22345 + }, + { + "epoch": 0.39856597581421893, + "grad_norm": 0.2868260443210602, + "learning_rate": 3.7607234479483425e-05, + "loss": 0.1871, + "step": 22346 + }, + { + "epoch": 0.3985838119359327, + "grad_norm": 0.21651361882686615, + "learning_rate": 3.760589035669738e-05, + "loss": 0.114, + "step": 22347 + }, + { + "epoch": 0.39860164805764636, + "grad_norm": 0.2022954225540161, + "learning_rate": 3.760454618504656e-05, + "loss": 0.1369, + "step": 22348 + }, + { + "epoch": 0.39861948417936005, + "grad_norm": 0.3257977366447449, + "learning_rate": 3.760320196453615e-05, + "loss": 0.193, + "step": 22349 + }, + { + "epoch": 0.39863732030107374, + "grad_norm": 0.31494295597076416, + "learning_rate": 3.7601857695171384e-05, + "loss": 0.149, + "step": 22350 + }, + { + "epoch": 0.3986551564227874, + "grad_norm": 0.3476201295852661, + "learning_rate": 3.760051337695746e-05, + "loss": 0.2422, + "step": 22351 + }, + { + "epoch": 0.3986729925445011, + "grad_norm": 0.20647187530994415, + "learning_rate": 3.759916900989959e-05, + "loss": 0.1268, + "step": 22352 + }, + { + "epoch": 0.3986908286662148, + "grad_norm": 0.2499152272939682, + "learning_rate": 3.7597824594003e-05, + "loss": 0.1741, + "step": 22353 + }, + { + "epoch": 0.3987086647879285, + "grad_norm": 0.24941423535346985, + "learning_rate": 3.7596480129272885e-05, + "loss": 0.1428, + "step": 22354 + }, + { + "epoch": 0.39872650090964223, + "grad_norm": 0.3463935852050781, + "learning_rate": 3.759513561571447e-05, + "loss": 0.2242, + "step": 22355 + }, + { + "epoch": 0.3987443370313559, + "grad_norm": 0.2725744843482971, + "learning_rate": 3.759379105333295e-05, + "loss": 0.2093, + "step": 22356 + }, + { + "epoch": 0.3987621731530696, + "grad_norm": 0.23158814013004303, + "learning_rate": 3.759244644213355e-05, + "loss": 0.1716, + "step": 22357 + }, + { + "epoch": 0.3987800092747833, + "grad_norm": 0.22862771153450012, + "learning_rate": 3.7591101782121475e-05, + "loss": 0.1302, + "step": 22358 + }, + { + "epoch": 0.398797845396497, + "grad_norm": 0.29475483298301697, + "learning_rate": 3.758975707330194e-05, + "loss": 0.1675, + "step": 22359 + }, + { + "epoch": 0.3988156815182107, + "grad_norm": 0.22144821286201477, + "learning_rate": 3.758841231568017e-05, + "loss": 0.1693, + "step": 22360 + }, + { + "epoch": 0.39883351763992436, + "grad_norm": 0.334911972284317, + "learning_rate": 3.7587067509261356e-05, + "loss": 0.2412, + "step": 22361 + }, + { + "epoch": 0.39885135376163805, + "grad_norm": 0.21591593325138092, + "learning_rate": 3.758572265405072e-05, + "loss": 0.1336, + "step": 22362 + }, + { + "epoch": 0.39886918988335174, + "grad_norm": 0.24001966416835785, + "learning_rate": 3.758437775005348e-05, + "loss": 0.1418, + "step": 22363 + }, + { + "epoch": 0.3988870260050655, + "grad_norm": 0.24681399762630463, + "learning_rate": 3.758303279727484e-05, + "loss": 0.1795, + "step": 22364 + }, + { + "epoch": 0.39890486212677917, + "grad_norm": 0.24292513728141785, + "learning_rate": 3.758168779572002e-05, + "loss": 0.1782, + "step": 22365 + }, + { + "epoch": 0.39892269824849286, + "grad_norm": 0.24503962695598602, + "learning_rate": 3.7580342745394237e-05, + "loss": 0.1608, + "step": 22366 + }, + { + "epoch": 0.39894053437020655, + "grad_norm": 0.35116299986839294, + "learning_rate": 3.75789976463027e-05, + "loss": 0.1883, + "step": 22367 + }, + { + "epoch": 0.39895837049192023, + "grad_norm": 0.32512491941452026, + "learning_rate": 3.757765249845062e-05, + "loss": 0.1835, + "step": 22368 + }, + { + "epoch": 0.3989762066136339, + "grad_norm": 0.21189890801906586, + "learning_rate": 3.7576307301843213e-05, + "loss": 0.132, + "step": 22369 + }, + { + "epoch": 0.3989940427353476, + "grad_norm": 0.22695215046405792, + "learning_rate": 3.757496205648571e-05, + "loss": 0.1457, + "step": 22370 + }, + { + "epoch": 0.3990118788570613, + "grad_norm": 0.4169876277446747, + "learning_rate": 3.757361676238329e-05, + "loss": 0.1461, + "step": 22371 + }, + { + "epoch": 0.39902971497877504, + "grad_norm": 0.2240995466709137, + "learning_rate": 3.757227141954119e-05, + "loss": 0.1418, + "step": 22372 + }, + { + "epoch": 0.39904755110048873, + "grad_norm": 0.25536563992500305, + "learning_rate": 3.7570926027964645e-05, + "loss": 0.1612, + "step": 22373 + }, + { + "epoch": 0.3990653872222024, + "grad_norm": 0.2658531665802002, + "learning_rate": 3.756958058765884e-05, + "loss": 0.1654, + "step": 22374 + }, + { + "epoch": 0.3990832233439161, + "grad_norm": 0.2901322841644287, + "learning_rate": 3.756823509862899e-05, + "loss": 0.1316, + "step": 22375 + }, + { + "epoch": 0.3991010594656298, + "grad_norm": 0.4107397496700287, + "learning_rate": 3.7566889560880326e-05, + "loss": 0.1926, + "step": 22376 + }, + { + "epoch": 0.3991188955873435, + "grad_norm": 0.35893312096595764, + "learning_rate": 3.756554397441805e-05, + "loss": 0.152, + "step": 22377 + }, + { + "epoch": 0.39913673170905717, + "grad_norm": 0.17993880808353424, + "learning_rate": 3.756419833924739e-05, + "loss": 0.1697, + "step": 22378 + }, + { + "epoch": 0.39915456783077086, + "grad_norm": 0.3351760506629944, + "learning_rate": 3.756285265537356e-05, + "loss": 0.1804, + "step": 22379 + }, + { + "epoch": 0.39917240395248454, + "grad_norm": 0.25972780585289, + "learning_rate": 3.756150692280178e-05, + "loss": 0.1709, + "step": 22380 + }, + { + "epoch": 0.3991902400741983, + "grad_norm": 0.2831493318080902, + "learning_rate": 3.7560161141537254e-05, + "loss": 0.1665, + "step": 22381 + }, + { + "epoch": 0.399208076195912, + "grad_norm": 0.24255450069904327, + "learning_rate": 3.755881531158521e-05, + "loss": 0.1601, + "step": 22382 + }, + { + "epoch": 0.39922591231762566, + "grad_norm": 0.2736910283565521, + "learning_rate": 3.755746943295085e-05, + "loss": 0.1347, + "step": 22383 + }, + { + "epoch": 0.39924374843933935, + "grad_norm": 0.29703789949417114, + "learning_rate": 3.755612350563941e-05, + "loss": 0.1865, + "step": 22384 + }, + { + "epoch": 0.39926158456105304, + "grad_norm": 0.4246012568473816, + "learning_rate": 3.755477752965609e-05, + "loss": 0.1653, + "step": 22385 + }, + { + "epoch": 0.39927942068276673, + "grad_norm": 0.26222506165504456, + "learning_rate": 3.755343150500612e-05, + "loss": 0.1294, + "step": 22386 + }, + { + "epoch": 0.3992972568044804, + "grad_norm": 0.33491799235343933, + "learning_rate": 3.755208543169472e-05, + "loss": 0.1318, + "step": 22387 + }, + { + "epoch": 0.3993150929261941, + "grad_norm": 0.21842913329601288, + "learning_rate": 3.7550739309727104e-05, + "loss": 0.14, + "step": 22388 + }, + { + "epoch": 0.39933292904790785, + "grad_norm": 0.2426721751689911, + "learning_rate": 3.754939313910848e-05, + "loss": 0.1433, + "step": 22389 + }, + { + "epoch": 0.39935076516962154, + "grad_norm": 0.23122094571590424, + "learning_rate": 3.754804691984407e-05, + "loss": 0.1739, + "step": 22390 + }, + { + "epoch": 0.3993686012913352, + "grad_norm": 0.26716575026512146, + "learning_rate": 3.7546700651939105e-05, + "loss": 0.1494, + "step": 22391 + }, + { + "epoch": 0.3993864374130489, + "grad_norm": 0.23491953313350677, + "learning_rate": 3.7545354335398785e-05, + "loss": 0.2122, + "step": 22392 + }, + { + "epoch": 0.3994042735347626, + "grad_norm": 0.2516017258167267, + "learning_rate": 3.7544007970228344e-05, + "loss": 0.1327, + "step": 22393 + }, + { + "epoch": 0.3994221096564763, + "grad_norm": 0.29688936471939087, + "learning_rate": 3.754266155643299e-05, + "loss": 0.1667, + "step": 22394 + }, + { + "epoch": 0.39943994577819, + "grad_norm": 0.2807506024837494, + "learning_rate": 3.754131509401796e-05, + "loss": 0.1744, + "step": 22395 + }, + { + "epoch": 0.39945778189990366, + "grad_norm": 0.30966708064079285, + "learning_rate": 3.7539968582988446e-05, + "loss": 0.1635, + "step": 22396 + }, + { + "epoch": 0.3994756180216174, + "grad_norm": 0.2621917426586151, + "learning_rate": 3.75386220233497e-05, + "loss": 0.1816, + "step": 22397 + }, + { + "epoch": 0.3994934541433311, + "grad_norm": 0.2556757628917694, + "learning_rate": 3.753727541510691e-05, + "loss": 0.1406, + "step": 22398 + }, + { + "epoch": 0.3995112902650448, + "grad_norm": 0.3581705689430237, + "learning_rate": 3.7535928758265315e-05, + "loss": 0.2429, + "step": 22399 + }, + { + "epoch": 0.39952912638675847, + "grad_norm": 0.24466313421726227, + "learning_rate": 3.753458205283013e-05, + "loss": 0.1612, + "step": 22400 + }, + { + "epoch": 0.39954696250847216, + "grad_norm": 0.23017403483390808, + "learning_rate": 3.753323529880658e-05, + "loss": 0.1384, + "step": 22401 + }, + { + "epoch": 0.39956479863018585, + "grad_norm": 0.2083551585674286, + "learning_rate": 3.7531888496199876e-05, + "loss": 0.1505, + "step": 22402 + }, + { + "epoch": 0.39958263475189953, + "grad_norm": 0.2748313248157501, + "learning_rate": 3.753054164501524e-05, + "loss": 0.146, + "step": 22403 + }, + { + "epoch": 0.3996004708736132, + "grad_norm": 0.2720942795276642, + "learning_rate": 3.7529194745257903e-05, + "loss": 0.1649, + "step": 22404 + }, + { + "epoch": 0.3996183069953269, + "grad_norm": 0.28043875098228455, + "learning_rate": 3.752784779693308e-05, + "loss": 0.1312, + "step": 22405 + }, + { + "epoch": 0.39963614311704065, + "grad_norm": 0.3267343044281006, + "learning_rate": 3.752650080004599e-05, + "loss": 0.202, + "step": 22406 + }, + { + "epoch": 0.39965397923875434, + "grad_norm": 0.1945066750049591, + "learning_rate": 3.7525153754601855e-05, + "loss": 0.1299, + "step": 22407 + }, + { + "epoch": 0.39967181536046803, + "grad_norm": 0.2010776400566101, + "learning_rate": 3.75238066606059e-05, + "loss": 0.1421, + "step": 22408 + }, + { + "epoch": 0.3996896514821817, + "grad_norm": 0.275698184967041, + "learning_rate": 3.752245951806335e-05, + "loss": 0.1972, + "step": 22409 + }, + { + "epoch": 0.3997074876038954, + "grad_norm": 0.18570183217525482, + "learning_rate": 3.752111232697941e-05, + "loss": 0.1568, + "step": 22410 + }, + { + "epoch": 0.3997253237256091, + "grad_norm": 0.24819031357765198, + "learning_rate": 3.751976508735932e-05, + "loss": 0.1236, + "step": 22411 + }, + { + "epoch": 0.3997431598473228, + "grad_norm": 0.28810280561447144, + "learning_rate": 3.7518417799208305e-05, + "loss": 0.1431, + "step": 22412 + }, + { + "epoch": 0.39976099596903647, + "grad_norm": 0.26540902256965637, + "learning_rate": 3.751707046253157e-05, + "loss": 0.1358, + "step": 22413 + }, + { + "epoch": 0.3997788320907502, + "grad_norm": 0.21171307563781738, + "learning_rate": 3.751572307733434e-05, + "loss": 0.0911, + "step": 22414 + }, + { + "epoch": 0.3997966682124639, + "grad_norm": 0.2565472722053528, + "learning_rate": 3.751437564362186e-05, + "loss": 0.1399, + "step": 22415 + }, + { + "epoch": 0.3998145043341776, + "grad_norm": 0.34243470430374146, + "learning_rate": 3.7513028161399324e-05, + "loss": 0.1419, + "step": 22416 + }, + { + "epoch": 0.3998323404558913, + "grad_norm": 0.2572193741798401, + "learning_rate": 3.7511680630671975e-05, + "loss": 0.1157, + "step": 22417 + }, + { + "epoch": 0.39985017657760497, + "grad_norm": 0.2543811798095703, + "learning_rate": 3.751033305144503e-05, + "loss": 0.1481, + "step": 22418 + }, + { + "epoch": 0.39986801269931865, + "grad_norm": 0.3897438645362854, + "learning_rate": 3.750898542372372e-05, + "loss": 0.211, + "step": 22419 + }, + { + "epoch": 0.39988584882103234, + "grad_norm": 0.19551517069339752, + "learning_rate": 3.7507637747513245e-05, + "loss": 0.1355, + "step": 22420 + }, + { + "epoch": 0.39990368494274603, + "grad_norm": 0.264609158039093, + "learning_rate": 3.750629002281886e-05, + "loss": 0.1883, + "step": 22421 + }, + { + "epoch": 0.3999215210644597, + "grad_norm": 0.2650822699069977, + "learning_rate": 3.750494224964577e-05, + "loss": 0.1666, + "step": 22422 + }, + { + "epoch": 0.39993935718617346, + "grad_norm": 0.2808986008167267, + "learning_rate": 3.7503594427999204e-05, + "loss": 0.1012, + "step": 22423 + }, + { + "epoch": 0.39995719330788715, + "grad_norm": 0.26200565695762634, + "learning_rate": 3.7502246557884394e-05, + "loss": 0.142, + "step": 22424 + }, + { + "epoch": 0.39997502942960084, + "grad_norm": 0.2342466562986374, + "learning_rate": 3.750089863930655e-05, + "loss": 0.1828, + "step": 22425 + }, + { + "epoch": 0.3999928655513145, + "grad_norm": 0.24660594761371613, + "learning_rate": 3.749955067227092e-05, + "loss": 0.159, + "step": 22426 + }, + { + "epoch": 0.4000107016730282, + "grad_norm": 0.28356727957725525, + "learning_rate": 3.749820265678269e-05, + "loss": 0.1524, + "step": 22427 + }, + { + "epoch": 0.4000285377947419, + "grad_norm": 0.21599942445755005, + "learning_rate": 3.749685459284713e-05, + "loss": 0.1481, + "step": 22428 + }, + { + "epoch": 0.4000463739164556, + "grad_norm": 0.2328146994113922, + "learning_rate": 3.7495506480469434e-05, + "loss": 0.1099, + "step": 22429 + }, + { + "epoch": 0.4000642100381693, + "grad_norm": 0.25440514087677, + "learning_rate": 3.749415831965485e-05, + "loss": 0.1719, + "step": 22430 + }, + { + "epoch": 0.400082046159883, + "grad_norm": 0.27376434206962585, + "learning_rate": 3.7492810110408585e-05, + "loss": 0.173, + "step": 22431 + }, + { + "epoch": 0.4000998822815967, + "grad_norm": 0.2598424255847931, + "learning_rate": 3.7491461852735876e-05, + "loss": 0.1774, + "step": 22432 + }, + { + "epoch": 0.4001177184033104, + "grad_norm": 0.25894680619239807, + "learning_rate": 3.749011354664194e-05, + "loss": 0.1671, + "step": 22433 + }, + { + "epoch": 0.4001355545250241, + "grad_norm": 0.343810111284256, + "learning_rate": 3.748876519213201e-05, + "loss": 0.2451, + "step": 22434 + }, + { + "epoch": 0.40015339064673777, + "grad_norm": 0.24313119053840637, + "learning_rate": 3.748741678921132e-05, + "loss": 0.1408, + "step": 22435 + }, + { + "epoch": 0.40017122676845146, + "grad_norm": 0.25919029116630554, + "learning_rate": 3.7486068337885094e-05, + "loss": 0.1193, + "step": 22436 + }, + { + "epoch": 0.40018906289016515, + "grad_norm": 0.27186962962150574, + "learning_rate": 3.7484719838158545e-05, + "loss": 0.1393, + "step": 22437 + }, + { + "epoch": 0.40020689901187884, + "grad_norm": 0.2500014007091522, + "learning_rate": 3.748337129003692e-05, + "loss": 0.1567, + "step": 22438 + }, + { + "epoch": 0.4002247351335925, + "grad_norm": 0.27956870198249817, + "learning_rate": 3.748202269352543e-05, + "loss": 0.1355, + "step": 22439 + }, + { + "epoch": 0.40024257125530627, + "grad_norm": 0.2722923755645752, + "learning_rate": 3.7480674048629304e-05, + "loss": 0.1397, + "step": 22440 + }, + { + "epoch": 0.40026040737701996, + "grad_norm": 0.2864592373371124, + "learning_rate": 3.747932535535378e-05, + "loss": 0.1884, + "step": 22441 + }, + { + "epoch": 0.40027824349873364, + "grad_norm": 0.23527361452579498, + "learning_rate": 3.747797661370407e-05, + "loss": 0.1418, + "step": 22442 + }, + { + "epoch": 0.40029607962044733, + "grad_norm": 0.2685128450393677, + "learning_rate": 3.7476627823685426e-05, + "loss": 0.1688, + "step": 22443 + }, + { + "epoch": 0.400313915742161, + "grad_norm": 0.300820916891098, + "learning_rate": 3.7475278985303056e-05, + "loss": 0.1574, + "step": 22444 + }, + { + "epoch": 0.4003317518638747, + "grad_norm": 0.23595267534255981, + "learning_rate": 3.7473930098562206e-05, + "loss": 0.1591, + "step": 22445 + }, + { + "epoch": 0.4003495879855884, + "grad_norm": 0.295297771692276, + "learning_rate": 3.747258116346809e-05, + "loss": 0.2227, + "step": 22446 + }, + { + "epoch": 0.4003674241073021, + "grad_norm": 0.4549243748188019, + "learning_rate": 3.747123218002594e-05, + "loss": 0.1571, + "step": 22447 + }, + { + "epoch": 0.4003852602290158, + "grad_norm": 0.26432284712791443, + "learning_rate": 3.746988314824098e-05, + "loss": 0.1795, + "step": 22448 + }, + { + "epoch": 0.4004030963507295, + "grad_norm": 0.26903557777404785, + "learning_rate": 3.746853406811845e-05, + "loss": 0.1442, + "step": 22449 + }, + { + "epoch": 0.4004209324724432, + "grad_norm": 0.30301082134246826, + "learning_rate": 3.746718493966358e-05, + "loss": 0.1428, + "step": 22450 + }, + { + "epoch": 0.4004387685941569, + "grad_norm": 0.29643484950065613, + "learning_rate": 3.74658357628816e-05, + "loss": 0.1769, + "step": 22451 + }, + { + "epoch": 0.4004566047158706, + "grad_norm": 0.2608742117881775, + "learning_rate": 3.7464486537777724e-05, + "loss": 0.1171, + "step": 22452 + }, + { + "epoch": 0.40047444083758427, + "grad_norm": 0.24533872306346893, + "learning_rate": 3.746313726435719e-05, + "loss": 0.1313, + "step": 22453 + }, + { + "epoch": 0.40049227695929795, + "grad_norm": 0.21685953438282013, + "learning_rate": 3.746178794262524e-05, + "loss": 0.1431, + "step": 22454 + }, + { + "epoch": 0.40051011308101164, + "grad_norm": 0.5362133979797363, + "learning_rate": 3.746043857258709e-05, + "loss": 0.1813, + "step": 22455 + }, + { + "epoch": 0.4005279492027254, + "grad_norm": 0.23753641545772552, + "learning_rate": 3.7459089154247985e-05, + "loss": 0.1307, + "step": 22456 + }, + { + "epoch": 0.4005457853244391, + "grad_norm": 0.34868675470352173, + "learning_rate": 3.745773968761314e-05, + "loss": 0.1958, + "step": 22457 + }, + { + "epoch": 0.40056362144615276, + "grad_norm": 0.2925041615962982, + "learning_rate": 3.74563901726878e-05, + "loss": 0.1693, + "step": 22458 + }, + { + "epoch": 0.40058145756786645, + "grad_norm": 0.206575408577919, + "learning_rate": 3.745504060947718e-05, + "loss": 0.1662, + "step": 22459 + }, + { + "epoch": 0.40059929368958014, + "grad_norm": 0.26016613841056824, + "learning_rate": 3.7453690997986534e-05, + "loss": 0.1226, + "step": 22460 + }, + { + "epoch": 0.4006171298112938, + "grad_norm": 0.2690242826938629, + "learning_rate": 3.745234133822107e-05, + "loss": 0.1679, + "step": 22461 + }, + { + "epoch": 0.4006349659330075, + "grad_norm": 0.3547515869140625, + "learning_rate": 3.745099163018603e-05, + "loss": 0.1546, + "step": 22462 + }, + { + "epoch": 0.4006528020547212, + "grad_norm": 0.19431684911251068, + "learning_rate": 3.744964187388664e-05, + "loss": 0.1331, + "step": 22463 + }, + { + "epoch": 0.4006706381764349, + "grad_norm": 0.2520959973335266, + "learning_rate": 3.744829206932815e-05, + "loss": 0.1287, + "step": 22464 + }, + { + "epoch": 0.40068847429814863, + "grad_norm": 0.22487646341323853, + "learning_rate": 3.744694221651578e-05, + "loss": 0.1466, + "step": 22465 + }, + { + "epoch": 0.4007063104198623, + "grad_norm": 0.24782566726207733, + "learning_rate": 3.7445592315454766e-05, + "loss": 0.1773, + "step": 22466 + }, + { + "epoch": 0.400724146541576, + "grad_norm": 0.2625008225440979, + "learning_rate": 3.744424236615033e-05, + "loss": 0.1482, + "step": 22467 + }, + { + "epoch": 0.4007419826632897, + "grad_norm": 0.26926976442337036, + "learning_rate": 3.744289236860771e-05, + "loss": 0.1434, + "step": 22468 + }, + { + "epoch": 0.4007598187850034, + "grad_norm": 0.2567867040634155, + "learning_rate": 3.7441542322832146e-05, + "loss": 0.1418, + "step": 22469 + }, + { + "epoch": 0.4007776549067171, + "grad_norm": 0.3140268921852112, + "learning_rate": 3.7440192228828864e-05, + "loss": 0.1816, + "step": 22470 + }, + { + "epoch": 0.40079549102843076, + "grad_norm": 0.20431353151798248, + "learning_rate": 3.74388420866031e-05, + "loss": 0.1524, + "step": 22471 + }, + { + "epoch": 0.40081332715014445, + "grad_norm": 0.22864453494548798, + "learning_rate": 3.743749189616009e-05, + "loss": 0.1151, + "step": 22472 + }, + { + "epoch": 0.4008311632718582, + "grad_norm": 0.2789619565010071, + "learning_rate": 3.7436141657505074e-05, + "loss": 0.1537, + "step": 22473 + }, + { + "epoch": 0.4008489993935719, + "grad_norm": 0.2794075012207031, + "learning_rate": 3.7434791370643266e-05, + "loss": 0.1838, + "step": 22474 + }, + { + "epoch": 0.40086683551528557, + "grad_norm": 0.3180589973926544, + "learning_rate": 3.743344103557992e-05, + "loss": 0.1656, + "step": 22475 + }, + { + "epoch": 0.40088467163699926, + "grad_norm": 0.24583542346954346, + "learning_rate": 3.743209065232025e-05, + "loss": 0.1523, + "step": 22476 + }, + { + "epoch": 0.40090250775871294, + "grad_norm": 0.3447679877281189, + "learning_rate": 3.743074022086951e-05, + "loss": 0.1374, + "step": 22477 + }, + { + "epoch": 0.40092034388042663, + "grad_norm": 0.38320985436439514, + "learning_rate": 3.742938974123293e-05, + "loss": 0.2163, + "step": 22478 + }, + { + "epoch": 0.4009381800021403, + "grad_norm": 0.30444759130477905, + "learning_rate": 3.742803921341574e-05, + "loss": 0.1371, + "step": 22479 + }, + { + "epoch": 0.400956016123854, + "grad_norm": 0.21661725640296936, + "learning_rate": 3.742668863742318e-05, + "loss": 0.116, + "step": 22480 + }, + { + "epoch": 0.4009738522455677, + "grad_norm": 0.24259746074676514, + "learning_rate": 3.742533801326048e-05, + "loss": 0.1726, + "step": 22481 + }, + { + "epoch": 0.40099168836728144, + "grad_norm": 0.3091650605201721, + "learning_rate": 3.742398734093287e-05, + "loss": 0.1417, + "step": 22482 + }, + { + "epoch": 0.40100952448899513, + "grad_norm": 0.2781609892845154, + "learning_rate": 3.7422636620445605e-05, + "loss": 0.1696, + "step": 22483 + }, + { + "epoch": 0.4010273606107088, + "grad_norm": 0.3832854628562927, + "learning_rate": 3.742128585180391e-05, + "loss": 0.109, + "step": 22484 + }, + { + "epoch": 0.4010451967324225, + "grad_norm": 0.23148325085639954, + "learning_rate": 3.741993503501302e-05, + "loss": 0.1636, + "step": 22485 + }, + { + "epoch": 0.4010630328541362, + "grad_norm": 0.28823745250701904, + "learning_rate": 3.7418584170078176e-05, + "loss": 0.1409, + "step": 22486 + }, + { + "epoch": 0.4010808689758499, + "grad_norm": 0.29489466547966003, + "learning_rate": 3.74172332570046e-05, + "loss": 0.1864, + "step": 22487 + }, + { + "epoch": 0.40109870509756357, + "grad_norm": 0.28868696093559265, + "learning_rate": 3.7415882295797545e-05, + "loss": 0.1668, + "step": 22488 + }, + { + "epoch": 0.40111654121927726, + "grad_norm": 0.25830361247062683, + "learning_rate": 3.741453128646224e-05, + "loss": 0.2186, + "step": 22489 + }, + { + "epoch": 0.401134377340991, + "grad_norm": 0.374521404504776, + "learning_rate": 3.741318022900392e-05, + "loss": 0.118, + "step": 22490 + }, + { + "epoch": 0.4011522134627047, + "grad_norm": 0.3246745467185974, + "learning_rate": 3.741182912342783e-05, + "loss": 0.1641, + "step": 22491 + }, + { + "epoch": 0.4011700495844184, + "grad_norm": 0.23623211681842804, + "learning_rate": 3.741047796973921e-05, + "loss": 0.1634, + "step": 22492 + }, + { + "epoch": 0.40118788570613206, + "grad_norm": 0.2502305507659912, + "learning_rate": 3.7409126767943284e-05, + "loss": 0.1851, + "step": 22493 + }, + { + "epoch": 0.40120572182784575, + "grad_norm": 0.30829817056655884, + "learning_rate": 3.74077755180453e-05, + "loss": 0.182, + "step": 22494 + }, + { + "epoch": 0.40122355794955944, + "grad_norm": 0.22984784841537476, + "learning_rate": 3.7406424220050487e-05, + "loss": 0.12, + "step": 22495 + }, + { + "epoch": 0.4012413940712731, + "grad_norm": 0.23265908658504486, + "learning_rate": 3.74050728739641e-05, + "loss": 0.1443, + "step": 22496 + }, + { + "epoch": 0.4012592301929868, + "grad_norm": 0.23170119524002075, + "learning_rate": 3.740372147979136e-05, + "loss": 0.1551, + "step": 22497 + }, + { + "epoch": 0.40127706631470056, + "grad_norm": 0.26299262046813965, + "learning_rate": 3.7402370037537496e-05, + "loss": 0.1849, + "step": 22498 + }, + { + "epoch": 0.40129490243641425, + "grad_norm": 0.24848175048828125, + "learning_rate": 3.7401018547207786e-05, + "loss": 0.1973, + "step": 22499 + }, + { + "epoch": 0.40131273855812793, + "grad_norm": 0.2625092566013336, + "learning_rate": 3.7399667008807425e-05, + "loss": 0.1472, + "step": 22500 + }, + { + "epoch": 0.4013305746798416, + "grad_norm": 0.2969052195549011, + "learning_rate": 3.739831542234169e-05, + "loss": 0.1521, + "step": 22501 + }, + { + "epoch": 0.4013484108015553, + "grad_norm": 0.23838455975055695, + "learning_rate": 3.739696378781579e-05, + "loss": 0.1396, + "step": 22502 + }, + { + "epoch": 0.401366246923269, + "grad_norm": 0.23480050265789032, + "learning_rate": 3.7395612105234985e-05, + "loss": 0.1125, + "step": 22503 + }, + { + "epoch": 0.4013840830449827, + "grad_norm": 0.2930973768234253, + "learning_rate": 3.7394260374604494e-05, + "loss": 0.1123, + "step": 22504 + }, + { + "epoch": 0.4014019191666964, + "grad_norm": 0.2859051823616028, + "learning_rate": 3.739290859592958e-05, + "loss": 0.1428, + "step": 22505 + }, + { + "epoch": 0.40141975528841006, + "grad_norm": 0.2250789999961853, + "learning_rate": 3.739155676921547e-05, + "loss": 0.1484, + "step": 22506 + }, + { + "epoch": 0.4014375914101238, + "grad_norm": 0.25904330611228943, + "learning_rate": 3.73902048944674e-05, + "loss": 0.1769, + "step": 22507 + }, + { + "epoch": 0.4014554275318375, + "grad_norm": 0.21639803051948547, + "learning_rate": 3.738885297169062e-05, + "loss": 0.153, + "step": 22508 + }, + { + "epoch": 0.4014732636535512, + "grad_norm": 0.3896249532699585, + "learning_rate": 3.738750100089037e-05, + "loss": 0.1749, + "step": 22509 + }, + { + "epoch": 0.40149109977526487, + "grad_norm": 0.23915569484233856, + "learning_rate": 3.738614898207188e-05, + "loss": 0.1728, + "step": 22510 + }, + { + "epoch": 0.40150893589697856, + "grad_norm": 0.27284306287765503, + "learning_rate": 3.738479691524041e-05, + "loss": 0.2205, + "step": 22511 + }, + { + "epoch": 0.40152677201869225, + "grad_norm": 0.22742249071598053, + "learning_rate": 3.738344480040118e-05, + "loss": 0.1464, + "step": 22512 + }, + { + "epoch": 0.40154460814040593, + "grad_norm": 0.28184935450553894, + "learning_rate": 3.7382092637559443e-05, + "loss": 0.0968, + "step": 22513 + }, + { + "epoch": 0.4015624442621196, + "grad_norm": 0.3212919235229492, + "learning_rate": 3.738074042672044e-05, + "loss": 0.2036, + "step": 22514 + }, + { + "epoch": 0.40158028038383337, + "grad_norm": 0.3239867091178894, + "learning_rate": 3.7379388167889415e-05, + "loss": 0.1872, + "step": 22515 + }, + { + "epoch": 0.40159811650554705, + "grad_norm": 0.280558705329895, + "learning_rate": 3.7378035861071606e-05, + "loss": 0.1648, + "step": 22516 + }, + { + "epoch": 0.40161595262726074, + "grad_norm": 0.3539159595966339, + "learning_rate": 3.737668350627225e-05, + "loss": 0.1791, + "step": 22517 + }, + { + "epoch": 0.40163378874897443, + "grad_norm": 0.3493812382221222, + "learning_rate": 3.737533110349658e-05, + "loss": 0.1909, + "step": 22518 + }, + { + "epoch": 0.4016516248706881, + "grad_norm": 0.2634727358818054, + "learning_rate": 3.737397865274987e-05, + "loss": 0.1573, + "step": 22519 + }, + { + "epoch": 0.4016694609924018, + "grad_norm": 0.23298679292201996, + "learning_rate": 3.7372626154037346e-05, + "loss": 0.1116, + "step": 22520 + }, + { + "epoch": 0.4016872971141155, + "grad_norm": 0.2536587417125702, + "learning_rate": 3.737127360736424e-05, + "loss": 0.1411, + "step": 22521 + }, + { + "epoch": 0.4017051332358292, + "grad_norm": 0.30334731936454773, + "learning_rate": 3.736992101273581e-05, + "loss": 0.1472, + "step": 22522 + }, + { + "epoch": 0.40172296935754287, + "grad_norm": 0.44926324486732483, + "learning_rate": 3.73685683701573e-05, + "loss": 0.1944, + "step": 22523 + }, + { + "epoch": 0.4017408054792566, + "grad_norm": 0.3534368872642517, + "learning_rate": 3.736721567963394e-05, + "loss": 0.1675, + "step": 22524 + }, + { + "epoch": 0.4017586416009703, + "grad_norm": 0.2229832410812378, + "learning_rate": 3.736586294117097e-05, + "loss": 0.1322, + "step": 22525 + }, + { + "epoch": 0.401776477722684, + "grad_norm": 0.3615153133869171, + "learning_rate": 3.736451015477366e-05, + "loss": 0.1822, + "step": 22526 + }, + { + "epoch": 0.4017943138443977, + "grad_norm": 0.3857594430446625, + "learning_rate": 3.736315732044724e-05, + "loss": 0.2246, + "step": 22527 + }, + { + "epoch": 0.40181214996611136, + "grad_norm": 0.18655601143836975, + "learning_rate": 3.736180443819694e-05, + "loss": 0.1073, + "step": 22528 + }, + { + "epoch": 0.40182998608782505, + "grad_norm": 0.21997660398483276, + "learning_rate": 3.736045150802803e-05, + "loss": 0.1579, + "step": 22529 + }, + { + "epoch": 0.40184782220953874, + "grad_norm": 0.3747881352901459, + "learning_rate": 3.7359098529945724e-05, + "loss": 0.1611, + "step": 22530 + }, + { + "epoch": 0.40186565833125243, + "grad_norm": 0.28655287623405457, + "learning_rate": 3.735774550395529e-05, + "loss": 0.1782, + "step": 22531 + }, + { + "epoch": 0.4018834944529662, + "grad_norm": 0.2734461724758148, + "learning_rate": 3.735639243006197e-05, + "loss": 0.139, + "step": 22532 + }, + { + "epoch": 0.40190133057467986, + "grad_norm": 0.2696603536605835, + "learning_rate": 3.7355039308271e-05, + "loss": 0.1562, + "step": 22533 + }, + { + "epoch": 0.40191916669639355, + "grad_norm": 0.2373989075422287, + "learning_rate": 3.735368613858764e-05, + "loss": 0.1511, + "step": 22534 + }, + { + "epoch": 0.40193700281810724, + "grad_norm": 0.28286632895469666, + "learning_rate": 3.735233292101712e-05, + "loss": 0.1372, + "step": 22535 + }, + { + "epoch": 0.4019548389398209, + "grad_norm": 0.35591399669647217, + "learning_rate": 3.735097965556469e-05, + "loss": 0.193, + "step": 22536 + }, + { + "epoch": 0.4019726750615346, + "grad_norm": 0.21864116191864014, + "learning_rate": 3.7349626342235595e-05, + "loss": 0.1847, + "step": 22537 + }, + { + "epoch": 0.4019905111832483, + "grad_norm": 0.30557364225387573, + "learning_rate": 3.7348272981035084e-05, + "loss": 0.2072, + "step": 22538 + }, + { + "epoch": 0.402008347304962, + "grad_norm": 0.22314032912254333, + "learning_rate": 3.7346919571968395e-05, + "loss": 0.1918, + "step": 22539 + }, + { + "epoch": 0.4020261834266757, + "grad_norm": 0.3513238728046417, + "learning_rate": 3.73455661150408e-05, + "loss": 0.1352, + "step": 22540 + }, + { + "epoch": 0.4020440195483894, + "grad_norm": 0.32878682017326355, + "learning_rate": 3.734421261025751e-05, + "loss": 0.1154, + "step": 22541 + }, + { + "epoch": 0.4020618556701031, + "grad_norm": 0.22667662799358368, + "learning_rate": 3.73428590576238e-05, + "loss": 0.1244, + "step": 22542 + }, + { + "epoch": 0.4020796917918168, + "grad_norm": 0.2991669774055481, + "learning_rate": 3.7341505457144896e-05, + "loss": 0.1466, + "step": 22543 + }, + { + "epoch": 0.4020975279135305, + "grad_norm": 0.2183166742324829, + "learning_rate": 3.734015180882606e-05, + "loss": 0.1495, + "step": 22544 + }, + { + "epoch": 0.40211536403524417, + "grad_norm": 0.21264046430587769, + "learning_rate": 3.7338798112672536e-05, + "loss": 0.1551, + "step": 22545 + }, + { + "epoch": 0.40213320015695786, + "grad_norm": 0.23337967693805695, + "learning_rate": 3.7337444368689555e-05, + "loss": 0.1816, + "step": 22546 + }, + { + "epoch": 0.40215103627867155, + "grad_norm": 0.2925909757614136, + "learning_rate": 3.7336090576882396e-05, + "loss": 0.1779, + "step": 22547 + }, + { + "epoch": 0.40216887240038524, + "grad_norm": 0.33091145753860474, + "learning_rate": 3.7334736737256277e-05, + "loss": 0.149, + "step": 22548 + }, + { + "epoch": 0.402186708522099, + "grad_norm": 0.3367851972579956, + "learning_rate": 3.733338284981647e-05, + "loss": 0.1423, + "step": 22549 + }, + { + "epoch": 0.40220454464381267, + "grad_norm": 0.32828691601753235, + "learning_rate": 3.7332028914568196e-05, + "loss": 0.1703, + "step": 22550 + }, + { + "epoch": 0.40222238076552635, + "grad_norm": 0.23279611766338348, + "learning_rate": 3.733067493151673e-05, + "loss": 0.1536, + "step": 22551 + }, + { + "epoch": 0.40224021688724004, + "grad_norm": 0.22553859651088715, + "learning_rate": 3.73293209006673e-05, + "loss": 0.1781, + "step": 22552 + }, + { + "epoch": 0.40225805300895373, + "grad_norm": 0.2534477412700653, + "learning_rate": 3.7327966822025166e-05, + "loss": 0.1919, + "step": 22553 + }, + { + "epoch": 0.4022758891306674, + "grad_norm": 0.22570395469665527, + "learning_rate": 3.7326612695595574e-05, + "loss": 0.0963, + "step": 22554 + }, + { + "epoch": 0.4022937252523811, + "grad_norm": 0.21601274609565735, + "learning_rate": 3.7325258521383784e-05, + "loss": 0.1699, + "step": 22555 + }, + { + "epoch": 0.4023115613740948, + "grad_norm": 0.1645299345254898, + "learning_rate": 3.732390429939503e-05, + "loss": 0.1288, + "step": 22556 + }, + { + "epoch": 0.40232939749580854, + "grad_norm": 0.22468598186969757, + "learning_rate": 3.732255002963456e-05, + "loss": 0.1558, + "step": 22557 + }, + { + "epoch": 0.4023472336175222, + "grad_norm": 0.22805725038051605, + "learning_rate": 3.7321195712107643e-05, + "loss": 0.1496, + "step": 22558 + }, + { + "epoch": 0.4023650697392359, + "grad_norm": 0.376115620136261, + "learning_rate": 3.73198413468195e-05, + "loss": 0.2402, + "step": 22559 + }, + { + "epoch": 0.4023829058609496, + "grad_norm": 0.26651424169540405, + "learning_rate": 3.73184869337754e-05, + "loss": 0.1837, + "step": 22560 + }, + { + "epoch": 0.4024007419826633, + "grad_norm": 0.3118203282356262, + "learning_rate": 3.73171324729806e-05, + "loss": 0.1609, + "step": 22561 + }, + { + "epoch": 0.402418578104377, + "grad_norm": 0.33430716395378113, + "learning_rate": 3.7315777964440336e-05, + "loss": 0.1453, + "step": 22562 + }, + { + "epoch": 0.40243641422609067, + "grad_norm": 0.2300124615430832, + "learning_rate": 3.731442340815986e-05, + "loss": 0.1758, + "step": 22563 + }, + { + "epoch": 0.40245425034780435, + "grad_norm": 0.235808864235878, + "learning_rate": 3.731306880414442e-05, + "loss": 0.1344, + "step": 22564 + }, + { + "epoch": 0.40247208646951804, + "grad_norm": 0.3269568979740143, + "learning_rate": 3.731171415239929e-05, + "loss": 0.1482, + "step": 22565 + }, + { + "epoch": 0.4024899225912318, + "grad_norm": 0.2756299078464508, + "learning_rate": 3.731035945292969e-05, + "loss": 0.1876, + "step": 22566 + }, + { + "epoch": 0.4025077587129455, + "grad_norm": 0.26264750957489014, + "learning_rate": 3.730900470574088e-05, + "loss": 0.2085, + "step": 22567 + }, + { + "epoch": 0.40252559483465916, + "grad_norm": 0.2224518060684204, + "learning_rate": 3.7307649910838126e-05, + "loss": 0.1287, + "step": 22568 + }, + { + "epoch": 0.40254343095637285, + "grad_norm": 0.2809959650039673, + "learning_rate": 3.730629506822667e-05, + "loss": 0.1602, + "step": 22569 + }, + { + "epoch": 0.40256126707808654, + "grad_norm": 0.24181436002254486, + "learning_rate": 3.730494017791176e-05, + "loss": 0.1537, + "step": 22570 + }, + { + "epoch": 0.4025791031998002, + "grad_norm": 0.21231873333454132, + "learning_rate": 3.730358523989866e-05, + "loss": 0.1964, + "step": 22571 + }, + { + "epoch": 0.4025969393215139, + "grad_norm": 0.21915362775325775, + "learning_rate": 3.730223025419261e-05, + "loss": 0.1508, + "step": 22572 + }, + { + "epoch": 0.4026147754432276, + "grad_norm": 0.3757163882255554, + "learning_rate": 3.7300875220798855e-05, + "loss": 0.1826, + "step": 22573 + }, + { + "epoch": 0.40263261156494135, + "grad_norm": 0.254212349653244, + "learning_rate": 3.729952013972267e-05, + "loss": 0.1804, + "step": 22574 + }, + { + "epoch": 0.40265044768665503, + "grad_norm": 0.3039003908634186, + "learning_rate": 3.72981650109693e-05, + "loss": 0.1597, + "step": 22575 + }, + { + "epoch": 0.4026682838083687, + "grad_norm": 0.22406205534934998, + "learning_rate": 3.729680983454399e-05, + "loss": 0.1297, + "step": 22576 + }, + { + "epoch": 0.4026861199300824, + "grad_norm": 0.2974556088447571, + "learning_rate": 3.7295454610452e-05, + "loss": 0.184, + "step": 22577 + }, + { + "epoch": 0.4027039560517961, + "grad_norm": 0.25405627489089966, + "learning_rate": 3.7294099338698575e-05, + "loss": 0.1275, + "step": 22578 + }, + { + "epoch": 0.4027217921735098, + "grad_norm": 0.27596405148506165, + "learning_rate": 3.729274401928898e-05, + "loss": 0.1329, + "step": 22579 + }, + { + "epoch": 0.4027396282952235, + "grad_norm": 0.508490264415741, + "learning_rate": 3.729138865222846e-05, + "loss": 0.1714, + "step": 22580 + }, + { + "epoch": 0.40275746441693716, + "grad_norm": 0.28058138489723206, + "learning_rate": 3.7290033237522276e-05, + "loss": 0.1695, + "step": 22581 + }, + { + "epoch": 0.40277530053865085, + "grad_norm": 0.21250146627426147, + "learning_rate": 3.728867777517567e-05, + "loss": 0.1697, + "step": 22582 + }, + { + "epoch": 0.4027931366603646, + "grad_norm": 0.20889438688755035, + "learning_rate": 3.728732226519392e-05, + "loss": 0.1658, + "step": 22583 + }, + { + "epoch": 0.4028109727820783, + "grad_norm": 0.2138649970293045, + "learning_rate": 3.7285966707582256e-05, + "loss": 0.154, + "step": 22584 + }, + { + "epoch": 0.40282880890379197, + "grad_norm": 0.23668667674064636, + "learning_rate": 3.728461110234594e-05, + "loss": 0.1757, + "step": 22585 + }, + { + "epoch": 0.40284664502550566, + "grad_norm": 0.2773319184780121, + "learning_rate": 3.728325544949024e-05, + "loss": 0.1433, + "step": 22586 + }, + { + "epoch": 0.40286448114721934, + "grad_norm": 0.22297628223896027, + "learning_rate": 3.728189974902038e-05, + "loss": 0.186, + "step": 22587 + }, + { + "epoch": 0.40288231726893303, + "grad_norm": 0.22563689947128296, + "learning_rate": 3.728054400094165e-05, + "loss": 0.1151, + "step": 22588 + }, + { + "epoch": 0.4029001533906467, + "grad_norm": 0.25474026799201965, + "learning_rate": 3.727918820525928e-05, + "loss": 0.1793, + "step": 22589 + }, + { + "epoch": 0.4029179895123604, + "grad_norm": 0.412511944770813, + "learning_rate": 3.7277832361978546e-05, + "loss": 0.1667, + "step": 22590 + }, + { + "epoch": 0.40293582563407415, + "grad_norm": 0.20552769303321838, + "learning_rate": 3.7276476471104684e-05, + "loss": 0.107, + "step": 22591 + }, + { + "epoch": 0.40295366175578784, + "grad_norm": 0.22179926931858063, + "learning_rate": 3.727512053264297e-05, + "loss": 0.1073, + "step": 22592 + }, + { + "epoch": 0.4029714978775015, + "grad_norm": 0.3287063539028168, + "learning_rate": 3.727376454659863e-05, + "loss": 0.1518, + "step": 22593 + }, + { + "epoch": 0.4029893339992152, + "grad_norm": 0.2026943862438202, + "learning_rate": 3.727240851297695e-05, + "loss": 0.1408, + "step": 22594 + }, + { + "epoch": 0.4030071701209289, + "grad_norm": 0.28636741638183594, + "learning_rate": 3.7271052431783175e-05, + "loss": 0.1427, + "step": 22595 + }, + { + "epoch": 0.4030250062426426, + "grad_norm": 0.24123352766036987, + "learning_rate": 3.726969630302257e-05, + "loss": 0.1148, + "step": 22596 + }, + { + "epoch": 0.4030428423643563, + "grad_norm": 0.24426031112670898, + "learning_rate": 3.726834012670038e-05, + "loss": 0.1018, + "step": 22597 + }, + { + "epoch": 0.40306067848606997, + "grad_norm": 0.27445563673973083, + "learning_rate": 3.726698390282186e-05, + "loss": 0.1468, + "step": 22598 + }, + { + "epoch": 0.4030785146077837, + "grad_norm": 0.23138689994812012, + "learning_rate": 3.726562763139229e-05, + "loss": 0.1022, + "step": 22599 + }, + { + "epoch": 0.4030963507294974, + "grad_norm": 0.2385842204093933, + "learning_rate": 3.726427131241689e-05, + "loss": 0.163, + "step": 22600 + }, + { + "epoch": 0.4031141868512111, + "grad_norm": 0.24416375160217285, + "learning_rate": 3.726291494590095e-05, + "loss": 0.1906, + "step": 22601 + }, + { + "epoch": 0.4031320229729248, + "grad_norm": 0.33560794591903687, + "learning_rate": 3.7261558531849705e-05, + "loss": 0.161, + "step": 22602 + }, + { + "epoch": 0.40314985909463846, + "grad_norm": 0.2666366398334503, + "learning_rate": 3.7260202070268435e-05, + "loss": 0.1432, + "step": 22603 + }, + { + "epoch": 0.40316769521635215, + "grad_norm": 0.24450832605361938, + "learning_rate": 3.725884556116238e-05, + "loss": 0.1532, + "step": 22604 + }, + { + "epoch": 0.40318553133806584, + "grad_norm": 0.23879143595695496, + "learning_rate": 3.7257489004536814e-05, + "loss": 0.1187, + "step": 22605 + }, + { + "epoch": 0.4032033674597795, + "grad_norm": 0.3145284056663513, + "learning_rate": 3.7256132400396985e-05, + "loss": 0.2259, + "step": 22606 + }, + { + "epoch": 0.4032212035814932, + "grad_norm": 0.25536713004112244, + "learning_rate": 3.7254775748748156e-05, + "loss": 0.0876, + "step": 22607 + }, + { + "epoch": 0.40323903970320696, + "grad_norm": 0.3537450432777405, + "learning_rate": 3.725341904959557e-05, + "loss": 0.19, + "step": 22608 + }, + { + "epoch": 0.40325687582492065, + "grad_norm": 0.261639803647995, + "learning_rate": 3.7252062302944516e-05, + "loss": 0.15, + "step": 22609 + }, + { + "epoch": 0.40327471194663433, + "grad_norm": 0.36763739585876465, + "learning_rate": 3.725070550880023e-05, + "loss": 0.1491, + "step": 22610 + }, + { + "epoch": 0.403292548068348, + "grad_norm": 0.2335500717163086, + "learning_rate": 3.724934866716798e-05, + "loss": 0.1406, + "step": 22611 + }, + { + "epoch": 0.4033103841900617, + "grad_norm": 0.28220289945602417, + "learning_rate": 3.7247991778053025e-05, + "loss": 0.1744, + "step": 22612 + }, + { + "epoch": 0.4033282203117754, + "grad_norm": 0.2794845700263977, + "learning_rate": 3.724663484146061e-05, + "loss": 0.1517, + "step": 22613 + }, + { + "epoch": 0.4033460564334891, + "grad_norm": 0.20992477238178253, + "learning_rate": 3.724527785739603e-05, + "loss": 0.1519, + "step": 22614 + }, + { + "epoch": 0.4033638925552028, + "grad_norm": 0.30859726667404175, + "learning_rate": 3.724392082586451e-05, + "loss": 0.1206, + "step": 22615 + }, + { + "epoch": 0.4033817286769165, + "grad_norm": 0.24820156395435333, + "learning_rate": 3.724256374687133e-05, + "loss": 0.1687, + "step": 22616 + }, + { + "epoch": 0.4033995647986302, + "grad_norm": 0.26261571049690247, + "learning_rate": 3.724120662042174e-05, + "loss": 0.1665, + "step": 22617 + }, + { + "epoch": 0.4034174009203439, + "grad_norm": 0.22059425711631775, + "learning_rate": 3.7239849446521004e-05, + "loss": 0.1407, + "step": 22618 + }, + { + "epoch": 0.4034352370420576, + "grad_norm": 0.3970620632171631, + "learning_rate": 3.723849222517438e-05, + "loss": 0.2214, + "step": 22619 + }, + { + "epoch": 0.40345307316377127, + "grad_norm": 0.21623550355434418, + "learning_rate": 3.7237134956387146e-05, + "loss": 0.16, + "step": 22620 + }, + { + "epoch": 0.40347090928548496, + "grad_norm": 0.2906252145767212, + "learning_rate": 3.723577764016454e-05, + "loss": 0.1559, + "step": 22621 + }, + { + "epoch": 0.40348874540719865, + "grad_norm": 0.22615738213062286, + "learning_rate": 3.723442027651184e-05, + "loss": 0.1496, + "step": 22622 + }, + { + "epoch": 0.40350658152891233, + "grad_norm": 0.2039412260055542, + "learning_rate": 3.72330628654343e-05, + "loss": 0.1219, + "step": 22623 + }, + { + "epoch": 0.403524417650626, + "grad_norm": 0.2280758023262024, + "learning_rate": 3.723170540693718e-05, + "loss": 0.1588, + "step": 22624 + }, + { + "epoch": 0.40354225377233977, + "grad_norm": 0.2778186798095703, + "learning_rate": 3.723034790102575e-05, + "loss": 0.2097, + "step": 22625 + }, + { + "epoch": 0.40356008989405345, + "grad_norm": 0.17122596502304077, + "learning_rate": 3.722899034770526e-05, + "loss": 0.1232, + "step": 22626 + }, + { + "epoch": 0.40357792601576714, + "grad_norm": 0.21911244094371796, + "learning_rate": 3.722763274698099e-05, + "loss": 0.1322, + "step": 22627 + }, + { + "epoch": 0.40359576213748083, + "grad_norm": 0.20474018156528473, + "learning_rate": 3.722627509885819e-05, + "loss": 0.1141, + "step": 22628 + }, + { + "epoch": 0.4036135982591945, + "grad_norm": 0.275081604719162, + "learning_rate": 3.722491740334211e-05, + "loss": 0.1423, + "step": 22629 + }, + { + "epoch": 0.4036314343809082, + "grad_norm": 0.19890952110290527, + "learning_rate": 3.722355966043804e-05, + "loss": 0.1377, + "step": 22630 + }, + { + "epoch": 0.4036492705026219, + "grad_norm": 0.23106753826141357, + "learning_rate": 3.7222201870151235e-05, + "loss": 0.1582, + "step": 22631 + }, + { + "epoch": 0.4036671066243356, + "grad_norm": 0.2641909122467041, + "learning_rate": 3.722084403248695e-05, + "loss": 0.1225, + "step": 22632 + }, + { + "epoch": 0.4036849427460493, + "grad_norm": 0.2284470647573471, + "learning_rate": 3.721948614745045e-05, + "loss": 0.1446, + "step": 22633 + }, + { + "epoch": 0.403702778867763, + "grad_norm": 0.2533705234527588, + "learning_rate": 3.7218128215047e-05, + "loss": 0.1513, + "step": 22634 + }, + { + "epoch": 0.4037206149894767, + "grad_norm": 0.23808880150318146, + "learning_rate": 3.721677023528187e-05, + "loss": 0.1489, + "step": 22635 + }, + { + "epoch": 0.4037384511111904, + "grad_norm": 0.30811694264411926, + "learning_rate": 3.7215412208160314e-05, + "loss": 0.1707, + "step": 22636 + }, + { + "epoch": 0.4037562872329041, + "grad_norm": 0.26869192719459534, + "learning_rate": 3.7214054133687604e-05, + "loss": 0.178, + "step": 22637 + }, + { + "epoch": 0.40377412335461776, + "grad_norm": 0.24078595638275146, + "learning_rate": 3.721269601186901e-05, + "loss": 0.1204, + "step": 22638 + }, + { + "epoch": 0.40379195947633145, + "grad_norm": 0.193939208984375, + "learning_rate": 3.7211337842709774e-05, + "loss": 0.1013, + "step": 22639 + }, + { + "epoch": 0.40380979559804514, + "grad_norm": 0.2960437834262848, + "learning_rate": 3.7209979626215185e-05, + "loss": 0.1445, + "step": 22640 + }, + { + "epoch": 0.40382763171975883, + "grad_norm": 0.3177621364593506, + "learning_rate": 3.7208621362390496e-05, + "loss": 0.1796, + "step": 22641 + }, + { + "epoch": 0.40384546784147257, + "grad_norm": 0.3184565007686615, + "learning_rate": 3.7207263051240964e-05, + "loss": 0.1858, + "step": 22642 + }, + { + "epoch": 0.40386330396318626, + "grad_norm": 0.23536665737628937, + "learning_rate": 3.720590469277187e-05, + "loss": 0.1544, + "step": 22643 + }, + { + "epoch": 0.40388114008489995, + "grad_norm": 0.23937270045280457, + "learning_rate": 3.7204546286988476e-05, + "loss": 0.0994, + "step": 22644 + }, + { + "epoch": 0.40389897620661364, + "grad_norm": 0.3052597641944885, + "learning_rate": 3.720318783389605e-05, + "loss": 0.1803, + "step": 22645 + }, + { + "epoch": 0.4039168123283273, + "grad_norm": 0.21923767030239105, + "learning_rate": 3.720182933349984e-05, + "loss": 0.1392, + "step": 22646 + }, + { + "epoch": 0.403934648450041, + "grad_norm": 0.23469729721546173, + "learning_rate": 3.720047078580514e-05, + "loss": 0.1406, + "step": 22647 + }, + { + "epoch": 0.4039524845717547, + "grad_norm": 0.3268749713897705, + "learning_rate": 3.719911219081719e-05, + "loss": 0.1485, + "step": 22648 + }, + { + "epoch": 0.4039703206934684, + "grad_norm": 0.22655069828033447, + "learning_rate": 3.719775354854127e-05, + "loss": 0.154, + "step": 22649 + }, + { + "epoch": 0.40398815681518213, + "grad_norm": 0.3410642445087433, + "learning_rate": 3.719639485898265e-05, + "loss": 0.1521, + "step": 22650 + }, + { + "epoch": 0.4040059929368958, + "grad_norm": 0.3202931582927704, + "learning_rate": 3.719503612214659e-05, + "loss": 0.1568, + "step": 22651 + }, + { + "epoch": 0.4040238290586095, + "grad_norm": 0.5891566872596741, + "learning_rate": 3.7193677338038354e-05, + "loss": 0.2045, + "step": 22652 + }, + { + "epoch": 0.4040416651803232, + "grad_norm": 0.2292635589838028, + "learning_rate": 3.7192318506663215e-05, + "loss": 0.1256, + "step": 22653 + }, + { + "epoch": 0.4040595013020369, + "grad_norm": 0.27007508277893066, + "learning_rate": 3.719095962802643e-05, + "loss": 0.1797, + "step": 22654 + }, + { + "epoch": 0.40407733742375057, + "grad_norm": 0.23769807815551758, + "learning_rate": 3.7189600702133285e-05, + "loss": 0.1794, + "step": 22655 + }, + { + "epoch": 0.40409517354546426, + "grad_norm": 0.2937954366207123, + "learning_rate": 3.7188241728989036e-05, + "loss": 0.1988, + "step": 22656 + }, + { + "epoch": 0.40411300966717795, + "grad_norm": 0.3008861839771271, + "learning_rate": 3.718688270859895e-05, + "loss": 0.1036, + "step": 22657 + }, + { + "epoch": 0.4041308457888917, + "grad_norm": 0.22435985505580902, + "learning_rate": 3.71855236409683e-05, + "loss": 0.156, + "step": 22658 + }, + { + "epoch": 0.4041486819106054, + "grad_norm": 0.2193506509065628, + "learning_rate": 3.718416452610235e-05, + "loss": 0.1555, + "step": 22659 + }, + { + "epoch": 0.40416651803231907, + "grad_norm": 0.2615002989768982, + "learning_rate": 3.718280536400637e-05, + "loss": 0.1678, + "step": 22660 + }, + { + "epoch": 0.40418435415403275, + "grad_norm": 0.30031007528305054, + "learning_rate": 3.7181446154685626e-05, + "loss": 0.1244, + "step": 22661 + }, + { + "epoch": 0.40420219027574644, + "grad_norm": 0.274593323469162, + "learning_rate": 3.71800868981454e-05, + "loss": 0.1272, + "step": 22662 + }, + { + "epoch": 0.40422002639746013, + "grad_norm": 0.2646281123161316, + "learning_rate": 3.717872759439094e-05, + "loss": 0.1586, + "step": 22663 + }, + { + "epoch": 0.4042378625191738, + "grad_norm": 0.289614737033844, + "learning_rate": 3.7177368243427525e-05, + "loss": 0.2013, + "step": 22664 + }, + { + "epoch": 0.4042556986408875, + "grad_norm": 0.2174532264471054, + "learning_rate": 3.7176008845260425e-05, + "loss": 0.1637, + "step": 22665 + }, + { + "epoch": 0.4042735347626012, + "grad_norm": 0.28268003463745117, + "learning_rate": 3.7174649399894916e-05, + "loss": 0.1544, + "step": 22666 + }, + { + "epoch": 0.40429137088431494, + "grad_norm": 0.25396421551704407, + "learning_rate": 3.7173289907336254e-05, + "loss": 0.138, + "step": 22667 + }, + { + "epoch": 0.4043092070060286, + "grad_norm": 0.37442055344581604, + "learning_rate": 3.7171930367589725e-05, + "loss": 0.2028, + "step": 22668 + }, + { + "epoch": 0.4043270431277423, + "grad_norm": 0.3185790181159973, + "learning_rate": 3.717057078066058e-05, + "loss": 0.1605, + "step": 22669 + }, + { + "epoch": 0.404344879249456, + "grad_norm": 0.21763397753238678, + "learning_rate": 3.71692111465541e-05, + "loss": 0.1617, + "step": 22670 + }, + { + "epoch": 0.4043627153711697, + "grad_norm": 0.19118084013462067, + "learning_rate": 3.716785146527556e-05, + "loss": 0.1259, + "step": 22671 + }, + { + "epoch": 0.4043805514928834, + "grad_norm": 0.27597668766975403, + "learning_rate": 3.716649173683022e-05, + "loss": 0.2024, + "step": 22672 + }, + { + "epoch": 0.40439838761459707, + "grad_norm": 0.4114522635936737, + "learning_rate": 3.716513196122336e-05, + "loss": 0.203, + "step": 22673 + }, + { + "epoch": 0.40441622373631075, + "grad_norm": 0.19684277474880219, + "learning_rate": 3.716377213846024e-05, + "loss": 0.168, + "step": 22674 + }, + { + "epoch": 0.4044340598580245, + "grad_norm": 0.19954712688922882, + "learning_rate": 3.7162412268546146e-05, + "loss": 0.1436, + "step": 22675 + }, + { + "epoch": 0.4044518959797382, + "grad_norm": 0.2597973048686981, + "learning_rate": 3.7161052351486345e-05, + "loss": 0.1382, + "step": 22676 + }, + { + "epoch": 0.4044697321014519, + "grad_norm": 0.2797156572341919, + "learning_rate": 3.715969238728609e-05, + "loss": 0.1903, + "step": 22677 + }, + { + "epoch": 0.40448756822316556, + "grad_norm": 0.26486700773239136, + "learning_rate": 3.7158332375950675e-05, + "loss": 0.1179, + "step": 22678 + }, + { + "epoch": 0.40450540434487925, + "grad_norm": 0.26257798075675964, + "learning_rate": 3.715697231748537e-05, + "loss": 0.1942, + "step": 22679 + }, + { + "epoch": 0.40452324046659294, + "grad_norm": 0.36095571517944336, + "learning_rate": 3.7155612211895436e-05, + "loss": 0.173, + "step": 22680 + }, + { + "epoch": 0.4045410765883066, + "grad_norm": 0.2301870882511139, + "learning_rate": 3.715425205918615e-05, + "loss": 0.1472, + "step": 22681 + }, + { + "epoch": 0.4045589127100203, + "grad_norm": 0.2968779504299164, + "learning_rate": 3.715289185936278e-05, + "loss": 0.1955, + "step": 22682 + }, + { + "epoch": 0.404576748831734, + "grad_norm": 0.24576152861118317, + "learning_rate": 3.7151531612430614e-05, + "loss": 0.1133, + "step": 22683 + }, + { + "epoch": 0.40459458495344774, + "grad_norm": 0.23309847712516785, + "learning_rate": 3.715017131839491e-05, + "loss": 0.1597, + "step": 22684 + }, + { + "epoch": 0.40461242107516143, + "grad_norm": 0.25851136445999146, + "learning_rate": 3.714881097726094e-05, + "loss": 0.1135, + "step": 22685 + }, + { + "epoch": 0.4046302571968751, + "grad_norm": 0.2883867025375366, + "learning_rate": 3.7147450589033996e-05, + "loss": 0.1262, + "step": 22686 + }, + { + "epoch": 0.4046480933185888, + "grad_norm": 0.18903659284114838, + "learning_rate": 3.714609015371933e-05, + "loss": 0.1369, + "step": 22687 + }, + { + "epoch": 0.4046659294403025, + "grad_norm": 0.3064503073692322, + "learning_rate": 3.714472967132222e-05, + "loss": 0.2039, + "step": 22688 + }, + { + "epoch": 0.4046837655620162, + "grad_norm": 0.3582994341850281, + "learning_rate": 3.714336914184795e-05, + "loss": 0.1629, + "step": 22689 + }, + { + "epoch": 0.4047016016837299, + "grad_norm": 0.2574089765548706, + "learning_rate": 3.714200856530178e-05, + "loss": 0.189, + "step": 22690 + }, + { + "epoch": 0.40471943780544356, + "grad_norm": 0.24698716402053833, + "learning_rate": 3.714064794168899e-05, + "loss": 0.2003, + "step": 22691 + }, + { + "epoch": 0.4047372739271573, + "grad_norm": 0.2423950731754303, + "learning_rate": 3.713928727101487e-05, + "loss": 0.1076, + "step": 22692 + }, + { + "epoch": 0.404755110048871, + "grad_norm": 0.25401636958122253, + "learning_rate": 3.7137926553284666e-05, + "loss": 0.1938, + "step": 22693 + }, + { + "epoch": 0.4047729461705847, + "grad_norm": 0.2981540262699127, + "learning_rate": 3.713656578850367e-05, + "loss": 0.1992, + "step": 22694 + }, + { + "epoch": 0.40479078229229837, + "grad_norm": 0.4166143238544464, + "learning_rate": 3.7135204976677155e-05, + "loss": 0.1668, + "step": 22695 + }, + { + "epoch": 0.40480861841401206, + "grad_norm": 0.3178115785121918, + "learning_rate": 3.71338441178104e-05, + "loss": 0.2333, + "step": 22696 + }, + { + "epoch": 0.40482645453572574, + "grad_norm": 0.33400195837020874, + "learning_rate": 3.713248321190866e-05, + "loss": 0.1333, + "step": 22697 + }, + { + "epoch": 0.40484429065743943, + "grad_norm": 0.2740047574043274, + "learning_rate": 3.713112225897723e-05, + "loss": 0.1735, + "step": 22698 + }, + { + "epoch": 0.4048621267791531, + "grad_norm": 0.45984068512916565, + "learning_rate": 3.712976125902138e-05, + "loss": 0.1703, + "step": 22699 + }, + { + "epoch": 0.40487996290086686, + "grad_norm": 0.2941805124282837, + "learning_rate": 3.7128400212046386e-05, + "loss": 0.1409, + "step": 22700 + }, + { + "epoch": 0.40489779902258055, + "grad_norm": 0.23208025097846985, + "learning_rate": 3.7127039118057527e-05, + "loss": 0.124, + "step": 22701 + }, + { + "epoch": 0.40491563514429424, + "grad_norm": 0.3174727261066437, + "learning_rate": 3.7125677977060073e-05, + "loss": 0.1275, + "step": 22702 + }, + { + "epoch": 0.4049334712660079, + "grad_norm": 0.22294315695762634, + "learning_rate": 3.712431678905931e-05, + "loss": 0.1823, + "step": 22703 + }, + { + "epoch": 0.4049513073877216, + "grad_norm": 0.2801859974861145, + "learning_rate": 3.71229555540605e-05, + "loss": 0.1846, + "step": 22704 + }, + { + "epoch": 0.4049691435094353, + "grad_norm": 0.28641200065612793, + "learning_rate": 3.7121594272068925e-05, + "loss": 0.2046, + "step": 22705 + }, + { + "epoch": 0.404986979631149, + "grad_norm": 0.2667654752731323, + "learning_rate": 3.712023294308987e-05, + "loss": 0.1561, + "step": 22706 + }, + { + "epoch": 0.4050048157528627, + "grad_norm": 0.2095882147550583, + "learning_rate": 3.711887156712861e-05, + "loss": 0.1581, + "step": 22707 + }, + { + "epoch": 0.40502265187457637, + "grad_norm": 0.2592107951641083, + "learning_rate": 3.7117510144190404e-05, + "loss": 0.2215, + "step": 22708 + }, + { + "epoch": 0.4050404879962901, + "grad_norm": 0.23236940801143646, + "learning_rate": 3.711614867428056e-05, + "loss": 0.1473, + "step": 22709 + }, + { + "epoch": 0.4050583241180038, + "grad_norm": 0.33154410123825073, + "learning_rate": 3.7114787157404326e-05, + "loss": 0.1784, + "step": 22710 + }, + { + "epoch": 0.4050761602397175, + "grad_norm": 0.3170108199119568, + "learning_rate": 3.7113425593566996e-05, + "loss": 0.1955, + "step": 22711 + }, + { + "epoch": 0.4050939963614312, + "grad_norm": 0.2649974822998047, + "learning_rate": 3.711206398277384e-05, + "loss": 0.2132, + "step": 22712 + }, + { + "epoch": 0.40511183248314486, + "grad_norm": 0.2959611117839813, + "learning_rate": 3.7110702325030146e-05, + "loss": 0.1804, + "step": 22713 + }, + { + "epoch": 0.40512966860485855, + "grad_norm": 0.23980401456356049, + "learning_rate": 3.7109340620341184e-05, + "loss": 0.1217, + "step": 22714 + }, + { + "epoch": 0.40514750472657224, + "grad_norm": 0.28079959750175476, + "learning_rate": 3.7107978868712245e-05, + "loss": 0.1303, + "step": 22715 + }, + { + "epoch": 0.4051653408482859, + "grad_norm": 0.2960323393344879, + "learning_rate": 3.710661707014859e-05, + "loss": 0.1852, + "step": 22716 + }, + { + "epoch": 0.40518317696999967, + "grad_norm": 0.3154200315475464, + "learning_rate": 3.710525522465551e-05, + "loss": 0.1264, + "step": 22717 + }, + { + "epoch": 0.40520101309171336, + "grad_norm": 0.339114785194397, + "learning_rate": 3.710389333223827e-05, + "loss": 0.1971, + "step": 22718 + }, + { + "epoch": 0.40521884921342705, + "grad_norm": 0.19106505811214447, + "learning_rate": 3.7102531392902166e-05, + "loss": 0.1039, + "step": 22719 + }, + { + "epoch": 0.40523668533514073, + "grad_norm": 0.22526821494102478, + "learning_rate": 3.710116940665247e-05, + "loss": 0.1519, + "step": 22720 + }, + { + "epoch": 0.4052545214568544, + "grad_norm": 0.31064149737358093, + "learning_rate": 3.709980737349447e-05, + "loss": 0.1642, + "step": 22721 + }, + { + "epoch": 0.4052723575785681, + "grad_norm": 0.267220139503479, + "learning_rate": 3.709844529343342e-05, + "loss": 0.1386, + "step": 22722 + }, + { + "epoch": 0.4052901937002818, + "grad_norm": 0.308649480342865, + "learning_rate": 3.709708316647463e-05, + "loss": 0.1125, + "step": 22723 + }, + { + "epoch": 0.4053080298219955, + "grad_norm": 0.31479600071907043, + "learning_rate": 3.709572099262337e-05, + "loss": 0.2352, + "step": 22724 + }, + { + "epoch": 0.4053258659437092, + "grad_norm": 0.22942212224006653, + "learning_rate": 3.709435877188491e-05, + "loss": 0.1374, + "step": 22725 + }, + { + "epoch": 0.4053437020654229, + "grad_norm": 0.2632072865962982, + "learning_rate": 3.709299650426453e-05, + "loss": 0.1767, + "step": 22726 + }, + { + "epoch": 0.4053615381871366, + "grad_norm": 0.2541314959526062, + "learning_rate": 3.7091634189767536e-05, + "loss": 0.1598, + "step": 22727 + }, + { + "epoch": 0.4053793743088503, + "grad_norm": 0.26471981406211853, + "learning_rate": 3.709027182839918e-05, + "loss": 0.1741, + "step": 22728 + }, + { + "epoch": 0.405397210430564, + "grad_norm": 0.41050460934638977, + "learning_rate": 3.7088909420164765e-05, + "loss": 0.1866, + "step": 22729 + }, + { + "epoch": 0.40541504655227767, + "grad_norm": 0.2278328686952591, + "learning_rate": 3.7087546965069556e-05, + "loss": 0.1522, + "step": 22730 + }, + { + "epoch": 0.40543288267399136, + "grad_norm": 0.33635637164115906, + "learning_rate": 3.7086184463118835e-05, + "loss": 0.1241, + "step": 22731 + }, + { + "epoch": 0.40545071879570505, + "grad_norm": 0.24367345869541168, + "learning_rate": 3.708482191431789e-05, + "loss": 0.1341, + "step": 22732 + }, + { + "epoch": 0.40546855491741873, + "grad_norm": 0.25286567211151123, + "learning_rate": 3.7083459318672e-05, + "loss": 0.1526, + "step": 22733 + }, + { + "epoch": 0.4054863910391325, + "grad_norm": 0.38565927743911743, + "learning_rate": 3.7082096676186454e-05, + "loss": 0.1896, + "step": 22734 + }, + { + "epoch": 0.40550422716084616, + "grad_norm": 0.2287045568227768, + "learning_rate": 3.7080733986866536e-05, + "loss": 0.164, + "step": 22735 + }, + { + "epoch": 0.40552206328255985, + "grad_norm": 0.28129565715789795, + "learning_rate": 3.707937125071751e-05, + "loss": 0.1483, + "step": 22736 + }, + { + "epoch": 0.40553989940427354, + "grad_norm": 0.32759547233581543, + "learning_rate": 3.7078008467744667e-05, + "loss": 0.1556, + "step": 22737 + }, + { + "epoch": 0.40555773552598723, + "grad_norm": 0.2574281692504883, + "learning_rate": 3.707664563795329e-05, + "loss": 0.1828, + "step": 22738 + }, + { + "epoch": 0.4055755716477009, + "grad_norm": 0.31409069895744324, + "learning_rate": 3.707528276134867e-05, + "loss": 0.1447, + "step": 22739 + }, + { + "epoch": 0.4055934077694146, + "grad_norm": 0.22808100283145905, + "learning_rate": 3.707391983793608e-05, + "loss": 0.1805, + "step": 22740 + }, + { + "epoch": 0.4056112438911283, + "grad_norm": 0.270231157541275, + "learning_rate": 3.7072556867720807e-05, + "loss": 0.0756, + "step": 22741 + }, + { + "epoch": 0.405629080012842, + "grad_norm": 0.2477722018957138, + "learning_rate": 3.707119385070814e-05, + "loss": 0.1491, + "step": 22742 + }, + { + "epoch": 0.4056469161345557, + "grad_norm": 0.32718339562416077, + "learning_rate": 3.706983078690335e-05, + "loss": 0.1341, + "step": 22743 + }, + { + "epoch": 0.4056647522562694, + "grad_norm": 0.284368634223938, + "learning_rate": 3.706846767631173e-05, + "loss": 0.1365, + "step": 22744 + }, + { + "epoch": 0.4056825883779831, + "grad_norm": 0.3289968967437744, + "learning_rate": 3.706710451893856e-05, + "loss": 0.1584, + "step": 22745 + }, + { + "epoch": 0.4057004244996968, + "grad_norm": 0.21433857083320618, + "learning_rate": 3.706574131478912e-05, + "loss": 0.1298, + "step": 22746 + }, + { + "epoch": 0.4057182606214105, + "grad_norm": 0.2692579925060272, + "learning_rate": 3.706437806386871e-05, + "loss": 0.1765, + "step": 22747 + }, + { + "epoch": 0.40573609674312416, + "grad_norm": 0.30741509795188904, + "learning_rate": 3.7063014766182594e-05, + "loss": 0.1785, + "step": 22748 + }, + { + "epoch": 0.40575393286483785, + "grad_norm": 0.2657647728919983, + "learning_rate": 3.706165142173607e-05, + "loss": 0.1793, + "step": 22749 + }, + { + "epoch": 0.40577176898655154, + "grad_norm": 0.24137602746486664, + "learning_rate": 3.7060288030534415e-05, + "loss": 0.1785, + "step": 22750 + }, + { + "epoch": 0.4057896051082653, + "grad_norm": 0.3676193654537201, + "learning_rate": 3.705892459258292e-05, + "loss": 0.1525, + "step": 22751 + }, + { + "epoch": 0.40580744122997897, + "grad_norm": 0.27030256390571594, + "learning_rate": 3.7057561107886874e-05, + "loss": 0.0998, + "step": 22752 + }, + { + "epoch": 0.40582527735169266, + "grad_norm": 0.23731249570846558, + "learning_rate": 3.705619757645155e-05, + "loss": 0.152, + "step": 22753 + }, + { + "epoch": 0.40584311347340635, + "grad_norm": 0.24279236793518066, + "learning_rate": 3.705483399828225e-05, + "loss": 0.166, + "step": 22754 + }, + { + "epoch": 0.40586094959512004, + "grad_norm": 0.30513203144073486, + "learning_rate": 3.7053470373384244e-05, + "loss": 0.1613, + "step": 22755 + }, + { + "epoch": 0.4058787857168337, + "grad_norm": 0.29539796710014343, + "learning_rate": 3.705210670176282e-05, + "loss": 0.2068, + "step": 22756 + }, + { + "epoch": 0.4058966218385474, + "grad_norm": 0.41609761118888855, + "learning_rate": 3.705074298342327e-05, + "loss": 0.1339, + "step": 22757 + }, + { + "epoch": 0.4059144579602611, + "grad_norm": 0.2749961018562317, + "learning_rate": 3.7049379218370875e-05, + "loss": 0.1435, + "step": 22758 + }, + { + "epoch": 0.40593229408197484, + "grad_norm": 0.3201879858970642, + "learning_rate": 3.704801540661093e-05, + "loss": 0.1655, + "step": 22759 + }, + { + "epoch": 0.40595013020368853, + "grad_norm": 0.3555706739425659, + "learning_rate": 3.704665154814872e-05, + "loss": 0.152, + "step": 22760 + }, + { + "epoch": 0.4059679663254022, + "grad_norm": 0.2815662622451782, + "learning_rate": 3.7045287642989514e-05, + "loss": 0.1365, + "step": 22761 + }, + { + "epoch": 0.4059858024471159, + "grad_norm": 0.21779082715511322, + "learning_rate": 3.7043923691138616e-05, + "loss": 0.1426, + "step": 22762 + }, + { + "epoch": 0.4060036385688296, + "grad_norm": 0.3736729919910431, + "learning_rate": 3.704255969260132e-05, + "loss": 0.1295, + "step": 22763 + }, + { + "epoch": 0.4060214746905433, + "grad_norm": 0.2549952268600464, + "learning_rate": 3.704119564738289e-05, + "loss": 0.1632, + "step": 22764 + }, + { + "epoch": 0.40603931081225697, + "grad_norm": 0.24689269065856934, + "learning_rate": 3.703983155548864e-05, + "loss": 0.1586, + "step": 22765 + }, + { + "epoch": 0.40605714693397066, + "grad_norm": 0.26894739270210266, + "learning_rate": 3.703846741692384e-05, + "loss": 0.1448, + "step": 22766 + }, + { + "epoch": 0.40607498305568435, + "grad_norm": 0.3928578495979309, + "learning_rate": 3.7037103231693774e-05, + "loss": 0.1562, + "step": 22767 + }, + { + "epoch": 0.4060928191773981, + "grad_norm": 0.2933739721775055, + "learning_rate": 3.703573899980375e-05, + "loss": 0.1183, + "step": 22768 + }, + { + "epoch": 0.4061106552991118, + "grad_norm": 0.3074215352535248, + "learning_rate": 3.703437472125903e-05, + "loss": 0.2131, + "step": 22769 + }, + { + "epoch": 0.40612849142082547, + "grad_norm": 0.3377894163131714, + "learning_rate": 3.703301039606494e-05, + "loss": 0.1517, + "step": 22770 + }, + { + "epoch": 0.40614632754253915, + "grad_norm": 0.2866095006465912, + "learning_rate": 3.703164602422673e-05, + "loss": 0.1279, + "step": 22771 + }, + { + "epoch": 0.40616416366425284, + "grad_norm": 0.21049438416957855, + "learning_rate": 3.703028160574971e-05, + "loss": 0.1233, + "step": 22772 + }, + { + "epoch": 0.40618199978596653, + "grad_norm": 0.291842520236969, + "learning_rate": 3.7028917140639155e-05, + "loss": 0.1708, + "step": 22773 + }, + { + "epoch": 0.4061998359076802, + "grad_norm": 0.25131165981292725, + "learning_rate": 3.702755262890037e-05, + "loss": 0.1282, + "step": 22774 + }, + { + "epoch": 0.4062176720293939, + "grad_norm": 0.3336566984653473, + "learning_rate": 3.702618807053863e-05, + "loss": 0.2221, + "step": 22775 + }, + { + "epoch": 0.40623550815110765, + "grad_norm": 0.28126558661460876, + "learning_rate": 3.702482346555924e-05, + "loss": 0.1793, + "step": 22776 + }, + { + "epoch": 0.40625334427282134, + "grad_norm": 0.2520708441734314, + "learning_rate": 3.702345881396748e-05, + "loss": 0.1435, + "step": 22777 + }, + { + "epoch": 0.406271180394535, + "grad_norm": 0.25496798753738403, + "learning_rate": 3.702209411576864e-05, + "loss": 0.1467, + "step": 22778 + }, + { + "epoch": 0.4062890165162487, + "grad_norm": 0.2044905573129654, + "learning_rate": 3.7020729370968e-05, + "loss": 0.1474, + "step": 22779 + }, + { + "epoch": 0.4063068526379624, + "grad_norm": 0.2566578984260559, + "learning_rate": 3.701936457957088e-05, + "loss": 0.1363, + "step": 22780 + }, + { + "epoch": 0.4063246887596761, + "grad_norm": 0.32850053906440735, + "learning_rate": 3.701799974158254e-05, + "loss": 0.1741, + "step": 22781 + }, + { + "epoch": 0.4063425248813898, + "grad_norm": 0.2941726744174957, + "learning_rate": 3.701663485700828e-05, + "loss": 0.1643, + "step": 22782 + }, + { + "epoch": 0.40636036100310347, + "grad_norm": 0.24518142640590668, + "learning_rate": 3.7015269925853395e-05, + "loss": 0.158, + "step": 22783 + }, + { + "epoch": 0.40637819712481715, + "grad_norm": 0.29638490080833435, + "learning_rate": 3.701390494812317e-05, + "loss": 0.1633, + "step": 22784 + }, + { + "epoch": 0.4063960332465309, + "grad_norm": 0.3184903860092163, + "learning_rate": 3.701253992382291e-05, + "loss": 0.1989, + "step": 22785 + }, + { + "epoch": 0.4064138693682446, + "grad_norm": 0.3627992868423462, + "learning_rate": 3.701117485295789e-05, + "loss": 0.199, + "step": 22786 + }, + { + "epoch": 0.4064317054899583, + "grad_norm": 0.22520382702350616, + "learning_rate": 3.700980973553342e-05, + "loss": 0.1781, + "step": 22787 + }, + { + "epoch": 0.40644954161167196, + "grad_norm": 0.229277566075325, + "learning_rate": 3.700844457155476e-05, + "loss": 0.1458, + "step": 22788 + }, + { + "epoch": 0.40646737773338565, + "grad_norm": 0.3314931094646454, + "learning_rate": 3.700707936102723e-05, + "loss": 0.2539, + "step": 22789 + }, + { + "epoch": 0.40648521385509934, + "grad_norm": 0.26296812295913696, + "learning_rate": 3.700571410395611e-05, + "loss": 0.1387, + "step": 22790 + }, + { + "epoch": 0.406503049976813, + "grad_norm": 0.22750885784626007, + "learning_rate": 3.7004348800346706e-05, + "loss": 0.1406, + "step": 22791 + }, + { + "epoch": 0.4065208860985267, + "grad_norm": 0.23684147000312805, + "learning_rate": 3.7002983450204284e-05, + "loss": 0.1776, + "step": 22792 + }, + { + "epoch": 0.40653872222024046, + "grad_norm": 0.3230094015598297, + "learning_rate": 3.7001618053534174e-05, + "loss": 0.2182, + "step": 22793 + }, + { + "epoch": 0.40655655834195414, + "grad_norm": 0.22826208174228668, + "learning_rate": 3.7000252610341626e-05, + "loss": 0.157, + "step": 22794 + }, + { + "epoch": 0.40657439446366783, + "grad_norm": 0.33501726388931274, + "learning_rate": 3.6998887120631966e-05, + "loss": 0.2059, + "step": 22795 + }, + { + "epoch": 0.4065922305853815, + "grad_norm": 0.3803720474243164, + "learning_rate": 3.6997521584410474e-05, + "loss": 0.1252, + "step": 22796 + }, + { + "epoch": 0.4066100667070952, + "grad_norm": 0.2851491868495941, + "learning_rate": 3.6996156001682434e-05, + "loss": 0.1252, + "step": 22797 + }, + { + "epoch": 0.4066279028288089, + "grad_norm": 0.2280944436788559, + "learning_rate": 3.6994790372453167e-05, + "loss": 0.079, + "step": 22798 + }, + { + "epoch": 0.4066457389505226, + "grad_norm": 0.29659906029701233, + "learning_rate": 3.6993424696727936e-05, + "loss": 0.1364, + "step": 22799 + }, + { + "epoch": 0.40666357507223627, + "grad_norm": 0.332767128944397, + "learning_rate": 3.6992058974512056e-05, + "loss": 0.1704, + "step": 22800 + }, + { + "epoch": 0.40668141119394996, + "grad_norm": 0.20667962729930878, + "learning_rate": 3.699069320581081e-05, + "loss": 0.1619, + "step": 22801 + }, + { + "epoch": 0.4066992473156637, + "grad_norm": 0.22946684062480927, + "learning_rate": 3.6989327390629493e-05, + "loss": 0.1786, + "step": 22802 + }, + { + "epoch": 0.4067170834373774, + "grad_norm": 0.5766844153404236, + "learning_rate": 3.69879615289734e-05, + "loss": 0.1444, + "step": 22803 + }, + { + "epoch": 0.4067349195590911, + "grad_norm": 0.34237435460090637, + "learning_rate": 3.6986595620847844e-05, + "loss": 0.2386, + "step": 22804 + }, + { + "epoch": 0.40675275568080477, + "grad_norm": 0.31693196296691895, + "learning_rate": 3.6985229666258084e-05, + "loss": 0.1838, + "step": 22805 + }, + { + "epoch": 0.40677059180251846, + "grad_norm": 0.2959299683570862, + "learning_rate": 3.698386366520945e-05, + "loss": 0.1283, + "step": 22806 + }, + { + "epoch": 0.40678842792423214, + "grad_norm": 0.25631704926490784, + "learning_rate": 3.698249761770721e-05, + "loss": 0.1265, + "step": 22807 + }, + { + "epoch": 0.40680626404594583, + "grad_norm": 0.2766932547092438, + "learning_rate": 3.698113152375668e-05, + "loss": 0.1467, + "step": 22808 + }, + { + "epoch": 0.4068241001676595, + "grad_norm": 0.20347727835178375, + "learning_rate": 3.697976538336313e-05, + "loss": 0.1529, + "step": 22809 + }, + { + "epoch": 0.40684193628937326, + "grad_norm": 0.25040000677108765, + "learning_rate": 3.697839919653189e-05, + "loss": 0.1548, + "step": 22810 + }, + { + "epoch": 0.40685977241108695, + "grad_norm": 0.29970845580101013, + "learning_rate": 3.697703296326823e-05, + "loss": 0.1867, + "step": 22811 + }, + { + "epoch": 0.40687760853280064, + "grad_norm": 0.27633410692214966, + "learning_rate": 3.6975666683577455e-05, + "loss": 0.1319, + "step": 22812 + }, + { + "epoch": 0.4068954446545143, + "grad_norm": 0.2531062066555023, + "learning_rate": 3.697430035746486e-05, + "loss": 0.159, + "step": 22813 + }, + { + "epoch": 0.406913280776228, + "grad_norm": 0.25076374411582947, + "learning_rate": 3.697293398493573e-05, + "loss": 0.112, + "step": 22814 + }, + { + "epoch": 0.4069311168979417, + "grad_norm": 0.3027176260948181, + "learning_rate": 3.697156756599539e-05, + "loss": 0.1585, + "step": 22815 + }, + { + "epoch": 0.4069489530196554, + "grad_norm": 0.24157872796058655, + "learning_rate": 3.6970201100649113e-05, + "loss": 0.1431, + "step": 22816 + }, + { + "epoch": 0.4069667891413691, + "grad_norm": 0.2958551347255707, + "learning_rate": 3.696883458890219e-05, + "loss": 0.1662, + "step": 22817 + }, + { + "epoch": 0.4069846252630828, + "grad_norm": 0.22539092600345612, + "learning_rate": 3.696746803075994e-05, + "loss": 0.1386, + "step": 22818 + }, + { + "epoch": 0.4070024613847965, + "grad_norm": 0.30554768443107605, + "learning_rate": 3.696610142622766e-05, + "loss": 0.1777, + "step": 22819 + }, + { + "epoch": 0.4070202975065102, + "grad_norm": 0.3243509531021118, + "learning_rate": 3.696473477531063e-05, + "loss": 0.1447, + "step": 22820 + }, + { + "epoch": 0.4070381336282239, + "grad_norm": 0.3667590618133545, + "learning_rate": 3.696336807801415e-05, + "loss": 0.1316, + "step": 22821 + }, + { + "epoch": 0.4070559697499376, + "grad_norm": 0.2549302577972412, + "learning_rate": 3.696200133434353e-05, + "loss": 0.1701, + "step": 22822 + }, + { + "epoch": 0.40707380587165126, + "grad_norm": 0.30883529782295227, + "learning_rate": 3.696063454430405e-05, + "loss": 0.2499, + "step": 22823 + }, + { + "epoch": 0.40709164199336495, + "grad_norm": 0.21412034332752228, + "learning_rate": 3.6959267707901037e-05, + "loss": 0.1952, + "step": 22824 + }, + { + "epoch": 0.40710947811507864, + "grad_norm": 0.34255868196487427, + "learning_rate": 3.6957900825139755e-05, + "loss": 0.1154, + "step": 22825 + }, + { + "epoch": 0.4071273142367923, + "grad_norm": 0.2662195563316345, + "learning_rate": 3.695653389602554e-05, + "loss": 0.1909, + "step": 22826 + }, + { + "epoch": 0.40714515035850607, + "grad_norm": 0.24291059374809265, + "learning_rate": 3.695516692056365e-05, + "loss": 0.1453, + "step": 22827 + }, + { + "epoch": 0.40716298648021976, + "grad_norm": 0.34842705726623535, + "learning_rate": 3.695379989875941e-05, + "loss": 0.114, + "step": 22828 + }, + { + "epoch": 0.40718082260193345, + "grad_norm": 0.2803455591201782, + "learning_rate": 3.6952432830618116e-05, + "loss": 0.175, + "step": 22829 + }, + { + "epoch": 0.40719865872364713, + "grad_norm": 0.21294955909252167, + "learning_rate": 3.695106571614506e-05, + "loss": 0.1777, + "step": 22830 + }, + { + "epoch": 0.4072164948453608, + "grad_norm": 0.2907697558403015, + "learning_rate": 3.694969855534555e-05, + "loss": 0.1726, + "step": 22831 + }, + { + "epoch": 0.4072343309670745, + "grad_norm": 0.19814598560333252, + "learning_rate": 3.694833134822487e-05, + "loss": 0.1285, + "step": 22832 + }, + { + "epoch": 0.4072521670887882, + "grad_norm": 0.3283410966396332, + "learning_rate": 3.694696409478835e-05, + "loss": 0.1104, + "step": 22833 + }, + { + "epoch": 0.4072700032105019, + "grad_norm": 0.2641567289829254, + "learning_rate": 3.694559679504126e-05, + "loss": 0.1668, + "step": 22834 + }, + { + "epoch": 0.40728783933221563, + "grad_norm": 0.2554190754890442, + "learning_rate": 3.6944229448988906e-05, + "loss": 0.1376, + "step": 22835 + }, + { + "epoch": 0.4073056754539293, + "grad_norm": 0.2924638092517853, + "learning_rate": 3.69428620566366e-05, + "loss": 0.1946, + "step": 22836 + }, + { + "epoch": 0.407323511575643, + "grad_norm": 0.3419770300388336, + "learning_rate": 3.6941494617989637e-05, + "loss": 0.1287, + "step": 22837 + }, + { + "epoch": 0.4073413476973567, + "grad_norm": 0.25817233324050903, + "learning_rate": 3.694012713305331e-05, + "loss": 0.1644, + "step": 22838 + }, + { + "epoch": 0.4073591838190704, + "grad_norm": 0.247476264834404, + "learning_rate": 3.6938759601832936e-05, + "loss": 0.1568, + "step": 22839 + }, + { + "epoch": 0.40737701994078407, + "grad_norm": 0.23786133527755737, + "learning_rate": 3.6937392024333794e-05, + "loss": 0.1393, + "step": 22840 + }, + { + "epoch": 0.40739485606249776, + "grad_norm": 0.2678735554218292, + "learning_rate": 3.693602440056121e-05, + "loss": 0.1358, + "step": 22841 + }, + { + "epoch": 0.40741269218421144, + "grad_norm": 0.2773587703704834, + "learning_rate": 3.693465673052046e-05, + "loss": 0.154, + "step": 22842 + }, + { + "epoch": 0.40743052830592513, + "grad_norm": 0.24629567563533783, + "learning_rate": 3.693328901421687e-05, + "loss": 0.1371, + "step": 22843 + }, + { + "epoch": 0.4074483644276389, + "grad_norm": 0.3306807577610016, + "learning_rate": 3.693192125165572e-05, + "loss": 0.2294, + "step": 22844 + }, + { + "epoch": 0.40746620054935256, + "grad_norm": 0.23849107325077057, + "learning_rate": 3.693055344284233e-05, + "loss": 0.1161, + "step": 22845 + }, + { + "epoch": 0.40748403667106625, + "grad_norm": 0.24857784807682037, + "learning_rate": 3.6929185587782e-05, + "loss": 0.1906, + "step": 22846 + }, + { + "epoch": 0.40750187279277994, + "grad_norm": 0.26840338110923767, + "learning_rate": 3.692781768648001e-05, + "loss": 0.1723, + "step": 22847 + }, + { + "epoch": 0.40751970891449363, + "grad_norm": 0.2885158360004425, + "learning_rate": 3.692644973894169e-05, + "loss": 0.1466, + "step": 22848 + }, + { + "epoch": 0.4075375450362073, + "grad_norm": 0.21937188506126404, + "learning_rate": 3.692508174517233e-05, + "loss": 0.1374, + "step": 22849 + }, + { + "epoch": 0.407555381157921, + "grad_norm": 0.25022321939468384, + "learning_rate": 3.6923713705177225e-05, + "loss": 0.1894, + "step": 22850 + }, + { + "epoch": 0.4075732172796347, + "grad_norm": 0.3140817880630493, + "learning_rate": 3.6922345618961696e-05, + "loss": 0.1306, + "step": 22851 + }, + { + "epoch": 0.40759105340134844, + "grad_norm": 0.26757583022117615, + "learning_rate": 3.692097748653104e-05, + "loss": 0.1612, + "step": 22852 + }, + { + "epoch": 0.4076088895230621, + "grad_norm": 0.2838507890701294, + "learning_rate": 3.691960930789055e-05, + "loss": 0.1888, + "step": 22853 + }, + { + "epoch": 0.4076267256447758, + "grad_norm": 0.3510291278362274, + "learning_rate": 3.691824108304554e-05, + "loss": 0.2161, + "step": 22854 + }, + { + "epoch": 0.4076445617664895, + "grad_norm": 0.2649875581264496, + "learning_rate": 3.691687281200132e-05, + "loss": 0.1486, + "step": 22855 + }, + { + "epoch": 0.4076623978882032, + "grad_norm": 0.28027260303497314, + "learning_rate": 3.691550449476318e-05, + "loss": 0.1737, + "step": 22856 + }, + { + "epoch": 0.4076802340099169, + "grad_norm": 0.2549683749675751, + "learning_rate": 3.691413613133643e-05, + "loss": 0.142, + "step": 22857 + }, + { + "epoch": 0.40769807013163056, + "grad_norm": 0.24638888239860535, + "learning_rate": 3.691276772172636e-05, + "loss": 0.1233, + "step": 22858 + }, + { + "epoch": 0.40771590625334425, + "grad_norm": 0.27769047021865845, + "learning_rate": 3.6911399265938304e-05, + "loss": 0.1757, + "step": 22859 + }, + { + "epoch": 0.407733742375058, + "grad_norm": 0.3439522087574005, + "learning_rate": 3.6910030763977544e-05, + "loss": 0.1086, + "step": 22860 + }, + { + "epoch": 0.4077515784967717, + "grad_norm": 0.27945300936698914, + "learning_rate": 3.6908662215849396e-05, + "loss": 0.1605, + "step": 22861 + }, + { + "epoch": 0.40776941461848537, + "grad_norm": 0.32378441095352173, + "learning_rate": 3.690729362155915e-05, + "loss": 0.1786, + "step": 22862 + }, + { + "epoch": 0.40778725074019906, + "grad_norm": 0.3368411362171173, + "learning_rate": 3.690592498111213e-05, + "loss": 0.176, + "step": 22863 + }, + { + "epoch": 0.40780508686191275, + "grad_norm": 0.32447549700737, + "learning_rate": 3.690455629451363e-05, + "loss": 0.2008, + "step": 22864 + }, + { + "epoch": 0.40782292298362643, + "grad_norm": 0.203013613820076, + "learning_rate": 3.690318756176896e-05, + "loss": 0.1404, + "step": 22865 + }, + { + "epoch": 0.4078407591053401, + "grad_norm": 0.21278530359268188, + "learning_rate": 3.690181878288342e-05, + "loss": 0.1353, + "step": 22866 + }, + { + "epoch": 0.4078585952270538, + "grad_norm": 0.2698383927345276, + "learning_rate": 3.690044995786232e-05, + "loss": 0.1536, + "step": 22867 + }, + { + "epoch": 0.4078764313487675, + "grad_norm": 0.2509118914604187, + "learning_rate": 3.689908108671096e-05, + "loss": 0.1366, + "step": 22868 + }, + { + "epoch": 0.40789426747048124, + "grad_norm": 0.2102491706609726, + "learning_rate": 3.689771216943467e-05, + "loss": 0.125, + "step": 22869 + }, + { + "epoch": 0.40791210359219493, + "grad_norm": 0.20959524810314178, + "learning_rate": 3.6896343206038723e-05, + "loss": 0.1241, + "step": 22870 + }, + { + "epoch": 0.4079299397139086, + "grad_norm": 0.3055448830127716, + "learning_rate": 3.689497419652844e-05, + "loss": 0.1598, + "step": 22871 + }, + { + "epoch": 0.4079477758356223, + "grad_norm": 0.21058553457260132, + "learning_rate": 3.689360514090914e-05, + "loss": 0.1333, + "step": 22872 + }, + { + "epoch": 0.407965611957336, + "grad_norm": 0.2746939957141876, + "learning_rate": 3.689223603918611e-05, + "loss": 0.1359, + "step": 22873 + }, + { + "epoch": 0.4079834480790497, + "grad_norm": 0.30057674646377563, + "learning_rate": 3.689086689136467e-05, + "loss": 0.1467, + "step": 22874 + }, + { + "epoch": 0.40800128420076337, + "grad_norm": 0.22446024417877197, + "learning_rate": 3.688949769745012e-05, + "loss": 0.1608, + "step": 22875 + }, + { + "epoch": 0.40801912032247706, + "grad_norm": 0.2605692446231842, + "learning_rate": 3.688812845744777e-05, + "loss": 0.1865, + "step": 22876 + }, + { + "epoch": 0.4080369564441908, + "grad_norm": 0.27201569080352783, + "learning_rate": 3.688675917136293e-05, + "loss": 0.1935, + "step": 22877 + }, + { + "epoch": 0.4080547925659045, + "grad_norm": 0.3429532051086426, + "learning_rate": 3.688538983920091e-05, + "loss": 0.1557, + "step": 22878 + }, + { + "epoch": 0.4080726286876182, + "grad_norm": 0.2994801998138428, + "learning_rate": 3.688402046096701e-05, + "loss": 0.1154, + "step": 22879 + }, + { + "epoch": 0.40809046480933187, + "grad_norm": 0.19963397085666656, + "learning_rate": 3.6882651036666536e-05, + "loss": 0.1519, + "step": 22880 + }, + { + "epoch": 0.40810830093104555, + "grad_norm": 0.20837277173995972, + "learning_rate": 3.6881281566304805e-05, + "loss": 0.1615, + "step": 22881 + }, + { + "epoch": 0.40812613705275924, + "grad_norm": 0.2550000846385956, + "learning_rate": 3.6879912049887136e-05, + "loss": 0.1547, + "step": 22882 + }, + { + "epoch": 0.40814397317447293, + "grad_norm": 0.25186896324157715, + "learning_rate": 3.687854248741881e-05, + "loss": 0.1807, + "step": 22883 + }, + { + "epoch": 0.4081618092961866, + "grad_norm": 0.2719725966453552, + "learning_rate": 3.6877172878905154e-05, + "loss": 0.1886, + "step": 22884 + }, + { + "epoch": 0.4081796454179003, + "grad_norm": 0.18179181218147278, + "learning_rate": 3.6875803224351474e-05, + "loss": 0.1265, + "step": 22885 + }, + { + "epoch": 0.40819748153961405, + "grad_norm": 0.2512252628803253, + "learning_rate": 3.687443352376308e-05, + "loss": 0.1527, + "step": 22886 + }, + { + "epoch": 0.40821531766132774, + "grad_norm": 0.2676926255226135, + "learning_rate": 3.687306377714528e-05, + "loss": 0.1621, + "step": 22887 + }, + { + "epoch": 0.4082331537830414, + "grad_norm": 0.2575880289077759, + "learning_rate": 3.687169398450339e-05, + "loss": 0.1568, + "step": 22888 + }, + { + "epoch": 0.4082509899047551, + "grad_norm": 0.24720615148544312, + "learning_rate": 3.6870324145842706e-05, + "loss": 0.1283, + "step": 22889 + }, + { + "epoch": 0.4082688260264688, + "grad_norm": 0.2541947364807129, + "learning_rate": 3.6868954261168545e-05, + "loss": 0.1459, + "step": 22890 + }, + { + "epoch": 0.4082866621481825, + "grad_norm": 0.3343372046947479, + "learning_rate": 3.6867584330486224e-05, + "loss": 0.1461, + "step": 22891 + }, + { + "epoch": 0.4083044982698962, + "grad_norm": 0.20460598170757294, + "learning_rate": 3.6866214353801035e-05, + "loss": 0.1391, + "step": 22892 + }, + { + "epoch": 0.40832233439160986, + "grad_norm": 0.2759687304496765, + "learning_rate": 3.686484433111831e-05, + "loss": 0.1548, + "step": 22893 + }, + { + "epoch": 0.4083401705133236, + "grad_norm": 0.2569333612918854, + "learning_rate": 3.6863474262443346e-05, + "loss": 0.1292, + "step": 22894 + }, + { + "epoch": 0.4083580066350373, + "grad_norm": 0.3087644577026367, + "learning_rate": 3.686210414778147e-05, + "loss": 0.1503, + "step": 22895 + }, + { + "epoch": 0.408375842756751, + "grad_norm": 0.3194817006587982, + "learning_rate": 3.686073398713797e-05, + "loss": 0.108, + "step": 22896 + }, + { + "epoch": 0.40839367887846467, + "grad_norm": 0.27774205803871155, + "learning_rate": 3.6859363780518174e-05, + "loss": 0.1881, + "step": 22897 + }, + { + "epoch": 0.40841151500017836, + "grad_norm": 0.31939876079559326, + "learning_rate": 3.685799352792738e-05, + "loss": 0.1594, + "step": 22898 + }, + { + "epoch": 0.40842935112189205, + "grad_norm": 0.22911010682582855, + "learning_rate": 3.685662322937091e-05, + "loss": 0.1313, + "step": 22899 + }, + { + "epoch": 0.40844718724360574, + "grad_norm": 0.3676689565181732, + "learning_rate": 3.685525288485409e-05, + "loss": 0.2567, + "step": 22900 + }, + { + "epoch": 0.4084650233653194, + "grad_norm": 0.2931276559829712, + "learning_rate": 3.68538824943822e-05, + "loss": 0.1375, + "step": 22901 + }, + { + "epoch": 0.4084828594870331, + "grad_norm": 0.29382312297821045, + "learning_rate": 3.685251205796057e-05, + "loss": 0.1802, + "step": 22902 + }, + { + "epoch": 0.40850069560874686, + "grad_norm": 0.2815816104412079, + "learning_rate": 3.68511415755945e-05, + "loss": 0.1887, + "step": 22903 + }, + { + "epoch": 0.40851853173046054, + "grad_norm": 0.3132527470588684, + "learning_rate": 3.684977104728933e-05, + "loss": 0.136, + "step": 22904 + }, + { + "epoch": 0.40853636785217423, + "grad_norm": 0.2781032919883728, + "learning_rate": 3.6848400473050335e-05, + "loss": 0.1566, + "step": 22905 + }, + { + "epoch": 0.4085542039738879, + "grad_norm": 0.28257012367248535, + "learning_rate": 3.6847029852882856e-05, + "loss": 0.1563, + "step": 22906 + }, + { + "epoch": 0.4085720400956016, + "grad_norm": 0.26520785689353943, + "learning_rate": 3.68456591867922e-05, + "loss": 0.1533, + "step": 22907 + }, + { + "epoch": 0.4085898762173153, + "grad_norm": 0.25135770440101624, + "learning_rate": 3.684428847478368e-05, + "loss": 0.1309, + "step": 22908 + }, + { + "epoch": 0.408607712339029, + "grad_norm": 0.2853555381298065, + "learning_rate": 3.6842917716862603e-05, + "loss": 0.1605, + "step": 22909 + }, + { + "epoch": 0.40862554846074267, + "grad_norm": 0.2451864629983902, + "learning_rate": 3.6841546913034285e-05, + "loss": 0.1475, + "step": 22910 + }, + { + "epoch": 0.4086433845824564, + "grad_norm": 0.21090559661388397, + "learning_rate": 3.6840176063304045e-05, + "loss": 0.1332, + "step": 22911 + }, + { + "epoch": 0.4086612207041701, + "grad_norm": 0.24229754507541656, + "learning_rate": 3.683880516767719e-05, + "loss": 0.162, + "step": 22912 + }, + { + "epoch": 0.4086790568258838, + "grad_norm": 0.2401350885629654, + "learning_rate": 3.683743422615904e-05, + "loss": 0.1528, + "step": 22913 + }, + { + "epoch": 0.4086968929475975, + "grad_norm": 0.3371230959892273, + "learning_rate": 3.68360632387549e-05, + "loss": 0.0936, + "step": 22914 + }, + { + "epoch": 0.40871472906931117, + "grad_norm": 0.26828733086586, + "learning_rate": 3.68346922054701e-05, + "loss": 0.1132, + "step": 22915 + }, + { + "epoch": 0.40873256519102485, + "grad_norm": 0.22380150854587555, + "learning_rate": 3.683332112630994e-05, + "loss": 0.1305, + "step": 22916 + }, + { + "epoch": 0.40875040131273854, + "grad_norm": 0.246516615152359, + "learning_rate": 3.6831950001279744e-05, + "loss": 0.1678, + "step": 22917 + }, + { + "epoch": 0.40876823743445223, + "grad_norm": 0.30184081196784973, + "learning_rate": 3.683057883038482e-05, + "loss": 0.1148, + "step": 22918 + }, + { + "epoch": 0.408786073556166, + "grad_norm": 0.313192754983902, + "learning_rate": 3.6829207613630487e-05, + "loss": 0.2113, + "step": 22919 + }, + { + "epoch": 0.40880390967787966, + "grad_norm": 0.18925026059150696, + "learning_rate": 3.682783635102206e-05, + "loss": 0.1216, + "step": 22920 + }, + { + "epoch": 0.40882174579959335, + "grad_norm": 0.2137240469455719, + "learning_rate": 3.682646504256485e-05, + "loss": 0.1111, + "step": 22921 + }, + { + "epoch": 0.40883958192130704, + "grad_norm": 0.24453464150428772, + "learning_rate": 3.682509368826418e-05, + "loss": 0.1571, + "step": 22922 + }, + { + "epoch": 0.4088574180430207, + "grad_norm": 0.24433757364749908, + "learning_rate": 3.682372228812536e-05, + "loss": 0.1246, + "step": 22923 + }, + { + "epoch": 0.4088752541647344, + "grad_norm": 0.31137651205062866, + "learning_rate": 3.682235084215371e-05, + "loss": 0.19, + "step": 22924 + }, + { + "epoch": 0.4088930902864481, + "grad_norm": 0.2959708571434021, + "learning_rate": 3.682097935035456e-05, + "loss": 0.1315, + "step": 22925 + }, + { + "epoch": 0.4089109264081618, + "grad_norm": 0.2607278823852539, + "learning_rate": 3.681960781273319e-05, + "loss": 0.1382, + "step": 22926 + }, + { + "epoch": 0.4089287625298755, + "grad_norm": 0.2750498950481415, + "learning_rate": 3.681823622929495e-05, + "loss": 0.1502, + "step": 22927 + }, + { + "epoch": 0.4089465986515892, + "grad_norm": 0.23453126847743988, + "learning_rate": 3.681686460004514e-05, + "loss": 0.1275, + "step": 22928 + }, + { + "epoch": 0.4089644347733029, + "grad_norm": 0.2705915868282318, + "learning_rate": 3.6815492924989074e-05, + "loss": 0.1703, + "step": 22929 + }, + { + "epoch": 0.4089822708950166, + "grad_norm": 0.24737876653671265, + "learning_rate": 3.681412120413209e-05, + "loss": 0.1157, + "step": 22930 + }, + { + "epoch": 0.4090001070167303, + "grad_norm": 0.22455322742462158, + "learning_rate": 3.681274943747948e-05, + "loss": 0.1527, + "step": 22931 + }, + { + "epoch": 0.409017943138444, + "grad_norm": 0.27392327785491943, + "learning_rate": 3.681137762503658e-05, + "loss": 0.2148, + "step": 22932 + }, + { + "epoch": 0.40903577926015766, + "grad_norm": 0.215500608086586, + "learning_rate": 3.68100057668087e-05, + "loss": 0.1086, + "step": 22933 + }, + { + "epoch": 0.40905361538187135, + "grad_norm": 0.2355826497077942, + "learning_rate": 3.680863386280116e-05, + "loss": 0.1362, + "step": 22934 + }, + { + "epoch": 0.40907145150358504, + "grad_norm": 0.2417983114719391, + "learning_rate": 3.680726191301927e-05, + "loss": 0.131, + "step": 22935 + }, + { + "epoch": 0.4090892876252988, + "grad_norm": 0.4128175675868988, + "learning_rate": 3.680588991746836e-05, + "loss": 0.1872, + "step": 22936 + }, + { + "epoch": 0.40910712374701247, + "grad_norm": 0.2485012263059616, + "learning_rate": 3.6804517876153746e-05, + "loss": 0.1843, + "step": 22937 + }, + { + "epoch": 0.40912495986872616, + "grad_norm": 0.3083796799182892, + "learning_rate": 3.680314578908074e-05, + "loss": 0.2065, + "step": 22938 + }, + { + "epoch": 0.40914279599043984, + "grad_norm": 0.2656121850013733, + "learning_rate": 3.680177365625467e-05, + "loss": 0.1171, + "step": 22939 + }, + { + "epoch": 0.40916063211215353, + "grad_norm": 0.2742651104927063, + "learning_rate": 3.680040147768083e-05, + "loss": 0.1892, + "step": 22940 + }, + { + "epoch": 0.4091784682338672, + "grad_norm": 0.2656703591346741, + "learning_rate": 3.6799029253364574e-05, + "loss": 0.1681, + "step": 22941 + }, + { + "epoch": 0.4091963043555809, + "grad_norm": 0.2989206314086914, + "learning_rate": 3.67976569833112e-05, + "loss": 0.1628, + "step": 22942 + }, + { + "epoch": 0.4092141404772946, + "grad_norm": 0.3289611339569092, + "learning_rate": 3.679628466752604e-05, + "loss": 0.1893, + "step": 22943 + }, + { + "epoch": 0.4092319765990083, + "grad_norm": 0.2592921555042267, + "learning_rate": 3.67949123060144e-05, + "loss": 0.1531, + "step": 22944 + }, + { + "epoch": 0.40924981272072203, + "grad_norm": 0.2873404920101166, + "learning_rate": 3.679353989878162e-05, + "loss": 0.1485, + "step": 22945 + }, + { + "epoch": 0.4092676488424357, + "grad_norm": 0.3295323848724365, + "learning_rate": 3.6792167445832986e-05, + "loss": 0.1779, + "step": 22946 + }, + { + "epoch": 0.4092854849641494, + "grad_norm": 0.2260526716709137, + "learning_rate": 3.679079494717385e-05, + "loss": 0.1482, + "step": 22947 + }, + { + "epoch": 0.4093033210858631, + "grad_norm": 0.31729191541671753, + "learning_rate": 3.678942240280951e-05, + "loss": 0.1624, + "step": 22948 + }, + { + "epoch": 0.4093211572075768, + "grad_norm": 0.2952575981616974, + "learning_rate": 3.6788049812745306e-05, + "loss": 0.1089, + "step": 22949 + }, + { + "epoch": 0.40933899332929047, + "grad_norm": 0.3142538070678711, + "learning_rate": 3.678667717698655e-05, + "loss": 0.1406, + "step": 22950 + }, + { + "epoch": 0.40935682945100416, + "grad_norm": 0.2574765086174011, + "learning_rate": 3.678530449553855e-05, + "loss": 0.1495, + "step": 22951 + }, + { + "epoch": 0.40937466557271784, + "grad_norm": 0.4022500514984131, + "learning_rate": 3.678393176840666e-05, + "loss": 0.1521, + "step": 22952 + }, + { + "epoch": 0.4093925016944316, + "grad_norm": 0.36074599623680115, + "learning_rate": 3.6782558995596164e-05, + "loss": 0.125, + "step": 22953 + }, + { + "epoch": 0.4094103378161453, + "grad_norm": 0.2353619486093521, + "learning_rate": 3.67811861771124e-05, + "loss": 0.1912, + "step": 22954 + }, + { + "epoch": 0.40942817393785896, + "grad_norm": 0.2922440469264984, + "learning_rate": 3.6779813312960695e-05, + "loss": 0.135, + "step": 22955 + }, + { + "epoch": 0.40944601005957265, + "grad_norm": 0.2769284248352051, + "learning_rate": 3.6778440403146364e-05, + "loss": 0.161, + "step": 22956 + }, + { + "epoch": 0.40946384618128634, + "grad_norm": 0.32437753677368164, + "learning_rate": 3.677706744767473e-05, + "loss": 0.179, + "step": 22957 + }, + { + "epoch": 0.409481682303, + "grad_norm": 0.2687344551086426, + "learning_rate": 3.677569444655112e-05, + "loss": 0.1093, + "step": 22958 + }, + { + "epoch": 0.4094995184247137, + "grad_norm": 0.21496063470840454, + "learning_rate": 3.6774321399780844e-05, + "loss": 0.0968, + "step": 22959 + }, + { + "epoch": 0.4095173545464274, + "grad_norm": 0.3070719838142395, + "learning_rate": 3.677294830736923e-05, + "loss": 0.1865, + "step": 22960 + }, + { + "epoch": 0.40953519066814115, + "grad_norm": 0.22498443722724915, + "learning_rate": 3.6771575169321605e-05, + "loss": 0.1586, + "step": 22961 + }, + { + "epoch": 0.40955302678985483, + "grad_norm": 0.28707748651504517, + "learning_rate": 3.677020198564329e-05, + "loss": 0.1418, + "step": 22962 + }, + { + "epoch": 0.4095708629115685, + "grad_norm": 0.2691110670566559, + "learning_rate": 3.67688287563396e-05, + "loss": 0.1701, + "step": 22963 + }, + { + "epoch": 0.4095886990332822, + "grad_norm": 0.3171987235546112, + "learning_rate": 3.676745548141587e-05, + "loss": 0.1601, + "step": 22964 + }, + { + "epoch": 0.4096065351549959, + "grad_norm": 0.2266453355550766, + "learning_rate": 3.676608216087743e-05, + "loss": 0.1482, + "step": 22965 + }, + { + "epoch": 0.4096243712767096, + "grad_norm": 0.2855393886566162, + "learning_rate": 3.676470879472958e-05, + "loss": 0.2021, + "step": 22966 + }, + { + "epoch": 0.4096422073984233, + "grad_norm": 0.25436756014823914, + "learning_rate": 3.6763335382977656e-05, + "loss": 0.1806, + "step": 22967 + }, + { + "epoch": 0.40966004352013696, + "grad_norm": 0.2488749623298645, + "learning_rate": 3.676196192562698e-05, + "loss": 0.1786, + "step": 22968 + }, + { + "epoch": 0.40967787964185065, + "grad_norm": 0.41944360733032227, + "learning_rate": 3.6760588422682873e-05, + "loss": 0.1389, + "step": 22969 + }, + { + "epoch": 0.4096957157635644, + "grad_norm": 0.35119929909706116, + "learning_rate": 3.675921487415067e-05, + "loss": 0.1583, + "step": 22970 + }, + { + "epoch": 0.4097135518852781, + "grad_norm": 0.2227325141429901, + "learning_rate": 3.675784128003569e-05, + "loss": 0.1045, + "step": 22971 + }, + { + "epoch": 0.40973138800699177, + "grad_norm": 0.18357279896736145, + "learning_rate": 3.675646764034325e-05, + "loss": 0.1106, + "step": 22972 + }, + { + "epoch": 0.40974922412870546, + "grad_norm": 0.2682918906211853, + "learning_rate": 3.675509395507869e-05, + "loss": 0.1603, + "step": 22973 + }, + { + "epoch": 0.40976706025041915, + "grad_norm": 0.27669504284858704, + "learning_rate": 3.675372022424732e-05, + "loss": 0.1426, + "step": 22974 + }, + { + "epoch": 0.40978489637213283, + "grad_norm": 0.4094705879688263, + "learning_rate": 3.675234644785447e-05, + "loss": 0.1884, + "step": 22975 + }, + { + "epoch": 0.4098027324938465, + "grad_norm": 0.26464247703552246, + "learning_rate": 3.675097262590546e-05, + "loss": 0.1872, + "step": 22976 + }, + { + "epoch": 0.4098205686155602, + "grad_norm": 0.3582855463027954, + "learning_rate": 3.674959875840563e-05, + "loss": 0.171, + "step": 22977 + }, + { + "epoch": 0.40983840473727395, + "grad_norm": 0.23134541511535645, + "learning_rate": 3.67482248453603e-05, + "loss": 0.1355, + "step": 22978 + }, + { + "epoch": 0.40985624085898764, + "grad_norm": 0.2304838001728058, + "learning_rate": 3.6746850886774787e-05, + "loss": 0.1363, + "step": 22979 + }, + { + "epoch": 0.40987407698070133, + "grad_norm": 0.3144586682319641, + "learning_rate": 3.674547688265443e-05, + "loss": 0.1209, + "step": 22980 + }, + { + "epoch": 0.409891913102415, + "grad_norm": 0.3483356833457947, + "learning_rate": 3.6744102833004535e-05, + "loss": 0.209, + "step": 22981 + }, + { + "epoch": 0.4099097492241287, + "grad_norm": 0.22373300790786743, + "learning_rate": 3.6742728737830444e-05, + "loss": 0.1677, + "step": 22982 + }, + { + "epoch": 0.4099275853458424, + "grad_norm": 0.4133744239807129, + "learning_rate": 3.674135459713749e-05, + "loss": 0.1615, + "step": 22983 + }, + { + "epoch": 0.4099454214675561, + "grad_norm": 0.23916204273700714, + "learning_rate": 3.673998041093098e-05, + "loss": 0.177, + "step": 22984 + }, + { + "epoch": 0.40996325758926977, + "grad_norm": 0.2945150136947632, + "learning_rate": 3.673860617921626e-05, + "loss": 0.1786, + "step": 22985 + }, + { + "epoch": 0.40998109371098346, + "grad_norm": 0.38855665922164917, + "learning_rate": 3.673723190199864e-05, + "loss": 0.1564, + "step": 22986 + }, + { + "epoch": 0.4099989298326972, + "grad_norm": 0.2788127064704895, + "learning_rate": 3.673585757928346e-05, + "loss": 0.1819, + "step": 22987 + }, + { + "epoch": 0.4100167659544109, + "grad_norm": 0.29421499371528625, + "learning_rate": 3.673448321107604e-05, + "loss": 0.1944, + "step": 22988 + }, + { + "epoch": 0.4100346020761246, + "grad_norm": 0.2578313946723938, + "learning_rate": 3.6733108797381717e-05, + "loss": 0.1665, + "step": 22989 + }, + { + "epoch": 0.41005243819783826, + "grad_norm": 0.3013758361339569, + "learning_rate": 3.67317343382058e-05, + "loss": 0.204, + "step": 22990 + }, + { + "epoch": 0.41007027431955195, + "grad_norm": 0.26153460144996643, + "learning_rate": 3.673035983355364e-05, + "loss": 0.148, + "step": 22991 + }, + { + "epoch": 0.41008811044126564, + "grad_norm": 0.24209193885326385, + "learning_rate": 3.672898528343055e-05, + "loss": 0.1643, + "step": 22992 + }, + { + "epoch": 0.41010594656297933, + "grad_norm": 0.26765844225883484, + "learning_rate": 3.6727610687841864e-05, + "loss": 0.1333, + "step": 22993 + }, + { + "epoch": 0.410123782684693, + "grad_norm": 0.2752847671508789, + "learning_rate": 3.6726236046792905e-05, + "loss": 0.1427, + "step": 22994 + }, + { + "epoch": 0.41014161880640676, + "grad_norm": 0.38417741656303406, + "learning_rate": 3.6724861360289006e-05, + "loss": 0.184, + "step": 22995 + }, + { + "epoch": 0.41015945492812045, + "grad_norm": 0.2892318069934845, + "learning_rate": 3.672348662833549e-05, + "loss": 0.1804, + "step": 22996 + }, + { + "epoch": 0.41017729104983414, + "grad_norm": 0.3207838833332062, + "learning_rate": 3.67221118509377e-05, + "loss": 0.1366, + "step": 22997 + }, + { + "epoch": 0.4101951271715478, + "grad_norm": 0.3486321270465851, + "learning_rate": 3.6720737028100945e-05, + "loss": 0.1211, + "step": 22998 + }, + { + "epoch": 0.4102129632932615, + "grad_norm": 0.24128076434135437, + "learning_rate": 3.671936215983057e-05, + "loss": 0.1517, + "step": 22999 + }, + { + "epoch": 0.4102307994149752, + "grad_norm": 0.28812161087989807, + "learning_rate": 3.67179872461319e-05, + "loss": 0.1395, + "step": 23000 + }, + { + "epoch": 0.4102307994149752, + "eval_loss": 0.15161660313606262, + "eval_runtime": 106.9424, + "eval_samples_per_second": 9.575, + "eval_steps_per_second": 1.599, + "step": 23000 + }, + { + "epoch": 0.4102486355366889, + "grad_norm": 0.32620468735694885, + "learning_rate": 3.671661228701026e-05, + "loss": 0.1579, + "step": 23001 + }, + { + "epoch": 0.4102664716584026, + "grad_norm": 0.2858385145664215, + "learning_rate": 3.671523728247099e-05, + "loss": 0.1558, + "step": 23002 + }, + { + "epoch": 0.41028430778011626, + "grad_norm": 0.3071691393852234, + "learning_rate": 3.671386223251941e-05, + "loss": 0.1659, + "step": 23003 + }, + { + "epoch": 0.41030214390183, + "grad_norm": 0.2689734995365143, + "learning_rate": 3.671248713716087e-05, + "loss": 0.1537, + "step": 23004 + }, + { + "epoch": 0.4103199800235437, + "grad_norm": 0.3195585310459137, + "learning_rate": 3.671111199640066e-05, + "loss": 0.1728, + "step": 23005 + }, + { + "epoch": 0.4103378161452574, + "grad_norm": 0.30270084738731384, + "learning_rate": 3.6709736810244155e-05, + "loss": 0.2003, + "step": 23006 + }, + { + "epoch": 0.41035565226697107, + "grad_norm": 0.2977372109889984, + "learning_rate": 3.6708361578696646e-05, + "loss": 0.1528, + "step": 23007 + }, + { + "epoch": 0.41037348838868476, + "grad_norm": 0.3400401771068573, + "learning_rate": 3.6706986301763505e-05, + "loss": 0.1747, + "step": 23008 + }, + { + "epoch": 0.41039132451039845, + "grad_norm": 0.22327066957950592, + "learning_rate": 3.6705610979450025e-05, + "loss": 0.133, + "step": 23009 + }, + { + "epoch": 0.41040916063211214, + "grad_norm": 0.26148056983947754, + "learning_rate": 3.670423561176156e-05, + "loss": 0.2047, + "step": 23010 + }, + { + "epoch": 0.4104269967538258, + "grad_norm": 0.28364816308021545, + "learning_rate": 3.6702860198703434e-05, + "loss": 0.1444, + "step": 23011 + }, + { + "epoch": 0.41044483287553957, + "grad_norm": 0.3011294901371002, + "learning_rate": 3.670148474028098e-05, + "loss": 0.1727, + "step": 23012 + }, + { + "epoch": 0.41046266899725325, + "grad_norm": 0.3662911057472229, + "learning_rate": 3.670010923649953e-05, + "loss": 0.1682, + "step": 23013 + }, + { + "epoch": 0.41048050511896694, + "grad_norm": 0.2082054167985916, + "learning_rate": 3.6698733687364416e-05, + "loss": 0.174, + "step": 23014 + }, + { + "epoch": 0.41049834124068063, + "grad_norm": 0.21559834480285645, + "learning_rate": 3.6697358092880975e-05, + "loss": 0.1026, + "step": 23015 + }, + { + "epoch": 0.4105161773623943, + "grad_norm": 0.2405346781015396, + "learning_rate": 3.6695982453054525e-05, + "loss": 0.1623, + "step": 23016 + }, + { + "epoch": 0.410534013484108, + "grad_norm": 0.27425476908683777, + "learning_rate": 3.669460676789041e-05, + "loss": 0.1723, + "step": 23017 + }, + { + "epoch": 0.4105518496058217, + "grad_norm": 0.40821516513824463, + "learning_rate": 3.669323103739396e-05, + "loss": 0.1977, + "step": 23018 + }, + { + "epoch": 0.4105696857275354, + "grad_norm": 0.22954507172107697, + "learning_rate": 3.6691855261570514e-05, + "loss": 0.0947, + "step": 23019 + }, + { + "epoch": 0.4105875218492491, + "grad_norm": 0.16959792375564575, + "learning_rate": 3.669047944042539e-05, + "loss": 0.1385, + "step": 23020 + }, + { + "epoch": 0.4106053579709628, + "grad_norm": 0.232644721865654, + "learning_rate": 3.6689103573963934e-05, + "loss": 0.1362, + "step": 23021 + }, + { + "epoch": 0.4106231940926765, + "grad_norm": 0.22902779281139374, + "learning_rate": 3.668772766219147e-05, + "loss": 0.1593, + "step": 23022 + }, + { + "epoch": 0.4106410302143902, + "grad_norm": 0.2516598403453827, + "learning_rate": 3.668635170511335e-05, + "loss": 0.1457, + "step": 23023 + }, + { + "epoch": 0.4106588663361039, + "grad_norm": 0.25940564274787903, + "learning_rate": 3.668497570273488e-05, + "loss": 0.1651, + "step": 23024 + }, + { + "epoch": 0.41067670245781757, + "grad_norm": 0.2819032669067383, + "learning_rate": 3.668359965506142e-05, + "loss": 0.0931, + "step": 23025 + }, + { + "epoch": 0.41069453857953125, + "grad_norm": 0.3397025465965271, + "learning_rate": 3.6682223562098276e-05, + "loss": 0.1775, + "step": 23026 + }, + { + "epoch": 0.41071237470124494, + "grad_norm": 0.3517206609249115, + "learning_rate": 3.668084742385082e-05, + "loss": 0.1667, + "step": 23027 + }, + { + "epoch": 0.41073021082295863, + "grad_norm": 0.2156389206647873, + "learning_rate": 3.667947124032435e-05, + "loss": 0.1696, + "step": 23028 + }, + { + "epoch": 0.4107480469446724, + "grad_norm": 0.22551634907722473, + "learning_rate": 3.667809501152422e-05, + "loss": 0.1408, + "step": 23029 + }, + { + "epoch": 0.41076588306638606, + "grad_norm": 0.18486660718917847, + "learning_rate": 3.6676718737455754e-05, + "loss": 0.1278, + "step": 23030 + }, + { + "epoch": 0.41078371918809975, + "grad_norm": 0.21407462656497955, + "learning_rate": 3.66753424181243e-05, + "loss": 0.1371, + "step": 23031 + }, + { + "epoch": 0.41080155530981344, + "grad_norm": 0.22909492254257202, + "learning_rate": 3.6673966053535195e-05, + "loss": 0.109, + "step": 23032 + }, + { + "epoch": 0.4108193914315271, + "grad_norm": 0.22414657473564148, + "learning_rate": 3.6672589643693745e-05, + "loss": 0.1692, + "step": 23033 + }, + { + "epoch": 0.4108372275532408, + "grad_norm": 0.2653462588787079, + "learning_rate": 3.667121318860533e-05, + "loss": 0.1522, + "step": 23034 + }, + { + "epoch": 0.4108550636749545, + "grad_norm": 0.21764078736305237, + "learning_rate": 3.666983668827524e-05, + "loss": 0.1827, + "step": 23035 + }, + { + "epoch": 0.4108728997966682, + "grad_norm": 0.23941759765148163, + "learning_rate": 3.6668460142708836e-05, + "loss": 0.152, + "step": 23036 + }, + { + "epoch": 0.41089073591838193, + "grad_norm": 0.23800028860569, + "learning_rate": 3.6667083551911456e-05, + "loss": 0.1532, + "step": 23037 + }, + { + "epoch": 0.4109085720400956, + "grad_norm": 0.27145278453826904, + "learning_rate": 3.666570691588843e-05, + "loss": 0.2035, + "step": 23038 + }, + { + "epoch": 0.4109264081618093, + "grad_norm": 0.26834502816200256, + "learning_rate": 3.66643302346451e-05, + "loss": 0.1562, + "step": 23039 + }, + { + "epoch": 0.410944244283523, + "grad_norm": 0.19137997925281525, + "learning_rate": 3.6662953508186784e-05, + "loss": 0.1506, + "step": 23040 + }, + { + "epoch": 0.4109620804052367, + "grad_norm": 0.2708612084388733, + "learning_rate": 3.666157673651884e-05, + "loss": 0.1828, + "step": 23041 + }, + { + "epoch": 0.4109799165269504, + "grad_norm": 0.23833531141281128, + "learning_rate": 3.66601999196466e-05, + "loss": 0.171, + "step": 23042 + }, + { + "epoch": 0.41099775264866406, + "grad_norm": 0.26716694235801697, + "learning_rate": 3.6658823057575396e-05, + "loss": 0.1659, + "step": 23043 + }, + { + "epoch": 0.41101558877037775, + "grad_norm": 0.36645907163619995, + "learning_rate": 3.665744615031056e-05, + "loss": 0.1967, + "step": 23044 + }, + { + "epoch": 0.41103342489209144, + "grad_norm": 0.27841904759407043, + "learning_rate": 3.665606919785745e-05, + "loss": 0.1551, + "step": 23045 + }, + { + "epoch": 0.4110512610138052, + "grad_norm": 0.30871209502220154, + "learning_rate": 3.6654692200221375e-05, + "loss": 0.1321, + "step": 23046 + }, + { + "epoch": 0.41106909713551887, + "grad_norm": 0.24763981997966766, + "learning_rate": 3.6653315157407695e-05, + "loss": 0.1544, + "step": 23047 + }, + { + "epoch": 0.41108693325723256, + "grad_norm": 0.30146706104278564, + "learning_rate": 3.665193806942174e-05, + "loss": 0.1308, + "step": 23048 + }, + { + "epoch": 0.41110476937894624, + "grad_norm": 0.21637827157974243, + "learning_rate": 3.6650560936268846e-05, + "loss": 0.1042, + "step": 23049 + }, + { + "epoch": 0.41112260550065993, + "grad_norm": 0.31351467967033386, + "learning_rate": 3.6649183757954356e-05, + "loss": 0.1394, + "step": 23050 + }, + { + "epoch": 0.4111404416223736, + "grad_norm": 0.4078234136104584, + "learning_rate": 3.6647806534483605e-05, + "loss": 0.178, + "step": 23051 + }, + { + "epoch": 0.4111582777440873, + "grad_norm": 0.2947288751602173, + "learning_rate": 3.664642926586194e-05, + "loss": 0.0667, + "step": 23052 + }, + { + "epoch": 0.411176113865801, + "grad_norm": 0.23695625364780426, + "learning_rate": 3.664505195209468e-05, + "loss": 0.1649, + "step": 23053 + }, + { + "epoch": 0.41119394998751474, + "grad_norm": 0.4286949932575226, + "learning_rate": 3.664367459318718e-05, + "loss": 0.2361, + "step": 23054 + }, + { + "epoch": 0.4112117861092284, + "grad_norm": 0.22863931953907013, + "learning_rate": 3.664229718914478e-05, + "loss": 0.1325, + "step": 23055 + }, + { + "epoch": 0.4112296222309421, + "grad_norm": 0.25124645233154297, + "learning_rate": 3.6640919739972816e-05, + "loss": 0.1665, + "step": 23056 + }, + { + "epoch": 0.4112474583526558, + "grad_norm": 0.3131903111934662, + "learning_rate": 3.663954224567663e-05, + "loss": 0.1398, + "step": 23057 + }, + { + "epoch": 0.4112652944743695, + "grad_norm": 0.20030651986598969, + "learning_rate": 3.6638164706261546e-05, + "loss": 0.1409, + "step": 23058 + }, + { + "epoch": 0.4112831305960832, + "grad_norm": 0.20581752061843872, + "learning_rate": 3.663678712173292e-05, + "loss": 0.1607, + "step": 23059 + }, + { + "epoch": 0.41130096671779687, + "grad_norm": 0.29224833846092224, + "learning_rate": 3.663540949209609e-05, + "loss": 0.1544, + "step": 23060 + }, + { + "epoch": 0.41131880283951056, + "grad_norm": 0.27331843972206116, + "learning_rate": 3.6634031817356396e-05, + "loss": 0.1381, + "step": 23061 + }, + { + "epoch": 0.4113366389612243, + "grad_norm": 0.2171776443719864, + "learning_rate": 3.6632654097519174e-05, + "loss": 0.1022, + "step": 23062 + }, + { + "epoch": 0.411354475082938, + "grad_norm": 0.32310447096824646, + "learning_rate": 3.663127633258977e-05, + "loss": 0.1817, + "step": 23063 + }, + { + "epoch": 0.4113723112046517, + "grad_norm": 0.37830615043640137, + "learning_rate": 3.662989852257352e-05, + "loss": 0.2586, + "step": 23064 + }, + { + "epoch": 0.41139014732636536, + "grad_norm": 0.21193359792232513, + "learning_rate": 3.6628520667475764e-05, + "loss": 0.1735, + "step": 23065 + }, + { + "epoch": 0.41140798344807905, + "grad_norm": 0.2421891689300537, + "learning_rate": 3.6627142767301846e-05, + "loss": 0.1211, + "step": 23066 + }, + { + "epoch": 0.41142581956979274, + "grad_norm": 0.22349536418914795, + "learning_rate": 3.662576482205711e-05, + "loss": 0.1645, + "step": 23067 + }, + { + "epoch": 0.4114436556915064, + "grad_norm": 0.28665441274642944, + "learning_rate": 3.662438683174689e-05, + "loss": 0.1604, + "step": 23068 + }, + { + "epoch": 0.4114614918132201, + "grad_norm": 0.28855791687965393, + "learning_rate": 3.6623008796376535e-05, + "loss": 0.1947, + "step": 23069 + }, + { + "epoch": 0.4114793279349338, + "grad_norm": 0.22060483694076538, + "learning_rate": 3.6621630715951384e-05, + "loss": 0.1095, + "step": 23070 + }, + { + "epoch": 0.41149716405664755, + "grad_norm": 0.2542547881603241, + "learning_rate": 3.662025259047678e-05, + "loss": 0.1516, + "step": 23071 + }, + { + "epoch": 0.41151500017836123, + "grad_norm": 0.23093442618846893, + "learning_rate": 3.661887441995805e-05, + "loss": 0.1256, + "step": 23072 + }, + { + "epoch": 0.4115328363000749, + "grad_norm": 0.2967522144317627, + "learning_rate": 3.6617496204400565e-05, + "loss": 0.1652, + "step": 23073 + }, + { + "epoch": 0.4115506724217886, + "grad_norm": 0.3545559048652649, + "learning_rate": 3.661611794380965e-05, + "loss": 0.1559, + "step": 23074 + }, + { + "epoch": 0.4115685085435023, + "grad_norm": 0.3534194529056549, + "learning_rate": 3.6614739638190645e-05, + "loss": 0.1678, + "step": 23075 + }, + { + "epoch": 0.411586344665216, + "grad_norm": 0.26916420459747314, + "learning_rate": 3.66133612875489e-05, + "loss": 0.1224, + "step": 23076 + }, + { + "epoch": 0.4116041807869297, + "grad_norm": 0.27614539861679077, + "learning_rate": 3.661198289188975e-05, + "loss": 0.19, + "step": 23077 + }, + { + "epoch": 0.41162201690864336, + "grad_norm": 0.284241646528244, + "learning_rate": 3.661060445121855e-05, + "loss": 0.2069, + "step": 23078 + }, + { + "epoch": 0.4116398530303571, + "grad_norm": 0.30562517046928406, + "learning_rate": 3.660922596554064e-05, + "loss": 0.093, + "step": 23079 + }, + { + "epoch": 0.4116576891520708, + "grad_norm": 0.22723637521266937, + "learning_rate": 3.660784743486135e-05, + "loss": 0.1213, + "step": 23080 + }, + { + "epoch": 0.4116755252737845, + "grad_norm": 0.3314789831638336, + "learning_rate": 3.6606468859186035e-05, + "loss": 0.1536, + "step": 23081 + }, + { + "epoch": 0.41169336139549817, + "grad_norm": 0.20812579989433289, + "learning_rate": 3.6605090238520045e-05, + "loss": 0.1718, + "step": 23082 + }, + { + "epoch": 0.41171119751721186, + "grad_norm": 0.3055664598941803, + "learning_rate": 3.660371157286872e-05, + "loss": 0.1303, + "step": 23083 + }, + { + "epoch": 0.41172903363892555, + "grad_norm": 0.24226170778274536, + "learning_rate": 3.66023328622374e-05, + "loss": 0.1091, + "step": 23084 + }, + { + "epoch": 0.41174686976063923, + "grad_norm": 0.38672134280204773, + "learning_rate": 3.6600954106631424e-05, + "loss": 0.1637, + "step": 23085 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.20913872122764587, + "learning_rate": 3.6599575306056144e-05, + "loss": 0.121, + "step": 23086 + }, + { + "epoch": 0.4117825420040666, + "grad_norm": 0.3569769561290741, + "learning_rate": 3.65981964605169e-05, + "loss": 0.1184, + "step": 23087 + }, + { + "epoch": 0.41180037812578035, + "grad_norm": 0.23577100038528442, + "learning_rate": 3.659681757001905e-05, + "loss": 0.1646, + "step": 23088 + }, + { + "epoch": 0.41181821424749404, + "grad_norm": 0.27256104350090027, + "learning_rate": 3.659543863456792e-05, + "loss": 0.1639, + "step": 23089 + }, + { + "epoch": 0.41183605036920773, + "grad_norm": 0.38175252079963684, + "learning_rate": 3.6594059654168875e-05, + "loss": 0.1361, + "step": 23090 + }, + { + "epoch": 0.4118538864909214, + "grad_norm": 0.2671360969543457, + "learning_rate": 3.659268062882725e-05, + "loss": 0.1818, + "step": 23091 + }, + { + "epoch": 0.4118717226126351, + "grad_norm": 0.3260008990764618, + "learning_rate": 3.6591301558548385e-05, + "loss": 0.1362, + "step": 23092 + }, + { + "epoch": 0.4118895587343488, + "grad_norm": 0.244276225566864, + "learning_rate": 3.6589922443337634e-05, + "loss": 0.1144, + "step": 23093 + }, + { + "epoch": 0.4119073948560625, + "grad_norm": 0.2617519795894623, + "learning_rate": 3.658854328320034e-05, + "loss": 0.1569, + "step": 23094 + }, + { + "epoch": 0.41192523097777617, + "grad_norm": 0.3298129439353943, + "learning_rate": 3.658716407814184e-05, + "loss": 0.1951, + "step": 23095 + }, + { + "epoch": 0.4119430670994899, + "grad_norm": 0.2517102062702179, + "learning_rate": 3.6585784828167505e-05, + "loss": 0.2047, + "step": 23096 + }, + { + "epoch": 0.4119609032212036, + "grad_norm": 0.27042925357818604, + "learning_rate": 3.658440553328267e-05, + "loss": 0.1797, + "step": 23097 + }, + { + "epoch": 0.4119787393429173, + "grad_norm": 0.2952825129032135, + "learning_rate": 3.6583026193492666e-05, + "loss": 0.1819, + "step": 23098 + }, + { + "epoch": 0.411996575464631, + "grad_norm": 0.2329859882593155, + "learning_rate": 3.6581646808802855e-05, + "loss": 0.1427, + "step": 23099 + }, + { + "epoch": 0.41201441158634466, + "grad_norm": 0.23942644894123077, + "learning_rate": 3.658026737921858e-05, + "loss": 0.147, + "step": 23100 + }, + { + "epoch": 0.41203224770805835, + "grad_norm": 0.4394679367542267, + "learning_rate": 3.6578887904745186e-05, + "loss": 0.1458, + "step": 23101 + }, + { + "epoch": 0.41205008382977204, + "grad_norm": 0.23011663556098938, + "learning_rate": 3.6577508385388026e-05, + "loss": 0.1706, + "step": 23102 + }, + { + "epoch": 0.41206791995148573, + "grad_norm": 0.3207777738571167, + "learning_rate": 3.657612882115245e-05, + "loss": 0.2025, + "step": 23103 + }, + { + "epoch": 0.4120857560731994, + "grad_norm": 0.2791122496128082, + "learning_rate": 3.657474921204379e-05, + "loss": 0.1487, + "step": 23104 + }, + { + "epoch": 0.41210359219491316, + "grad_norm": 0.17093636095523834, + "learning_rate": 3.657336955806741e-05, + "loss": 0.1508, + "step": 23105 + }, + { + "epoch": 0.41212142831662685, + "grad_norm": 0.1648503988981247, + "learning_rate": 3.6571989859228654e-05, + "loss": 0.1081, + "step": 23106 + }, + { + "epoch": 0.41213926443834054, + "grad_norm": 0.23249873518943787, + "learning_rate": 3.657061011553287e-05, + "loss": 0.1283, + "step": 23107 + }, + { + "epoch": 0.4121571005600542, + "grad_norm": 0.2866901755332947, + "learning_rate": 3.65692303269854e-05, + "loss": 0.1557, + "step": 23108 + }, + { + "epoch": 0.4121749366817679, + "grad_norm": 0.2189938873052597, + "learning_rate": 3.65678504935916e-05, + "loss": 0.1571, + "step": 23109 + }, + { + "epoch": 0.4121927728034816, + "grad_norm": 0.23248472809791565, + "learning_rate": 3.656647061535682e-05, + "loss": 0.1425, + "step": 23110 + }, + { + "epoch": 0.4122106089251953, + "grad_norm": 0.2784644067287445, + "learning_rate": 3.65650906922864e-05, + "loss": 0.1874, + "step": 23111 + }, + { + "epoch": 0.412228445046909, + "grad_norm": 0.23064307868480682, + "learning_rate": 3.656371072438569e-05, + "loss": 0.1495, + "step": 23112 + }, + { + "epoch": 0.4122462811686227, + "grad_norm": 0.30065059661865234, + "learning_rate": 3.656233071166005e-05, + "loss": 0.1217, + "step": 23113 + }, + { + "epoch": 0.4122641172903364, + "grad_norm": 0.22854925692081451, + "learning_rate": 3.656095065411482e-05, + "loss": 0.1876, + "step": 23114 + }, + { + "epoch": 0.4122819534120501, + "grad_norm": 0.2647433578968048, + "learning_rate": 3.655957055175535e-05, + "loss": 0.1379, + "step": 23115 + }, + { + "epoch": 0.4122997895337638, + "grad_norm": 0.2703079283237457, + "learning_rate": 3.655819040458699e-05, + "loss": 0.1605, + "step": 23116 + }, + { + "epoch": 0.41231762565547747, + "grad_norm": 0.29445329308509827, + "learning_rate": 3.65568102126151e-05, + "loss": 0.1338, + "step": 23117 + }, + { + "epoch": 0.41233546177719116, + "grad_norm": 0.2516380846500397, + "learning_rate": 3.655542997584502e-05, + "loss": 0.1534, + "step": 23118 + }, + { + "epoch": 0.41235329789890485, + "grad_norm": 0.32217973470687866, + "learning_rate": 3.65540496942821e-05, + "loss": 0.1623, + "step": 23119 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 0.25049564242362976, + "learning_rate": 3.65526693679317e-05, + "loss": 0.1355, + "step": 23120 + }, + { + "epoch": 0.4123889701423323, + "grad_norm": 0.3135298788547516, + "learning_rate": 3.655128899679915e-05, + "loss": 0.1535, + "step": 23121 + }, + { + "epoch": 0.41240680626404597, + "grad_norm": 0.2812309265136719, + "learning_rate": 3.654990858088982e-05, + "loss": 0.1177, + "step": 23122 + }, + { + "epoch": 0.41242464238575965, + "grad_norm": 0.29339414834976196, + "learning_rate": 3.654852812020906e-05, + "loss": 0.197, + "step": 23123 + }, + { + "epoch": 0.41244247850747334, + "grad_norm": 0.17289677262306213, + "learning_rate": 3.654714761476221e-05, + "loss": 0.129, + "step": 23124 + }, + { + "epoch": 0.41246031462918703, + "grad_norm": 0.2023874968290329, + "learning_rate": 3.654576706455464e-05, + "loss": 0.1381, + "step": 23125 + }, + { + "epoch": 0.4124781507509007, + "grad_norm": 0.22508110105991364, + "learning_rate": 3.6544386469591675e-05, + "loss": 0.1326, + "step": 23126 + }, + { + "epoch": 0.4124959868726144, + "grad_norm": 0.33962857723236084, + "learning_rate": 3.6543005829878686e-05, + "loss": 0.1858, + "step": 23127 + }, + { + "epoch": 0.4125138229943281, + "grad_norm": 0.2923021912574768, + "learning_rate": 3.654162514542101e-05, + "loss": 0.1453, + "step": 23128 + }, + { + "epoch": 0.4125316591160418, + "grad_norm": 0.36241307854652405, + "learning_rate": 3.6540244416224015e-05, + "loss": 0.2042, + "step": 23129 + }, + { + "epoch": 0.4125494952377555, + "grad_norm": 0.23861025273799896, + "learning_rate": 3.653886364229305e-05, + "loss": 0.1368, + "step": 23130 + }, + { + "epoch": 0.4125673313594692, + "grad_norm": 0.26289698481559753, + "learning_rate": 3.653748282363347e-05, + "loss": 0.1866, + "step": 23131 + }, + { + "epoch": 0.4125851674811829, + "grad_norm": 0.3463841676712036, + "learning_rate": 3.653610196025061e-05, + "loss": 0.1816, + "step": 23132 + }, + { + "epoch": 0.4126030036028966, + "grad_norm": 0.23657816648483276, + "learning_rate": 3.653472105214984e-05, + "loss": 0.1577, + "step": 23133 + }, + { + "epoch": 0.4126208397246103, + "grad_norm": 0.3227595388889313, + "learning_rate": 3.653334009933651e-05, + "loss": 0.0959, + "step": 23134 + }, + { + "epoch": 0.41263867584632397, + "grad_norm": 0.3086320161819458, + "learning_rate": 3.653195910181596e-05, + "loss": 0.1929, + "step": 23135 + }, + { + "epoch": 0.41265651196803765, + "grad_norm": 0.443154513835907, + "learning_rate": 3.6530578059593564e-05, + "loss": 0.1811, + "step": 23136 + }, + { + "epoch": 0.41267434808975134, + "grad_norm": 0.24609248340129852, + "learning_rate": 3.652919697267466e-05, + "loss": 0.2163, + "step": 23137 + }, + { + "epoch": 0.4126921842114651, + "grad_norm": 0.20582136511802673, + "learning_rate": 3.6527815841064605e-05, + "loss": 0.1665, + "step": 23138 + }, + { + "epoch": 0.4127100203331788, + "grad_norm": 0.2355787456035614, + "learning_rate": 3.652643466476876e-05, + "loss": 0.1226, + "step": 23139 + }, + { + "epoch": 0.41272785645489246, + "grad_norm": 0.2802969515323639, + "learning_rate": 3.652505344379247e-05, + "loss": 0.1003, + "step": 23140 + }, + { + "epoch": 0.41274569257660615, + "grad_norm": 0.24644528329372406, + "learning_rate": 3.6523672178141086e-05, + "loss": 0.1944, + "step": 23141 + }, + { + "epoch": 0.41276352869831984, + "grad_norm": 0.2273740917444229, + "learning_rate": 3.6522290867819976e-05, + "loss": 0.1357, + "step": 23142 + }, + { + "epoch": 0.4127813648200335, + "grad_norm": 0.3127565383911133, + "learning_rate": 3.652090951283448e-05, + "loss": 0.1942, + "step": 23143 + }, + { + "epoch": 0.4127992009417472, + "grad_norm": 0.21294841170310974, + "learning_rate": 3.651952811318997e-05, + "loss": 0.12, + "step": 23144 + }, + { + "epoch": 0.4128170370634609, + "grad_norm": 0.22707761824131012, + "learning_rate": 3.6518146668891786e-05, + "loss": 0.1521, + "step": 23145 + }, + { + "epoch": 0.4128348731851746, + "grad_norm": 0.20713070034980774, + "learning_rate": 3.651676517994529e-05, + "loss": 0.1221, + "step": 23146 + }, + { + "epoch": 0.41285270930688833, + "grad_norm": 0.49638211727142334, + "learning_rate": 3.6515383646355825e-05, + "loss": 0.1586, + "step": 23147 + }, + { + "epoch": 0.412870545428602, + "grad_norm": 0.1768767386674881, + "learning_rate": 3.6514002068128766e-05, + "loss": 0.1191, + "step": 23148 + }, + { + "epoch": 0.4128883815503157, + "grad_norm": 0.33315321803092957, + "learning_rate": 3.6512620445269453e-05, + "loss": 0.1519, + "step": 23149 + }, + { + "epoch": 0.4129062176720294, + "grad_norm": 0.24592268466949463, + "learning_rate": 3.651123877778325e-05, + "loss": 0.1523, + "step": 23150 + }, + { + "epoch": 0.4129240537937431, + "grad_norm": 0.28065225481987, + "learning_rate": 3.650985706567551e-05, + "loss": 0.1856, + "step": 23151 + }, + { + "epoch": 0.4129418899154568, + "grad_norm": 0.2765066921710968, + "learning_rate": 3.650847530895158e-05, + "loss": 0.141, + "step": 23152 + }, + { + "epoch": 0.41295972603717046, + "grad_norm": 0.23578819632530212, + "learning_rate": 3.650709350761683e-05, + "loss": 0.1254, + "step": 23153 + }, + { + "epoch": 0.41297756215888415, + "grad_norm": 0.2761731743812561, + "learning_rate": 3.6505711661676614e-05, + "loss": 0.1498, + "step": 23154 + }, + { + "epoch": 0.4129953982805979, + "grad_norm": 0.3300350606441498, + "learning_rate": 3.650432977113629e-05, + "loss": 0.1429, + "step": 23155 + }, + { + "epoch": 0.4130132344023116, + "grad_norm": 0.32916247844696045, + "learning_rate": 3.65029478360012e-05, + "loss": 0.1331, + "step": 23156 + }, + { + "epoch": 0.41303107052402527, + "grad_norm": 0.19779275357723236, + "learning_rate": 3.6501565856276706e-05, + "loss": 0.1359, + "step": 23157 + }, + { + "epoch": 0.41304890664573896, + "grad_norm": 0.22703024744987488, + "learning_rate": 3.650018383196818e-05, + "loss": 0.1427, + "step": 23158 + }, + { + "epoch": 0.41306674276745264, + "grad_norm": 0.22725622355937958, + "learning_rate": 3.649880176308098e-05, + "loss": 0.1467, + "step": 23159 + }, + { + "epoch": 0.41308457888916633, + "grad_norm": 0.2867787182331085, + "learning_rate": 3.649741964962043e-05, + "loss": 0.1324, + "step": 23160 + }, + { + "epoch": 0.41310241501088, + "grad_norm": 0.3186010718345642, + "learning_rate": 3.649603749159193e-05, + "loss": 0.1978, + "step": 23161 + }, + { + "epoch": 0.4131202511325937, + "grad_norm": 0.31810036301612854, + "learning_rate": 3.649465528900081e-05, + "loss": 0.1778, + "step": 23162 + }, + { + "epoch": 0.4131380872543074, + "grad_norm": 0.3615397810935974, + "learning_rate": 3.6493273041852424e-05, + "loss": 0.2253, + "step": 23163 + }, + { + "epoch": 0.41315592337602114, + "grad_norm": 0.2043425738811493, + "learning_rate": 3.6491890750152166e-05, + "loss": 0.1582, + "step": 23164 + }, + { + "epoch": 0.4131737594977348, + "grad_norm": 0.27091988921165466, + "learning_rate": 3.6490508413905354e-05, + "loss": 0.1778, + "step": 23165 + }, + { + "epoch": 0.4131915956194485, + "grad_norm": 0.30199170112609863, + "learning_rate": 3.648912603311737e-05, + "loss": 0.1688, + "step": 23166 + }, + { + "epoch": 0.4132094317411622, + "grad_norm": 0.3116376996040344, + "learning_rate": 3.648774360779356e-05, + "loss": 0.1677, + "step": 23167 + }, + { + "epoch": 0.4132272678628759, + "grad_norm": 0.24620358645915985, + "learning_rate": 3.648636113793929e-05, + "loss": 0.1552, + "step": 23168 + }, + { + "epoch": 0.4132451039845896, + "grad_norm": 0.2617967426776886, + "learning_rate": 3.648497862355992e-05, + "loss": 0.2027, + "step": 23169 + }, + { + "epoch": 0.41326294010630327, + "grad_norm": 0.2816195487976074, + "learning_rate": 3.6483596064660794e-05, + "loss": 0.1446, + "step": 23170 + }, + { + "epoch": 0.41328077622801696, + "grad_norm": 0.25955280661582947, + "learning_rate": 3.6482213461247295e-05, + "loss": 0.1755, + "step": 23171 + }, + { + "epoch": 0.4132986123497307, + "grad_norm": 0.25968292355537415, + "learning_rate": 3.648083081332478e-05, + "loss": 0.1245, + "step": 23172 + }, + { + "epoch": 0.4133164484714444, + "grad_norm": 0.22797012329101562, + "learning_rate": 3.647944812089857e-05, + "loss": 0.1159, + "step": 23173 + }, + { + "epoch": 0.4133342845931581, + "grad_norm": 0.35979729890823364, + "learning_rate": 3.647806538397408e-05, + "loss": 0.1626, + "step": 23174 + }, + { + "epoch": 0.41335212071487176, + "grad_norm": 0.22722698748111725, + "learning_rate": 3.647668260255665e-05, + "loss": 0.1227, + "step": 23175 + }, + { + "epoch": 0.41336995683658545, + "grad_norm": 0.3491485118865967, + "learning_rate": 3.6475299776651614e-05, + "loss": 0.1771, + "step": 23176 + }, + { + "epoch": 0.41338779295829914, + "grad_norm": 0.3642544150352478, + "learning_rate": 3.647391690626435e-05, + "loss": 0.2034, + "step": 23177 + }, + { + "epoch": 0.4134056290800128, + "grad_norm": 0.2418254017829895, + "learning_rate": 3.647253399140023e-05, + "loss": 0.0914, + "step": 23178 + }, + { + "epoch": 0.4134234652017265, + "grad_norm": 0.25657105445861816, + "learning_rate": 3.647115103206461e-05, + "loss": 0.1414, + "step": 23179 + }, + { + "epoch": 0.41344130132344026, + "grad_norm": 0.2985260486602783, + "learning_rate": 3.646976802826284e-05, + "loss": 0.1974, + "step": 23180 + }, + { + "epoch": 0.41345913744515395, + "grad_norm": 0.22879236936569214, + "learning_rate": 3.646838498000029e-05, + "loss": 0.1617, + "step": 23181 + }, + { + "epoch": 0.41347697356686763, + "grad_norm": 0.2395196557044983, + "learning_rate": 3.646700188728232e-05, + "loss": 0.1317, + "step": 23182 + }, + { + "epoch": 0.4134948096885813, + "grad_norm": 0.32290178537368774, + "learning_rate": 3.6465618750114293e-05, + "loss": 0.1669, + "step": 23183 + }, + { + "epoch": 0.413512645810295, + "grad_norm": 0.29915207624435425, + "learning_rate": 3.6464235568501556e-05, + "loss": 0.2162, + "step": 23184 + }, + { + "epoch": 0.4135304819320087, + "grad_norm": 0.3112390339374542, + "learning_rate": 3.646285234244949e-05, + "loss": 0.1712, + "step": 23185 + }, + { + "epoch": 0.4135483180537224, + "grad_norm": 0.2557258605957031, + "learning_rate": 3.646146907196345e-05, + "loss": 0.1545, + "step": 23186 + }, + { + "epoch": 0.4135661541754361, + "grad_norm": 0.24216027557849884, + "learning_rate": 3.64600857570488e-05, + "loss": 0.1339, + "step": 23187 + }, + { + "epoch": 0.41358399029714976, + "grad_norm": 0.268718957901001, + "learning_rate": 3.64587023977109e-05, + "loss": 0.1914, + "step": 23188 + }, + { + "epoch": 0.4136018264188635, + "grad_norm": 0.2952747642993927, + "learning_rate": 3.6457318993955105e-05, + "loss": 0.1851, + "step": 23189 + }, + { + "epoch": 0.4136196625405772, + "grad_norm": 0.279373437166214, + "learning_rate": 3.6455935545786784e-05, + "loss": 0.1109, + "step": 23190 + }, + { + "epoch": 0.4136374986622909, + "grad_norm": 0.2655762732028961, + "learning_rate": 3.64545520532113e-05, + "loss": 0.121, + "step": 23191 + }, + { + "epoch": 0.41365533478400457, + "grad_norm": 0.24142512679100037, + "learning_rate": 3.6453168516234026e-05, + "loss": 0.1701, + "step": 23192 + }, + { + "epoch": 0.41367317090571826, + "grad_norm": 0.18149729073047638, + "learning_rate": 3.64517849348603e-05, + "loss": 0.1396, + "step": 23193 + }, + { + "epoch": 0.41369100702743195, + "grad_norm": 0.24246717989444733, + "learning_rate": 3.645040130909552e-05, + "loss": 0.129, + "step": 23194 + }, + { + "epoch": 0.41370884314914563, + "grad_norm": 0.28347063064575195, + "learning_rate": 3.644901763894501e-05, + "loss": 0.1856, + "step": 23195 + }, + { + "epoch": 0.4137266792708593, + "grad_norm": 0.3277598023414612, + "learning_rate": 3.644763392441417e-05, + "loss": 0.2089, + "step": 23196 + }, + { + "epoch": 0.41374451539257306, + "grad_norm": 0.22673563659191132, + "learning_rate": 3.6446250165508334e-05, + "loss": 0.1482, + "step": 23197 + }, + { + "epoch": 0.41376235151428675, + "grad_norm": 0.33759182691574097, + "learning_rate": 3.6444866362232875e-05, + "loss": 0.0903, + "step": 23198 + }, + { + "epoch": 0.41378018763600044, + "grad_norm": 0.26132363080978394, + "learning_rate": 3.644348251459317e-05, + "loss": 0.137, + "step": 23199 + }, + { + "epoch": 0.41379802375771413, + "grad_norm": 0.3460666835308075, + "learning_rate": 3.6442098622594576e-05, + "loss": 0.1874, + "step": 23200 + }, + { + "epoch": 0.4138158598794278, + "grad_norm": 0.2770192325115204, + "learning_rate": 3.644071468624246e-05, + "loss": 0.1858, + "step": 23201 + }, + { + "epoch": 0.4138336960011415, + "grad_norm": 0.2089637666940689, + "learning_rate": 3.6439330705542176e-05, + "loss": 0.1377, + "step": 23202 + }, + { + "epoch": 0.4138515321228552, + "grad_norm": 0.23270075023174286, + "learning_rate": 3.643794668049909e-05, + "loss": 0.1567, + "step": 23203 + }, + { + "epoch": 0.4138693682445689, + "grad_norm": 0.3162238597869873, + "learning_rate": 3.643656261111858e-05, + "loss": 0.1178, + "step": 23204 + }, + { + "epoch": 0.41388720436628257, + "grad_norm": 0.3702440559864044, + "learning_rate": 3.6435178497405996e-05, + "loss": 0.2463, + "step": 23205 + }, + { + "epoch": 0.4139050404879963, + "grad_norm": 0.22339370846748352, + "learning_rate": 3.643379433936671e-05, + "loss": 0.1383, + "step": 23206 + }, + { + "epoch": 0.41392287660971, + "grad_norm": 0.20407943427562714, + "learning_rate": 3.64324101370061e-05, + "loss": 0.1209, + "step": 23207 + }, + { + "epoch": 0.4139407127314237, + "grad_norm": 0.23986142873764038, + "learning_rate": 3.643102589032951e-05, + "loss": 0.1452, + "step": 23208 + }, + { + "epoch": 0.4139585488531374, + "grad_norm": 0.2907426059246063, + "learning_rate": 3.6429641599342326e-05, + "loss": 0.1201, + "step": 23209 + }, + { + "epoch": 0.41397638497485106, + "grad_norm": 0.2096967250108719, + "learning_rate": 3.642825726404989e-05, + "loss": 0.0943, + "step": 23210 + }, + { + "epoch": 0.41399422109656475, + "grad_norm": 0.3130171298980713, + "learning_rate": 3.6426872884457585e-05, + "loss": 0.1909, + "step": 23211 + }, + { + "epoch": 0.41401205721827844, + "grad_norm": 0.2926204800605774, + "learning_rate": 3.642548846057077e-05, + "loss": 0.2103, + "step": 23212 + }, + { + "epoch": 0.4140298933399921, + "grad_norm": 0.2604474425315857, + "learning_rate": 3.642410399239482e-05, + "loss": 0.1646, + "step": 23213 + }, + { + "epoch": 0.41404772946170587, + "grad_norm": 0.28436794877052307, + "learning_rate": 3.642271947993511e-05, + "loss": 0.1107, + "step": 23214 + }, + { + "epoch": 0.41406556558341956, + "grad_norm": 0.3702876567840576, + "learning_rate": 3.642133492319698e-05, + "loss": 0.1483, + "step": 23215 + }, + { + "epoch": 0.41408340170513325, + "grad_norm": 0.27169784903526306, + "learning_rate": 3.641995032218582e-05, + "loss": 0.2042, + "step": 23216 + }, + { + "epoch": 0.41410123782684694, + "grad_norm": 0.28211212158203125, + "learning_rate": 3.641856567690698e-05, + "loss": 0.1465, + "step": 23217 + }, + { + "epoch": 0.4141190739485606, + "grad_norm": 0.24151967465877533, + "learning_rate": 3.6417180987365835e-05, + "loss": 0.1849, + "step": 23218 + }, + { + "epoch": 0.4141369100702743, + "grad_norm": 0.28115060925483704, + "learning_rate": 3.641579625356775e-05, + "loss": 0.165, + "step": 23219 + }, + { + "epoch": 0.414154746191988, + "grad_norm": 0.25493156909942627, + "learning_rate": 3.641441147551811e-05, + "loss": 0.1739, + "step": 23220 + }, + { + "epoch": 0.4141725823137017, + "grad_norm": 0.45502808690071106, + "learning_rate": 3.641302665322226e-05, + "loss": 0.1765, + "step": 23221 + }, + { + "epoch": 0.41419041843541543, + "grad_norm": 0.3129476308822632, + "learning_rate": 3.641164178668557e-05, + "loss": 0.0975, + "step": 23222 + }, + { + "epoch": 0.4142082545571291, + "grad_norm": 0.3463571071624756, + "learning_rate": 3.641025687591343e-05, + "loss": 0.1273, + "step": 23223 + }, + { + "epoch": 0.4142260906788428, + "grad_norm": 0.2650723159313202, + "learning_rate": 3.640887192091118e-05, + "loss": 0.1523, + "step": 23224 + }, + { + "epoch": 0.4142439268005565, + "grad_norm": 0.25148633122444153, + "learning_rate": 3.6407486921684206e-05, + "loss": 0.1904, + "step": 23225 + }, + { + "epoch": 0.4142617629222702, + "grad_norm": 0.3528432548046112, + "learning_rate": 3.640610187823788e-05, + "loss": 0.1326, + "step": 23226 + }, + { + "epoch": 0.41427959904398387, + "grad_norm": 0.3362433910369873, + "learning_rate": 3.6404716790577555e-05, + "loss": 0.1647, + "step": 23227 + }, + { + "epoch": 0.41429743516569756, + "grad_norm": 0.25073257088661194, + "learning_rate": 3.640333165870861e-05, + "loss": 0.124, + "step": 23228 + }, + { + "epoch": 0.41431527128741125, + "grad_norm": 0.29544728994369507, + "learning_rate": 3.640194648263642e-05, + "loss": 0.0908, + "step": 23229 + }, + { + "epoch": 0.41433310740912493, + "grad_norm": 0.33751586079597473, + "learning_rate": 3.640056126236634e-05, + "loss": 0.1477, + "step": 23230 + }, + { + "epoch": 0.4143509435308387, + "grad_norm": 0.3116064667701721, + "learning_rate": 3.639917599790375e-05, + "loss": 0.1474, + "step": 23231 + }, + { + "epoch": 0.41436877965255237, + "grad_norm": 0.31631600856781006, + "learning_rate": 3.639779068925401e-05, + "loss": 0.1889, + "step": 23232 + }, + { + "epoch": 0.41438661577426605, + "grad_norm": 0.2231280654668808, + "learning_rate": 3.63964053364225e-05, + "loss": 0.1669, + "step": 23233 + }, + { + "epoch": 0.41440445189597974, + "grad_norm": 0.19502250850200653, + "learning_rate": 3.639501993941459e-05, + "loss": 0.165, + "step": 23234 + }, + { + "epoch": 0.41442228801769343, + "grad_norm": 0.3572506010532379, + "learning_rate": 3.6393634498235645e-05, + "loss": 0.2092, + "step": 23235 + }, + { + "epoch": 0.4144401241394071, + "grad_norm": 0.338662713766098, + "learning_rate": 3.6392249012891036e-05, + "loss": 0.155, + "step": 23236 + }, + { + "epoch": 0.4144579602611208, + "grad_norm": 0.2813863158226013, + "learning_rate": 3.639086348338614e-05, + "loss": 0.1587, + "step": 23237 + }, + { + "epoch": 0.4144757963828345, + "grad_norm": 0.2780759036540985, + "learning_rate": 3.638947790972632e-05, + "loss": 0.1493, + "step": 23238 + }, + { + "epoch": 0.41449363250454824, + "grad_norm": 0.3177599310874939, + "learning_rate": 3.6388092291916945e-05, + "loss": 0.0855, + "step": 23239 + }, + { + "epoch": 0.4145114686262619, + "grad_norm": 0.3616902232170105, + "learning_rate": 3.63867066299634e-05, + "loss": 0.1613, + "step": 23240 + }, + { + "epoch": 0.4145293047479756, + "grad_norm": 0.27790239453315735, + "learning_rate": 3.638532092387104e-05, + "loss": 0.2083, + "step": 23241 + }, + { + "epoch": 0.4145471408696893, + "grad_norm": 0.23762886226177216, + "learning_rate": 3.638393517364525e-05, + "loss": 0.1081, + "step": 23242 + }, + { + "epoch": 0.414564976991403, + "grad_norm": 0.24658772349357605, + "learning_rate": 3.638254937929139e-05, + "loss": 0.1537, + "step": 23243 + }, + { + "epoch": 0.4145828131131167, + "grad_norm": 0.27946385741233826, + "learning_rate": 3.6381163540814845e-05, + "loss": 0.1309, + "step": 23244 + }, + { + "epoch": 0.41460064923483037, + "grad_norm": 0.29800471663475037, + "learning_rate": 3.637977765822097e-05, + "loss": 0.1609, + "step": 23245 + }, + { + "epoch": 0.41461848535654405, + "grad_norm": 0.2592264413833618, + "learning_rate": 3.637839173151515e-05, + "loss": 0.1504, + "step": 23246 + }, + { + "epoch": 0.41463632147825774, + "grad_norm": 0.24479345977306366, + "learning_rate": 3.6377005760702754e-05, + "loss": 0.1419, + "step": 23247 + }, + { + "epoch": 0.4146541575999715, + "grad_norm": 0.257204532623291, + "learning_rate": 3.6375619745789155e-05, + "loss": 0.1599, + "step": 23248 + }, + { + "epoch": 0.4146719937216852, + "grad_norm": 0.26670849323272705, + "learning_rate": 3.637423368677972e-05, + "loss": 0.1637, + "step": 23249 + }, + { + "epoch": 0.41468982984339886, + "grad_norm": 0.23996742069721222, + "learning_rate": 3.637284758367983e-05, + "loss": 0.1514, + "step": 23250 + }, + { + "epoch": 0.41470766596511255, + "grad_norm": 0.27667713165283203, + "learning_rate": 3.637146143649486e-05, + "loss": 0.1002, + "step": 23251 + }, + { + "epoch": 0.41472550208682624, + "grad_norm": 0.23895084857940674, + "learning_rate": 3.637007524523017e-05, + "loss": 0.1472, + "step": 23252 + }, + { + "epoch": 0.4147433382085399, + "grad_norm": 0.2592090368270874, + "learning_rate": 3.636868900989114e-05, + "loss": 0.105, + "step": 23253 + }, + { + "epoch": 0.4147611743302536, + "grad_norm": 0.2676096260547638, + "learning_rate": 3.636730273048315e-05, + "loss": 0.174, + "step": 23254 + }, + { + "epoch": 0.4147790104519673, + "grad_norm": 0.20837606489658356, + "learning_rate": 3.636591640701157e-05, + "loss": 0.1605, + "step": 23255 + }, + { + "epoch": 0.41479684657368104, + "grad_norm": 0.2332581877708435, + "learning_rate": 3.636453003948177e-05, + "loss": 0.1287, + "step": 23256 + }, + { + "epoch": 0.41481468269539473, + "grad_norm": 0.4502076804637909, + "learning_rate": 3.636314362789913e-05, + "loss": 0.1805, + "step": 23257 + }, + { + "epoch": 0.4148325188171084, + "grad_norm": 0.24375800788402557, + "learning_rate": 3.636175717226901e-05, + "loss": 0.1454, + "step": 23258 + }, + { + "epoch": 0.4148503549388221, + "grad_norm": 0.33940210938453674, + "learning_rate": 3.636037067259681e-05, + "loss": 0.1664, + "step": 23259 + }, + { + "epoch": 0.4148681910605358, + "grad_norm": 0.2320115566253662, + "learning_rate": 3.635898412888787e-05, + "loss": 0.1501, + "step": 23260 + }, + { + "epoch": 0.4148860271822495, + "grad_norm": 0.2881716191768646, + "learning_rate": 3.6357597541147596e-05, + "loss": 0.1331, + "step": 23261 + }, + { + "epoch": 0.41490386330396317, + "grad_norm": 0.2525869309902191, + "learning_rate": 3.635621090938135e-05, + "loss": 0.1101, + "step": 23262 + }, + { + "epoch": 0.41492169942567686, + "grad_norm": 0.2740098237991333, + "learning_rate": 3.6354824233594514e-05, + "loss": 0.1243, + "step": 23263 + }, + { + "epoch": 0.41493953554739055, + "grad_norm": 0.255045622587204, + "learning_rate": 3.635343751379245e-05, + "loss": 0.1738, + "step": 23264 + }, + { + "epoch": 0.4149573716691043, + "grad_norm": 0.22921952605247498, + "learning_rate": 3.6352050749980546e-05, + "loss": 0.1843, + "step": 23265 + }, + { + "epoch": 0.414975207790818, + "grad_norm": 0.24881695210933685, + "learning_rate": 3.635066394216416e-05, + "loss": 0.1641, + "step": 23266 + }, + { + "epoch": 0.41499304391253167, + "grad_norm": 0.32158026099205017, + "learning_rate": 3.634927709034869e-05, + "loss": 0.1828, + "step": 23267 + }, + { + "epoch": 0.41501088003424536, + "grad_norm": 0.2720593214035034, + "learning_rate": 3.6347890194539504e-05, + "loss": 0.173, + "step": 23268 + }, + { + "epoch": 0.41502871615595904, + "grad_norm": 0.23455075919628143, + "learning_rate": 3.634650325474198e-05, + "loss": 0.155, + "step": 23269 + }, + { + "epoch": 0.41504655227767273, + "grad_norm": 0.3205150365829468, + "learning_rate": 3.6345116270961485e-05, + "loss": 0.1891, + "step": 23270 + }, + { + "epoch": 0.4150643883993864, + "grad_norm": 0.27757516503334045, + "learning_rate": 3.6343729243203395e-05, + "loss": 0.1526, + "step": 23271 + }, + { + "epoch": 0.4150822245211001, + "grad_norm": 0.307907372713089, + "learning_rate": 3.634234217147311e-05, + "loss": 0.1554, + "step": 23272 + }, + { + "epoch": 0.41510006064281385, + "grad_norm": 0.32490867376327515, + "learning_rate": 3.6340955055775974e-05, + "loss": 0.1409, + "step": 23273 + }, + { + "epoch": 0.41511789676452754, + "grad_norm": 0.31072351336479187, + "learning_rate": 3.633956789611738e-05, + "loss": 0.201, + "step": 23274 + }, + { + "epoch": 0.4151357328862412, + "grad_norm": 0.1969567835330963, + "learning_rate": 3.63381806925027e-05, + "loss": 0.1672, + "step": 23275 + }, + { + "epoch": 0.4151535690079549, + "grad_norm": 0.24947373569011688, + "learning_rate": 3.633679344493732e-05, + "loss": 0.1627, + "step": 23276 + }, + { + "epoch": 0.4151714051296686, + "grad_norm": 0.3956585228443146, + "learning_rate": 3.6335406153426616e-05, + "loss": 0.152, + "step": 23277 + }, + { + "epoch": 0.4151892412513823, + "grad_norm": 0.2326977699995041, + "learning_rate": 3.633401881797597e-05, + "loss": 0.1603, + "step": 23278 + }, + { + "epoch": 0.415207077373096, + "grad_norm": 0.25640445947647095, + "learning_rate": 3.6332631438590736e-05, + "loss": 0.1278, + "step": 23279 + }, + { + "epoch": 0.41522491349480967, + "grad_norm": 0.27067849040031433, + "learning_rate": 3.633124401527632e-05, + "loss": 0.1044, + "step": 23280 + }, + { + "epoch": 0.4152427496165234, + "grad_norm": 0.26866787672042847, + "learning_rate": 3.632985654803808e-05, + "loss": 0.2111, + "step": 23281 + }, + { + "epoch": 0.4152605857382371, + "grad_norm": 0.28165364265441895, + "learning_rate": 3.6328469036881405e-05, + "loss": 0.1863, + "step": 23282 + }, + { + "epoch": 0.4152784218599508, + "grad_norm": 0.2928689420223236, + "learning_rate": 3.632708148181168e-05, + "loss": 0.1302, + "step": 23283 + }, + { + "epoch": 0.4152962579816645, + "grad_norm": 0.2067536860704422, + "learning_rate": 3.632569388283427e-05, + "loss": 0.1619, + "step": 23284 + }, + { + "epoch": 0.41531409410337816, + "grad_norm": 0.4143860936164856, + "learning_rate": 3.632430623995456e-05, + "loss": 0.19, + "step": 23285 + }, + { + "epoch": 0.41533193022509185, + "grad_norm": 0.1750083714723587, + "learning_rate": 3.632291855317792e-05, + "loss": 0.1237, + "step": 23286 + }, + { + "epoch": 0.41534976634680554, + "grad_norm": 0.37058934569358826, + "learning_rate": 3.632153082250975e-05, + "loss": 0.1655, + "step": 23287 + }, + { + "epoch": 0.4153676024685192, + "grad_norm": 0.27337387204170227, + "learning_rate": 3.6320143047955396e-05, + "loss": 0.1627, + "step": 23288 + }, + { + "epoch": 0.4153854385902329, + "grad_norm": 0.21912992000579834, + "learning_rate": 3.631875522952027e-05, + "loss": 0.1835, + "step": 23289 + }, + { + "epoch": 0.41540327471194666, + "grad_norm": 0.32558396458625793, + "learning_rate": 3.6317367367209744e-05, + "loss": 0.1802, + "step": 23290 + }, + { + "epoch": 0.41542111083366035, + "grad_norm": 0.24679528176784515, + "learning_rate": 3.631597946102919e-05, + "loss": 0.162, + "step": 23291 + }, + { + "epoch": 0.41543894695537403, + "grad_norm": 0.24788397550582886, + "learning_rate": 3.6314591510983984e-05, + "loss": 0.1647, + "step": 23292 + }, + { + "epoch": 0.4154567830770877, + "grad_norm": 0.29844745993614197, + "learning_rate": 3.631320351707953e-05, + "loss": 0.2673, + "step": 23293 + }, + { + "epoch": 0.4154746191988014, + "grad_norm": 0.28938814997673035, + "learning_rate": 3.6311815479321174e-05, + "loss": 0.1621, + "step": 23294 + }, + { + "epoch": 0.4154924553205151, + "grad_norm": 0.24037320911884308, + "learning_rate": 3.6310427397714316e-05, + "loss": 0.1757, + "step": 23295 + }, + { + "epoch": 0.4155102914422288, + "grad_norm": 0.21771900355815887, + "learning_rate": 3.630903927226434e-05, + "loss": 0.146, + "step": 23296 + }, + { + "epoch": 0.4155281275639425, + "grad_norm": 0.24293573200702667, + "learning_rate": 3.6307651102976625e-05, + "loss": 0.1259, + "step": 23297 + }, + { + "epoch": 0.4155459636856562, + "grad_norm": 0.25553926825523376, + "learning_rate": 3.630626288985655e-05, + "loss": 0.1635, + "step": 23298 + }, + { + "epoch": 0.4155637998073699, + "grad_norm": 0.3220132291316986, + "learning_rate": 3.630487463290949e-05, + "loss": 0.151, + "step": 23299 + }, + { + "epoch": 0.4155816359290836, + "grad_norm": 0.2131488025188446, + "learning_rate": 3.630348633214083e-05, + "loss": 0.1354, + "step": 23300 + }, + { + "epoch": 0.4155994720507973, + "grad_norm": 0.23164942860603333, + "learning_rate": 3.6302097987555955e-05, + "loss": 0.1536, + "step": 23301 + }, + { + "epoch": 0.41561730817251097, + "grad_norm": 0.2612023651599884, + "learning_rate": 3.630070959916024e-05, + "loss": 0.1465, + "step": 23302 + }, + { + "epoch": 0.41563514429422466, + "grad_norm": 0.30307537317276, + "learning_rate": 3.6299321166959075e-05, + "loss": 0.1627, + "step": 23303 + }, + { + "epoch": 0.41565298041593834, + "grad_norm": 0.23780179023742676, + "learning_rate": 3.629793269095785e-05, + "loss": 0.1186, + "step": 23304 + }, + { + "epoch": 0.41567081653765203, + "grad_norm": 0.3202025592327118, + "learning_rate": 3.6296544171161914e-05, + "loss": 0.1933, + "step": 23305 + }, + { + "epoch": 0.4156886526593657, + "grad_norm": 0.18145042657852173, + "learning_rate": 3.629515560757669e-05, + "loss": 0.1366, + "step": 23306 + }, + { + "epoch": 0.41570648878107946, + "grad_norm": 0.2425667941570282, + "learning_rate": 3.6293767000207534e-05, + "loss": 0.1874, + "step": 23307 + }, + { + "epoch": 0.41572432490279315, + "grad_norm": 0.29195621609687805, + "learning_rate": 3.6292378349059836e-05, + "loss": 0.1598, + "step": 23308 + }, + { + "epoch": 0.41574216102450684, + "grad_norm": 0.34642493724823, + "learning_rate": 3.629098965413897e-05, + "loss": 0.1953, + "step": 23309 + }, + { + "epoch": 0.41575999714622053, + "grad_norm": 0.3469013571739197, + "learning_rate": 3.628960091545034e-05, + "loss": 0.169, + "step": 23310 + }, + { + "epoch": 0.4157778332679342, + "grad_norm": 0.2694319784641266, + "learning_rate": 3.628821213299932e-05, + "loss": 0.1811, + "step": 23311 + }, + { + "epoch": 0.4157956693896479, + "grad_norm": 0.3065076172351837, + "learning_rate": 3.6286823306791284e-05, + "loss": 0.2084, + "step": 23312 + }, + { + "epoch": 0.4158135055113616, + "grad_norm": 0.3911265730857849, + "learning_rate": 3.628543443683163e-05, + "loss": 0.1043, + "step": 23313 + }, + { + "epoch": 0.4158313416330753, + "grad_norm": 0.3049227297306061, + "learning_rate": 3.628404552312573e-05, + "loss": 0.1705, + "step": 23314 + }, + { + "epoch": 0.415849177754789, + "grad_norm": 0.2835361957550049, + "learning_rate": 3.628265656567897e-05, + "loss": 0.1157, + "step": 23315 + }, + { + "epoch": 0.4158670138765027, + "grad_norm": 0.20075726509094238, + "learning_rate": 3.628126756449673e-05, + "loss": 0.1496, + "step": 23316 + }, + { + "epoch": 0.4158848499982164, + "grad_norm": 0.2600798010826111, + "learning_rate": 3.627987851958441e-05, + "loss": 0.1416, + "step": 23317 + }, + { + "epoch": 0.4159026861199301, + "grad_norm": 0.3323518931865692, + "learning_rate": 3.6278489430947383e-05, + "loss": 0.1661, + "step": 23318 + }, + { + "epoch": 0.4159205222416438, + "grad_norm": 0.23960067331790924, + "learning_rate": 3.627710029859104e-05, + "loss": 0.1484, + "step": 23319 + }, + { + "epoch": 0.41593835836335746, + "grad_norm": 0.2297036498785019, + "learning_rate": 3.6275711122520753e-05, + "loss": 0.1504, + "step": 23320 + }, + { + "epoch": 0.41595619448507115, + "grad_norm": 0.33420923352241516, + "learning_rate": 3.627432190274192e-05, + "loss": 0.1116, + "step": 23321 + }, + { + "epoch": 0.41597403060678484, + "grad_norm": 0.20574922859668732, + "learning_rate": 3.6272932639259916e-05, + "loss": 0.105, + "step": 23322 + }, + { + "epoch": 0.4159918667284986, + "grad_norm": 0.26272812485694885, + "learning_rate": 3.627154333208014e-05, + "loss": 0.1394, + "step": 23323 + }, + { + "epoch": 0.41600970285021227, + "grad_norm": 0.31237339973449707, + "learning_rate": 3.627015398120797e-05, + "loss": 0.1336, + "step": 23324 + }, + { + "epoch": 0.41602753897192596, + "grad_norm": 0.3028266131877899, + "learning_rate": 3.6268764586648774e-05, + "loss": 0.1923, + "step": 23325 + }, + { + "epoch": 0.41604537509363965, + "grad_norm": 0.2907858192920685, + "learning_rate": 3.6267375148407975e-05, + "loss": 0.1719, + "step": 23326 + }, + { + "epoch": 0.41606321121535333, + "grad_norm": 0.24408207833766937, + "learning_rate": 3.626598566649092e-05, + "loss": 0.1542, + "step": 23327 + }, + { + "epoch": 0.416081047337067, + "grad_norm": 0.1730697900056839, + "learning_rate": 3.626459614090303e-05, + "loss": 0.1309, + "step": 23328 + }, + { + "epoch": 0.4160988834587807, + "grad_norm": 0.22531509399414062, + "learning_rate": 3.626320657164966e-05, + "loss": 0.0978, + "step": 23329 + }, + { + "epoch": 0.4161167195804944, + "grad_norm": 0.3676927387714386, + "learning_rate": 3.626181695873622e-05, + "loss": 0.1725, + "step": 23330 + }, + { + "epoch": 0.4161345557022081, + "grad_norm": 0.23085030913352966, + "learning_rate": 3.6260427302168084e-05, + "loss": 0.1621, + "step": 23331 + }, + { + "epoch": 0.41615239182392183, + "grad_norm": 0.18741777539253235, + "learning_rate": 3.6259037601950646e-05, + "loss": 0.1296, + "step": 23332 + }, + { + "epoch": 0.4161702279456355, + "grad_norm": 0.2550046145915985, + "learning_rate": 3.625764785808929e-05, + "loss": 0.1748, + "step": 23333 + }, + { + "epoch": 0.4161880640673492, + "grad_norm": 0.20631776750087738, + "learning_rate": 3.62562580705894e-05, + "loss": 0.1421, + "step": 23334 + }, + { + "epoch": 0.4162059001890629, + "grad_norm": 0.2726269066333771, + "learning_rate": 3.6254868239456367e-05, + "loss": 0.1342, + "step": 23335 + }, + { + "epoch": 0.4162237363107766, + "grad_norm": 0.21395710110664368, + "learning_rate": 3.625347836469557e-05, + "loss": 0.1643, + "step": 23336 + }, + { + "epoch": 0.41624157243249027, + "grad_norm": 0.4118204712867737, + "learning_rate": 3.625208844631241e-05, + "loss": 0.1633, + "step": 23337 + }, + { + "epoch": 0.41625940855420396, + "grad_norm": 0.35637524724006653, + "learning_rate": 3.625069848431227e-05, + "loss": 0.1537, + "step": 23338 + }, + { + "epoch": 0.41627724467591765, + "grad_norm": 0.25940996408462524, + "learning_rate": 3.624930847870054e-05, + "loss": 0.1503, + "step": 23339 + }, + { + "epoch": 0.4162950807976314, + "grad_norm": 0.21032650768756866, + "learning_rate": 3.62479184294826e-05, + "loss": 0.1193, + "step": 23340 + }, + { + "epoch": 0.4163129169193451, + "grad_norm": 0.27171435952186584, + "learning_rate": 3.6246528336663846e-05, + "loss": 0.1905, + "step": 23341 + }, + { + "epoch": 0.41633075304105877, + "grad_norm": 0.20779871940612793, + "learning_rate": 3.624513820024966e-05, + "loss": 0.1689, + "step": 23342 + }, + { + "epoch": 0.41634858916277245, + "grad_norm": 0.26859521865844727, + "learning_rate": 3.624374802024544e-05, + "loss": 0.1591, + "step": 23343 + }, + { + "epoch": 0.41636642528448614, + "grad_norm": 0.36573532223701477, + "learning_rate": 3.6242357796656563e-05, + "loss": 0.19, + "step": 23344 + }, + { + "epoch": 0.41638426140619983, + "grad_norm": 0.24918632209300995, + "learning_rate": 3.624096752948843e-05, + "loss": 0.0949, + "step": 23345 + }, + { + "epoch": 0.4164020975279135, + "grad_norm": 0.3105931878089905, + "learning_rate": 3.623957721874642e-05, + "loss": 0.1422, + "step": 23346 + }, + { + "epoch": 0.4164199336496272, + "grad_norm": 0.2787606716156006, + "learning_rate": 3.6238186864435934e-05, + "loss": 0.1414, + "step": 23347 + }, + { + "epoch": 0.4164377697713409, + "grad_norm": 0.24953365325927734, + "learning_rate": 3.623679646656235e-05, + "loss": 0.163, + "step": 23348 + }, + { + "epoch": 0.41645560589305464, + "grad_norm": 0.27132412791252136, + "learning_rate": 3.623540602513106e-05, + "loss": 0.1515, + "step": 23349 + }, + { + "epoch": 0.4164734420147683, + "grad_norm": 0.2446313053369522, + "learning_rate": 3.623401554014745e-05, + "loss": 0.1056, + "step": 23350 + }, + { + "epoch": 0.416491278136482, + "grad_norm": 0.26457521319389343, + "learning_rate": 3.623262501161692e-05, + "loss": 0.141, + "step": 23351 + }, + { + "epoch": 0.4165091142581957, + "grad_norm": 0.2918124198913574, + "learning_rate": 3.623123443954486e-05, + "loss": 0.2161, + "step": 23352 + }, + { + "epoch": 0.4165269503799094, + "grad_norm": 0.2150900661945343, + "learning_rate": 3.622984382393665e-05, + "loss": 0.1156, + "step": 23353 + }, + { + "epoch": 0.4165447865016231, + "grad_norm": 0.24866187572479248, + "learning_rate": 3.622845316479769e-05, + "loss": 0.1425, + "step": 23354 + }, + { + "epoch": 0.41656262262333676, + "grad_norm": 0.3092412054538727, + "learning_rate": 3.622706246213337e-05, + "loss": 0.1814, + "step": 23355 + }, + { + "epoch": 0.41658045874505045, + "grad_norm": 0.28487467765808105, + "learning_rate": 3.622567171594908e-05, + "loss": 0.2087, + "step": 23356 + }, + { + "epoch": 0.4165982948667642, + "grad_norm": 0.2582945227622986, + "learning_rate": 3.62242809262502e-05, + "loss": 0.1168, + "step": 23357 + }, + { + "epoch": 0.4166161309884779, + "grad_norm": 0.28591442108154297, + "learning_rate": 3.622289009304214e-05, + "loss": 0.1497, + "step": 23358 + }, + { + "epoch": 0.41663396711019157, + "grad_norm": 0.298454612493515, + "learning_rate": 3.622149921633027e-05, + "loss": 0.1519, + "step": 23359 + }, + { + "epoch": 0.41665180323190526, + "grad_norm": 0.2814613878726959, + "learning_rate": 3.6220108296120005e-05, + "loss": 0.1306, + "step": 23360 + }, + { + "epoch": 0.41666963935361895, + "grad_norm": 0.29728448390960693, + "learning_rate": 3.6218717332416724e-05, + "loss": 0.2144, + "step": 23361 + }, + { + "epoch": 0.41668747547533264, + "grad_norm": 0.48023849725723267, + "learning_rate": 3.6217326325225816e-05, + "loss": 0.1878, + "step": 23362 + }, + { + "epoch": 0.4167053115970463, + "grad_norm": 0.21834638714790344, + "learning_rate": 3.6215935274552674e-05, + "loss": 0.1552, + "step": 23363 + }, + { + "epoch": 0.41672314771876, + "grad_norm": 0.371028333902359, + "learning_rate": 3.62145441804027e-05, + "loss": 0.2564, + "step": 23364 + }, + { + "epoch": 0.4167409838404737, + "grad_norm": 0.3147681951522827, + "learning_rate": 3.621315304278127e-05, + "loss": 0.1312, + "step": 23365 + }, + { + "epoch": 0.41675881996218744, + "grad_norm": 0.3369613587856293, + "learning_rate": 3.621176186169379e-05, + "loss": 0.1762, + "step": 23366 + }, + { + "epoch": 0.41677665608390113, + "grad_norm": 0.2935454845428467, + "learning_rate": 3.621037063714565e-05, + "loss": 0.2089, + "step": 23367 + }, + { + "epoch": 0.4167944922056148, + "grad_norm": 0.28261691331863403, + "learning_rate": 3.6208979369142245e-05, + "loss": 0.1785, + "step": 23368 + }, + { + "epoch": 0.4168123283273285, + "grad_norm": 0.3761289119720459, + "learning_rate": 3.620758805768896e-05, + "loss": 0.1494, + "step": 23369 + }, + { + "epoch": 0.4168301644490422, + "grad_norm": 0.16350127756595612, + "learning_rate": 3.6206196702791186e-05, + "loss": 0.1181, + "step": 23370 + }, + { + "epoch": 0.4168480005707559, + "grad_norm": 0.22421710193157196, + "learning_rate": 3.6204805304454334e-05, + "loss": 0.1521, + "step": 23371 + }, + { + "epoch": 0.41686583669246957, + "grad_norm": 0.20705775916576385, + "learning_rate": 3.620341386268379e-05, + "loss": 0.1168, + "step": 23372 + }, + { + "epoch": 0.41688367281418326, + "grad_norm": 0.2572283446788788, + "learning_rate": 3.620202237748493e-05, + "loss": 0.1374, + "step": 23373 + }, + { + "epoch": 0.416901508935897, + "grad_norm": 0.3022323548793793, + "learning_rate": 3.620063084886318e-05, + "loss": 0.1563, + "step": 23374 + }, + { + "epoch": 0.4169193450576107, + "grad_norm": 0.19538500905036926, + "learning_rate": 3.61992392768239e-05, + "loss": 0.124, + "step": 23375 + }, + { + "epoch": 0.4169371811793244, + "grad_norm": 0.26758912205696106, + "learning_rate": 3.619784766137251e-05, + "loss": 0.2, + "step": 23376 + }, + { + "epoch": 0.41695501730103807, + "grad_norm": 0.2323193997144699, + "learning_rate": 3.619645600251439e-05, + "loss": 0.1597, + "step": 23377 + }, + { + "epoch": 0.41697285342275175, + "grad_norm": 0.24129992723464966, + "learning_rate": 3.619506430025494e-05, + "loss": 0.1635, + "step": 23378 + }, + { + "epoch": 0.41699068954446544, + "grad_norm": 0.2884138524532318, + "learning_rate": 3.619367255459955e-05, + "loss": 0.1376, + "step": 23379 + }, + { + "epoch": 0.41700852566617913, + "grad_norm": 0.2532219886779785, + "learning_rate": 3.6192280765553624e-05, + "loss": 0.1283, + "step": 23380 + }, + { + "epoch": 0.4170263617878928, + "grad_norm": 0.2978946566581726, + "learning_rate": 3.619088893312255e-05, + "loss": 0.1756, + "step": 23381 + }, + { + "epoch": 0.41704419790960656, + "grad_norm": 0.22412151098251343, + "learning_rate": 3.6189497057311735e-05, + "loss": 0.1388, + "step": 23382 + }, + { + "epoch": 0.41706203403132025, + "grad_norm": 0.31035101413726807, + "learning_rate": 3.618810513812655e-05, + "loss": 0.2271, + "step": 23383 + }, + { + "epoch": 0.41707987015303394, + "grad_norm": 0.25164157152175903, + "learning_rate": 3.618671317557242e-05, + "loss": 0.167, + "step": 23384 + }, + { + "epoch": 0.4170977062747476, + "grad_norm": 0.23634856939315796, + "learning_rate": 3.6185321169654714e-05, + "loss": 0.1385, + "step": 23385 + }, + { + "epoch": 0.4171155423964613, + "grad_norm": 0.2758852243423462, + "learning_rate": 3.618392912037884e-05, + "loss": 0.174, + "step": 23386 + }, + { + "epoch": 0.417133378518175, + "grad_norm": 0.3032567799091339, + "learning_rate": 3.6182537027750205e-05, + "loss": 0.1714, + "step": 23387 + }, + { + "epoch": 0.4171512146398887, + "grad_norm": 0.25116029381752014, + "learning_rate": 3.618114489177418e-05, + "loss": 0.1335, + "step": 23388 + }, + { + "epoch": 0.4171690507616024, + "grad_norm": 0.2649308741092682, + "learning_rate": 3.617975271245619e-05, + "loss": 0.143, + "step": 23389 + }, + { + "epoch": 0.41718688688331607, + "grad_norm": 0.35784125328063965, + "learning_rate": 3.61783604898016e-05, + "loss": 0.2568, + "step": 23390 + }, + { + "epoch": 0.4172047230050298, + "grad_norm": 0.23808801174163818, + "learning_rate": 3.617696822381584e-05, + "loss": 0.1657, + "step": 23391 + }, + { + "epoch": 0.4172225591267435, + "grad_norm": 0.24368254840373993, + "learning_rate": 3.617557591450428e-05, + "loss": 0.1325, + "step": 23392 + }, + { + "epoch": 0.4172403952484572, + "grad_norm": 0.32355621457099915, + "learning_rate": 3.617418356187233e-05, + "loss": 0.22, + "step": 23393 + }, + { + "epoch": 0.4172582313701709, + "grad_norm": 0.2619912028312683, + "learning_rate": 3.617279116592539e-05, + "loss": 0.1348, + "step": 23394 + }, + { + "epoch": 0.41727606749188456, + "grad_norm": 0.2434076964855194, + "learning_rate": 3.617139872666885e-05, + "loss": 0.1283, + "step": 23395 + }, + { + "epoch": 0.41729390361359825, + "grad_norm": 0.28391337394714355, + "learning_rate": 3.617000624410811e-05, + "loss": 0.1334, + "step": 23396 + }, + { + "epoch": 0.41731173973531194, + "grad_norm": 0.25604772567749023, + "learning_rate": 3.6168613718248574e-05, + "loss": 0.1102, + "step": 23397 + }, + { + "epoch": 0.4173295758570256, + "grad_norm": 0.3729875981807709, + "learning_rate": 3.616722114909562e-05, + "loss": 0.1808, + "step": 23398 + }, + { + "epoch": 0.41734741197873937, + "grad_norm": 0.28842636942863464, + "learning_rate": 3.6165828536654666e-05, + "loss": 0.136, + "step": 23399 + }, + { + "epoch": 0.41736524810045306, + "grad_norm": 0.2618440091609955, + "learning_rate": 3.6164435880931116e-05, + "loss": 0.1738, + "step": 23400 + }, + { + "epoch": 0.41738308422216674, + "grad_norm": 0.30283886194229126, + "learning_rate": 3.616304318193034e-05, + "loss": 0.1184, + "step": 23401 + }, + { + "epoch": 0.41740092034388043, + "grad_norm": 0.24745404720306396, + "learning_rate": 3.616165043965776e-05, + "loss": 0.1694, + "step": 23402 + }, + { + "epoch": 0.4174187564655941, + "grad_norm": 0.18638542294502258, + "learning_rate": 3.616025765411876e-05, + "loss": 0.1493, + "step": 23403 + }, + { + "epoch": 0.4174365925873078, + "grad_norm": 0.3083762228488922, + "learning_rate": 3.615886482531876e-05, + "loss": 0.1582, + "step": 23404 + }, + { + "epoch": 0.4174544287090215, + "grad_norm": 0.25041407346725464, + "learning_rate": 3.615747195326314e-05, + "loss": 0.1361, + "step": 23405 + }, + { + "epoch": 0.4174722648307352, + "grad_norm": 0.2558700144290924, + "learning_rate": 3.61560790379573e-05, + "loss": 0.1673, + "step": 23406 + }, + { + "epoch": 0.4174901009524489, + "grad_norm": 0.22794656455516815, + "learning_rate": 3.6154686079406645e-05, + "loss": 0.1656, + "step": 23407 + }, + { + "epoch": 0.4175079370741626, + "grad_norm": 0.367709219455719, + "learning_rate": 3.6153293077616576e-05, + "loss": 0.1676, + "step": 23408 + }, + { + "epoch": 0.4175257731958763, + "grad_norm": 0.4466867446899414, + "learning_rate": 3.6151900032592495e-05, + "loss": 0.1031, + "step": 23409 + }, + { + "epoch": 0.41754360931759, + "grad_norm": 0.2913598120212555, + "learning_rate": 3.61505069443398e-05, + "loss": 0.2015, + "step": 23410 + }, + { + "epoch": 0.4175614454393037, + "grad_norm": 0.23446781933307648, + "learning_rate": 3.614911381286389e-05, + "loss": 0.1509, + "step": 23411 + }, + { + "epoch": 0.41757928156101737, + "grad_norm": 0.233299121260643, + "learning_rate": 3.6147720638170155e-05, + "loss": 0.1509, + "step": 23412 + }, + { + "epoch": 0.41759711768273106, + "grad_norm": 0.26786768436431885, + "learning_rate": 3.6146327420264006e-05, + "loss": 0.1379, + "step": 23413 + }, + { + "epoch": 0.41761495380444474, + "grad_norm": 0.25657570362091064, + "learning_rate": 3.6144934159150836e-05, + "loss": 0.1488, + "step": 23414 + }, + { + "epoch": 0.41763278992615843, + "grad_norm": 0.196097269654274, + "learning_rate": 3.6143540854836065e-05, + "loss": 0.1324, + "step": 23415 + }, + { + "epoch": 0.4176506260478722, + "grad_norm": 0.29617664217948914, + "learning_rate": 3.6142147507325074e-05, + "loss": 0.1524, + "step": 23416 + }, + { + "epoch": 0.41766846216958586, + "grad_norm": 0.281398743391037, + "learning_rate": 3.614075411662327e-05, + "loss": 0.1505, + "step": 23417 + }, + { + "epoch": 0.41768629829129955, + "grad_norm": 0.2473234385251999, + "learning_rate": 3.613936068273606e-05, + "loss": 0.11, + "step": 23418 + }, + { + "epoch": 0.41770413441301324, + "grad_norm": 0.33206626772880554, + "learning_rate": 3.613796720566884e-05, + "loss": 0.1342, + "step": 23419 + }, + { + "epoch": 0.4177219705347269, + "grad_norm": 0.2577970027923584, + "learning_rate": 3.6136573685427e-05, + "loss": 0.1637, + "step": 23420 + }, + { + "epoch": 0.4177398066564406, + "grad_norm": 0.2510599195957184, + "learning_rate": 3.613518012201597e-05, + "loss": 0.1521, + "step": 23421 + }, + { + "epoch": 0.4177576427781543, + "grad_norm": 0.30506131052970886, + "learning_rate": 3.613378651544113e-05, + "loss": 0.2153, + "step": 23422 + }, + { + "epoch": 0.417775478899868, + "grad_norm": 0.29952675104141235, + "learning_rate": 3.613239286570789e-05, + "loss": 0.1853, + "step": 23423 + }, + { + "epoch": 0.41779331502158173, + "grad_norm": 0.32651522755622864, + "learning_rate": 3.613099917282165e-05, + "loss": 0.1619, + "step": 23424 + }, + { + "epoch": 0.4178111511432954, + "grad_norm": 0.4050840437412262, + "learning_rate": 3.612960543678781e-05, + "loss": 0.1205, + "step": 23425 + }, + { + "epoch": 0.4178289872650091, + "grad_norm": 0.27712371945381165, + "learning_rate": 3.612821165761177e-05, + "loss": 0.146, + "step": 23426 + }, + { + "epoch": 0.4178468233867228, + "grad_norm": 0.23165202140808105, + "learning_rate": 3.612681783529894e-05, + "loss": 0.1785, + "step": 23427 + }, + { + "epoch": 0.4178646595084365, + "grad_norm": 0.4011196792125702, + "learning_rate": 3.612542396985473e-05, + "loss": 0.1583, + "step": 23428 + }, + { + "epoch": 0.4178824956301502, + "grad_norm": 0.24495315551757812, + "learning_rate": 3.612403006128453e-05, + "loss": 0.1607, + "step": 23429 + }, + { + "epoch": 0.41790033175186386, + "grad_norm": 0.2975074350833893, + "learning_rate": 3.612263610959375e-05, + "loss": 0.1624, + "step": 23430 + }, + { + "epoch": 0.41791816787357755, + "grad_norm": 0.3701333701610565, + "learning_rate": 3.612124211478778e-05, + "loss": 0.1314, + "step": 23431 + }, + { + "epoch": 0.41793600399529124, + "grad_norm": 0.24394141137599945, + "learning_rate": 3.6119848076872045e-05, + "loss": 0.1496, + "step": 23432 + }, + { + "epoch": 0.417953840117005, + "grad_norm": 0.280828058719635, + "learning_rate": 3.6118453995851935e-05, + "loss": 0.1753, + "step": 23433 + }, + { + "epoch": 0.41797167623871867, + "grad_norm": 0.2302258461713791, + "learning_rate": 3.6117059871732856e-05, + "loss": 0.1523, + "step": 23434 + }, + { + "epoch": 0.41798951236043236, + "grad_norm": 0.23049025237560272, + "learning_rate": 3.611566570452021e-05, + "loss": 0.1209, + "step": 23435 + }, + { + "epoch": 0.41800734848214605, + "grad_norm": 0.279909610748291, + "learning_rate": 3.611427149421941e-05, + "loss": 0.2142, + "step": 23436 + }, + { + "epoch": 0.41802518460385973, + "grad_norm": 0.27000412344932556, + "learning_rate": 3.611287724083586e-05, + "loss": 0.1513, + "step": 23437 + }, + { + "epoch": 0.4180430207255734, + "grad_norm": 0.3522058129310608, + "learning_rate": 3.6111482944374955e-05, + "loss": 0.1016, + "step": 23438 + }, + { + "epoch": 0.4180608568472871, + "grad_norm": 0.28997260332107544, + "learning_rate": 3.61100886048421e-05, + "loss": 0.1559, + "step": 23439 + }, + { + "epoch": 0.4180786929690008, + "grad_norm": 0.2866147756576538, + "learning_rate": 3.610869422224271e-05, + "loss": 0.1297, + "step": 23440 + }, + { + "epoch": 0.41809652909071454, + "grad_norm": 0.24868327379226685, + "learning_rate": 3.610729979658218e-05, + "loss": 0.1465, + "step": 23441 + }, + { + "epoch": 0.41811436521242823, + "grad_norm": 0.2247244119644165, + "learning_rate": 3.610590532786592e-05, + "loss": 0.1161, + "step": 23442 + }, + { + "epoch": 0.4181322013341419, + "grad_norm": 0.3591804802417755, + "learning_rate": 3.610451081609934e-05, + "loss": 0.225, + "step": 23443 + }, + { + "epoch": 0.4181500374558556, + "grad_norm": 0.4007553160190582, + "learning_rate": 3.610311626128783e-05, + "loss": 0.216, + "step": 23444 + }, + { + "epoch": 0.4181678735775693, + "grad_norm": 0.22225411236286163, + "learning_rate": 3.610172166343682e-05, + "loss": 0.1122, + "step": 23445 + }, + { + "epoch": 0.418185709699283, + "grad_norm": 0.1966107189655304, + "learning_rate": 3.610032702255169e-05, + "loss": 0.1285, + "step": 23446 + }, + { + "epoch": 0.41820354582099667, + "grad_norm": 0.4930487275123596, + "learning_rate": 3.609893233863786e-05, + "loss": 0.1185, + "step": 23447 + }, + { + "epoch": 0.41822138194271036, + "grad_norm": 0.22558413445949554, + "learning_rate": 3.609753761170074e-05, + "loss": 0.1351, + "step": 23448 + }, + { + "epoch": 0.41823921806442405, + "grad_norm": 0.22826021909713745, + "learning_rate": 3.609614284174574e-05, + "loss": 0.1203, + "step": 23449 + }, + { + "epoch": 0.4182570541861378, + "grad_norm": 0.21137797832489014, + "learning_rate": 3.609474802877824e-05, + "loss": 0.1116, + "step": 23450 + }, + { + "epoch": 0.4182748903078515, + "grad_norm": 0.17108070850372314, + "learning_rate": 3.609335317280367e-05, + "loss": 0.1431, + "step": 23451 + }, + { + "epoch": 0.41829272642956516, + "grad_norm": 0.2823784649372101, + "learning_rate": 3.609195827382744e-05, + "loss": 0.1902, + "step": 23452 + }, + { + "epoch": 0.41831056255127885, + "grad_norm": 0.3045092821121216, + "learning_rate": 3.609056333185494e-05, + "loss": 0.1488, + "step": 23453 + }, + { + "epoch": 0.41832839867299254, + "grad_norm": 0.24341613054275513, + "learning_rate": 3.608916834689159e-05, + "loss": 0.1433, + "step": 23454 + }, + { + "epoch": 0.41834623479470623, + "grad_norm": 0.30423226952552795, + "learning_rate": 3.6087773318942785e-05, + "loss": 0.201, + "step": 23455 + }, + { + "epoch": 0.4183640709164199, + "grad_norm": 0.38588663935661316, + "learning_rate": 3.608637824801395e-05, + "loss": 0.1285, + "step": 23456 + }, + { + "epoch": 0.4183819070381336, + "grad_norm": 0.37726718187332153, + "learning_rate": 3.608498313411049e-05, + "loss": 0.141, + "step": 23457 + }, + { + "epoch": 0.41839974315984735, + "grad_norm": 0.35581764578819275, + "learning_rate": 3.60835879772378e-05, + "loss": 0.1408, + "step": 23458 + }, + { + "epoch": 0.41841757928156104, + "grad_norm": 0.23475539684295654, + "learning_rate": 3.608219277740129e-05, + "loss": 0.1477, + "step": 23459 + }, + { + "epoch": 0.4184354154032747, + "grad_norm": 0.3895374834537506, + "learning_rate": 3.608079753460638e-05, + "loss": 0.2008, + "step": 23460 + }, + { + "epoch": 0.4184532515249884, + "grad_norm": 0.24002403020858765, + "learning_rate": 3.607940224885846e-05, + "loss": 0.1717, + "step": 23461 + }, + { + "epoch": 0.4184710876467021, + "grad_norm": 0.2423631250858307, + "learning_rate": 3.6078006920162965e-05, + "loss": 0.1195, + "step": 23462 + }, + { + "epoch": 0.4184889237684158, + "grad_norm": 0.2357056587934494, + "learning_rate": 3.6076611548525285e-05, + "loss": 0.094, + "step": 23463 + }, + { + "epoch": 0.4185067598901295, + "grad_norm": 0.2701158821582794, + "learning_rate": 3.607521613395083e-05, + "loss": 0.1552, + "step": 23464 + }, + { + "epoch": 0.41852459601184316, + "grad_norm": 0.26690196990966797, + "learning_rate": 3.607382067644501e-05, + "loss": 0.127, + "step": 23465 + }, + { + "epoch": 0.41854243213355685, + "grad_norm": 0.33754757046699524, + "learning_rate": 3.6072425176013235e-05, + "loss": 0.1489, + "step": 23466 + }, + { + "epoch": 0.4185602682552706, + "grad_norm": 0.24351945519447327, + "learning_rate": 3.607102963266092e-05, + "loss": 0.1569, + "step": 23467 + }, + { + "epoch": 0.4185781043769843, + "grad_norm": 0.3732454776763916, + "learning_rate": 3.6069634046393476e-05, + "loss": 0.1813, + "step": 23468 + }, + { + "epoch": 0.41859594049869797, + "grad_norm": 0.21357886493206024, + "learning_rate": 3.6068238417216295e-05, + "loss": 0.1636, + "step": 23469 + }, + { + "epoch": 0.41861377662041166, + "grad_norm": 0.39409980177879333, + "learning_rate": 3.60668427451348e-05, + "loss": 0.143, + "step": 23470 + }, + { + "epoch": 0.41863161274212535, + "grad_norm": 0.34588822722435, + "learning_rate": 3.606544703015442e-05, + "loss": 0.1199, + "step": 23471 + }, + { + "epoch": 0.41864944886383904, + "grad_norm": 0.27032163739204407, + "learning_rate": 3.606405127228052e-05, + "loss": 0.1499, + "step": 23472 + }, + { + "epoch": 0.4186672849855527, + "grad_norm": 0.44959843158721924, + "learning_rate": 3.6062655471518556e-05, + "loss": 0.2312, + "step": 23473 + }, + { + "epoch": 0.4186851211072664, + "grad_norm": 0.2795168459415436, + "learning_rate": 3.60612596278739e-05, + "loss": 0.1339, + "step": 23474 + }, + { + "epoch": 0.41870295722898015, + "grad_norm": 0.334330290555954, + "learning_rate": 3.605986374135199e-05, + "loss": 0.1939, + "step": 23475 + }, + { + "epoch": 0.41872079335069384, + "grad_norm": 0.2417813390493393, + "learning_rate": 3.605846781195823e-05, + "loss": 0.1389, + "step": 23476 + }, + { + "epoch": 0.41873862947240753, + "grad_norm": 0.23533926904201508, + "learning_rate": 3.6057071839698026e-05, + "loss": 0.1479, + "step": 23477 + }, + { + "epoch": 0.4187564655941212, + "grad_norm": 0.2055702954530716, + "learning_rate": 3.6055675824576795e-05, + "loss": 0.1068, + "step": 23478 + }, + { + "epoch": 0.4187743017158349, + "grad_norm": 0.21435564756393433, + "learning_rate": 3.6054279766599946e-05, + "loss": 0.1435, + "step": 23479 + }, + { + "epoch": 0.4187921378375486, + "grad_norm": 0.2539478540420532, + "learning_rate": 3.60528836657729e-05, + "loss": 0.117, + "step": 23480 + }, + { + "epoch": 0.4188099739592623, + "grad_norm": 0.326172798871994, + "learning_rate": 3.605148752210104e-05, + "loss": 0.1344, + "step": 23481 + }, + { + "epoch": 0.41882781008097597, + "grad_norm": 0.29429468512535095, + "learning_rate": 3.60500913355898e-05, + "loss": 0.1833, + "step": 23482 + }, + { + "epoch": 0.4188456462026897, + "grad_norm": 0.2629934549331665, + "learning_rate": 3.60486951062446e-05, + "loss": 0.1215, + "step": 23483 + }, + { + "epoch": 0.4188634823244034, + "grad_norm": 0.22771377861499786, + "learning_rate": 3.604729883407084e-05, + "loss": 0.1554, + "step": 23484 + }, + { + "epoch": 0.4188813184461171, + "grad_norm": 0.26041877269744873, + "learning_rate": 3.6045902519073925e-05, + "loss": 0.1431, + "step": 23485 + }, + { + "epoch": 0.4188991545678308, + "grad_norm": 0.2677501440048218, + "learning_rate": 3.604450616125929e-05, + "loss": 0.1812, + "step": 23486 + }, + { + "epoch": 0.41891699068954447, + "grad_norm": 0.2599920630455017, + "learning_rate": 3.6043109760632326e-05, + "loss": 0.1287, + "step": 23487 + }, + { + "epoch": 0.41893482681125815, + "grad_norm": 0.2790832817554474, + "learning_rate": 3.604171331719846e-05, + "loss": 0.1486, + "step": 23488 + }, + { + "epoch": 0.41895266293297184, + "grad_norm": 0.23356810212135315, + "learning_rate": 3.60403168309631e-05, + "loss": 0.1565, + "step": 23489 + }, + { + "epoch": 0.41897049905468553, + "grad_norm": 0.2652481496334076, + "learning_rate": 3.603892030193164e-05, + "loss": 0.0968, + "step": 23490 + }, + { + "epoch": 0.4189883351763992, + "grad_norm": 0.22331734001636505, + "learning_rate": 3.6037523730109534e-05, + "loss": 0.1675, + "step": 23491 + }, + { + "epoch": 0.41900617129811296, + "grad_norm": 0.237565815448761, + "learning_rate": 3.6036127115502164e-05, + "loss": 0.1264, + "step": 23492 + }, + { + "epoch": 0.41902400741982665, + "grad_norm": 0.1968141794204712, + "learning_rate": 3.603473045811496e-05, + "loss": 0.0989, + "step": 23493 + }, + { + "epoch": 0.41904184354154034, + "grad_norm": 0.2734472453594208, + "learning_rate": 3.603333375795333e-05, + "loss": 0.2273, + "step": 23494 + }, + { + "epoch": 0.419059679663254, + "grad_norm": 0.23806653916835785, + "learning_rate": 3.6031937015022676e-05, + "loss": 0.1582, + "step": 23495 + }, + { + "epoch": 0.4190775157849677, + "grad_norm": 0.244029238820076, + "learning_rate": 3.6030540229328434e-05, + "loss": 0.1248, + "step": 23496 + }, + { + "epoch": 0.4190953519066814, + "grad_norm": 0.2946047782897949, + "learning_rate": 3.6029143400876e-05, + "loss": 0.0984, + "step": 23497 + }, + { + "epoch": 0.4191131880283951, + "grad_norm": 0.23965643346309662, + "learning_rate": 3.60277465296708e-05, + "loss": 0.1635, + "step": 23498 + }, + { + "epoch": 0.4191310241501088, + "grad_norm": 0.21011687815189362, + "learning_rate": 3.602634961571825e-05, + "loss": 0.1697, + "step": 23499 + }, + { + "epoch": 0.4191488602718225, + "grad_norm": 0.37227901816368103, + "learning_rate": 3.6024952659023756e-05, + "loss": 0.162, + "step": 23500 + }, + { + "epoch": 0.4191666963935362, + "grad_norm": 0.22852960228919983, + "learning_rate": 3.6023555659592744e-05, + "loss": 0.1501, + "step": 23501 + }, + { + "epoch": 0.4191845325152499, + "grad_norm": 0.23039503395557404, + "learning_rate": 3.602215861743062e-05, + "loss": 0.1374, + "step": 23502 + }, + { + "epoch": 0.4192023686369636, + "grad_norm": 0.20760031044483185, + "learning_rate": 3.602076153254279e-05, + "loss": 0.1252, + "step": 23503 + }, + { + "epoch": 0.4192202047586773, + "grad_norm": 0.22538182139396667, + "learning_rate": 3.60193644049347e-05, + "loss": 0.1374, + "step": 23504 + }, + { + "epoch": 0.41923804088039096, + "grad_norm": 0.2839062213897705, + "learning_rate": 3.601796723461174e-05, + "loss": 0.1305, + "step": 23505 + }, + { + "epoch": 0.41925587700210465, + "grad_norm": 0.37018483877182007, + "learning_rate": 3.601657002157934e-05, + "loss": 0.1736, + "step": 23506 + }, + { + "epoch": 0.41927371312381834, + "grad_norm": 0.22349640727043152, + "learning_rate": 3.60151727658429e-05, + "loss": 0.1743, + "step": 23507 + }, + { + "epoch": 0.419291549245532, + "grad_norm": 0.28234776854515076, + "learning_rate": 3.601377546740785e-05, + "loss": 0.1752, + "step": 23508 + }, + { + "epoch": 0.41930938536724577, + "grad_norm": 0.26952725648880005, + "learning_rate": 3.60123781262796e-05, + "loss": 0.1789, + "step": 23509 + }, + { + "epoch": 0.41932722148895946, + "grad_norm": 0.2512272298336029, + "learning_rate": 3.601098074246357e-05, + "loss": 0.103, + "step": 23510 + }, + { + "epoch": 0.41934505761067314, + "grad_norm": 0.2429272085428238, + "learning_rate": 3.600958331596517e-05, + "loss": 0.184, + "step": 23511 + }, + { + "epoch": 0.41936289373238683, + "grad_norm": 0.2635399103164673, + "learning_rate": 3.600818584678983e-05, + "loss": 0.1704, + "step": 23512 + }, + { + "epoch": 0.4193807298541005, + "grad_norm": 0.2201332449913025, + "learning_rate": 3.600678833494296e-05, + "loss": 0.1292, + "step": 23513 + }, + { + "epoch": 0.4193985659758142, + "grad_norm": 0.27403369545936584, + "learning_rate": 3.600539078042998e-05, + "loss": 0.1675, + "step": 23514 + }, + { + "epoch": 0.4194164020975279, + "grad_norm": 0.34280261397361755, + "learning_rate": 3.6003993183256293e-05, + "loss": 0.1432, + "step": 23515 + }, + { + "epoch": 0.4194342382192416, + "grad_norm": 0.3418848514556885, + "learning_rate": 3.6002595543427336e-05, + "loss": 0.185, + "step": 23516 + }, + { + "epoch": 0.4194520743409553, + "grad_norm": 0.287748783826828, + "learning_rate": 3.6001197860948515e-05, + "loss": 0.1434, + "step": 23517 + }, + { + "epoch": 0.419469910462669, + "grad_norm": 0.26222291588783264, + "learning_rate": 3.599980013582525e-05, + "loss": 0.1789, + "step": 23518 + }, + { + "epoch": 0.4194877465843827, + "grad_norm": 0.20043300092220306, + "learning_rate": 3.5998402368062964e-05, + "loss": 0.1676, + "step": 23519 + }, + { + "epoch": 0.4195055827060964, + "grad_norm": 0.38191646337509155, + "learning_rate": 3.599700455766707e-05, + "loss": 0.1887, + "step": 23520 + }, + { + "epoch": 0.4195234188278101, + "grad_norm": 0.3139938414096832, + "learning_rate": 3.599560670464299e-05, + "loss": 0.1453, + "step": 23521 + }, + { + "epoch": 0.41954125494952377, + "grad_norm": 0.31105825304985046, + "learning_rate": 3.599420880899614e-05, + "loss": 0.1366, + "step": 23522 + }, + { + "epoch": 0.41955909107123746, + "grad_norm": 0.19444000720977783, + "learning_rate": 3.599281087073194e-05, + "loss": 0.1115, + "step": 23523 + }, + { + "epoch": 0.41957692719295114, + "grad_norm": 0.2868615984916687, + "learning_rate": 3.5991412889855804e-05, + "loss": 0.2068, + "step": 23524 + }, + { + "epoch": 0.4195947633146649, + "grad_norm": 0.24259425699710846, + "learning_rate": 3.599001486637315e-05, + "loss": 0.1991, + "step": 23525 + }, + { + "epoch": 0.4196125994363786, + "grad_norm": 0.19913876056671143, + "learning_rate": 3.598861680028942e-05, + "loss": 0.1354, + "step": 23526 + }, + { + "epoch": 0.41963043555809226, + "grad_norm": 0.24840469658374786, + "learning_rate": 3.598721869161001e-05, + "loss": 0.1858, + "step": 23527 + }, + { + "epoch": 0.41964827167980595, + "grad_norm": 0.3437330424785614, + "learning_rate": 3.598582054034034e-05, + "loss": 0.1271, + "step": 23528 + }, + { + "epoch": 0.41966610780151964, + "grad_norm": 0.25164932012557983, + "learning_rate": 3.5984422346485835e-05, + "loss": 0.14, + "step": 23529 + }, + { + "epoch": 0.4196839439232333, + "grad_norm": 0.2910948395729065, + "learning_rate": 3.5983024110051924e-05, + "loss": 0.1735, + "step": 23530 + }, + { + "epoch": 0.419701780044947, + "grad_norm": 0.22296029329299927, + "learning_rate": 3.598162583104401e-05, + "loss": 0.1106, + "step": 23531 + }, + { + "epoch": 0.4197196161666607, + "grad_norm": 0.29615527391433716, + "learning_rate": 3.598022750946752e-05, + "loss": 0.1809, + "step": 23532 + }, + { + "epoch": 0.4197374522883744, + "grad_norm": 0.29448026418685913, + "learning_rate": 3.597882914532788e-05, + "loss": 0.1659, + "step": 23533 + }, + { + "epoch": 0.41975528841008813, + "grad_norm": 0.26123934984207153, + "learning_rate": 3.5977430738630505e-05, + "loss": 0.1791, + "step": 23534 + }, + { + "epoch": 0.4197731245318018, + "grad_norm": 0.3136507570743561, + "learning_rate": 3.597603228938082e-05, + "loss": 0.1743, + "step": 23535 + }, + { + "epoch": 0.4197909606535155, + "grad_norm": 0.24377232789993286, + "learning_rate": 3.597463379758424e-05, + "loss": 0.1107, + "step": 23536 + }, + { + "epoch": 0.4198087967752292, + "grad_norm": 0.2058449238538742, + "learning_rate": 3.5973235263246184e-05, + "loss": 0.1008, + "step": 23537 + }, + { + "epoch": 0.4198266328969429, + "grad_norm": 0.4670116901397705, + "learning_rate": 3.597183668637209e-05, + "loss": 0.1907, + "step": 23538 + }, + { + "epoch": 0.4198444690186566, + "grad_norm": 0.26883623003959656, + "learning_rate": 3.597043806696735e-05, + "loss": 0.1781, + "step": 23539 + }, + { + "epoch": 0.41986230514037026, + "grad_norm": 0.27269601821899414, + "learning_rate": 3.596903940503742e-05, + "loss": 0.1546, + "step": 23540 + }, + { + "epoch": 0.41988014126208395, + "grad_norm": 0.2307538241147995, + "learning_rate": 3.5967640700587693e-05, + "loss": 0.1669, + "step": 23541 + }, + { + "epoch": 0.4198979773837977, + "grad_norm": 0.26575759053230286, + "learning_rate": 3.5966241953623614e-05, + "loss": 0.0946, + "step": 23542 + }, + { + "epoch": 0.4199158135055114, + "grad_norm": 0.2895721197128296, + "learning_rate": 3.596484316415058e-05, + "loss": 0.179, + "step": 23543 + }, + { + "epoch": 0.41993364962722507, + "grad_norm": 0.27788469195365906, + "learning_rate": 3.596344433217404e-05, + "loss": 0.2152, + "step": 23544 + }, + { + "epoch": 0.41995148574893876, + "grad_norm": 0.42665326595306396, + "learning_rate": 3.596204545769939e-05, + "loss": 0.2037, + "step": 23545 + }, + { + "epoch": 0.41996932187065245, + "grad_norm": 0.21723249554634094, + "learning_rate": 3.596064654073207e-05, + "loss": 0.0994, + "step": 23546 + }, + { + "epoch": 0.41998715799236613, + "grad_norm": 0.21885307133197784, + "learning_rate": 3.5959247581277513e-05, + "loss": 0.1357, + "step": 23547 + }, + { + "epoch": 0.4200049941140798, + "grad_norm": 0.30911707878112793, + "learning_rate": 3.595784857934111e-05, + "loss": 0.1402, + "step": 23548 + }, + { + "epoch": 0.4200228302357935, + "grad_norm": 0.23402488231658936, + "learning_rate": 3.5956449534928304e-05, + "loss": 0.1537, + "step": 23549 + }, + { + "epoch": 0.4200406663575072, + "grad_norm": 0.18036618828773499, + "learning_rate": 3.595505044804452e-05, + "loss": 0.1235, + "step": 23550 + }, + { + "epoch": 0.42005850247922094, + "grad_norm": 0.488210529088974, + "learning_rate": 3.595365131869518e-05, + "loss": 0.1756, + "step": 23551 + }, + { + "epoch": 0.42007633860093463, + "grad_norm": 0.29397720098495483, + "learning_rate": 3.595225214688569e-05, + "loss": 0.1681, + "step": 23552 + }, + { + "epoch": 0.4200941747226483, + "grad_norm": 0.453151136636734, + "learning_rate": 3.59508529326215e-05, + "loss": 0.1668, + "step": 23553 + }, + { + "epoch": 0.420112010844362, + "grad_norm": 0.3504691421985626, + "learning_rate": 3.5949453675908016e-05, + "loss": 0.2063, + "step": 23554 + }, + { + "epoch": 0.4201298469660757, + "grad_norm": 0.3381533920764923, + "learning_rate": 3.594805437675067e-05, + "loss": 0.178, + "step": 23555 + }, + { + "epoch": 0.4201476830877894, + "grad_norm": 0.2532208561897278, + "learning_rate": 3.5946655035154886e-05, + "loss": 0.1979, + "step": 23556 + }, + { + "epoch": 0.42016551920950307, + "grad_norm": 0.308187335729599, + "learning_rate": 3.5945255651126085e-05, + "loss": 0.1998, + "step": 23557 + }, + { + "epoch": 0.42018335533121676, + "grad_norm": 0.2154446244239807, + "learning_rate": 3.5943856224669695e-05, + "loss": 0.0835, + "step": 23558 + }, + { + "epoch": 0.4202011914529305, + "grad_norm": 0.3717007040977478, + "learning_rate": 3.5942456755791124e-05, + "loss": 0.2221, + "step": 23559 + }, + { + "epoch": 0.4202190275746442, + "grad_norm": 0.2461913526058197, + "learning_rate": 3.5941057244495826e-05, + "loss": 0.1213, + "step": 23560 + }, + { + "epoch": 0.4202368636963579, + "grad_norm": 0.27049311995506287, + "learning_rate": 3.593965769078921e-05, + "loss": 0.1733, + "step": 23561 + }, + { + "epoch": 0.42025469981807156, + "grad_norm": 0.22452659904956818, + "learning_rate": 3.59382580946767e-05, + "loss": 0.1562, + "step": 23562 + }, + { + "epoch": 0.42027253593978525, + "grad_norm": 0.314168781042099, + "learning_rate": 3.593685845616372e-05, + "loss": 0.1276, + "step": 23563 + }, + { + "epoch": 0.42029037206149894, + "grad_norm": 0.26814785599708557, + "learning_rate": 3.593545877525571e-05, + "loss": 0.1752, + "step": 23564 + }, + { + "epoch": 0.42030820818321263, + "grad_norm": 0.24847298860549927, + "learning_rate": 3.593405905195807e-05, + "loss": 0.1444, + "step": 23565 + }, + { + "epoch": 0.4203260443049263, + "grad_norm": 0.33460527658462524, + "learning_rate": 3.593265928627625e-05, + "loss": 0.1459, + "step": 23566 + }, + { + "epoch": 0.42034388042664, + "grad_norm": 0.2722640931606293, + "learning_rate": 3.593125947821566e-05, + "loss": 0.1548, + "step": 23567 + }, + { + "epoch": 0.42036171654835375, + "grad_norm": 0.26249417662620544, + "learning_rate": 3.592985962778174e-05, + "loss": 0.1591, + "step": 23568 + }, + { + "epoch": 0.42037955267006744, + "grad_norm": 0.24926874041557312, + "learning_rate": 3.5928459734979915e-05, + "loss": 0.1344, + "step": 23569 + }, + { + "epoch": 0.4203973887917811, + "grad_norm": 0.26802873611450195, + "learning_rate": 3.5927059799815595e-05, + "loss": 0.1956, + "step": 23570 + }, + { + "epoch": 0.4204152249134948, + "grad_norm": 0.5149135589599609, + "learning_rate": 3.592565982229422e-05, + "loss": 0.1341, + "step": 23571 + }, + { + "epoch": 0.4204330610352085, + "grad_norm": 0.20482110977172852, + "learning_rate": 3.592425980242121e-05, + "loss": 0.1312, + "step": 23572 + }, + { + "epoch": 0.4204508971569222, + "grad_norm": 0.28916677832603455, + "learning_rate": 3.5922859740202e-05, + "loss": 0.136, + "step": 23573 + }, + { + "epoch": 0.4204687332786359, + "grad_norm": 0.2726137936115265, + "learning_rate": 3.592145963564201e-05, + "loss": 0.1812, + "step": 23574 + }, + { + "epoch": 0.42048656940034956, + "grad_norm": 0.21922151744365692, + "learning_rate": 3.5920059488746674e-05, + "loss": 0.1384, + "step": 23575 + }, + { + "epoch": 0.4205044055220633, + "grad_norm": 0.26206812262535095, + "learning_rate": 3.5918659299521414e-05, + "loss": 0.1503, + "step": 23576 + }, + { + "epoch": 0.420522241643777, + "grad_norm": 0.24848762154579163, + "learning_rate": 3.591725906797166e-05, + "loss": 0.1702, + "step": 23577 + }, + { + "epoch": 0.4205400777654907, + "grad_norm": 0.25790759921073914, + "learning_rate": 3.591585879410284e-05, + "loss": 0.1202, + "step": 23578 + }, + { + "epoch": 0.42055791388720437, + "grad_norm": 0.2853497564792633, + "learning_rate": 3.591445847792038e-05, + "loss": 0.1392, + "step": 23579 + }, + { + "epoch": 0.42057575000891806, + "grad_norm": 0.29755979776382446, + "learning_rate": 3.5913058119429706e-05, + "loss": 0.1564, + "step": 23580 + }, + { + "epoch": 0.42059358613063175, + "grad_norm": 0.25221002101898193, + "learning_rate": 3.591165771863625e-05, + "loss": 0.1818, + "step": 23581 + }, + { + "epoch": 0.42061142225234543, + "grad_norm": 0.2708946168422699, + "learning_rate": 3.5910257275545445e-05, + "loss": 0.1515, + "step": 23582 + }, + { + "epoch": 0.4206292583740591, + "grad_norm": 0.24300266802310944, + "learning_rate": 3.590885679016271e-05, + "loss": 0.1553, + "step": 23583 + }, + { + "epoch": 0.42064709449577287, + "grad_norm": 0.2931252121925354, + "learning_rate": 3.5907456262493485e-05, + "loss": 0.1711, + "step": 23584 + }, + { + "epoch": 0.42066493061748655, + "grad_norm": 0.3231823742389679, + "learning_rate": 3.5906055692543186e-05, + "loss": 0.2082, + "step": 23585 + }, + { + "epoch": 0.42068276673920024, + "grad_norm": 0.25302112102508545, + "learning_rate": 3.590465508031725e-05, + "loss": 0.1334, + "step": 23586 + }, + { + "epoch": 0.42070060286091393, + "grad_norm": 0.25858011841773987, + "learning_rate": 3.59032544258211e-05, + "loss": 0.1754, + "step": 23587 + }, + { + "epoch": 0.4207184389826276, + "grad_norm": 0.30044376850128174, + "learning_rate": 3.590185372906018e-05, + "loss": 0.1398, + "step": 23588 + }, + { + "epoch": 0.4207362751043413, + "grad_norm": 0.2664319574832916, + "learning_rate": 3.59004529900399e-05, + "loss": 0.1316, + "step": 23589 + }, + { + "epoch": 0.420754111226055, + "grad_norm": 0.23234841227531433, + "learning_rate": 3.58990522087657e-05, + "loss": 0.1166, + "step": 23590 + }, + { + "epoch": 0.4207719473477687, + "grad_norm": 0.3134883642196655, + "learning_rate": 3.589765138524301e-05, + "loss": 0.2265, + "step": 23591 + }, + { + "epoch": 0.42078978346948237, + "grad_norm": 0.21505266427993774, + "learning_rate": 3.589625051947727e-05, + "loss": 0.1607, + "step": 23592 + }, + { + "epoch": 0.4208076195911961, + "grad_norm": 0.238050639629364, + "learning_rate": 3.589484961147389e-05, + "loss": 0.1752, + "step": 23593 + }, + { + "epoch": 0.4208254557129098, + "grad_norm": 0.2429279386997223, + "learning_rate": 3.5893448661238305e-05, + "loss": 0.1541, + "step": 23594 + }, + { + "epoch": 0.4208432918346235, + "grad_norm": 0.3417379856109619, + "learning_rate": 3.5892047668775964e-05, + "loss": 0.1447, + "step": 23595 + }, + { + "epoch": 0.4208611279563372, + "grad_norm": 0.31697213649749756, + "learning_rate": 3.589064663409227e-05, + "loss": 0.1602, + "step": 23596 + }, + { + "epoch": 0.42087896407805087, + "grad_norm": 0.2814241051673889, + "learning_rate": 3.588924555719268e-05, + "loss": 0.1461, + "step": 23597 + }, + { + "epoch": 0.42089680019976455, + "grad_norm": 0.4176954925060272, + "learning_rate": 3.588784443808261e-05, + "loss": 0.191, + "step": 23598 + }, + { + "epoch": 0.42091463632147824, + "grad_norm": 0.23985715210437775, + "learning_rate": 3.588644327676749e-05, + "loss": 0.1549, + "step": 23599 + }, + { + "epoch": 0.42093247244319193, + "grad_norm": 0.32432979345321655, + "learning_rate": 3.588504207325276e-05, + "loss": 0.1343, + "step": 23600 + }, + { + "epoch": 0.4209503085649057, + "grad_norm": 0.22184689342975616, + "learning_rate": 3.588364082754384e-05, + "loss": 0.1881, + "step": 23601 + }, + { + "epoch": 0.42096814468661936, + "grad_norm": 0.24987797439098358, + "learning_rate": 3.588223953964618e-05, + "loss": 0.1482, + "step": 23602 + }, + { + "epoch": 0.42098598080833305, + "grad_norm": 0.3044281601905823, + "learning_rate": 3.5880838209565195e-05, + "loss": 0.1386, + "step": 23603 + }, + { + "epoch": 0.42100381693004674, + "grad_norm": 0.2577959895133972, + "learning_rate": 3.5879436837306325e-05, + "loss": 0.1497, + "step": 23604 + }, + { + "epoch": 0.4210216530517604, + "grad_norm": 0.2630673050880432, + "learning_rate": 3.5878035422875e-05, + "loss": 0.1295, + "step": 23605 + }, + { + "epoch": 0.4210394891734741, + "grad_norm": 0.22514985501766205, + "learning_rate": 3.5876633966276645e-05, + "loss": 0.1306, + "step": 23606 + }, + { + "epoch": 0.4210573252951878, + "grad_norm": 0.32045015692710876, + "learning_rate": 3.5875232467516704e-05, + "loss": 0.1343, + "step": 23607 + }, + { + "epoch": 0.4210751614169015, + "grad_norm": 0.234904944896698, + "learning_rate": 3.587383092660062e-05, + "loss": 0.1422, + "step": 23608 + }, + { + "epoch": 0.4210929975386152, + "grad_norm": 0.20119354128837585, + "learning_rate": 3.5872429343533793e-05, + "loss": 0.1428, + "step": 23609 + }, + { + "epoch": 0.4211108336603289, + "grad_norm": 0.23028001189231873, + "learning_rate": 3.587102771832168e-05, + "loss": 0.1737, + "step": 23610 + }, + { + "epoch": 0.4211286697820426, + "grad_norm": 0.18881520628929138, + "learning_rate": 3.586962605096971e-05, + "loss": 0.1355, + "step": 23611 + }, + { + "epoch": 0.4211465059037563, + "grad_norm": 0.2693285048007965, + "learning_rate": 3.586822434148332e-05, + "loss": 0.1229, + "step": 23612 + }, + { + "epoch": 0.42116434202547, + "grad_norm": 0.29025503993034363, + "learning_rate": 3.586682258986793e-05, + "loss": 0.166, + "step": 23613 + }, + { + "epoch": 0.4211821781471837, + "grad_norm": 0.271465539932251, + "learning_rate": 3.586542079612899e-05, + "loss": 0.126, + "step": 23614 + }, + { + "epoch": 0.42120001426889736, + "grad_norm": 0.2229209691286087, + "learning_rate": 3.586401896027192e-05, + "loss": 0.1378, + "step": 23615 + }, + { + "epoch": 0.42121785039061105, + "grad_norm": 0.24449051916599274, + "learning_rate": 3.5862617082302164e-05, + "loss": 0.1703, + "step": 23616 + }, + { + "epoch": 0.42123568651232474, + "grad_norm": 0.25472691655158997, + "learning_rate": 3.586121516222515e-05, + "loss": 0.1469, + "step": 23617 + }, + { + "epoch": 0.4212535226340385, + "grad_norm": 0.20706047117710114, + "learning_rate": 3.585981320004632e-05, + "loss": 0.1583, + "step": 23618 + }, + { + "epoch": 0.42127135875575217, + "grad_norm": 0.34037670493125916, + "learning_rate": 3.58584111957711e-05, + "loss": 0.1485, + "step": 23619 + }, + { + "epoch": 0.42128919487746586, + "grad_norm": 0.32346442341804504, + "learning_rate": 3.5857009149404927e-05, + "loss": 0.1611, + "step": 23620 + }, + { + "epoch": 0.42130703099917954, + "grad_norm": 0.25114670395851135, + "learning_rate": 3.585560706095323e-05, + "loss": 0.1636, + "step": 23621 + }, + { + "epoch": 0.42132486712089323, + "grad_norm": 0.2218189686536789, + "learning_rate": 3.585420493042146e-05, + "loss": 0.1573, + "step": 23622 + }, + { + "epoch": 0.4213427032426069, + "grad_norm": 0.2421419471502304, + "learning_rate": 3.5852802757815044e-05, + "loss": 0.1961, + "step": 23623 + }, + { + "epoch": 0.4213605393643206, + "grad_norm": 0.3725896179676056, + "learning_rate": 3.585140054313941e-05, + "loss": 0.1619, + "step": 23624 + }, + { + "epoch": 0.4213783754860343, + "grad_norm": 0.24416519701480865, + "learning_rate": 3.5849998286400005e-05, + "loss": 0.174, + "step": 23625 + }, + { + "epoch": 0.421396211607748, + "grad_norm": 0.26666975021362305, + "learning_rate": 3.584859598760225e-05, + "loss": 0.156, + "step": 23626 + }, + { + "epoch": 0.4214140477294617, + "grad_norm": 0.1893339902162552, + "learning_rate": 3.58471936467516e-05, + "loss": 0.1152, + "step": 23627 + }, + { + "epoch": 0.4214318838511754, + "grad_norm": 0.29018595814704895, + "learning_rate": 3.584579126385347e-05, + "loss": 0.1275, + "step": 23628 + }, + { + "epoch": 0.4214497199728891, + "grad_norm": 0.30612584948539734, + "learning_rate": 3.5844388838913316e-05, + "loss": 0.1721, + "step": 23629 + }, + { + "epoch": 0.4214675560946028, + "grad_norm": 0.2823601961135864, + "learning_rate": 3.584298637193656e-05, + "loss": 0.1406, + "step": 23630 + }, + { + "epoch": 0.4214853922163165, + "grad_norm": 0.2962504029273987, + "learning_rate": 3.584158386292865e-05, + "loss": 0.1744, + "step": 23631 + }, + { + "epoch": 0.42150322833803017, + "grad_norm": 0.24475839734077454, + "learning_rate": 3.584018131189502e-05, + "loss": 0.1654, + "step": 23632 + }, + { + "epoch": 0.42152106445974385, + "grad_norm": 0.45434901118278503, + "learning_rate": 3.583877871884109e-05, + "loss": 0.191, + "step": 23633 + }, + { + "epoch": 0.42153890058145754, + "grad_norm": 0.2057182639837265, + "learning_rate": 3.5837376083772315e-05, + "loss": 0.1663, + "step": 23634 + }, + { + "epoch": 0.4215567367031713, + "grad_norm": 0.2028048038482666, + "learning_rate": 3.583597340669413e-05, + "loss": 0.1277, + "step": 23635 + }, + { + "epoch": 0.421574572824885, + "grad_norm": 0.45485812425613403, + "learning_rate": 3.583457068761197e-05, + "loss": 0.1454, + "step": 23636 + }, + { + "epoch": 0.42159240894659866, + "grad_norm": 0.253262996673584, + "learning_rate": 3.583316792653127e-05, + "loss": 0.1633, + "step": 23637 + }, + { + "epoch": 0.42161024506831235, + "grad_norm": 0.2653898000717163, + "learning_rate": 3.5831765123457474e-05, + "loss": 0.1603, + "step": 23638 + }, + { + "epoch": 0.42162808119002604, + "grad_norm": 0.326353520154953, + "learning_rate": 3.5830362278396004e-05, + "loss": 0.1441, + "step": 23639 + }, + { + "epoch": 0.4216459173117397, + "grad_norm": 0.34983694553375244, + "learning_rate": 3.582895939135232e-05, + "loss": 0.1871, + "step": 23640 + }, + { + "epoch": 0.4216637534334534, + "grad_norm": 0.3122880458831787, + "learning_rate": 3.582755646233185e-05, + "loss": 0.1882, + "step": 23641 + }, + { + "epoch": 0.4216815895551671, + "grad_norm": 0.2588161528110504, + "learning_rate": 3.582615349134002e-05, + "loss": 0.1258, + "step": 23642 + }, + { + "epoch": 0.42169942567688085, + "grad_norm": 0.2501724660396576, + "learning_rate": 3.582475047838229e-05, + "loss": 0.117, + "step": 23643 + }, + { + "epoch": 0.42171726179859453, + "grad_norm": 0.19072869420051575, + "learning_rate": 3.582334742346408e-05, + "loss": 0.1248, + "step": 23644 + }, + { + "epoch": 0.4217350979203082, + "grad_norm": 0.29543107748031616, + "learning_rate": 3.5821944326590836e-05, + "loss": 0.1902, + "step": 23645 + }, + { + "epoch": 0.4217529340420219, + "grad_norm": 0.30165714025497437, + "learning_rate": 3.5820541187768006e-05, + "loss": 0.1254, + "step": 23646 + }, + { + "epoch": 0.4217707701637356, + "grad_norm": 0.29045385122299194, + "learning_rate": 3.581913800700103e-05, + "loss": 0.0816, + "step": 23647 + }, + { + "epoch": 0.4217886062854493, + "grad_norm": 0.24130268394947052, + "learning_rate": 3.581773478429532e-05, + "loss": 0.1064, + "step": 23648 + }, + { + "epoch": 0.421806442407163, + "grad_norm": 0.24183712899684906, + "learning_rate": 3.581633151965634e-05, + "loss": 0.2029, + "step": 23649 + }, + { + "epoch": 0.42182427852887666, + "grad_norm": 0.32043445110321045, + "learning_rate": 3.581492821308953e-05, + "loss": 0.1578, + "step": 23650 + }, + { + "epoch": 0.42184211465059035, + "grad_norm": 0.22469887137413025, + "learning_rate": 3.581352486460031e-05, + "loss": 0.1023, + "step": 23651 + }, + { + "epoch": 0.4218599507723041, + "grad_norm": 0.23213882744312286, + "learning_rate": 3.581212147419414e-05, + "loss": 0.1526, + "step": 23652 + }, + { + "epoch": 0.4218777868940178, + "grad_norm": 0.3670051097869873, + "learning_rate": 3.581071804187646e-05, + "loss": 0.1764, + "step": 23653 + }, + { + "epoch": 0.42189562301573147, + "grad_norm": 0.262602835893631, + "learning_rate": 3.580931456765269e-05, + "loss": 0.1891, + "step": 23654 + }, + { + "epoch": 0.42191345913744516, + "grad_norm": 0.22695784270763397, + "learning_rate": 3.580791105152829e-05, + "loss": 0.1318, + "step": 23655 + }, + { + "epoch": 0.42193129525915885, + "grad_norm": 0.2450912594795227, + "learning_rate": 3.580650749350869e-05, + "loss": 0.1625, + "step": 23656 + }, + { + "epoch": 0.42194913138087253, + "grad_norm": 0.21248646080493927, + "learning_rate": 3.580510389359934e-05, + "loss": 0.1585, + "step": 23657 + }, + { + "epoch": 0.4219669675025862, + "grad_norm": 0.41088610887527466, + "learning_rate": 3.5803700251805674e-05, + "loss": 0.1674, + "step": 23658 + }, + { + "epoch": 0.4219848036242999, + "grad_norm": 0.2377018928527832, + "learning_rate": 3.580229656813313e-05, + "loss": 0.1573, + "step": 23659 + }, + { + "epoch": 0.42200263974601365, + "grad_norm": 0.26952099800109863, + "learning_rate": 3.580089284258716e-05, + "loss": 0.1353, + "step": 23660 + }, + { + "epoch": 0.42202047586772734, + "grad_norm": 0.28703954815864563, + "learning_rate": 3.579948907517319e-05, + "loss": 0.144, + "step": 23661 + }, + { + "epoch": 0.42203831198944103, + "grad_norm": 0.22947105765342712, + "learning_rate": 3.579808526589668e-05, + "loss": 0.1239, + "step": 23662 + }, + { + "epoch": 0.4220561481111547, + "grad_norm": 0.2878875434398651, + "learning_rate": 3.579668141476305e-05, + "loss": 0.2091, + "step": 23663 + }, + { + "epoch": 0.4220739842328684, + "grad_norm": 0.19706681370735168, + "learning_rate": 3.579527752177777e-05, + "loss": 0.1599, + "step": 23664 + }, + { + "epoch": 0.4220918203545821, + "grad_norm": 0.2394402027130127, + "learning_rate": 3.579387358694625e-05, + "loss": 0.1409, + "step": 23665 + }, + { + "epoch": 0.4221096564762958, + "grad_norm": 0.2105996161699295, + "learning_rate": 3.579246961027396e-05, + "loss": 0.2027, + "step": 23666 + }, + { + "epoch": 0.42212749259800947, + "grad_norm": 0.18609221279621124, + "learning_rate": 3.579106559176632e-05, + "loss": 0.131, + "step": 23667 + }, + { + "epoch": 0.42214532871972316, + "grad_norm": 0.24411694705486298, + "learning_rate": 3.578966153142879e-05, + "loss": 0.1485, + "step": 23668 + }, + { + "epoch": 0.4221631648414369, + "grad_norm": 0.2051008641719818, + "learning_rate": 3.5788257429266804e-05, + "loss": 0.1254, + "step": 23669 + }, + { + "epoch": 0.4221810009631506, + "grad_norm": 0.27412158250808716, + "learning_rate": 3.5786853285285805e-05, + "loss": 0.1747, + "step": 23670 + }, + { + "epoch": 0.4221988370848643, + "grad_norm": 0.2188235968351364, + "learning_rate": 3.578544909949123e-05, + "loss": 0.1357, + "step": 23671 + }, + { + "epoch": 0.42221667320657796, + "grad_norm": 0.226033017039299, + "learning_rate": 3.578404487188854e-05, + "loss": 0.1606, + "step": 23672 + }, + { + "epoch": 0.42223450932829165, + "grad_norm": 0.2508758306503296, + "learning_rate": 3.5782640602483166e-05, + "loss": 0.1499, + "step": 23673 + }, + { + "epoch": 0.42225234545000534, + "grad_norm": 0.31331875920295715, + "learning_rate": 3.578123629128055e-05, + "loss": 0.1627, + "step": 23674 + }, + { + "epoch": 0.422270181571719, + "grad_norm": 0.3454189598560333, + "learning_rate": 3.577983193828615e-05, + "loss": 0.1701, + "step": 23675 + }, + { + "epoch": 0.4222880176934327, + "grad_norm": 0.28553205728530884, + "learning_rate": 3.5778427543505375e-05, + "loss": 0.1489, + "step": 23676 + }, + { + "epoch": 0.42230585381514646, + "grad_norm": 0.22232919931411743, + "learning_rate": 3.5777023106943706e-05, + "loss": 0.1596, + "step": 23677 + }, + { + "epoch": 0.42232368993686015, + "grad_norm": 0.2846347987651825, + "learning_rate": 3.577561862860657e-05, + "loss": 0.1272, + "step": 23678 + }, + { + "epoch": 0.42234152605857384, + "grad_norm": 0.3221072554588318, + "learning_rate": 3.577421410849942e-05, + "loss": 0.1931, + "step": 23679 + }, + { + "epoch": 0.4223593621802875, + "grad_norm": 0.21035178005695343, + "learning_rate": 3.577280954662769e-05, + "loss": 0.167, + "step": 23680 + }, + { + "epoch": 0.4223771983020012, + "grad_norm": 0.2143920660018921, + "learning_rate": 3.5771404942996825e-05, + "loss": 0.126, + "step": 23681 + }, + { + "epoch": 0.4223950344237149, + "grad_norm": 0.5835981369018555, + "learning_rate": 3.577000029761228e-05, + "loss": 0.151, + "step": 23682 + }, + { + "epoch": 0.4224128705454286, + "grad_norm": 0.38990381360054016, + "learning_rate": 3.5768595610479496e-05, + "loss": 0.1893, + "step": 23683 + }, + { + "epoch": 0.4224307066671423, + "grad_norm": 0.28467774391174316, + "learning_rate": 3.5767190881603904e-05, + "loss": 0.1568, + "step": 23684 + }, + { + "epoch": 0.422448542788856, + "grad_norm": 0.27227291464805603, + "learning_rate": 3.576578611099097e-05, + "loss": 0.127, + "step": 23685 + }, + { + "epoch": 0.4224663789105697, + "grad_norm": 0.2288699597120285, + "learning_rate": 3.576438129864613e-05, + "loss": 0.1356, + "step": 23686 + }, + { + "epoch": 0.4224842150322834, + "grad_norm": 0.2638051211833954, + "learning_rate": 3.5762976444574835e-05, + "loss": 0.1392, + "step": 23687 + }, + { + "epoch": 0.4225020511539971, + "grad_norm": 0.30850347876548767, + "learning_rate": 3.576157154878253e-05, + "loss": 0.1803, + "step": 23688 + }, + { + "epoch": 0.42251988727571077, + "grad_norm": 0.2594379782676697, + "learning_rate": 3.5760166611274646e-05, + "loss": 0.1249, + "step": 23689 + }, + { + "epoch": 0.42253772339742446, + "grad_norm": 0.27456754446029663, + "learning_rate": 3.575876163205664e-05, + "loss": 0.2294, + "step": 23690 + }, + { + "epoch": 0.42255555951913815, + "grad_norm": 0.2205992043018341, + "learning_rate": 3.575735661113396e-05, + "loss": 0.1522, + "step": 23691 + }, + { + "epoch": 0.42257339564085183, + "grad_norm": 0.4007713496685028, + "learning_rate": 3.575595154851205e-05, + "loss": 0.1421, + "step": 23692 + }, + { + "epoch": 0.4225912317625655, + "grad_norm": 0.2664162516593933, + "learning_rate": 3.575454644419636e-05, + "loss": 0.1366, + "step": 23693 + }, + { + "epoch": 0.42260906788427927, + "grad_norm": 0.2566847503185272, + "learning_rate": 3.575314129819233e-05, + "loss": 0.1533, + "step": 23694 + }, + { + "epoch": 0.42262690400599295, + "grad_norm": 0.30679431557655334, + "learning_rate": 3.575173611050541e-05, + "loss": 0.1864, + "step": 23695 + }, + { + "epoch": 0.42264474012770664, + "grad_norm": 0.21846045553684235, + "learning_rate": 3.575033088114105e-05, + "loss": 0.143, + "step": 23696 + }, + { + "epoch": 0.42266257624942033, + "grad_norm": 0.23655980825424194, + "learning_rate": 3.5748925610104694e-05, + "loss": 0.1118, + "step": 23697 + }, + { + "epoch": 0.422680412371134, + "grad_norm": 0.21267226338386536, + "learning_rate": 3.574752029740179e-05, + "loss": 0.1122, + "step": 23698 + }, + { + "epoch": 0.4226982484928477, + "grad_norm": 0.28920501470565796, + "learning_rate": 3.574611494303778e-05, + "loss": 0.148, + "step": 23699 + }, + { + "epoch": 0.4227160846145614, + "grad_norm": 0.20284847915172577, + "learning_rate": 3.574470954701812e-05, + "loss": 0.1458, + "step": 23700 + }, + { + "epoch": 0.4227339207362751, + "grad_norm": 0.3121538758277893, + "learning_rate": 3.5743304109348265e-05, + "loss": 0.1543, + "step": 23701 + }, + { + "epoch": 0.4227517568579888, + "grad_norm": 0.2441626340150833, + "learning_rate": 3.5741898630033635e-05, + "loss": 0.1919, + "step": 23702 + }, + { + "epoch": 0.4227695929797025, + "grad_norm": 0.3428318202495575, + "learning_rate": 3.574049310907971e-05, + "loss": 0.1634, + "step": 23703 + }, + { + "epoch": 0.4227874291014162, + "grad_norm": 0.23114174604415894, + "learning_rate": 3.573908754649192e-05, + "loss": 0.1234, + "step": 23704 + }, + { + "epoch": 0.4228052652231299, + "grad_norm": 0.2092934101819992, + "learning_rate": 3.5737681942275713e-05, + "loss": 0.1422, + "step": 23705 + }, + { + "epoch": 0.4228231013448436, + "grad_norm": 0.35993555188179016, + "learning_rate": 3.573627629643655e-05, + "loss": 0.1261, + "step": 23706 + }, + { + "epoch": 0.42284093746655727, + "grad_norm": 0.24289120733737946, + "learning_rate": 3.573487060897987e-05, + "loss": 0.1643, + "step": 23707 + }, + { + "epoch": 0.42285877358827095, + "grad_norm": 0.28490716218948364, + "learning_rate": 3.573346487991111e-05, + "loss": 0.1943, + "step": 23708 + }, + { + "epoch": 0.42287660970998464, + "grad_norm": 0.25983208417892456, + "learning_rate": 3.573205910923575e-05, + "loss": 0.1496, + "step": 23709 + }, + { + "epoch": 0.42289444583169833, + "grad_norm": 0.4670659899711609, + "learning_rate": 3.573065329695921e-05, + "loss": 0.1323, + "step": 23710 + }, + { + "epoch": 0.4229122819534121, + "grad_norm": 0.24420194327831268, + "learning_rate": 3.572924744308696e-05, + "loss": 0.1622, + "step": 23711 + }, + { + "epoch": 0.42293011807512576, + "grad_norm": 0.314409464597702, + "learning_rate": 3.572784154762443e-05, + "loss": 0.1883, + "step": 23712 + }, + { + "epoch": 0.42294795419683945, + "grad_norm": 0.27840864658355713, + "learning_rate": 3.572643561057709e-05, + "loss": 0.1836, + "step": 23713 + }, + { + "epoch": 0.42296579031855314, + "grad_norm": 0.31840789318084717, + "learning_rate": 3.572502963195039e-05, + "loss": 0.1551, + "step": 23714 + }, + { + "epoch": 0.4229836264402668, + "grad_norm": 0.3444361984729767, + "learning_rate": 3.5723623611749754e-05, + "loss": 0.1482, + "step": 23715 + }, + { + "epoch": 0.4230014625619805, + "grad_norm": 0.39516517519950867, + "learning_rate": 3.572221754998066e-05, + "loss": 0.2096, + "step": 23716 + }, + { + "epoch": 0.4230192986836942, + "grad_norm": 0.20691877603530884, + "learning_rate": 3.5720811446648546e-05, + "loss": 0.1584, + "step": 23717 + }, + { + "epoch": 0.4230371348054079, + "grad_norm": 0.18600843846797943, + "learning_rate": 3.571940530175886e-05, + "loss": 0.1406, + "step": 23718 + }, + { + "epoch": 0.42305497092712163, + "grad_norm": 0.26819100975990295, + "learning_rate": 3.5717999115317054e-05, + "loss": 0.1171, + "step": 23719 + }, + { + "epoch": 0.4230728070488353, + "grad_norm": 0.27075424790382385, + "learning_rate": 3.571659288732859e-05, + "loss": 0.1539, + "step": 23720 + }, + { + "epoch": 0.423090643170549, + "grad_norm": 0.32275882363319397, + "learning_rate": 3.5715186617798904e-05, + "loss": 0.2049, + "step": 23721 + }, + { + "epoch": 0.4231084792922627, + "grad_norm": 0.26822638511657715, + "learning_rate": 3.5713780306733455e-05, + "loss": 0.1772, + "step": 23722 + }, + { + "epoch": 0.4231263154139764, + "grad_norm": 0.2827689051628113, + "learning_rate": 3.571237395413769e-05, + "loss": 0.2033, + "step": 23723 + }, + { + "epoch": 0.42314415153569007, + "grad_norm": 0.2942645847797394, + "learning_rate": 3.5710967560017074e-05, + "loss": 0.1807, + "step": 23724 + }, + { + "epoch": 0.42316198765740376, + "grad_norm": 0.21800316870212555, + "learning_rate": 3.570956112437704e-05, + "loss": 0.0937, + "step": 23725 + }, + { + "epoch": 0.42317982377911745, + "grad_norm": 0.24841605126857758, + "learning_rate": 3.5708154647223044e-05, + "loss": 0.1602, + "step": 23726 + }, + { + "epoch": 0.42319765990083114, + "grad_norm": 0.4378014802932739, + "learning_rate": 3.570674812856056e-05, + "loss": 0.1805, + "step": 23727 + }, + { + "epoch": 0.4232154960225449, + "grad_norm": 0.3208266794681549, + "learning_rate": 3.5705341568395e-05, + "loss": 0.1482, + "step": 23728 + }, + { + "epoch": 0.42323333214425857, + "grad_norm": 0.31566476821899414, + "learning_rate": 3.570393496673186e-05, + "loss": 0.1053, + "step": 23729 + }, + { + "epoch": 0.42325116826597226, + "grad_norm": 0.42010727524757385, + "learning_rate": 3.5702528323576556e-05, + "loss": 0.1665, + "step": 23730 + }, + { + "epoch": 0.42326900438768594, + "grad_norm": 0.2503792345523834, + "learning_rate": 3.570112163893456e-05, + "loss": 0.1813, + "step": 23731 + }, + { + "epoch": 0.42328684050939963, + "grad_norm": 0.22296959161758423, + "learning_rate": 3.569971491281132e-05, + "loss": 0.1226, + "step": 23732 + }, + { + "epoch": 0.4233046766311133, + "grad_norm": 0.22855298221111298, + "learning_rate": 3.569830814521229e-05, + "loss": 0.1499, + "step": 23733 + }, + { + "epoch": 0.423322512752827, + "grad_norm": 0.23642213642597198, + "learning_rate": 3.569690133614292e-05, + "loss": 0.1453, + "step": 23734 + }, + { + "epoch": 0.4233403488745407, + "grad_norm": 0.3246593177318573, + "learning_rate": 3.569549448560867e-05, + "loss": 0.1409, + "step": 23735 + }, + { + "epoch": 0.42335818499625444, + "grad_norm": 0.24724628031253815, + "learning_rate": 3.5694087593614986e-05, + "loss": 0.1924, + "step": 23736 + }, + { + "epoch": 0.4233760211179681, + "grad_norm": 0.3473714590072632, + "learning_rate": 3.5692680660167325e-05, + "loss": 0.157, + "step": 23737 + }, + { + "epoch": 0.4233938572396818, + "grad_norm": 0.31693729758262634, + "learning_rate": 3.569127368527114e-05, + "loss": 0.1904, + "step": 23738 + }, + { + "epoch": 0.4234116933613955, + "grad_norm": 0.28705716133117676, + "learning_rate": 3.568986666893189e-05, + "loss": 0.1326, + "step": 23739 + }, + { + "epoch": 0.4234295294831092, + "grad_norm": 0.1985987275838852, + "learning_rate": 3.5688459611155024e-05, + "loss": 0.1121, + "step": 23740 + }, + { + "epoch": 0.4234473656048229, + "grad_norm": 0.26479002833366394, + "learning_rate": 3.568705251194599e-05, + "loss": 0.1916, + "step": 23741 + }, + { + "epoch": 0.42346520172653657, + "grad_norm": 0.35913845896720886, + "learning_rate": 3.568564537131026e-05, + "loss": 0.1757, + "step": 23742 + }, + { + "epoch": 0.42348303784825025, + "grad_norm": 0.27993494272232056, + "learning_rate": 3.568423818925327e-05, + "loss": 0.1926, + "step": 23743 + }, + { + "epoch": 0.423500873969964, + "grad_norm": 0.2530037760734558, + "learning_rate": 3.568283096578049e-05, + "loss": 0.1525, + "step": 23744 + }, + { + "epoch": 0.4235187100916777, + "grad_norm": 0.319872111082077, + "learning_rate": 3.568142370089735e-05, + "loss": 0.1669, + "step": 23745 + }, + { + "epoch": 0.4235365462133914, + "grad_norm": 0.258839875459671, + "learning_rate": 3.568001639460934e-05, + "loss": 0.1584, + "step": 23746 + }, + { + "epoch": 0.42355438233510506, + "grad_norm": 0.31688395142555237, + "learning_rate": 3.567860904692189e-05, + "loss": 0.1569, + "step": 23747 + }, + { + "epoch": 0.42357221845681875, + "grad_norm": 0.3236488997936249, + "learning_rate": 3.567720165784046e-05, + "loss": 0.1374, + "step": 23748 + }, + { + "epoch": 0.42359005457853244, + "grad_norm": 0.31253373622894287, + "learning_rate": 3.5675794227370516e-05, + "loss": 0.1728, + "step": 23749 + }, + { + "epoch": 0.4236078907002461, + "grad_norm": 0.3201337158679962, + "learning_rate": 3.567438675551751e-05, + "loss": 0.2051, + "step": 23750 + }, + { + "epoch": 0.4236257268219598, + "grad_norm": 0.2224111407995224, + "learning_rate": 3.567297924228689e-05, + "loss": 0.1139, + "step": 23751 + }, + { + "epoch": 0.4236435629436735, + "grad_norm": 0.24392402172088623, + "learning_rate": 3.5671571687684115e-05, + "loss": 0.1709, + "step": 23752 + }, + { + "epoch": 0.42366139906538725, + "grad_norm": 0.2275601178407669, + "learning_rate": 3.5670164091714645e-05, + "loss": 0.1803, + "step": 23753 + }, + { + "epoch": 0.42367923518710093, + "grad_norm": 0.21494191884994507, + "learning_rate": 3.5668756454383926e-05, + "loss": 0.0959, + "step": 23754 + }, + { + "epoch": 0.4236970713088146, + "grad_norm": 0.34057918190956116, + "learning_rate": 3.5667348775697426e-05, + "loss": 0.2145, + "step": 23755 + }, + { + "epoch": 0.4237149074305283, + "grad_norm": 0.2655284106731415, + "learning_rate": 3.5665941055660594e-05, + "loss": 0.1364, + "step": 23756 + }, + { + "epoch": 0.423732743552242, + "grad_norm": 0.2598637342453003, + "learning_rate": 3.5664533294278905e-05, + "loss": 0.1486, + "step": 23757 + }, + { + "epoch": 0.4237505796739557, + "grad_norm": 0.19027076661586761, + "learning_rate": 3.566312549155778e-05, + "loss": 0.1248, + "step": 23758 + }, + { + "epoch": 0.4237684157956694, + "grad_norm": 0.19882620871067047, + "learning_rate": 3.566171764750271e-05, + "loss": 0.1455, + "step": 23759 + }, + { + "epoch": 0.42378625191738306, + "grad_norm": 0.2997380793094635, + "learning_rate": 3.566030976211914e-05, + "loss": 0.1742, + "step": 23760 + }, + { + "epoch": 0.4238040880390968, + "grad_norm": 0.22737203538417816, + "learning_rate": 3.565890183541253e-05, + "loss": 0.1456, + "step": 23761 + }, + { + "epoch": 0.4238219241608105, + "grad_norm": 0.2445087879896164, + "learning_rate": 3.5657493867388324e-05, + "loss": 0.1699, + "step": 23762 + }, + { + "epoch": 0.4238397602825242, + "grad_norm": 0.35739731788635254, + "learning_rate": 3.5656085858052004e-05, + "loss": 0.1144, + "step": 23763 + }, + { + "epoch": 0.42385759640423787, + "grad_norm": 0.25468719005584717, + "learning_rate": 3.565467780740901e-05, + "loss": 0.1143, + "step": 23764 + }, + { + "epoch": 0.42387543252595156, + "grad_norm": 0.201274573802948, + "learning_rate": 3.5653269715464805e-05, + "loss": 0.1523, + "step": 23765 + }, + { + "epoch": 0.42389326864766524, + "grad_norm": 0.25827720761299133, + "learning_rate": 3.5651861582224844e-05, + "loss": 0.1229, + "step": 23766 + }, + { + "epoch": 0.42391110476937893, + "grad_norm": 0.3260040879249573, + "learning_rate": 3.565045340769458e-05, + "loss": 0.1454, + "step": 23767 + }, + { + "epoch": 0.4239289408910926, + "grad_norm": 0.25070345401763916, + "learning_rate": 3.56490451918795e-05, + "loss": 0.2157, + "step": 23768 + }, + { + "epoch": 0.4239467770128063, + "grad_norm": 0.29251378774642944, + "learning_rate": 3.564763693478503e-05, + "loss": 0.2199, + "step": 23769 + }, + { + "epoch": 0.42396461313452005, + "grad_norm": 0.2800651788711548, + "learning_rate": 3.564622863641665e-05, + "loss": 0.1551, + "step": 23770 + }, + { + "epoch": 0.42398244925623374, + "grad_norm": 0.1974204033613205, + "learning_rate": 3.56448202967798e-05, + "loss": 0.1439, + "step": 23771 + }, + { + "epoch": 0.42400028537794743, + "grad_norm": 0.27609798312187195, + "learning_rate": 3.5643411915879956e-05, + "loss": 0.1334, + "step": 23772 + }, + { + "epoch": 0.4240181214996611, + "grad_norm": 0.32134515047073364, + "learning_rate": 3.564200349372257e-05, + "loss": 0.1648, + "step": 23773 + }, + { + "epoch": 0.4240359576213748, + "grad_norm": 0.23624029755592346, + "learning_rate": 3.5640595030313105e-05, + "loss": 0.1753, + "step": 23774 + }, + { + "epoch": 0.4240537937430885, + "grad_norm": 0.4308352768421173, + "learning_rate": 3.5639186525657017e-05, + "loss": 0.2437, + "step": 23775 + }, + { + "epoch": 0.4240716298648022, + "grad_norm": 0.2978517711162567, + "learning_rate": 3.563777797975977e-05, + "loss": 0.2309, + "step": 23776 + }, + { + "epoch": 0.42408946598651587, + "grad_norm": 0.28222936391830444, + "learning_rate": 3.5636369392626813e-05, + "loss": 0.1818, + "step": 23777 + }, + { + "epoch": 0.4241073021082296, + "grad_norm": 0.26422178745269775, + "learning_rate": 3.563496076426362e-05, + "loss": 0.1565, + "step": 23778 + }, + { + "epoch": 0.4241251382299433, + "grad_norm": 0.22211819887161255, + "learning_rate": 3.563355209467566e-05, + "loss": 0.127, + "step": 23779 + }, + { + "epoch": 0.424142974351657, + "grad_norm": 0.27805814146995544, + "learning_rate": 3.563214338386836e-05, + "loss": 0.1514, + "step": 23780 + }, + { + "epoch": 0.4241608104733707, + "grad_norm": 0.17706941068172455, + "learning_rate": 3.56307346318472e-05, + "loss": 0.1408, + "step": 23781 + }, + { + "epoch": 0.42417864659508436, + "grad_norm": 0.2802286744117737, + "learning_rate": 3.5629325838617644e-05, + "loss": 0.1678, + "step": 23782 + }, + { + "epoch": 0.42419648271679805, + "grad_norm": 0.2613270580768585, + "learning_rate": 3.562791700418516e-05, + "loss": 0.1435, + "step": 23783 + }, + { + "epoch": 0.42421431883851174, + "grad_norm": 0.21706587076187134, + "learning_rate": 3.5626508128555184e-05, + "loss": 0.1868, + "step": 23784 + }, + { + "epoch": 0.4242321549602254, + "grad_norm": 0.25618359446525574, + "learning_rate": 3.56250992117332e-05, + "loss": 0.1549, + "step": 23785 + }, + { + "epoch": 0.42424999108193917, + "grad_norm": 0.34990188479423523, + "learning_rate": 3.562369025372466e-05, + "loss": 0.2101, + "step": 23786 + }, + { + "epoch": 0.42426782720365286, + "grad_norm": 0.26322513818740845, + "learning_rate": 3.562228125453503e-05, + "loss": 0.1763, + "step": 23787 + }, + { + "epoch": 0.42428566332536655, + "grad_norm": 0.23602735996246338, + "learning_rate": 3.5620872214169767e-05, + "loss": 0.1426, + "step": 23788 + }, + { + "epoch": 0.42430349944708023, + "grad_norm": 0.25292080640792847, + "learning_rate": 3.5619463132634333e-05, + "loss": 0.1676, + "step": 23789 + }, + { + "epoch": 0.4243213355687939, + "grad_norm": 0.3345840871334076, + "learning_rate": 3.561805400993419e-05, + "loss": 0.2286, + "step": 23790 + }, + { + "epoch": 0.4243391716905076, + "grad_norm": 0.2635435163974762, + "learning_rate": 3.561664484607481e-05, + "loss": 0.1482, + "step": 23791 + }, + { + "epoch": 0.4243570078122213, + "grad_norm": 0.24531997740268707, + "learning_rate": 3.561523564106165e-05, + "loss": 0.1939, + "step": 23792 + }, + { + "epoch": 0.424374843933935, + "grad_norm": 0.27716177701950073, + "learning_rate": 3.561382639490016e-05, + "loss": 0.146, + "step": 23793 + }, + { + "epoch": 0.4243926800556487, + "grad_norm": 0.26818716526031494, + "learning_rate": 3.561241710759582e-05, + "loss": 0.1625, + "step": 23794 + }, + { + "epoch": 0.4244105161773624, + "grad_norm": 0.31768810749053955, + "learning_rate": 3.561100777915408e-05, + "loss": 0.1805, + "step": 23795 + }, + { + "epoch": 0.4244283522990761, + "grad_norm": 0.28001144528388977, + "learning_rate": 3.560959840958042e-05, + "loss": 0.1585, + "step": 23796 + }, + { + "epoch": 0.4244461884207898, + "grad_norm": 0.24928061664104462, + "learning_rate": 3.5608188998880276e-05, + "loss": 0.1407, + "step": 23797 + }, + { + "epoch": 0.4244640245425035, + "grad_norm": 0.26309409737586975, + "learning_rate": 3.5606779547059145e-05, + "loss": 0.1488, + "step": 23798 + }, + { + "epoch": 0.42448186066421717, + "grad_norm": 0.21805571019649506, + "learning_rate": 3.560537005412246e-05, + "loss": 0.1552, + "step": 23799 + }, + { + "epoch": 0.42449969678593086, + "grad_norm": 0.3703272342681885, + "learning_rate": 3.5603960520075706e-05, + "loss": 0.1727, + "step": 23800 + }, + { + "epoch": 0.42451753290764455, + "grad_norm": 0.216144397854805, + "learning_rate": 3.560255094492433e-05, + "loss": 0.1724, + "step": 23801 + }, + { + "epoch": 0.42453536902935823, + "grad_norm": 0.27654367685317993, + "learning_rate": 3.56011413286738e-05, + "loss": 0.1085, + "step": 23802 + }, + { + "epoch": 0.424553205151072, + "grad_norm": 0.33142679929733276, + "learning_rate": 3.55997316713296e-05, + "loss": 0.204, + "step": 23803 + }, + { + "epoch": 0.42457104127278567, + "grad_norm": 0.3179510831832886, + "learning_rate": 3.5598321972897176e-05, + "loss": 0.1499, + "step": 23804 + }, + { + "epoch": 0.42458887739449935, + "grad_norm": 0.23971857130527496, + "learning_rate": 3.5596912233381996e-05, + "loss": 0.1581, + "step": 23805 + }, + { + "epoch": 0.42460671351621304, + "grad_norm": 0.2246101051568985, + "learning_rate": 3.559550245278951e-05, + "loss": 0.1426, + "step": 23806 + }, + { + "epoch": 0.42462454963792673, + "grad_norm": 0.21489155292510986, + "learning_rate": 3.5594092631125215e-05, + "loss": 0.1379, + "step": 23807 + }, + { + "epoch": 0.4246423857596404, + "grad_norm": 0.24619744718074799, + "learning_rate": 3.559268276839455e-05, + "loss": 0.107, + "step": 23808 + }, + { + "epoch": 0.4246602218813541, + "grad_norm": 0.2678712010383606, + "learning_rate": 3.559127286460299e-05, + "loss": 0.1912, + "step": 23809 + }, + { + "epoch": 0.4246780580030678, + "grad_norm": 0.2584265172481537, + "learning_rate": 3.558986291975599e-05, + "loss": 0.1306, + "step": 23810 + }, + { + "epoch": 0.4246958941247815, + "grad_norm": 0.28185492753982544, + "learning_rate": 3.558845293385903e-05, + "loss": 0.1847, + "step": 23811 + }, + { + "epoch": 0.4247137302464952, + "grad_norm": 0.23856894671916962, + "learning_rate": 3.5587042906917565e-05, + "loss": 0.1548, + "step": 23812 + }, + { + "epoch": 0.4247315663682089, + "grad_norm": 0.3829350173473358, + "learning_rate": 3.5585632838937075e-05, + "loss": 0.1763, + "step": 23813 + }, + { + "epoch": 0.4247494024899226, + "grad_norm": 0.2294577807188034, + "learning_rate": 3.558422272992301e-05, + "loss": 0.1632, + "step": 23814 + }, + { + "epoch": 0.4247672386116363, + "grad_norm": 0.3405967652797699, + "learning_rate": 3.558281257988084e-05, + "loss": 0.1153, + "step": 23815 + }, + { + "epoch": 0.42478507473335, + "grad_norm": 0.30611565709114075, + "learning_rate": 3.558140238881603e-05, + "loss": 0.1688, + "step": 23816 + }, + { + "epoch": 0.42480291085506366, + "grad_norm": 0.22104357182979584, + "learning_rate": 3.557999215673406e-05, + "loss": 0.1259, + "step": 23817 + }, + { + "epoch": 0.42482074697677735, + "grad_norm": 0.2542276680469513, + "learning_rate": 3.557858188364038e-05, + "loss": 0.1378, + "step": 23818 + }, + { + "epoch": 0.42483858309849104, + "grad_norm": 0.28939351439476013, + "learning_rate": 3.557717156954047e-05, + "loss": 0.2234, + "step": 23819 + }, + { + "epoch": 0.4248564192202048, + "grad_norm": 0.2534879148006439, + "learning_rate": 3.5575761214439786e-05, + "loss": 0.1917, + "step": 23820 + }, + { + "epoch": 0.42487425534191847, + "grad_norm": 0.23282602429389954, + "learning_rate": 3.557435081834379e-05, + "loss": 0.1524, + "step": 23821 + }, + { + "epoch": 0.42489209146363216, + "grad_norm": 0.27611321210861206, + "learning_rate": 3.557294038125797e-05, + "loss": 0.1334, + "step": 23822 + }, + { + "epoch": 0.42490992758534585, + "grad_norm": 0.26024171710014343, + "learning_rate": 3.557152990318777e-05, + "loss": 0.1659, + "step": 23823 + }, + { + "epoch": 0.42492776370705954, + "grad_norm": 0.39405298233032227, + "learning_rate": 3.5570119384138676e-05, + "loss": 0.166, + "step": 23824 + }, + { + "epoch": 0.4249455998287732, + "grad_norm": 0.24254606664180756, + "learning_rate": 3.556870882411615e-05, + "loss": 0.1236, + "step": 23825 + }, + { + "epoch": 0.4249634359504869, + "grad_norm": 0.23225122690200806, + "learning_rate": 3.556729822312566e-05, + "loss": 0.1492, + "step": 23826 + }, + { + "epoch": 0.4249812720722006, + "grad_norm": 0.18533039093017578, + "learning_rate": 3.5565887581172665e-05, + "loss": 0.1842, + "step": 23827 + }, + { + "epoch": 0.4249991081939143, + "grad_norm": 0.30577361583709717, + "learning_rate": 3.556447689826264e-05, + "loss": 0.1483, + "step": 23828 + }, + { + "epoch": 0.42501694431562803, + "grad_norm": 0.2784716486930847, + "learning_rate": 3.5563066174401054e-05, + "loss": 0.1702, + "step": 23829 + }, + { + "epoch": 0.4250347804373417, + "grad_norm": 0.21199016273021698, + "learning_rate": 3.556165540959338e-05, + "loss": 0.1059, + "step": 23830 + }, + { + "epoch": 0.4250526165590554, + "grad_norm": 0.2019483745098114, + "learning_rate": 3.5560244603845085e-05, + "loss": 0.1309, + "step": 23831 + }, + { + "epoch": 0.4250704526807691, + "grad_norm": 0.23276233673095703, + "learning_rate": 3.5558833757161626e-05, + "loss": 0.1326, + "step": 23832 + }, + { + "epoch": 0.4250882888024828, + "grad_norm": 0.26270052790641785, + "learning_rate": 3.5557422869548485e-05, + "loss": 0.1346, + "step": 23833 + }, + { + "epoch": 0.42510612492419647, + "grad_norm": 0.35761696100234985, + "learning_rate": 3.5556011941011124e-05, + "loss": 0.1777, + "step": 23834 + }, + { + "epoch": 0.42512396104591016, + "grad_norm": 0.18861781060695648, + "learning_rate": 3.555460097155502e-05, + "loss": 0.1515, + "step": 23835 + }, + { + "epoch": 0.42514179716762385, + "grad_norm": 0.2612263560295105, + "learning_rate": 3.5553189961185626e-05, + "loss": 0.2049, + "step": 23836 + }, + { + "epoch": 0.4251596332893376, + "grad_norm": 0.28499284386634827, + "learning_rate": 3.555177890990843e-05, + "loss": 0.1972, + "step": 23837 + }, + { + "epoch": 0.4251774694110513, + "grad_norm": 0.232050359249115, + "learning_rate": 3.5550367817728895e-05, + "loss": 0.1659, + "step": 23838 + }, + { + "epoch": 0.42519530553276497, + "grad_norm": 0.307167649269104, + "learning_rate": 3.554895668465249e-05, + "loss": 0.1846, + "step": 23839 + }, + { + "epoch": 0.42521314165447865, + "grad_norm": 0.2804708480834961, + "learning_rate": 3.554754551068469e-05, + "loss": 0.1547, + "step": 23840 + }, + { + "epoch": 0.42523097777619234, + "grad_norm": 0.3466965854167938, + "learning_rate": 3.5546134295830954e-05, + "loss": 0.2108, + "step": 23841 + }, + { + "epoch": 0.42524881389790603, + "grad_norm": 0.2884812653064728, + "learning_rate": 3.554472304009676e-05, + "loss": 0.1606, + "step": 23842 + }, + { + "epoch": 0.4252666500196197, + "grad_norm": 0.2581908404827118, + "learning_rate": 3.554331174348757e-05, + "loss": 0.1594, + "step": 23843 + }, + { + "epoch": 0.4252844861413334, + "grad_norm": 0.19995135068893433, + "learning_rate": 3.554190040600888e-05, + "loss": 0.1312, + "step": 23844 + }, + { + "epoch": 0.42530232226304715, + "grad_norm": 0.19691519439220428, + "learning_rate": 3.554048902766613e-05, + "loss": 0.1324, + "step": 23845 + }, + { + "epoch": 0.42532015838476084, + "grad_norm": 0.2621724009513855, + "learning_rate": 3.5539077608464814e-05, + "loss": 0.1209, + "step": 23846 + }, + { + "epoch": 0.4253379945064745, + "grad_norm": 0.16922634840011597, + "learning_rate": 3.553766614841038e-05, + "loss": 0.1398, + "step": 23847 + }, + { + "epoch": 0.4253558306281882, + "grad_norm": 0.22233954071998596, + "learning_rate": 3.553625464750832e-05, + "loss": 0.186, + "step": 23848 + }, + { + "epoch": 0.4253736667499019, + "grad_norm": 0.2795570194721222, + "learning_rate": 3.55348431057641e-05, + "loss": 0.1762, + "step": 23849 + }, + { + "epoch": 0.4253915028716156, + "grad_norm": 0.2672940790653229, + "learning_rate": 3.553343152318318e-05, + "loss": 0.1766, + "step": 23850 + }, + { + "epoch": 0.4254093389933293, + "grad_norm": 0.2887563407421112, + "learning_rate": 3.5532019899771045e-05, + "loss": 0.2031, + "step": 23851 + }, + { + "epoch": 0.42542717511504297, + "grad_norm": 0.32539504766464233, + "learning_rate": 3.553060823553317e-05, + "loss": 0.1544, + "step": 23852 + }, + { + "epoch": 0.42544501123675665, + "grad_norm": 0.25815871357917786, + "learning_rate": 3.552919653047502e-05, + "loss": 0.1472, + "step": 23853 + }, + { + "epoch": 0.4254628473584704, + "grad_norm": 0.27862316370010376, + "learning_rate": 3.5527784784602064e-05, + "loss": 0.1714, + "step": 23854 + }, + { + "epoch": 0.4254806834801841, + "grad_norm": 0.24769659340381622, + "learning_rate": 3.5526372997919774e-05, + "loss": 0.0831, + "step": 23855 + }, + { + "epoch": 0.4254985196018978, + "grad_norm": 0.2353099286556244, + "learning_rate": 3.552496117043364e-05, + "loss": 0.1333, + "step": 23856 + }, + { + "epoch": 0.42551635572361146, + "grad_norm": 0.26901599764823914, + "learning_rate": 3.552354930214911e-05, + "loss": 0.1541, + "step": 23857 + }, + { + "epoch": 0.42553419184532515, + "grad_norm": 0.31995871663093567, + "learning_rate": 3.552213739307166e-05, + "loss": 0.15, + "step": 23858 + }, + { + "epoch": 0.42555202796703884, + "grad_norm": 0.23480314016342163, + "learning_rate": 3.552072544320678e-05, + "loss": 0.2033, + "step": 23859 + }, + { + "epoch": 0.4255698640887525, + "grad_norm": 0.2328900545835495, + "learning_rate": 3.551931345255994e-05, + "loss": 0.1772, + "step": 23860 + }, + { + "epoch": 0.4255877002104662, + "grad_norm": 0.2869192063808441, + "learning_rate": 3.55179014211366e-05, + "loss": 0.1827, + "step": 23861 + }, + { + "epoch": 0.42560553633217996, + "grad_norm": 0.2263847291469574, + "learning_rate": 3.551648934894225e-05, + "loss": 0.1407, + "step": 23862 + }, + { + "epoch": 0.42562337245389364, + "grad_norm": 0.26627570390701294, + "learning_rate": 3.5515077235982354e-05, + "loss": 0.1714, + "step": 23863 + }, + { + "epoch": 0.42564120857560733, + "grad_norm": 0.26254335045814514, + "learning_rate": 3.551366508226237e-05, + "loss": 0.1407, + "step": 23864 + }, + { + "epoch": 0.425659044697321, + "grad_norm": 0.33721083402633667, + "learning_rate": 3.5512252887787806e-05, + "loss": 0.1665, + "step": 23865 + }, + { + "epoch": 0.4256768808190347, + "grad_norm": 0.27196943759918213, + "learning_rate": 3.551084065256411e-05, + "loss": 0.1295, + "step": 23866 + }, + { + "epoch": 0.4256947169407484, + "grad_norm": 0.34141629934310913, + "learning_rate": 3.550942837659678e-05, + "loss": 0.1878, + "step": 23867 + }, + { + "epoch": 0.4257125530624621, + "grad_norm": 0.2678629755973816, + "learning_rate": 3.550801605989126e-05, + "loss": 0.1388, + "step": 23868 + }, + { + "epoch": 0.4257303891841758, + "grad_norm": 0.2903549075126648, + "learning_rate": 3.550660370245305e-05, + "loss": 0.1197, + "step": 23869 + }, + { + "epoch": 0.42574822530588946, + "grad_norm": 0.32868221402168274, + "learning_rate": 3.55051913042876e-05, + "loss": 0.1806, + "step": 23870 + }, + { + "epoch": 0.4257660614276032, + "grad_norm": 0.2503345310688019, + "learning_rate": 3.550377886540042e-05, + "loss": 0.208, + "step": 23871 + }, + { + "epoch": 0.4257838975493169, + "grad_norm": 0.2820666432380676, + "learning_rate": 3.550236638579695e-05, + "loss": 0.1704, + "step": 23872 + }, + { + "epoch": 0.4258017336710306, + "grad_norm": 0.3565189838409424, + "learning_rate": 3.550095386548269e-05, + "loss": 0.2019, + "step": 23873 + }, + { + "epoch": 0.42581956979274427, + "grad_norm": 0.25917696952819824, + "learning_rate": 3.54995413044631e-05, + "loss": 0.1702, + "step": 23874 + }, + { + "epoch": 0.42583740591445796, + "grad_norm": 0.36733704805374146, + "learning_rate": 3.5498128702743664e-05, + "loss": 0.1639, + "step": 23875 + }, + { + "epoch": 0.42585524203617164, + "grad_norm": 0.26840436458587646, + "learning_rate": 3.549671606032986e-05, + "loss": 0.0889, + "step": 23876 + }, + { + "epoch": 0.42587307815788533, + "grad_norm": 0.3518798351287842, + "learning_rate": 3.5495303377227153e-05, + "loss": 0.229, + "step": 23877 + }, + { + "epoch": 0.425890914279599, + "grad_norm": 0.2991696298122406, + "learning_rate": 3.549389065344103e-05, + "loss": 0.1593, + "step": 23878 + }, + { + "epoch": 0.42590875040131276, + "grad_norm": 0.2914685606956482, + "learning_rate": 3.549247788897695e-05, + "loss": 0.1763, + "step": 23879 + }, + { + "epoch": 0.42592658652302645, + "grad_norm": 0.26813337206840515, + "learning_rate": 3.549106508384041e-05, + "loss": 0.1285, + "step": 23880 + }, + { + "epoch": 0.42594442264474014, + "grad_norm": 0.2833791971206665, + "learning_rate": 3.548965223803688e-05, + "loss": 0.1261, + "step": 23881 + }, + { + "epoch": 0.4259622587664538, + "grad_norm": 0.28477099537849426, + "learning_rate": 3.5488239351571836e-05, + "loss": 0.1153, + "step": 23882 + }, + { + "epoch": 0.4259800948881675, + "grad_norm": 0.28000229597091675, + "learning_rate": 3.5486826424450756e-05, + "loss": 0.1662, + "step": 23883 + }, + { + "epoch": 0.4259979310098812, + "grad_norm": 0.26784002780914307, + "learning_rate": 3.548541345667911e-05, + "loss": 0.136, + "step": 23884 + }, + { + "epoch": 0.4260157671315949, + "grad_norm": 0.24080026149749756, + "learning_rate": 3.548400044826238e-05, + "loss": 0.1299, + "step": 23885 + }, + { + "epoch": 0.4260336032533086, + "grad_norm": 0.40504252910614014, + "learning_rate": 3.5482587399206034e-05, + "loss": 0.1567, + "step": 23886 + }, + { + "epoch": 0.4260514393750223, + "grad_norm": 0.27305126190185547, + "learning_rate": 3.5481174309515574e-05, + "loss": 0.1484, + "step": 23887 + }, + { + "epoch": 0.426069275496736, + "grad_norm": 0.27621057629585266, + "learning_rate": 3.547976117919646e-05, + "loss": 0.1138, + "step": 23888 + }, + { + "epoch": 0.4260871116184497, + "grad_norm": 0.2447374016046524, + "learning_rate": 3.547834800825417e-05, + "loss": 0.1557, + "step": 23889 + }, + { + "epoch": 0.4261049477401634, + "grad_norm": 0.20814082026481628, + "learning_rate": 3.547693479669418e-05, + "loss": 0.1216, + "step": 23890 + }, + { + "epoch": 0.4261227838618771, + "grad_norm": 0.27645888924598694, + "learning_rate": 3.5475521544521974e-05, + "loss": 0.1565, + "step": 23891 + }, + { + "epoch": 0.42614061998359076, + "grad_norm": 0.3014417290687561, + "learning_rate": 3.547410825174302e-05, + "loss": 0.1959, + "step": 23892 + }, + { + "epoch": 0.42615845610530445, + "grad_norm": 0.22696557641029358, + "learning_rate": 3.547269491836282e-05, + "loss": 0.1418, + "step": 23893 + }, + { + "epoch": 0.42617629222701814, + "grad_norm": 0.39561939239501953, + "learning_rate": 3.547128154438683e-05, + "loss": 0.155, + "step": 23894 + }, + { + "epoch": 0.4261941283487318, + "grad_norm": 0.25103700160980225, + "learning_rate": 3.5469868129820535e-05, + "loss": 0.1269, + "step": 23895 + }, + { + "epoch": 0.42621196447044557, + "grad_norm": 0.3278326988220215, + "learning_rate": 3.546845467466942e-05, + "loss": 0.1266, + "step": 23896 + }, + { + "epoch": 0.42622980059215926, + "grad_norm": 0.33121931552886963, + "learning_rate": 3.546704117893896e-05, + "loss": 0.174, + "step": 23897 + }, + { + "epoch": 0.42624763671387295, + "grad_norm": 0.2715900242328644, + "learning_rate": 3.546562764263462e-05, + "loss": 0.178, + "step": 23898 + }, + { + "epoch": 0.42626547283558663, + "grad_norm": 0.23536370694637299, + "learning_rate": 3.54642140657619e-05, + "loss": 0.158, + "step": 23899 + }, + { + "epoch": 0.4262833089573003, + "grad_norm": 0.36287975311279297, + "learning_rate": 3.546280044832628e-05, + "loss": 0.2215, + "step": 23900 + }, + { + "epoch": 0.426301145079014, + "grad_norm": 0.3486074209213257, + "learning_rate": 3.5461386790333227e-05, + "loss": 0.1994, + "step": 23901 + }, + { + "epoch": 0.4263189812007277, + "grad_norm": 0.23576629161834717, + "learning_rate": 3.5459973091788226e-05, + "loss": 0.1818, + "step": 23902 + }, + { + "epoch": 0.4263368173224414, + "grad_norm": 0.2224901169538498, + "learning_rate": 3.5458559352696754e-05, + "loss": 0.1352, + "step": 23903 + }, + { + "epoch": 0.42635465344415513, + "grad_norm": 0.2519136369228363, + "learning_rate": 3.54571455730643e-05, + "loss": 0.1902, + "step": 23904 + }, + { + "epoch": 0.4263724895658688, + "grad_norm": 0.16021160781383514, + "learning_rate": 3.5455731752896326e-05, + "loss": 0.121, + "step": 23905 + }, + { + "epoch": 0.4263903256875825, + "grad_norm": 0.2509852647781372, + "learning_rate": 3.545431789219833e-05, + "loss": 0.1311, + "step": 23906 + }, + { + "epoch": 0.4264081618092962, + "grad_norm": 0.29924800992012024, + "learning_rate": 3.5452903990975784e-05, + "loss": 0.1853, + "step": 23907 + }, + { + "epoch": 0.4264259979310099, + "grad_norm": 0.27524664998054504, + "learning_rate": 3.545149004923418e-05, + "loss": 0.1587, + "step": 23908 + }, + { + "epoch": 0.42644383405272357, + "grad_norm": 0.29763686656951904, + "learning_rate": 3.5450076066978984e-05, + "loss": 0.2084, + "step": 23909 + }, + { + "epoch": 0.42646167017443726, + "grad_norm": 0.3354107141494751, + "learning_rate": 3.544866204421568e-05, + "loss": 0.1435, + "step": 23910 + }, + { + "epoch": 0.42647950629615095, + "grad_norm": 0.2531072497367859, + "learning_rate": 3.5447247980949775e-05, + "loss": 0.1308, + "step": 23911 + }, + { + "epoch": 0.42649734241786463, + "grad_norm": 0.25510644912719727, + "learning_rate": 3.5445833877186706e-05, + "loss": 0.0815, + "step": 23912 + }, + { + "epoch": 0.4265151785395784, + "grad_norm": 0.22560018301010132, + "learning_rate": 3.5444419732931986e-05, + "loss": 0.1315, + "step": 23913 + }, + { + "epoch": 0.42653301466129206, + "grad_norm": 0.25691989064216614, + "learning_rate": 3.5443005548191077e-05, + "loss": 0.1649, + "step": 23914 + }, + { + "epoch": 0.42655085078300575, + "grad_norm": 0.3879161477088928, + "learning_rate": 3.544159132296949e-05, + "loss": 0.1394, + "step": 23915 + }, + { + "epoch": 0.42656868690471944, + "grad_norm": 0.2623904347419739, + "learning_rate": 3.544017705727267e-05, + "loss": 0.1504, + "step": 23916 + }, + { + "epoch": 0.42658652302643313, + "grad_norm": 0.25704699754714966, + "learning_rate": 3.5438762751106134e-05, + "loss": 0.1804, + "step": 23917 + }, + { + "epoch": 0.4266043591481468, + "grad_norm": 0.28251948952674866, + "learning_rate": 3.5437348404475334e-05, + "loss": 0.1167, + "step": 23918 + }, + { + "epoch": 0.4266221952698605, + "grad_norm": 0.2453586906194687, + "learning_rate": 3.5435934017385775e-05, + "loss": 0.1734, + "step": 23919 + }, + { + "epoch": 0.4266400313915742, + "grad_norm": 0.2514716386795044, + "learning_rate": 3.543451958984293e-05, + "loss": 0.1921, + "step": 23920 + }, + { + "epoch": 0.42665786751328794, + "grad_norm": 0.2496187537908554, + "learning_rate": 3.543310512185228e-05, + "loss": 0.162, + "step": 23921 + }, + { + "epoch": 0.4266757036350016, + "grad_norm": 0.2603379786014557, + "learning_rate": 3.5431690613419317e-05, + "loss": 0.1137, + "step": 23922 + }, + { + "epoch": 0.4266935397567153, + "grad_norm": 0.22919818758964539, + "learning_rate": 3.5430276064549514e-05, + "loss": 0.136, + "step": 23923 + }, + { + "epoch": 0.426711375878429, + "grad_norm": 0.3271125555038452, + "learning_rate": 3.542886147524836e-05, + "loss": 0.1512, + "step": 23924 + }, + { + "epoch": 0.4267292120001427, + "grad_norm": 0.1695825755596161, + "learning_rate": 3.542744684552134e-05, + "loss": 0.1263, + "step": 23925 + }, + { + "epoch": 0.4267470481218564, + "grad_norm": 0.23370809853076935, + "learning_rate": 3.5426032175373927e-05, + "loss": 0.1441, + "step": 23926 + }, + { + "epoch": 0.42676488424357006, + "grad_norm": 0.2657979130744934, + "learning_rate": 3.542461746481161e-05, + "loss": 0.1568, + "step": 23927 + }, + { + "epoch": 0.42678272036528375, + "grad_norm": 0.2109915167093277, + "learning_rate": 3.5423202713839885e-05, + "loss": 0.1372, + "step": 23928 + }, + { + "epoch": 0.42680055648699744, + "grad_norm": 0.2852080166339874, + "learning_rate": 3.5421787922464224e-05, + "loss": 0.1678, + "step": 23929 + }, + { + "epoch": 0.4268183926087112, + "grad_norm": 0.2549762725830078, + "learning_rate": 3.542037309069012e-05, + "loss": 0.1861, + "step": 23930 + }, + { + "epoch": 0.42683622873042487, + "grad_norm": 0.2283683717250824, + "learning_rate": 3.5418958218523034e-05, + "loss": 0.1529, + "step": 23931 + }, + { + "epoch": 0.42685406485213856, + "grad_norm": 0.22913269698619843, + "learning_rate": 3.541754330596848e-05, + "loss": 0.1258, + "step": 23932 + }, + { + "epoch": 0.42687190097385225, + "grad_norm": 0.29181382060050964, + "learning_rate": 3.5416128353031926e-05, + "loss": 0.1635, + "step": 23933 + }, + { + "epoch": 0.42688973709556594, + "grad_norm": 0.21410875022411346, + "learning_rate": 3.541471335971886e-05, + "loss": 0.1247, + "step": 23934 + }, + { + "epoch": 0.4269075732172796, + "grad_norm": 0.34712743759155273, + "learning_rate": 3.541329832603477e-05, + "loss": 0.1638, + "step": 23935 + }, + { + "epoch": 0.4269254093389933, + "grad_norm": 0.21683192253112793, + "learning_rate": 3.541188325198513e-05, + "loss": 0.121, + "step": 23936 + }, + { + "epoch": 0.426943245460707, + "grad_norm": 0.6718734502792358, + "learning_rate": 3.5410468137575445e-05, + "loss": 0.2966, + "step": 23937 + }, + { + "epoch": 0.42696108158242074, + "grad_norm": 0.37805864214897156, + "learning_rate": 3.540905298281119e-05, + "loss": 0.1739, + "step": 23938 + }, + { + "epoch": 0.42697891770413443, + "grad_norm": 0.28198936581611633, + "learning_rate": 3.540763778769785e-05, + "loss": 0.1785, + "step": 23939 + }, + { + "epoch": 0.4269967538258481, + "grad_norm": 0.24662794172763824, + "learning_rate": 3.540622255224091e-05, + "loss": 0.1213, + "step": 23940 + }, + { + "epoch": 0.4270145899475618, + "grad_norm": 0.3209708034992218, + "learning_rate": 3.540480727644585e-05, + "loss": 0.195, + "step": 23941 + }, + { + "epoch": 0.4270324260692755, + "grad_norm": 0.34436291456222534, + "learning_rate": 3.5403391960318165e-05, + "loss": 0.1701, + "step": 23942 + }, + { + "epoch": 0.4270502621909892, + "grad_norm": 0.2335614413022995, + "learning_rate": 3.540197660386335e-05, + "loss": 0.1497, + "step": 23943 + }, + { + "epoch": 0.42706809831270287, + "grad_norm": 0.2021789401769638, + "learning_rate": 3.540056120708687e-05, + "loss": 0.113, + "step": 23944 + }, + { + "epoch": 0.42708593443441656, + "grad_norm": 0.3811666667461395, + "learning_rate": 3.539914576999424e-05, + "loss": 0.1941, + "step": 23945 + }, + { + "epoch": 0.4271037705561303, + "grad_norm": 0.24851712584495544, + "learning_rate": 3.5397730292590906e-05, + "loss": 0.1857, + "step": 23946 + }, + { + "epoch": 0.427121606677844, + "grad_norm": 0.21563822031021118, + "learning_rate": 3.5396314774882386e-05, + "loss": 0.1654, + "step": 23947 + }, + { + "epoch": 0.4271394427995577, + "grad_norm": 0.2698385715484619, + "learning_rate": 3.5394899216874166e-05, + "loss": 0.191, + "step": 23948 + }, + { + "epoch": 0.42715727892127137, + "grad_norm": 0.25924378633499146, + "learning_rate": 3.539348361857172e-05, + "loss": 0.1626, + "step": 23949 + }, + { + "epoch": 0.42717511504298505, + "grad_norm": 0.2653866410255432, + "learning_rate": 3.539206797998055e-05, + "loss": 0.1656, + "step": 23950 + }, + { + "epoch": 0.42719295116469874, + "grad_norm": 0.2630806565284729, + "learning_rate": 3.5390652301106134e-05, + "loss": 0.1417, + "step": 23951 + }, + { + "epoch": 0.42721078728641243, + "grad_norm": 0.3655237555503845, + "learning_rate": 3.5389236581953954e-05, + "loss": 0.1564, + "step": 23952 + }, + { + "epoch": 0.4272286234081261, + "grad_norm": 0.3011694550514221, + "learning_rate": 3.5387820822529505e-05, + "loss": 0.1807, + "step": 23953 + }, + { + "epoch": 0.4272464595298398, + "grad_norm": 0.2681520879268646, + "learning_rate": 3.5386405022838276e-05, + "loss": 0.1604, + "step": 23954 + }, + { + "epoch": 0.42726429565155355, + "grad_norm": 0.2561852037906647, + "learning_rate": 3.538498918288575e-05, + "loss": 0.1412, + "step": 23955 + }, + { + "epoch": 0.42728213177326724, + "grad_norm": 0.2399996519088745, + "learning_rate": 3.538357330267742e-05, + "loss": 0.1828, + "step": 23956 + }, + { + "epoch": 0.4272999678949809, + "grad_norm": 0.270550400018692, + "learning_rate": 3.5382157382218776e-05, + "loss": 0.1548, + "step": 23957 + }, + { + "epoch": 0.4273178040166946, + "grad_norm": 0.22000697255134583, + "learning_rate": 3.538074142151531e-05, + "loss": 0.1536, + "step": 23958 + }, + { + "epoch": 0.4273356401384083, + "grad_norm": 0.27395185828208923, + "learning_rate": 3.53793254205725e-05, + "loss": 0.1144, + "step": 23959 + }, + { + "epoch": 0.427353476260122, + "grad_norm": 0.2699791193008423, + "learning_rate": 3.537790937939584e-05, + "loss": 0.1941, + "step": 23960 + }, + { + "epoch": 0.4273713123818357, + "grad_norm": 0.3365248143672943, + "learning_rate": 3.537649329799082e-05, + "loss": 0.0994, + "step": 23961 + }, + { + "epoch": 0.42738914850354937, + "grad_norm": 0.3106212317943573, + "learning_rate": 3.537507717636292e-05, + "loss": 0.1784, + "step": 23962 + }, + { + "epoch": 0.4274069846252631, + "grad_norm": 0.260199636220932, + "learning_rate": 3.537366101451765e-05, + "loss": 0.0817, + "step": 23963 + }, + { + "epoch": 0.4274248207469768, + "grad_norm": 0.26688748598098755, + "learning_rate": 3.537224481246048e-05, + "loss": 0.1499, + "step": 23964 + }, + { + "epoch": 0.4274426568686905, + "grad_norm": 0.26480117440223694, + "learning_rate": 3.5370828570196905e-05, + "loss": 0.1654, + "step": 23965 + }, + { + "epoch": 0.4274604929904042, + "grad_norm": 0.2640722095966339, + "learning_rate": 3.5369412287732417e-05, + "loss": 0.149, + "step": 23966 + }, + { + "epoch": 0.42747832911211786, + "grad_norm": 0.39676433801651, + "learning_rate": 3.5367995965072515e-05, + "loss": 0.0998, + "step": 23967 + }, + { + "epoch": 0.42749616523383155, + "grad_norm": 0.19692380726337433, + "learning_rate": 3.536657960222267e-05, + "loss": 0.151, + "step": 23968 + }, + { + "epoch": 0.42751400135554524, + "grad_norm": 0.21552708745002747, + "learning_rate": 3.536516319918838e-05, + "loss": 0.1726, + "step": 23969 + }, + { + "epoch": 0.4275318374772589, + "grad_norm": 0.2576480507850647, + "learning_rate": 3.5363746755975144e-05, + "loss": 0.1567, + "step": 23970 + }, + { + "epoch": 0.4275496735989726, + "grad_norm": 0.277039498090744, + "learning_rate": 3.536233027258844e-05, + "loss": 0.1687, + "step": 23971 + }, + { + "epoch": 0.42756750972068636, + "grad_norm": 0.23763813078403473, + "learning_rate": 3.536091374903377e-05, + "loss": 0.1668, + "step": 23972 + }, + { + "epoch": 0.42758534584240004, + "grad_norm": 0.18927772343158722, + "learning_rate": 3.535949718531662e-05, + "loss": 0.1324, + "step": 23973 + }, + { + "epoch": 0.42760318196411373, + "grad_norm": 0.27033889293670654, + "learning_rate": 3.5358080581442475e-05, + "loss": 0.1378, + "step": 23974 + }, + { + "epoch": 0.4276210180858274, + "grad_norm": 0.35147562623023987, + "learning_rate": 3.5356663937416837e-05, + "loss": 0.2035, + "step": 23975 + }, + { + "epoch": 0.4276388542075411, + "grad_norm": 0.19672538340091705, + "learning_rate": 3.535524725324519e-05, + "loss": 0.1554, + "step": 23976 + }, + { + "epoch": 0.4276566903292548, + "grad_norm": 0.16885758936405182, + "learning_rate": 3.5353830528933026e-05, + "loss": 0.1282, + "step": 23977 + }, + { + "epoch": 0.4276745264509685, + "grad_norm": 0.258370041847229, + "learning_rate": 3.5352413764485845e-05, + "loss": 0.1606, + "step": 23978 + }, + { + "epoch": 0.42769236257268217, + "grad_norm": 0.21713489294052124, + "learning_rate": 3.535099695990913e-05, + "loss": 0.1149, + "step": 23979 + }, + { + "epoch": 0.4277101986943959, + "grad_norm": 0.5765313506126404, + "learning_rate": 3.534958011520838e-05, + "loss": 0.1654, + "step": 23980 + }, + { + "epoch": 0.4277280348161096, + "grad_norm": 0.25585129857063293, + "learning_rate": 3.534816323038907e-05, + "loss": 0.1636, + "step": 23981 + }, + { + "epoch": 0.4277458709378233, + "grad_norm": 0.281139075756073, + "learning_rate": 3.534674630545672e-05, + "loss": 0.1273, + "step": 23982 + }, + { + "epoch": 0.427763707059537, + "grad_norm": 0.32164880633354187, + "learning_rate": 3.5345329340416796e-05, + "loss": 0.1298, + "step": 23983 + }, + { + "epoch": 0.42778154318125067, + "grad_norm": 0.26561206579208374, + "learning_rate": 3.5343912335274816e-05, + "loss": 0.1763, + "step": 23984 + }, + { + "epoch": 0.42779937930296436, + "grad_norm": 0.2745407521724701, + "learning_rate": 3.534249529003625e-05, + "loss": 0.1631, + "step": 23985 + }, + { + "epoch": 0.42781721542467804, + "grad_norm": 0.24976280331611633, + "learning_rate": 3.53410782047066e-05, + "loss": 0.1884, + "step": 23986 + }, + { + "epoch": 0.42783505154639173, + "grad_norm": 0.27511921525001526, + "learning_rate": 3.533966107929136e-05, + "loss": 0.1445, + "step": 23987 + }, + { + "epoch": 0.4278528876681055, + "grad_norm": 0.3072541654109955, + "learning_rate": 3.533824391379602e-05, + "loss": 0.1458, + "step": 23988 + }, + { + "epoch": 0.42787072378981916, + "grad_norm": 0.3907562792301178, + "learning_rate": 3.533682670822608e-05, + "loss": 0.2003, + "step": 23989 + }, + { + "epoch": 0.42788855991153285, + "grad_norm": 0.30391618609428406, + "learning_rate": 3.5335409462587026e-05, + "loss": 0.1211, + "step": 23990 + }, + { + "epoch": 0.42790639603324654, + "grad_norm": 0.24945279955863953, + "learning_rate": 3.5333992176884354e-05, + "loss": 0.1486, + "step": 23991 + }, + { + "epoch": 0.4279242321549602, + "grad_norm": 0.2895711362361908, + "learning_rate": 3.533257485112357e-05, + "loss": 0.1407, + "step": 23992 + }, + { + "epoch": 0.4279420682766739, + "grad_norm": 0.2654116749763489, + "learning_rate": 3.533115748531015e-05, + "loss": 0.1268, + "step": 23993 + }, + { + "epoch": 0.4279599043983876, + "grad_norm": 0.23434217274188995, + "learning_rate": 3.5329740079449594e-05, + "loss": 0.1472, + "step": 23994 + }, + { + "epoch": 0.4279777405201013, + "grad_norm": 0.26843225955963135, + "learning_rate": 3.532832263354739e-05, + "loss": 0.1597, + "step": 23995 + }, + { + "epoch": 0.427995576641815, + "grad_norm": 0.28946638107299805, + "learning_rate": 3.5326905147609046e-05, + "loss": 0.1113, + "step": 23996 + }, + { + "epoch": 0.4280134127635287, + "grad_norm": 0.3010145425796509, + "learning_rate": 3.532548762164006e-05, + "loss": 0.1529, + "step": 23997 + }, + { + "epoch": 0.4280312488852424, + "grad_norm": 0.325039803981781, + "learning_rate": 3.5324070055645905e-05, + "loss": 0.1555, + "step": 23998 + }, + { + "epoch": 0.4280490850069561, + "grad_norm": 0.300077348947525, + "learning_rate": 3.53226524496321e-05, + "loss": 0.1509, + "step": 23999 + }, + { + "epoch": 0.4280669211286698, + "grad_norm": 0.2601543664932251, + "learning_rate": 3.532123480360412e-05, + "loss": 0.1475, + "step": 24000 + }, + { + "epoch": 0.4280669211286698, + "eval_loss": 0.14888684451580048, + "eval_runtime": 106.9842, + "eval_samples_per_second": 9.572, + "eval_steps_per_second": 1.598, + "step": 24000 + }, + { + "epoch": 0.4280847572503835, + "grad_norm": 0.24454689025878906, + "learning_rate": 3.5319817117567475e-05, + "loss": 0.1194, + "step": 24001 + }, + { + "epoch": 0.42810259337209716, + "grad_norm": 0.3253958821296692, + "learning_rate": 3.531839939152765e-05, + "loss": 0.1408, + "step": 24002 + }, + { + "epoch": 0.42812042949381085, + "grad_norm": 0.289745569229126, + "learning_rate": 3.531698162549015e-05, + "loss": 0.2044, + "step": 24003 + }, + { + "epoch": 0.42813826561552454, + "grad_norm": 0.3855065703392029, + "learning_rate": 3.531556381946046e-05, + "loss": 0.147, + "step": 24004 + }, + { + "epoch": 0.4281561017372383, + "grad_norm": 0.25362133979797363, + "learning_rate": 3.531414597344409e-05, + "loss": 0.165, + "step": 24005 + }, + { + "epoch": 0.42817393785895197, + "grad_norm": 0.3345668315887451, + "learning_rate": 3.5312728087446524e-05, + "loss": 0.1857, + "step": 24006 + }, + { + "epoch": 0.42819177398066566, + "grad_norm": 0.44475558400154114, + "learning_rate": 3.5311310161473254e-05, + "loss": 0.2183, + "step": 24007 + }, + { + "epoch": 0.42820961010237935, + "grad_norm": 0.2567703425884247, + "learning_rate": 3.5309892195529794e-05, + "loss": 0.1221, + "step": 24008 + }, + { + "epoch": 0.42822744622409303, + "grad_norm": 0.21394670009613037, + "learning_rate": 3.5308474189621625e-05, + "loss": 0.1506, + "step": 24009 + }, + { + "epoch": 0.4282452823458067, + "grad_norm": 0.2452140897512436, + "learning_rate": 3.530705614375425e-05, + "loss": 0.1426, + "step": 24010 + }, + { + "epoch": 0.4282631184675204, + "grad_norm": 0.29380446672439575, + "learning_rate": 3.5305638057933164e-05, + "loss": 0.1625, + "step": 24011 + }, + { + "epoch": 0.4282809545892341, + "grad_norm": 0.17671631276607513, + "learning_rate": 3.530421993216387e-05, + "loss": 0.1598, + "step": 24012 + }, + { + "epoch": 0.4282987907109478, + "grad_norm": 0.2578020989894867, + "learning_rate": 3.530280176645186e-05, + "loss": 0.155, + "step": 24013 + }, + { + "epoch": 0.42831662683266153, + "grad_norm": 0.2121681421995163, + "learning_rate": 3.530138356080264e-05, + "loss": 0.1123, + "step": 24014 + }, + { + "epoch": 0.4283344629543752, + "grad_norm": 0.3155827224254608, + "learning_rate": 3.5299965315221694e-05, + "loss": 0.2084, + "step": 24015 + }, + { + "epoch": 0.4283522990760889, + "grad_norm": 0.23339667916297913, + "learning_rate": 3.5298547029714515e-05, + "loss": 0.1449, + "step": 24016 + }, + { + "epoch": 0.4283701351978026, + "grad_norm": 0.3760417401790619, + "learning_rate": 3.529712870428662e-05, + "loss": 0.1686, + "step": 24017 + }, + { + "epoch": 0.4283879713195163, + "grad_norm": 0.27320724725723267, + "learning_rate": 3.5295710338943495e-05, + "loss": 0.1347, + "step": 24018 + }, + { + "epoch": 0.42840580744122997, + "grad_norm": 0.2863955795764923, + "learning_rate": 3.5294291933690646e-05, + "loss": 0.111, + "step": 24019 + }, + { + "epoch": 0.42842364356294366, + "grad_norm": 0.2796202600002289, + "learning_rate": 3.529287348853355e-05, + "loss": 0.193, + "step": 24020 + }, + { + "epoch": 0.42844147968465734, + "grad_norm": 0.22371521592140198, + "learning_rate": 3.5291455003477744e-05, + "loss": 0.163, + "step": 24021 + }, + { + "epoch": 0.4284593158063711, + "grad_norm": 0.3729160726070404, + "learning_rate": 3.529003647852869e-05, + "loss": 0.0918, + "step": 24022 + }, + { + "epoch": 0.4284771519280848, + "grad_norm": 0.1812201291322708, + "learning_rate": 3.52886179136919e-05, + "loss": 0.1463, + "step": 24023 + }, + { + "epoch": 0.42849498804979846, + "grad_norm": 0.2121412456035614, + "learning_rate": 3.528719930897287e-05, + "loss": 0.1115, + "step": 24024 + }, + { + "epoch": 0.42851282417151215, + "grad_norm": 0.2475673407316208, + "learning_rate": 3.528578066437711e-05, + "loss": 0.1651, + "step": 24025 + }, + { + "epoch": 0.42853066029322584, + "grad_norm": 0.2292526662349701, + "learning_rate": 3.5284361979910106e-05, + "loss": 0.1542, + "step": 24026 + }, + { + "epoch": 0.42854849641493953, + "grad_norm": 0.1916513293981552, + "learning_rate": 3.528294325557737e-05, + "loss": 0.1536, + "step": 24027 + }, + { + "epoch": 0.4285663325366532, + "grad_norm": 0.2592538595199585, + "learning_rate": 3.52815244913844e-05, + "loss": 0.145, + "step": 24028 + }, + { + "epoch": 0.4285841686583669, + "grad_norm": 0.2909541428089142, + "learning_rate": 3.528010568733668e-05, + "loss": 0.1873, + "step": 24029 + }, + { + "epoch": 0.4286020047800806, + "grad_norm": 0.27688759565353394, + "learning_rate": 3.527868684343972e-05, + "loss": 0.1267, + "step": 24030 + }, + { + "epoch": 0.42861984090179434, + "grad_norm": 0.21342121064662933, + "learning_rate": 3.5277267959699014e-05, + "loss": 0.1118, + "step": 24031 + }, + { + "epoch": 0.428637677023508, + "grad_norm": 0.24847741425037384, + "learning_rate": 3.527584903612008e-05, + "loss": 0.194, + "step": 24032 + }, + { + "epoch": 0.4286555131452217, + "grad_norm": 0.23573623597621918, + "learning_rate": 3.52744300727084e-05, + "loss": 0.1229, + "step": 24033 + }, + { + "epoch": 0.4286733492669354, + "grad_norm": 0.21822835505008698, + "learning_rate": 3.527301106946948e-05, + "loss": 0.1757, + "step": 24034 + }, + { + "epoch": 0.4286911853886491, + "grad_norm": 0.2192905992269516, + "learning_rate": 3.5271592026408815e-05, + "loss": 0.1275, + "step": 24035 + }, + { + "epoch": 0.4287090215103628, + "grad_norm": 0.3949853181838989, + "learning_rate": 3.5270172943531924e-05, + "loss": 0.2046, + "step": 24036 + }, + { + "epoch": 0.42872685763207646, + "grad_norm": 0.21146616339683533, + "learning_rate": 3.526875382084429e-05, + "loss": 0.1114, + "step": 24037 + }, + { + "epoch": 0.42874469375379015, + "grad_norm": 0.22338072955608368, + "learning_rate": 3.526733465835141e-05, + "loss": 0.1303, + "step": 24038 + }, + { + "epoch": 0.4287625298755039, + "grad_norm": 0.30598244071006775, + "learning_rate": 3.526591545605881e-05, + "loss": 0.1044, + "step": 24039 + }, + { + "epoch": 0.4287803659972176, + "grad_norm": 0.2424745261669159, + "learning_rate": 3.526449621397197e-05, + "loss": 0.1644, + "step": 24040 + }, + { + "epoch": 0.42879820211893127, + "grad_norm": 0.2184273898601532, + "learning_rate": 3.52630769320964e-05, + "loss": 0.1386, + "step": 24041 + }, + { + "epoch": 0.42881603824064496, + "grad_norm": 0.21332210302352905, + "learning_rate": 3.5261657610437594e-05, + "loss": 0.164, + "step": 24042 + }, + { + "epoch": 0.42883387436235865, + "grad_norm": 0.32384905219078064, + "learning_rate": 3.5260238249001065e-05, + "loss": 0.1984, + "step": 24043 + }, + { + "epoch": 0.42885171048407233, + "grad_norm": 0.30819737911224365, + "learning_rate": 3.52588188477923e-05, + "loss": 0.1774, + "step": 24044 + }, + { + "epoch": 0.428869546605786, + "grad_norm": 0.24642221629619598, + "learning_rate": 3.5257399406816815e-05, + "loss": 0.1758, + "step": 24045 + }, + { + "epoch": 0.4288873827274997, + "grad_norm": 0.2304585725069046, + "learning_rate": 3.525597992608011e-05, + "loss": 0.1473, + "step": 24046 + }, + { + "epoch": 0.42890521884921345, + "grad_norm": 0.20225901901721954, + "learning_rate": 3.5254560405587676e-05, + "loss": 0.1626, + "step": 24047 + }, + { + "epoch": 0.42892305497092714, + "grad_norm": 0.3168150782585144, + "learning_rate": 3.5253140845345026e-05, + "loss": 0.1789, + "step": 24048 + }, + { + "epoch": 0.42894089109264083, + "grad_norm": 0.2006775587797165, + "learning_rate": 3.525172124535767e-05, + "loss": 0.1479, + "step": 24049 + }, + { + "epoch": 0.4289587272143545, + "grad_norm": 0.2692861557006836, + "learning_rate": 3.525030160563109e-05, + "loss": 0.1794, + "step": 24050 + }, + { + "epoch": 0.4289765633360682, + "grad_norm": 0.3805452287197113, + "learning_rate": 3.5248881926170804e-05, + "loss": 0.1632, + "step": 24051 + }, + { + "epoch": 0.4289943994577819, + "grad_norm": 0.26520583033561707, + "learning_rate": 3.524746220698232e-05, + "loss": 0.1735, + "step": 24052 + }, + { + "epoch": 0.4290122355794956, + "grad_norm": 0.20892249047756195, + "learning_rate": 3.524604244807113e-05, + "loss": 0.145, + "step": 24053 + }, + { + "epoch": 0.42903007170120927, + "grad_norm": 0.26491081714630127, + "learning_rate": 3.524462264944274e-05, + "loss": 0.1476, + "step": 24054 + }, + { + "epoch": 0.42904790782292296, + "grad_norm": 0.2512350082397461, + "learning_rate": 3.524320281110265e-05, + "loss": 0.1618, + "step": 24055 + }, + { + "epoch": 0.4290657439446367, + "grad_norm": 0.26457685232162476, + "learning_rate": 3.524178293305637e-05, + "loss": 0.1167, + "step": 24056 + }, + { + "epoch": 0.4290835800663504, + "grad_norm": 0.27770259976387024, + "learning_rate": 3.52403630153094e-05, + "loss": 0.131, + "step": 24057 + }, + { + "epoch": 0.4291014161880641, + "grad_norm": 0.22504554688930511, + "learning_rate": 3.523894305786725e-05, + "loss": 0.1385, + "step": 24058 + }, + { + "epoch": 0.42911925230977777, + "grad_norm": 0.3124557137489319, + "learning_rate": 3.523752306073541e-05, + "loss": 0.2122, + "step": 24059 + }, + { + "epoch": 0.42913708843149145, + "grad_norm": 0.30401766300201416, + "learning_rate": 3.523610302391941e-05, + "loss": 0.1464, + "step": 24060 + }, + { + "epoch": 0.42915492455320514, + "grad_norm": 0.3035971522331238, + "learning_rate": 3.523468294742473e-05, + "loss": 0.1818, + "step": 24061 + }, + { + "epoch": 0.42917276067491883, + "grad_norm": 0.3898014426231384, + "learning_rate": 3.523326283125689e-05, + "loss": 0.14, + "step": 24062 + }, + { + "epoch": 0.4291905967966325, + "grad_norm": 0.2723587453365326, + "learning_rate": 3.523184267542138e-05, + "loss": 0.1826, + "step": 24063 + }, + { + "epoch": 0.42920843291834626, + "grad_norm": 0.24735338985919952, + "learning_rate": 3.5230422479923726e-05, + "loss": 0.1745, + "step": 24064 + }, + { + "epoch": 0.42922626904005995, + "grad_norm": 0.22546587884426117, + "learning_rate": 3.522900224476941e-05, + "loss": 0.0991, + "step": 24065 + }, + { + "epoch": 0.42924410516177364, + "grad_norm": 0.3785165250301361, + "learning_rate": 3.522758196996395e-05, + "loss": 0.1395, + "step": 24066 + }, + { + "epoch": 0.4292619412834873, + "grad_norm": 0.23490187525749207, + "learning_rate": 3.522616165551286e-05, + "loss": 0.131, + "step": 24067 + }, + { + "epoch": 0.429279777405201, + "grad_norm": 0.23220893740653992, + "learning_rate": 3.522474130142162e-05, + "loss": 0.1722, + "step": 24068 + }, + { + "epoch": 0.4292976135269147, + "grad_norm": 0.22375334799289703, + "learning_rate": 3.522332090769576e-05, + "loss": 0.1535, + "step": 24069 + }, + { + "epoch": 0.4293154496486284, + "grad_norm": 0.2489255964756012, + "learning_rate": 3.522190047434078e-05, + "loss": 0.162, + "step": 24070 + }, + { + "epoch": 0.4293332857703421, + "grad_norm": 0.2375880628824234, + "learning_rate": 3.5220480001362176e-05, + "loss": 0.0928, + "step": 24071 + }, + { + "epoch": 0.42935112189205576, + "grad_norm": 0.26167815923690796, + "learning_rate": 3.5219059488765465e-05, + "loss": 0.1337, + "step": 24072 + }, + { + "epoch": 0.4293689580137695, + "grad_norm": 0.23371227085590363, + "learning_rate": 3.521763893655615e-05, + "loss": 0.1342, + "step": 24073 + }, + { + "epoch": 0.4293867941354832, + "grad_norm": 0.3406495749950409, + "learning_rate": 3.521621834473973e-05, + "loss": 0.1793, + "step": 24074 + }, + { + "epoch": 0.4294046302571969, + "grad_norm": 0.28165772557258606, + "learning_rate": 3.521479771332173e-05, + "loss": 0.1951, + "step": 24075 + }, + { + "epoch": 0.4294224663789106, + "grad_norm": 0.32539570331573486, + "learning_rate": 3.521337704230764e-05, + "loss": 0.2462, + "step": 24076 + }, + { + "epoch": 0.42944030250062426, + "grad_norm": 0.31212443113327026, + "learning_rate": 3.5211956331702975e-05, + "loss": 0.1754, + "step": 24077 + }, + { + "epoch": 0.42945813862233795, + "grad_norm": 0.3066851496696472, + "learning_rate": 3.521053558151324e-05, + "loss": 0.1004, + "step": 24078 + }, + { + "epoch": 0.42947597474405164, + "grad_norm": 0.2351367473602295, + "learning_rate": 3.520911479174394e-05, + "loss": 0.1825, + "step": 24079 + }, + { + "epoch": 0.4294938108657653, + "grad_norm": 0.3087090253829956, + "learning_rate": 3.520769396240058e-05, + "loss": 0.1798, + "step": 24080 + }, + { + "epoch": 0.42951164698747907, + "grad_norm": 0.29715174436569214, + "learning_rate": 3.520627309348869e-05, + "loss": 0.1759, + "step": 24081 + }, + { + "epoch": 0.42952948310919276, + "grad_norm": 0.31474366784095764, + "learning_rate": 3.5204852185013755e-05, + "loss": 0.1765, + "step": 24082 + }, + { + "epoch": 0.42954731923090644, + "grad_norm": 0.26054781675338745, + "learning_rate": 3.520343123698128e-05, + "loss": 0.1492, + "step": 24083 + }, + { + "epoch": 0.42956515535262013, + "grad_norm": 0.33756422996520996, + "learning_rate": 3.520201024939679e-05, + "loss": 0.1337, + "step": 24084 + }, + { + "epoch": 0.4295829914743338, + "grad_norm": 0.3311472535133362, + "learning_rate": 3.5200589222265775e-05, + "loss": 0.1964, + "step": 24085 + }, + { + "epoch": 0.4296008275960475, + "grad_norm": 0.26739028096199036, + "learning_rate": 3.519916815559375e-05, + "loss": 0.1265, + "step": 24086 + }, + { + "epoch": 0.4296186637177612, + "grad_norm": 0.25582271814346313, + "learning_rate": 3.5197747049386234e-05, + "loss": 0.1404, + "step": 24087 + }, + { + "epoch": 0.4296364998394749, + "grad_norm": 0.2394641935825348, + "learning_rate": 3.519632590364873e-05, + "loss": 0.1786, + "step": 24088 + }, + { + "epoch": 0.42965433596118857, + "grad_norm": 0.2776090204715729, + "learning_rate": 3.5194904718386744e-05, + "loss": 0.1849, + "step": 24089 + }, + { + "epoch": 0.4296721720829023, + "grad_norm": 0.2702847123146057, + "learning_rate": 3.5193483493605795e-05, + "loss": 0.1532, + "step": 24090 + }, + { + "epoch": 0.429690008204616, + "grad_norm": 0.2719164490699768, + "learning_rate": 3.519206222931137e-05, + "loss": 0.1577, + "step": 24091 + }, + { + "epoch": 0.4297078443263297, + "grad_norm": 0.23871085047721863, + "learning_rate": 3.5190640925509e-05, + "loss": 0.1551, + "step": 24092 + }, + { + "epoch": 0.4297256804480434, + "grad_norm": 0.2889685034751892, + "learning_rate": 3.518921958220418e-05, + "loss": 0.1347, + "step": 24093 + }, + { + "epoch": 0.42974351656975707, + "grad_norm": 0.2794472873210907, + "learning_rate": 3.518779819940242e-05, + "loss": 0.1274, + "step": 24094 + }, + { + "epoch": 0.42976135269147075, + "grad_norm": 0.23831377923488617, + "learning_rate": 3.5186376777109246e-05, + "loss": 0.1473, + "step": 24095 + }, + { + "epoch": 0.42977918881318444, + "grad_norm": 0.29172539710998535, + "learning_rate": 3.518495531533015e-05, + "loss": 0.1713, + "step": 24096 + }, + { + "epoch": 0.42979702493489813, + "grad_norm": 0.27400290966033936, + "learning_rate": 3.5183533814070656e-05, + "loss": 0.2138, + "step": 24097 + }, + { + "epoch": 0.4298148610566119, + "grad_norm": 0.27783259749412537, + "learning_rate": 3.518211227333627e-05, + "loss": 0.1305, + "step": 24098 + }, + { + "epoch": 0.42983269717832556, + "grad_norm": 0.23592181503772736, + "learning_rate": 3.5180690693132495e-05, + "loss": 0.1516, + "step": 24099 + }, + { + "epoch": 0.42985053330003925, + "grad_norm": 0.21255047619342804, + "learning_rate": 3.517926907346484e-05, + "loss": 0.132, + "step": 24100 + }, + { + "epoch": 0.42986836942175294, + "grad_norm": 0.2559812366962433, + "learning_rate": 3.5177847414338833e-05, + "loss": 0.2054, + "step": 24101 + }, + { + "epoch": 0.4298862055434666, + "grad_norm": 0.1958625167608261, + "learning_rate": 3.517642571575996e-05, + "loss": 0.1056, + "step": 24102 + }, + { + "epoch": 0.4299040416651803, + "grad_norm": 0.24039381742477417, + "learning_rate": 3.5175003977733766e-05, + "loss": 0.0952, + "step": 24103 + }, + { + "epoch": 0.429921877786894, + "grad_norm": 0.18486778438091278, + "learning_rate": 3.517358220026573e-05, + "loss": 0.0977, + "step": 24104 + }, + { + "epoch": 0.4299397139086077, + "grad_norm": 0.38379645347595215, + "learning_rate": 3.517216038336138e-05, + "loss": 0.1363, + "step": 24105 + }, + { + "epoch": 0.42995755003032143, + "grad_norm": 0.2336011677980423, + "learning_rate": 3.517073852702622e-05, + "loss": 0.1431, + "step": 24106 + }, + { + "epoch": 0.4299753861520351, + "grad_norm": 0.3343081772327423, + "learning_rate": 3.5169316631265764e-05, + "loss": 0.1608, + "step": 24107 + }, + { + "epoch": 0.4299932222737488, + "grad_norm": 0.24900609254837036, + "learning_rate": 3.5167894696085526e-05, + "loss": 0.1585, + "step": 24108 + }, + { + "epoch": 0.4300110583954625, + "grad_norm": 0.36066752672195435, + "learning_rate": 3.5166472721491016e-05, + "loss": 0.1924, + "step": 24109 + }, + { + "epoch": 0.4300288945171762, + "grad_norm": 0.3907620906829834, + "learning_rate": 3.516505070748775e-05, + "loss": 0.1667, + "step": 24110 + }, + { + "epoch": 0.4300467306388899, + "grad_norm": 0.2419070452451706, + "learning_rate": 3.5163628654081234e-05, + "loss": 0.1482, + "step": 24111 + }, + { + "epoch": 0.43006456676060356, + "grad_norm": 0.18374262750148773, + "learning_rate": 3.516220656127698e-05, + "loss": 0.1589, + "step": 24112 + }, + { + "epoch": 0.43008240288231725, + "grad_norm": 0.3223990499973297, + "learning_rate": 3.5160784429080504e-05, + "loss": 0.1562, + "step": 24113 + }, + { + "epoch": 0.43010023900403094, + "grad_norm": 0.27670204639434814, + "learning_rate": 3.515936225749732e-05, + "loss": 0.1907, + "step": 24114 + }, + { + "epoch": 0.4301180751257447, + "grad_norm": 0.2766922414302826, + "learning_rate": 3.5157940046532934e-05, + "loss": 0.153, + "step": 24115 + }, + { + "epoch": 0.43013591124745837, + "grad_norm": 0.25355401635169983, + "learning_rate": 3.515651779619287e-05, + "loss": 0.1543, + "step": 24116 + }, + { + "epoch": 0.43015374736917206, + "grad_norm": 0.430664598941803, + "learning_rate": 3.515509550648264e-05, + "loss": 0.1494, + "step": 24117 + }, + { + "epoch": 0.43017158349088574, + "grad_norm": 0.33976903557777405, + "learning_rate": 3.5153673177407745e-05, + "loss": 0.1856, + "step": 24118 + }, + { + "epoch": 0.43018941961259943, + "grad_norm": 0.28228557109832764, + "learning_rate": 3.51522508089737e-05, + "loss": 0.1725, + "step": 24119 + }, + { + "epoch": 0.4302072557343131, + "grad_norm": 0.31134703755378723, + "learning_rate": 3.5150828401186034e-05, + "loss": 0.1549, + "step": 24120 + }, + { + "epoch": 0.4302250918560268, + "grad_norm": 0.3672589361667633, + "learning_rate": 3.5149405954050254e-05, + "loss": 0.1775, + "step": 24121 + }, + { + "epoch": 0.4302429279777405, + "grad_norm": 0.24515683948993683, + "learning_rate": 3.514798346757186e-05, + "loss": 0.1848, + "step": 24122 + }, + { + "epoch": 0.43026076409945424, + "grad_norm": 0.24136663973331451, + "learning_rate": 3.514656094175638e-05, + "loss": 0.1735, + "step": 24123 + }, + { + "epoch": 0.43027860022116793, + "grad_norm": 0.2479276806116104, + "learning_rate": 3.5145138376609335e-05, + "loss": 0.1203, + "step": 24124 + }, + { + "epoch": 0.4302964363428816, + "grad_norm": 0.23985235393047333, + "learning_rate": 3.514371577213622e-05, + "loss": 0.1578, + "step": 24125 + }, + { + "epoch": 0.4303142724645953, + "grad_norm": 0.2573501765727997, + "learning_rate": 3.5142293128342566e-05, + "loss": 0.107, + "step": 24126 + }, + { + "epoch": 0.430332108586309, + "grad_norm": 0.27935051918029785, + "learning_rate": 3.514087044523387e-05, + "loss": 0.1949, + "step": 24127 + }, + { + "epoch": 0.4303499447080227, + "grad_norm": 0.29461532831192017, + "learning_rate": 3.513944772281566e-05, + "loss": 0.1878, + "step": 24128 + }, + { + "epoch": 0.43036778082973637, + "grad_norm": 0.20467537641525269, + "learning_rate": 3.5138024961093456e-05, + "loss": 0.1501, + "step": 24129 + }, + { + "epoch": 0.43038561695145006, + "grad_norm": 0.20023281872272491, + "learning_rate": 3.5136602160072765e-05, + "loss": 0.1275, + "step": 24130 + }, + { + "epoch": 0.43040345307316374, + "grad_norm": 0.23346124589443207, + "learning_rate": 3.5135179319759096e-05, + "loss": 0.157, + "step": 24131 + }, + { + "epoch": 0.4304212891948775, + "grad_norm": 0.22761325538158417, + "learning_rate": 3.513375644015797e-05, + "loss": 0.1411, + "step": 24132 + }, + { + "epoch": 0.4304391253165912, + "grad_norm": 0.45452895760536194, + "learning_rate": 3.513233352127492e-05, + "loss": 0.1637, + "step": 24133 + }, + { + "epoch": 0.43045696143830486, + "grad_norm": 0.2729017734527588, + "learning_rate": 3.513091056311543e-05, + "loss": 0.1747, + "step": 24134 + }, + { + "epoch": 0.43047479756001855, + "grad_norm": 0.19828414916992188, + "learning_rate": 3.512948756568504e-05, + "loss": 0.1598, + "step": 24135 + }, + { + "epoch": 0.43049263368173224, + "grad_norm": 0.22937597334384918, + "learning_rate": 3.5128064528989255e-05, + "loss": 0.142, + "step": 24136 + }, + { + "epoch": 0.4305104698034459, + "grad_norm": 0.27797451615333557, + "learning_rate": 3.512664145303359e-05, + "loss": 0.1442, + "step": 24137 + }, + { + "epoch": 0.4305283059251596, + "grad_norm": 0.2587609589099884, + "learning_rate": 3.512521833782357e-05, + "loss": 0.1396, + "step": 24138 + }, + { + "epoch": 0.4305461420468733, + "grad_norm": 0.29430365562438965, + "learning_rate": 3.512379518336471e-05, + "loss": 0.1195, + "step": 24139 + }, + { + "epoch": 0.43056397816858705, + "grad_norm": 0.6127382516860962, + "learning_rate": 3.512237198966251e-05, + "loss": 0.1425, + "step": 24140 + }, + { + "epoch": 0.43058181429030074, + "grad_norm": 0.26574718952178955, + "learning_rate": 3.5120948756722514e-05, + "loss": 0.1522, + "step": 24141 + }, + { + "epoch": 0.4305996504120144, + "grad_norm": 0.4380887448787689, + "learning_rate": 3.511952548455021e-05, + "loss": 0.1494, + "step": 24142 + }, + { + "epoch": 0.4306174865337281, + "grad_norm": 0.31454208493232727, + "learning_rate": 3.511810217315114e-05, + "loss": 0.1933, + "step": 24143 + }, + { + "epoch": 0.4306353226554418, + "grad_norm": 0.20737364888191223, + "learning_rate": 3.5116678822530814e-05, + "loss": 0.13, + "step": 24144 + }, + { + "epoch": 0.4306531587771555, + "grad_norm": 0.2648198902606964, + "learning_rate": 3.5115255432694736e-05, + "loss": 0.1457, + "step": 24145 + }, + { + "epoch": 0.4306709948988692, + "grad_norm": 0.18288712203502655, + "learning_rate": 3.511383200364845e-05, + "loss": 0.1332, + "step": 24146 + }, + { + "epoch": 0.43068883102058286, + "grad_norm": 0.23093189299106598, + "learning_rate": 3.511240853539745e-05, + "loss": 0.1393, + "step": 24147 + }, + { + "epoch": 0.4307066671422966, + "grad_norm": 0.28162264823913574, + "learning_rate": 3.511098502794726e-05, + "loss": 0.142, + "step": 24148 + }, + { + "epoch": 0.4307245032640103, + "grad_norm": 0.36396291851997375, + "learning_rate": 3.51095614813034e-05, + "loss": 0.1876, + "step": 24149 + }, + { + "epoch": 0.430742339385724, + "grad_norm": 0.280042827129364, + "learning_rate": 3.5108137895471385e-05, + "loss": 0.1493, + "step": 24150 + }, + { + "epoch": 0.43076017550743767, + "grad_norm": 0.3160635828971863, + "learning_rate": 3.5106714270456745e-05, + "loss": 0.2073, + "step": 24151 + }, + { + "epoch": 0.43077801162915136, + "grad_norm": 0.24016617238521576, + "learning_rate": 3.510529060626498e-05, + "loss": 0.1015, + "step": 24152 + }, + { + "epoch": 0.43079584775086505, + "grad_norm": 0.25095823407173157, + "learning_rate": 3.510386690290163e-05, + "loss": 0.0878, + "step": 24153 + }, + { + "epoch": 0.43081368387257873, + "grad_norm": 0.33107972145080566, + "learning_rate": 3.510244316037219e-05, + "loss": 0.2136, + "step": 24154 + }, + { + "epoch": 0.4308315199942924, + "grad_norm": 0.2084999680519104, + "learning_rate": 3.510101937868219e-05, + "loss": 0.1279, + "step": 24155 + }, + { + "epoch": 0.4308493561160061, + "grad_norm": 0.31072115898132324, + "learning_rate": 3.5099595557837154e-05, + "loss": 0.1266, + "step": 24156 + }, + { + "epoch": 0.43086719223771985, + "grad_norm": 0.30046194791793823, + "learning_rate": 3.5098171697842604e-05, + "loss": 0.1735, + "step": 24157 + }, + { + "epoch": 0.43088502835943354, + "grad_norm": 0.37660181522369385, + "learning_rate": 3.509674779870405e-05, + "loss": 0.2102, + "step": 24158 + }, + { + "epoch": 0.43090286448114723, + "grad_norm": 0.26191726326942444, + "learning_rate": 3.509532386042702e-05, + "loss": 0.1417, + "step": 24159 + }, + { + "epoch": 0.4309207006028609, + "grad_norm": 0.2836020588874817, + "learning_rate": 3.509389988301702e-05, + "loss": 0.1806, + "step": 24160 + }, + { + "epoch": 0.4309385367245746, + "grad_norm": 0.2684483528137207, + "learning_rate": 3.509247586647957e-05, + "loss": 0.1514, + "step": 24161 + }, + { + "epoch": 0.4309563728462883, + "grad_norm": 0.28021323680877686, + "learning_rate": 3.509105181082021e-05, + "loss": 0.1504, + "step": 24162 + }, + { + "epoch": 0.430974208968002, + "grad_norm": 0.3087092936038971, + "learning_rate": 3.5089627716044436e-05, + "loss": 0.1399, + "step": 24163 + }, + { + "epoch": 0.43099204508971567, + "grad_norm": 0.24836251139640808, + "learning_rate": 3.50882035821578e-05, + "loss": 0.1884, + "step": 24164 + }, + { + "epoch": 0.4310098812114294, + "grad_norm": 0.24367257952690125, + "learning_rate": 3.5086779409165784e-05, + "loss": 0.1537, + "step": 24165 + }, + { + "epoch": 0.4310277173331431, + "grad_norm": 0.2773861587047577, + "learning_rate": 3.508535519707393e-05, + "loss": 0.1383, + "step": 24166 + }, + { + "epoch": 0.4310455534548568, + "grad_norm": 0.21887654066085815, + "learning_rate": 3.5083930945887764e-05, + "loss": 0.1239, + "step": 24167 + }, + { + "epoch": 0.4310633895765705, + "grad_norm": 0.1991252303123474, + "learning_rate": 3.5082506655612796e-05, + "loss": 0.1422, + "step": 24168 + }, + { + "epoch": 0.43108122569828417, + "grad_norm": 0.2893853187561035, + "learning_rate": 3.508108232625454e-05, + "loss": 0.1292, + "step": 24169 + }, + { + "epoch": 0.43109906181999785, + "grad_norm": 0.27717188000679016, + "learning_rate": 3.507965795781854e-05, + "loss": 0.1207, + "step": 24170 + }, + { + "epoch": 0.43111689794171154, + "grad_norm": 0.25788143277168274, + "learning_rate": 3.5078233550310295e-05, + "loss": 0.1285, + "step": 24171 + }, + { + "epoch": 0.43113473406342523, + "grad_norm": 0.22396482527256012, + "learning_rate": 3.507680910373534e-05, + "loss": 0.1405, + "step": 24172 + }, + { + "epoch": 0.4311525701851389, + "grad_norm": 0.28769373893737793, + "learning_rate": 3.50753846180992e-05, + "loss": 0.1434, + "step": 24173 + }, + { + "epoch": 0.43117040630685266, + "grad_norm": 0.3915373682975769, + "learning_rate": 3.507396009340738e-05, + "loss": 0.1954, + "step": 24174 + }, + { + "epoch": 0.43118824242856635, + "grad_norm": 0.2991529107093811, + "learning_rate": 3.507253552966542e-05, + "loss": 0.133, + "step": 24175 + }, + { + "epoch": 0.43120607855028004, + "grad_norm": 0.20913895964622498, + "learning_rate": 3.507111092687882e-05, + "loss": 0.1265, + "step": 24176 + }, + { + "epoch": 0.4312239146719937, + "grad_norm": 0.2475053369998932, + "learning_rate": 3.506968628505312e-05, + "loss": 0.1359, + "step": 24177 + }, + { + "epoch": 0.4312417507937074, + "grad_norm": 0.2712688148021698, + "learning_rate": 3.5068261604193844e-05, + "loss": 0.1991, + "step": 24178 + }, + { + "epoch": 0.4312595869154211, + "grad_norm": 0.32760435342788696, + "learning_rate": 3.50668368843065e-05, + "loss": 0.1482, + "step": 24179 + }, + { + "epoch": 0.4312774230371348, + "grad_norm": 0.3047506809234619, + "learning_rate": 3.5065412125396625e-05, + "loss": 0.1899, + "step": 24180 + }, + { + "epoch": 0.4312952591588485, + "grad_norm": 0.3048126995563507, + "learning_rate": 3.506398732746974e-05, + "loss": 0.1844, + "step": 24181 + }, + { + "epoch": 0.4313130952805622, + "grad_norm": 0.25498640537261963, + "learning_rate": 3.506256249053136e-05, + "loss": 0.1945, + "step": 24182 + }, + { + "epoch": 0.4313309314022759, + "grad_norm": 0.35354673862457275, + "learning_rate": 3.5061137614587005e-05, + "loss": 0.1012, + "step": 24183 + }, + { + "epoch": 0.4313487675239896, + "grad_norm": 0.25961971282958984, + "learning_rate": 3.505971269964221e-05, + "loss": 0.1693, + "step": 24184 + }, + { + "epoch": 0.4313666036457033, + "grad_norm": 0.28149473667144775, + "learning_rate": 3.50582877457025e-05, + "loss": 0.1389, + "step": 24185 + }, + { + "epoch": 0.43138443976741697, + "grad_norm": 0.2908150851726532, + "learning_rate": 3.505686275277339e-05, + "loss": 0.1537, + "step": 24186 + }, + { + "epoch": 0.43140227588913066, + "grad_norm": 0.40581485629081726, + "learning_rate": 3.505543772086041e-05, + "loss": 0.1668, + "step": 24187 + }, + { + "epoch": 0.43142011201084435, + "grad_norm": 0.19538424909114838, + "learning_rate": 3.505401264996908e-05, + "loss": 0.1355, + "step": 24188 + }, + { + "epoch": 0.43143794813255804, + "grad_norm": 0.282721608877182, + "learning_rate": 3.5052587540104916e-05, + "loss": 0.1657, + "step": 24189 + }, + { + "epoch": 0.4314557842542717, + "grad_norm": 0.20696620643138885, + "learning_rate": 3.505116239127345e-05, + "loss": 0.1284, + "step": 24190 + }, + { + "epoch": 0.43147362037598547, + "grad_norm": 0.23924341797828674, + "learning_rate": 3.504973720348021e-05, + "loss": 0.1502, + "step": 24191 + }, + { + "epoch": 0.43149145649769916, + "grad_norm": 0.23188172280788422, + "learning_rate": 3.504831197673072e-05, + "loss": 0.1191, + "step": 24192 + }, + { + "epoch": 0.43150929261941284, + "grad_norm": 0.3421745300292969, + "learning_rate": 3.5046886711030505e-05, + "loss": 0.1931, + "step": 24193 + }, + { + "epoch": 0.43152712874112653, + "grad_norm": 0.25157660245895386, + "learning_rate": 3.5045461406385085e-05, + "loss": 0.1825, + "step": 24194 + }, + { + "epoch": 0.4315449648628402, + "grad_norm": 0.38536420464515686, + "learning_rate": 3.504403606279998e-05, + "loss": 0.1672, + "step": 24195 + }, + { + "epoch": 0.4315628009845539, + "grad_norm": 0.2484949678182602, + "learning_rate": 3.504261068028073e-05, + "loss": 0.1442, + "step": 24196 + }, + { + "epoch": 0.4315806371062676, + "grad_norm": 0.21039129793643951, + "learning_rate": 3.504118525883286e-05, + "loss": 0.1373, + "step": 24197 + }, + { + "epoch": 0.4315984732279813, + "grad_norm": 0.28027409315109253, + "learning_rate": 3.5039759798461866e-05, + "loss": 0.1919, + "step": 24198 + }, + { + "epoch": 0.431616309349695, + "grad_norm": 0.30816641449928284, + "learning_rate": 3.503833429917332e-05, + "loss": 0.1835, + "step": 24199 + }, + { + "epoch": 0.4316341454714087, + "grad_norm": 0.34088900685310364, + "learning_rate": 3.50369087609727e-05, + "loss": 0.1602, + "step": 24200 + }, + { + "epoch": 0.4316519815931224, + "grad_norm": 0.21640869975090027, + "learning_rate": 3.503548318386557e-05, + "loss": 0.1039, + "step": 24201 + }, + { + "epoch": 0.4316698177148361, + "grad_norm": 0.189417764544487, + "learning_rate": 3.503405756785743e-05, + "loss": 0.1529, + "step": 24202 + }, + { + "epoch": 0.4316876538365498, + "grad_norm": 0.386018842458725, + "learning_rate": 3.503263191295383e-05, + "loss": 0.2075, + "step": 24203 + }, + { + "epoch": 0.43170548995826347, + "grad_norm": 0.2628712058067322, + "learning_rate": 3.503120621916027e-05, + "loss": 0.1434, + "step": 24204 + }, + { + "epoch": 0.43172332607997715, + "grad_norm": 0.31899526715278625, + "learning_rate": 3.50297804864823e-05, + "loss": 0.1411, + "step": 24205 + }, + { + "epoch": 0.43174116220169084, + "grad_norm": 0.2391701340675354, + "learning_rate": 3.502835471492543e-05, + "loss": 0.1539, + "step": 24206 + }, + { + "epoch": 0.4317589983234046, + "grad_norm": 0.25236669182777405, + "learning_rate": 3.502692890449521e-05, + "loss": 0.1211, + "step": 24207 + }, + { + "epoch": 0.4317768344451183, + "grad_norm": 0.3210826516151428, + "learning_rate": 3.5025503055197126e-05, + "loss": 0.1513, + "step": 24208 + }, + { + "epoch": 0.43179467056683196, + "grad_norm": 0.23876696825027466, + "learning_rate": 3.5024077167036746e-05, + "loss": 0.1968, + "step": 24209 + }, + { + "epoch": 0.43181250668854565, + "grad_norm": 0.24284358322620392, + "learning_rate": 3.502265124001958e-05, + "loss": 0.1156, + "step": 24210 + }, + { + "epoch": 0.43183034281025934, + "grad_norm": 0.3190374970436096, + "learning_rate": 3.502122527415114e-05, + "loss": 0.1759, + "step": 24211 + }, + { + "epoch": 0.431848178931973, + "grad_norm": 0.4405178725719452, + "learning_rate": 3.501979926943699e-05, + "loss": 0.2023, + "step": 24212 + }, + { + "epoch": 0.4318660150536867, + "grad_norm": 0.36915671825408936, + "learning_rate": 3.501837322588263e-05, + "loss": 0.1438, + "step": 24213 + }, + { + "epoch": 0.4318838511754004, + "grad_norm": 0.26710161566734314, + "learning_rate": 3.50169471434936e-05, + "loss": 0.1664, + "step": 24214 + }, + { + "epoch": 0.4319016872971141, + "grad_norm": 0.2517680525779724, + "learning_rate": 3.501552102227541e-05, + "loss": 0.1789, + "step": 24215 + }, + { + "epoch": 0.43191952341882783, + "grad_norm": 0.34554535150527954, + "learning_rate": 3.501409486223361e-05, + "loss": 0.0978, + "step": 24216 + }, + { + "epoch": 0.4319373595405415, + "grad_norm": 0.26666978001594543, + "learning_rate": 3.501266866337372e-05, + "loss": 0.1112, + "step": 24217 + }, + { + "epoch": 0.4319551956622552, + "grad_norm": 0.20696979761123657, + "learning_rate": 3.5011242425701266e-05, + "loss": 0.1441, + "step": 24218 + }, + { + "epoch": 0.4319730317839689, + "grad_norm": 0.4186224639415741, + "learning_rate": 3.500981614922177e-05, + "loss": 0.1665, + "step": 24219 + }, + { + "epoch": 0.4319908679056826, + "grad_norm": 0.27967795729637146, + "learning_rate": 3.500838983394078e-05, + "loss": 0.1569, + "step": 24220 + }, + { + "epoch": 0.4320087040273963, + "grad_norm": 0.5172132849693298, + "learning_rate": 3.5006963479863807e-05, + "loss": 0.1595, + "step": 24221 + }, + { + "epoch": 0.43202654014910996, + "grad_norm": 0.2538737952709198, + "learning_rate": 3.50055370869964e-05, + "loss": 0.151, + "step": 24222 + }, + { + "epoch": 0.43204437627082365, + "grad_norm": 0.3613035976886749, + "learning_rate": 3.500411065534407e-05, + "loss": 0.1734, + "step": 24223 + }, + { + "epoch": 0.4320622123925374, + "grad_norm": 0.31018775701522827, + "learning_rate": 3.5002684184912347e-05, + "loss": 0.1463, + "step": 24224 + }, + { + "epoch": 0.4320800485142511, + "grad_norm": 0.2684520184993744, + "learning_rate": 3.5001257675706767e-05, + "loss": 0.1515, + "step": 24225 + }, + { + "epoch": 0.43209788463596477, + "grad_norm": 0.27552419900894165, + "learning_rate": 3.4999831127732854e-05, + "loss": 0.1268, + "step": 24226 + }, + { + "epoch": 0.43211572075767846, + "grad_norm": 0.2797854542732239, + "learning_rate": 3.499840454099615e-05, + "loss": 0.1148, + "step": 24227 + }, + { + "epoch": 0.43213355687939214, + "grad_norm": 0.32367444038391113, + "learning_rate": 3.499697791550217e-05, + "loss": 0.2238, + "step": 24228 + }, + { + "epoch": 0.43215139300110583, + "grad_norm": 0.2627507448196411, + "learning_rate": 3.499555125125647e-05, + "loss": 0.1562, + "step": 24229 + }, + { + "epoch": 0.4321692291228195, + "grad_norm": 0.2851807773113251, + "learning_rate": 3.4994124548264535e-05, + "loss": 0.1497, + "step": 24230 + }, + { + "epoch": 0.4321870652445332, + "grad_norm": 0.2750164866447449, + "learning_rate": 3.499269780653193e-05, + "loss": 0.1554, + "step": 24231 + }, + { + "epoch": 0.4322049013662469, + "grad_norm": 0.24700988829135895, + "learning_rate": 3.499127102606418e-05, + "loss": 0.1287, + "step": 24232 + }, + { + "epoch": 0.43222273748796064, + "grad_norm": 0.2587621510028839, + "learning_rate": 3.4989844206866807e-05, + "loss": 0.1795, + "step": 24233 + }, + { + "epoch": 0.43224057360967433, + "grad_norm": 0.522209107875824, + "learning_rate": 3.498841734894535e-05, + "loss": 0.1916, + "step": 24234 + }, + { + "epoch": 0.432258409731388, + "grad_norm": 0.24742832779884338, + "learning_rate": 3.498699045230534e-05, + "loss": 0.1953, + "step": 24235 + }, + { + "epoch": 0.4322762458531017, + "grad_norm": 0.2535552978515625, + "learning_rate": 3.498556351695231e-05, + "loss": 0.1125, + "step": 24236 + }, + { + "epoch": 0.4322940819748154, + "grad_norm": 0.27269431948661804, + "learning_rate": 3.4984136542891776e-05, + "loss": 0.1095, + "step": 24237 + }, + { + "epoch": 0.4323119180965291, + "grad_norm": 0.27531227469444275, + "learning_rate": 3.498270953012929e-05, + "loss": 0.1251, + "step": 24238 + }, + { + "epoch": 0.43232975421824277, + "grad_norm": 0.2995506525039673, + "learning_rate": 3.498128247867036e-05, + "loss": 0.1533, + "step": 24239 + }, + { + "epoch": 0.43234759033995646, + "grad_norm": 0.21968425810337067, + "learning_rate": 3.497985538852055e-05, + "loss": 0.1373, + "step": 24240 + }, + { + "epoch": 0.4323654264616702, + "grad_norm": 0.4896388351917267, + "learning_rate": 3.497842825968537e-05, + "loss": 0.1508, + "step": 24241 + }, + { + "epoch": 0.4323832625833839, + "grad_norm": 0.19803155958652496, + "learning_rate": 3.497700109217035e-05, + "loss": 0.134, + "step": 24242 + }, + { + "epoch": 0.4324010987050976, + "grad_norm": 0.22787396609783173, + "learning_rate": 3.4975573885981024e-05, + "loss": 0.168, + "step": 24243 + }, + { + "epoch": 0.43241893482681126, + "grad_norm": 0.22332033514976501, + "learning_rate": 3.497414664112294e-05, + "loss": 0.0872, + "step": 24244 + }, + { + "epoch": 0.43243677094852495, + "grad_norm": 0.42162781953811646, + "learning_rate": 3.49727193576016e-05, + "loss": 0.2102, + "step": 24245 + }, + { + "epoch": 0.43245460707023864, + "grad_norm": 0.3007100820541382, + "learning_rate": 3.497129203542257e-05, + "loss": 0.1605, + "step": 24246 + }, + { + "epoch": 0.4324724431919523, + "grad_norm": 0.30453598499298096, + "learning_rate": 3.4969864674591364e-05, + "loss": 0.1426, + "step": 24247 + }, + { + "epoch": 0.432490279313666, + "grad_norm": 0.24274639785289764, + "learning_rate": 3.496843727511352e-05, + "loss": 0.1651, + "step": 24248 + }, + { + "epoch": 0.43250811543537976, + "grad_norm": 0.24301005899906158, + "learning_rate": 3.496700983699457e-05, + "loss": 0.1783, + "step": 24249 + }, + { + "epoch": 0.43252595155709345, + "grad_norm": 0.28563907742500305, + "learning_rate": 3.496558236024004e-05, + "loss": 0.19, + "step": 24250 + }, + { + "epoch": 0.43254378767880713, + "grad_norm": 0.2543340027332306, + "learning_rate": 3.496415484485549e-05, + "loss": 0.1495, + "step": 24251 + }, + { + "epoch": 0.4325616238005208, + "grad_norm": 0.31891199946403503, + "learning_rate": 3.496272729084642e-05, + "loss": 0.162, + "step": 24252 + }, + { + "epoch": 0.4325794599222345, + "grad_norm": 0.21086058020591736, + "learning_rate": 3.496129969821838e-05, + "loss": 0.1361, + "step": 24253 + }, + { + "epoch": 0.4325972960439482, + "grad_norm": 0.28397929668426514, + "learning_rate": 3.4959872066976895e-05, + "loss": 0.173, + "step": 24254 + }, + { + "epoch": 0.4326151321656619, + "grad_norm": 0.2805987596511841, + "learning_rate": 3.495844439712752e-05, + "loss": 0.1465, + "step": 24255 + }, + { + "epoch": 0.4326329682873756, + "grad_norm": 0.23381903767585754, + "learning_rate": 3.495701668867576e-05, + "loss": 0.1534, + "step": 24256 + }, + { + "epoch": 0.43265080440908926, + "grad_norm": 0.3065488040447235, + "learning_rate": 3.495558894162718e-05, + "loss": 0.1355, + "step": 24257 + }, + { + "epoch": 0.432668640530803, + "grad_norm": 0.30921217799186707, + "learning_rate": 3.495416115598729e-05, + "loss": 0.1901, + "step": 24258 + }, + { + "epoch": 0.4326864766525167, + "grad_norm": 0.32827669382095337, + "learning_rate": 3.4952733331761635e-05, + "loss": 0.1771, + "step": 24259 + }, + { + "epoch": 0.4327043127742304, + "grad_norm": 0.21493420004844666, + "learning_rate": 3.4951305468955745e-05, + "loss": 0.1005, + "step": 24260 + }, + { + "epoch": 0.43272214889594407, + "grad_norm": 0.27582138776779175, + "learning_rate": 3.494987756757516e-05, + "loss": 0.0742, + "step": 24261 + }, + { + "epoch": 0.43273998501765776, + "grad_norm": 0.4195863902568817, + "learning_rate": 3.494844962762541e-05, + "loss": 0.1158, + "step": 24262 + }, + { + "epoch": 0.43275782113937145, + "grad_norm": 0.19916103780269623, + "learning_rate": 3.494702164911204e-05, + "loss": 0.1196, + "step": 24263 + }, + { + "epoch": 0.43277565726108513, + "grad_norm": 0.3065303564071655, + "learning_rate": 3.4945593632040577e-05, + "loss": 0.1989, + "step": 24264 + }, + { + "epoch": 0.4327934933827988, + "grad_norm": 0.25682297348976135, + "learning_rate": 3.4944165576416553e-05, + "loss": 0.1704, + "step": 24265 + }, + { + "epoch": 0.43281132950451257, + "grad_norm": 0.2471446394920349, + "learning_rate": 3.494273748224551e-05, + "loss": 0.1596, + "step": 24266 + }, + { + "epoch": 0.43282916562622625, + "grad_norm": 0.27245932817459106, + "learning_rate": 3.494130934953298e-05, + "loss": 0.1577, + "step": 24267 + }, + { + "epoch": 0.43284700174793994, + "grad_norm": 0.22331424057483673, + "learning_rate": 3.493988117828451e-05, + "loss": 0.0874, + "step": 24268 + }, + { + "epoch": 0.43286483786965363, + "grad_norm": 0.3115536570549011, + "learning_rate": 3.493845296850562e-05, + "loss": 0.1924, + "step": 24269 + }, + { + "epoch": 0.4328826739913673, + "grad_norm": 0.2679843604564667, + "learning_rate": 3.493702472020186e-05, + "loss": 0.1563, + "step": 24270 + }, + { + "epoch": 0.432900510113081, + "grad_norm": 0.1973322033882141, + "learning_rate": 3.4935596433378757e-05, + "loss": 0.1501, + "step": 24271 + }, + { + "epoch": 0.4329183462347947, + "grad_norm": 0.26838403940200806, + "learning_rate": 3.493416810804185e-05, + "loss": 0.2326, + "step": 24272 + }, + { + "epoch": 0.4329361823565084, + "grad_norm": 0.2561957836151123, + "learning_rate": 3.4932739744196675e-05, + "loss": 0.152, + "step": 24273 + }, + { + "epoch": 0.43295401847822207, + "grad_norm": 0.2538825273513794, + "learning_rate": 3.4931311341848774e-05, + "loss": 0.1391, + "step": 24274 + }, + { + "epoch": 0.4329718545999358, + "grad_norm": 0.2524832487106323, + "learning_rate": 3.4929882901003674e-05, + "loss": 0.1531, + "step": 24275 + }, + { + "epoch": 0.4329896907216495, + "grad_norm": 0.32115912437438965, + "learning_rate": 3.492845442166692e-05, + "loss": 0.1712, + "step": 24276 + }, + { + "epoch": 0.4330075268433632, + "grad_norm": 0.46569839119911194, + "learning_rate": 3.4927025903844045e-05, + "loss": 0.1601, + "step": 24277 + }, + { + "epoch": 0.4330253629650769, + "grad_norm": 0.28514623641967773, + "learning_rate": 3.49255973475406e-05, + "loss": 0.1382, + "step": 24278 + }, + { + "epoch": 0.43304319908679056, + "grad_norm": 0.24826541543006897, + "learning_rate": 3.492416875276211e-05, + "loss": 0.1541, + "step": 24279 + }, + { + "epoch": 0.43306103520850425, + "grad_norm": 0.29415425658226013, + "learning_rate": 3.4922740119514106e-05, + "loss": 0.1897, + "step": 24280 + }, + { + "epoch": 0.43307887133021794, + "grad_norm": 0.30140557885169983, + "learning_rate": 3.492131144780213e-05, + "loss": 0.1122, + "step": 24281 + }, + { + "epoch": 0.43309670745193163, + "grad_norm": 0.29900699853897095, + "learning_rate": 3.491988273763173e-05, + "loss": 0.1283, + "step": 24282 + }, + { + "epoch": 0.43311454357364537, + "grad_norm": 0.29860368371009827, + "learning_rate": 3.491845398900844e-05, + "loss": 0.1613, + "step": 24283 + }, + { + "epoch": 0.43313237969535906, + "grad_norm": 0.3095369040966034, + "learning_rate": 3.49170252019378e-05, + "loss": 0.1299, + "step": 24284 + }, + { + "epoch": 0.43315021581707275, + "grad_norm": 0.2636614441871643, + "learning_rate": 3.491559637642534e-05, + "loss": 0.1697, + "step": 24285 + }, + { + "epoch": 0.43316805193878644, + "grad_norm": 0.23080094158649445, + "learning_rate": 3.4914167512476605e-05, + "loss": 0.1372, + "step": 24286 + }, + { + "epoch": 0.4331858880605001, + "grad_norm": 0.20795312523841858, + "learning_rate": 3.491273861009713e-05, + "loss": 0.1447, + "step": 24287 + }, + { + "epoch": 0.4332037241822138, + "grad_norm": 0.2338692992925644, + "learning_rate": 3.491130966929246e-05, + "loss": 0.1398, + "step": 24288 + }, + { + "epoch": 0.4332215603039275, + "grad_norm": 0.4019908010959625, + "learning_rate": 3.490988069006813e-05, + "loss": 0.1483, + "step": 24289 + }, + { + "epoch": 0.4332393964256412, + "grad_norm": 0.3317795693874359, + "learning_rate": 3.4908451672429674e-05, + "loss": 0.1244, + "step": 24290 + }, + { + "epoch": 0.4332572325473549, + "grad_norm": 0.3120705783367157, + "learning_rate": 3.490702261638265e-05, + "loss": 0.1439, + "step": 24291 + }, + { + "epoch": 0.4332750686690686, + "grad_norm": 0.2482733130455017, + "learning_rate": 3.4905593521932575e-05, + "loss": 0.1218, + "step": 24292 + }, + { + "epoch": 0.4332929047907823, + "grad_norm": 0.233924001455307, + "learning_rate": 3.4904164389084996e-05, + "loss": 0.1367, + "step": 24293 + }, + { + "epoch": 0.433310740912496, + "grad_norm": 0.32161903381347656, + "learning_rate": 3.490273521784546e-05, + "loss": 0.1596, + "step": 24294 + }, + { + "epoch": 0.4333285770342097, + "grad_norm": 0.39300015568733215, + "learning_rate": 3.490130600821949e-05, + "loss": 0.1387, + "step": 24295 + }, + { + "epoch": 0.43334641315592337, + "grad_norm": 0.33966004848480225, + "learning_rate": 3.4899876760212655e-05, + "loss": 0.2224, + "step": 24296 + }, + { + "epoch": 0.43336424927763706, + "grad_norm": 0.26362356543540955, + "learning_rate": 3.489844747383047e-05, + "loss": 0.1287, + "step": 24297 + }, + { + "epoch": 0.43338208539935075, + "grad_norm": 0.3174605071544647, + "learning_rate": 3.4897018149078494e-05, + "loss": 0.1574, + "step": 24298 + }, + { + "epoch": 0.43339992152106444, + "grad_norm": 0.20997481048107147, + "learning_rate": 3.489558878596224e-05, + "loss": 0.1897, + "step": 24299 + }, + { + "epoch": 0.4334177576427782, + "grad_norm": 0.37025001645088196, + "learning_rate": 3.489415938448728e-05, + "loss": 0.2124, + "step": 24300 + }, + { + "epoch": 0.43343559376449187, + "grad_norm": 0.2173960655927658, + "learning_rate": 3.489272994465914e-05, + "loss": 0.2001, + "step": 24301 + }, + { + "epoch": 0.43345342988620555, + "grad_norm": 0.2510943114757538, + "learning_rate": 3.4891300466483354e-05, + "loss": 0.2245, + "step": 24302 + }, + { + "epoch": 0.43347126600791924, + "grad_norm": 0.22401520609855652, + "learning_rate": 3.488987094996547e-05, + "loss": 0.154, + "step": 24303 + }, + { + "epoch": 0.43348910212963293, + "grad_norm": 0.3415440320968628, + "learning_rate": 3.4888441395111036e-05, + "loss": 0.123, + "step": 24304 + }, + { + "epoch": 0.4335069382513466, + "grad_norm": 0.27179810404777527, + "learning_rate": 3.488701180192559e-05, + "loss": 0.1551, + "step": 24305 + }, + { + "epoch": 0.4335247743730603, + "grad_norm": 0.31604230403900146, + "learning_rate": 3.488558217041467e-05, + "loss": 0.186, + "step": 24306 + }, + { + "epoch": 0.433542610494774, + "grad_norm": 0.3194698989391327, + "learning_rate": 3.488415250058382e-05, + "loss": 0.0988, + "step": 24307 + }, + { + "epoch": 0.43356044661648774, + "grad_norm": 0.2013579159975052, + "learning_rate": 3.4882722792438574e-05, + "loss": 0.1288, + "step": 24308 + }, + { + "epoch": 0.4335782827382014, + "grad_norm": 0.24615490436553955, + "learning_rate": 3.4881293045984485e-05, + "loss": 0.1627, + "step": 24309 + }, + { + "epoch": 0.4335961188599151, + "grad_norm": 0.21517011523246765, + "learning_rate": 3.487986326122709e-05, + "loss": 0.1356, + "step": 24310 + }, + { + "epoch": 0.4336139549816288, + "grad_norm": 0.34740614891052246, + "learning_rate": 3.4878433438171944e-05, + "loss": 0.1811, + "step": 24311 + }, + { + "epoch": 0.4336317911033425, + "grad_norm": 0.28739190101623535, + "learning_rate": 3.4877003576824565e-05, + "loss": 0.1492, + "step": 24312 + }, + { + "epoch": 0.4336496272250562, + "grad_norm": 0.3440515995025635, + "learning_rate": 3.487557367719051e-05, + "loss": 0.173, + "step": 24313 + }, + { + "epoch": 0.43366746334676987, + "grad_norm": 0.30778539180755615, + "learning_rate": 3.487414373927532e-05, + "loss": 0.1887, + "step": 24314 + }, + { + "epoch": 0.43368529946848355, + "grad_norm": 0.2444865107536316, + "learning_rate": 3.487271376308454e-05, + "loss": 0.1526, + "step": 24315 + }, + { + "epoch": 0.43370313559019724, + "grad_norm": 0.21706320345401764, + "learning_rate": 3.487128374862371e-05, + "loss": 0.1047, + "step": 24316 + }, + { + "epoch": 0.433720971711911, + "grad_norm": 0.2686428129673004, + "learning_rate": 3.4869853695898384e-05, + "loss": 0.1823, + "step": 24317 + }, + { + "epoch": 0.4337388078336247, + "grad_norm": 0.24340467154979706, + "learning_rate": 3.486842360491409e-05, + "loss": 0.125, + "step": 24318 + }, + { + "epoch": 0.43375664395533836, + "grad_norm": 0.23119409382343292, + "learning_rate": 3.486699347567638e-05, + "loss": 0.1502, + "step": 24319 + }, + { + "epoch": 0.43377448007705205, + "grad_norm": 0.24016402661800385, + "learning_rate": 3.4865563308190796e-05, + "loss": 0.1656, + "step": 24320 + }, + { + "epoch": 0.43379231619876574, + "grad_norm": 0.2949119806289673, + "learning_rate": 3.486413310246287e-05, + "loss": 0.1833, + "step": 24321 + }, + { + "epoch": 0.4338101523204794, + "grad_norm": 0.2694990038871765, + "learning_rate": 3.486270285849816e-05, + "loss": 0.2416, + "step": 24322 + }, + { + "epoch": 0.4338279884421931, + "grad_norm": 0.23864035308361053, + "learning_rate": 3.486127257630222e-05, + "loss": 0.1727, + "step": 24323 + }, + { + "epoch": 0.4338458245639068, + "grad_norm": 0.32019928097724915, + "learning_rate": 3.485984225588058e-05, + "loss": 0.1307, + "step": 24324 + }, + { + "epoch": 0.43386366068562054, + "grad_norm": 0.2653842568397522, + "learning_rate": 3.4858411897238774e-05, + "loss": 0.1546, + "step": 24325 + }, + { + "epoch": 0.43388149680733423, + "grad_norm": 0.32241615653038025, + "learning_rate": 3.4856981500382365e-05, + "loss": 0.1408, + "step": 24326 + }, + { + "epoch": 0.4338993329290479, + "grad_norm": 0.2626042068004608, + "learning_rate": 3.485555106531689e-05, + "loss": 0.1716, + "step": 24327 + }, + { + "epoch": 0.4339171690507616, + "grad_norm": 0.30881282687187195, + "learning_rate": 3.48541205920479e-05, + "loss": 0.1203, + "step": 24328 + }, + { + "epoch": 0.4339350051724753, + "grad_norm": 0.28870540857315063, + "learning_rate": 3.485269008058093e-05, + "loss": 0.197, + "step": 24329 + }, + { + "epoch": 0.433952841294189, + "grad_norm": 0.36803141236305237, + "learning_rate": 3.485125953092153e-05, + "loss": 0.1556, + "step": 24330 + }, + { + "epoch": 0.4339706774159027, + "grad_norm": 0.18751604855060577, + "learning_rate": 3.4849828943075245e-05, + "loss": 0.0948, + "step": 24331 + }, + { + "epoch": 0.43398851353761636, + "grad_norm": 0.3452558219432831, + "learning_rate": 3.484839831704762e-05, + "loss": 0.1405, + "step": 24332 + }, + { + "epoch": 0.43400634965933005, + "grad_norm": 0.23843905329704285, + "learning_rate": 3.484696765284421e-05, + "loss": 0.1922, + "step": 24333 + }, + { + "epoch": 0.4340241857810438, + "grad_norm": 0.23658090829849243, + "learning_rate": 3.484553695047054e-05, + "loss": 0.1361, + "step": 24334 + }, + { + "epoch": 0.4340420219027575, + "grad_norm": 0.21467217803001404, + "learning_rate": 3.484410620993218e-05, + "loss": 0.1519, + "step": 24335 + }, + { + "epoch": 0.43405985802447117, + "grad_norm": 0.26574647426605225, + "learning_rate": 3.484267543123465e-05, + "loss": 0.1388, + "step": 24336 + }, + { + "epoch": 0.43407769414618486, + "grad_norm": 0.30465880036354065, + "learning_rate": 3.4841244614383525e-05, + "loss": 0.0962, + "step": 24337 + }, + { + "epoch": 0.43409553026789854, + "grad_norm": 0.2994210720062256, + "learning_rate": 3.483981375938433e-05, + "loss": 0.1904, + "step": 24338 + }, + { + "epoch": 0.43411336638961223, + "grad_norm": 0.22072015702724457, + "learning_rate": 3.483838286624262e-05, + "loss": 0.1376, + "step": 24339 + }, + { + "epoch": 0.4341312025113259, + "grad_norm": 0.2772163450717926, + "learning_rate": 3.483695193496394e-05, + "loss": 0.1219, + "step": 24340 + }, + { + "epoch": 0.4341490386330396, + "grad_norm": 0.2658138871192932, + "learning_rate": 3.483552096555384e-05, + "loss": 0.1368, + "step": 24341 + }, + { + "epoch": 0.43416687475475335, + "grad_norm": 0.29843610525131226, + "learning_rate": 3.4834089958017854e-05, + "loss": 0.1522, + "step": 24342 + }, + { + "epoch": 0.43418471087646704, + "grad_norm": 0.21727637946605682, + "learning_rate": 3.4832658912361544e-05, + "loss": 0.1289, + "step": 24343 + }, + { + "epoch": 0.4342025469981807, + "grad_norm": 0.31400787830352783, + "learning_rate": 3.483122782859046e-05, + "loss": 0.1837, + "step": 24344 + }, + { + "epoch": 0.4342203831198944, + "grad_norm": 0.32936444878578186, + "learning_rate": 3.482979670671013e-05, + "loss": 0.1496, + "step": 24345 + }, + { + "epoch": 0.4342382192416081, + "grad_norm": 0.31356382369995117, + "learning_rate": 3.4828365546726114e-05, + "loss": 0.1352, + "step": 24346 + }, + { + "epoch": 0.4342560553633218, + "grad_norm": 0.32013651728630066, + "learning_rate": 3.482693434864396e-05, + "loss": 0.1606, + "step": 24347 + }, + { + "epoch": 0.4342738914850355, + "grad_norm": 0.24525177478790283, + "learning_rate": 3.482550311246922e-05, + "loss": 0.1681, + "step": 24348 + }, + { + "epoch": 0.43429172760674917, + "grad_norm": 0.28695160150527954, + "learning_rate": 3.482407183820743e-05, + "loss": 0.1751, + "step": 24349 + }, + { + "epoch": 0.4343095637284629, + "grad_norm": 0.27639666199684143, + "learning_rate": 3.4822640525864146e-05, + "loss": 0.1215, + "step": 24350 + }, + { + "epoch": 0.4343273998501766, + "grad_norm": 0.28301528096199036, + "learning_rate": 3.4821209175444914e-05, + "loss": 0.2216, + "step": 24351 + }, + { + "epoch": 0.4343452359718903, + "grad_norm": 0.27372410893440247, + "learning_rate": 3.481977778695529e-05, + "loss": 0.113, + "step": 24352 + }, + { + "epoch": 0.434363072093604, + "grad_norm": 0.2885031998157501, + "learning_rate": 3.48183463604008e-05, + "loss": 0.1431, + "step": 24353 + }, + { + "epoch": 0.43438090821531766, + "grad_norm": 0.24308699369430542, + "learning_rate": 3.4816914895787026e-05, + "loss": 0.1446, + "step": 24354 + }, + { + "epoch": 0.43439874433703135, + "grad_norm": 0.2509942650794983, + "learning_rate": 3.481548339311948e-05, + "loss": 0.1755, + "step": 24355 + }, + { + "epoch": 0.43441658045874504, + "grad_norm": 0.19815278053283691, + "learning_rate": 3.481405185240375e-05, + "loss": 0.1075, + "step": 24356 + }, + { + "epoch": 0.4344344165804587, + "grad_norm": 0.30510663986206055, + "learning_rate": 3.481262027364536e-05, + "loss": 0.1764, + "step": 24357 + }, + { + "epoch": 0.4344522527021724, + "grad_norm": 0.26673340797424316, + "learning_rate": 3.4811188656849856e-05, + "loss": 0.1731, + "step": 24358 + }, + { + "epoch": 0.43447008882388616, + "grad_norm": 0.231350839138031, + "learning_rate": 3.4809757002022805e-05, + "loss": 0.1485, + "step": 24359 + }, + { + "epoch": 0.43448792494559985, + "grad_norm": 0.24012552201747894, + "learning_rate": 3.480832530916974e-05, + "loss": 0.1545, + "step": 24360 + }, + { + "epoch": 0.43450576106731353, + "grad_norm": 0.28711503744125366, + "learning_rate": 3.480689357829623e-05, + "loss": 0.1513, + "step": 24361 + }, + { + "epoch": 0.4345235971890272, + "grad_norm": 0.18398378789424896, + "learning_rate": 3.48054618094078e-05, + "loss": 0.1048, + "step": 24362 + }, + { + "epoch": 0.4345414333107409, + "grad_norm": 0.23202653229236603, + "learning_rate": 3.480403000251002e-05, + "loss": 0.1233, + "step": 24363 + }, + { + "epoch": 0.4345592694324546, + "grad_norm": 0.2332935333251953, + "learning_rate": 3.480259815760843e-05, + "loss": 0.1565, + "step": 24364 + }, + { + "epoch": 0.4345771055541683, + "grad_norm": 0.24581611156463623, + "learning_rate": 3.480116627470859e-05, + "loss": 0.1759, + "step": 24365 + }, + { + "epoch": 0.434594941675882, + "grad_norm": 0.21847109496593475, + "learning_rate": 3.479973435381604e-05, + "loss": 0.1049, + "step": 24366 + }, + { + "epoch": 0.4346127777975957, + "grad_norm": 0.35353147983551025, + "learning_rate": 3.4798302394936336e-05, + "loss": 0.1559, + "step": 24367 + }, + { + "epoch": 0.4346306139193094, + "grad_norm": 0.2123904675245285, + "learning_rate": 3.479687039807503e-05, + "loss": 0.1013, + "step": 24368 + }, + { + "epoch": 0.4346484500410231, + "grad_norm": 0.2722563147544861, + "learning_rate": 3.479543836323767e-05, + "loss": 0.1193, + "step": 24369 + }, + { + "epoch": 0.4346662861627368, + "grad_norm": 0.36920273303985596, + "learning_rate": 3.4794006290429806e-05, + "loss": 0.2087, + "step": 24370 + }, + { + "epoch": 0.43468412228445047, + "grad_norm": 0.2945559024810791, + "learning_rate": 3.479257417965699e-05, + "loss": 0.161, + "step": 24371 + }, + { + "epoch": 0.43470195840616416, + "grad_norm": 0.45980262756347656, + "learning_rate": 3.4791142030924774e-05, + "loss": 0.1603, + "step": 24372 + }, + { + "epoch": 0.43471979452787785, + "grad_norm": 0.2901257276535034, + "learning_rate": 3.478970984423871e-05, + "loss": 0.157, + "step": 24373 + }, + { + "epoch": 0.43473763064959153, + "grad_norm": 0.27465295791625977, + "learning_rate": 3.4788277619604354e-05, + "loss": 0.1617, + "step": 24374 + }, + { + "epoch": 0.4347554667713052, + "grad_norm": 0.28169938921928406, + "learning_rate": 3.478684535702725e-05, + "loss": 0.1469, + "step": 24375 + }, + { + "epoch": 0.43477330289301896, + "grad_norm": 0.311553955078125, + "learning_rate": 3.478541305651295e-05, + "loss": 0.1776, + "step": 24376 + }, + { + "epoch": 0.43479113901473265, + "grad_norm": 0.253508061170578, + "learning_rate": 3.4783980718067014e-05, + "loss": 0.1569, + "step": 24377 + }, + { + "epoch": 0.43480897513644634, + "grad_norm": 0.31083858013153076, + "learning_rate": 3.478254834169498e-05, + "loss": 0.1865, + "step": 24378 + }, + { + "epoch": 0.43482681125816003, + "grad_norm": 0.2461954951286316, + "learning_rate": 3.478111592740242e-05, + "loss": 0.107, + "step": 24379 + }, + { + "epoch": 0.4348446473798737, + "grad_norm": 0.20668452978134155, + "learning_rate": 3.477968347519488e-05, + "loss": 0.1171, + "step": 24380 + }, + { + "epoch": 0.4348624835015874, + "grad_norm": 0.2546831965446472, + "learning_rate": 3.477825098507789e-05, + "loss": 0.1533, + "step": 24381 + }, + { + "epoch": 0.4348803196233011, + "grad_norm": 0.4030230939388275, + "learning_rate": 3.4776818457057045e-05, + "loss": 0.128, + "step": 24382 + }, + { + "epoch": 0.4348981557450148, + "grad_norm": 0.26909109950065613, + "learning_rate": 3.477538589113786e-05, + "loss": 0.1504, + "step": 24383 + }, + { + "epoch": 0.4349159918667285, + "grad_norm": 0.22686009109020233, + "learning_rate": 3.477395328732591e-05, + "loss": 0.1434, + "step": 24384 + }, + { + "epoch": 0.4349338279884422, + "grad_norm": 0.22543098032474518, + "learning_rate": 3.477252064562674e-05, + "loss": 0.1363, + "step": 24385 + }, + { + "epoch": 0.4349516641101559, + "grad_norm": 0.31914573907852173, + "learning_rate": 3.4771087966045895e-05, + "loss": 0.1374, + "step": 24386 + }, + { + "epoch": 0.4349695002318696, + "grad_norm": 0.37344828248023987, + "learning_rate": 3.476965524858895e-05, + "loss": 0.2053, + "step": 24387 + }, + { + "epoch": 0.4349873363535833, + "grad_norm": 0.31551864743232727, + "learning_rate": 3.476822249326144e-05, + "loss": 0.1464, + "step": 24388 + }, + { + "epoch": 0.43500517247529696, + "grad_norm": 0.33455032110214233, + "learning_rate": 3.476678970006893e-05, + "loss": 0.1533, + "step": 24389 + }, + { + "epoch": 0.43502300859701065, + "grad_norm": 0.2222704142332077, + "learning_rate": 3.476535686901697e-05, + "loss": 0.1452, + "step": 24390 + }, + { + "epoch": 0.43504084471872434, + "grad_norm": 0.26246216893196106, + "learning_rate": 3.476392400011112e-05, + "loss": 0.1138, + "step": 24391 + }, + { + "epoch": 0.43505868084043803, + "grad_norm": 0.26675572991371155, + "learning_rate": 3.476249109335691e-05, + "loss": 0.121, + "step": 24392 + }, + { + "epoch": 0.43507651696215177, + "grad_norm": 0.31257131695747375, + "learning_rate": 3.476105814875993e-05, + "loss": 0.1382, + "step": 24393 + }, + { + "epoch": 0.43509435308386546, + "grad_norm": 0.2697868347167969, + "learning_rate": 3.475962516632571e-05, + "loss": 0.1276, + "step": 24394 + }, + { + "epoch": 0.43511218920557915, + "grad_norm": 0.22796927392482758, + "learning_rate": 3.475819214605981e-05, + "loss": 0.1111, + "step": 24395 + }, + { + "epoch": 0.43513002532729284, + "grad_norm": 0.3646264672279358, + "learning_rate": 3.4756759087967794e-05, + "loss": 0.1079, + "step": 24396 + }, + { + "epoch": 0.4351478614490065, + "grad_norm": 0.30447709560394287, + "learning_rate": 3.4755325992055204e-05, + "loss": 0.1665, + "step": 24397 + }, + { + "epoch": 0.4351656975707202, + "grad_norm": 0.44530022144317627, + "learning_rate": 3.47538928583276e-05, + "loss": 0.1765, + "step": 24398 + }, + { + "epoch": 0.4351835336924339, + "grad_norm": 0.21506398916244507, + "learning_rate": 3.475245968679054e-05, + "loss": 0.1671, + "step": 24399 + }, + { + "epoch": 0.4352013698141476, + "grad_norm": 0.1718909740447998, + "learning_rate": 3.475102647744958e-05, + "loss": 0.1083, + "step": 24400 + }, + { + "epoch": 0.43521920593586133, + "grad_norm": 0.3857828378677368, + "learning_rate": 3.4749593230310275e-05, + "loss": 0.1162, + "step": 24401 + }, + { + "epoch": 0.435237042057575, + "grad_norm": 0.31044501066207886, + "learning_rate": 3.474815994537818e-05, + "loss": 0.1883, + "step": 24402 + }, + { + "epoch": 0.4352548781792887, + "grad_norm": 0.2804064452648163, + "learning_rate": 3.474672662265884e-05, + "loss": 0.144, + "step": 24403 + }, + { + "epoch": 0.4352727143010024, + "grad_norm": 0.24959516525268555, + "learning_rate": 3.474529326215783e-05, + "loss": 0.1664, + "step": 24404 + }, + { + "epoch": 0.4352905504227161, + "grad_norm": 0.29151400923728943, + "learning_rate": 3.4743859863880696e-05, + "loss": 0.1422, + "step": 24405 + }, + { + "epoch": 0.43530838654442977, + "grad_norm": 0.2321368306875229, + "learning_rate": 3.474242642783299e-05, + "loss": 0.1187, + "step": 24406 + }, + { + "epoch": 0.43532622266614346, + "grad_norm": 0.29965221881866455, + "learning_rate": 3.474099295402028e-05, + "loss": 0.1111, + "step": 24407 + }, + { + "epoch": 0.43534405878785715, + "grad_norm": 0.2102593332529068, + "learning_rate": 3.4739559442448124e-05, + "loss": 0.1484, + "step": 24408 + }, + { + "epoch": 0.4353618949095709, + "grad_norm": 0.3023035228252411, + "learning_rate": 3.4738125893122064e-05, + "loss": 0.1514, + "step": 24409 + }, + { + "epoch": 0.4353797310312846, + "grad_norm": 0.15038491785526276, + "learning_rate": 3.4736692306047655e-05, + "loss": 0.1222, + "step": 24410 + }, + { + "epoch": 0.43539756715299827, + "grad_norm": 0.2755349278450012, + "learning_rate": 3.473525868123048e-05, + "loss": 0.1637, + "step": 24411 + }, + { + "epoch": 0.43541540327471195, + "grad_norm": 0.32242774963378906, + "learning_rate": 3.473382501867608e-05, + "loss": 0.1833, + "step": 24412 + }, + { + "epoch": 0.43543323939642564, + "grad_norm": 0.2398013025522232, + "learning_rate": 3.4732391318389997e-05, + "loss": 0.1611, + "step": 24413 + }, + { + "epoch": 0.43545107551813933, + "grad_norm": 0.21700620651245117, + "learning_rate": 3.473095758037781e-05, + "loss": 0.1624, + "step": 24414 + }, + { + "epoch": 0.435468911639853, + "grad_norm": 0.19714149832725525, + "learning_rate": 3.472952380464508e-05, + "loss": 0.1341, + "step": 24415 + }, + { + "epoch": 0.4354867477615667, + "grad_norm": 0.4227885603904724, + "learning_rate": 3.4728089991197345e-05, + "loss": 0.1762, + "step": 24416 + }, + { + "epoch": 0.4355045838832804, + "grad_norm": 0.24466878175735474, + "learning_rate": 3.4726656140040174e-05, + "loss": 0.1636, + "step": 24417 + }, + { + "epoch": 0.43552242000499414, + "grad_norm": 0.36232542991638184, + "learning_rate": 3.472522225117912e-05, + "loss": 0.1802, + "step": 24418 + }, + { + "epoch": 0.4355402561267078, + "grad_norm": 0.2487695813179016, + "learning_rate": 3.4723788324619754e-05, + "loss": 0.1443, + "step": 24419 + }, + { + "epoch": 0.4355580922484215, + "grad_norm": 0.2208571434020996, + "learning_rate": 3.4722354360367625e-05, + "loss": 0.1516, + "step": 24420 + }, + { + "epoch": 0.4355759283701352, + "grad_norm": 0.3345034718513489, + "learning_rate": 3.4720920358428297e-05, + "loss": 0.1344, + "step": 24421 + }, + { + "epoch": 0.4355937644918489, + "grad_norm": 0.21832901239395142, + "learning_rate": 3.471948631880732e-05, + "loss": 0.1453, + "step": 24422 + }, + { + "epoch": 0.4356116006135626, + "grad_norm": 0.38871943950653076, + "learning_rate": 3.471805224151025e-05, + "loss": 0.1545, + "step": 24423 + }, + { + "epoch": 0.43562943673527627, + "grad_norm": 0.23371315002441406, + "learning_rate": 3.4716618126542665e-05, + "loss": 0.1848, + "step": 24424 + }, + { + "epoch": 0.43564727285698995, + "grad_norm": 0.30884552001953125, + "learning_rate": 3.47151839739101e-05, + "loss": 0.1675, + "step": 24425 + }, + { + "epoch": 0.4356651089787037, + "grad_norm": 0.297853946685791, + "learning_rate": 3.471374978361813e-05, + "loss": 0.1266, + "step": 24426 + }, + { + "epoch": 0.4356829451004174, + "grad_norm": 0.48763370513916016, + "learning_rate": 3.471231555567231e-05, + "loss": 0.1414, + "step": 24427 + }, + { + "epoch": 0.4357007812221311, + "grad_norm": 0.21908225119113922, + "learning_rate": 3.471088129007821e-05, + "loss": 0.1142, + "step": 24428 + }, + { + "epoch": 0.43571861734384476, + "grad_norm": 0.33073118329048157, + "learning_rate": 3.470944698684137e-05, + "loss": 0.1675, + "step": 24429 + }, + { + "epoch": 0.43573645346555845, + "grad_norm": 0.2916359603404999, + "learning_rate": 3.470801264596737e-05, + "loss": 0.1733, + "step": 24430 + }, + { + "epoch": 0.43575428958727214, + "grad_norm": 0.3057893216609955, + "learning_rate": 3.470657826746175e-05, + "loss": 0.1968, + "step": 24431 + }, + { + "epoch": 0.4357721257089858, + "grad_norm": 0.19366583228111267, + "learning_rate": 3.4705143851330086e-05, + "loss": 0.1315, + "step": 24432 + }, + { + "epoch": 0.4357899618306995, + "grad_norm": 0.2801244556903839, + "learning_rate": 3.470370939757793e-05, + "loss": 0.1529, + "step": 24433 + }, + { + "epoch": 0.4358077979524132, + "grad_norm": 0.25890833139419556, + "learning_rate": 3.470227490621084e-05, + "loss": 0.157, + "step": 24434 + }, + { + "epoch": 0.43582563407412694, + "grad_norm": 0.4473634362220764, + "learning_rate": 3.470084037723439e-05, + "loss": 0.105, + "step": 24435 + }, + { + "epoch": 0.43584347019584063, + "grad_norm": 0.2150295078754425, + "learning_rate": 3.469940581065413e-05, + "loss": 0.1579, + "step": 24436 + }, + { + "epoch": 0.4358613063175543, + "grad_norm": 0.2542577087879181, + "learning_rate": 3.4697971206475624e-05, + "loss": 0.1188, + "step": 24437 + }, + { + "epoch": 0.435879142439268, + "grad_norm": 0.21434485912322998, + "learning_rate": 3.4696536564704425e-05, + "loss": 0.1524, + "step": 24438 + }, + { + "epoch": 0.4358969785609817, + "grad_norm": 0.24575546383857727, + "learning_rate": 3.4695101885346106e-05, + "loss": 0.1143, + "step": 24439 + }, + { + "epoch": 0.4359148146826954, + "grad_norm": 0.2871822118759155, + "learning_rate": 3.4693667168406216e-05, + "loss": 0.1465, + "step": 24440 + }, + { + "epoch": 0.43593265080440907, + "grad_norm": 0.25435957312583923, + "learning_rate": 3.469223241389034e-05, + "loss": 0.1464, + "step": 24441 + }, + { + "epoch": 0.43595048692612276, + "grad_norm": 0.2695740759372711, + "learning_rate": 3.469079762180401e-05, + "loss": 0.1536, + "step": 24442 + }, + { + "epoch": 0.4359683230478365, + "grad_norm": 0.2845589220523834, + "learning_rate": 3.4689362792152805e-05, + "loss": 0.1765, + "step": 24443 + }, + { + "epoch": 0.4359861591695502, + "grad_norm": 0.24259649217128754, + "learning_rate": 3.468792792494228e-05, + "loss": 0.143, + "step": 24444 + }, + { + "epoch": 0.4360039952912639, + "grad_norm": 0.44644781947135925, + "learning_rate": 3.4686493020178014e-05, + "loss": 0.1331, + "step": 24445 + }, + { + "epoch": 0.43602183141297757, + "grad_norm": 0.22415953874588013, + "learning_rate": 3.468505807786554e-05, + "loss": 0.1479, + "step": 24446 + }, + { + "epoch": 0.43603966753469126, + "grad_norm": 0.25826209783554077, + "learning_rate": 3.4683623098010444e-05, + "loss": 0.1501, + "step": 24447 + }, + { + "epoch": 0.43605750365640494, + "grad_norm": 0.24463853240013123, + "learning_rate": 3.468218808061828e-05, + "loss": 0.163, + "step": 24448 + }, + { + "epoch": 0.43607533977811863, + "grad_norm": 0.28812387585639954, + "learning_rate": 3.46807530256946e-05, + "loss": 0.1986, + "step": 24449 + }, + { + "epoch": 0.4360931758998323, + "grad_norm": 0.20546510815620422, + "learning_rate": 3.4679317933244996e-05, + "loss": 0.1203, + "step": 24450 + }, + { + "epoch": 0.436111012021546, + "grad_norm": 0.22608225047588348, + "learning_rate": 3.4677882803274994e-05, + "loss": 0.1226, + "step": 24451 + }, + { + "epoch": 0.43612884814325975, + "grad_norm": 0.27659979462623596, + "learning_rate": 3.4676447635790195e-05, + "loss": 0.1158, + "step": 24452 + }, + { + "epoch": 0.43614668426497344, + "grad_norm": 0.4154391884803772, + "learning_rate": 3.467501243079613e-05, + "loss": 0.1683, + "step": 24453 + }, + { + "epoch": 0.4361645203866871, + "grad_norm": 0.2342330664396286, + "learning_rate": 3.4673577188298375e-05, + "loss": 0.1404, + "step": 24454 + }, + { + "epoch": 0.4361823565084008, + "grad_norm": 0.2267889678478241, + "learning_rate": 3.4672141908302495e-05, + "loss": 0.0653, + "step": 24455 + }, + { + "epoch": 0.4362001926301145, + "grad_norm": 0.3321382999420166, + "learning_rate": 3.467070659081405e-05, + "loss": 0.164, + "step": 24456 + }, + { + "epoch": 0.4362180287518282, + "grad_norm": 0.2762233018875122, + "learning_rate": 3.4669271235838615e-05, + "loss": 0.1262, + "step": 24457 + }, + { + "epoch": 0.4362358648735419, + "grad_norm": 0.24827255308628082, + "learning_rate": 3.466783584338174e-05, + "loss": 0.148, + "step": 24458 + }, + { + "epoch": 0.43625370099525557, + "grad_norm": 0.39366522431373596, + "learning_rate": 3.466640041344899e-05, + "loss": 0.2163, + "step": 24459 + }, + { + "epoch": 0.4362715371169693, + "grad_norm": 0.24751730263233185, + "learning_rate": 3.4664964946045945e-05, + "loss": 0.1344, + "step": 24460 + }, + { + "epoch": 0.436289373238683, + "grad_norm": 0.3638309836387634, + "learning_rate": 3.4663529441178144e-05, + "loss": 0.125, + "step": 24461 + }, + { + "epoch": 0.4363072093603967, + "grad_norm": 0.2329244762659073, + "learning_rate": 3.4662093898851166e-05, + "loss": 0.1616, + "step": 24462 + }, + { + "epoch": 0.4363250454821104, + "grad_norm": 0.2134053260087967, + "learning_rate": 3.466065831907058e-05, + "loss": 0.1216, + "step": 24463 + }, + { + "epoch": 0.43634288160382406, + "grad_norm": 0.36467444896698, + "learning_rate": 3.465922270184195e-05, + "loss": 0.1302, + "step": 24464 + }, + { + "epoch": 0.43636071772553775, + "grad_norm": 0.4559136927127838, + "learning_rate": 3.465778704717083e-05, + "loss": 0.1351, + "step": 24465 + }, + { + "epoch": 0.43637855384725144, + "grad_norm": 0.2538378834724426, + "learning_rate": 3.4656351355062796e-05, + "loss": 0.127, + "step": 24466 + }, + { + "epoch": 0.4363963899689651, + "grad_norm": 0.2677273452281952, + "learning_rate": 3.46549156255234e-05, + "loss": 0.103, + "step": 24467 + }, + { + "epoch": 0.43641422609067887, + "grad_norm": 0.2955908179283142, + "learning_rate": 3.4653479858558225e-05, + "loss": 0.2337, + "step": 24468 + }, + { + "epoch": 0.43643206221239256, + "grad_norm": 0.2496703714132309, + "learning_rate": 3.4652044054172826e-05, + "loss": 0.1097, + "step": 24469 + }, + { + "epoch": 0.43644989833410625, + "grad_norm": 0.306892454624176, + "learning_rate": 3.465060821237277e-05, + "loss": 0.1421, + "step": 24470 + }, + { + "epoch": 0.43646773445581993, + "grad_norm": 0.26149633526802063, + "learning_rate": 3.464917233316363e-05, + "loss": 0.1664, + "step": 24471 + }, + { + "epoch": 0.4364855705775336, + "grad_norm": 0.2137775719165802, + "learning_rate": 3.464773641655096e-05, + "loss": 0.1328, + "step": 24472 + }, + { + "epoch": 0.4365034066992473, + "grad_norm": 0.269750714302063, + "learning_rate": 3.464630046254033e-05, + "loss": 0.1248, + "step": 24473 + }, + { + "epoch": 0.436521242820961, + "grad_norm": 0.26732224225997925, + "learning_rate": 3.464486447113731e-05, + "loss": 0.1183, + "step": 24474 + }, + { + "epoch": 0.4365390789426747, + "grad_norm": 0.2814632058143616, + "learning_rate": 3.464342844234746e-05, + "loss": 0.1526, + "step": 24475 + }, + { + "epoch": 0.4365569150643884, + "grad_norm": 0.20892545580863953, + "learning_rate": 3.4641992376176354e-05, + "loss": 0.1411, + "step": 24476 + }, + { + "epoch": 0.4365747511861021, + "grad_norm": 0.2442842721939087, + "learning_rate": 3.4640556272629556e-05, + "loss": 0.1517, + "step": 24477 + }, + { + "epoch": 0.4365925873078158, + "grad_norm": 0.26687225699424744, + "learning_rate": 3.463912013171263e-05, + "loss": 0.1477, + "step": 24478 + }, + { + "epoch": 0.4366104234295295, + "grad_norm": 0.2707096338272095, + "learning_rate": 3.463768395343114e-05, + "loss": 0.1751, + "step": 24479 + }, + { + "epoch": 0.4366282595512432, + "grad_norm": 0.2348027527332306, + "learning_rate": 3.4636247737790675e-05, + "loss": 0.0858, + "step": 24480 + }, + { + "epoch": 0.43664609567295687, + "grad_norm": 0.24467630684375763, + "learning_rate": 3.463481148479677e-05, + "loss": 0.1394, + "step": 24481 + }, + { + "epoch": 0.43666393179467056, + "grad_norm": 0.25188347697257996, + "learning_rate": 3.463337519445501e-05, + "loss": 0.1709, + "step": 24482 + }, + { + "epoch": 0.43668176791638424, + "grad_norm": 0.29841721057891846, + "learning_rate": 3.4631938866770956e-05, + "loss": 0.1824, + "step": 24483 + }, + { + "epoch": 0.43669960403809793, + "grad_norm": 0.2666328251361847, + "learning_rate": 3.46305025017502e-05, + "loss": 0.1518, + "step": 24484 + }, + { + "epoch": 0.4367174401598117, + "grad_norm": 0.2557719945907593, + "learning_rate": 3.462906609939826e-05, + "loss": 0.1764, + "step": 24485 + }, + { + "epoch": 0.43673527628152536, + "grad_norm": 0.31345266103744507, + "learning_rate": 3.462762965972076e-05, + "loss": 0.1676, + "step": 24486 + }, + { + "epoch": 0.43675311240323905, + "grad_norm": 0.2803944945335388, + "learning_rate": 3.462619318272323e-05, + "loss": 0.1516, + "step": 24487 + }, + { + "epoch": 0.43677094852495274, + "grad_norm": 0.26661330461502075, + "learning_rate": 3.462475666841126e-05, + "loss": 0.1724, + "step": 24488 + }, + { + "epoch": 0.43678878464666643, + "grad_norm": 0.22016002237796783, + "learning_rate": 3.46233201167904e-05, + "loss": 0.1776, + "step": 24489 + }, + { + "epoch": 0.4368066207683801, + "grad_norm": 0.35202881693840027, + "learning_rate": 3.4621883527866225e-05, + "loss": 0.1715, + "step": 24490 + }, + { + "epoch": 0.4368244568900938, + "grad_norm": 0.2893364131450653, + "learning_rate": 3.4620446901644316e-05, + "loss": 0.1546, + "step": 24491 + }, + { + "epoch": 0.4368422930118075, + "grad_norm": 0.321759968996048, + "learning_rate": 3.4619010238130224e-05, + "loss": 0.1142, + "step": 24492 + }, + { + "epoch": 0.4368601291335212, + "grad_norm": 0.4297880232334137, + "learning_rate": 3.461757353732953e-05, + "loss": 0.1349, + "step": 24493 + }, + { + "epoch": 0.4368779652552349, + "grad_norm": 0.2464224398136139, + "learning_rate": 3.46161367992478e-05, + "loss": 0.1487, + "step": 24494 + }, + { + "epoch": 0.4368958013769486, + "grad_norm": 0.29403191804885864, + "learning_rate": 3.46147000238906e-05, + "loss": 0.1272, + "step": 24495 + }, + { + "epoch": 0.4369136374986623, + "grad_norm": 0.30291152000427246, + "learning_rate": 3.4613263211263503e-05, + "loss": 0.0841, + "step": 24496 + }, + { + "epoch": 0.436931473620376, + "grad_norm": 0.2908466160297394, + "learning_rate": 3.461182636137208e-05, + "loss": 0.1706, + "step": 24497 + }, + { + "epoch": 0.4369493097420897, + "grad_norm": 0.3156099319458008, + "learning_rate": 3.4610389474221885e-05, + "loss": 0.1646, + "step": 24498 + }, + { + "epoch": 0.43696714586380336, + "grad_norm": 0.20026199519634247, + "learning_rate": 3.460895254981852e-05, + "loss": 0.1395, + "step": 24499 + }, + { + "epoch": 0.43698498198551705, + "grad_norm": 0.32203853130340576, + "learning_rate": 3.460751558816753e-05, + "loss": 0.142, + "step": 24500 + }, + { + "epoch": 0.43700281810723074, + "grad_norm": 0.32851582765579224, + "learning_rate": 3.4606078589274486e-05, + "loss": 0.1438, + "step": 24501 + }, + { + "epoch": 0.4370206542289445, + "grad_norm": 0.21360023319721222, + "learning_rate": 3.460464155314497e-05, + "loss": 0.1638, + "step": 24502 + }, + { + "epoch": 0.43703849035065817, + "grad_norm": 0.3464330732822418, + "learning_rate": 3.460320447978453e-05, + "loss": 0.1225, + "step": 24503 + }, + { + "epoch": 0.43705632647237186, + "grad_norm": 0.18150708079338074, + "learning_rate": 3.460176736919877e-05, + "loss": 0.144, + "step": 24504 + }, + { + "epoch": 0.43707416259408555, + "grad_norm": 0.3969244360923767, + "learning_rate": 3.460033022139324e-05, + "loss": 0.246, + "step": 24505 + }, + { + "epoch": 0.43709199871579923, + "grad_norm": 0.26365572214126587, + "learning_rate": 3.459889303637351e-05, + "loss": 0.1753, + "step": 24506 + }, + { + "epoch": 0.4371098348375129, + "grad_norm": 0.3229334354400635, + "learning_rate": 3.4597455814145164e-05, + "loss": 0.1668, + "step": 24507 + }, + { + "epoch": 0.4371276709592266, + "grad_norm": 0.2189752757549286, + "learning_rate": 3.459601855471376e-05, + "loss": 0.1523, + "step": 24508 + }, + { + "epoch": 0.4371455070809403, + "grad_norm": 0.31201139092445374, + "learning_rate": 3.459458125808487e-05, + "loss": 0.1672, + "step": 24509 + }, + { + "epoch": 0.43716334320265404, + "grad_norm": 0.2635055482387543, + "learning_rate": 3.4593143924264066e-05, + "loss": 0.1342, + "step": 24510 + }, + { + "epoch": 0.43718117932436773, + "grad_norm": 0.2361844927072525, + "learning_rate": 3.459170655325692e-05, + "loss": 0.1145, + "step": 24511 + }, + { + "epoch": 0.4371990154460814, + "grad_norm": 0.24420146644115448, + "learning_rate": 3.4590269145069024e-05, + "loss": 0.1457, + "step": 24512 + }, + { + "epoch": 0.4372168515677951, + "grad_norm": 0.18792513012886047, + "learning_rate": 3.458883169970592e-05, + "loss": 0.1066, + "step": 24513 + }, + { + "epoch": 0.4372346876895088, + "grad_norm": 0.24578148126602173, + "learning_rate": 3.4587394217173194e-05, + "loss": 0.1422, + "step": 24514 + }, + { + "epoch": 0.4372525238112225, + "grad_norm": 0.2214363068342209, + "learning_rate": 3.458595669747643e-05, + "loss": 0.1336, + "step": 24515 + }, + { + "epoch": 0.43727035993293617, + "grad_norm": 0.28363943099975586, + "learning_rate": 3.458451914062117e-05, + "loss": 0.1784, + "step": 24516 + }, + { + "epoch": 0.43728819605464986, + "grad_norm": 0.4099245071411133, + "learning_rate": 3.4583081546613006e-05, + "loss": 0.1431, + "step": 24517 + }, + { + "epoch": 0.43730603217636355, + "grad_norm": 0.23216953873634338, + "learning_rate": 3.458164391545751e-05, + "loss": 0.1441, + "step": 24518 + }, + { + "epoch": 0.4373238682980773, + "grad_norm": 0.23158235847949982, + "learning_rate": 3.458020624716025e-05, + "loss": 0.152, + "step": 24519 + }, + { + "epoch": 0.437341704419791, + "grad_norm": 0.3129504919052124, + "learning_rate": 3.457876854172681e-05, + "loss": 0.1511, + "step": 24520 + }, + { + "epoch": 0.43735954054150467, + "grad_norm": 0.27938613295555115, + "learning_rate": 3.457733079916275e-05, + "loss": 0.101, + "step": 24521 + }, + { + "epoch": 0.43737737666321835, + "grad_norm": 0.23409724235534668, + "learning_rate": 3.457589301947364e-05, + "loss": 0.1018, + "step": 24522 + }, + { + "epoch": 0.43739521278493204, + "grad_norm": 0.32318681478500366, + "learning_rate": 3.457445520266507e-05, + "loss": 0.1863, + "step": 24523 + }, + { + "epoch": 0.43741304890664573, + "grad_norm": 0.2505429685115814, + "learning_rate": 3.45730173487426e-05, + "loss": 0.1487, + "step": 24524 + }, + { + "epoch": 0.4374308850283594, + "grad_norm": 0.26734769344329834, + "learning_rate": 3.457157945771182e-05, + "loss": 0.1433, + "step": 24525 + }, + { + "epoch": 0.4374487211500731, + "grad_norm": 0.2437477856874466, + "learning_rate": 3.457014152957828e-05, + "loss": 0.1376, + "step": 24526 + }, + { + "epoch": 0.43746655727178685, + "grad_norm": 0.29736319184303284, + "learning_rate": 3.456870356434757e-05, + "loss": 0.1286, + "step": 24527 + }, + { + "epoch": 0.43748439339350054, + "grad_norm": 0.31667613983154297, + "learning_rate": 3.456726556202526e-05, + "loss": 0.1752, + "step": 24528 + }, + { + "epoch": 0.4375022295152142, + "grad_norm": 0.2735440731048584, + "learning_rate": 3.456582752261693e-05, + "loss": 0.1444, + "step": 24529 + }, + { + "epoch": 0.4375200656369279, + "grad_norm": 0.2536827027797699, + "learning_rate": 3.4564389446128134e-05, + "loss": 0.1595, + "step": 24530 + }, + { + "epoch": 0.4375379017586416, + "grad_norm": 0.26457610726356506, + "learning_rate": 3.456295133256447e-05, + "loss": 0.1388, + "step": 24531 + }, + { + "epoch": 0.4375557378803553, + "grad_norm": 0.1932777315378189, + "learning_rate": 3.456151318193151e-05, + "loss": 0.1535, + "step": 24532 + }, + { + "epoch": 0.437573574002069, + "grad_norm": 0.3857862651348114, + "learning_rate": 3.4560074994234816e-05, + "loss": 0.1309, + "step": 24533 + }, + { + "epoch": 0.43759141012378266, + "grad_norm": 0.18543091416358948, + "learning_rate": 3.455863676947997e-05, + "loss": 0.1318, + "step": 24534 + }, + { + "epoch": 0.43760924624549635, + "grad_norm": 0.3003429174423218, + "learning_rate": 3.455719850767254e-05, + "loss": 0.1418, + "step": 24535 + }, + { + "epoch": 0.4376270823672101, + "grad_norm": 0.3420927822589874, + "learning_rate": 3.455576020881812e-05, + "loss": 0.1492, + "step": 24536 + }, + { + "epoch": 0.4376449184889238, + "grad_norm": 0.2572770416736603, + "learning_rate": 3.4554321872922265e-05, + "loss": 0.1536, + "step": 24537 + }, + { + "epoch": 0.43766275461063747, + "grad_norm": 0.2656768262386322, + "learning_rate": 3.455288349999056e-05, + "loss": 0.2012, + "step": 24538 + }, + { + "epoch": 0.43768059073235116, + "grad_norm": 0.2661580741405487, + "learning_rate": 3.455144509002857e-05, + "loss": 0.1357, + "step": 24539 + }, + { + "epoch": 0.43769842685406485, + "grad_norm": 0.29712095856666565, + "learning_rate": 3.455000664304189e-05, + "loss": 0.1373, + "step": 24540 + }, + { + "epoch": 0.43771626297577854, + "grad_norm": 0.2587900757789612, + "learning_rate": 3.4548568159036096e-05, + "loss": 0.0936, + "step": 24541 + }, + { + "epoch": 0.4377340990974922, + "grad_norm": 0.3766099512577057, + "learning_rate": 3.454712963801674e-05, + "loss": 0.1425, + "step": 24542 + }, + { + "epoch": 0.4377519352192059, + "grad_norm": 0.28522539138793945, + "learning_rate": 3.454569107998942e-05, + "loss": 0.1802, + "step": 24543 + }, + { + "epoch": 0.43776977134091966, + "grad_norm": 0.25206080079078674, + "learning_rate": 3.4544252484959684e-05, + "loss": 0.1766, + "step": 24544 + }, + { + "epoch": 0.43778760746263334, + "grad_norm": 0.3596685528755188, + "learning_rate": 3.454281385293315e-05, + "loss": 0.1179, + "step": 24545 + }, + { + "epoch": 0.43780544358434703, + "grad_norm": 0.28925469517707825, + "learning_rate": 3.454137518391536e-05, + "loss": 0.1792, + "step": 24546 + }, + { + "epoch": 0.4378232797060607, + "grad_norm": 0.42516639828681946, + "learning_rate": 3.4539936477911916e-05, + "loss": 0.1905, + "step": 24547 + }, + { + "epoch": 0.4378411158277744, + "grad_norm": 0.32832595705986023, + "learning_rate": 3.4538497734928374e-05, + "loss": 0.1689, + "step": 24548 + }, + { + "epoch": 0.4378589519494881, + "grad_norm": 0.24498018622398376, + "learning_rate": 3.453705895497034e-05, + "loss": 0.1372, + "step": 24549 + }, + { + "epoch": 0.4378767880712018, + "grad_norm": 0.3208106458187103, + "learning_rate": 3.453562013804335e-05, + "loss": 0.1533, + "step": 24550 + }, + { + "epoch": 0.43789462419291547, + "grad_norm": 0.2686372995376587, + "learning_rate": 3.4534181284153005e-05, + "loss": 0.2071, + "step": 24551 + }, + { + "epoch": 0.43791246031462916, + "grad_norm": 0.28134483098983765, + "learning_rate": 3.4532742393304886e-05, + "loss": 0.1201, + "step": 24552 + }, + { + "epoch": 0.4379302964363429, + "grad_norm": 0.21770042181015015, + "learning_rate": 3.453130346550457e-05, + "loss": 0.1086, + "step": 24553 + }, + { + "epoch": 0.4379481325580566, + "grad_norm": 0.19623777270317078, + "learning_rate": 3.452986450075762e-05, + "loss": 0.1468, + "step": 24554 + }, + { + "epoch": 0.4379659686797703, + "grad_norm": 0.25837016105651855, + "learning_rate": 3.4528425499069625e-05, + "loss": 0.127, + "step": 24555 + }, + { + "epoch": 0.43798380480148397, + "grad_norm": 0.202680766582489, + "learning_rate": 3.452698646044617e-05, + "loss": 0.0911, + "step": 24556 + }, + { + "epoch": 0.43800164092319765, + "grad_norm": 0.2903212904930115, + "learning_rate": 3.452554738489282e-05, + "loss": 0.1546, + "step": 24557 + }, + { + "epoch": 0.43801947704491134, + "grad_norm": 0.27089986205101013, + "learning_rate": 3.452410827241515e-05, + "loss": 0.204, + "step": 24558 + }, + { + "epoch": 0.43803731316662503, + "grad_norm": 0.2867121696472168, + "learning_rate": 3.452266912301875e-05, + "loss": 0.1661, + "step": 24559 + }, + { + "epoch": 0.4380551492883387, + "grad_norm": 0.21074457466602325, + "learning_rate": 3.45212299367092e-05, + "loss": 0.0843, + "step": 24560 + }, + { + "epoch": 0.43807298541005246, + "grad_norm": 0.26539722084999084, + "learning_rate": 3.451979071349208e-05, + "loss": 0.1901, + "step": 24561 + }, + { + "epoch": 0.43809082153176615, + "grad_norm": 0.28294482827186584, + "learning_rate": 3.451835145337295e-05, + "loss": 0.1707, + "step": 24562 + }, + { + "epoch": 0.43810865765347984, + "grad_norm": 0.2809041738510132, + "learning_rate": 3.451691215635742e-05, + "loss": 0.1798, + "step": 24563 + }, + { + "epoch": 0.4381264937751935, + "grad_norm": 0.3937005400657654, + "learning_rate": 3.4515472822451037e-05, + "loss": 0.2056, + "step": 24564 + }, + { + "epoch": 0.4381443298969072, + "grad_norm": 0.18566061556339264, + "learning_rate": 3.45140334516594e-05, + "loss": 0.1391, + "step": 24565 + }, + { + "epoch": 0.4381621660186209, + "grad_norm": 0.37465575337409973, + "learning_rate": 3.451259404398808e-05, + "loss": 0.1641, + "step": 24566 + }, + { + "epoch": 0.4381800021403346, + "grad_norm": 0.28893840312957764, + "learning_rate": 3.4511154599442666e-05, + "loss": 0.155, + "step": 24567 + }, + { + "epoch": 0.4381978382620483, + "grad_norm": 0.2862136662006378, + "learning_rate": 3.450971511802872e-05, + "loss": 0.1797, + "step": 24568 + }, + { + "epoch": 0.438215674383762, + "grad_norm": 0.31373411417007446, + "learning_rate": 3.450827559975185e-05, + "loss": 0.1659, + "step": 24569 + }, + { + "epoch": 0.4382335105054757, + "grad_norm": 0.2854815125465393, + "learning_rate": 3.450683604461761e-05, + "loss": 0.1598, + "step": 24570 + }, + { + "epoch": 0.4382513466271894, + "grad_norm": 0.27923718094825745, + "learning_rate": 3.450539645263159e-05, + "loss": 0.1418, + "step": 24571 + }, + { + "epoch": 0.4382691827489031, + "grad_norm": 0.33809757232666016, + "learning_rate": 3.450395682379936e-05, + "loss": 0.106, + "step": 24572 + }, + { + "epoch": 0.4382870188706168, + "grad_norm": 0.22958843410015106, + "learning_rate": 3.4502517158126527e-05, + "loss": 0.1633, + "step": 24573 + }, + { + "epoch": 0.43830485499233046, + "grad_norm": 0.28078243136405945, + "learning_rate": 3.450107745561865e-05, + "loss": 0.1625, + "step": 24574 + }, + { + "epoch": 0.43832269111404415, + "grad_norm": 0.2779524624347687, + "learning_rate": 3.449963771628132e-05, + "loss": 0.1436, + "step": 24575 + }, + { + "epoch": 0.43834052723575784, + "grad_norm": 0.23051893711090088, + "learning_rate": 3.4498197940120106e-05, + "loss": 0.1781, + "step": 24576 + }, + { + "epoch": 0.4383583633574715, + "grad_norm": 0.23626069724559784, + "learning_rate": 3.44967581271406e-05, + "loss": 0.1549, + "step": 24577 + }, + { + "epoch": 0.43837619947918527, + "grad_norm": 0.4216504991054535, + "learning_rate": 3.449531827734838e-05, + "loss": 0.137, + "step": 24578 + }, + { + "epoch": 0.43839403560089896, + "grad_norm": 0.3295765817165375, + "learning_rate": 3.4493878390749016e-05, + "loss": 0.2072, + "step": 24579 + }, + { + "epoch": 0.43841187172261264, + "grad_norm": 0.24754376709461212, + "learning_rate": 3.449243846734812e-05, + "loss": 0.1263, + "step": 24580 + }, + { + "epoch": 0.43842970784432633, + "grad_norm": 0.2161509245634079, + "learning_rate": 3.449099850715123e-05, + "loss": 0.1852, + "step": 24581 + }, + { + "epoch": 0.43844754396604, + "grad_norm": 0.27608081698417664, + "learning_rate": 3.4489558510163974e-05, + "loss": 0.1585, + "step": 24582 + }, + { + "epoch": 0.4384653800877537, + "grad_norm": 0.39963284134864807, + "learning_rate": 3.44881184763919e-05, + "loss": 0.1747, + "step": 24583 + }, + { + "epoch": 0.4384832162094674, + "grad_norm": 0.27768880128860474, + "learning_rate": 3.448667840584061e-05, + "loss": 0.1402, + "step": 24584 + }, + { + "epoch": 0.4385010523311811, + "grad_norm": 0.24097082018852234, + "learning_rate": 3.4485238298515665e-05, + "loss": 0.1328, + "step": 24585 + }, + { + "epoch": 0.43851888845289483, + "grad_norm": 0.27341970801353455, + "learning_rate": 3.448379815442267e-05, + "loss": 0.1978, + "step": 24586 + }, + { + "epoch": 0.4385367245746085, + "grad_norm": 0.26556727290153503, + "learning_rate": 3.448235797356719e-05, + "loss": 0.1306, + "step": 24587 + }, + { + "epoch": 0.4385545606963222, + "grad_norm": 0.35834696888923645, + "learning_rate": 3.4480917755954825e-05, + "loss": 0.227, + "step": 24588 + }, + { + "epoch": 0.4385723968180359, + "grad_norm": 0.24175631999969482, + "learning_rate": 3.447947750159114e-05, + "loss": 0.1613, + "step": 24589 + }, + { + "epoch": 0.4385902329397496, + "grad_norm": 0.20927119255065918, + "learning_rate": 3.4478037210481737e-05, + "loss": 0.1202, + "step": 24590 + }, + { + "epoch": 0.43860806906146327, + "grad_norm": 0.3092963695526123, + "learning_rate": 3.447659688263218e-05, + "loss": 0.1408, + "step": 24591 + }, + { + "epoch": 0.43862590518317696, + "grad_norm": 0.26060038805007935, + "learning_rate": 3.4475156518048066e-05, + "loss": 0.1206, + "step": 24592 + }, + { + "epoch": 0.43864374130489064, + "grad_norm": 0.25653448700904846, + "learning_rate": 3.447371611673496e-05, + "loss": 0.1902, + "step": 24593 + }, + { + "epoch": 0.43866157742660433, + "grad_norm": 0.2920917570590973, + "learning_rate": 3.447227567869846e-05, + "loss": 0.2089, + "step": 24594 + }, + { + "epoch": 0.4386794135483181, + "grad_norm": 0.3284919261932373, + "learning_rate": 3.4470835203944166e-05, + "loss": 0.131, + "step": 24595 + }, + { + "epoch": 0.43869724967003176, + "grad_norm": 0.23107919096946716, + "learning_rate": 3.4469394692477626e-05, + "loss": 0.1198, + "step": 24596 + }, + { + "epoch": 0.43871508579174545, + "grad_norm": 0.2584525942802429, + "learning_rate": 3.4467954144304446e-05, + "loss": 0.1476, + "step": 24597 + }, + { + "epoch": 0.43873292191345914, + "grad_norm": 0.2407422959804535, + "learning_rate": 3.446651355943021e-05, + "loss": 0.1221, + "step": 24598 + }, + { + "epoch": 0.4387507580351728, + "grad_norm": 0.26085159182548523, + "learning_rate": 3.446507293786049e-05, + "loss": 0.1597, + "step": 24599 + }, + { + "epoch": 0.4387685941568865, + "grad_norm": 0.3187166750431061, + "learning_rate": 3.446363227960088e-05, + "loss": 0.1808, + "step": 24600 + }, + { + "epoch": 0.4387864302786002, + "grad_norm": 0.1783655285835266, + "learning_rate": 3.446219158465697e-05, + "loss": 0.1238, + "step": 24601 + }, + { + "epoch": 0.4388042664003139, + "grad_norm": 0.3295961022377014, + "learning_rate": 3.446075085303433e-05, + "loss": 0.1156, + "step": 24602 + }, + { + "epoch": 0.43882210252202764, + "grad_norm": 0.3164421319961548, + "learning_rate": 3.445931008473856e-05, + "loss": 0.1384, + "step": 24603 + }, + { + "epoch": 0.4388399386437413, + "grad_norm": 0.24440906941890717, + "learning_rate": 3.445786927977523e-05, + "loss": 0.1914, + "step": 24604 + }, + { + "epoch": 0.438857774765455, + "grad_norm": 0.24379198253154755, + "learning_rate": 3.445642843814994e-05, + "loss": 0.186, + "step": 24605 + }, + { + "epoch": 0.4388756108871687, + "grad_norm": 0.2811765968799591, + "learning_rate": 3.445498755986826e-05, + "loss": 0.1611, + "step": 24606 + }, + { + "epoch": 0.4388934470088824, + "grad_norm": 0.35629937052726746, + "learning_rate": 3.4453546644935776e-05, + "loss": 0.1557, + "step": 24607 + }, + { + "epoch": 0.4389112831305961, + "grad_norm": 0.31019970774650574, + "learning_rate": 3.445210569335809e-05, + "loss": 0.1727, + "step": 24608 + }, + { + "epoch": 0.43892911925230976, + "grad_norm": 0.2642403244972229, + "learning_rate": 3.445066470514078e-05, + "loss": 0.1563, + "step": 24609 + }, + { + "epoch": 0.43894695537402345, + "grad_norm": 0.40901580452919006, + "learning_rate": 3.444922368028942e-05, + "loss": 0.1801, + "step": 24610 + }, + { + "epoch": 0.4389647914957372, + "grad_norm": 0.2804419696331024, + "learning_rate": 3.444778261880961e-05, + "loss": 0.2067, + "step": 24611 + }, + { + "epoch": 0.4389826276174509, + "grad_norm": 0.32510489225387573, + "learning_rate": 3.444634152070694e-05, + "loss": 0.2355, + "step": 24612 + }, + { + "epoch": 0.43900046373916457, + "grad_norm": 0.33013513684272766, + "learning_rate": 3.444490038598697e-05, + "loss": 0.1408, + "step": 24613 + }, + { + "epoch": 0.43901829986087826, + "grad_norm": 0.34253761172294617, + "learning_rate": 3.444345921465532e-05, + "loss": 0.1769, + "step": 24614 + }, + { + "epoch": 0.43903613598259195, + "grad_norm": 0.256433367729187, + "learning_rate": 3.444201800671755e-05, + "loss": 0.1645, + "step": 24615 + }, + { + "epoch": 0.43905397210430563, + "grad_norm": 0.25067293643951416, + "learning_rate": 3.444057676217926e-05, + "loss": 0.1168, + "step": 24616 + }, + { + "epoch": 0.4390718082260193, + "grad_norm": 0.31757479906082153, + "learning_rate": 3.443913548104603e-05, + "loss": 0.1463, + "step": 24617 + }, + { + "epoch": 0.439089644347733, + "grad_norm": 0.29563412070274353, + "learning_rate": 3.4437694163323464e-05, + "loss": 0.1332, + "step": 24618 + }, + { + "epoch": 0.4391074804694467, + "grad_norm": 0.44224053621292114, + "learning_rate": 3.443625280901713e-05, + "loss": 0.1571, + "step": 24619 + }, + { + "epoch": 0.43912531659116044, + "grad_norm": 0.2787693738937378, + "learning_rate": 3.443481141813261e-05, + "loss": 0.162, + "step": 24620 + }, + { + "epoch": 0.43914315271287413, + "grad_norm": 0.2649666965007782, + "learning_rate": 3.4433369990675515e-05, + "loss": 0.1063, + "step": 24621 + }, + { + "epoch": 0.4391609888345878, + "grad_norm": 0.26307302713394165, + "learning_rate": 3.443192852665141e-05, + "loss": 0.14, + "step": 24622 + }, + { + "epoch": 0.4391788249563015, + "grad_norm": 0.22876949608325958, + "learning_rate": 3.44304870260659e-05, + "loss": 0.1396, + "step": 24623 + }, + { + "epoch": 0.4391966610780152, + "grad_norm": 0.3244122564792633, + "learning_rate": 3.442904548892456e-05, + "loss": 0.2014, + "step": 24624 + }, + { + "epoch": 0.4392144971997289, + "grad_norm": 0.37037718296051025, + "learning_rate": 3.442760391523299e-05, + "loss": 0.1614, + "step": 24625 + }, + { + "epoch": 0.43923233332144257, + "grad_norm": 0.2491759955883026, + "learning_rate": 3.442616230499676e-05, + "loss": 0.1962, + "step": 24626 + }, + { + "epoch": 0.43925016944315626, + "grad_norm": 0.23094677925109863, + "learning_rate": 3.4424720658221474e-05, + "loss": 0.1333, + "step": 24627 + }, + { + "epoch": 0.43926800556487, + "grad_norm": 0.31826314330101013, + "learning_rate": 3.4423278974912716e-05, + "loss": 0.175, + "step": 24628 + }, + { + "epoch": 0.4392858416865837, + "grad_norm": 0.24727092683315277, + "learning_rate": 3.4421837255076075e-05, + "loss": 0.1795, + "step": 24629 + }, + { + "epoch": 0.4393036778082974, + "grad_norm": 0.24087651073932648, + "learning_rate": 3.442039549871714e-05, + "loss": 0.1253, + "step": 24630 + }, + { + "epoch": 0.43932151393001106, + "grad_norm": 0.3798896074295044, + "learning_rate": 3.44189537058415e-05, + "loss": 0.1788, + "step": 24631 + }, + { + "epoch": 0.43933935005172475, + "grad_norm": 0.26776382327079773, + "learning_rate": 3.441751187645474e-05, + "loss": 0.1496, + "step": 24632 + }, + { + "epoch": 0.43935718617343844, + "grad_norm": 0.30220523476600647, + "learning_rate": 3.441607001056245e-05, + "loss": 0.139, + "step": 24633 + }, + { + "epoch": 0.43937502229515213, + "grad_norm": 0.2621837258338928, + "learning_rate": 3.4414628108170225e-05, + "loss": 0.1447, + "step": 24634 + }, + { + "epoch": 0.4393928584168658, + "grad_norm": 0.30371183156967163, + "learning_rate": 3.441318616928364e-05, + "loss": 0.073, + "step": 24635 + }, + { + "epoch": 0.4394106945385795, + "grad_norm": 0.25109121203422546, + "learning_rate": 3.44117441939083e-05, + "loss": 0.1442, + "step": 24636 + }, + { + "epoch": 0.43942853066029325, + "grad_norm": 0.2872004508972168, + "learning_rate": 3.4410302182049786e-05, + "loss": 0.1212, + "step": 24637 + }, + { + "epoch": 0.43944636678200694, + "grad_norm": 0.282455712556839, + "learning_rate": 3.44088601337137e-05, + "loss": 0.1764, + "step": 24638 + }, + { + "epoch": 0.4394642029037206, + "grad_norm": 0.23103205859661102, + "learning_rate": 3.440741804890562e-05, + "loss": 0.1326, + "step": 24639 + }, + { + "epoch": 0.4394820390254343, + "grad_norm": 0.27019548416137695, + "learning_rate": 3.440597592763113e-05, + "loss": 0.1543, + "step": 24640 + }, + { + "epoch": 0.439499875147148, + "grad_norm": 0.23728132247924805, + "learning_rate": 3.440453376989583e-05, + "loss": 0.1504, + "step": 24641 + }, + { + "epoch": 0.4395177112688617, + "grad_norm": 0.3127742409706116, + "learning_rate": 3.440309157570531e-05, + "loss": 0.2093, + "step": 24642 + }, + { + "epoch": 0.4395355473905754, + "grad_norm": 0.22227708995342255, + "learning_rate": 3.4401649345065156e-05, + "loss": 0.1367, + "step": 24643 + }, + { + "epoch": 0.43955338351228906, + "grad_norm": 0.3436882495880127, + "learning_rate": 3.440020707798097e-05, + "loss": 0.155, + "step": 24644 + }, + { + "epoch": 0.4395712196340028, + "grad_norm": 0.29666784405708313, + "learning_rate": 3.439876477445834e-05, + "loss": 0.1612, + "step": 24645 + }, + { + "epoch": 0.4395890557557165, + "grad_norm": 0.24896258115768433, + "learning_rate": 3.439732243450284e-05, + "loss": 0.1943, + "step": 24646 + }, + { + "epoch": 0.4396068918774302, + "grad_norm": 0.256962388753891, + "learning_rate": 3.439588005812008e-05, + "loss": 0.1773, + "step": 24647 + }, + { + "epoch": 0.43962472799914387, + "grad_norm": 0.260832816362381, + "learning_rate": 3.4394437645315634e-05, + "loss": 0.1549, + "step": 24648 + }, + { + "epoch": 0.43964256412085756, + "grad_norm": 0.22145824134349823, + "learning_rate": 3.43929951960951e-05, + "loss": 0.134, + "step": 24649 + }, + { + "epoch": 0.43966040024257125, + "grad_norm": 0.30688732862472534, + "learning_rate": 3.4391552710464084e-05, + "loss": 0.1359, + "step": 24650 + }, + { + "epoch": 0.43967823636428494, + "grad_norm": 0.1992262452840805, + "learning_rate": 3.4390110188428166e-05, + "loss": 0.1442, + "step": 24651 + }, + { + "epoch": 0.4396960724859986, + "grad_norm": 0.2667248845100403, + "learning_rate": 3.438866762999293e-05, + "loss": 0.1498, + "step": 24652 + }, + { + "epoch": 0.4397139086077123, + "grad_norm": 0.2612917423248291, + "learning_rate": 3.438722503516399e-05, + "loss": 0.1383, + "step": 24653 + }, + { + "epoch": 0.43973174472942606, + "grad_norm": 0.3943200707435608, + "learning_rate": 3.4385782403946905e-05, + "loss": 0.2004, + "step": 24654 + }, + { + "epoch": 0.43974958085113974, + "grad_norm": 0.3169809579849243, + "learning_rate": 3.438433973634729e-05, + "loss": 0.1094, + "step": 24655 + }, + { + "epoch": 0.43976741697285343, + "grad_norm": 0.30927079916000366, + "learning_rate": 3.438289703237074e-05, + "loss": 0.1607, + "step": 24656 + }, + { + "epoch": 0.4397852530945671, + "grad_norm": 0.3377167880535126, + "learning_rate": 3.438145429202284e-05, + "loss": 0.1987, + "step": 24657 + }, + { + "epoch": 0.4398030892162808, + "grad_norm": 0.31129828095436096, + "learning_rate": 3.438001151530918e-05, + "loss": 0.2062, + "step": 24658 + }, + { + "epoch": 0.4398209253379945, + "grad_norm": 0.26784205436706543, + "learning_rate": 3.437856870223535e-05, + "loss": 0.167, + "step": 24659 + }, + { + "epoch": 0.4398387614597082, + "grad_norm": 0.3814446032047272, + "learning_rate": 3.4377125852806956e-05, + "loss": 0.1448, + "step": 24660 + }, + { + "epoch": 0.43985659758142187, + "grad_norm": 0.2507796287536621, + "learning_rate": 3.4375682967029585e-05, + "loss": 0.1706, + "step": 24661 + }, + { + "epoch": 0.4398744337031356, + "grad_norm": 0.24941763281822205, + "learning_rate": 3.437424004490882e-05, + "loss": 0.1016, + "step": 24662 + }, + { + "epoch": 0.4398922698248493, + "grad_norm": 0.37323588132858276, + "learning_rate": 3.4372797086450265e-05, + "loss": 0.1246, + "step": 24663 + }, + { + "epoch": 0.439910105946563, + "grad_norm": 0.19328205287456512, + "learning_rate": 3.437135409165952e-05, + "loss": 0.1425, + "step": 24664 + }, + { + "epoch": 0.4399279420682767, + "grad_norm": 0.40340539813041687, + "learning_rate": 3.436991106054216e-05, + "loss": 0.2477, + "step": 24665 + }, + { + "epoch": 0.43994577818999037, + "grad_norm": 0.2697802484035492, + "learning_rate": 3.436846799310379e-05, + "loss": 0.1022, + "step": 24666 + }, + { + "epoch": 0.43996361431170405, + "grad_norm": 0.21388310194015503, + "learning_rate": 3.4367024889350006e-05, + "loss": 0.1683, + "step": 24667 + }, + { + "epoch": 0.43998145043341774, + "grad_norm": 0.4298039972782135, + "learning_rate": 3.43655817492864e-05, + "loss": 0.148, + "step": 24668 + }, + { + "epoch": 0.43999928655513143, + "grad_norm": 0.1701425164937973, + "learning_rate": 3.436413857291856e-05, + "loss": 0.0935, + "step": 24669 + }, + { + "epoch": 0.4400171226768452, + "grad_norm": 0.2667483389377594, + "learning_rate": 3.4362695360252086e-05, + "loss": 0.1344, + "step": 24670 + }, + { + "epoch": 0.44003495879855886, + "grad_norm": 0.2699483036994934, + "learning_rate": 3.4361252111292575e-05, + "loss": 0.1451, + "step": 24671 + }, + { + "epoch": 0.44005279492027255, + "grad_norm": 0.21968398988246918, + "learning_rate": 3.435980882604561e-05, + "loss": 0.134, + "step": 24672 + }, + { + "epoch": 0.44007063104198624, + "grad_norm": 0.3413139879703522, + "learning_rate": 3.43583655045168e-05, + "loss": 0.2286, + "step": 24673 + }, + { + "epoch": 0.4400884671636999, + "grad_norm": 0.27021604776382446, + "learning_rate": 3.435692214671172e-05, + "loss": 0.1253, + "step": 24674 + }, + { + "epoch": 0.4401063032854136, + "grad_norm": 0.2780676782131195, + "learning_rate": 3.4355478752636e-05, + "loss": 0.1533, + "step": 24675 + }, + { + "epoch": 0.4401241394071273, + "grad_norm": 0.2237774133682251, + "learning_rate": 3.435403532229519e-05, + "loss": 0.1375, + "step": 24676 + }, + { + "epoch": 0.440141975528841, + "grad_norm": 0.22570393979549408, + "learning_rate": 3.435259185569492e-05, + "loss": 0.116, + "step": 24677 + }, + { + "epoch": 0.4401598116505547, + "grad_norm": 0.46217167377471924, + "learning_rate": 3.435114835284077e-05, + "loss": 0.1292, + "step": 24678 + }, + { + "epoch": 0.4401776477722684, + "grad_norm": 0.2668130397796631, + "learning_rate": 3.434970481373835e-05, + "loss": 0.1282, + "step": 24679 + }, + { + "epoch": 0.4401954838939821, + "grad_norm": 0.36065685749053955, + "learning_rate": 3.434826123839323e-05, + "loss": 0.1298, + "step": 24680 + }, + { + "epoch": 0.4402133200156958, + "grad_norm": 0.24330353736877441, + "learning_rate": 3.4346817626811036e-05, + "loss": 0.1644, + "step": 24681 + }, + { + "epoch": 0.4402311561374095, + "grad_norm": 0.38841748237609863, + "learning_rate": 3.434537397899734e-05, + "loss": 0.1444, + "step": 24682 + }, + { + "epoch": 0.4402489922591232, + "grad_norm": 0.3205896019935608, + "learning_rate": 3.434393029495774e-05, + "loss": 0.1526, + "step": 24683 + }, + { + "epoch": 0.44026682838083686, + "grad_norm": 0.3153390884399414, + "learning_rate": 3.434248657469784e-05, + "loss": 0.1261, + "step": 24684 + }, + { + "epoch": 0.44028466450255055, + "grad_norm": 0.3627713918685913, + "learning_rate": 3.4341042818223246e-05, + "loss": 0.1599, + "step": 24685 + }, + { + "epoch": 0.44030250062426424, + "grad_norm": 0.26903846859931946, + "learning_rate": 3.4339599025539544e-05, + "loss": 0.1924, + "step": 24686 + }, + { + "epoch": 0.440320336745978, + "grad_norm": 0.40315642952919006, + "learning_rate": 3.433815519665232e-05, + "loss": 0.1628, + "step": 24687 + }, + { + "epoch": 0.44033817286769167, + "grad_norm": 0.3624573349952698, + "learning_rate": 3.433671133156719e-05, + "loss": 0.1396, + "step": 24688 + }, + { + "epoch": 0.44035600898940536, + "grad_norm": 0.3920454680919647, + "learning_rate": 3.4335267430289735e-05, + "loss": 0.1536, + "step": 24689 + }, + { + "epoch": 0.44037384511111904, + "grad_norm": 0.19093580543994904, + "learning_rate": 3.4333823492825564e-05, + "loss": 0.1312, + "step": 24690 + }, + { + "epoch": 0.44039168123283273, + "grad_norm": 0.28893616795539856, + "learning_rate": 3.4332379519180266e-05, + "loss": 0.155, + "step": 24691 + }, + { + "epoch": 0.4404095173545464, + "grad_norm": 0.25348326563835144, + "learning_rate": 3.4330935509359444e-05, + "loss": 0.1524, + "step": 24692 + }, + { + "epoch": 0.4404273534762601, + "grad_norm": 0.29898956418037415, + "learning_rate": 3.432949146336869e-05, + "loss": 0.1733, + "step": 24693 + }, + { + "epoch": 0.4404451895979738, + "grad_norm": 0.27690890431404114, + "learning_rate": 3.432804738121361e-05, + "loss": 0.1445, + "step": 24694 + }, + { + "epoch": 0.4404630257196875, + "grad_norm": 0.36643916368484497, + "learning_rate": 3.4326603262899795e-05, + "loss": 0.196, + "step": 24695 + }, + { + "epoch": 0.44048086184140123, + "grad_norm": 0.29113900661468506, + "learning_rate": 3.4325159108432844e-05, + "loss": 0.1509, + "step": 24696 + }, + { + "epoch": 0.4404986979631149, + "grad_norm": 0.3071874976158142, + "learning_rate": 3.4323714917818355e-05, + "loss": 0.1373, + "step": 24697 + }, + { + "epoch": 0.4405165340848286, + "grad_norm": 0.2989560067653656, + "learning_rate": 3.4322270691061926e-05, + "loss": 0.1384, + "step": 24698 + }, + { + "epoch": 0.4405343702065423, + "grad_norm": 0.2771482467651367, + "learning_rate": 3.4320826428169154e-05, + "loss": 0.1657, + "step": 24699 + }, + { + "epoch": 0.440552206328256, + "grad_norm": 0.30179348587989807, + "learning_rate": 3.431938212914564e-05, + "loss": 0.0988, + "step": 24700 + }, + { + "epoch": 0.44057004244996967, + "grad_norm": 0.29966628551483154, + "learning_rate": 3.431793779399699e-05, + "loss": 0.1558, + "step": 24701 + }, + { + "epoch": 0.44058787857168336, + "grad_norm": 0.31375232338905334, + "learning_rate": 3.4316493422728784e-05, + "loss": 0.1745, + "step": 24702 + }, + { + "epoch": 0.44060571469339704, + "grad_norm": 0.26222479343414307, + "learning_rate": 3.431504901534663e-05, + "loss": 0.1555, + "step": 24703 + }, + { + "epoch": 0.4406235508151108, + "grad_norm": 0.27539992332458496, + "learning_rate": 3.431360457185614e-05, + "loss": 0.1679, + "step": 24704 + }, + { + "epoch": 0.4406413869368245, + "grad_norm": 0.48759081959724426, + "learning_rate": 3.43121600922629e-05, + "loss": 0.1947, + "step": 24705 + }, + { + "epoch": 0.44065922305853816, + "grad_norm": 0.24323388934135437, + "learning_rate": 3.4310715576572506e-05, + "loss": 0.1694, + "step": 24706 + }, + { + "epoch": 0.44067705918025185, + "grad_norm": 0.26869115233421326, + "learning_rate": 3.430927102479057e-05, + "loss": 0.1687, + "step": 24707 + }, + { + "epoch": 0.44069489530196554, + "grad_norm": 0.2161533385515213, + "learning_rate": 3.4307826436922676e-05, + "loss": 0.1644, + "step": 24708 + }, + { + "epoch": 0.4407127314236792, + "grad_norm": 0.21348880231380463, + "learning_rate": 3.430638181297444e-05, + "loss": 0.1019, + "step": 24709 + }, + { + "epoch": 0.4407305675453929, + "grad_norm": 0.30145740509033203, + "learning_rate": 3.430493715295144e-05, + "loss": 0.2164, + "step": 24710 + }, + { + "epoch": 0.4407484036671066, + "grad_norm": 0.1956244558095932, + "learning_rate": 3.43034924568593e-05, + "loss": 0.1401, + "step": 24711 + }, + { + "epoch": 0.44076623978882035, + "grad_norm": 0.23823858797550201, + "learning_rate": 3.4302047724703615e-05, + "loss": 0.1284, + "step": 24712 + }, + { + "epoch": 0.44078407591053403, + "grad_norm": 0.31276917457580566, + "learning_rate": 3.430060295648997e-05, + "loss": 0.1235, + "step": 24713 + }, + { + "epoch": 0.4408019120322477, + "grad_norm": 0.21205952763557434, + "learning_rate": 3.429915815222398e-05, + "loss": 0.1315, + "step": 24714 + }, + { + "epoch": 0.4408197481539614, + "grad_norm": 0.3925333023071289, + "learning_rate": 3.429771331191124e-05, + "loss": 0.1959, + "step": 24715 + }, + { + "epoch": 0.4408375842756751, + "grad_norm": 0.2393421232700348, + "learning_rate": 3.429626843555736e-05, + "loss": 0.1886, + "step": 24716 + }, + { + "epoch": 0.4408554203973888, + "grad_norm": 0.20441681146621704, + "learning_rate": 3.429482352316792e-05, + "loss": 0.1081, + "step": 24717 + }, + { + "epoch": 0.4408732565191025, + "grad_norm": 0.2991001009941101, + "learning_rate": 3.4293378574748534e-05, + "loss": 0.151, + "step": 24718 + }, + { + "epoch": 0.44089109264081616, + "grad_norm": 0.23300915956497192, + "learning_rate": 3.42919335903048e-05, + "loss": 0.1753, + "step": 24719 + }, + { + "epoch": 0.44090892876252985, + "grad_norm": 0.25761520862579346, + "learning_rate": 3.429048856984234e-05, + "loss": 0.1374, + "step": 24720 + }, + { + "epoch": 0.4409267648842436, + "grad_norm": 0.30491897463798523, + "learning_rate": 3.428904351336673e-05, + "loss": 0.1902, + "step": 24721 + }, + { + "epoch": 0.4409446010059573, + "grad_norm": 0.26109182834625244, + "learning_rate": 3.428759842088357e-05, + "loss": 0.1635, + "step": 24722 + }, + { + "epoch": 0.44096243712767097, + "grad_norm": 0.2662157118320465, + "learning_rate": 3.428615329239848e-05, + "loss": 0.1718, + "step": 24723 + }, + { + "epoch": 0.44098027324938466, + "grad_norm": 0.33706337213516235, + "learning_rate": 3.428470812791705e-05, + "loss": 0.1498, + "step": 24724 + }, + { + "epoch": 0.44099810937109835, + "grad_norm": 0.2901989221572876, + "learning_rate": 3.428326292744488e-05, + "loss": 0.1666, + "step": 24725 + }, + { + "epoch": 0.44101594549281203, + "grad_norm": 0.3397957384586334, + "learning_rate": 3.428181769098758e-05, + "loss": 0.1322, + "step": 24726 + }, + { + "epoch": 0.4410337816145257, + "grad_norm": 0.3570844233036041, + "learning_rate": 3.428037241855075e-05, + "loss": 0.2176, + "step": 24727 + }, + { + "epoch": 0.4410516177362394, + "grad_norm": 0.34804993867874146, + "learning_rate": 3.4278927110139994e-05, + "loss": 0.1086, + "step": 24728 + }, + { + "epoch": 0.44106945385795315, + "grad_norm": 0.2614504396915436, + "learning_rate": 3.427748176576091e-05, + "loss": 0.1642, + "step": 24729 + }, + { + "epoch": 0.44108728997966684, + "grad_norm": 0.27107861638069153, + "learning_rate": 3.4276036385419094e-05, + "loss": 0.1239, + "step": 24730 + }, + { + "epoch": 0.44110512610138053, + "grad_norm": 0.2110345959663391, + "learning_rate": 3.4274590969120154e-05, + "loss": 0.1847, + "step": 24731 + }, + { + "epoch": 0.4411229622230942, + "grad_norm": 0.24015846848487854, + "learning_rate": 3.427314551686971e-05, + "loss": 0.167, + "step": 24732 + }, + { + "epoch": 0.4411407983448079, + "grad_norm": 0.23190492391586304, + "learning_rate": 3.4271700028673345e-05, + "loss": 0.1443, + "step": 24733 + }, + { + "epoch": 0.4411586344665216, + "grad_norm": 0.20426729321479797, + "learning_rate": 3.427025450453667e-05, + "loss": 0.1313, + "step": 24734 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 0.35279545187950134, + "learning_rate": 3.426880894446529e-05, + "loss": 0.1829, + "step": 24735 + }, + { + "epoch": 0.44119430670994897, + "grad_norm": 0.28399360179901123, + "learning_rate": 3.4267363348464796e-05, + "loss": 0.1528, + "step": 24736 + }, + { + "epoch": 0.44121214283166266, + "grad_norm": 0.3399839401245117, + "learning_rate": 3.426591771654082e-05, + "loss": 0.1647, + "step": 24737 + }, + { + "epoch": 0.4412299789533764, + "grad_norm": 0.25238320231437683, + "learning_rate": 3.4264472048698926e-05, + "loss": 0.1121, + "step": 24738 + }, + { + "epoch": 0.4412478150750901, + "grad_norm": 0.1852482706308365, + "learning_rate": 3.426302634494474e-05, + "loss": 0.1286, + "step": 24739 + }, + { + "epoch": 0.4412656511968038, + "grad_norm": 0.316701740026474, + "learning_rate": 3.426158060528388e-05, + "loss": 0.1686, + "step": 24740 + }, + { + "epoch": 0.44128348731851746, + "grad_norm": 0.30942216515541077, + "learning_rate": 3.426013482972192e-05, + "loss": 0.1388, + "step": 24741 + }, + { + "epoch": 0.44130132344023115, + "grad_norm": 0.27361252903938293, + "learning_rate": 3.425868901826449e-05, + "loss": 0.1967, + "step": 24742 + }, + { + "epoch": 0.44131915956194484, + "grad_norm": 0.322246253490448, + "learning_rate": 3.425724317091717e-05, + "loss": 0.1363, + "step": 24743 + }, + { + "epoch": 0.44133699568365853, + "grad_norm": 0.2132885754108429, + "learning_rate": 3.425579728768559e-05, + "loss": 0.1571, + "step": 24744 + }, + { + "epoch": 0.4413548318053722, + "grad_norm": 0.22016534209251404, + "learning_rate": 3.4254351368575336e-05, + "loss": 0.1519, + "step": 24745 + }, + { + "epoch": 0.44137266792708596, + "grad_norm": 0.19636160135269165, + "learning_rate": 3.4252905413592025e-05, + "loss": 0.0994, + "step": 24746 + }, + { + "epoch": 0.44139050404879965, + "grad_norm": 0.2573027014732361, + "learning_rate": 3.425145942274125e-05, + "loss": 0.1706, + "step": 24747 + }, + { + "epoch": 0.44140834017051334, + "grad_norm": 0.29500851035118103, + "learning_rate": 3.425001339602863e-05, + "loss": 0.1694, + "step": 24748 + }, + { + "epoch": 0.441426176292227, + "grad_norm": 0.25622713565826416, + "learning_rate": 3.424856733345976e-05, + "loss": 0.1272, + "step": 24749 + }, + { + "epoch": 0.4414440124139407, + "grad_norm": 0.332975298166275, + "learning_rate": 3.424712123504025e-05, + "loss": 0.2292, + "step": 24750 + }, + { + "epoch": 0.4414618485356544, + "grad_norm": 0.27496057748794556, + "learning_rate": 3.4245675100775706e-05, + "loss": 0.176, + "step": 24751 + }, + { + "epoch": 0.4414796846573681, + "grad_norm": 0.16666676104068756, + "learning_rate": 3.424422893067173e-05, + "loss": 0.1322, + "step": 24752 + }, + { + "epoch": 0.4414975207790818, + "grad_norm": 0.40514636039733887, + "learning_rate": 3.424278272473393e-05, + "loss": 0.1453, + "step": 24753 + }, + { + "epoch": 0.44151535690079546, + "grad_norm": 0.2893943786621094, + "learning_rate": 3.42413364829679e-05, + "loss": 0.1637, + "step": 24754 + }, + { + "epoch": 0.4415331930225092, + "grad_norm": 0.23168393969535828, + "learning_rate": 3.423989020537927e-05, + "loss": 0.1309, + "step": 24755 + }, + { + "epoch": 0.4415510291442229, + "grad_norm": 0.242039754986763, + "learning_rate": 3.4238443891973634e-05, + "loss": 0.1419, + "step": 24756 + }, + { + "epoch": 0.4415688652659366, + "grad_norm": 0.21341006457805634, + "learning_rate": 3.42369975427566e-05, + "loss": 0.1209, + "step": 24757 + }, + { + "epoch": 0.44158670138765027, + "grad_norm": 0.29745322465896606, + "learning_rate": 3.423555115773377e-05, + "loss": 0.1738, + "step": 24758 + }, + { + "epoch": 0.44160453750936396, + "grad_norm": 0.33084338903427124, + "learning_rate": 3.423410473691075e-05, + "loss": 0.1601, + "step": 24759 + }, + { + "epoch": 0.44162237363107765, + "grad_norm": 0.24049882590770721, + "learning_rate": 3.423265828029315e-05, + "loss": 0.1668, + "step": 24760 + }, + { + "epoch": 0.44164020975279134, + "grad_norm": 0.22782929241657257, + "learning_rate": 3.423121178788659e-05, + "loss": 0.0955, + "step": 24761 + }, + { + "epoch": 0.441658045874505, + "grad_norm": 0.2691223621368408, + "learning_rate": 3.4229765259696656e-05, + "loss": 0.1319, + "step": 24762 + }, + { + "epoch": 0.44167588199621877, + "grad_norm": 0.2977958917617798, + "learning_rate": 3.4228318695728964e-05, + "loss": 0.1894, + "step": 24763 + }, + { + "epoch": 0.44169371811793245, + "grad_norm": 0.20542508363723755, + "learning_rate": 3.422687209598913e-05, + "loss": 0.1391, + "step": 24764 + }, + { + "epoch": 0.44171155423964614, + "grad_norm": 0.23032750189304352, + "learning_rate": 3.422542546048274e-05, + "loss": 0.1357, + "step": 24765 + }, + { + "epoch": 0.44172939036135983, + "grad_norm": 0.24167431890964508, + "learning_rate": 3.422397878921542e-05, + "loss": 0.1541, + "step": 24766 + }, + { + "epoch": 0.4417472264830735, + "grad_norm": 0.2834479808807373, + "learning_rate": 3.422253208219277e-05, + "loss": 0.1466, + "step": 24767 + }, + { + "epoch": 0.4417650626047872, + "grad_norm": 0.1846838891506195, + "learning_rate": 3.42210853394204e-05, + "loss": 0.1067, + "step": 24768 + }, + { + "epoch": 0.4417828987265009, + "grad_norm": 0.3064740002155304, + "learning_rate": 3.421963856090393e-05, + "loss": 0.2035, + "step": 24769 + }, + { + "epoch": 0.4418007348482146, + "grad_norm": 0.2363947182893753, + "learning_rate": 3.421819174664895e-05, + "loss": 0.1725, + "step": 24770 + }, + { + "epoch": 0.4418185709699283, + "grad_norm": 0.2552568018436432, + "learning_rate": 3.421674489666107e-05, + "loss": 0.0926, + "step": 24771 + }, + { + "epoch": 0.441836407091642, + "grad_norm": 0.32727375626564026, + "learning_rate": 3.421529801094591e-05, + "loss": 0.1598, + "step": 24772 + }, + { + "epoch": 0.4418542432133557, + "grad_norm": 0.321321040391922, + "learning_rate": 3.421385108950906e-05, + "loss": 0.1764, + "step": 24773 + }, + { + "epoch": 0.4418720793350694, + "grad_norm": 0.31286823749542236, + "learning_rate": 3.421240413235615e-05, + "loss": 0.1532, + "step": 24774 + }, + { + "epoch": 0.4418899154567831, + "grad_norm": 0.25946810841560364, + "learning_rate": 3.421095713949278e-05, + "loss": 0.2026, + "step": 24775 + }, + { + "epoch": 0.44190775157849677, + "grad_norm": 0.2240571528673172, + "learning_rate": 3.420951011092456e-05, + "loss": 0.0928, + "step": 24776 + }, + { + "epoch": 0.44192558770021045, + "grad_norm": 0.2431851476430893, + "learning_rate": 3.4208063046657096e-05, + "loss": 0.157, + "step": 24777 + }, + { + "epoch": 0.44194342382192414, + "grad_norm": 0.259525328874588, + "learning_rate": 3.4206615946695996e-05, + "loss": 0.0977, + "step": 24778 + }, + { + "epoch": 0.44196125994363783, + "grad_norm": 0.22379638254642487, + "learning_rate": 3.420516881104688e-05, + "loss": 0.1468, + "step": 24779 + }, + { + "epoch": 0.4419790960653516, + "grad_norm": 0.2746237814426422, + "learning_rate": 3.4203721639715335e-05, + "loss": 0.178, + "step": 24780 + }, + { + "epoch": 0.44199693218706526, + "grad_norm": 0.2116021066904068, + "learning_rate": 3.4202274432707e-05, + "loss": 0.1173, + "step": 24781 + }, + { + "epoch": 0.44201476830877895, + "grad_norm": 0.23600275814533234, + "learning_rate": 3.420082719002747e-05, + "loss": 0.1045, + "step": 24782 + }, + { + "epoch": 0.44203260443049264, + "grad_norm": 0.23731102049350739, + "learning_rate": 3.4199379911682357e-05, + "loss": 0.172, + "step": 24783 + }, + { + "epoch": 0.4420504405522063, + "grad_norm": 0.3204597532749176, + "learning_rate": 3.419793259767726e-05, + "loss": 0.1397, + "step": 24784 + }, + { + "epoch": 0.44206827667392, + "grad_norm": 0.25386473536491394, + "learning_rate": 3.419648524801781e-05, + "loss": 0.189, + "step": 24785 + }, + { + "epoch": 0.4420861127956337, + "grad_norm": 0.2224293202161789, + "learning_rate": 3.4195037862709604e-05, + "loss": 0.169, + "step": 24786 + }, + { + "epoch": 0.4421039489173474, + "grad_norm": 0.211457297205925, + "learning_rate": 3.419359044175825e-05, + "loss": 0.1439, + "step": 24787 + }, + { + "epoch": 0.44212178503906113, + "grad_norm": 0.44649019837379456, + "learning_rate": 3.419214298516937e-05, + "loss": 0.1529, + "step": 24788 + }, + { + "epoch": 0.4421396211607748, + "grad_norm": 0.1499038189649582, + "learning_rate": 3.419069549294857e-05, + "loss": 0.0752, + "step": 24789 + }, + { + "epoch": 0.4421574572824885, + "grad_norm": 0.29040560126304626, + "learning_rate": 3.418924796510146e-05, + "loss": 0.1562, + "step": 24790 + }, + { + "epoch": 0.4421752934042022, + "grad_norm": 0.2621300518512726, + "learning_rate": 3.418780040163365e-05, + "loss": 0.1407, + "step": 24791 + }, + { + "epoch": 0.4421931295259159, + "grad_norm": 0.44027602672576904, + "learning_rate": 3.418635280255075e-05, + "loss": 0.1766, + "step": 24792 + }, + { + "epoch": 0.4422109656476296, + "grad_norm": 0.24036888778209686, + "learning_rate": 3.4184905167858375e-05, + "loss": 0.1373, + "step": 24793 + }, + { + "epoch": 0.44222880176934326, + "grad_norm": 0.2343611866235733, + "learning_rate": 3.4183457497562133e-05, + "loss": 0.1703, + "step": 24794 + }, + { + "epoch": 0.44224663789105695, + "grad_norm": 0.21968717873096466, + "learning_rate": 3.418200979166764e-05, + "loss": 0.1559, + "step": 24795 + }, + { + "epoch": 0.44226447401277064, + "grad_norm": 0.2974361777305603, + "learning_rate": 3.41805620501805e-05, + "loss": 0.1681, + "step": 24796 + }, + { + "epoch": 0.4422823101344844, + "grad_norm": 0.3284761607646942, + "learning_rate": 3.417911427310634e-05, + "loss": 0.1317, + "step": 24797 + }, + { + "epoch": 0.44230014625619807, + "grad_norm": 0.31266823410987854, + "learning_rate": 3.417766646045076e-05, + "loss": 0.1641, + "step": 24798 + }, + { + "epoch": 0.44231798237791176, + "grad_norm": 0.178290456533432, + "learning_rate": 3.417621861221937e-05, + "loss": 0.1133, + "step": 24799 + }, + { + "epoch": 0.44233581849962544, + "grad_norm": 0.3215397298336029, + "learning_rate": 3.4174770728417795e-05, + "loss": 0.1903, + "step": 24800 + }, + { + "epoch": 0.44235365462133913, + "grad_norm": 0.2602700889110565, + "learning_rate": 3.417332280905163e-05, + "loss": 0.0659, + "step": 24801 + }, + { + "epoch": 0.4423714907430528, + "grad_norm": 0.2936047315597534, + "learning_rate": 3.417187485412651e-05, + "loss": 0.1783, + "step": 24802 + }, + { + "epoch": 0.4423893268647665, + "grad_norm": 0.38923677802085876, + "learning_rate": 3.4170426863648025e-05, + "loss": 0.1866, + "step": 24803 + }, + { + "epoch": 0.4424071629864802, + "grad_norm": 0.2908266484737396, + "learning_rate": 3.41689788376218e-05, + "loss": 0.1817, + "step": 24804 + }, + { + "epoch": 0.44242499910819394, + "grad_norm": 0.206027552485466, + "learning_rate": 3.4167530776053446e-05, + "loss": 0.1026, + "step": 24805 + }, + { + "epoch": 0.4424428352299076, + "grad_norm": 0.29428526759147644, + "learning_rate": 3.416608267894858e-05, + "loss": 0.1384, + "step": 24806 + }, + { + "epoch": 0.4424606713516213, + "grad_norm": 0.33642786741256714, + "learning_rate": 3.416463454631281e-05, + "loss": 0.1909, + "step": 24807 + }, + { + "epoch": 0.442478507473335, + "grad_norm": 0.265349417924881, + "learning_rate": 3.416318637815175e-05, + "loss": 0.1581, + "step": 24808 + }, + { + "epoch": 0.4424963435950487, + "grad_norm": 0.21105970442295074, + "learning_rate": 3.416173817447101e-05, + "loss": 0.0958, + "step": 24809 + }, + { + "epoch": 0.4425141797167624, + "grad_norm": 0.33798760175704956, + "learning_rate": 3.416028993527621e-05, + "loss": 0.1012, + "step": 24810 + }, + { + "epoch": 0.44253201583847607, + "grad_norm": 0.32038789987564087, + "learning_rate": 3.415884166057297e-05, + "loss": 0.165, + "step": 24811 + }, + { + "epoch": 0.44254985196018976, + "grad_norm": 0.27280333638191223, + "learning_rate": 3.41573933503669e-05, + "loss": 0.2035, + "step": 24812 + }, + { + "epoch": 0.4425676880819035, + "grad_norm": 0.30833297967910767, + "learning_rate": 3.41559450046636e-05, + "loss": 0.1132, + "step": 24813 + }, + { + "epoch": 0.4425855242036172, + "grad_norm": 0.2255076915025711, + "learning_rate": 3.415449662346869e-05, + "loss": 0.1366, + "step": 24814 + }, + { + "epoch": 0.4426033603253309, + "grad_norm": 0.22436361014842987, + "learning_rate": 3.41530482067878e-05, + "loss": 0.1397, + "step": 24815 + }, + { + "epoch": 0.44262119644704456, + "grad_norm": 0.31262320280075073, + "learning_rate": 3.4151599754626536e-05, + "loss": 0.1618, + "step": 24816 + }, + { + "epoch": 0.44263903256875825, + "grad_norm": 0.2063130885362625, + "learning_rate": 3.415015126699051e-05, + "loss": 0.1197, + "step": 24817 + }, + { + "epoch": 0.44265686869047194, + "grad_norm": 0.2980993390083313, + "learning_rate": 3.414870274388533e-05, + "loss": 0.1187, + "step": 24818 + }, + { + "epoch": 0.4426747048121856, + "grad_norm": 0.251115083694458, + "learning_rate": 3.414725418531662e-05, + "loss": 0.1444, + "step": 24819 + }, + { + "epoch": 0.4426925409338993, + "grad_norm": 0.3356434404850006, + "learning_rate": 3.414580559128999e-05, + "loss": 0.1756, + "step": 24820 + }, + { + "epoch": 0.442710377055613, + "grad_norm": 0.2360800802707672, + "learning_rate": 3.4144356961811066e-05, + "loss": 0.1582, + "step": 24821 + }, + { + "epoch": 0.44272821317732675, + "grad_norm": 0.2754876911640167, + "learning_rate": 3.414290829688545e-05, + "loss": 0.1275, + "step": 24822 + }, + { + "epoch": 0.44274604929904043, + "grad_norm": 0.2772158086299896, + "learning_rate": 3.4141459596518765e-05, + "loss": 0.1151, + "step": 24823 + }, + { + "epoch": 0.4427638854207541, + "grad_norm": 0.21152371168136597, + "learning_rate": 3.414001086071663e-05, + "loss": 0.154, + "step": 24824 + }, + { + "epoch": 0.4427817215424678, + "grad_norm": 0.22168295085430145, + "learning_rate": 3.413856208948465e-05, + "loss": 0.1548, + "step": 24825 + }, + { + "epoch": 0.4427995576641815, + "grad_norm": 0.3185756802558899, + "learning_rate": 3.413711328282845e-05, + "loss": 0.1564, + "step": 24826 + }, + { + "epoch": 0.4428173937858952, + "grad_norm": 0.2771763801574707, + "learning_rate": 3.413566444075364e-05, + "loss": 0.1021, + "step": 24827 + }, + { + "epoch": 0.4428352299076089, + "grad_norm": 0.3360157608985901, + "learning_rate": 3.413421556326585e-05, + "loss": 0.1503, + "step": 24828 + }, + { + "epoch": 0.44285306602932256, + "grad_norm": 0.22309336066246033, + "learning_rate": 3.4132766650370674e-05, + "loss": 0.1059, + "step": 24829 + }, + { + "epoch": 0.4428709021510363, + "grad_norm": 0.3279212415218353, + "learning_rate": 3.4131317702073744e-05, + "loss": 0.1072, + "step": 24830 + }, + { + "epoch": 0.44288873827275, + "grad_norm": 0.3064015805721283, + "learning_rate": 3.4129868718380676e-05, + "loss": 0.208, + "step": 24831 + }, + { + "epoch": 0.4429065743944637, + "grad_norm": 0.24279336631298065, + "learning_rate": 3.4128419699297074e-05, + "loss": 0.1347, + "step": 24832 + }, + { + "epoch": 0.44292441051617737, + "grad_norm": 0.2656693458557129, + "learning_rate": 3.4126970644828576e-05, + "loss": 0.1328, + "step": 24833 + }, + { + "epoch": 0.44294224663789106, + "grad_norm": 0.42791885137557983, + "learning_rate": 3.4125521554980786e-05, + "loss": 0.1509, + "step": 24834 + }, + { + "epoch": 0.44296008275960475, + "grad_norm": 0.3154014050960541, + "learning_rate": 3.4124072429759314e-05, + "loss": 0.1295, + "step": 24835 + }, + { + "epoch": 0.44297791888131843, + "grad_norm": 0.24485813081264496, + "learning_rate": 3.4122623269169785e-05, + "loss": 0.1532, + "step": 24836 + }, + { + "epoch": 0.4429957550030321, + "grad_norm": 0.20850135385990143, + "learning_rate": 3.412117407321783e-05, + "loss": 0.1637, + "step": 24837 + }, + { + "epoch": 0.4430135911247458, + "grad_norm": 0.3197515904903412, + "learning_rate": 3.411972484190904e-05, + "loss": 0.1349, + "step": 24838 + }, + { + "epoch": 0.44303142724645955, + "grad_norm": 0.27301672101020813, + "learning_rate": 3.4118275575249056e-05, + "loss": 0.1689, + "step": 24839 + }, + { + "epoch": 0.44304926336817324, + "grad_norm": 0.3225424885749817, + "learning_rate": 3.411682627324349e-05, + "loss": 0.0943, + "step": 24840 + }, + { + "epoch": 0.44306709948988693, + "grad_norm": 0.305266410112381, + "learning_rate": 3.411537693589795e-05, + "loss": 0.2364, + "step": 24841 + }, + { + "epoch": 0.4430849356116006, + "grad_norm": 0.21491581201553345, + "learning_rate": 3.411392756321806e-05, + "loss": 0.1307, + "step": 24842 + }, + { + "epoch": 0.4431027717333143, + "grad_norm": 0.24535910785198212, + "learning_rate": 3.411247815520944e-05, + "loss": 0.1714, + "step": 24843 + }, + { + "epoch": 0.443120607855028, + "grad_norm": 0.21555083990097046, + "learning_rate": 3.411102871187771e-05, + "loss": 0.134, + "step": 24844 + }, + { + "epoch": 0.4431384439767417, + "grad_norm": 0.1810872107744217, + "learning_rate": 3.410957923322848e-05, + "loss": 0.1564, + "step": 24845 + }, + { + "epoch": 0.44315628009845537, + "grad_norm": 0.25940626859664917, + "learning_rate": 3.410812971926738e-05, + "loss": 0.1243, + "step": 24846 + }, + { + "epoch": 0.4431741162201691, + "grad_norm": 0.2311011552810669, + "learning_rate": 3.4106680170000016e-05, + "loss": 0.1196, + "step": 24847 + }, + { + "epoch": 0.4431919523418828, + "grad_norm": 0.23357993364334106, + "learning_rate": 3.410523058543202e-05, + "loss": 0.1662, + "step": 24848 + }, + { + "epoch": 0.4432097884635965, + "grad_norm": 0.2707495391368866, + "learning_rate": 3.410378096556901e-05, + "loss": 0.1269, + "step": 24849 + }, + { + "epoch": 0.4432276245853102, + "grad_norm": 0.3249301016330719, + "learning_rate": 3.410233131041659e-05, + "loss": 0.196, + "step": 24850 + }, + { + "epoch": 0.44324546070702386, + "grad_norm": 0.2773958444595337, + "learning_rate": 3.41008816199804e-05, + "loss": 0.1936, + "step": 24851 + }, + { + "epoch": 0.44326329682873755, + "grad_norm": 0.2752026617527008, + "learning_rate": 3.4099431894266044e-05, + "loss": 0.1576, + "step": 24852 + }, + { + "epoch": 0.44328113295045124, + "grad_norm": 0.300048828125, + "learning_rate": 3.4097982133279145e-05, + "loss": 0.0947, + "step": 24853 + }, + { + "epoch": 0.44329896907216493, + "grad_norm": 0.23399627208709717, + "learning_rate": 3.409653233702533e-05, + "loss": 0.1142, + "step": 24854 + }, + { + "epoch": 0.4433168051938786, + "grad_norm": 0.22062677145004272, + "learning_rate": 3.4095082505510214e-05, + "loss": 0.0985, + "step": 24855 + }, + { + "epoch": 0.44333464131559236, + "grad_norm": 0.19096003472805023, + "learning_rate": 3.4093632638739417e-05, + "loss": 0.1225, + "step": 24856 + }, + { + "epoch": 0.44335247743730605, + "grad_norm": 0.30973878502845764, + "learning_rate": 3.409218273671855e-05, + "loss": 0.1645, + "step": 24857 + }, + { + "epoch": 0.44337031355901974, + "grad_norm": 0.21248957514762878, + "learning_rate": 3.409073279945325e-05, + "loss": 0.136, + "step": 24858 + }, + { + "epoch": 0.4433881496807334, + "grad_norm": 0.2656228542327881, + "learning_rate": 3.408928282694913e-05, + "loss": 0.1495, + "step": 24859 + }, + { + "epoch": 0.4434059858024471, + "grad_norm": 0.37462103366851807, + "learning_rate": 3.408783281921181e-05, + "loss": 0.1434, + "step": 24860 + }, + { + "epoch": 0.4434238219241608, + "grad_norm": 0.2554827630519867, + "learning_rate": 3.408638277624691e-05, + "loss": 0.1545, + "step": 24861 + }, + { + "epoch": 0.4434416580458745, + "grad_norm": 0.23490700125694275, + "learning_rate": 3.408493269806005e-05, + "loss": 0.1533, + "step": 24862 + }, + { + "epoch": 0.4434594941675882, + "grad_norm": 0.27587994933128357, + "learning_rate": 3.408348258465686e-05, + "loss": 0.1547, + "step": 24863 + }, + { + "epoch": 0.4434773302893019, + "grad_norm": 0.3151644468307495, + "learning_rate": 3.408203243604294e-05, + "loss": 0.1477, + "step": 24864 + }, + { + "epoch": 0.4434951664110156, + "grad_norm": 0.24536073207855225, + "learning_rate": 3.408058225222394e-05, + "loss": 0.1166, + "step": 24865 + }, + { + "epoch": 0.4435130025327293, + "grad_norm": 0.26136505603790283, + "learning_rate": 3.4079132033205465e-05, + "loss": 0.1297, + "step": 24866 + }, + { + "epoch": 0.443530838654443, + "grad_norm": 0.28272610902786255, + "learning_rate": 3.407768177899314e-05, + "loss": 0.1961, + "step": 24867 + }, + { + "epoch": 0.44354867477615667, + "grad_norm": 0.26030322909355164, + "learning_rate": 3.407623148959258e-05, + "loss": 0.1252, + "step": 24868 + }, + { + "epoch": 0.44356651089787036, + "grad_norm": 0.23360766470432281, + "learning_rate": 3.407478116500941e-05, + "loss": 0.1533, + "step": 24869 + }, + { + "epoch": 0.44358434701958405, + "grad_norm": 0.28147628903388977, + "learning_rate": 3.407333080524925e-05, + "loss": 0.1158, + "step": 24870 + }, + { + "epoch": 0.44360218314129773, + "grad_norm": 0.38535448908805847, + "learning_rate": 3.4071880410317735e-05, + "loss": 0.1673, + "step": 24871 + }, + { + "epoch": 0.4436200192630115, + "grad_norm": 0.27549269795417786, + "learning_rate": 3.407042998022047e-05, + "loss": 0.1577, + "step": 24872 + }, + { + "epoch": 0.44363785538472517, + "grad_norm": 0.27959156036376953, + "learning_rate": 3.40689795149631e-05, + "loss": 0.0812, + "step": 24873 + }, + { + "epoch": 0.44365569150643885, + "grad_norm": 0.2094658613204956, + "learning_rate": 3.4067529014551224e-05, + "loss": 0.151, + "step": 24874 + }, + { + "epoch": 0.44367352762815254, + "grad_norm": 0.2217167764902115, + "learning_rate": 3.406607847899047e-05, + "loss": 0.143, + "step": 24875 + }, + { + "epoch": 0.44369136374986623, + "grad_norm": 0.4256473183631897, + "learning_rate": 3.406462790828647e-05, + "loss": 0.1912, + "step": 24876 + }, + { + "epoch": 0.4437091998715799, + "grad_norm": 0.24542111158370972, + "learning_rate": 3.406317730244484e-05, + "loss": 0.1107, + "step": 24877 + }, + { + "epoch": 0.4437270359932936, + "grad_norm": 0.19015569984912872, + "learning_rate": 3.406172666147121e-05, + "loss": 0.1179, + "step": 24878 + }, + { + "epoch": 0.4437448721150073, + "grad_norm": 0.2770199179649353, + "learning_rate": 3.406027598537118e-05, + "loss": 0.1545, + "step": 24879 + }, + { + "epoch": 0.443762708236721, + "grad_norm": 0.26866960525512695, + "learning_rate": 3.405882527415041e-05, + "loss": 0.0878, + "step": 24880 + }, + { + "epoch": 0.4437805443584347, + "grad_norm": 0.2646898925304413, + "learning_rate": 3.40573745278145e-05, + "loss": 0.1561, + "step": 24881 + }, + { + "epoch": 0.4437983804801484, + "grad_norm": 0.3976670205593109, + "learning_rate": 3.4055923746369075e-05, + "loss": 0.1609, + "step": 24882 + }, + { + "epoch": 0.4438162166018621, + "grad_norm": 0.1968865841627121, + "learning_rate": 3.4054472929819766e-05, + "loss": 0.164, + "step": 24883 + }, + { + "epoch": 0.4438340527235758, + "grad_norm": 0.2928166687488556, + "learning_rate": 3.4053022078172184e-05, + "loss": 0.1475, + "step": 24884 + }, + { + "epoch": 0.4438518888452895, + "grad_norm": 0.24959976971149445, + "learning_rate": 3.4051571191431965e-05, + "loss": 0.1539, + "step": 24885 + }, + { + "epoch": 0.44386972496700317, + "grad_norm": 0.279587060213089, + "learning_rate": 3.405012026960473e-05, + "loss": 0.1577, + "step": 24886 + }, + { + "epoch": 0.44388756108871685, + "grad_norm": 0.25099626183509827, + "learning_rate": 3.404866931269611e-05, + "loss": 0.1628, + "step": 24887 + }, + { + "epoch": 0.44390539721043054, + "grad_norm": 0.2726922333240509, + "learning_rate": 3.404721832071171e-05, + "loss": 0.1026, + "step": 24888 + }, + { + "epoch": 0.4439232333321443, + "grad_norm": 0.3385123908519745, + "learning_rate": 3.4045767293657176e-05, + "loss": 0.1853, + "step": 24889 + }, + { + "epoch": 0.443941069453858, + "grad_norm": 0.22980597615242004, + "learning_rate": 3.404431623153812e-05, + "loss": 0.1336, + "step": 24890 + }, + { + "epoch": 0.44395890557557166, + "grad_norm": 0.2664623558521271, + "learning_rate": 3.404286513436017e-05, + "loss": 0.1474, + "step": 24891 + }, + { + "epoch": 0.44397674169728535, + "grad_norm": 0.2742363512516022, + "learning_rate": 3.4041414002128954e-05, + "loss": 0.1343, + "step": 24892 + }, + { + "epoch": 0.44399457781899904, + "grad_norm": 0.2724882662296295, + "learning_rate": 3.4039962834850095e-05, + "loss": 0.1556, + "step": 24893 + }, + { + "epoch": 0.4440124139407127, + "grad_norm": 0.2719927132129669, + "learning_rate": 3.403851163252921e-05, + "loss": 0.1527, + "step": 24894 + }, + { + "epoch": 0.4440302500624264, + "grad_norm": 0.30843448638916016, + "learning_rate": 3.403706039517194e-05, + "loss": 0.1121, + "step": 24895 + }, + { + "epoch": 0.4440480861841401, + "grad_norm": 0.3314504325389862, + "learning_rate": 3.4035609122783905e-05, + "loss": 0.1513, + "step": 24896 + }, + { + "epoch": 0.4440659223058538, + "grad_norm": 0.3164519667625427, + "learning_rate": 3.403415781537073e-05, + "loss": 0.1872, + "step": 24897 + }, + { + "epoch": 0.44408375842756753, + "grad_norm": 0.334339439868927, + "learning_rate": 3.403270647293803e-05, + "loss": 0.1764, + "step": 24898 + }, + { + "epoch": 0.4441015945492812, + "grad_norm": 0.2193564623594284, + "learning_rate": 3.4031255095491436e-05, + "loss": 0.1967, + "step": 24899 + }, + { + "epoch": 0.4441194306709949, + "grad_norm": 0.15048037469387054, + "learning_rate": 3.4029803683036587e-05, + "loss": 0.0838, + "step": 24900 + }, + { + "epoch": 0.4441372667927086, + "grad_norm": 0.2857602536678314, + "learning_rate": 3.40283522355791e-05, + "loss": 0.1668, + "step": 24901 + }, + { + "epoch": 0.4441551029144223, + "grad_norm": 0.2837502360343933, + "learning_rate": 3.4026900753124604e-05, + "loss": 0.1393, + "step": 24902 + }, + { + "epoch": 0.44417293903613597, + "grad_norm": 0.23231053352355957, + "learning_rate": 3.4025449235678713e-05, + "loss": 0.1218, + "step": 24903 + }, + { + "epoch": 0.44419077515784966, + "grad_norm": 0.2585558593273163, + "learning_rate": 3.4023997683247075e-05, + "loss": 0.1392, + "step": 24904 + }, + { + "epoch": 0.44420861127956335, + "grad_norm": 0.21564440429210663, + "learning_rate": 3.4022546095835294e-05, + "loss": 0.1288, + "step": 24905 + }, + { + "epoch": 0.4442264474012771, + "grad_norm": 0.32804688811302185, + "learning_rate": 3.4021094473449014e-05, + "loss": 0.1699, + "step": 24906 + }, + { + "epoch": 0.4442442835229908, + "grad_norm": 0.22931218147277832, + "learning_rate": 3.401964281609385e-05, + "loss": 0.1366, + "step": 24907 + }, + { + "epoch": 0.44426211964470447, + "grad_norm": 0.2769996225833893, + "learning_rate": 3.401819112377544e-05, + "loss": 0.2147, + "step": 24908 + }, + { + "epoch": 0.44427995576641816, + "grad_norm": 0.28523778915405273, + "learning_rate": 3.401673939649942e-05, + "loss": 0.1628, + "step": 24909 + }, + { + "epoch": 0.44429779188813184, + "grad_norm": 0.21927498281002045, + "learning_rate": 3.401528763427139e-05, + "loss": 0.1415, + "step": 24910 + }, + { + "epoch": 0.44431562800984553, + "grad_norm": 0.24998922646045685, + "learning_rate": 3.401383583709699e-05, + "loss": 0.1426, + "step": 24911 + }, + { + "epoch": 0.4443334641315592, + "grad_norm": 0.3166537284851074, + "learning_rate": 3.4012384004981844e-05, + "loss": 0.1688, + "step": 24912 + }, + { + "epoch": 0.4443513002532729, + "grad_norm": 0.2819408178329468, + "learning_rate": 3.4010932137931595e-05, + "loss": 0.1804, + "step": 24913 + }, + { + "epoch": 0.4443691363749866, + "grad_norm": 0.2404019981622696, + "learning_rate": 3.4009480235951855e-05, + "loss": 0.1257, + "step": 24914 + }, + { + "epoch": 0.44438697249670034, + "grad_norm": 0.3003208041191101, + "learning_rate": 3.400802829904827e-05, + "loss": 0.1461, + "step": 24915 + }, + { + "epoch": 0.444404808618414, + "grad_norm": 0.440448135137558, + "learning_rate": 3.4006576327226434e-05, + "loss": 0.1942, + "step": 24916 + }, + { + "epoch": 0.4444226447401277, + "grad_norm": 0.2392699122428894, + "learning_rate": 3.400512432049202e-05, + "loss": 0.1603, + "step": 24917 + }, + { + "epoch": 0.4444404808618414, + "grad_norm": 0.24726980924606323, + "learning_rate": 3.4003672278850617e-05, + "loss": 0.1656, + "step": 24918 + }, + { + "epoch": 0.4444583169835551, + "grad_norm": 0.2180936634540558, + "learning_rate": 3.4002220202307876e-05, + "loss": 0.1311, + "step": 24919 + }, + { + "epoch": 0.4444761531052688, + "grad_norm": 0.20830442011356354, + "learning_rate": 3.4000768090869424e-05, + "loss": 0.1652, + "step": 24920 + }, + { + "epoch": 0.44449398922698247, + "grad_norm": 0.3282788097858429, + "learning_rate": 3.3999315944540886e-05, + "loss": 0.1706, + "step": 24921 + }, + { + "epoch": 0.44451182534869615, + "grad_norm": 0.460666686296463, + "learning_rate": 3.399786376332789e-05, + "loss": 0.1299, + "step": 24922 + }, + { + "epoch": 0.4445296614704099, + "grad_norm": 0.2247573584318161, + "learning_rate": 3.399641154723606e-05, + "loss": 0.2017, + "step": 24923 + }, + { + "epoch": 0.4445474975921236, + "grad_norm": 0.2084415704011917, + "learning_rate": 3.399495929627105e-05, + "loss": 0.1678, + "step": 24924 + }, + { + "epoch": 0.4445653337138373, + "grad_norm": 0.25306323170661926, + "learning_rate": 3.3993507010438446e-05, + "loss": 0.1668, + "step": 24925 + }, + { + "epoch": 0.44458316983555096, + "grad_norm": 0.20952977240085602, + "learning_rate": 3.399205468974391e-05, + "loss": 0.1446, + "step": 24926 + }, + { + "epoch": 0.44460100595726465, + "grad_norm": 0.25841525197029114, + "learning_rate": 3.399060233419307e-05, + "loss": 0.1087, + "step": 24927 + }, + { + "epoch": 0.44461884207897834, + "grad_norm": 0.29320597648620605, + "learning_rate": 3.398914994379155e-05, + "loss": 0.1652, + "step": 24928 + }, + { + "epoch": 0.444636678200692, + "grad_norm": 0.34367331862449646, + "learning_rate": 3.398769751854498e-05, + "loss": 0.1338, + "step": 24929 + }, + { + "epoch": 0.4446545143224057, + "grad_norm": 0.23877593874931335, + "learning_rate": 3.3986245058458995e-05, + "loss": 0.1274, + "step": 24930 + }, + { + "epoch": 0.44467235044411946, + "grad_norm": 0.28455033898353577, + "learning_rate": 3.398479256353921e-05, + "loss": 0.1905, + "step": 24931 + }, + { + "epoch": 0.44469018656583315, + "grad_norm": 0.21717683970928192, + "learning_rate": 3.3983340033791275e-05, + "loss": 0.1219, + "step": 24932 + }, + { + "epoch": 0.44470802268754683, + "grad_norm": 0.22644302248954773, + "learning_rate": 3.3981887469220805e-05, + "loss": 0.1088, + "step": 24933 + }, + { + "epoch": 0.4447258588092605, + "grad_norm": 0.2652421295642853, + "learning_rate": 3.398043486983343e-05, + "loss": 0.1327, + "step": 24934 + }, + { + "epoch": 0.4447436949309742, + "grad_norm": 0.38841715455055237, + "learning_rate": 3.3978982235634807e-05, + "loss": 0.1773, + "step": 24935 + }, + { + "epoch": 0.4447615310526879, + "grad_norm": 0.2776479125022888, + "learning_rate": 3.397752956663053e-05, + "loss": 0.1657, + "step": 24936 + }, + { + "epoch": 0.4447793671744016, + "grad_norm": 0.28849300742149353, + "learning_rate": 3.397607686282626e-05, + "loss": 0.1866, + "step": 24937 + }, + { + "epoch": 0.4447972032961153, + "grad_norm": 0.18944236636161804, + "learning_rate": 3.3974624124227604e-05, + "loss": 0.1538, + "step": 24938 + }, + { + "epoch": 0.44481503941782896, + "grad_norm": 0.300076425075531, + "learning_rate": 3.397317135084021e-05, + "loss": 0.1031, + "step": 24939 + }, + { + "epoch": 0.4448328755395427, + "grad_norm": 0.18755820393562317, + "learning_rate": 3.3971718542669704e-05, + "loss": 0.1048, + "step": 24940 + }, + { + "epoch": 0.4448507116612564, + "grad_norm": 0.25192809104919434, + "learning_rate": 3.397026569972172e-05, + "loss": 0.1615, + "step": 24941 + }, + { + "epoch": 0.4448685477829701, + "grad_norm": 0.2576552927494049, + "learning_rate": 3.396881282200189e-05, + "loss": 0.079, + "step": 24942 + }, + { + "epoch": 0.44488638390468377, + "grad_norm": 0.2265574336051941, + "learning_rate": 3.396735990951585e-05, + "loss": 0.146, + "step": 24943 + }, + { + "epoch": 0.44490422002639746, + "grad_norm": 0.41231682896614075, + "learning_rate": 3.3965906962269214e-05, + "loss": 0.1329, + "step": 24944 + }, + { + "epoch": 0.44492205614811114, + "grad_norm": 0.2770184874534607, + "learning_rate": 3.396445398026763e-05, + "loss": 0.1263, + "step": 24945 + }, + { + "epoch": 0.44493989226982483, + "grad_norm": 0.23502086102962494, + "learning_rate": 3.396300096351672e-05, + "loss": 0.1394, + "step": 24946 + }, + { + "epoch": 0.4449577283915385, + "grad_norm": 0.35654568672180176, + "learning_rate": 3.396154791202213e-05, + "loss": 0.1386, + "step": 24947 + }, + { + "epoch": 0.44497556451325226, + "grad_norm": 0.32461968064308167, + "learning_rate": 3.396009482578949e-05, + "loss": 0.1063, + "step": 24948 + }, + { + "epoch": 0.44499340063496595, + "grad_norm": 0.2688353955745697, + "learning_rate": 3.395864170482441e-05, + "loss": 0.1411, + "step": 24949 + }, + { + "epoch": 0.44501123675667964, + "grad_norm": 0.28183308243751526, + "learning_rate": 3.395718854913256e-05, + "loss": 0.1387, + "step": 24950 + }, + { + "epoch": 0.44502907287839333, + "grad_norm": 0.3091309070587158, + "learning_rate": 3.395573535871954e-05, + "loss": 0.127, + "step": 24951 + }, + { + "epoch": 0.445046909000107, + "grad_norm": 0.42708688974380493, + "learning_rate": 3.3954282133591006e-05, + "loss": 0.2512, + "step": 24952 + }, + { + "epoch": 0.4450647451218207, + "grad_norm": 0.25196170806884766, + "learning_rate": 3.3952828873752576e-05, + "loss": 0.1353, + "step": 24953 + }, + { + "epoch": 0.4450825812435344, + "grad_norm": 0.25167515873908997, + "learning_rate": 3.395137557920989e-05, + "loss": 0.161, + "step": 24954 + }, + { + "epoch": 0.4451004173652481, + "grad_norm": 0.21120524406433105, + "learning_rate": 3.3949922249968576e-05, + "loss": 0.1502, + "step": 24955 + }, + { + "epoch": 0.44511825348696177, + "grad_norm": 0.27936986088752747, + "learning_rate": 3.394846888603428e-05, + "loss": 0.147, + "step": 24956 + }, + { + "epoch": 0.4451360896086755, + "grad_norm": 0.25990229845046997, + "learning_rate": 3.394701548741262e-05, + "loss": 0.1035, + "step": 24957 + }, + { + "epoch": 0.4451539257303892, + "grad_norm": 0.2814280092716217, + "learning_rate": 3.394556205410925e-05, + "loss": 0.139, + "step": 24958 + }, + { + "epoch": 0.4451717618521029, + "grad_norm": 0.23317204415798187, + "learning_rate": 3.394410858612977e-05, + "loss": 0.1793, + "step": 24959 + }, + { + "epoch": 0.4451895979738166, + "grad_norm": 0.40178561210632324, + "learning_rate": 3.394265508347986e-05, + "loss": 0.0678, + "step": 24960 + }, + { + "epoch": 0.44520743409553026, + "grad_norm": 0.2048632651567459, + "learning_rate": 3.394120154616512e-05, + "loss": 0.1408, + "step": 24961 + }, + { + "epoch": 0.44522527021724395, + "grad_norm": 0.3062325716018677, + "learning_rate": 3.393974797419119e-05, + "loss": 0.1576, + "step": 24962 + }, + { + "epoch": 0.44524310633895764, + "grad_norm": 0.2437918484210968, + "learning_rate": 3.3938294367563724e-05, + "loss": 0.1697, + "step": 24963 + }, + { + "epoch": 0.4452609424606713, + "grad_norm": 0.22002071142196655, + "learning_rate": 3.3936840726288326e-05, + "loss": 0.0932, + "step": 24964 + }, + { + "epoch": 0.44527877858238507, + "grad_norm": 0.203473299741745, + "learning_rate": 3.393538705037066e-05, + "loss": 0.1498, + "step": 24965 + }, + { + "epoch": 0.44529661470409876, + "grad_norm": 0.21064774692058563, + "learning_rate": 3.393393333981634e-05, + "loss": 0.1189, + "step": 24966 + }, + { + "epoch": 0.44531445082581245, + "grad_norm": 0.30560675263404846, + "learning_rate": 3.3932479594631e-05, + "loss": 0.1638, + "step": 24967 + }, + { + "epoch": 0.44533228694752613, + "grad_norm": 0.25150179862976074, + "learning_rate": 3.39310258148203e-05, + "loss": 0.0789, + "step": 24968 + }, + { + "epoch": 0.4453501230692398, + "grad_norm": 0.28300243616104126, + "learning_rate": 3.392957200038985e-05, + "loss": 0.1391, + "step": 24969 + }, + { + "epoch": 0.4453679591909535, + "grad_norm": 0.3144187033176422, + "learning_rate": 3.392811815134529e-05, + "loss": 0.1409, + "step": 24970 + }, + { + "epoch": 0.4453857953126672, + "grad_norm": 0.30045562982559204, + "learning_rate": 3.392666426769228e-05, + "loss": 0.1816, + "step": 24971 + }, + { + "epoch": 0.4454036314343809, + "grad_norm": 0.31369584798812866, + "learning_rate": 3.3925210349436414e-05, + "loss": 0.169, + "step": 24972 + }, + { + "epoch": 0.44542146755609463, + "grad_norm": 0.23182500898838043, + "learning_rate": 3.392375639658336e-05, + "loss": 0.1489, + "step": 24973 + }, + { + "epoch": 0.4454393036778083, + "grad_norm": 0.3368208408355713, + "learning_rate": 3.3922302409138744e-05, + "loss": 0.1485, + "step": 24974 + }, + { + "epoch": 0.445457139799522, + "grad_norm": 0.23210720717906952, + "learning_rate": 3.39208483871082e-05, + "loss": 0.1991, + "step": 24975 + }, + { + "epoch": 0.4454749759212357, + "grad_norm": 0.26044806838035583, + "learning_rate": 3.391939433049737e-05, + "loss": 0.121, + "step": 24976 + }, + { + "epoch": 0.4454928120429494, + "grad_norm": 0.2672989070415497, + "learning_rate": 3.3917940239311885e-05, + "loss": 0.1278, + "step": 24977 + }, + { + "epoch": 0.44551064816466307, + "grad_norm": 0.2215174436569214, + "learning_rate": 3.3916486113557385e-05, + "loss": 0.1244, + "step": 24978 + }, + { + "epoch": 0.44552848428637676, + "grad_norm": 0.38108932971954346, + "learning_rate": 3.391503195323951e-05, + "loss": 0.1635, + "step": 24979 + }, + { + "epoch": 0.44554632040809045, + "grad_norm": 0.18883508443832397, + "learning_rate": 3.391357775836388e-05, + "loss": 0.1392, + "step": 24980 + }, + { + "epoch": 0.44556415652980413, + "grad_norm": 0.24974606931209564, + "learning_rate": 3.3912123528936154e-05, + "loss": 0.1078, + "step": 24981 + }, + { + "epoch": 0.4455819926515179, + "grad_norm": 0.3186878561973572, + "learning_rate": 3.391066926496195e-05, + "loss": 0.1234, + "step": 24982 + }, + { + "epoch": 0.44559982877323157, + "grad_norm": 0.2335820198059082, + "learning_rate": 3.3909214966446916e-05, + "loss": 0.17, + "step": 24983 + }, + { + "epoch": 0.44561766489494525, + "grad_norm": 0.25362586975097656, + "learning_rate": 3.3907760633396694e-05, + "loss": 0.0998, + "step": 24984 + }, + { + "epoch": 0.44563550101665894, + "grad_norm": 0.5613580942153931, + "learning_rate": 3.390630626581691e-05, + "loss": 0.1224, + "step": 24985 + }, + { + "epoch": 0.44565333713837263, + "grad_norm": 0.26306864619255066, + "learning_rate": 3.390485186371321e-05, + "loss": 0.108, + "step": 24986 + }, + { + "epoch": 0.4456711732600863, + "grad_norm": 0.2483878880739212, + "learning_rate": 3.3903397427091234e-05, + "loss": 0.1604, + "step": 24987 + }, + { + "epoch": 0.4456890093818, + "grad_norm": 0.1977320909500122, + "learning_rate": 3.39019429559566e-05, + "loss": 0.1613, + "step": 24988 + }, + { + "epoch": 0.4457068455035137, + "grad_norm": 0.1874285489320755, + "learning_rate": 3.390048845031497e-05, + "loss": 0.1164, + "step": 24989 + }, + { + "epoch": 0.44572468162522744, + "grad_norm": 0.28090229630470276, + "learning_rate": 3.389903391017197e-05, + "loss": 0.192, + "step": 24990 + }, + { + "epoch": 0.4457425177469411, + "grad_norm": 0.20751237869262695, + "learning_rate": 3.389757933553324e-05, + "loss": 0.0752, + "step": 24991 + }, + { + "epoch": 0.4457603538686548, + "grad_norm": 0.2270546853542328, + "learning_rate": 3.389612472640442e-05, + "loss": 0.0792, + "step": 24992 + }, + { + "epoch": 0.4457781899903685, + "grad_norm": 0.42721545696258545, + "learning_rate": 3.389467008279116e-05, + "loss": 0.169, + "step": 24993 + }, + { + "epoch": 0.4457960261120822, + "grad_norm": 0.34319695830345154, + "learning_rate": 3.389321540469907e-05, + "loss": 0.1316, + "step": 24994 + }, + { + "epoch": 0.4458138622337959, + "grad_norm": 0.23150020837783813, + "learning_rate": 3.3891760692133806e-05, + "loss": 0.126, + "step": 24995 + }, + { + "epoch": 0.44583169835550956, + "grad_norm": 0.25390133261680603, + "learning_rate": 3.389030594510101e-05, + "loss": 0.1752, + "step": 24996 + }, + { + "epoch": 0.44584953447722325, + "grad_norm": 0.2629392445087433, + "learning_rate": 3.3888851163606324e-05, + "loss": 0.127, + "step": 24997 + }, + { + "epoch": 0.44586737059893694, + "grad_norm": 0.2480379343032837, + "learning_rate": 3.3887396347655375e-05, + "loss": 0.15, + "step": 24998 + }, + { + "epoch": 0.4458852067206507, + "grad_norm": 0.3128427565097809, + "learning_rate": 3.388594149725381e-05, + "loss": 0.1691, + "step": 24999 + }, + { + "epoch": 0.44590304284236437, + "grad_norm": 0.2217392474412918, + "learning_rate": 3.3884486612407266e-05, + "loss": 0.1003, + "step": 25000 + }, + { + "epoch": 0.44590304284236437, + "eval_loss": 0.1438295841217041, + "eval_runtime": 107.275, + "eval_samples_per_second": 9.546, + "eval_steps_per_second": 1.594, + "step": 25000 + }, + { + "epoch": 0.44592087896407806, + "grad_norm": 0.24391217529773712, + "learning_rate": 3.388303169312138e-05, + "loss": 0.1476, + "step": 25001 + }, + { + "epoch": 0.44593871508579175, + "grad_norm": 0.25604453682899475, + "learning_rate": 3.38815767394018e-05, + "loss": 0.1046, + "step": 25002 + }, + { + "epoch": 0.44595655120750544, + "grad_norm": 0.38589605689048767, + "learning_rate": 3.388012175125416e-05, + "loss": 0.1661, + "step": 25003 + }, + { + "epoch": 0.4459743873292191, + "grad_norm": 0.29729077219963074, + "learning_rate": 3.38786667286841e-05, + "loss": 0.154, + "step": 25004 + }, + { + "epoch": 0.4459922234509328, + "grad_norm": 0.335660845041275, + "learning_rate": 3.387721167169726e-05, + "loss": 0.178, + "step": 25005 + }, + { + "epoch": 0.4460100595726465, + "grad_norm": 0.29932448267936707, + "learning_rate": 3.387575658029928e-05, + "loss": 0.14, + "step": 25006 + }, + { + "epoch": 0.44602789569436024, + "grad_norm": 0.2991105318069458, + "learning_rate": 3.387430145449581e-05, + "loss": 0.1648, + "step": 25007 + }, + { + "epoch": 0.44604573181607393, + "grad_norm": 0.24931123852729797, + "learning_rate": 3.3872846294292474e-05, + "loss": 0.127, + "step": 25008 + }, + { + "epoch": 0.4460635679377876, + "grad_norm": 0.40050822496414185, + "learning_rate": 3.387139109969493e-05, + "loss": 0.161, + "step": 25009 + }, + { + "epoch": 0.4460814040595013, + "grad_norm": 0.3428746163845062, + "learning_rate": 3.3869935870708794e-05, + "loss": 0.1141, + "step": 25010 + }, + { + "epoch": 0.446099240181215, + "grad_norm": 0.26931509375572205, + "learning_rate": 3.3868480607339735e-05, + "loss": 0.1563, + "step": 25011 + }, + { + "epoch": 0.4461170763029287, + "grad_norm": 0.21234270930290222, + "learning_rate": 3.386702530959338e-05, + "loss": 0.1497, + "step": 25012 + }, + { + "epoch": 0.44613491242464237, + "grad_norm": 0.25246769189834595, + "learning_rate": 3.3865569977475376e-05, + "loss": 0.1534, + "step": 25013 + }, + { + "epoch": 0.44615274854635606, + "grad_norm": 0.29366806149482727, + "learning_rate": 3.386411461099136e-05, + "loss": 0.213, + "step": 25014 + }, + { + "epoch": 0.44617058466806975, + "grad_norm": 0.20416218042373657, + "learning_rate": 3.3862659210146975e-05, + "loss": 0.1333, + "step": 25015 + }, + { + "epoch": 0.4461884207897835, + "grad_norm": 0.298424631357193, + "learning_rate": 3.386120377494785e-05, + "loss": 0.1739, + "step": 25016 + }, + { + "epoch": 0.4462062569114972, + "grad_norm": 0.23990018665790558, + "learning_rate": 3.3859748305399655e-05, + "loss": 0.1378, + "step": 25017 + }, + { + "epoch": 0.44622409303321087, + "grad_norm": 0.251436322927475, + "learning_rate": 3.3858292801507995e-05, + "loss": 0.1161, + "step": 25018 + }, + { + "epoch": 0.44624192915492455, + "grad_norm": 0.29651856422424316, + "learning_rate": 3.3856837263278554e-05, + "loss": 0.2039, + "step": 25019 + }, + { + "epoch": 0.44625976527663824, + "grad_norm": 0.24433746933937073, + "learning_rate": 3.385538169071694e-05, + "loss": 0.127, + "step": 25020 + }, + { + "epoch": 0.44627760139835193, + "grad_norm": 0.15020044147968292, + "learning_rate": 3.3853926083828814e-05, + "loss": 0.1044, + "step": 25021 + }, + { + "epoch": 0.4462954375200656, + "grad_norm": 0.32814761996269226, + "learning_rate": 3.385247044261981e-05, + "loss": 0.1287, + "step": 25022 + }, + { + "epoch": 0.4463132736417793, + "grad_norm": 0.369859904050827, + "learning_rate": 3.3851014767095565e-05, + "loss": 0.2183, + "step": 25023 + }, + { + "epoch": 0.44633110976349305, + "grad_norm": 0.2809942066669464, + "learning_rate": 3.384955905726174e-05, + "loss": 0.2118, + "step": 25024 + }, + { + "epoch": 0.44634894588520674, + "grad_norm": 0.2394491732120514, + "learning_rate": 3.384810331312397e-05, + "loss": 0.1198, + "step": 25025 + }, + { + "epoch": 0.4463667820069204, + "grad_norm": 0.21639138460159302, + "learning_rate": 3.384664753468789e-05, + "loss": 0.151, + "step": 25026 + }, + { + "epoch": 0.4463846181286341, + "grad_norm": 0.2115909904241562, + "learning_rate": 3.384519172195915e-05, + "loss": 0.1117, + "step": 25027 + }, + { + "epoch": 0.4464024542503478, + "grad_norm": 0.3918338418006897, + "learning_rate": 3.384373587494339e-05, + "loss": 0.2509, + "step": 25028 + }, + { + "epoch": 0.4464202903720615, + "grad_norm": 0.20188231766223907, + "learning_rate": 3.384227999364626e-05, + "loss": 0.1303, + "step": 25029 + }, + { + "epoch": 0.4464381264937752, + "grad_norm": 0.2243097722530365, + "learning_rate": 3.3840824078073394e-05, + "loss": 0.1593, + "step": 25030 + }, + { + "epoch": 0.44645596261548887, + "grad_norm": 0.18480341136455536, + "learning_rate": 3.383936812823044e-05, + "loss": 0.1348, + "step": 25031 + }, + { + "epoch": 0.4464737987372026, + "grad_norm": 0.23222771286964417, + "learning_rate": 3.3837912144123045e-05, + "loss": 0.1349, + "step": 25032 + }, + { + "epoch": 0.4464916348589163, + "grad_norm": 0.3027746081352234, + "learning_rate": 3.383645612575685e-05, + "loss": 0.1282, + "step": 25033 + }, + { + "epoch": 0.44650947098063, + "grad_norm": 0.2174845039844513, + "learning_rate": 3.3835000073137504e-05, + "loss": 0.139, + "step": 25034 + }, + { + "epoch": 0.4465273071023437, + "grad_norm": 0.20630599558353424, + "learning_rate": 3.3833543986270634e-05, + "loss": 0.0871, + "step": 25035 + }, + { + "epoch": 0.44654514322405736, + "grad_norm": 0.2154911458492279, + "learning_rate": 3.383208786516191e-05, + "loss": 0.1475, + "step": 25036 + }, + { + "epoch": 0.44656297934577105, + "grad_norm": 0.344966858625412, + "learning_rate": 3.383063170981695e-05, + "loss": 0.2358, + "step": 25037 + }, + { + "epoch": 0.44658081546748474, + "grad_norm": 0.25561490654945374, + "learning_rate": 3.382917552024141e-05, + "loss": 0.1566, + "step": 25038 + }, + { + "epoch": 0.4465986515891984, + "grad_norm": 0.27829602360725403, + "learning_rate": 3.382771929644095e-05, + "loss": 0.1834, + "step": 25039 + }, + { + "epoch": 0.4466164877109121, + "grad_norm": 0.3486580550670624, + "learning_rate": 3.382626303842119e-05, + "loss": 0.1629, + "step": 25040 + }, + { + "epoch": 0.44663432383262586, + "grad_norm": 0.2230127602815628, + "learning_rate": 3.38248067461878e-05, + "loss": 0.1105, + "step": 25041 + }, + { + "epoch": 0.44665215995433954, + "grad_norm": 0.2597845196723938, + "learning_rate": 3.38233504197464e-05, + "loss": 0.14, + "step": 25042 + }, + { + "epoch": 0.44666999607605323, + "grad_norm": 0.30955132842063904, + "learning_rate": 3.3821894059102645e-05, + "loss": 0.2182, + "step": 25043 + }, + { + "epoch": 0.4466878321977669, + "grad_norm": 0.26418086886405945, + "learning_rate": 3.382043766426218e-05, + "loss": 0.1485, + "step": 25044 + }, + { + "epoch": 0.4467056683194806, + "grad_norm": 0.2648506462574005, + "learning_rate": 3.3818981235230655e-05, + "loss": 0.1206, + "step": 25045 + }, + { + "epoch": 0.4467235044411943, + "grad_norm": 0.2646901607513428, + "learning_rate": 3.381752477201372e-05, + "loss": 0.1807, + "step": 25046 + }, + { + "epoch": 0.446741340562908, + "grad_norm": 0.2794043719768524, + "learning_rate": 3.381606827461701e-05, + "loss": 0.1296, + "step": 25047 + }, + { + "epoch": 0.4467591766846217, + "grad_norm": 0.5966060757637024, + "learning_rate": 3.3814611743046165e-05, + "loss": 0.167, + "step": 25048 + }, + { + "epoch": 0.4467770128063354, + "grad_norm": 0.31819650530815125, + "learning_rate": 3.3813155177306846e-05, + "loss": 0.1813, + "step": 25049 + }, + { + "epoch": 0.4467948489280491, + "grad_norm": 0.1923387199640274, + "learning_rate": 3.3811698577404696e-05, + "loss": 0.1214, + "step": 25050 + }, + { + "epoch": 0.4468126850497628, + "grad_norm": 0.2962595820426941, + "learning_rate": 3.381024194334535e-05, + "loss": 0.1348, + "step": 25051 + }, + { + "epoch": 0.4468305211714765, + "grad_norm": 0.2852820158004761, + "learning_rate": 3.380878527513448e-05, + "loss": 0.1186, + "step": 25052 + }, + { + "epoch": 0.44684835729319017, + "grad_norm": 0.31666702032089233, + "learning_rate": 3.3807328572777706e-05, + "loss": 0.1882, + "step": 25053 + }, + { + "epoch": 0.44686619341490386, + "grad_norm": 0.2562214434146881, + "learning_rate": 3.380587183628069e-05, + "loss": 0.1586, + "step": 25054 + }, + { + "epoch": 0.44688402953661754, + "grad_norm": 0.25463801622390747, + "learning_rate": 3.3804415065649064e-05, + "loss": 0.1699, + "step": 25055 + }, + { + "epoch": 0.44690186565833123, + "grad_norm": 0.260018527507782, + "learning_rate": 3.380295826088849e-05, + "loss": 0.1453, + "step": 25056 + }, + { + "epoch": 0.4469197017800449, + "grad_norm": 0.2361421436071396, + "learning_rate": 3.380150142200461e-05, + "loss": 0.1589, + "step": 25057 + }, + { + "epoch": 0.44693753790175866, + "grad_norm": 0.2917416989803314, + "learning_rate": 3.3800044549003065e-05, + "loss": 0.1161, + "step": 25058 + }, + { + "epoch": 0.44695537402347235, + "grad_norm": 0.22058722376823425, + "learning_rate": 3.379858764188951e-05, + "loss": 0.1286, + "step": 25059 + }, + { + "epoch": 0.44697321014518604, + "grad_norm": 0.17733772099018097, + "learning_rate": 3.37971307006696e-05, + "loss": 0.1376, + "step": 25060 + }, + { + "epoch": 0.4469910462668997, + "grad_norm": 0.27901384234428406, + "learning_rate": 3.379567372534896e-05, + "loss": 0.0938, + "step": 25061 + }, + { + "epoch": 0.4470088823886134, + "grad_norm": 0.284843772649765, + "learning_rate": 3.379421671593326e-05, + "loss": 0.1839, + "step": 25062 + }, + { + "epoch": 0.4470267185103271, + "grad_norm": 0.2234141081571579, + "learning_rate": 3.379275967242813e-05, + "loss": 0.1673, + "step": 25063 + }, + { + "epoch": 0.4470445546320408, + "grad_norm": 0.21907198429107666, + "learning_rate": 3.3791302594839236e-05, + "loss": 0.1233, + "step": 25064 + }, + { + "epoch": 0.4470623907537545, + "grad_norm": 0.3250732123851776, + "learning_rate": 3.3789845483172214e-05, + "loss": 0.1535, + "step": 25065 + }, + { + "epoch": 0.4470802268754682, + "grad_norm": 0.2301318347454071, + "learning_rate": 3.378838833743271e-05, + "loss": 0.1585, + "step": 25066 + }, + { + "epoch": 0.4470980629971819, + "grad_norm": 0.22827045619487762, + "learning_rate": 3.378693115762638e-05, + "loss": 0.1443, + "step": 25067 + }, + { + "epoch": 0.4471158991188956, + "grad_norm": 0.3583378195762634, + "learning_rate": 3.378547394375887e-05, + "loss": 0.1579, + "step": 25068 + }, + { + "epoch": 0.4471337352406093, + "grad_norm": 0.19095134735107422, + "learning_rate": 3.378401669583583e-05, + "loss": 0.1267, + "step": 25069 + }, + { + "epoch": 0.447151571362323, + "grad_norm": 0.27327707409858704, + "learning_rate": 3.37825594138629e-05, + "loss": 0.1753, + "step": 25070 + }, + { + "epoch": 0.44716940748403666, + "grad_norm": 0.21191474795341492, + "learning_rate": 3.378110209784574e-05, + "loss": 0.1572, + "step": 25071 + }, + { + "epoch": 0.44718724360575035, + "grad_norm": 0.2567843496799469, + "learning_rate": 3.377964474779e-05, + "loss": 0.1282, + "step": 25072 + }, + { + "epoch": 0.44720507972746404, + "grad_norm": 0.2781575620174408, + "learning_rate": 3.3778187363701323e-05, + "loss": 0.2073, + "step": 25073 + }, + { + "epoch": 0.4472229158491778, + "grad_norm": 0.30490902066230774, + "learning_rate": 3.3776729945585364e-05, + "loss": 0.2047, + "step": 25074 + }, + { + "epoch": 0.44724075197089147, + "grad_norm": 0.22402484714984894, + "learning_rate": 3.377527249344776e-05, + "loss": 0.1766, + "step": 25075 + }, + { + "epoch": 0.44725858809260516, + "grad_norm": 0.23545986413955688, + "learning_rate": 3.377381500729417e-05, + "loss": 0.1764, + "step": 25076 + }, + { + "epoch": 0.44727642421431885, + "grad_norm": 0.21871119737625122, + "learning_rate": 3.3772357487130245e-05, + "loss": 0.1424, + "step": 25077 + }, + { + "epoch": 0.44729426033603253, + "grad_norm": 0.27139389514923096, + "learning_rate": 3.377089993296164e-05, + "loss": 0.1333, + "step": 25078 + }, + { + "epoch": 0.4473120964577462, + "grad_norm": 0.2520368993282318, + "learning_rate": 3.376944234479398e-05, + "loss": 0.1648, + "step": 25079 + }, + { + "epoch": 0.4473299325794599, + "grad_norm": 0.2421381175518036, + "learning_rate": 3.376798472263295e-05, + "loss": 0.1336, + "step": 25080 + }, + { + "epoch": 0.4473477687011736, + "grad_norm": 0.3012990355491638, + "learning_rate": 3.376652706648417e-05, + "loss": 0.1724, + "step": 25081 + }, + { + "epoch": 0.4473656048228873, + "grad_norm": 0.26099875569343567, + "learning_rate": 3.3765069376353315e-05, + "loss": 0.0883, + "step": 25082 + }, + { + "epoch": 0.44738344094460103, + "grad_norm": 0.22884108126163483, + "learning_rate": 3.376361165224601e-05, + "loss": 0.1318, + "step": 25083 + }, + { + "epoch": 0.4474012770663147, + "grad_norm": 0.3147015869617462, + "learning_rate": 3.376215389416794e-05, + "loss": 0.1369, + "step": 25084 + }, + { + "epoch": 0.4474191131880284, + "grad_norm": 0.2578750550746918, + "learning_rate": 3.376069610212471e-05, + "loss": 0.1263, + "step": 25085 + }, + { + "epoch": 0.4474369493097421, + "grad_norm": 0.2533610165119171, + "learning_rate": 3.375923827612201e-05, + "loss": 0.1963, + "step": 25086 + }, + { + "epoch": 0.4474547854314558, + "grad_norm": 0.2715403437614441, + "learning_rate": 3.375778041616548e-05, + "loss": 0.1646, + "step": 25087 + }, + { + "epoch": 0.44747262155316947, + "grad_norm": 0.4554165005683899, + "learning_rate": 3.375632252226076e-05, + "loss": 0.1114, + "step": 25088 + }, + { + "epoch": 0.44749045767488316, + "grad_norm": 0.27990132570266724, + "learning_rate": 3.375486459441351e-05, + "loss": 0.1508, + "step": 25089 + }, + { + "epoch": 0.44750829379659685, + "grad_norm": 0.28010523319244385, + "learning_rate": 3.375340663262939e-05, + "loss": 0.1291, + "step": 25090 + }, + { + "epoch": 0.4475261299183106, + "grad_norm": 0.2398664802312851, + "learning_rate": 3.3751948636914033e-05, + "loss": 0.1386, + "step": 25091 + }, + { + "epoch": 0.4475439660400243, + "grad_norm": 0.2075163871049881, + "learning_rate": 3.37504906072731e-05, + "loss": 0.1358, + "step": 25092 + }, + { + "epoch": 0.44756180216173796, + "grad_norm": 0.198046013712883, + "learning_rate": 3.374903254371225e-05, + "loss": 0.1533, + "step": 25093 + }, + { + "epoch": 0.44757963828345165, + "grad_norm": 0.32511812448501587, + "learning_rate": 3.374757444623712e-05, + "loss": 0.1015, + "step": 25094 + }, + { + "epoch": 0.44759747440516534, + "grad_norm": 0.2876203656196594, + "learning_rate": 3.3746116314853384e-05, + "loss": 0.1314, + "step": 25095 + }, + { + "epoch": 0.44761531052687903, + "grad_norm": 0.2246880978345871, + "learning_rate": 3.3744658149566666e-05, + "loss": 0.1247, + "step": 25096 + }, + { + "epoch": 0.4476331466485927, + "grad_norm": 0.4224717319011688, + "learning_rate": 3.3743199950382645e-05, + "loss": 0.1575, + "step": 25097 + }, + { + "epoch": 0.4476509827703064, + "grad_norm": 0.2676776349544525, + "learning_rate": 3.374174171730695e-05, + "loss": 0.1273, + "step": 25098 + }, + { + "epoch": 0.4476688188920201, + "grad_norm": 0.35073116421699524, + "learning_rate": 3.374028345034525e-05, + "loss": 0.1305, + "step": 25099 + }, + { + "epoch": 0.44768665501373384, + "grad_norm": 0.3417600095272064, + "learning_rate": 3.373882514950319e-05, + "loss": 0.1209, + "step": 25100 + }, + { + "epoch": 0.4477044911354475, + "grad_norm": 0.26098814606666565, + "learning_rate": 3.373736681478643e-05, + "loss": 0.1647, + "step": 25101 + }, + { + "epoch": 0.4477223272571612, + "grad_norm": 0.2984170913696289, + "learning_rate": 3.373590844620062e-05, + "loss": 0.1723, + "step": 25102 + }, + { + "epoch": 0.4477401633788749, + "grad_norm": 0.2330995351076126, + "learning_rate": 3.3734450043751403e-05, + "loss": 0.1413, + "step": 25103 + }, + { + "epoch": 0.4477579995005886, + "grad_norm": 0.26513171195983887, + "learning_rate": 3.373299160744444e-05, + "loss": 0.1861, + "step": 25104 + }, + { + "epoch": 0.4477758356223023, + "grad_norm": 0.22102831304073334, + "learning_rate": 3.37315331372854e-05, + "loss": 0.1541, + "step": 25105 + }, + { + "epoch": 0.44779367174401596, + "grad_norm": 0.2039075344800949, + "learning_rate": 3.373007463327991e-05, + "loss": 0.1405, + "step": 25106 + }, + { + "epoch": 0.44781150786572965, + "grad_norm": 0.26761913299560547, + "learning_rate": 3.372861609543363e-05, + "loss": 0.1338, + "step": 25107 + }, + { + "epoch": 0.4478293439874434, + "grad_norm": 0.24937881529331207, + "learning_rate": 3.372715752375223e-05, + "loss": 0.1348, + "step": 25108 + }, + { + "epoch": 0.4478471801091571, + "grad_norm": 0.2639763355255127, + "learning_rate": 3.372569891824135e-05, + "loss": 0.1474, + "step": 25109 + }, + { + "epoch": 0.44786501623087077, + "grad_norm": 0.40596500039100647, + "learning_rate": 3.3724240278906646e-05, + "loss": 0.1187, + "step": 25110 + }, + { + "epoch": 0.44788285235258446, + "grad_norm": 0.23569276928901672, + "learning_rate": 3.372278160575377e-05, + "loss": 0.1687, + "step": 25111 + }, + { + "epoch": 0.44790068847429815, + "grad_norm": 0.29724204540252686, + "learning_rate": 3.3721322898788394e-05, + "loss": 0.1677, + "step": 25112 + }, + { + "epoch": 0.44791852459601184, + "grad_norm": 0.2467195838689804, + "learning_rate": 3.371986415801615e-05, + "loss": 0.1944, + "step": 25113 + }, + { + "epoch": 0.4479363607177255, + "grad_norm": 0.41708070039749146, + "learning_rate": 3.3718405383442694e-05, + "loss": 0.1971, + "step": 25114 + }, + { + "epoch": 0.4479541968394392, + "grad_norm": 0.23805159330368042, + "learning_rate": 3.371694657507369e-05, + "loss": 0.1195, + "step": 25115 + }, + { + "epoch": 0.4479720329611529, + "grad_norm": 0.2947307527065277, + "learning_rate": 3.37154877329148e-05, + "loss": 0.1794, + "step": 25116 + }, + { + "epoch": 0.44798986908286664, + "grad_norm": 0.34023165702819824, + "learning_rate": 3.371402885697166e-05, + "loss": 0.1806, + "step": 25117 + }, + { + "epoch": 0.44800770520458033, + "grad_norm": 0.2649897038936615, + "learning_rate": 3.371256994724994e-05, + "loss": 0.1659, + "step": 25118 + }, + { + "epoch": 0.448025541326294, + "grad_norm": 0.2599446773529053, + "learning_rate": 3.371111100375528e-05, + "loss": 0.1653, + "step": 25119 + }, + { + "epoch": 0.4480433774480077, + "grad_norm": 0.29039278626441956, + "learning_rate": 3.370965202649335e-05, + "loss": 0.1642, + "step": 25120 + }, + { + "epoch": 0.4480612135697214, + "grad_norm": 0.2501363158226013, + "learning_rate": 3.37081930154698e-05, + "loss": 0.1195, + "step": 25121 + }, + { + "epoch": 0.4480790496914351, + "grad_norm": 0.38731157779693604, + "learning_rate": 3.370673397069029e-05, + "loss": 0.247, + "step": 25122 + }, + { + "epoch": 0.44809688581314877, + "grad_norm": 0.3547469973564148, + "learning_rate": 3.370527489216048e-05, + "loss": 0.1358, + "step": 25123 + }, + { + "epoch": 0.44811472193486246, + "grad_norm": 0.28627923130989075, + "learning_rate": 3.3703815779886e-05, + "loss": 0.1557, + "step": 25124 + }, + { + "epoch": 0.4481325580565762, + "grad_norm": 0.2766645848751068, + "learning_rate": 3.3702356633872536e-05, + "loss": 0.144, + "step": 25125 + }, + { + "epoch": 0.4481503941782899, + "grad_norm": 0.23358498513698578, + "learning_rate": 3.370089745412572e-05, + "loss": 0.121, + "step": 25126 + }, + { + "epoch": 0.4481682303000036, + "grad_norm": 0.2869694232940674, + "learning_rate": 3.369943824065123e-05, + "loss": 0.1304, + "step": 25127 + }, + { + "epoch": 0.44818606642171727, + "grad_norm": 0.13642704486846924, + "learning_rate": 3.369797899345471e-05, + "loss": 0.1054, + "step": 25128 + }, + { + "epoch": 0.44820390254343095, + "grad_norm": 0.25332969427108765, + "learning_rate": 3.3696519712541825e-05, + "loss": 0.133, + "step": 25129 + }, + { + "epoch": 0.44822173866514464, + "grad_norm": 0.20523595809936523, + "learning_rate": 3.3695060397918216e-05, + "loss": 0.13, + "step": 25130 + }, + { + "epoch": 0.44823957478685833, + "grad_norm": 0.2310948371887207, + "learning_rate": 3.369360104958956e-05, + "loss": 0.1487, + "step": 25131 + }, + { + "epoch": 0.448257410908572, + "grad_norm": 0.2618043124675751, + "learning_rate": 3.36921416675615e-05, + "loss": 0.1664, + "step": 25132 + }, + { + "epoch": 0.44827524703028576, + "grad_norm": 0.30424538254737854, + "learning_rate": 3.369068225183969e-05, + "loss": 0.1566, + "step": 25133 + }, + { + "epoch": 0.44829308315199945, + "grad_norm": 0.29187119007110596, + "learning_rate": 3.36892228024298e-05, + "loss": 0.1664, + "step": 25134 + }, + { + "epoch": 0.44831091927371314, + "grad_norm": 0.2773895561695099, + "learning_rate": 3.368776331933748e-05, + "loss": 0.1563, + "step": 25135 + }, + { + "epoch": 0.4483287553954268, + "grad_norm": 0.2607872784137726, + "learning_rate": 3.3686303802568384e-05, + "loss": 0.1919, + "step": 25136 + }, + { + "epoch": 0.4483465915171405, + "grad_norm": 0.20376341044902802, + "learning_rate": 3.3684844252128186e-05, + "loss": 0.1341, + "step": 25137 + }, + { + "epoch": 0.4483644276388542, + "grad_norm": 0.31892910599708557, + "learning_rate": 3.368338466802252e-05, + "loss": 0.172, + "step": 25138 + }, + { + "epoch": 0.4483822637605679, + "grad_norm": 0.3441879153251648, + "learning_rate": 3.368192505025706e-05, + "loss": 0.1556, + "step": 25139 + }, + { + "epoch": 0.4484000998822816, + "grad_norm": 0.3495695888996124, + "learning_rate": 3.3680465398837465e-05, + "loss": 0.2275, + "step": 25140 + }, + { + "epoch": 0.44841793600399527, + "grad_norm": 0.5058633685112, + "learning_rate": 3.367900571376938e-05, + "loss": 0.1151, + "step": 25141 + }, + { + "epoch": 0.448435772125709, + "grad_norm": 0.27564477920532227, + "learning_rate": 3.367754599505848e-05, + "loss": 0.1348, + "step": 25142 + }, + { + "epoch": 0.4484536082474227, + "grad_norm": 0.2550720274448395, + "learning_rate": 3.36760862427104e-05, + "loss": 0.1876, + "step": 25143 + }, + { + "epoch": 0.4484714443691364, + "grad_norm": 0.2571973502635956, + "learning_rate": 3.3674626456730826e-05, + "loss": 0.1695, + "step": 25144 + }, + { + "epoch": 0.4484892804908501, + "grad_norm": 0.20841741561889648, + "learning_rate": 3.367316663712541e-05, + "loss": 0.1759, + "step": 25145 + }, + { + "epoch": 0.44850711661256376, + "grad_norm": 0.3069261908531189, + "learning_rate": 3.3671706783899795e-05, + "loss": 0.1077, + "step": 25146 + }, + { + "epoch": 0.44852495273427745, + "grad_norm": 0.37278303503990173, + "learning_rate": 3.367024689705965e-05, + "loss": 0.1035, + "step": 25147 + }, + { + "epoch": 0.44854278885599114, + "grad_norm": 0.2994999587535858, + "learning_rate": 3.3668786976610625e-05, + "loss": 0.1421, + "step": 25148 + }, + { + "epoch": 0.4485606249777048, + "grad_norm": 0.3582760691642761, + "learning_rate": 3.36673270225584e-05, + "loss": 0.1724, + "step": 25149 + }, + { + "epoch": 0.44857846109941857, + "grad_norm": 0.21472257375717163, + "learning_rate": 3.3665867034908615e-05, + "loss": 0.1869, + "step": 25150 + }, + { + "epoch": 0.44859629722113226, + "grad_norm": 0.23597067594528198, + "learning_rate": 3.3664407013666946e-05, + "loss": 0.1445, + "step": 25151 + }, + { + "epoch": 0.44861413334284594, + "grad_norm": 0.24522598087787628, + "learning_rate": 3.366294695883903e-05, + "loss": 0.1681, + "step": 25152 + }, + { + "epoch": 0.44863196946455963, + "grad_norm": 0.25341346859931946, + "learning_rate": 3.366148687043055e-05, + "loss": 0.1566, + "step": 25153 + }, + { + "epoch": 0.4486498055862733, + "grad_norm": 0.3028821647167206, + "learning_rate": 3.3660026748447146e-05, + "loss": 0.1463, + "step": 25154 + }, + { + "epoch": 0.448667641707987, + "grad_norm": 0.29849863052368164, + "learning_rate": 3.365856659289449e-05, + "loss": 0.1714, + "step": 25155 + }, + { + "epoch": 0.4486854778297007, + "grad_norm": 0.5199018716812134, + "learning_rate": 3.365710640377824e-05, + "loss": 0.1567, + "step": 25156 + }, + { + "epoch": 0.4487033139514144, + "grad_norm": 0.27346059679985046, + "learning_rate": 3.3655646181104056e-05, + "loss": 0.1868, + "step": 25157 + }, + { + "epoch": 0.44872115007312807, + "grad_norm": 0.2995662987232208, + "learning_rate": 3.36541859248776e-05, + "loss": 0.1333, + "step": 25158 + }, + { + "epoch": 0.4487389861948418, + "grad_norm": 0.22186905145645142, + "learning_rate": 3.365272563510453e-05, + "loss": 0.1153, + "step": 25159 + }, + { + "epoch": 0.4487568223165555, + "grad_norm": 0.26111921668052673, + "learning_rate": 3.365126531179051e-05, + "loss": 0.1798, + "step": 25160 + }, + { + "epoch": 0.4487746584382692, + "grad_norm": 0.28916266560554504, + "learning_rate": 3.3649804954941186e-05, + "loss": 0.1668, + "step": 25161 + }, + { + "epoch": 0.4487924945599829, + "grad_norm": 0.21810342371463776, + "learning_rate": 3.364834456456224e-05, + "loss": 0.1619, + "step": 25162 + }, + { + "epoch": 0.44881033068169657, + "grad_norm": 0.32179510593414307, + "learning_rate": 3.3646884140659315e-05, + "loss": 0.2116, + "step": 25163 + }, + { + "epoch": 0.44882816680341026, + "grad_norm": 0.22521504759788513, + "learning_rate": 3.364542368323809e-05, + "loss": 0.072, + "step": 25164 + }, + { + "epoch": 0.44884600292512394, + "grad_norm": 0.22853060066699982, + "learning_rate": 3.364396319230421e-05, + "loss": 0.1747, + "step": 25165 + }, + { + "epoch": 0.44886383904683763, + "grad_norm": 0.22562773525714874, + "learning_rate": 3.364250266786335e-05, + "loss": 0.1329, + "step": 25166 + }, + { + "epoch": 0.4488816751685514, + "grad_norm": 0.32169923186302185, + "learning_rate": 3.364104210992116e-05, + "loss": 0.1501, + "step": 25167 + }, + { + "epoch": 0.44889951129026506, + "grad_norm": 0.3001664876937866, + "learning_rate": 3.363958151848331e-05, + "loss": 0.0976, + "step": 25168 + }, + { + "epoch": 0.44891734741197875, + "grad_norm": 0.26077699661254883, + "learning_rate": 3.363812089355545e-05, + "loss": 0.1592, + "step": 25169 + }, + { + "epoch": 0.44893518353369244, + "grad_norm": 0.26564738154411316, + "learning_rate": 3.363666023514326e-05, + "loss": 0.1729, + "step": 25170 + }, + { + "epoch": 0.4489530196554061, + "grad_norm": 0.2562617063522339, + "learning_rate": 3.363519954325239e-05, + "loss": 0.1463, + "step": 25171 + }, + { + "epoch": 0.4489708557771198, + "grad_norm": 0.2161594182252884, + "learning_rate": 3.3633738817888495e-05, + "loss": 0.0988, + "step": 25172 + }, + { + "epoch": 0.4489886918988335, + "grad_norm": 0.4524584412574768, + "learning_rate": 3.363227805905725e-05, + "loss": 0.2153, + "step": 25173 + }, + { + "epoch": 0.4490065280205472, + "grad_norm": 0.36006593704223633, + "learning_rate": 3.363081726676432e-05, + "loss": 0.18, + "step": 25174 + }, + { + "epoch": 0.44902436414226093, + "grad_norm": 0.31347933411598206, + "learning_rate": 3.362935644101536e-05, + "loss": 0.152, + "step": 25175 + }, + { + "epoch": 0.4490422002639746, + "grad_norm": 0.2436746209859848, + "learning_rate": 3.3627895581816025e-05, + "loss": 0.1652, + "step": 25176 + }, + { + "epoch": 0.4490600363856883, + "grad_norm": 0.34364891052246094, + "learning_rate": 3.3626434689172e-05, + "loss": 0.1097, + "step": 25177 + }, + { + "epoch": 0.449077872507402, + "grad_norm": 0.2105472981929779, + "learning_rate": 3.362497376308892e-05, + "loss": 0.1337, + "step": 25178 + }, + { + "epoch": 0.4490957086291157, + "grad_norm": 0.2542533576488495, + "learning_rate": 3.362351280357248e-05, + "loss": 0.1936, + "step": 25179 + }, + { + "epoch": 0.4491135447508294, + "grad_norm": 0.25485458970069885, + "learning_rate": 3.362205181062831e-05, + "loss": 0.1562, + "step": 25180 + }, + { + "epoch": 0.44913138087254306, + "grad_norm": 0.3123902678489685, + "learning_rate": 3.36205907842621e-05, + "loss": 0.1603, + "step": 25181 + }, + { + "epoch": 0.44914921699425675, + "grad_norm": 0.25344014167785645, + "learning_rate": 3.3619129724479495e-05, + "loss": 0.1416, + "step": 25182 + }, + { + "epoch": 0.44916705311597044, + "grad_norm": 0.25669312477111816, + "learning_rate": 3.361766863128617e-05, + "loss": 0.1422, + "step": 25183 + }, + { + "epoch": 0.4491848892376842, + "grad_norm": 0.28137296438217163, + "learning_rate": 3.361620750468779e-05, + "loss": 0.1775, + "step": 25184 + }, + { + "epoch": 0.44920272535939787, + "grad_norm": 0.30618470907211304, + "learning_rate": 3.361474634469e-05, + "loss": 0.1601, + "step": 25185 + }, + { + "epoch": 0.44922056148111156, + "grad_norm": 0.2281280755996704, + "learning_rate": 3.361328515129849e-05, + "loss": 0.1266, + "step": 25186 + }, + { + "epoch": 0.44923839760282525, + "grad_norm": 0.2970804274082184, + "learning_rate": 3.361182392451891e-05, + "loss": 0.1932, + "step": 25187 + }, + { + "epoch": 0.44925623372453893, + "grad_norm": 0.22811521589756012, + "learning_rate": 3.3610362664356925e-05, + "loss": 0.1474, + "step": 25188 + }, + { + "epoch": 0.4492740698462526, + "grad_norm": 0.3088405430316925, + "learning_rate": 3.3608901370818196e-05, + "loss": 0.1466, + "step": 25189 + }, + { + "epoch": 0.4492919059679663, + "grad_norm": 0.32938724756240845, + "learning_rate": 3.3607440043908395e-05, + "loss": 0.1593, + "step": 25190 + }, + { + "epoch": 0.44930974208968, + "grad_norm": 0.2268843799829483, + "learning_rate": 3.360597868363318e-05, + "loss": 0.1707, + "step": 25191 + }, + { + "epoch": 0.44932757821139374, + "grad_norm": 0.23305165767669678, + "learning_rate": 3.360451728999823e-05, + "loss": 0.1421, + "step": 25192 + }, + { + "epoch": 0.44934541433310743, + "grad_norm": 0.28758129477500916, + "learning_rate": 3.360305586300919e-05, + "loss": 0.1548, + "step": 25193 + }, + { + "epoch": 0.4493632504548211, + "grad_norm": 0.21277910470962524, + "learning_rate": 3.3601594402671735e-05, + "loss": 0.1202, + "step": 25194 + }, + { + "epoch": 0.4493810865765348, + "grad_norm": 0.2662602663040161, + "learning_rate": 3.360013290899153e-05, + "loss": 0.1862, + "step": 25195 + }, + { + "epoch": 0.4493989226982485, + "grad_norm": 0.2004675716161728, + "learning_rate": 3.359867138197424e-05, + "loss": 0.1165, + "step": 25196 + }, + { + "epoch": 0.4494167588199622, + "grad_norm": 0.22816108167171478, + "learning_rate": 3.359720982162553e-05, + "loss": 0.1877, + "step": 25197 + }, + { + "epoch": 0.44943459494167587, + "grad_norm": 0.24822060763835907, + "learning_rate": 3.359574822795106e-05, + "loss": 0.1655, + "step": 25198 + }, + { + "epoch": 0.44945243106338956, + "grad_norm": 0.303446501493454, + "learning_rate": 3.3594286600956506e-05, + "loss": 0.1152, + "step": 25199 + }, + { + "epoch": 0.44947026718510324, + "grad_norm": 0.2870773673057556, + "learning_rate": 3.359282494064753e-05, + "loss": 0.1713, + "step": 25200 + }, + { + "epoch": 0.449488103306817, + "grad_norm": 0.3076757490634918, + "learning_rate": 3.35913632470298e-05, + "loss": 0.1499, + "step": 25201 + }, + { + "epoch": 0.4495059394285307, + "grad_norm": 0.2821045517921448, + "learning_rate": 3.358990152010897e-05, + "loss": 0.181, + "step": 25202 + }, + { + "epoch": 0.44952377555024436, + "grad_norm": 0.2755041718482971, + "learning_rate": 3.358843975989072e-05, + "loss": 0.1975, + "step": 25203 + }, + { + "epoch": 0.44954161167195805, + "grad_norm": 0.18216709792613983, + "learning_rate": 3.358697796638071e-05, + "loss": 0.1075, + "step": 25204 + }, + { + "epoch": 0.44955944779367174, + "grad_norm": 0.3075617253780365, + "learning_rate": 3.358551613958461e-05, + "loss": 0.1647, + "step": 25205 + }, + { + "epoch": 0.44957728391538543, + "grad_norm": 0.2305593639612198, + "learning_rate": 3.3584054279508076e-05, + "loss": 0.1173, + "step": 25206 + }, + { + "epoch": 0.4495951200370991, + "grad_norm": 0.2448464035987854, + "learning_rate": 3.358259238615679e-05, + "loss": 0.1632, + "step": 25207 + }, + { + "epoch": 0.4496129561588128, + "grad_norm": 0.19121719896793365, + "learning_rate": 3.3581130459536406e-05, + "loss": 0.1178, + "step": 25208 + }, + { + "epoch": 0.44963079228052655, + "grad_norm": 0.2409624606370926, + "learning_rate": 3.3579668499652605e-05, + "loss": 0.145, + "step": 25209 + }, + { + "epoch": 0.44964862840224024, + "grad_norm": 0.22802075743675232, + "learning_rate": 3.357820650651104e-05, + "loss": 0.1153, + "step": 25210 + }, + { + "epoch": 0.4496664645239539, + "grad_norm": 0.2209000289440155, + "learning_rate": 3.3576744480117384e-05, + "loss": 0.1347, + "step": 25211 + }, + { + "epoch": 0.4496843006456676, + "grad_norm": 0.34943556785583496, + "learning_rate": 3.357528242047731e-05, + "loss": 0.1787, + "step": 25212 + }, + { + "epoch": 0.4497021367673813, + "grad_norm": 0.20893093943595886, + "learning_rate": 3.357382032759647e-05, + "loss": 0.1175, + "step": 25213 + }, + { + "epoch": 0.449719972889095, + "grad_norm": 0.2644076943397522, + "learning_rate": 3.3572358201480545e-05, + "loss": 0.1645, + "step": 25214 + }, + { + "epoch": 0.4497378090108087, + "grad_norm": 0.25215739011764526, + "learning_rate": 3.35708960421352e-05, + "loss": 0.127, + "step": 25215 + }, + { + "epoch": 0.44975564513252236, + "grad_norm": 0.20632304251194, + "learning_rate": 3.3569433849566105e-05, + "loss": 0.0626, + "step": 25216 + }, + { + "epoch": 0.44977348125423605, + "grad_norm": 0.21374443173408508, + "learning_rate": 3.356797162377892e-05, + "loss": 0.1094, + "step": 25217 + }, + { + "epoch": 0.4497913173759498, + "grad_norm": 0.22069582343101501, + "learning_rate": 3.3566509364779315e-05, + "loss": 0.1199, + "step": 25218 + }, + { + "epoch": 0.4498091534976635, + "grad_norm": 0.25240421295166016, + "learning_rate": 3.3565047072572966e-05, + "loss": 0.1301, + "step": 25219 + }, + { + "epoch": 0.44982698961937717, + "grad_norm": 0.20521599054336548, + "learning_rate": 3.3563584747165535e-05, + "loss": 0.126, + "step": 25220 + }, + { + "epoch": 0.44984482574109086, + "grad_norm": 0.4605122208595276, + "learning_rate": 3.356212238856269e-05, + "loss": 0.1837, + "step": 25221 + }, + { + "epoch": 0.44986266186280455, + "grad_norm": 0.22142378985881805, + "learning_rate": 3.3560659996770103e-05, + "loss": 0.1468, + "step": 25222 + }, + { + "epoch": 0.44988049798451824, + "grad_norm": 0.2981818914413452, + "learning_rate": 3.355919757179344e-05, + "loss": 0.1744, + "step": 25223 + }, + { + "epoch": 0.4498983341062319, + "grad_norm": 0.2656365931034088, + "learning_rate": 3.3557735113638375e-05, + "loss": 0.1332, + "step": 25224 + }, + { + "epoch": 0.4499161702279456, + "grad_norm": 0.2899135947227478, + "learning_rate": 3.355627262231057e-05, + "loss": 0.1583, + "step": 25225 + }, + { + "epoch": 0.44993400634965935, + "grad_norm": 0.28080734610557556, + "learning_rate": 3.355481009781569e-05, + "loss": 0.1788, + "step": 25226 + }, + { + "epoch": 0.44995184247137304, + "grad_norm": 0.25094074010849, + "learning_rate": 3.355334754015943e-05, + "loss": 0.1913, + "step": 25227 + }, + { + "epoch": 0.44996967859308673, + "grad_norm": 0.2630382776260376, + "learning_rate": 3.3551884949347415e-05, + "loss": 0.1594, + "step": 25228 + }, + { + "epoch": 0.4499875147148004, + "grad_norm": 0.2984447479248047, + "learning_rate": 3.355042232538536e-05, + "loss": 0.1683, + "step": 25229 + }, + { + "epoch": 0.4500053508365141, + "grad_norm": 0.22763073444366455, + "learning_rate": 3.35489596682789e-05, + "loss": 0.1868, + "step": 25230 + }, + { + "epoch": 0.4500231869582278, + "grad_norm": 0.20442494750022888, + "learning_rate": 3.3547496978033734e-05, + "loss": 0.124, + "step": 25231 + }, + { + "epoch": 0.4500410230799415, + "grad_norm": 0.243928924202919, + "learning_rate": 3.354603425465551e-05, + "loss": 0.1297, + "step": 25232 + }, + { + "epoch": 0.45005885920165517, + "grad_norm": 0.2313058078289032, + "learning_rate": 3.354457149814991e-05, + "loss": 0.1001, + "step": 25233 + }, + { + "epoch": 0.4500766953233689, + "grad_norm": 0.34878385066986084, + "learning_rate": 3.35431087085226e-05, + "loss": 0.13, + "step": 25234 + }, + { + "epoch": 0.4500945314450826, + "grad_norm": 0.2583732008934021, + "learning_rate": 3.354164588577925e-05, + "loss": 0.1457, + "step": 25235 + }, + { + "epoch": 0.4501123675667963, + "grad_norm": 0.27169570326805115, + "learning_rate": 3.3540183029925526e-05, + "loss": 0.1823, + "step": 25236 + }, + { + "epoch": 0.45013020368851, + "grad_norm": 0.2585698068141937, + "learning_rate": 3.3538720140967105e-05, + "loss": 0.125, + "step": 25237 + }, + { + "epoch": 0.45014803981022367, + "grad_norm": 0.3255886733531952, + "learning_rate": 3.353725721890965e-05, + "loss": 0.1878, + "step": 25238 + }, + { + "epoch": 0.45016587593193735, + "grad_norm": 0.24168072640895844, + "learning_rate": 3.353579426375884e-05, + "loss": 0.1227, + "step": 25239 + }, + { + "epoch": 0.45018371205365104, + "grad_norm": 0.2801237404346466, + "learning_rate": 3.353433127552035e-05, + "loss": 0.1856, + "step": 25240 + }, + { + "epoch": 0.45020154817536473, + "grad_norm": 0.22060294449329376, + "learning_rate": 3.353286825419984e-05, + "loss": 0.1727, + "step": 25241 + }, + { + "epoch": 0.4502193842970784, + "grad_norm": 0.28346458077430725, + "learning_rate": 3.3531405199802986e-05, + "loss": 0.1658, + "step": 25242 + }, + { + "epoch": 0.45023722041879216, + "grad_norm": 0.22537072002887726, + "learning_rate": 3.3529942112335455e-05, + "loss": 0.1703, + "step": 25243 + }, + { + "epoch": 0.45025505654050585, + "grad_norm": 0.2824644446372986, + "learning_rate": 3.352847899180293e-05, + "loss": 0.175, + "step": 25244 + }, + { + "epoch": 0.45027289266221954, + "grad_norm": 0.3069266676902771, + "learning_rate": 3.3527015838211066e-05, + "loss": 0.2013, + "step": 25245 + }, + { + "epoch": 0.4502907287839332, + "grad_norm": 0.25050270557403564, + "learning_rate": 3.3525552651565544e-05, + "loss": 0.1497, + "step": 25246 + }, + { + "epoch": 0.4503085649056469, + "grad_norm": 0.2628241181373596, + "learning_rate": 3.352408943187204e-05, + "loss": 0.1866, + "step": 25247 + }, + { + "epoch": 0.4503264010273606, + "grad_norm": 0.2206946611404419, + "learning_rate": 3.352262617913622e-05, + "loss": 0.1617, + "step": 25248 + }, + { + "epoch": 0.4503442371490743, + "grad_norm": 0.3750324845314026, + "learning_rate": 3.352116289336376e-05, + "loss": 0.1572, + "step": 25249 + }, + { + "epoch": 0.450362073270788, + "grad_norm": 0.22658652067184448, + "learning_rate": 3.3519699574560325e-05, + "loss": 0.1438, + "step": 25250 + }, + { + "epoch": 0.4503799093925017, + "grad_norm": 0.24892501533031464, + "learning_rate": 3.35182362227316e-05, + "loss": 0.1207, + "step": 25251 + }, + { + "epoch": 0.4503977455142154, + "grad_norm": 0.3095797002315521, + "learning_rate": 3.351677283788324e-05, + "loss": 0.1522, + "step": 25252 + }, + { + "epoch": 0.4504155816359291, + "grad_norm": 0.3030305504798889, + "learning_rate": 3.351530942002093e-05, + "loss": 0.1753, + "step": 25253 + }, + { + "epoch": 0.4504334177576428, + "grad_norm": 0.2439109981060028, + "learning_rate": 3.351384596915034e-05, + "loss": 0.1364, + "step": 25254 + }, + { + "epoch": 0.4504512538793565, + "grad_norm": 0.329201877117157, + "learning_rate": 3.351238248527714e-05, + "loss": 0.1087, + "step": 25255 + }, + { + "epoch": 0.45046909000107016, + "grad_norm": 0.30045226216316223, + "learning_rate": 3.351091896840701e-05, + "loss": 0.1632, + "step": 25256 + }, + { + "epoch": 0.45048692612278385, + "grad_norm": 0.27069029211997986, + "learning_rate": 3.3509455418545615e-05, + "loss": 0.1947, + "step": 25257 + }, + { + "epoch": 0.45050476224449754, + "grad_norm": 0.34230273962020874, + "learning_rate": 3.350799183569863e-05, + "loss": 0.114, + "step": 25258 + }, + { + "epoch": 0.4505225983662112, + "grad_norm": 0.48819881677627563, + "learning_rate": 3.3506528219871733e-05, + "loss": 0.1565, + "step": 25259 + }, + { + "epoch": 0.45054043448792497, + "grad_norm": 0.22449827194213867, + "learning_rate": 3.350506457107059e-05, + "loss": 0.134, + "step": 25260 + }, + { + "epoch": 0.45055827060963866, + "grad_norm": 0.2970493733882904, + "learning_rate": 3.350360088930088e-05, + "loss": 0.2096, + "step": 25261 + }, + { + "epoch": 0.45057610673135234, + "grad_norm": 0.2858191728591919, + "learning_rate": 3.3502137174568286e-05, + "loss": 0.1456, + "step": 25262 + }, + { + "epoch": 0.45059394285306603, + "grad_norm": 0.23565861582756042, + "learning_rate": 3.3500673426878465e-05, + "loss": 0.194, + "step": 25263 + }, + { + "epoch": 0.4506117789747797, + "grad_norm": 0.30705341696739197, + "learning_rate": 3.34992096462371e-05, + "loss": 0.123, + "step": 25264 + }, + { + "epoch": 0.4506296150964934, + "grad_norm": 0.33568763732910156, + "learning_rate": 3.349774583264986e-05, + "loss": 0.1909, + "step": 25265 + }, + { + "epoch": 0.4506474512182071, + "grad_norm": 0.29980939626693726, + "learning_rate": 3.3496281986122426e-05, + "loss": 0.1694, + "step": 25266 + }, + { + "epoch": 0.4506652873399208, + "grad_norm": 0.27742552757263184, + "learning_rate": 3.349481810666046e-05, + "loss": 0.1191, + "step": 25267 + }, + { + "epoch": 0.4506831234616345, + "grad_norm": 0.2611779570579529, + "learning_rate": 3.349335419426966e-05, + "loss": 0.1285, + "step": 25268 + }, + { + "epoch": 0.4507009595833482, + "grad_norm": 0.2133435606956482, + "learning_rate": 3.3491890248955665e-05, + "loss": 0.1137, + "step": 25269 + }, + { + "epoch": 0.4507187957050619, + "grad_norm": 0.3011770248413086, + "learning_rate": 3.349042627072418e-05, + "loss": 0.183, + "step": 25270 + }, + { + "epoch": 0.4507366318267756, + "grad_norm": 0.3069663643836975, + "learning_rate": 3.348896225958088e-05, + "loss": 0.1797, + "step": 25271 + }, + { + "epoch": 0.4507544679484893, + "grad_norm": 0.28608402609825134, + "learning_rate": 3.348749821553142e-05, + "loss": 0.1483, + "step": 25272 + }, + { + "epoch": 0.45077230407020297, + "grad_norm": 0.7763242721557617, + "learning_rate": 3.3486034138581486e-05, + "loss": 0.1449, + "step": 25273 + }, + { + "epoch": 0.45079014019191666, + "grad_norm": 0.27435481548309326, + "learning_rate": 3.348457002873675e-05, + "loss": 0.1446, + "step": 25274 + }, + { + "epoch": 0.45080797631363034, + "grad_norm": 0.21519720554351807, + "learning_rate": 3.34831058860029e-05, + "loss": 0.1579, + "step": 25275 + }, + { + "epoch": 0.45082581243534403, + "grad_norm": 0.26937273144721985, + "learning_rate": 3.3481641710385594e-05, + "loss": 0.1608, + "step": 25276 + }, + { + "epoch": 0.4508436485570578, + "grad_norm": 0.2907882630825043, + "learning_rate": 3.348017750189052e-05, + "loss": 0.1345, + "step": 25277 + }, + { + "epoch": 0.45086148467877146, + "grad_norm": 0.3035869002342224, + "learning_rate": 3.3478713260523344e-05, + "loss": 0.193, + "step": 25278 + }, + { + "epoch": 0.45087932080048515, + "grad_norm": 0.22311025857925415, + "learning_rate": 3.347724898628974e-05, + "loss": 0.1076, + "step": 25279 + }, + { + "epoch": 0.45089715692219884, + "grad_norm": 0.3103754222393036, + "learning_rate": 3.347578467919541e-05, + "loss": 0.1588, + "step": 25280 + }, + { + "epoch": 0.4509149930439125, + "grad_norm": 0.22730982303619385, + "learning_rate": 3.347432033924599e-05, + "loss": 0.1206, + "step": 25281 + }, + { + "epoch": 0.4509328291656262, + "grad_norm": 0.3011152148246765, + "learning_rate": 3.347285596644719e-05, + "loss": 0.1468, + "step": 25282 + }, + { + "epoch": 0.4509506652873399, + "grad_norm": 0.21181604266166687, + "learning_rate": 3.347139156080468e-05, + "loss": 0.1441, + "step": 25283 + }, + { + "epoch": 0.4509685014090536, + "grad_norm": 0.2664339244365692, + "learning_rate": 3.346992712232412e-05, + "loss": 0.1492, + "step": 25284 + }, + { + "epoch": 0.45098633753076733, + "grad_norm": 0.24919430911540985, + "learning_rate": 3.34684626510112e-05, + "loss": 0.1629, + "step": 25285 + }, + { + "epoch": 0.451004173652481, + "grad_norm": 0.27807408571243286, + "learning_rate": 3.34669981468716e-05, + "loss": 0.1261, + "step": 25286 + }, + { + "epoch": 0.4510220097741947, + "grad_norm": 0.33694398403167725, + "learning_rate": 3.3465533609910977e-05, + "loss": 0.1855, + "step": 25287 + }, + { + "epoch": 0.4510398458959084, + "grad_norm": 0.29241684079170227, + "learning_rate": 3.346406904013503e-05, + "loss": 0.1469, + "step": 25288 + }, + { + "epoch": 0.4510576820176221, + "grad_norm": 0.24050629138946533, + "learning_rate": 3.346260443754943e-05, + "loss": 0.1599, + "step": 25289 + }, + { + "epoch": 0.4510755181393358, + "grad_norm": 0.2322806715965271, + "learning_rate": 3.346113980215985e-05, + "loss": 0.1223, + "step": 25290 + }, + { + "epoch": 0.45109335426104946, + "grad_norm": 0.3003292381763458, + "learning_rate": 3.345967513397197e-05, + "loss": 0.1053, + "step": 25291 + }, + { + "epoch": 0.45111119038276315, + "grad_norm": 0.2685421109199524, + "learning_rate": 3.3458210432991464e-05, + "loss": 0.1231, + "step": 25292 + }, + { + "epoch": 0.4511290265044769, + "grad_norm": 0.30601227283477783, + "learning_rate": 3.3456745699224015e-05, + "loss": 0.1829, + "step": 25293 + }, + { + "epoch": 0.4511468626261906, + "grad_norm": 0.29467248916625977, + "learning_rate": 3.34552809326753e-05, + "loss": 0.1267, + "step": 25294 + }, + { + "epoch": 0.45116469874790427, + "grad_norm": 0.29698774218559265, + "learning_rate": 3.345381613335099e-05, + "loss": 0.1347, + "step": 25295 + }, + { + "epoch": 0.45118253486961796, + "grad_norm": 0.3743653893470764, + "learning_rate": 3.345235130125678e-05, + "loss": 0.1226, + "step": 25296 + }, + { + "epoch": 0.45120037099133165, + "grad_norm": 0.2456246316432953, + "learning_rate": 3.3450886436398324e-05, + "loss": 0.1349, + "step": 25297 + }, + { + "epoch": 0.45121820711304533, + "grad_norm": 0.2276928871870041, + "learning_rate": 3.344942153878133e-05, + "loss": 0.1077, + "step": 25298 + }, + { + "epoch": 0.451236043234759, + "grad_norm": 0.3575064241886139, + "learning_rate": 3.344795660841145e-05, + "loss": 0.1363, + "step": 25299 + }, + { + "epoch": 0.4512538793564727, + "grad_norm": 0.2665491998195648, + "learning_rate": 3.344649164529437e-05, + "loss": 0.1741, + "step": 25300 + }, + { + "epoch": 0.4512717154781864, + "grad_norm": 0.35520878434181213, + "learning_rate": 3.3445026649435773e-05, + "loss": 0.1228, + "step": 25301 + }, + { + "epoch": 0.45128955159990014, + "grad_norm": 0.30768412351608276, + "learning_rate": 3.3443561620841336e-05, + "loss": 0.1684, + "step": 25302 + }, + { + "epoch": 0.45130738772161383, + "grad_norm": 0.20322144031524658, + "learning_rate": 3.344209655951674e-05, + "loss": 0.16, + "step": 25303 + }, + { + "epoch": 0.4513252238433275, + "grad_norm": 0.28124427795410156, + "learning_rate": 3.344063146546766e-05, + "loss": 0.0946, + "step": 25304 + }, + { + "epoch": 0.4513430599650412, + "grad_norm": 0.2849627435207367, + "learning_rate": 3.343916633869978e-05, + "loss": 0.1254, + "step": 25305 + }, + { + "epoch": 0.4513608960867549, + "grad_norm": 0.26051315665245056, + "learning_rate": 3.343770117921877e-05, + "loss": 0.145, + "step": 25306 + }, + { + "epoch": 0.4513787322084686, + "grad_norm": 0.2562755346298218, + "learning_rate": 3.3436235987030315e-05, + "loss": 0.2008, + "step": 25307 + }, + { + "epoch": 0.45139656833018227, + "grad_norm": 0.2513726055622101, + "learning_rate": 3.34347707621401e-05, + "loss": 0.1298, + "step": 25308 + }, + { + "epoch": 0.45141440445189596, + "grad_norm": 0.27829447388648987, + "learning_rate": 3.34333055045538e-05, + "loss": 0.1271, + "step": 25309 + }, + { + "epoch": 0.4514322405736097, + "grad_norm": 0.3086654245853424, + "learning_rate": 3.34318402142771e-05, + "loss": 0.1415, + "step": 25310 + }, + { + "epoch": 0.4514500766953234, + "grad_norm": 0.3558303415775299, + "learning_rate": 3.3430374891315674e-05, + "loss": 0.1666, + "step": 25311 + }, + { + "epoch": 0.4514679128170371, + "grad_norm": 0.3510570228099823, + "learning_rate": 3.3428909535675195e-05, + "loss": 0.134, + "step": 25312 + }, + { + "epoch": 0.45148574893875076, + "grad_norm": 0.3693179488182068, + "learning_rate": 3.3427444147361356e-05, + "loss": 0.1587, + "step": 25313 + }, + { + "epoch": 0.45150358506046445, + "grad_norm": 0.25806477665901184, + "learning_rate": 3.3425978726379834e-05, + "loss": 0.1669, + "step": 25314 + }, + { + "epoch": 0.45152142118217814, + "grad_norm": 0.3023644685745239, + "learning_rate": 3.342451327273631e-05, + "loss": 0.1168, + "step": 25315 + }, + { + "epoch": 0.45153925730389183, + "grad_norm": 0.4080870747566223, + "learning_rate": 3.342304778643646e-05, + "loss": 0.1196, + "step": 25316 + }, + { + "epoch": 0.4515570934256055, + "grad_norm": 0.4201013445854187, + "learning_rate": 3.342158226748596e-05, + "loss": 0.2076, + "step": 25317 + }, + { + "epoch": 0.4515749295473192, + "grad_norm": 0.3449398875236511, + "learning_rate": 3.3420116715890515e-05, + "loss": 0.1874, + "step": 25318 + }, + { + "epoch": 0.45159276566903295, + "grad_norm": 0.29861119389533997, + "learning_rate": 3.341865113165577e-05, + "loss": 0.161, + "step": 25319 + }, + { + "epoch": 0.45161060179074664, + "grad_norm": 0.27511903643608093, + "learning_rate": 3.3417185514787446e-05, + "loss": 0.1134, + "step": 25320 + }, + { + "epoch": 0.4516284379124603, + "grad_norm": 0.2599897086620331, + "learning_rate": 3.3415719865291194e-05, + "loss": 0.1182, + "step": 25321 + }, + { + "epoch": 0.451646274034174, + "grad_norm": 0.3110343813896179, + "learning_rate": 3.34142541831727e-05, + "loss": 0.163, + "step": 25322 + }, + { + "epoch": 0.4516641101558877, + "grad_norm": 0.21736828982830048, + "learning_rate": 3.341278846843765e-05, + "loss": 0.1602, + "step": 25323 + }, + { + "epoch": 0.4516819462776014, + "grad_norm": 0.31393927335739136, + "learning_rate": 3.3411322721091735e-05, + "loss": 0.1516, + "step": 25324 + }, + { + "epoch": 0.4516997823993151, + "grad_norm": 0.22412462532520294, + "learning_rate": 3.3409856941140625e-05, + "loss": 0.1602, + "step": 25325 + }, + { + "epoch": 0.45171761852102876, + "grad_norm": 0.3443428874015808, + "learning_rate": 3.340839112859001e-05, + "loss": 0.1623, + "step": 25326 + }, + { + "epoch": 0.4517354546427425, + "grad_norm": 0.25632989406585693, + "learning_rate": 3.340692528344556e-05, + "loss": 0.1751, + "step": 25327 + }, + { + "epoch": 0.4517532907644562, + "grad_norm": 0.23276600241661072, + "learning_rate": 3.340545940571297e-05, + "loss": 0.1188, + "step": 25328 + }, + { + "epoch": 0.4517711268861699, + "grad_norm": 0.30829668045043945, + "learning_rate": 3.340399349539791e-05, + "loss": 0.1342, + "step": 25329 + }, + { + "epoch": 0.45178896300788357, + "grad_norm": 0.35151705145835876, + "learning_rate": 3.3402527552506064e-05, + "loss": 0.1009, + "step": 25330 + }, + { + "epoch": 0.45180679912959726, + "grad_norm": 0.27517446875572205, + "learning_rate": 3.340106157704313e-05, + "loss": 0.1204, + "step": 25331 + }, + { + "epoch": 0.45182463525131095, + "grad_norm": 0.3466947376728058, + "learning_rate": 3.339959556901477e-05, + "loss": 0.1274, + "step": 25332 + }, + { + "epoch": 0.45184247137302463, + "grad_norm": 0.39269569516181946, + "learning_rate": 3.339812952842668e-05, + "loss": 0.1527, + "step": 25333 + }, + { + "epoch": 0.4518603074947383, + "grad_norm": 0.26444125175476074, + "learning_rate": 3.3396663455284536e-05, + "loss": 0.1319, + "step": 25334 + }, + { + "epoch": 0.45187814361645207, + "grad_norm": 0.27987992763519287, + "learning_rate": 3.339519734959403e-05, + "loss": 0.1756, + "step": 25335 + }, + { + "epoch": 0.45189597973816575, + "grad_norm": 0.21436183154582977, + "learning_rate": 3.339373121136084e-05, + "loss": 0.129, + "step": 25336 + }, + { + "epoch": 0.45191381585987944, + "grad_norm": 0.2604813575744629, + "learning_rate": 3.3392265040590645e-05, + "loss": 0.1312, + "step": 25337 + }, + { + "epoch": 0.45193165198159313, + "grad_norm": 0.30031466484069824, + "learning_rate": 3.3390798837289134e-05, + "loss": 0.1372, + "step": 25338 + }, + { + "epoch": 0.4519494881033068, + "grad_norm": 0.29773229360580444, + "learning_rate": 3.338933260146199e-05, + "loss": 0.127, + "step": 25339 + }, + { + "epoch": 0.4519673242250205, + "grad_norm": 0.23106749355793, + "learning_rate": 3.338786633311489e-05, + "loss": 0.132, + "step": 25340 + }, + { + "epoch": 0.4519851603467342, + "grad_norm": 0.2504771649837494, + "learning_rate": 3.3386400032253533e-05, + "loss": 0.1984, + "step": 25341 + }, + { + "epoch": 0.4520029964684479, + "grad_norm": 0.25725501775741577, + "learning_rate": 3.338493369888359e-05, + "loss": 0.1341, + "step": 25342 + }, + { + "epoch": 0.45202083259016157, + "grad_norm": 0.3508862257003784, + "learning_rate": 3.338346733301074e-05, + "loss": 0.1219, + "step": 25343 + }, + { + "epoch": 0.4520386687118753, + "grad_norm": 0.27442172169685364, + "learning_rate": 3.3382000934640684e-05, + "loss": 0.1694, + "step": 25344 + }, + { + "epoch": 0.452056504833589, + "grad_norm": 0.3080499470233917, + "learning_rate": 3.338053450377909e-05, + "loss": 0.1468, + "step": 25345 + }, + { + "epoch": 0.4520743409553027, + "grad_norm": 0.3203141987323761, + "learning_rate": 3.3379068040431653e-05, + "loss": 0.1237, + "step": 25346 + }, + { + "epoch": 0.4520921770770164, + "grad_norm": 0.3010665774345398, + "learning_rate": 3.3377601544604055e-05, + "loss": 0.1242, + "step": 25347 + }, + { + "epoch": 0.45211001319873007, + "grad_norm": 0.3806282579898834, + "learning_rate": 3.337613501630198e-05, + "loss": 0.1127, + "step": 25348 + }, + { + "epoch": 0.45212784932044375, + "grad_norm": 0.24944719672203064, + "learning_rate": 3.337466845553111e-05, + "loss": 0.1308, + "step": 25349 + }, + { + "epoch": 0.45214568544215744, + "grad_norm": 0.38069990277290344, + "learning_rate": 3.3373201862297135e-05, + "loss": 0.1044, + "step": 25350 + }, + { + "epoch": 0.45216352156387113, + "grad_norm": 0.2638382017612457, + "learning_rate": 3.337173523660573e-05, + "loss": 0.1284, + "step": 25351 + }, + { + "epoch": 0.4521813576855849, + "grad_norm": 0.3609374761581421, + "learning_rate": 3.33702685784626e-05, + "loss": 0.1425, + "step": 25352 + }, + { + "epoch": 0.45219919380729856, + "grad_norm": 0.2218160778284073, + "learning_rate": 3.3368801887873414e-05, + "loss": 0.1243, + "step": 25353 + }, + { + "epoch": 0.45221702992901225, + "grad_norm": 0.2859613001346588, + "learning_rate": 3.336733516484386e-05, + "loss": 0.1616, + "step": 25354 + }, + { + "epoch": 0.45223486605072594, + "grad_norm": 0.24077485501766205, + "learning_rate": 3.3365868409379634e-05, + "loss": 0.1704, + "step": 25355 + }, + { + "epoch": 0.4522527021724396, + "grad_norm": 0.22482459247112274, + "learning_rate": 3.336440162148639e-05, + "loss": 0.116, + "step": 25356 + }, + { + "epoch": 0.4522705382941533, + "grad_norm": 0.2575322687625885, + "learning_rate": 3.336293480116985e-05, + "loss": 0.1245, + "step": 25357 + }, + { + "epoch": 0.452288374415867, + "grad_norm": 0.36693382263183594, + "learning_rate": 3.3361467948435685e-05, + "loss": 0.1445, + "step": 25358 + }, + { + "epoch": 0.4523062105375807, + "grad_norm": 0.2518131136894226, + "learning_rate": 3.3360001063289585e-05, + "loss": 0.214, + "step": 25359 + }, + { + "epoch": 0.4523240466592944, + "grad_norm": 0.2738848924636841, + "learning_rate": 3.335853414573723e-05, + "loss": 0.1356, + "step": 25360 + }, + { + "epoch": 0.4523418827810081, + "grad_norm": 0.22676293551921844, + "learning_rate": 3.3357067195784316e-05, + "loss": 0.1189, + "step": 25361 + }, + { + "epoch": 0.4523597189027218, + "grad_norm": 0.2900744080543518, + "learning_rate": 3.335560021343652e-05, + "loss": 0.1888, + "step": 25362 + }, + { + "epoch": 0.4523775550244355, + "grad_norm": 0.22740840911865234, + "learning_rate": 3.335413319869952e-05, + "loss": 0.1274, + "step": 25363 + }, + { + "epoch": 0.4523953911461492, + "grad_norm": 0.2982439696788788, + "learning_rate": 3.335266615157902e-05, + "loss": 0.113, + "step": 25364 + }, + { + "epoch": 0.45241322726786287, + "grad_norm": 0.22127170860767365, + "learning_rate": 3.335119907208071e-05, + "loss": 0.1145, + "step": 25365 + }, + { + "epoch": 0.45243106338957656, + "grad_norm": 0.2754104733467102, + "learning_rate": 3.3349731960210266e-05, + "loss": 0.1335, + "step": 25366 + }, + { + "epoch": 0.45244889951129025, + "grad_norm": 0.2940003573894501, + "learning_rate": 3.3348264815973376e-05, + "loss": 0.1883, + "step": 25367 + }, + { + "epoch": 0.45246673563300394, + "grad_norm": 0.21314938366413116, + "learning_rate": 3.3346797639375735e-05, + "loss": 0.1194, + "step": 25368 + }, + { + "epoch": 0.4524845717547177, + "grad_norm": 0.30342239141464233, + "learning_rate": 3.3345330430423014e-05, + "loss": 0.1218, + "step": 25369 + }, + { + "epoch": 0.45250240787643137, + "grad_norm": 0.23137035965919495, + "learning_rate": 3.3343863189120905e-05, + "loss": 0.1296, + "step": 25370 + }, + { + "epoch": 0.45252024399814506, + "grad_norm": 0.36742621660232544, + "learning_rate": 3.334239591547511e-05, + "loss": 0.17, + "step": 25371 + }, + { + "epoch": 0.45253808011985874, + "grad_norm": 0.1878555566072464, + "learning_rate": 3.33409286094913e-05, + "loss": 0.1174, + "step": 25372 + }, + { + "epoch": 0.45255591624157243, + "grad_norm": 0.20224159955978394, + "learning_rate": 3.3339461271175174e-05, + "loss": 0.1174, + "step": 25373 + }, + { + "epoch": 0.4525737523632861, + "grad_norm": 0.37649932503700256, + "learning_rate": 3.3337993900532415e-05, + "loss": 0.2172, + "step": 25374 + }, + { + "epoch": 0.4525915884849998, + "grad_norm": 0.36614614725112915, + "learning_rate": 3.333652649756871e-05, + "loss": 0.1656, + "step": 25375 + }, + { + "epoch": 0.4526094246067135, + "grad_norm": 0.4283222556114197, + "learning_rate": 3.333505906228975e-05, + "loss": 0.2213, + "step": 25376 + }, + { + "epoch": 0.4526272607284272, + "grad_norm": 0.342178612947464, + "learning_rate": 3.333359159470123e-05, + "loss": 0.1695, + "step": 25377 + }, + { + "epoch": 0.4526450968501409, + "grad_norm": 0.27212056517601013, + "learning_rate": 3.3332124094808816e-05, + "loss": 0.1054, + "step": 25378 + }, + { + "epoch": 0.4526629329718546, + "grad_norm": 0.26995909214019775, + "learning_rate": 3.333065656261822e-05, + "loss": 0.1482, + "step": 25379 + }, + { + "epoch": 0.4526807690935683, + "grad_norm": 0.2667284607887268, + "learning_rate": 3.332918899813512e-05, + "loss": 0.1067, + "step": 25380 + }, + { + "epoch": 0.452698605215282, + "grad_norm": 0.2785443067550659, + "learning_rate": 3.3327721401365214e-05, + "loss": 0.1753, + "step": 25381 + }, + { + "epoch": 0.4527164413369957, + "grad_norm": 0.3091239333152771, + "learning_rate": 3.3326253772314177e-05, + "loss": 0.1201, + "step": 25382 + }, + { + "epoch": 0.45273427745870937, + "grad_norm": 0.2919481694698334, + "learning_rate": 3.33247861109877e-05, + "loss": 0.192, + "step": 25383 + }, + { + "epoch": 0.45275211358042305, + "grad_norm": 0.2751409411430359, + "learning_rate": 3.332331841739148e-05, + "loss": 0.1894, + "step": 25384 + }, + { + "epoch": 0.45276994970213674, + "grad_norm": 0.2278016209602356, + "learning_rate": 3.33218506915312e-05, + "loss": 0.143, + "step": 25385 + }, + { + "epoch": 0.4527877858238505, + "grad_norm": 0.27491307258605957, + "learning_rate": 3.332038293341256e-05, + "loss": 0.1532, + "step": 25386 + }, + { + "epoch": 0.4528056219455642, + "grad_norm": 0.32482025027275085, + "learning_rate": 3.3318915143041244e-05, + "loss": 0.1453, + "step": 25387 + }, + { + "epoch": 0.45282345806727786, + "grad_norm": 0.21392029523849487, + "learning_rate": 3.331744732042293e-05, + "loss": 0.1209, + "step": 25388 + }, + { + "epoch": 0.45284129418899155, + "grad_norm": 0.22954034805297852, + "learning_rate": 3.3315979465563325e-05, + "loss": 0.1547, + "step": 25389 + }, + { + "epoch": 0.45285913031070524, + "grad_norm": 0.30901920795440674, + "learning_rate": 3.3314511578468105e-05, + "loss": 0.1821, + "step": 25390 + }, + { + "epoch": 0.4528769664324189, + "grad_norm": 0.26551201939582825, + "learning_rate": 3.331304365914297e-05, + "loss": 0.1711, + "step": 25391 + }, + { + "epoch": 0.4528948025541326, + "grad_norm": 0.3265780210494995, + "learning_rate": 3.3311575707593604e-05, + "loss": 0.1816, + "step": 25392 + }, + { + "epoch": 0.4529126386758463, + "grad_norm": 0.19142450392246246, + "learning_rate": 3.33101077238257e-05, + "loss": 0.1134, + "step": 25393 + }, + { + "epoch": 0.45293047479756005, + "grad_norm": 0.2555154860019684, + "learning_rate": 3.330863970784496e-05, + "loss": 0.1589, + "step": 25394 + }, + { + "epoch": 0.45294831091927373, + "grad_norm": 0.2754727602005005, + "learning_rate": 3.330717165965704e-05, + "loss": 0.1535, + "step": 25395 + }, + { + "epoch": 0.4529661470409874, + "grad_norm": 0.27245625853538513, + "learning_rate": 3.3305703579267676e-05, + "loss": 0.167, + "step": 25396 + }, + { + "epoch": 0.4529839831627011, + "grad_norm": 0.2841571271419525, + "learning_rate": 3.330423546668252e-05, + "loss": 0.1664, + "step": 25397 + }, + { + "epoch": 0.4530018192844148, + "grad_norm": 0.20171573758125305, + "learning_rate": 3.330276732190729e-05, + "loss": 0.1414, + "step": 25398 + }, + { + "epoch": 0.4530196554061285, + "grad_norm": 0.2928656339645386, + "learning_rate": 3.3301299144947656e-05, + "loss": 0.169, + "step": 25399 + }, + { + "epoch": 0.4530374915278422, + "grad_norm": 0.3740893304347992, + "learning_rate": 3.329983093580933e-05, + "loss": 0.2375, + "step": 25400 + }, + { + "epoch": 0.45305532764955586, + "grad_norm": 0.41993436217308044, + "learning_rate": 3.329836269449799e-05, + "loss": 0.273, + "step": 25401 + }, + { + "epoch": 0.45307316377126955, + "grad_norm": 0.3017175495624542, + "learning_rate": 3.3296894421019326e-05, + "loss": 0.1247, + "step": 25402 + }, + { + "epoch": 0.4530909998929833, + "grad_norm": 0.23109309375286102, + "learning_rate": 3.329542611537904e-05, + "loss": 0.1609, + "step": 25403 + }, + { + "epoch": 0.453108836014697, + "grad_norm": 0.1993561089038849, + "learning_rate": 3.3293957777582804e-05, + "loss": 0.1714, + "step": 25404 + }, + { + "epoch": 0.45312667213641067, + "grad_norm": 0.18567967414855957, + "learning_rate": 3.329248940763634e-05, + "loss": 0.1279, + "step": 25405 + }, + { + "epoch": 0.45314450825812436, + "grad_norm": 0.254224568605423, + "learning_rate": 3.329102100554531e-05, + "loss": 0.1253, + "step": 25406 + }, + { + "epoch": 0.45316234437983804, + "grad_norm": 0.23910973966121674, + "learning_rate": 3.3289552571315425e-05, + "loss": 0.15, + "step": 25407 + }, + { + "epoch": 0.45318018050155173, + "grad_norm": 0.3448658287525177, + "learning_rate": 3.328808410495236e-05, + "loss": 0.147, + "step": 25408 + }, + { + "epoch": 0.4531980166232654, + "grad_norm": 0.2811363637447357, + "learning_rate": 3.3286615606461836e-05, + "loss": 0.1247, + "step": 25409 + }, + { + "epoch": 0.4532158527449791, + "grad_norm": 0.294975608587265, + "learning_rate": 3.328514707584952e-05, + "loss": 0.1115, + "step": 25410 + }, + { + "epoch": 0.45323368886669285, + "grad_norm": 0.3807539641857147, + "learning_rate": 3.328367851312111e-05, + "loss": 0.135, + "step": 25411 + }, + { + "epoch": 0.45325152498840654, + "grad_norm": 0.3476579487323761, + "learning_rate": 3.3282209918282305e-05, + "loss": 0.1568, + "step": 25412 + }, + { + "epoch": 0.45326936111012023, + "grad_norm": 0.22811362147331238, + "learning_rate": 3.328074129133879e-05, + "loss": 0.1104, + "step": 25413 + }, + { + "epoch": 0.4532871972318339, + "grad_norm": 0.2721503674983978, + "learning_rate": 3.327927263229626e-05, + "loss": 0.1288, + "step": 25414 + }, + { + "epoch": 0.4533050333535476, + "grad_norm": 0.34425482153892517, + "learning_rate": 3.3277803941160415e-05, + "loss": 0.1654, + "step": 25415 + }, + { + "epoch": 0.4533228694752613, + "grad_norm": 0.32033097743988037, + "learning_rate": 3.327633521793694e-05, + "loss": 0.1553, + "step": 25416 + }, + { + "epoch": 0.453340705596975, + "grad_norm": 0.2533934414386749, + "learning_rate": 3.3274866462631536e-05, + "loss": 0.1275, + "step": 25417 + }, + { + "epoch": 0.45335854171868867, + "grad_norm": 0.2786405086517334, + "learning_rate": 3.3273397675249886e-05, + "loss": 0.1516, + "step": 25418 + }, + { + "epoch": 0.45337637784040236, + "grad_norm": 0.25266364216804504, + "learning_rate": 3.327192885579769e-05, + "loss": 0.1579, + "step": 25419 + }, + { + "epoch": 0.4533942139621161, + "grad_norm": 0.2841903865337372, + "learning_rate": 3.3270460004280644e-05, + "loss": 0.1715, + "step": 25420 + }, + { + "epoch": 0.4534120500838298, + "grad_norm": 0.2631411552429199, + "learning_rate": 3.3268991120704436e-05, + "loss": 0.1828, + "step": 25421 + }, + { + "epoch": 0.4534298862055435, + "grad_norm": 0.26544615626335144, + "learning_rate": 3.326752220507476e-05, + "loss": 0.0846, + "step": 25422 + }, + { + "epoch": 0.45344772232725716, + "grad_norm": 0.2929244935512543, + "learning_rate": 3.3266053257397315e-05, + "loss": 0.1182, + "step": 25423 + }, + { + "epoch": 0.45346555844897085, + "grad_norm": 0.3162120580673218, + "learning_rate": 3.32645842776778e-05, + "loss": 0.1587, + "step": 25424 + }, + { + "epoch": 0.45348339457068454, + "grad_norm": 0.28302067518234253, + "learning_rate": 3.326311526592189e-05, + "loss": 0.165, + "step": 25425 + }, + { + "epoch": 0.4535012306923982, + "grad_norm": 0.2734776735305786, + "learning_rate": 3.32616462221353e-05, + "loss": 0.1143, + "step": 25426 + }, + { + "epoch": 0.4535190668141119, + "grad_norm": 0.30495041608810425, + "learning_rate": 3.32601771463237e-05, + "loss": 0.131, + "step": 25427 + }, + { + "epoch": 0.45353690293582566, + "grad_norm": 0.18731018900871277, + "learning_rate": 3.325870803849282e-05, + "loss": 0.1258, + "step": 25428 + }, + { + "epoch": 0.45355473905753935, + "grad_norm": 0.2503253221511841, + "learning_rate": 3.3257238898648324e-05, + "loss": 0.0937, + "step": 25429 + }, + { + "epoch": 0.45357257517925303, + "grad_norm": 0.24718806147575378, + "learning_rate": 3.3255769726795925e-05, + "loss": 0.1605, + "step": 25430 + }, + { + "epoch": 0.4535904113009667, + "grad_norm": 0.31829050183296204, + "learning_rate": 3.325430052294131e-05, + "loss": 0.1425, + "step": 25431 + }, + { + "epoch": 0.4536082474226804, + "grad_norm": 0.20550750195980072, + "learning_rate": 3.325283128709017e-05, + "loss": 0.1462, + "step": 25432 + }, + { + "epoch": 0.4536260835443941, + "grad_norm": 0.25150609016418457, + "learning_rate": 3.325136201924821e-05, + "loss": 0.1946, + "step": 25433 + }, + { + "epoch": 0.4536439196661078, + "grad_norm": 0.23848611116409302, + "learning_rate": 3.324989271942111e-05, + "loss": 0.0974, + "step": 25434 + }, + { + "epoch": 0.4536617557878215, + "grad_norm": 0.24768418073654175, + "learning_rate": 3.324842338761459e-05, + "loss": 0.1471, + "step": 25435 + }, + { + "epoch": 0.4536795919095352, + "grad_norm": 0.31885871291160583, + "learning_rate": 3.3246954023834324e-05, + "loss": 0.1494, + "step": 25436 + }, + { + "epoch": 0.4536974280312489, + "grad_norm": 0.2993391156196594, + "learning_rate": 3.324548462808602e-05, + "loss": 0.183, + "step": 25437 + }, + { + "epoch": 0.4537152641529626, + "grad_norm": 0.27632758021354675, + "learning_rate": 3.3244015200375364e-05, + "loss": 0.1606, + "step": 25438 + }, + { + "epoch": 0.4537331002746763, + "grad_norm": 0.29089802503585815, + "learning_rate": 3.324254574070806e-05, + "loss": 0.1895, + "step": 25439 + }, + { + "epoch": 0.45375093639638997, + "grad_norm": 0.23531211912631989, + "learning_rate": 3.324107624908981e-05, + "loss": 0.1146, + "step": 25440 + }, + { + "epoch": 0.45376877251810366, + "grad_norm": 0.27233681082725525, + "learning_rate": 3.32396067255263e-05, + "loss": 0.1677, + "step": 25441 + }, + { + "epoch": 0.45378660863981735, + "grad_norm": 0.2612689733505249, + "learning_rate": 3.323813717002322e-05, + "loss": 0.1873, + "step": 25442 + }, + { + "epoch": 0.45380444476153103, + "grad_norm": 0.31285834312438965, + "learning_rate": 3.323666758258628e-05, + "loss": 0.1435, + "step": 25443 + }, + { + "epoch": 0.4538222808832447, + "grad_norm": 0.26134970784187317, + "learning_rate": 3.323519796322117e-05, + "loss": 0.1281, + "step": 25444 + }, + { + "epoch": 0.45384011700495847, + "grad_norm": 0.2308206707239151, + "learning_rate": 3.32337283119336e-05, + "loss": 0.1602, + "step": 25445 + }, + { + "epoch": 0.45385795312667215, + "grad_norm": 0.35686835646629333, + "learning_rate": 3.323225862872924e-05, + "loss": 0.2012, + "step": 25446 + }, + { + "epoch": 0.45387578924838584, + "grad_norm": 0.26989123225212097, + "learning_rate": 3.3230788913613804e-05, + "loss": 0.1298, + "step": 25447 + }, + { + "epoch": 0.45389362537009953, + "grad_norm": 0.28670790791511536, + "learning_rate": 3.322931916659299e-05, + "loss": 0.1652, + "step": 25448 + }, + { + "epoch": 0.4539114614918132, + "grad_norm": 0.2709404230117798, + "learning_rate": 3.3227849387672494e-05, + "loss": 0.0825, + "step": 25449 + }, + { + "epoch": 0.4539292976135269, + "grad_norm": 0.26748818159103394, + "learning_rate": 3.322637957685801e-05, + "loss": 0.1389, + "step": 25450 + }, + { + "epoch": 0.4539471337352406, + "grad_norm": 0.22117049992084503, + "learning_rate": 3.322490973415524e-05, + "loss": 0.1131, + "step": 25451 + }, + { + "epoch": 0.4539649698569543, + "grad_norm": 0.27598392963409424, + "learning_rate": 3.322343985956988e-05, + "loss": 0.1431, + "step": 25452 + }, + { + "epoch": 0.453982805978668, + "grad_norm": 0.271395742893219, + "learning_rate": 3.3221969953107625e-05, + "loss": 0.156, + "step": 25453 + }, + { + "epoch": 0.4540006421003817, + "grad_norm": 0.3040768504142761, + "learning_rate": 3.322050001477417e-05, + "loss": 0.1716, + "step": 25454 + }, + { + "epoch": 0.4540184782220954, + "grad_norm": 0.34275102615356445, + "learning_rate": 3.3219030044575214e-05, + "loss": 0.1612, + "step": 25455 + }, + { + "epoch": 0.4540363143438091, + "grad_norm": 0.19952526688575745, + "learning_rate": 3.3217560042516476e-05, + "loss": 0.1277, + "step": 25456 + }, + { + "epoch": 0.4540541504655228, + "grad_norm": 0.3267901837825775, + "learning_rate": 3.321609000860362e-05, + "loss": 0.1118, + "step": 25457 + }, + { + "epoch": 0.45407198658723646, + "grad_norm": 0.2913527190685272, + "learning_rate": 3.321461994284237e-05, + "loss": 0.1749, + "step": 25458 + }, + { + "epoch": 0.45408982270895015, + "grad_norm": 0.2528184652328491, + "learning_rate": 3.321314984523842e-05, + "loss": 0.1704, + "step": 25459 + }, + { + "epoch": 0.45410765883066384, + "grad_norm": 0.3290480673313141, + "learning_rate": 3.321167971579746e-05, + "loss": 0.1914, + "step": 25460 + }, + { + "epoch": 0.45412549495237753, + "grad_norm": 0.3445420563220978, + "learning_rate": 3.321020955452519e-05, + "loss": 0.1525, + "step": 25461 + }, + { + "epoch": 0.45414333107409127, + "grad_norm": 0.2904965877532959, + "learning_rate": 3.320873936142732e-05, + "loss": 0.085, + "step": 25462 + }, + { + "epoch": 0.45416116719580496, + "grad_norm": 0.21717512607574463, + "learning_rate": 3.3207269136509536e-05, + "loss": 0.1317, + "step": 25463 + }, + { + "epoch": 0.45417900331751865, + "grad_norm": 0.3402880132198334, + "learning_rate": 3.320579887977754e-05, + "loss": 0.1838, + "step": 25464 + }, + { + "epoch": 0.45419683943923234, + "grad_norm": 0.2852194011211395, + "learning_rate": 3.3204328591237034e-05, + "loss": 0.1623, + "step": 25465 + }, + { + "epoch": 0.454214675560946, + "grad_norm": 0.23262082040309906, + "learning_rate": 3.320285827089372e-05, + "loss": 0.1403, + "step": 25466 + }, + { + "epoch": 0.4542325116826597, + "grad_norm": 0.27028903365135193, + "learning_rate": 3.320138791875329e-05, + "loss": 0.1484, + "step": 25467 + }, + { + "epoch": 0.4542503478043734, + "grad_norm": 0.30091822147369385, + "learning_rate": 3.319991753482145e-05, + "loss": 0.1478, + "step": 25468 + }, + { + "epoch": 0.4542681839260871, + "grad_norm": 0.2841566205024719, + "learning_rate": 3.319844711910389e-05, + "loss": 0.1623, + "step": 25469 + }, + { + "epoch": 0.45428602004780083, + "grad_norm": 0.34597864747047424, + "learning_rate": 3.319697667160633e-05, + "loss": 0.1169, + "step": 25470 + }, + { + "epoch": 0.4543038561695145, + "grad_norm": 0.2742307782173157, + "learning_rate": 3.319550619233445e-05, + "loss": 0.1258, + "step": 25471 + }, + { + "epoch": 0.4543216922912282, + "grad_norm": 0.291324645280838, + "learning_rate": 3.319403568129396e-05, + "loss": 0.1178, + "step": 25472 + }, + { + "epoch": 0.4543395284129419, + "grad_norm": 0.20286288857460022, + "learning_rate": 3.319256513849055e-05, + "loss": 0.1658, + "step": 25473 + }, + { + "epoch": 0.4543573645346556, + "grad_norm": 0.3391698896884918, + "learning_rate": 3.3191094563929944e-05, + "loss": 0.166, + "step": 25474 + }, + { + "epoch": 0.45437520065636927, + "grad_norm": 0.24068965017795563, + "learning_rate": 3.318962395761781e-05, + "loss": 0.1494, + "step": 25475 + }, + { + "epoch": 0.45439303677808296, + "grad_norm": 0.3006434738636017, + "learning_rate": 3.3188153319559876e-05, + "loss": 0.1393, + "step": 25476 + }, + { + "epoch": 0.45441087289979665, + "grad_norm": 0.2690424621105194, + "learning_rate": 3.318668264976182e-05, + "loss": 0.1575, + "step": 25477 + }, + { + "epoch": 0.45442870902151034, + "grad_norm": 0.22099590301513672, + "learning_rate": 3.3185211948229375e-05, + "loss": 0.172, + "step": 25478 + }, + { + "epoch": 0.4544465451432241, + "grad_norm": 0.24531807005405426, + "learning_rate": 3.31837412149682e-05, + "loss": 0.133, + "step": 25479 + }, + { + "epoch": 0.45446438126493777, + "grad_norm": 0.4104959964752197, + "learning_rate": 3.318227044998403e-05, + "loss": 0.0908, + "step": 25480 + }, + { + "epoch": 0.45448221738665145, + "grad_norm": 0.3880630433559418, + "learning_rate": 3.3180799653282546e-05, + "loss": 0.16, + "step": 25481 + }, + { + "epoch": 0.45450005350836514, + "grad_norm": 0.2126411646604538, + "learning_rate": 3.317932882486946e-05, + "loss": 0.1597, + "step": 25482 + }, + { + "epoch": 0.45451788963007883, + "grad_norm": 0.34773823618888855, + "learning_rate": 3.317785796475047e-05, + "loss": 0.2344, + "step": 25483 + }, + { + "epoch": 0.4545357257517925, + "grad_norm": 0.311054527759552, + "learning_rate": 3.317638707293128e-05, + "loss": 0.1486, + "step": 25484 + }, + { + "epoch": 0.4545535618735062, + "grad_norm": 0.2105044275522232, + "learning_rate": 3.317491614941759e-05, + "loss": 0.1268, + "step": 25485 + }, + { + "epoch": 0.4545713979952199, + "grad_norm": 0.33068689703941345, + "learning_rate": 3.31734451942151e-05, + "loss": 0.1361, + "step": 25486 + }, + { + "epoch": 0.45458923411693364, + "grad_norm": 0.27694565057754517, + "learning_rate": 3.317197420732952e-05, + "loss": 0.1548, + "step": 25487 + }, + { + "epoch": 0.4546070702386473, + "grad_norm": 0.31229138374328613, + "learning_rate": 3.317050318876653e-05, + "loss": 0.172, + "step": 25488 + }, + { + "epoch": 0.454624906360361, + "grad_norm": 0.40736308693885803, + "learning_rate": 3.316903213853186e-05, + "loss": 0.2063, + "step": 25489 + }, + { + "epoch": 0.4546427424820747, + "grad_norm": 0.2958024740219116, + "learning_rate": 3.316756105663119e-05, + "loss": 0.1292, + "step": 25490 + }, + { + "epoch": 0.4546605786037884, + "grad_norm": 0.2401237189769745, + "learning_rate": 3.3166089943070245e-05, + "loss": 0.1521, + "step": 25491 + }, + { + "epoch": 0.4546784147255021, + "grad_norm": 0.2923887372016907, + "learning_rate": 3.316461879785471e-05, + "loss": 0.1376, + "step": 25492 + }, + { + "epoch": 0.45469625084721577, + "grad_norm": 0.32773536443710327, + "learning_rate": 3.3163147620990296e-05, + "loss": 0.122, + "step": 25493 + }, + { + "epoch": 0.45471408696892945, + "grad_norm": 0.2533378303050995, + "learning_rate": 3.316167641248269e-05, + "loss": 0.1372, + "step": 25494 + }, + { + "epoch": 0.4547319230906432, + "grad_norm": 0.34814250469207764, + "learning_rate": 3.316020517233761e-05, + "loss": 0.1917, + "step": 25495 + }, + { + "epoch": 0.4547497592123569, + "grad_norm": 0.20848925411701202, + "learning_rate": 3.315873390056076e-05, + "loss": 0.1171, + "step": 25496 + }, + { + "epoch": 0.4547675953340706, + "grad_norm": 0.2148837298154831, + "learning_rate": 3.3157262597157846e-05, + "loss": 0.1493, + "step": 25497 + }, + { + "epoch": 0.45478543145578426, + "grad_norm": 0.297224760055542, + "learning_rate": 3.315579126213455e-05, + "loss": 0.1405, + "step": 25498 + }, + { + "epoch": 0.45480326757749795, + "grad_norm": 0.2944718599319458, + "learning_rate": 3.315431989549661e-05, + "loss": 0.1671, + "step": 25499 + }, + { + "epoch": 0.45482110369921164, + "grad_norm": 0.25866109132766724, + "learning_rate": 3.31528484972497e-05, + "loss": 0.1391, + "step": 25500 + }, + { + "epoch": 0.4548389398209253, + "grad_norm": 0.2821806073188782, + "learning_rate": 3.3151377067399536e-05, + "loss": 0.1221, + "step": 25501 + }, + { + "epoch": 0.454856775942639, + "grad_norm": 0.4539002776145935, + "learning_rate": 3.314990560595181e-05, + "loss": 0.1496, + "step": 25502 + }, + { + "epoch": 0.4548746120643527, + "grad_norm": 0.2803192734718323, + "learning_rate": 3.314843411291224e-05, + "loss": 0.1278, + "step": 25503 + }, + { + "epoch": 0.45489244818606644, + "grad_norm": 0.1840869039297104, + "learning_rate": 3.314696258828653e-05, + "loss": 0.1319, + "step": 25504 + }, + { + "epoch": 0.45491028430778013, + "grad_norm": 0.25788959860801697, + "learning_rate": 3.3145491032080375e-05, + "loss": 0.1359, + "step": 25505 + }, + { + "epoch": 0.4549281204294938, + "grad_norm": 0.40240857005119324, + "learning_rate": 3.3144019444299476e-05, + "loss": 0.1134, + "step": 25506 + }, + { + "epoch": 0.4549459565512075, + "grad_norm": 0.33708715438842773, + "learning_rate": 3.314254782494956e-05, + "loss": 0.1648, + "step": 25507 + }, + { + "epoch": 0.4549637926729212, + "grad_norm": 0.3606436252593994, + "learning_rate": 3.31410761740363e-05, + "loss": 0.1902, + "step": 25508 + }, + { + "epoch": 0.4549816287946349, + "grad_norm": 0.33318817615509033, + "learning_rate": 3.313960449156543e-05, + "loss": 0.122, + "step": 25509 + }, + { + "epoch": 0.4549994649163486, + "grad_norm": 0.194342240691185, + "learning_rate": 3.3138132777542637e-05, + "loss": 0.1376, + "step": 25510 + }, + { + "epoch": 0.45501730103806226, + "grad_norm": 0.24338695406913757, + "learning_rate": 3.313666103197363e-05, + "loss": 0.1637, + "step": 25511 + }, + { + "epoch": 0.455035137159776, + "grad_norm": 0.32704874873161316, + "learning_rate": 3.3135189254864114e-05, + "loss": 0.1578, + "step": 25512 + }, + { + "epoch": 0.4550529732814897, + "grad_norm": 0.3315202295780182, + "learning_rate": 3.31337174462198e-05, + "loss": 0.1532, + "step": 25513 + }, + { + "epoch": 0.4550708094032034, + "grad_norm": 0.25110602378845215, + "learning_rate": 3.313224560604638e-05, + "loss": 0.1346, + "step": 25514 + }, + { + "epoch": 0.45508864552491707, + "grad_norm": 0.2566005289554596, + "learning_rate": 3.313077373434957e-05, + "loss": 0.1269, + "step": 25515 + }, + { + "epoch": 0.45510648164663076, + "grad_norm": 0.31238797307014465, + "learning_rate": 3.312930183113507e-05, + "loss": 0.1426, + "step": 25516 + }, + { + "epoch": 0.45512431776834444, + "grad_norm": 0.2830149233341217, + "learning_rate": 3.3127829896408596e-05, + "loss": 0.1595, + "step": 25517 + }, + { + "epoch": 0.45514215389005813, + "grad_norm": 0.37631404399871826, + "learning_rate": 3.3126357930175835e-05, + "loss": 0.1397, + "step": 25518 + }, + { + "epoch": 0.4551599900117718, + "grad_norm": 0.3661668300628662, + "learning_rate": 3.3124885932442516e-05, + "loss": 0.1107, + "step": 25519 + }, + { + "epoch": 0.4551778261334855, + "grad_norm": 0.34236806631088257, + "learning_rate": 3.312341390321433e-05, + "loss": 0.1393, + "step": 25520 + }, + { + "epoch": 0.45519566225519925, + "grad_norm": 0.32491669058799744, + "learning_rate": 3.312194184249698e-05, + "loss": 0.1281, + "step": 25521 + }, + { + "epoch": 0.45521349837691294, + "grad_norm": 0.25198623538017273, + "learning_rate": 3.3120469750296185e-05, + "loss": 0.118, + "step": 25522 + }, + { + "epoch": 0.4552313344986266, + "grad_norm": 0.2775854468345642, + "learning_rate": 3.311899762661764e-05, + "loss": 0.1318, + "step": 25523 + }, + { + "epoch": 0.4552491706203403, + "grad_norm": 0.31547802686691284, + "learning_rate": 3.3117525471467054e-05, + "loss": 0.0975, + "step": 25524 + }, + { + "epoch": 0.455267006742054, + "grad_norm": 0.2870213985443115, + "learning_rate": 3.311605328485014e-05, + "loss": 0.1494, + "step": 25525 + }, + { + "epoch": 0.4552848428637677, + "grad_norm": 0.22317950427532196, + "learning_rate": 3.31145810667726e-05, + "loss": 0.1255, + "step": 25526 + }, + { + "epoch": 0.4553026789854814, + "grad_norm": 0.21474316716194153, + "learning_rate": 3.311310881724014e-05, + "loss": 0.152, + "step": 25527 + }, + { + "epoch": 0.45532051510719507, + "grad_norm": 0.3272128403186798, + "learning_rate": 3.311163653625847e-05, + "loss": 0.1356, + "step": 25528 + }, + { + "epoch": 0.4553383512289088, + "grad_norm": 0.2720585763454437, + "learning_rate": 3.3110164223833296e-05, + "loss": 0.1392, + "step": 25529 + }, + { + "epoch": 0.4553561873506225, + "grad_norm": 0.2651171088218689, + "learning_rate": 3.310869187997032e-05, + "loss": 0.1458, + "step": 25530 + }, + { + "epoch": 0.4553740234723362, + "grad_norm": 0.4045639932155609, + "learning_rate": 3.310721950467525e-05, + "loss": 0.1968, + "step": 25531 + }, + { + "epoch": 0.4553918595940499, + "grad_norm": 0.3149820864200592, + "learning_rate": 3.31057470979538e-05, + "loss": 0.0789, + "step": 25532 + }, + { + "epoch": 0.45540969571576356, + "grad_norm": 0.23971009254455566, + "learning_rate": 3.310427465981168e-05, + "loss": 0.1031, + "step": 25533 + }, + { + "epoch": 0.45542753183747725, + "grad_norm": 0.22553564608097076, + "learning_rate": 3.3102802190254594e-05, + "loss": 0.1289, + "step": 25534 + }, + { + "epoch": 0.45544536795919094, + "grad_norm": 0.3127795159816742, + "learning_rate": 3.310132968928824e-05, + "loss": 0.0965, + "step": 25535 + }, + { + "epoch": 0.4554632040809046, + "grad_norm": 0.2910325527191162, + "learning_rate": 3.309985715691834e-05, + "loss": 0.1621, + "step": 25536 + }, + { + "epoch": 0.45548104020261837, + "grad_norm": 0.2201349139213562, + "learning_rate": 3.309838459315059e-05, + "loss": 0.1559, + "step": 25537 + }, + { + "epoch": 0.45549887632433206, + "grad_norm": 0.3200796842575073, + "learning_rate": 3.309691199799071e-05, + "loss": 0.1392, + "step": 25538 + }, + { + "epoch": 0.45551671244604575, + "grad_norm": 0.21429410576820374, + "learning_rate": 3.3095439371444407e-05, + "loss": 0.1563, + "step": 25539 + }, + { + "epoch": 0.45553454856775943, + "grad_norm": 0.2324209362268448, + "learning_rate": 3.309396671351737e-05, + "loss": 0.1487, + "step": 25540 + }, + { + "epoch": 0.4555523846894731, + "grad_norm": 0.3225129544734955, + "learning_rate": 3.3092494024215336e-05, + "loss": 0.1525, + "step": 25541 + }, + { + "epoch": 0.4555702208111868, + "grad_norm": 0.24083071947097778, + "learning_rate": 3.309102130354399e-05, + "loss": 0.1389, + "step": 25542 + }, + { + "epoch": 0.4555880569329005, + "grad_norm": 0.3956213593482971, + "learning_rate": 3.3089548551509054e-05, + "loss": 0.1213, + "step": 25543 + }, + { + "epoch": 0.4556058930546142, + "grad_norm": 0.4100399911403656, + "learning_rate": 3.3088075768116233e-05, + "loss": 0.2187, + "step": 25544 + }, + { + "epoch": 0.4556237291763279, + "grad_norm": 0.2815300524234772, + "learning_rate": 3.308660295337124e-05, + "loss": 0.1867, + "step": 25545 + }, + { + "epoch": 0.4556415652980416, + "grad_norm": 0.22833020985126495, + "learning_rate": 3.308513010727978e-05, + "loss": 0.1356, + "step": 25546 + }, + { + "epoch": 0.4556594014197553, + "grad_norm": 0.261622816324234, + "learning_rate": 3.3083657229847566e-05, + "loss": 0.1951, + "step": 25547 + }, + { + "epoch": 0.455677237541469, + "grad_norm": 0.24547246098518372, + "learning_rate": 3.30821843210803e-05, + "loss": 0.1254, + "step": 25548 + }, + { + "epoch": 0.4556950736631827, + "grad_norm": 0.2390258014202118, + "learning_rate": 3.30807113809837e-05, + "loss": 0.1193, + "step": 25549 + }, + { + "epoch": 0.45571290978489637, + "grad_norm": 0.39300480484962463, + "learning_rate": 3.3079238409563467e-05, + "loss": 0.136, + "step": 25550 + }, + { + "epoch": 0.45573074590661006, + "grad_norm": 0.22021755576133728, + "learning_rate": 3.307776540682532e-05, + "loss": 0.1187, + "step": 25551 + }, + { + "epoch": 0.45574858202832375, + "grad_norm": 0.25812453031539917, + "learning_rate": 3.3076292372774963e-05, + "loss": 0.1799, + "step": 25552 + }, + { + "epoch": 0.45576641815003743, + "grad_norm": 0.2631734311580658, + "learning_rate": 3.30748193074181e-05, + "loss": 0.1242, + "step": 25553 + }, + { + "epoch": 0.4557842542717512, + "grad_norm": 0.2188076674938202, + "learning_rate": 3.3073346210760466e-05, + "loss": 0.1357, + "step": 25554 + }, + { + "epoch": 0.45580209039346486, + "grad_norm": 0.3625791668891907, + "learning_rate": 3.3071873082807734e-05, + "loss": 0.1306, + "step": 25555 + }, + { + "epoch": 0.45581992651517855, + "grad_norm": 0.267915815114975, + "learning_rate": 3.3070399923565645e-05, + "loss": 0.1634, + "step": 25556 + }, + { + "epoch": 0.45583776263689224, + "grad_norm": 0.2879968285560608, + "learning_rate": 3.306892673303989e-05, + "loss": 0.1633, + "step": 25557 + }, + { + "epoch": 0.45585559875860593, + "grad_norm": 0.3673526644706726, + "learning_rate": 3.30674535112362e-05, + "loss": 0.1339, + "step": 25558 + }, + { + "epoch": 0.4558734348803196, + "grad_norm": 0.21310196816921234, + "learning_rate": 3.306598025816027e-05, + "loss": 0.1638, + "step": 25559 + }, + { + "epoch": 0.4558912710020333, + "grad_norm": 0.19753588736057281, + "learning_rate": 3.306450697381781e-05, + "loss": 0.1248, + "step": 25560 + }, + { + "epoch": 0.455909107123747, + "grad_norm": 0.19765333831310272, + "learning_rate": 3.306303365821454e-05, + "loss": 0.1091, + "step": 25561 + }, + { + "epoch": 0.4559269432454607, + "grad_norm": 0.29036945104599, + "learning_rate": 3.3061560311356165e-05, + "loss": 0.1625, + "step": 25562 + }, + { + "epoch": 0.4559447793671744, + "grad_norm": 0.3032093346118927, + "learning_rate": 3.30600869332484e-05, + "loss": 0.1787, + "step": 25563 + }, + { + "epoch": 0.4559626154888881, + "grad_norm": 0.27889686822891235, + "learning_rate": 3.305861352389695e-05, + "loss": 0.131, + "step": 25564 + }, + { + "epoch": 0.4559804516106018, + "grad_norm": 0.30255818367004395, + "learning_rate": 3.305714008330753e-05, + "loss": 0.1746, + "step": 25565 + }, + { + "epoch": 0.4559982877323155, + "grad_norm": 0.25976645946502686, + "learning_rate": 3.3055666611485854e-05, + "loss": 0.1207, + "step": 25566 + }, + { + "epoch": 0.4560161238540292, + "grad_norm": 0.3765120208263397, + "learning_rate": 3.305419310843764e-05, + "loss": 0.1992, + "step": 25567 + }, + { + "epoch": 0.45603395997574286, + "grad_norm": 0.2526637017726898, + "learning_rate": 3.3052719574168576e-05, + "loss": 0.1319, + "step": 25568 + }, + { + "epoch": 0.45605179609745655, + "grad_norm": 0.31131696701049805, + "learning_rate": 3.30512460086844e-05, + "loss": 0.1508, + "step": 25569 + }, + { + "epoch": 0.45606963221917024, + "grad_norm": 0.30652379989624023, + "learning_rate": 3.304977241199081e-05, + "loss": 0.1452, + "step": 25570 + }, + { + "epoch": 0.456087468340884, + "grad_norm": 0.26475590467453003, + "learning_rate": 3.304829878409352e-05, + "loss": 0.1306, + "step": 25571 + }, + { + "epoch": 0.45610530446259767, + "grad_norm": 0.3271113336086273, + "learning_rate": 3.304682512499825e-05, + "loss": 0.1443, + "step": 25572 + }, + { + "epoch": 0.45612314058431136, + "grad_norm": 0.25378769636154175, + "learning_rate": 3.3045351434710705e-05, + "loss": 0.1361, + "step": 25573 + }, + { + "epoch": 0.45614097670602505, + "grad_norm": 0.23429793119430542, + "learning_rate": 3.30438777132366e-05, + "loss": 0.0991, + "step": 25574 + }, + { + "epoch": 0.45615881282773874, + "grad_norm": 0.4037632942199707, + "learning_rate": 3.3042403960581646e-05, + "loss": 0.1542, + "step": 25575 + }, + { + "epoch": 0.4561766489494524, + "grad_norm": 0.35451459884643555, + "learning_rate": 3.304093017675155e-05, + "loss": 0.2035, + "step": 25576 + }, + { + "epoch": 0.4561944850711661, + "grad_norm": 0.2875897288322449, + "learning_rate": 3.3039456361752045e-05, + "loss": 0.1262, + "step": 25577 + }, + { + "epoch": 0.4562123211928798, + "grad_norm": 0.25457751750946045, + "learning_rate": 3.3037982515588815e-05, + "loss": 0.1488, + "step": 25578 + }, + { + "epoch": 0.4562301573145935, + "grad_norm": 0.34255313873291016, + "learning_rate": 3.303650863826759e-05, + "loss": 0.1206, + "step": 25579 + }, + { + "epoch": 0.45624799343630723, + "grad_norm": 0.453331857919693, + "learning_rate": 3.30350347297941e-05, + "loss": 0.1487, + "step": 25580 + }, + { + "epoch": 0.4562658295580209, + "grad_norm": 0.23620760440826416, + "learning_rate": 3.303356079017402e-05, + "loss": 0.1615, + "step": 25581 + }, + { + "epoch": 0.4562836656797346, + "grad_norm": 0.18880541622638702, + "learning_rate": 3.3032086819413094e-05, + "loss": 0.1306, + "step": 25582 + }, + { + "epoch": 0.4563015018014483, + "grad_norm": 0.3897157311439514, + "learning_rate": 3.303061281751702e-05, + "loss": 0.1168, + "step": 25583 + }, + { + "epoch": 0.456319337923162, + "grad_norm": 0.26303455233573914, + "learning_rate": 3.302913878449153e-05, + "loss": 0.1024, + "step": 25584 + }, + { + "epoch": 0.45633717404487567, + "grad_norm": 0.2909948527812958, + "learning_rate": 3.302766472034231e-05, + "loss": 0.1377, + "step": 25585 + }, + { + "epoch": 0.45635501016658936, + "grad_norm": 0.21276971697807312, + "learning_rate": 3.302619062507509e-05, + "loss": 0.1333, + "step": 25586 + }, + { + "epoch": 0.45637284628830305, + "grad_norm": 0.26306089758872986, + "learning_rate": 3.302471649869559e-05, + "loss": 0.2211, + "step": 25587 + }, + { + "epoch": 0.4563906824100168, + "grad_norm": 0.25632795691490173, + "learning_rate": 3.302324234120951e-05, + "loss": 0.1043, + "step": 25588 + }, + { + "epoch": 0.4564085185317305, + "grad_norm": 0.28908053040504456, + "learning_rate": 3.302176815262257e-05, + "loss": 0.1798, + "step": 25589 + }, + { + "epoch": 0.45642635465344417, + "grad_norm": 0.22195380926132202, + "learning_rate": 3.302029393294049e-05, + "loss": 0.134, + "step": 25590 + }, + { + "epoch": 0.45644419077515785, + "grad_norm": 0.2504135072231293, + "learning_rate": 3.301881968216899e-05, + "loss": 0.1528, + "step": 25591 + }, + { + "epoch": 0.45646202689687154, + "grad_norm": 0.30597007274627686, + "learning_rate": 3.301734540031376e-05, + "loss": 0.1501, + "step": 25592 + }, + { + "epoch": 0.45647986301858523, + "grad_norm": 0.19958379864692688, + "learning_rate": 3.301587108738054e-05, + "loss": 0.1291, + "step": 25593 + }, + { + "epoch": 0.4564976991402989, + "grad_norm": 0.2568812668323517, + "learning_rate": 3.301439674337503e-05, + "loss": 0.1225, + "step": 25594 + }, + { + "epoch": 0.4565155352620126, + "grad_norm": 0.26551640033721924, + "learning_rate": 3.301292236830295e-05, + "loss": 0.1088, + "step": 25595 + }, + { + "epoch": 0.45653337138372635, + "grad_norm": 0.15709362924098969, + "learning_rate": 3.301144796217002e-05, + "loss": 0.1291, + "step": 25596 + }, + { + "epoch": 0.45655120750544004, + "grad_norm": 0.25126639008522034, + "learning_rate": 3.300997352498195e-05, + "loss": 0.1855, + "step": 25597 + }, + { + "epoch": 0.4565690436271537, + "grad_norm": 0.3434886932373047, + "learning_rate": 3.3008499056744456e-05, + "loss": 0.153, + "step": 25598 + }, + { + "epoch": 0.4565868797488674, + "grad_norm": 0.27629926800727844, + "learning_rate": 3.300702455746325e-05, + "loss": 0.2016, + "step": 25599 + }, + { + "epoch": 0.4566047158705811, + "grad_norm": 0.24353432655334473, + "learning_rate": 3.300555002714405e-05, + "loss": 0.1762, + "step": 25600 + }, + { + "epoch": 0.4566225519922948, + "grad_norm": 0.3592023253440857, + "learning_rate": 3.300407546579258e-05, + "loss": 0.1627, + "step": 25601 + }, + { + "epoch": 0.4566403881140085, + "grad_norm": 0.27998852729797363, + "learning_rate": 3.300260087341455e-05, + "loss": 0.1385, + "step": 25602 + }, + { + "epoch": 0.45665822423572217, + "grad_norm": 0.3741909861564636, + "learning_rate": 3.300112625001567e-05, + "loss": 0.18, + "step": 25603 + }, + { + "epoch": 0.45667606035743585, + "grad_norm": 0.24563151597976685, + "learning_rate": 3.299965159560166e-05, + "loss": 0.1241, + "step": 25604 + }, + { + "epoch": 0.4566938964791496, + "grad_norm": 0.21539169549942017, + "learning_rate": 3.299817691017824e-05, + "loss": 0.1592, + "step": 25605 + }, + { + "epoch": 0.4567117326008633, + "grad_norm": 0.31977224349975586, + "learning_rate": 3.299670219375112e-05, + "loss": 0.1118, + "step": 25606 + }, + { + "epoch": 0.456729568722577, + "grad_norm": 0.2661239206790924, + "learning_rate": 3.299522744632602e-05, + "loss": 0.172, + "step": 25607 + }, + { + "epoch": 0.45674740484429066, + "grad_norm": 0.30055010318756104, + "learning_rate": 3.2993752667908665e-05, + "loss": 0.1808, + "step": 25608 + }, + { + "epoch": 0.45676524096600435, + "grad_norm": 0.19384883344173431, + "learning_rate": 3.299227785850476e-05, + "loss": 0.1122, + "step": 25609 + }, + { + "epoch": 0.45678307708771804, + "grad_norm": 0.3133239150047302, + "learning_rate": 3.299080301812002e-05, + "loss": 0.16, + "step": 25610 + }, + { + "epoch": 0.4568009132094317, + "grad_norm": 0.23757590353488922, + "learning_rate": 3.298932814676017e-05, + "loss": 0.1265, + "step": 25611 + }, + { + "epoch": 0.4568187493311454, + "grad_norm": 0.25030988454818726, + "learning_rate": 3.2987853244430935e-05, + "loss": 0.1857, + "step": 25612 + }, + { + "epoch": 0.45683658545285916, + "grad_norm": 0.22245270013809204, + "learning_rate": 3.2986378311138e-05, + "loss": 0.1612, + "step": 25613 + }, + { + "epoch": 0.45685442157457284, + "grad_norm": 0.28517913818359375, + "learning_rate": 3.298490334688712e-05, + "loss": 0.169, + "step": 25614 + }, + { + "epoch": 0.45687225769628653, + "grad_norm": 0.358346551656723, + "learning_rate": 3.2983428351684e-05, + "loss": 0.1242, + "step": 25615 + }, + { + "epoch": 0.4568900938180002, + "grad_norm": 0.48526445031166077, + "learning_rate": 3.298195332553435e-05, + "loss": 0.1863, + "step": 25616 + }, + { + "epoch": 0.4569079299397139, + "grad_norm": 0.23559698462486267, + "learning_rate": 3.298047826844389e-05, + "loss": 0.1684, + "step": 25617 + }, + { + "epoch": 0.4569257660614276, + "grad_norm": 0.25800377130508423, + "learning_rate": 3.297900318041834e-05, + "loss": 0.1399, + "step": 25618 + }, + { + "epoch": 0.4569436021831413, + "grad_norm": 0.3152902126312256, + "learning_rate": 3.297752806146341e-05, + "loss": 0.0944, + "step": 25619 + }, + { + "epoch": 0.45696143830485497, + "grad_norm": 0.2307669073343277, + "learning_rate": 3.297605291158484e-05, + "loss": 0.1688, + "step": 25620 + }, + { + "epoch": 0.45697927442656866, + "grad_norm": 0.31882476806640625, + "learning_rate": 3.297457773078833e-05, + "loss": 0.0921, + "step": 25621 + }, + { + "epoch": 0.4569971105482824, + "grad_norm": 0.3550775349140167, + "learning_rate": 3.2973102519079595e-05, + "loss": 0.1663, + "step": 25622 + }, + { + "epoch": 0.4570149466699961, + "grad_norm": 0.24725449085235596, + "learning_rate": 3.297162727646437e-05, + "loss": 0.1162, + "step": 25623 + }, + { + "epoch": 0.4570327827917098, + "grad_norm": 0.2853067219257355, + "learning_rate": 3.297015200294836e-05, + "loss": 0.1615, + "step": 25624 + }, + { + "epoch": 0.45705061891342347, + "grad_norm": 0.31117144227027893, + "learning_rate": 3.296867669853729e-05, + "loss": 0.2383, + "step": 25625 + }, + { + "epoch": 0.45706845503513716, + "grad_norm": 0.24993376433849335, + "learning_rate": 3.2967201363236874e-05, + "loss": 0.1578, + "step": 25626 + }, + { + "epoch": 0.45708629115685084, + "grad_norm": 0.22071373462677002, + "learning_rate": 3.296572599705284e-05, + "loss": 0.117, + "step": 25627 + }, + { + "epoch": 0.45710412727856453, + "grad_norm": 0.3490014672279358, + "learning_rate": 3.296425059999089e-05, + "loss": 0.1469, + "step": 25628 + }, + { + "epoch": 0.4571219634002782, + "grad_norm": 0.2714429199695587, + "learning_rate": 3.2962775172056766e-05, + "loss": 0.1429, + "step": 25629 + }, + { + "epoch": 0.45713979952199196, + "grad_norm": 0.3379661738872528, + "learning_rate": 3.296129971325617e-05, + "loss": 0.1922, + "step": 25630 + }, + { + "epoch": 0.45715763564370565, + "grad_norm": 0.2674955129623413, + "learning_rate": 3.2959824223594826e-05, + "loss": 0.132, + "step": 25631 + }, + { + "epoch": 0.45717547176541934, + "grad_norm": 0.32472437620162964, + "learning_rate": 3.2958348703078455e-05, + "loss": 0.157, + "step": 25632 + }, + { + "epoch": 0.457193307887133, + "grad_norm": 0.29699936509132385, + "learning_rate": 3.295687315171278e-05, + "loss": 0.1462, + "step": 25633 + }, + { + "epoch": 0.4572111440088467, + "grad_norm": 0.2153027504682541, + "learning_rate": 3.29553975695035e-05, + "loss": 0.1318, + "step": 25634 + }, + { + "epoch": 0.4572289801305604, + "grad_norm": 0.32456353306770325, + "learning_rate": 3.2953921956456364e-05, + "loss": 0.1715, + "step": 25635 + }, + { + "epoch": 0.4572468162522741, + "grad_norm": 0.295696496963501, + "learning_rate": 3.295244631257708e-05, + "loss": 0.2638, + "step": 25636 + }, + { + "epoch": 0.4572646523739878, + "grad_norm": 0.2974448502063751, + "learning_rate": 3.295097063787136e-05, + "loss": 0.1177, + "step": 25637 + }, + { + "epoch": 0.4572824884957015, + "grad_norm": 0.27639755606651306, + "learning_rate": 3.2949494932344944e-05, + "loss": 0.1465, + "step": 25638 + }, + { + "epoch": 0.4573003246174152, + "grad_norm": 0.21070195734500885, + "learning_rate": 3.2948019196003535e-05, + "loss": 0.1158, + "step": 25639 + }, + { + "epoch": 0.4573181607391289, + "grad_norm": 0.26121270656585693, + "learning_rate": 3.2946543428852854e-05, + "loss": 0.1087, + "step": 25640 + }, + { + "epoch": 0.4573359968608426, + "grad_norm": 0.2552196979522705, + "learning_rate": 3.294506763089863e-05, + "loss": 0.1646, + "step": 25641 + }, + { + "epoch": 0.4573538329825563, + "grad_norm": 0.25155118107795715, + "learning_rate": 3.2943591802146574e-05, + "loss": 0.1557, + "step": 25642 + }, + { + "epoch": 0.45737166910426996, + "grad_norm": 0.34909623861312866, + "learning_rate": 3.294211594260242e-05, + "loss": 0.1354, + "step": 25643 + }, + { + "epoch": 0.45738950522598365, + "grad_norm": 0.2594251334667206, + "learning_rate": 3.294064005227188e-05, + "loss": 0.1467, + "step": 25644 + }, + { + "epoch": 0.45740734134769734, + "grad_norm": 0.3125285506248474, + "learning_rate": 3.293916413116067e-05, + "loss": 0.1207, + "step": 25645 + }, + { + "epoch": 0.457425177469411, + "grad_norm": 0.18811379373073578, + "learning_rate": 3.2937688179274525e-05, + "loss": 0.1178, + "step": 25646 + }, + { + "epoch": 0.45744301359112477, + "grad_norm": 0.2780362665653229, + "learning_rate": 3.2936212196619154e-05, + "loss": 0.1679, + "step": 25647 + }, + { + "epoch": 0.45746084971283846, + "grad_norm": 0.27849629521369934, + "learning_rate": 3.2934736183200284e-05, + "loss": 0.1509, + "step": 25648 + }, + { + "epoch": 0.45747868583455215, + "grad_norm": 0.345282644033432, + "learning_rate": 3.2933260139023635e-05, + "loss": 0.1437, + "step": 25649 + }, + { + "epoch": 0.45749652195626583, + "grad_norm": 0.26852524280548096, + "learning_rate": 3.2931784064094934e-05, + "loss": 0.1473, + "step": 25650 + }, + { + "epoch": 0.4575143580779795, + "grad_norm": 0.41945159435272217, + "learning_rate": 3.29303079584199e-05, + "loss": 0.1387, + "step": 25651 + }, + { + "epoch": 0.4575321941996932, + "grad_norm": 0.2789818048477173, + "learning_rate": 3.292883182200425e-05, + "loss": 0.1411, + "step": 25652 + }, + { + "epoch": 0.4575500303214069, + "grad_norm": 0.19430217146873474, + "learning_rate": 3.292735565485371e-05, + "loss": 0.1002, + "step": 25653 + }, + { + "epoch": 0.4575678664431206, + "grad_norm": 0.24267876148223877, + "learning_rate": 3.2925879456973996e-05, + "loss": 0.103, + "step": 25654 + }, + { + "epoch": 0.45758570256483433, + "grad_norm": 0.29199084639549255, + "learning_rate": 3.292440322837084e-05, + "loss": 0.1338, + "step": 25655 + }, + { + "epoch": 0.457603538686548, + "grad_norm": 0.23551107943058014, + "learning_rate": 3.292292696904996e-05, + "loss": 0.1415, + "step": 25656 + }, + { + "epoch": 0.4576213748082617, + "grad_norm": 0.2686185836791992, + "learning_rate": 3.292145067901708e-05, + "loss": 0.1591, + "step": 25657 + }, + { + "epoch": 0.4576392109299754, + "grad_norm": 0.28387245535850525, + "learning_rate": 3.291997435827793e-05, + "loss": 0.1721, + "step": 25658 + }, + { + "epoch": 0.4576570470516891, + "grad_norm": 0.21101944148540497, + "learning_rate": 3.291849800683821e-05, + "loss": 0.1204, + "step": 25659 + }, + { + "epoch": 0.45767488317340277, + "grad_norm": 0.31178781390190125, + "learning_rate": 3.291702162470366e-05, + "loss": 0.1905, + "step": 25660 + }, + { + "epoch": 0.45769271929511646, + "grad_norm": 0.2514325678348541, + "learning_rate": 3.291554521188e-05, + "loss": 0.1569, + "step": 25661 + }, + { + "epoch": 0.45771055541683014, + "grad_norm": 0.2219846397638321, + "learning_rate": 3.291406876837295e-05, + "loss": 0.1405, + "step": 25662 + }, + { + "epoch": 0.45772839153854383, + "grad_norm": 0.22639405727386475, + "learning_rate": 3.2912592294188236e-05, + "loss": 0.1088, + "step": 25663 + }, + { + "epoch": 0.4577462276602576, + "grad_norm": 0.26393505930900574, + "learning_rate": 3.291111578933159e-05, + "loss": 0.1104, + "step": 25664 + }, + { + "epoch": 0.45776406378197126, + "grad_norm": 0.34459200501441956, + "learning_rate": 3.290963925380872e-05, + "loss": 0.1405, + "step": 25665 + }, + { + "epoch": 0.45778189990368495, + "grad_norm": 0.1711176335811615, + "learning_rate": 3.2908162687625365e-05, + "loss": 0.1219, + "step": 25666 + }, + { + "epoch": 0.45779973602539864, + "grad_norm": 0.21426557004451752, + "learning_rate": 3.290668609078723e-05, + "loss": 0.0925, + "step": 25667 + }, + { + "epoch": 0.45781757214711233, + "grad_norm": 0.2660115361213684, + "learning_rate": 3.2905209463300055e-05, + "loss": 0.1416, + "step": 25668 + }, + { + "epoch": 0.457835408268826, + "grad_norm": 0.34632986783981323, + "learning_rate": 3.290373280516955e-05, + "loss": 0.1243, + "step": 25669 + }, + { + "epoch": 0.4578532443905397, + "grad_norm": 0.22129449248313904, + "learning_rate": 3.2902256116401454e-05, + "loss": 0.1364, + "step": 25670 + }, + { + "epoch": 0.4578710805122534, + "grad_norm": 0.3137301802635193, + "learning_rate": 3.290077939700148e-05, + "loss": 0.1056, + "step": 25671 + }, + { + "epoch": 0.45788891663396714, + "grad_norm": 0.2301984578371048, + "learning_rate": 3.2899302646975363e-05, + "loss": 0.119, + "step": 25672 + }, + { + "epoch": 0.4579067527556808, + "grad_norm": 0.2068316489458084, + "learning_rate": 3.2897825866328816e-05, + "loss": 0.1175, + "step": 25673 + }, + { + "epoch": 0.4579245888773945, + "grad_norm": 0.23619364202022552, + "learning_rate": 3.289634905506756e-05, + "loss": 0.108, + "step": 25674 + }, + { + "epoch": 0.4579424249991082, + "grad_norm": 0.32757872343063354, + "learning_rate": 3.289487221319734e-05, + "loss": 0.2001, + "step": 25675 + }, + { + "epoch": 0.4579602611208219, + "grad_norm": 0.2315714955329895, + "learning_rate": 3.289339534072386e-05, + "loss": 0.1303, + "step": 25676 + }, + { + "epoch": 0.4579780972425356, + "grad_norm": 0.2705220580101013, + "learning_rate": 3.2891918437652856e-05, + "loss": 0.2017, + "step": 25677 + }, + { + "epoch": 0.45799593336424926, + "grad_norm": 0.2685730755329132, + "learning_rate": 3.289044150399005e-05, + "loss": 0.1502, + "step": 25678 + }, + { + "epoch": 0.45801376948596295, + "grad_norm": 0.31895095109939575, + "learning_rate": 3.2888964539741176e-05, + "loss": 0.1508, + "step": 25679 + }, + { + "epoch": 0.45803160560767664, + "grad_norm": 0.24345429241657257, + "learning_rate": 3.288748754491194e-05, + "loss": 0.1813, + "step": 25680 + }, + { + "epoch": 0.4580494417293904, + "grad_norm": 0.24111449718475342, + "learning_rate": 3.288601051950808e-05, + "loss": 0.154, + "step": 25681 + }, + { + "epoch": 0.45806727785110407, + "grad_norm": 0.2795223593711853, + "learning_rate": 3.288453346353532e-05, + "loss": 0.086, + "step": 25682 + }, + { + "epoch": 0.45808511397281776, + "grad_norm": 0.1886620670557022, + "learning_rate": 3.2883056376999386e-05, + "loss": 0.1117, + "step": 25683 + }, + { + "epoch": 0.45810295009453145, + "grad_norm": 0.2418023943901062, + "learning_rate": 3.2881579259906005e-05, + "loss": 0.1297, + "step": 25684 + }, + { + "epoch": 0.45812078621624513, + "grad_norm": 0.26265498995780945, + "learning_rate": 3.28801021122609e-05, + "loss": 0.1355, + "step": 25685 + }, + { + "epoch": 0.4581386223379588, + "grad_norm": 0.2866142988204956, + "learning_rate": 3.28786249340698e-05, + "loss": 0.127, + "step": 25686 + }, + { + "epoch": 0.4581564584596725, + "grad_norm": 0.27637115120887756, + "learning_rate": 3.287714772533842e-05, + "loss": 0.1616, + "step": 25687 + }, + { + "epoch": 0.4581742945813862, + "grad_norm": 0.25364696979522705, + "learning_rate": 3.28756704860725e-05, + "loss": 0.1821, + "step": 25688 + }, + { + "epoch": 0.45819213070309994, + "grad_norm": 0.24076221883296967, + "learning_rate": 3.287419321627776e-05, + "loss": 0.1768, + "step": 25689 + }, + { + "epoch": 0.45820996682481363, + "grad_norm": 0.2482614517211914, + "learning_rate": 3.287271591595993e-05, + "loss": 0.1215, + "step": 25690 + }, + { + "epoch": 0.4582278029465273, + "grad_norm": 0.3314327597618103, + "learning_rate": 3.2871238585124724e-05, + "loss": 0.2124, + "step": 25691 + }, + { + "epoch": 0.458245639068241, + "grad_norm": 0.27108749747276306, + "learning_rate": 3.2869761223777893e-05, + "loss": 0.1797, + "step": 25692 + }, + { + "epoch": 0.4582634751899547, + "grad_norm": 0.22751684486865997, + "learning_rate": 3.286828383192514e-05, + "loss": 0.1774, + "step": 25693 + }, + { + "epoch": 0.4582813113116684, + "grad_norm": 0.26826396584510803, + "learning_rate": 3.28668064095722e-05, + "loss": 0.1614, + "step": 25694 + }, + { + "epoch": 0.45829914743338207, + "grad_norm": 0.24261720478534698, + "learning_rate": 3.2865328956724814e-05, + "loss": 0.1095, + "step": 25695 + }, + { + "epoch": 0.45831698355509576, + "grad_norm": 0.28999802470207214, + "learning_rate": 3.286385147338868e-05, + "loss": 0.1738, + "step": 25696 + }, + { + "epoch": 0.4583348196768095, + "grad_norm": 0.2743673324584961, + "learning_rate": 3.286237395956955e-05, + "loss": 0.1484, + "step": 25697 + }, + { + "epoch": 0.4583526557985232, + "grad_norm": 0.2501561641693115, + "learning_rate": 3.286089641527315e-05, + "loss": 0.1185, + "step": 25698 + }, + { + "epoch": 0.4583704919202369, + "grad_norm": 0.33886897563934326, + "learning_rate": 3.285941884050519e-05, + "loss": 0.1533, + "step": 25699 + }, + { + "epoch": 0.45838832804195057, + "grad_norm": 0.21820585429668427, + "learning_rate": 3.2857941235271405e-05, + "loss": 0.1125, + "step": 25700 + }, + { + "epoch": 0.45840616416366425, + "grad_norm": 0.2532954812049866, + "learning_rate": 3.2856463599577535e-05, + "loss": 0.1382, + "step": 25701 + }, + { + "epoch": 0.45842400028537794, + "grad_norm": 0.23721110820770264, + "learning_rate": 3.2854985933429293e-05, + "loss": 0.127, + "step": 25702 + }, + { + "epoch": 0.45844183640709163, + "grad_norm": 0.26717960834503174, + "learning_rate": 3.285350823683241e-05, + "loss": 0.1609, + "step": 25703 + }, + { + "epoch": 0.4584596725288053, + "grad_norm": 0.2746828496456146, + "learning_rate": 3.2852030509792626e-05, + "loss": 0.1656, + "step": 25704 + }, + { + "epoch": 0.458477508650519, + "grad_norm": 0.2644045650959015, + "learning_rate": 3.285055275231566e-05, + "loss": 0.1164, + "step": 25705 + }, + { + "epoch": 0.45849534477223275, + "grad_norm": 0.29921290278434753, + "learning_rate": 3.284907496440723e-05, + "loss": 0.1783, + "step": 25706 + }, + { + "epoch": 0.45851318089394644, + "grad_norm": 0.2715904116630554, + "learning_rate": 3.284759714607308e-05, + "loss": 0.1323, + "step": 25707 + }, + { + "epoch": 0.4585310170156601, + "grad_norm": 0.24322174489498138, + "learning_rate": 3.284611929731893e-05, + "loss": 0.1721, + "step": 25708 + }, + { + "epoch": 0.4585488531373738, + "grad_norm": 0.20780092477798462, + "learning_rate": 3.284464141815052e-05, + "loss": 0.0977, + "step": 25709 + }, + { + "epoch": 0.4585666892590875, + "grad_norm": 0.23517489433288574, + "learning_rate": 3.284316350857356e-05, + "loss": 0.1111, + "step": 25710 + }, + { + "epoch": 0.4585845253808012, + "grad_norm": 0.238978773355484, + "learning_rate": 3.284168556859379e-05, + "loss": 0.1394, + "step": 25711 + }, + { + "epoch": 0.4586023615025149, + "grad_norm": 0.22924506664276123, + "learning_rate": 3.284020759821694e-05, + "loss": 0.1643, + "step": 25712 + }, + { + "epoch": 0.45862019762422856, + "grad_norm": 0.19677604734897614, + "learning_rate": 3.283872959744874e-05, + "loss": 0.157, + "step": 25713 + }, + { + "epoch": 0.4586380337459423, + "grad_norm": 0.26793423295021057, + "learning_rate": 3.2837251566294926e-05, + "loss": 0.1796, + "step": 25714 + }, + { + "epoch": 0.458655869867656, + "grad_norm": 0.3087601661682129, + "learning_rate": 3.2835773504761205e-05, + "loss": 0.1513, + "step": 25715 + }, + { + "epoch": 0.4586737059893697, + "grad_norm": 0.28473737835884094, + "learning_rate": 3.283429541285332e-05, + "loss": 0.1631, + "step": 25716 + }, + { + "epoch": 0.4586915421110834, + "grad_norm": 0.3098143935203552, + "learning_rate": 3.2832817290577e-05, + "loss": 0.1234, + "step": 25717 + }, + { + "epoch": 0.45870937823279706, + "grad_norm": 0.25014400482177734, + "learning_rate": 3.283133913793798e-05, + "loss": 0.104, + "step": 25718 + }, + { + "epoch": 0.45872721435451075, + "grad_norm": 0.33132413029670715, + "learning_rate": 3.2829860954941976e-05, + "loss": 0.1606, + "step": 25719 + }, + { + "epoch": 0.45874505047622444, + "grad_norm": 0.33311113715171814, + "learning_rate": 3.2828382741594736e-05, + "loss": 0.1864, + "step": 25720 + }, + { + "epoch": 0.4587628865979381, + "grad_norm": 0.25058743357658386, + "learning_rate": 3.282690449790198e-05, + "loss": 0.1192, + "step": 25721 + }, + { + "epoch": 0.4587807227196518, + "grad_norm": 0.22558225691318512, + "learning_rate": 3.2825426223869436e-05, + "loss": 0.1485, + "step": 25722 + }, + { + "epoch": 0.45879855884136556, + "grad_norm": 0.22102046012878418, + "learning_rate": 3.282394791950284e-05, + "loss": 0.1386, + "step": 25723 + }, + { + "epoch": 0.45881639496307924, + "grad_norm": 0.24871978163719177, + "learning_rate": 3.2822469584807906e-05, + "loss": 0.0817, + "step": 25724 + }, + { + "epoch": 0.45883423108479293, + "grad_norm": 0.35498183965682983, + "learning_rate": 3.28209912197904e-05, + "loss": 0.164, + "step": 25725 + }, + { + "epoch": 0.4588520672065066, + "grad_norm": 0.21349261701107025, + "learning_rate": 3.281951282445601e-05, + "loss": 0.135, + "step": 25726 + }, + { + "epoch": 0.4588699033282203, + "grad_norm": 0.36893653869628906, + "learning_rate": 3.2818034398810504e-05, + "loss": 0.1557, + "step": 25727 + }, + { + "epoch": 0.458887739449934, + "grad_norm": 0.3415740132331848, + "learning_rate": 3.281655594285959e-05, + "loss": 0.1804, + "step": 25728 + }, + { + "epoch": 0.4589055755716477, + "grad_norm": 0.25834035873413086, + "learning_rate": 3.281507745660901e-05, + "loss": 0.1252, + "step": 25729 + }, + { + "epoch": 0.45892341169336137, + "grad_norm": 0.26537999510765076, + "learning_rate": 3.2813598940064475e-05, + "loss": 0.1724, + "step": 25730 + }, + { + "epoch": 0.4589412478150751, + "grad_norm": 0.26515117287635803, + "learning_rate": 3.2812120393231744e-05, + "loss": 0.1675, + "step": 25731 + }, + { + "epoch": 0.4589590839367888, + "grad_norm": 0.20561553537845612, + "learning_rate": 3.2810641816116535e-05, + "loss": 0.1086, + "step": 25732 + }, + { + "epoch": 0.4589769200585025, + "grad_norm": 0.40284013748168945, + "learning_rate": 3.280916320872458e-05, + "loss": 0.1636, + "step": 25733 + }, + { + "epoch": 0.4589947561802162, + "grad_norm": 0.23910824954509735, + "learning_rate": 3.280768457106161e-05, + "loss": 0.1183, + "step": 25734 + }, + { + "epoch": 0.45901259230192987, + "grad_norm": 0.23710238933563232, + "learning_rate": 3.280620590313336e-05, + "loss": 0.1631, + "step": 25735 + }, + { + "epoch": 0.45903042842364356, + "grad_norm": 0.27089130878448486, + "learning_rate": 3.280472720494556e-05, + "loss": 0.1262, + "step": 25736 + }, + { + "epoch": 0.45904826454535724, + "grad_norm": 0.21723240613937378, + "learning_rate": 3.2803248476503944e-05, + "loss": 0.1312, + "step": 25737 + }, + { + "epoch": 0.45906610066707093, + "grad_norm": 0.3460243046283722, + "learning_rate": 3.280176971781423e-05, + "loss": 0.1742, + "step": 25738 + }, + { + "epoch": 0.4590839367887846, + "grad_norm": 0.36793309450149536, + "learning_rate": 3.280029092888217e-05, + "loss": 0.1279, + "step": 25739 + }, + { + "epoch": 0.45910177291049836, + "grad_norm": 0.19602982699871063, + "learning_rate": 3.2798812109713496e-05, + "loss": 0.0997, + "step": 25740 + }, + { + "epoch": 0.45911960903221205, + "grad_norm": 0.21076786518096924, + "learning_rate": 3.279733326031392e-05, + "loss": 0.0915, + "step": 25741 + }, + { + "epoch": 0.45913744515392574, + "grad_norm": 0.25963619351387024, + "learning_rate": 3.27958543806892e-05, + "loss": 0.1126, + "step": 25742 + }, + { + "epoch": 0.4591552812756394, + "grad_norm": 0.25655022263526917, + "learning_rate": 3.279437547084504e-05, + "loss": 0.1376, + "step": 25743 + }, + { + "epoch": 0.4591731173973531, + "grad_norm": 0.2974923551082611, + "learning_rate": 3.2792896530787204e-05, + "loss": 0.2223, + "step": 25744 + }, + { + "epoch": 0.4591909535190668, + "grad_norm": 0.2894544005393982, + "learning_rate": 3.2791417560521396e-05, + "loss": 0.138, + "step": 25745 + }, + { + "epoch": 0.4592087896407805, + "grad_norm": 0.30527371168136597, + "learning_rate": 3.278993856005337e-05, + "loss": 0.1602, + "step": 25746 + }, + { + "epoch": 0.4592266257624942, + "grad_norm": 0.19043420255184174, + "learning_rate": 3.278845952938885e-05, + "loss": 0.1585, + "step": 25747 + }, + { + "epoch": 0.4592444618842079, + "grad_norm": 0.22898824512958527, + "learning_rate": 3.278698046853357e-05, + "loss": 0.1364, + "step": 25748 + }, + { + "epoch": 0.4592622980059216, + "grad_norm": 0.22600287199020386, + "learning_rate": 3.278550137749327e-05, + "loss": 0.1362, + "step": 25749 + }, + { + "epoch": 0.4592801341276353, + "grad_norm": 0.29720863699913025, + "learning_rate": 3.278402225627367e-05, + "loss": 0.0885, + "step": 25750 + }, + { + "epoch": 0.459297970249349, + "grad_norm": 0.3840457499027252, + "learning_rate": 3.278254310488051e-05, + "loss": 0.1833, + "step": 25751 + }, + { + "epoch": 0.4593158063710627, + "grad_norm": 0.27006083726882935, + "learning_rate": 3.2781063923319536e-05, + "loss": 0.161, + "step": 25752 + }, + { + "epoch": 0.45933364249277636, + "grad_norm": 0.26204490661621094, + "learning_rate": 3.277958471159646e-05, + "loss": 0.1046, + "step": 25753 + }, + { + "epoch": 0.45935147861449005, + "grad_norm": 0.33187493681907654, + "learning_rate": 3.277810546971703e-05, + "loss": 0.1329, + "step": 25754 + }, + { + "epoch": 0.45936931473620374, + "grad_norm": 0.30578693747520447, + "learning_rate": 3.277662619768698e-05, + "loss": 0.1403, + "step": 25755 + }, + { + "epoch": 0.4593871508579175, + "grad_norm": 0.34515100717544556, + "learning_rate": 3.2775146895512034e-05, + "loss": 0.1595, + "step": 25756 + }, + { + "epoch": 0.45940498697963117, + "grad_norm": 0.27839985489845276, + "learning_rate": 3.2773667563197943e-05, + "loss": 0.146, + "step": 25757 + }, + { + "epoch": 0.45942282310134486, + "grad_norm": 0.2635040283203125, + "learning_rate": 3.277218820075042e-05, + "loss": 0.1192, + "step": 25758 + }, + { + "epoch": 0.45944065922305855, + "grad_norm": 0.30586007237434387, + "learning_rate": 3.277070880817521e-05, + "loss": 0.1492, + "step": 25759 + }, + { + "epoch": 0.45945849534477223, + "grad_norm": 0.27949824929237366, + "learning_rate": 3.2769229385478064e-05, + "loss": 0.1253, + "step": 25760 + }, + { + "epoch": 0.4594763314664859, + "grad_norm": 0.30623701214790344, + "learning_rate": 3.2767749932664694e-05, + "loss": 0.1744, + "step": 25761 + }, + { + "epoch": 0.4594941675881996, + "grad_norm": 0.2526586949825287, + "learning_rate": 3.276627044974084e-05, + "loss": 0.0899, + "step": 25762 + }, + { + "epoch": 0.4595120037099133, + "grad_norm": 0.2833310067653656, + "learning_rate": 3.276479093671224e-05, + "loss": 0.1041, + "step": 25763 + }, + { + "epoch": 0.459529839831627, + "grad_norm": 0.32900676131248474, + "learning_rate": 3.2763311393584635e-05, + "loss": 0.1456, + "step": 25764 + }, + { + "epoch": 0.45954767595334073, + "grad_norm": 0.22483208775520325, + "learning_rate": 3.276183182036375e-05, + "loss": 0.1556, + "step": 25765 + }, + { + "epoch": 0.4595655120750544, + "grad_norm": 0.31574106216430664, + "learning_rate": 3.276035221705532e-05, + "loss": 0.1552, + "step": 25766 + }, + { + "epoch": 0.4595833481967681, + "grad_norm": 0.22517213225364685, + "learning_rate": 3.275887258366508e-05, + "loss": 0.1506, + "step": 25767 + }, + { + "epoch": 0.4596011843184818, + "grad_norm": 0.22395804524421692, + "learning_rate": 3.275739292019878e-05, + "loss": 0.1131, + "step": 25768 + }, + { + "epoch": 0.4596190204401955, + "grad_norm": 0.28659528493881226, + "learning_rate": 3.275591322666214e-05, + "loss": 0.1116, + "step": 25769 + }, + { + "epoch": 0.45963685656190917, + "grad_norm": 0.38826295733451843, + "learning_rate": 3.2754433503060914e-05, + "loss": 0.132, + "step": 25770 + }, + { + "epoch": 0.45965469268362286, + "grad_norm": 0.34235289692878723, + "learning_rate": 3.275295374940081e-05, + "loss": 0.1004, + "step": 25771 + }, + { + "epoch": 0.45967252880533654, + "grad_norm": 0.24566218256950378, + "learning_rate": 3.2751473965687585e-05, + "loss": 0.136, + "step": 25772 + }, + { + "epoch": 0.4596903649270503, + "grad_norm": 0.29846876859664917, + "learning_rate": 3.274999415192696e-05, + "loss": 0.1794, + "step": 25773 + }, + { + "epoch": 0.459708201048764, + "grad_norm": 0.2336914837360382, + "learning_rate": 3.274851430812469e-05, + "loss": 0.1337, + "step": 25774 + }, + { + "epoch": 0.45972603717047766, + "grad_norm": 0.4262526035308838, + "learning_rate": 3.2747034434286514e-05, + "loss": 0.1014, + "step": 25775 + }, + { + "epoch": 0.45974387329219135, + "grad_norm": 0.3195294141769409, + "learning_rate": 3.274555453041814e-05, + "loss": 0.2094, + "step": 25776 + }, + { + "epoch": 0.45976170941390504, + "grad_norm": 0.3057333827018738, + "learning_rate": 3.274407459652533e-05, + "loss": 0.1279, + "step": 25777 + }, + { + "epoch": 0.4597795455356187, + "grad_norm": 0.22899553179740906, + "learning_rate": 3.2742594632613805e-05, + "loss": 0.1405, + "step": 25778 + }, + { + "epoch": 0.4597973816573324, + "grad_norm": 0.24905577301979065, + "learning_rate": 3.274111463868931e-05, + "loss": 0.1393, + "step": 25779 + }, + { + "epoch": 0.4598152177790461, + "grad_norm": 0.21940824389457703, + "learning_rate": 3.273963461475759e-05, + "loss": 0.1432, + "step": 25780 + }, + { + "epoch": 0.4598330539007598, + "grad_norm": 0.32560619711875916, + "learning_rate": 3.273815456082436e-05, + "loss": 0.1551, + "step": 25781 + }, + { + "epoch": 0.45985089002247354, + "grad_norm": 0.2656085789203644, + "learning_rate": 3.273667447689538e-05, + "loss": 0.1616, + "step": 25782 + }, + { + "epoch": 0.4598687261441872, + "grad_norm": 0.27227163314819336, + "learning_rate": 3.2735194362976374e-05, + "loss": 0.0834, + "step": 25783 + }, + { + "epoch": 0.4598865622659009, + "grad_norm": 0.2706531584262848, + "learning_rate": 3.2733714219073084e-05, + "loss": 0.1177, + "step": 25784 + }, + { + "epoch": 0.4599043983876146, + "grad_norm": 0.3090117871761322, + "learning_rate": 3.273223404519125e-05, + "loss": 0.176, + "step": 25785 + }, + { + "epoch": 0.4599222345093283, + "grad_norm": 0.19468435645103455, + "learning_rate": 3.27307538413366e-05, + "loss": 0.1273, + "step": 25786 + }, + { + "epoch": 0.459940070631042, + "grad_norm": 0.31341978907585144, + "learning_rate": 3.272927360751488e-05, + "loss": 0.1941, + "step": 25787 + }, + { + "epoch": 0.45995790675275566, + "grad_norm": 0.24075721204280853, + "learning_rate": 3.272779334373183e-05, + "loss": 0.135, + "step": 25788 + }, + { + "epoch": 0.45997574287446935, + "grad_norm": 0.28112050890922546, + "learning_rate": 3.272631304999317e-05, + "loss": 0.1821, + "step": 25789 + }, + { + "epoch": 0.4599935789961831, + "grad_norm": 0.2784784734249115, + "learning_rate": 3.2724832726304673e-05, + "loss": 0.1521, + "step": 25790 + }, + { + "epoch": 0.4600114151178968, + "grad_norm": 0.27189746499061584, + "learning_rate": 3.272335237267204e-05, + "loss": 0.1511, + "step": 25791 + }, + { + "epoch": 0.46002925123961047, + "grad_norm": 0.253939151763916, + "learning_rate": 3.272187198910104e-05, + "loss": 0.0698, + "step": 25792 + }, + { + "epoch": 0.46004708736132416, + "grad_norm": 0.307739794254303, + "learning_rate": 3.272039157559738e-05, + "loss": 0.1753, + "step": 25793 + }, + { + "epoch": 0.46006492348303785, + "grad_norm": 0.3136581778526306, + "learning_rate": 3.2718911132166826e-05, + "loss": 0.116, + "step": 25794 + }, + { + "epoch": 0.46008275960475153, + "grad_norm": 0.2769233286380768, + "learning_rate": 3.27174306588151e-05, + "loss": 0.1874, + "step": 25795 + }, + { + "epoch": 0.4601005957264652, + "grad_norm": 0.18698105216026306, + "learning_rate": 3.271595015554796e-05, + "loss": 0.0925, + "step": 25796 + }, + { + "epoch": 0.4601184318481789, + "grad_norm": 0.23098380863666534, + "learning_rate": 3.271446962237112e-05, + "loss": 0.1232, + "step": 25797 + }, + { + "epoch": 0.46013626796989265, + "grad_norm": 0.4390886723995209, + "learning_rate": 3.2712989059290334e-05, + "loss": 0.1363, + "step": 25798 + }, + { + "epoch": 0.46015410409160634, + "grad_norm": 0.197047621011734, + "learning_rate": 3.271150846631134e-05, + "loss": 0.1283, + "step": 25799 + }, + { + "epoch": 0.46017194021332003, + "grad_norm": 0.27739617228507996, + "learning_rate": 3.271002784343988e-05, + "loss": 0.1401, + "step": 25800 + }, + { + "epoch": 0.4601897763350337, + "grad_norm": 0.24595533311367035, + "learning_rate": 3.270854719068168e-05, + "loss": 0.1199, + "step": 25801 + }, + { + "epoch": 0.4602076124567474, + "grad_norm": 0.20866097509860992, + "learning_rate": 3.270706650804249e-05, + "loss": 0.1444, + "step": 25802 + }, + { + "epoch": 0.4602254485784611, + "grad_norm": 0.24671515822410583, + "learning_rate": 3.270558579552806e-05, + "loss": 0.0954, + "step": 25803 + }, + { + "epoch": 0.4602432847001748, + "grad_norm": 0.2498752623796463, + "learning_rate": 3.27041050531441e-05, + "loss": 0.1282, + "step": 25804 + }, + { + "epoch": 0.46026112082188847, + "grad_norm": 0.275075227022171, + "learning_rate": 3.270262428089638e-05, + "loss": 0.0933, + "step": 25805 + }, + { + "epoch": 0.46027895694360216, + "grad_norm": 0.3085978627204895, + "learning_rate": 3.270114347879063e-05, + "loss": 0.1541, + "step": 25806 + }, + { + "epoch": 0.4602967930653159, + "grad_norm": 0.32416579127311707, + "learning_rate": 3.269966264683258e-05, + "loss": 0.1247, + "step": 25807 + }, + { + "epoch": 0.4603146291870296, + "grad_norm": 0.2567242383956909, + "learning_rate": 3.269818178502797e-05, + "loss": 0.1654, + "step": 25808 + }, + { + "epoch": 0.4603324653087433, + "grad_norm": 0.2719188928604126, + "learning_rate": 3.269670089338257e-05, + "loss": 0.1675, + "step": 25809 + }, + { + "epoch": 0.46035030143045697, + "grad_norm": 0.3668282628059387, + "learning_rate": 3.269521997190209e-05, + "loss": 0.0978, + "step": 25810 + }, + { + "epoch": 0.46036813755217065, + "grad_norm": 0.25756388902664185, + "learning_rate": 3.269373902059228e-05, + "loss": 0.1716, + "step": 25811 + }, + { + "epoch": 0.46038597367388434, + "grad_norm": 0.26922938227653503, + "learning_rate": 3.269225803945888e-05, + "loss": 0.1125, + "step": 25812 + }, + { + "epoch": 0.46040380979559803, + "grad_norm": 0.27569660544395447, + "learning_rate": 3.269077702850763e-05, + "loss": 0.1258, + "step": 25813 + }, + { + "epoch": 0.4604216459173117, + "grad_norm": 0.26394906640052795, + "learning_rate": 3.268929598774427e-05, + "loss": 0.1753, + "step": 25814 + }, + { + "epoch": 0.46043948203902546, + "grad_norm": 0.33814337849617004, + "learning_rate": 3.268781491717454e-05, + "loss": 0.1942, + "step": 25815 + }, + { + "epoch": 0.46045731816073915, + "grad_norm": 0.2406303882598877, + "learning_rate": 3.2686333816804194e-05, + "loss": 0.1132, + "step": 25816 + }, + { + "epoch": 0.46047515428245284, + "grad_norm": 0.26674601435661316, + "learning_rate": 3.2684852686638956e-05, + "loss": 0.1203, + "step": 25817 + }, + { + "epoch": 0.4604929904041665, + "grad_norm": 0.21297091245651245, + "learning_rate": 3.268337152668458e-05, + "loss": 0.0782, + "step": 25818 + }, + { + "epoch": 0.4605108265258802, + "grad_norm": 0.2654586732387543, + "learning_rate": 3.2681890336946795e-05, + "loss": 0.1428, + "step": 25819 + }, + { + "epoch": 0.4605286626475939, + "grad_norm": 0.23363183438777924, + "learning_rate": 3.268040911743135e-05, + "loss": 0.1557, + "step": 25820 + }, + { + "epoch": 0.4605464987693076, + "grad_norm": 0.2674151659011841, + "learning_rate": 3.2678927868143994e-05, + "loss": 0.132, + "step": 25821 + }, + { + "epoch": 0.4605643348910213, + "grad_norm": 0.2626718580722809, + "learning_rate": 3.2677446589090455e-05, + "loss": 0.1383, + "step": 25822 + }, + { + "epoch": 0.46058217101273496, + "grad_norm": 0.20936977863311768, + "learning_rate": 3.267596528027648e-05, + "loss": 0.1465, + "step": 25823 + }, + { + "epoch": 0.4606000071344487, + "grad_norm": 0.33624109625816345, + "learning_rate": 3.2674483941707826e-05, + "loss": 0.1087, + "step": 25824 + }, + { + "epoch": 0.4606178432561624, + "grad_norm": 0.2843417823314667, + "learning_rate": 3.267300257339021e-05, + "loss": 0.1601, + "step": 25825 + }, + { + "epoch": 0.4606356793778761, + "grad_norm": 0.33800917863845825, + "learning_rate": 3.267152117532939e-05, + "loss": 0.1321, + "step": 25826 + }, + { + "epoch": 0.46065351549958977, + "grad_norm": 0.27965471148490906, + "learning_rate": 3.26700397475311e-05, + "loss": 0.162, + "step": 25827 + }, + { + "epoch": 0.46067135162130346, + "grad_norm": 0.2899395227432251, + "learning_rate": 3.266855829000108e-05, + "loss": 0.1071, + "step": 25828 + }, + { + "epoch": 0.46068918774301715, + "grad_norm": 0.48492705821990967, + "learning_rate": 3.2667076802745096e-05, + "loss": 0.1522, + "step": 25829 + }, + { + "epoch": 0.46070702386473084, + "grad_norm": 0.26405781507492065, + "learning_rate": 3.2665595285768866e-05, + "loss": 0.0881, + "step": 25830 + }, + { + "epoch": 0.4607248599864445, + "grad_norm": 0.2571663558483124, + "learning_rate": 3.2664113739078144e-05, + "loss": 0.15, + "step": 25831 + }, + { + "epoch": 0.46074269610815827, + "grad_norm": 0.2735995650291443, + "learning_rate": 3.266263216267866e-05, + "loss": 0.1405, + "step": 25832 + }, + { + "epoch": 0.46076053222987196, + "grad_norm": 0.3667232096195221, + "learning_rate": 3.266115055657618e-05, + "loss": 0.1304, + "step": 25833 + }, + { + "epoch": 0.46077836835158564, + "grad_norm": 0.38410672545433044, + "learning_rate": 3.265966892077643e-05, + "loss": 0.2069, + "step": 25834 + }, + { + "epoch": 0.46079620447329933, + "grad_norm": 0.2521287202835083, + "learning_rate": 3.2658187255285156e-05, + "loss": 0.1055, + "step": 25835 + }, + { + "epoch": 0.460814040595013, + "grad_norm": 0.25701242685317993, + "learning_rate": 3.26567055601081e-05, + "loss": 0.1528, + "step": 25836 + }, + { + "epoch": 0.4608318767167267, + "grad_norm": 0.2520907521247864, + "learning_rate": 3.265522383525101e-05, + "loss": 0.1429, + "step": 25837 + }, + { + "epoch": 0.4608497128384404, + "grad_norm": 0.24482101202011108, + "learning_rate": 3.2653742080719635e-05, + "loss": 0.0912, + "step": 25838 + }, + { + "epoch": 0.4608675489601541, + "grad_norm": 0.3421719968318939, + "learning_rate": 3.265226029651971e-05, + "loss": 0.1392, + "step": 25839 + }, + { + "epoch": 0.46088538508186777, + "grad_norm": 0.24517175555229187, + "learning_rate": 3.265077848265699e-05, + "loss": 0.1351, + "step": 25840 + }, + { + "epoch": 0.4609032212035815, + "grad_norm": 0.27683916687965393, + "learning_rate": 3.264929663913719e-05, + "loss": 0.178, + "step": 25841 + }, + { + "epoch": 0.4609210573252952, + "grad_norm": 0.28843027353286743, + "learning_rate": 3.264781476596608e-05, + "loss": 0.1654, + "step": 25842 + }, + { + "epoch": 0.4609388934470089, + "grad_norm": 0.21967118978500366, + "learning_rate": 3.26463328631494e-05, + "loss": 0.1503, + "step": 25843 + }, + { + "epoch": 0.4609567295687226, + "grad_norm": 0.33800002932548523, + "learning_rate": 3.26448509306929e-05, + "loss": 0.1768, + "step": 25844 + }, + { + "epoch": 0.46097456569043627, + "grad_norm": 0.28730538487434387, + "learning_rate": 3.264336896860231e-05, + "loss": 0.1014, + "step": 25845 + }, + { + "epoch": 0.46099240181214995, + "grad_norm": 0.2638072669506073, + "learning_rate": 3.264188697688339e-05, + "loss": 0.1497, + "step": 25846 + }, + { + "epoch": 0.46101023793386364, + "grad_norm": 0.2450537234544754, + "learning_rate": 3.264040495554187e-05, + "loss": 0.1339, + "step": 25847 + }, + { + "epoch": 0.46102807405557733, + "grad_norm": 0.2675429582595825, + "learning_rate": 3.26389229045835e-05, + "loss": 0.1293, + "step": 25848 + }, + { + "epoch": 0.4610459101772911, + "grad_norm": 0.3125545084476471, + "learning_rate": 3.263744082401403e-05, + "loss": 0.2092, + "step": 25849 + }, + { + "epoch": 0.46106374629900476, + "grad_norm": 0.25436902046203613, + "learning_rate": 3.26359587138392e-05, + "loss": 0.1066, + "step": 25850 + }, + { + "epoch": 0.46108158242071845, + "grad_norm": 0.2024439424276352, + "learning_rate": 3.263447657406476e-05, + "loss": 0.0882, + "step": 25851 + }, + { + "epoch": 0.46109941854243214, + "grad_norm": 0.2594786286354065, + "learning_rate": 3.2632994404696446e-05, + "loss": 0.1622, + "step": 25852 + }, + { + "epoch": 0.4611172546641458, + "grad_norm": 0.4123919904232025, + "learning_rate": 3.2631512205740014e-05, + "loss": 0.1273, + "step": 25853 + }, + { + "epoch": 0.4611350907858595, + "grad_norm": 0.23608681559562683, + "learning_rate": 3.263002997720121e-05, + "loss": 0.1405, + "step": 25854 + }, + { + "epoch": 0.4611529269075732, + "grad_norm": 0.41029465198516846, + "learning_rate": 3.262854771908576e-05, + "loss": 0.1477, + "step": 25855 + }, + { + "epoch": 0.4611707630292869, + "grad_norm": 0.2541080713272095, + "learning_rate": 3.2627065431399425e-05, + "loss": 0.1281, + "step": 25856 + }, + { + "epoch": 0.46118859915100063, + "grad_norm": 0.2968452572822571, + "learning_rate": 3.2625583114147964e-05, + "loss": 0.1029, + "step": 25857 + }, + { + "epoch": 0.4612064352727143, + "grad_norm": 0.2291620671749115, + "learning_rate": 3.262410076733709e-05, + "loss": 0.1317, + "step": 25858 + }, + { + "epoch": 0.461224271394428, + "grad_norm": 0.21105308830738068, + "learning_rate": 3.262261839097259e-05, + "loss": 0.1566, + "step": 25859 + }, + { + "epoch": 0.4612421075161417, + "grad_norm": 0.4057336747646332, + "learning_rate": 3.262113598506017e-05, + "loss": 0.1841, + "step": 25860 + }, + { + "epoch": 0.4612599436378554, + "grad_norm": 0.28084295988082886, + "learning_rate": 3.261965354960561e-05, + "loss": 0.1334, + "step": 25861 + }, + { + "epoch": 0.4612777797595691, + "grad_norm": 0.29075777530670166, + "learning_rate": 3.2618171084614634e-05, + "loss": 0.0846, + "step": 25862 + }, + { + "epoch": 0.46129561588128276, + "grad_norm": 0.21737127006053925, + "learning_rate": 3.261668859009299e-05, + "loss": 0.1555, + "step": 25863 + }, + { + "epoch": 0.46131345200299645, + "grad_norm": 0.5115792155265808, + "learning_rate": 3.261520606604644e-05, + "loss": 0.1626, + "step": 25864 + }, + { + "epoch": 0.46133128812471014, + "grad_norm": 0.24249915778636932, + "learning_rate": 3.261372351248072e-05, + "loss": 0.13, + "step": 25865 + }, + { + "epoch": 0.4613491242464239, + "grad_norm": 0.30558910965919495, + "learning_rate": 3.261224092940158e-05, + "loss": 0.1214, + "step": 25866 + }, + { + "epoch": 0.46136696036813757, + "grad_norm": 0.32076868414878845, + "learning_rate": 3.261075831681475e-05, + "loss": 0.1446, + "step": 25867 + }, + { + "epoch": 0.46138479648985126, + "grad_norm": 0.38136833906173706, + "learning_rate": 3.260927567472601e-05, + "loss": 0.1579, + "step": 25868 + }, + { + "epoch": 0.46140263261156494, + "grad_norm": 0.27521535754203796, + "learning_rate": 3.260779300314108e-05, + "loss": 0.168, + "step": 25869 + }, + { + "epoch": 0.46142046873327863, + "grad_norm": 0.4319819211959839, + "learning_rate": 3.260631030206572e-05, + "loss": 0.1866, + "step": 25870 + }, + { + "epoch": 0.4614383048549923, + "grad_norm": 0.30179375410079956, + "learning_rate": 3.260482757150567e-05, + "loss": 0.1784, + "step": 25871 + }, + { + "epoch": 0.461456140976706, + "grad_norm": 0.2616255283355713, + "learning_rate": 3.2603344811466685e-05, + "loss": 0.1402, + "step": 25872 + }, + { + "epoch": 0.4614739770984197, + "grad_norm": 0.31946712732315063, + "learning_rate": 3.2601862021954504e-05, + "loss": 0.1547, + "step": 25873 + }, + { + "epoch": 0.46149181322013344, + "grad_norm": 0.21861965954303741, + "learning_rate": 3.260037920297489e-05, + "loss": 0.1873, + "step": 25874 + }, + { + "epoch": 0.46150964934184713, + "grad_norm": 0.29000288248062134, + "learning_rate": 3.259889635453357e-05, + "loss": 0.1459, + "step": 25875 + }, + { + "epoch": 0.4615274854635608, + "grad_norm": 0.3512144088745117, + "learning_rate": 3.259741347663632e-05, + "loss": 0.1646, + "step": 25876 + }, + { + "epoch": 0.4615453215852745, + "grad_norm": 0.47261953353881836, + "learning_rate": 3.2595930569288844e-05, + "loss": 0.1406, + "step": 25877 + }, + { + "epoch": 0.4615631577069882, + "grad_norm": 0.31997692584991455, + "learning_rate": 3.259444763249694e-05, + "loss": 0.1581, + "step": 25878 + }, + { + "epoch": 0.4615809938287019, + "grad_norm": 0.2950047552585602, + "learning_rate": 3.259296466626634e-05, + "loss": 0.1572, + "step": 25879 + }, + { + "epoch": 0.46159882995041557, + "grad_norm": 0.20899878442287445, + "learning_rate": 3.259148167060276e-05, + "loss": 0.1269, + "step": 25880 + }, + { + "epoch": 0.46161666607212926, + "grad_norm": 0.21753248572349548, + "learning_rate": 3.2589998645512e-05, + "loss": 0.1244, + "step": 25881 + }, + { + "epoch": 0.46163450219384294, + "grad_norm": 0.25028809905052185, + "learning_rate": 3.2588515590999765e-05, + "loss": 0.1273, + "step": 25882 + }, + { + "epoch": 0.4616523383155567, + "grad_norm": 0.32519659399986267, + "learning_rate": 3.258703250707183e-05, + "loss": 0.1811, + "step": 25883 + }, + { + "epoch": 0.4616701744372704, + "grad_norm": 0.23123672604560852, + "learning_rate": 3.258554939373394e-05, + "loss": 0.1504, + "step": 25884 + }, + { + "epoch": 0.46168801055898406, + "grad_norm": 0.30120208859443665, + "learning_rate": 3.258406625099184e-05, + "loss": 0.184, + "step": 25885 + }, + { + "epoch": 0.46170584668069775, + "grad_norm": 0.19667299091815948, + "learning_rate": 3.258258307885127e-05, + "loss": 0.1121, + "step": 25886 + }, + { + "epoch": 0.46172368280241144, + "grad_norm": 0.2335060089826584, + "learning_rate": 3.2581099877318e-05, + "loss": 0.1539, + "step": 25887 + }, + { + "epoch": 0.4617415189241251, + "grad_norm": 0.42860716581344604, + "learning_rate": 3.2579616646397763e-05, + "loss": 0.1867, + "step": 25888 + }, + { + "epoch": 0.4617593550458388, + "grad_norm": 0.2365170121192932, + "learning_rate": 3.257813338609632e-05, + "loss": 0.1429, + "step": 25889 + }, + { + "epoch": 0.4617771911675525, + "grad_norm": 0.2708742618560791, + "learning_rate": 3.2576650096419406e-05, + "loss": 0.166, + "step": 25890 + }, + { + "epoch": 0.46179502728926625, + "grad_norm": 0.24951422214508057, + "learning_rate": 3.257516677737278e-05, + "loss": 0.1058, + "step": 25891 + }, + { + "epoch": 0.46181286341097993, + "grad_norm": 0.2135237604379654, + "learning_rate": 3.25736834289622e-05, + "loss": 0.1362, + "step": 25892 + }, + { + "epoch": 0.4618306995326936, + "grad_norm": 0.3428475856781006, + "learning_rate": 3.2572200051193404e-05, + "loss": 0.2107, + "step": 25893 + }, + { + "epoch": 0.4618485356544073, + "grad_norm": 0.36544451117515564, + "learning_rate": 3.2570716644072144e-05, + "loss": 0.1662, + "step": 25894 + }, + { + "epoch": 0.461866371776121, + "grad_norm": 0.24160338938236237, + "learning_rate": 3.2569233207604174e-05, + "loss": 0.1288, + "step": 25895 + }, + { + "epoch": 0.4618842078978347, + "grad_norm": 0.2374226450920105, + "learning_rate": 3.256774974179524e-05, + "loss": 0.1049, + "step": 25896 + }, + { + "epoch": 0.4619020440195484, + "grad_norm": 0.2506173551082611, + "learning_rate": 3.256626624665109e-05, + "loss": 0.1618, + "step": 25897 + }, + { + "epoch": 0.46191988014126206, + "grad_norm": 0.33280596137046814, + "learning_rate": 3.256478272217748e-05, + "loss": 0.1762, + "step": 25898 + }, + { + "epoch": 0.4619377162629758, + "grad_norm": 0.21386969089508057, + "learning_rate": 3.256329916838016e-05, + "loss": 0.0788, + "step": 25899 + }, + { + "epoch": 0.4619555523846895, + "grad_norm": 0.2730884253978729, + "learning_rate": 3.256181558526488e-05, + "loss": 0.1176, + "step": 25900 + }, + { + "epoch": 0.4619733885064032, + "grad_norm": 0.2239217460155487, + "learning_rate": 3.2560331972837396e-05, + "loss": 0.1708, + "step": 25901 + }, + { + "epoch": 0.46199122462811687, + "grad_norm": 0.25614023208618164, + "learning_rate": 3.2558848331103446e-05, + "loss": 0.1937, + "step": 25902 + }, + { + "epoch": 0.46200906074983056, + "grad_norm": 0.2311106026172638, + "learning_rate": 3.25573646600688e-05, + "loss": 0.105, + "step": 25903 + }, + { + "epoch": 0.46202689687154425, + "grad_norm": 0.315411239862442, + "learning_rate": 3.255588095973919e-05, + "loss": 0.1807, + "step": 25904 + }, + { + "epoch": 0.46204473299325793, + "grad_norm": 0.3713129758834839, + "learning_rate": 3.255439723012037e-05, + "loss": 0.1344, + "step": 25905 + }, + { + "epoch": 0.4620625691149716, + "grad_norm": 0.29898330569267273, + "learning_rate": 3.25529134712181e-05, + "loss": 0.1303, + "step": 25906 + }, + { + "epoch": 0.4620804052366853, + "grad_norm": 0.19526520371437073, + "learning_rate": 3.255142968303814e-05, + "loss": 0.1066, + "step": 25907 + }, + { + "epoch": 0.46209824135839905, + "grad_norm": 0.2991268038749695, + "learning_rate": 3.254994586558622e-05, + "loss": 0.1358, + "step": 25908 + }, + { + "epoch": 0.46211607748011274, + "grad_norm": 0.3419950604438782, + "learning_rate": 3.254846201886812e-05, + "loss": 0.1117, + "step": 25909 + }, + { + "epoch": 0.46213391360182643, + "grad_norm": 0.23802126944065094, + "learning_rate": 3.254697814288955e-05, + "loss": 0.1561, + "step": 25910 + }, + { + "epoch": 0.4621517497235401, + "grad_norm": 0.2340325266122818, + "learning_rate": 3.2545494237656295e-05, + "loss": 0.1388, + "step": 25911 + }, + { + "epoch": 0.4621695858452538, + "grad_norm": 0.26105475425720215, + "learning_rate": 3.25440103031741e-05, + "loss": 0.1928, + "step": 25912 + }, + { + "epoch": 0.4621874219669675, + "grad_norm": 0.261292427778244, + "learning_rate": 3.254252633944872e-05, + "loss": 0.1759, + "step": 25913 + }, + { + "epoch": 0.4622052580886812, + "grad_norm": 0.2863219082355499, + "learning_rate": 3.254104234648589e-05, + "loss": 0.1639, + "step": 25914 + }, + { + "epoch": 0.46222309421039487, + "grad_norm": 0.22527258098125458, + "learning_rate": 3.253955832429138e-05, + "loss": 0.1238, + "step": 25915 + }, + { + "epoch": 0.4622409303321086, + "grad_norm": 0.28590673208236694, + "learning_rate": 3.253807427287095e-05, + "loss": 0.14, + "step": 25916 + }, + { + "epoch": 0.4622587664538223, + "grad_norm": 0.2266944795846939, + "learning_rate": 3.253659019223033e-05, + "loss": 0.11, + "step": 25917 + }, + { + "epoch": 0.462276602575536, + "grad_norm": 0.30064821243286133, + "learning_rate": 3.253510608237528e-05, + "loss": 0.1731, + "step": 25918 + }, + { + "epoch": 0.4622944386972497, + "grad_norm": 0.253482460975647, + "learning_rate": 3.253362194331156e-05, + "loss": 0.1901, + "step": 25919 + }, + { + "epoch": 0.46231227481896336, + "grad_norm": 0.2247164398431778, + "learning_rate": 3.2532137775044926e-05, + "loss": 0.1295, + "step": 25920 + }, + { + "epoch": 0.46233011094067705, + "grad_norm": 0.23139981925487518, + "learning_rate": 3.253065357758111e-05, + "loss": 0.11, + "step": 25921 + }, + { + "epoch": 0.46234794706239074, + "grad_norm": 0.28722211718559265, + "learning_rate": 3.2529169350925894e-05, + "loss": 0.1593, + "step": 25922 + }, + { + "epoch": 0.46236578318410443, + "grad_norm": 0.26677945256233215, + "learning_rate": 3.252768509508501e-05, + "loss": 0.1362, + "step": 25923 + }, + { + "epoch": 0.4623836193058181, + "grad_norm": 0.19600580632686615, + "learning_rate": 3.252620081006422e-05, + "loss": 0.0935, + "step": 25924 + }, + { + "epoch": 0.46240145542753186, + "grad_norm": 0.22214733064174652, + "learning_rate": 3.252471649586927e-05, + "loss": 0.1169, + "step": 25925 + }, + { + "epoch": 0.46241929154924555, + "grad_norm": 0.27019229531288147, + "learning_rate": 3.2523232152505925e-05, + "loss": 0.17, + "step": 25926 + }, + { + "epoch": 0.46243712767095924, + "grad_norm": 0.2838871479034424, + "learning_rate": 3.252174777997994e-05, + "loss": 0.154, + "step": 25927 + }, + { + "epoch": 0.4624549637926729, + "grad_norm": 0.2956608831882477, + "learning_rate": 3.252026337829706e-05, + "loss": 0.1467, + "step": 25928 + }, + { + "epoch": 0.4624727999143866, + "grad_norm": 0.25164660811424255, + "learning_rate": 3.251877894746303e-05, + "loss": 0.1297, + "step": 25929 + }, + { + "epoch": 0.4624906360361003, + "grad_norm": 0.2655068039894104, + "learning_rate": 3.251729448748363e-05, + "loss": 0.1561, + "step": 25930 + }, + { + "epoch": 0.462508472157814, + "grad_norm": 0.30581045150756836, + "learning_rate": 3.2515809998364595e-05, + "loss": 0.1809, + "step": 25931 + }, + { + "epoch": 0.4625263082795277, + "grad_norm": 0.26822715997695923, + "learning_rate": 3.251432548011168e-05, + "loss": 0.1672, + "step": 25932 + }, + { + "epoch": 0.4625441444012414, + "grad_norm": 0.2988027334213257, + "learning_rate": 3.251284093273065e-05, + "loss": 0.1328, + "step": 25933 + }, + { + "epoch": 0.4625619805229551, + "grad_norm": 0.24633675813674927, + "learning_rate": 3.251135635622725e-05, + "loss": 0.1612, + "step": 25934 + }, + { + "epoch": 0.4625798166446688, + "grad_norm": 0.29048851132392883, + "learning_rate": 3.2509871750607244e-05, + "loss": 0.1887, + "step": 25935 + }, + { + "epoch": 0.4625976527663825, + "grad_norm": 0.30334049463272095, + "learning_rate": 3.2508387115876384e-05, + "loss": 0.1707, + "step": 25936 + }, + { + "epoch": 0.46261548888809617, + "grad_norm": 0.2526455819606781, + "learning_rate": 3.2506902452040414e-05, + "loss": 0.1164, + "step": 25937 + }, + { + "epoch": 0.46263332500980986, + "grad_norm": 0.29355186223983765, + "learning_rate": 3.25054177591051e-05, + "loss": 0.1679, + "step": 25938 + }, + { + "epoch": 0.46265116113152355, + "grad_norm": 0.17166070640087128, + "learning_rate": 3.2503933037076186e-05, + "loss": 0.1152, + "step": 25939 + }, + { + "epoch": 0.46266899725323724, + "grad_norm": 0.347345769405365, + "learning_rate": 3.250244828595945e-05, + "loss": 0.1546, + "step": 25940 + }, + { + "epoch": 0.4626868333749509, + "grad_norm": 0.41629549860954285, + "learning_rate": 3.2500963505760627e-05, + "loss": 0.1663, + "step": 25941 + }, + { + "epoch": 0.46270466949666467, + "grad_norm": 0.31726574897766113, + "learning_rate": 3.2499478696485474e-05, + "loss": 0.1519, + "step": 25942 + }, + { + "epoch": 0.46272250561837835, + "grad_norm": 0.24121001362800598, + "learning_rate": 3.2497993858139765e-05, + "loss": 0.1636, + "step": 25943 + }, + { + "epoch": 0.46274034174009204, + "grad_norm": 0.38795900344848633, + "learning_rate": 3.2496508990729225e-05, + "loss": 0.1713, + "step": 25944 + }, + { + "epoch": 0.46275817786180573, + "grad_norm": 0.2509864866733551, + "learning_rate": 3.249502409425964e-05, + "loss": 0.1287, + "step": 25945 + }, + { + "epoch": 0.4627760139835194, + "grad_norm": 0.24911074340343475, + "learning_rate": 3.2493539168736745e-05, + "loss": 0.1256, + "step": 25946 + }, + { + "epoch": 0.4627938501052331, + "grad_norm": 0.24461624026298523, + "learning_rate": 3.2492054214166305e-05, + "loss": 0.1323, + "step": 25947 + }, + { + "epoch": 0.4628116862269468, + "grad_norm": 0.32783693075180054, + "learning_rate": 3.249056923055408e-05, + "loss": 0.1052, + "step": 25948 + }, + { + "epoch": 0.4628295223486605, + "grad_norm": 0.3568902313709259, + "learning_rate": 3.2489084217905816e-05, + "loss": 0.2109, + "step": 25949 + }, + { + "epoch": 0.4628473584703742, + "grad_norm": 0.22166769206523895, + "learning_rate": 3.2487599176227286e-05, + "loss": 0.1603, + "step": 25950 + }, + { + "epoch": 0.4628651945920879, + "grad_norm": 0.24175916612148285, + "learning_rate": 3.2486114105524224e-05, + "loss": 0.0798, + "step": 25951 + }, + { + "epoch": 0.4628830307138016, + "grad_norm": 0.24645406007766724, + "learning_rate": 3.24846290058024e-05, + "loss": 0.1331, + "step": 25952 + }, + { + "epoch": 0.4629008668355153, + "grad_norm": 0.27952513098716736, + "learning_rate": 3.248314387706757e-05, + "loss": 0.0735, + "step": 25953 + }, + { + "epoch": 0.462918702957229, + "grad_norm": 0.2802181839942932, + "learning_rate": 3.2481658719325495e-05, + "loss": 0.1233, + "step": 25954 + }, + { + "epoch": 0.46293653907894267, + "grad_norm": 0.20891527831554413, + "learning_rate": 3.2480173532581914e-05, + "loss": 0.1489, + "step": 25955 + }, + { + "epoch": 0.46295437520065635, + "grad_norm": 0.45281511545181274, + "learning_rate": 3.2478688316842606e-05, + "loss": 0.1938, + "step": 25956 + }, + { + "epoch": 0.46297221132237004, + "grad_norm": 0.29913291335105896, + "learning_rate": 3.247720307211332e-05, + "loss": 0.1341, + "step": 25957 + }, + { + "epoch": 0.4629900474440838, + "grad_norm": 0.3208201229572296, + "learning_rate": 3.247571779839981e-05, + "loss": 0.1677, + "step": 25958 + }, + { + "epoch": 0.4630078835657975, + "grad_norm": 0.22196722030639648, + "learning_rate": 3.2474232495707834e-05, + "loss": 0.1012, + "step": 25959 + }, + { + "epoch": 0.46302571968751116, + "grad_norm": 0.38233399391174316, + "learning_rate": 3.247274716404315e-05, + "loss": 0.2327, + "step": 25960 + }, + { + "epoch": 0.46304355580922485, + "grad_norm": 0.2365937978029251, + "learning_rate": 3.2471261803411525e-05, + "loss": 0.112, + "step": 25961 + }, + { + "epoch": 0.46306139193093854, + "grad_norm": 0.2246352732181549, + "learning_rate": 3.24697764138187e-05, + "loss": 0.1381, + "step": 25962 + }, + { + "epoch": 0.4630792280526522, + "grad_norm": 0.25872036814689636, + "learning_rate": 3.246829099527044e-05, + "loss": 0.132, + "step": 25963 + }, + { + "epoch": 0.4630970641743659, + "grad_norm": 0.35274624824523926, + "learning_rate": 3.246680554777251e-05, + "loss": 0.1189, + "step": 25964 + }, + { + "epoch": 0.4631149002960796, + "grad_norm": 0.24399487674236298, + "learning_rate": 3.246532007133067e-05, + "loss": 0.1264, + "step": 25965 + }, + { + "epoch": 0.4631327364177933, + "grad_norm": 0.20977070927619934, + "learning_rate": 3.246383456595066e-05, + "loss": 0.1388, + "step": 25966 + }, + { + "epoch": 0.46315057253950703, + "grad_norm": 0.23945015668869019, + "learning_rate": 3.246234903163825e-05, + "loss": 0.0913, + "step": 25967 + }, + { + "epoch": 0.4631684086612207, + "grad_norm": 0.2062525451183319, + "learning_rate": 3.24608634683992e-05, + "loss": 0.1126, + "step": 25968 + }, + { + "epoch": 0.4631862447829344, + "grad_norm": 0.2679837942123413, + "learning_rate": 3.2459377876239274e-05, + "loss": 0.1377, + "step": 25969 + }, + { + "epoch": 0.4632040809046481, + "grad_norm": 0.31055957078933716, + "learning_rate": 3.2457892255164214e-05, + "loss": 0.1533, + "step": 25970 + }, + { + "epoch": 0.4632219170263618, + "grad_norm": 0.31911930441856384, + "learning_rate": 3.245640660517979e-05, + "loss": 0.136, + "step": 25971 + }, + { + "epoch": 0.4632397531480755, + "grad_norm": 0.21749882400035858, + "learning_rate": 3.245492092629176e-05, + "loss": 0.1551, + "step": 25972 + }, + { + "epoch": 0.46325758926978916, + "grad_norm": 0.26170194149017334, + "learning_rate": 3.2453435218505877e-05, + "loss": 0.1527, + "step": 25973 + }, + { + "epoch": 0.46327542539150285, + "grad_norm": 0.39017996191978455, + "learning_rate": 3.245194948182791e-05, + "loss": 0.2313, + "step": 25974 + }, + { + "epoch": 0.4632932615132166, + "grad_norm": 0.2629111409187317, + "learning_rate": 3.2450463716263606e-05, + "loss": 0.1434, + "step": 25975 + }, + { + "epoch": 0.4633110976349303, + "grad_norm": 0.24127335846424103, + "learning_rate": 3.244897792181874e-05, + "loss": 0.1999, + "step": 25976 + }, + { + "epoch": 0.46332893375664397, + "grad_norm": 0.24200038611888885, + "learning_rate": 3.244749209849906e-05, + "loss": 0.1577, + "step": 25977 + }, + { + "epoch": 0.46334676987835766, + "grad_norm": 0.2859693765640259, + "learning_rate": 3.244600624631032e-05, + "loss": 0.1158, + "step": 25978 + }, + { + "epoch": 0.46336460600007134, + "grad_norm": 0.23622198402881622, + "learning_rate": 3.24445203652583e-05, + "loss": 0.1121, + "step": 25979 + }, + { + "epoch": 0.46338244212178503, + "grad_norm": 0.24148476123809814, + "learning_rate": 3.2443034455348745e-05, + "loss": 0.1285, + "step": 25980 + }, + { + "epoch": 0.4634002782434987, + "grad_norm": 0.22917766869068146, + "learning_rate": 3.244154851658742e-05, + "loss": 0.1711, + "step": 25981 + }, + { + "epoch": 0.4634181143652124, + "grad_norm": 0.22683215141296387, + "learning_rate": 3.244006254898007e-05, + "loss": 0.1141, + "step": 25982 + }, + { + "epoch": 0.4634359504869261, + "grad_norm": 0.2625378668308258, + "learning_rate": 3.243857655253247e-05, + "loss": 0.1204, + "step": 25983 + }, + { + "epoch": 0.46345378660863984, + "grad_norm": 0.3025020360946655, + "learning_rate": 3.243709052725039e-05, + "loss": 0.1906, + "step": 25984 + }, + { + "epoch": 0.4634716227303535, + "grad_norm": 0.34796440601348877, + "learning_rate": 3.243560447313958e-05, + "loss": 0.1577, + "step": 25985 + }, + { + "epoch": 0.4634894588520672, + "grad_norm": 0.28842583298683167, + "learning_rate": 3.243411839020579e-05, + "loss": 0.1493, + "step": 25986 + }, + { + "epoch": 0.4635072949737809, + "grad_norm": 0.3031710684299469, + "learning_rate": 3.243263227845479e-05, + "loss": 0.1257, + "step": 25987 + }, + { + "epoch": 0.4635251310954946, + "grad_norm": 0.2704150080680847, + "learning_rate": 3.243114613789233e-05, + "loss": 0.1353, + "step": 25988 + }, + { + "epoch": 0.4635429672172083, + "grad_norm": 0.29831618070602417, + "learning_rate": 3.24296599685242e-05, + "loss": 0.1387, + "step": 25989 + }, + { + "epoch": 0.46356080333892197, + "grad_norm": 0.3365585505962372, + "learning_rate": 3.242817377035613e-05, + "loss": 0.1688, + "step": 25990 + }, + { + "epoch": 0.46357863946063566, + "grad_norm": 0.355507493019104, + "learning_rate": 3.24266875433939e-05, + "loss": 0.1823, + "step": 25991 + }, + { + "epoch": 0.4635964755823494, + "grad_norm": 0.2670857608318329, + "learning_rate": 3.242520128764326e-05, + "loss": 0.1198, + "step": 25992 + }, + { + "epoch": 0.4636143117040631, + "grad_norm": 0.46618160605430603, + "learning_rate": 3.242371500310998e-05, + "loss": 0.19, + "step": 25993 + }, + { + "epoch": 0.4636321478257768, + "grad_norm": 0.24077323079109192, + "learning_rate": 3.242222868979981e-05, + "loss": 0.1791, + "step": 25994 + }, + { + "epoch": 0.46364998394749046, + "grad_norm": 0.24090790748596191, + "learning_rate": 3.242074234771852e-05, + "loss": 0.1386, + "step": 25995 + }, + { + "epoch": 0.46366782006920415, + "grad_norm": 0.23734600841999054, + "learning_rate": 3.241925597687186e-05, + "loss": 0.1181, + "step": 25996 + }, + { + "epoch": 0.46368565619091784, + "grad_norm": 0.27186650037765503, + "learning_rate": 3.241776957726562e-05, + "loss": 0.1372, + "step": 25997 + }, + { + "epoch": 0.4637034923126315, + "grad_norm": 0.3167542815208435, + "learning_rate": 3.241628314890554e-05, + "loss": 0.2001, + "step": 25998 + }, + { + "epoch": 0.4637213284343452, + "grad_norm": 0.2665323317050934, + "learning_rate": 3.241479669179738e-05, + "loss": 0.1564, + "step": 25999 + }, + { + "epoch": 0.46373916455605896, + "grad_norm": 0.2914828956127167, + "learning_rate": 3.2413310205946904e-05, + "loss": 0.176, + "step": 26000 + }, + { + "epoch": 0.46373916455605896, + "eval_loss": 0.14186939597129822, + "eval_runtime": 108.4364, + "eval_samples_per_second": 9.443, + "eval_steps_per_second": 1.577, + "step": 26000 + }, + { + "epoch": 0.46375700067777265, + "grad_norm": 0.27630114555358887, + "learning_rate": 3.241182369135988e-05, + "loss": 0.1203, + "step": 26001 + }, + { + "epoch": 0.46377483679948633, + "grad_norm": 0.2262500822544098, + "learning_rate": 3.241033714804207e-05, + "loss": 0.141, + "step": 26002 + }, + { + "epoch": 0.4637926729212, + "grad_norm": 0.20729343593120575, + "learning_rate": 3.240885057599923e-05, + "loss": 0.1336, + "step": 26003 + }, + { + "epoch": 0.4638105090429137, + "grad_norm": 0.23321793973445892, + "learning_rate": 3.2407363975237126e-05, + "loss": 0.1561, + "step": 26004 + }, + { + "epoch": 0.4638283451646274, + "grad_norm": 0.29829907417297363, + "learning_rate": 3.2405877345761524e-05, + "loss": 0.1255, + "step": 26005 + }, + { + "epoch": 0.4638461812863411, + "grad_norm": 0.30365437269210815, + "learning_rate": 3.240439068757818e-05, + "loss": 0.1676, + "step": 26006 + }, + { + "epoch": 0.4638640174080548, + "grad_norm": 0.32210665941238403, + "learning_rate": 3.2402904000692865e-05, + "loss": 0.1745, + "step": 26007 + }, + { + "epoch": 0.46388185352976846, + "grad_norm": 0.27390822768211365, + "learning_rate": 3.2401417285111335e-05, + "loss": 0.1943, + "step": 26008 + }, + { + "epoch": 0.4638996896514822, + "grad_norm": 0.3175702691078186, + "learning_rate": 3.239993054083935e-05, + "loss": 0.1561, + "step": 26009 + }, + { + "epoch": 0.4639175257731959, + "grad_norm": 0.2405591607093811, + "learning_rate": 3.239844376788268e-05, + "loss": 0.173, + "step": 26010 + }, + { + "epoch": 0.4639353618949096, + "grad_norm": 0.2824705243110657, + "learning_rate": 3.2396956966247096e-05, + "loss": 0.157, + "step": 26011 + }, + { + "epoch": 0.46395319801662327, + "grad_norm": 0.2979973554611206, + "learning_rate": 3.239547013593834e-05, + "loss": 0.1333, + "step": 26012 + }, + { + "epoch": 0.46397103413833696, + "grad_norm": 0.25221627950668335, + "learning_rate": 3.23939832769622e-05, + "loss": 0.1346, + "step": 26013 + }, + { + "epoch": 0.46398887026005065, + "grad_norm": 0.22971589863300323, + "learning_rate": 3.239249638932441e-05, + "loss": 0.1261, + "step": 26014 + }, + { + "epoch": 0.46400670638176433, + "grad_norm": 0.29105710983276367, + "learning_rate": 3.239100947303077e-05, + "loss": 0.1449, + "step": 26015 + }, + { + "epoch": 0.464024542503478, + "grad_norm": 0.30627772212028503, + "learning_rate": 3.2389522528087006e-05, + "loss": 0.1995, + "step": 26016 + }, + { + "epoch": 0.46404237862519176, + "grad_norm": 0.32698777318000793, + "learning_rate": 3.2388035554498916e-05, + "loss": 0.1283, + "step": 26017 + }, + { + "epoch": 0.46406021474690545, + "grad_norm": 0.2415143847465515, + "learning_rate": 3.2386548552272234e-05, + "loss": 0.1196, + "step": 26018 + }, + { + "epoch": 0.46407805086861914, + "grad_norm": 0.24074943363666534, + "learning_rate": 3.238506152141275e-05, + "loss": 0.1734, + "step": 26019 + }, + { + "epoch": 0.46409588699033283, + "grad_norm": 0.2436971515417099, + "learning_rate": 3.2383574461926214e-05, + "loss": 0.1146, + "step": 26020 + }, + { + "epoch": 0.4641137231120465, + "grad_norm": 0.18990497291088104, + "learning_rate": 3.2382087373818395e-05, + "loss": 0.1326, + "step": 26021 + }, + { + "epoch": 0.4641315592337602, + "grad_norm": 0.27914443612098694, + "learning_rate": 3.238060025709505e-05, + "loss": 0.1401, + "step": 26022 + }, + { + "epoch": 0.4641493953554739, + "grad_norm": 0.35573136806488037, + "learning_rate": 3.237911311176195e-05, + "loss": 0.1776, + "step": 26023 + }, + { + "epoch": 0.4641672314771876, + "grad_norm": 0.31344133615493774, + "learning_rate": 3.2377625937824865e-05, + "loss": 0.1335, + "step": 26024 + }, + { + "epoch": 0.46418506759890127, + "grad_norm": 0.2672535479068756, + "learning_rate": 3.237613873528955e-05, + "loss": 0.1353, + "step": 26025 + }, + { + "epoch": 0.464202903720615, + "grad_norm": 0.30407804250717163, + "learning_rate": 3.2374651504161775e-05, + "loss": 0.1492, + "step": 26026 + }, + { + "epoch": 0.4642207398423287, + "grad_norm": 0.32202044129371643, + "learning_rate": 3.2373164244447303e-05, + "loss": 0.1905, + "step": 26027 + }, + { + "epoch": 0.4642385759640424, + "grad_norm": 0.2623530924320221, + "learning_rate": 3.23716769561519e-05, + "loss": 0.1268, + "step": 26028 + }, + { + "epoch": 0.4642564120857561, + "grad_norm": 0.2711970806121826, + "learning_rate": 3.2370189639281326e-05, + "loss": 0.1789, + "step": 26029 + }, + { + "epoch": 0.46427424820746976, + "grad_norm": 0.23704560101032257, + "learning_rate": 3.236870229384136e-05, + "loss": 0.1129, + "step": 26030 + }, + { + "epoch": 0.46429208432918345, + "grad_norm": 0.2677852213382721, + "learning_rate": 3.236721491983775e-05, + "loss": 0.0899, + "step": 26031 + }, + { + "epoch": 0.46430992045089714, + "grad_norm": 0.2846788465976715, + "learning_rate": 3.236572751727628e-05, + "loss": 0.1505, + "step": 26032 + }, + { + "epoch": 0.46432775657261083, + "grad_norm": 0.36903154850006104, + "learning_rate": 3.236424008616269e-05, + "loss": 0.1548, + "step": 26033 + }, + { + "epoch": 0.46434559269432457, + "grad_norm": 0.30848273634910583, + "learning_rate": 3.236275262650278e-05, + "loss": 0.1804, + "step": 26034 + }, + { + "epoch": 0.46436342881603826, + "grad_norm": 0.2429845929145813, + "learning_rate": 3.236126513830229e-05, + "loss": 0.1142, + "step": 26035 + }, + { + "epoch": 0.46438126493775195, + "grad_norm": 0.26715463399887085, + "learning_rate": 3.235977762156699e-05, + "loss": 0.1419, + "step": 26036 + }, + { + "epoch": 0.46439910105946564, + "grad_norm": 0.36638882756233215, + "learning_rate": 3.2358290076302657e-05, + "loss": 0.1271, + "step": 26037 + }, + { + "epoch": 0.4644169371811793, + "grad_norm": 0.26610100269317627, + "learning_rate": 3.235680250251505e-05, + "loss": 0.1367, + "step": 26038 + }, + { + "epoch": 0.464434773302893, + "grad_norm": 0.23521414399147034, + "learning_rate": 3.235531490020993e-05, + "loss": 0.1083, + "step": 26039 + }, + { + "epoch": 0.4644526094246067, + "grad_norm": 0.3382713794708252, + "learning_rate": 3.235382726939307e-05, + "loss": 0.1976, + "step": 26040 + }, + { + "epoch": 0.4644704455463204, + "grad_norm": 0.2942153215408325, + "learning_rate": 3.235233961007024e-05, + "loss": 0.1865, + "step": 26041 + }, + { + "epoch": 0.4644882816680341, + "grad_norm": 0.2761353850364685, + "learning_rate": 3.23508519222472e-05, + "loss": 0.1734, + "step": 26042 + }, + { + "epoch": 0.4645061177897478, + "grad_norm": 0.26857873797416687, + "learning_rate": 3.2349364205929716e-05, + "loss": 0.1491, + "step": 26043 + }, + { + "epoch": 0.4645239539114615, + "grad_norm": 0.21701617538928986, + "learning_rate": 3.234787646112356e-05, + "loss": 0.1427, + "step": 26044 + }, + { + "epoch": 0.4645417900331752, + "grad_norm": 0.21352718770503998, + "learning_rate": 3.23463886878345e-05, + "loss": 0.1436, + "step": 26045 + }, + { + "epoch": 0.4645596261548889, + "grad_norm": 0.19101294875144958, + "learning_rate": 3.2344900886068294e-05, + "loss": 0.1285, + "step": 26046 + }, + { + "epoch": 0.46457746227660257, + "grad_norm": 0.1992942988872528, + "learning_rate": 3.234341305583072e-05, + "loss": 0.1476, + "step": 26047 + }, + { + "epoch": 0.46459529839831626, + "grad_norm": 0.3391256630420685, + "learning_rate": 3.234192519712754e-05, + "loss": 0.1251, + "step": 26048 + }, + { + "epoch": 0.46461313452002995, + "grad_norm": 0.39006611704826355, + "learning_rate": 3.2340437309964525e-05, + "loss": 0.1321, + "step": 26049 + }, + { + "epoch": 0.46463097064174363, + "grad_norm": 0.24516627192497253, + "learning_rate": 3.233894939434744e-05, + "loss": 0.1479, + "step": 26050 + }, + { + "epoch": 0.4646488067634574, + "grad_norm": 0.3214873969554901, + "learning_rate": 3.2337461450282044e-05, + "loss": 0.1549, + "step": 26051 + }, + { + "epoch": 0.46466664288517107, + "grad_norm": 0.2771398723125458, + "learning_rate": 3.233597347777412e-05, + "loss": 0.0903, + "step": 26052 + }, + { + "epoch": 0.46468447900688475, + "grad_norm": 0.2523135244846344, + "learning_rate": 3.233448547682943e-05, + "loss": 0.1039, + "step": 26053 + }, + { + "epoch": 0.46470231512859844, + "grad_norm": 0.3429401218891144, + "learning_rate": 3.233299744745374e-05, + "loss": 0.1661, + "step": 26054 + }, + { + "epoch": 0.46472015125031213, + "grad_norm": 0.28885892033576965, + "learning_rate": 3.233150938965281e-05, + "loss": 0.1459, + "step": 26055 + }, + { + "epoch": 0.4647379873720258, + "grad_norm": 0.4163936376571655, + "learning_rate": 3.233002130343243e-05, + "loss": 0.19, + "step": 26056 + }, + { + "epoch": 0.4647558234937395, + "grad_norm": 0.2808150351047516, + "learning_rate": 3.232853318879835e-05, + "loss": 0.1527, + "step": 26057 + }, + { + "epoch": 0.4647736596154532, + "grad_norm": 0.29562413692474365, + "learning_rate": 3.232704504575634e-05, + "loss": 0.1255, + "step": 26058 + }, + { + "epoch": 0.46479149573716694, + "grad_norm": 0.4406629502773285, + "learning_rate": 3.232555687431218e-05, + "loss": 0.1042, + "step": 26059 + }, + { + "epoch": 0.4648093318588806, + "grad_norm": 0.24277228116989136, + "learning_rate": 3.232406867447163e-05, + "loss": 0.1272, + "step": 26060 + }, + { + "epoch": 0.4648271679805943, + "grad_norm": 0.3693324327468872, + "learning_rate": 3.2322580446240456e-05, + "loss": 0.2228, + "step": 26061 + }, + { + "epoch": 0.464845004102308, + "grad_norm": 0.2505847215652466, + "learning_rate": 3.2321092189624435e-05, + "loss": 0.1497, + "step": 26062 + }, + { + "epoch": 0.4648628402240217, + "grad_norm": 0.2134181708097458, + "learning_rate": 3.2319603904629334e-05, + "loss": 0.1548, + "step": 26063 + }, + { + "epoch": 0.4648806763457354, + "grad_norm": 0.34179818630218506, + "learning_rate": 3.231811559126091e-05, + "loss": 0.194, + "step": 26064 + }, + { + "epoch": 0.46489851246744907, + "grad_norm": 0.5091789960861206, + "learning_rate": 3.231662724952496e-05, + "loss": 0.1507, + "step": 26065 + }, + { + "epoch": 0.46491634858916275, + "grad_norm": 0.2267841249704361, + "learning_rate": 3.231513887942722e-05, + "loss": 0.1596, + "step": 26066 + }, + { + "epoch": 0.46493418471087644, + "grad_norm": 0.27425557374954224, + "learning_rate": 3.231365048097348e-05, + "loss": 0.1796, + "step": 26067 + }, + { + "epoch": 0.4649520208325902, + "grad_norm": 0.21191570162773132, + "learning_rate": 3.231216205416951e-05, + "loss": 0.1149, + "step": 26068 + }, + { + "epoch": 0.4649698569543039, + "grad_norm": 0.3472452759742737, + "learning_rate": 3.231067359902107e-05, + "loss": 0.1526, + "step": 26069 + }, + { + "epoch": 0.46498769307601756, + "grad_norm": 0.25764933228492737, + "learning_rate": 3.230918511553393e-05, + "loss": 0.1294, + "step": 26070 + }, + { + "epoch": 0.46500552919773125, + "grad_norm": 0.327184796333313, + "learning_rate": 3.230769660371387e-05, + "loss": 0.1379, + "step": 26071 + }, + { + "epoch": 0.46502336531944494, + "grad_norm": 0.25578218698501587, + "learning_rate": 3.2306208063566646e-05, + "loss": 0.1556, + "step": 26072 + }, + { + "epoch": 0.4650412014411586, + "grad_norm": 0.26179003715515137, + "learning_rate": 3.230471949509804e-05, + "loss": 0.1465, + "step": 26073 + }, + { + "epoch": 0.4650590375628723, + "grad_norm": 0.3551662862300873, + "learning_rate": 3.230323089831382e-05, + "loss": 0.1229, + "step": 26074 + }, + { + "epoch": 0.465076873684586, + "grad_norm": 0.2909257709980011, + "learning_rate": 3.230174227321976e-05, + "loss": 0.1796, + "step": 26075 + }, + { + "epoch": 0.46509470980629974, + "grad_norm": 0.26737260818481445, + "learning_rate": 3.230025361982162e-05, + "loss": 0.1555, + "step": 26076 + }, + { + "epoch": 0.46511254592801343, + "grad_norm": 0.27418631315231323, + "learning_rate": 3.229876493812517e-05, + "loss": 0.1571, + "step": 26077 + }, + { + "epoch": 0.4651303820497271, + "grad_norm": 0.377976655960083, + "learning_rate": 3.229727622813619e-05, + "loss": 0.1707, + "step": 26078 + }, + { + "epoch": 0.4651482181714408, + "grad_norm": 0.28598037362098694, + "learning_rate": 3.229578748986045e-05, + "loss": 0.137, + "step": 26079 + }, + { + "epoch": 0.4651660542931545, + "grad_norm": 0.24126529693603516, + "learning_rate": 3.2294298723303715e-05, + "loss": 0.1146, + "step": 26080 + }, + { + "epoch": 0.4651838904148682, + "grad_norm": 0.45753878355026245, + "learning_rate": 3.2292809928471765e-05, + "loss": 0.1837, + "step": 26081 + }, + { + "epoch": 0.46520172653658187, + "grad_norm": 0.25630712509155273, + "learning_rate": 3.229132110537036e-05, + "loss": 0.1432, + "step": 26082 + }, + { + "epoch": 0.46521956265829556, + "grad_norm": 0.22521261870861053, + "learning_rate": 3.228983225400527e-05, + "loss": 0.1659, + "step": 26083 + }, + { + "epoch": 0.46523739878000925, + "grad_norm": 0.2586803436279297, + "learning_rate": 3.2288343374382286e-05, + "loss": 0.1376, + "step": 26084 + }, + { + "epoch": 0.465255234901723, + "grad_norm": 0.22216632962226868, + "learning_rate": 3.2286854466507155e-05, + "loss": 0.1095, + "step": 26085 + }, + { + "epoch": 0.4652730710234367, + "grad_norm": 0.2442510724067688, + "learning_rate": 3.228536553038566e-05, + "loss": 0.1695, + "step": 26086 + }, + { + "epoch": 0.46529090714515037, + "grad_norm": 0.41868990659713745, + "learning_rate": 3.2283876566023565e-05, + "loss": 0.1761, + "step": 26087 + }, + { + "epoch": 0.46530874326686406, + "grad_norm": 0.25471076369285583, + "learning_rate": 3.228238757342667e-05, + "loss": 0.1452, + "step": 26088 + }, + { + "epoch": 0.46532657938857774, + "grad_norm": 0.2155960500240326, + "learning_rate": 3.2280898552600716e-05, + "loss": 0.1125, + "step": 26089 + }, + { + "epoch": 0.46534441551029143, + "grad_norm": 0.25104236602783203, + "learning_rate": 3.227940950355147e-05, + "loss": 0.1527, + "step": 26090 + }, + { + "epoch": 0.4653622516320051, + "grad_norm": 0.3023114502429962, + "learning_rate": 3.227792042628473e-05, + "loss": 0.1748, + "step": 26091 + }, + { + "epoch": 0.4653800877537188, + "grad_norm": 0.257811963558197, + "learning_rate": 3.2276431320806254e-05, + "loss": 0.1379, + "step": 26092 + }, + { + "epoch": 0.46539792387543255, + "grad_norm": 0.24357429146766663, + "learning_rate": 3.227494218712183e-05, + "loss": 0.1585, + "step": 26093 + }, + { + "epoch": 0.46541575999714624, + "grad_norm": 0.3331894278526306, + "learning_rate": 3.22734530252372e-05, + "loss": 0.1613, + "step": 26094 + }, + { + "epoch": 0.4654335961188599, + "grad_norm": 0.2197878062725067, + "learning_rate": 3.2271963835158166e-05, + "loss": 0.1371, + "step": 26095 + }, + { + "epoch": 0.4654514322405736, + "grad_norm": 0.2556401491165161, + "learning_rate": 3.227047461689048e-05, + "loss": 0.1727, + "step": 26096 + }, + { + "epoch": 0.4654692683622873, + "grad_norm": 0.2661169767379761, + "learning_rate": 3.226898537043993e-05, + "loss": 0.1545, + "step": 26097 + }, + { + "epoch": 0.465487104484001, + "grad_norm": 0.34631234407424927, + "learning_rate": 3.226749609581228e-05, + "loss": 0.1539, + "step": 26098 + }, + { + "epoch": 0.4655049406057147, + "grad_norm": 0.29225045442581177, + "learning_rate": 3.22660067930133e-05, + "loss": 0.1213, + "step": 26099 + }, + { + "epoch": 0.46552277672742837, + "grad_norm": 0.23258855938911438, + "learning_rate": 3.226451746204877e-05, + "loss": 0.0938, + "step": 26100 + }, + { + "epoch": 0.4655406128491421, + "grad_norm": 0.3291206359863281, + "learning_rate": 3.226302810292447e-05, + "loss": 0.1993, + "step": 26101 + }, + { + "epoch": 0.4655584489708558, + "grad_norm": 0.32316386699676514, + "learning_rate": 3.226153871564615e-05, + "loss": 0.163, + "step": 26102 + }, + { + "epoch": 0.4655762850925695, + "grad_norm": 0.25505632162094116, + "learning_rate": 3.226004930021961e-05, + "loss": 0.1481, + "step": 26103 + }, + { + "epoch": 0.4655941212142832, + "grad_norm": 0.25649598240852356, + "learning_rate": 3.225855985665061e-05, + "loss": 0.1526, + "step": 26104 + }, + { + "epoch": 0.46561195733599686, + "grad_norm": 0.3256259858608246, + "learning_rate": 3.225707038494492e-05, + "loss": 0.1765, + "step": 26105 + }, + { + "epoch": 0.46562979345771055, + "grad_norm": 0.36218512058258057, + "learning_rate": 3.2255580885108313e-05, + "loss": 0.1235, + "step": 26106 + }, + { + "epoch": 0.46564762957942424, + "grad_norm": 0.29450666904449463, + "learning_rate": 3.225409135714658e-05, + "loss": 0.1375, + "step": 26107 + }, + { + "epoch": 0.4656654657011379, + "grad_norm": 0.29874613881111145, + "learning_rate": 3.2252601801065485e-05, + "loss": 0.1601, + "step": 26108 + }, + { + "epoch": 0.4656833018228516, + "grad_norm": 0.2843853831291199, + "learning_rate": 3.225111221687079e-05, + "loss": 0.1535, + "step": 26109 + }, + { + "epoch": 0.46570113794456536, + "grad_norm": 0.2098836749792099, + "learning_rate": 3.224962260456829e-05, + "loss": 0.1364, + "step": 26110 + }, + { + "epoch": 0.46571897406627905, + "grad_norm": 0.2120732069015503, + "learning_rate": 3.224813296416374e-05, + "loss": 0.1429, + "step": 26111 + }, + { + "epoch": 0.46573681018799273, + "grad_norm": 0.3068566620349884, + "learning_rate": 3.224664329566293e-05, + "loss": 0.124, + "step": 26112 + }, + { + "epoch": 0.4657546463097064, + "grad_norm": 0.23693221807479858, + "learning_rate": 3.224515359907162e-05, + "loss": 0.0939, + "step": 26113 + }, + { + "epoch": 0.4657724824314201, + "grad_norm": 0.31513774394989014, + "learning_rate": 3.22436638743956e-05, + "loss": 0.1736, + "step": 26114 + }, + { + "epoch": 0.4657903185531338, + "grad_norm": 0.28280919790267944, + "learning_rate": 3.224217412164063e-05, + "loss": 0.1424, + "step": 26115 + }, + { + "epoch": 0.4658081546748475, + "grad_norm": 0.3217497169971466, + "learning_rate": 3.2240684340812496e-05, + "loss": 0.1927, + "step": 26116 + }, + { + "epoch": 0.4658259907965612, + "grad_norm": 0.3084224760532379, + "learning_rate": 3.223919453191697e-05, + "loss": 0.1598, + "step": 26117 + }, + { + "epoch": 0.4658438269182749, + "grad_norm": 0.3566245138645172, + "learning_rate": 3.2237704694959826e-05, + "loss": 0.1943, + "step": 26118 + }, + { + "epoch": 0.4658616630399886, + "grad_norm": 0.26105552911758423, + "learning_rate": 3.223621482994683e-05, + "loss": 0.1106, + "step": 26119 + }, + { + "epoch": 0.4658794991617023, + "grad_norm": 0.3417191505432129, + "learning_rate": 3.2234724936883774e-05, + "loss": 0.2076, + "step": 26120 + }, + { + "epoch": 0.465897335283416, + "grad_norm": 0.22240227460861206, + "learning_rate": 3.2233235015776426e-05, + "loss": 0.1533, + "step": 26121 + }, + { + "epoch": 0.46591517140512967, + "grad_norm": 0.2728138566017151, + "learning_rate": 3.223174506663056e-05, + "loss": 0.1457, + "step": 26122 + }, + { + "epoch": 0.46593300752684336, + "grad_norm": 0.2878311276435852, + "learning_rate": 3.223025508945195e-05, + "loss": 0.0855, + "step": 26123 + }, + { + "epoch": 0.46595084364855704, + "grad_norm": 0.25562503933906555, + "learning_rate": 3.222876508424637e-05, + "loss": 0.1163, + "step": 26124 + }, + { + "epoch": 0.46596867977027073, + "grad_norm": 0.2526286542415619, + "learning_rate": 3.2227275051019614e-05, + "loss": 0.1285, + "step": 26125 + }, + { + "epoch": 0.4659865158919844, + "grad_norm": 0.26987364888191223, + "learning_rate": 3.222578498977743e-05, + "loss": 0.1805, + "step": 26126 + }, + { + "epoch": 0.46600435201369816, + "grad_norm": 0.30599212646484375, + "learning_rate": 3.222429490052561e-05, + "loss": 0.1626, + "step": 26127 + }, + { + "epoch": 0.46602218813541185, + "grad_norm": 0.302566260099411, + "learning_rate": 3.222280478326993e-05, + "loss": 0.1576, + "step": 26128 + }, + { + "epoch": 0.46604002425712554, + "grad_norm": 0.22841358184814453, + "learning_rate": 3.2221314638016164e-05, + "loss": 0.1535, + "step": 26129 + }, + { + "epoch": 0.46605786037883923, + "grad_norm": 0.2445744276046753, + "learning_rate": 3.221982446477009e-05, + "loss": 0.1436, + "step": 26130 + }, + { + "epoch": 0.4660756965005529, + "grad_norm": 0.2645500898361206, + "learning_rate": 3.221833426353748e-05, + "loss": 0.1718, + "step": 26131 + }, + { + "epoch": 0.4660935326222666, + "grad_norm": 0.3193371295928955, + "learning_rate": 3.221684403432412e-05, + "loss": 0.1416, + "step": 26132 + }, + { + "epoch": 0.4661113687439803, + "grad_norm": 0.31366005539894104, + "learning_rate": 3.221535377713577e-05, + "loss": 0.196, + "step": 26133 + }, + { + "epoch": 0.466129204865694, + "grad_norm": 0.27750080823898315, + "learning_rate": 3.221386349197822e-05, + "loss": 0.1492, + "step": 26134 + }, + { + "epoch": 0.4661470409874077, + "grad_norm": 0.25234177708625793, + "learning_rate": 3.2212373178857244e-05, + "loss": 0.1518, + "step": 26135 + }, + { + "epoch": 0.4661648771091214, + "grad_norm": 0.4431353211402893, + "learning_rate": 3.2210882837778615e-05, + "loss": 0.131, + "step": 26136 + }, + { + "epoch": 0.4661827132308351, + "grad_norm": 0.21329589188098907, + "learning_rate": 3.2209392468748116e-05, + "loss": 0.1387, + "step": 26137 + }, + { + "epoch": 0.4662005493525488, + "grad_norm": 0.24988535046577454, + "learning_rate": 3.220790207177153e-05, + "loss": 0.1433, + "step": 26138 + }, + { + "epoch": 0.4662183854742625, + "grad_norm": 0.32674267888069153, + "learning_rate": 3.220641164685462e-05, + "loss": 0.1576, + "step": 26139 + }, + { + "epoch": 0.46623622159597616, + "grad_norm": 0.385643869638443, + "learning_rate": 3.220492119400317e-05, + "loss": 0.1767, + "step": 26140 + }, + { + "epoch": 0.46625405771768985, + "grad_norm": 0.2466435730457306, + "learning_rate": 3.2203430713222944e-05, + "loss": 0.1483, + "step": 26141 + }, + { + "epoch": 0.46627189383940354, + "grad_norm": 0.22646495699882507, + "learning_rate": 3.220194020451975e-05, + "loss": 0.1528, + "step": 26142 + }, + { + "epoch": 0.4662897299611172, + "grad_norm": 0.30912965536117554, + "learning_rate": 3.220044966789935e-05, + "loss": 0.1406, + "step": 26143 + }, + { + "epoch": 0.46630756608283097, + "grad_norm": 0.3339691162109375, + "learning_rate": 3.2198959103367506e-05, + "loss": 0.1392, + "step": 26144 + }, + { + "epoch": 0.46632540220454466, + "grad_norm": 0.25986963510513306, + "learning_rate": 3.219746851093002e-05, + "loss": 0.127, + "step": 26145 + }, + { + "epoch": 0.46634323832625835, + "grad_norm": 0.2913098931312561, + "learning_rate": 3.219597789059265e-05, + "loss": 0.1771, + "step": 26146 + }, + { + "epoch": 0.46636107444797203, + "grad_norm": 0.2323073297739029, + "learning_rate": 3.2194487242361194e-05, + "loss": 0.1556, + "step": 26147 + }, + { + "epoch": 0.4663789105696857, + "grad_norm": 0.21654458343982697, + "learning_rate": 3.2192996566241414e-05, + "loss": 0.1557, + "step": 26148 + }, + { + "epoch": 0.4663967466913994, + "grad_norm": 0.2698739171028137, + "learning_rate": 3.2191505862239105e-05, + "loss": 0.1202, + "step": 26149 + }, + { + "epoch": 0.4664145828131131, + "grad_norm": 0.2647920846939087, + "learning_rate": 3.219001513036002e-05, + "loss": 0.1476, + "step": 26150 + }, + { + "epoch": 0.4664324189348268, + "grad_norm": 0.23627696931362152, + "learning_rate": 3.218852437060996e-05, + "loss": 0.172, + "step": 26151 + }, + { + "epoch": 0.46645025505654053, + "grad_norm": 0.31318241357803345, + "learning_rate": 3.2187033582994704e-05, + "loss": 0.1284, + "step": 26152 + }, + { + "epoch": 0.4664680911782542, + "grad_norm": 0.3041784167289734, + "learning_rate": 3.218554276752002e-05, + "loss": 0.1398, + "step": 26153 + }, + { + "epoch": 0.4664859272999679, + "grad_norm": 0.2556712031364441, + "learning_rate": 3.218405192419168e-05, + "loss": 0.1495, + "step": 26154 + }, + { + "epoch": 0.4665037634216816, + "grad_norm": 0.24803361296653748, + "learning_rate": 3.2182561053015486e-05, + "loss": 0.1444, + "step": 26155 + }, + { + "epoch": 0.4665215995433953, + "grad_norm": 0.16730177402496338, + "learning_rate": 3.2181070153997204e-05, + "loss": 0.11, + "step": 26156 + }, + { + "epoch": 0.46653943566510897, + "grad_norm": 0.26929745078086853, + "learning_rate": 3.217957922714261e-05, + "loss": 0.1484, + "step": 26157 + }, + { + "epoch": 0.46655727178682266, + "grad_norm": 0.32538139820098877, + "learning_rate": 3.217808827245748e-05, + "loss": 0.1719, + "step": 26158 + }, + { + "epoch": 0.46657510790853635, + "grad_norm": 0.2770591378211975, + "learning_rate": 3.2176597289947616e-05, + "loss": 0.1129, + "step": 26159 + }, + { + "epoch": 0.4665929440302501, + "grad_norm": 0.27576369047164917, + "learning_rate": 3.2175106279618775e-05, + "loss": 0.1127, + "step": 26160 + }, + { + "epoch": 0.4666107801519638, + "grad_norm": 0.3002508282661438, + "learning_rate": 3.217361524147674e-05, + "loss": 0.1458, + "step": 26161 + }, + { + "epoch": 0.46662861627367747, + "grad_norm": 0.23747000098228455, + "learning_rate": 3.21721241755273e-05, + "loss": 0.1056, + "step": 26162 + }, + { + "epoch": 0.46664645239539115, + "grad_norm": 0.25853633880615234, + "learning_rate": 3.2170633081776224e-05, + "loss": 0.1946, + "step": 26163 + }, + { + "epoch": 0.46666428851710484, + "grad_norm": 0.5431085228919983, + "learning_rate": 3.21691419602293e-05, + "loss": 0.3354, + "step": 26164 + }, + { + "epoch": 0.46668212463881853, + "grad_norm": 0.3621675372123718, + "learning_rate": 3.2167650810892305e-05, + "loss": 0.1782, + "step": 26165 + }, + { + "epoch": 0.4666999607605322, + "grad_norm": 0.4184924364089966, + "learning_rate": 3.216615963377103e-05, + "loss": 0.193, + "step": 26166 + }, + { + "epoch": 0.4667177968822459, + "grad_norm": 0.2539195716381073, + "learning_rate": 3.216466842887123e-05, + "loss": 0.1371, + "step": 26167 + }, + { + "epoch": 0.4667356330039596, + "grad_norm": 0.2213921993970871, + "learning_rate": 3.216317719619871e-05, + "loss": 0.1359, + "step": 26168 + }, + { + "epoch": 0.46675346912567334, + "grad_norm": 0.3268488645553589, + "learning_rate": 3.2161685935759235e-05, + "loss": 0.1401, + "step": 26169 + }, + { + "epoch": 0.466771305247387, + "grad_norm": 0.3819471001625061, + "learning_rate": 3.2160194647558593e-05, + "loss": 0.1795, + "step": 26170 + }, + { + "epoch": 0.4667891413691007, + "grad_norm": 0.27619296312332153, + "learning_rate": 3.2158703331602566e-05, + "loss": 0.1735, + "step": 26171 + }, + { + "epoch": 0.4668069774908144, + "grad_norm": 0.29396742582321167, + "learning_rate": 3.215721198789693e-05, + "loss": 0.1667, + "step": 26172 + }, + { + "epoch": 0.4668248136125281, + "grad_norm": 0.22877304255962372, + "learning_rate": 3.2155720616447474e-05, + "loss": 0.1686, + "step": 26173 + }, + { + "epoch": 0.4668426497342418, + "grad_norm": 0.27977070212364197, + "learning_rate": 3.215422921725997e-05, + "loss": 0.1475, + "step": 26174 + }, + { + "epoch": 0.46686048585595546, + "grad_norm": 0.31222712993621826, + "learning_rate": 3.21527377903402e-05, + "loss": 0.1725, + "step": 26175 + }, + { + "epoch": 0.46687832197766915, + "grad_norm": 0.25478875637054443, + "learning_rate": 3.215124633569395e-05, + "loss": 0.1458, + "step": 26176 + }, + { + "epoch": 0.4668961580993829, + "grad_norm": 0.42479678988456726, + "learning_rate": 3.2149754853326994e-05, + "loss": 0.1654, + "step": 26177 + }, + { + "epoch": 0.4669139942210966, + "grad_norm": 0.2332555651664734, + "learning_rate": 3.214826334324513e-05, + "loss": 0.137, + "step": 26178 + }, + { + "epoch": 0.4669318303428103, + "grad_norm": 0.26375123858451843, + "learning_rate": 3.214677180545412e-05, + "loss": 0.1554, + "step": 26179 + }, + { + "epoch": 0.46694966646452396, + "grad_norm": 0.2504287660121918, + "learning_rate": 3.2145280239959756e-05, + "loss": 0.1419, + "step": 26180 + }, + { + "epoch": 0.46696750258623765, + "grad_norm": 0.23868931829929352, + "learning_rate": 3.2143788646767825e-05, + "loss": 0.2077, + "step": 26181 + }, + { + "epoch": 0.46698533870795134, + "grad_norm": 0.28528350591659546, + "learning_rate": 3.214229702588409e-05, + "loss": 0.1334, + "step": 26182 + }, + { + "epoch": 0.467003174829665, + "grad_norm": 0.28509292006492615, + "learning_rate": 3.2140805377314346e-05, + "loss": 0.2047, + "step": 26183 + }, + { + "epoch": 0.4670210109513787, + "grad_norm": 0.3331129252910614, + "learning_rate": 3.2139313701064384e-05, + "loss": 0.1395, + "step": 26184 + }, + { + "epoch": 0.4670388470730924, + "grad_norm": 0.2526521384716034, + "learning_rate": 3.2137821997139965e-05, + "loss": 0.1272, + "step": 26185 + }, + { + "epoch": 0.46705668319480614, + "grad_norm": 0.21979525685310364, + "learning_rate": 3.213633026554689e-05, + "loss": 0.1307, + "step": 26186 + }, + { + "epoch": 0.46707451931651983, + "grad_norm": 0.20031945407390594, + "learning_rate": 3.213483850629093e-05, + "loss": 0.1178, + "step": 26187 + }, + { + "epoch": 0.4670923554382335, + "grad_norm": 0.19642189145088196, + "learning_rate": 3.213334671937788e-05, + "loss": 0.1432, + "step": 26188 + }, + { + "epoch": 0.4671101915599472, + "grad_norm": 0.36276355385780334, + "learning_rate": 3.213185490481351e-05, + "loss": 0.12, + "step": 26189 + }, + { + "epoch": 0.4671280276816609, + "grad_norm": 0.32859084010124207, + "learning_rate": 3.2130363062603594e-05, + "loss": 0.1636, + "step": 26190 + }, + { + "epoch": 0.4671458638033746, + "grad_norm": 0.2847687900066376, + "learning_rate": 3.212887119275394e-05, + "loss": 0.1652, + "step": 26191 + }, + { + "epoch": 0.46716369992508827, + "grad_norm": 0.3417710065841675, + "learning_rate": 3.212737929527032e-05, + "loss": 0.0908, + "step": 26192 + }, + { + "epoch": 0.46718153604680196, + "grad_norm": 0.24598552286624908, + "learning_rate": 3.2125887370158515e-05, + "loss": 0.1542, + "step": 26193 + }, + { + "epoch": 0.4671993721685157, + "grad_norm": 0.3142749071121216, + "learning_rate": 3.212439541742431e-05, + "loss": 0.1332, + "step": 26194 + }, + { + "epoch": 0.4672172082902294, + "grad_norm": 0.35682913661003113, + "learning_rate": 3.212290343707348e-05, + "loss": 0.2032, + "step": 26195 + }, + { + "epoch": 0.4672350444119431, + "grad_norm": 0.3021930158138275, + "learning_rate": 3.212141142911183e-05, + "loss": 0.1505, + "step": 26196 + }, + { + "epoch": 0.46725288053365677, + "grad_norm": 0.32572516798973083, + "learning_rate": 3.2119919393545117e-05, + "loss": 0.2172, + "step": 26197 + }, + { + "epoch": 0.46727071665537045, + "grad_norm": 0.2606397271156311, + "learning_rate": 3.2118427330379144e-05, + "loss": 0.1411, + "step": 26198 + }, + { + "epoch": 0.46728855277708414, + "grad_norm": 0.29652053117752075, + "learning_rate": 3.2116935239619685e-05, + "loss": 0.0634, + "step": 26199 + }, + { + "epoch": 0.46730638889879783, + "grad_norm": 0.3495809733867645, + "learning_rate": 3.2115443121272524e-05, + "loss": 0.086, + "step": 26200 + }, + { + "epoch": 0.4673242250205115, + "grad_norm": 0.2699562609195709, + "learning_rate": 3.211395097534346e-05, + "loss": 0.1264, + "step": 26201 + }, + { + "epoch": 0.4673420611422252, + "grad_norm": 0.26937612891197205, + "learning_rate": 3.211245880183825e-05, + "loss": 0.1438, + "step": 26202 + }, + { + "epoch": 0.46735989726393895, + "grad_norm": 0.23679888248443604, + "learning_rate": 3.2110966600762704e-05, + "loss": 0.1291, + "step": 26203 + }, + { + "epoch": 0.46737773338565264, + "grad_norm": 0.31273531913757324, + "learning_rate": 3.2109474372122584e-05, + "loss": 0.1436, + "step": 26204 + }, + { + "epoch": 0.4673955695073663, + "grad_norm": 0.31768372654914856, + "learning_rate": 3.21079821159237e-05, + "loss": 0.1285, + "step": 26205 + }, + { + "epoch": 0.46741340562908, + "grad_norm": 0.29872846603393555, + "learning_rate": 3.2106489832171807e-05, + "loss": 0.1618, + "step": 26206 + }, + { + "epoch": 0.4674312417507937, + "grad_norm": 0.2371252179145813, + "learning_rate": 3.210499752087272e-05, + "loss": 0.1422, + "step": 26207 + }, + { + "epoch": 0.4674490778725074, + "grad_norm": 0.2899041175842285, + "learning_rate": 3.210350518203221e-05, + "loss": 0.1996, + "step": 26208 + }, + { + "epoch": 0.4674669139942211, + "grad_norm": 0.41877254843711853, + "learning_rate": 3.2102012815656046e-05, + "loss": 0.1797, + "step": 26209 + }, + { + "epoch": 0.46748475011593477, + "grad_norm": 0.24062591791152954, + "learning_rate": 3.2100520421750034e-05, + "loss": 0.1186, + "step": 26210 + }, + { + "epoch": 0.4675025862376485, + "grad_norm": 0.2736894488334656, + "learning_rate": 3.209902800031995e-05, + "loss": 0.114, + "step": 26211 + }, + { + "epoch": 0.4675204223593622, + "grad_norm": 0.29374274611473083, + "learning_rate": 3.209753555137158e-05, + "loss": 0.1925, + "step": 26212 + }, + { + "epoch": 0.4675382584810759, + "grad_norm": 0.22029341757297516, + "learning_rate": 3.209604307491072e-05, + "loss": 0.1387, + "step": 26213 + }, + { + "epoch": 0.4675560946027896, + "grad_norm": 0.3146451413631439, + "learning_rate": 3.209455057094315e-05, + "loss": 0.179, + "step": 26214 + }, + { + "epoch": 0.46757393072450326, + "grad_norm": 0.26568135619163513, + "learning_rate": 3.2093058039474633e-05, + "loss": 0.181, + "step": 26215 + }, + { + "epoch": 0.46759176684621695, + "grad_norm": 0.3594539165496826, + "learning_rate": 3.209156548051099e-05, + "loss": 0.1788, + "step": 26216 + }, + { + "epoch": 0.46760960296793064, + "grad_norm": 0.23460763692855835, + "learning_rate": 3.2090072894057977e-05, + "loss": 0.1599, + "step": 26217 + }, + { + "epoch": 0.4676274390896443, + "grad_norm": 0.25468727946281433, + "learning_rate": 3.2088580280121394e-05, + "loss": 0.136, + "step": 26218 + }, + { + "epoch": 0.46764527521135807, + "grad_norm": 0.3144964575767517, + "learning_rate": 3.208708763870703e-05, + "loss": 0.1411, + "step": 26219 + }, + { + "epoch": 0.46766311133307176, + "grad_norm": 0.26124510169029236, + "learning_rate": 3.2085594969820666e-05, + "loss": 0.141, + "step": 26220 + }, + { + "epoch": 0.46768094745478545, + "grad_norm": 0.3724759817123413, + "learning_rate": 3.208410227346809e-05, + "loss": 0.1772, + "step": 26221 + }, + { + "epoch": 0.46769878357649913, + "grad_norm": 0.22663894295692444, + "learning_rate": 3.208260954965508e-05, + "loss": 0.1209, + "step": 26222 + }, + { + "epoch": 0.4677166196982128, + "grad_norm": 0.2945179343223572, + "learning_rate": 3.208111679838744e-05, + "loss": 0.1359, + "step": 26223 + }, + { + "epoch": 0.4677344558199265, + "grad_norm": 0.3263687193393707, + "learning_rate": 3.2079624019670933e-05, + "loss": 0.1457, + "step": 26224 + }, + { + "epoch": 0.4677522919416402, + "grad_norm": 0.28166189789772034, + "learning_rate": 3.207813121351137e-05, + "loss": 0.1624, + "step": 26225 + }, + { + "epoch": 0.4677701280633539, + "grad_norm": 0.6706572771072388, + "learning_rate": 3.207663837991452e-05, + "loss": 0.1583, + "step": 26226 + }, + { + "epoch": 0.4677879641850676, + "grad_norm": 0.19967494904994965, + "learning_rate": 3.207514551888618e-05, + "loss": 0.163, + "step": 26227 + }, + { + "epoch": 0.4678058003067813, + "grad_norm": 0.2821843922138214, + "learning_rate": 3.207365263043213e-05, + "loss": 0.1488, + "step": 26228 + }, + { + "epoch": 0.467823636428495, + "grad_norm": 0.2672920227050781, + "learning_rate": 3.207215971455816e-05, + "loss": 0.0851, + "step": 26229 + }, + { + "epoch": 0.4678414725502087, + "grad_norm": 0.26915085315704346, + "learning_rate": 3.207066677127005e-05, + "loss": 0.1639, + "step": 26230 + }, + { + "epoch": 0.4678593086719224, + "grad_norm": 0.28028029203414917, + "learning_rate": 3.206917380057359e-05, + "loss": 0.1305, + "step": 26231 + }, + { + "epoch": 0.46787714479363607, + "grad_norm": 0.24594922363758087, + "learning_rate": 3.206768080247458e-05, + "loss": 0.1267, + "step": 26232 + }, + { + "epoch": 0.46789498091534976, + "grad_norm": 0.3995407819747925, + "learning_rate": 3.20661877769788e-05, + "loss": 0.1798, + "step": 26233 + }, + { + "epoch": 0.46791281703706344, + "grad_norm": 0.30076685547828674, + "learning_rate": 3.206469472409204e-05, + "loss": 0.1124, + "step": 26234 + }, + { + "epoch": 0.46793065315877713, + "grad_norm": 0.2927619516849518, + "learning_rate": 3.206320164382007e-05, + "loss": 0.1535, + "step": 26235 + }, + { + "epoch": 0.4679484892804909, + "grad_norm": 0.19791756570339203, + "learning_rate": 3.20617085361687e-05, + "loss": 0.1173, + "step": 26236 + }, + { + "epoch": 0.46796632540220456, + "grad_norm": 0.33532899618148804, + "learning_rate": 3.20602154011437e-05, + "loss": 0.1516, + "step": 26237 + }, + { + "epoch": 0.46798416152391825, + "grad_norm": 0.3091859519481659, + "learning_rate": 3.205872223875087e-05, + "loss": 0.1259, + "step": 26238 + }, + { + "epoch": 0.46800199764563194, + "grad_norm": 0.3842073380947113, + "learning_rate": 3.2057229048996e-05, + "loss": 0.1364, + "step": 26239 + }, + { + "epoch": 0.4680198337673456, + "grad_norm": 0.24250420928001404, + "learning_rate": 3.205573583188486e-05, + "loss": 0.1332, + "step": 26240 + }, + { + "epoch": 0.4680376698890593, + "grad_norm": 0.2488769143819809, + "learning_rate": 3.205424258742327e-05, + "loss": 0.1598, + "step": 26241 + }, + { + "epoch": 0.468055506010773, + "grad_norm": 0.5432834625244141, + "learning_rate": 3.2052749315616985e-05, + "loss": 0.1751, + "step": 26242 + }, + { + "epoch": 0.4680733421324867, + "grad_norm": 0.4061277210712433, + "learning_rate": 3.205125601647181e-05, + "loss": 0.1791, + "step": 26243 + }, + { + "epoch": 0.4680911782542004, + "grad_norm": 0.2756275534629822, + "learning_rate": 3.2049762689993536e-05, + "loss": 0.1877, + "step": 26244 + }, + { + "epoch": 0.4681090143759141, + "grad_norm": 0.18963421881198883, + "learning_rate": 3.204826933618794e-05, + "loss": 0.1165, + "step": 26245 + }, + { + "epoch": 0.4681268504976278, + "grad_norm": 0.2764233350753784, + "learning_rate": 3.2046775955060823e-05, + "loss": 0.1928, + "step": 26246 + }, + { + "epoch": 0.4681446866193415, + "grad_norm": 0.20233765244483948, + "learning_rate": 3.2045282546617975e-05, + "loss": 0.1189, + "step": 26247 + }, + { + "epoch": 0.4681625227410552, + "grad_norm": 0.2738974392414093, + "learning_rate": 3.2043789110865164e-05, + "loss": 0.1613, + "step": 26248 + }, + { + "epoch": 0.4681803588627689, + "grad_norm": 0.19651754200458527, + "learning_rate": 3.20422956478082e-05, + "loss": 0.1238, + "step": 26249 + }, + { + "epoch": 0.46819819498448256, + "grad_norm": 0.27636095881462097, + "learning_rate": 3.2040802157452866e-05, + "loss": 0.1552, + "step": 26250 + }, + { + "epoch": 0.46821603110619625, + "grad_norm": 0.29355713725090027, + "learning_rate": 3.2039308639804946e-05, + "loss": 0.1198, + "step": 26251 + }, + { + "epoch": 0.46823386722790994, + "grad_norm": 0.26799729466438293, + "learning_rate": 3.203781509487024e-05, + "loss": 0.1602, + "step": 26252 + }, + { + "epoch": 0.4682517033496237, + "grad_norm": 0.2301194667816162, + "learning_rate": 3.203632152265453e-05, + "loss": 0.1404, + "step": 26253 + }, + { + "epoch": 0.46826953947133737, + "grad_norm": 0.28055834770202637, + "learning_rate": 3.2034827923163605e-05, + "loss": 0.1512, + "step": 26254 + }, + { + "epoch": 0.46828737559305106, + "grad_norm": 0.34947171807289124, + "learning_rate": 3.2033334296403266e-05, + "loss": 0.1523, + "step": 26255 + }, + { + "epoch": 0.46830521171476475, + "grad_norm": 0.30112358927726746, + "learning_rate": 3.203184064237929e-05, + "loss": 0.1996, + "step": 26256 + }, + { + "epoch": 0.46832304783647843, + "grad_norm": 0.32969823479652405, + "learning_rate": 3.203034696109748e-05, + "loss": 0.1563, + "step": 26257 + }, + { + "epoch": 0.4683408839581921, + "grad_norm": 0.3396940231323242, + "learning_rate": 3.2028853252563594e-05, + "loss": 0.1402, + "step": 26258 + }, + { + "epoch": 0.4683587200799058, + "grad_norm": 0.22942569851875305, + "learning_rate": 3.202735951678346e-05, + "loss": 0.0856, + "step": 26259 + }, + { + "epoch": 0.4683765562016195, + "grad_norm": 0.20532183349132538, + "learning_rate": 3.2025865753762855e-05, + "loss": 0.1298, + "step": 26260 + }, + { + "epoch": 0.46839439232333324, + "grad_norm": 0.2443859875202179, + "learning_rate": 3.202437196350756e-05, + "loss": 0.1024, + "step": 26261 + }, + { + "epoch": 0.46841222844504693, + "grad_norm": 0.20618800818920135, + "learning_rate": 3.2022878146023386e-05, + "loss": 0.1526, + "step": 26262 + }, + { + "epoch": 0.4684300645667606, + "grad_norm": 0.21313892304897308, + "learning_rate": 3.2021384301316095e-05, + "loss": 0.1424, + "step": 26263 + }, + { + "epoch": 0.4684479006884743, + "grad_norm": 0.22955258190631866, + "learning_rate": 3.20198904293915e-05, + "loss": 0.1159, + "step": 26264 + }, + { + "epoch": 0.468465736810188, + "grad_norm": 0.3054492175579071, + "learning_rate": 3.2018396530255385e-05, + "loss": 0.1339, + "step": 26265 + }, + { + "epoch": 0.4684835729319017, + "grad_norm": 0.3174167573451996, + "learning_rate": 3.201690260391354e-05, + "loss": 0.1639, + "step": 26266 + }, + { + "epoch": 0.46850140905361537, + "grad_norm": 0.27130749821662903, + "learning_rate": 3.201540865037175e-05, + "loss": 0.2028, + "step": 26267 + }, + { + "epoch": 0.46851924517532906, + "grad_norm": 0.25795820355415344, + "learning_rate": 3.201391466963583e-05, + "loss": 0.1609, + "step": 26268 + }, + { + "epoch": 0.46853708129704275, + "grad_norm": 0.29352548718452454, + "learning_rate": 3.201242066171154e-05, + "loss": 0.191, + "step": 26269 + }, + { + "epoch": 0.4685549174187565, + "grad_norm": 0.2807568907737732, + "learning_rate": 3.20109266266047e-05, + "loss": 0.1168, + "step": 26270 + }, + { + "epoch": 0.4685727535404702, + "grad_norm": 0.32258519530296326, + "learning_rate": 3.200943256432107e-05, + "loss": 0.1608, + "step": 26271 + }, + { + "epoch": 0.46859058966218387, + "grad_norm": 0.2827734351158142, + "learning_rate": 3.200793847486647e-05, + "loss": 0.1392, + "step": 26272 + }, + { + "epoch": 0.46860842578389755, + "grad_norm": 0.22582592070102692, + "learning_rate": 3.2006444358246676e-05, + "loss": 0.1416, + "step": 26273 + }, + { + "epoch": 0.46862626190561124, + "grad_norm": 0.24394312500953674, + "learning_rate": 3.200495021446748e-05, + "loss": 0.1146, + "step": 26274 + }, + { + "epoch": 0.46864409802732493, + "grad_norm": 0.2754935026168823, + "learning_rate": 3.200345604353469e-05, + "loss": 0.1435, + "step": 26275 + }, + { + "epoch": 0.4686619341490386, + "grad_norm": 0.27431949973106384, + "learning_rate": 3.200196184545407e-05, + "loss": 0.1158, + "step": 26276 + }, + { + "epoch": 0.4686797702707523, + "grad_norm": 0.2708548903465271, + "learning_rate": 3.200046762023144e-05, + "loss": 0.2015, + "step": 26277 + }, + { + "epoch": 0.46869760639246605, + "grad_norm": 0.2920028269290924, + "learning_rate": 3.199897336787257e-05, + "loss": 0.154, + "step": 26278 + }, + { + "epoch": 0.46871544251417974, + "grad_norm": 0.26886850595474243, + "learning_rate": 3.199747908838326e-05, + "loss": 0.1598, + "step": 26279 + }, + { + "epoch": 0.4687332786358934, + "grad_norm": 0.2931189239025116, + "learning_rate": 3.199598478176931e-05, + "loss": 0.1098, + "step": 26280 + }, + { + "epoch": 0.4687511147576071, + "grad_norm": 0.35463953018188477, + "learning_rate": 3.199449044803651e-05, + "loss": 0.1647, + "step": 26281 + }, + { + "epoch": 0.4687689508793208, + "grad_norm": 0.25132209062576294, + "learning_rate": 3.1992996087190643e-05, + "loss": 0.1402, + "step": 26282 + }, + { + "epoch": 0.4687867870010345, + "grad_norm": 0.29833027720451355, + "learning_rate": 3.199150169923751e-05, + "loss": 0.1286, + "step": 26283 + }, + { + "epoch": 0.4688046231227482, + "grad_norm": 0.1923772394657135, + "learning_rate": 3.1990007284182904e-05, + "loss": 0.1355, + "step": 26284 + }, + { + "epoch": 0.46882245924446186, + "grad_norm": 0.22590340673923492, + "learning_rate": 3.198851284203262e-05, + "loss": 0.1572, + "step": 26285 + }, + { + "epoch": 0.46884029536617555, + "grad_norm": 0.2155126929283142, + "learning_rate": 3.198701837279244e-05, + "loss": 0.1515, + "step": 26286 + }, + { + "epoch": 0.4688581314878893, + "grad_norm": 0.33970415592193604, + "learning_rate": 3.198552387646816e-05, + "loss": 0.139, + "step": 26287 + }, + { + "epoch": 0.468875967609603, + "grad_norm": 0.2502792179584503, + "learning_rate": 3.198402935306558e-05, + "loss": 0.1222, + "step": 26288 + }, + { + "epoch": 0.46889380373131667, + "grad_norm": 0.26113376021385193, + "learning_rate": 3.198253480259049e-05, + "loss": 0.1138, + "step": 26289 + }, + { + "epoch": 0.46891163985303036, + "grad_norm": 0.2515289783477783, + "learning_rate": 3.1981040225048686e-05, + "loss": 0.1391, + "step": 26290 + }, + { + "epoch": 0.46892947597474405, + "grad_norm": 0.29478350281715393, + "learning_rate": 3.197954562044596e-05, + "loss": 0.1481, + "step": 26291 + }, + { + "epoch": 0.46894731209645774, + "grad_norm": 0.24080762267112732, + "learning_rate": 3.19780509887881e-05, + "loss": 0.1387, + "step": 26292 + }, + { + "epoch": 0.4689651482181714, + "grad_norm": 0.25260424613952637, + "learning_rate": 3.197655633008091e-05, + "loss": 0.1626, + "step": 26293 + }, + { + "epoch": 0.4689829843398851, + "grad_norm": 0.23998117446899414, + "learning_rate": 3.197506164433017e-05, + "loss": 0.1262, + "step": 26294 + }, + { + "epoch": 0.46900082046159886, + "grad_norm": 0.30281248688697815, + "learning_rate": 3.1973566931541684e-05, + "loss": 0.1344, + "step": 26295 + }, + { + "epoch": 0.46901865658331254, + "grad_norm": 0.23442330956459045, + "learning_rate": 3.197207219172125e-05, + "loss": 0.1322, + "step": 26296 + }, + { + "epoch": 0.46903649270502623, + "grad_norm": 0.5462908148765564, + "learning_rate": 3.1970577424874646e-05, + "loss": 0.1583, + "step": 26297 + }, + { + "epoch": 0.4690543288267399, + "grad_norm": 0.20979632437229156, + "learning_rate": 3.1969082631007686e-05, + "loss": 0.1652, + "step": 26298 + }, + { + "epoch": 0.4690721649484536, + "grad_norm": 0.27456435561180115, + "learning_rate": 3.196758781012615e-05, + "loss": 0.1754, + "step": 26299 + }, + { + "epoch": 0.4690900010701673, + "grad_norm": 0.36536943912506104, + "learning_rate": 3.1966092962235835e-05, + "loss": 0.1691, + "step": 26300 + }, + { + "epoch": 0.469107837191881, + "grad_norm": 0.22177709639072418, + "learning_rate": 3.1964598087342544e-05, + "loss": 0.1169, + "step": 26301 + }, + { + "epoch": 0.46912567331359467, + "grad_norm": 0.23451322317123413, + "learning_rate": 3.196310318545206e-05, + "loss": 0.0926, + "step": 26302 + }, + { + "epoch": 0.46914350943530836, + "grad_norm": 0.26566970348358154, + "learning_rate": 3.196160825657019e-05, + "loss": 0.1204, + "step": 26303 + }, + { + "epoch": 0.4691613455570221, + "grad_norm": 0.361223429441452, + "learning_rate": 3.196011330070271e-05, + "loss": 0.1998, + "step": 26304 + }, + { + "epoch": 0.4691791816787358, + "grad_norm": 0.18521316349506378, + "learning_rate": 3.195861831785544e-05, + "loss": 0.1454, + "step": 26305 + }, + { + "epoch": 0.4691970178004495, + "grad_norm": 0.23742611706256866, + "learning_rate": 3.195712330803415e-05, + "loss": 0.1282, + "step": 26306 + }, + { + "epoch": 0.46921485392216317, + "grad_norm": 0.27079543471336365, + "learning_rate": 3.1955628271244655e-05, + "loss": 0.111, + "step": 26307 + }, + { + "epoch": 0.46923269004387685, + "grad_norm": 0.3403595983982086, + "learning_rate": 3.195413320749274e-05, + "loss": 0.1566, + "step": 26308 + }, + { + "epoch": 0.46925052616559054, + "grad_norm": 0.2622055411338806, + "learning_rate": 3.19526381167842e-05, + "loss": 0.1825, + "step": 26309 + }, + { + "epoch": 0.46926836228730423, + "grad_norm": 0.20057597756385803, + "learning_rate": 3.1951142999124836e-05, + "loss": 0.0756, + "step": 26310 + }, + { + "epoch": 0.4692861984090179, + "grad_norm": 0.26419612765312195, + "learning_rate": 3.1949647854520446e-05, + "loss": 0.1396, + "step": 26311 + }, + { + "epoch": 0.46930403453073166, + "grad_norm": 0.24994225800037384, + "learning_rate": 3.194815268297681e-05, + "loss": 0.1484, + "step": 26312 + }, + { + "epoch": 0.46932187065244535, + "grad_norm": 0.3066805303096771, + "learning_rate": 3.194665748449975e-05, + "loss": 0.1635, + "step": 26313 + }, + { + "epoch": 0.46933970677415904, + "grad_norm": 0.28367283940315247, + "learning_rate": 3.194516225909503e-05, + "loss": 0.1287, + "step": 26314 + }, + { + "epoch": 0.4693575428958727, + "grad_norm": 0.3292962908744812, + "learning_rate": 3.194366700676847e-05, + "loss": 0.1833, + "step": 26315 + }, + { + "epoch": 0.4693753790175864, + "grad_norm": 0.3581288456916809, + "learning_rate": 3.194217172752586e-05, + "loss": 0.1443, + "step": 26316 + }, + { + "epoch": 0.4693932151393001, + "grad_norm": 0.39648306369781494, + "learning_rate": 3.194067642137299e-05, + "loss": 0.1387, + "step": 26317 + }, + { + "epoch": 0.4694110512610138, + "grad_norm": 0.3037940263748169, + "learning_rate": 3.193918108831566e-05, + "loss": 0.1474, + "step": 26318 + }, + { + "epoch": 0.4694288873827275, + "grad_norm": 0.20100311934947968, + "learning_rate": 3.193768572835967e-05, + "loss": 0.1707, + "step": 26319 + }, + { + "epoch": 0.4694467235044412, + "grad_norm": 0.5518757104873657, + "learning_rate": 3.1936190341510816e-05, + "loss": 0.1501, + "step": 26320 + }, + { + "epoch": 0.4694645596261549, + "grad_norm": 0.23308266699314117, + "learning_rate": 3.1934694927774894e-05, + "loss": 0.1836, + "step": 26321 + }, + { + "epoch": 0.4694823957478686, + "grad_norm": 0.30691757798194885, + "learning_rate": 3.1933199487157696e-05, + "loss": 0.2167, + "step": 26322 + }, + { + "epoch": 0.4695002318695823, + "grad_norm": 0.3114815950393677, + "learning_rate": 3.193170401966502e-05, + "loss": 0.1672, + "step": 26323 + }, + { + "epoch": 0.469518067991296, + "grad_norm": 0.2991527318954468, + "learning_rate": 3.193020852530267e-05, + "loss": 0.1523, + "step": 26324 + }, + { + "epoch": 0.46953590411300966, + "grad_norm": 0.3079780638217926, + "learning_rate": 3.192871300407643e-05, + "loss": 0.1864, + "step": 26325 + }, + { + "epoch": 0.46955374023472335, + "grad_norm": 0.2281007170677185, + "learning_rate": 3.192721745599211e-05, + "loss": 0.1402, + "step": 26326 + }, + { + "epoch": 0.46957157635643704, + "grad_norm": 0.2386164665222168, + "learning_rate": 3.1925721881055505e-05, + "loss": 0.1398, + "step": 26327 + }, + { + "epoch": 0.4695894124781507, + "grad_norm": 0.22394618391990662, + "learning_rate": 3.192422627927241e-05, + "loss": 0.1497, + "step": 26328 + }, + { + "epoch": 0.46960724859986447, + "grad_norm": 0.2283318191766739, + "learning_rate": 3.1922730650648616e-05, + "loss": 0.1351, + "step": 26329 + }, + { + "epoch": 0.46962508472157816, + "grad_norm": 0.31934654712677, + "learning_rate": 3.1921234995189935e-05, + "loss": 0.1785, + "step": 26330 + }, + { + "epoch": 0.46964292084329184, + "grad_norm": 0.5170240998268127, + "learning_rate": 3.1919739312902155e-05, + "loss": 0.1098, + "step": 26331 + }, + { + "epoch": 0.46966075696500553, + "grad_norm": 0.26554417610168457, + "learning_rate": 3.191824360379107e-05, + "loss": 0.1397, + "step": 26332 + }, + { + "epoch": 0.4696785930867192, + "grad_norm": 0.2660197615623474, + "learning_rate": 3.19167478678625e-05, + "loss": 0.1462, + "step": 26333 + }, + { + "epoch": 0.4696964292084329, + "grad_norm": 0.29562854766845703, + "learning_rate": 3.191525210512221e-05, + "loss": 0.1891, + "step": 26334 + }, + { + "epoch": 0.4697142653301466, + "grad_norm": 0.22776257991790771, + "learning_rate": 3.191375631557601e-05, + "loss": 0.1422, + "step": 26335 + }, + { + "epoch": 0.4697321014518603, + "grad_norm": 0.22175659239292145, + "learning_rate": 3.1912260499229706e-05, + "loss": 0.1233, + "step": 26336 + }, + { + "epoch": 0.46974993757357403, + "grad_norm": 0.3063209354877472, + "learning_rate": 3.1910764656089096e-05, + "loss": 0.1813, + "step": 26337 + }, + { + "epoch": 0.4697677736952877, + "grad_norm": 0.2530362904071808, + "learning_rate": 3.190926878615998e-05, + "loss": 0.1182, + "step": 26338 + }, + { + "epoch": 0.4697856098170014, + "grad_norm": 0.307373046875, + "learning_rate": 3.190777288944815e-05, + "loss": 0.1267, + "step": 26339 + }, + { + "epoch": 0.4698034459387151, + "grad_norm": 0.22828654944896698, + "learning_rate": 3.1906276965959406e-05, + "loss": 0.0776, + "step": 26340 + }, + { + "epoch": 0.4698212820604288, + "grad_norm": 0.3093816637992859, + "learning_rate": 3.190478101569955e-05, + "loss": 0.1493, + "step": 26341 + }, + { + "epoch": 0.46983911818214247, + "grad_norm": 0.32637467980384827, + "learning_rate": 3.190328503867437e-05, + "loss": 0.1022, + "step": 26342 + }, + { + "epoch": 0.46985695430385616, + "grad_norm": 0.21642765402793884, + "learning_rate": 3.190178903488967e-05, + "loss": 0.1574, + "step": 26343 + }, + { + "epoch": 0.46987479042556984, + "grad_norm": 0.24719901382923126, + "learning_rate": 3.190029300435127e-05, + "loss": 0.1508, + "step": 26344 + }, + { + "epoch": 0.46989262654728353, + "grad_norm": 0.29670023918151855, + "learning_rate": 3.189879694706493e-05, + "loss": 0.1764, + "step": 26345 + }, + { + "epoch": 0.4699104626689973, + "grad_norm": 0.2982756793498993, + "learning_rate": 3.1897300863036487e-05, + "loss": 0.1753, + "step": 26346 + }, + { + "epoch": 0.46992829879071096, + "grad_norm": 0.2514650225639343, + "learning_rate": 3.189580475227171e-05, + "loss": 0.173, + "step": 26347 + }, + { + "epoch": 0.46994613491242465, + "grad_norm": 0.2989982068538666, + "learning_rate": 3.1894308614776424e-05, + "loss": 0.1556, + "step": 26348 + }, + { + "epoch": 0.46996397103413834, + "grad_norm": 0.30362823605537415, + "learning_rate": 3.18928124505564e-05, + "loss": 0.1192, + "step": 26349 + }, + { + "epoch": 0.469981807155852, + "grad_norm": 0.29709258675575256, + "learning_rate": 3.189131625961747e-05, + "loss": 0.0839, + "step": 26350 + }, + { + "epoch": 0.4699996432775657, + "grad_norm": 0.3220767080783844, + "learning_rate": 3.188982004196541e-05, + "loss": 0.1582, + "step": 26351 + }, + { + "epoch": 0.4700174793992794, + "grad_norm": 0.3034157454967499, + "learning_rate": 3.1888323797606034e-05, + "loss": 0.1448, + "step": 26352 + }, + { + "epoch": 0.4700353155209931, + "grad_norm": 0.28179022669792175, + "learning_rate": 3.188682752654514e-05, + "loss": 0.1399, + "step": 26353 + }, + { + "epoch": 0.47005315164270683, + "grad_norm": 0.2542007565498352, + "learning_rate": 3.1885331228788514e-05, + "loss": 0.139, + "step": 26354 + }, + { + "epoch": 0.4700709877644205, + "grad_norm": 0.2888839542865753, + "learning_rate": 3.188383490434197e-05, + "loss": 0.1799, + "step": 26355 + }, + { + "epoch": 0.4700888238861342, + "grad_norm": 0.37037986516952515, + "learning_rate": 3.18823385532113e-05, + "loss": 0.1666, + "step": 26356 + }, + { + "epoch": 0.4701066600078479, + "grad_norm": 0.26663026213645935, + "learning_rate": 3.188084217540231e-05, + "loss": 0.1067, + "step": 26357 + }, + { + "epoch": 0.4701244961295616, + "grad_norm": 0.3092741072177887, + "learning_rate": 3.18793457709208e-05, + "loss": 0.1224, + "step": 26358 + }, + { + "epoch": 0.4701423322512753, + "grad_norm": 0.25784868001937866, + "learning_rate": 3.187784933977258e-05, + "loss": 0.0975, + "step": 26359 + }, + { + "epoch": 0.47016016837298896, + "grad_norm": 0.31876060366630554, + "learning_rate": 3.187635288196342e-05, + "loss": 0.2221, + "step": 26360 + }, + { + "epoch": 0.47017800449470265, + "grad_norm": 0.31552034616470337, + "learning_rate": 3.187485639749916e-05, + "loss": 0.1451, + "step": 26361 + }, + { + "epoch": 0.4701958406164164, + "grad_norm": 0.24119937419891357, + "learning_rate": 3.1873359886385576e-05, + "loss": 0.1143, + "step": 26362 + }, + { + "epoch": 0.4702136767381301, + "grad_norm": 0.2996779978275299, + "learning_rate": 3.187186334862847e-05, + "loss": 0.1375, + "step": 26363 + }, + { + "epoch": 0.47023151285984377, + "grad_norm": 0.33241936564445496, + "learning_rate": 3.1870366784233646e-05, + "loss": 0.1331, + "step": 26364 + }, + { + "epoch": 0.47024934898155746, + "grad_norm": 0.33816003799438477, + "learning_rate": 3.1868870193206915e-05, + "loss": 0.1336, + "step": 26365 + }, + { + "epoch": 0.47026718510327115, + "grad_norm": 0.2590244710445404, + "learning_rate": 3.186737357555407e-05, + "loss": 0.1405, + "step": 26366 + }, + { + "epoch": 0.47028502122498483, + "grad_norm": 0.2345762848854065, + "learning_rate": 3.1865876931280915e-05, + "loss": 0.1267, + "step": 26367 + }, + { + "epoch": 0.4703028573466985, + "grad_norm": 0.24232934415340424, + "learning_rate": 3.186438026039325e-05, + "loss": 0.1177, + "step": 26368 + }, + { + "epoch": 0.4703206934684122, + "grad_norm": 0.23698510229587555, + "learning_rate": 3.186288356289687e-05, + "loss": 0.1374, + "step": 26369 + }, + { + "epoch": 0.4703385295901259, + "grad_norm": 0.3014860451221466, + "learning_rate": 3.186138683879758e-05, + "loss": 0.1499, + "step": 26370 + }, + { + "epoch": 0.47035636571183964, + "grad_norm": 0.3066621422767639, + "learning_rate": 3.1859890088101193e-05, + "loss": 0.1782, + "step": 26371 + }, + { + "epoch": 0.47037420183355333, + "grad_norm": 0.24974851310253143, + "learning_rate": 3.18583933108135e-05, + "loss": 0.1186, + "step": 26372 + }, + { + "epoch": 0.470392037955267, + "grad_norm": 0.3588750958442688, + "learning_rate": 3.18568965069403e-05, + "loss": 0.161, + "step": 26373 + }, + { + "epoch": 0.4704098740769807, + "grad_norm": 0.20368777215480804, + "learning_rate": 3.185539967648742e-05, + "loss": 0.1233, + "step": 26374 + }, + { + "epoch": 0.4704277101986944, + "grad_norm": 0.2930132746696472, + "learning_rate": 3.1853902819460624e-05, + "loss": 0.127, + "step": 26375 + }, + { + "epoch": 0.4704455463204081, + "grad_norm": 0.23821282386779785, + "learning_rate": 3.185240593586574e-05, + "loss": 0.1589, + "step": 26376 + }, + { + "epoch": 0.47046338244212177, + "grad_norm": 0.22390709817409515, + "learning_rate": 3.1850909025708556e-05, + "loss": 0.0778, + "step": 26377 + }, + { + "epoch": 0.47048121856383546, + "grad_norm": 0.2755146324634552, + "learning_rate": 3.18494120889949e-05, + "loss": 0.0971, + "step": 26378 + }, + { + "epoch": 0.4704990546855492, + "grad_norm": 0.34734922647476196, + "learning_rate": 3.184791512573054e-05, + "loss": 0.1703, + "step": 26379 + }, + { + "epoch": 0.4705168908072629, + "grad_norm": 0.2981151342391968, + "learning_rate": 3.18464181359213e-05, + "loss": 0.1331, + "step": 26380 + }, + { + "epoch": 0.4705347269289766, + "grad_norm": 0.20875748991966248, + "learning_rate": 3.184492111957299e-05, + "loss": 0.1518, + "step": 26381 + }, + { + "epoch": 0.47055256305069026, + "grad_norm": 0.24016188085079193, + "learning_rate": 3.1843424076691386e-05, + "loss": 0.1686, + "step": 26382 + }, + { + "epoch": 0.47057039917240395, + "grad_norm": 0.2279919981956482, + "learning_rate": 3.184192700728231e-05, + "loss": 0.1645, + "step": 26383 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.25243496894836426, + "learning_rate": 3.184042991135157e-05, + "loss": 0.1474, + "step": 26384 + }, + { + "epoch": 0.47060607141583133, + "grad_norm": 0.3161037564277649, + "learning_rate": 3.183893278890496e-05, + "loss": 0.1303, + "step": 26385 + }, + { + "epoch": 0.470623907537545, + "grad_norm": 0.2923678457736969, + "learning_rate": 3.1837435639948276e-05, + "loss": 0.1048, + "step": 26386 + }, + { + "epoch": 0.4706417436592587, + "grad_norm": 0.28258299827575684, + "learning_rate": 3.183593846448734e-05, + "loss": 0.0938, + "step": 26387 + }, + { + "epoch": 0.47065957978097245, + "grad_norm": 0.27700623869895935, + "learning_rate": 3.183444126252793e-05, + "loss": 0.1571, + "step": 26388 + }, + { + "epoch": 0.47067741590268614, + "grad_norm": 0.3193347752094269, + "learning_rate": 3.1832944034075876e-05, + "loss": 0.1678, + "step": 26389 + }, + { + "epoch": 0.4706952520243998, + "grad_norm": 0.27845674753189087, + "learning_rate": 3.1831446779136976e-05, + "loss": 0.1312, + "step": 26390 + }, + { + "epoch": 0.4707130881461135, + "grad_norm": 0.33350154757499695, + "learning_rate": 3.182994949771701e-05, + "loss": 0.1583, + "step": 26391 + }, + { + "epoch": 0.4707309242678272, + "grad_norm": 0.19883500039577484, + "learning_rate": 3.182845218982182e-05, + "loss": 0.0911, + "step": 26392 + }, + { + "epoch": 0.4707487603895409, + "grad_norm": 0.3399393856525421, + "learning_rate": 3.1826954855457184e-05, + "loss": 0.16, + "step": 26393 + }, + { + "epoch": 0.4707665965112546, + "grad_norm": 0.21473821997642517, + "learning_rate": 3.182545749462892e-05, + "loss": 0.1196, + "step": 26394 + }, + { + "epoch": 0.47078443263296826, + "grad_norm": 0.2845939099788666, + "learning_rate": 3.1823960107342815e-05, + "loss": 0.1628, + "step": 26395 + }, + { + "epoch": 0.470802268754682, + "grad_norm": 0.2274755984544754, + "learning_rate": 3.182246269360469e-05, + "loss": 0.1578, + "step": 26396 + }, + { + "epoch": 0.4708201048763957, + "grad_norm": 0.21096496284008026, + "learning_rate": 3.1820965253420335e-05, + "loss": 0.1311, + "step": 26397 + }, + { + "epoch": 0.4708379409981094, + "grad_norm": 0.22262373566627502, + "learning_rate": 3.181946778679557e-05, + "loss": 0.1368, + "step": 26398 + }, + { + "epoch": 0.47085577711982307, + "grad_norm": 0.3591758608818054, + "learning_rate": 3.181797029373619e-05, + "loss": 0.1724, + "step": 26399 + }, + { + "epoch": 0.47087361324153676, + "grad_norm": 0.29298755526542664, + "learning_rate": 3.1816472774248e-05, + "loss": 0.1397, + "step": 26400 + }, + { + "epoch": 0.47089144936325045, + "grad_norm": 0.2678554654121399, + "learning_rate": 3.181497522833681e-05, + "loss": 0.188, + "step": 26401 + }, + { + "epoch": 0.47090928548496414, + "grad_norm": 0.25611796975135803, + "learning_rate": 3.181347765600843e-05, + "loss": 0.1499, + "step": 26402 + }, + { + "epoch": 0.4709271216066778, + "grad_norm": 0.24687770009040833, + "learning_rate": 3.181198005726864e-05, + "loss": 0.1059, + "step": 26403 + }, + { + "epoch": 0.4709449577283915, + "grad_norm": 0.3149312734603882, + "learning_rate": 3.181048243212327e-05, + "loss": 0.1703, + "step": 26404 + }, + { + "epoch": 0.47096279385010525, + "grad_norm": 0.21981625258922577, + "learning_rate": 3.1808984780578126e-05, + "loss": 0.1217, + "step": 26405 + }, + { + "epoch": 0.47098062997181894, + "grad_norm": 0.3896492123603821, + "learning_rate": 3.1807487102639e-05, + "loss": 0.2234, + "step": 26406 + }, + { + "epoch": 0.47099846609353263, + "grad_norm": 0.22677797079086304, + "learning_rate": 3.1805989398311704e-05, + "loss": 0.1281, + "step": 26407 + }, + { + "epoch": 0.4710163022152463, + "grad_norm": 0.2642861306667328, + "learning_rate": 3.180449166760204e-05, + "loss": 0.1053, + "step": 26408 + }, + { + "epoch": 0.47103413833696, + "grad_norm": 0.38626107573509216, + "learning_rate": 3.180299391051582e-05, + "loss": 0.1536, + "step": 26409 + }, + { + "epoch": 0.4710519744586737, + "grad_norm": 0.3027537167072296, + "learning_rate": 3.180149612705884e-05, + "loss": 0.1638, + "step": 26410 + }, + { + "epoch": 0.4710698105803874, + "grad_norm": 0.22869277000427246, + "learning_rate": 3.1799998317236914e-05, + "loss": 0.16, + "step": 26411 + }, + { + "epoch": 0.47108764670210107, + "grad_norm": 0.26541826128959656, + "learning_rate": 3.179850048105584e-05, + "loss": 0.1388, + "step": 26412 + }, + { + "epoch": 0.4711054828238148, + "grad_norm": 0.2785693407058716, + "learning_rate": 3.1797002618521435e-05, + "loss": 0.155, + "step": 26413 + }, + { + "epoch": 0.4711233189455285, + "grad_norm": 0.27491217851638794, + "learning_rate": 3.17955047296395e-05, + "loss": 0.1553, + "step": 26414 + }, + { + "epoch": 0.4711411550672422, + "grad_norm": 0.2087952196598053, + "learning_rate": 3.179400681441585e-05, + "loss": 0.1353, + "step": 26415 + }, + { + "epoch": 0.4711589911889559, + "grad_norm": 0.24017567932605743, + "learning_rate": 3.179250887285628e-05, + "loss": 0.1526, + "step": 26416 + }, + { + "epoch": 0.47117682731066957, + "grad_norm": 0.3256704807281494, + "learning_rate": 3.179101090496659e-05, + "loss": 0.1681, + "step": 26417 + }, + { + "epoch": 0.47119466343238325, + "grad_norm": 0.2272716909646988, + "learning_rate": 3.17895129107526e-05, + "loss": 0.1494, + "step": 26418 + }, + { + "epoch": 0.47121249955409694, + "grad_norm": 0.28907379508018494, + "learning_rate": 3.1788014890220114e-05, + "loss": 0.1806, + "step": 26419 + }, + { + "epoch": 0.47123033567581063, + "grad_norm": 0.3094012141227722, + "learning_rate": 3.178651684337494e-05, + "loss": 0.1529, + "step": 26420 + }, + { + "epoch": 0.4712481717975244, + "grad_norm": 0.27203044295310974, + "learning_rate": 3.178501877022288e-05, + "loss": 0.1896, + "step": 26421 + }, + { + "epoch": 0.47126600791923806, + "grad_norm": 0.19576877355575562, + "learning_rate": 3.178352067076975e-05, + "loss": 0.1271, + "step": 26422 + }, + { + "epoch": 0.47128384404095175, + "grad_norm": 0.3442360758781433, + "learning_rate": 3.1782022545021346e-05, + "loss": 0.1948, + "step": 26423 + }, + { + "epoch": 0.47130168016266544, + "grad_norm": 0.22480064630508423, + "learning_rate": 3.178052439298348e-05, + "loss": 0.1694, + "step": 26424 + }, + { + "epoch": 0.4713195162843791, + "grad_norm": 0.24323417246341705, + "learning_rate": 3.1779026214661953e-05, + "loss": 0.1354, + "step": 26425 + }, + { + "epoch": 0.4713373524060928, + "grad_norm": 0.28382396697998047, + "learning_rate": 3.1777528010062584e-05, + "loss": 0.1322, + "step": 26426 + }, + { + "epoch": 0.4713551885278065, + "grad_norm": 0.21106085181236267, + "learning_rate": 3.177602977919118e-05, + "loss": 0.1582, + "step": 26427 + }, + { + "epoch": 0.4713730246495202, + "grad_norm": 0.288219153881073, + "learning_rate": 3.177453152205354e-05, + "loss": 0.1253, + "step": 26428 + }, + { + "epoch": 0.4713908607712339, + "grad_norm": 0.28570500016212463, + "learning_rate": 3.177303323865548e-05, + "loss": 0.177, + "step": 26429 + }, + { + "epoch": 0.4714086968929476, + "grad_norm": 0.26425454020500183, + "learning_rate": 3.1771534929002804e-05, + "loss": 0.128, + "step": 26430 + }, + { + "epoch": 0.4714265330146613, + "grad_norm": 0.23358196020126343, + "learning_rate": 3.1770036593101314e-05, + "loss": 0.1125, + "step": 26431 + }, + { + "epoch": 0.471444369136375, + "grad_norm": 0.3212023079395294, + "learning_rate": 3.1768538230956824e-05, + "loss": 0.1415, + "step": 26432 + }, + { + "epoch": 0.4714622052580887, + "grad_norm": 0.37274155020713806, + "learning_rate": 3.1767039842575144e-05, + "loss": 0.1486, + "step": 26433 + }, + { + "epoch": 0.4714800413798024, + "grad_norm": 0.22081796824932098, + "learning_rate": 3.176554142796209e-05, + "loss": 0.122, + "step": 26434 + }, + { + "epoch": 0.47149787750151606, + "grad_norm": 0.2711513042449951, + "learning_rate": 3.1764042987123446e-05, + "loss": 0.1492, + "step": 26435 + }, + { + "epoch": 0.47151571362322975, + "grad_norm": 0.2879045307636261, + "learning_rate": 3.176254452006504e-05, + "loss": 0.1784, + "step": 26436 + }, + { + "epoch": 0.47153354974494344, + "grad_norm": 0.23209641873836517, + "learning_rate": 3.1761046026792684e-05, + "loss": 0.1635, + "step": 26437 + }, + { + "epoch": 0.4715513858666572, + "grad_norm": 0.3062979280948639, + "learning_rate": 3.1759547507312164e-05, + "loss": 0.2206, + "step": 26438 + }, + { + "epoch": 0.47156922198837087, + "grad_norm": 0.16119593381881714, + "learning_rate": 3.175804896162931e-05, + "loss": 0.1217, + "step": 26439 + }, + { + "epoch": 0.47158705811008456, + "grad_norm": 0.2624979317188263, + "learning_rate": 3.175655038974992e-05, + "loss": 0.1331, + "step": 26440 + }, + { + "epoch": 0.47160489423179824, + "grad_norm": 1.424967885017395, + "learning_rate": 3.175505179167982e-05, + "loss": 0.2486, + "step": 26441 + }, + { + "epoch": 0.47162273035351193, + "grad_norm": 0.2697075605392456, + "learning_rate": 3.1753553167424795e-05, + "loss": 0.1194, + "step": 26442 + }, + { + "epoch": 0.4716405664752256, + "grad_norm": 0.19480131566524506, + "learning_rate": 3.175205451699067e-05, + "loss": 0.1218, + "step": 26443 + }, + { + "epoch": 0.4716584025969393, + "grad_norm": 0.301111102104187, + "learning_rate": 3.175055584038325e-05, + "loss": 0.1177, + "step": 26444 + }, + { + "epoch": 0.471676238718653, + "grad_norm": 0.24969439208507538, + "learning_rate": 3.174905713760834e-05, + "loss": 0.1505, + "step": 26445 + }, + { + "epoch": 0.4716940748403667, + "grad_norm": 0.3286374509334564, + "learning_rate": 3.174755840867175e-05, + "loss": 0.2119, + "step": 26446 + }, + { + "epoch": 0.4717119109620804, + "grad_norm": 0.2736777067184448, + "learning_rate": 3.1746059653579294e-05, + "loss": 0.203, + "step": 26447 + }, + { + "epoch": 0.4717297470837941, + "grad_norm": 0.21302145719528198, + "learning_rate": 3.1744560872336794e-05, + "loss": 0.0926, + "step": 26448 + }, + { + "epoch": 0.4717475832055078, + "grad_norm": 0.31996867060661316, + "learning_rate": 3.174306206495003e-05, + "loss": 0.2061, + "step": 26449 + }, + { + "epoch": 0.4717654193272215, + "grad_norm": 0.28593701124191284, + "learning_rate": 3.1741563231424844e-05, + "loss": 0.1055, + "step": 26450 + }, + { + "epoch": 0.4717832554489352, + "grad_norm": 0.2543777525424957, + "learning_rate": 3.174006437176702e-05, + "loss": 0.159, + "step": 26451 + }, + { + "epoch": 0.47180109157064887, + "grad_norm": 0.3063381314277649, + "learning_rate": 3.173856548598239e-05, + "loss": 0.1834, + "step": 26452 + }, + { + "epoch": 0.47181892769236256, + "grad_norm": 0.28480780124664307, + "learning_rate": 3.173706657407673e-05, + "loss": 0.1639, + "step": 26453 + }, + { + "epoch": 0.47183676381407624, + "grad_norm": 0.2981882691383362, + "learning_rate": 3.173556763605589e-05, + "loss": 0.1694, + "step": 26454 + }, + { + "epoch": 0.47185459993579, + "grad_norm": 0.19655829668045044, + "learning_rate": 3.1734068671925654e-05, + "loss": 0.1196, + "step": 26455 + }, + { + "epoch": 0.4718724360575037, + "grad_norm": 0.26821285486221313, + "learning_rate": 3.1732569681691855e-05, + "loss": 0.1286, + "step": 26456 + }, + { + "epoch": 0.47189027217921736, + "grad_norm": 0.23773056268692017, + "learning_rate": 3.173107066536029e-05, + "loss": 0.1606, + "step": 26457 + }, + { + "epoch": 0.47190810830093105, + "grad_norm": 0.3065195083618164, + "learning_rate": 3.1729571622936763e-05, + "loss": 0.1725, + "step": 26458 + }, + { + "epoch": 0.47192594442264474, + "grad_norm": 0.39167124032974243, + "learning_rate": 3.1728072554427085e-05, + "loss": 0.1955, + "step": 26459 + }, + { + "epoch": 0.4719437805443584, + "grad_norm": 0.2635594606399536, + "learning_rate": 3.1726573459837085e-05, + "loss": 0.1336, + "step": 26460 + }, + { + "epoch": 0.4719616166660721, + "grad_norm": 0.19853445887565613, + "learning_rate": 3.172507433917256e-05, + "loss": 0.1697, + "step": 26461 + }, + { + "epoch": 0.4719794527877858, + "grad_norm": 0.24068090319633484, + "learning_rate": 3.172357519243932e-05, + "loss": 0.1265, + "step": 26462 + }, + { + "epoch": 0.47199728890949955, + "grad_norm": 0.3256964385509491, + "learning_rate": 3.172207601964319e-05, + "loss": 0.1572, + "step": 26463 + }, + { + "epoch": 0.47201512503121323, + "grad_norm": 0.2837182879447937, + "learning_rate": 3.172057682078996e-05, + "loss": 0.1719, + "step": 26464 + }, + { + "epoch": 0.4720329611529269, + "grad_norm": 0.219710111618042, + "learning_rate": 3.1719077595885466e-05, + "loss": 0.1571, + "step": 26465 + }, + { + "epoch": 0.4720507972746406, + "grad_norm": 0.31797677278518677, + "learning_rate": 3.1717578344935495e-05, + "loss": 0.166, + "step": 26466 + }, + { + "epoch": 0.4720686333963543, + "grad_norm": 0.26471418142318726, + "learning_rate": 3.171607906794587e-05, + "loss": 0.153, + "step": 26467 + }, + { + "epoch": 0.472086469518068, + "grad_norm": 0.2208862602710724, + "learning_rate": 3.1714579764922406e-05, + "loss": 0.1309, + "step": 26468 + }, + { + "epoch": 0.4721043056397817, + "grad_norm": 0.33635571599006653, + "learning_rate": 3.1713080435870915e-05, + "loss": 0.1682, + "step": 26469 + }, + { + "epoch": 0.47212214176149536, + "grad_norm": 0.2618339955806732, + "learning_rate": 3.171158108079721e-05, + "loss": 0.119, + "step": 26470 + }, + { + "epoch": 0.47213997788320905, + "grad_norm": 0.26608848571777344, + "learning_rate": 3.171008169970709e-05, + "loss": 0.1462, + "step": 26471 + }, + { + "epoch": 0.4721578140049228, + "grad_norm": 0.264061838388443, + "learning_rate": 3.170858229260638e-05, + "loss": 0.1114, + "step": 26472 + }, + { + "epoch": 0.4721756501266365, + "grad_norm": 0.20839554071426392, + "learning_rate": 3.1707082859500883e-05, + "loss": 0.1441, + "step": 26473 + }, + { + "epoch": 0.47219348624835017, + "grad_norm": 0.28644636273384094, + "learning_rate": 3.170558340039642e-05, + "loss": 0.1158, + "step": 26474 + }, + { + "epoch": 0.47221132237006386, + "grad_norm": 0.22905872762203217, + "learning_rate": 3.17040839152988e-05, + "loss": 0.1214, + "step": 26475 + }, + { + "epoch": 0.47222915849177755, + "grad_norm": 0.2504383325576782, + "learning_rate": 3.170258440421384e-05, + "loss": 0.1088, + "step": 26476 + }, + { + "epoch": 0.47224699461349123, + "grad_norm": 0.27762576937675476, + "learning_rate": 3.1701084867147334e-05, + "loss": 0.1568, + "step": 26477 + }, + { + "epoch": 0.4722648307352049, + "grad_norm": 0.2974173128604889, + "learning_rate": 3.1699585304105126e-05, + "loss": 0.1722, + "step": 26478 + }, + { + "epoch": 0.4722826668569186, + "grad_norm": 0.1970190703868866, + "learning_rate": 3.1698085715092996e-05, + "loss": 0.1147, + "step": 26479 + }, + { + "epoch": 0.47230050297863235, + "grad_norm": 0.31518593430519104, + "learning_rate": 3.169658610011678e-05, + "loss": 0.1229, + "step": 26480 + }, + { + "epoch": 0.47231833910034604, + "grad_norm": 0.33436256647109985, + "learning_rate": 3.1695086459182276e-05, + "loss": 0.1727, + "step": 26481 + }, + { + "epoch": 0.47233617522205973, + "grad_norm": 0.23115403950214386, + "learning_rate": 3.169358679229531e-05, + "loss": 0.1531, + "step": 26482 + }, + { + "epoch": 0.4723540113437734, + "grad_norm": 0.24127766489982605, + "learning_rate": 3.16920870994617e-05, + "loss": 0.1335, + "step": 26483 + }, + { + "epoch": 0.4723718474654871, + "grad_norm": 0.3136756122112274, + "learning_rate": 3.169058738068723e-05, + "loss": 0.1732, + "step": 26484 + }, + { + "epoch": 0.4723896835872008, + "grad_norm": 0.293433278799057, + "learning_rate": 3.168908763597775e-05, + "loss": 0.1296, + "step": 26485 + }, + { + "epoch": 0.4724075197089145, + "grad_norm": 0.32146790623664856, + "learning_rate": 3.168758786533904e-05, + "loss": 0.1448, + "step": 26486 + }, + { + "epoch": 0.47242535583062817, + "grad_norm": 0.3155902028083801, + "learning_rate": 3.168608806877694e-05, + "loss": 0.1718, + "step": 26487 + }, + { + "epoch": 0.47244319195234186, + "grad_norm": 0.3479894697666168, + "learning_rate": 3.168458824629725e-05, + "loss": 0.1304, + "step": 26488 + }, + { + "epoch": 0.4724610280740556, + "grad_norm": 0.24790723621845245, + "learning_rate": 3.168308839790579e-05, + "loss": 0.1311, + "step": 26489 + }, + { + "epoch": 0.4724788641957693, + "grad_norm": 0.2899687886238098, + "learning_rate": 3.1681588523608364e-05, + "loss": 0.1745, + "step": 26490 + }, + { + "epoch": 0.472496700317483, + "grad_norm": 0.27624866366386414, + "learning_rate": 3.1680088623410803e-05, + "loss": 0.1714, + "step": 26491 + }, + { + "epoch": 0.47251453643919666, + "grad_norm": 0.22364988923072815, + "learning_rate": 3.1678588697318904e-05, + "loss": 0.0864, + "step": 26492 + }, + { + "epoch": 0.47253237256091035, + "grad_norm": 0.3170121908187866, + "learning_rate": 3.167708874533849e-05, + "loss": 0.1783, + "step": 26493 + }, + { + "epoch": 0.47255020868262404, + "grad_norm": 0.35619670152664185, + "learning_rate": 3.167558876747537e-05, + "loss": 0.1507, + "step": 26494 + }, + { + "epoch": 0.47256804480433773, + "grad_norm": 0.2822248637676239, + "learning_rate": 3.167408876373537e-05, + "loss": 0.1183, + "step": 26495 + }, + { + "epoch": 0.4725858809260514, + "grad_norm": 0.3348664343357086, + "learning_rate": 3.1672588734124295e-05, + "loss": 0.1543, + "step": 26496 + }, + { + "epoch": 0.47260371704776516, + "grad_norm": 0.38687536120414734, + "learning_rate": 3.167108867864796e-05, + "loss": 0.144, + "step": 26497 + }, + { + "epoch": 0.47262155316947885, + "grad_norm": 0.28511950373649597, + "learning_rate": 3.1669588597312184e-05, + "loss": 0.1702, + "step": 26498 + }, + { + "epoch": 0.47263938929119254, + "grad_norm": 0.17796646058559418, + "learning_rate": 3.166808849012277e-05, + "loss": 0.0965, + "step": 26499 + }, + { + "epoch": 0.4726572254129062, + "grad_norm": 0.33366072177886963, + "learning_rate": 3.1666588357085556e-05, + "loss": 0.165, + "step": 26500 + }, + { + "epoch": 0.4726750615346199, + "grad_norm": 0.23638556897640228, + "learning_rate": 3.166508819820633e-05, + "loss": 0.1254, + "step": 26501 + }, + { + "epoch": 0.4726928976563336, + "grad_norm": 0.2803076207637787, + "learning_rate": 3.1663588013490924e-05, + "loss": 0.1253, + "step": 26502 + }, + { + "epoch": 0.4727107337780473, + "grad_norm": 0.3209938704967499, + "learning_rate": 3.166208780294515e-05, + "loss": 0.2013, + "step": 26503 + }, + { + "epoch": 0.472728569899761, + "grad_norm": 0.2803567945957184, + "learning_rate": 3.166058756657483e-05, + "loss": 0.1051, + "step": 26504 + }, + { + "epoch": 0.47274640602147466, + "grad_norm": 0.18961292505264282, + "learning_rate": 3.1659087304385756e-05, + "loss": 0.1107, + "step": 26505 + }, + { + "epoch": 0.4727642421431884, + "grad_norm": 0.2234373539686203, + "learning_rate": 3.1657587016383776e-05, + "loss": 0.123, + "step": 26506 + }, + { + "epoch": 0.4727820782649021, + "grad_norm": 0.2961442470550537, + "learning_rate": 3.165608670257468e-05, + "loss": 0.1509, + "step": 26507 + }, + { + "epoch": 0.4727999143866158, + "grad_norm": 0.20827454328536987, + "learning_rate": 3.165458636296429e-05, + "loss": 0.1165, + "step": 26508 + }, + { + "epoch": 0.47281775050832947, + "grad_norm": 0.23899710178375244, + "learning_rate": 3.165308599755843e-05, + "loss": 0.1725, + "step": 26509 + }, + { + "epoch": 0.47283558663004316, + "grad_norm": 0.2794351577758789, + "learning_rate": 3.1651585606362914e-05, + "loss": 0.1591, + "step": 26510 + }, + { + "epoch": 0.47285342275175685, + "grad_norm": 0.217891663312912, + "learning_rate": 3.165008518938355e-05, + "loss": 0.1397, + "step": 26511 + }, + { + "epoch": 0.47287125887347053, + "grad_norm": 0.31379568576812744, + "learning_rate": 3.1648584746626167e-05, + "loss": 0.1559, + "step": 26512 + }, + { + "epoch": 0.4728890949951842, + "grad_norm": 0.20858775079250336, + "learning_rate": 3.164708427809657e-05, + "loss": 0.1375, + "step": 26513 + }, + { + "epoch": 0.47290693111689797, + "grad_norm": 0.17133578658103943, + "learning_rate": 3.164558378380057e-05, + "loss": 0.1072, + "step": 26514 + }, + { + "epoch": 0.47292476723861165, + "grad_norm": 0.27931275963783264, + "learning_rate": 3.1644083263743994e-05, + "loss": 0.1604, + "step": 26515 + }, + { + "epoch": 0.47294260336032534, + "grad_norm": 0.28538671135902405, + "learning_rate": 3.1642582717932656e-05, + "loss": 0.2105, + "step": 26516 + }, + { + "epoch": 0.47296043948203903, + "grad_norm": 0.23764726519584656, + "learning_rate": 3.164108214637238e-05, + "loss": 0.1574, + "step": 26517 + }, + { + "epoch": 0.4729782756037527, + "grad_norm": 0.25231626629829407, + "learning_rate": 3.163958154906898e-05, + "loss": 0.1755, + "step": 26518 + }, + { + "epoch": 0.4729961117254664, + "grad_norm": 0.29099997878074646, + "learning_rate": 3.163808092602826e-05, + "loss": 0.1405, + "step": 26519 + }, + { + "epoch": 0.4730139478471801, + "grad_norm": 0.24946630001068115, + "learning_rate": 3.163658027725604e-05, + "loss": 0.1338, + "step": 26520 + }, + { + "epoch": 0.4730317839688938, + "grad_norm": 0.3589918613433838, + "learning_rate": 3.163507960275815e-05, + "loss": 0.1053, + "step": 26521 + }, + { + "epoch": 0.4730496200906075, + "grad_norm": 0.21702727675437927, + "learning_rate": 3.16335789025404e-05, + "loss": 0.1338, + "step": 26522 + }, + { + "epoch": 0.4730674562123212, + "grad_norm": 0.23356683552265167, + "learning_rate": 3.1632078176608604e-05, + "loss": 0.1392, + "step": 26523 + }, + { + "epoch": 0.4730852923340349, + "grad_norm": 0.22037291526794434, + "learning_rate": 3.1630577424968585e-05, + "loss": 0.0898, + "step": 26524 + }, + { + "epoch": 0.4731031284557486, + "grad_norm": 0.34863466024398804, + "learning_rate": 3.162907664762616e-05, + "loss": 0.2478, + "step": 26525 + }, + { + "epoch": 0.4731209645774623, + "grad_norm": 0.3291824162006378, + "learning_rate": 3.162757584458714e-05, + "loss": 0.1611, + "step": 26526 + }, + { + "epoch": 0.47313880069917597, + "grad_norm": 0.24684789776802063, + "learning_rate": 3.162607501585735e-05, + "loss": 0.1359, + "step": 26527 + }, + { + "epoch": 0.47315663682088965, + "grad_norm": 0.23944151401519775, + "learning_rate": 3.1624574161442614e-05, + "loss": 0.1329, + "step": 26528 + }, + { + "epoch": 0.47317447294260334, + "grad_norm": 0.22520282864570618, + "learning_rate": 3.162307328134872e-05, + "loss": 0.1717, + "step": 26529 + }, + { + "epoch": 0.47319230906431703, + "grad_norm": 0.29841431975364685, + "learning_rate": 3.1621572375581516e-05, + "loss": 0.1314, + "step": 26530 + }, + { + "epoch": 0.4732101451860308, + "grad_norm": 0.3050844371318817, + "learning_rate": 3.162007144414681e-05, + "loss": 0.1835, + "step": 26531 + }, + { + "epoch": 0.47322798130774446, + "grad_norm": 0.2689076364040375, + "learning_rate": 3.161857048705042e-05, + "loss": 0.18, + "step": 26532 + }, + { + "epoch": 0.47324581742945815, + "grad_norm": 0.29678064584732056, + "learning_rate": 3.161706950429817e-05, + "loss": 0.1353, + "step": 26533 + }, + { + "epoch": 0.47326365355117184, + "grad_norm": 0.30316436290740967, + "learning_rate": 3.161556849589587e-05, + "loss": 0.2003, + "step": 26534 + }, + { + "epoch": 0.4732814896728855, + "grad_norm": 0.26531943678855896, + "learning_rate": 3.161406746184934e-05, + "loss": 0.1264, + "step": 26535 + }, + { + "epoch": 0.4732993257945992, + "grad_norm": 0.33824265003204346, + "learning_rate": 3.1612566402164396e-05, + "loss": 0.1212, + "step": 26536 + }, + { + "epoch": 0.4733171619163129, + "grad_norm": 0.2977240979671478, + "learning_rate": 3.161106531684687e-05, + "loss": 0.1368, + "step": 26537 + }, + { + "epoch": 0.4733349980380266, + "grad_norm": 0.37242844700813293, + "learning_rate": 3.160956420590257e-05, + "loss": 0.1722, + "step": 26538 + }, + { + "epoch": 0.47335283415974033, + "grad_norm": 0.21689192950725555, + "learning_rate": 3.160806306933731e-05, + "loss": 0.1159, + "step": 26539 + }, + { + "epoch": 0.473370670281454, + "grad_norm": 0.25579482316970825, + "learning_rate": 3.1606561907156914e-05, + "loss": 0.1159, + "step": 26540 + }, + { + "epoch": 0.4733885064031677, + "grad_norm": 0.1851266771554947, + "learning_rate": 3.160506071936721e-05, + "loss": 0.1217, + "step": 26541 + }, + { + "epoch": 0.4734063425248814, + "grad_norm": 0.3675920367240906, + "learning_rate": 3.1603559505974004e-05, + "loss": 0.1947, + "step": 26542 + }, + { + "epoch": 0.4734241786465951, + "grad_norm": 0.24690604209899902, + "learning_rate": 3.1602058266983123e-05, + "loss": 0.1729, + "step": 26543 + }, + { + "epoch": 0.47344201476830877, + "grad_norm": 0.2714647352695465, + "learning_rate": 3.160055700240038e-05, + "loss": 0.1922, + "step": 26544 + }, + { + "epoch": 0.47345985089002246, + "grad_norm": 0.2930722236633301, + "learning_rate": 3.159905571223161e-05, + "loss": 0.1539, + "step": 26545 + }, + { + "epoch": 0.47347768701173615, + "grad_norm": 0.23162326216697693, + "learning_rate": 3.159755439648261e-05, + "loss": 0.1746, + "step": 26546 + }, + { + "epoch": 0.47349552313344984, + "grad_norm": 0.29259929060935974, + "learning_rate": 3.159605305515921e-05, + "loss": 0.146, + "step": 26547 + }, + { + "epoch": 0.4735133592551636, + "grad_norm": 0.268196165561676, + "learning_rate": 3.1594551688267236e-05, + "loss": 0.1618, + "step": 26548 + }, + { + "epoch": 0.47353119537687727, + "grad_norm": 0.26541832089424133, + "learning_rate": 3.15930502958125e-05, + "loss": 0.2037, + "step": 26549 + }, + { + "epoch": 0.47354903149859096, + "grad_norm": 0.38825973868370056, + "learning_rate": 3.1591548877800825e-05, + "loss": 0.1796, + "step": 26550 + }, + { + "epoch": 0.47356686762030464, + "grad_norm": 0.24469010531902313, + "learning_rate": 3.159004743423802e-05, + "loss": 0.1685, + "step": 26551 + }, + { + "epoch": 0.47358470374201833, + "grad_norm": 0.36846232414245605, + "learning_rate": 3.158854596512993e-05, + "loss": 0.1688, + "step": 26552 + }, + { + "epoch": 0.473602539863732, + "grad_norm": 0.30020350217819214, + "learning_rate": 3.158704447048235e-05, + "loss": 0.1917, + "step": 26553 + }, + { + "epoch": 0.4736203759854457, + "grad_norm": 0.23571336269378662, + "learning_rate": 3.1585542950301116e-05, + "loss": 0.1888, + "step": 26554 + }, + { + "epoch": 0.4736382121071594, + "grad_norm": 0.34698784351348877, + "learning_rate": 3.158404140459204e-05, + "loss": 0.1488, + "step": 26555 + }, + { + "epoch": 0.47365604822887314, + "grad_norm": 0.2997521460056305, + "learning_rate": 3.1582539833360946e-05, + "loss": 0.1726, + "step": 26556 + }, + { + "epoch": 0.4736738843505868, + "grad_norm": 0.25175920128822327, + "learning_rate": 3.158103823661366e-05, + "loss": 0.1172, + "step": 26557 + }, + { + "epoch": 0.4736917204723005, + "grad_norm": 0.2800270617008209, + "learning_rate": 3.157953661435599e-05, + "loss": 0.1829, + "step": 26558 + }, + { + "epoch": 0.4737095565940142, + "grad_norm": 0.23580262064933777, + "learning_rate": 3.1578034966593756e-05, + "loss": 0.1397, + "step": 26559 + }, + { + "epoch": 0.4737273927157279, + "grad_norm": 0.2947693467140198, + "learning_rate": 3.15765332933328e-05, + "loss": 0.1425, + "step": 26560 + }, + { + "epoch": 0.4737452288374416, + "grad_norm": 0.21910950541496277, + "learning_rate": 3.157503159457892e-05, + "loss": 0.1117, + "step": 26561 + }, + { + "epoch": 0.47376306495915527, + "grad_norm": 0.2973783314228058, + "learning_rate": 3.157352987033796e-05, + "loss": 0.1351, + "step": 26562 + }, + { + "epoch": 0.47378090108086895, + "grad_norm": 0.23400671780109406, + "learning_rate": 3.1572028120615715e-05, + "loss": 0.2029, + "step": 26563 + }, + { + "epoch": 0.47379873720258264, + "grad_norm": 0.24722535908222198, + "learning_rate": 3.157052634541802e-05, + "loss": 0.1164, + "step": 26564 + }, + { + "epoch": 0.4738165733242964, + "grad_norm": 0.25167903304100037, + "learning_rate": 3.15690245447507e-05, + "loss": 0.1333, + "step": 26565 + }, + { + "epoch": 0.4738344094460101, + "grad_norm": 0.2707163989543915, + "learning_rate": 3.156752271861957e-05, + "loss": 0.1164, + "step": 26566 + }, + { + "epoch": 0.47385224556772376, + "grad_norm": 0.24458569288253784, + "learning_rate": 3.156602086703046e-05, + "loss": 0.1346, + "step": 26567 + }, + { + "epoch": 0.47387008168943745, + "grad_norm": 0.24327470362186432, + "learning_rate": 3.156451898998917e-05, + "loss": 0.1602, + "step": 26568 + }, + { + "epoch": 0.47388791781115114, + "grad_norm": 0.27301260828971863, + "learning_rate": 3.1563017087501555e-05, + "loss": 0.1544, + "step": 26569 + }, + { + "epoch": 0.4739057539328648, + "grad_norm": 0.24105463922023773, + "learning_rate": 3.1561515159573405e-05, + "loss": 0.1448, + "step": 26570 + }, + { + "epoch": 0.4739235900545785, + "grad_norm": 0.25364959239959717, + "learning_rate": 3.156001320621056e-05, + "loss": 0.1246, + "step": 26571 + }, + { + "epoch": 0.4739414261762922, + "grad_norm": 0.3247831165790558, + "learning_rate": 3.155851122741884e-05, + "loss": 0.1936, + "step": 26572 + }, + { + "epoch": 0.47395926229800595, + "grad_norm": 0.2784326374530792, + "learning_rate": 3.1557009223204065e-05, + "loss": 0.1411, + "step": 26573 + }, + { + "epoch": 0.47397709841971963, + "grad_norm": 0.5130375623703003, + "learning_rate": 3.155550719357205e-05, + "loss": 0.2661, + "step": 26574 + }, + { + "epoch": 0.4739949345414333, + "grad_norm": 0.21962743997573853, + "learning_rate": 3.1554005138528644e-05, + "loss": 0.1297, + "step": 26575 + }, + { + "epoch": 0.474012770663147, + "grad_norm": 0.23967252671718597, + "learning_rate": 3.155250305807964e-05, + "loss": 0.1141, + "step": 26576 + }, + { + "epoch": 0.4740306067848607, + "grad_norm": 0.2910381853580475, + "learning_rate": 3.155100095223087e-05, + "loss": 0.1637, + "step": 26577 + }, + { + "epoch": 0.4740484429065744, + "grad_norm": 0.3584441840648651, + "learning_rate": 3.1549498820988156e-05, + "loss": 0.2059, + "step": 26578 + }, + { + "epoch": 0.4740662790282881, + "grad_norm": 0.2497677505016327, + "learning_rate": 3.1547996664357315e-05, + "loss": 0.1633, + "step": 26579 + }, + { + "epoch": 0.47408411515000176, + "grad_norm": 0.22797313332557678, + "learning_rate": 3.154649448234419e-05, + "loss": 0.1593, + "step": 26580 + }, + { + "epoch": 0.4741019512717155, + "grad_norm": 0.2294081747531891, + "learning_rate": 3.154499227495459e-05, + "loss": 0.132, + "step": 26581 + }, + { + "epoch": 0.4741197873934292, + "grad_norm": 0.23380063474178314, + "learning_rate": 3.1543490042194336e-05, + "loss": 0.1664, + "step": 26582 + }, + { + "epoch": 0.4741376235151429, + "grad_norm": 0.26418888568878174, + "learning_rate": 3.1541987784069256e-05, + "loss": 0.1391, + "step": 26583 + }, + { + "epoch": 0.47415545963685657, + "grad_norm": 0.2400120198726654, + "learning_rate": 3.154048550058517e-05, + "loss": 0.1343, + "step": 26584 + }, + { + "epoch": 0.47417329575857026, + "grad_norm": 0.3527982532978058, + "learning_rate": 3.15389831917479e-05, + "loss": 0.196, + "step": 26585 + }, + { + "epoch": 0.47419113188028394, + "grad_norm": 0.2657168507575989, + "learning_rate": 3.153748085756328e-05, + "loss": 0.1348, + "step": 26586 + }, + { + "epoch": 0.47420896800199763, + "grad_norm": 0.25882789492607117, + "learning_rate": 3.153597849803712e-05, + "loss": 0.1737, + "step": 26587 + }, + { + "epoch": 0.4742268041237113, + "grad_norm": 0.2524459958076477, + "learning_rate": 3.153447611317526e-05, + "loss": 0.1738, + "step": 26588 + }, + { + "epoch": 0.474244640245425, + "grad_norm": 0.3235996961593628, + "learning_rate": 3.1532973702983506e-05, + "loss": 0.1502, + "step": 26589 + }, + { + "epoch": 0.47426247636713875, + "grad_norm": 0.24306495487689972, + "learning_rate": 3.153147126746769e-05, + "loss": 0.1299, + "step": 26590 + }, + { + "epoch": 0.47428031248885244, + "grad_norm": 0.22534674406051636, + "learning_rate": 3.152996880663363e-05, + "loss": 0.1479, + "step": 26591 + }, + { + "epoch": 0.47429814861056613, + "grad_norm": 0.20552285015583038, + "learning_rate": 3.152846632048716e-05, + "loss": 0.0877, + "step": 26592 + }, + { + "epoch": 0.4743159847322798, + "grad_norm": 0.24186621606349945, + "learning_rate": 3.1526963809034104e-05, + "loss": 0.217, + "step": 26593 + }, + { + "epoch": 0.4743338208539935, + "grad_norm": 0.29868051409721375, + "learning_rate": 3.1525461272280274e-05, + "loss": 0.1163, + "step": 26594 + }, + { + "epoch": 0.4743516569757072, + "grad_norm": 0.3221137523651123, + "learning_rate": 3.152395871023152e-05, + "loss": 0.0844, + "step": 26595 + }, + { + "epoch": 0.4743694930974209, + "grad_norm": 0.3390181064605713, + "learning_rate": 3.1522456122893627e-05, + "loss": 0.0881, + "step": 26596 + }, + { + "epoch": 0.47438732921913457, + "grad_norm": 0.29285022616386414, + "learning_rate": 3.152095351027246e-05, + "loss": 0.1522, + "step": 26597 + }, + { + "epoch": 0.4744051653408483, + "grad_norm": 1.3998613357543945, + "learning_rate": 3.151945087237381e-05, + "loss": 0.1752, + "step": 26598 + }, + { + "epoch": 0.474423001462562, + "grad_norm": 0.2050648182630539, + "learning_rate": 3.151794820920352e-05, + "loss": 0.1065, + "step": 26599 + }, + { + "epoch": 0.4744408375842757, + "grad_norm": 0.23183481395244598, + "learning_rate": 3.151644552076741e-05, + "loss": 0.1425, + "step": 26600 + }, + { + "epoch": 0.4744586737059894, + "grad_norm": 0.31935393810272217, + "learning_rate": 3.1514942807071315e-05, + "loss": 0.2051, + "step": 26601 + }, + { + "epoch": 0.47447650982770306, + "grad_norm": 0.29622718691825867, + "learning_rate": 3.1513440068121044e-05, + "loss": 0.1657, + "step": 26602 + }, + { + "epoch": 0.47449434594941675, + "grad_norm": 0.23380009829998016, + "learning_rate": 3.1511937303922435e-05, + "loss": 0.1634, + "step": 26603 + }, + { + "epoch": 0.47451218207113044, + "grad_norm": 0.4176250100135803, + "learning_rate": 3.15104345144813e-05, + "loss": 0.1935, + "step": 26604 + }, + { + "epoch": 0.4745300181928441, + "grad_norm": 0.24143612384796143, + "learning_rate": 3.150893169980348e-05, + "loss": 0.0811, + "step": 26605 + }, + { + "epoch": 0.4745478543145578, + "grad_norm": 0.27351275086402893, + "learning_rate": 3.150742885989478e-05, + "loss": 0.1289, + "step": 26606 + }, + { + "epoch": 0.47456569043627156, + "grad_norm": 0.2599388360977173, + "learning_rate": 3.150592599476105e-05, + "loss": 0.1583, + "step": 26607 + }, + { + "epoch": 0.47458352655798525, + "grad_norm": 0.35098227858543396, + "learning_rate": 3.150442310440811e-05, + "loss": 0.1778, + "step": 26608 + }, + { + "epoch": 0.47460136267969893, + "grad_norm": 0.2600686252117157, + "learning_rate": 3.150292018884177e-05, + "loss": 0.1183, + "step": 26609 + }, + { + "epoch": 0.4746191988014126, + "grad_norm": 0.37164050340652466, + "learning_rate": 3.1501417248067865e-05, + "loss": 0.189, + "step": 26610 + }, + { + "epoch": 0.4746370349231263, + "grad_norm": 0.26129043102264404, + "learning_rate": 3.149991428209222e-05, + "loss": 0.1572, + "step": 26611 + }, + { + "epoch": 0.47465487104484, + "grad_norm": 0.2669675350189209, + "learning_rate": 3.149841129092066e-05, + "loss": 0.0944, + "step": 26612 + }, + { + "epoch": 0.4746727071665537, + "grad_norm": 0.2776889204978943, + "learning_rate": 3.1496908274559016e-05, + "loss": 0.1492, + "step": 26613 + }, + { + "epoch": 0.4746905432882674, + "grad_norm": 0.424248069524765, + "learning_rate": 3.149540523301312e-05, + "loss": 0.1835, + "step": 26614 + }, + { + "epoch": 0.4747083794099811, + "grad_norm": 0.29181039333343506, + "learning_rate": 3.149390216628879e-05, + "loss": 0.1233, + "step": 26615 + }, + { + "epoch": 0.4747262155316948, + "grad_norm": 0.20913194119930267, + "learning_rate": 3.1492399074391835e-05, + "loss": 0.1525, + "step": 26616 + }, + { + "epoch": 0.4747440516534085, + "grad_norm": 0.2814866602420807, + "learning_rate": 3.1490895957328116e-05, + "loss": 0.1444, + "step": 26617 + }, + { + "epoch": 0.4747618877751222, + "grad_norm": 0.22930529713630676, + "learning_rate": 3.148939281510343e-05, + "loss": 0.0575, + "step": 26618 + }, + { + "epoch": 0.47477972389683587, + "grad_norm": 0.2853902280330658, + "learning_rate": 3.1487889647723625e-05, + "loss": 0.1244, + "step": 26619 + }, + { + "epoch": 0.47479756001854956, + "grad_norm": 0.24450060725212097, + "learning_rate": 3.1486386455194515e-05, + "loss": 0.1376, + "step": 26620 + }, + { + "epoch": 0.47481539614026325, + "grad_norm": 0.5030564069747925, + "learning_rate": 3.148488323752193e-05, + "loss": 0.1536, + "step": 26621 + }, + { + "epoch": 0.47483323226197693, + "grad_norm": 0.21783877909183502, + "learning_rate": 3.1483379994711705e-05, + "loss": 0.1256, + "step": 26622 + }, + { + "epoch": 0.4748510683836907, + "grad_norm": 0.20736421644687653, + "learning_rate": 3.148187672676965e-05, + "loss": 0.1254, + "step": 26623 + }, + { + "epoch": 0.47486890450540437, + "grad_norm": 0.36123204231262207, + "learning_rate": 3.14803734337016e-05, + "loss": 0.1179, + "step": 26624 + }, + { + "epoch": 0.47488674062711805, + "grad_norm": 0.36821067333221436, + "learning_rate": 3.1478870115513406e-05, + "loss": 0.112, + "step": 26625 + }, + { + "epoch": 0.47490457674883174, + "grad_norm": 0.29902219772338867, + "learning_rate": 3.1477366772210846e-05, + "loss": 0.1734, + "step": 26626 + }, + { + "epoch": 0.47492241287054543, + "grad_norm": 0.27828463912010193, + "learning_rate": 3.147586340379979e-05, + "loss": 0.1618, + "step": 26627 + }, + { + "epoch": 0.4749402489922591, + "grad_norm": 0.31777480244636536, + "learning_rate": 3.147436001028605e-05, + "loss": 0.1793, + "step": 26628 + }, + { + "epoch": 0.4749580851139728, + "grad_norm": 0.25407347083091736, + "learning_rate": 3.147285659167545e-05, + "loss": 0.1175, + "step": 26629 + }, + { + "epoch": 0.4749759212356865, + "grad_norm": 0.2659320831298828, + "learning_rate": 3.147135314797383e-05, + "loss": 0.1676, + "step": 26630 + }, + { + "epoch": 0.4749937573574002, + "grad_norm": 0.30362290143966675, + "learning_rate": 3.1469849679187e-05, + "loss": 0.1823, + "step": 26631 + }, + { + "epoch": 0.4750115934791139, + "grad_norm": 0.23539355397224426, + "learning_rate": 3.1468346185320804e-05, + "loss": 0.1205, + "step": 26632 + }, + { + "epoch": 0.4750294296008276, + "grad_norm": 0.2805580794811249, + "learning_rate": 3.146684266638107e-05, + "loss": 0.182, + "step": 26633 + }, + { + "epoch": 0.4750472657225413, + "grad_norm": 0.23611187934875488, + "learning_rate": 3.1465339122373606e-05, + "loss": 0.1365, + "step": 26634 + }, + { + "epoch": 0.475065101844255, + "grad_norm": 0.1864871382713318, + "learning_rate": 3.146383555330426e-05, + "loss": 0.1334, + "step": 26635 + }, + { + "epoch": 0.4750829379659687, + "grad_norm": 0.21819111704826355, + "learning_rate": 3.146233195917886e-05, + "loss": 0.1549, + "step": 26636 + }, + { + "epoch": 0.47510077408768236, + "grad_norm": 0.31759074330329895, + "learning_rate": 3.1460828340003225e-05, + "loss": 0.1889, + "step": 26637 + }, + { + "epoch": 0.47511861020939605, + "grad_norm": 0.30069538950920105, + "learning_rate": 3.145932469578319e-05, + "loss": 0.1707, + "step": 26638 + }, + { + "epoch": 0.47513644633110974, + "grad_norm": 0.2910155653953552, + "learning_rate": 3.145782102652458e-05, + "loss": 0.1578, + "step": 26639 + }, + { + "epoch": 0.4751542824528235, + "grad_norm": 0.2799510061740875, + "learning_rate": 3.145631733223323e-05, + "loss": 0.1176, + "step": 26640 + }, + { + "epoch": 0.4751721185745372, + "grad_norm": 0.21098117530345917, + "learning_rate": 3.1454813612914956e-05, + "loss": 0.1382, + "step": 26641 + }, + { + "epoch": 0.47518995469625086, + "grad_norm": 0.2942548394203186, + "learning_rate": 3.14533098685756e-05, + "loss": 0.1069, + "step": 26642 + }, + { + "epoch": 0.47520779081796455, + "grad_norm": 0.2770611345767975, + "learning_rate": 3.1451806099220984e-05, + "loss": 0.1661, + "step": 26643 + }, + { + "epoch": 0.47522562693967824, + "grad_norm": 0.18634948134422302, + "learning_rate": 3.145030230485694e-05, + "loss": 0.1143, + "step": 26644 + }, + { + "epoch": 0.4752434630613919, + "grad_norm": 0.2504843473434448, + "learning_rate": 3.14487984854893e-05, + "loss": 0.118, + "step": 26645 + }, + { + "epoch": 0.4752612991831056, + "grad_norm": 0.2074119597673416, + "learning_rate": 3.144729464112388e-05, + "loss": 0.0969, + "step": 26646 + }, + { + "epoch": 0.4752791353048193, + "grad_norm": 0.2745310664176941, + "learning_rate": 3.1445790771766524e-05, + "loss": 0.1811, + "step": 26647 + }, + { + "epoch": 0.475296971426533, + "grad_norm": 0.3306925296783447, + "learning_rate": 3.1444286877423054e-05, + "loss": 0.173, + "step": 26648 + }, + { + "epoch": 0.47531480754824673, + "grad_norm": 0.23517920076847076, + "learning_rate": 3.1442782958099316e-05, + "loss": 0.1278, + "step": 26649 + }, + { + "epoch": 0.4753326436699604, + "grad_norm": 0.25629228353500366, + "learning_rate": 3.144127901380111e-05, + "loss": 0.0984, + "step": 26650 + }, + { + "epoch": 0.4753504797916741, + "grad_norm": 0.2576800286769867, + "learning_rate": 3.143977504453429e-05, + "loss": 0.1631, + "step": 26651 + }, + { + "epoch": 0.4753683159133878, + "grad_norm": 0.22790028154850006, + "learning_rate": 3.143827105030467e-05, + "loss": 0.1534, + "step": 26652 + }, + { + "epoch": 0.4753861520351015, + "grad_norm": 0.34455347061157227, + "learning_rate": 3.14367670311181e-05, + "loss": 0.1249, + "step": 26653 + }, + { + "epoch": 0.47540398815681517, + "grad_norm": 0.3306369185447693, + "learning_rate": 3.143526298698039e-05, + "loss": 0.1035, + "step": 26654 + }, + { + "epoch": 0.47542182427852886, + "grad_norm": 0.519098162651062, + "learning_rate": 3.143375891789738e-05, + "loss": 0.1819, + "step": 26655 + }, + { + "epoch": 0.47543966040024255, + "grad_norm": 0.22022035717964172, + "learning_rate": 3.14322548238749e-05, + "loss": 0.1629, + "step": 26656 + }, + { + "epoch": 0.4754574965219563, + "grad_norm": 0.21873527765274048, + "learning_rate": 3.143075070491877e-05, + "loss": 0.1546, + "step": 26657 + }, + { + "epoch": 0.47547533264367, + "grad_norm": 0.24396440386772156, + "learning_rate": 3.142924656103484e-05, + "loss": 0.1383, + "step": 26658 + }, + { + "epoch": 0.47549316876538367, + "grad_norm": 0.2099212259054184, + "learning_rate": 3.1427742392228925e-05, + "loss": 0.1401, + "step": 26659 + }, + { + "epoch": 0.47551100488709735, + "grad_norm": 0.2894495725631714, + "learning_rate": 3.1426238198506865e-05, + "loss": 0.1842, + "step": 26660 + }, + { + "epoch": 0.47552884100881104, + "grad_norm": 0.24795302748680115, + "learning_rate": 3.1424733979874474e-05, + "loss": 0.1228, + "step": 26661 + }, + { + "epoch": 0.47554667713052473, + "grad_norm": 0.2640261650085449, + "learning_rate": 3.1423229736337607e-05, + "loss": 0.1527, + "step": 26662 + }, + { + "epoch": 0.4755645132522384, + "grad_norm": 0.1835392266511917, + "learning_rate": 3.142172546790208e-05, + "loss": 0.0815, + "step": 26663 + }, + { + "epoch": 0.4755823493739521, + "grad_norm": 0.22116541862487793, + "learning_rate": 3.142022117457372e-05, + "loss": 0.1137, + "step": 26664 + }, + { + "epoch": 0.4756001854956658, + "grad_norm": 0.37021613121032715, + "learning_rate": 3.141871685635837e-05, + "loss": 0.193, + "step": 26665 + }, + { + "epoch": 0.47561802161737954, + "grad_norm": 0.2605608105659485, + "learning_rate": 3.1417212513261865e-05, + "loss": 0.1421, + "step": 26666 + }, + { + "epoch": 0.4756358577390932, + "grad_norm": 0.32819122076034546, + "learning_rate": 3.141570814529001e-05, + "loss": 0.1668, + "step": 26667 + }, + { + "epoch": 0.4756536938608069, + "grad_norm": 0.25336000323295593, + "learning_rate": 3.141420375244867e-05, + "loss": 0.1502, + "step": 26668 + }, + { + "epoch": 0.4756715299825206, + "grad_norm": 0.29337984323501587, + "learning_rate": 3.1412699334743655e-05, + "loss": 0.1388, + "step": 26669 + }, + { + "epoch": 0.4756893661042343, + "grad_norm": 0.21162143349647522, + "learning_rate": 3.14111948921808e-05, + "loss": 0.1178, + "step": 26670 + }, + { + "epoch": 0.475707202225948, + "grad_norm": 0.28766298294067383, + "learning_rate": 3.140969042476595e-05, + "loss": 0.1829, + "step": 26671 + }, + { + "epoch": 0.47572503834766167, + "grad_norm": 0.23621641099452972, + "learning_rate": 3.1408185932504915e-05, + "loss": 0.1327, + "step": 26672 + }, + { + "epoch": 0.47574287446937535, + "grad_norm": 0.2442992776632309, + "learning_rate": 3.1406681415403545e-05, + "loss": 0.1749, + "step": 26673 + }, + { + "epoch": 0.4757607105910891, + "grad_norm": 0.25909215211868286, + "learning_rate": 3.1405176873467656e-05, + "loss": 0.1801, + "step": 26674 + }, + { + "epoch": 0.4757785467128028, + "grad_norm": 0.2717550992965698, + "learning_rate": 3.14036723067031e-05, + "loss": 0.2002, + "step": 26675 + }, + { + "epoch": 0.4757963828345165, + "grad_norm": 0.1897938847541809, + "learning_rate": 3.140216771511569e-05, + "loss": 0.1229, + "step": 26676 + }, + { + "epoch": 0.47581421895623016, + "grad_norm": 0.32422974705696106, + "learning_rate": 3.140066309871127e-05, + "loss": 0.1357, + "step": 26677 + }, + { + "epoch": 0.47583205507794385, + "grad_norm": 0.28003808856010437, + "learning_rate": 3.139915845749566e-05, + "loss": 0.1385, + "step": 26678 + }, + { + "epoch": 0.47584989119965754, + "grad_norm": 0.27112826704978943, + "learning_rate": 3.139765379147472e-05, + "loss": 0.2031, + "step": 26679 + }, + { + "epoch": 0.4758677273213712, + "grad_norm": 0.24170328676700592, + "learning_rate": 3.1396149100654257e-05, + "loss": 0.1545, + "step": 26680 + }, + { + "epoch": 0.4758855634430849, + "grad_norm": 0.3290562331676483, + "learning_rate": 3.13946443850401e-05, + "loss": 0.1675, + "step": 26681 + }, + { + "epoch": 0.47590339956479866, + "grad_norm": 0.3371567130088806, + "learning_rate": 3.13931396446381e-05, + "loss": 0.1702, + "step": 26682 + }, + { + "epoch": 0.47592123568651234, + "grad_norm": 0.279109925031662, + "learning_rate": 3.139163487945409e-05, + "loss": 0.1471, + "step": 26683 + }, + { + "epoch": 0.47593907180822603, + "grad_norm": 0.24332238733768463, + "learning_rate": 3.139013008949389e-05, + "loss": 0.16, + "step": 26684 + }, + { + "epoch": 0.4759569079299397, + "grad_norm": 0.2929777503013611, + "learning_rate": 3.138862527476334e-05, + "loss": 0.147, + "step": 26685 + }, + { + "epoch": 0.4759747440516534, + "grad_norm": 0.2846218943595886, + "learning_rate": 3.138712043526827e-05, + "loss": 0.1428, + "step": 26686 + }, + { + "epoch": 0.4759925801733671, + "grad_norm": 0.3117104768753052, + "learning_rate": 3.1385615571014516e-05, + "loss": 0.2169, + "step": 26687 + }, + { + "epoch": 0.4760104162950808, + "grad_norm": 0.29619720578193665, + "learning_rate": 3.1384110682007914e-05, + "loss": 0.1469, + "step": 26688 + }, + { + "epoch": 0.4760282524167945, + "grad_norm": 0.2650678753852844, + "learning_rate": 3.138260576825428e-05, + "loss": 0.1139, + "step": 26689 + }, + { + "epoch": 0.47604608853850816, + "grad_norm": 0.28708723187446594, + "learning_rate": 3.1381100829759476e-05, + "loss": 0.1748, + "step": 26690 + }, + { + "epoch": 0.4760639246602219, + "grad_norm": 0.3410675823688507, + "learning_rate": 3.137959586652931e-05, + "loss": 0.0811, + "step": 26691 + }, + { + "epoch": 0.4760817607819356, + "grad_norm": 0.2603628933429718, + "learning_rate": 3.137809087856964e-05, + "loss": 0.1051, + "step": 26692 + }, + { + "epoch": 0.4760995969036493, + "grad_norm": 0.31957879662513733, + "learning_rate": 3.137658586588628e-05, + "loss": 0.1106, + "step": 26693 + }, + { + "epoch": 0.47611743302536297, + "grad_norm": 0.5554287433624268, + "learning_rate": 3.137508082848507e-05, + "loss": 0.1919, + "step": 26694 + }, + { + "epoch": 0.47613526914707666, + "grad_norm": 0.2722870707511902, + "learning_rate": 3.137357576637184e-05, + "loss": 0.1869, + "step": 26695 + }, + { + "epoch": 0.47615310526879034, + "grad_norm": 0.2875826358795166, + "learning_rate": 3.137207067955243e-05, + "loss": 0.1314, + "step": 26696 + }, + { + "epoch": 0.47617094139050403, + "grad_norm": 0.22378547489643097, + "learning_rate": 3.1370565568032687e-05, + "loss": 0.1353, + "step": 26697 + }, + { + "epoch": 0.4761887775122177, + "grad_norm": 0.36931565403938293, + "learning_rate": 3.136906043181842e-05, + "loss": 0.165, + "step": 26698 + }, + { + "epoch": 0.47620661363393146, + "grad_norm": 0.24093963205814362, + "learning_rate": 3.136755527091548e-05, + "loss": 0.1887, + "step": 26699 + }, + { + "epoch": 0.47622444975564515, + "grad_norm": 0.2697581946849823, + "learning_rate": 3.1366050085329694e-05, + "loss": 0.1626, + "step": 26700 + }, + { + "epoch": 0.47624228587735884, + "grad_norm": 0.24962428212165833, + "learning_rate": 3.136454487506691e-05, + "loss": 0.183, + "step": 26701 + }, + { + "epoch": 0.4762601219990725, + "grad_norm": 0.3362979590892792, + "learning_rate": 3.136303964013293e-05, + "loss": 0.1822, + "step": 26702 + }, + { + "epoch": 0.4762779581207862, + "grad_norm": 0.2727450430393219, + "learning_rate": 3.136153438053362e-05, + "loss": 0.1804, + "step": 26703 + }, + { + "epoch": 0.4762957942424999, + "grad_norm": 0.21144120395183563, + "learning_rate": 3.1360029096274806e-05, + "loss": 0.1251, + "step": 26704 + }, + { + "epoch": 0.4763136303642136, + "grad_norm": 0.2462942749261856, + "learning_rate": 3.1358523787362327e-05, + "loss": 0.1693, + "step": 26705 + }, + { + "epoch": 0.4763314664859273, + "grad_norm": 0.3199939429759979, + "learning_rate": 3.135701845380201e-05, + "loss": 0.1641, + "step": 26706 + }, + { + "epoch": 0.47634930260764097, + "grad_norm": 0.34898555278778076, + "learning_rate": 3.13555130955997e-05, + "loss": 0.1611, + "step": 26707 + }, + { + "epoch": 0.4763671387293547, + "grad_norm": 0.2827635407447815, + "learning_rate": 3.135400771276122e-05, + "loss": 0.1526, + "step": 26708 + }, + { + "epoch": 0.4763849748510684, + "grad_norm": 0.2632668912410736, + "learning_rate": 3.1352502305292406e-05, + "loss": 0.1594, + "step": 26709 + }, + { + "epoch": 0.4764028109727821, + "grad_norm": 0.2850704789161682, + "learning_rate": 3.1350996873199104e-05, + "loss": 0.1348, + "step": 26710 + }, + { + "epoch": 0.4764206470944958, + "grad_norm": 0.3156528174877167, + "learning_rate": 3.134949141648715e-05, + "loss": 0.211, + "step": 26711 + }, + { + "epoch": 0.47643848321620946, + "grad_norm": 0.3261871039867401, + "learning_rate": 3.134798593516237e-05, + "loss": 0.1768, + "step": 26712 + }, + { + "epoch": 0.47645631933792315, + "grad_norm": 0.313679575920105, + "learning_rate": 3.13464804292306e-05, + "loss": 0.1972, + "step": 26713 + }, + { + "epoch": 0.47647415545963684, + "grad_norm": 0.3305988907814026, + "learning_rate": 3.134497489869769e-05, + "loss": 0.2131, + "step": 26714 + }, + { + "epoch": 0.4764919915813505, + "grad_norm": 0.2083570957183838, + "learning_rate": 3.1343469343569456e-05, + "loss": 0.1095, + "step": 26715 + }, + { + "epoch": 0.47650982770306427, + "grad_norm": 0.2725713551044464, + "learning_rate": 3.134196376385175e-05, + "loss": 0.1414, + "step": 26716 + }, + { + "epoch": 0.47652766382477796, + "grad_norm": 0.27093926072120667, + "learning_rate": 3.134045815955039e-05, + "loss": 0.1489, + "step": 26717 + }, + { + "epoch": 0.47654549994649165, + "grad_norm": 0.21418976783752441, + "learning_rate": 3.133895253067124e-05, + "loss": 0.1284, + "step": 26718 + }, + { + "epoch": 0.47656333606820533, + "grad_norm": 0.313117653131485, + "learning_rate": 3.133744687722011e-05, + "loss": 0.1493, + "step": 26719 + }, + { + "epoch": 0.476581172189919, + "grad_norm": 0.17176547646522522, + "learning_rate": 3.1335941199202855e-05, + "loss": 0.1314, + "step": 26720 + }, + { + "epoch": 0.4765990083116327, + "grad_norm": 0.3212798833847046, + "learning_rate": 3.13344354966253e-05, + "loss": 0.1568, + "step": 26721 + }, + { + "epoch": 0.4766168444333464, + "grad_norm": 0.2611295282840729, + "learning_rate": 3.133292976949328e-05, + "loss": 0.115, + "step": 26722 + }, + { + "epoch": 0.4766346805550601, + "grad_norm": 0.2615753412246704, + "learning_rate": 3.1331424017812644e-05, + "loss": 0.1556, + "step": 26723 + }, + { + "epoch": 0.47665251667677383, + "grad_norm": 0.33312904834747314, + "learning_rate": 3.1329918241589215e-05, + "loss": 0.151, + "step": 26724 + }, + { + "epoch": 0.4766703527984875, + "grad_norm": 0.2551664710044861, + "learning_rate": 3.132841244082885e-05, + "loss": 0.1592, + "step": 26725 + }, + { + "epoch": 0.4766881889202012, + "grad_norm": 0.27993300557136536, + "learning_rate": 3.1326906615537355e-05, + "loss": 0.1694, + "step": 26726 + }, + { + "epoch": 0.4767060250419149, + "grad_norm": 0.23331375420093536, + "learning_rate": 3.13254007657206e-05, + "loss": 0.1292, + "step": 26727 + }, + { + "epoch": 0.4767238611636286, + "grad_norm": 0.20556502044200897, + "learning_rate": 3.1323894891384396e-05, + "loss": 0.1375, + "step": 26728 + }, + { + "epoch": 0.47674169728534227, + "grad_norm": 0.30760133266448975, + "learning_rate": 3.1322388992534604e-05, + "loss": 0.1186, + "step": 26729 + }, + { + "epoch": 0.47675953340705596, + "grad_norm": 0.18481747806072235, + "learning_rate": 3.132088306917703e-05, + "loss": 0.1028, + "step": 26730 + }, + { + "epoch": 0.47677736952876965, + "grad_norm": 0.3166261911392212, + "learning_rate": 3.131937712131754e-05, + "loss": 0.1684, + "step": 26731 + }, + { + "epoch": 0.47679520565048333, + "grad_norm": 0.3804336190223694, + "learning_rate": 3.131787114896196e-05, + "loss": 0.1433, + "step": 26732 + }, + { + "epoch": 0.4768130417721971, + "grad_norm": 0.3028728663921356, + "learning_rate": 3.1316365152116135e-05, + "loss": 0.1548, + "step": 26733 + }, + { + "epoch": 0.47683087789391077, + "grad_norm": 0.3954489529132843, + "learning_rate": 3.1314859130785894e-05, + "loss": 0.1965, + "step": 26734 + }, + { + "epoch": 0.47684871401562445, + "grad_norm": 0.19537022709846497, + "learning_rate": 3.131335308497708e-05, + "loss": 0.1208, + "step": 26735 + }, + { + "epoch": 0.47686655013733814, + "grad_norm": 0.25616368651390076, + "learning_rate": 3.131184701469553e-05, + "loss": 0.163, + "step": 26736 + }, + { + "epoch": 0.47688438625905183, + "grad_norm": 0.2302490770816803, + "learning_rate": 3.131034091994707e-05, + "loss": 0.1329, + "step": 26737 + }, + { + "epoch": 0.4769022223807655, + "grad_norm": 0.28550970554351807, + "learning_rate": 3.130883480073755e-05, + "loss": 0.1659, + "step": 26738 + }, + { + "epoch": 0.4769200585024792, + "grad_norm": 0.29962217807769775, + "learning_rate": 3.130732865707281e-05, + "loss": 0.132, + "step": 26739 + }, + { + "epoch": 0.4769378946241929, + "grad_norm": 0.24919135868549347, + "learning_rate": 3.1305822488958694e-05, + "loss": 0.1409, + "step": 26740 + }, + { + "epoch": 0.47695573074590664, + "grad_norm": 0.2445518672466278, + "learning_rate": 3.130431629640103e-05, + "loss": 0.1575, + "step": 26741 + }, + { + "epoch": 0.4769735668676203, + "grad_norm": 0.31576380133628845, + "learning_rate": 3.1302810079405654e-05, + "loss": 0.1242, + "step": 26742 + }, + { + "epoch": 0.476991402989334, + "grad_norm": 0.36300721764564514, + "learning_rate": 3.1301303837978415e-05, + "loss": 0.1468, + "step": 26743 + }, + { + "epoch": 0.4770092391110477, + "grad_norm": 0.27243566513061523, + "learning_rate": 3.129979757212513e-05, + "loss": 0.168, + "step": 26744 + }, + { + "epoch": 0.4770270752327614, + "grad_norm": 0.29267990589141846, + "learning_rate": 3.1298291281851664e-05, + "loss": 0.1304, + "step": 26745 + }, + { + "epoch": 0.4770449113544751, + "grad_norm": 0.21837356686592102, + "learning_rate": 3.1296784967163854e-05, + "loss": 0.1917, + "step": 26746 + }, + { + "epoch": 0.47706274747618876, + "grad_norm": 0.22631166875362396, + "learning_rate": 3.129527862806753e-05, + "loss": 0.0898, + "step": 26747 + }, + { + "epoch": 0.47708058359790245, + "grad_norm": 0.20745904743671417, + "learning_rate": 3.1293772264568524e-05, + "loss": 0.0834, + "step": 26748 + }, + { + "epoch": 0.47709841971961614, + "grad_norm": 0.2255771905183792, + "learning_rate": 3.1292265876672686e-05, + "loss": 0.1478, + "step": 26749 + }, + { + "epoch": 0.4771162558413299, + "grad_norm": 0.28668999671936035, + "learning_rate": 3.1290759464385844e-05, + "loss": 0.1301, + "step": 26750 + }, + { + "epoch": 0.47713409196304357, + "grad_norm": 0.24045246839523315, + "learning_rate": 3.128925302771385e-05, + "loss": 0.1882, + "step": 26751 + }, + { + "epoch": 0.47715192808475726, + "grad_norm": 0.3463227450847626, + "learning_rate": 3.128774656666254e-05, + "loss": 0.156, + "step": 26752 + }, + { + "epoch": 0.47716976420647095, + "grad_norm": 0.28448840975761414, + "learning_rate": 3.128624008123775e-05, + "loss": 0.1414, + "step": 26753 + }, + { + "epoch": 0.47718760032818464, + "grad_norm": 0.283145546913147, + "learning_rate": 3.128473357144533e-05, + "loss": 0.1245, + "step": 26754 + }, + { + "epoch": 0.4772054364498983, + "grad_norm": 0.24987533688545227, + "learning_rate": 3.128322703729111e-05, + "loss": 0.1467, + "step": 26755 + }, + { + "epoch": 0.477223272571612, + "grad_norm": 0.3295406699180603, + "learning_rate": 3.128172047878093e-05, + "loss": 0.1493, + "step": 26756 + }, + { + "epoch": 0.4772411086933257, + "grad_norm": 0.3394148647785187, + "learning_rate": 3.1280213895920634e-05, + "loss": 0.1207, + "step": 26757 + }, + { + "epoch": 0.47725894481503944, + "grad_norm": 0.33238524198532104, + "learning_rate": 3.127870728871606e-05, + "loss": 0.1307, + "step": 26758 + }, + { + "epoch": 0.47727678093675313, + "grad_norm": 0.3225302994251251, + "learning_rate": 3.127720065717304e-05, + "loss": 0.1835, + "step": 26759 + }, + { + "epoch": 0.4772946170584668, + "grad_norm": 0.21729303896427155, + "learning_rate": 3.127569400129743e-05, + "loss": 0.1063, + "step": 26760 + }, + { + "epoch": 0.4773124531801805, + "grad_norm": 0.3240685760974884, + "learning_rate": 3.1274187321095056e-05, + "loss": 0.2023, + "step": 26761 + }, + { + "epoch": 0.4773302893018942, + "grad_norm": 0.3455943763256073, + "learning_rate": 3.127268061657177e-05, + "loss": 0.1667, + "step": 26762 + }, + { + "epoch": 0.4773481254236079, + "grad_norm": 0.21467365324497223, + "learning_rate": 3.127117388773341e-05, + "loss": 0.1464, + "step": 26763 + }, + { + "epoch": 0.47736596154532157, + "grad_norm": 0.27158910036087036, + "learning_rate": 3.126966713458581e-05, + "loss": 0.2106, + "step": 26764 + }, + { + "epoch": 0.47738379766703526, + "grad_norm": 0.22854214906692505, + "learning_rate": 3.126816035713481e-05, + "loss": 0.1252, + "step": 26765 + }, + { + "epoch": 0.47740163378874895, + "grad_norm": 0.28195810317993164, + "learning_rate": 3.126665355538626e-05, + "loss": 0.1365, + "step": 26766 + }, + { + "epoch": 0.4774194699104627, + "grad_norm": 0.2766095697879791, + "learning_rate": 3.126514672934599e-05, + "loss": 0.1624, + "step": 26767 + }, + { + "epoch": 0.4774373060321764, + "grad_norm": 0.2377602607011795, + "learning_rate": 3.126363987901986e-05, + "loss": 0.1708, + "step": 26768 + }, + { + "epoch": 0.47745514215389007, + "grad_norm": 0.23954246938228607, + "learning_rate": 3.1262133004413696e-05, + "loss": 0.1572, + "step": 26769 + }, + { + "epoch": 0.47747297827560375, + "grad_norm": 0.22584986686706543, + "learning_rate": 3.126062610553334e-05, + "loss": 0.1169, + "step": 26770 + }, + { + "epoch": 0.47749081439731744, + "grad_norm": 0.23570430278778076, + "learning_rate": 3.125911918238463e-05, + "loss": 0.0877, + "step": 26771 + }, + { + "epoch": 0.47750865051903113, + "grad_norm": 0.2953796088695526, + "learning_rate": 3.125761223497341e-05, + "loss": 0.0967, + "step": 26772 + }, + { + "epoch": 0.4775264866407448, + "grad_norm": 0.2231953889131546, + "learning_rate": 3.125610526330553e-05, + "loss": 0.1599, + "step": 26773 + }, + { + "epoch": 0.4775443227624585, + "grad_norm": 0.33506304025650024, + "learning_rate": 3.1254598267386826e-05, + "loss": 0.1411, + "step": 26774 + }, + { + "epoch": 0.47756215888417225, + "grad_norm": 0.2571059465408325, + "learning_rate": 3.125309124722314e-05, + "loss": 0.11, + "step": 26775 + }, + { + "epoch": 0.47757999500588594, + "grad_norm": 0.2351292371749878, + "learning_rate": 3.125158420282031e-05, + "loss": 0.139, + "step": 26776 + }, + { + "epoch": 0.4775978311275996, + "grad_norm": 0.2855851352214813, + "learning_rate": 3.125007713418418e-05, + "loss": 0.1339, + "step": 26777 + }, + { + "epoch": 0.4776156672493133, + "grad_norm": 0.26687389612197876, + "learning_rate": 3.1248570041320594e-05, + "loss": 0.1525, + "step": 26778 + }, + { + "epoch": 0.477633503371027, + "grad_norm": 0.25127851963043213, + "learning_rate": 3.124706292423539e-05, + "loss": 0.1743, + "step": 26779 + }, + { + "epoch": 0.4776513394927407, + "grad_norm": 0.23393309116363525, + "learning_rate": 3.124555578293441e-05, + "loss": 0.1118, + "step": 26780 + }, + { + "epoch": 0.4776691756144544, + "grad_norm": 0.25191599130630493, + "learning_rate": 3.1244048617423504e-05, + "loss": 0.1743, + "step": 26781 + }, + { + "epoch": 0.47768701173616807, + "grad_norm": 0.3310117721557617, + "learning_rate": 3.1242541427708504e-05, + "loss": 0.1609, + "step": 26782 + }, + { + "epoch": 0.4777048478578818, + "grad_norm": 0.5342959761619568, + "learning_rate": 3.1241034213795264e-05, + "loss": 0.1776, + "step": 26783 + }, + { + "epoch": 0.4777226839795955, + "grad_norm": 0.2696291506290436, + "learning_rate": 3.123952697568962e-05, + "loss": 0.1562, + "step": 26784 + }, + { + "epoch": 0.4777405201013092, + "grad_norm": 0.3045158386230469, + "learning_rate": 3.1238019713397406e-05, + "loss": 0.1707, + "step": 26785 + }, + { + "epoch": 0.4777583562230229, + "grad_norm": 0.22710925340652466, + "learning_rate": 3.123651242692448e-05, + "loss": 0.1729, + "step": 26786 + }, + { + "epoch": 0.47777619234473656, + "grad_norm": 0.3497841954231262, + "learning_rate": 3.123500511627667e-05, + "loss": 0.1491, + "step": 26787 + }, + { + "epoch": 0.47779402846645025, + "grad_norm": 0.3096001446247101, + "learning_rate": 3.123349778145983e-05, + "loss": 0.1477, + "step": 26788 + }, + { + "epoch": 0.47781186458816394, + "grad_norm": 0.20097851753234863, + "learning_rate": 3.12319904224798e-05, + "loss": 0.1251, + "step": 26789 + }, + { + "epoch": 0.4778297007098776, + "grad_norm": 0.2788364887237549, + "learning_rate": 3.123048303934243e-05, + "loss": 0.1519, + "step": 26790 + }, + { + "epoch": 0.4778475368315913, + "grad_norm": 0.21522000432014465, + "learning_rate": 3.1228975632053546e-05, + "loss": 0.1329, + "step": 26791 + }, + { + "epoch": 0.47786537295330506, + "grad_norm": 0.2439911812543869, + "learning_rate": 3.1227468200619006e-05, + "loss": 0.1062, + "step": 26792 + }, + { + "epoch": 0.47788320907501874, + "grad_norm": 0.3866584897041321, + "learning_rate": 3.1225960745044644e-05, + "loss": 0.1775, + "step": 26793 + }, + { + "epoch": 0.47790104519673243, + "grad_norm": 0.27161309123039246, + "learning_rate": 3.1224453265336314e-05, + "loss": 0.1537, + "step": 26794 + }, + { + "epoch": 0.4779188813184461, + "grad_norm": 0.2869950532913208, + "learning_rate": 3.1222945761499843e-05, + "loss": 0.1028, + "step": 26795 + }, + { + "epoch": 0.4779367174401598, + "grad_norm": 0.2506483197212219, + "learning_rate": 3.12214382335411e-05, + "loss": 0.1462, + "step": 26796 + }, + { + "epoch": 0.4779545535618735, + "grad_norm": 0.3127689063549042, + "learning_rate": 3.12199306814659e-05, + "loss": 0.1724, + "step": 26797 + }, + { + "epoch": 0.4779723896835872, + "grad_norm": 0.22565217316150665, + "learning_rate": 3.121842310528011e-05, + "loss": 0.1195, + "step": 26798 + }, + { + "epoch": 0.4779902258053009, + "grad_norm": 0.19420625269412994, + "learning_rate": 3.121691550498956e-05, + "loss": 0.1326, + "step": 26799 + }, + { + "epoch": 0.4780080619270146, + "grad_norm": 0.2743839621543884, + "learning_rate": 3.1215407880600096e-05, + "loss": 0.1545, + "step": 26800 + }, + { + "epoch": 0.4780258980487283, + "grad_norm": 0.2599467635154724, + "learning_rate": 3.121390023211757e-05, + "loss": 0.1313, + "step": 26801 + }, + { + "epoch": 0.478043734170442, + "grad_norm": 0.22616392374038696, + "learning_rate": 3.1212392559547814e-05, + "loss": 0.1225, + "step": 26802 + }, + { + "epoch": 0.4780615702921557, + "grad_norm": 0.23243194818496704, + "learning_rate": 3.121088486289669e-05, + "loss": 0.1832, + "step": 26803 + }, + { + "epoch": 0.47807940641386937, + "grad_norm": 0.23705218732357025, + "learning_rate": 3.120937714217002e-05, + "loss": 0.115, + "step": 26804 + }, + { + "epoch": 0.47809724253558306, + "grad_norm": 0.25697460770606995, + "learning_rate": 3.120786939737367e-05, + "loss": 0.1286, + "step": 26805 + }, + { + "epoch": 0.47811507865729674, + "grad_norm": 0.178166463971138, + "learning_rate": 3.1206361628513456e-05, + "loss": 0.1258, + "step": 26806 + }, + { + "epoch": 0.47813291477901043, + "grad_norm": 0.3773612976074219, + "learning_rate": 3.120485383559525e-05, + "loss": 0.1303, + "step": 26807 + }, + { + "epoch": 0.4781507509007241, + "grad_norm": 0.3702050745487213, + "learning_rate": 3.120334601862489e-05, + "loss": 0.1412, + "step": 26808 + }, + { + "epoch": 0.47816858702243786, + "grad_norm": 0.2967703938484192, + "learning_rate": 3.120183817760822e-05, + "loss": 0.1549, + "step": 26809 + }, + { + "epoch": 0.47818642314415155, + "grad_norm": 0.3417518436908722, + "learning_rate": 3.120033031255108e-05, + "loss": 0.1623, + "step": 26810 + }, + { + "epoch": 0.47820425926586524, + "grad_norm": 0.2530301809310913, + "learning_rate": 3.119882242345932e-05, + "loss": 0.1336, + "step": 26811 + }, + { + "epoch": 0.4782220953875789, + "grad_norm": 0.2536642253398895, + "learning_rate": 3.1197314510338786e-05, + "loss": 0.1089, + "step": 26812 + }, + { + "epoch": 0.4782399315092926, + "grad_norm": 0.3018265664577484, + "learning_rate": 3.1195806573195314e-05, + "loss": 0.1206, + "step": 26813 + }, + { + "epoch": 0.4782577676310063, + "grad_norm": 0.2743295729160309, + "learning_rate": 3.119429861203476e-05, + "loss": 0.1477, + "step": 26814 + }, + { + "epoch": 0.47827560375272, + "grad_norm": 0.23781079053878784, + "learning_rate": 3.119279062686296e-05, + "loss": 0.1188, + "step": 26815 + }, + { + "epoch": 0.4782934398744337, + "grad_norm": 0.33741384744644165, + "learning_rate": 3.119128261768577e-05, + "loss": 0.1691, + "step": 26816 + }, + { + "epoch": 0.4783112759961474, + "grad_norm": 0.33613476157188416, + "learning_rate": 3.1189774584509024e-05, + "loss": 0.1396, + "step": 26817 + }, + { + "epoch": 0.4783291121178611, + "grad_norm": 0.2095946967601776, + "learning_rate": 3.1188266527338584e-05, + "loss": 0.1444, + "step": 26818 + }, + { + "epoch": 0.4783469482395748, + "grad_norm": 0.2285582572221756, + "learning_rate": 3.118675844618027e-05, + "loss": 0.1866, + "step": 26819 + }, + { + "epoch": 0.4783647843612885, + "grad_norm": 0.3779899775981903, + "learning_rate": 3.1185250341039965e-05, + "loss": 0.1456, + "step": 26820 + }, + { + "epoch": 0.4783826204830022, + "grad_norm": 0.23052087426185608, + "learning_rate": 3.1183742211923475e-05, + "loss": 0.1414, + "step": 26821 + }, + { + "epoch": 0.47840045660471586, + "grad_norm": 0.251264750957489, + "learning_rate": 3.118223405883667e-05, + "loss": 0.1401, + "step": 26822 + }, + { + "epoch": 0.47841829272642955, + "grad_norm": 0.3534930646419525, + "learning_rate": 3.118072588178539e-05, + "loss": 0.1341, + "step": 26823 + }, + { + "epoch": 0.47843612884814324, + "grad_norm": 0.23159746825695038, + "learning_rate": 3.1179217680775485e-05, + "loss": 0.1347, + "step": 26824 + }, + { + "epoch": 0.478453964969857, + "grad_norm": 0.23824146389961243, + "learning_rate": 3.11777094558128e-05, + "loss": 0.1206, + "step": 26825 + }, + { + "epoch": 0.47847180109157067, + "grad_norm": 0.29496538639068604, + "learning_rate": 3.117620120690317e-05, + "loss": 0.1087, + "step": 26826 + }, + { + "epoch": 0.47848963721328436, + "grad_norm": 0.27531182765960693, + "learning_rate": 3.117469293405246e-05, + "loss": 0.1474, + "step": 26827 + }, + { + "epoch": 0.47850747333499805, + "grad_norm": 0.3209870755672455, + "learning_rate": 3.11731846372665e-05, + "loss": 0.1638, + "step": 26828 + }, + { + "epoch": 0.47852530945671173, + "grad_norm": 0.23279798030853271, + "learning_rate": 3.1171676316551153e-05, + "loss": 0.1212, + "step": 26829 + }, + { + "epoch": 0.4785431455784254, + "grad_norm": 0.2647162675857544, + "learning_rate": 3.1170167971912246e-05, + "loss": 0.1494, + "step": 26830 + }, + { + "epoch": 0.4785609817001391, + "grad_norm": 0.32629281282424927, + "learning_rate": 3.116865960335565e-05, + "loss": 0.2031, + "step": 26831 + }, + { + "epoch": 0.4785788178218528, + "grad_norm": 0.24953068792819977, + "learning_rate": 3.116715121088718e-05, + "loss": 0.1488, + "step": 26832 + }, + { + "epoch": 0.4785966539435665, + "grad_norm": 0.21855202317237854, + "learning_rate": 3.1165642794512724e-05, + "loss": 0.1352, + "step": 26833 + }, + { + "epoch": 0.47861449006528023, + "grad_norm": 0.3009147346019745, + "learning_rate": 3.11641343542381e-05, + "loss": 0.1691, + "step": 26834 + }, + { + "epoch": 0.4786323261869939, + "grad_norm": 0.25184929370880127, + "learning_rate": 3.1162625890069154e-05, + "loss": 0.1347, + "step": 26835 + }, + { + "epoch": 0.4786501623087076, + "grad_norm": 0.27973997592926025, + "learning_rate": 3.116111740201174e-05, + "loss": 0.1669, + "step": 26836 + }, + { + "epoch": 0.4786679984304213, + "grad_norm": 0.20674775540828705, + "learning_rate": 3.1159608890071715e-05, + "loss": 0.132, + "step": 26837 + }, + { + "epoch": 0.478685834552135, + "grad_norm": 0.24774637818336487, + "learning_rate": 3.1158100354254924e-05, + "loss": 0.1642, + "step": 26838 + }, + { + "epoch": 0.47870367067384867, + "grad_norm": 0.25628334283828735, + "learning_rate": 3.11565917945672e-05, + "loss": 0.147, + "step": 26839 + }, + { + "epoch": 0.47872150679556236, + "grad_norm": 0.21749669313430786, + "learning_rate": 3.11550832110144e-05, + "loss": 0.1401, + "step": 26840 + }, + { + "epoch": 0.47873934291727605, + "grad_norm": 0.29271650314331055, + "learning_rate": 3.1153574603602375e-05, + "loss": 0.1625, + "step": 26841 + }, + { + "epoch": 0.4787571790389898, + "grad_norm": 0.29296764731407166, + "learning_rate": 3.1152065972336964e-05, + "loss": 0.154, + "step": 26842 + }, + { + "epoch": 0.4787750151607035, + "grad_norm": 0.22280484437942505, + "learning_rate": 3.115055731722402e-05, + "loss": 0.0953, + "step": 26843 + }, + { + "epoch": 0.47879285128241716, + "grad_norm": 0.27112606167793274, + "learning_rate": 3.11490486382694e-05, + "loss": 0.2115, + "step": 26844 + }, + { + "epoch": 0.47881068740413085, + "grad_norm": 0.27489474415779114, + "learning_rate": 3.1147539935478935e-05, + "loss": 0.1453, + "step": 26845 + }, + { + "epoch": 0.47882852352584454, + "grad_norm": 0.18704353272914886, + "learning_rate": 3.114603120885849e-05, + "loss": 0.0846, + "step": 26846 + }, + { + "epoch": 0.47884635964755823, + "grad_norm": 0.23803308606147766, + "learning_rate": 3.11445224584139e-05, + "loss": 0.1511, + "step": 26847 + }, + { + "epoch": 0.4788641957692719, + "grad_norm": 0.22296997904777527, + "learning_rate": 3.114301368415102e-05, + "loss": 0.1541, + "step": 26848 + }, + { + "epoch": 0.4788820318909856, + "grad_norm": 0.267245352268219, + "learning_rate": 3.1141504886075695e-05, + "loss": 0.1384, + "step": 26849 + }, + { + "epoch": 0.4788998680126993, + "grad_norm": 0.2892456650733948, + "learning_rate": 3.113999606419378e-05, + "loss": 0.1375, + "step": 26850 + }, + { + "epoch": 0.47891770413441304, + "grad_norm": 0.3024786114692688, + "learning_rate": 3.113848721851113e-05, + "loss": 0.1139, + "step": 26851 + }, + { + "epoch": 0.4789355402561267, + "grad_norm": 0.4629128575325012, + "learning_rate": 3.113697834903356e-05, + "loss": 0.1468, + "step": 26852 + }, + { + "epoch": 0.4789533763778404, + "grad_norm": 0.21727323532104492, + "learning_rate": 3.113546945576696e-05, + "loss": 0.1694, + "step": 26853 + }, + { + "epoch": 0.4789712124995541, + "grad_norm": 0.33874839544296265, + "learning_rate": 3.113396053871715e-05, + "loss": 0.1671, + "step": 26854 + }, + { + "epoch": 0.4789890486212678, + "grad_norm": 0.22276677191257477, + "learning_rate": 3.1132451597889996e-05, + "loss": 0.1516, + "step": 26855 + }, + { + "epoch": 0.4790068847429815, + "grad_norm": 0.26417648792266846, + "learning_rate": 3.113094263329134e-05, + "loss": 0.1837, + "step": 26856 + }, + { + "epoch": 0.47902472086469516, + "grad_norm": 0.299300879240036, + "learning_rate": 3.112943364492703e-05, + "loss": 0.1344, + "step": 26857 + }, + { + "epoch": 0.47904255698640885, + "grad_norm": 0.31141749024391174, + "learning_rate": 3.112792463280292e-05, + "loss": 0.1842, + "step": 26858 + }, + { + "epoch": 0.4790603931081226, + "grad_norm": 0.2354898750782013, + "learning_rate": 3.112641559692486e-05, + "loss": 0.1295, + "step": 26859 + }, + { + "epoch": 0.4790782292298363, + "grad_norm": 0.361444354057312, + "learning_rate": 3.11249065372987e-05, + "loss": 0.1836, + "step": 26860 + }, + { + "epoch": 0.47909606535154997, + "grad_norm": 0.25282618403434753, + "learning_rate": 3.112339745393029e-05, + "loss": 0.1605, + "step": 26861 + }, + { + "epoch": 0.47911390147326366, + "grad_norm": 0.2857251763343811, + "learning_rate": 3.1121888346825465e-05, + "loss": 0.1725, + "step": 26862 + }, + { + "epoch": 0.47913173759497735, + "grad_norm": 0.23793472349643707, + "learning_rate": 3.1120379215990085e-05, + "loss": 0.1511, + "step": 26863 + }, + { + "epoch": 0.47914957371669104, + "grad_norm": 0.2909567952156067, + "learning_rate": 3.111887006143001e-05, + "loss": 0.1242, + "step": 26864 + }, + { + "epoch": 0.4791674098384047, + "grad_norm": 0.2157069891691208, + "learning_rate": 3.1117360883151074e-05, + "loss": 0.1461, + "step": 26865 + }, + { + "epoch": 0.4791852459601184, + "grad_norm": 0.20435114204883575, + "learning_rate": 3.1115851681159147e-05, + "loss": 0.1117, + "step": 26866 + }, + { + "epoch": 0.4792030820818321, + "grad_norm": 0.23241065442562103, + "learning_rate": 3.1114342455460054e-05, + "loss": 0.144, + "step": 26867 + }, + { + "epoch": 0.47922091820354584, + "grad_norm": 0.3007977604866028, + "learning_rate": 3.1112833206059665e-05, + "loss": 0.1938, + "step": 26868 + }, + { + "epoch": 0.47923875432525953, + "grad_norm": 0.23746216297149658, + "learning_rate": 3.111132393296382e-05, + "loss": 0.1428, + "step": 26869 + }, + { + "epoch": 0.4792565904469732, + "grad_norm": 0.23492389917373657, + "learning_rate": 3.110981463617837e-05, + "loss": 0.1942, + "step": 26870 + }, + { + "epoch": 0.4792744265686869, + "grad_norm": 0.2607043981552124, + "learning_rate": 3.110830531570917e-05, + "loss": 0.1229, + "step": 26871 + }, + { + "epoch": 0.4792922626904006, + "grad_norm": 0.21305793523788452, + "learning_rate": 3.1106795971562076e-05, + "loss": 0.1461, + "step": 26872 + }, + { + "epoch": 0.4793100988121143, + "grad_norm": 0.32901135087013245, + "learning_rate": 3.110528660374292e-05, + "loss": 0.142, + "step": 26873 + }, + { + "epoch": 0.47932793493382797, + "grad_norm": 0.26443949341773987, + "learning_rate": 3.1103777212257575e-05, + "loss": 0.1249, + "step": 26874 + }, + { + "epoch": 0.47934577105554166, + "grad_norm": 0.24153360724449158, + "learning_rate": 3.110226779711187e-05, + "loss": 0.1809, + "step": 26875 + }, + { + "epoch": 0.4793636071772554, + "grad_norm": 0.2590068280696869, + "learning_rate": 3.110075835831168e-05, + "loss": 0.1182, + "step": 26876 + }, + { + "epoch": 0.4793814432989691, + "grad_norm": 0.3583972752094269, + "learning_rate": 3.109924889586283e-05, + "loss": 0.223, + "step": 26877 + }, + { + "epoch": 0.4793992794206828, + "grad_norm": 0.39803624153137207, + "learning_rate": 3.1097739409771194e-05, + "loss": 0.1851, + "step": 26878 + }, + { + "epoch": 0.47941711554239647, + "grad_norm": 0.33544570207595825, + "learning_rate": 3.1096229900042615e-05, + "loss": 0.1839, + "step": 26879 + }, + { + "epoch": 0.47943495166411015, + "grad_norm": 0.2965698540210724, + "learning_rate": 3.109472036668294e-05, + "loss": 0.1018, + "step": 26880 + }, + { + "epoch": 0.47945278778582384, + "grad_norm": 0.2768281400203705, + "learning_rate": 3.109321080969803e-05, + "loss": 0.1614, + "step": 26881 + }, + { + "epoch": 0.47947062390753753, + "grad_norm": 0.2305006980895996, + "learning_rate": 3.109170122909372e-05, + "loss": 0.1609, + "step": 26882 + }, + { + "epoch": 0.4794884600292512, + "grad_norm": 0.24510516226291656, + "learning_rate": 3.1090191624875875e-05, + "loss": 0.1272, + "step": 26883 + }, + { + "epoch": 0.47950629615096496, + "grad_norm": 0.2029527723789215, + "learning_rate": 3.108868199705034e-05, + "loss": 0.1564, + "step": 26884 + }, + { + "epoch": 0.47952413227267865, + "grad_norm": 0.317044734954834, + "learning_rate": 3.108717234562298e-05, + "loss": 0.1425, + "step": 26885 + }, + { + "epoch": 0.47954196839439234, + "grad_norm": 0.2231772243976593, + "learning_rate": 3.108566267059963e-05, + "loss": 0.1742, + "step": 26886 + }, + { + "epoch": 0.479559804516106, + "grad_norm": 0.20359310507774353, + "learning_rate": 3.1084152971986155e-05, + "loss": 0.1164, + "step": 26887 + }, + { + "epoch": 0.4795776406378197, + "grad_norm": 0.24313755333423615, + "learning_rate": 3.10826432497884e-05, + "loss": 0.1124, + "step": 26888 + }, + { + "epoch": 0.4795954767595334, + "grad_norm": 0.24253372848033905, + "learning_rate": 3.1081133504012224e-05, + "loss": 0.1406, + "step": 26889 + }, + { + "epoch": 0.4796133128812471, + "grad_norm": 0.20761600136756897, + "learning_rate": 3.1079623734663465e-05, + "loss": 0.1582, + "step": 26890 + }, + { + "epoch": 0.4796311490029608, + "grad_norm": 0.2471642941236496, + "learning_rate": 3.107811394174798e-05, + "loss": 0.1496, + "step": 26891 + }, + { + "epoch": 0.47964898512467447, + "grad_norm": 0.20048244297504425, + "learning_rate": 3.1076604125271644e-05, + "loss": 0.1271, + "step": 26892 + }, + { + "epoch": 0.4796668212463882, + "grad_norm": 0.22641848027706146, + "learning_rate": 3.107509428524028e-05, + "loss": 0.1311, + "step": 26893 + }, + { + "epoch": 0.4796846573681019, + "grad_norm": 0.26793721318244934, + "learning_rate": 3.107358442165976e-05, + "loss": 0.1143, + "step": 26894 + }, + { + "epoch": 0.4797024934898156, + "grad_norm": 0.25153848528862, + "learning_rate": 3.1072074534535916e-05, + "loss": 0.1505, + "step": 26895 + }, + { + "epoch": 0.4797203296115293, + "grad_norm": 0.22778263688087463, + "learning_rate": 3.1070564623874625e-05, + "loss": 0.106, + "step": 26896 + }, + { + "epoch": 0.47973816573324296, + "grad_norm": 0.2666641175746918, + "learning_rate": 3.106905468968172e-05, + "loss": 0.122, + "step": 26897 + }, + { + "epoch": 0.47975600185495665, + "grad_norm": 0.2887571156024933, + "learning_rate": 3.106754473196307e-05, + "loss": 0.1307, + "step": 26898 + }, + { + "epoch": 0.47977383797667034, + "grad_norm": 0.2980011999607086, + "learning_rate": 3.106603475072452e-05, + "loss": 0.1073, + "step": 26899 + }, + { + "epoch": 0.479791674098384, + "grad_norm": 0.31376636028289795, + "learning_rate": 3.106452474597192e-05, + "loss": 0.1096, + "step": 26900 + }, + { + "epoch": 0.47980951022009777, + "grad_norm": 0.30685168504714966, + "learning_rate": 3.106301471771113e-05, + "loss": 0.1168, + "step": 26901 + }, + { + "epoch": 0.47982734634181146, + "grad_norm": 0.3056320548057556, + "learning_rate": 3.106150466594801e-05, + "loss": 0.1571, + "step": 26902 + }, + { + "epoch": 0.47984518246352514, + "grad_norm": 0.42128077149391174, + "learning_rate": 3.105999459068839e-05, + "loss": 0.1397, + "step": 26903 + }, + { + "epoch": 0.47986301858523883, + "grad_norm": 0.37117356061935425, + "learning_rate": 3.105848449193814e-05, + "loss": 0.1407, + "step": 26904 + }, + { + "epoch": 0.4798808547069525, + "grad_norm": 0.2590087354183197, + "learning_rate": 3.1056974369703115e-05, + "loss": 0.1771, + "step": 26905 + }, + { + "epoch": 0.4798986908286662, + "grad_norm": 0.2574358880519867, + "learning_rate": 3.105546422398916e-05, + "loss": 0.1101, + "step": 26906 + }, + { + "epoch": 0.4799165269503799, + "grad_norm": 0.34375739097595215, + "learning_rate": 3.105395405480215e-05, + "loss": 0.1291, + "step": 26907 + }, + { + "epoch": 0.4799343630720936, + "grad_norm": 0.18603846430778503, + "learning_rate": 3.10524438621479e-05, + "loss": 0.1025, + "step": 26908 + }, + { + "epoch": 0.47995219919380727, + "grad_norm": 0.2082604020833969, + "learning_rate": 3.105093364603231e-05, + "loss": 0.1559, + "step": 26909 + }, + { + "epoch": 0.479970035315521, + "grad_norm": 0.3786512613296509, + "learning_rate": 3.10494234064612e-05, + "loss": 0.2184, + "step": 26910 + }, + { + "epoch": 0.4799878714372347, + "grad_norm": 0.200238436460495, + "learning_rate": 3.1047913143440436e-05, + "loss": 0.1297, + "step": 26911 + }, + { + "epoch": 0.4800057075589484, + "grad_norm": 0.22389757633209229, + "learning_rate": 3.104640285697587e-05, + "loss": 0.1229, + "step": 26912 + }, + { + "epoch": 0.4800235436806621, + "grad_norm": 0.21933946013450623, + "learning_rate": 3.104489254707336e-05, + "loss": 0.1544, + "step": 26913 + }, + { + "epoch": 0.48004137980237577, + "grad_norm": 0.26832059025764465, + "learning_rate": 3.104338221373876e-05, + "loss": 0.1536, + "step": 26914 + }, + { + "epoch": 0.48005921592408946, + "grad_norm": 0.24429138004779816, + "learning_rate": 3.104187185697792e-05, + "loss": 0.1235, + "step": 26915 + }, + { + "epoch": 0.48007705204580314, + "grad_norm": 0.32528597116470337, + "learning_rate": 3.1040361476796705e-05, + "loss": 0.1545, + "step": 26916 + }, + { + "epoch": 0.48009488816751683, + "grad_norm": 0.22587130963802338, + "learning_rate": 3.103885107320096e-05, + "loss": 0.1595, + "step": 26917 + }, + { + "epoch": 0.4801127242892306, + "grad_norm": 0.2778416574001312, + "learning_rate": 3.1037340646196534e-05, + "loss": 0.1663, + "step": 26918 + }, + { + "epoch": 0.48013056041094426, + "grad_norm": 0.3093664348125458, + "learning_rate": 3.1035830195789295e-05, + "loss": 0.1392, + "step": 26919 + }, + { + "epoch": 0.48014839653265795, + "grad_norm": 0.1803629845380783, + "learning_rate": 3.10343197219851e-05, + "loss": 0.0912, + "step": 26920 + }, + { + "epoch": 0.48016623265437164, + "grad_norm": 0.305999755859375, + "learning_rate": 3.1032809224789795e-05, + "loss": 0.1372, + "step": 26921 + }, + { + "epoch": 0.4801840687760853, + "grad_norm": 0.231741800904274, + "learning_rate": 3.103129870420923e-05, + "loss": 0.1729, + "step": 26922 + }, + { + "epoch": 0.480201904897799, + "grad_norm": 0.254946231842041, + "learning_rate": 3.1029788160249275e-05, + "loss": 0.1531, + "step": 26923 + }, + { + "epoch": 0.4802197410195127, + "grad_norm": 0.25297650694847107, + "learning_rate": 3.1028277592915785e-05, + "loss": 0.1966, + "step": 26924 + }, + { + "epoch": 0.4802375771412264, + "grad_norm": 0.20638325810432434, + "learning_rate": 3.1026767002214594e-05, + "loss": 0.1033, + "step": 26925 + }, + { + "epoch": 0.48025541326294013, + "grad_norm": 0.2511560916900635, + "learning_rate": 3.102525638815158e-05, + "loss": 0.1599, + "step": 26926 + }, + { + "epoch": 0.4802732493846538, + "grad_norm": 0.22767919301986694, + "learning_rate": 3.102374575073259e-05, + "loss": 0.1474, + "step": 26927 + }, + { + "epoch": 0.4802910855063675, + "grad_norm": 0.39325883984565735, + "learning_rate": 3.102223508996348e-05, + "loss": 0.1644, + "step": 26928 + }, + { + "epoch": 0.4803089216280812, + "grad_norm": 0.2564949691295624, + "learning_rate": 3.102072440585011e-05, + "loss": 0.142, + "step": 26929 + }, + { + "epoch": 0.4803267577497949, + "grad_norm": 0.2197631448507309, + "learning_rate": 3.101921369839833e-05, + "loss": 0.1341, + "step": 26930 + }, + { + "epoch": 0.4803445938715086, + "grad_norm": 0.37748482823371887, + "learning_rate": 3.1017702967614e-05, + "loss": 0.1393, + "step": 26931 + }, + { + "epoch": 0.48036242999322226, + "grad_norm": 0.23145487904548645, + "learning_rate": 3.101619221350298e-05, + "loss": 0.1454, + "step": 26932 + }, + { + "epoch": 0.48038026611493595, + "grad_norm": 0.2263801395893097, + "learning_rate": 3.1014681436071116e-05, + "loss": 0.1584, + "step": 26933 + }, + { + "epoch": 0.48039810223664964, + "grad_norm": 0.23662111163139343, + "learning_rate": 3.101317063532426e-05, + "loss": 0.1336, + "step": 26934 + }, + { + "epoch": 0.4804159383583634, + "grad_norm": 0.260810911655426, + "learning_rate": 3.10116598112683e-05, + "loss": 0.1993, + "step": 26935 + }, + { + "epoch": 0.48043377448007707, + "grad_norm": 0.2356175184249878, + "learning_rate": 3.101014896390905e-05, + "loss": 0.1296, + "step": 26936 + }, + { + "epoch": 0.48045161060179076, + "grad_norm": 0.23916806280612946, + "learning_rate": 3.1008638093252395e-05, + "loss": 0.1538, + "step": 26937 + }, + { + "epoch": 0.48046944672350445, + "grad_norm": 0.3416164219379425, + "learning_rate": 3.100712719930418e-05, + "loss": 0.1982, + "step": 26938 + }, + { + "epoch": 0.48048728284521813, + "grad_norm": 0.23779907822608948, + "learning_rate": 3.100561628207026e-05, + "loss": 0.1245, + "step": 26939 + }, + { + "epoch": 0.4805051189669318, + "grad_norm": 0.30729466676712036, + "learning_rate": 3.100410534155651e-05, + "loss": 0.1666, + "step": 26940 + }, + { + "epoch": 0.4805229550886455, + "grad_norm": 0.45495182275772095, + "learning_rate": 3.100259437776877e-05, + "loss": 0.1716, + "step": 26941 + }, + { + "epoch": 0.4805407912103592, + "grad_norm": 0.23640476167201996, + "learning_rate": 3.10010833907129e-05, + "loss": 0.1222, + "step": 26942 + }, + { + "epoch": 0.48055862733207294, + "grad_norm": 0.2531169354915619, + "learning_rate": 3.099957238039476e-05, + "loss": 0.1341, + "step": 26943 + }, + { + "epoch": 0.48057646345378663, + "grad_norm": 0.3022766709327698, + "learning_rate": 3.0998061346820206e-05, + "loss": 0.2402, + "step": 26944 + }, + { + "epoch": 0.4805942995755003, + "grad_norm": 0.2547677755355835, + "learning_rate": 3.099655028999508e-05, + "loss": 0.144, + "step": 26945 + }, + { + "epoch": 0.480612135697214, + "grad_norm": 0.26292699575424194, + "learning_rate": 3.099503920992527e-05, + "loss": 0.1572, + "step": 26946 + }, + { + "epoch": 0.4806299718189277, + "grad_norm": 0.2665134370326996, + "learning_rate": 3.099352810661661e-05, + "loss": 0.1433, + "step": 26947 + }, + { + "epoch": 0.4806478079406414, + "grad_norm": 0.27100810408592224, + "learning_rate": 3.099201698007497e-05, + "loss": 0.1245, + "step": 26948 + }, + { + "epoch": 0.48066564406235507, + "grad_norm": 0.3097866177558899, + "learning_rate": 3.0990505830306196e-05, + "loss": 0.1747, + "step": 26949 + }, + { + "epoch": 0.48068348018406876, + "grad_norm": 0.3356071412563324, + "learning_rate": 3.098899465731617e-05, + "loss": 0.1092, + "step": 26950 + }, + { + "epoch": 0.48070131630578244, + "grad_norm": 0.32404643297195435, + "learning_rate": 3.098748346111071e-05, + "loss": 0.1325, + "step": 26951 + }, + { + "epoch": 0.4807191524274962, + "grad_norm": 0.27789992094039917, + "learning_rate": 3.09859722416957e-05, + "loss": 0.1427, + "step": 26952 + }, + { + "epoch": 0.4807369885492099, + "grad_norm": 0.2856275141239166, + "learning_rate": 3.0984460999077e-05, + "loss": 0.1299, + "step": 26953 + }, + { + "epoch": 0.48075482467092356, + "grad_norm": 0.2191135734319687, + "learning_rate": 3.098294973326046e-05, + "loss": 0.1154, + "step": 26954 + }, + { + "epoch": 0.48077266079263725, + "grad_norm": 0.1872471272945404, + "learning_rate": 3.098143844425194e-05, + "loss": 0.0617, + "step": 26955 + }, + { + "epoch": 0.48079049691435094, + "grad_norm": 0.3164249360561371, + "learning_rate": 3.097992713205731e-05, + "loss": 0.1459, + "step": 26956 + }, + { + "epoch": 0.48080833303606463, + "grad_norm": 0.3594079315662384, + "learning_rate": 3.097841579668241e-05, + "loss": 0.2561, + "step": 26957 + }, + { + "epoch": 0.4808261691577783, + "grad_norm": 0.3132946193218231, + "learning_rate": 3.09769044381331e-05, + "loss": 0.1655, + "step": 26958 + }, + { + "epoch": 0.480844005279492, + "grad_norm": 0.27492833137512207, + "learning_rate": 3.097539305641524e-05, + "loss": 0.1608, + "step": 26959 + }, + { + "epoch": 0.48086184140120575, + "grad_norm": 0.39647388458251953, + "learning_rate": 3.09738816515347e-05, + "loss": 0.1424, + "step": 26960 + }, + { + "epoch": 0.48087967752291944, + "grad_norm": 0.3303094208240509, + "learning_rate": 3.097237022349734e-05, + "loss": 0.1861, + "step": 26961 + }, + { + "epoch": 0.4808975136446331, + "grad_norm": 0.20275112986564636, + "learning_rate": 3.0970858772309e-05, + "loss": 0.1219, + "step": 26962 + }, + { + "epoch": 0.4809153497663468, + "grad_norm": 0.22518904507160187, + "learning_rate": 3.096934729797555e-05, + "loss": 0.1251, + "step": 26963 + }, + { + "epoch": 0.4809331858880605, + "grad_norm": 0.3026747405529022, + "learning_rate": 3.096783580050284e-05, + "loss": 0.1307, + "step": 26964 + }, + { + "epoch": 0.4809510220097742, + "grad_norm": 0.2873251736164093, + "learning_rate": 3.096632427989675e-05, + "loss": 0.1032, + "step": 26965 + }, + { + "epoch": 0.4809688581314879, + "grad_norm": 0.20177914202213287, + "learning_rate": 3.096481273616312e-05, + "loss": 0.1477, + "step": 26966 + }, + { + "epoch": 0.48098669425320156, + "grad_norm": 0.3231706917285919, + "learning_rate": 3.096330116930782e-05, + "loss": 0.1837, + "step": 26967 + }, + { + "epoch": 0.48100453037491525, + "grad_norm": 0.22386451065540314, + "learning_rate": 3.09617895793367e-05, + "loss": 0.1611, + "step": 26968 + }, + { + "epoch": 0.481022366496629, + "grad_norm": 0.2567024230957031, + "learning_rate": 3.096027796625563e-05, + "loss": 0.1441, + "step": 26969 + }, + { + "epoch": 0.4810402026183427, + "grad_norm": 0.35814276337623596, + "learning_rate": 3.0958766330070463e-05, + "loss": 0.1393, + "step": 26970 + }, + { + "epoch": 0.48105803874005637, + "grad_norm": 0.30965086817741394, + "learning_rate": 3.095725467078706e-05, + "loss": 0.2281, + "step": 26971 + }, + { + "epoch": 0.48107587486177006, + "grad_norm": 0.27966707944869995, + "learning_rate": 3.095574298841128e-05, + "loss": 0.1094, + "step": 26972 + }, + { + "epoch": 0.48109371098348375, + "grad_norm": 0.25909602642059326, + "learning_rate": 3.095423128294898e-05, + "loss": 0.1418, + "step": 26973 + }, + { + "epoch": 0.48111154710519743, + "grad_norm": 0.31567519903182983, + "learning_rate": 3.095271955440602e-05, + "loss": 0.1417, + "step": 26974 + }, + { + "epoch": 0.4811293832269111, + "grad_norm": 0.2649823725223541, + "learning_rate": 3.0951207802788264e-05, + "loss": 0.1107, + "step": 26975 + }, + { + "epoch": 0.4811472193486248, + "grad_norm": 0.27317014336586, + "learning_rate": 3.0949696028101574e-05, + "loss": 0.0916, + "step": 26976 + }, + { + "epoch": 0.48116505547033855, + "grad_norm": 0.32236748933792114, + "learning_rate": 3.094818423035181e-05, + "loss": 0.1384, + "step": 26977 + }, + { + "epoch": 0.48118289159205224, + "grad_norm": 0.1466619372367859, + "learning_rate": 3.094667240954483e-05, + "loss": 0.1034, + "step": 26978 + }, + { + "epoch": 0.48120072771376593, + "grad_norm": 0.34723901748657227, + "learning_rate": 3.094516056568649e-05, + "loss": 0.1322, + "step": 26979 + }, + { + "epoch": 0.4812185638354796, + "grad_norm": 0.3337315618991852, + "learning_rate": 3.094364869878265e-05, + "loss": 0.227, + "step": 26980 + }, + { + "epoch": 0.4812363999571933, + "grad_norm": 0.2675262689590454, + "learning_rate": 3.0942136808839176e-05, + "loss": 0.1116, + "step": 26981 + }, + { + "epoch": 0.481254236078907, + "grad_norm": 0.35388287901878357, + "learning_rate": 3.0940624895861936e-05, + "loss": 0.1275, + "step": 26982 + }, + { + "epoch": 0.4812720722006207, + "grad_norm": 0.2760636806488037, + "learning_rate": 3.093911295985678e-05, + "loss": 0.1237, + "step": 26983 + }, + { + "epoch": 0.48128990832233437, + "grad_norm": 0.3363461494445801, + "learning_rate": 3.0937601000829567e-05, + "loss": 0.1148, + "step": 26984 + }, + { + "epoch": 0.4813077444440481, + "grad_norm": 0.32991641759872437, + "learning_rate": 3.093608901878616e-05, + "loss": 0.1899, + "step": 26985 + }, + { + "epoch": 0.4813255805657618, + "grad_norm": 0.2606499195098877, + "learning_rate": 3.093457701373243e-05, + "loss": 0.1378, + "step": 26986 + }, + { + "epoch": 0.4813434166874755, + "grad_norm": 0.25897732377052307, + "learning_rate": 3.093306498567422e-05, + "loss": 0.1551, + "step": 26987 + }, + { + "epoch": 0.4813612528091892, + "grad_norm": 0.27342841029167175, + "learning_rate": 3.09315529346174e-05, + "loss": 0.1226, + "step": 26988 + }, + { + "epoch": 0.48137908893090287, + "grad_norm": 0.3533625304698944, + "learning_rate": 3.093004086056784e-05, + "loss": 0.112, + "step": 26989 + }, + { + "epoch": 0.48139692505261655, + "grad_norm": 0.29467374086380005, + "learning_rate": 3.092852876353139e-05, + "loss": 0.2224, + "step": 26990 + }, + { + "epoch": 0.48141476117433024, + "grad_norm": 0.26189154386520386, + "learning_rate": 3.092701664351392e-05, + "loss": 0.1847, + "step": 26991 + }, + { + "epoch": 0.48143259729604393, + "grad_norm": 0.22214268147945404, + "learning_rate": 3.092550450052128e-05, + "loss": 0.1015, + "step": 26992 + }, + { + "epoch": 0.4814504334177576, + "grad_norm": 0.2892412841320038, + "learning_rate": 3.092399233455934e-05, + "loss": 0.1618, + "step": 26993 + }, + { + "epoch": 0.48146826953947136, + "grad_norm": 0.25335508584976196, + "learning_rate": 3.0922480145633965e-05, + "loss": 0.1479, + "step": 26994 + }, + { + "epoch": 0.48148610566118505, + "grad_norm": 0.2531585991382599, + "learning_rate": 3.0920967933751e-05, + "loss": 0.1472, + "step": 26995 + }, + { + "epoch": 0.48150394178289874, + "grad_norm": 0.21424445509910583, + "learning_rate": 3.0919455698916326e-05, + "loss": 0.1066, + "step": 26996 + }, + { + "epoch": 0.4815217779046124, + "grad_norm": 0.24043121933937073, + "learning_rate": 3.091794344113579e-05, + "loss": 0.1145, + "step": 26997 + }, + { + "epoch": 0.4815396140263261, + "grad_norm": 0.27867183089256287, + "learning_rate": 3.091643116041528e-05, + "loss": 0.1506, + "step": 26998 + }, + { + "epoch": 0.4815574501480398, + "grad_norm": 0.28481653332710266, + "learning_rate": 3.091491885676062e-05, + "loss": 0.1239, + "step": 26999 + }, + { + "epoch": 0.4815752862697535, + "grad_norm": 0.2043977826833725, + "learning_rate": 3.09134065301777e-05, + "loss": 0.1449, + "step": 27000 + }, + { + "epoch": 0.4815752862697535, + "eval_loss": 0.13947582244873047, + "eval_runtime": 107.8396, + "eval_samples_per_second": 9.496, + "eval_steps_per_second": 1.586, + "step": 27000 + }, + { + "epoch": 0.4815931223914672, + "grad_norm": 0.4226484000682831, + "learning_rate": 3.0911894180672366e-05, + "loss": 0.1076, + "step": 27001 + }, + { + "epoch": 0.4816109585131809, + "grad_norm": 0.22894150018692017, + "learning_rate": 3.0910381808250496e-05, + "loss": 0.1684, + "step": 27002 + }, + { + "epoch": 0.4816287946348946, + "grad_norm": 0.2135767787694931, + "learning_rate": 3.090886941291794e-05, + "loss": 0.0945, + "step": 27003 + }, + { + "epoch": 0.4816466307566083, + "grad_norm": 0.23432646691799164, + "learning_rate": 3.090735699468057e-05, + "loss": 0.1297, + "step": 27004 + }, + { + "epoch": 0.481664466878322, + "grad_norm": 0.25252532958984375, + "learning_rate": 3.0905844553544236e-05, + "loss": 0.1378, + "step": 27005 + }, + { + "epoch": 0.48168230300003567, + "grad_norm": 0.3878104090690613, + "learning_rate": 3.090433208951482e-05, + "loss": 0.1623, + "step": 27006 + }, + { + "epoch": 0.48170013912174936, + "grad_norm": 0.2725524604320526, + "learning_rate": 3.090281960259817e-05, + "loss": 0.1284, + "step": 27007 + }, + { + "epoch": 0.48171797524346305, + "grad_norm": 0.286504864692688, + "learning_rate": 3.0901307092800145e-05, + "loss": 0.1385, + "step": 27008 + }, + { + "epoch": 0.48173581136517674, + "grad_norm": 0.31102314591407776, + "learning_rate": 3.089979456012663e-05, + "loss": 0.1167, + "step": 27009 + }, + { + "epoch": 0.4817536474868904, + "grad_norm": 0.22377517819404602, + "learning_rate": 3.089828200458346e-05, + "loss": 0.1043, + "step": 27010 + }, + { + "epoch": 0.48177148360860417, + "grad_norm": 0.3582703769207001, + "learning_rate": 3.089676942617652e-05, + "loss": 0.1357, + "step": 27011 + }, + { + "epoch": 0.48178931973031786, + "grad_norm": 0.216938778758049, + "learning_rate": 3.0895256824911654e-05, + "loss": 0.1355, + "step": 27012 + }, + { + "epoch": 0.48180715585203154, + "grad_norm": 0.28301793336868286, + "learning_rate": 3.089374420079475e-05, + "loss": 0.1788, + "step": 27013 + }, + { + "epoch": 0.48182499197374523, + "grad_norm": 0.1755129098892212, + "learning_rate": 3.0892231553831646e-05, + "loss": 0.121, + "step": 27014 + }, + { + "epoch": 0.4818428280954589, + "grad_norm": 0.17529180645942688, + "learning_rate": 3.0890718884028224e-05, + "loss": 0.1356, + "step": 27015 + }, + { + "epoch": 0.4818606642171726, + "grad_norm": 0.3269725441932678, + "learning_rate": 3.088920619139033e-05, + "loss": 0.1983, + "step": 27016 + }, + { + "epoch": 0.4818785003388863, + "grad_norm": 0.26815667748451233, + "learning_rate": 3.088769347592386e-05, + "loss": 0.1886, + "step": 27017 + }, + { + "epoch": 0.4818963364606, + "grad_norm": 0.2617287039756775, + "learning_rate": 3.088618073763464e-05, + "loss": 0.1765, + "step": 27018 + }, + { + "epoch": 0.4819141725823137, + "grad_norm": 0.2688720226287842, + "learning_rate": 3.088466797652856e-05, + "loss": 0.1372, + "step": 27019 + }, + { + "epoch": 0.4819320087040274, + "grad_norm": 0.23563289642333984, + "learning_rate": 3.088315519261147e-05, + "loss": 0.1439, + "step": 27020 + }, + { + "epoch": 0.4819498448257411, + "grad_norm": 0.24001914262771606, + "learning_rate": 3.0881642385889245e-05, + "loss": 0.123, + "step": 27021 + }, + { + "epoch": 0.4819676809474548, + "grad_norm": 0.2797130346298218, + "learning_rate": 3.088012955636773e-05, + "loss": 0.1003, + "step": 27022 + }, + { + "epoch": 0.4819855170691685, + "grad_norm": 0.25982335209846497, + "learning_rate": 3.0878616704052806e-05, + "loss": 0.1663, + "step": 27023 + }, + { + "epoch": 0.48200335319088217, + "grad_norm": 0.20879769325256348, + "learning_rate": 3.0877103828950337e-05, + "loss": 0.1085, + "step": 27024 + }, + { + "epoch": 0.48202118931259585, + "grad_norm": 0.30116739869117737, + "learning_rate": 3.087559093106618e-05, + "loss": 0.1366, + "step": 27025 + }, + { + "epoch": 0.48203902543430954, + "grad_norm": 0.29739004373550415, + "learning_rate": 3.087407801040621e-05, + "loss": 0.1394, + "step": 27026 + }, + { + "epoch": 0.48205686155602323, + "grad_norm": 0.22542817890644073, + "learning_rate": 3.0872565066976275e-05, + "loss": 0.1399, + "step": 27027 + }, + { + "epoch": 0.482074697677737, + "grad_norm": 0.2542777359485626, + "learning_rate": 3.087105210078226e-05, + "loss": 0.1196, + "step": 27028 + }, + { + "epoch": 0.48209253379945066, + "grad_norm": 0.3396031856536865, + "learning_rate": 3.0869539111830006e-05, + "loss": 0.1077, + "step": 27029 + }, + { + "epoch": 0.48211036992116435, + "grad_norm": 0.28776851296424866, + "learning_rate": 3.08680261001254e-05, + "loss": 0.1258, + "step": 27030 + }, + { + "epoch": 0.48212820604287804, + "grad_norm": 0.3039550483226776, + "learning_rate": 3.0866513065674295e-05, + "loss": 0.1614, + "step": 27031 + }, + { + "epoch": 0.4821460421645917, + "grad_norm": 0.31225457787513733, + "learning_rate": 3.0865000008482564e-05, + "loss": 0.1693, + "step": 27032 + }, + { + "epoch": 0.4821638782863054, + "grad_norm": 0.3079989552497864, + "learning_rate": 3.086348692855606e-05, + "loss": 0.1385, + "step": 27033 + }, + { + "epoch": 0.4821817144080191, + "grad_norm": 0.24990880489349365, + "learning_rate": 3.086197382590067e-05, + "loss": 0.1223, + "step": 27034 + }, + { + "epoch": 0.4821995505297328, + "grad_norm": 0.20213203132152557, + "learning_rate": 3.086046070052223e-05, + "loss": 0.163, + "step": 27035 + }, + { + "epoch": 0.48221738665144653, + "grad_norm": 0.25982651114463806, + "learning_rate": 3.085894755242662e-05, + "loss": 0.077, + "step": 27036 + }, + { + "epoch": 0.4822352227731602, + "grad_norm": 0.2415047138929367, + "learning_rate": 3.085743438161972e-05, + "loss": 0.1363, + "step": 27037 + }, + { + "epoch": 0.4822530588948739, + "grad_norm": 0.233364999294281, + "learning_rate": 3.0855921188107364e-05, + "loss": 0.1186, + "step": 27038 + }, + { + "epoch": 0.4822708950165876, + "grad_norm": 0.27403080463409424, + "learning_rate": 3.085440797189545e-05, + "loss": 0.0811, + "step": 27039 + }, + { + "epoch": 0.4822887311383013, + "grad_norm": 0.2892562747001648, + "learning_rate": 3.0852894732989815e-05, + "loss": 0.1534, + "step": 27040 + }, + { + "epoch": 0.482306567260015, + "grad_norm": 0.28563547134399414, + "learning_rate": 3.085138147139635e-05, + "loss": 0.1761, + "step": 27041 + }, + { + "epoch": 0.48232440338172866, + "grad_norm": 0.3108138144016266, + "learning_rate": 3.08498681871209e-05, + "loss": 0.1338, + "step": 27042 + }, + { + "epoch": 0.48234223950344235, + "grad_norm": 0.26626822352409363, + "learning_rate": 3.0848354880169346e-05, + "loss": 0.0957, + "step": 27043 + }, + { + "epoch": 0.4823600756251561, + "grad_norm": 0.22019726037979126, + "learning_rate": 3.0846841550547546e-05, + "loss": 0.1388, + "step": 27044 + }, + { + "epoch": 0.4823779117468698, + "grad_norm": 0.33581289649009705, + "learning_rate": 3.084532819826137e-05, + "loss": 0.169, + "step": 27045 + }, + { + "epoch": 0.48239574786858347, + "grad_norm": 0.19992482662200928, + "learning_rate": 3.084381482331668e-05, + "loss": 0.1231, + "step": 27046 + }, + { + "epoch": 0.48241358399029716, + "grad_norm": 0.2791059911251068, + "learning_rate": 3.084230142571935e-05, + "loss": 0.1987, + "step": 27047 + }, + { + "epoch": 0.48243142011201084, + "grad_norm": 0.27307823300361633, + "learning_rate": 3.0840788005475246e-05, + "loss": 0.1054, + "step": 27048 + }, + { + "epoch": 0.48244925623372453, + "grad_norm": 0.22794122993946075, + "learning_rate": 3.083927456259022e-05, + "loss": 0.166, + "step": 27049 + }, + { + "epoch": 0.4824670923554382, + "grad_norm": 0.30295199155807495, + "learning_rate": 3.083776109707015e-05, + "loss": 0.1609, + "step": 27050 + }, + { + "epoch": 0.4824849284771519, + "grad_norm": 0.23303623497486115, + "learning_rate": 3.08362476089209e-05, + "loss": 0.1467, + "step": 27051 + }, + { + "epoch": 0.4825027645988656, + "grad_norm": 0.2229575365781784, + "learning_rate": 3.083473409814835e-05, + "loss": 0.1276, + "step": 27052 + }, + { + "epoch": 0.48252060072057934, + "grad_norm": 0.28668197989463806, + "learning_rate": 3.0833220564758346e-05, + "loss": 0.1584, + "step": 27053 + }, + { + "epoch": 0.48253843684229303, + "grad_norm": 0.23593303561210632, + "learning_rate": 3.083170700875677e-05, + "loss": 0.1379, + "step": 27054 + }, + { + "epoch": 0.4825562729640067, + "grad_norm": 0.3035764992237091, + "learning_rate": 3.0830193430149476e-05, + "loss": 0.1692, + "step": 27055 + }, + { + "epoch": 0.4825741090857204, + "grad_norm": 0.2567789852619171, + "learning_rate": 3.082867982894235e-05, + "loss": 0.1555, + "step": 27056 + }, + { + "epoch": 0.4825919452074341, + "grad_norm": 0.2320074439048767, + "learning_rate": 3.082716620514123e-05, + "loss": 0.1707, + "step": 27057 + }, + { + "epoch": 0.4826097813291478, + "grad_norm": 0.29797905683517456, + "learning_rate": 3.082565255875202e-05, + "loss": 0.16, + "step": 27058 + }, + { + "epoch": 0.48262761745086147, + "grad_norm": 0.2768057584762573, + "learning_rate": 3.082413888978056e-05, + "loss": 0.1478, + "step": 27059 + }, + { + "epoch": 0.48264545357257516, + "grad_norm": 0.2559477388858795, + "learning_rate": 3.082262519823273e-05, + "loss": 0.1898, + "step": 27060 + }, + { + "epoch": 0.4826632896942889, + "grad_norm": 0.22729475796222687, + "learning_rate": 3.0821111484114395e-05, + "loss": 0.1155, + "step": 27061 + }, + { + "epoch": 0.4826811258160026, + "grad_norm": 0.2949030101299286, + "learning_rate": 3.081959774743141e-05, + "loss": 0.1289, + "step": 27062 + }, + { + "epoch": 0.4826989619377163, + "grad_norm": 0.4954430162906647, + "learning_rate": 3.081808398818966e-05, + "loss": 0.2021, + "step": 27063 + }, + { + "epoch": 0.48271679805942996, + "grad_norm": 0.272966593503952, + "learning_rate": 3.0816570206395004e-05, + "loss": 0.1389, + "step": 27064 + }, + { + "epoch": 0.48273463418114365, + "grad_norm": 0.21344424784183502, + "learning_rate": 3.0815056402053325e-05, + "loss": 0.1301, + "step": 27065 + }, + { + "epoch": 0.48275247030285734, + "grad_norm": 0.33520445227622986, + "learning_rate": 3.0813542575170466e-05, + "loss": 0.1551, + "step": 27066 + }, + { + "epoch": 0.482770306424571, + "grad_norm": 0.35573261976242065, + "learning_rate": 3.0812028725752316e-05, + "loss": 0.1598, + "step": 27067 + }, + { + "epoch": 0.4827881425462847, + "grad_norm": 0.3074539601802826, + "learning_rate": 3.081051485380473e-05, + "loss": 0.1607, + "step": 27068 + }, + { + "epoch": 0.4828059786679984, + "grad_norm": 0.3038214147090912, + "learning_rate": 3.080900095933359e-05, + "loss": 0.1679, + "step": 27069 + }, + { + "epoch": 0.48282381478971215, + "grad_norm": 0.3289279639720917, + "learning_rate": 3.0807487042344746e-05, + "loss": 0.1544, + "step": 27070 + }, + { + "epoch": 0.48284165091142583, + "grad_norm": 0.27637359499931335, + "learning_rate": 3.080597310284408e-05, + "loss": 0.1257, + "step": 27071 + }, + { + "epoch": 0.4828594870331395, + "grad_norm": 0.22734405100345612, + "learning_rate": 3.080445914083745e-05, + "loss": 0.132, + "step": 27072 + }, + { + "epoch": 0.4828773231548532, + "grad_norm": 0.22987398505210876, + "learning_rate": 3.0802945156330745e-05, + "loss": 0.1594, + "step": 27073 + }, + { + "epoch": 0.4828951592765669, + "grad_norm": 0.33401110768318176, + "learning_rate": 3.080143114932981e-05, + "loss": 0.2106, + "step": 27074 + }, + { + "epoch": 0.4829129953982806, + "grad_norm": 0.23963074386119843, + "learning_rate": 3.0799917119840535e-05, + "loss": 0.1384, + "step": 27075 + }, + { + "epoch": 0.4829308315199943, + "grad_norm": 0.28968027234077454, + "learning_rate": 3.079840306786877e-05, + "loss": 0.1273, + "step": 27076 + }, + { + "epoch": 0.48294866764170796, + "grad_norm": 0.24933290481567383, + "learning_rate": 3.079688899342039e-05, + "loss": 0.1139, + "step": 27077 + }, + { + "epoch": 0.4829665037634217, + "grad_norm": 0.22661608457565308, + "learning_rate": 3.0795374896501266e-05, + "loss": 0.147, + "step": 27078 + }, + { + "epoch": 0.4829843398851354, + "grad_norm": 0.2771577537059784, + "learning_rate": 3.079386077711727e-05, + "loss": 0.1204, + "step": 27079 + }, + { + "epoch": 0.4830021760068491, + "grad_norm": 0.2487345039844513, + "learning_rate": 3.079234663527427e-05, + "loss": 0.1382, + "step": 27080 + }, + { + "epoch": 0.48302001212856277, + "grad_norm": 0.4305541217327118, + "learning_rate": 3.079083247097813e-05, + "loss": 0.2463, + "step": 27081 + }, + { + "epoch": 0.48303784825027646, + "grad_norm": 0.18258030712604523, + "learning_rate": 3.078931828423473e-05, + "loss": 0.1343, + "step": 27082 + }, + { + "epoch": 0.48305568437199015, + "grad_norm": 0.39898568391799927, + "learning_rate": 3.0787804075049926e-05, + "loss": 0.1215, + "step": 27083 + }, + { + "epoch": 0.48307352049370383, + "grad_norm": 0.2975720465183258, + "learning_rate": 3.078628984342959e-05, + "loss": 0.1792, + "step": 27084 + }, + { + "epoch": 0.4830913566154175, + "grad_norm": 0.273441880941391, + "learning_rate": 3.078477558937961e-05, + "loss": 0.1182, + "step": 27085 + }, + { + "epoch": 0.48310919273713127, + "grad_norm": 0.26858997344970703, + "learning_rate": 3.078326131290583e-05, + "loss": 0.1499, + "step": 27086 + }, + { + "epoch": 0.48312702885884495, + "grad_norm": 0.2902086079120636, + "learning_rate": 3.0781747014014146e-05, + "loss": 0.1386, + "step": 27087 + }, + { + "epoch": 0.48314486498055864, + "grad_norm": 0.2149176150560379, + "learning_rate": 3.0780232692710396e-05, + "loss": 0.1124, + "step": 27088 + }, + { + "epoch": 0.48316270110227233, + "grad_norm": 0.32025763392448425, + "learning_rate": 3.077871834900048e-05, + "loss": 0.1648, + "step": 27089 + }, + { + "epoch": 0.483180537223986, + "grad_norm": 0.30817756056785583, + "learning_rate": 3.077720398289025e-05, + "loss": 0.1602, + "step": 27090 + }, + { + "epoch": 0.4831983733456997, + "grad_norm": 0.23591212928295135, + "learning_rate": 3.0775689594385585e-05, + "loss": 0.1603, + "step": 27091 + }, + { + "epoch": 0.4832162094674134, + "grad_norm": 0.241417795419693, + "learning_rate": 3.077417518349235e-05, + "loss": 0.1565, + "step": 27092 + }, + { + "epoch": 0.4832340455891271, + "grad_norm": 0.25818952918052673, + "learning_rate": 3.077266075021642e-05, + "loss": 0.1776, + "step": 27093 + }, + { + "epoch": 0.48325188171084077, + "grad_norm": 0.31792545318603516, + "learning_rate": 3.0771146294563656e-05, + "loss": 0.1571, + "step": 27094 + }, + { + "epoch": 0.4832697178325545, + "grad_norm": 0.20953741669654846, + "learning_rate": 3.0769631816539947e-05, + "loss": 0.1441, + "step": 27095 + }, + { + "epoch": 0.4832875539542682, + "grad_norm": 0.22194012999534607, + "learning_rate": 3.076811731615114e-05, + "loss": 0.0815, + "step": 27096 + }, + { + "epoch": 0.4833053900759819, + "grad_norm": 0.318523108959198, + "learning_rate": 3.0766602793403134e-05, + "loss": 0.1549, + "step": 27097 + }, + { + "epoch": 0.4833232261976956, + "grad_norm": 0.2722551226615906, + "learning_rate": 3.076508824830177e-05, + "loss": 0.1445, + "step": 27098 + }, + { + "epoch": 0.48334106231940926, + "grad_norm": 0.2218547761440277, + "learning_rate": 3.076357368085293e-05, + "loss": 0.1162, + "step": 27099 + }, + { + "epoch": 0.48335889844112295, + "grad_norm": 0.26550573110580444, + "learning_rate": 3.07620590910625e-05, + "loss": 0.0919, + "step": 27100 + }, + { + "epoch": 0.48337673456283664, + "grad_norm": 0.45003244280815125, + "learning_rate": 3.076054447893633e-05, + "loss": 0.2033, + "step": 27101 + }, + { + "epoch": 0.48339457068455033, + "grad_norm": 0.31724047660827637, + "learning_rate": 3.075902984448031e-05, + "loss": 0.1198, + "step": 27102 + }, + { + "epoch": 0.48341240680626407, + "grad_norm": 0.29529982805252075, + "learning_rate": 3.075751518770029e-05, + "loss": 0.1202, + "step": 27103 + }, + { + "epoch": 0.48343024292797776, + "grad_norm": 0.2640722990036011, + "learning_rate": 3.075600050860216e-05, + "loss": 0.2108, + "step": 27104 + }, + { + "epoch": 0.48344807904969145, + "grad_norm": 0.25682541728019714, + "learning_rate": 3.075448580719178e-05, + "loss": 0.1084, + "step": 27105 + }, + { + "epoch": 0.48346591517140514, + "grad_norm": 0.29415833950042725, + "learning_rate": 3.0752971083475025e-05, + "loss": 0.1155, + "step": 27106 + }, + { + "epoch": 0.4834837512931188, + "grad_norm": 0.2655586898326874, + "learning_rate": 3.075145633745777e-05, + "loss": 0.1648, + "step": 27107 + }, + { + "epoch": 0.4835015874148325, + "grad_norm": 0.2796500325202942, + "learning_rate": 3.074994156914589e-05, + "loss": 0.1207, + "step": 27108 + }, + { + "epoch": 0.4835194235365462, + "grad_norm": 0.32488369941711426, + "learning_rate": 3.0748426778545234e-05, + "loss": 0.1777, + "step": 27109 + }, + { + "epoch": 0.4835372596582599, + "grad_norm": 0.24566836655139923, + "learning_rate": 3.0746911965661706e-05, + "loss": 0.2071, + "step": 27110 + }, + { + "epoch": 0.4835550957799736, + "grad_norm": 0.22565458714962006, + "learning_rate": 3.074539713050115e-05, + "loss": 0.1467, + "step": 27111 + }, + { + "epoch": 0.4835729319016873, + "grad_norm": 0.21871069073677063, + "learning_rate": 3.0743882273069456e-05, + "loss": 0.0904, + "step": 27112 + }, + { + "epoch": 0.483590768023401, + "grad_norm": 0.23211175203323364, + "learning_rate": 3.074236739337249e-05, + "loss": 0.1908, + "step": 27113 + }, + { + "epoch": 0.4836086041451147, + "grad_norm": 0.2156677544116974, + "learning_rate": 3.074085249141613e-05, + "loss": 0.1518, + "step": 27114 + }, + { + "epoch": 0.4836264402668284, + "grad_norm": 0.273124635219574, + "learning_rate": 3.073933756720624e-05, + "loss": 0.1254, + "step": 27115 + }, + { + "epoch": 0.48364427638854207, + "grad_norm": 0.23021823167800903, + "learning_rate": 3.073782262074869e-05, + "loss": 0.1362, + "step": 27116 + }, + { + "epoch": 0.48366211251025576, + "grad_norm": 0.424717515707016, + "learning_rate": 3.073630765204936e-05, + "loss": 0.1478, + "step": 27117 + }, + { + "epoch": 0.48367994863196945, + "grad_norm": 0.29039236903190613, + "learning_rate": 3.073479266111412e-05, + "loss": 0.1898, + "step": 27118 + }, + { + "epoch": 0.48369778475368314, + "grad_norm": 0.27556338906288147, + "learning_rate": 3.073327764794884e-05, + "loss": 0.1304, + "step": 27119 + }, + { + "epoch": 0.4837156208753969, + "grad_norm": 0.24541257321834564, + "learning_rate": 3.07317626125594e-05, + "loss": 0.134, + "step": 27120 + }, + { + "epoch": 0.48373345699711057, + "grad_norm": 0.2787431478500366, + "learning_rate": 3.0730247554951675e-05, + "loss": 0.1568, + "step": 27121 + }, + { + "epoch": 0.48375129311882425, + "grad_norm": 0.24350681900978088, + "learning_rate": 3.072873247513152e-05, + "loss": 0.135, + "step": 27122 + }, + { + "epoch": 0.48376912924053794, + "grad_norm": 0.3390022814273834, + "learning_rate": 3.0727217373104826e-05, + "loss": 0.182, + "step": 27123 + }, + { + "epoch": 0.48378696536225163, + "grad_norm": 0.3546014130115509, + "learning_rate": 3.072570224887746e-05, + "loss": 0.1965, + "step": 27124 + }, + { + "epoch": 0.4838048014839653, + "grad_norm": 0.24808244407176971, + "learning_rate": 3.0724187102455293e-05, + "loss": 0.1546, + "step": 27125 + }, + { + "epoch": 0.483822637605679, + "grad_norm": 0.29959049820899963, + "learning_rate": 3.07226719338442e-05, + "loss": 0.1469, + "step": 27126 + }, + { + "epoch": 0.4838404737273927, + "grad_norm": 0.42251360416412354, + "learning_rate": 3.072115674305005e-05, + "loss": 0.1134, + "step": 27127 + }, + { + "epoch": 0.4838583098491064, + "grad_norm": 0.24596382677555084, + "learning_rate": 3.071964153007872e-05, + "loss": 0.1516, + "step": 27128 + }, + { + "epoch": 0.4838761459708201, + "grad_norm": 0.2794867157936096, + "learning_rate": 3.071812629493609e-05, + "loss": 0.1387, + "step": 27129 + }, + { + "epoch": 0.4838939820925338, + "grad_norm": 0.21823757886886597, + "learning_rate": 3.071661103762803e-05, + "loss": 0.1244, + "step": 27130 + }, + { + "epoch": 0.4839118182142475, + "grad_norm": 0.24210450053215027, + "learning_rate": 3.07150957581604e-05, + "loss": 0.1296, + "step": 27131 + }, + { + "epoch": 0.4839296543359612, + "grad_norm": 0.1930825561285019, + "learning_rate": 3.071358045653909e-05, + "loss": 0.1029, + "step": 27132 + }, + { + "epoch": 0.4839474904576749, + "grad_norm": 0.26027199625968933, + "learning_rate": 3.071206513276997e-05, + "loss": 0.1147, + "step": 27133 + }, + { + "epoch": 0.48396532657938857, + "grad_norm": 0.31662362813949585, + "learning_rate": 3.071054978685891e-05, + "loss": 0.1256, + "step": 27134 + }, + { + "epoch": 0.48398316270110225, + "grad_norm": 0.3016536831855774, + "learning_rate": 3.070903441881179e-05, + "loss": 0.089, + "step": 27135 + }, + { + "epoch": 0.48400099882281594, + "grad_norm": 0.2770494520664215, + "learning_rate": 3.070751902863448e-05, + "loss": 0.1711, + "step": 27136 + }, + { + "epoch": 0.4840188349445297, + "grad_norm": 0.4436330199241638, + "learning_rate": 3.0706003616332853e-05, + "loss": 0.1378, + "step": 27137 + }, + { + "epoch": 0.4840366710662434, + "grad_norm": 0.28773120045661926, + "learning_rate": 3.070448818191279e-05, + "loss": 0.1641, + "step": 27138 + }, + { + "epoch": 0.48405450718795706, + "grad_norm": 0.27296674251556396, + "learning_rate": 3.0702972725380155e-05, + "loss": 0.1238, + "step": 27139 + }, + { + "epoch": 0.48407234330967075, + "grad_norm": 0.39131787419319153, + "learning_rate": 3.0701457246740826e-05, + "loss": 0.1644, + "step": 27140 + }, + { + "epoch": 0.48409017943138444, + "grad_norm": 0.28994521498680115, + "learning_rate": 3.0699941746000686e-05, + "loss": 0.1657, + "step": 27141 + }, + { + "epoch": 0.4841080155530981, + "grad_norm": 0.18912653625011444, + "learning_rate": 3.06984262231656e-05, + "loss": 0.1208, + "step": 27142 + }, + { + "epoch": 0.4841258516748118, + "grad_norm": 0.19882464408874512, + "learning_rate": 3.069691067824145e-05, + "loss": 0.1323, + "step": 27143 + }, + { + "epoch": 0.4841436877965255, + "grad_norm": 0.30060574412345886, + "learning_rate": 3.06953951112341e-05, + "loss": 0.1798, + "step": 27144 + }, + { + "epoch": 0.48416152391823924, + "grad_norm": 0.35380762815475464, + "learning_rate": 3.069387952214944e-05, + "loss": 0.1753, + "step": 27145 + }, + { + "epoch": 0.48417936003995293, + "grad_norm": 0.4506976306438446, + "learning_rate": 3.069236391099333e-05, + "loss": 0.163, + "step": 27146 + }, + { + "epoch": 0.4841971961616666, + "grad_norm": 0.22866684198379517, + "learning_rate": 3.069084827777165e-05, + "loss": 0.1538, + "step": 27147 + }, + { + "epoch": 0.4842150322833803, + "grad_norm": 0.22546716034412384, + "learning_rate": 3.068933262249027e-05, + "loss": 0.1268, + "step": 27148 + }, + { + "epoch": 0.484232868405094, + "grad_norm": 0.2038814276456833, + "learning_rate": 3.0687816945155085e-05, + "loss": 0.1169, + "step": 27149 + }, + { + "epoch": 0.4842507045268077, + "grad_norm": 0.2168342024087906, + "learning_rate": 3.068630124577196e-05, + "loss": 0.1499, + "step": 27150 + }, + { + "epoch": 0.4842685406485214, + "grad_norm": 0.21687765419483185, + "learning_rate": 3.0684785524346754e-05, + "loss": 0.1143, + "step": 27151 + }, + { + "epoch": 0.48428637677023506, + "grad_norm": 0.3289444148540497, + "learning_rate": 3.068326978088536e-05, + "loss": 0.1028, + "step": 27152 + }, + { + "epoch": 0.48430421289194875, + "grad_norm": 0.31328248977661133, + "learning_rate": 3.0681754015393654e-05, + "loss": 0.1231, + "step": 27153 + }, + { + "epoch": 0.4843220490136625, + "grad_norm": 0.2642780840396881, + "learning_rate": 3.06802382278775e-05, + "loss": 0.1128, + "step": 27154 + }, + { + "epoch": 0.4843398851353762, + "grad_norm": 0.23576690256595612, + "learning_rate": 3.0678722418342785e-05, + "loss": 0.1419, + "step": 27155 + }, + { + "epoch": 0.48435772125708987, + "grad_norm": 0.2915872037410736, + "learning_rate": 3.0677206586795384e-05, + "loss": 0.1783, + "step": 27156 + }, + { + "epoch": 0.48437555737880356, + "grad_norm": 0.2980562746524811, + "learning_rate": 3.0675690733241167e-05, + "loss": 0.1425, + "step": 27157 + }, + { + "epoch": 0.48439339350051724, + "grad_norm": 0.2599024474620819, + "learning_rate": 3.0674174857686014e-05, + "loss": 0.1416, + "step": 27158 + }, + { + "epoch": 0.48441122962223093, + "grad_norm": 0.25335493683815, + "learning_rate": 3.0672658960135793e-05, + "loss": 0.1439, + "step": 27159 + }, + { + "epoch": 0.4844290657439446, + "grad_norm": 0.2693977355957031, + "learning_rate": 3.067114304059639e-05, + "loss": 0.1072, + "step": 27160 + }, + { + "epoch": 0.4844469018656583, + "grad_norm": 0.30277130007743835, + "learning_rate": 3.066962709907367e-05, + "loss": 0.1251, + "step": 27161 + }, + { + "epoch": 0.48446473798737205, + "grad_norm": 0.2464374154806137, + "learning_rate": 3.066811113557353e-05, + "loss": 0.1484, + "step": 27162 + }, + { + "epoch": 0.48448257410908574, + "grad_norm": 0.25897088646888733, + "learning_rate": 3.066659515010183e-05, + "loss": 0.1592, + "step": 27163 + }, + { + "epoch": 0.4845004102307994, + "grad_norm": 0.22919850051403046, + "learning_rate": 3.0665079142664446e-05, + "loss": 0.091, + "step": 27164 + }, + { + "epoch": 0.4845182463525131, + "grad_norm": 0.23086853325366974, + "learning_rate": 3.0663563113267266e-05, + "loss": 0.1237, + "step": 27165 + }, + { + "epoch": 0.4845360824742268, + "grad_norm": 0.22809967398643494, + "learning_rate": 3.066204706191616e-05, + "loss": 0.1225, + "step": 27166 + }, + { + "epoch": 0.4845539185959405, + "grad_norm": 0.2634361982345581, + "learning_rate": 3.066053098861699e-05, + "loss": 0.1446, + "step": 27167 + }, + { + "epoch": 0.4845717547176542, + "grad_norm": 0.28495100140571594, + "learning_rate": 3.0659014893375655e-05, + "loss": 0.1178, + "step": 27168 + }, + { + "epoch": 0.48458959083936787, + "grad_norm": 0.26449814438819885, + "learning_rate": 3.0657498776198025e-05, + "loss": 0.1571, + "step": 27169 + }, + { + "epoch": 0.48460742696108156, + "grad_norm": 0.21386142075061798, + "learning_rate": 3.065598263708997e-05, + "loss": 0.0789, + "step": 27170 + }, + { + "epoch": 0.4846252630827953, + "grad_norm": 0.3007747232913971, + "learning_rate": 3.065446647605739e-05, + "loss": 0.1563, + "step": 27171 + }, + { + "epoch": 0.484643099204509, + "grad_norm": 0.33034080266952515, + "learning_rate": 3.0652950293106125e-05, + "loss": 0.1969, + "step": 27172 + }, + { + "epoch": 0.4846609353262227, + "grad_norm": 0.20527958869934082, + "learning_rate": 3.065143408824208e-05, + "loss": 0.0939, + "step": 27173 + }, + { + "epoch": 0.48467877144793636, + "grad_norm": 0.3309456408023834, + "learning_rate": 3.0649917861471125e-05, + "loss": 0.2742, + "step": 27174 + }, + { + "epoch": 0.48469660756965005, + "grad_norm": 0.2243259698152542, + "learning_rate": 3.064840161279913e-05, + "loss": 0.1334, + "step": 27175 + }, + { + "epoch": 0.48471444369136374, + "grad_norm": 0.21642421185970306, + "learning_rate": 3.0646885342231985e-05, + "loss": 0.145, + "step": 27176 + }, + { + "epoch": 0.4847322798130774, + "grad_norm": 0.21992802619934082, + "learning_rate": 3.064536904977557e-05, + "loss": 0.1196, + "step": 27177 + }, + { + "epoch": 0.4847501159347911, + "grad_norm": 0.27333512902259827, + "learning_rate": 3.064385273543574e-05, + "loss": 0.1283, + "step": 27178 + }, + { + "epoch": 0.48476795205650486, + "grad_norm": 0.2596698999404907, + "learning_rate": 3.06423363992184e-05, + "loss": 0.1262, + "step": 27179 + }, + { + "epoch": 0.48478578817821855, + "grad_norm": 0.25320670008659363, + "learning_rate": 3.0640820041129414e-05, + "loss": 0.1464, + "step": 27180 + }, + { + "epoch": 0.48480362429993223, + "grad_norm": 0.31184887886047363, + "learning_rate": 3.063930366117466e-05, + "loss": 0.106, + "step": 27181 + }, + { + "epoch": 0.4848214604216459, + "grad_norm": 0.34344539046287537, + "learning_rate": 3.063778725936001e-05, + "loss": 0.1753, + "step": 27182 + }, + { + "epoch": 0.4848392965433596, + "grad_norm": 0.33374086022377014, + "learning_rate": 3.063627083569135e-05, + "loss": 0.1642, + "step": 27183 + }, + { + "epoch": 0.4848571326650733, + "grad_norm": 0.3592602610588074, + "learning_rate": 3.063475439017456e-05, + "loss": 0.1736, + "step": 27184 + }, + { + "epoch": 0.484874968786787, + "grad_norm": 0.26294299960136414, + "learning_rate": 3.063323792281552e-05, + "loss": 0.2282, + "step": 27185 + }, + { + "epoch": 0.4848928049085007, + "grad_norm": 0.21310549974441528, + "learning_rate": 3.0631721433620104e-05, + "loss": 0.1253, + "step": 27186 + }, + { + "epoch": 0.4849106410302144, + "grad_norm": 0.2431284487247467, + "learning_rate": 3.063020492259418e-05, + "loss": 0.1055, + "step": 27187 + }, + { + "epoch": 0.4849284771519281, + "grad_norm": 0.28226086497306824, + "learning_rate": 3.062868838974365e-05, + "loss": 0.1046, + "step": 27188 + }, + { + "epoch": 0.4849463132736418, + "grad_norm": 0.34102001786231995, + "learning_rate": 3.062717183507437e-05, + "loss": 0.1687, + "step": 27189 + }, + { + "epoch": 0.4849641493953555, + "grad_norm": 0.2814805209636688, + "learning_rate": 3.062565525859224e-05, + "loss": 0.1272, + "step": 27190 + }, + { + "epoch": 0.48498198551706917, + "grad_norm": 0.25693613290786743, + "learning_rate": 3.062413866030311e-05, + "loss": 0.2097, + "step": 27191 + }, + { + "epoch": 0.48499982163878286, + "grad_norm": 0.30521929264068604, + "learning_rate": 3.06226220402129e-05, + "loss": 0.1427, + "step": 27192 + }, + { + "epoch": 0.48501765776049655, + "grad_norm": 0.3133265972137451, + "learning_rate": 3.0621105398327446e-05, + "loss": 0.1067, + "step": 27193 + }, + { + "epoch": 0.48503549388221023, + "grad_norm": 0.2955072820186615, + "learning_rate": 3.0619588734652656e-05, + "loss": 0.1338, + "step": 27194 + }, + { + "epoch": 0.4850533300039239, + "grad_norm": 0.3286081850528717, + "learning_rate": 3.0618072049194394e-05, + "loss": 0.1099, + "step": 27195 + }, + { + "epoch": 0.48507116612563767, + "grad_norm": 0.31009331345558167, + "learning_rate": 3.061655534195854e-05, + "loss": 0.1602, + "step": 27196 + }, + { + "epoch": 0.48508900224735135, + "grad_norm": 0.23093637824058533, + "learning_rate": 3.061503861295099e-05, + "loss": 0.1186, + "step": 27197 + }, + { + "epoch": 0.48510683836906504, + "grad_norm": 0.23135803639888763, + "learning_rate": 3.0613521862177596e-05, + "loss": 0.1703, + "step": 27198 + }, + { + "epoch": 0.48512467449077873, + "grad_norm": 0.2465144395828247, + "learning_rate": 3.0612005089644266e-05, + "loss": 0.1279, + "step": 27199 + }, + { + "epoch": 0.4851425106124924, + "grad_norm": 0.22441820800304413, + "learning_rate": 3.061048829535685e-05, + "loss": 0.12, + "step": 27200 + }, + { + "epoch": 0.4851603467342061, + "grad_norm": 0.2618945837020874, + "learning_rate": 3.0608971479321266e-05, + "loss": 0.1198, + "step": 27201 + }, + { + "epoch": 0.4851781828559198, + "grad_norm": 0.19926472008228302, + "learning_rate": 3.060745464154335e-05, + "loss": 0.1316, + "step": 27202 + }, + { + "epoch": 0.4851960189776335, + "grad_norm": 0.30909961462020874, + "learning_rate": 3.0605937782029015e-05, + "loss": 0.1567, + "step": 27203 + }, + { + "epoch": 0.4852138550993472, + "grad_norm": 0.217269629240036, + "learning_rate": 3.060442090078412e-05, + "loss": 0.1187, + "step": 27204 + }, + { + "epoch": 0.4852316912210609, + "grad_norm": 0.29075273871421814, + "learning_rate": 3.060290399781456e-05, + "loss": 0.1964, + "step": 27205 + }, + { + "epoch": 0.4852495273427746, + "grad_norm": 0.18378223478794098, + "learning_rate": 3.060138707312622e-05, + "loss": 0.0483, + "step": 27206 + }, + { + "epoch": 0.4852673634644883, + "grad_norm": 0.24114476144313812, + "learning_rate": 3.059987012672495e-05, + "loss": 0.1442, + "step": 27207 + }, + { + "epoch": 0.485285199586202, + "grad_norm": 0.3498968183994293, + "learning_rate": 3.059835315861666e-05, + "loss": 0.1545, + "step": 27208 + }, + { + "epoch": 0.48530303570791566, + "grad_norm": 0.21598756313323975, + "learning_rate": 3.059683616880721e-05, + "loss": 0.1493, + "step": 27209 + }, + { + "epoch": 0.48532087182962935, + "grad_norm": 0.2919982969760895, + "learning_rate": 3.0595319157302496e-05, + "loss": 0.1415, + "step": 27210 + }, + { + "epoch": 0.48533870795134304, + "grad_norm": 0.28294768929481506, + "learning_rate": 3.0593802124108396e-05, + "loss": 0.1338, + "step": 27211 + }, + { + "epoch": 0.48535654407305673, + "grad_norm": 0.3219447135925293, + "learning_rate": 3.059228506923079e-05, + "loss": 0.1693, + "step": 27212 + }, + { + "epoch": 0.48537438019477047, + "grad_norm": 0.2757996618747711, + "learning_rate": 3.059076799267554e-05, + "loss": 0.1678, + "step": 27213 + }, + { + "epoch": 0.48539221631648416, + "grad_norm": 0.28392598032951355, + "learning_rate": 3.0589250894448556e-05, + "loss": 0.1969, + "step": 27214 + }, + { + "epoch": 0.48541005243819785, + "grad_norm": 0.26357629895210266, + "learning_rate": 3.05877337745557e-05, + "loss": 0.1293, + "step": 27215 + }, + { + "epoch": 0.48542788855991154, + "grad_norm": 0.27374133467674255, + "learning_rate": 3.058621663300286e-05, + "loss": 0.1789, + "step": 27216 + }, + { + "epoch": 0.4854457246816252, + "grad_norm": 0.2714899182319641, + "learning_rate": 3.058469946979591e-05, + "loss": 0.1853, + "step": 27217 + }, + { + "epoch": 0.4854635608033389, + "grad_norm": 0.27141493558883667, + "learning_rate": 3.058318228494074e-05, + "loss": 0.1519, + "step": 27218 + }, + { + "epoch": 0.4854813969250526, + "grad_norm": 0.3366071581840515, + "learning_rate": 3.058166507844323e-05, + "loss": 0.1488, + "step": 27219 + }, + { + "epoch": 0.4854992330467663, + "grad_norm": 0.2763836979866028, + "learning_rate": 3.058014785030925e-05, + "loss": 0.1612, + "step": 27220 + }, + { + "epoch": 0.48551706916848003, + "grad_norm": 0.3584231436252594, + "learning_rate": 3.05786306005447e-05, + "loss": 0.1502, + "step": 27221 + }, + { + "epoch": 0.4855349052901937, + "grad_norm": 0.28474316000938416, + "learning_rate": 3.057711332915544e-05, + "loss": 0.1573, + "step": 27222 + }, + { + "epoch": 0.4855527414119074, + "grad_norm": 0.2725260257720947, + "learning_rate": 3.057559603614737e-05, + "loss": 0.1354, + "step": 27223 + }, + { + "epoch": 0.4855705775336211, + "grad_norm": 0.3297487795352936, + "learning_rate": 3.057407872152636e-05, + "loss": 0.1032, + "step": 27224 + }, + { + "epoch": 0.4855884136553348, + "grad_norm": 0.31142011284828186, + "learning_rate": 3.05725613852983e-05, + "loss": 0.1947, + "step": 27225 + }, + { + "epoch": 0.48560624977704847, + "grad_norm": 0.31022921204566956, + "learning_rate": 3.057104402746906e-05, + "loss": 0.1045, + "step": 27226 + }, + { + "epoch": 0.48562408589876216, + "grad_norm": 0.4807033836841583, + "learning_rate": 3.0569526648044536e-05, + "loss": 0.1215, + "step": 27227 + }, + { + "epoch": 0.48564192202047585, + "grad_norm": 0.22525370121002197, + "learning_rate": 3.05680092470306e-05, + "loss": 0.1023, + "step": 27228 + }, + { + "epoch": 0.48565975814218953, + "grad_norm": 0.28735658526420593, + "learning_rate": 3.056649182443314e-05, + "loss": 0.1575, + "step": 27229 + }, + { + "epoch": 0.4856775942639033, + "grad_norm": 0.3156735301017761, + "learning_rate": 3.056497438025803e-05, + "loss": 0.1394, + "step": 27230 + }, + { + "epoch": 0.48569543038561697, + "grad_norm": 0.2701871395111084, + "learning_rate": 3.056345691451116e-05, + "loss": 0.1575, + "step": 27231 + }, + { + "epoch": 0.48571326650733065, + "grad_norm": 0.2798452377319336, + "learning_rate": 3.0561939427198414e-05, + "loss": 0.1458, + "step": 27232 + }, + { + "epoch": 0.48573110262904434, + "grad_norm": 0.2689918279647827, + "learning_rate": 3.056042191832567e-05, + "loss": 0.1799, + "step": 27233 + }, + { + "epoch": 0.48574893875075803, + "grad_norm": 0.2624211013317108, + "learning_rate": 3.05589043878988e-05, + "loss": 0.1306, + "step": 27234 + }, + { + "epoch": 0.4857667748724717, + "grad_norm": 0.4467747211456299, + "learning_rate": 3.05573868359237e-05, + "loss": 0.1993, + "step": 27235 + }, + { + "epoch": 0.4857846109941854, + "grad_norm": 0.2985488772392273, + "learning_rate": 3.055586926240626e-05, + "loss": 0.1735, + "step": 27236 + }, + { + "epoch": 0.4858024471158991, + "grad_norm": 0.33233118057250977, + "learning_rate": 3.0554351667352335e-05, + "loss": 0.2119, + "step": 27237 + }, + { + "epoch": 0.48582028323761284, + "grad_norm": 0.25685766339302063, + "learning_rate": 3.055283405076784e-05, + "loss": 0.0981, + "step": 27238 + }, + { + "epoch": 0.4858381193593265, + "grad_norm": 0.24434791505336761, + "learning_rate": 3.0551316412658626e-05, + "loss": 0.1352, + "step": 27239 + }, + { + "epoch": 0.4858559554810402, + "grad_norm": 0.2511981129646301, + "learning_rate": 3.0549798753030604e-05, + "loss": 0.1355, + "step": 27240 + }, + { + "epoch": 0.4858737916027539, + "grad_norm": 0.41593602299690247, + "learning_rate": 3.054828107188964e-05, + "loss": 0.1864, + "step": 27241 + }, + { + "epoch": 0.4858916277244676, + "grad_norm": 0.24475279450416565, + "learning_rate": 3.0546763369241634e-05, + "loss": 0.1515, + "step": 27242 + }, + { + "epoch": 0.4859094638461813, + "grad_norm": 0.25201985239982605, + "learning_rate": 3.054524564509244e-05, + "loss": 0.1166, + "step": 27243 + }, + { + "epoch": 0.48592729996789497, + "grad_norm": 0.3088558614253998, + "learning_rate": 3.054372789944797e-05, + "loss": 0.157, + "step": 27244 + }, + { + "epoch": 0.48594513608960865, + "grad_norm": 0.3116265833377838, + "learning_rate": 3.054221013231409e-05, + "loss": 0.1684, + "step": 27245 + }, + { + "epoch": 0.4859629722113224, + "grad_norm": 0.2777627110481262, + "learning_rate": 3.054069234369669e-05, + "loss": 0.1592, + "step": 27246 + }, + { + "epoch": 0.4859808083330361, + "grad_norm": 0.3016069531440735, + "learning_rate": 3.053917453360166e-05, + "loss": 0.1531, + "step": 27247 + }, + { + "epoch": 0.4859986444547498, + "grad_norm": 0.1716867983341217, + "learning_rate": 3.053765670203487e-05, + "loss": 0.116, + "step": 27248 + }, + { + "epoch": 0.48601648057646346, + "grad_norm": 0.22268109023571014, + "learning_rate": 3.0536138849002215e-05, + "loss": 0.1412, + "step": 27249 + }, + { + "epoch": 0.48603431669817715, + "grad_norm": 0.37913307547569275, + "learning_rate": 3.053462097450956e-05, + "loss": 0.1293, + "step": 27250 + }, + { + "epoch": 0.48605215281989084, + "grad_norm": 0.27252355217933655, + "learning_rate": 3.0533103078562816e-05, + "loss": 0.1845, + "step": 27251 + }, + { + "epoch": 0.4860699889416045, + "grad_norm": 0.2057403028011322, + "learning_rate": 3.053158516116785e-05, + "loss": 0.1379, + "step": 27252 + }, + { + "epoch": 0.4860878250633182, + "grad_norm": 0.3789529800415039, + "learning_rate": 3.053006722233055e-05, + "loss": 0.2164, + "step": 27253 + }, + { + "epoch": 0.4861056611850319, + "grad_norm": 0.25591522455215454, + "learning_rate": 3.05285492620568e-05, + "loss": 0.1123, + "step": 27254 + }, + { + "epoch": 0.48612349730674564, + "grad_norm": 0.2654406726360321, + "learning_rate": 3.052703128035248e-05, + "loss": 0.1261, + "step": 27255 + }, + { + "epoch": 0.48614133342845933, + "grad_norm": 0.26605573296546936, + "learning_rate": 3.052551327722348e-05, + "loss": 0.1655, + "step": 27256 + }, + { + "epoch": 0.486159169550173, + "grad_norm": 0.2640789747238159, + "learning_rate": 3.052399525267569e-05, + "loss": 0.1628, + "step": 27257 + }, + { + "epoch": 0.4861770056718867, + "grad_norm": 0.22468449175357819, + "learning_rate": 3.052247720671497e-05, + "loss": 0.1051, + "step": 27258 + }, + { + "epoch": 0.4861948417936004, + "grad_norm": 0.21288982033729553, + "learning_rate": 3.052095913934723e-05, + "loss": 0.1165, + "step": 27259 + }, + { + "epoch": 0.4862126779153141, + "grad_norm": 0.25478121638298035, + "learning_rate": 3.0519441050578346e-05, + "loss": 0.1855, + "step": 27260 + }, + { + "epoch": 0.4862305140370278, + "grad_norm": 0.2562662661075592, + "learning_rate": 3.05179229404142e-05, + "loss": 0.132, + "step": 27261 + }, + { + "epoch": 0.48624835015874146, + "grad_norm": 0.19263970851898193, + "learning_rate": 3.051640480886068e-05, + "loss": 0.1076, + "step": 27262 + }, + { + "epoch": 0.4862661862804552, + "grad_norm": 0.29618728160858154, + "learning_rate": 3.051488665592367e-05, + "loss": 0.1597, + "step": 27263 + }, + { + "epoch": 0.4862840224021689, + "grad_norm": 0.23893561959266663, + "learning_rate": 3.051336848160906e-05, + "loss": 0.1426, + "step": 27264 + }, + { + "epoch": 0.4863018585238826, + "grad_norm": 0.32210400700569153, + "learning_rate": 3.0511850285922715e-05, + "loss": 0.1889, + "step": 27265 + }, + { + "epoch": 0.48631969464559627, + "grad_norm": 0.22814425826072693, + "learning_rate": 3.0510332068870544e-05, + "loss": 0.1193, + "step": 27266 + }, + { + "epoch": 0.48633753076730996, + "grad_norm": 0.3275805413722992, + "learning_rate": 3.050881383045842e-05, + "loss": 0.1625, + "step": 27267 + }, + { + "epoch": 0.48635536688902364, + "grad_norm": 0.23945313692092896, + "learning_rate": 3.050729557069224e-05, + "loss": 0.1415, + "step": 27268 + }, + { + "epoch": 0.48637320301073733, + "grad_norm": 0.26737454533576965, + "learning_rate": 3.050577728957787e-05, + "loss": 0.1628, + "step": 27269 + }, + { + "epoch": 0.486391039132451, + "grad_norm": 0.2799249589443207, + "learning_rate": 3.050425898712121e-05, + "loss": 0.1253, + "step": 27270 + }, + { + "epoch": 0.4864088752541647, + "grad_norm": 0.22880299389362335, + "learning_rate": 3.0502740663328138e-05, + "loss": 0.1357, + "step": 27271 + }, + { + "epoch": 0.48642671137587845, + "grad_norm": 0.3262365758419037, + "learning_rate": 3.050122231820454e-05, + "loss": 0.1635, + "step": 27272 + }, + { + "epoch": 0.48644454749759214, + "grad_norm": 0.21146617829799652, + "learning_rate": 3.0499703951756313e-05, + "loss": 0.1373, + "step": 27273 + }, + { + "epoch": 0.4864623836193058, + "grad_norm": 0.25496119260787964, + "learning_rate": 3.049818556398933e-05, + "loss": 0.1046, + "step": 27274 + }, + { + "epoch": 0.4864802197410195, + "grad_norm": 0.2394377738237381, + "learning_rate": 3.049666715490948e-05, + "loss": 0.1295, + "step": 27275 + }, + { + "epoch": 0.4864980558627332, + "grad_norm": 0.26413217186927795, + "learning_rate": 3.049514872452265e-05, + "loss": 0.1242, + "step": 27276 + }, + { + "epoch": 0.4865158919844469, + "grad_norm": 0.24838456511497498, + "learning_rate": 3.0493630272834728e-05, + "loss": 0.1334, + "step": 27277 + }, + { + "epoch": 0.4865337281061606, + "grad_norm": 0.29534652829170227, + "learning_rate": 3.049211179985159e-05, + "loss": 0.1522, + "step": 27278 + }, + { + "epoch": 0.48655156422787427, + "grad_norm": 0.2617117464542389, + "learning_rate": 3.0490593305579136e-05, + "loss": 0.101, + "step": 27279 + }, + { + "epoch": 0.486569400349588, + "grad_norm": 0.310689240694046, + "learning_rate": 3.0489074790023244e-05, + "loss": 0.1596, + "step": 27280 + }, + { + "epoch": 0.4865872364713017, + "grad_norm": 0.30059751868247986, + "learning_rate": 3.0487556253189802e-05, + "loss": 0.1624, + "step": 27281 + }, + { + "epoch": 0.4866050725930154, + "grad_norm": 0.2293773740530014, + "learning_rate": 3.0486037695084697e-05, + "loss": 0.1444, + "step": 27282 + }, + { + "epoch": 0.4866229087147291, + "grad_norm": 0.38112732768058777, + "learning_rate": 3.0484519115713816e-05, + "loss": 0.1805, + "step": 27283 + }, + { + "epoch": 0.48664074483644276, + "grad_norm": 0.23375114798545837, + "learning_rate": 3.048300051508305e-05, + "loss": 0.1461, + "step": 27284 + }, + { + "epoch": 0.48665858095815645, + "grad_norm": 0.3298053443431854, + "learning_rate": 3.0481481893198273e-05, + "loss": 0.1842, + "step": 27285 + }, + { + "epoch": 0.48667641707987014, + "grad_norm": 0.24933935701847076, + "learning_rate": 3.0479963250065378e-05, + "loss": 0.1386, + "step": 27286 + }, + { + "epoch": 0.4866942532015838, + "grad_norm": 0.3003977835178375, + "learning_rate": 3.0478444585690252e-05, + "loss": 0.1364, + "step": 27287 + }, + { + "epoch": 0.48671208932329757, + "grad_norm": 0.30580267310142517, + "learning_rate": 3.047692590007879e-05, + "loss": 0.0891, + "step": 27288 + }, + { + "epoch": 0.48672992544501126, + "grad_norm": 0.2590245306491852, + "learning_rate": 3.0475407193236864e-05, + "loss": 0.156, + "step": 27289 + }, + { + "epoch": 0.48674776156672495, + "grad_norm": 0.2779429256916046, + "learning_rate": 3.0473888465170376e-05, + "loss": 0.1801, + "step": 27290 + }, + { + "epoch": 0.48676559768843863, + "grad_norm": 0.2095423936843872, + "learning_rate": 3.04723697158852e-05, + "loss": 0.1498, + "step": 27291 + }, + { + "epoch": 0.4867834338101523, + "grad_norm": 0.25905272364616394, + "learning_rate": 3.047085094538723e-05, + "loss": 0.1397, + "step": 27292 + }, + { + "epoch": 0.486801269931866, + "grad_norm": 0.23495237529277802, + "learning_rate": 3.0469332153682352e-05, + "loss": 0.1353, + "step": 27293 + }, + { + "epoch": 0.4868191060535797, + "grad_norm": 0.2629874348640442, + "learning_rate": 3.046781334077645e-05, + "loss": 0.1354, + "step": 27294 + }, + { + "epoch": 0.4868369421752934, + "grad_norm": 0.2586022615432739, + "learning_rate": 3.0466294506675417e-05, + "loss": 0.1446, + "step": 27295 + }, + { + "epoch": 0.4868547782970071, + "grad_norm": 0.30887869000434875, + "learning_rate": 3.0464775651385147e-05, + "loss": 0.119, + "step": 27296 + }, + { + "epoch": 0.4868726144187208, + "grad_norm": 0.42592737078666687, + "learning_rate": 3.0463256774911514e-05, + "loss": 0.1737, + "step": 27297 + }, + { + "epoch": 0.4868904505404345, + "grad_norm": 0.23727940022945404, + "learning_rate": 3.046173787726041e-05, + "loss": 0.1334, + "step": 27298 + }, + { + "epoch": 0.4869082866621482, + "grad_norm": 0.2822306454181671, + "learning_rate": 3.046021895843772e-05, + "loss": 0.1536, + "step": 27299 + }, + { + "epoch": 0.4869261227838619, + "grad_norm": 0.3753502666950226, + "learning_rate": 3.0458700018449337e-05, + "loss": 0.1118, + "step": 27300 + }, + { + "epoch": 0.48694395890557557, + "grad_norm": 0.22475706040859222, + "learning_rate": 3.0457181057301154e-05, + "loss": 0.1106, + "step": 27301 + }, + { + "epoch": 0.48696179502728926, + "grad_norm": 0.31406259536743164, + "learning_rate": 3.045566207499904e-05, + "loss": 0.1984, + "step": 27302 + }, + { + "epoch": 0.48697963114900295, + "grad_norm": 0.2767631411552429, + "learning_rate": 3.0454143071548908e-05, + "loss": 0.1253, + "step": 27303 + }, + { + "epoch": 0.48699746727071663, + "grad_norm": 0.2754286527633667, + "learning_rate": 3.0452624046956623e-05, + "loss": 0.1483, + "step": 27304 + }, + { + "epoch": 0.4870153033924304, + "grad_norm": 0.36240652203559875, + "learning_rate": 3.0451105001228097e-05, + "loss": 0.1468, + "step": 27305 + }, + { + "epoch": 0.48703313951414406, + "grad_norm": 0.2550050616264343, + "learning_rate": 3.0449585934369196e-05, + "loss": 0.1389, + "step": 27306 + }, + { + "epoch": 0.48705097563585775, + "grad_norm": 0.20303471386432648, + "learning_rate": 3.0448066846385815e-05, + "loss": 0.1174, + "step": 27307 + }, + { + "epoch": 0.48706881175757144, + "grad_norm": 0.4444245994091034, + "learning_rate": 3.044654773728385e-05, + "loss": 0.1363, + "step": 27308 + }, + { + "epoch": 0.48708664787928513, + "grad_norm": 0.2130480259656906, + "learning_rate": 3.0445028607069188e-05, + "loss": 0.1239, + "step": 27309 + }, + { + "epoch": 0.4871044840009988, + "grad_norm": 0.2550525665283203, + "learning_rate": 3.0443509455747706e-05, + "loss": 0.1328, + "step": 27310 + }, + { + "epoch": 0.4871223201227125, + "grad_norm": 0.2371637523174286, + "learning_rate": 3.0441990283325304e-05, + "loss": 0.1323, + "step": 27311 + }, + { + "epoch": 0.4871401562444262, + "grad_norm": 0.26530876755714417, + "learning_rate": 3.0440471089807876e-05, + "loss": 0.148, + "step": 27312 + }, + { + "epoch": 0.4871579923661399, + "grad_norm": 0.2489183098077774, + "learning_rate": 3.0438951875201293e-05, + "loss": 0.1367, + "step": 27313 + }, + { + "epoch": 0.4871758284878536, + "grad_norm": 0.32706326246261597, + "learning_rate": 3.043743263951146e-05, + "loss": 0.1312, + "step": 27314 + }, + { + "epoch": 0.4871936646095673, + "grad_norm": 0.3285485506057739, + "learning_rate": 3.043591338274425e-05, + "loss": 0.1337, + "step": 27315 + }, + { + "epoch": 0.487211500731281, + "grad_norm": 0.2609952390193939, + "learning_rate": 3.0434394104905577e-05, + "loss": 0.1695, + "step": 27316 + }, + { + "epoch": 0.4872293368529947, + "grad_norm": 0.22378557920455933, + "learning_rate": 3.043287480600131e-05, + "loss": 0.1744, + "step": 27317 + }, + { + "epoch": 0.4872471729747084, + "grad_norm": 0.2997693717479706, + "learning_rate": 3.0431355486037343e-05, + "loss": 0.1289, + "step": 27318 + }, + { + "epoch": 0.48726500909642206, + "grad_norm": 0.39024093747138977, + "learning_rate": 3.0429836145019562e-05, + "loss": 0.2342, + "step": 27319 + }, + { + "epoch": 0.48728284521813575, + "grad_norm": 0.22317062318325043, + "learning_rate": 3.0428316782953863e-05, + "loss": 0.1395, + "step": 27320 + }, + { + "epoch": 0.48730068133984944, + "grad_norm": 0.24340015649795532, + "learning_rate": 3.0426797399846134e-05, + "loss": 0.1506, + "step": 27321 + }, + { + "epoch": 0.4873185174615632, + "grad_norm": 0.34011828899383545, + "learning_rate": 3.0425277995702268e-05, + "loss": 0.117, + "step": 27322 + }, + { + "epoch": 0.48733635358327687, + "grad_norm": 0.333850622177124, + "learning_rate": 3.0423758570528142e-05, + "loss": 0.0828, + "step": 27323 + }, + { + "epoch": 0.48735418970499056, + "grad_norm": 0.34431585669517517, + "learning_rate": 3.0422239124329666e-05, + "loss": 0.0981, + "step": 27324 + }, + { + "epoch": 0.48737202582670425, + "grad_norm": 0.634473979473114, + "learning_rate": 3.0420719657112718e-05, + "loss": 0.1293, + "step": 27325 + }, + { + "epoch": 0.48738986194841794, + "grad_norm": 0.2658355236053467, + "learning_rate": 3.0419200168883176e-05, + "loss": 0.1194, + "step": 27326 + }, + { + "epoch": 0.4874076980701316, + "grad_norm": 0.2890503704547882, + "learning_rate": 3.0417680659646946e-05, + "loss": 0.1215, + "step": 27327 + }, + { + "epoch": 0.4874255341918453, + "grad_norm": 0.305399626493454, + "learning_rate": 3.0416161129409916e-05, + "loss": 0.1407, + "step": 27328 + }, + { + "epoch": 0.487443370313559, + "grad_norm": 0.23517367243766785, + "learning_rate": 3.0414641578177982e-05, + "loss": 0.1261, + "step": 27329 + }, + { + "epoch": 0.4874612064352727, + "grad_norm": 0.2857036888599396, + "learning_rate": 3.041312200595702e-05, + "loss": 0.1355, + "step": 27330 + }, + { + "epoch": 0.48747904255698643, + "grad_norm": 0.24496567249298096, + "learning_rate": 3.0411602412752925e-05, + "loss": 0.1594, + "step": 27331 + }, + { + "epoch": 0.4874968786787001, + "grad_norm": 0.3168342709541321, + "learning_rate": 3.0410082798571593e-05, + "loss": 0.1339, + "step": 27332 + }, + { + "epoch": 0.4875147148004138, + "grad_norm": 0.23017621040344238, + "learning_rate": 3.0408563163418918e-05, + "loss": 0.1385, + "step": 27333 + }, + { + "epoch": 0.4875325509221275, + "grad_norm": 0.2749054729938507, + "learning_rate": 3.0407043507300775e-05, + "loss": 0.1886, + "step": 27334 + }, + { + "epoch": 0.4875503870438412, + "grad_norm": 0.16834156215190887, + "learning_rate": 3.0405523830223065e-05, + "loss": 0.0992, + "step": 27335 + }, + { + "epoch": 0.48756822316555487, + "grad_norm": 0.22413797676563263, + "learning_rate": 3.040400413219168e-05, + "loss": 0.14, + "step": 27336 + }, + { + "epoch": 0.48758605928726856, + "grad_norm": 0.31531214714050293, + "learning_rate": 3.0402484413212513e-05, + "loss": 0.1844, + "step": 27337 + }, + { + "epoch": 0.48760389540898225, + "grad_norm": 0.3473220467567444, + "learning_rate": 3.0400964673291444e-05, + "loss": 0.1437, + "step": 27338 + }, + { + "epoch": 0.487621731530696, + "grad_norm": 0.2839626967906952, + "learning_rate": 3.0399444912434373e-05, + "loss": 0.1666, + "step": 27339 + }, + { + "epoch": 0.4876395676524097, + "grad_norm": 0.23532170057296753, + "learning_rate": 3.0397925130647186e-05, + "loss": 0.1724, + "step": 27340 + }, + { + "epoch": 0.48765740377412337, + "grad_norm": 0.2965671420097351, + "learning_rate": 3.0396405327935778e-05, + "loss": 0.1184, + "step": 27341 + }, + { + "epoch": 0.48767523989583705, + "grad_norm": 0.2658535838127136, + "learning_rate": 3.0394885504306037e-05, + "loss": 0.2124, + "step": 27342 + }, + { + "epoch": 0.48769307601755074, + "grad_norm": 0.21778833866119385, + "learning_rate": 3.0393365659763863e-05, + "loss": 0.1314, + "step": 27343 + }, + { + "epoch": 0.48771091213926443, + "grad_norm": 0.2593882977962494, + "learning_rate": 3.0391845794315137e-05, + "loss": 0.1275, + "step": 27344 + }, + { + "epoch": 0.4877287482609781, + "grad_norm": 0.21858806908130646, + "learning_rate": 3.039032590796575e-05, + "loss": 0.1136, + "step": 27345 + }, + { + "epoch": 0.4877465843826918, + "grad_norm": 0.37158480286598206, + "learning_rate": 3.038880600072161e-05, + "loss": 0.1292, + "step": 27346 + }, + { + "epoch": 0.48776442050440555, + "grad_norm": 0.3079637885093689, + "learning_rate": 3.0387286072588584e-05, + "loss": 0.1451, + "step": 27347 + }, + { + "epoch": 0.48778225662611924, + "grad_norm": 0.3274306356906891, + "learning_rate": 3.038576612357258e-05, + "loss": 0.1498, + "step": 27348 + }, + { + "epoch": 0.4878000927478329, + "grad_norm": 0.3026948571205139, + "learning_rate": 3.0384246153679487e-05, + "loss": 0.11, + "step": 27349 + }, + { + "epoch": 0.4878179288695466, + "grad_norm": 0.3100977838039398, + "learning_rate": 3.0382726162915197e-05, + "loss": 0.1491, + "step": 27350 + }, + { + "epoch": 0.4878357649912603, + "grad_norm": 0.22274494171142578, + "learning_rate": 3.0381206151285607e-05, + "loss": 0.1368, + "step": 27351 + }, + { + "epoch": 0.487853601112974, + "grad_norm": 0.20299595594406128, + "learning_rate": 3.0379686118796596e-05, + "loss": 0.2092, + "step": 27352 + }, + { + "epoch": 0.4878714372346877, + "grad_norm": 0.2284993678331375, + "learning_rate": 3.0378166065454068e-05, + "loss": 0.1335, + "step": 27353 + }, + { + "epoch": 0.48788927335640137, + "grad_norm": 0.264803409576416, + "learning_rate": 3.0376645991263908e-05, + "loss": 0.0985, + "step": 27354 + }, + { + "epoch": 0.48790710947811505, + "grad_norm": 0.22501897811889648, + "learning_rate": 3.037512589623201e-05, + "loss": 0.0989, + "step": 27355 + }, + { + "epoch": 0.4879249455998288, + "grad_norm": 0.2307986468076706, + "learning_rate": 3.037360578036426e-05, + "loss": 0.1323, + "step": 27356 + }, + { + "epoch": 0.4879427817215425, + "grad_norm": 0.33018019795417786, + "learning_rate": 3.0372085643666577e-05, + "loss": 0.137, + "step": 27357 + }, + { + "epoch": 0.4879606178432562, + "grad_norm": 0.2736720144748688, + "learning_rate": 3.037056548614482e-05, + "loss": 0.1962, + "step": 27358 + }, + { + "epoch": 0.48797845396496986, + "grad_norm": 0.25060150027275085, + "learning_rate": 3.03690453078049e-05, + "loss": 0.1238, + "step": 27359 + }, + { + "epoch": 0.48799629008668355, + "grad_norm": 0.26136261224746704, + "learning_rate": 3.0367525108652706e-05, + "loss": 0.1641, + "step": 27360 + }, + { + "epoch": 0.48801412620839724, + "grad_norm": 0.24233616888523102, + "learning_rate": 3.0366004888694132e-05, + "loss": 0.1297, + "step": 27361 + }, + { + "epoch": 0.4880319623301109, + "grad_norm": 0.2826780378818512, + "learning_rate": 3.036448464793507e-05, + "loss": 0.1441, + "step": 27362 + }, + { + "epoch": 0.4880497984518246, + "grad_norm": 0.24363841116428375, + "learning_rate": 3.036296438638141e-05, + "loss": 0.1993, + "step": 27363 + }, + { + "epoch": 0.48806763457353836, + "grad_norm": 0.22785243391990662, + "learning_rate": 3.0361444104039055e-05, + "loss": 0.0982, + "step": 27364 + }, + { + "epoch": 0.48808547069525204, + "grad_norm": 0.21507996320724487, + "learning_rate": 3.035992380091388e-05, + "loss": 0.1331, + "step": 27365 + }, + { + "epoch": 0.48810330681696573, + "grad_norm": 0.19429700076580048, + "learning_rate": 3.0358403477011797e-05, + "loss": 0.1262, + "step": 27366 + }, + { + "epoch": 0.4881211429386794, + "grad_norm": 0.2522944211959839, + "learning_rate": 3.0356883132338687e-05, + "loss": 0.1772, + "step": 27367 + }, + { + "epoch": 0.4881389790603931, + "grad_norm": 0.2989320755004883, + "learning_rate": 3.0355362766900453e-05, + "loss": 0.1165, + "step": 27368 + }, + { + "epoch": 0.4881568151821068, + "grad_norm": 0.3227361738681793, + "learning_rate": 3.0353842380702975e-05, + "loss": 0.0887, + "step": 27369 + }, + { + "epoch": 0.4881746513038205, + "grad_norm": 0.4426630437374115, + "learning_rate": 3.035232197375216e-05, + "loss": 0.1368, + "step": 27370 + }, + { + "epoch": 0.48819248742553417, + "grad_norm": 0.21288444101810455, + "learning_rate": 3.0350801546053898e-05, + "loss": 0.0839, + "step": 27371 + }, + { + "epoch": 0.48821032354724786, + "grad_norm": 0.25482863187789917, + "learning_rate": 3.0349281097614078e-05, + "loss": 0.1225, + "step": 27372 + }, + { + "epoch": 0.4882281596689616, + "grad_norm": 0.2689240872859955, + "learning_rate": 3.0347760628438597e-05, + "loss": 0.1689, + "step": 27373 + }, + { + "epoch": 0.4882459957906753, + "grad_norm": 0.25223878026008606, + "learning_rate": 3.0346240138533354e-05, + "loss": 0.1649, + "step": 27374 + }, + { + "epoch": 0.488263831912389, + "grad_norm": 0.2771747410297394, + "learning_rate": 3.034471962790423e-05, + "loss": 0.1555, + "step": 27375 + }, + { + "epoch": 0.48828166803410267, + "grad_norm": 0.2560770511627197, + "learning_rate": 3.034319909655713e-05, + "loss": 0.1375, + "step": 27376 + }, + { + "epoch": 0.48829950415581636, + "grad_norm": 0.2753375172615051, + "learning_rate": 3.0341678544497947e-05, + "loss": 0.1653, + "step": 27377 + }, + { + "epoch": 0.48831734027753004, + "grad_norm": 0.2662998139858246, + "learning_rate": 3.034015797173257e-05, + "loss": 0.1163, + "step": 27378 + }, + { + "epoch": 0.48833517639924373, + "grad_norm": 0.3011186122894287, + "learning_rate": 3.03386373782669e-05, + "loss": 0.193, + "step": 27379 + }, + { + "epoch": 0.4883530125209574, + "grad_norm": 0.3543972969055176, + "learning_rate": 3.033711676410682e-05, + "loss": 0.1501, + "step": 27380 + }, + { + "epoch": 0.48837084864267116, + "grad_norm": 0.28629258275032043, + "learning_rate": 3.033559612925824e-05, + "loss": 0.107, + "step": 27381 + }, + { + "epoch": 0.48838868476438485, + "grad_norm": 0.3023747205734253, + "learning_rate": 3.033407547372704e-05, + "loss": 0.1242, + "step": 27382 + }, + { + "epoch": 0.48840652088609854, + "grad_norm": 0.20361687242984772, + "learning_rate": 3.0332554797519124e-05, + "loss": 0.1347, + "step": 27383 + }, + { + "epoch": 0.4884243570078122, + "grad_norm": 0.24906346201896667, + "learning_rate": 3.0331034100640383e-05, + "loss": 0.1759, + "step": 27384 + }, + { + "epoch": 0.4884421931295259, + "grad_norm": 0.23788444697856903, + "learning_rate": 3.032951338309672e-05, + "loss": 0.1474, + "step": 27385 + }, + { + "epoch": 0.4884600292512396, + "grad_norm": 0.18916882574558258, + "learning_rate": 3.032799264489401e-05, + "loss": 0.1335, + "step": 27386 + }, + { + "epoch": 0.4884778653729533, + "grad_norm": 0.22644850611686707, + "learning_rate": 3.032647188603817e-05, + "loss": 0.1471, + "step": 27387 + }, + { + "epoch": 0.488495701494667, + "grad_norm": 0.35181525349617004, + "learning_rate": 3.0324951106535082e-05, + "loss": 0.1971, + "step": 27388 + }, + { + "epoch": 0.4885135376163807, + "grad_norm": 0.24308420717716217, + "learning_rate": 3.0323430306390642e-05, + "loss": 0.0956, + "step": 27389 + }, + { + "epoch": 0.4885313737380944, + "grad_norm": 0.36524635553359985, + "learning_rate": 3.032190948561075e-05, + "loss": 0.1359, + "step": 27390 + }, + { + "epoch": 0.4885492098598081, + "grad_norm": 0.2894512414932251, + "learning_rate": 3.03203886442013e-05, + "loss": 0.1718, + "step": 27391 + }, + { + "epoch": 0.4885670459815218, + "grad_norm": 0.262677937746048, + "learning_rate": 3.0318867782168186e-05, + "loss": 0.1546, + "step": 27392 + }, + { + "epoch": 0.4885848821032355, + "grad_norm": 0.31588712334632874, + "learning_rate": 3.0317346899517295e-05, + "loss": 0.1508, + "step": 27393 + }, + { + "epoch": 0.48860271822494916, + "grad_norm": 0.2818751633167267, + "learning_rate": 3.0315825996254542e-05, + "loss": 0.1521, + "step": 27394 + }, + { + "epoch": 0.48862055434666285, + "grad_norm": 0.2366069108247757, + "learning_rate": 3.031430507238581e-05, + "loss": 0.0787, + "step": 27395 + }, + { + "epoch": 0.48863839046837654, + "grad_norm": 0.3727762997150421, + "learning_rate": 3.0312784127916993e-05, + "loss": 0.1801, + "step": 27396 + }, + { + "epoch": 0.4886562265900902, + "grad_norm": 0.29055285453796387, + "learning_rate": 3.031126316285398e-05, + "loss": 0.1692, + "step": 27397 + }, + { + "epoch": 0.48867406271180397, + "grad_norm": 0.2740151882171631, + "learning_rate": 3.0309742177202695e-05, + "loss": 0.1494, + "step": 27398 + }, + { + "epoch": 0.48869189883351766, + "grad_norm": 0.2600267231464386, + "learning_rate": 3.0308221170969002e-05, + "loss": 0.1669, + "step": 27399 + }, + { + "epoch": 0.48870973495523135, + "grad_norm": 0.2821851670742035, + "learning_rate": 3.0306700144158817e-05, + "loss": 0.1363, + "step": 27400 + }, + { + "epoch": 0.48872757107694503, + "grad_norm": 0.3379646837711334, + "learning_rate": 3.0305179096778026e-05, + "loss": 0.1555, + "step": 27401 + }, + { + "epoch": 0.4887454071986587, + "grad_norm": 0.25313127040863037, + "learning_rate": 3.030365802883253e-05, + "loss": 0.143, + "step": 27402 + }, + { + "epoch": 0.4887632433203724, + "grad_norm": 0.21434620022773743, + "learning_rate": 3.0302136940328223e-05, + "loss": 0.1444, + "step": 27403 + }, + { + "epoch": 0.4887810794420861, + "grad_norm": 0.23814326524734497, + "learning_rate": 3.0300615831271e-05, + "loss": 0.1035, + "step": 27404 + }, + { + "epoch": 0.4887989155637998, + "grad_norm": 0.3455928564071655, + "learning_rate": 3.0299094701666765e-05, + "loss": 0.1652, + "step": 27405 + }, + { + "epoch": 0.48881675168551353, + "grad_norm": 0.26397237181663513, + "learning_rate": 3.0297573551521406e-05, + "loss": 0.1655, + "step": 27406 + }, + { + "epoch": 0.4888345878072272, + "grad_norm": 0.29323145747184753, + "learning_rate": 3.0296052380840824e-05, + "loss": 0.1346, + "step": 27407 + }, + { + "epoch": 0.4888524239289409, + "grad_norm": 0.2768794894218445, + "learning_rate": 3.0294531189630908e-05, + "loss": 0.1491, + "step": 27408 + }, + { + "epoch": 0.4888702600506546, + "grad_norm": 0.22584381699562073, + "learning_rate": 3.029300997789757e-05, + "loss": 0.151, + "step": 27409 + }, + { + "epoch": 0.4888880961723683, + "grad_norm": 0.25385189056396484, + "learning_rate": 3.0291488745646685e-05, + "loss": 0.17, + "step": 27410 + }, + { + "epoch": 0.48890593229408197, + "grad_norm": 0.2069319486618042, + "learning_rate": 3.028996749288417e-05, + "loss": 0.1355, + "step": 27411 + }, + { + "epoch": 0.48892376841579566, + "grad_norm": 0.24361161887645721, + "learning_rate": 3.0288446219615906e-05, + "loss": 0.1375, + "step": 27412 + }, + { + "epoch": 0.48894160453750934, + "grad_norm": 0.20962321758270264, + "learning_rate": 3.028692492584781e-05, + "loss": 0.1016, + "step": 27413 + }, + { + "epoch": 0.48895944065922303, + "grad_norm": 0.29396456480026245, + "learning_rate": 3.0285403611585755e-05, + "loss": 0.1891, + "step": 27414 + }, + { + "epoch": 0.4889772767809368, + "grad_norm": 0.2472020983695984, + "learning_rate": 3.0283882276835664e-05, + "loss": 0.1157, + "step": 27415 + }, + { + "epoch": 0.48899511290265046, + "grad_norm": 0.22812482714653015, + "learning_rate": 3.0282360921603414e-05, + "loss": 0.1622, + "step": 27416 + }, + { + "epoch": 0.48901294902436415, + "grad_norm": 0.31664395332336426, + "learning_rate": 3.0280839545894903e-05, + "loss": 0.1012, + "step": 27417 + }, + { + "epoch": 0.48903078514607784, + "grad_norm": 0.2688933312892914, + "learning_rate": 3.0279318149716034e-05, + "loss": 0.1178, + "step": 27418 + }, + { + "epoch": 0.48904862126779153, + "grad_norm": 0.23485180735588074, + "learning_rate": 3.0277796733072707e-05, + "loss": 0.07, + "step": 27419 + }, + { + "epoch": 0.4890664573895052, + "grad_norm": 0.26901087164878845, + "learning_rate": 3.0276275295970823e-05, + "loss": 0.1517, + "step": 27420 + }, + { + "epoch": 0.4890842935112189, + "grad_norm": 0.15468831360340118, + "learning_rate": 3.0274753838416266e-05, + "loss": 0.0828, + "step": 27421 + }, + { + "epoch": 0.4891021296329326, + "grad_norm": 0.2339048832654953, + "learning_rate": 3.027323236041495e-05, + "loss": 0.1433, + "step": 27422 + }, + { + "epoch": 0.48911996575464634, + "grad_norm": 0.2595570981502533, + "learning_rate": 3.0271710861972753e-05, + "loss": 0.1971, + "step": 27423 + }, + { + "epoch": 0.48913780187636, + "grad_norm": 0.26092326641082764, + "learning_rate": 3.0270189343095585e-05, + "loss": 0.1634, + "step": 27424 + }, + { + "epoch": 0.4891556379980737, + "grad_norm": 0.29813894629478455, + "learning_rate": 3.0268667803789347e-05, + "loss": 0.2104, + "step": 27425 + }, + { + "epoch": 0.4891734741197874, + "grad_norm": 0.3102382719516754, + "learning_rate": 3.0267146244059936e-05, + "loss": 0.115, + "step": 27426 + }, + { + "epoch": 0.4891913102415011, + "grad_norm": 0.2704358398914337, + "learning_rate": 3.026562466391324e-05, + "loss": 0.1026, + "step": 27427 + }, + { + "epoch": 0.4892091463632148, + "grad_norm": 0.2531728148460388, + "learning_rate": 3.026410306335517e-05, + "loss": 0.133, + "step": 27428 + }, + { + "epoch": 0.48922698248492846, + "grad_norm": 0.29078641533851624, + "learning_rate": 3.026258144239162e-05, + "loss": 0.1399, + "step": 27429 + }, + { + "epoch": 0.48924481860664215, + "grad_norm": 0.23377011716365814, + "learning_rate": 3.026105980102848e-05, + "loss": 0.1403, + "step": 27430 + }, + { + "epoch": 0.48926265472835584, + "grad_norm": 0.22940313816070557, + "learning_rate": 3.0259538139271652e-05, + "loss": 0.1281, + "step": 27431 + }, + { + "epoch": 0.4892804908500696, + "grad_norm": 0.47694000601768494, + "learning_rate": 3.0258016457127047e-05, + "loss": 0.1888, + "step": 27432 + }, + { + "epoch": 0.48929832697178327, + "grad_norm": 0.3156335949897766, + "learning_rate": 3.025649475460055e-05, + "loss": 0.1128, + "step": 27433 + }, + { + "epoch": 0.48931616309349696, + "grad_norm": 0.2901115119457245, + "learning_rate": 3.0254973031698064e-05, + "loss": 0.1204, + "step": 27434 + }, + { + "epoch": 0.48933399921521065, + "grad_norm": 0.2870045304298401, + "learning_rate": 3.0253451288425493e-05, + "loss": 0.1467, + "step": 27435 + }, + { + "epoch": 0.48935183533692433, + "grad_norm": 0.39747709035873413, + "learning_rate": 3.025192952478872e-05, + "loss": 0.2356, + "step": 27436 + }, + { + "epoch": 0.489369671458638, + "grad_norm": 0.33862990140914917, + "learning_rate": 3.0250407740793664e-05, + "loss": 0.1443, + "step": 27437 + }, + { + "epoch": 0.4893875075803517, + "grad_norm": 0.310565322637558, + "learning_rate": 3.024888593644621e-05, + "loss": 0.1044, + "step": 27438 + }, + { + "epoch": 0.4894053437020654, + "grad_norm": 0.2805270850658417, + "learning_rate": 3.0247364111752258e-05, + "loss": 0.193, + "step": 27439 + }, + { + "epoch": 0.48942317982377914, + "grad_norm": 0.20183080434799194, + "learning_rate": 3.024584226671771e-05, + "loss": 0.1507, + "step": 27440 + }, + { + "epoch": 0.48944101594549283, + "grad_norm": 0.30187568068504333, + "learning_rate": 3.024432040134847e-05, + "loss": 0.1669, + "step": 27441 + }, + { + "epoch": 0.4894588520672065, + "grad_norm": 0.2592521905899048, + "learning_rate": 3.0242798515650435e-05, + "loss": 0.1926, + "step": 27442 + }, + { + "epoch": 0.4894766881889202, + "grad_norm": 0.29906854033470154, + "learning_rate": 3.02412766096295e-05, + "loss": 0.1631, + "step": 27443 + }, + { + "epoch": 0.4894945243106339, + "grad_norm": 0.21281097829341888, + "learning_rate": 3.0239754683291567e-05, + "loss": 0.1203, + "step": 27444 + }, + { + "epoch": 0.4895123604323476, + "grad_norm": 0.38983625173568726, + "learning_rate": 3.023823273664253e-05, + "loss": 0.1442, + "step": 27445 + }, + { + "epoch": 0.48953019655406127, + "grad_norm": 0.23149482905864716, + "learning_rate": 3.023671076968829e-05, + "loss": 0.0654, + "step": 27446 + }, + { + "epoch": 0.48954803267577496, + "grad_norm": 0.2669079601764679, + "learning_rate": 3.0235188782434757e-05, + "loss": 0.1142, + "step": 27447 + }, + { + "epoch": 0.4895658687974887, + "grad_norm": 0.3289227783679962, + "learning_rate": 3.0233666774887833e-05, + "loss": 0.1875, + "step": 27448 + }, + { + "epoch": 0.4895837049192024, + "grad_norm": 0.39887478947639465, + "learning_rate": 3.023214474705339e-05, + "loss": 0.1652, + "step": 27449 + }, + { + "epoch": 0.4896015410409161, + "grad_norm": 0.2522299289703369, + "learning_rate": 3.0230622698937366e-05, + "loss": 0.1658, + "step": 27450 + }, + { + "epoch": 0.48961937716262977, + "grad_norm": 0.2716444730758667, + "learning_rate": 3.0229100630545632e-05, + "loss": 0.1804, + "step": 27451 + }, + { + "epoch": 0.48963721328434345, + "grad_norm": 0.23780399560928345, + "learning_rate": 3.0227578541884095e-05, + "loss": 0.1475, + "step": 27452 + }, + { + "epoch": 0.48965504940605714, + "grad_norm": 0.22552287578582764, + "learning_rate": 3.022605643295866e-05, + "loss": 0.1007, + "step": 27453 + }, + { + "epoch": 0.48967288552777083, + "grad_norm": 0.24677103757858276, + "learning_rate": 3.0224534303775232e-05, + "loss": 0.0985, + "step": 27454 + }, + { + "epoch": 0.4896907216494845, + "grad_norm": 0.2954382598400116, + "learning_rate": 3.0223012154339704e-05, + "loss": 0.14, + "step": 27455 + }, + { + "epoch": 0.4897085577711982, + "grad_norm": 0.27266794443130493, + "learning_rate": 3.0221489984657968e-05, + "loss": 0.1604, + "step": 27456 + }, + { + "epoch": 0.48972639389291195, + "grad_norm": 0.27781030535697937, + "learning_rate": 3.021996779473594e-05, + "loss": 0.1985, + "step": 27457 + }, + { + "epoch": 0.48974423001462564, + "grad_norm": 0.2853183448314667, + "learning_rate": 3.021844558457951e-05, + "loss": 0.1739, + "step": 27458 + }, + { + "epoch": 0.4897620661363393, + "grad_norm": 0.3584592938423157, + "learning_rate": 3.021692335419458e-05, + "loss": 0.1238, + "step": 27459 + }, + { + "epoch": 0.489779902258053, + "grad_norm": 0.21173353493213654, + "learning_rate": 3.021540110358706e-05, + "loss": 0.0968, + "step": 27460 + }, + { + "epoch": 0.4897977383797667, + "grad_norm": 0.4052639901638031, + "learning_rate": 3.0213878832762843e-05, + "loss": 0.221, + "step": 27461 + }, + { + "epoch": 0.4898155745014804, + "grad_norm": 0.25534844398498535, + "learning_rate": 3.0212356541727828e-05, + "loss": 0.0455, + "step": 27462 + }, + { + "epoch": 0.4898334106231941, + "grad_norm": 0.20178954303264618, + "learning_rate": 3.0210834230487923e-05, + "loss": 0.1476, + "step": 27463 + }, + { + "epoch": 0.48985124674490776, + "grad_norm": 0.27798745036125183, + "learning_rate": 3.020931189904902e-05, + "loss": 0.1799, + "step": 27464 + }, + { + "epoch": 0.4898690828666215, + "grad_norm": 0.3439313769340515, + "learning_rate": 3.020778954741703e-05, + "loss": 0.1369, + "step": 27465 + }, + { + "epoch": 0.4898869189883352, + "grad_norm": 0.3136689364910126, + "learning_rate": 3.020626717559784e-05, + "loss": 0.1571, + "step": 27466 + }, + { + "epoch": 0.4899047551100489, + "grad_norm": 0.35819634795188904, + "learning_rate": 3.0204744783597365e-05, + "loss": 0.1008, + "step": 27467 + }, + { + "epoch": 0.48992259123176257, + "grad_norm": 0.291364848613739, + "learning_rate": 3.0203222371421507e-05, + "loss": 0.1599, + "step": 27468 + }, + { + "epoch": 0.48994042735347626, + "grad_norm": 0.2212088406085968, + "learning_rate": 3.0201699939076155e-05, + "loss": 0.1075, + "step": 27469 + }, + { + "epoch": 0.48995826347518995, + "grad_norm": 0.2668302357196808, + "learning_rate": 3.020017748656722e-05, + "loss": 0.1444, + "step": 27470 + }, + { + "epoch": 0.48997609959690364, + "grad_norm": 0.3390072286128998, + "learning_rate": 3.01986550139006e-05, + "loss": 0.1113, + "step": 27471 + }, + { + "epoch": 0.4899939357186173, + "grad_norm": 0.3463013768196106, + "learning_rate": 3.0197132521082204e-05, + "loss": 0.1858, + "step": 27472 + }, + { + "epoch": 0.490011771840331, + "grad_norm": 0.3130691349506378, + "learning_rate": 3.019561000811792e-05, + "loss": 0.176, + "step": 27473 + }, + { + "epoch": 0.49002960796204476, + "grad_norm": 0.32623130083084106, + "learning_rate": 3.0194087475013655e-05, + "loss": 0.0689, + "step": 27474 + }, + { + "epoch": 0.49004744408375844, + "grad_norm": 0.28828662633895874, + "learning_rate": 3.0192564921775312e-05, + "loss": 0.1205, + "step": 27475 + }, + { + "epoch": 0.49006528020547213, + "grad_norm": 0.35746586322784424, + "learning_rate": 3.0191042348408803e-05, + "loss": 0.1345, + "step": 27476 + }, + { + "epoch": 0.4900831163271858, + "grad_norm": 0.2819545269012451, + "learning_rate": 3.0189519754920013e-05, + "loss": 0.1398, + "step": 27477 + }, + { + "epoch": 0.4901009524488995, + "grad_norm": 0.20441186428070068, + "learning_rate": 3.0187997141314862e-05, + "loss": 0.1648, + "step": 27478 + }, + { + "epoch": 0.4901187885706132, + "grad_norm": 0.247260183095932, + "learning_rate": 3.0186474507599234e-05, + "loss": 0.1281, + "step": 27479 + }, + { + "epoch": 0.4901366246923269, + "grad_norm": 0.29136478900909424, + "learning_rate": 3.0184951853779042e-05, + "loss": 0.1359, + "step": 27480 + }, + { + "epoch": 0.49015446081404057, + "grad_norm": 0.23198296129703522, + "learning_rate": 3.018342917986019e-05, + "loss": 0.1378, + "step": 27481 + }, + { + "epoch": 0.4901722969357543, + "grad_norm": 0.24682170152664185, + "learning_rate": 3.0181906485848567e-05, + "loss": 0.1384, + "step": 27482 + }, + { + "epoch": 0.490190133057468, + "grad_norm": 0.2727537155151367, + "learning_rate": 3.0180383771750092e-05, + "loss": 0.1637, + "step": 27483 + }, + { + "epoch": 0.4902079691791817, + "grad_norm": 0.2801114618778229, + "learning_rate": 3.0178861037570655e-05, + "loss": 0.1787, + "step": 27484 + }, + { + "epoch": 0.4902258053008954, + "grad_norm": 0.30123138427734375, + "learning_rate": 3.017733828331617e-05, + "loss": 0.1531, + "step": 27485 + }, + { + "epoch": 0.49024364142260907, + "grad_norm": 0.26603224873542786, + "learning_rate": 3.0175815508992528e-05, + "loss": 0.169, + "step": 27486 + }, + { + "epoch": 0.49026147754432275, + "grad_norm": 0.21042069792747498, + "learning_rate": 3.0174292714605636e-05, + "loss": 0.1038, + "step": 27487 + }, + { + "epoch": 0.49027931366603644, + "grad_norm": 0.2981683909893036, + "learning_rate": 3.01727699001614e-05, + "loss": 0.0856, + "step": 27488 + }, + { + "epoch": 0.49029714978775013, + "grad_norm": 0.2670917510986328, + "learning_rate": 3.0171247065665726e-05, + "loss": 0.1033, + "step": 27489 + }, + { + "epoch": 0.4903149859094638, + "grad_norm": 0.24902333319187164, + "learning_rate": 3.0169724211124506e-05, + "loss": 0.1172, + "step": 27490 + }, + { + "epoch": 0.49033282203117756, + "grad_norm": 0.2571159303188324, + "learning_rate": 3.0168201336543655e-05, + "loss": 0.1391, + "step": 27491 + }, + { + "epoch": 0.49035065815289125, + "grad_norm": 0.22074784338474274, + "learning_rate": 3.0166678441929063e-05, + "loss": 0.1413, + "step": 27492 + }, + { + "epoch": 0.49036849427460494, + "grad_norm": 0.2892563045024872, + "learning_rate": 3.0165155527286653e-05, + "loss": 0.1667, + "step": 27493 + }, + { + "epoch": 0.4903863303963186, + "grad_norm": 0.2722818851470947, + "learning_rate": 3.0163632592622303e-05, + "loss": 0.1388, + "step": 27494 + }, + { + "epoch": 0.4904041665180323, + "grad_norm": 0.2804606258869171, + "learning_rate": 3.0162109637941938e-05, + "loss": 0.1964, + "step": 27495 + }, + { + "epoch": 0.490422002639746, + "grad_norm": 0.3598021864891052, + "learning_rate": 3.0160586663251455e-05, + "loss": 0.1094, + "step": 27496 + }, + { + "epoch": 0.4904398387614597, + "grad_norm": 0.2383718192577362, + "learning_rate": 3.015906366855675e-05, + "loss": 0.1325, + "step": 27497 + }, + { + "epoch": 0.4904576748831734, + "grad_norm": 0.22849123179912567, + "learning_rate": 3.0157540653863736e-05, + "loss": 0.1276, + "step": 27498 + }, + { + "epoch": 0.4904755110048871, + "grad_norm": 0.405903697013855, + "learning_rate": 3.0156017619178307e-05, + "loss": 0.1348, + "step": 27499 + }, + { + "epoch": 0.4904933471266008, + "grad_norm": 0.19192469120025635, + "learning_rate": 3.015449456450638e-05, + "loss": 0.1421, + "step": 27500 + }, + { + "epoch": 0.4905111832483145, + "grad_norm": 0.23690274357795715, + "learning_rate": 3.0152971489853842e-05, + "loss": 0.0872, + "step": 27501 + }, + { + "epoch": 0.4905290193700282, + "grad_norm": 0.2556398808956146, + "learning_rate": 3.0151448395226615e-05, + "loss": 0.1546, + "step": 27502 + }, + { + "epoch": 0.4905468554917419, + "grad_norm": 0.23648172616958618, + "learning_rate": 3.0149925280630593e-05, + "loss": 0.1284, + "step": 27503 + }, + { + "epoch": 0.49056469161345556, + "grad_norm": 0.23430638015270233, + "learning_rate": 3.0148402146071685e-05, + "loss": 0.1498, + "step": 27504 + }, + { + "epoch": 0.49058252773516925, + "grad_norm": 0.24758735299110413, + "learning_rate": 3.0146878991555786e-05, + "loss": 0.1492, + "step": 27505 + }, + { + "epoch": 0.49060036385688294, + "grad_norm": 0.3409648835659027, + "learning_rate": 3.014535581708881e-05, + "loss": 0.1167, + "step": 27506 + }, + { + "epoch": 0.4906181999785967, + "grad_norm": 0.23154838383197784, + "learning_rate": 3.014383262267666e-05, + "loss": 0.115, + "step": 27507 + }, + { + "epoch": 0.49063603610031037, + "grad_norm": 0.2456117570400238, + "learning_rate": 3.014230940832523e-05, + "loss": 0.1259, + "step": 27508 + }, + { + "epoch": 0.49065387222202406, + "grad_norm": 0.41577792167663574, + "learning_rate": 3.0140786174040442e-05, + "loss": 0.1777, + "step": 27509 + }, + { + "epoch": 0.49067170834373774, + "grad_norm": 0.2501739263534546, + "learning_rate": 3.0139262919828186e-05, + "loss": 0.1147, + "step": 27510 + }, + { + "epoch": 0.49068954446545143, + "grad_norm": 0.25977393984794617, + "learning_rate": 3.0137739645694376e-05, + "loss": 0.112, + "step": 27511 + }, + { + "epoch": 0.4907073805871651, + "grad_norm": 0.31532108783721924, + "learning_rate": 3.0136216351644906e-05, + "loss": 0.1576, + "step": 27512 + }, + { + "epoch": 0.4907252167088788, + "grad_norm": 0.2619518041610718, + "learning_rate": 3.0134693037685697e-05, + "loss": 0.1338, + "step": 27513 + }, + { + "epoch": 0.4907430528305925, + "grad_norm": 0.2229187786579132, + "learning_rate": 3.0133169703822634e-05, + "loss": 0.1668, + "step": 27514 + }, + { + "epoch": 0.4907608889523062, + "grad_norm": 0.30031391978263855, + "learning_rate": 3.0131646350061638e-05, + "loss": 0.1124, + "step": 27515 + }, + { + "epoch": 0.49077872507401993, + "grad_norm": 0.2917827069759369, + "learning_rate": 3.0130122976408602e-05, + "loss": 0.1176, + "step": 27516 + }, + { + "epoch": 0.4907965611957336, + "grad_norm": 0.23008359968662262, + "learning_rate": 3.0128599582869445e-05, + "loss": 0.1303, + "step": 27517 + }, + { + "epoch": 0.4908143973174473, + "grad_norm": 0.2603848874568939, + "learning_rate": 3.012707616945006e-05, + "loss": 0.1536, + "step": 27518 + }, + { + "epoch": 0.490832233439161, + "grad_norm": 0.2413429319858551, + "learning_rate": 3.012555273615636e-05, + "loss": 0.1495, + "step": 27519 + }, + { + "epoch": 0.4908500695608747, + "grad_norm": 0.25921598076820374, + "learning_rate": 3.0124029282994247e-05, + "loss": 0.1163, + "step": 27520 + }, + { + "epoch": 0.49086790568258837, + "grad_norm": 0.27238136529922485, + "learning_rate": 3.0122505809969627e-05, + "loss": 0.1933, + "step": 27521 + }, + { + "epoch": 0.49088574180430206, + "grad_norm": 0.28039273619651794, + "learning_rate": 3.0120982317088403e-05, + "loss": 0.1186, + "step": 27522 + }, + { + "epoch": 0.49090357792601574, + "grad_norm": 0.26747840642929077, + "learning_rate": 3.0119458804356483e-05, + "loss": 0.0806, + "step": 27523 + }, + { + "epoch": 0.4909214140477295, + "grad_norm": 0.2762358486652374, + "learning_rate": 3.0117935271779775e-05, + "loss": 0.1591, + "step": 27524 + }, + { + "epoch": 0.4909392501694432, + "grad_norm": 0.28732550144195557, + "learning_rate": 3.011641171936418e-05, + "loss": 0.2027, + "step": 27525 + }, + { + "epoch": 0.49095708629115686, + "grad_norm": 0.31902605295181274, + "learning_rate": 3.0114888147115612e-05, + "loss": 0.1622, + "step": 27526 + }, + { + "epoch": 0.49097492241287055, + "grad_norm": 0.2513086199760437, + "learning_rate": 3.011336455503996e-05, + "loss": 0.1298, + "step": 27527 + }, + { + "epoch": 0.49099275853458424, + "grad_norm": 0.3024000823497772, + "learning_rate": 3.0111840943143145e-05, + "loss": 0.1448, + "step": 27528 + }, + { + "epoch": 0.4910105946562979, + "grad_norm": 0.24043670296669006, + "learning_rate": 3.011031731143107e-05, + "loss": 0.1144, + "step": 27529 + }, + { + "epoch": 0.4910284307780116, + "grad_norm": 0.22187557816505432, + "learning_rate": 3.010879365990964e-05, + "loss": 0.1299, + "step": 27530 + }, + { + "epoch": 0.4910462668997253, + "grad_norm": 0.2667667269706726, + "learning_rate": 3.0107269988584764e-05, + "loss": 0.1575, + "step": 27531 + }, + { + "epoch": 0.491064103021439, + "grad_norm": 0.2809300422668457, + "learning_rate": 3.0105746297462346e-05, + "loss": 0.1256, + "step": 27532 + }, + { + "epoch": 0.49108193914315273, + "grad_norm": 0.30818885564804077, + "learning_rate": 3.0104222586548285e-05, + "loss": 0.1787, + "step": 27533 + }, + { + "epoch": 0.4910997752648664, + "grad_norm": 0.3898325264453888, + "learning_rate": 3.01026988558485e-05, + "loss": 0.1831, + "step": 27534 + }, + { + "epoch": 0.4911176113865801, + "grad_norm": 0.2780079245567322, + "learning_rate": 3.010117510536889e-05, + "loss": 0.1018, + "step": 27535 + }, + { + "epoch": 0.4911354475082938, + "grad_norm": 0.2041061669588089, + "learning_rate": 3.0099651335115364e-05, + "loss": 0.14, + "step": 27536 + }, + { + "epoch": 0.4911532836300075, + "grad_norm": 0.2900209128856659, + "learning_rate": 3.009812754509383e-05, + "loss": 0.1018, + "step": 27537 + }, + { + "epoch": 0.4911711197517212, + "grad_norm": 0.2860986292362213, + "learning_rate": 3.0096603735310187e-05, + "loss": 0.1392, + "step": 27538 + }, + { + "epoch": 0.49118895587343486, + "grad_norm": 0.585249662399292, + "learning_rate": 3.009507990577035e-05, + "loss": 0.1587, + "step": 27539 + }, + { + "epoch": 0.49120679199514855, + "grad_norm": 0.26620766520500183, + "learning_rate": 3.0093556056480227e-05, + "loss": 0.1084, + "step": 27540 + }, + { + "epoch": 0.4912246281168623, + "grad_norm": 0.2916359603404999, + "learning_rate": 3.0092032187445725e-05, + "loss": 0.1641, + "step": 27541 + }, + { + "epoch": 0.491242464238576, + "grad_norm": 0.2687723934650421, + "learning_rate": 3.0090508298672736e-05, + "loss": 0.1353, + "step": 27542 + }, + { + "epoch": 0.49126030036028967, + "grad_norm": 0.22274333238601685, + "learning_rate": 3.0088984390167186e-05, + "loss": 0.1402, + "step": 27543 + }, + { + "epoch": 0.49127813648200336, + "grad_norm": 0.2707686722278595, + "learning_rate": 3.0087460461934968e-05, + "loss": 0.1581, + "step": 27544 + }, + { + "epoch": 0.49129597260371705, + "grad_norm": 0.2288791686296463, + "learning_rate": 3.0085936513982006e-05, + "loss": 0.1128, + "step": 27545 + }, + { + "epoch": 0.49131380872543073, + "grad_norm": 0.2257990837097168, + "learning_rate": 3.008441254631419e-05, + "loss": 0.1302, + "step": 27546 + }, + { + "epoch": 0.4913316448471444, + "grad_norm": 0.2584269046783447, + "learning_rate": 3.0082888558937438e-05, + "loss": 0.1727, + "step": 27547 + }, + { + "epoch": 0.4913494809688581, + "grad_norm": 0.24260324239730835, + "learning_rate": 3.008136455185766e-05, + "loss": 0.1493, + "step": 27548 + }, + { + "epoch": 0.49136731709057185, + "grad_norm": 0.30001112818717957, + "learning_rate": 3.0079840525080748e-05, + "loss": 0.1934, + "step": 27549 + }, + { + "epoch": 0.49138515321228554, + "grad_norm": 0.24751876294612885, + "learning_rate": 3.0078316478612623e-05, + "loss": 0.1142, + "step": 27550 + }, + { + "epoch": 0.49140298933399923, + "grad_norm": 0.3574122190475464, + "learning_rate": 3.0076792412459188e-05, + "loss": 0.1174, + "step": 27551 + }, + { + "epoch": 0.4914208254557129, + "grad_norm": 0.22561073303222656, + "learning_rate": 3.0075268326626356e-05, + "loss": 0.1648, + "step": 27552 + }, + { + "epoch": 0.4914386615774266, + "grad_norm": 0.25146549940109253, + "learning_rate": 3.0073744221120025e-05, + "loss": 0.1655, + "step": 27553 + }, + { + "epoch": 0.4914564976991403, + "grad_norm": 0.19077861309051514, + "learning_rate": 3.007222009594612e-05, + "loss": 0.1548, + "step": 27554 + }, + { + "epoch": 0.491474333820854, + "grad_norm": 0.2860119640827179, + "learning_rate": 3.0070695951110528e-05, + "loss": 0.159, + "step": 27555 + }, + { + "epoch": 0.49149216994256767, + "grad_norm": 0.1798303872346878, + "learning_rate": 3.0069171786619167e-05, + "loss": 0.136, + "step": 27556 + }, + { + "epoch": 0.49151000606428136, + "grad_norm": 0.25867077708244324, + "learning_rate": 3.0067647602477946e-05, + "loss": 0.1385, + "step": 27557 + }, + { + "epoch": 0.4915278421859951, + "grad_norm": 0.24283356964588165, + "learning_rate": 3.006612339869278e-05, + "loss": 0.1396, + "step": 27558 + }, + { + "epoch": 0.4915456783077088, + "grad_norm": 0.20348159968852997, + "learning_rate": 3.006459917526956e-05, + "loss": 0.097, + "step": 27559 + }, + { + "epoch": 0.4915635144294225, + "grad_norm": 0.31267812848091125, + "learning_rate": 3.0063074932214215e-05, + "loss": 0.1396, + "step": 27560 + }, + { + "epoch": 0.49158135055113616, + "grad_norm": 0.39648309350013733, + "learning_rate": 3.006155066953264e-05, + "loss": 0.1681, + "step": 27561 + }, + { + "epoch": 0.49159918667284985, + "grad_norm": 0.2350505292415619, + "learning_rate": 3.006002638723074e-05, + "loss": 0.157, + "step": 27562 + }, + { + "epoch": 0.49161702279456354, + "grad_norm": 0.2715829908847809, + "learning_rate": 3.005850208531443e-05, + "loss": 0.2128, + "step": 27563 + }, + { + "epoch": 0.49163485891627723, + "grad_norm": 0.3013695180416107, + "learning_rate": 3.0056977763789623e-05, + "loss": 0.1759, + "step": 27564 + }, + { + "epoch": 0.4916526950379909, + "grad_norm": 0.17881599068641663, + "learning_rate": 3.0055453422662223e-05, + "loss": 0.1114, + "step": 27565 + }, + { + "epoch": 0.49167053115970466, + "grad_norm": 0.23384027183055878, + "learning_rate": 3.0053929061938135e-05, + "loss": 0.1236, + "step": 27566 + }, + { + "epoch": 0.49168836728141835, + "grad_norm": 0.3760800063610077, + "learning_rate": 3.005240468162328e-05, + "loss": 0.17, + "step": 27567 + }, + { + "epoch": 0.49170620340313204, + "grad_norm": 0.3806721270084381, + "learning_rate": 3.005088028172356e-05, + "loss": 0.1155, + "step": 27568 + }, + { + "epoch": 0.4917240395248457, + "grad_norm": 0.3928253650665283, + "learning_rate": 3.004935586224488e-05, + "loss": 0.155, + "step": 27569 + }, + { + "epoch": 0.4917418756465594, + "grad_norm": 0.31668326258659363, + "learning_rate": 3.004783142319315e-05, + "loss": 0.0758, + "step": 27570 + }, + { + "epoch": 0.4917597117682731, + "grad_norm": 0.2437426745891571, + "learning_rate": 3.0046306964574283e-05, + "loss": 0.0983, + "step": 27571 + }, + { + "epoch": 0.4917775478899868, + "grad_norm": 0.26245933771133423, + "learning_rate": 3.0044782486394184e-05, + "loss": 0.1463, + "step": 27572 + }, + { + "epoch": 0.4917953840117005, + "grad_norm": 0.26438385248184204, + "learning_rate": 3.0043257988658774e-05, + "loss": 0.1379, + "step": 27573 + }, + { + "epoch": 0.49181322013341416, + "grad_norm": 0.237799271941185, + "learning_rate": 3.0041733471373955e-05, + "loss": 0.1099, + "step": 27574 + }, + { + "epoch": 0.4918310562551279, + "grad_norm": 0.33988502621650696, + "learning_rate": 3.0040208934545626e-05, + "loss": 0.2688, + "step": 27575 + }, + { + "epoch": 0.4918488923768416, + "grad_norm": 0.23251087963581085, + "learning_rate": 3.0038684378179717e-05, + "loss": 0.1604, + "step": 27576 + }, + { + "epoch": 0.4918667284985553, + "grad_norm": 0.27852708101272583, + "learning_rate": 3.0037159802282122e-05, + "loss": 0.1308, + "step": 27577 + }, + { + "epoch": 0.49188456462026897, + "grad_norm": 0.20254004001617432, + "learning_rate": 3.0035635206858747e-05, + "loss": 0.1465, + "step": 27578 + }, + { + "epoch": 0.49190240074198266, + "grad_norm": 0.24009934067726135, + "learning_rate": 3.0034110591915522e-05, + "loss": 0.1421, + "step": 27579 + }, + { + "epoch": 0.49192023686369635, + "grad_norm": 0.2905464172363281, + "learning_rate": 3.0032585957458342e-05, + "loss": 0.1713, + "step": 27580 + }, + { + "epoch": 0.49193807298541004, + "grad_norm": 0.2547030448913574, + "learning_rate": 3.003106130349312e-05, + "loss": 0.1582, + "step": 27581 + }, + { + "epoch": 0.4919559091071237, + "grad_norm": 0.24700869619846344, + "learning_rate": 3.0029536630025772e-05, + "loss": 0.1137, + "step": 27582 + }, + { + "epoch": 0.49197374522883747, + "grad_norm": 0.2629300355911255, + "learning_rate": 3.0028011937062193e-05, + "loss": 0.1325, + "step": 27583 + }, + { + "epoch": 0.49199158135055115, + "grad_norm": 0.44348493218421936, + "learning_rate": 3.0026487224608312e-05, + "loss": 0.1508, + "step": 27584 + }, + { + "epoch": 0.49200941747226484, + "grad_norm": 0.40411919355392456, + "learning_rate": 3.0024962492670023e-05, + "loss": 0.1575, + "step": 27585 + }, + { + "epoch": 0.49202725359397853, + "grad_norm": 0.2483748495578766, + "learning_rate": 3.0023437741253253e-05, + "loss": 0.1438, + "step": 27586 + }, + { + "epoch": 0.4920450897156922, + "grad_norm": 0.29653963446617126, + "learning_rate": 3.0021912970363903e-05, + "loss": 0.1372, + "step": 27587 + }, + { + "epoch": 0.4920629258374059, + "grad_norm": 0.20083899796009064, + "learning_rate": 3.0020388180007876e-05, + "loss": 0.1275, + "step": 27588 + }, + { + "epoch": 0.4920807619591196, + "grad_norm": 0.22859717905521393, + "learning_rate": 3.0018863370191098e-05, + "loss": 0.1642, + "step": 27589 + }, + { + "epoch": 0.4920985980808333, + "grad_norm": 0.3652629256248474, + "learning_rate": 3.0017338540919464e-05, + "loss": 0.1873, + "step": 27590 + }, + { + "epoch": 0.49211643420254697, + "grad_norm": 0.20172858238220215, + "learning_rate": 3.001581369219889e-05, + "loss": 0.1098, + "step": 27591 + }, + { + "epoch": 0.4921342703242607, + "grad_norm": 0.27154991030693054, + "learning_rate": 3.0014288824035297e-05, + "loss": 0.1451, + "step": 27592 + }, + { + "epoch": 0.4921521064459744, + "grad_norm": 0.24188734591007233, + "learning_rate": 3.0012763936434592e-05, + "loss": 0.1138, + "step": 27593 + }, + { + "epoch": 0.4921699425676881, + "grad_norm": 0.2573421001434326, + "learning_rate": 3.0011239029402677e-05, + "loss": 0.1287, + "step": 27594 + }, + { + "epoch": 0.4921877786894018, + "grad_norm": 0.22781307995319366, + "learning_rate": 3.0009714102945474e-05, + "loss": 0.104, + "step": 27595 + }, + { + "epoch": 0.49220561481111547, + "grad_norm": 0.2729334831237793, + "learning_rate": 3.0008189157068882e-05, + "loss": 0.1604, + "step": 27596 + }, + { + "epoch": 0.49222345093282915, + "grad_norm": 0.29051733016967773, + "learning_rate": 3.0006664191778827e-05, + "loss": 0.1406, + "step": 27597 + }, + { + "epoch": 0.49224128705454284, + "grad_norm": 0.3005540668964386, + "learning_rate": 3.00051392070812e-05, + "loss": 0.1445, + "step": 27598 + }, + { + "epoch": 0.49225912317625653, + "grad_norm": 0.33935075998306274, + "learning_rate": 3.0003614202981932e-05, + "loss": 0.1756, + "step": 27599 + }, + { + "epoch": 0.4922769592979703, + "grad_norm": 0.22830133140087128, + "learning_rate": 3.000208917948693e-05, + "loss": 0.095, + "step": 27600 + }, + { + "epoch": 0.49229479541968396, + "grad_norm": 0.24258065223693848, + "learning_rate": 3.0000564136602098e-05, + "loss": 0.1652, + "step": 27601 + }, + { + "epoch": 0.49231263154139765, + "grad_norm": 0.3200227916240692, + "learning_rate": 2.9999039074333357e-05, + "loss": 0.1237, + "step": 27602 + }, + { + "epoch": 0.49233046766311134, + "grad_norm": 0.2693004012107849, + "learning_rate": 2.999751399268661e-05, + "loss": 0.1336, + "step": 27603 + }, + { + "epoch": 0.492348303784825, + "grad_norm": 0.29738491773605347, + "learning_rate": 2.999598889166778e-05, + "loss": 0.1072, + "step": 27604 + }, + { + "epoch": 0.4923661399065387, + "grad_norm": 0.26984646916389465, + "learning_rate": 2.9994463771282755e-05, + "loss": 0.1046, + "step": 27605 + }, + { + "epoch": 0.4923839760282524, + "grad_norm": 0.32763803005218506, + "learning_rate": 2.9992938631537482e-05, + "loss": 0.1474, + "step": 27606 + }, + { + "epoch": 0.4924018121499661, + "grad_norm": 0.30050089955329895, + "learning_rate": 2.9991413472437842e-05, + "loss": 0.104, + "step": 27607 + }, + { + "epoch": 0.49241964827167983, + "grad_norm": 0.2945813536643982, + "learning_rate": 2.9989888293989763e-05, + "loss": 0.125, + "step": 27608 + }, + { + "epoch": 0.4924374843933935, + "grad_norm": 0.26110172271728516, + "learning_rate": 2.9988363096199156e-05, + "loss": 0.1211, + "step": 27609 + }, + { + "epoch": 0.4924553205151072, + "grad_norm": 0.26588115096092224, + "learning_rate": 2.998683787907193e-05, + "loss": 0.1314, + "step": 27610 + }, + { + "epoch": 0.4924731566368209, + "grad_norm": 0.23548169434070587, + "learning_rate": 2.9985312642613995e-05, + "loss": 0.1015, + "step": 27611 + }, + { + "epoch": 0.4924909927585346, + "grad_norm": 0.3043283522129059, + "learning_rate": 2.998378738683127e-05, + "loss": 0.1502, + "step": 27612 + }, + { + "epoch": 0.4925088288802483, + "grad_norm": 0.29870033264160156, + "learning_rate": 2.9982262111729665e-05, + "loss": 0.1486, + "step": 27613 + }, + { + "epoch": 0.49252666500196196, + "grad_norm": 0.25260472297668457, + "learning_rate": 2.9980736817315082e-05, + "loss": 0.115, + "step": 27614 + }, + { + "epoch": 0.49254450112367565, + "grad_norm": 0.29032596945762634, + "learning_rate": 2.9979211503593453e-05, + "loss": 0.1479, + "step": 27615 + }, + { + "epoch": 0.49256233724538934, + "grad_norm": 0.35725653171539307, + "learning_rate": 2.9977686170570675e-05, + "loss": 0.1433, + "step": 27616 + }, + { + "epoch": 0.4925801733671031, + "grad_norm": 0.20266664028167725, + "learning_rate": 2.997616081825267e-05, + "loss": 0.1032, + "step": 27617 + }, + { + "epoch": 0.49259800948881677, + "grad_norm": 0.2522314488887787, + "learning_rate": 2.9974635446645344e-05, + "loss": 0.1271, + "step": 27618 + }, + { + "epoch": 0.49261584561053046, + "grad_norm": 0.3580878973007202, + "learning_rate": 2.9973110055754612e-05, + "loss": 0.209, + "step": 27619 + }, + { + "epoch": 0.49263368173224414, + "grad_norm": 0.19904747605323792, + "learning_rate": 2.9971584645586387e-05, + "loss": 0.1029, + "step": 27620 + }, + { + "epoch": 0.49265151785395783, + "grad_norm": 0.27453264594078064, + "learning_rate": 2.9970059216146583e-05, + "loss": 0.1591, + "step": 27621 + }, + { + "epoch": 0.4926693539756715, + "grad_norm": 0.22794201970100403, + "learning_rate": 2.9968533767441113e-05, + "loss": 0.1569, + "step": 27622 + }, + { + "epoch": 0.4926871900973852, + "grad_norm": 0.21620889008045197, + "learning_rate": 2.9967008299475896e-05, + "loss": 0.1096, + "step": 27623 + }, + { + "epoch": 0.4927050262190989, + "grad_norm": 0.34933048486709595, + "learning_rate": 2.9965482812256834e-05, + "loss": 0.142, + "step": 27624 + }, + { + "epoch": 0.49272286234081264, + "grad_norm": 0.20878398418426514, + "learning_rate": 2.9963957305789853e-05, + "loss": 0.1291, + "step": 27625 + }, + { + "epoch": 0.4927406984625263, + "grad_norm": 0.30119627714157104, + "learning_rate": 2.9962431780080845e-05, + "loss": 0.1504, + "step": 27626 + }, + { + "epoch": 0.49275853458424, + "grad_norm": 0.23445358872413635, + "learning_rate": 2.9960906235135743e-05, + "loss": 0.133, + "step": 27627 + }, + { + "epoch": 0.4927763707059537, + "grad_norm": 0.27041730284690857, + "learning_rate": 2.9959380670960464e-05, + "loss": 0.147, + "step": 27628 + }, + { + "epoch": 0.4927942068276674, + "grad_norm": 0.34073033928871155, + "learning_rate": 2.9957855087560903e-05, + "loss": 0.1634, + "step": 27629 + }, + { + "epoch": 0.4928120429493811, + "grad_norm": 0.30418482422828674, + "learning_rate": 2.995632948494299e-05, + "loss": 0.1616, + "step": 27630 + }, + { + "epoch": 0.49282987907109477, + "grad_norm": 0.32524460554122925, + "learning_rate": 2.9954803863112622e-05, + "loss": 0.2172, + "step": 27631 + }, + { + "epoch": 0.49284771519280846, + "grad_norm": 0.26666396856307983, + "learning_rate": 2.9953278222075738e-05, + "loss": 0.1423, + "step": 27632 + }, + { + "epoch": 0.49286555131452214, + "grad_norm": 0.4993481934070587, + "learning_rate": 2.995175256183822e-05, + "loss": 0.1531, + "step": 27633 + }, + { + "epoch": 0.4928833874362359, + "grad_norm": 0.18934115767478943, + "learning_rate": 2.9950226882406006e-05, + "loss": 0.1076, + "step": 27634 + }, + { + "epoch": 0.4929012235579496, + "grad_norm": 0.23731137812137604, + "learning_rate": 2.9948701183785e-05, + "loss": 0.1318, + "step": 27635 + }, + { + "epoch": 0.49291905967966326, + "grad_norm": 0.23826810717582703, + "learning_rate": 2.9947175465981124e-05, + "loss": 0.146, + "step": 27636 + }, + { + "epoch": 0.49293689580137695, + "grad_norm": 0.3153833746910095, + "learning_rate": 2.9945649729000286e-05, + "loss": 0.132, + "step": 27637 + }, + { + "epoch": 0.49295473192309064, + "grad_norm": 0.2083572894334793, + "learning_rate": 2.9944123972848402e-05, + "loss": 0.1338, + "step": 27638 + }, + { + "epoch": 0.4929725680448043, + "grad_norm": 0.37656208872795105, + "learning_rate": 2.9942598197531385e-05, + "loss": 0.1179, + "step": 27639 + }, + { + "epoch": 0.492990404166518, + "grad_norm": 0.2984860837459564, + "learning_rate": 2.994107240305515e-05, + "loss": 0.1154, + "step": 27640 + }, + { + "epoch": 0.4930082402882317, + "grad_norm": 0.3810182809829712, + "learning_rate": 2.9939546589425616e-05, + "loss": 0.1936, + "step": 27641 + }, + { + "epoch": 0.49302607640994545, + "grad_norm": 0.25900816917419434, + "learning_rate": 2.993802075664869e-05, + "loss": 0.1149, + "step": 27642 + }, + { + "epoch": 0.49304391253165913, + "grad_norm": 0.35352107882499695, + "learning_rate": 2.9936494904730288e-05, + "loss": 0.1303, + "step": 27643 + }, + { + "epoch": 0.4930617486533728, + "grad_norm": 0.1907660961151123, + "learning_rate": 2.9934969033676325e-05, + "loss": 0.1175, + "step": 27644 + }, + { + "epoch": 0.4930795847750865, + "grad_norm": 0.3007625639438629, + "learning_rate": 2.9933443143492722e-05, + "loss": 0.1575, + "step": 27645 + }, + { + "epoch": 0.4930974208968002, + "grad_norm": 0.21162590384483337, + "learning_rate": 2.9931917234185393e-05, + "loss": 0.111, + "step": 27646 + }, + { + "epoch": 0.4931152570185139, + "grad_norm": 0.2759343385696411, + "learning_rate": 2.993039130576024e-05, + "loss": 0.1182, + "step": 27647 + }, + { + "epoch": 0.4931330931402276, + "grad_norm": 0.31473293900489807, + "learning_rate": 2.992886535822319e-05, + "loss": 0.179, + "step": 27648 + }, + { + "epoch": 0.49315092926194126, + "grad_norm": 0.24355410039424896, + "learning_rate": 2.9927339391580157e-05, + "loss": 0.1423, + "step": 27649 + }, + { + "epoch": 0.493168765383655, + "grad_norm": 0.2825729548931122, + "learning_rate": 2.9925813405837056e-05, + "loss": 0.1715, + "step": 27650 + }, + { + "epoch": 0.4931866015053687, + "grad_norm": 0.2509865462779999, + "learning_rate": 2.9924287400999802e-05, + "loss": 0.1121, + "step": 27651 + }, + { + "epoch": 0.4932044376270824, + "grad_norm": 0.26842087507247925, + "learning_rate": 2.992276137707431e-05, + "loss": 0.1387, + "step": 27652 + }, + { + "epoch": 0.49322227374879607, + "grad_norm": 0.30520492792129517, + "learning_rate": 2.9921235334066487e-05, + "loss": 0.1151, + "step": 27653 + }, + { + "epoch": 0.49324010987050976, + "grad_norm": 0.22917208075523376, + "learning_rate": 2.991970927198226e-05, + "loss": 0.1321, + "step": 27654 + }, + { + "epoch": 0.49325794599222345, + "grad_norm": 0.2446434497833252, + "learning_rate": 2.9918183190827542e-05, + "loss": 0.1225, + "step": 27655 + }, + { + "epoch": 0.49327578211393713, + "grad_norm": 0.20324060320854187, + "learning_rate": 2.9916657090608246e-05, + "loss": 0.1208, + "step": 27656 + }, + { + "epoch": 0.4932936182356508, + "grad_norm": 0.2528868317604065, + "learning_rate": 2.9915130971330292e-05, + "loss": 0.1561, + "step": 27657 + }, + { + "epoch": 0.4933114543573645, + "grad_norm": 0.29077768325805664, + "learning_rate": 2.9913604832999592e-05, + "loss": 0.1313, + "step": 27658 + }, + { + "epoch": 0.49332929047907825, + "grad_norm": 0.43253135681152344, + "learning_rate": 2.991207867562206e-05, + "loss": 0.1786, + "step": 27659 + }, + { + "epoch": 0.49334712660079194, + "grad_norm": 0.2216760367155075, + "learning_rate": 2.9910552499203614e-05, + "loss": 0.1437, + "step": 27660 + }, + { + "epoch": 0.49336496272250563, + "grad_norm": 0.2433215230703354, + "learning_rate": 2.9909026303750164e-05, + "loss": 0.1511, + "step": 27661 + }, + { + "epoch": 0.4933827988442193, + "grad_norm": 0.2987543046474457, + "learning_rate": 2.9907500089267648e-05, + "loss": 0.1448, + "step": 27662 + }, + { + "epoch": 0.493400634965933, + "grad_norm": 0.22920256853103638, + "learning_rate": 2.9905973855761955e-05, + "loss": 0.1385, + "step": 27663 + }, + { + "epoch": 0.4934184710876467, + "grad_norm": 0.28606343269348145, + "learning_rate": 2.9904447603239023e-05, + "loss": 0.1198, + "step": 27664 + }, + { + "epoch": 0.4934363072093604, + "grad_norm": 0.3272809088230133, + "learning_rate": 2.9902921331704752e-05, + "loss": 0.1339, + "step": 27665 + }, + { + "epoch": 0.49345414333107407, + "grad_norm": 0.26525336503982544, + "learning_rate": 2.9901395041165064e-05, + "loss": 0.1417, + "step": 27666 + }, + { + "epoch": 0.4934719794527878, + "grad_norm": 0.2768530547618866, + "learning_rate": 2.9899868731625874e-05, + "loss": 0.177, + "step": 27667 + }, + { + "epoch": 0.4934898155745015, + "grad_norm": 0.3807823657989502, + "learning_rate": 2.98983424030931e-05, + "loss": 0.2308, + "step": 27668 + }, + { + "epoch": 0.4935076516962152, + "grad_norm": 0.24528251588344574, + "learning_rate": 2.9896816055572668e-05, + "loss": 0.1171, + "step": 27669 + }, + { + "epoch": 0.4935254878179289, + "grad_norm": 0.24487866461277008, + "learning_rate": 2.9895289689070476e-05, + "loss": 0.1553, + "step": 27670 + }, + { + "epoch": 0.49354332393964256, + "grad_norm": 0.3777882754802704, + "learning_rate": 2.989376330359246e-05, + "loss": 0.0869, + "step": 27671 + }, + { + "epoch": 0.49356116006135625, + "grad_norm": 0.25599420070648193, + "learning_rate": 2.9892236899144515e-05, + "loss": 0.1539, + "step": 27672 + }, + { + "epoch": 0.49357899618306994, + "grad_norm": 0.31662672758102417, + "learning_rate": 2.989071047573258e-05, + "loss": 0.1741, + "step": 27673 + }, + { + "epoch": 0.49359683230478363, + "grad_norm": 0.31823235750198364, + "learning_rate": 2.9889184033362556e-05, + "loss": 0.1319, + "step": 27674 + }, + { + "epoch": 0.4936146684264973, + "grad_norm": 0.2882575988769531, + "learning_rate": 2.9887657572040368e-05, + "loss": 0.1962, + "step": 27675 + }, + { + "epoch": 0.49363250454821106, + "grad_norm": 0.2059096395969391, + "learning_rate": 2.988613109177193e-05, + "loss": 0.1061, + "step": 27676 + }, + { + "epoch": 0.49365034066992475, + "grad_norm": 0.17289742827415466, + "learning_rate": 2.9884604592563165e-05, + "loss": 0.0821, + "step": 27677 + }, + { + "epoch": 0.49366817679163844, + "grad_norm": 0.23387284576892853, + "learning_rate": 2.9883078074419984e-05, + "loss": 0.1192, + "step": 27678 + }, + { + "epoch": 0.4936860129133521, + "grad_norm": 0.2925841212272644, + "learning_rate": 2.98815515373483e-05, + "loss": 0.1517, + "step": 27679 + }, + { + "epoch": 0.4937038490350658, + "grad_norm": 0.22345633804798126, + "learning_rate": 2.9880024981354044e-05, + "loss": 0.1276, + "step": 27680 + }, + { + "epoch": 0.4937216851567795, + "grad_norm": 0.4029824733734131, + "learning_rate": 2.9878498406443122e-05, + "loss": 0.1731, + "step": 27681 + }, + { + "epoch": 0.4937395212784932, + "grad_norm": 0.224434956908226, + "learning_rate": 2.987697181262145e-05, + "loss": 0.1406, + "step": 27682 + }, + { + "epoch": 0.4937573574002069, + "grad_norm": 0.388540118932724, + "learning_rate": 2.987544519989496e-05, + "loss": 0.153, + "step": 27683 + }, + { + "epoch": 0.4937751935219206, + "grad_norm": 0.30785760283470154, + "learning_rate": 2.9873918568269554e-05, + "loss": 0.1888, + "step": 27684 + }, + { + "epoch": 0.4937930296436343, + "grad_norm": 0.24741925299167633, + "learning_rate": 2.9872391917751156e-05, + "loss": 0.0887, + "step": 27685 + }, + { + "epoch": 0.493810865765348, + "grad_norm": 0.1698487251996994, + "learning_rate": 2.9870865248345693e-05, + "loss": 0.0701, + "step": 27686 + }, + { + "epoch": 0.4938287018870617, + "grad_norm": 0.2524876892566681, + "learning_rate": 2.986933856005906e-05, + "loss": 0.133, + "step": 27687 + }, + { + "epoch": 0.49384653800877537, + "grad_norm": 0.23333775997161865, + "learning_rate": 2.98678118528972e-05, + "loss": 0.1252, + "step": 27688 + }, + { + "epoch": 0.49386437413048906, + "grad_norm": 0.25426897406578064, + "learning_rate": 2.986628512686601e-05, + "loss": 0.1375, + "step": 27689 + }, + { + "epoch": 0.49388221025220275, + "grad_norm": 0.2026497721672058, + "learning_rate": 2.986475838197143e-05, + "loss": 0.1332, + "step": 27690 + }, + { + "epoch": 0.49390004637391643, + "grad_norm": 0.19517701864242554, + "learning_rate": 2.9863231618219367e-05, + "loss": 0.1386, + "step": 27691 + }, + { + "epoch": 0.4939178824956301, + "grad_norm": 0.22057275474071503, + "learning_rate": 2.986170483561573e-05, + "loss": 0.1362, + "step": 27692 + }, + { + "epoch": 0.49393571861734387, + "grad_norm": 0.23425064980983734, + "learning_rate": 2.9860178034166452e-05, + "loss": 0.1142, + "step": 27693 + }, + { + "epoch": 0.49395355473905755, + "grad_norm": 0.3758876621723175, + "learning_rate": 2.9858651213877437e-05, + "loss": 0.14, + "step": 27694 + }, + { + "epoch": 0.49397139086077124, + "grad_norm": 0.19376304745674133, + "learning_rate": 2.9857124374754615e-05, + "loss": 0.1434, + "step": 27695 + }, + { + "epoch": 0.49398922698248493, + "grad_norm": 0.28832682967185974, + "learning_rate": 2.9855597516803903e-05, + "loss": 0.1327, + "step": 27696 + }, + { + "epoch": 0.4940070631041986, + "grad_norm": 0.28042230010032654, + "learning_rate": 2.9854070640031218e-05, + "loss": 0.1753, + "step": 27697 + }, + { + "epoch": 0.4940248992259123, + "grad_norm": 0.2371305227279663, + "learning_rate": 2.9852543744442475e-05, + "loss": 0.1001, + "step": 27698 + }, + { + "epoch": 0.494042735347626, + "grad_norm": 0.24829888343811035, + "learning_rate": 2.9851016830043604e-05, + "loss": 0.0878, + "step": 27699 + }, + { + "epoch": 0.4940605714693397, + "grad_norm": 0.33452898263931274, + "learning_rate": 2.984948989684051e-05, + "loss": 0.1556, + "step": 27700 + }, + { + "epoch": 0.4940784075910534, + "grad_norm": 0.3049828112125397, + "learning_rate": 2.9847962944839126e-05, + "loss": 0.1314, + "step": 27701 + }, + { + "epoch": 0.4940962437127671, + "grad_norm": 0.28771519660949707, + "learning_rate": 2.984643597404535e-05, + "loss": 0.1602, + "step": 27702 + }, + { + "epoch": 0.4941140798344808, + "grad_norm": 0.19846481084823608, + "learning_rate": 2.9844908984465125e-05, + "loss": 0.1109, + "step": 27703 + }, + { + "epoch": 0.4941319159561945, + "grad_norm": 0.343536376953125, + "learning_rate": 2.9843381976104357e-05, + "loss": 0.1925, + "step": 27704 + }, + { + "epoch": 0.4941497520779082, + "grad_norm": 0.3101692795753479, + "learning_rate": 2.9841854948968962e-05, + "loss": 0.1337, + "step": 27705 + }, + { + "epoch": 0.49416758819962187, + "grad_norm": 0.2644140124320984, + "learning_rate": 2.9840327903064875e-05, + "loss": 0.1282, + "step": 27706 + }, + { + "epoch": 0.49418542432133555, + "grad_norm": 0.23767289519309998, + "learning_rate": 2.9838800838397995e-05, + "loss": 0.1652, + "step": 27707 + }, + { + "epoch": 0.49420326044304924, + "grad_norm": 0.2844628691673279, + "learning_rate": 2.9837273754974258e-05, + "loss": 0.1184, + "step": 27708 + }, + { + "epoch": 0.494221096564763, + "grad_norm": 0.31627243757247925, + "learning_rate": 2.9835746652799574e-05, + "loss": 0.091, + "step": 27709 + }, + { + "epoch": 0.4942389326864767, + "grad_norm": 0.2738164961338043, + "learning_rate": 2.983421953187987e-05, + "loss": 0.1286, + "step": 27710 + }, + { + "epoch": 0.49425676880819036, + "grad_norm": 0.2729830741882324, + "learning_rate": 2.9832692392221057e-05, + "loss": 0.1612, + "step": 27711 + }, + { + "epoch": 0.49427460492990405, + "grad_norm": 0.21243572235107422, + "learning_rate": 2.983116523382906e-05, + "loss": 0.1264, + "step": 27712 + }, + { + "epoch": 0.49429244105161774, + "grad_norm": 0.3365642726421356, + "learning_rate": 2.98296380567098e-05, + "loss": 0.1272, + "step": 27713 + }, + { + "epoch": 0.4943102771733314, + "grad_norm": 0.2932683229446411, + "learning_rate": 2.9828110860869197e-05, + "loss": 0.1549, + "step": 27714 + }, + { + "epoch": 0.4943281132950451, + "grad_norm": 0.29253682494163513, + "learning_rate": 2.9826583646313166e-05, + "loss": 0.1713, + "step": 27715 + }, + { + "epoch": 0.4943459494167588, + "grad_norm": 0.27886298298835754, + "learning_rate": 2.9825056413047624e-05, + "loss": 0.1223, + "step": 27716 + }, + { + "epoch": 0.4943637855384725, + "grad_norm": 0.20484280586242676, + "learning_rate": 2.98235291610785e-05, + "loss": 0.0968, + "step": 27717 + }, + { + "epoch": 0.49438162166018623, + "grad_norm": 0.2996967136859894, + "learning_rate": 2.9822001890411716e-05, + "loss": 0.1307, + "step": 27718 + }, + { + "epoch": 0.4943994577818999, + "grad_norm": 0.23785392940044403, + "learning_rate": 2.982047460105319e-05, + "loss": 0.1306, + "step": 27719 + }, + { + "epoch": 0.4944172939036136, + "grad_norm": 0.3025568723678589, + "learning_rate": 2.981894729300883e-05, + "loss": 0.1554, + "step": 27720 + }, + { + "epoch": 0.4944351300253273, + "grad_norm": 0.29137152433395386, + "learning_rate": 2.981741996628457e-05, + "loss": 0.1419, + "step": 27721 + }, + { + "epoch": 0.494452966147041, + "grad_norm": 0.2891000509262085, + "learning_rate": 2.9815892620886326e-05, + "loss": 0.1496, + "step": 27722 + }, + { + "epoch": 0.49447080226875467, + "grad_norm": 0.4134567677974701, + "learning_rate": 2.9814365256820015e-05, + "loss": 0.1739, + "step": 27723 + }, + { + "epoch": 0.49448863839046836, + "grad_norm": 0.3712529242038727, + "learning_rate": 2.9812837874091565e-05, + "loss": 0.1901, + "step": 27724 + }, + { + "epoch": 0.49450647451218205, + "grad_norm": 0.1878279596567154, + "learning_rate": 2.98113104727069e-05, + "loss": 0.1404, + "step": 27725 + }, + { + "epoch": 0.4945243106338958, + "grad_norm": 0.2331874817609787, + "learning_rate": 2.9809783052671918e-05, + "loss": 0.1201, + "step": 27726 + }, + { + "epoch": 0.4945421467556095, + "grad_norm": 0.22781163454055786, + "learning_rate": 2.9808255613992568e-05, + "loss": 0.1489, + "step": 27727 + }, + { + "epoch": 0.49455998287732317, + "grad_norm": 0.3174269199371338, + "learning_rate": 2.9806728156674756e-05, + "loss": 0.1774, + "step": 27728 + }, + { + "epoch": 0.49457781899903686, + "grad_norm": 0.2981187105178833, + "learning_rate": 2.980520068072441e-05, + "loss": 0.1565, + "step": 27729 + }, + { + "epoch": 0.49459565512075054, + "grad_norm": 0.34540411829948425, + "learning_rate": 2.9803673186147433e-05, + "loss": 0.1802, + "step": 27730 + }, + { + "epoch": 0.49461349124246423, + "grad_norm": 0.3372819721698761, + "learning_rate": 2.980214567294977e-05, + "loss": 0.2015, + "step": 27731 + }, + { + "epoch": 0.4946313273641779, + "grad_norm": 0.3403260409832001, + "learning_rate": 2.9800618141137332e-05, + "loss": 0.1245, + "step": 27732 + }, + { + "epoch": 0.4946491634858916, + "grad_norm": 0.2683867812156677, + "learning_rate": 2.9799090590716037e-05, + "loss": 0.1243, + "step": 27733 + }, + { + "epoch": 0.4946669996076053, + "grad_norm": 0.1998906433582306, + "learning_rate": 2.979756302169181e-05, + "loss": 0.1138, + "step": 27734 + }, + { + "epoch": 0.49468483572931904, + "grad_norm": 0.3172931373119354, + "learning_rate": 2.9796035434070575e-05, + "loss": 0.1467, + "step": 27735 + }, + { + "epoch": 0.4947026718510327, + "grad_norm": 0.2523716986179352, + "learning_rate": 2.979450782785825e-05, + "loss": 0.1575, + "step": 27736 + }, + { + "epoch": 0.4947205079727464, + "grad_norm": 0.29659193754196167, + "learning_rate": 2.9792980203060743e-05, + "loss": 0.1648, + "step": 27737 + }, + { + "epoch": 0.4947383440944601, + "grad_norm": 0.32037606835365295, + "learning_rate": 2.9791452559684002e-05, + "loss": 0.1281, + "step": 27738 + }, + { + "epoch": 0.4947561802161738, + "grad_norm": 0.2830572724342346, + "learning_rate": 2.9789924897733934e-05, + "loss": 0.1366, + "step": 27739 + }, + { + "epoch": 0.4947740163378875, + "grad_norm": 0.23512092232704163, + "learning_rate": 2.9788397217216462e-05, + "loss": 0.1196, + "step": 27740 + }, + { + "epoch": 0.49479185245960117, + "grad_norm": 0.2612616717815399, + "learning_rate": 2.978686951813751e-05, + "loss": 0.1752, + "step": 27741 + }, + { + "epoch": 0.49480968858131485, + "grad_norm": 0.2290542721748352, + "learning_rate": 2.9785341800502998e-05, + "loss": 0.1288, + "step": 27742 + }, + { + "epoch": 0.4948275247030286, + "grad_norm": 0.22217997908592224, + "learning_rate": 2.978381406431885e-05, + "loss": 0.1219, + "step": 27743 + }, + { + "epoch": 0.4948453608247423, + "grad_norm": 0.1958005577325821, + "learning_rate": 2.978228630959098e-05, + "loss": 0.1369, + "step": 27744 + }, + { + "epoch": 0.494863196946456, + "grad_norm": 0.2611028254032135, + "learning_rate": 2.9780758536325327e-05, + "loss": 0.1885, + "step": 27745 + }, + { + "epoch": 0.49488103306816966, + "grad_norm": 0.3484054505825043, + "learning_rate": 2.977923074452779e-05, + "loss": 0.1415, + "step": 27746 + }, + { + "epoch": 0.49489886918988335, + "grad_norm": 0.31591013073921204, + "learning_rate": 2.9777702934204316e-05, + "loss": 0.2276, + "step": 27747 + }, + { + "epoch": 0.49491670531159704, + "grad_norm": 0.25888514518737793, + "learning_rate": 2.9776175105360805e-05, + "loss": 0.1456, + "step": 27748 + }, + { + "epoch": 0.4949345414333107, + "grad_norm": 0.23392897844314575, + "learning_rate": 2.9774647258003196e-05, + "loss": 0.0924, + "step": 27749 + }, + { + "epoch": 0.4949523775550244, + "grad_norm": 0.40121981501579285, + "learning_rate": 2.9773119392137406e-05, + "loss": 0.1579, + "step": 27750 + }, + { + "epoch": 0.49497021367673816, + "grad_norm": 0.3367566764354706, + "learning_rate": 2.9771591507769347e-05, + "loss": 0.1771, + "step": 27751 + }, + { + "epoch": 0.49498804979845185, + "grad_norm": 0.2644231617450714, + "learning_rate": 2.9770063604904958e-05, + "loss": 0.2018, + "step": 27752 + }, + { + "epoch": 0.49500588592016553, + "grad_norm": 0.2466106414794922, + "learning_rate": 2.9768535683550158e-05, + "loss": 0.106, + "step": 27753 + }, + { + "epoch": 0.4950237220418792, + "grad_norm": 0.23186130821704865, + "learning_rate": 2.9767007743710863e-05, + "loss": 0.1123, + "step": 27754 + }, + { + "epoch": 0.4950415581635929, + "grad_norm": 0.26247891783714294, + "learning_rate": 2.9765479785393002e-05, + "loss": 0.1182, + "step": 27755 + }, + { + "epoch": 0.4950593942853066, + "grad_norm": 0.26620256900787354, + "learning_rate": 2.976395180860249e-05, + "loss": 0.159, + "step": 27756 + }, + { + "epoch": 0.4950772304070203, + "grad_norm": 0.27991393208503723, + "learning_rate": 2.9762423813345262e-05, + "loss": 0.1584, + "step": 27757 + }, + { + "epoch": 0.495095066528734, + "grad_norm": 0.20036830008029938, + "learning_rate": 2.9760895799627232e-05, + "loss": 0.1158, + "step": 27758 + }, + { + "epoch": 0.49511290265044766, + "grad_norm": 0.4034297466278076, + "learning_rate": 2.9759367767454322e-05, + "loss": 0.1144, + "step": 27759 + }, + { + "epoch": 0.4951307387721614, + "grad_norm": 0.2466028481721878, + "learning_rate": 2.975783971683247e-05, + "loss": 0.1273, + "step": 27760 + }, + { + "epoch": 0.4951485748938751, + "grad_norm": 0.36630794405937195, + "learning_rate": 2.9756311647767577e-05, + "loss": 0.1831, + "step": 27761 + }, + { + "epoch": 0.4951664110155888, + "grad_norm": 0.23322570323944092, + "learning_rate": 2.975478356026558e-05, + "loss": 0.149, + "step": 27762 + }, + { + "epoch": 0.49518424713730247, + "grad_norm": 0.2638810873031616, + "learning_rate": 2.9753255454332395e-05, + "loss": 0.1295, + "step": 27763 + }, + { + "epoch": 0.49520208325901616, + "grad_norm": 0.2458961457014084, + "learning_rate": 2.9751727329973955e-05, + "loss": 0.1053, + "step": 27764 + }, + { + "epoch": 0.49521991938072984, + "grad_norm": 0.3017711639404297, + "learning_rate": 2.9750199187196174e-05, + "loss": 0.0805, + "step": 27765 + }, + { + "epoch": 0.49523775550244353, + "grad_norm": 0.29882705211639404, + "learning_rate": 2.974867102600499e-05, + "loss": 0.2002, + "step": 27766 + }, + { + "epoch": 0.4952555916241572, + "grad_norm": 0.26527103781700134, + "learning_rate": 2.9747142846406312e-05, + "loss": 0.1655, + "step": 27767 + }, + { + "epoch": 0.49527342774587096, + "grad_norm": 0.2811010479927063, + "learning_rate": 2.9745614648406073e-05, + "loss": 0.1368, + "step": 27768 + }, + { + "epoch": 0.49529126386758465, + "grad_norm": 0.27112844586372375, + "learning_rate": 2.974408643201018e-05, + "loss": 0.1755, + "step": 27769 + }, + { + "epoch": 0.49530909998929834, + "grad_norm": 0.22731705009937286, + "learning_rate": 2.9742558197224585e-05, + "loss": 0.2092, + "step": 27770 + }, + { + "epoch": 0.49532693611101203, + "grad_norm": 0.26577648520469666, + "learning_rate": 2.974102994405519e-05, + "loss": 0.1202, + "step": 27771 + }, + { + "epoch": 0.4953447722327257, + "grad_norm": 0.24133983254432678, + "learning_rate": 2.973950167250792e-05, + "loss": 0.1214, + "step": 27772 + }, + { + "epoch": 0.4953626083544394, + "grad_norm": 0.3082998991012573, + "learning_rate": 2.9737973382588714e-05, + "loss": 0.1353, + "step": 27773 + }, + { + "epoch": 0.4953804444761531, + "grad_norm": 0.21491719782352448, + "learning_rate": 2.9736445074303477e-05, + "loss": 0.1397, + "step": 27774 + }, + { + "epoch": 0.4953982805978668, + "grad_norm": 0.34926852583885193, + "learning_rate": 2.9734916747658147e-05, + "loss": 0.1793, + "step": 27775 + }, + { + "epoch": 0.49541611671958047, + "grad_norm": 0.34190186858177185, + "learning_rate": 2.9733388402658646e-05, + "loss": 0.1546, + "step": 27776 + }, + { + "epoch": 0.4954339528412942, + "grad_norm": 0.2932499945163727, + "learning_rate": 2.97318600393109e-05, + "loss": 0.1328, + "step": 27777 + }, + { + "epoch": 0.4954517889630079, + "grad_norm": 0.3202677071094513, + "learning_rate": 2.9730331657620826e-05, + "loss": 0.1207, + "step": 27778 + }, + { + "epoch": 0.4954696250847216, + "grad_norm": 0.30953875184059143, + "learning_rate": 2.9728803257594345e-05, + "loss": 0.165, + "step": 27779 + }, + { + "epoch": 0.4954874612064353, + "grad_norm": 0.29354506731033325, + "learning_rate": 2.9727274839237396e-05, + "loss": 0.1148, + "step": 27780 + }, + { + "epoch": 0.49550529732814896, + "grad_norm": 0.22380825877189636, + "learning_rate": 2.97257464025559e-05, + "loss": 0.1218, + "step": 27781 + }, + { + "epoch": 0.49552313344986265, + "grad_norm": 0.2687002122402191, + "learning_rate": 2.9724217947555777e-05, + "loss": 0.149, + "step": 27782 + }, + { + "epoch": 0.49554096957157634, + "grad_norm": 0.2262597382068634, + "learning_rate": 2.9722689474242955e-05, + "loss": 0.1466, + "step": 27783 + }, + { + "epoch": 0.49555880569329, + "grad_norm": 0.17526397109031677, + "learning_rate": 2.972116098262336e-05, + "loss": 0.097, + "step": 27784 + }, + { + "epoch": 0.49557664181500377, + "grad_norm": 0.3570282757282257, + "learning_rate": 2.9719632472702908e-05, + "loss": 0.1078, + "step": 27785 + }, + { + "epoch": 0.49559447793671746, + "grad_norm": 0.22851315140724182, + "learning_rate": 2.971810394448753e-05, + "loss": 0.1338, + "step": 27786 + }, + { + "epoch": 0.49561231405843115, + "grad_norm": 0.36523857712745667, + "learning_rate": 2.9716575397983148e-05, + "loss": 0.1642, + "step": 27787 + }, + { + "epoch": 0.49563015018014484, + "grad_norm": 0.3585837483406067, + "learning_rate": 2.9715046833195704e-05, + "loss": 0.1287, + "step": 27788 + }, + { + "epoch": 0.4956479863018585, + "grad_norm": 0.25841131806373596, + "learning_rate": 2.9713518250131095e-05, + "loss": 0.1317, + "step": 27789 + }, + { + "epoch": 0.4956658224235722, + "grad_norm": 0.2565012276172638, + "learning_rate": 2.9711989648795273e-05, + "loss": 0.148, + "step": 27790 + }, + { + "epoch": 0.4956836585452859, + "grad_norm": 0.3194092810153961, + "learning_rate": 2.971046102919415e-05, + "loss": 0.1185, + "step": 27791 + }, + { + "epoch": 0.4957014946669996, + "grad_norm": 0.2027805745601654, + "learning_rate": 2.9708932391333643e-05, + "loss": 0.1052, + "step": 27792 + }, + { + "epoch": 0.4957193307887133, + "grad_norm": 0.2346637099981308, + "learning_rate": 2.9707403735219697e-05, + "loss": 0.1163, + "step": 27793 + }, + { + "epoch": 0.495737166910427, + "grad_norm": 0.2699218988418579, + "learning_rate": 2.9705875060858224e-05, + "loss": 0.1413, + "step": 27794 + }, + { + "epoch": 0.4957550030321407, + "grad_norm": 0.26824429631233215, + "learning_rate": 2.9704346368255155e-05, + "loss": 0.1143, + "step": 27795 + }, + { + "epoch": 0.4957728391538544, + "grad_norm": 0.326987624168396, + "learning_rate": 2.9702817657416422e-05, + "loss": 0.1186, + "step": 27796 + }, + { + "epoch": 0.4957906752755681, + "grad_norm": 0.30650341510772705, + "learning_rate": 2.9701288928347938e-05, + "loss": 0.1515, + "step": 27797 + }, + { + "epoch": 0.49580851139728177, + "grad_norm": 0.31967130303382874, + "learning_rate": 2.9699760181055637e-05, + "loss": 0.1177, + "step": 27798 + }, + { + "epoch": 0.49582634751899546, + "grad_norm": 0.3324146866798401, + "learning_rate": 2.969823141554543e-05, + "loss": 0.1683, + "step": 27799 + }, + { + "epoch": 0.49584418364070915, + "grad_norm": 0.26720014214515686, + "learning_rate": 2.9696702631823263e-05, + "loss": 0.1595, + "step": 27800 + }, + { + "epoch": 0.49586201976242283, + "grad_norm": 0.31063705682754517, + "learning_rate": 2.969517382989506e-05, + "loss": 0.1915, + "step": 27801 + }, + { + "epoch": 0.4958798558841366, + "grad_norm": 0.22244539856910706, + "learning_rate": 2.9693645009766737e-05, + "loss": 0.0797, + "step": 27802 + }, + { + "epoch": 0.49589769200585027, + "grad_norm": 0.28870487213134766, + "learning_rate": 2.969211617144423e-05, + "loss": 0.1635, + "step": 27803 + }, + { + "epoch": 0.49591552812756395, + "grad_norm": 0.18169677257537842, + "learning_rate": 2.9690587314933448e-05, + "loss": 0.1022, + "step": 27804 + }, + { + "epoch": 0.49593336424927764, + "grad_norm": 0.30563223361968994, + "learning_rate": 2.968905844024034e-05, + "loss": 0.1199, + "step": 27805 + }, + { + "epoch": 0.49595120037099133, + "grad_norm": 0.20923562347888947, + "learning_rate": 2.968752954737082e-05, + "loss": 0.1462, + "step": 27806 + }, + { + "epoch": 0.495969036492705, + "grad_norm": 0.26435256004333496, + "learning_rate": 2.968600063633081e-05, + "loss": 0.1307, + "step": 27807 + }, + { + "epoch": 0.4959868726144187, + "grad_norm": 0.22661389410495758, + "learning_rate": 2.968447170712625e-05, + "loss": 0.0988, + "step": 27808 + }, + { + "epoch": 0.4960047087361324, + "grad_norm": 0.21868455410003662, + "learning_rate": 2.968294275976306e-05, + "loss": 0.1125, + "step": 27809 + }, + { + "epoch": 0.49602254485784614, + "grad_norm": 0.29917147755622864, + "learning_rate": 2.9681413794247164e-05, + "loss": 0.1526, + "step": 27810 + }, + { + "epoch": 0.4960403809795598, + "grad_norm": 0.20681817829608917, + "learning_rate": 2.967988481058449e-05, + "loss": 0.1014, + "step": 27811 + }, + { + "epoch": 0.4960582171012735, + "grad_norm": 0.2031608521938324, + "learning_rate": 2.9678355808780974e-05, + "loss": 0.1582, + "step": 27812 + }, + { + "epoch": 0.4960760532229872, + "grad_norm": 0.24757565557956696, + "learning_rate": 2.9676826788842527e-05, + "loss": 0.1362, + "step": 27813 + }, + { + "epoch": 0.4960938893447009, + "grad_norm": 0.350021630525589, + "learning_rate": 2.9675297750775082e-05, + "loss": 0.1636, + "step": 27814 + }, + { + "epoch": 0.4961117254664146, + "grad_norm": 0.19894543290138245, + "learning_rate": 2.967376869458457e-05, + "loss": 0.1657, + "step": 27815 + }, + { + "epoch": 0.49612956158812827, + "grad_norm": 0.22180263698101044, + "learning_rate": 2.9672239620276922e-05, + "loss": 0.1456, + "step": 27816 + }, + { + "epoch": 0.49614739770984195, + "grad_norm": 0.3438716232776642, + "learning_rate": 2.9670710527858052e-05, + "loss": 0.1585, + "step": 27817 + }, + { + "epoch": 0.49616523383155564, + "grad_norm": 0.25440147519111633, + "learning_rate": 2.96691814173339e-05, + "loss": 0.1638, + "step": 27818 + }, + { + "epoch": 0.4961830699532694, + "grad_norm": 0.23777243494987488, + "learning_rate": 2.9667652288710383e-05, + "loss": 0.1206, + "step": 27819 + }, + { + "epoch": 0.4962009060749831, + "grad_norm": 0.26888108253479004, + "learning_rate": 2.9666123141993435e-05, + "loss": 0.1266, + "step": 27820 + }, + { + "epoch": 0.49621874219669676, + "grad_norm": 0.2419235110282898, + "learning_rate": 2.966459397718898e-05, + "loss": 0.1874, + "step": 27821 + }, + { + "epoch": 0.49623657831841045, + "grad_norm": 0.2995932102203369, + "learning_rate": 2.966306479430296e-05, + "loss": 0.1937, + "step": 27822 + }, + { + "epoch": 0.49625441444012414, + "grad_norm": 0.20871558785438538, + "learning_rate": 2.9661535593341283e-05, + "loss": 0.142, + "step": 27823 + }, + { + "epoch": 0.4962722505618378, + "grad_norm": 0.22967679798603058, + "learning_rate": 2.966000637430988e-05, + "loss": 0.1348, + "step": 27824 + }, + { + "epoch": 0.4962900866835515, + "grad_norm": 0.22010403871536255, + "learning_rate": 2.965847713721469e-05, + "loss": 0.09, + "step": 27825 + }, + { + "epoch": 0.4963079228052652, + "grad_norm": 0.3423563838005066, + "learning_rate": 2.9656947882061625e-05, + "loss": 0.1666, + "step": 27826 + }, + { + "epoch": 0.49632575892697894, + "grad_norm": 0.24587838351726532, + "learning_rate": 2.9655418608856627e-05, + "loss": 0.1179, + "step": 27827 + }, + { + "epoch": 0.49634359504869263, + "grad_norm": 0.31879591941833496, + "learning_rate": 2.9653889317605616e-05, + "loss": 0.1192, + "step": 27828 + }, + { + "epoch": 0.4963614311704063, + "grad_norm": 0.2893899977207184, + "learning_rate": 2.9652360008314528e-05, + "loss": 0.1397, + "step": 27829 + }, + { + "epoch": 0.49637926729212, + "grad_norm": 0.30011263489723206, + "learning_rate": 2.9650830680989278e-05, + "loss": 0.1153, + "step": 27830 + }, + { + "epoch": 0.4963971034138337, + "grad_norm": 0.2200906127691269, + "learning_rate": 2.964930133563581e-05, + "loss": 0.1555, + "step": 27831 + }, + { + "epoch": 0.4964149395355474, + "grad_norm": 0.2774388790130615, + "learning_rate": 2.9647771972260035e-05, + "loss": 0.1681, + "step": 27832 + }, + { + "epoch": 0.49643277565726107, + "grad_norm": 0.3050696849822998, + "learning_rate": 2.96462425908679e-05, + "loss": 0.1308, + "step": 27833 + }, + { + "epoch": 0.49645061177897476, + "grad_norm": 0.24673138558864594, + "learning_rate": 2.9644713191465316e-05, + "loss": 0.1343, + "step": 27834 + }, + { + "epoch": 0.49646844790068845, + "grad_norm": 0.2587926983833313, + "learning_rate": 2.9643183774058224e-05, + "loss": 0.1542, + "step": 27835 + }, + { + "epoch": 0.4964862840224022, + "grad_norm": 0.3508484363555908, + "learning_rate": 2.9641654338652546e-05, + "loss": 0.1545, + "step": 27836 + }, + { + "epoch": 0.4965041201441159, + "grad_norm": 0.33448004722595215, + "learning_rate": 2.9640124885254218e-05, + "loss": 0.1173, + "step": 27837 + }, + { + "epoch": 0.49652195626582957, + "grad_norm": 0.313690185546875, + "learning_rate": 2.963859541386916e-05, + "loss": 0.1579, + "step": 27838 + }, + { + "epoch": 0.49653979238754326, + "grad_norm": 0.26195430755615234, + "learning_rate": 2.9637065924503298e-05, + "loss": 0.1237, + "step": 27839 + }, + { + "epoch": 0.49655762850925694, + "grad_norm": 0.34702929854393005, + "learning_rate": 2.9635536417162574e-05, + "loss": 0.131, + "step": 27840 + }, + { + "epoch": 0.49657546463097063, + "grad_norm": 0.38884323835372925, + "learning_rate": 2.96340068918529e-05, + "loss": 0.2393, + "step": 27841 + }, + { + "epoch": 0.4965933007526843, + "grad_norm": 0.3461587429046631, + "learning_rate": 2.963247734858023e-05, + "loss": 0.1266, + "step": 27842 + }, + { + "epoch": 0.496611136874398, + "grad_norm": 0.2655712366104126, + "learning_rate": 2.9630947787350467e-05, + "loss": 0.1298, + "step": 27843 + }, + { + "epoch": 0.49662897299611175, + "grad_norm": 0.16946347057819366, + "learning_rate": 2.962941820816956e-05, + "loss": 0.0715, + "step": 27844 + }, + { + "epoch": 0.49664680911782544, + "grad_norm": 0.2938988506793976, + "learning_rate": 2.962788861104342e-05, + "loss": 0.1095, + "step": 27845 + }, + { + "epoch": 0.4966646452395391, + "grad_norm": 0.27131253480911255, + "learning_rate": 2.962635899597799e-05, + "loss": 0.097, + "step": 27846 + }, + { + "epoch": 0.4966824813612528, + "grad_norm": 0.25252237915992737, + "learning_rate": 2.962482936297919e-05, + "loss": 0.143, + "step": 27847 + }, + { + "epoch": 0.4967003174829665, + "grad_norm": 0.2072848677635193, + "learning_rate": 2.9623299712052954e-05, + "loss": 0.1174, + "step": 27848 + }, + { + "epoch": 0.4967181536046802, + "grad_norm": 0.29337868094444275, + "learning_rate": 2.9621770043205215e-05, + "loss": 0.1432, + "step": 27849 + }, + { + "epoch": 0.4967359897263939, + "grad_norm": 0.3335683345794678, + "learning_rate": 2.9620240356441897e-05, + "loss": 0.1371, + "step": 27850 + }, + { + "epoch": 0.49675382584810757, + "grad_norm": 0.22608454525470734, + "learning_rate": 2.9618710651768938e-05, + "loss": 0.1291, + "step": 27851 + }, + { + "epoch": 0.49677166196982125, + "grad_norm": 0.29111024737358093, + "learning_rate": 2.9617180929192257e-05, + "loss": 0.1228, + "step": 27852 + }, + { + "epoch": 0.496789498091535, + "grad_norm": 0.2770387530326843, + "learning_rate": 2.9615651188717786e-05, + "loss": 0.1454, + "step": 27853 + }, + { + "epoch": 0.4968073342132487, + "grad_norm": 0.3288304805755615, + "learning_rate": 2.961412143035146e-05, + "loss": 0.1269, + "step": 27854 + }, + { + "epoch": 0.4968251703349624, + "grad_norm": 0.29333335161209106, + "learning_rate": 2.9612591654099198e-05, + "loss": 0.1242, + "step": 27855 + }, + { + "epoch": 0.49684300645667606, + "grad_norm": 0.2045074701309204, + "learning_rate": 2.9611061859966942e-05, + "loss": 0.1074, + "step": 27856 + }, + { + "epoch": 0.49686084257838975, + "grad_norm": 0.23224858939647675, + "learning_rate": 2.9609532047960624e-05, + "loss": 0.1042, + "step": 27857 + }, + { + "epoch": 0.49687867870010344, + "grad_norm": 0.2689198851585388, + "learning_rate": 2.9608002218086163e-05, + "loss": 0.1584, + "step": 27858 + }, + { + "epoch": 0.4968965148218171, + "grad_norm": 0.36132392287254333, + "learning_rate": 2.960647237034949e-05, + "loss": 0.166, + "step": 27859 + }, + { + "epoch": 0.4969143509435308, + "grad_norm": 0.3105090260505676, + "learning_rate": 2.9604942504756545e-05, + "loss": 0.1507, + "step": 27860 + }, + { + "epoch": 0.49693218706524456, + "grad_norm": 0.2498639076948166, + "learning_rate": 2.9603412621313254e-05, + "loss": 0.139, + "step": 27861 + }, + { + "epoch": 0.49695002318695825, + "grad_norm": 0.27536413073539734, + "learning_rate": 2.9601882720025537e-05, + "loss": 0.1865, + "step": 27862 + }, + { + "epoch": 0.49696785930867193, + "grad_norm": 0.38503387570381165, + "learning_rate": 2.960035280089934e-05, + "loss": 0.184, + "step": 27863 + }, + { + "epoch": 0.4969856954303856, + "grad_norm": 0.24762022495269775, + "learning_rate": 2.9598822863940584e-05, + "loss": 0.1506, + "step": 27864 + }, + { + "epoch": 0.4970035315520993, + "grad_norm": 0.1912391036748886, + "learning_rate": 2.95972929091552e-05, + "loss": 0.0989, + "step": 27865 + }, + { + "epoch": 0.497021367673813, + "grad_norm": 0.3228663206100464, + "learning_rate": 2.9595762936549132e-05, + "loss": 0.1474, + "step": 27866 + }, + { + "epoch": 0.4970392037955267, + "grad_norm": 0.307590126991272, + "learning_rate": 2.9594232946128287e-05, + "loss": 0.1637, + "step": 27867 + }, + { + "epoch": 0.4970570399172404, + "grad_norm": 0.19887320697307587, + "learning_rate": 2.9592702937898607e-05, + "loss": 0.1218, + "step": 27868 + }, + { + "epoch": 0.4970748760389541, + "grad_norm": 0.261941522359848, + "learning_rate": 2.9591172911866027e-05, + "loss": 0.1753, + "step": 27869 + }, + { + "epoch": 0.4970927121606678, + "grad_norm": 0.21067079901695251, + "learning_rate": 2.958964286803648e-05, + "loss": 0.1178, + "step": 27870 + }, + { + "epoch": 0.4971105482823815, + "grad_norm": 0.2668260335922241, + "learning_rate": 2.958811280641589e-05, + "loss": 0.1397, + "step": 27871 + }, + { + "epoch": 0.4971283844040952, + "grad_norm": 0.30177226662635803, + "learning_rate": 2.958658272701019e-05, + "loss": 0.107, + "step": 27872 + }, + { + "epoch": 0.49714622052580887, + "grad_norm": 0.47434917092323303, + "learning_rate": 2.958505262982531e-05, + "loss": 0.1431, + "step": 27873 + }, + { + "epoch": 0.49716405664752256, + "grad_norm": 0.2704737186431885, + "learning_rate": 2.958352251486719e-05, + "loss": 0.132, + "step": 27874 + }, + { + "epoch": 0.49718189276923624, + "grad_norm": 0.2231382578611374, + "learning_rate": 2.9581992382141744e-05, + "loss": 0.1249, + "step": 27875 + }, + { + "epoch": 0.49719972889094993, + "grad_norm": 0.21486921608448029, + "learning_rate": 2.9580462231654914e-05, + "loss": 0.1102, + "step": 27876 + }, + { + "epoch": 0.4972175650126636, + "grad_norm": 0.3190035820007324, + "learning_rate": 2.9578932063412634e-05, + "loss": 0.2299, + "step": 27877 + }, + { + "epoch": 0.49723540113437736, + "grad_norm": 0.20680023729801178, + "learning_rate": 2.957740187742083e-05, + "loss": 0.1434, + "step": 27878 + }, + { + "epoch": 0.49725323725609105, + "grad_norm": 0.48658859729766846, + "learning_rate": 2.957587167368544e-05, + "loss": 0.1483, + "step": 27879 + }, + { + "epoch": 0.49727107337780474, + "grad_norm": 0.22653689980506897, + "learning_rate": 2.9574341452212382e-05, + "loss": 0.1534, + "step": 27880 + }, + { + "epoch": 0.49728890949951843, + "grad_norm": 0.34906628727912903, + "learning_rate": 2.95728112130076e-05, + "loss": 0.1459, + "step": 27881 + }, + { + "epoch": 0.4973067456212321, + "grad_norm": 0.25396373867988586, + "learning_rate": 2.9571280956077026e-05, + "loss": 0.1393, + "step": 27882 + }, + { + "epoch": 0.4973245817429458, + "grad_norm": 0.2544762194156647, + "learning_rate": 2.9569750681426584e-05, + "loss": 0.1212, + "step": 27883 + }, + { + "epoch": 0.4973424178646595, + "grad_norm": 0.25442975759506226, + "learning_rate": 2.9568220389062206e-05, + "loss": 0.0889, + "step": 27884 + }, + { + "epoch": 0.4973602539863732, + "grad_norm": 0.39988458156585693, + "learning_rate": 2.956669007898984e-05, + "loss": 0.1558, + "step": 27885 + }, + { + "epoch": 0.4973780901080869, + "grad_norm": 0.2710377275943756, + "learning_rate": 2.956515975121539e-05, + "loss": 0.1399, + "step": 27886 + }, + { + "epoch": 0.4973959262298006, + "grad_norm": 0.2345605194568634, + "learning_rate": 2.956362940574482e-05, + "loss": 0.1286, + "step": 27887 + }, + { + "epoch": 0.4974137623515143, + "grad_norm": 0.36714980006217957, + "learning_rate": 2.9562099042584046e-05, + "loss": 0.1865, + "step": 27888 + }, + { + "epoch": 0.497431598473228, + "grad_norm": 0.27104154229164124, + "learning_rate": 2.956056866173899e-05, + "loss": 0.0925, + "step": 27889 + }, + { + "epoch": 0.4974494345949417, + "grad_norm": 0.29537317156791687, + "learning_rate": 2.95590382632156e-05, + "loss": 0.1455, + "step": 27890 + }, + { + "epoch": 0.49746727071665536, + "grad_norm": 0.19378677010536194, + "learning_rate": 2.9557507847019793e-05, + "loss": 0.1596, + "step": 27891 + }, + { + "epoch": 0.49748510683836905, + "grad_norm": 0.29182490706443787, + "learning_rate": 2.9555977413157525e-05, + "loss": 0.1782, + "step": 27892 + }, + { + "epoch": 0.49750294296008274, + "grad_norm": 0.276284784078598, + "learning_rate": 2.9554446961634708e-05, + "loss": 0.1852, + "step": 27893 + }, + { + "epoch": 0.4975207790817964, + "grad_norm": 0.2724047005176544, + "learning_rate": 2.9552916492457288e-05, + "loss": 0.13, + "step": 27894 + }, + { + "epoch": 0.49753861520351017, + "grad_norm": 0.3606939911842346, + "learning_rate": 2.9551386005631188e-05, + "loss": 0.1805, + "step": 27895 + }, + { + "epoch": 0.49755645132522386, + "grad_norm": 0.22548851370811462, + "learning_rate": 2.9549855501162337e-05, + "loss": 0.1526, + "step": 27896 + }, + { + "epoch": 0.49757428744693755, + "grad_norm": 0.24227149784564972, + "learning_rate": 2.9548324979056678e-05, + "loss": 0.132, + "step": 27897 + }, + { + "epoch": 0.49759212356865123, + "grad_norm": 0.2506137490272522, + "learning_rate": 2.9546794439320147e-05, + "loss": 0.1553, + "step": 27898 + }, + { + "epoch": 0.4976099596903649, + "grad_norm": 0.25468847155570984, + "learning_rate": 2.9545263881958662e-05, + "loss": 0.1014, + "step": 27899 + }, + { + "epoch": 0.4976277958120786, + "grad_norm": 0.46975842118263245, + "learning_rate": 2.954373330697817e-05, + "loss": 0.1779, + "step": 27900 + }, + { + "epoch": 0.4976456319337923, + "grad_norm": 0.3091696798801422, + "learning_rate": 2.9542202714384605e-05, + "loss": 0.206, + "step": 27901 + }, + { + "epoch": 0.497663468055506, + "grad_norm": 0.2690056562423706, + "learning_rate": 2.954067210418388e-05, + "loss": 0.1393, + "step": 27902 + }, + { + "epoch": 0.49768130417721973, + "grad_norm": 0.47849467396736145, + "learning_rate": 2.953914147638195e-05, + "loss": 0.1618, + "step": 27903 + }, + { + "epoch": 0.4976991402989334, + "grad_norm": 0.2702648937702179, + "learning_rate": 2.953761083098473e-05, + "loss": 0.182, + "step": 27904 + }, + { + "epoch": 0.4977169764206471, + "grad_norm": 0.25787919759750366, + "learning_rate": 2.9536080167998176e-05, + "loss": 0.1976, + "step": 27905 + }, + { + "epoch": 0.4977348125423608, + "grad_norm": 0.29339367151260376, + "learning_rate": 2.9534549487428197e-05, + "loss": 0.2173, + "step": 27906 + }, + { + "epoch": 0.4977526486640745, + "grad_norm": 0.265498548746109, + "learning_rate": 2.9533018789280746e-05, + "loss": 0.1374, + "step": 27907 + }, + { + "epoch": 0.49777048478578817, + "grad_norm": 0.28753966093063354, + "learning_rate": 2.9531488073561746e-05, + "loss": 0.1441, + "step": 27908 + }, + { + "epoch": 0.49778832090750186, + "grad_norm": 0.3045188784599304, + "learning_rate": 2.9529957340277136e-05, + "loss": 0.1064, + "step": 27909 + }, + { + "epoch": 0.49780615702921555, + "grad_norm": 0.26599541306495667, + "learning_rate": 2.9528426589432845e-05, + "loss": 0.1323, + "step": 27910 + }, + { + "epoch": 0.4978239931509293, + "grad_norm": 0.19216345250606537, + "learning_rate": 2.95268958210348e-05, + "loss": 0.0914, + "step": 27911 + }, + { + "epoch": 0.497841829272643, + "grad_norm": 0.22652976214885712, + "learning_rate": 2.9525365035088954e-05, + "loss": 0.1341, + "step": 27912 + }, + { + "epoch": 0.49785966539435667, + "grad_norm": 0.321074903011322, + "learning_rate": 2.9523834231601227e-05, + "loss": 0.2251, + "step": 27913 + }, + { + "epoch": 0.49787750151607035, + "grad_norm": 0.26399973034858704, + "learning_rate": 2.9522303410577552e-05, + "loss": 0.1418, + "step": 27914 + }, + { + "epoch": 0.49789533763778404, + "grad_norm": 0.2515855133533478, + "learning_rate": 2.9520772572023874e-05, + "loss": 0.1267, + "step": 27915 + }, + { + "epoch": 0.49791317375949773, + "grad_norm": 0.21967852115631104, + "learning_rate": 2.951924171594612e-05, + "loss": 0.1344, + "step": 27916 + }, + { + "epoch": 0.4979310098812114, + "grad_norm": 0.3357223868370056, + "learning_rate": 2.951771084235022e-05, + "loss": 0.1358, + "step": 27917 + }, + { + "epoch": 0.4979488460029251, + "grad_norm": 0.30676183104515076, + "learning_rate": 2.9516179951242113e-05, + "loss": 0.2006, + "step": 27918 + }, + { + "epoch": 0.4979666821246388, + "grad_norm": 0.2936556339263916, + "learning_rate": 2.951464904262773e-05, + "loss": 0.1357, + "step": 27919 + }, + { + "epoch": 0.49798451824635254, + "grad_norm": 0.23046720027923584, + "learning_rate": 2.9513118116513017e-05, + "loss": 0.1329, + "step": 27920 + }, + { + "epoch": 0.4980023543680662, + "grad_norm": 0.2276427447795868, + "learning_rate": 2.951158717290389e-05, + "loss": 0.1578, + "step": 27921 + }, + { + "epoch": 0.4980201904897799, + "grad_norm": 0.32960057258605957, + "learning_rate": 2.9510056211806298e-05, + "loss": 0.2434, + "step": 27922 + }, + { + "epoch": 0.4980380266114936, + "grad_norm": 0.25639766454696655, + "learning_rate": 2.9508525233226164e-05, + "loss": 0.1164, + "step": 27923 + }, + { + "epoch": 0.4980558627332073, + "grad_norm": 0.3169064223766327, + "learning_rate": 2.9506994237169433e-05, + "loss": 0.1859, + "step": 27924 + }, + { + "epoch": 0.498073698854921, + "grad_norm": 0.2877647578716278, + "learning_rate": 2.9505463223642037e-05, + "loss": 0.1278, + "step": 27925 + }, + { + "epoch": 0.49809153497663466, + "grad_norm": 0.1823241412639618, + "learning_rate": 2.9503932192649907e-05, + "loss": 0.1247, + "step": 27926 + }, + { + "epoch": 0.49810937109834835, + "grad_norm": 0.23231762647628784, + "learning_rate": 2.9502401144198984e-05, + "loss": 0.1317, + "step": 27927 + }, + { + "epoch": 0.4981272072200621, + "grad_norm": 0.3350907266139984, + "learning_rate": 2.95008700782952e-05, + "loss": 0.1325, + "step": 27928 + }, + { + "epoch": 0.4981450433417758, + "grad_norm": 0.29064926505088806, + "learning_rate": 2.949933899494449e-05, + "loss": 0.196, + "step": 27929 + }, + { + "epoch": 0.49816287946348947, + "grad_norm": 0.18521523475646973, + "learning_rate": 2.949780789415278e-05, + "loss": 0.1689, + "step": 27930 + }, + { + "epoch": 0.49818071558520316, + "grad_norm": 0.29793670773506165, + "learning_rate": 2.9496276775926017e-05, + "loss": 0.1177, + "step": 27931 + }, + { + "epoch": 0.49819855170691685, + "grad_norm": 0.19556596875190735, + "learning_rate": 2.949474564027013e-05, + "loss": 0.0887, + "step": 27932 + }, + { + "epoch": 0.49821638782863054, + "grad_norm": 0.33334097266197205, + "learning_rate": 2.9493214487191063e-05, + "loss": 0.1732, + "step": 27933 + }, + { + "epoch": 0.4982342239503442, + "grad_norm": 0.28000178933143616, + "learning_rate": 2.949168331669474e-05, + "loss": 0.176, + "step": 27934 + }, + { + "epoch": 0.4982520600720579, + "grad_norm": 0.3112078607082367, + "learning_rate": 2.94901521287871e-05, + "loss": 0.1725, + "step": 27935 + }, + { + "epoch": 0.4982698961937716, + "grad_norm": 0.21778663992881775, + "learning_rate": 2.948862092347408e-05, + "loss": 0.1071, + "step": 27936 + }, + { + "epoch": 0.49828773231548534, + "grad_norm": 0.5297493934631348, + "learning_rate": 2.9487089700761624e-05, + "loss": 0.1849, + "step": 27937 + }, + { + "epoch": 0.49830556843719903, + "grad_norm": 0.2754274010658264, + "learning_rate": 2.948555846065565e-05, + "loss": 0.1286, + "step": 27938 + }, + { + "epoch": 0.4983234045589127, + "grad_norm": 0.20975162088871002, + "learning_rate": 2.9484027203162097e-05, + "loss": 0.1566, + "step": 27939 + }, + { + "epoch": 0.4983412406806264, + "grad_norm": 0.3835209012031555, + "learning_rate": 2.9482495928286908e-05, + "loss": 0.1824, + "step": 27940 + }, + { + "epoch": 0.4983590768023401, + "grad_norm": 0.2465917468070984, + "learning_rate": 2.9480964636036023e-05, + "loss": 0.1136, + "step": 27941 + }, + { + "epoch": 0.4983769129240538, + "grad_norm": 0.2265441119670868, + "learning_rate": 2.9479433326415372e-05, + "loss": 0.1536, + "step": 27942 + }, + { + "epoch": 0.49839474904576747, + "grad_norm": 0.2865673303604126, + "learning_rate": 2.947790199943089e-05, + "loss": 0.1274, + "step": 27943 + }, + { + "epoch": 0.49841258516748116, + "grad_norm": 0.25633201003074646, + "learning_rate": 2.9476370655088508e-05, + "loss": 0.1572, + "step": 27944 + }, + { + "epoch": 0.4984304212891949, + "grad_norm": 0.222824826836586, + "learning_rate": 2.9474839293394162e-05, + "loss": 0.1332, + "step": 27945 + }, + { + "epoch": 0.4984482574109086, + "grad_norm": 0.3386722207069397, + "learning_rate": 2.9473307914353808e-05, + "loss": 0.1541, + "step": 27946 + }, + { + "epoch": 0.4984660935326223, + "grad_norm": 0.22178107500076294, + "learning_rate": 2.947177651797336e-05, + "loss": 0.1557, + "step": 27947 + }, + { + "epoch": 0.49848392965433597, + "grad_norm": 0.3116909861564636, + "learning_rate": 2.9470245104258766e-05, + "loss": 0.1103, + "step": 27948 + }, + { + "epoch": 0.49850176577604965, + "grad_norm": 0.32038989663124084, + "learning_rate": 2.946871367321595e-05, + "loss": 0.1539, + "step": 27949 + }, + { + "epoch": 0.49851960189776334, + "grad_norm": 0.37848106026649475, + "learning_rate": 2.9467182224850865e-05, + "loss": 0.1365, + "step": 27950 + }, + { + "epoch": 0.49853743801947703, + "grad_norm": 0.4757857322692871, + "learning_rate": 2.9465650759169428e-05, + "loss": 0.2003, + "step": 27951 + }, + { + "epoch": 0.4985552741411907, + "grad_norm": 0.2234267145395279, + "learning_rate": 2.9464119276177593e-05, + "loss": 0.136, + "step": 27952 + }, + { + "epoch": 0.4985731102629044, + "grad_norm": 0.3395175635814667, + "learning_rate": 2.9462587775881284e-05, + "loss": 0.2647, + "step": 27953 + }, + { + "epoch": 0.49859094638461815, + "grad_norm": 0.2047962099313736, + "learning_rate": 2.9461056258286456e-05, + "loss": 0.1191, + "step": 27954 + }, + { + "epoch": 0.49860878250633184, + "grad_norm": 0.35745835304260254, + "learning_rate": 2.945952472339903e-05, + "loss": 0.1312, + "step": 27955 + }, + { + "epoch": 0.4986266186280455, + "grad_norm": 0.25644558668136597, + "learning_rate": 2.945799317122494e-05, + "loss": 0.155, + "step": 27956 + }, + { + "epoch": 0.4986444547497592, + "grad_norm": 0.3524622619152069, + "learning_rate": 2.945646160177013e-05, + "loss": 0.2007, + "step": 27957 + }, + { + "epoch": 0.4986622908714729, + "grad_norm": 0.31439030170440674, + "learning_rate": 2.9454930015040534e-05, + "loss": 0.1278, + "step": 27958 + }, + { + "epoch": 0.4986801269931866, + "grad_norm": 0.24150004982948303, + "learning_rate": 2.9453398411042094e-05, + "loss": 0.1409, + "step": 27959 + }, + { + "epoch": 0.4986979631149003, + "grad_norm": 0.2034335434436798, + "learning_rate": 2.9451866789780735e-05, + "loss": 0.1131, + "step": 27960 + }, + { + "epoch": 0.49871579923661397, + "grad_norm": 0.20568163692951202, + "learning_rate": 2.9450335151262416e-05, + "loss": 0.158, + "step": 27961 + }, + { + "epoch": 0.4987336353583277, + "grad_norm": 0.23867696523666382, + "learning_rate": 2.9448803495493054e-05, + "loss": 0.1213, + "step": 27962 + }, + { + "epoch": 0.4987514714800414, + "grad_norm": 0.21701084077358246, + "learning_rate": 2.9447271822478596e-05, + "loss": 0.1413, + "step": 27963 + }, + { + "epoch": 0.4987693076017551, + "grad_norm": 0.2105296552181244, + "learning_rate": 2.9445740132224975e-05, + "loss": 0.0792, + "step": 27964 + }, + { + "epoch": 0.4987871437234688, + "grad_norm": 0.3016611933708191, + "learning_rate": 2.944420842473813e-05, + "loss": 0.1429, + "step": 27965 + }, + { + "epoch": 0.49880497984518246, + "grad_norm": 0.1890726089477539, + "learning_rate": 2.9442676700023996e-05, + "loss": 0.1377, + "step": 27966 + }, + { + "epoch": 0.49882281596689615, + "grad_norm": 0.2599705755710602, + "learning_rate": 2.9441144958088513e-05, + "loss": 0.0986, + "step": 27967 + }, + { + "epoch": 0.49884065208860984, + "grad_norm": 0.3319588303565979, + "learning_rate": 2.943961319893762e-05, + "loss": 0.1805, + "step": 27968 + }, + { + "epoch": 0.4988584882103235, + "grad_norm": 0.2364499568939209, + "learning_rate": 2.9438081422577253e-05, + "loss": 0.1404, + "step": 27969 + }, + { + "epoch": 0.49887632433203727, + "grad_norm": 0.4357466697692871, + "learning_rate": 2.943654962901335e-05, + "loss": 0.1712, + "step": 27970 + }, + { + "epoch": 0.49889416045375096, + "grad_norm": 0.23609469830989838, + "learning_rate": 2.9435017818251844e-05, + "loss": 0.1527, + "step": 27971 + }, + { + "epoch": 0.49891199657546464, + "grad_norm": 0.2735869288444519, + "learning_rate": 2.943348599029868e-05, + "loss": 0.1455, + "step": 27972 + }, + { + "epoch": 0.49892983269717833, + "grad_norm": 0.27534282207489014, + "learning_rate": 2.943195414515979e-05, + "loss": 0.1676, + "step": 27973 + }, + { + "epoch": 0.498947668818892, + "grad_norm": 0.2754566967487335, + "learning_rate": 2.943042228284112e-05, + "loss": 0.1453, + "step": 27974 + }, + { + "epoch": 0.4989655049406057, + "grad_norm": 0.3426382541656494, + "learning_rate": 2.9428890403348597e-05, + "loss": 0.201, + "step": 27975 + }, + { + "epoch": 0.4989833410623194, + "grad_norm": 0.2707737684249878, + "learning_rate": 2.9427358506688174e-05, + "loss": 0.1642, + "step": 27976 + }, + { + "epoch": 0.4990011771840331, + "grad_norm": 0.24972344934940338, + "learning_rate": 2.9425826592865773e-05, + "loss": 0.1431, + "step": 27977 + }, + { + "epoch": 0.4990190133057468, + "grad_norm": 0.26107680797576904, + "learning_rate": 2.9424294661887346e-05, + "loss": 0.0909, + "step": 27978 + }, + { + "epoch": 0.4990368494274605, + "grad_norm": 0.21545089781284332, + "learning_rate": 2.9422762713758816e-05, + "loss": 0.1501, + "step": 27979 + }, + { + "epoch": 0.4990546855491742, + "grad_norm": 0.2637833058834076, + "learning_rate": 2.9421230748486133e-05, + "loss": 0.1205, + "step": 27980 + }, + { + "epoch": 0.4990725216708879, + "grad_norm": 0.24294264614582062, + "learning_rate": 2.941969876607524e-05, + "loss": 0.1369, + "step": 27981 + }, + { + "epoch": 0.4990903577926016, + "grad_norm": 0.427143931388855, + "learning_rate": 2.9418166766532064e-05, + "loss": 0.1718, + "step": 27982 + }, + { + "epoch": 0.49910819391431527, + "grad_norm": 0.26317328214645386, + "learning_rate": 2.9416634749862542e-05, + "loss": 0.1589, + "step": 27983 + }, + { + "epoch": 0.49912603003602896, + "grad_norm": 0.43047598004341125, + "learning_rate": 2.9415102716072622e-05, + "loss": 0.1823, + "step": 27984 + }, + { + "epoch": 0.49914386615774264, + "grad_norm": 0.24958008527755737, + "learning_rate": 2.9413570665168245e-05, + "loss": 0.1179, + "step": 27985 + }, + { + "epoch": 0.49916170227945633, + "grad_norm": 0.28731048107147217, + "learning_rate": 2.9412038597155334e-05, + "loss": 0.1377, + "step": 27986 + }, + { + "epoch": 0.4991795384011701, + "grad_norm": 0.21803952753543854, + "learning_rate": 2.9410506512039838e-05, + "loss": 0.1197, + "step": 27987 + }, + { + "epoch": 0.49919737452288376, + "grad_norm": 0.23055711388587952, + "learning_rate": 2.94089744098277e-05, + "loss": 0.1036, + "step": 27988 + }, + { + "epoch": 0.49921521064459745, + "grad_norm": 0.2937275767326355, + "learning_rate": 2.9407442290524854e-05, + "loss": 0.2049, + "step": 27989 + }, + { + "epoch": 0.49923304676631114, + "grad_norm": 0.22427405416965485, + "learning_rate": 2.9405910154137238e-05, + "loss": 0.1503, + "step": 27990 + }, + { + "epoch": 0.4992508828880248, + "grad_norm": 0.2825898230075836, + "learning_rate": 2.9404378000670795e-05, + "loss": 0.1722, + "step": 27991 + }, + { + "epoch": 0.4992687190097385, + "grad_norm": 0.21711206436157227, + "learning_rate": 2.940284583013146e-05, + "loss": 0.1219, + "step": 27992 + }, + { + "epoch": 0.4992865551314522, + "grad_norm": 0.24050657451152802, + "learning_rate": 2.9401313642525175e-05, + "loss": 0.1446, + "step": 27993 + }, + { + "epoch": 0.4993043912531659, + "grad_norm": 0.3528558015823364, + "learning_rate": 2.9399781437857877e-05, + "loss": 0.1245, + "step": 27994 + }, + { + "epoch": 0.4993222273748796, + "grad_norm": 0.2966325879096985, + "learning_rate": 2.9398249216135503e-05, + "loss": 0.1834, + "step": 27995 + }, + { + "epoch": 0.4993400634965933, + "grad_norm": 0.19626571238040924, + "learning_rate": 2.9396716977364002e-05, + "loss": 0.1152, + "step": 27996 + }, + { + "epoch": 0.499357899618307, + "grad_norm": 0.22221416234970093, + "learning_rate": 2.93951847215493e-05, + "loss": 0.1201, + "step": 27997 + }, + { + "epoch": 0.4993757357400207, + "grad_norm": 0.23627491295337677, + "learning_rate": 2.9393652448697355e-05, + "loss": 0.1094, + "step": 27998 + }, + { + "epoch": 0.4993935718617344, + "grad_norm": 0.23147229850292206, + "learning_rate": 2.9392120158814084e-05, + "loss": 0.0889, + "step": 27999 + }, + { + "epoch": 0.4994114079834481, + "grad_norm": 0.38820022344589233, + "learning_rate": 2.9390587851905444e-05, + "loss": 0.2184, + "step": 28000 + }, + { + "epoch": 0.4994114079834481, + "eval_loss": 0.13733762502670288, + "eval_runtime": 106.8398, + "eval_samples_per_second": 9.584, + "eval_steps_per_second": 1.601, + "step": 28000 + }, + { + "epoch": 0.49942924410516176, + "grad_norm": 0.2396186888217926, + "learning_rate": 2.9389055527977367e-05, + "loss": 0.1347, + "step": 28001 + }, + { + "epoch": 0.49944708022687545, + "grad_norm": 0.220523402094841, + "learning_rate": 2.9387523187035797e-05, + "loss": 0.0866, + "step": 28002 + }, + { + "epoch": 0.49946491634858914, + "grad_norm": 0.22361794114112854, + "learning_rate": 2.9385990829086673e-05, + "loss": 0.134, + "step": 28003 + }, + { + "epoch": 0.4994827524703029, + "grad_norm": 0.2309955358505249, + "learning_rate": 2.9384458454135933e-05, + "loss": 0.1628, + "step": 28004 + }, + { + "epoch": 0.49950058859201657, + "grad_norm": 0.22957615554332733, + "learning_rate": 2.9382926062189515e-05, + "loss": 0.1305, + "step": 28005 + }, + { + "epoch": 0.49951842471373026, + "grad_norm": 0.22241808474063873, + "learning_rate": 2.9381393653253368e-05, + "loss": 0.0978, + "step": 28006 + }, + { + "epoch": 0.49953626083544395, + "grad_norm": 0.23145662248134613, + "learning_rate": 2.9379861227333417e-05, + "loss": 0.1658, + "step": 28007 + }, + { + "epoch": 0.49955409695715763, + "grad_norm": 0.2550261616706848, + "learning_rate": 2.9378328784435616e-05, + "loss": 0.18, + "step": 28008 + }, + { + "epoch": 0.4995719330788713, + "grad_norm": 0.25448763370513916, + "learning_rate": 2.9376796324565904e-05, + "loss": 0.1105, + "step": 28009 + }, + { + "epoch": 0.499589769200585, + "grad_norm": 0.26080530881881714, + "learning_rate": 2.937526384773021e-05, + "loss": 0.1501, + "step": 28010 + }, + { + "epoch": 0.4996076053222987, + "grad_norm": 0.20976018905639648, + "learning_rate": 2.9373731353934487e-05, + "loss": 0.1387, + "step": 28011 + }, + { + "epoch": 0.49962544144401244, + "grad_norm": 0.21771611273288727, + "learning_rate": 2.9372198843184667e-05, + "loss": 0.1405, + "step": 28012 + }, + { + "epoch": 0.49964327756572613, + "grad_norm": 0.22740884125232697, + "learning_rate": 2.93706663154867e-05, + "loss": 0.1275, + "step": 28013 + }, + { + "epoch": 0.4996611136874398, + "grad_norm": 0.3152305781841278, + "learning_rate": 2.9369133770846515e-05, + "loss": 0.1187, + "step": 28014 + }, + { + "epoch": 0.4996789498091535, + "grad_norm": 0.3350315988063812, + "learning_rate": 2.9367601209270056e-05, + "loss": 0.2001, + "step": 28015 + }, + { + "epoch": 0.4996967859308672, + "grad_norm": 0.21111929416656494, + "learning_rate": 2.9366068630763273e-05, + "loss": 0.1257, + "step": 28016 + }, + { + "epoch": 0.4997146220525809, + "grad_norm": 0.27207595109939575, + "learning_rate": 2.93645360353321e-05, + "loss": 0.1317, + "step": 28017 + }, + { + "epoch": 0.49973245817429457, + "grad_norm": 0.310823917388916, + "learning_rate": 2.936300342298247e-05, + "loss": 0.1433, + "step": 28018 + }, + { + "epoch": 0.49975029429600826, + "grad_norm": 0.29535984992980957, + "learning_rate": 2.9361470793720346e-05, + "loss": 0.1631, + "step": 28019 + }, + { + "epoch": 0.49976813041772195, + "grad_norm": 0.20434069633483887, + "learning_rate": 2.9359938147551646e-05, + "loss": 0.1134, + "step": 28020 + }, + { + "epoch": 0.4997859665394357, + "grad_norm": 0.248887300491333, + "learning_rate": 2.935840548448232e-05, + "loss": 0.1513, + "step": 28021 + }, + { + "epoch": 0.4998038026611494, + "grad_norm": 0.3449693024158478, + "learning_rate": 2.935687280451831e-05, + "loss": 0.1891, + "step": 28022 + }, + { + "epoch": 0.49982163878286306, + "grad_norm": 0.3072223663330078, + "learning_rate": 2.935534010766555e-05, + "loss": 0.1703, + "step": 28023 + }, + { + "epoch": 0.49983947490457675, + "grad_norm": 0.24689984321594238, + "learning_rate": 2.9353807393929998e-05, + "loss": 0.1385, + "step": 28024 + }, + { + "epoch": 0.49985731102629044, + "grad_norm": 0.36406317353248596, + "learning_rate": 2.9352274663317575e-05, + "loss": 0.1707, + "step": 28025 + }, + { + "epoch": 0.49987514714800413, + "grad_norm": 0.300327867269516, + "learning_rate": 2.935074191583424e-05, + "loss": 0.1543, + "step": 28026 + }, + { + "epoch": 0.4998929832697178, + "grad_norm": 0.26289102435112, + "learning_rate": 2.9349209151485925e-05, + "loss": 0.1576, + "step": 28027 + }, + { + "epoch": 0.4999108193914315, + "grad_norm": 0.25302886962890625, + "learning_rate": 2.9347676370278564e-05, + "loss": 0.2007, + "step": 28028 + }, + { + "epoch": 0.49992865551314525, + "grad_norm": 0.26356586813926697, + "learning_rate": 2.9346143572218116e-05, + "loss": 0.1528, + "step": 28029 + }, + { + "epoch": 0.49994649163485894, + "grad_norm": 0.2535671591758728, + "learning_rate": 2.934461075731052e-05, + "loss": 0.1699, + "step": 28030 + }, + { + "epoch": 0.4999643277565726, + "grad_norm": 0.30741581320762634, + "learning_rate": 2.93430779255617e-05, + "loss": 0.1339, + "step": 28031 + }, + { + "epoch": 0.4999821638782863, + "grad_norm": 0.26062649488449097, + "learning_rate": 2.9341545076977622e-05, + "loss": 0.1114, + "step": 28032 + }, + { + "epoch": 0.5, + "grad_norm": 0.2643240690231323, + "learning_rate": 2.9340012211564212e-05, + "loss": 0.1453, + "step": 28033 + }, + { + "epoch": 0.5000178361217137, + "grad_norm": 0.26569658517837524, + "learning_rate": 2.9338479329327412e-05, + "loss": 0.1136, + "step": 28034 + }, + { + "epoch": 0.5000356722434274, + "grad_norm": 0.23257343471050262, + "learning_rate": 2.9336946430273167e-05, + "loss": 0.125, + "step": 28035 + }, + { + "epoch": 0.5000535083651411, + "grad_norm": 0.19785386323928833, + "learning_rate": 2.9335413514407418e-05, + "loss": 0.1075, + "step": 28036 + }, + { + "epoch": 0.5000713444868548, + "grad_norm": 0.2894573211669922, + "learning_rate": 2.933388058173612e-05, + "loss": 0.1792, + "step": 28037 + }, + { + "epoch": 0.5000891806085684, + "grad_norm": 0.19879817962646484, + "learning_rate": 2.9332347632265188e-05, + "loss": 0.1502, + "step": 28038 + }, + { + "epoch": 0.5001070167302821, + "grad_norm": 0.2950558364391327, + "learning_rate": 2.9330814666000596e-05, + "loss": 0.0814, + "step": 28039 + }, + { + "epoch": 0.5001248528519958, + "grad_norm": 0.31717249751091003, + "learning_rate": 2.932928168294826e-05, + "loss": 0.2298, + "step": 28040 + }, + { + "epoch": 0.5001426889737095, + "grad_norm": 0.24027322232723236, + "learning_rate": 2.9327748683114137e-05, + "loss": 0.147, + "step": 28041 + }, + { + "epoch": 0.5001605250954233, + "grad_norm": 0.2599031329154968, + "learning_rate": 2.9326215666504163e-05, + "loss": 0.1065, + "step": 28042 + }, + { + "epoch": 0.500178361217137, + "grad_norm": 0.2908802628517151, + "learning_rate": 2.9324682633124285e-05, + "loss": 0.0837, + "step": 28043 + }, + { + "epoch": 0.5001961973388507, + "grad_norm": 0.28757938742637634, + "learning_rate": 2.9323149582980436e-05, + "loss": 0.1416, + "step": 28044 + }, + { + "epoch": 0.5002140334605644, + "grad_norm": 0.35413914918899536, + "learning_rate": 2.9321616516078575e-05, + "loss": 0.1395, + "step": 28045 + }, + { + "epoch": 0.500231869582278, + "grad_norm": 0.29072460532188416, + "learning_rate": 2.932008343242464e-05, + "loss": 0.1576, + "step": 28046 + }, + { + "epoch": 0.5002497057039917, + "grad_norm": 0.24231556057929993, + "learning_rate": 2.9318550332024563e-05, + "loss": 0.1536, + "step": 28047 + }, + { + "epoch": 0.5002675418257054, + "grad_norm": 0.24185895919799805, + "learning_rate": 2.9317017214884297e-05, + "loss": 0.1559, + "step": 28048 + }, + { + "epoch": 0.5002853779474191, + "grad_norm": 0.2856906056404114, + "learning_rate": 2.9315484081009774e-05, + "loss": 0.1168, + "step": 28049 + }, + { + "epoch": 0.5003032140691328, + "grad_norm": 0.2460201531648636, + "learning_rate": 2.9313950930406947e-05, + "loss": 0.1381, + "step": 28050 + }, + { + "epoch": 0.5003210501908465, + "grad_norm": 0.27226993441581726, + "learning_rate": 2.931241776308175e-05, + "loss": 0.1041, + "step": 28051 + }, + { + "epoch": 0.5003388863125602, + "grad_norm": 0.3510323762893677, + "learning_rate": 2.9310884579040143e-05, + "loss": 0.1613, + "step": 28052 + }, + { + "epoch": 0.5003567224342739, + "grad_norm": 0.3239705264568329, + "learning_rate": 2.9309351378288052e-05, + "loss": 0.1665, + "step": 28053 + }, + { + "epoch": 0.5003745585559876, + "grad_norm": 0.3006664216518402, + "learning_rate": 2.9307818160831435e-05, + "loss": 0.1006, + "step": 28054 + }, + { + "epoch": 0.5003923946777012, + "grad_norm": 0.21220754086971283, + "learning_rate": 2.930628492667622e-05, + "loss": 0.1015, + "step": 28055 + }, + { + "epoch": 0.5004102307994149, + "grad_norm": 0.16951516270637512, + "learning_rate": 2.9304751675828358e-05, + "loss": 0.1297, + "step": 28056 + }, + { + "epoch": 0.5004280669211286, + "grad_norm": 0.24412105977535248, + "learning_rate": 2.9303218408293787e-05, + "loss": 0.1433, + "step": 28057 + }, + { + "epoch": 0.5004459030428423, + "grad_norm": 0.30340859293937683, + "learning_rate": 2.9301685124078465e-05, + "loss": 0.1274, + "step": 28058 + }, + { + "epoch": 0.5004637391645561, + "grad_norm": 0.3519989550113678, + "learning_rate": 2.9300151823188326e-05, + "loss": 0.136, + "step": 28059 + }, + { + "epoch": 0.5004815752862698, + "grad_norm": 0.31953418254852295, + "learning_rate": 2.9298618505629305e-05, + "loss": 0.1773, + "step": 28060 + }, + { + "epoch": 0.5004994114079835, + "grad_norm": 0.23292499780654907, + "learning_rate": 2.9297085171407363e-05, + "loss": 0.1361, + "step": 28061 + }, + { + "epoch": 0.5005172475296972, + "grad_norm": 0.31692078709602356, + "learning_rate": 2.9295551820528423e-05, + "loss": 0.1573, + "step": 28062 + }, + { + "epoch": 0.5005350836514109, + "grad_norm": 0.24037839472293854, + "learning_rate": 2.9294018452998445e-05, + "loss": 0.1545, + "step": 28063 + }, + { + "epoch": 0.5005529197731245, + "grad_norm": 0.23084452748298645, + "learning_rate": 2.9292485068823365e-05, + "loss": 0.1497, + "step": 28064 + }, + { + "epoch": 0.5005707558948382, + "grad_norm": 0.2341664731502533, + "learning_rate": 2.9290951668009143e-05, + "loss": 0.1326, + "step": 28065 + }, + { + "epoch": 0.5005885920165519, + "grad_norm": 0.17126047611236572, + "learning_rate": 2.92894182505617e-05, + "loss": 0.1162, + "step": 28066 + }, + { + "epoch": 0.5006064281382656, + "grad_norm": 0.30533409118652344, + "learning_rate": 2.9287884816486992e-05, + "loss": 0.1145, + "step": 28067 + }, + { + "epoch": 0.5006242642599793, + "grad_norm": 0.250066876411438, + "learning_rate": 2.9286351365790965e-05, + "loss": 0.1489, + "step": 28068 + }, + { + "epoch": 0.500642100381693, + "grad_norm": 0.32610979676246643, + "learning_rate": 2.928481789847956e-05, + "loss": 0.1515, + "step": 28069 + }, + { + "epoch": 0.5006599365034067, + "grad_norm": 0.22377827763557434, + "learning_rate": 2.9283284414558715e-05, + "loss": 0.0895, + "step": 28070 + }, + { + "epoch": 0.5006777726251204, + "grad_norm": 0.21274669468402863, + "learning_rate": 2.928175091403438e-05, + "loss": 0.132, + "step": 28071 + }, + { + "epoch": 0.500695608746834, + "grad_norm": 0.23008203506469727, + "learning_rate": 2.9280217396912508e-05, + "loss": 0.1428, + "step": 28072 + }, + { + "epoch": 0.5007134448685477, + "grad_norm": 0.22746393084526062, + "learning_rate": 2.9278683863199023e-05, + "loss": 0.1338, + "step": 28073 + }, + { + "epoch": 0.5007312809902614, + "grad_norm": 0.33229076862335205, + "learning_rate": 2.927715031289989e-05, + "loss": 0.1585, + "step": 28074 + }, + { + "epoch": 0.5007491171119751, + "grad_norm": 0.2531786262989044, + "learning_rate": 2.9275616746021045e-05, + "loss": 0.1531, + "step": 28075 + }, + { + "epoch": 0.5007669532336889, + "grad_norm": 0.27786120772361755, + "learning_rate": 2.9274083162568433e-05, + "loss": 0.1494, + "step": 28076 + }, + { + "epoch": 0.5007847893554026, + "grad_norm": 0.3160673975944519, + "learning_rate": 2.927254956254799e-05, + "loss": 0.1645, + "step": 28077 + }, + { + "epoch": 0.5008026254771163, + "grad_norm": 0.2560996115207672, + "learning_rate": 2.927101594596568e-05, + "loss": 0.1539, + "step": 28078 + }, + { + "epoch": 0.50082046159883, + "grad_norm": 0.2621341645717621, + "learning_rate": 2.9269482312827434e-05, + "loss": 0.141, + "step": 28079 + }, + { + "epoch": 0.5008382977205437, + "grad_norm": 0.29709592461586, + "learning_rate": 2.92679486631392e-05, + "loss": 0.1218, + "step": 28080 + }, + { + "epoch": 0.5008561338422574, + "grad_norm": 0.3116741180419922, + "learning_rate": 2.9266414996906923e-05, + "loss": 0.1596, + "step": 28081 + }, + { + "epoch": 0.500873969963971, + "grad_norm": 0.25773724913597107, + "learning_rate": 2.9264881314136544e-05, + "loss": 0.1331, + "step": 28082 + }, + { + "epoch": 0.5008918060856847, + "grad_norm": 0.31263741850852966, + "learning_rate": 2.9263347614834013e-05, + "loss": 0.1309, + "step": 28083 + }, + { + "epoch": 0.5009096422073984, + "grad_norm": 0.25896355509757996, + "learning_rate": 2.9261813899005273e-05, + "loss": 0.1254, + "step": 28084 + }, + { + "epoch": 0.5009274783291121, + "grad_norm": 0.26880183815956116, + "learning_rate": 2.9260280166656277e-05, + "loss": 0.0779, + "step": 28085 + }, + { + "epoch": 0.5009453144508258, + "grad_norm": 0.39075830578804016, + "learning_rate": 2.925874641779296e-05, + "loss": 0.1839, + "step": 28086 + }, + { + "epoch": 0.5009631505725395, + "grad_norm": 0.2828160524368286, + "learning_rate": 2.9257212652421274e-05, + "loss": 0.1238, + "step": 28087 + }, + { + "epoch": 0.5009809866942532, + "grad_norm": 0.3004515469074249, + "learning_rate": 2.9255678870547153e-05, + "loss": 0.1461, + "step": 28088 + }, + { + "epoch": 0.5009988228159669, + "grad_norm": 0.3946534991264343, + "learning_rate": 2.925414507217656e-05, + "loss": 0.1487, + "step": 28089 + }, + { + "epoch": 0.5010166589376805, + "grad_norm": 0.2745427191257477, + "learning_rate": 2.9252611257315428e-05, + "loss": 0.1333, + "step": 28090 + }, + { + "epoch": 0.5010344950593942, + "grad_norm": 0.2747195065021515, + "learning_rate": 2.9251077425969697e-05, + "loss": 0.1347, + "step": 28091 + }, + { + "epoch": 0.501052331181108, + "grad_norm": 0.25806090235710144, + "learning_rate": 2.9249543578145333e-05, + "loss": 0.1472, + "step": 28092 + }, + { + "epoch": 0.5010701673028217, + "grad_norm": 0.2558298408985138, + "learning_rate": 2.9248009713848268e-05, + "loss": 0.1931, + "step": 28093 + }, + { + "epoch": 0.5010880034245354, + "grad_norm": 0.277619868516922, + "learning_rate": 2.9246475833084446e-05, + "loss": 0.1391, + "step": 28094 + }, + { + "epoch": 0.5011058395462491, + "grad_norm": 0.19680927693843842, + "learning_rate": 2.9244941935859826e-05, + "loss": 0.1236, + "step": 28095 + }, + { + "epoch": 0.5011236756679628, + "grad_norm": 0.26855865120887756, + "learning_rate": 2.9243408022180334e-05, + "loss": 0.1138, + "step": 28096 + }, + { + "epoch": 0.5011415117896765, + "grad_norm": 0.2550387978553772, + "learning_rate": 2.9241874092051937e-05, + "loss": 0.1761, + "step": 28097 + }, + { + "epoch": 0.5011593479113902, + "grad_norm": 0.30739662051200867, + "learning_rate": 2.9240340145480566e-05, + "loss": 0.1593, + "step": 28098 + }, + { + "epoch": 0.5011771840331039, + "grad_norm": 0.3022283613681793, + "learning_rate": 2.9238806182472166e-05, + "loss": 0.1008, + "step": 28099 + }, + { + "epoch": 0.5011950201548175, + "grad_norm": 0.3105224370956421, + "learning_rate": 2.9237272203032702e-05, + "loss": 0.1477, + "step": 28100 + }, + { + "epoch": 0.5012128562765312, + "grad_norm": 0.3185834586620331, + "learning_rate": 2.9235738207168097e-05, + "loss": 0.1282, + "step": 28101 + }, + { + "epoch": 0.5012306923982449, + "grad_norm": 0.30447420477867126, + "learning_rate": 2.9234204194884314e-05, + "loss": 0.1537, + "step": 28102 + }, + { + "epoch": 0.5012485285199586, + "grad_norm": 0.26840248703956604, + "learning_rate": 2.923267016618729e-05, + "loss": 0.1867, + "step": 28103 + }, + { + "epoch": 0.5012663646416723, + "grad_norm": 0.2757900059223175, + "learning_rate": 2.923113612108297e-05, + "loss": 0.152, + "step": 28104 + }, + { + "epoch": 0.501284200763386, + "grad_norm": 0.3213600516319275, + "learning_rate": 2.922960205957731e-05, + "loss": 0.1979, + "step": 28105 + }, + { + "epoch": 0.5013020368850997, + "grad_norm": 0.240966334939003, + "learning_rate": 2.9228067981676256e-05, + "loss": 0.1324, + "step": 28106 + }, + { + "epoch": 0.5013198730068134, + "grad_norm": 0.1754409819841385, + "learning_rate": 2.922653388738574e-05, + "loss": 0.1252, + "step": 28107 + }, + { + "epoch": 0.501337709128527, + "grad_norm": 0.2842355966567993, + "learning_rate": 2.9224999776711726e-05, + "loss": 0.1963, + "step": 28108 + }, + { + "epoch": 0.5013555452502408, + "grad_norm": 0.26824405789375305, + "learning_rate": 2.9223465649660152e-05, + "loss": 0.1652, + "step": 28109 + }, + { + "epoch": 0.5013733813719545, + "grad_norm": 0.24093066155910492, + "learning_rate": 2.9221931506236973e-05, + "loss": 0.1239, + "step": 28110 + }, + { + "epoch": 0.5013912174936682, + "grad_norm": 0.2823273539543152, + "learning_rate": 2.922039734644812e-05, + "loss": 0.1501, + "step": 28111 + }, + { + "epoch": 0.5014090536153819, + "grad_norm": 0.3238869607448578, + "learning_rate": 2.921886317029955e-05, + "loss": 0.1873, + "step": 28112 + }, + { + "epoch": 0.5014268897370956, + "grad_norm": 0.2070161998271942, + "learning_rate": 2.9217328977797208e-05, + "loss": 0.1172, + "step": 28113 + }, + { + "epoch": 0.5014447258588093, + "grad_norm": 0.28239917755126953, + "learning_rate": 2.9215794768947046e-05, + "loss": 0.1446, + "step": 28114 + }, + { + "epoch": 0.501462561980523, + "grad_norm": 0.30700185894966125, + "learning_rate": 2.9214260543755013e-05, + "loss": 0.2055, + "step": 28115 + }, + { + "epoch": 0.5014803981022367, + "grad_norm": 0.17265966534614563, + "learning_rate": 2.9212726302227038e-05, + "loss": 0.1165, + "step": 28116 + }, + { + "epoch": 0.5014982342239503, + "grad_norm": 0.27358657121658325, + "learning_rate": 2.9211192044369094e-05, + "loss": 0.1357, + "step": 28117 + }, + { + "epoch": 0.501516070345664, + "grad_norm": 0.2435189187526703, + "learning_rate": 2.9209657770187104e-05, + "loss": 0.1136, + "step": 28118 + }, + { + "epoch": 0.5015339064673777, + "grad_norm": 0.27064594626426697, + "learning_rate": 2.9208123479687032e-05, + "loss": 0.1544, + "step": 28119 + }, + { + "epoch": 0.5015517425890914, + "grad_norm": 0.243107408285141, + "learning_rate": 2.9206589172874816e-05, + "loss": 0.1317, + "step": 28120 + }, + { + "epoch": 0.5015695787108051, + "grad_norm": 0.44382965564727783, + "learning_rate": 2.9205054849756412e-05, + "loss": 0.1056, + "step": 28121 + }, + { + "epoch": 0.5015874148325188, + "grad_norm": 0.2630468010902405, + "learning_rate": 2.920352051033776e-05, + "loss": 0.1409, + "step": 28122 + }, + { + "epoch": 0.5016052509542325, + "grad_norm": 0.2559202015399933, + "learning_rate": 2.9201986154624815e-05, + "loss": 0.1295, + "step": 28123 + }, + { + "epoch": 0.5016230870759462, + "grad_norm": 0.2731707990169525, + "learning_rate": 2.9200451782623517e-05, + "loss": 0.1352, + "step": 28124 + }, + { + "epoch": 0.5016409231976598, + "grad_norm": 0.2575231194496155, + "learning_rate": 2.919891739433982e-05, + "loss": 0.1671, + "step": 28125 + }, + { + "epoch": 0.5016587593193736, + "grad_norm": 0.36431142687797546, + "learning_rate": 2.9197382989779666e-05, + "loss": 0.1618, + "step": 28126 + }, + { + "epoch": 0.5016765954410873, + "grad_norm": 0.23700128495693207, + "learning_rate": 2.9195848568949002e-05, + "loss": 0.1081, + "step": 28127 + }, + { + "epoch": 0.501694431562801, + "grad_norm": 0.22517183423042297, + "learning_rate": 2.9194314131853785e-05, + "loss": 0.1166, + "step": 28128 + }, + { + "epoch": 0.5017122676845147, + "grad_norm": 0.2879955470561981, + "learning_rate": 2.9192779678499956e-05, + "loss": 0.1059, + "step": 28129 + }, + { + "epoch": 0.5017301038062284, + "grad_norm": 0.27152031660079956, + "learning_rate": 2.919124520889347e-05, + "loss": 0.1499, + "step": 28130 + }, + { + "epoch": 0.5017479399279421, + "grad_norm": 0.2744602560997009, + "learning_rate": 2.9189710723040263e-05, + "loss": 0.1517, + "step": 28131 + }, + { + "epoch": 0.5017657760496558, + "grad_norm": 0.22146444022655487, + "learning_rate": 2.918817622094629e-05, + "loss": 0.1808, + "step": 28132 + }, + { + "epoch": 0.5017836121713695, + "grad_norm": 0.35656702518463135, + "learning_rate": 2.91866417026175e-05, + "loss": 0.1111, + "step": 28133 + }, + { + "epoch": 0.5018014482930832, + "grad_norm": 0.2832266688346863, + "learning_rate": 2.9185107168059843e-05, + "loss": 0.1262, + "step": 28134 + }, + { + "epoch": 0.5018192844147968, + "grad_norm": 0.31262874603271484, + "learning_rate": 2.9183572617279265e-05, + "loss": 0.2218, + "step": 28135 + }, + { + "epoch": 0.5018371205365105, + "grad_norm": 0.3318476676940918, + "learning_rate": 2.918203805028172e-05, + "loss": 0.1578, + "step": 28136 + }, + { + "epoch": 0.5018549566582242, + "grad_norm": 0.2821923792362213, + "learning_rate": 2.9180503467073138e-05, + "loss": 0.1653, + "step": 28137 + }, + { + "epoch": 0.5018727927799379, + "grad_norm": 0.20795147120952606, + "learning_rate": 2.9178968867659496e-05, + "loss": 0.1116, + "step": 28138 + }, + { + "epoch": 0.5018906289016516, + "grad_norm": 0.32274147868156433, + "learning_rate": 2.9177434252046714e-05, + "loss": 0.103, + "step": 28139 + }, + { + "epoch": 0.5019084650233653, + "grad_norm": 0.4081878066062927, + "learning_rate": 2.917589962024076e-05, + "loss": 0.0963, + "step": 28140 + }, + { + "epoch": 0.501926301145079, + "grad_norm": 0.3755194842815399, + "learning_rate": 2.917436497224758e-05, + "loss": 0.1182, + "step": 28141 + }, + { + "epoch": 0.5019441372667927, + "grad_norm": 0.24923057854175568, + "learning_rate": 2.917283030807311e-05, + "loss": 0.0863, + "step": 28142 + }, + { + "epoch": 0.5019619733885065, + "grad_norm": 0.19700871407985687, + "learning_rate": 2.9171295627723323e-05, + "loss": 0.1098, + "step": 28143 + }, + { + "epoch": 0.5019798095102201, + "grad_norm": 0.2449912279844284, + "learning_rate": 2.9169760931204136e-05, + "loss": 0.1364, + "step": 28144 + }, + { + "epoch": 0.5019976456319338, + "grad_norm": 0.2614738941192627, + "learning_rate": 2.916822621852153e-05, + "loss": 0.1443, + "step": 28145 + }, + { + "epoch": 0.5020154817536475, + "grad_norm": 0.2839575707912445, + "learning_rate": 2.916669148968143e-05, + "loss": 0.1517, + "step": 28146 + }, + { + "epoch": 0.5020333178753612, + "grad_norm": 0.4459647834300995, + "learning_rate": 2.9165156744689798e-05, + "loss": 0.1994, + "step": 28147 + }, + { + "epoch": 0.5020511539970749, + "grad_norm": 0.22343763709068298, + "learning_rate": 2.9163621983552582e-05, + "loss": 0.1402, + "step": 28148 + }, + { + "epoch": 0.5020689901187886, + "grad_norm": 0.26836588978767395, + "learning_rate": 2.916208720627573e-05, + "loss": 0.128, + "step": 28149 + }, + { + "epoch": 0.5020868262405023, + "grad_norm": 0.23422901332378387, + "learning_rate": 2.9160552412865182e-05, + "loss": 0.0989, + "step": 28150 + }, + { + "epoch": 0.502104662362216, + "grad_norm": 0.24315550923347473, + "learning_rate": 2.9159017603326904e-05, + "loss": 0.1224, + "step": 28151 + }, + { + "epoch": 0.5021224984839296, + "grad_norm": 0.3060329854488373, + "learning_rate": 2.9157482777666838e-05, + "loss": 0.1303, + "step": 28152 + }, + { + "epoch": 0.5021403346056433, + "grad_norm": 0.2087785005569458, + "learning_rate": 2.915594793589093e-05, + "loss": 0.1532, + "step": 28153 + }, + { + "epoch": 0.502158170727357, + "grad_norm": 0.32457199692726135, + "learning_rate": 2.9154413078005126e-05, + "loss": 0.1332, + "step": 28154 + }, + { + "epoch": 0.5021760068490707, + "grad_norm": 0.2215084284543991, + "learning_rate": 2.9152878204015386e-05, + "loss": 0.1148, + "step": 28155 + }, + { + "epoch": 0.5021938429707844, + "grad_norm": 0.2640739679336548, + "learning_rate": 2.9151343313927663e-05, + "loss": 0.143, + "step": 28156 + }, + { + "epoch": 0.5022116790924981, + "grad_norm": 0.22357462346553802, + "learning_rate": 2.9149808407747893e-05, + "loss": 0.1446, + "step": 28157 + }, + { + "epoch": 0.5022295152142118, + "grad_norm": 0.23636764287948608, + "learning_rate": 2.9148273485482036e-05, + "loss": 0.1612, + "step": 28158 + }, + { + "epoch": 0.5022473513359255, + "grad_norm": 0.2029261291027069, + "learning_rate": 2.9146738547136037e-05, + "loss": 0.1144, + "step": 28159 + }, + { + "epoch": 0.5022651874576393, + "grad_norm": 0.23585352301597595, + "learning_rate": 2.9145203592715843e-05, + "loss": 0.1953, + "step": 28160 + }, + { + "epoch": 0.502283023579353, + "grad_norm": 0.24664238095283508, + "learning_rate": 2.914366862222741e-05, + "loss": 0.1368, + "step": 28161 + }, + { + "epoch": 0.5023008597010666, + "grad_norm": 0.4473843574523926, + "learning_rate": 2.914213363567669e-05, + "loss": 0.1382, + "step": 28162 + }, + { + "epoch": 0.5023186958227803, + "grad_norm": 0.2319299876689911, + "learning_rate": 2.9140598633069628e-05, + "loss": 0.1524, + "step": 28163 + }, + { + "epoch": 0.502336531944494, + "grad_norm": 0.21977467834949493, + "learning_rate": 2.9139063614412176e-05, + "loss": 0.0822, + "step": 28164 + }, + { + "epoch": 0.5023543680662077, + "grad_norm": 0.2436297982931137, + "learning_rate": 2.9137528579710284e-05, + "loss": 0.1433, + "step": 28165 + }, + { + "epoch": 0.5023722041879214, + "grad_norm": 0.2653481960296631, + "learning_rate": 2.9135993528969895e-05, + "loss": 0.1572, + "step": 28166 + }, + { + "epoch": 0.5023900403096351, + "grad_norm": 0.268472820520401, + "learning_rate": 2.9134458462196974e-05, + "loss": 0.2238, + "step": 28167 + }, + { + "epoch": 0.5024078764313488, + "grad_norm": 0.3164266049861908, + "learning_rate": 2.9132923379397458e-05, + "loss": 0.208, + "step": 28168 + }, + { + "epoch": 0.5024257125530625, + "grad_norm": 0.31102389097213745, + "learning_rate": 2.913138828057731e-05, + "loss": 0.2051, + "step": 28169 + }, + { + "epoch": 0.5024435486747761, + "grad_norm": 0.22614945471286774, + "learning_rate": 2.9129853165742472e-05, + "loss": 0.1181, + "step": 28170 + }, + { + "epoch": 0.5024613847964898, + "grad_norm": 0.20448987185955048, + "learning_rate": 2.91283180348989e-05, + "loss": 0.1799, + "step": 28171 + }, + { + "epoch": 0.5024792209182035, + "grad_norm": 0.2348880171775818, + "learning_rate": 2.9126782888052535e-05, + "loss": 0.1019, + "step": 28172 + }, + { + "epoch": 0.5024970570399172, + "grad_norm": 0.39700719714164734, + "learning_rate": 2.9125247725209337e-05, + "loss": 0.1301, + "step": 28173 + }, + { + "epoch": 0.5025148931616309, + "grad_norm": 0.4070872366428375, + "learning_rate": 2.9123712546375255e-05, + "loss": 0.1456, + "step": 28174 + }, + { + "epoch": 0.5025327292833446, + "grad_norm": 0.29996219277381897, + "learning_rate": 2.9122177351556235e-05, + "loss": 0.1371, + "step": 28175 + }, + { + "epoch": 0.5025505654050583, + "grad_norm": 0.3382759690284729, + "learning_rate": 2.912064214075823e-05, + "loss": 0.2226, + "step": 28176 + }, + { + "epoch": 0.5025684015267721, + "grad_norm": 0.2362937331199646, + "learning_rate": 2.9119106913987205e-05, + "loss": 0.1617, + "step": 28177 + }, + { + "epoch": 0.5025862376484858, + "grad_norm": 0.21494202315807343, + "learning_rate": 2.9117571671249095e-05, + "loss": 0.1644, + "step": 28178 + }, + { + "epoch": 0.5026040737701994, + "grad_norm": 0.3636217713356018, + "learning_rate": 2.9116036412549845e-05, + "loss": 0.1487, + "step": 28179 + }, + { + "epoch": 0.5026219098919131, + "grad_norm": 0.22337442636489868, + "learning_rate": 2.911450113789543e-05, + "loss": 0.1564, + "step": 28180 + }, + { + "epoch": 0.5026397460136268, + "grad_norm": 0.2899104356765747, + "learning_rate": 2.9112965847291768e-05, + "loss": 0.1852, + "step": 28181 + }, + { + "epoch": 0.5026575821353405, + "grad_norm": 0.3071236312389374, + "learning_rate": 2.9111430540744843e-05, + "loss": 0.2012, + "step": 28182 + }, + { + "epoch": 0.5026754182570542, + "grad_norm": 0.26008909940719604, + "learning_rate": 2.9109895218260597e-05, + "loss": 0.1325, + "step": 28183 + }, + { + "epoch": 0.5026932543787679, + "grad_norm": 0.2218368649482727, + "learning_rate": 2.910835987984497e-05, + "loss": 0.1619, + "step": 28184 + }, + { + "epoch": 0.5027110905004816, + "grad_norm": 0.32528555393218994, + "learning_rate": 2.9106824525503922e-05, + "loss": 0.1404, + "step": 28185 + }, + { + "epoch": 0.5027289266221953, + "grad_norm": 0.24894194304943085, + "learning_rate": 2.910528915524341e-05, + "loss": 0.1128, + "step": 28186 + }, + { + "epoch": 0.502746762743909, + "grad_norm": 0.29863473773002625, + "learning_rate": 2.9103753769069374e-05, + "loss": 0.1777, + "step": 28187 + }, + { + "epoch": 0.5027645988656226, + "grad_norm": 0.24769367277622223, + "learning_rate": 2.910221836698777e-05, + "loss": 0.12, + "step": 28188 + }, + { + "epoch": 0.5027824349873363, + "grad_norm": 0.22818894684314728, + "learning_rate": 2.910068294900455e-05, + "loss": 0.1569, + "step": 28189 + }, + { + "epoch": 0.50280027110905, + "grad_norm": 0.24973297119140625, + "learning_rate": 2.9099147515125675e-05, + "loss": 0.1192, + "step": 28190 + }, + { + "epoch": 0.5028181072307637, + "grad_norm": 0.5419824719429016, + "learning_rate": 2.9097612065357083e-05, + "loss": 0.1244, + "step": 28191 + }, + { + "epoch": 0.5028359433524774, + "grad_norm": 0.178416445851326, + "learning_rate": 2.909607659970473e-05, + "loss": 0.099, + "step": 28192 + }, + { + "epoch": 0.5028537794741912, + "grad_norm": 0.25704649090766907, + "learning_rate": 2.9094541118174574e-05, + "loss": 0.1142, + "step": 28193 + }, + { + "epoch": 0.5028716155959049, + "grad_norm": 0.3065432608127594, + "learning_rate": 2.9093005620772555e-05, + "loss": 0.1248, + "step": 28194 + }, + { + "epoch": 0.5028894517176186, + "grad_norm": 0.3015022277832031, + "learning_rate": 2.9091470107504638e-05, + "loss": 0.1401, + "step": 28195 + }, + { + "epoch": 0.5029072878393323, + "grad_norm": 0.3590488135814667, + "learning_rate": 2.9089934578376767e-05, + "loss": 0.1253, + "step": 28196 + }, + { + "epoch": 0.5029251239610459, + "grad_norm": 0.3722882866859436, + "learning_rate": 2.90883990333949e-05, + "loss": 0.1322, + "step": 28197 + }, + { + "epoch": 0.5029429600827596, + "grad_norm": 0.3550504148006439, + "learning_rate": 2.908686347256498e-05, + "loss": 0.1139, + "step": 28198 + }, + { + "epoch": 0.5029607962044733, + "grad_norm": 0.23661009967327118, + "learning_rate": 2.9085327895892974e-05, + "loss": 0.1097, + "step": 28199 + }, + { + "epoch": 0.502978632326187, + "grad_norm": 0.2474004179239273, + "learning_rate": 2.9083792303384815e-05, + "loss": 0.1441, + "step": 28200 + }, + { + "epoch": 0.5029964684479007, + "grad_norm": 0.20673780143260956, + "learning_rate": 2.908225669504648e-05, + "loss": 0.1023, + "step": 28201 + }, + { + "epoch": 0.5030143045696144, + "grad_norm": 0.31982874870300293, + "learning_rate": 2.90807210708839e-05, + "loss": 0.1562, + "step": 28202 + }, + { + "epoch": 0.5030321406913281, + "grad_norm": 0.44826820492744446, + "learning_rate": 2.9079185430903035e-05, + "loss": 0.2, + "step": 28203 + }, + { + "epoch": 0.5030499768130418, + "grad_norm": 0.22934645414352417, + "learning_rate": 2.9077649775109843e-05, + "loss": 0.1327, + "step": 28204 + }, + { + "epoch": 0.5030678129347554, + "grad_norm": 0.22432422637939453, + "learning_rate": 2.9076114103510265e-05, + "loss": 0.1463, + "step": 28205 + }, + { + "epoch": 0.5030856490564691, + "grad_norm": 0.2702496647834778, + "learning_rate": 2.907457841611027e-05, + "loss": 0.1029, + "step": 28206 + }, + { + "epoch": 0.5031034851781828, + "grad_norm": 0.25634831190109253, + "learning_rate": 2.90730427129158e-05, + "loss": 0.1095, + "step": 28207 + }, + { + "epoch": 0.5031213212998965, + "grad_norm": 0.2893608808517456, + "learning_rate": 2.90715069939328e-05, + "loss": 0.2039, + "step": 28208 + }, + { + "epoch": 0.5031391574216102, + "grad_norm": 0.434030145406723, + "learning_rate": 2.906997125916724e-05, + "loss": 0.14, + "step": 28209 + }, + { + "epoch": 0.503156993543324, + "grad_norm": 0.33444851636886597, + "learning_rate": 2.906843550862507e-05, + "loss": 0.2418, + "step": 28210 + }, + { + "epoch": 0.5031748296650377, + "grad_norm": 0.30654022097587585, + "learning_rate": 2.9066899742312226e-05, + "loss": 0.1692, + "step": 28211 + }, + { + "epoch": 0.5031926657867514, + "grad_norm": 0.23410923779010773, + "learning_rate": 2.9065363960234686e-05, + "loss": 0.139, + "step": 28212 + }, + { + "epoch": 0.5032105019084651, + "grad_norm": 0.2185935229063034, + "learning_rate": 2.906382816239839e-05, + "loss": 0.0798, + "step": 28213 + }, + { + "epoch": 0.5032283380301787, + "grad_norm": 0.2545168995857239, + "learning_rate": 2.906229234880929e-05, + "loss": 0.1167, + "step": 28214 + }, + { + "epoch": 0.5032461741518924, + "grad_norm": 0.2124333381652832, + "learning_rate": 2.9060756519473337e-05, + "loss": 0.1339, + "step": 28215 + }, + { + "epoch": 0.5032640102736061, + "grad_norm": 0.24294736981391907, + "learning_rate": 2.905922067439649e-05, + "loss": 0.1306, + "step": 28216 + }, + { + "epoch": 0.5032818463953198, + "grad_norm": 0.25309163331985474, + "learning_rate": 2.905768481358471e-05, + "loss": 0.1361, + "step": 28217 + }, + { + "epoch": 0.5032996825170335, + "grad_norm": 0.32499727606773376, + "learning_rate": 2.9056148937043936e-05, + "loss": 0.1304, + "step": 28218 + }, + { + "epoch": 0.5033175186387472, + "grad_norm": 0.21370266377925873, + "learning_rate": 2.905461304478013e-05, + "loss": 0.1149, + "step": 28219 + }, + { + "epoch": 0.5033353547604609, + "grad_norm": 0.3214581608772278, + "learning_rate": 2.9053077136799245e-05, + "loss": 0.1526, + "step": 28220 + }, + { + "epoch": 0.5033531908821746, + "grad_norm": 0.2663993537425995, + "learning_rate": 2.9051541213107232e-05, + "loss": 0.1126, + "step": 28221 + }, + { + "epoch": 0.5033710270038882, + "grad_norm": 0.2632231116294861, + "learning_rate": 2.9050005273710045e-05, + "loss": 0.1417, + "step": 28222 + }, + { + "epoch": 0.5033888631256019, + "grad_norm": 0.31506532430648804, + "learning_rate": 2.9048469318613635e-05, + "loss": 0.1822, + "step": 28223 + }, + { + "epoch": 0.5034066992473156, + "grad_norm": 0.18968191742897034, + "learning_rate": 2.904693334782396e-05, + "loss": 0.1211, + "step": 28224 + }, + { + "epoch": 0.5034245353690293, + "grad_norm": 0.3112109899520874, + "learning_rate": 2.9045397361346983e-05, + "loss": 0.1286, + "step": 28225 + }, + { + "epoch": 0.503442371490743, + "grad_norm": 0.22733506560325623, + "learning_rate": 2.904386135918864e-05, + "loss": 0.0959, + "step": 28226 + }, + { + "epoch": 0.5034602076124568, + "grad_norm": 0.3000713884830475, + "learning_rate": 2.9042325341354898e-05, + "loss": 0.1165, + "step": 28227 + }, + { + "epoch": 0.5034780437341705, + "grad_norm": 0.27964308857917786, + "learning_rate": 2.90407893078517e-05, + "loss": 0.1904, + "step": 28228 + }, + { + "epoch": 0.5034958798558842, + "grad_norm": 0.2866894602775574, + "learning_rate": 2.9039253258685017e-05, + "loss": 0.1439, + "step": 28229 + }, + { + "epoch": 0.5035137159775979, + "grad_norm": 0.3150649666786194, + "learning_rate": 2.9037717193860788e-05, + "loss": 0.1577, + "step": 28230 + }, + { + "epoch": 0.5035315520993116, + "grad_norm": 0.31004220247268677, + "learning_rate": 2.9036181113384965e-05, + "loss": 0.185, + "step": 28231 + }, + { + "epoch": 0.5035493882210252, + "grad_norm": 0.29367175698280334, + "learning_rate": 2.9034645017263522e-05, + "loss": 0.1619, + "step": 28232 + }, + { + "epoch": 0.5035672243427389, + "grad_norm": 0.3524821102619171, + "learning_rate": 2.903310890550239e-05, + "loss": 0.1607, + "step": 28233 + }, + { + "epoch": 0.5035850604644526, + "grad_norm": 0.25724029541015625, + "learning_rate": 2.903157277810755e-05, + "loss": 0.1416, + "step": 28234 + }, + { + "epoch": 0.5036028965861663, + "grad_norm": 0.22763384878635406, + "learning_rate": 2.9030036635084928e-05, + "loss": 0.068, + "step": 28235 + }, + { + "epoch": 0.50362073270788, + "grad_norm": 0.27549588680267334, + "learning_rate": 2.9028500476440497e-05, + "loss": 0.1306, + "step": 28236 + }, + { + "epoch": 0.5036385688295937, + "grad_norm": 0.20183268189430237, + "learning_rate": 2.9026964302180203e-05, + "loss": 0.1216, + "step": 28237 + }, + { + "epoch": 0.5036564049513074, + "grad_norm": 0.24084806442260742, + "learning_rate": 2.9025428112310014e-05, + "loss": 0.1604, + "step": 28238 + }, + { + "epoch": 0.5036742410730211, + "grad_norm": 0.25227150321006775, + "learning_rate": 2.9023891906835866e-05, + "loss": 0.1119, + "step": 28239 + }, + { + "epoch": 0.5036920771947347, + "grad_norm": 0.3735066056251526, + "learning_rate": 2.902235568576373e-05, + "loss": 0.1263, + "step": 28240 + }, + { + "epoch": 0.5037099133164484, + "grad_norm": 0.3639550805091858, + "learning_rate": 2.9020819449099552e-05, + "loss": 0.1085, + "step": 28241 + }, + { + "epoch": 0.5037277494381621, + "grad_norm": 0.21803152561187744, + "learning_rate": 2.901928319684929e-05, + "loss": 0.1101, + "step": 28242 + }, + { + "epoch": 0.5037455855598758, + "grad_norm": 0.2249739021062851, + "learning_rate": 2.9017746929018897e-05, + "loss": 0.1194, + "step": 28243 + }, + { + "epoch": 0.5037634216815896, + "grad_norm": 0.3105986714363098, + "learning_rate": 2.9016210645614322e-05, + "loss": 0.1695, + "step": 28244 + }, + { + "epoch": 0.5037812578033033, + "grad_norm": 0.2683711647987366, + "learning_rate": 2.901467434664154e-05, + "loss": 0.1242, + "step": 28245 + }, + { + "epoch": 0.503799093925017, + "grad_norm": 0.319701224565506, + "learning_rate": 2.9013138032106485e-05, + "loss": 0.1428, + "step": 28246 + }, + { + "epoch": 0.5038169300467307, + "grad_norm": 0.43664389848709106, + "learning_rate": 2.9011601702015124e-05, + "loss": 0.1651, + "step": 28247 + }, + { + "epoch": 0.5038347661684444, + "grad_norm": 0.34010130167007446, + "learning_rate": 2.9010065356373405e-05, + "loss": 0.1177, + "step": 28248 + }, + { + "epoch": 0.503852602290158, + "grad_norm": 0.23218445479869843, + "learning_rate": 2.9008528995187296e-05, + "loss": 0.1557, + "step": 28249 + }, + { + "epoch": 0.5038704384118717, + "grad_norm": 0.3078397810459137, + "learning_rate": 2.9006992618462736e-05, + "loss": 0.1593, + "step": 28250 + }, + { + "epoch": 0.5038882745335854, + "grad_norm": 0.2370034009218216, + "learning_rate": 2.900545622620569e-05, + "loss": 0.1081, + "step": 28251 + }, + { + "epoch": 0.5039061106552991, + "grad_norm": 0.2142411470413208, + "learning_rate": 2.9003919818422116e-05, + "loss": 0.1324, + "step": 28252 + }, + { + "epoch": 0.5039239467770128, + "grad_norm": 0.24562764167785645, + "learning_rate": 2.9002383395117965e-05, + "loss": 0.1711, + "step": 28253 + }, + { + "epoch": 0.5039417828987265, + "grad_norm": 0.2884778678417206, + "learning_rate": 2.900084695629919e-05, + "loss": 0.1495, + "step": 28254 + }, + { + "epoch": 0.5039596190204402, + "grad_norm": 0.2911554276943207, + "learning_rate": 2.8999310501971755e-05, + "loss": 0.0916, + "step": 28255 + }, + { + "epoch": 0.5039774551421539, + "grad_norm": 0.23309870064258575, + "learning_rate": 2.899777403214161e-05, + "loss": 0.0948, + "step": 28256 + }, + { + "epoch": 0.5039952912638676, + "grad_norm": 0.21861794590950012, + "learning_rate": 2.8996237546814713e-05, + "loss": 0.1512, + "step": 28257 + }, + { + "epoch": 0.5040131273855812, + "grad_norm": 0.2875767946243286, + "learning_rate": 2.8994701045997014e-05, + "loss": 0.137, + "step": 28258 + }, + { + "epoch": 0.5040309635072949, + "grad_norm": 0.2580637037754059, + "learning_rate": 2.8993164529694476e-05, + "loss": 0.1328, + "step": 28259 + }, + { + "epoch": 0.5040487996290086, + "grad_norm": 0.26604339480400085, + "learning_rate": 2.8991627997913055e-05, + "loss": 0.1949, + "step": 28260 + }, + { + "epoch": 0.5040666357507224, + "grad_norm": 0.20167092978954315, + "learning_rate": 2.8990091450658707e-05, + "loss": 0.099, + "step": 28261 + }, + { + "epoch": 0.5040844718724361, + "grad_norm": 0.3219318389892578, + "learning_rate": 2.8988554887937384e-05, + "loss": 0.1312, + "step": 28262 + }, + { + "epoch": 0.5041023079941498, + "grad_norm": 0.26011037826538086, + "learning_rate": 2.8987018309755044e-05, + "loss": 0.2164, + "step": 28263 + }, + { + "epoch": 0.5041201441158635, + "grad_norm": 0.2656829059123993, + "learning_rate": 2.898548171611764e-05, + "loss": 0.1538, + "step": 28264 + }, + { + "epoch": 0.5041379802375772, + "grad_norm": 0.2955317199230194, + "learning_rate": 2.8983945107031136e-05, + "loss": 0.1893, + "step": 28265 + }, + { + "epoch": 0.5041558163592909, + "grad_norm": 0.249686598777771, + "learning_rate": 2.8982408482501487e-05, + "loss": 0.0937, + "step": 28266 + }, + { + "epoch": 0.5041736524810045, + "grad_norm": 0.2201281189918518, + "learning_rate": 2.8980871842534646e-05, + "loss": 0.1007, + "step": 28267 + }, + { + "epoch": 0.5041914886027182, + "grad_norm": 0.24558743834495544, + "learning_rate": 2.8979335187136576e-05, + "loss": 0.1196, + "step": 28268 + }, + { + "epoch": 0.5042093247244319, + "grad_norm": 0.2904684543609619, + "learning_rate": 2.8977798516313226e-05, + "loss": 0.1576, + "step": 28269 + }, + { + "epoch": 0.5042271608461456, + "grad_norm": 0.20844466984272003, + "learning_rate": 2.897626183007055e-05, + "loss": 0.1248, + "step": 28270 + }, + { + "epoch": 0.5042449969678593, + "grad_norm": 0.32644855976104736, + "learning_rate": 2.897472512841451e-05, + "loss": 0.1425, + "step": 28271 + }, + { + "epoch": 0.504262833089573, + "grad_norm": 0.3606814742088318, + "learning_rate": 2.897318841135106e-05, + "loss": 0.2302, + "step": 28272 + }, + { + "epoch": 0.5042806692112867, + "grad_norm": 0.1962312012910843, + "learning_rate": 2.897165167888617e-05, + "loss": 0.1213, + "step": 28273 + }, + { + "epoch": 0.5042985053330004, + "grad_norm": 0.2603902220726013, + "learning_rate": 2.897011493102578e-05, + "loss": 0.1162, + "step": 28274 + }, + { + "epoch": 0.504316341454714, + "grad_norm": 0.24425619840621948, + "learning_rate": 2.896857816777585e-05, + "loss": 0.156, + "step": 28275 + }, + { + "epoch": 0.5043341775764277, + "grad_norm": 0.2703363299369812, + "learning_rate": 2.8967041389142346e-05, + "loss": 0.1484, + "step": 28276 + }, + { + "epoch": 0.5043520136981414, + "grad_norm": 0.21123532950878143, + "learning_rate": 2.896550459513122e-05, + "loss": 0.0945, + "step": 28277 + }, + { + "epoch": 0.5043698498198552, + "grad_norm": 0.24760466814041138, + "learning_rate": 2.8963967785748425e-05, + "loss": 0.1609, + "step": 28278 + }, + { + "epoch": 0.5043876859415689, + "grad_norm": 0.1943018138408661, + "learning_rate": 2.896243096099992e-05, + "loss": 0.1618, + "step": 28279 + }, + { + "epoch": 0.5044055220632826, + "grad_norm": 0.29512542486190796, + "learning_rate": 2.8960894120891662e-05, + "loss": 0.1593, + "step": 28280 + }, + { + "epoch": 0.5044233581849963, + "grad_norm": 0.28630539774894714, + "learning_rate": 2.895935726542962e-05, + "loss": 0.1401, + "step": 28281 + }, + { + "epoch": 0.50444119430671, + "grad_norm": 0.21738514304161072, + "learning_rate": 2.895782039461974e-05, + "loss": 0.1214, + "step": 28282 + }, + { + "epoch": 0.5044590304284237, + "grad_norm": 0.2422303706407547, + "learning_rate": 2.8956283508467975e-05, + "loss": 0.185, + "step": 28283 + }, + { + "epoch": 0.5044768665501373, + "grad_norm": 0.14903074502944946, + "learning_rate": 2.8954746606980295e-05, + "loss": 0.0847, + "step": 28284 + }, + { + "epoch": 0.504494702671851, + "grad_norm": 0.2515467703342438, + "learning_rate": 2.8953209690162642e-05, + "loss": 0.117, + "step": 28285 + }, + { + "epoch": 0.5045125387935647, + "grad_norm": 0.39284393191337585, + "learning_rate": 2.895167275802099e-05, + "loss": 0.1475, + "step": 28286 + }, + { + "epoch": 0.5045303749152784, + "grad_norm": 0.23131625354290009, + "learning_rate": 2.8950135810561286e-05, + "loss": 0.1638, + "step": 28287 + }, + { + "epoch": 0.5045482110369921, + "grad_norm": 0.23975437879562378, + "learning_rate": 2.8948598847789494e-05, + "loss": 0.1097, + "step": 28288 + }, + { + "epoch": 0.5045660471587058, + "grad_norm": 0.3290809690952301, + "learning_rate": 2.8947061869711566e-05, + "loss": 0.1602, + "step": 28289 + }, + { + "epoch": 0.5045838832804195, + "grad_norm": 0.25540485978126526, + "learning_rate": 2.894552487633347e-05, + "loss": 0.1524, + "step": 28290 + }, + { + "epoch": 0.5046017194021332, + "grad_norm": 0.2619209587574005, + "learning_rate": 2.8943987867661148e-05, + "loss": 0.1292, + "step": 28291 + }, + { + "epoch": 0.5046195555238469, + "grad_norm": 0.32695481181144714, + "learning_rate": 2.894245084370057e-05, + "loss": 0.1407, + "step": 28292 + }, + { + "epoch": 0.5046373916455605, + "grad_norm": 0.2673144042491913, + "learning_rate": 2.894091380445769e-05, + "loss": 0.1397, + "step": 28293 + }, + { + "epoch": 0.5046552277672743, + "grad_norm": 0.22678114473819733, + "learning_rate": 2.8939376749938472e-05, + "loss": 0.1296, + "step": 28294 + }, + { + "epoch": 0.504673063888988, + "grad_norm": 0.28748729825019836, + "learning_rate": 2.8937839680148872e-05, + "loss": 0.1152, + "step": 28295 + }, + { + "epoch": 0.5046909000107017, + "grad_norm": 0.3260626494884491, + "learning_rate": 2.8936302595094833e-05, + "loss": 0.1862, + "step": 28296 + }, + { + "epoch": 0.5047087361324154, + "grad_norm": 0.2241354137659073, + "learning_rate": 2.8934765494782333e-05, + "loss": 0.1071, + "step": 28297 + }, + { + "epoch": 0.5047265722541291, + "grad_norm": 0.26125675439834595, + "learning_rate": 2.8933228379217324e-05, + "loss": 0.1384, + "step": 28298 + }, + { + "epoch": 0.5047444083758428, + "grad_norm": 0.2865285277366638, + "learning_rate": 2.8931691248405756e-05, + "loss": 0.1669, + "step": 28299 + }, + { + "epoch": 0.5047622444975565, + "grad_norm": 0.22164495289325714, + "learning_rate": 2.8930154102353602e-05, + "loss": 0.1405, + "step": 28300 + }, + { + "epoch": 0.5047800806192702, + "grad_norm": 0.24465897679328918, + "learning_rate": 2.892861694106681e-05, + "loss": 0.1087, + "step": 28301 + }, + { + "epoch": 0.5047979167409838, + "grad_norm": 0.23553021252155304, + "learning_rate": 2.8927079764551344e-05, + "loss": 0.1178, + "step": 28302 + }, + { + "epoch": 0.5048157528626975, + "grad_norm": 0.28661873936653137, + "learning_rate": 2.8925542572813164e-05, + "loss": 0.1829, + "step": 28303 + }, + { + "epoch": 0.5048335889844112, + "grad_norm": 0.33187881112098694, + "learning_rate": 2.8924005365858216e-05, + "loss": 0.1375, + "step": 28304 + }, + { + "epoch": 0.5048514251061249, + "grad_norm": 0.3010408282279968, + "learning_rate": 2.8922468143692477e-05, + "loss": 0.1465, + "step": 28305 + }, + { + "epoch": 0.5048692612278386, + "grad_norm": 0.20376047492027283, + "learning_rate": 2.8920930906321887e-05, + "loss": 0.0954, + "step": 28306 + }, + { + "epoch": 0.5048870973495523, + "grad_norm": 0.21004408597946167, + "learning_rate": 2.891939365375242e-05, + "loss": 0.1216, + "step": 28307 + }, + { + "epoch": 0.504904933471266, + "grad_norm": 0.2306356579065323, + "learning_rate": 2.8917856385990034e-05, + "loss": 0.156, + "step": 28308 + }, + { + "epoch": 0.5049227695929797, + "grad_norm": 0.26142948865890503, + "learning_rate": 2.8916319103040672e-05, + "loss": 0.1748, + "step": 28309 + }, + { + "epoch": 0.5049406057146933, + "grad_norm": 0.26618692278862, + "learning_rate": 2.891478180491032e-05, + "loss": 0.1531, + "step": 28310 + }, + { + "epoch": 0.5049584418364071, + "grad_norm": 0.22935238480567932, + "learning_rate": 2.891324449160491e-05, + "loss": 0.1113, + "step": 28311 + }, + { + "epoch": 0.5049762779581208, + "grad_norm": 0.21562381088733673, + "learning_rate": 2.8911707163130413e-05, + "loss": 0.1217, + "step": 28312 + }, + { + "epoch": 0.5049941140798345, + "grad_norm": 0.28413572907447815, + "learning_rate": 2.8910169819492784e-05, + "loss": 0.1567, + "step": 28313 + }, + { + "epoch": 0.5050119502015482, + "grad_norm": 0.3368399441242218, + "learning_rate": 2.8908632460698e-05, + "loss": 0.2026, + "step": 28314 + }, + { + "epoch": 0.5050297863232619, + "grad_norm": 0.3194226920604706, + "learning_rate": 2.8907095086751996e-05, + "loss": 0.1683, + "step": 28315 + }, + { + "epoch": 0.5050476224449756, + "grad_norm": 0.28431981801986694, + "learning_rate": 2.8905557697660746e-05, + "loss": 0.1467, + "step": 28316 + }, + { + "epoch": 0.5050654585666893, + "grad_norm": 0.28889966011047363, + "learning_rate": 2.89040202934302e-05, + "loss": 0.1439, + "step": 28317 + }, + { + "epoch": 0.505083294688403, + "grad_norm": 0.24825017154216766, + "learning_rate": 2.8902482874066333e-05, + "loss": 0.144, + "step": 28318 + }, + { + "epoch": 0.5051011308101166, + "grad_norm": 0.2697658836841583, + "learning_rate": 2.890094543957509e-05, + "loss": 0.1255, + "step": 28319 + }, + { + "epoch": 0.5051189669318303, + "grad_norm": 0.18925027549266815, + "learning_rate": 2.889940798996243e-05, + "loss": 0.1064, + "step": 28320 + }, + { + "epoch": 0.505136803053544, + "grad_norm": 0.25063151121139526, + "learning_rate": 2.889787052523432e-05, + "loss": 0.1348, + "step": 28321 + }, + { + "epoch": 0.5051546391752577, + "grad_norm": 0.4020143151283264, + "learning_rate": 2.8896333045396724e-05, + "loss": 0.1762, + "step": 28322 + }, + { + "epoch": 0.5051724752969714, + "grad_norm": 0.2611508071422577, + "learning_rate": 2.8894795550455595e-05, + "loss": 0.12, + "step": 28323 + }, + { + "epoch": 0.5051903114186851, + "grad_norm": 0.28865477442741394, + "learning_rate": 2.889325804041688e-05, + "loss": 0.1435, + "step": 28324 + }, + { + "epoch": 0.5052081475403988, + "grad_norm": 0.23400035500526428, + "learning_rate": 2.889172051528657e-05, + "loss": 0.1477, + "step": 28325 + }, + { + "epoch": 0.5052259836621125, + "grad_norm": 0.320441871881485, + "learning_rate": 2.8890182975070594e-05, + "loss": 0.1369, + "step": 28326 + }, + { + "epoch": 0.5052438197838262, + "grad_norm": 0.2101871222257614, + "learning_rate": 2.888864541977493e-05, + "loss": 0.1112, + "step": 28327 + }, + { + "epoch": 0.50526165590554, + "grad_norm": 0.4844701290130615, + "learning_rate": 2.8887107849405533e-05, + "loss": 0.1169, + "step": 28328 + }, + { + "epoch": 0.5052794920272536, + "grad_norm": 0.22285085916519165, + "learning_rate": 2.8885570263968363e-05, + "loss": 0.1778, + "step": 28329 + }, + { + "epoch": 0.5052973281489673, + "grad_norm": 0.34672802686691284, + "learning_rate": 2.888403266346938e-05, + "loss": 0.1473, + "step": 28330 + }, + { + "epoch": 0.505315164270681, + "grad_norm": 0.20867227017879486, + "learning_rate": 2.888249504791455e-05, + "loss": 0.1265, + "step": 28331 + }, + { + "epoch": 0.5053330003923947, + "grad_norm": 0.3154470920562744, + "learning_rate": 2.888095741730982e-05, + "loss": 0.1871, + "step": 28332 + }, + { + "epoch": 0.5053508365141084, + "grad_norm": 0.20947100222110748, + "learning_rate": 2.8879419771661166e-05, + "loss": 0.1261, + "step": 28333 + }, + { + "epoch": 0.5053686726358221, + "grad_norm": 0.22799675166606903, + "learning_rate": 2.8877882110974534e-05, + "loss": 0.1309, + "step": 28334 + }, + { + "epoch": 0.5053865087575358, + "grad_norm": 0.1632799655199051, + "learning_rate": 2.8876344435255897e-05, + "loss": 0.1125, + "step": 28335 + }, + { + "epoch": 0.5054043448792495, + "grad_norm": 0.2476794272661209, + "learning_rate": 2.8874806744511206e-05, + "loss": 0.1771, + "step": 28336 + }, + { + "epoch": 0.5054221810009631, + "grad_norm": 0.24419334530830383, + "learning_rate": 2.887326903874643e-05, + "loss": 0.1097, + "step": 28337 + }, + { + "epoch": 0.5054400171226768, + "grad_norm": 0.19359739124774933, + "learning_rate": 2.8871731317967527e-05, + "loss": 0.127, + "step": 28338 + }, + { + "epoch": 0.5054578532443905, + "grad_norm": 0.2425316572189331, + "learning_rate": 2.887019358218045e-05, + "loss": 0.163, + "step": 28339 + }, + { + "epoch": 0.5054756893661042, + "grad_norm": 0.3257550299167633, + "learning_rate": 2.8868655831391166e-05, + "loss": 0.1482, + "step": 28340 + }, + { + "epoch": 0.5054935254878179, + "grad_norm": 0.2643504738807678, + "learning_rate": 2.8867118065605635e-05, + "loss": 0.1716, + "step": 28341 + }, + { + "epoch": 0.5055113616095316, + "grad_norm": 0.20067425072193146, + "learning_rate": 2.886558028482983e-05, + "loss": 0.1005, + "step": 28342 + }, + { + "epoch": 0.5055291977312453, + "grad_norm": 0.28453224897384644, + "learning_rate": 2.8864042489069687e-05, + "loss": 0.1977, + "step": 28343 + }, + { + "epoch": 0.505547033852959, + "grad_norm": 0.25305116176605225, + "learning_rate": 2.8862504678331192e-05, + "loss": 0.1427, + "step": 28344 + }, + { + "epoch": 0.5055648699746728, + "grad_norm": 0.30163607001304626, + "learning_rate": 2.8860966852620286e-05, + "loss": 0.1851, + "step": 28345 + }, + { + "epoch": 0.5055827060963864, + "grad_norm": 0.2377101182937622, + "learning_rate": 2.8859429011942947e-05, + "loss": 0.1403, + "step": 28346 + }, + { + "epoch": 0.5056005422181001, + "grad_norm": 0.31126388907432556, + "learning_rate": 2.8857891156305116e-05, + "loss": 0.1061, + "step": 28347 + }, + { + "epoch": 0.5056183783398138, + "grad_norm": 0.2425568550825119, + "learning_rate": 2.8856353285712777e-05, + "loss": 0.0923, + "step": 28348 + }, + { + "epoch": 0.5056362144615275, + "grad_norm": 0.2648632228374481, + "learning_rate": 2.885481540017188e-05, + "loss": 0.133, + "step": 28349 + }, + { + "epoch": 0.5056540505832412, + "grad_norm": 0.2518453896045685, + "learning_rate": 2.885327749968838e-05, + "loss": 0.1581, + "step": 28350 + }, + { + "epoch": 0.5056718867049549, + "grad_norm": 0.16720138490200043, + "learning_rate": 2.8851739584268257e-05, + "loss": 0.1488, + "step": 28351 + }, + { + "epoch": 0.5056897228266686, + "grad_norm": 0.25937044620513916, + "learning_rate": 2.885020165391745e-05, + "loss": 0.21, + "step": 28352 + }, + { + "epoch": 0.5057075589483823, + "grad_norm": 0.36026614904403687, + "learning_rate": 2.884866370864194e-05, + "loss": 0.1414, + "step": 28353 + }, + { + "epoch": 0.505725395070096, + "grad_norm": 0.27204209566116333, + "learning_rate": 2.8847125748447672e-05, + "loss": 0.1515, + "step": 28354 + }, + { + "epoch": 0.5057432311918096, + "grad_norm": 0.3236941397190094, + "learning_rate": 2.884558777334062e-05, + "loss": 0.1315, + "step": 28355 + }, + { + "epoch": 0.5057610673135233, + "grad_norm": 0.2813650965690613, + "learning_rate": 2.8844049783326736e-05, + "loss": 0.1268, + "step": 28356 + }, + { + "epoch": 0.505778903435237, + "grad_norm": 0.21412873268127441, + "learning_rate": 2.8842511778411997e-05, + "loss": 0.1262, + "step": 28357 + }, + { + "epoch": 0.5057967395569507, + "grad_norm": 0.2654881179332733, + "learning_rate": 2.8840973758602352e-05, + "loss": 0.0994, + "step": 28358 + }, + { + "epoch": 0.5058145756786644, + "grad_norm": 0.2818746268749237, + "learning_rate": 2.883943572390377e-05, + "loss": 0.1089, + "step": 28359 + }, + { + "epoch": 0.5058324118003781, + "grad_norm": 0.1742342859506607, + "learning_rate": 2.88378976743222e-05, + "loss": 0.1293, + "step": 28360 + }, + { + "epoch": 0.5058502479220918, + "grad_norm": 0.23831379413604736, + "learning_rate": 2.8836359609863615e-05, + "loss": 0.1271, + "step": 28361 + }, + { + "epoch": 0.5058680840438056, + "grad_norm": 0.2732433080673218, + "learning_rate": 2.883482153053398e-05, + "loss": 0.1041, + "step": 28362 + }, + { + "epoch": 0.5058859201655193, + "grad_norm": 0.2501196563243866, + "learning_rate": 2.883328343633925e-05, + "loss": 0.1351, + "step": 28363 + }, + { + "epoch": 0.5059037562872329, + "grad_norm": 0.23516647517681122, + "learning_rate": 2.8831745327285386e-05, + "loss": 0.1341, + "step": 28364 + }, + { + "epoch": 0.5059215924089466, + "grad_norm": 0.33683860301971436, + "learning_rate": 2.8830207203378357e-05, + "loss": 0.167, + "step": 28365 + }, + { + "epoch": 0.5059394285306603, + "grad_norm": 0.2710850238800049, + "learning_rate": 2.8828669064624124e-05, + "loss": 0.1, + "step": 28366 + }, + { + "epoch": 0.505957264652374, + "grad_norm": 0.2861916720867157, + "learning_rate": 2.8827130911028644e-05, + "loss": 0.1899, + "step": 28367 + }, + { + "epoch": 0.5059751007740877, + "grad_norm": 0.23177196085453033, + "learning_rate": 2.882559274259788e-05, + "loss": 0.1277, + "step": 28368 + }, + { + "epoch": 0.5059929368958014, + "grad_norm": 0.28973355889320374, + "learning_rate": 2.88240545593378e-05, + "loss": 0.1628, + "step": 28369 + }, + { + "epoch": 0.5060107730175151, + "grad_norm": 0.26916858553886414, + "learning_rate": 2.882251636125437e-05, + "loss": 0.1361, + "step": 28370 + }, + { + "epoch": 0.5060286091392288, + "grad_norm": 0.2533247172832489, + "learning_rate": 2.882097814835354e-05, + "loss": 0.1578, + "step": 28371 + }, + { + "epoch": 0.5060464452609424, + "grad_norm": 0.40038228034973145, + "learning_rate": 2.8819439920641276e-05, + "loss": 0.1232, + "step": 28372 + }, + { + "epoch": 0.5060642813826561, + "grad_norm": 0.27214571833610535, + "learning_rate": 2.881790167812355e-05, + "loss": 0.1674, + "step": 28373 + }, + { + "epoch": 0.5060821175043698, + "grad_norm": 0.2790907025337219, + "learning_rate": 2.881636342080632e-05, + "loss": 0.1524, + "step": 28374 + }, + { + "epoch": 0.5060999536260835, + "grad_norm": 0.41377803683280945, + "learning_rate": 2.881482514869554e-05, + "loss": 0.1551, + "step": 28375 + }, + { + "epoch": 0.5061177897477972, + "grad_norm": 0.23858848214149475, + "learning_rate": 2.8813286861797178e-05, + "loss": 0.1251, + "step": 28376 + }, + { + "epoch": 0.5061356258695109, + "grad_norm": 0.3751448392868042, + "learning_rate": 2.8811748560117208e-05, + "loss": 0.1869, + "step": 28377 + }, + { + "epoch": 0.5061534619912246, + "grad_norm": 0.27639347314834595, + "learning_rate": 2.881021024366158e-05, + "loss": 0.1687, + "step": 28378 + }, + { + "epoch": 0.5061712981129384, + "grad_norm": 0.25463035702705383, + "learning_rate": 2.8808671912436262e-05, + "loss": 0.0999, + "step": 28379 + }, + { + "epoch": 0.5061891342346521, + "grad_norm": 0.32845792174339294, + "learning_rate": 2.8807133566447213e-05, + "loss": 0.1824, + "step": 28380 + }, + { + "epoch": 0.5062069703563657, + "grad_norm": 0.2344057559967041, + "learning_rate": 2.880559520570041e-05, + "loss": 0.151, + "step": 28381 + }, + { + "epoch": 0.5062248064780794, + "grad_norm": 0.38982972502708435, + "learning_rate": 2.8804056830201793e-05, + "loss": 0.1802, + "step": 28382 + }, + { + "epoch": 0.5062426425997931, + "grad_norm": 0.2142881602048874, + "learning_rate": 2.8802518439957342e-05, + "loss": 0.1841, + "step": 28383 + }, + { + "epoch": 0.5062604787215068, + "grad_norm": 0.3108970522880554, + "learning_rate": 2.8800980034973014e-05, + "loss": 0.1515, + "step": 28384 + }, + { + "epoch": 0.5062783148432205, + "grad_norm": 0.26968348026275635, + "learning_rate": 2.879944161525478e-05, + "loss": 0.1831, + "step": 28385 + }, + { + "epoch": 0.5062961509649342, + "grad_norm": 0.2866678535938263, + "learning_rate": 2.8797903180808594e-05, + "loss": 0.1477, + "step": 28386 + }, + { + "epoch": 0.5063139870866479, + "grad_norm": 0.35315167903900146, + "learning_rate": 2.8796364731640428e-05, + "loss": 0.1428, + "step": 28387 + }, + { + "epoch": 0.5063318232083616, + "grad_norm": 0.3027825355529785, + "learning_rate": 2.879482626775624e-05, + "loss": 0.1612, + "step": 28388 + }, + { + "epoch": 0.5063496593300753, + "grad_norm": 0.36291801929473877, + "learning_rate": 2.8793287789161995e-05, + "loss": 0.1158, + "step": 28389 + }, + { + "epoch": 0.5063674954517889, + "grad_norm": 0.21735769510269165, + "learning_rate": 2.8791749295863646e-05, + "loss": 0.1392, + "step": 28390 + }, + { + "epoch": 0.5063853315735026, + "grad_norm": 0.2526581585407257, + "learning_rate": 2.879021078786718e-05, + "loss": 0.098, + "step": 28391 + }, + { + "epoch": 0.5064031676952163, + "grad_norm": 0.35680288076400757, + "learning_rate": 2.878867226517854e-05, + "loss": 0.1956, + "step": 28392 + }, + { + "epoch": 0.50642100381693, + "grad_norm": 0.2951757311820984, + "learning_rate": 2.87871337278037e-05, + "loss": 0.1527, + "step": 28393 + }, + { + "epoch": 0.5064388399386437, + "grad_norm": 0.2806277573108673, + "learning_rate": 2.8785595175748624e-05, + "loss": 0.1279, + "step": 28394 + }, + { + "epoch": 0.5064566760603575, + "grad_norm": 0.21698053181171417, + "learning_rate": 2.878405660901927e-05, + "loss": 0.1361, + "step": 28395 + }, + { + "epoch": 0.5064745121820712, + "grad_norm": 0.25057244300842285, + "learning_rate": 2.878251802762161e-05, + "loss": 0.1388, + "step": 28396 + }, + { + "epoch": 0.5064923483037849, + "grad_norm": 0.2653912901878357, + "learning_rate": 2.8780979431561593e-05, + "loss": 0.1745, + "step": 28397 + }, + { + "epoch": 0.5065101844254986, + "grad_norm": 0.2689267694950104, + "learning_rate": 2.877944082084521e-05, + "loss": 0.1215, + "step": 28398 + }, + { + "epoch": 0.5065280205472122, + "grad_norm": 0.36103591322898865, + "learning_rate": 2.8777902195478396e-05, + "loss": 0.1203, + "step": 28399 + }, + { + "epoch": 0.5065458566689259, + "grad_norm": 0.2123267501592636, + "learning_rate": 2.877636355546714e-05, + "loss": 0.1156, + "step": 28400 + }, + { + "epoch": 0.5065636927906396, + "grad_norm": 0.30385056138038635, + "learning_rate": 2.8774824900817388e-05, + "loss": 0.1828, + "step": 28401 + }, + { + "epoch": 0.5065815289123533, + "grad_norm": 0.2396973818540573, + "learning_rate": 2.8773286231535107e-05, + "loss": 0.1337, + "step": 28402 + }, + { + "epoch": 0.506599365034067, + "grad_norm": 0.33189648389816284, + "learning_rate": 2.877174754762627e-05, + "loss": 0.1545, + "step": 28403 + }, + { + "epoch": 0.5066172011557807, + "grad_norm": 0.23831751942634583, + "learning_rate": 2.877020884909683e-05, + "loss": 0.1211, + "step": 28404 + }, + { + "epoch": 0.5066350372774944, + "grad_norm": 0.36035892367362976, + "learning_rate": 2.8768670135952768e-05, + "loss": 0.1438, + "step": 28405 + }, + { + "epoch": 0.5066528733992081, + "grad_norm": 0.2645803689956665, + "learning_rate": 2.8767131408200033e-05, + "loss": 0.0785, + "step": 28406 + }, + { + "epoch": 0.5066707095209217, + "grad_norm": 0.24531646072864532, + "learning_rate": 2.8765592665844603e-05, + "loss": 0.1409, + "step": 28407 + }, + { + "epoch": 0.5066885456426354, + "grad_norm": 0.2863783836364746, + "learning_rate": 2.876405390889243e-05, + "loss": 0.171, + "step": 28408 + }, + { + "epoch": 0.5067063817643491, + "grad_norm": 0.31837233901023865, + "learning_rate": 2.876251513734948e-05, + "loss": 0.1294, + "step": 28409 + }, + { + "epoch": 0.5067242178860628, + "grad_norm": 0.22746196389198303, + "learning_rate": 2.876097635122173e-05, + "loss": 0.1222, + "step": 28410 + }, + { + "epoch": 0.5067420540077765, + "grad_norm": 0.26692891120910645, + "learning_rate": 2.8759437550515128e-05, + "loss": 0.1436, + "step": 28411 + }, + { + "epoch": 0.5067598901294903, + "grad_norm": 0.266908198595047, + "learning_rate": 2.875789873523565e-05, + "loss": 0.0977, + "step": 28412 + }, + { + "epoch": 0.506777726251204, + "grad_norm": 0.3461291193962097, + "learning_rate": 2.8756359905389264e-05, + "loss": 0.1501, + "step": 28413 + }, + { + "epoch": 0.5067955623729177, + "grad_norm": 0.2428554892539978, + "learning_rate": 2.875482106098193e-05, + "loss": 0.121, + "step": 28414 + }, + { + "epoch": 0.5068133984946314, + "grad_norm": 0.2869994044303894, + "learning_rate": 2.8753282202019603e-05, + "loss": 0.1528, + "step": 28415 + }, + { + "epoch": 0.506831234616345, + "grad_norm": 0.21481558680534363, + "learning_rate": 2.875174332850827e-05, + "loss": 0.1349, + "step": 28416 + }, + { + "epoch": 0.5068490707380587, + "grad_norm": 0.3269020915031433, + "learning_rate": 2.8750204440453872e-05, + "loss": 0.1684, + "step": 28417 + }, + { + "epoch": 0.5068669068597724, + "grad_norm": 0.32559534907341003, + "learning_rate": 2.874866553786239e-05, + "loss": 0.1526, + "step": 28418 + }, + { + "epoch": 0.5068847429814861, + "grad_norm": 0.2766841650009155, + "learning_rate": 2.874712662073979e-05, + "loss": 0.1298, + "step": 28419 + }, + { + "epoch": 0.5069025791031998, + "grad_norm": 0.2073928266763687, + "learning_rate": 2.874558768909203e-05, + "loss": 0.179, + "step": 28420 + }, + { + "epoch": 0.5069204152249135, + "grad_norm": 0.33086246252059937, + "learning_rate": 2.874404874292508e-05, + "loss": 0.1165, + "step": 28421 + }, + { + "epoch": 0.5069382513466272, + "grad_norm": 0.2925296425819397, + "learning_rate": 2.8742509782244904e-05, + "loss": 0.1392, + "step": 28422 + }, + { + "epoch": 0.5069560874683409, + "grad_norm": 0.2516690194606781, + "learning_rate": 2.8740970807057467e-05, + "loss": 0.1243, + "step": 28423 + }, + { + "epoch": 0.5069739235900546, + "grad_norm": 0.2913403809070587, + "learning_rate": 2.8739431817368728e-05, + "loss": 0.1606, + "step": 28424 + }, + { + "epoch": 0.5069917597117682, + "grad_norm": 0.26513007283210754, + "learning_rate": 2.8737892813184663e-05, + "loss": 0.1404, + "step": 28425 + }, + { + "epoch": 0.5070095958334819, + "grad_norm": 0.3299475908279419, + "learning_rate": 2.8736353794511246e-05, + "loss": 0.145, + "step": 28426 + }, + { + "epoch": 0.5070274319551956, + "grad_norm": 0.2627675533294678, + "learning_rate": 2.8734814761354423e-05, + "loss": 0.1394, + "step": 28427 + }, + { + "epoch": 0.5070452680769093, + "grad_norm": 0.2725169360637665, + "learning_rate": 2.8733275713720166e-05, + "loss": 0.1437, + "step": 28428 + }, + { + "epoch": 0.5070631041986231, + "grad_norm": 0.22264286875724792, + "learning_rate": 2.873173665161445e-05, + "loss": 0.0541, + "step": 28429 + }, + { + "epoch": 0.5070809403203368, + "grad_norm": 0.2680964171886444, + "learning_rate": 2.873019757504322e-05, + "loss": 0.1445, + "step": 28430 + }, + { + "epoch": 0.5070987764420505, + "grad_norm": 0.3315582573413849, + "learning_rate": 2.8728658484012465e-05, + "loss": 0.1693, + "step": 28431 + }, + { + "epoch": 0.5071166125637642, + "grad_norm": 0.26628589630126953, + "learning_rate": 2.8727119378528138e-05, + "loss": 0.1336, + "step": 28432 + }, + { + "epoch": 0.5071344486854779, + "grad_norm": 0.38926175236701965, + "learning_rate": 2.8725580258596218e-05, + "loss": 0.1417, + "step": 28433 + }, + { + "epoch": 0.5071522848071915, + "grad_norm": 0.2617027759552002, + "learning_rate": 2.8724041124222657e-05, + "loss": 0.2018, + "step": 28434 + }, + { + "epoch": 0.5071701209289052, + "grad_norm": 0.3515075743198395, + "learning_rate": 2.8722501975413422e-05, + "loss": 0.1508, + "step": 28435 + }, + { + "epoch": 0.5071879570506189, + "grad_norm": 0.23065081238746643, + "learning_rate": 2.872096281217449e-05, + "loss": 0.1546, + "step": 28436 + }, + { + "epoch": 0.5072057931723326, + "grad_norm": 0.27275601029396057, + "learning_rate": 2.8719423634511823e-05, + "loss": 0.0949, + "step": 28437 + }, + { + "epoch": 0.5072236292940463, + "grad_norm": 0.32661938667297363, + "learning_rate": 2.8717884442431374e-05, + "loss": 0.1606, + "step": 28438 + }, + { + "epoch": 0.50724146541576, + "grad_norm": 0.4000622034072876, + "learning_rate": 2.8716345235939128e-05, + "loss": 0.1797, + "step": 28439 + }, + { + "epoch": 0.5072593015374737, + "grad_norm": 0.23305268585681915, + "learning_rate": 2.8714806015041047e-05, + "loss": 0.1224, + "step": 28440 + }, + { + "epoch": 0.5072771376591874, + "grad_norm": 0.2751278877258301, + "learning_rate": 2.8713266779743087e-05, + "loss": 0.1696, + "step": 28441 + }, + { + "epoch": 0.507294973780901, + "grad_norm": 0.28718727827072144, + "learning_rate": 2.871172753005123e-05, + "loss": 0.2082, + "step": 28442 + }, + { + "epoch": 0.5073128099026147, + "grad_norm": 0.19972676038742065, + "learning_rate": 2.8710188265971434e-05, + "loss": 0.1356, + "step": 28443 + }, + { + "epoch": 0.5073306460243284, + "grad_norm": 0.32930871844291687, + "learning_rate": 2.870864898750966e-05, + "loss": 0.1624, + "step": 28444 + }, + { + "epoch": 0.5073484821460421, + "grad_norm": 0.24793067574501038, + "learning_rate": 2.8707109694671886e-05, + "loss": 0.1065, + "step": 28445 + }, + { + "epoch": 0.5073663182677559, + "grad_norm": 0.2668672800064087, + "learning_rate": 2.8705570387464074e-05, + "loss": 0.1249, + "step": 28446 + }, + { + "epoch": 0.5073841543894696, + "grad_norm": 0.3631262183189392, + "learning_rate": 2.8704031065892194e-05, + "loss": 0.1614, + "step": 28447 + }, + { + "epoch": 0.5074019905111833, + "grad_norm": 0.32281559705734253, + "learning_rate": 2.870249172996221e-05, + "loss": 0.0852, + "step": 28448 + }, + { + "epoch": 0.507419826632897, + "grad_norm": 0.3494997024536133, + "learning_rate": 2.8700952379680086e-05, + "loss": 0.1527, + "step": 28449 + }, + { + "epoch": 0.5074376627546107, + "grad_norm": 0.23304596543312073, + "learning_rate": 2.8699413015051796e-05, + "loss": 0.1141, + "step": 28450 + }, + { + "epoch": 0.5074554988763244, + "grad_norm": 0.2574032247066498, + "learning_rate": 2.86978736360833e-05, + "loss": 0.1534, + "step": 28451 + }, + { + "epoch": 0.507473334998038, + "grad_norm": 0.18970376253128052, + "learning_rate": 2.869633424278057e-05, + "loss": 0.1376, + "step": 28452 + }, + { + "epoch": 0.5074911711197517, + "grad_norm": 0.2882007360458374, + "learning_rate": 2.8694794835149575e-05, + "loss": 0.1419, + "step": 28453 + }, + { + "epoch": 0.5075090072414654, + "grad_norm": 0.28692206740379333, + "learning_rate": 2.8693255413196274e-05, + "loss": 0.1327, + "step": 28454 + }, + { + "epoch": 0.5075268433631791, + "grad_norm": 0.26517432928085327, + "learning_rate": 2.8691715976926642e-05, + "loss": 0.082, + "step": 28455 + }, + { + "epoch": 0.5075446794848928, + "grad_norm": 0.28659650683403015, + "learning_rate": 2.869017652634664e-05, + "loss": 0.1863, + "step": 28456 + }, + { + "epoch": 0.5075625156066065, + "grad_norm": 0.2675664722919464, + "learning_rate": 2.868863706146225e-05, + "loss": 0.1419, + "step": 28457 + }, + { + "epoch": 0.5075803517283202, + "grad_norm": 0.21762660145759583, + "learning_rate": 2.8687097582279417e-05, + "loss": 0.1289, + "step": 28458 + }, + { + "epoch": 0.5075981878500339, + "grad_norm": 0.23660603165626526, + "learning_rate": 2.868555808880412e-05, + "loss": 0.1544, + "step": 28459 + }, + { + "epoch": 0.5076160239717475, + "grad_norm": 0.2530672550201416, + "learning_rate": 2.8684018581042333e-05, + "loss": 0.1611, + "step": 28460 + }, + { + "epoch": 0.5076338600934612, + "grad_norm": 0.2534952163696289, + "learning_rate": 2.8682479059000017e-05, + "loss": 0.1222, + "step": 28461 + }, + { + "epoch": 0.5076516962151749, + "grad_norm": 0.24667608737945557, + "learning_rate": 2.8680939522683136e-05, + "loss": 0.1207, + "step": 28462 + }, + { + "epoch": 0.5076695323368887, + "grad_norm": 0.3288152515888214, + "learning_rate": 2.867939997209767e-05, + "loss": 0.1734, + "step": 28463 + }, + { + "epoch": 0.5076873684586024, + "grad_norm": 0.2570081651210785, + "learning_rate": 2.8677860407249574e-05, + "loss": 0.1242, + "step": 28464 + }, + { + "epoch": 0.5077052045803161, + "grad_norm": 0.2850131690502167, + "learning_rate": 2.867632082814482e-05, + "loss": 0.1193, + "step": 28465 + }, + { + "epoch": 0.5077230407020298, + "grad_norm": 0.23317448794841766, + "learning_rate": 2.8674781234789378e-05, + "loss": 0.1543, + "step": 28466 + }, + { + "epoch": 0.5077408768237435, + "grad_norm": 0.21266146004199982, + "learning_rate": 2.8673241627189212e-05, + "loss": 0.1105, + "step": 28467 + }, + { + "epoch": 0.5077587129454572, + "grad_norm": 0.25855812430381775, + "learning_rate": 2.8671702005350298e-05, + "loss": 0.1176, + "step": 28468 + }, + { + "epoch": 0.5077765490671708, + "grad_norm": 0.20435065031051636, + "learning_rate": 2.867016236927859e-05, + "loss": 0.1333, + "step": 28469 + }, + { + "epoch": 0.5077943851888845, + "grad_norm": 0.35112375020980835, + "learning_rate": 2.8668622718980077e-05, + "loss": 0.207, + "step": 28470 + }, + { + "epoch": 0.5078122213105982, + "grad_norm": 0.40474754571914673, + "learning_rate": 2.8667083054460707e-05, + "loss": 0.202, + "step": 28471 + }, + { + "epoch": 0.5078300574323119, + "grad_norm": 0.2701167166233063, + "learning_rate": 2.8665543375726454e-05, + "loss": 0.1236, + "step": 28472 + }, + { + "epoch": 0.5078478935540256, + "grad_norm": 0.22002114355564117, + "learning_rate": 2.8664003682783298e-05, + "loss": 0.0965, + "step": 28473 + }, + { + "epoch": 0.5078657296757393, + "grad_norm": 0.2836028039455414, + "learning_rate": 2.8662463975637195e-05, + "loss": 0.1512, + "step": 28474 + }, + { + "epoch": 0.507883565797453, + "grad_norm": 0.22661170363426208, + "learning_rate": 2.8660924254294115e-05, + "loss": 0.1311, + "step": 28475 + }, + { + "epoch": 0.5079014019191667, + "grad_norm": 0.3529321253299713, + "learning_rate": 2.865938451876003e-05, + "loss": 0.1428, + "step": 28476 + }, + { + "epoch": 0.5079192380408803, + "grad_norm": 0.20662051439285278, + "learning_rate": 2.8657844769040904e-05, + "loss": 0.115, + "step": 28477 + }, + { + "epoch": 0.507937074162594, + "grad_norm": 0.29631802439689636, + "learning_rate": 2.865630500514271e-05, + "loss": 0.1876, + "step": 28478 + }, + { + "epoch": 0.5079549102843077, + "grad_norm": 0.22083105146884918, + "learning_rate": 2.8654765227071413e-05, + "loss": 0.1469, + "step": 28479 + }, + { + "epoch": 0.5079727464060215, + "grad_norm": 0.24177899956703186, + "learning_rate": 2.865322543483298e-05, + "loss": 0.1275, + "step": 28480 + }, + { + "epoch": 0.5079905825277352, + "grad_norm": 0.26579251885414124, + "learning_rate": 2.8651685628433396e-05, + "loss": 0.1149, + "step": 28481 + }, + { + "epoch": 0.5080084186494489, + "grad_norm": 0.3118237555027008, + "learning_rate": 2.865014580787861e-05, + "loss": 0.1271, + "step": 28482 + }, + { + "epoch": 0.5080262547711626, + "grad_norm": 0.3080024719238281, + "learning_rate": 2.8648605973174597e-05, + "loss": 0.1481, + "step": 28483 + }, + { + "epoch": 0.5080440908928763, + "grad_norm": 0.14927025139331818, + "learning_rate": 2.8647066124327326e-05, + "loss": 0.0791, + "step": 28484 + }, + { + "epoch": 0.50806192701459, + "grad_norm": 0.2248726338148117, + "learning_rate": 2.864552626134277e-05, + "loss": 0.1025, + "step": 28485 + }, + { + "epoch": 0.5080797631363037, + "grad_norm": 0.3614726662635803, + "learning_rate": 2.8643986384226895e-05, + "loss": 0.1782, + "step": 28486 + }, + { + "epoch": 0.5080975992580173, + "grad_norm": 0.3290412127971649, + "learning_rate": 2.8642446492985665e-05, + "loss": 0.1325, + "step": 28487 + }, + { + "epoch": 0.508115435379731, + "grad_norm": 0.3148520886898041, + "learning_rate": 2.864090658762506e-05, + "loss": 0.1227, + "step": 28488 + }, + { + "epoch": 0.5081332715014447, + "grad_norm": 0.25307032465934753, + "learning_rate": 2.8639366668151047e-05, + "loss": 0.1439, + "step": 28489 + }, + { + "epoch": 0.5081511076231584, + "grad_norm": 0.3114222288131714, + "learning_rate": 2.8637826734569584e-05, + "loss": 0.1466, + "step": 28490 + }, + { + "epoch": 0.5081689437448721, + "grad_norm": 0.19577671587467194, + "learning_rate": 2.8636286786886653e-05, + "loss": 0.0834, + "step": 28491 + }, + { + "epoch": 0.5081867798665858, + "grad_norm": 0.2320423424243927, + "learning_rate": 2.8634746825108216e-05, + "loss": 0.1019, + "step": 28492 + }, + { + "epoch": 0.5082046159882995, + "grad_norm": 0.24921290576457977, + "learning_rate": 2.8633206849240246e-05, + "loss": 0.1161, + "step": 28493 + }, + { + "epoch": 0.5082224521100132, + "grad_norm": 0.24708881974220276, + "learning_rate": 2.8631666859288707e-05, + "loss": 0.1491, + "step": 28494 + }, + { + "epoch": 0.5082402882317268, + "grad_norm": 0.2295462042093277, + "learning_rate": 2.8630126855259575e-05, + "loss": 0.0968, + "step": 28495 + }, + { + "epoch": 0.5082581243534405, + "grad_norm": 0.2319304198026657, + "learning_rate": 2.862858683715882e-05, + "loss": 0.1322, + "step": 28496 + }, + { + "epoch": 0.5082759604751543, + "grad_norm": 0.30353572964668274, + "learning_rate": 2.8627046804992406e-05, + "loss": 0.1499, + "step": 28497 + }, + { + "epoch": 0.508293796596868, + "grad_norm": 0.20552609860897064, + "learning_rate": 2.862550675876632e-05, + "loss": 0.0737, + "step": 28498 + }, + { + "epoch": 0.5083116327185817, + "grad_norm": 0.2148098349571228, + "learning_rate": 2.86239666984865e-05, + "loss": 0.1245, + "step": 28499 + }, + { + "epoch": 0.5083294688402954, + "grad_norm": 0.2352498173713684, + "learning_rate": 2.8622426624158938e-05, + "loss": 0.1187, + "step": 28500 + }, + { + "epoch": 0.5083473049620091, + "grad_norm": 0.34014391899108887, + "learning_rate": 2.8620886535789597e-05, + "loss": 0.1764, + "step": 28501 + }, + { + "epoch": 0.5083651410837228, + "grad_norm": 0.24722465872764587, + "learning_rate": 2.861934643338446e-05, + "loss": 0.1969, + "step": 28502 + }, + { + "epoch": 0.5083829772054365, + "grad_norm": 0.2576411962509155, + "learning_rate": 2.8617806316949475e-05, + "loss": 0.1372, + "step": 28503 + }, + { + "epoch": 0.5084008133271501, + "grad_norm": 0.2918015420436859, + "learning_rate": 2.8616266186490627e-05, + "loss": 0.1298, + "step": 28504 + }, + { + "epoch": 0.5084186494488638, + "grad_norm": 0.3275103271007538, + "learning_rate": 2.8614726042013885e-05, + "loss": 0.1697, + "step": 28505 + }, + { + "epoch": 0.5084364855705775, + "grad_norm": 0.30394643545150757, + "learning_rate": 2.8613185883525212e-05, + "loss": 0.1108, + "step": 28506 + }, + { + "epoch": 0.5084543216922912, + "grad_norm": 0.22448478639125824, + "learning_rate": 2.861164571103058e-05, + "loss": 0.1437, + "step": 28507 + }, + { + "epoch": 0.5084721578140049, + "grad_norm": 0.30676203966140747, + "learning_rate": 2.8610105524535967e-05, + "loss": 0.1606, + "step": 28508 + }, + { + "epoch": 0.5084899939357186, + "grad_norm": 0.2928142845630646, + "learning_rate": 2.860856532404734e-05, + "loss": 0.1578, + "step": 28509 + }, + { + "epoch": 0.5085078300574323, + "grad_norm": 0.23866738379001617, + "learning_rate": 2.860702510957066e-05, + "loss": 0.1044, + "step": 28510 + }, + { + "epoch": 0.508525666179146, + "grad_norm": 0.4757208228111267, + "learning_rate": 2.8605484881111917e-05, + "loss": 0.1826, + "step": 28511 + }, + { + "epoch": 0.5085435023008597, + "grad_norm": 0.33099836111068726, + "learning_rate": 2.860394463867706e-05, + "loss": 0.1494, + "step": 28512 + }, + { + "epoch": 0.5085613384225735, + "grad_norm": 0.30314525961875916, + "learning_rate": 2.8602404382272075e-05, + "loss": 0.1648, + "step": 28513 + }, + { + "epoch": 0.5085791745442871, + "grad_norm": 0.2715233564376831, + "learning_rate": 2.8600864111902913e-05, + "loss": 0.1461, + "step": 28514 + }, + { + "epoch": 0.5085970106660008, + "grad_norm": 0.2651790380477905, + "learning_rate": 2.859932382757557e-05, + "loss": 0.1876, + "step": 28515 + }, + { + "epoch": 0.5086148467877145, + "grad_norm": 0.2851487398147583, + "learning_rate": 2.8597783529295997e-05, + "loss": 0.1122, + "step": 28516 + }, + { + "epoch": 0.5086326829094282, + "grad_norm": 0.3177490532398224, + "learning_rate": 2.859624321707018e-05, + "loss": 0.1848, + "step": 28517 + }, + { + "epoch": 0.5086505190311419, + "grad_norm": 0.3167870342731476, + "learning_rate": 2.8594702890904078e-05, + "loss": 0.1535, + "step": 28518 + }, + { + "epoch": 0.5086683551528556, + "grad_norm": 0.3049619495868683, + "learning_rate": 2.859316255080367e-05, + "loss": 0.1518, + "step": 28519 + }, + { + "epoch": 0.5086861912745693, + "grad_norm": 0.22853955626487732, + "learning_rate": 2.8591622196774925e-05, + "loss": 0.1153, + "step": 28520 + }, + { + "epoch": 0.508704027396283, + "grad_norm": 0.22894343733787537, + "learning_rate": 2.85900818288238e-05, + "loss": 0.112, + "step": 28521 + }, + { + "epoch": 0.5087218635179966, + "grad_norm": 0.23513081669807434, + "learning_rate": 2.8588541446956286e-05, + "loss": 0.1555, + "step": 28522 + }, + { + "epoch": 0.5087396996397103, + "grad_norm": 0.25258374214172363, + "learning_rate": 2.8587001051178343e-05, + "loss": 0.14, + "step": 28523 + }, + { + "epoch": 0.508757535761424, + "grad_norm": 0.4361419081687927, + "learning_rate": 2.8585460641495947e-05, + "loss": 0.1317, + "step": 28524 + }, + { + "epoch": 0.5087753718831377, + "grad_norm": 0.1856044977903366, + "learning_rate": 2.8583920217915066e-05, + "loss": 0.0932, + "step": 28525 + }, + { + "epoch": 0.5087932080048514, + "grad_norm": 0.28256869316101074, + "learning_rate": 2.8582379780441675e-05, + "loss": 0.1364, + "step": 28526 + }, + { + "epoch": 0.5088110441265651, + "grad_norm": 0.2961440980434418, + "learning_rate": 2.8580839329081742e-05, + "loss": 0.1078, + "step": 28527 + }, + { + "epoch": 0.5088288802482788, + "grad_norm": 0.21577990055084229, + "learning_rate": 2.8579298863841236e-05, + "loss": 0.1299, + "step": 28528 + }, + { + "epoch": 0.5088467163699925, + "grad_norm": 0.2719804644584656, + "learning_rate": 2.8577758384726133e-05, + "loss": 0.1086, + "step": 28529 + }, + { + "epoch": 0.5088645524917063, + "grad_norm": 0.3241913914680481, + "learning_rate": 2.8576217891742408e-05, + "loss": 0.1579, + "step": 28530 + }, + { + "epoch": 0.50888238861342, + "grad_norm": 0.19262385368347168, + "learning_rate": 2.8574677384896016e-05, + "loss": 0.0771, + "step": 28531 + }, + { + "epoch": 0.5089002247351336, + "grad_norm": 0.33867642283439636, + "learning_rate": 2.857313686419295e-05, + "loss": 0.1581, + "step": 28532 + }, + { + "epoch": 0.5089180608568473, + "grad_norm": 0.2797696590423584, + "learning_rate": 2.8571596329639173e-05, + "loss": 0.158, + "step": 28533 + }, + { + "epoch": 0.508935896978561, + "grad_norm": 0.23965167999267578, + "learning_rate": 2.8570055781240645e-05, + "loss": 0.1528, + "step": 28534 + }, + { + "epoch": 0.5089537331002747, + "grad_norm": 0.3141854703426361, + "learning_rate": 2.856851521900335e-05, + "loss": 0.1523, + "step": 28535 + }, + { + "epoch": 0.5089715692219884, + "grad_norm": 0.2644658088684082, + "learning_rate": 2.856697464293326e-05, + "loss": 0.1391, + "step": 28536 + }, + { + "epoch": 0.5089894053437021, + "grad_norm": 0.2711368203163147, + "learning_rate": 2.8565434053036344e-05, + "loss": 0.1473, + "step": 28537 + }, + { + "epoch": 0.5090072414654158, + "grad_norm": 0.23995690047740936, + "learning_rate": 2.8563893449318575e-05, + "loss": 0.1157, + "step": 28538 + }, + { + "epoch": 0.5090250775871294, + "grad_norm": 0.35455048084259033, + "learning_rate": 2.8562352831785925e-05, + "loss": 0.1987, + "step": 28539 + }, + { + "epoch": 0.5090429137088431, + "grad_norm": 0.24381007254123688, + "learning_rate": 2.856081220044436e-05, + "loss": 0.0913, + "step": 28540 + }, + { + "epoch": 0.5090607498305568, + "grad_norm": 0.3457053303718567, + "learning_rate": 2.8559271555299865e-05, + "loss": 0.1, + "step": 28541 + }, + { + "epoch": 0.5090785859522705, + "grad_norm": 0.22499744594097137, + "learning_rate": 2.8557730896358398e-05, + "loss": 0.1109, + "step": 28542 + }, + { + "epoch": 0.5090964220739842, + "grad_norm": 0.2553752064704895, + "learning_rate": 2.8556190223625933e-05, + "loss": 0.1445, + "step": 28543 + }, + { + "epoch": 0.5091142581956979, + "grad_norm": 0.26085275411605835, + "learning_rate": 2.855464953710845e-05, + "loss": 0.1452, + "step": 28544 + }, + { + "epoch": 0.5091320943174116, + "grad_norm": 0.3246316611766815, + "learning_rate": 2.8553108836811927e-05, + "loss": 0.1073, + "step": 28545 + }, + { + "epoch": 0.5091499304391253, + "grad_norm": 0.26541852951049805, + "learning_rate": 2.8551568122742323e-05, + "loss": 0.1714, + "step": 28546 + }, + { + "epoch": 0.5091677665608391, + "grad_norm": 0.25668826699256897, + "learning_rate": 2.8550027394905608e-05, + "loss": 0.1036, + "step": 28547 + }, + { + "epoch": 0.5091856026825528, + "grad_norm": 0.3039669394493103, + "learning_rate": 2.854848665330776e-05, + "loss": 0.11, + "step": 28548 + }, + { + "epoch": 0.5092034388042664, + "grad_norm": 0.2453691065311432, + "learning_rate": 2.8546945897954758e-05, + "loss": 0.075, + "step": 28549 + }, + { + "epoch": 0.5092212749259801, + "grad_norm": 0.48145225644111633, + "learning_rate": 2.8545405128852566e-05, + "loss": 0.1648, + "step": 28550 + }, + { + "epoch": 0.5092391110476938, + "grad_norm": 0.3121007978916168, + "learning_rate": 2.854386434600716e-05, + "loss": 0.135, + "step": 28551 + }, + { + "epoch": 0.5092569471694075, + "grad_norm": 0.23640026152133942, + "learning_rate": 2.854232354942451e-05, + "loss": 0.1614, + "step": 28552 + }, + { + "epoch": 0.5092747832911212, + "grad_norm": 0.29564833641052246, + "learning_rate": 2.8540782739110593e-05, + "loss": 0.1972, + "step": 28553 + }, + { + "epoch": 0.5092926194128349, + "grad_norm": 0.2820846140384674, + "learning_rate": 2.853924191507138e-05, + "loss": 0.1741, + "step": 28554 + }, + { + "epoch": 0.5093104555345486, + "grad_norm": 0.20693929493427277, + "learning_rate": 2.853770107731284e-05, + "loss": 0.1018, + "step": 28555 + }, + { + "epoch": 0.5093282916562623, + "grad_norm": 0.20320770144462585, + "learning_rate": 2.8536160225840946e-05, + "loss": 0.1558, + "step": 28556 + }, + { + "epoch": 0.5093461277779759, + "grad_norm": 0.2440221905708313, + "learning_rate": 2.8534619360661674e-05, + "loss": 0.1045, + "step": 28557 + }, + { + "epoch": 0.5093639638996896, + "grad_norm": 0.3163812756538391, + "learning_rate": 2.8533078481781007e-05, + "loss": 0.1478, + "step": 28558 + }, + { + "epoch": 0.5093818000214033, + "grad_norm": 0.2869386076927185, + "learning_rate": 2.8531537589204904e-05, + "loss": 0.1386, + "step": 28559 + }, + { + "epoch": 0.509399636143117, + "grad_norm": 0.22747382521629333, + "learning_rate": 2.8529996682939337e-05, + "loss": 0.1514, + "step": 28560 + }, + { + "epoch": 0.5094174722648307, + "grad_norm": 0.21826334297657013, + "learning_rate": 2.8528455762990287e-05, + "loss": 0.1249, + "step": 28561 + }, + { + "epoch": 0.5094353083865444, + "grad_norm": 0.258172869682312, + "learning_rate": 2.8526914829363716e-05, + "loss": 0.1497, + "step": 28562 + }, + { + "epoch": 0.5094531445082581, + "grad_norm": 0.361729234457016, + "learning_rate": 2.852537388206561e-05, + "loss": 0.1704, + "step": 28563 + }, + { + "epoch": 0.5094709806299719, + "grad_norm": 0.24234500527381897, + "learning_rate": 2.8523832921101933e-05, + "loss": 0.1657, + "step": 28564 + }, + { + "epoch": 0.5094888167516856, + "grad_norm": 0.23297803103923798, + "learning_rate": 2.8522291946478673e-05, + "loss": 0.1299, + "step": 28565 + }, + { + "epoch": 0.5095066528733992, + "grad_norm": 0.2698739171028137, + "learning_rate": 2.852075095820178e-05, + "loss": 0.1123, + "step": 28566 + }, + { + "epoch": 0.5095244889951129, + "grad_norm": 0.26719093322753906, + "learning_rate": 2.8519209956277254e-05, + "loss": 0.1588, + "step": 28567 + }, + { + "epoch": 0.5095423251168266, + "grad_norm": 0.27653738856315613, + "learning_rate": 2.8517668940711046e-05, + "loss": 0.0995, + "step": 28568 + }, + { + "epoch": 0.5095601612385403, + "grad_norm": 0.21178923547267914, + "learning_rate": 2.8516127911509143e-05, + "loss": 0.127, + "step": 28569 + }, + { + "epoch": 0.509577997360254, + "grad_norm": 0.22715769708156586, + "learning_rate": 2.8514586868677507e-05, + "loss": 0.0907, + "step": 28570 + }, + { + "epoch": 0.5095958334819677, + "grad_norm": 0.3355378806591034, + "learning_rate": 2.8513045812222122e-05, + "loss": 0.1417, + "step": 28571 + }, + { + "epoch": 0.5096136696036814, + "grad_norm": 0.3025701642036438, + "learning_rate": 2.8511504742148958e-05, + "loss": 0.159, + "step": 28572 + }, + { + "epoch": 0.5096315057253951, + "grad_norm": 0.2599545121192932, + "learning_rate": 2.850996365846399e-05, + "loss": 0.1723, + "step": 28573 + }, + { + "epoch": 0.5096493418471087, + "grad_norm": 0.26544317603111267, + "learning_rate": 2.850842256117319e-05, + "loss": 0.1622, + "step": 28574 + }, + { + "epoch": 0.5096671779688224, + "grad_norm": 0.21109437942504883, + "learning_rate": 2.850688145028253e-05, + "loss": 0.1271, + "step": 28575 + }, + { + "epoch": 0.5096850140905361, + "grad_norm": 0.260470986366272, + "learning_rate": 2.850534032579799e-05, + "loss": 0.1763, + "step": 28576 + }, + { + "epoch": 0.5097028502122498, + "grad_norm": 0.2698030471801758, + "learning_rate": 2.8503799187725534e-05, + "loss": 0.1021, + "step": 28577 + }, + { + "epoch": 0.5097206863339635, + "grad_norm": 0.4732128381729126, + "learning_rate": 2.850225803607115e-05, + "loss": 0.1681, + "step": 28578 + }, + { + "epoch": 0.5097385224556772, + "grad_norm": 0.22712358832359314, + "learning_rate": 2.85007168708408e-05, + "loss": 0.1406, + "step": 28579 + }, + { + "epoch": 0.5097563585773909, + "grad_norm": 0.19389115273952484, + "learning_rate": 2.8499175692040465e-05, + "loss": 0.1336, + "step": 28580 + }, + { + "epoch": 0.5097741946991047, + "grad_norm": 0.28978896141052246, + "learning_rate": 2.8497634499676112e-05, + "loss": 0.1558, + "step": 28581 + }, + { + "epoch": 0.5097920308208184, + "grad_norm": 0.21501219272613525, + "learning_rate": 2.8496093293753727e-05, + "loss": 0.1477, + "step": 28582 + }, + { + "epoch": 0.509809866942532, + "grad_norm": 0.25573790073394775, + "learning_rate": 2.849455207427927e-05, + "loss": 0.1353, + "step": 28583 + }, + { + "epoch": 0.5098277030642457, + "grad_norm": 0.21876104176044464, + "learning_rate": 2.849301084125872e-05, + "loss": 0.1711, + "step": 28584 + }, + { + "epoch": 0.5098455391859594, + "grad_norm": 0.34798502922058105, + "learning_rate": 2.8491469594698063e-05, + "loss": 0.17, + "step": 28585 + }, + { + "epoch": 0.5098633753076731, + "grad_norm": 0.22214873135089874, + "learning_rate": 2.8489928334603255e-05, + "loss": 0.1234, + "step": 28586 + }, + { + "epoch": 0.5098812114293868, + "grad_norm": 0.2880497872829437, + "learning_rate": 2.8488387060980288e-05, + "loss": 0.1117, + "step": 28587 + }, + { + "epoch": 0.5098990475511005, + "grad_norm": 0.2621387243270874, + "learning_rate": 2.848684577383512e-05, + "loss": 0.1633, + "step": 28588 + }, + { + "epoch": 0.5099168836728142, + "grad_norm": 0.22223816812038422, + "learning_rate": 2.848530447317374e-05, + "loss": 0.1015, + "step": 28589 + }, + { + "epoch": 0.5099347197945279, + "grad_norm": 0.3039596974849701, + "learning_rate": 2.8483763159002113e-05, + "loss": 0.096, + "step": 28590 + }, + { + "epoch": 0.5099525559162416, + "grad_norm": 0.25319209694862366, + "learning_rate": 2.8482221831326213e-05, + "loss": 0.1389, + "step": 28591 + }, + { + "epoch": 0.5099703920379552, + "grad_norm": 0.25262385606765747, + "learning_rate": 2.848068049015202e-05, + "loss": 0.1647, + "step": 28592 + }, + { + "epoch": 0.5099882281596689, + "grad_norm": 0.19870199263095856, + "learning_rate": 2.847913913548551e-05, + "loss": 0.101, + "step": 28593 + }, + { + "epoch": 0.5100060642813826, + "grad_norm": 0.27417927980422974, + "learning_rate": 2.8477597767332654e-05, + "loss": 0.1921, + "step": 28594 + }, + { + "epoch": 0.5100239004030963, + "grad_norm": 0.4326969087123871, + "learning_rate": 2.847605638569943e-05, + "loss": 0.1727, + "step": 28595 + }, + { + "epoch": 0.51004173652481, + "grad_norm": 0.2687114477157593, + "learning_rate": 2.8474514990591806e-05, + "loss": 0.2068, + "step": 28596 + }, + { + "epoch": 0.5100595726465237, + "grad_norm": 0.29581889510154724, + "learning_rate": 2.8472973582015772e-05, + "loss": 0.1286, + "step": 28597 + }, + { + "epoch": 0.5100774087682375, + "grad_norm": 0.19792883098125458, + "learning_rate": 2.847143215997728e-05, + "loss": 0.1387, + "step": 28598 + }, + { + "epoch": 0.5100952448899512, + "grad_norm": 0.2532748579978943, + "learning_rate": 2.8469890724482322e-05, + "loss": 0.1024, + "step": 28599 + }, + { + "epoch": 0.5101130810116649, + "grad_norm": 0.2700733542442322, + "learning_rate": 2.8468349275536877e-05, + "loss": 0.146, + "step": 28600 + }, + { + "epoch": 0.5101309171333785, + "grad_norm": 0.26082083582878113, + "learning_rate": 2.8466807813146902e-05, + "loss": 0.164, + "step": 28601 + }, + { + "epoch": 0.5101487532550922, + "grad_norm": 0.26076796650886536, + "learning_rate": 2.8465266337318386e-05, + "loss": 0.1446, + "step": 28602 + }, + { + "epoch": 0.5101665893768059, + "grad_norm": 0.2745343744754791, + "learning_rate": 2.84637248480573e-05, + "loss": 0.1486, + "step": 28603 + }, + { + "epoch": 0.5101844254985196, + "grad_norm": 0.21207405626773834, + "learning_rate": 2.846218334536962e-05, + "loss": 0.1316, + "step": 28604 + }, + { + "epoch": 0.5102022616202333, + "grad_norm": 0.3733825385570526, + "learning_rate": 2.846064182926132e-05, + "loss": 0.1324, + "step": 28605 + }, + { + "epoch": 0.510220097741947, + "grad_norm": 0.195739284157753, + "learning_rate": 2.8459100299738384e-05, + "loss": 0.1162, + "step": 28606 + }, + { + "epoch": 0.5102379338636607, + "grad_norm": 0.24634745717048645, + "learning_rate": 2.8457558756806773e-05, + "loss": 0.1546, + "step": 28607 + }, + { + "epoch": 0.5102557699853744, + "grad_norm": 0.31158387660980225, + "learning_rate": 2.8456017200472478e-05, + "loss": 0.1332, + "step": 28608 + }, + { + "epoch": 0.510273606107088, + "grad_norm": 0.3077595829963684, + "learning_rate": 2.8454475630741463e-05, + "loss": 0.1154, + "step": 28609 + }, + { + "epoch": 0.5102914422288017, + "grad_norm": 0.27323028445243835, + "learning_rate": 2.8452934047619707e-05, + "loss": 0.1267, + "step": 28610 + }, + { + "epoch": 0.5103092783505154, + "grad_norm": 0.34198784828186035, + "learning_rate": 2.8451392451113185e-05, + "loss": 0.191, + "step": 28611 + }, + { + "epoch": 0.5103271144722291, + "grad_norm": 0.2741488218307495, + "learning_rate": 2.844985084122787e-05, + "loss": 0.1378, + "step": 28612 + }, + { + "epoch": 0.5103449505939428, + "grad_norm": 0.2992853820323944, + "learning_rate": 2.8448309217969748e-05, + "loss": 0.1613, + "step": 28613 + }, + { + "epoch": 0.5103627867156566, + "grad_norm": 0.21999169886112213, + "learning_rate": 2.8446767581344787e-05, + "loss": 0.1076, + "step": 28614 + }, + { + "epoch": 0.5103806228373703, + "grad_norm": 0.20979440212249756, + "learning_rate": 2.8445225931358965e-05, + "loss": 0.1378, + "step": 28615 + }, + { + "epoch": 0.510398458959084, + "grad_norm": 0.18849465250968933, + "learning_rate": 2.8443684268018256e-05, + "loss": 0.1507, + "step": 28616 + }, + { + "epoch": 0.5104162950807977, + "grad_norm": 0.32718199491500854, + "learning_rate": 2.844214259132864e-05, + "loss": 0.1533, + "step": 28617 + }, + { + "epoch": 0.5104341312025114, + "grad_norm": 0.1834608018398285, + "learning_rate": 2.8440600901296087e-05, + "loss": 0.1264, + "step": 28618 + }, + { + "epoch": 0.510451967324225, + "grad_norm": 0.24632766842842102, + "learning_rate": 2.8439059197926576e-05, + "loss": 0.1537, + "step": 28619 + }, + { + "epoch": 0.5104698034459387, + "grad_norm": 0.38008156418800354, + "learning_rate": 2.8437517481226085e-05, + "loss": 0.174, + "step": 28620 + }, + { + "epoch": 0.5104876395676524, + "grad_norm": 0.24725519120693207, + "learning_rate": 2.843597575120059e-05, + "loss": 0.1926, + "step": 28621 + }, + { + "epoch": 0.5105054756893661, + "grad_norm": 0.3012906610965729, + "learning_rate": 2.8434434007856064e-05, + "loss": 0.1471, + "step": 28622 + }, + { + "epoch": 0.5105233118110798, + "grad_norm": 0.22625456750392914, + "learning_rate": 2.843289225119849e-05, + "loss": 0.1292, + "step": 28623 + }, + { + "epoch": 0.5105411479327935, + "grad_norm": 0.2392953783273697, + "learning_rate": 2.8431350481233837e-05, + "loss": 0.1287, + "step": 28624 + }, + { + "epoch": 0.5105589840545072, + "grad_norm": 0.298268586397171, + "learning_rate": 2.842980869796808e-05, + "loss": 0.1281, + "step": 28625 + }, + { + "epoch": 0.5105768201762209, + "grad_norm": 0.49562084674835205, + "learning_rate": 2.8428266901407208e-05, + "loss": 0.158, + "step": 28626 + }, + { + "epoch": 0.5105946562979345, + "grad_norm": 0.2313910722732544, + "learning_rate": 2.8426725091557182e-05, + "loss": 0.1679, + "step": 28627 + }, + { + "epoch": 0.5106124924196482, + "grad_norm": 0.2302023321390152, + "learning_rate": 2.8425183268423993e-05, + "loss": 0.1354, + "step": 28628 + }, + { + "epoch": 0.5106303285413619, + "grad_norm": 0.28523245453834534, + "learning_rate": 2.8423641432013602e-05, + "loss": 0.1808, + "step": 28629 + }, + { + "epoch": 0.5106481646630756, + "grad_norm": 0.3322940766811371, + "learning_rate": 2.8422099582332008e-05, + "loss": 0.1447, + "step": 28630 + }, + { + "epoch": 0.5106660007847894, + "grad_norm": 0.3432726263999939, + "learning_rate": 2.842055771938516e-05, + "loss": 0.2508, + "step": 28631 + }, + { + "epoch": 0.5106838369065031, + "grad_norm": 0.2665051519870758, + "learning_rate": 2.8419015843179054e-05, + "loss": 0.1281, + "step": 28632 + }, + { + "epoch": 0.5107016730282168, + "grad_norm": 0.2627442479133606, + "learning_rate": 2.841747395371966e-05, + "loss": 0.1217, + "step": 28633 + }, + { + "epoch": 0.5107195091499305, + "grad_norm": 0.29333555698394775, + "learning_rate": 2.841593205101296e-05, + "loss": 0.1636, + "step": 28634 + }, + { + "epoch": 0.5107373452716442, + "grad_norm": 0.2493095099925995, + "learning_rate": 2.8414390135064922e-05, + "loss": 0.1143, + "step": 28635 + }, + { + "epoch": 0.5107551813933578, + "grad_norm": 0.3318060636520386, + "learning_rate": 2.8412848205881537e-05, + "loss": 0.2112, + "step": 28636 + }, + { + "epoch": 0.5107730175150715, + "grad_norm": 0.27241435647010803, + "learning_rate": 2.8411306263468775e-05, + "loss": 0.113, + "step": 28637 + }, + { + "epoch": 0.5107908536367852, + "grad_norm": 0.39097660779953003, + "learning_rate": 2.8409764307832605e-05, + "loss": 0.1383, + "step": 28638 + }, + { + "epoch": 0.5108086897584989, + "grad_norm": 0.290276437997818, + "learning_rate": 2.8408222338979008e-05, + "loss": 0.1646, + "step": 28639 + }, + { + "epoch": 0.5108265258802126, + "grad_norm": 0.23668363690376282, + "learning_rate": 2.8406680356913967e-05, + "loss": 0.1713, + "step": 28640 + }, + { + "epoch": 0.5108443620019263, + "grad_norm": 0.23128637671470642, + "learning_rate": 2.840513836164346e-05, + "loss": 0.1455, + "step": 28641 + }, + { + "epoch": 0.51086219812364, + "grad_norm": 0.23550868034362793, + "learning_rate": 2.8403596353173456e-05, + "loss": 0.1782, + "step": 28642 + }, + { + "epoch": 0.5108800342453537, + "grad_norm": 0.17445312440395355, + "learning_rate": 2.8402054331509943e-05, + "loss": 0.1261, + "step": 28643 + }, + { + "epoch": 0.5108978703670674, + "grad_norm": 0.20496228337287903, + "learning_rate": 2.8400512296658887e-05, + "loss": 0.1285, + "step": 28644 + }, + { + "epoch": 0.510915706488781, + "grad_norm": 0.3230728507041931, + "learning_rate": 2.8398970248626272e-05, + "loss": 0.1989, + "step": 28645 + }, + { + "epoch": 0.5109335426104947, + "grad_norm": 0.26608991622924805, + "learning_rate": 2.8397428187418074e-05, + "loss": 0.1009, + "step": 28646 + }, + { + "epoch": 0.5109513787322084, + "grad_norm": 0.26584088802337646, + "learning_rate": 2.8395886113040272e-05, + "loss": 0.143, + "step": 28647 + }, + { + "epoch": 0.5109692148539222, + "grad_norm": 0.30108442902565, + "learning_rate": 2.8394344025498836e-05, + "loss": 0.1051, + "step": 28648 + }, + { + "epoch": 0.5109870509756359, + "grad_norm": 0.3000594675540924, + "learning_rate": 2.839280192479976e-05, + "loss": 0.1581, + "step": 28649 + }, + { + "epoch": 0.5110048870973496, + "grad_norm": 0.2545219957828522, + "learning_rate": 2.839125981094901e-05, + "loss": 0.1622, + "step": 28650 + }, + { + "epoch": 0.5110227232190633, + "grad_norm": 0.23320001363754272, + "learning_rate": 2.8389717683952566e-05, + "loss": 0.1001, + "step": 28651 + }, + { + "epoch": 0.511040559340777, + "grad_norm": 0.2879813611507416, + "learning_rate": 2.8388175543816405e-05, + "loss": 0.087, + "step": 28652 + }, + { + "epoch": 0.5110583954624907, + "grad_norm": 0.257871150970459, + "learning_rate": 2.8386633390546493e-05, + "loss": 0.1668, + "step": 28653 + }, + { + "epoch": 0.5110762315842043, + "grad_norm": 0.28548383712768555, + "learning_rate": 2.8385091224148834e-05, + "loss": 0.1172, + "step": 28654 + }, + { + "epoch": 0.511094067705918, + "grad_norm": 0.30526936054229736, + "learning_rate": 2.8383549044629388e-05, + "loss": 0.1627, + "step": 28655 + }, + { + "epoch": 0.5111119038276317, + "grad_norm": 0.2569371163845062, + "learning_rate": 2.838200685199414e-05, + "loss": 0.1542, + "step": 28656 + }, + { + "epoch": 0.5111297399493454, + "grad_norm": 0.24707630276679993, + "learning_rate": 2.838046464624906e-05, + "loss": 0.1723, + "step": 28657 + }, + { + "epoch": 0.5111475760710591, + "grad_norm": 0.23386873304843903, + "learning_rate": 2.837892242740014e-05, + "loss": 0.1189, + "step": 28658 + }, + { + "epoch": 0.5111654121927728, + "grad_norm": 0.23468729853630066, + "learning_rate": 2.8377380195453346e-05, + "loss": 0.1238, + "step": 28659 + }, + { + "epoch": 0.5111832483144865, + "grad_norm": 0.21325302124023438, + "learning_rate": 2.837583795041465e-05, + "loss": 0.1123, + "step": 28660 + }, + { + "epoch": 0.5112010844362002, + "grad_norm": 0.21917521953582764, + "learning_rate": 2.8374295692290053e-05, + "loss": 0.1037, + "step": 28661 + }, + { + "epoch": 0.5112189205579138, + "grad_norm": 0.2470393180847168, + "learning_rate": 2.8372753421085518e-05, + "loss": 0.1534, + "step": 28662 + }, + { + "epoch": 0.5112367566796275, + "grad_norm": 0.2687867283821106, + "learning_rate": 2.8371211136807026e-05, + "loss": 0.123, + "step": 28663 + }, + { + "epoch": 0.5112545928013412, + "grad_norm": 0.3077491819858551, + "learning_rate": 2.836966883946055e-05, + "loss": 0.1182, + "step": 28664 + }, + { + "epoch": 0.511272428923055, + "grad_norm": 0.1964159458875656, + "learning_rate": 2.836812652905208e-05, + "loss": 0.1368, + "step": 28665 + }, + { + "epoch": 0.5112902650447687, + "grad_norm": 0.2795601487159729, + "learning_rate": 2.8366584205587588e-05, + "loss": 0.132, + "step": 28666 + }, + { + "epoch": 0.5113081011664824, + "grad_norm": 0.22139166295528412, + "learning_rate": 2.8365041869073045e-05, + "loss": 0.1265, + "step": 28667 + }, + { + "epoch": 0.5113259372881961, + "grad_norm": 0.30906009674072266, + "learning_rate": 2.8363499519514442e-05, + "loss": 0.0853, + "step": 28668 + }, + { + "epoch": 0.5113437734099098, + "grad_norm": 0.2961386442184448, + "learning_rate": 2.8361957156917756e-05, + "loss": 0.1125, + "step": 28669 + }, + { + "epoch": 0.5113616095316235, + "grad_norm": 0.27009424567222595, + "learning_rate": 2.836041478128896e-05, + "loss": 0.1513, + "step": 28670 + }, + { + "epoch": 0.5113794456533372, + "grad_norm": 0.26191455125808716, + "learning_rate": 2.835887239263404e-05, + "loss": 0.1369, + "step": 28671 + }, + { + "epoch": 0.5113972817750508, + "grad_norm": 0.2946007251739502, + "learning_rate": 2.8357329990958963e-05, + "loss": 0.1748, + "step": 28672 + }, + { + "epoch": 0.5114151178967645, + "grad_norm": 0.2295503169298172, + "learning_rate": 2.8355787576269723e-05, + "loss": 0.1266, + "step": 28673 + }, + { + "epoch": 0.5114329540184782, + "grad_norm": 0.4403449296951294, + "learning_rate": 2.8354245148572283e-05, + "loss": 0.1942, + "step": 28674 + }, + { + "epoch": 0.5114507901401919, + "grad_norm": 0.17378412187099457, + "learning_rate": 2.835270270787263e-05, + "loss": 0.095, + "step": 28675 + }, + { + "epoch": 0.5114686262619056, + "grad_norm": 0.29376813769340515, + "learning_rate": 2.8351160254176757e-05, + "loss": 0.1833, + "step": 28676 + }, + { + "epoch": 0.5114864623836193, + "grad_norm": 0.28688934445381165, + "learning_rate": 2.834961778749061e-05, + "loss": 0.1553, + "step": 28677 + }, + { + "epoch": 0.511504298505333, + "grad_norm": 0.2363552302122116, + "learning_rate": 2.8348075307820205e-05, + "loss": 0.1763, + "step": 28678 + }, + { + "epoch": 0.5115221346270467, + "grad_norm": 0.29156407713890076, + "learning_rate": 2.8346532815171496e-05, + "loss": 0.1538, + "step": 28679 + }, + { + "epoch": 0.5115399707487603, + "grad_norm": 0.19915670156478882, + "learning_rate": 2.8344990309550467e-05, + "loss": 0.1316, + "step": 28680 + }, + { + "epoch": 0.511557806870474, + "grad_norm": 0.2654586434364319, + "learning_rate": 2.8343447790963102e-05, + "loss": 0.1589, + "step": 28681 + }, + { + "epoch": 0.5115756429921878, + "grad_norm": 0.283134788274765, + "learning_rate": 2.8341905259415384e-05, + "loss": 0.1719, + "step": 28682 + }, + { + "epoch": 0.5115934791139015, + "grad_norm": 0.2221931517124176, + "learning_rate": 2.8340362714913283e-05, + "loss": 0.1279, + "step": 28683 + }, + { + "epoch": 0.5116113152356152, + "grad_norm": 0.23733840882778168, + "learning_rate": 2.8338820157462786e-05, + "loss": 0.1268, + "step": 28684 + }, + { + "epoch": 0.5116291513573289, + "grad_norm": 0.23013168573379517, + "learning_rate": 2.833727758706986e-05, + "loss": 0.095, + "step": 28685 + }, + { + "epoch": 0.5116469874790426, + "grad_norm": 0.24538858234882355, + "learning_rate": 2.8335735003740504e-05, + "loss": 0.1708, + "step": 28686 + }, + { + "epoch": 0.5116648236007563, + "grad_norm": 0.27597978711128235, + "learning_rate": 2.833419240748068e-05, + "loss": 0.1391, + "step": 28687 + }, + { + "epoch": 0.51168265972247, + "grad_norm": 0.2346007227897644, + "learning_rate": 2.8332649798296373e-05, + "loss": 0.1523, + "step": 28688 + }, + { + "epoch": 0.5117004958441836, + "grad_norm": 0.19158942997455597, + "learning_rate": 2.8331107176193573e-05, + "loss": 0.1112, + "step": 28689 + }, + { + "epoch": 0.5117183319658973, + "grad_norm": 0.24004830420017242, + "learning_rate": 2.8329564541178243e-05, + "loss": 0.1208, + "step": 28690 + }, + { + "epoch": 0.511736168087611, + "grad_norm": 0.24585741758346558, + "learning_rate": 2.832802189325638e-05, + "loss": 0.1404, + "step": 28691 + }, + { + "epoch": 0.5117540042093247, + "grad_norm": 0.2926566004753113, + "learning_rate": 2.8326479232433945e-05, + "loss": 0.1064, + "step": 28692 + }, + { + "epoch": 0.5117718403310384, + "grad_norm": 0.25121474266052246, + "learning_rate": 2.8324936558716934e-05, + "loss": 0.1604, + "step": 28693 + }, + { + "epoch": 0.5117896764527521, + "grad_norm": 0.3177691102027893, + "learning_rate": 2.8323393872111314e-05, + "loss": 0.1291, + "step": 28694 + }, + { + "epoch": 0.5118075125744658, + "grad_norm": 0.2946058213710785, + "learning_rate": 2.8321851172623075e-05, + "loss": 0.1448, + "step": 28695 + }, + { + "epoch": 0.5118253486961795, + "grad_norm": 0.28573042154312134, + "learning_rate": 2.8320308460258194e-05, + "loss": 0.1375, + "step": 28696 + }, + { + "epoch": 0.5118431848178931, + "grad_norm": 0.3390047550201416, + "learning_rate": 2.831876573502265e-05, + "loss": 0.127, + "step": 28697 + }, + { + "epoch": 0.5118610209396068, + "grad_norm": 0.1729322373867035, + "learning_rate": 2.831722299692242e-05, + "loss": 0.0932, + "step": 28698 + }, + { + "epoch": 0.5118788570613206, + "grad_norm": 0.2633526921272278, + "learning_rate": 2.83156802459635e-05, + "loss": 0.1593, + "step": 28699 + }, + { + "epoch": 0.5118966931830343, + "grad_norm": 0.3222138285636902, + "learning_rate": 2.8314137482151847e-05, + "loss": 0.1141, + "step": 28700 + }, + { + "epoch": 0.511914529304748, + "grad_norm": 0.19847998023033142, + "learning_rate": 2.8312594705493455e-05, + "loss": 0.1185, + "step": 28701 + }, + { + "epoch": 0.5119323654264617, + "grad_norm": 0.23680081963539124, + "learning_rate": 2.83110519159943e-05, + "loss": 0.1294, + "step": 28702 + }, + { + "epoch": 0.5119502015481754, + "grad_norm": 0.26284313201904297, + "learning_rate": 2.8309509113660365e-05, + "loss": 0.1454, + "step": 28703 + }, + { + "epoch": 0.5119680376698891, + "grad_norm": 0.2582609951496124, + "learning_rate": 2.8307966298497634e-05, + "loss": 0.1414, + "step": 28704 + }, + { + "epoch": 0.5119858737916028, + "grad_norm": 0.2555851936340332, + "learning_rate": 2.8306423470512078e-05, + "loss": 0.1085, + "step": 28705 + }, + { + "epoch": 0.5120037099133165, + "grad_norm": 0.2710530459880829, + "learning_rate": 2.830488062970969e-05, + "loss": 0.1287, + "step": 28706 + }, + { + "epoch": 0.5120215460350301, + "grad_norm": 0.24250569939613342, + "learning_rate": 2.830333777609644e-05, + "loss": 0.1305, + "step": 28707 + }, + { + "epoch": 0.5120393821567438, + "grad_norm": 0.373189240694046, + "learning_rate": 2.8301794909678302e-05, + "loss": 0.0808, + "step": 28708 + }, + { + "epoch": 0.5120572182784575, + "grad_norm": 0.29538479447364807, + "learning_rate": 2.8300252030461273e-05, + "loss": 0.1275, + "step": 28709 + }, + { + "epoch": 0.5120750544001712, + "grad_norm": 0.25075945258140564, + "learning_rate": 2.8298709138451335e-05, + "loss": 0.1134, + "step": 28710 + }, + { + "epoch": 0.5120928905218849, + "grad_norm": 0.44663530588150024, + "learning_rate": 2.829716623365445e-05, + "loss": 0.1212, + "step": 28711 + }, + { + "epoch": 0.5121107266435986, + "grad_norm": 0.3136363923549652, + "learning_rate": 2.829562331607662e-05, + "loss": 0.2078, + "step": 28712 + }, + { + "epoch": 0.5121285627653123, + "grad_norm": 0.3370586931705475, + "learning_rate": 2.8294080385723808e-05, + "loss": 0.1434, + "step": 28713 + }, + { + "epoch": 0.512146398887026, + "grad_norm": 0.24719923734664917, + "learning_rate": 2.8292537442602008e-05, + "loss": 0.1783, + "step": 28714 + }, + { + "epoch": 0.5121642350087398, + "grad_norm": 0.17714382708072662, + "learning_rate": 2.8290994486717194e-05, + "loss": 0.1159, + "step": 28715 + }, + { + "epoch": 0.5121820711304534, + "grad_norm": 0.27581316232681274, + "learning_rate": 2.828945151807534e-05, + "loss": 0.1695, + "step": 28716 + }, + { + "epoch": 0.5121999072521671, + "grad_norm": 0.3174869120121002, + "learning_rate": 2.828790853668245e-05, + "loss": 0.2087, + "step": 28717 + }, + { + "epoch": 0.5122177433738808, + "grad_norm": 0.22335317730903625, + "learning_rate": 2.828636554254448e-05, + "loss": 0.139, + "step": 28718 + }, + { + "epoch": 0.5122355794955945, + "grad_norm": 0.5008512735366821, + "learning_rate": 2.828482253566743e-05, + "loss": 0.2166, + "step": 28719 + }, + { + "epoch": 0.5122534156173082, + "grad_norm": 0.22603140771389008, + "learning_rate": 2.8283279516057266e-05, + "loss": 0.165, + "step": 28720 + }, + { + "epoch": 0.5122712517390219, + "grad_norm": 0.22612769901752472, + "learning_rate": 2.8281736483719983e-05, + "loss": 0.1707, + "step": 28721 + }, + { + "epoch": 0.5122890878607356, + "grad_norm": 0.3717956840991974, + "learning_rate": 2.828019343866155e-05, + "loss": 0.1535, + "step": 28722 + }, + { + "epoch": 0.5123069239824493, + "grad_norm": 0.27614501118659973, + "learning_rate": 2.8278650380887956e-05, + "loss": 0.1075, + "step": 28723 + }, + { + "epoch": 0.512324760104163, + "grad_norm": 0.22543823719024658, + "learning_rate": 2.827710731040518e-05, + "loss": 0.1443, + "step": 28724 + }, + { + "epoch": 0.5123425962258766, + "grad_norm": 0.3627166748046875, + "learning_rate": 2.827556422721921e-05, + "loss": 0.0981, + "step": 28725 + }, + { + "epoch": 0.5123604323475903, + "grad_norm": 0.277536541223526, + "learning_rate": 2.8274021131336014e-05, + "loss": 0.1323, + "step": 28726 + }, + { + "epoch": 0.512378268469304, + "grad_norm": 0.3194122314453125, + "learning_rate": 2.827247802276159e-05, + "loss": 0.1264, + "step": 28727 + }, + { + "epoch": 0.5123961045910177, + "grad_norm": 0.30672580003738403, + "learning_rate": 2.8270934901501906e-05, + "loss": 0.1662, + "step": 28728 + }, + { + "epoch": 0.5124139407127314, + "grad_norm": 0.49692416191101074, + "learning_rate": 2.8269391767562947e-05, + "loss": 0.1486, + "step": 28729 + }, + { + "epoch": 0.5124317768344451, + "grad_norm": 0.2641540467739105, + "learning_rate": 2.8267848620950697e-05, + "loss": 0.1231, + "step": 28730 + }, + { + "epoch": 0.5124496129561588, + "grad_norm": 0.20069244503974915, + "learning_rate": 2.8266305461671138e-05, + "loss": 0.1016, + "step": 28731 + }, + { + "epoch": 0.5124674490778726, + "grad_norm": 0.30080023407936096, + "learning_rate": 2.8264762289730252e-05, + "loss": 0.1103, + "step": 28732 + }, + { + "epoch": 0.5124852851995862, + "grad_norm": 0.275672048330307, + "learning_rate": 2.826321910513402e-05, + "loss": 0.1384, + "step": 28733 + }, + { + "epoch": 0.5125031213212999, + "grad_norm": 0.17145149409770966, + "learning_rate": 2.8261675907888425e-05, + "loss": 0.0755, + "step": 28734 + }, + { + "epoch": 0.5125209574430136, + "grad_norm": 0.3689696788787842, + "learning_rate": 2.8260132697999443e-05, + "loss": 0.155, + "step": 28735 + }, + { + "epoch": 0.5125387935647273, + "grad_norm": 0.24058884382247925, + "learning_rate": 2.825858947547306e-05, + "loss": 0.1661, + "step": 28736 + }, + { + "epoch": 0.512556629686441, + "grad_norm": 0.3380197584629059, + "learning_rate": 2.825704624031526e-05, + "loss": 0.1562, + "step": 28737 + }, + { + "epoch": 0.5125744658081547, + "grad_norm": 0.23685264587402344, + "learning_rate": 2.8255502992532028e-05, + "loss": 0.086, + "step": 28738 + }, + { + "epoch": 0.5125923019298684, + "grad_norm": 0.28849929571151733, + "learning_rate": 2.825395973212934e-05, + "loss": 0.1409, + "step": 28739 + }, + { + "epoch": 0.5126101380515821, + "grad_norm": 0.2545168995857239, + "learning_rate": 2.825241645911318e-05, + "loss": 0.1142, + "step": 28740 + }, + { + "epoch": 0.5126279741732958, + "grad_norm": 0.20858325064182281, + "learning_rate": 2.825087317348953e-05, + "loss": 0.1255, + "step": 28741 + }, + { + "epoch": 0.5126458102950094, + "grad_norm": 0.25887784361839294, + "learning_rate": 2.8249329875264376e-05, + "loss": 0.1615, + "step": 28742 + }, + { + "epoch": 0.5126636464167231, + "grad_norm": 0.2657526135444641, + "learning_rate": 2.824778656444369e-05, + "loss": 0.1299, + "step": 28743 + }, + { + "epoch": 0.5126814825384368, + "grad_norm": 0.32634103298187256, + "learning_rate": 2.824624324103346e-05, + "loss": 0.1199, + "step": 28744 + }, + { + "epoch": 0.5126993186601505, + "grad_norm": 0.28554731607437134, + "learning_rate": 2.824469990503968e-05, + "loss": 0.1511, + "step": 28745 + }, + { + "epoch": 0.5127171547818642, + "grad_norm": 0.2940383851528168, + "learning_rate": 2.8243156556468314e-05, + "loss": 0.1381, + "step": 28746 + }, + { + "epoch": 0.5127349909035779, + "grad_norm": 0.24180209636688232, + "learning_rate": 2.824161319532536e-05, + "loss": 0.0842, + "step": 28747 + }, + { + "epoch": 0.5127528270252916, + "grad_norm": 0.333478182554245, + "learning_rate": 2.8240069821616788e-05, + "loss": 0.1247, + "step": 28748 + }, + { + "epoch": 0.5127706631470054, + "grad_norm": 0.26317405700683594, + "learning_rate": 2.8238526435348594e-05, + "loss": 0.1552, + "step": 28749 + }, + { + "epoch": 0.5127884992687191, + "grad_norm": 0.25221842527389526, + "learning_rate": 2.8236983036526742e-05, + "loss": 0.1411, + "step": 28750 + }, + { + "epoch": 0.5128063353904327, + "grad_norm": 0.3700776994228363, + "learning_rate": 2.8235439625157235e-05, + "loss": 0.1272, + "step": 28751 + }, + { + "epoch": 0.5128241715121464, + "grad_norm": 0.21291691064834595, + "learning_rate": 2.8233896201246036e-05, + "loss": 0.1165, + "step": 28752 + }, + { + "epoch": 0.5128420076338601, + "grad_norm": 0.24549849331378937, + "learning_rate": 2.8232352764799153e-05, + "loss": 0.1523, + "step": 28753 + }, + { + "epoch": 0.5128598437555738, + "grad_norm": 0.22554031014442444, + "learning_rate": 2.8230809315822542e-05, + "loss": 0.1307, + "step": 28754 + }, + { + "epoch": 0.5128776798772875, + "grad_norm": 0.3477083146572113, + "learning_rate": 2.8229265854322206e-05, + "loss": 0.1703, + "step": 28755 + }, + { + "epoch": 0.5128955159990012, + "grad_norm": 0.2978888750076294, + "learning_rate": 2.8227722380304116e-05, + "loss": 0.1492, + "step": 28756 + }, + { + "epoch": 0.5129133521207149, + "grad_norm": 0.19415195286273956, + "learning_rate": 2.822617889377426e-05, + "loss": 0.1371, + "step": 28757 + }, + { + "epoch": 0.5129311882424286, + "grad_norm": 0.19340279698371887, + "learning_rate": 2.8224635394738617e-05, + "loss": 0.1135, + "step": 28758 + }, + { + "epoch": 0.5129490243641422, + "grad_norm": 0.21840186417102814, + "learning_rate": 2.8223091883203178e-05, + "loss": 0.1527, + "step": 28759 + }, + { + "epoch": 0.5129668604858559, + "grad_norm": 0.22684365510940552, + "learning_rate": 2.8221548359173922e-05, + "loss": 0.1246, + "step": 28760 + }, + { + "epoch": 0.5129846966075696, + "grad_norm": 0.29964983463287354, + "learning_rate": 2.822000482265683e-05, + "loss": 0.1442, + "step": 28761 + }, + { + "epoch": 0.5130025327292833, + "grad_norm": 0.258208304643631, + "learning_rate": 2.821846127365789e-05, + "loss": 0.1426, + "step": 28762 + }, + { + "epoch": 0.513020368850997, + "grad_norm": 0.24436752498149872, + "learning_rate": 2.8216917712183077e-05, + "loss": 0.1582, + "step": 28763 + }, + { + "epoch": 0.5130382049727107, + "grad_norm": 0.21674521267414093, + "learning_rate": 2.8215374138238382e-05, + "loss": 0.0975, + "step": 28764 + }, + { + "epoch": 0.5130560410944244, + "grad_norm": 0.24736134707927704, + "learning_rate": 2.8213830551829784e-05, + "loss": 0.1507, + "step": 28765 + }, + { + "epoch": 0.5130738772161382, + "grad_norm": 0.3155052661895752, + "learning_rate": 2.8212286952963273e-05, + "loss": 0.0944, + "step": 28766 + }, + { + "epoch": 0.5130917133378519, + "grad_norm": 0.2506276071071625, + "learning_rate": 2.821074334164483e-05, + "loss": 0.1681, + "step": 28767 + }, + { + "epoch": 0.5131095494595656, + "grad_norm": 0.28828078508377075, + "learning_rate": 2.8209199717880435e-05, + "loss": 0.1197, + "step": 28768 + }, + { + "epoch": 0.5131273855812792, + "grad_norm": 0.29900696873664856, + "learning_rate": 2.8207656081676078e-05, + "loss": 0.1812, + "step": 28769 + }, + { + "epoch": 0.5131452217029929, + "grad_norm": 0.39720383286476135, + "learning_rate": 2.820611243303773e-05, + "loss": 0.2044, + "step": 28770 + }, + { + "epoch": 0.5131630578247066, + "grad_norm": 0.19839033484458923, + "learning_rate": 2.8204568771971385e-05, + "loss": 0.1195, + "step": 28771 + }, + { + "epoch": 0.5131808939464203, + "grad_norm": 0.3253510594367981, + "learning_rate": 2.8203025098483022e-05, + "loss": 0.1982, + "step": 28772 + }, + { + "epoch": 0.513198730068134, + "grad_norm": 0.28392675518989563, + "learning_rate": 2.8201481412578634e-05, + "loss": 0.1196, + "step": 28773 + }, + { + "epoch": 0.5132165661898477, + "grad_norm": 0.2275567650794983, + "learning_rate": 2.819993771426419e-05, + "loss": 0.1427, + "step": 28774 + }, + { + "epoch": 0.5132344023115614, + "grad_norm": 0.18309307098388672, + "learning_rate": 2.819839400354569e-05, + "loss": 0.1007, + "step": 28775 + }, + { + "epoch": 0.513252238433275, + "grad_norm": 0.25341901183128357, + "learning_rate": 2.8196850280429103e-05, + "loss": 0.14, + "step": 28776 + }, + { + "epoch": 0.5132700745549887, + "grad_norm": 0.2880527377128601, + "learning_rate": 2.819530654492043e-05, + "loss": 0.1697, + "step": 28777 + }, + { + "epoch": 0.5132879106767024, + "grad_norm": 0.27188563346862793, + "learning_rate": 2.819376279702564e-05, + "loss": 0.1272, + "step": 28778 + }, + { + "epoch": 0.5133057467984161, + "grad_norm": 0.3012101352214813, + "learning_rate": 2.8192219036750716e-05, + "loss": 0.1609, + "step": 28779 + }, + { + "epoch": 0.5133235829201298, + "grad_norm": 0.3392595648765564, + "learning_rate": 2.8190675264101657e-05, + "loss": 0.1441, + "step": 28780 + }, + { + "epoch": 0.5133414190418435, + "grad_norm": 0.25709715485572815, + "learning_rate": 2.8189131479084436e-05, + "loss": 0.1336, + "step": 28781 + }, + { + "epoch": 0.5133592551635572, + "grad_norm": 0.26314452290534973, + "learning_rate": 2.8187587681705047e-05, + "loss": 0.0769, + "step": 28782 + }, + { + "epoch": 0.513377091285271, + "grad_norm": 0.32637593150138855, + "learning_rate": 2.818604387196946e-05, + "loss": 0.1257, + "step": 28783 + }, + { + "epoch": 0.5133949274069847, + "grad_norm": 0.2575835883617401, + "learning_rate": 2.8184500049883662e-05, + "loss": 0.1775, + "step": 28784 + }, + { + "epoch": 0.5134127635286984, + "grad_norm": 0.2551773190498352, + "learning_rate": 2.8182956215453642e-05, + "loss": 0.0962, + "step": 28785 + }, + { + "epoch": 0.513430599650412, + "grad_norm": 0.31901121139526367, + "learning_rate": 2.8181412368685395e-05, + "loss": 0.1381, + "step": 28786 + }, + { + "epoch": 0.5134484357721257, + "grad_norm": 0.3186083137989044, + "learning_rate": 2.8179868509584884e-05, + "loss": 0.1079, + "step": 28787 + }, + { + "epoch": 0.5134662718938394, + "grad_norm": 0.24258184432983398, + "learning_rate": 2.8178324638158115e-05, + "loss": 0.1428, + "step": 28788 + }, + { + "epoch": 0.5134841080155531, + "grad_norm": 0.29395216703414917, + "learning_rate": 2.8176780754411053e-05, + "loss": 0.1395, + "step": 28789 + }, + { + "epoch": 0.5135019441372668, + "grad_norm": 0.29596075415611267, + "learning_rate": 2.8175236858349696e-05, + "loss": 0.178, + "step": 28790 + }, + { + "epoch": 0.5135197802589805, + "grad_norm": 0.33360156416893005, + "learning_rate": 2.8173692949980023e-05, + "loss": 0.2014, + "step": 28791 + }, + { + "epoch": 0.5135376163806942, + "grad_norm": 0.3463611900806427, + "learning_rate": 2.8172149029308016e-05, + "loss": 0.1489, + "step": 28792 + }, + { + "epoch": 0.5135554525024079, + "grad_norm": 0.2758581340312958, + "learning_rate": 2.8170605096339665e-05, + "loss": 0.1172, + "step": 28793 + }, + { + "epoch": 0.5135732886241215, + "grad_norm": 0.2756361663341522, + "learning_rate": 2.816906115108096e-05, + "loss": 0.1612, + "step": 28794 + }, + { + "epoch": 0.5135911247458352, + "grad_norm": 0.2590934634208679, + "learning_rate": 2.8167517193537878e-05, + "loss": 0.1609, + "step": 28795 + }, + { + "epoch": 0.5136089608675489, + "grad_norm": 0.28928935527801514, + "learning_rate": 2.81659732237164e-05, + "loss": 0.159, + "step": 28796 + }, + { + "epoch": 0.5136267969892626, + "grad_norm": 0.28954124450683594, + "learning_rate": 2.8164429241622525e-05, + "loss": 0.1501, + "step": 28797 + }, + { + "epoch": 0.5136446331109763, + "grad_norm": 0.24776677787303925, + "learning_rate": 2.8162885247262222e-05, + "loss": 0.1319, + "step": 28798 + }, + { + "epoch": 0.51366246923269, + "grad_norm": 0.20022942125797272, + "learning_rate": 2.816134124064148e-05, + "loss": 0.1087, + "step": 28799 + }, + { + "epoch": 0.5136803053544038, + "grad_norm": 0.2850240170955658, + "learning_rate": 2.8159797221766293e-05, + "loss": 0.1373, + "step": 28800 + }, + { + "epoch": 0.5136981414761175, + "grad_norm": 0.29663124680519104, + "learning_rate": 2.8158253190642643e-05, + "loss": 0.1874, + "step": 28801 + }, + { + "epoch": 0.5137159775978312, + "grad_norm": 0.30229130387306213, + "learning_rate": 2.8156709147276504e-05, + "loss": 0.1656, + "step": 28802 + }, + { + "epoch": 0.5137338137195449, + "grad_norm": 0.21825046837329865, + "learning_rate": 2.815516509167388e-05, + "loss": 0.1447, + "step": 28803 + }, + { + "epoch": 0.5137516498412585, + "grad_norm": 0.1991797536611557, + "learning_rate": 2.8153621023840744e-05, + "loss": 0.1276, + "step": 28804 + }, + { + "epoch": 0.5137694859629722, + "grad_norm": 0.2877190411090851, + "learning_rate": 2.815207694378308e-05, + "loss": 0.0983, + "step": 28805 + }, + { + "epoch": 0.5137873220846859, + "grad_norm": 0.42595672607421875, + "learning_rate": 2.815053285150688e-05, + "loss": 0.1153, + "step": 28806 + }, + { + "epoch": 0.5138051582063996, + "grad_norm": 0.35596680641174316, + "learning_rate": 2.8148988747018124e-05, + "loss": 0.1587, + "step": 28807 + }, + { + "epoch": 0.5138229943281133, + "grad_norm": 0.30313313007354736, + "learning_rate": 2.8147444630322805e-05, + "loss": 0.1167, + "step": 28808 + }, + { + "epoch": 0.513840830449827, + "grad_norm": 0.23832905292510986, + "learning_rate": 2.8145900501426897e-05, + "loss": 0.1077, + "step": 28809 + }, + { + "epoch": 0.5138586665715407, + "grad_norm": 0.26534879207611084, + "learning_rate": 2.81443563603364e-05, + "loss": 0.1696, + "step": 28810 + }, + { + "epoch": 0.5138765026932544, + "grad_norm": 0.20336323976516724, + "learning_rate": 2.814281220705728e-05, + "loss": 0.1211, + "step": 28811 + }, + { + "epoch": 0.513894338814968, + "grad_norm": 0.27276936173439026, + "learning_rate": 2.8141268041595542e-05, + "loss": 0.1533, + "step": 28812 + }, + { + "epoch": 0.5139121749366817, + "grad_norm": 0.24836784601211548, + "learning_rate": 2.8139723863957157e-05, + "loss": 0.0976, + "step": 28813 + }, + { + "epoch": 0.5139300110583954, + "grad_norm": 0.20647704601287842, + "learning_rate": 2.813817967414813e-05, + "loss": 0.1285, + "step": 28814 + }, + { + "epoch": 0.5139478471801091, + "grad_norm": 0.22772224247455597, + "learning_rate": 2.8136635472174427e-05, + "loss": 0.1249, + "step": 28815 + }, + { + "epoch": 0.5139656833018229, + "grad_norm": 0.23963771760463715, + "learning_rate": 2.8135091258042045e-05, + "loss": 0.1704, + "step": 28816 + }, + { + "epoch": 0.5139835194235366, + "grad_norm": 0.24223528802394867, + "learning_rate": 2.8133547031756962e-05, + "loss": 0.1176, + "step": 28817 + }, + { + "epoch": 0.5140013555452503, + "grad_norm": 0.22984950244426727, + "learning_rate": 2.8132002793325173e-05, + "loss": 0.1379, + "step": 28818 + }, + { + "epoch": 0.514019191666964, + "grad_norm": 0.2879546284675598, + "learning_rate": 2.8130458542752657e-05, + "loss": 0.1281, + "step": 28819 + }, + { + "epoch": 0.5140370277886777, + "grad_norm": 0.2941230237483978, + "learning_rate": 2.8128914280045405e-05, + "loss": 0.1844, + "step": 28820 + }, + { + "epoch": 0.5140548639103913, + "grad_norm": 0.23569253087043762, + "learning_rate": 2.8127370005209397e-05, + "loss": 0.1205, + "step": 28821 + }, + { + "epoch": 0.514072700032105, + "grad_norm": 0.29706260561943054, + "learning_rate": 2.8125825718250627e-05, + "loss": 0.1731, + "step": 28822 + }, + { + "epoch": 0.5140905361538187, + "grad_norm": 0.3536146581172943, + "learning_rate": 2.8124281419175073e-05, + "loss": 0.1269, + "step": 28823 + }, + { + "epoch": 0.5141083722755324, + "grad_norm": 0.2389230877161026, + "learning_rate": 2.8122737107988727e-05, + "loss": 0.125, + "step": 28824 + }, + { + "epoch": 0.5141262083972461, + "grad_norm": 0.33298254013061523, + "learning_rate": 2.8121192784697576e-05, + "loss": 0.1626, + "step": 28825 + }, + { + "epoch": 0.5141440445189598, + "grad_norm": 0.3502786457538605, + "learning_rate": 2.8119648449307596e-05, + "loss": 0.1093, + "step": 28826 + }, + { + "epoch": 0.5141618806406735, + "grad_norm": 0.18796348571777344, + "learning_rate": 2.8118104101824784e-05, + "loss": 0.1098, + "step": 28827 + }, + { + "epoch": 0.5141797167623872, + "grad_norm": 0.23002715408802032, + "learning_rate": 2.8116559742255123e-05, + "loss": 0.1131, + "step": 28828 + }, + { + "epoch": 0.5141975528841009, + "grad_norm": 0.24341636896133423, + "learning_rate": 2.8115015370604608e-05, + "loss": 0.1394, + "step": 28829 + }, + { + "epoch": 0.5142153890058145, + "grad_norm": 0.365753173828125, + "learning_rate": 2.8113470986879207e-05, + "loss": 0.1331, + "step": 28830 + }, + { + "epoch": 0.5142332251275282, + "grad_norm": 0.2563750147819519, + "learning_rate": 2.8111926591084926e-05, + "loss": 0.1522, + "step": 28831 + }, + { + "epoch": 0.5142510612492419, + "grad_norm": 0.2598399519920349, + "learning_rate": 2.8110382183227736e-05, + "loss": 0.1143, + "step": 28832 + }, + { + "epoch": 0.5142688973709557, + "grad_norm": 0.3067164123058319, + "learning_rate": 2.810883776331364e-05, + "loss": 0.1051, + "step": 28833 + }, + { + "epoch": 0.5142867334926694, + "grad_norm": 0.20387589931488037, + "learning_rate": 2.8107293331348605e-05, + "loss": 0.1195, + "step": 28834 + }, + { + "epoch": 0.5143045696143831, + "grad_norm": 0.21908961236476898, + "learning_rate": 2.810574888733863e-05, + "loss": 0.1221, + "step": 28835 + }, + { + "epoch": 0.5143224057360968, + "grad_norm": 0.3255521357059479, + "learning_rate": 2.81042044312897e-05, + "loss": 0.174, + "step": 28836 + }, + { + "epoch": 0.5143402418578105, + "grad_norm": 0.24000081419944763, + "learning_rate": 2.81026599632078e-05, + "loss": 0.1527, + "step": 28837 + }, + { + "epoch": 0.5143580779795242, + "grad_norm": 0.24934221804141998, + "learning_rate": 2.810111548309892e-05, + "loss": 0.1142, + "step": 28838 + }, + { + "epoch": 0.5143759141012378, + "grad_norm": 0.28087320923805237, + "learning_rate": 2.809957099096905e-05, + "loss": 0.153, + "step": 28839 + }, + { + "epoch": 0.5143937502229515, + "grad_norm": 0.269821435213089, + "learning_rate": 2.8098026486824165e-05, + "loss": 0.1001, + "step": 28840 + }, + { + "epoch": 0.5144115863446652, + "grad_norm": 0.24234291911125183, + "learning_rate": 2.8096481970670256e-05, + "loss": 0.161, + "step": 28841 + }, + { + "epoch": 0.5144294224663789, + "grad_norm": 0.22775107622146606, + "learning_rate": 2.8094937442513324e-05, + "loss": 0.1224, + "step": 28842 + }, + { + "epoch": 0.5144472585880926, + "grad_norm": 0.332675039768219, + "learning_rate": 2.809339290235934e-05, + "loss": 0.2071, + "step": 28843 + }, + { + "epoch": 0.5144650947098063, + "grad_norm": 0.18648183345794678, + "learning_rate": 2.80918483502143e-05, + "loss": 0.1092, + "step": 28844 + }, + { + "epoch": 0.51448293083152, + "grad_norm": 0.24979731440544128, + "learning_rate": 2.8090303786084183e-05, + "loss": 0.1822, + "step": 28845 + }, + { + "epoch": 0.5145007669532337, + "grad_norm": 0.25101134181022644, + "learning_rate": 2.8088759209974984e-05, + "loss": 0.1212, + "step": 28846 + }, + { + "epoch": 0.5145186030749473, + "grad_norm": 0.25630924105644226, + "learning_rate": 2.8087214621892687e-05, + "loss": 0.2035, + "step": 28847 + }, + { + "epoch": 0.514536439196661, + "grad_norm": 0.23433223366737366, + "learning_rate": 2.808567002184328e-05, + "loss": 0.1194, + "step": 28848 + }, + { + "epoch": 0.5145542753183747, + "grad_norm": 0.2651732265949249, + "learning_rate": 2.808412540983275e-05, + "loss": 0.1289, + "step": 28849 + }, + { + "epoch": 0.5145721114400885, + "grad_norm": 0.29305675625801086, + "learning_rate": 2.8082580785867085e-05, + "loss": 0.2218, + "step": 28850 + }, + { + "epoch": 0.5145899475618022, + "grad_norm": 0.24327361583709717, + "learning_rate": 2.808103614995228e-05, + "loss": 0.116, + "step": 28851 + }, + { + "epoch": 0.5146077836835159, + "grad_norm": 0.3182009756565094, + "learning_rate": 2.80794915020943e-05, + "loss": 0.1883, + "step": 28852 + }, + { + "epoch": 0.5146256198052296, + "grad_norm": 0.21386811137199402, + "learning_rate": 2.8077946842299158e-05, + "loss": 0.1179, + "step": 28853 + }, + { + "epoch": 0.5146434559269433, + "grad_norm": 0.28595349192619324, + "learning_rate": 2.8076402170572825e-05, + "loss": 0.1424, + "step": 28854 + }, + { + "epoch": 0.514661292048657, + "grad_norm": 0.3199631869792938, + "learning_rate": 2.8074857486921298e-05, + "loss": 0.1098, + "step": 28855 + }, + { + "epoch": 0.5146791281703706, + "grad_norm": 0.21425190567970276, + "learning_rate": 2.8073312791350554e-05, + "loss": 0.0952, + "step": 28856 + }, + { + "epoch": 0.5146969642920843, + "grad_norm": 0.20657610893249512, + "learning_rate": 2.80717680838666e-05, + "loss": 0.0961, + "step": 28857 + }, + { + "epoch": 0.514714800413798, + "grad_norm": 0.23270902037620544, + "learning_rate": 2.80702233644754e-05, + "loss": 0.132, + "step": 28858 + }, + { + "epoch": 0.5147326365355117, + "grad_norm": 0.29292598366737366, + "learning_rate": 2.8068678633182967e-05, + "loss": 0.1006, + "step": 28859 + }, + { + "epoch": 0.5147504726572254, + "grad_norm": 0.36392742395401, + "learning_rate": 2.8067133889995273e-05, + "loss": 0.1672, + "step": 28860 + }, + { + "epoch": 0.5147683087789391, + "grad_norm": 0.2515296936035156, + "learning_rate": 2.8065589134918296e-05, + "loss": 0.1099, + "step": 28861 + }, + { + "epoch": 0.5147861449006528, + "grad_norm": 0.46453461050987244, + "learning_rate": 2.806404436795805e-05, + "loss": 0.1492, + "step": 28862 + }, + { + "epoch": 0.5148039810223665, + "grad_norm": 0.28568825125694275, + "learning_rate": 2.80624995891205e-05, + "loss": 0.1408, + "step": 28863 + }, + { + "epoch": 0.5148218171440802, + "grad_norm": 0.21573366224765778, + "learning_rate": 2.8060954798411656e-05, + "loss": 0.1344, + "step": 28864 + }, + { + "epoch": 0.5148396532657938, + "grad_norm": 0.3369169235229492, + "learning_rate": 2.8059409995837482e-05, + "loss": 0.1965, + "step": 28865 + }, + { + "epoch": 0.5148574893875075, + "grad_norm": 0.3836381733417511, + "learning_rate": 2.8057865181403987e-05, + "loss": 0.124, + "step": 28866 + }, + { + "epoch": 0.5148753255092213, + "grad_norm": 0.2967154085636139, + "learning_rate": 2.8056320355117143e-05, + "loss": 0.1414, + "step": 28867 + }, + { + "epoch": 0.514893161630935, + "grad_norm": 0.22564348578453064, + "learning_rate": 2.8054775516982944e-05, + "loss": 0.0991, + "step": 28868 + }, + { + "epoch": 0.5149109977526487, + "grad_norm": 0.3808143734931946, + "learning_rate": 2.8053230667007387e-05, + "loss": 0.2282, + "step": 28869 + }, + { + "epoch": 0.5149288338743624, + "grad_norm": 0.2590887248516083, + "learning_rate": 2.8051685805196455e-05, + "loss": 0.1223, + "step": 28870 + }, + { + "epoch": 0.5149466699960761, + "grad_norm": 0.30746832489967346, + "learning_rate": 2.805014093155613e-05, + "loss": 0.1409, + "step": 28871 + }, + { + "epoch": 0.5149645061177898, + "grad_norm": 0.2892451286315918, + "learning_rate": 2.8048596046092406e-05, + "loss": 0.1193, + "step": 28872 + }, + { + "epoch": 0.5149823422395035, + "grad_norm": 0.2334245890378952, + "learning_rate": 2.8047051148811275e-05, + "loss": 0.0944, + "step": 28873 + }, + { + "epoch": 0.5150001783612171, + "grad_norm": 0.22783909738063812, + "learning_rate": 2.8045506239718717e-05, + "loss": 0.1899, + "step": 28874 + }, + { + "epoch": 0.5150180144829308, + "grad_norm": 0.21311908960342407, + "learning_rate": 2.8043961318820722e-05, + "loss": 0.1148, + "step": 28875 + }, + { + "epoch": 0.5150358506046445, + "grad_norm": 0.268890380859375, + "learning_rate": 2.8042416386123284e-05, + "loss": 0.1612, + "step": 28876 + }, + { + "epoch": 0.5150536867263582, + "grad_norm": 0.4682801365852356, + "learning_rate": 2.8040871441632393e-05, + "loss": 0.1884, + "step": 28877 + }, + { + "epoch": 0.5150715228480719, + "grad_norm": 0.27963343262672424, + "learning_rate": 2.8039326485354028e-05, + "loss": 0.153, + "step": 28878 + }, + { + "epoch": 0.5150893589697856, + "grad_norm": 0.21740885078907013, + "learning_rate": 2.8037781517294194e-05, + "loss": 0.084, + "step": 28879 + }, + { + "epoch": 0.5151071950914993, + "grad_norm": 0.2583710849285126, + "learning_rate": 2.803623653745886e-05, + "loss": 0.1095, + "step": 28880 + }, + { + "epoch": 0.515125031213213, + "grad_norm": 0.22081707417964935, + "learning_rate": 2.803469154585403e-05, + "loss": 0.1078, + "step": 28881 + }, + { + "epoch": 0.5151428673349266, + "grad_norm": 0.22243289649486542, + "learning_rate": 2.8033146542485683e-05, + "loss": 0.1812, + "step": 28882 + }, + { + "epoch": 0.5151607034566403, + "grad_norm": 0.2706349492073059, + "learning_rate": 2.8031601527359813e-05, + "loss": 0.1602, + "step": 28883 + }, + { + "epoch": 0.5151785395783541, + "grad_norm": 0.22824732959270477, + "learning_rate": 2.8030056500482404e-05, + "loss": 0.1276, + "step": 28884 + }, + { + "epoch": 0.5151963757000678, + "grad_norm": 0.186862051486969, + "learning_rate": 2.8028511461859457e-05, + "loss": 0.1105, + "step": 28885 + }, + { + "epoch": 0.5152142118217815, + "grad_norm": 0.27251294255256653, + "learning_rate": 2.802696641149696e-05, + "loss": 0.1169, + "step": 28886 + }, + { + "epoch": 0.5152320479434952, + "grad_norm": 0.3063884377479553, + "learning_rate": 2.8025421349400883e-05, + "loss": 0.1421, + "step": 28887 + }, + { + "epoch": 0.5152498840652089, + "grad_norm": 0.22104158997535706, + "learning_rate": 2.802387627557724e-05, + "loss": 0.1263, + "step": 28888 + }, + { + "epoch": 0.5152677201869226, + "grad_norm": 0.3139147162437439, + "learning_rate": 2.8022331190031987e-05, + "loss": 0.2088, + "step": 28889 + }, + { + "epoch": 0.5152855563086363, + "grad_norm": 0.2928353250026703, + "learning_rate": 2.802078609277115e-05, + "loss": 0.1138, + "step": 28890 + }, + { + "epoch": 0.51530339243035, + "grad_norm": 0.35080447793006897, + "learning_rate": 2.8019240983800698e-05, + "loss": 0.1461, + "step": 28891 + }, + { + "epoch": 0.5153212285520636, + "grad_norm": 0.26810023188591003, + "learning_rate": 2.8017695863126635e-05, + "loss": 0.0917, + "step": 28892 + }, + { + "epoch": 0.5153390646737773, + "grad_norm": 0.2373628467321396, + "learning_rate": 2.8016150730754932e-05, + "loss": 0.1108, + "step": 28893 + }, + { + "epoch": 0.515356900795491, + "grad_norm": 0.3014053702354431, + "learning_rate": 2.801460558669159e-05, + "loss": 0.1879, + "step": 28894 + }, + { + "epoch": 0.5153747369172047, + "grad_norm": 0.2637397348880768, + "learning_rate": 2.801306043094259e-05, + "loss": 0.1452, + "step": 28895 + }, + { + "epoch": 0.5153925730389184, + "grad_norm": 0.28406721353530884, + "learning_rate": 2.8011515263513927e-05, + "loss": 0.1463, + "step": 28896 + }, + { + "epoch": 0.5154104091606321, + "grad_norm": 0.24499158561229706, + "learning_rate": 2.8009970084411596e-05, + "loss": 0.1317, + "step": 28897 + }, + { + "epoch": 0.5154282452823458, + "grad_norm": 0.30253899097442627, + "learning_rate": 2.8008424893641582e-05, + "loss": 0.1543, + "step": 28898 + }, + { + "epoch": 0.5154460814040595, + "grad_norm": 0.47803977131843567, + "learning_rate": 2.8006879691209877e-05, + "loss": 0.1407, + "step": 28899 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 0.24904556572437286, + "learning_rate": 2.800533447712246e-05, + "loss": 0.1447, + "step": 28900 + }, + { + "epoch": 0.5154817536474869, + "grad_norm": 0.2349046766757965, + "learning_rate": 2.8003789251385333e-05, + "loss": 0.1567, + "step": 28901 + }, + { + "epoch": 0.5154995897692006, + "grad_norm": 0.24299566447734833, + "learning_rate": 2.8002244014004475e-05, + "loss": 0.114, + "step": 28902 + }, + { + "epoch": 0.5155174258909143, + "grad_norm": 0.2111862152814865, + "learning_rate": 2.800069876498589e-05, + "loss": 0.1045, + "step": 28903 + }, + { + "epoch": 0.515535262012628, + "grad_norm": 0.2480131983757019, + "learning_rate": 2.7999153504335552e-05, + "loss": 0.0966, + "step": 28904 + }, + { + "epoch": 0.5155530981343417, + "grad_norm": 0.29171517491340637, + "learning_rate": 2.7997608232059468e-05, + "loss": 0.2029, + "step": 28905 + }, + { + "epoch": 0.5155709342560554, + "grad_norm": 0.25620126724243164, + "learning_rate": 2.7996062948163616e-05, + "loss": 0.1987, + "step": 28906 + }, + { + "epoch": 0.5155887703777691, + "grad_norm": 0.26390090584754944, + "learning_rate": 2.799451765265399e-05, + "loss": 0.1601, + "step": 28907 + }, + { + "epoch": 0.5156066064994828, + "grad_norm": 0.3143174350261688, + "learning_rate": 2.7992972345536572e-05, + "loss": 0.1274, + "step": 28908 + }, + { + "epoch": 0.5156244426211964, + "grad_norm": 0.3383817672729492, + "learning_rate": 2.799142702681737e-05, + "loss": 0.1601, + "step": 28909 + }, + { + "epoch": 0.5156422787429101, + "grad_norm": 0.3445188105106354, + "learning_rate": 2.7989881696502357e-05, + "loss": 0.1802, + "step": 28910 + }, + { + "epoch": 0.5156601148646238, + "grad_norm": 0.22919434309005737, + "learning_rate": 2.798833635459753e-05, + "loss": 0.0902, + "step": 28911 + }, + { + "epoch": 0.5156779509863375, + "grad_norm": 0.2137250304222107, + "learning_rate": 2.7986791001108888e-05, + "loss": 0.1091, + "step": 28912 + }, + { + "epoch": 0.5156957871080512, + "grad_norm": 0.24877431988716125, + "learning_rate": 2.79852456360424e-05, + "loss": 0.158, + "step": 28913 + }, + { + "epoch": 0.5157136232297649, + "grad_norm": 0.2536190152168274, + "learning_rate": 2.798370025940408e-05, + "loss": 0.1133, + "step": 28914 + }, + { + "epoch": 0.5157314593514786, + "grad_norm": 0.2222224920988083, + "learning_rate": 2.7982154871199894e-05, + "loss": 0.1428, + "step": 28915 + }, + { + "epoch": 0.5157492954731923, + "grad_norm": 0.33202672004699707, + "learning_rate": 2.7980609471435854e-05, + "loss": 0.1561, + "step": 28916 + }, + { + "epoch": 0.5157671315949061, + "grad_norm": 0.46549445390701294, + "learning_rate": 2.797906406011794e-05, + "loss": 0.1488, + "step": 28917 + }, + { + "epoch": 0.5157849677166197, + "grad_norm": 0.18265798687934875, + "learning_rate": 2.7977518637252146e-05, + "loss": 0.1164, + "step": 28918 + }, + { + "epoch": 0.5158028038383334, + "grad_norm": 0.25831854343414307, + "learning_rate": 2.7975973202844463e-05, + "loss": 0.1553, + "step": 28919 + }, + { + "epoch": 0.5158206399600471, + "grad_norm": 0.2652304470539093, + "learning_rate": 2.7974427756900878e-05, + "loss": 0.1064, + "step": 28920 + }, + { + "epoch": 0.5158384760817608, + "grad_norm": 0.258872389793396, + "learning_rate": 2.7972882299427382e-05, + "loss": 0.1748, + "step": 28921 + }, + { + "epoch": 0.5158563122034745, + "grad_norm": 0.26258233189582825, + "learning_rate": 2.7971336830429973e-05, + "loss": 0.1822, + "step": 28922 + }, + { + "epoch": 0.5158741483251882, + "grad_norm": 0.27374476194381714, + "learning_rate": 2.796979134991463e-05, + "loss": 0.1309, + "step": 28923 + }, + { + "epoch": 0.5158919844469019, + "grad_norm": 0.31991007924079895, + "learning_rate": 2.7968245857887353e-05, + "loss": 0.1804, + "step": 28924 + }, + { + "epoch": 0.5159098205686156, + "grad_norm": 0.21791322529315948, + "learning_rate": 2.796670035435413e-05, + "loss": 0.1494, + "step": 28925 + }, + { + "epoch": 0.5159276566903293, + "grad_norm": 0.28283265233039856, + "learning_rate": 2.796515483932095e-05, + "loss": 0.1455, + "step": 28926 + }, + { + "epoch": 0.5159454928120429, + "grad_norm": 0.34426149725914, + "learning_rate": 2.7963609312793813e-05, + "loss": 0.1569, + "step": 28927 + }, + { + "epoch": 0.5159633289337566, + "grad_norm": 0.27180472016334534, + "learning_rate": 2.7962063774778696e-05, + "loss": 0.1472, + "step": 28928 + }, + { + "epoch": 0.5159811650554703, + "grad_norm": 0.29504522681236267, + "learning_rate": 2.7960518225281603e-05, + "loss": 0.1326, + "step": 28929 + }, + { + "epoch": 0.515999001177184, + "grad_norm": 0.38925954699516296, + "learning_rate": 2.7958972664308507e-05, + "loss": 0.1284, + "step": 28930 + }, + { + "epoch": 0.5160168372988977, + "grad_norm": 0.21360723674297333, + "learning_rate": 2.7957427091865417e-05, + "loss": 0.0769, + "step": 28931 + }, + { + "epoch": 0.5160346734206114, + "grad_norm": 0.22349615395069122, + "learning_rate": 2.7955881507958316e-05, + "loss": 0.1205, + "step": 28932 + }, + { + "epoch": 0.5160525095423251, + "grad_norm": 0.2895841598510742, + "learning_rate": 2.7954335912593206e-05, + "loss": 0.1502, + "step": 28933 + }, + { + "epoch": 0.5160703456640389, + "grad_norm": 0.22992043197155, + "learning_rate": 2.7952790305776062e-05, + "loss": 0.1245, + "step": 28934 + }, + { + "epoch": 0.5160881817857526, + "grad_norm": 0.3140040934085846, + "learning_rate": 2.7951244687512884e-05, + "loss": 0.1358, + "step": 28935 + }, + { + "epoch": 0.5161060179074662, + "grad_norm": 0.3326644003391266, + "learning_rate": 2.7949699057809665e-05, + "loss": 0.1577, + "step": 28936 + }, + { + "epoch": 0.5161238540291799, + "grad_norm": 0.3024000823497772, + "learning_rate": 2.794815341667239e-05, + "loss": 0.1246, + "step": 28937 + }, + { + "epoch": 0.5161416901508936, + "grad_norm": 0.2273421734571457, + "learning_rate": 2.7946607764107046e-05, + "loss": 0.1294, + "step": 28938 + }, + { + "epoch": 0.5161595262726073, + "grad_norm": 0.27051490545272827, + "learning_rate": 2.7945062100119644e-05, + "loss": 0.1231, + "step": 28939 + }, + { + "epoch": 0.516177362394321, + "grad_norm": 0.2705560624599457, + "learning_rate": 2.7943516424716172e-05, + "loss": 0.1417, + "step": 28940 + }, + { + "epoch": 0.5161951985160347, + "grad_norm": 0.21922354400157928, + "learning_rate": 2.79419707379026e-05, + "loss": 0.1094, + "step": 28941 + }, + { + "epoch": 0.5162130346377484, + "grad_norm": 0.4108131527900696, + "learning_rate": 2.7940425039684943e-05, + "loss": 0.1551, + "step": 28942 + }, + { + "epoch": 0.5162308707594621, + "grad_norm": 0.24285483360290527, + "learning_rate": 2.7938879330069173e-05, + "loss": 0.1516, + "step": 28943 + }, + { + "epoch": 0.5162487068811757, + "grad_norm": 0.23992125689983368, + "learning_rate": 2.793733360906129e-05, + "loss": 0.1225, + "step": 28944 + }, + { + "epoch": 0.5162665430028894, + "grad_norm": 0.27711063623428345, + "learning_rate": 2.793578787666729e-05, + "loss": 0.1758, + "step": 28945 + }, + { + "epoch": 0.5162843791246031, + "grad_norm": 0.2801652252674103, + "learning_rate": 2.7934242132893173e-05, + "loss": 0.1871, + "step": 28946 + }, + { + "epoch": 0.5163022152463168, + "grad_norm": 0.2829630374908447, + "learning_rate": 2.793269637774491e-05, + "loss": 0.1377, + "step": 28947 + }, + { + "epoch": 0.5163200513680305, + "grad_norm": 0.28745269775390625, + "learning_rate": 2.7931150611228507e-05, + "loss": 0.154, + "step": 28948 + }, + { + "epoch": 0.5163378874897442, + "grad_norm": 0.3312828242778778, + "learning_rate": 2.792960483334995e-05, + "loss": 0.1329, + "step": 28949 + }, + { + "epoch": 0.5163557236114579, + "grad_norm": 0.27629029750823975, + "learning_rate": 2.7928059044115233e-05, + "loss": 0.1651, + "step": 28950 + }, + { + "epoch": 0.5163735597331717, + "grad_norm": 0.22156387567520142, + "learning_rate": 2.792651324353035e-05, + "loss": 0.1333, + "step": 28951 + }, + { + "epoch": 0.5163913958548854, + "grad_norm": 0.315434992313385, + "learning_rate": 2.7924967431601285e-05, + "loss": 0.1783, + "step": 28952 + }, + { + "epoch": 0.516409231976599, + "grad_norm": 0.25512686371803284, + "learning_rate": 2.7923421608334043e-05, + "loss": 0.1461, + "step": 28953 + }, + { + "epoch": 0.5164270680983127, + "grad_norm": 0.21157903969287872, + "learning_rate": 2.79218757737346e-05, + "loss": 0.099, + "step": 28954 + }, + { + "epoch": 0.5164449042200264, + "grad_norm": 0.547171950340271, + "learning_rate": 2.792032992780897e-05, + "loss": 0.1832, + "step": 28955 + }, + { + "epoch": 0.5164627403417401, + "grad_norm": 0.30515196919441223, + "learning_rate": 2.791878407056312e-05, + "loss": 0.1639, + "step": 28956 + }, + { + "epoch": 0.5164805764634538, + "grad_norm": 0.3291718661785126, + "learning_rate": 2.7917238202003065e-05, + "loss": 0.1372, + "step": 28957 + }, + { + "epoch": 0.5164984125851675, + "grad_norm": 0.2584816813468933, + "learning_rate": 2.7915692322134777e-05, + "loss": 0.1391, + "step": 28958 + }, + { + "epoch": 0.5165162487068812, + "grad_norm": 0.27601930499076843, + "learning_rate": 2.7914146430964265e-05, + "loss": 0.1998, + "step": 28959 + }, + { + "epoch": 0.5165340848285949, + "grad_norm": 0.20140008628368378, + "learning_rate": 2.791260052849751e-05, + "loss": 0.1309, + "step": 28960 + }, + { + "epoch": 0.5165519209503086, + "grad_norm": 0.24996884167194366, + "learning_rate": 2.7911054614740516e-05, + "loss": 0.0593, + "step": 28961 + }, + { + "epoch": 0.5165697570720222, + "grad_norm": 0.25987932085990906, + "learning_rate": 2.7909508689699264e-05, + "loss": 0.1437, + "step": 28962 + }, + { + "epoch": 0.5165875931937359, + "grad_norm": 0.4900023937225342, + "learning_rate": 2.7907962753379764e-05, + "loss": 0.1584, + "step": 28963 + }, + { + "epoch": 0.5166054293154496, + "grad_norm": 0.2832129895687103, + "learning_rate": 2.7906416805787984e-05, + "loss": 0.1665, + "step": 28964 + }, + { + "epoch": 0.5166232654371633, + "grad_norm": 0.23083336651325226, + "learning_rate": 2.790487084692993e-05, + "loss": 0.1452, + "step": 28965 + }, + { + "epoch": 0.516641101558877, + "grad_norm": 0.30606067180633545, + "learning_rate": 2.7903324876811593e-05, + "loss": 0.1336, + "step": 28966 + }, + { + "epoch": 0.5166589376805907, + "grad_norm": 0.2836165428161621, + "learning_rate": 2.7901778895438963e-05, + "loss": 0.1383, + "step": 28967 + }, + { + "epoch": 0.5166767738023045, + "grad_norm": 0.26837658882141113, + "learning_rate": 2.790023290281804e-05, + "loss": 0.1804, + "step": 28968 + }, + { + "epoch": 0.5166946099240182, + "grad_norm": 0.29959067702293396, + "learning_rate": 2.7898686898954813e-05, + "loss": 0.1142, + "step": 28969 + }, + { + "epoch": 0.5167124460457319, + "grad_norm": 0.17766804993152618, + "learning_rate": 2.7897140883855276e-05, + "loss": 0.1368, + "step": 28970 + }, + { + "epoch": 0.5167302821674455, + "grad_norm": 0.2631548345088959, + "learning_rate": 2.7895594857525413e-05, + "loss": 0.1992, + "step": 28971 + }, + { + "epoch": 0.5167481182891592, + "grad_norm": 0.21336770057678223, + "learning_rate": 2.789404881997123e-05, + "loss": 0.0953, + "step": 28972 + }, + { + "epoch": 0.5167659544108729, + "grad_norm": 0.24261608719825745, + "learning_rate": 2.7892502771198707e-05, + "loss": 0.153, + "step": 28973 + }, + { + "epoch": 0.5167837905325866, + "grad_norm": 0.25070300698280334, + "learning_rate": 2.7890956711213854e-05, + "loss": 0.1566, + "step": 28974 + }, + { + "epoch": 0.5168016266543003, + "grad_norm": 0.1853751540184021, + "learning_rate": 2.788941064002265e-05, + "loss": 0.1083, + "step": 28975 + }, + { + "epoch": 0.516819462776014, + "grad_norm": 0.2470073103904724, + "learning_rate": 2.78878645576311e-05, + "loss": 0.1427, + "step": 28976 + }, + { + "epoch": 0.5168372988977277, + "grad_norm": 0.35444435477256775, + "learning_rate": 2.7886318464045174e-05, + "loss": 0.1034, + "step": 28977 + }, + { + "epoch": 0.5168551350194414, + "grad_norm": 0.23380325734615326, + "learning_rate": 2.7884772359270893e-05, + "loss": 0.1285, + "step": 28978 + }, + { + "epoch": 0.516872971141155, + "grad_norm": 0.22492867708206177, + "learning_rate": 2.7883226243314238e-05, + "loss": 0.1235, + "step": 28979 + }, + { + "epoch": 0.5168908072628687, + "grad_norm": 0.2778223156929016, + "learning_rate": 2.788168011618119e-05, + "loss": 0.119, + "step": 28980 + }, + { + "epoch": 0.5169086433845824, + "grad_norm": 0.22704172134399414, + "learning_rate": 2.788013397787777e-05, + "loss": 0.0989, + "step": 28981 + }, + { + "epoch": 0.5169264795062961, + "grad_norm": 0.242509663105011, + "learning_rate": 2.7878587828409948e-05, + "loss": 0.1184, + "step": 28982 + }, + { + "epoch": 0.5169443156280098, + "grad_norm": 0.2854163944721222, + "learning_rate": 2.7877041667783726e-05, + "loss": 0.1994, + "step": 28983 + }, + { + "epoch": 0.5169621517497235, + "grad_norm": 0.22003023326396942, + "learning_rate": 2.7875495496005093e-05, + "loss": 0.137, + "step": 28984 + }, + { + "epoch": 0.5169799878714373, + "grad_norm": 0.20889660716056824, + "learning_rate": 2.7873949313080055e-05, + "loss": 0.1318, + "step": 28985 + }, + { + "epoch": 0.516997823993151, + "grad_norm": 0.2641153335571289, + "learning_rate": 2.787240311901459e-05, + "loss": 0.1394, + "step": 28986 + }, + { + "epoch": 0.5170156601148647, + "grad_norm": 0.23657356202602386, + "learning_rate": 2.7870856913814703e-05, + "loss": 0.0823, + "step": 28987 + }, + { + "epoch": 0.5170334962365783, + "grad_norm": 0.27231404185295105, + "learning_rate": 2.786931069748638e-05, + "loss": 0.1608, + "step": 28988 + }, + { + "epoch": 0.517051332358292, + "grad_norm": 0.31724461913108826, + "learning_rate": 2.7867764470035617e-05, + "loss": 0.1265, + "step": 28989 + }, + { + "epoch": 0.5170691684800057, + "grad_norm": 0.27262791991233826, + "learning_rate": 2.786621823146841e-05, + "loss": 0.1516, + "step": 28990 + }, + { + "epoch": 0.5170870046017194, + "grad_norm": 0.18548011779785156, + "learning_rate": 2.786467198179075e-05, + "loss": 0.0942, + "step": 28991 + }, + { + "epoch": 0.5171048407234331, + "grad_norm": 0.25160813331604004, + "learning_rate": 2.7863125721008637e-05, + "loss": 0.1057, + "step": 28992 + }, + { + "epoch": 0.5171226768451468, + "grad_norm": 0.20386798679828644, + "learning_rate": 2.786157944912805e-05, + "loss": 0.099, + "step": 28993 + }, + { + "epoch": 0.5171405129668605, + "grad_norm": 0.28697773814201355, + "learning_rate": 2.7860033166155004e-05, + "loss": 0.1494, + "step": 28994 + }, + { + "epoch": 0.5171583490885742, + "grad_norm": 0.3194122910499573, + "learning_rate": 2.785848687209547e-05, + "loss": 0.1452, + "step": 28995 + }, + { + "epoch": 0.5171761852102879, + "grad_norm": 0.2933448553085327, + "learning_rate": 2.7856940566955463e-05, + "loss": 0.1454, + "step": 28996 + }, + { + "epoch": 0.5171940213320015, + "grad_norm": 0.2207578867673874, + "learning_rate": 2.785539425074096e-05, + "loss": 0.1257, + "step": 28997 + }, + { + "epoch": 0.5172118574537152, + "grad_norm": 0.258308470249176, + "learning_rate": 2.7853847923457975e-05, + "loss": 0.1435, + "step": 28998 + }, + { + "epoch": 0.5172296935754289, + "grad_norm": 0.20282821357250214, + "learning_rate": 2.7852301585112478e-05, + "loss": 0.1141, + "step": 28999 + }, + { + "epoch": 0.5172475296971426, + "grad_norm": 0.2658439874649048, + "learning_rate": 2.7850755235710473e-05, + "loss": 0.1725, + "step": 29000 + }, + { + "epoch": 0.5172475296971426, + "eval_loss": 0.13551442325115204, + "eval_runtime": 106.861, + "eval_samples_per_second": 9.583, + "eval_steps_per_second": 1.6, + "step": 29000 + }, + { + "epoch": 0.5172653658188563, + "grad_norm": 0.24678544700145721, + "learning_rate": 2.7849208875257965e-05, + "loss": 0.141, + "step": 29001 + }, + { + "epoch": 0.5172832019405701, + "grad_norm": 0.3237248957157135, + "learning_rate": 2.7847662503760937e-05, + "loss": 0.1341, + "step": 29002 + }, + { + "epoch": 0.5173010380622838, + "grad_norm": 0.21365462243556976, + "learning_rate": 2.7846116121225385e-05, + "loss": 0.1237, + "step": 29003 + }, + { + "epoch": 0.5173188741839975, + "grad_norm": 0.3641394376754761, + "learning_rate": 2.7844569727657304e-05, + "loss": 0.1521, + "step": 29004 + }, + { + "epoch": 0.5173367103057112, + "grad_norm": 0.49921318888664246, + "learning_rate": 2.784302332306269e-05, + "loss": 0.178, + "step": 29005 + }, + { + "epoch": 0.5173545464274248, + "grad_norm": 0.23799487948417664, + "learning_rate": 2.7841476907447534e-05, + "loss": 0.1186, + "step": 29006 + }, + { + "epoch": 0.5173723825491385, + "grad_norm": 0.27667132019996643, + "learning_rate": 2.783993048081783e-05, + "loss": 0.1007, + "step": 29007 + }, + { + "epoch": 0.5173902186708522, + "grad_norm": 0.29321256279945374, + "learning_rate": 2.7838384043179572e-05, + "loss": 0.1526, + "step": 29008 + }, + { + "epoch": 0.5174080547925659, + "grad_norm": 0.2686637043952942, + "learning_rate": 2.783683759453876e-05, + "loss": 0.1477, + "step": 29009 + }, + { + "epoch": 0.5174258909142796, + "grad_norm": 0.36251071095466614, + "learning_rate": 2.7835291134901387e-05, + "loss": 0.1421, + "step": 29010 + }, + { + "epoch": 0.5174437270359933, + "grad_norm": 0.39520522952079773, + "learning_rate": 2.7833744664273454e-05, + "loss": 0.1732, + "step": 29011 + }, + { + "epoch": 0.517461563157707, + "grad_norm": 0.2549844980239868, + "learning_rate": 2.7832198182660936e-05, + "loss": 0.154, + "step": 29012 + }, + { + "epoch": 0.5174793992794207, + "grad_norm": 0.21941344439983368, + "learning_rate": 2.7830651690069848e-05, + "loss": 0.1374, + "step": 29013 + }, + { + "epoch": 0.5174972354011343, + "grad_norm": 0.2945975065231323, + "learning_rate": 2.7829105186506167e-05, + "loss": 0.1344, + "step": 29014 + }, + { + "epoch": 0.517515071522848, + "grad_norm": 0.25093361735343933, + "learning_rate": 2.7827558671975902e-05, + "loss": 0.0928, + "step": 29015 + }, + { + "epoch": 0.5175329076445617, + "grad_norm": 0.2622643709182739, + "learning_rate": 2.782601214648504e-05, + "loss": 0.1334, + "step": 29016 + }, + { + "epoch": 0.5175507437662754, + "grad_norm": 0.30701369047164917, + "learning_rate": 2.7824465610039585e-05, + "loss": 0.1423, + "step": 29017 + }, + { + "epoch": 0.5175685798879892, + "grad_norm": 0.2798667252063751, + "learning_rate": 2.7822919062645526e-05, + "loss": 0.1538, + "step": 29018 + }, + { + "epoch": 0.5175864160097029, + "grad_norm": 0.3005349636077881, + "learning_rate": 2.7821372504308852e-05, + "loss": 0.1775, + "step": 29019 + }, + { + "epoch": 0.5176042521314166, + "grad_norm": 0.2491803914308548, + "learning_rate": 2.7819825935035567e-05, + "loss": 0.0998, + "step": 29020 + }, + { + "epoch": 0.5176220882531303, + "grad_norm": 0.3449251651763916, + "learning_rate": 2.7818279354831656e-05, + "loss": 0.147, + "step": 29021 + }, + { + "epoch": 0.517639924374844, + "grad_norm": 0.33878713846206665, + "learning_rate": 2.781673276370313e-05, + "loss": 0.1641, + "step": 29022 + }, + { + "epoch": 0.5176577604965577, + "grad_norm": 0.23291771113872528, + "learning_rate": 2.781518616165597e-05, + "loss": 0.141, + "step": 29023 + }, + { + "epoch": 0.5176755966182713, + "grad_norm": 0.24511079490184784, + "learning_rate": 2.781363954869618e-05, + "loss": 0.1458, + "step": 29024 + }, + { + "epoch": 0.517693432739985, + "grad_norm": 0.26316049695014954, + "learning_rate": 2.7812092924829748e-05, + "loss": 0.131, + "step": 29025 + }, + { + "epoch": 0.5177112688616987, + "grad_norm": 0.24760164320468903, + "learning_rate": 2.781054629006268e-05, + "loss": 0.081, + "step": 29026 + }, + { + "epoch": 0.5177291049834124, + "grad_norm": 0.3154909610748291, + "learning_rate": 2.780899964440095e-05, + "loss": 0.1492, + "step": 29027 + }, + { + "epoch": 0.5177469411051261, + "grad_norm": 0.33660760521888733, + "learning_rate": 2.7807452987850575e-05, + "loss": 0.1145, + "step": 29028 + }, + { + "epoch": 0.5177647772268398, + "grad_norm": 0.1884072870016098, + "learning_rate": 2.780590632041754e-05, + "loss": 0.1326, + "step": 29029 + }, + { + "epoch": 0.5177826133485535, + "grad_norm": 0.2741774618625641, + "learning_rate": 2.7804359642107846e-05, + "loss": 0.0649, + "step": 29030 + }, + { + "epoch": 0.5178004494702672, + "grad_norm": 0.2877926528453827, + "learning_rate": 2.7802812952927492e-05, + "loss": 0.1806, + "step": 29031 + }, + { + "epoch": 0.5178182855919808, + "grad_norm": 0.20064689218997955, + "learning_rate": 2.7801266252882457e-05, + "loss": 0.1136, + "step": 29032 + }, + { + "epoch": 0.5178361217136945, + "grad_norm": 0.32631775736808777, + "learning_rate": 2.7799719541978754e-05, + "loss": 0.125, + "step": 29033 + }, + { + "epoch": 0.5178539578354082, + "grad_norm": 0.30685681104660034, + "learning_rate": 2.7798172820222363e-05, + "loss": 0.2023, + "step": 29034 + }, + { + "epoch": 0.517871793957122, + "grad_norm": 0.25200074911117554, + "learning_rate": 2.779662608761929e-05, + "loss": 0.1331, + "step": 29035 + }, + { + "epoch": 0.5178896300788357, + "grad_norm": 0.2657131254673004, + "learning_rate": 2.7795079344175524e-05, + "loss": 0.1312, + "step": 29036 + }, + { + "epoch": 0.5179074662005494, + "grad_norm": 0.19913235306739807, + "learning_rate": 2.7793532589897077e-05, + "loss": 0.0629, + "step": 29037 + }, + { + "epoch": 0.5179253023222631, + "grad_norm": 0.25144246220588684, + "learning_rate": 2.7791985824789923e-05, + "loss": 0.1778, + "step": 29038 + }, + { + "epoch": 0.5179431384439768, + "grad_norm": 0.29484477639198303, + "learning_rate": 2.7790439048860073e-05, + "loss": 0.1781, + "step": 29039 + }, + { + "epoch": 0.5179609745656905, + "grad_norm": 0.33371952176094055, + "learning_rate": 2.7788892262113514e-05, + "loss": 0.1563, + "step": 29040 + }, + { + "epoch": 0.5179788106874041, + "grad_norm": 0.19438757002353668, + "learning_rate": 2.7787345464556247e-05, + "loss": 0.122, + "step": 29041 + }, + { + "epoch": 0.5179966468091178, + "grad_norm": 0.20084218680858612, + "learning_rate": 2.7785798656194258e-05, + "loss": 0.1433, + "step": 29042 + }, + { + "epoch": 0.5180144829308315, + "grad_norm": 0.27779170870780945, + "learning_rate": 2.7784251837033564e-05, + "loss": 0.1443, + "step": 29043 + }, + { + "epoch": 0.5180323190525452, + "grad_norm": 0.2793269157409668, + "learning_rate": 2.7782705007080146e-05, + "loss": 0.1246, + "step": 29044 + }, + { + "epoch": 0.5180501551742589, + "grad_norm": 0.28280022740364075, + "learning_rate": 2.7781158166339997e-05, + "loss": 0.1882, + "step": 29045 + }, + { + "epoch": 0.5180679912959726, + "grad_norm": 0.262104868888855, + "learning_rate": 2.7779611314819125e-05, + "loss": 0.17, + "step": 29046 + }, + { + "epoch": 0.5180858274176863, + "grad_norm": 0.30261287093162537, + "learning_rate": 2.7778064452523512e-05, + "loss": 0.1248, + "step": 29047 + }, + { + "epoch": 0.5181036635394, + "grad_norm": 0.2441805750131607, + "learning_rate": 2.777651757945916e-05, + "loss": 0.1328, + "step": 29048 + }, + { + "epoch": 0.5181214996611136, + "grad_norm": 0.3091019093990326, + "learning_rate": 2.7774970695632075e-05, + "loss": 0.1663, + "step": 29049 + }, + { + "epoch": 0.5181393357828273, + "grad_norm": 0.255980908870697, + "learning_rate": 2.7773423801048244e-05, + "loss": 0.1682, + "step": 29050 + }, + { + "epoch": 0.518157171904541, + "grad_norm": 0.28301292657852173, + "learning_rate": 2.777187689571366e-05, + "loss": 0.1742, + "step": 29051 + }, + { + "epoch": 0.5181750080262548, + "grad_norm": 0.23590096831321716, + "learning_rate": 2.777032997963433e-05, + "loss": 0.1414, + "step": 29052 + }, + { + "epoch": 0.5181928441479685, + "grad_norm": 0.26343271136283875, + "learning_rate": 2.776878305281624e-05, + "loss": 0.147, + "step": 29053 + }, + { + "epoch": 0.5182106802696822, + "grad_norm": 0.21160916984081268, + "learning_rate": 2.7767236115265395e-05, + "loss": 0.1092, + "step": 29054 + }, + { + "epoch": 0.5182285163913959, + "grad_norm": 0.25611087679862976, + "learning_rate": 2.776568916698778e-05, + "loss": 0.143, + "step": 29055 + }, + { + "epoch": 0.5182463525131096, + "grad_norm": 0.19979433715343475, + "learning_rate": 2.7764142207989396e-05, + "loss": 0.122, + "step": 29056 + }, + { + "epoch": 0.5182641886348233, + "grad_norm": 0.288221538066864, + "learning_rate": 2.776259523827625e-05, + "loss": 0.1439, + "step": 29057 + }, + { + "epoch": 0.518282024756537, + "grad_norm": 0.2494753897190094, + "learning_rate": 2.7761048257854327e-05, + "loss": 0.1135, + "step": 29058 + }, + { + "epoch": 0.5182998608782506, + "grad_norm": 0.27413010597229004, + "learning_rate": 2.7759501266729632e-05, + "loss": 0.1447, + "step": 29059 + }, + { + "epoch": 0.5183176969999643, + "grad_norm": 0.2546256184577942, + "learning_rate": 2.7757954264908153e-05, + "loss": 0.1294, + "step": 29060 + }, + { + "epoch": 0.518335533121678, + "grad_norm": 0.23120713233947754, + "learning_rate": 2.7756407252395893e-05, + "loss": 0.0793, + "step": 29061 + }, + { + "epoch": 0.5183533692433917, + "grad_norm": 0.23434863984584808, + "learning_rate": 2.7754860229198843e-05, + "loss": 0.1116, + "step": 29062 + }, + { + "epoch": 0.5183712053651054, + "grad_norm": 0.21776700019836426, + "learning_rate": 2.7753313195323005e-05, + "loss": 0.1217, + "step": 29063 + }, + { + "epoch": 0.5183890414868191, + "grad_norm": 0.261416494846344, + "learning_rate": 2.7751766150774373e-05, + "loss": 0.1134, + "step": 29064 + }, + { + "epoch": 0.5184068776085328, + "grad_norm": 0.23069989681243896, + "learning_rate": 2.775021909555895e-05, + "loss": 0.1155, + "step": 29065 + }, + { + "epoch": 0.5184247137302465, + "grad_norm": 0.30875062942504883, + "learning_rate": 2.7748672029682722e-05, + "loss": 0.1332, + "step": 29066 + }, + { + "epoch": 0.5184425498519601, + "grad_norm": 0.23155012726783752, + "learning_rate": 2.7747124953151694e-05, + "loss": 0.1438, + "step": 29067 + }, + { + "epoch": 0.5184603859736738, + "grad_norm": 0.20740477740764618, + "learning_rate": 2.774557786597186e-05, + "loss": 0.1139, + "step": 29068 + }, + { + "epoch": 0.5184782220953876, + "grad_norm": 0.2374761551618576, + "learning_rate": 2.7744030768149222e-05, + "loss": 0.1396, + "step": 29069 + }, + { + "epoch": 0.5184960582171013, + "grad_norm": 0.23060667514801025, + "learning_rate": 2.7742483659689765e-05, + "loss": 0.0981, + "step": 29070 + }, + { + "epoch": 0.518513894338815, + "grad_norm": 0.27698296308517456, + "learning_rate": 2.77409365405995e-05, + "loss": 0.0668, + "step": 29071 + }, + { + "epoch": 0.5185317304605287, + "grad_norm": 0.3301070034503937, + "learning_rate": 2.773938941088442e-05, + "loss": 0.103, + "step": 29072 + }, + { + "epoch": 0.5185495665822424, + "grad_norm": 0.3216072618961334, + "learning_rate": 2.773784227055051e-05, + "loss": 0.1423, + "step": 29073 + }, + { + "epoch": 0.5185674027039561, + "grad_norm": 0.22094807028770447, + "learning_rate": 2.7736295119603794e-05, + "loss": 0.1941, + "step": 29074 + }, + { + "epoch": 0.5185852388256698, + "grad_norm": 0.2325967699289322, + "learning_rate": 2.7734747958050238e-05, + "loss": 0.0998, + "step": 29075 + }, + { + "epoch": 0.5186030749473834, + "grad_norm": 0.22630323469638824, + "learning_rate": 2.773320078589586e-05, + "loss": 0.1234, + "step": 29076 + }, + { + "epoch": 0.5186209110690971, + "grad_norm": 0.25922587513923645, + "learning_rate": 2.773165360314665e-05, + "loss": 0.1545, + "step": 29077 + }, + { + "epoch": 0.5186387471908108, + "grad_norm": 0.23307378590106964, + "learning_rate": 2.7730106409808615e-05, + "loss": 0.1202, + "step": 29078 + }, + { + "epoch": 0.5186565833125245, + "grad_norm": 0.2333347648382187, + "learning_rate": 2.7728559205887735e-05, + "loss": 0.1586, + "step": 29079 + }, + { + "epoch": 0.5186744194342382, + "grad_norm": 0.3560292720794678, + "learning_rate": 2.7727011991390022e-05, + "loss": 0.1561, + "step": 29080 + }, + { + "epoch": 0.5186922555559519, + "grad_norm": 0.2181674987077713, + "learning_rate": 2.7725464766321467e-05, + "loss": 0.138, + "step": 29081 + }, + { + "epoch": 0.5187100916776656, + "grad_norm": 0.2910957932472229, + "learning_rate": 2.7723917530688077e-05, + "loss": 0.1424, + "step": 29082 + }, + { + "epoch": 0.5187279277993793, + "grad_norm": 0.23398922383785248, + "learning_rate": 2.772237028449583e-05, + "loss": 0.1344, + "step": 29083 + }, + { + "epoch": 0.518745763921093, + "grad_norm": 0.23091459274291992, + "learning_rate": 2.7720823027750738e-05, + "loss": 0.1199, + "step": 29084 + }, + { + "epoch": 0.5187636000428066, + "grad_norm": 0.2950792610645294, + "learning_rate": 2.7719275760458803e-05, + "loss": 0.1691, + "step": 29085 + }, + { + "epoch": 0.5187814361645204, + "grad_norm": 0.2599247395992279, + "learning_rate": 2.771772848262601e-05, + "loss": 0.1803, + "step": 29086 + }, + { + "epoch": 0.5187992722862341, + "grad_norm": 0.23546212911605835, + "learning_rate": 2.771618119425837e-05, + "loss": 0.1322, + "step": 29087 + }, + { + "epoch": 0.5188171084079478, + "grad_norm": 0.28911495208740234, + "learning_rate": 2.7714633895361863e-05, + "loss": 0.1238, + "step": 29088 + }, + { + "epoch": 0.5188349445296615, + "grad_norm": 0.20387057960033417, + "learning_rate": 2.7713086585942504e-05, + "loss": 0.0752, + "step": 29089 + }, + { + "epoch": 0.5188527806513752, + "grad_norm": 0.27801382541656494, + "learning_rate": 2.771153926600628e-05, + "loss": 0.0968, + "step": 29090 + }, + { + "epoch": 0.5188706167730889, + "grad_norm": 0.2624642848968506, + "learning_rate": 2.7709991935559192e-05, + "loss": 0.1015, + "step": 29091 + }, + { + "epoch": 0.5188884528948026, + "grad_norm": 0.25566697120666504, + "learning_rate": 2.7708444594607247e-05, + "loss": 0.1366, + "step": 29092 + }, + { + "epoch": 0.5189062890165163, + "grad_norm": 0.2571468949317932, + "learning_rate": 2.7706897243156433e-05, + "loss": 0.1173, + "step": 29093 + }, + { + "epoch": 0.5189241251382299, + "grad_norm": 0.32615068554878235, + "learning_rate": 2.7705349881212744e-05, + "loss": 0.0906, + "step": 29094 + }, + { + "epoch": 0.5189419612599436, + "grad_norm": 0.30232304334640503, + "learning_rate": 2.7703802508782194e-05, + "loss": 0.1591, + "step": 29095 + }, + { + "epoch": 0.5189597973816573, + "grad_norm": 0.2673504948616028, + "learning_rate": 2.7702255125870767e-05, + "loss": 0.1273, + "step": 29096 + }, + { + "epoch": 0.518977633503371, + "grad_norm": 0.24881090223789215, + "learning_rate": 2.770070773248446e-05, + "loss": 0.1444, + "step": 29097 + }, + { + "epoch": 0.5189954696250847, + "grad_norm": 0.2517539858818054, + "learning_rate": 2.7699160328629286e-05, + "loss": 0.1363, + "step": 29098 + }, + { + "epoch": 0.5190133057467984, + "grad_norm": 0.2921959459781647, + "learning_rate": 2.7697612914311233e-05, + "loss": 0.2051, + "step": 29099 + }, + { + "epoch": 0.5190311418685121, + "grad_norm": 0.2753399610519409, + "learning_rate": 2.76960654895363e-05, + "loss": 0.1304, + "step": 29100 + }, + { + "epoch": 0.5190489779902258, + "grad_norm": 0.2771969139575958, + "learning_rate": 2.7694518054310482e-05, + "loss": 0.1829, + "step": 29101 + }, + { + "epoch": 0.5190668141119394, + "grad_norm": 0.29123929142951965, + "learning_rate": 2.7692970608639785e-05, + "loss": 0.1062, + "step": 29102 + }, + { + "epoch": 0.5190846502336532, + "grad_norm": 0.25604504346847534, + "learning_rate": 2.7691423152530203e-05, + "loss": 0.1249, + "step": 29103 + }, + { + "epoch": 0.5191024863553669, + "grad_norm": 0.22162814438343048, + "learning_rate": 2.768987568598773e-05, + "loss": 0.1399, + "step": 29104 + }, + { + "epoch": 0.5191203224770806, + "grad_norm": 0.18678194284439087, + "learning_rate": 2.7688328209018377e-05, + "loss": 0.1101, + "step": 29105 + }, + { + "epoch": 0.5191381585987943, + "grad_norm": 0.217605322599411, + "learning_rate": 2.7686780721628136e-05, + "loss": 0.1188, + "step": 29106 + }, + { + "epoch": 0.519155994720508, + "grad_norm": 0.20708641409873962, + "learning_rate": 2.7685233223823e-05, + "loss": 0.0879, + "step": 29107 + }, + { + "epoch": 0.5191738308422217, + "grad_norm": 0.493600994348526, + "learning_rate": 2.7683685715608975e-05, + "loss": 0.1455, + "step": 29108 + }, + { + "epoch": 0.5191916669639354, + "grad_norm": 0.3135494589805603, + "learning_rate": 2.7682138196992065e-05, + "loss": 0.1341, + "step": 29109 + }, + { + "epoch": 0.5192095030856491, + "grad_norm": 0.3242650330066681, + "learning_rate": 2.7680590667978246e-05, + "loss": 0.1608, + "step": 29110 + }, + { + "epoch": 0.5192273392073627, + "grad_norm": 0.2908034026622772, + "learning_rate": 2.7679043128573538e-05, + "loss": 0.1196, + "step": 29111 + }, + { + "epoch": 0.5192451753290764, + "grad_norm": 0.24959696829319, + "learning_rate": 2.767749557878393e-05, + "loss": 0.079, + "step": 29112 + }, + { + "epoch": 0.5192630114507901, + "grad_norm": 0.248470738530159, + "learning_rate": 2.7675948018615435e-05, + "loss": 0.1372, + "step": 29113 + }, + { + "epoch": 0.5192808475725038, + "grad_norm": 0.3410813808441162, + "learning_rate": 2.7674400448074033e-05, + "loss": 0.1531, + "step": 29114 + }, + { + "epoch": 0.5192986836942175, + "grad_norm": 0.18837669491767883, + "learning_rate": 2.7672852867165734e-05, + "loss": 0.1322, + "step": 29115 + }, + { + "epoch": 0.5193165198159312, + "grad_norm": 0.34150800108909607, + "learning_rate": 2.767130527589653e-05, + "loss": 0.1526, + "step": 29116 + }, + { + "epoch": 0.5193343559376449, + "grad_norm": 0.24078314006328583, + "learning_rate": 2.7669757674272436e-05, + "loss": 0.1193, + "step": 29117 + }, + { + "epoch": 0.5193521920593586, + "grad_norm": 0.3139224052429199, + "learning_rate": 2.766821006229942e-05, + "loss": 0.1463, + "step": 29118 + }, + { + "epoch": 0.5193700281810724, + "grad_norm": 0.30436229705810547, + "learning_rate": 2.766666243998351e-05, + "loss": 0.1565, + "step": 29119 + }, + { + "epoch": 0.519387864302786, + "grad_norm": 0.20879687368869781, + "learning_rate": 2.7665114807330696e-05, + "loss": 0.1221, + "step": 29120 + }, + { + "epoch": 0.5194057004244997, + "grad_norm": 0.2561609447002411, + "learning_rate": 2.766356716434698e-05, + "loss": 0.1266, + "step": 29121 + }, + { + "epoch": 0.5194235365462134, + "grad_norm": 0.1767602264881134, + "learning_rate": 2.766201951103835e-05, + "loss": 0.0932, + "step": 29122 + }, + { + "epoch": 0.5194413726679271, + "grad_norm": 0.37322792410850525, + "learning_rate": 2.7660471847410817e-05, + "loss": 0.1719, + "step": 29123 + }, + { + "epoch": 0.5194592087896408, + "grad_norm": 0.28258487582206726, + "learning_rate": 2.765892417347038e-05, + "loss": 0.1231, + "step": 29124 + }, + { + "epoch": 0.5194770449113545, + "grad_norm": 0.2252558022737503, + "learning_rate": 2.7657376489223023e-05, + "loss": 0.1084, + "step": 29125 + }, + { + "epoch": 0.5194948810330682, + "grad_norm": 0.36280569434165955, + "learning_rate": 2.765582879467477e-05, + "loss": 0.1446, + "step": 29126 + }, + { + "epoch": 0.5195127171547819, + "grad_norm": 0.25010618567466736, + "learning_rate": 2.7654281089831603e-05, + "loss": 0.1232, + "step": 29127 + }, + { + "epoch": 0.5195305532764956, + "grad_norm": 0.3785548210144043, + "learning_rate": 2.7652733374699523e-05, + "loss": 0.2013, + "step": 29128 + }, + { + "epoch": 0.5195483893982092, + "grad_norm": 0.23961438238620758, + "learning_rate": 2.7651185649284533e-05, + "loss": 0.0947, + "step": 29129 + }, + { + "epoch": 0.5195662255199229, + "grad_norm": 0.24644187092781067, + "learning_rate": 2.7649637913592642e-05, + "loss": 0.1389, + "step": 29130 + }, + { + "epoch": 0.5195840616416366, + "grad_norm": 0.34896811842918396, + "learning_rate": 2.7648090167629826e-05, + "loss": 0.1084, + "step": 29131 + }, + { + "epoch": 0.5196018977633503, + "grad_norm": 0.30557531118392944, + "learning_rate": 2.76465424114021e-05, + "loss": 0.1599, + "step": 29132 + }, + { + "epoch": 0.519619733885064, + "grad_norm": 0.2294258177280426, + "learning_rate": 2.7644994644915457e-05, + "loss": 0.1322, + "step": 29133 + }, + { + "epoch": 0.5196375700067777, + "grad_norm": 0.34130120277404785, + "learning_rate": 2.7643446868175916e-05, + "loss": 0.2066, + "step": 29134 + }, + { + "epoch": 0.5196554061284914, + "grad_norm": 0.2521969676017761, + "learning_rate": 2.7641899081189447e-05, + "loss": 0.1307, + "step": 29135 + }, + { + "epoch": 0.5196732422502052, + "grad_norm": 0.3334324061870575, + "learning_rate": 2.7640351283962073e-05, + "loss": 0.1601, + "step": 29136 + }, + { + "epoch": 0.5196910783719189, + "grad_norm": 0.23689663410186768, + "learning_rate": 2.7638803476499793e-05, + "loss": 0.1399, + "step": 29137 + }, + { + "epoch": 0.5197089144936325, + "grad_norm": 0.31018728017807007, + "learning_rate": 2.7637255658808586e-05, + "loss": 0.14, + "step": 29138 + }, + { + "epoch": 0.5197267506153462, + "grad_norm": 0.29798370599746704, + "learning_rate": 2.7635707830894465e-05, + "loss": 0.0937, + "step": 29139 + }, + { + "epoch": 0.5197445867370599, + "grad_norm": 0.23557382822036743, + "learning_rate": 2.763415999276343e-05, + "loss": 0.1255, + "step": 29140 + }, + { + "epoch": 0.5197624228587736, + "grad_norm": 0.2728688716888428, + "learning_rate": 2.763261214442149e-05, + "loss": 0.1028, + "step": 29141 + }, + { + "epoch": 0.5197802589804873, + "grad_norm": 0.24097716808319092, + "learning_rate": 2.763106428587463e-05, + "loss": 0.1255, + "step": 29142 + }, + { + "epoch": 0.519798095102201, + "grad_norm": 0.2832989990711212, + "learning_rate": 2.762951641712886e-05, + "loss": 0.1287, + "step": 29143 + }, + { + "epoch": 0.5198159312239147, + "grad_norm": 0.2786170542240143, + "learning_rate": 2.7627968538190175e-05, + "loss": 0.0946, + "step": 29144 + }, + { + "epoch": 0.5198337673456284, + "grad_norm": 0.3092043697834015, + "learning_rate": 2.7626420649064578e-05, + "loss": 0.1662, + "step": 29145 + }, + { + "epoch": 0.519851603467342, + "grad_norm": 0.365028440952301, + "learning_rate": 2.7624872749758058e-05, + "loss": 0.1975, + "step": 29146 + }, + { + "epoch": 0.5198694395890557, + "grad_norm": 0.3091890513896942, + "learning_rate": 2.7623324840276632e-05, + "loss": 0.1636, + "step": 29147 + }, + { + "epoch": 0.5198872757107694, + "grad_norm": 0.2252136915922165, + "learning_rate": 2.7621776920626285e-05, + "loss": 0.1453, + "step": 29148 + }, + { + "epoch": 0.5199051118324831, + "grad_norm": 0.25473907589912415, + "learning_rate": 2.7620228990813036e-05, + "loss": 0.1226, + "step": 29149 + }, + { + "epoch": 0.5199229479541968, + "grad_norm": 0.2998908460140228, + "learning_rate": 2.7618681050842877e-05, + "loss": 0.1069, + "step": 29150 + }, + { + "epoch": 0.5199407840759105, + "grad_norm": 0.18929672241210938, + "learning_rate": 2.7617133100721793e-05, + "loss": 0.1073, + "step": 29151 + }, + { + "epoch": 0.5199586201976242, + "grad_norm": 0.2511807382106781, + "learning_rate": 2.7615585140455802e-05, + "loss": 0.1323, + "step": 29152 + }, + { + "epoch": 0.519976456319338, + "grad_norm": 0.22078782320022583, + "learning_rate": 2.76140371700509e-05, + "loss": 0.1434, + "step": 29153 + }, + { + "epoch": 0.5199942924410517, + "grad_norm": 0.16528822481632233, + "learning_rate": 2.7612489189513092e-05, + "loss": 0.0767, + "step": 29154 + }, + { + "epoch": 0.5200121285627654, + "grad_norm": 0.206413134932518, + "learning_rate": 2.7610941198848366e-05, + "loss": 0.1483, + "step": 29155 + }, + { + "epoch": 0.520029964684479, + "grad_norm": 0.30228081345558167, + "learning_rate": 2.760939319806274e-05, + "loss": 0.1735, + "step": 29156 + }, + { + "epoch": 0.5200478008061927, + "grad_norm": 0.24667786061763763, + "learning_rate": 2.7607845187162196e-05, + "loss": 0.1105, + "step": 29157 + }, + { + "epoch": 0.5200656369279064, + "grad_norm": 0.3133133053779602, + "learning_rate": 2.7606297166152745e-05, + "loss": 0.1391, + "step": 29158 + }, + { + "epoch": 0.5200834730496201, + "grad_norm": 0.26266568899154663, + "learning_rate": 2.7604749135040387e-05, + "loss": 0.1392, + "step": 29159 + }, + { + "epoch": 0.5201013091713338, + "grad_norm": 0.2189272940158844, + "learning_rate": 2.7603201093831115e-05, + "loss": 0.1236, + "step": 29160 + }, + { + "epoch": 0.5201191452930475, + "grad_norm": 0.31974124908447266, + "learning_rate": 2.7601653042530946e-05, + "loss": 0.1515, + "step": 29161 + }, + { + "epoch": 0.5201369814147612, + "grad_norm": 0.3024396002292633, + "learning_rate": 2.760010498114587e-05, + "loss": 0.1081, + "step": 29162 + }, + { + "epoch": 0.5201548175364749, + "grad_norm": 0.2115083634853363, + "learning_rate": 2.759855690968189e-05, + "loss": 0.1417, + "step": 29163 + }, + { + "epoch": 0.5201726536581885, + "grad_norm": 0.30354738235473633, + "learning_rate": 2.7597008828145e-05, + "loss": 0.1179, + "step": 29164 + }, + { + "epoch": 0.5201904897799022, + "grad_norm": 0.18906204402446747, + "learning_rate": 2.7595460736541214e-05, + "loss": 0.1044, + "step": 29165 + }, + { + "epoch": 0.5202083259016159, + "grad_norm": 0.46158620715141296, + "learning_rate": 2.7593912634876517e-05, + "loss": 0.1332, + "step": 29166 + }, + { + "epoch": 0.5202261620233296, + "grad_norm": 0.2531507909297943, + "learning_rate": 2.7592364523156922e-05, + "loss": 0.1007, + "step": 29167 + }, + { + "epoch": 0.5202439981450433, + "grad_norm": 0.35190489888191223, + "learning_rate": 2.7590816401388426e-05, + "loss": 0.1051, + "step": 29168 + }, + { + "epoch": 0.520261834266757, + "grad_norm": 0.29423987865448, + "learning_rate": 2.758926826957704e-05, + "loss": 0.181, + "step": 29169 + }, + { + "epoch": 0.5202796703884708, + "grad_norm": 0.2975088953971863, + "learning_rate": 2.7587720127728745e-05, + "loss": 0.1165, + "step": 29170 + }, + { + "epoch": 0.5202975065101845, + "grad_norm": 0.2691587209701538, + "learning_rate": 2.7586171975849556e-05, + "loss": 0.1373, + "step": 29171 + }, + { + "epoch": 0.5203153426318982, + "grad_norm": 0.2392454892396927, + "learning_rate": 2.7584623813945472e-05, + "loss": 0.1274, + "step": 29172 + }, + { + "epoch": 0.5203331787536118, + "grad_norm": 0.20323362946510315, + "learning_rate": 2.75830756420225e-05, + "loss": 0.1225, + "step": 29173 + }, + { + "epoch": 0.5203510148753255, + "grad_norm": 0.21179316937923431, + "learning_rate": 2.7581527460086614e-05, + "loss": 0.1458, + "step": 29174 + }, + { + "epoch": 0.5203688509970392, + "grad_norm": 0.22890907526016235, + "learning_rate": 2.7579979268143858e-05, + "loss": 0.1143, + "step": 29175 + }, + { + "epoch": 0.5203866871187529, + "grad_norm": 0.20152054727077484, + "learning_rate": 2.7578431066200204e-05, + "loss": 0.1288, + "step": 29176 + }, + { + "epoch": 0.5204045232404666, + "grad_norm": 0.35340768098831177, + "learning_rate": 2.757688285426166e-05, + "loss": 0.1303, + "step": 29177 + }, + { + "epoch": 0.5204223593621803, + "grad_norm": 0.2403675615787506, + "learning_rate": 2.7575334632334225e-05, + "loss": 0.0759, + "step": 29178 + }, + { + "epoch": 0.520440195483894, + "grad_norm": 0.30929702520370483, + "learning_rate": 2.7573786400423908e-05, + "loss": 0.1185, + "step": 29179 + }, + { + "epoch": 0.5204580316056077, + "grad_norm": 0.2516457736492157, + "learning_rate": 2.75722381585367e-05, + "loss": 0.148, + "step": 29180 + }, + { + "epoch": 0.5204758677273214, + "grad_norm": 0.31668320298194885, + "learning_rate": 2.7570689906678615e-05, + "loss": 0.1023, + "step": 29181 + }, + { + "epoch": 0.520493703849035, + "grad_norm": 0.24320733547210693, + "learning_rate": 2.7569141644855645e-05, + "loss": 0.1016, + "step": 29182 + }, + { + "epoch": 0.5205115399707487, + "grad_norm": 0.2117556780576706, + "learning_rate": 2.7567593373073796e-05, + "loss": 0.0875, + "step": 29183 + }, + { + "epoch": 0.5205293760924624, + "grad_norm": 0.3125084638595581, + "learning_rate": 2.7566045091339073e-05, + "loss": 0.1164, + "step": 29184 + }, + { + "epoch": 0.5205472122141761, + "grad_norm": 0.2813771963119507, + "learning_rate": 2.7564496799657462e-05, + "loss": 0.1099, + "step": 29185 + }, + { + "epoch": 0.5205650483358898, + "grad_norm": 0.3593194782733917, + "learning_rate": 2.7562948498034985e-05, + "loss": 0.2107, + "step": 29186 + }, + { + "epoch": 0.5205828844576036, + "grad_norm": 0.24496445059776306, + "learning_rate": 2.7561400186477627e-05, + "loss": 0.1343, + "step": 29187 + }, + { + "epoch": 0.5206007205793173, + "grad_norm": 0.26318058371543884, + "learning_rate": 2.7559851864991397e-05, + "loss": 0.158, + "step": 29188 + }, + { + "epoch": 0.520618556701031, + "grad_norm": 0.5840705633163452, + "learning_rate": 2.7558303533582308e-05, + "loss": 0.2025, + "step": 29189 + }, + { + "epoch": 0.5206363928227447, + "grad_norm": 0.19877250492572784, + "learning_rate": 2.755675519225634e-05, + "loss": 0.1418, + "step": 29190 + }, + { + "epoch": 0.5206542289444583, + "grad_norm": 0.21138867735862732, + "learning_rate": 2.7555206841019515e-05, + "loss": 0.1066, + "step": 29191 + }, + { + "epoch": 0.520672065066172, + "grad_norm": 0.34063977003097534, + "learning_rate": 2.7553658479877814e-05, + "loss": 0.1652, + "step": 29192 + }, + { + "epoch": 0.5206899011878857, + "grad_norm": 0.24009549617767334, + "learning_rate": 2.7552110108837263e-05, + "loss": 0.133, + "step": 29193 + }, + { + "epoch": 0.5207077373095994, + "grad_norm": 0.242417111992836, + "learning_rate": 2.7550561727903845e-05, + "loss": 0.1395, + "step": 29194 + }, + { + "epoch": 0.5207255734313131, + "grad_norm": 0.21727785468101501, + "learning_rate": 2.7549013337083567e-05, + "loss": 0.1326, + "step": 29195 + }, + { + "epoch": 0.5207434095530268, + "grad_norm": 0.2895382344722748, + "learning_rate": 2.7547464936382433e-05, + "loss": 0.1272, + "step": 29196 + }, + { + "epoch": 0.5207612456747405, + "grad_norm": 0.31297650933265686, + "learning_rate": 2.7545916525806446e-05, + "loss": 0.1758, + "step": 29197 + }, + { + "epoch": 0.5207790817964542, + "grad_norm": 0.3134779930114746, + "learning_rate": 2.754436810536161e-05, + "loss": 0.186, + "step": 29198 + }, + { + "epoch": 0.5207969179181678, + "grad_norm": 0.2492859810590744, + "learning_rate": 2.7542819675053927e-05, + "loss": 0.1513, + "step": 29199 + }, + { + "epoch": 0.5208147540398815, + "grad_norm": 0.26196470856666565, + "learning_rate": 2.7541271234889393e-05, + "loss": 0.0971, + "step": 29200 + }, + { + "epoch": 0.5208325901615952, + "grad_norm": 0.2854453921318054, + "learning_rate": 2.753972278487401e-05, + "loss": 0.1331, + "step": 29201 + }, + { + "epoch": 0.5208504262833089, + "grad_norm": 0.24749097228050232, + "learning_rate": 2.7538174325013788e-05, + "loss": 0.0945, + "step": 29202 + }, + { + "epoch": 0.5208682624050226, + "grad_norm": 0.2798328101634979, + "learning_rate": 2.7536625855314725e-05, + "loss": 0.1631, + "step": 29203 + }, + { + "epoch": 0.5208860985267364, + "grad_norm": 0.3242731988430023, + "learning_rate": 2.753507737578283e-05, + "loss": 0.1241, + "step": 29204 + }, + { + "epoch": 0.5209039346484501, + "grad_norm": 0.37805265188217163, + "learning_rate": 2.7533528886424092e-05, + "loss": 0.0872, + "step": 29205 + }, + { + "epoch": 0.5209217707701638, + "grad_norm": 0.5689181089401245, + "learning_rate": 2.7531980387244526e-05, + "loss": 0.1625, + "step": 29206 + }, + { + "epoch": 0.5209396068918775, + "grad_norm": 0.341597318649292, + "learning_rate": 2.7530431878250124e-05, + "loss": 0.1528, + "step": 29207 + }, + { + "epoch": 0.5209574430135911, + "grad_norm": 0.21227316558361053, + "learning_rate": 2.7528883359446895e-05, + "loss": 0.1148, + "step": 29208 + }, + { + "epoch": 0.5209752791353048, + "grad_norm": 0.2937573790550232, + "learning_rate": 2.7527334830840838e-05, + "loss": 0.1443, + "step": 29209 + }, + { + "epoch": 0.5209931152570185, + "grad_norm": 0.23038238286972046, + "learning_rate": 2.7525786292437967e-05, + "loss": 0.1435, + "step": 29210 + }, + { + "epoch": 0.5210109513787322, + "grad_norm": 0.22587664425373077, + "learning_rate": 2.752423774424427e-05, + "loss": 0.1457, + "step": 29211 + }, + { + "epoch": 0.5210287875004459, + "grad_norm": 0.24479340016841888, + "learning_rate": 2.752268918626576e-05, + "loss": 0.1733, + "step": 29212 + }, + { + "epoch": 0.5210466236221596, + "grad_norm": 0.23688150942325592, + "learning_rate": 2.7521140618508434e-05, + "loss": 0.1631, + "step": 29213 + }, + { + "epoch": 0.5210644597438733, + "grad_norm": 0.245303213596344, + "learning_rate": 2.7519592040978297e-05, + "loss": 0.172, + "step": 29214 + }, + { + "epoch": 0.521082295865587, + "grad_norm": 0.2524663805961609, + "learning_rate": 2.7518043453681347e-05, + "loss": 0.1346, + "step": 29215 + }, + { + "epoch": 0.5211001319873007, + "grad_norm": 0.3133960962295532, + "learning_rate": 2.7516494856623593e-05, + "loss": 0.1394, + "step": 29216 + }, + { + "epoch": 0.5211179681090143, + "grad_norm": 0.24628432095050812, + "learning_rate": 2.7514946249811035e-05, + "loss": 0.1725, + "step": 29217 + }, + { + "epoch": 0.521135804230728, + "grad_norm": 0.240559920668602, + "learning_rate": 2.7513397633249677e-05, + "loss": 0.1251, + "step": 29218 + }, + { + "epoch": 0.5211536403524417, + "grad_norm": 0.3326775133609772, + "learning_rate": 2.751184900694552e-05, + "loss": 0.1541, + "step": 29219 + }, + { + "epoch": 0.5211714764741555, + "grad_norm": 0.2751973569393158, + "learning_rate": 2.7510300370904574e-05, + "loss": 0.1813, + "step": 29220 + }, + { + "epoch": 0.5211893125958692, + "grad_norm": 0.2630683481693268, + "learning_rate": 2.7508751725132837e-05, + "loss": 0.1645, + "step": 29221 + }, + { + "epoch": 0.5212071487175829, + "grad_norm": 0.36335527896881104, + "learning_rate": 2.7507203069636305e-05, + "loss": 0.1944, + "step": 29222 + }, + { + "epoch": 0.5212249848392966, + "grad_norm": 0.31136658787727356, + "learning_rate": 2.7505654404420987e-05, + "loss": 0.168, + "step": 29223 + }, + { + "epoch": 0.5212428209610103, + "grad_norm": 0.24197950959205627, + "learning_rate": 2.750410572949289e-05, + "loss": 0.0985, + "step": 29224 + }, + { + "epoch": 0.521260657082724, + "grad_norm": 0.2367485612630844, + "learning_rate": 2.7502557044858017e-05, + "loss": 0.1347, + "step": 29225 + }, + { + "epoch": 0.5212784932044376, + "grad_norm": 0.2614814341068268, + "learning_rate": 2.7501008350522368e-05, + "loss": 0.1297, + "step": 29226 + }, + { + "epoch": 0.5212963293261513, + "grad_norm": 0.22772814333438873, + "learning_rate": 2.749945964649195e-05, + "loss": 0.1097, + "step": 29227 + }, + { + "epoch": 0.521314165447865, + "grad_norm": 0.28534895181655884, + "learning_rate": 2.7497910932772763e-05, + "loss": 0.1612, + "step": 29228 + }, + { + "epoch": 0.5213320015695787, + "grad_norm": 0.28451213240623474, + "learning_rate": 2.7496362209370796e-05, + "loss": 0.131, + "step": 29229 + }, + { + "epoch": 0.5213498376912924, + "grad_norm": 0.35889771580696106, + "learning_rate": 2.749481347629208e-05, + "loss": 0.1324, + "step": 29230 + }, + { + "epoch": 0.5213676738130061, + "grad_norm": 0.4010239839553833, + "learning_rate": 2.7493264733542602e-05, + "loss": 0.1401, + "step": 29231 + }, + { + "epoch": 0.5213855099347198, + "grad_norm": 0.24921678006649017, + "learning_rate": 2.7491715981128374e-05, + "loss": 0.161, + "step": 29232 + }, + { + "epoch": 0.5214033460564335, + "grad_norm": 0.21963492035865784, + "learning_rate": 2.7490167219055385e-05, + "loss": 0.1274, + "step": 29233 + }, + { + "epoch": 0.5214211821781471, + "grad_norm": 0.25317490100860596, + "learning_rate": 2.7488618447329655e-05, + "loss": 0.1928, + "step": 29234 + }, + { + "epoch": 0.5214390182998608, + "grad_norm": 0.31491875648498535, + "learning_rate": 2.7487069665957176e-05, + "loss": 0.1343, + "step": 29235 + }, + { + "epoch": 0.5214568544215745, + "grad_norm": 0.262983500957489, + "learning_rate": 2.748552087494396e-05, + "loss": 0.1497, + "step": 29236 + }, + { + "epoch": 0.5214746905432883, + "grad_norm": 0.22808369994163513, + "learning_rate": 2.7483972074296e-05, + "loss": 0.1837, + "step": 29237 + }, + { + "epoch": 0.521492526665002, + "grad_norm": 0.2774639129638672, + "learning_rate": 2.7482423264019315e-05, + "loss": 0.1462, + "step": 29238 + }, + { + "epoch": 0.5215103627867157, + "grad_norm": 0.2729647755622864, + "learning_rate": 2.748087444411989e-05, + "loss": 0.1498, + "step": 29239 + }, + { + "epoch": 0.5215281989084294, + "grad_norm": 0.259782999753952, + "learning_rate": 2.747932561460375e-05, + "loss": 0.1505, + "step": 29240 + }, + { + "epoch": 0.5215460350301431, + "grad_norm": 0.20661120116710663, + "learning_rate": 2.747777677547689e-05, + "loss": 0.1396, + "step": 29241 + }, + { + "epoch": 0.5215638711518568, + "grad_norm": 0.47408100962638855, + "learning_rate": 2.7476227926745297e-05, + "loss": 0.1596, + "step": 29242 + }, + { + "epoch": 0.5215817072735704, + "grad_norm": 0.21687044203281403, + "learning_rate": 2.7474679068414992e-05, + "loss": 0.1039, + "step": 29243 + }, + { + "epoch": 0.5215995433952841, + "grad_norm": 0.21080181002616882, + "learning_rate": 2.7473130200491982e-05, + "loss": 0.0841, + "step": 29244 + }, + { + "epoch": 0.5216173795169978, + "grad_norm": 0.23617619276046753, + "learning_rate": 2.7471581322982266e-05, + "loss": 0.1861, + "step": 29245 + }, + { + "epoch": 0.5216352156387115, + "grad_norm": 0.2647800147533417, + "learning_rate": 2.747003243589184e-05, + "loss": 0.1325, + "step": 29246 + }, + { + "epoch": 0.5216530517604252, + "grad_norm": 0.3150482475757599, + "learning_rate": 2.7468483539226724e-05, + "loss": 0.1759, + "step": 29247 + }, + { + "epoch": 0.5216708878821389, + "grad_norm": 0.2943536341190338, + "learning_rate": 2.7466934632992908e-05, + "loss": 0.1252, + "step": 29248 + }, + { + "epoch": 0.5216887240038526, + "grad_norm": 0.3113054931163788, + "learning_rate": 2.7465385717196402e-05, + "loss": 0.1267, + "step": 29249 + }, + { + "epoch": 0.5217065601255663, + "grad_norm": 0.26860058307647705, + "learning_rate": 2.7463836791843206e-05, + "loss": 0.1281, + "step": 29250 + }, + { + "epoch": 0.52172439624728, + "grad_norm": 0.20125220715999603, + "learning_rate": 2.7462287856939334e-05, + "loss": 0.0949, + "step": 29251 + }, + { + "epoch": 0.5217422323689936, + "grad_norm": 0.2599751353263855, + "learning_rate": 2.7460738912490776e-05, + "loss": 0.116, + "step": 29252 + }, + { + "epoch": 0.5217600684907073, + "grad_norm": 0.3939947485923767, + "learning_rate": 2.745918995850355e-05, + "loss": 0.2015, + "step": 29253 + }, + { + "epoch": 0.5217779046124211, + "grad_norm": 0.20629280805587769, + "learning_rate": 2.745764099498366e-05, + "loss": 0.1276, + "step": 29254 + }, + { + "epoch": 0.5217957407341348, + "grad_norm": 0.22894804179668427, + "learning_rate": 2.7456092021937097e-05, + "loss": 0.1373, + "step": 29255 + }, + { + "epoch": 0.5218135768558485, + "grad_norm": 0.37440380454063416, + "learning_rate": 2.7454543039369868e-05, + "loss": 0.1045, + "step": 29256 + }, + { + "epoch": 0.5218314129775622, + "grad_norm": 0.2669689357280731, + "learning_rate": 2.745299404728799e-05, + "loss": 0.1817, + "step": 29257 + }, + { + "epoch": 0.5218492490992759, + "grad_norm": 0.2596376836299896, + "learning_rate": 2.745144504569746e-05, + "loss": 0.1442, + "step": 29258 + }, + { + "epoch": 0.5218670852209896, + "grad_norm": 0.2961863577365875, + "learning_rate": 2.7449896034604276e-05, + "loss": 0.1603, + "step": 29259 + }, + { + "epoch": 0.5218849213427033, + "grad_norm": 0.24457305669784546, + "learning_rate": 2.744834701401446e-05, + "loss": 0.1184, + "step": 29260 + }, + { + "epoch": 0.5219027574644169, + "grad_norm": 0.29331278800964355, + "learning_rate": 2.744679798393399e-05, + "loss": 0.1251, + "step": 29261 + }, + { + "epoch": 0.5219205935861306, + "grad_norm": 0.4266640245914459, + "learning_rate": 2.7445248944368902e-05, + "loss": 0.1462, + "step": 29262 + }, + { + "epoch": 0.5219384297078443, + "grad_norm": 0.2676289677619934, + "learning_rate": 2.7443699895325176e-05, + "loss": 0.1904, + "step": 29263 + }, + { + "epoch": 0.521956265829558, + "grad_norm": 0.2730562686920166, + "learning_rate": 2.7442150836808822e-05, + "loss": 0.1253, + "step": 29264 + }, + { + "epoch": 0.5219741019512717, + "grad_norm": 0.27988651394844055, + "learning_rate": 2.7440601768825852e-05, + "loss": 0.1588, + "step": 29265 + }, + { + "epoch": 0.5219919380729854, + "grad_norm": 0.3188876211643219, + "learning_rate": 2.7439052691382273e-05, + "loss": 0.1372, + "step": 29266 + }, + { + "epoch": 0.5220097741946991, + "grad_norm": 0.25485074520111084, + "learning_rate": 2.7437503604484078e-05, + "loss": 0.1114, + "step": 29267 + }, + { + "epoch": 0.5220276103164128, + "grad_norm": 0.35971009731292725, + "learning_rate": 2.7435954508137275e-05, + "loss": 0.1477, + "step": 29268 + }, + { + "epoch": 0.5220454464381264, + "grad_norm": 0.25260525941848755, + "learning_rate": 2.7434405402347873e-05, + "loss": 0.1092, + "step": 29269 + }, + { + "epoch": 0.5220632825598401, + "grad_norm": 0.26016655564308167, + "learning_rate": 2.743285628712187e-05, + "loss": 0.165, + "step": 29270 + }, + { + "epoch": 0.5220811186815539, + "grad_norm": 0.2793441712856293, + "learning_rate": 2.7431307162465282e-05, + "loss": 0.2104, + "step": 29271 + }, + { + "epoch": 0.5220989548032676, + "grad_norm": 0.29178282618522644, + "learning_rate": 2.74297580283841e-05, + "loss": 0.1044, + "step": 29272 + }, + { + "epoch": 0.5221167909249813, + "grad_norm": 0.32044875621795654, + "learning_rate": 2.7428208884884347e-05, + "loss": 0.1868, + "step": 29273 + }, + { + "epoch": 0.522134627046695, + "grad_norm": 0.2564692497253418, + "learning_rate": 2.742665973197201e-05, + "loss": 0.1178, + "step": 29274 + }, + { + "epoch": 0.5221524631684087, + "grad_norm": 0.22253145277500153, + "learning_rate": 2.7425110569653106e-05, + "loss": 0.1343, + "step": 29275 + }, + { + "epoch": 0.5221702992901224, + "grad_norm": 0.2613027095794678, + "learning_rate": 2.7423561397933635e-05, + "loss": 0.1427, + "step": 29276 + }, + { + "epoch": 0.5221881354118361, + "grad_norm": 0.22599023580551147, + "learning_rate": 2.7422012216819603e-05, + "loss": 0.1203, + "step": 29277 + }, + { + "epoch": 0.5222059715335498, + "grad_norm": 0.30845287442207336, + "learning_rate": 2.742046302631701e-05, + "loss": 0.1195, + "step": 29278 + }, + { + "epoch": 0.5222238076552634, + "grad_norm": 0.2735668420791626, + "learning_rate": 2.7418913826431874e-05, + "loss": 0.143, + "step": 29279 + }, + { + "epoch": 0.5222416437769771, + "grad_norm": 0.2763333320617676, + "learning_rate": 2.7417364617170195e-05, + "loss": 0.1242, + "step": 29280 + }, + { + "epoch": 0.5222594798986908, + "grad_norm": 0.26139646768569946, + "learning_rate": 2.7415815398537964e-05, + "loss": 0.142, + "step": 29281 + }, + { + "epoch": 0.5222773160204045, + "grad_norm": 0.32071706652641296, + "learning_rate": 2.741426617054121e-05, + "loss": 0.1671, + "step": 29282 + }, + { + "epoch": 0.5222951521421182, + "grad_norm": 0.2634124457836151, + "learning_rate": 2.7412716933185918e-05, + "loss": 0.1413, + "step": 29283 + }, + { + "epoch": 0.5223129882638319, + "grad_norm": 0.25278130173683167, + "learning_rate": 2.7411167686478102e-05, + "loss": 0.1116, + "step": 29284 + }, + { + "epoch": 0.5223308243855456, + "grad_norm": 0.25004836916923523, + "learning_rate": 2.740961843042377e-05, + "loss": 0.1082, + "step": 29285 + }, + { + "epoch": 0.5223486605072593, + "grad_norm": 0.29337209463119507, + "learning_rate": 2.7408069165028928e-05, + "loss": 0.1598, + "step": 29286 + }, + { + "epoch": 0.5223664966289729, + "grad_norm": 0.19883732497692108, + "learning_rate": 2.740651989029957e-05, + "loss": 0.1237, + "step": 29287 + }, + { + "epoch": 0.5223843327506867, + "grad_norm": 0.37855368852615356, + "learning_rate": 2.7404970606241715e-05, + "loss": 0.1433, + "step": 29288 + }, + { + "epoch": 0.5224021688724004, + "grad_norm": 0.3001745343208313, + "learning_rate": 2.740342131286136e-05, + "loss": 0.0821, + "step": 29289 + }, + { + "epoch": 0.5224200049941141, + "grad_norm": 0.35412755608558655, + "learning_rate": 2.740187201016452e-05, + "loss": 0.1582, + "step": 29290 + }, + { + "epoch": 0.5224378411158278, + "grad_norm": 0.25114208459854126, + "learning_rate": 2.740032269815719e-05, + "loss": 0.1417, + "step": 29291 + }, + { + "epoch": 0.5224556772375415, + "grad_norm": 0.2741473913192749, + "learning_rate": 2.7398773376845384e-05, + "loss": 0.1382, + "step": 29292 + }, + { + "epoch": 0.5224735133592552, + "grad_norm": 0.25901567935943604, + "learning_rate": 2.73972240462351e-05, + "loss": 0.1593, + "step": 29293 + }, + { + "epoch": 0.5224913494809689, + "grad_norm": 0.23876120150089264, + "learning_rate": 2.7395674706332347e-05, + "loss": 0.1332, + "step": 29294 + }, + { + "epoch": 0.5225091856026826, + "grad_norm": 0.30361801385879517, + "learning_rate": 2.7394125357143135e-05, + "loss": 0.1273, + "step": 29295 + }, + { + "epoch": 0.5225270217243962, + "grad_norm": 0.2836470603942871, + "learning_rate": 2.7392575998673465e-05, + "loss": 0.1423, + "step": 29296 + }, + { + "epoch": 0.5225448578461099, + "grad_norm": 0.2937440276145935, + "learning_rate": 2.7391026630929344e-05, + "loss": 0.1517, + "step": 29297 + }, + { + "epoch": 0.5225626939678236, + "grad_norm": 0.28235161304473877, + "learning_rate": 2.7389477253916777e-05, + "loss": 0.1778, + "step": 29298 + }, + { + "epoch": 0.5225805300895373, + "grad_norm": 0.31325188279151917, + "learning_rate": 2.738792786764177e-05, + "loss": 0.1573, + "step": 29299 + }, + { + "epoch": 0.522598366211251, + "grad_norm": 0.22566524147987366, + "learning_rate": 2.738637847211033e-05, + "loss": 0.1324, + "step": 29300 + }, + { + "epoch": 0.5226162023329647, + "grad_norm": 0.3627447187900543, + "learning_rate": 2.7384829067328465e-05, + "loss": 0.1744, + "step": 29301 + }, + { + "epoch": 0.5226340384546784, + "grad_norm": 0.2041521668434143, + "learning_rate": 2.7383279653302175e-05, + "loss": 0.1194, + "step": 29302 + }, + { + "epoch": 0.5226518745763921, + "grad_norm": 0.2611396312713623, + "learning_rate": 2.7381730230037477e-05, + "loss": 0.1443, + "step": 29303 + }, + { + "epoch": 0.5226697106981057, + "grad_norm": 0.23618021607398987, + "learning_rate": 2.7380180797540363e-05, + "loss": 0.1517, + "step": 29304 + }, + { + "epoch": 0.5226875468198195, + "grad_norm": 0.429051011800766, + "learning_rate": 2.7378631355816854e-05, + "loss": 0.1712, + "step": 29305 + }, + { + "epoch": 0.5227053829415332, + "grad_norm": 0.19206973910331726, + "learning_rate": 2.7377081904872938e-05, + "loss": 0.1587, + "step": 29306 + }, + { + "epoch": 0.5227232190632469, + "grad_norm": 0.3254934549331665, + "learning_rate": 2.7375532444714635e-05, + "loss": 0.1174, + "step": 29307 + }, + { + "epoch": 0.5227410551849606, + "grad_norm": 0.2245730310678482, + "learning_rate": 2.737398297534795e-05, + "loss": 0.181, + "step": 29308 + }, + { + "epoch": 0.5227588913066743, + "grad_norm": 0.24527154862880707, + "learning_rate": 2.7372433496778887e-05, + "loss": 0.1603, + "step": 29309 + }, + { + "epoch": 0.522776727428388, + "grad_norm": 0.2554319202899933, + "learning_rate": 2.7370884009013452e-05, + "loss": 0.1258, + "step": 29310 + }, + { + "epoch": 0.5227945635501017, + "grad_norm": 0.25624948740005493, + "learning_rate": 2.7369334512057653e-05, + "loss": 0.1187, + "step": 29311 + }, + { + "epoch": 0.5228123996718154, + "grad_norm": 0.3218866288661957, + "learning_rate": 2.736778500591749e-05, + "loss": 0.1471, + "step": 29312 + }, + { + "epoch": 0.522830235793529, + "grad_norm": 0.2931661009788513, + "learning_rate": 2.736623549059898e-05, + "loss": 0.1467, + "step": 29313 + }, + { + "epoch": 0.5228480719152427, + "grad_norm": 0.284283310174942, + "learning_rate": 2.7364685966108122e-05, + "loss": 0.1063, + "step": 29314 + }, + { + "epoch": 0.5228659080369564, + "grad_norm": 0.23836104571819305, + "learning_rate": 2.736313643245092e-05, + "loss": 0.1161, + "step": 29315 + }, + { + "epoch": 0.5228837441586701, + "grad_norm": 0.3303387761116028, + "learning_rate": 2.7361586889633396e-05, + "loss": 0.1341, + "step": 29316 + }, + { + "epoch": 0.5229015802803838, + "grad_norm": 0.2390783131122589, + "learning_rate": 2.7360037337661536e-05, + "loss": 0.1362, + "step": 29317 + }, + { + "epoch": 0.5229194164020975, + "grad_norm": 0.21096543967723846, + "learning_rate": 2.7358487776541365e-05, + "loss": 0.095, + "step": 29318 + }, + { + "epoch": 0.5229372525238112, + "grad_norm": 0.2093539834022522, + "learning_rate": 2.735693820627887e-05, + "loss": 0.1408, + "step": 29319 + }, + { + "epoch": 0.5229550886455249, + "grad_norm": 0.30725032091140747, + "learning_rate": 2.7355388626880075e-05, + "loss": 0.1338, + "step": 29320 + }, + { + "epoch": 0.5229729247672386, + "grad_norm": 0.1974681168794632, + "learning_rate": 2.735383903835098e-05, + "loss": 0.0951, + "step": 29321 + }, + { + "epoch": 0.5229907608889524, + "grad_norm": 0.36075061559677124, + "learning_rate": 2.7352289440697587e-05, + "loss": 0.146, + "step": 29322 + }, + { + "epoch": 0.523008597010666, + "grad_norm": 0.22667114436626434, + "learning_rate": 2.7350739833925914e-05, + "loss": 0.1223, + "step": 29323 + }, + { + "epoch": 0.5230264331323797, + "grad_norm": 0.2949930429458618, + "learning_rate": 2.734919021804196e-05, + "loss": 0.1318, + "step": 29324 + }, + { + "epoch": 0.5230442692540934, + "grad_norm": 0.45312586426734924, + "learning_rate": 2.7347640593051732e-05, + "loss": 0.1577, + "step": 29325 + }, + { + "epoch": 0.5230621053758071, + "grad_norm": 0.18610809743404388, + "learning_rate": 2.7346090958961234e-05, + "loss": 0.127, + "step": 29326 + }, + { + "epoch": 0.5230799414975208, + "grad_norm": 0.3281126618385315, + "learning_rate": 2.7344541315776482e-05, + "loss": 0.1535, + "step": 29327 + }, + { + "epoch": 0.5230977776192345, + "grad_norm": 0.23575715720653534, + "learning_rate": 2.7342991663503477e-05, + "loss": 0.1605, + "step": 29328 + }, + { + "epoch": 0.5231156137409482, + "grad_norm": 0.25598421692848206, + "learning_rate": 2.7341442002148225e-05, + "loss": 0.1259, + "step": 29329 + }, + { + "epoch": 0.5231334498626619, + "grad_norm": 0.3533547818660736, + "learning_rate": 2.7339892331716737e-05, + "loss": 0.1431, + "step": 29330 + }, + { + "epoch": 0.5231512859843755, + "grad_norm": 0.31133317947387695, + "learning_rate": 2.733834265221502e-05, + "loss": 0.1339, + "step": 29331 + }, + { + "epoch": 0.5231691221060892, + "grad_norm": 0.25769299268722534, + "learning_rate": 2.7336792963649076e-05, + "loss": 0.116, + "step": 29332 + }, + { + "epoch": 0.5231869582278029, + "grad_norm": 0.20682445168495178, + "learning_rate": 2.7335243266024917e-05, + "loss": 0.139, + "step": 29333 + }, + { + "epoch": 0.5232047943495166, + "grad_norm": 0.3065132200717926, + "learning_rate": 2.7333693559348554e-05, + "loss": 0.1417, + "step": 29334 + }, + { + "epoch": 0.5232226304712303, + "grad_norm": 0.3109782934188843, + "learning_rate": 2.733214384362598e-05, + "loss": 0.1765, + "step": 29335 + }, + { + "epoch": 0.523240466592944, + "grad_norm": 0.23374749720096588, + "learning_rate": 2.7330594118863218e-05, + "loss": 0.1415, + "step": 29336 + }, + { + "epoch": 0.5232583027146577, + "grad_norm": 0.29835987091064453, + "learning_rate": 2.7329044385066266e-05, + "loss": 0.1742, + "step": 29337 + }, + { + "epoch": 0.5232761388363715, + "grad_norm": 0.25371772050857544, + "learning_rate": 2.7327494642241132e-05, + "loss": 0.1149, + "step": 29338 + }, + { + "epoch": 0.5232939749580852, + "grad_norm": 0.2961961627006531, + "learning_rate": 2.7325944890393823e-05, + "loss": 0.1162, + "step": 29339 + }, + { + "epoch": 0.5233118110797988, + "grad_norm": 0.23892581462860107, + "learning_rate": 2.732439512953035e-05, + "loss": 0.1237, + "step": 29340 + }, + { + "epoch": 0.5233296472015125, + "grad_norm": 0.21712057292461395, + "learning_rate": 2.7322845359656717e-05, + "loss": 0.1122, + "step": 29341 + }, + { + "epoch": 0.5233474833232262, + "grad_norm": 0.21730738878250122, + "learning_rate": 2.7321295580778937e-05, + "loss": 0.113, + "step": 29342 + }, + { + "epoch": 0.5233653194449399, + "grad_norm": 0.30356866121292114, + "learning_rate": 2.731974579290301e-05, + "loss": 0.1537, + "step": 29343 + }, + { + "epoch": 0.5233831555666536, + "grad_norm": 0.25511807203292847, + "learning_rate": 2.7318195996034952e-05, + "loss": 0.1031, + "step": 29344 + }, + { + "epoch": 0.5234009916883673, + "grad_norm": 0.2114313542842865, + "learning_rate": 2.731664619018076e-05, + "loss": 0.1112, + "step": 29345 + }, + { + "epoch": 0.523418827810081, + "grad_norm": 0.2681681215763092, + "learning_rate": 2.7315096375346456e-05, + "loss": 0.1144, + "step": 29346 + }, + { + "epoch": 0.5234366639317947, + "grad_norm": 0.30722877383232117, + "learning_rate": 2.7313546551538027e-05, + "loss": 0.1638, + "step": 29347 + }, + { + "epoch": 0.5234545000535084, + "grad_norm": 0.3154377043247223, + "learning_rate": 2.7311996718761494e-05, + "loss": 0.1363, + "step": 29348 + }, + { + "epoch": 0.523472336175222, + "grad_norm": 0.1898418515920639, + "learning_rate": 2.731044687702287e-05, + "loss": 0.1374, + "step": 29349 + }, + { + "epoch": 0.5234901722969357, + "grad_norm": 0.32936128973960876, + "learning_rate": 2.7308897026328156e-05, + "loss": 0.1515, + "step": 29350 + }, + { + "epoch": 0.5235080084186494, + "grad_norm": 0.21824435889720917, + "learning_rate": 2.7307347166683362e-05, + "loss": 0.1338, + "step": 29351 + }, + { + "epoch": 0.5235258445403631, + "grad_norm": 0.2480928599834442, + "learning_rate": 2.7305797298094483e-05, + "loss": 0.1383, + "step": 29352 + }, + { + "epoch": 0.5235436806620768, + "grad_norm": 0.27887973189353943, + "learning_rate": 2.7304247420567546e-05, + "loss": 0.1274, + "step": 29353 + }, + { + "epoch": 0.5235615167837905, + "grad_norm": 0.26893147826194763, + "learning_rate": 2.7302697534108545e-05, + "loss": 0.1282, + "step": 29354 + }, + { + "epoch": 0.5235793529055043, + "grad_norm": 0.23445633053779602, + "learning_rate": 2.730114763872349e-05, + "loss": 0.1175, + "step": 29355 + }, + { + "epoch": 0.523597189027218, + "grad_norm": 0.22287671267986298, + "learning_rate": 2.7299597734418396e-05, + "loss": 0.1091, + "step": 29356 + }, + { + "epoch": 0.5236150251489317, + "grad_norm": 0.3193022906780243, + "learning_rate": 2.729804782119927e-05, + "loss": 0.1483, + "step": 29357 + }, + { + "epoch": 0.5236328612706453, + "grad_norm": 0.2530090808868408, + "learning_rate": 2.7296497899072114e-05, + "loss": 0.1366, + "step": 29358 + }, + { + "epoch": 0.523650697392359, + "grad_norm": 0.2959394156932831, + "learning_rate": 2.7294947968042944e-05, + "loss": 0.1365, + "step": 29359 + }, + { + "epoch": 0.5236685335140727, + "grad_norm": 0.24657073616981506, + "learning_rate": 2.7293398028117757e-05, + "loss": 0.0942, + "step": 29360 + }, + { + "epoch": 0.5236863696357864, + "grad_norm": 0.2343170940876007, + "learning_rate": 2.729184807930256e-05, + "loss": 0.1508, + "step": 29361 + }, + { + "epoch": 0.5237042057575001, + "grad_norm": 0.27657824754714966, + "learning_rate": 2.7290298121603387e-05, + "loss": 0.1351, + "step": 29362 + }, + { + "epoch": 0.5237220418792138, + "grad_norm": 0.38412174582481384, + "learning_rate": 2.7288748155026213e-05, + "loss": 0.1497, + "step": 29363 + }, + { + "epoch": 0.5237398780009275, + "grad_norm": 0.227009579539299, + "learning_rate": 2.7287198179577066e-05, + "loss": 0.139, + "step": 29364 + }, + { + "epoch": 0.5237577141226412, + "grad_norm": 0.2263375222682953, + "learning_rate": 2.7285648195261948e-05, + "loss": 0.1453, + "step": 29365 + }, + { + "epoch": 0.5237755502443548, + "grad_norm": 0.24828296899795532, + "learning_rate": 2.7284098202086872e-05, + "loss": 0.1319, + "step": 29366 + }, + { + "epoch": 0.5237933863660685, + "grad_norm": 0.2947194278240204, + "learning_rate": 2.7282548200057835e-05, + "loss": 0.1392, + "step": 29367 + }, + { + "epoch": 0.5238112224877822, + "grad_norm": 0.2659969627857208, + "learning_rate": 2.7280998189180858e-05, + "loss": 0.1533, + "step": 29368 + }, + { + "epoch": 0.5238290586094959, + "grad_norm": 0.3143240511417389, + "learning_rate": 2.7279448169461934e-05, + "loss": 0.098, + "step": 29369 + }, + { + "epoch": 0.5238468947312096, + "grad_norm": 0.29897889494895935, + "learning_rate": 2.727789814090709e-05, + "loss": 0.1136, + "step": 29370 + }, + { + "epoch": 0.5238647308529233, + "grad_norm": 0.25768452882766724, + "learning_rate": 2.727634810352233e-05, + "loss": 0.1933, + "step": 29371 + }, + { + "epoch": 0.5238825669746371, + "grad_norm": 0.23185980319976807, + "learning_rate": 2.7274798057313654e-05, + "loss": 0.1158, + "step": 29372 + }, + { + "epoch": 0.5239004030963508, + "grad_norm": 0.24120916426181793, + "learning_rate": 2.7273248002287083e-05, + "loss": 0.1081, + "step": 29373 + }, + { + "epoch": 0.5239182392180645, + "grad_norm": 0.24477148056030273, + "learning_rate": 2.727169793844861e-05, + "loss": 0.1545, + "step": 29374 + }, + { + "epoch": 0.5239360753397782, + "grad_norm": 0.21501465141773224, + "learning_rate": 2.7270147865804248e-05, + "loss": 0.1414, + "step": 29375 + }, + { + "epoch": 0.5239539114614918, + "grad_norm": 0.261867880821228, + "learning_rate": 2.7268597784360007e-05, + "loss": 0.2095, + "step": 29376 + }, + { + "epoch": 0.5239717475832055, + "grad_norm": 0.2604585587978363, + "learning_rate": 2.7267047694121905e-05, + "loss": 0.1726, + "step": 29377 + }, + { + "epoch": 0.5239895837049192, + "grad_norm": 0.24295467138290405, + "learning_rate": 2.726549759509594e-05, + "loss": 0.1615, + "step": 29378 + }, + { + "epoch": 0.5240074198266329, + "grad_norm": 0.27478599548339844, + "learning_rate": 2.7263947487288128e-05, + "loss": 0.1132, + "step": 29379 + }, + { + "epoch": 0.5240252559483466, + "grad_norm": 0.21894565224647522, + "learning_rate": 2.7262397370704468e-05, + "loss": 0.1587, + "step": 29380 + }, + { + "epoch": 0.5240430920700603, + "grad_norm": 0.28471311926841736, + "learning_rate": 2.7260847245350978e-05, + "loss": 0.1028, + "step": 29381 + }, + { + "epoch": 0.524060928191774, + "grad_norm": 0.22251319885253906, + "learning_rate": 2.725929711123366e-05, + "loss": 0.1533, + "step": 29382 + }, + { + "epoch": 0.5240787643134877, + "grad_norm": 0.23817679286003113, + "learning_rate": 2.725774696835853e-05, + "loss": 0.1807, + "step": 29383 + }, + { + "epoch": 0.5240966004352013, + "grad_norm": 0.2621144652366638, + "learning_rate": 2.7256196816731588e-05, + "loss": 0.1495, + "step": 29384 + }, + { + "epoch": 0.524114436556915, + "grad_norm": 0.21168941259384155, + "learning_rate": 2.7254646656358852e-05, + "loss": 0.1189, + "step": 29385 + }, + { + "epoch": 0.5241322726786287, + "grad_norm": 0.30611488223075867, + "learning_rate": 2.7253096487246332e-05, + "loss": 0.169, + "step": 29386 + }, + { + "epoch": 0.5241501088003424, + "grad_norm": 0.22719596326351166, + "learning_rate": 2.725154630940002e-05, + "loss": 0.1107, + "step": 29387 + }, + { + "epoch": 0.5241679449220561, + "grad_norm": 0.4431948661804199, + "learning_rate": 2.7249996122825944e-05, + "loss": 0.1026, + "step": 29388 + }, + { + "epoch": 0.5241857810437699, + "grad_norm": 0.3256751298904419, + "learning_rate": 2.7248445927530098e-05, + "loss": 0.1285, + "step": 29389 + }, + { + "epoch": 0.5242036171654836, + "grad_norm": 0.3353295624256134, + "learning_rate": 2.7246895723518513e-05, + "loss": 0.1366, + "step": 29390 + }, + { + "epoch": 0.5242214532871973, + "grad_norm": 0.32560133934020996, + "learning_rate": 2.7245345510797172e-05, + "loss": 0.1232, + "step": 29391 + }, + { + "epoch": 0.524239289408911, + "grad_norm": 0.3134133219718933, + "learning_rate": 2.7243795289372104e-05, + "loss": 0.173, + "step": 29392 + }, + { + "epoch": 0.5242571255306246, + "grad_norm": 0.31444886326789856, + "learning_rate": 2.7242245059249305e-05, + "loss": 0.1311, + "step": 29393 + }, + { + "epoch": 0.5242749616523383, + "grad_norm": 0.1758672595024109, + "learning_rate": 2.7240694820434793e-05, + "loss": 0.126, + "step": 29394 + }, + { + "epoch": 0.524292797774052, + "grad_norm": 0.28313782811164856, + "learning_rate": 2.723914457293457e-05, + "loss": 0.1766, + "step": 29395 + }, + { + "epoch": 0.5243106338957657, + "grad_norm": 0.2278400957584381, + "learning_rate": 2.723759431675465e-05, + "loss": 0.1617, + "step": 29396 + }, + { + "epoch": 0.5243284700174794, + "grad_norm": 0.2927832007408142, + "learning_rate": 2.7236044051901044e-05, + "loss": 0.187, + "step": 29397 + }, + { + "epoch": 0.5243463061391931, + "grad_norm": 0.24268564581871033, + "learning_rate": 2.723449377837976e-05, + "loss": 0.1541, + "step": 29398 + }, + { + "epoch": 0.5243641422609068, + "grad_norm": 0.304359495639801, + "learning_rate": 2.7232943496196806e-05, + "loss": 0.1118, + "step": 29399 + }, + { + "epoch": 0.5243819783826205, + "grad_norm": 0.27364951372146606, + "learning_rate": 2.723139320535819e-05, + "loss": 0.1522, + "step": 29400 + }, + { + "epoch": 0.5243998145043341, + "grad_norm": 0.2424466460943222, + "learning_rate": 2.722984290586993e-05, + "loss": 0.1116, + "step": 29401 + }, + { + "epoch": 0.5244176506260478, + "grad_norm": 0.278877854347229, + "learning_rate": 2.722829259773802e-05, + "loss": 0.1135, + "step": 29402 + }, + { + "epoch": 0.5244354867477615, + "grad_norm": 0.21040643751621246, + "learning_rate": 2.7226742280968475e-05, + "loss": 0.1299, + "step": 29403 + }, + { + "epoch": 0.5244533228694752, + "grad_norm": 0.401019811630249, + "learning_rate": 2.7225191955567314e-05, + "loss": 0.1865, + "step": 29404 + }, + { + "epoch": 0.5244711589911889, + "grad_norm": 0.3226037621498108, + "learning_rate": 2.7223641621540542e-05, + "loss": 0.1821, + "step": 29405 + }, + { + "epoch": 0.5244889951129027, + "grad_norm": 0.2710559368133545, + "learning_rate": 2.7222091278894163e-05, + "loss": 0.17, + "step": 29406 + }, + { + "epoch": 0.5245068312346164, + "grad_norm": 0.22361299395561218, + "learning_rate": 2.7220540927634198e-05, + "loss": 0.1389, + "step": 29407 + }, + { + "epoch": 0.5245246673563301, + "grad_norm": 0.2204882651567459, + "learning_rate": 2.721899056776664e-05, + "loss": 0.1559, + "step": 29408 + }, + { + "epoch": 0.5245425034780438, + "grad_norm": 0.361446738243103, + "learning_rate": 2.7217440199297516e-05, + "loss": 0.1443, + "step": 29409 + }, + { + "epoch": 0.5245603395997575, + "grad_norm": 0.23979291319847107, + "learning_rate": 2.7215889822232817e-05, + "loss": 0.1441, + "step": 29410 + }, + { + "epoch": 0.5245781757214711, + "grad_norm": 0.28126832842826843, + "learning_rate": 2.7214339436578573e-05, + "loss": 0.1148, + "step": 29411 + }, + { + "epoch": 0.5245960118431848, + "grad_norm": 0.24983297288417816, + "learning_rate": 2.7212789042340786e-05, + "loss": 0.1452, + "step": 29412 + }, + { + "epoch": 0.5246138479648985, + "grad_norm": 0.2227584719657898, + "learning_rate": 2.721123863952546e-05, + "loss": 0.1237, + "step": 29413 + }, + { + "epoch": 0.5246316840866122, + "grad_norm": 0.24605344235897064, + "learning_rate": 2.7209688228138612e-05, + "loss": 0.1661, + "step": 29414 + }, + { + "epoch": 0.5246495202083259, + "grad_norm": 0.20384852588176727, + "learning_rate": 2.7208137808186247e-05, + "loss": 0.1141, + "step": 29415 + }, + { + "epoch": 0.5246673563300396, + "grad_norm": 0.26670730113983154, + "learning_rate": 2.7206587379674376e-05, + "loss": 0.151, + "step": 29416 + }, + { + "epoch": 0.5246851924517533, + "grad_norm": 0.2451811134815216, + "learning_rate": 2.7205036942609007e-05, + "loss": 0.1256, + "step": 29417 + }, + { + "epoch": 0.524703028573467, + "grad_norm": 0.2369154691696167, + "learning_rate": 2.720348649699616e-05, + "loss": 0.1511, + "step": 29418 + }, + { + "epoch": 0.5247208646951806, + "grad_norm": 0.2820444107055664, + "learning_rate": 2.7201936042841835e-05, + "loss": 0.1515, + "step": 29419 + }, + { + "epoch": 0.5247387008168943, + "grad_norm": 0.22452563047409058, + "learning_rate": 2.7200385580152056e-05, + "loss": 0.1305, + "step": 29420 + }, + { + "epoch": 0.524756536938608, + "grad_norm": 0.2157796174287796, + "learning_rate": 2.7198835108932808e-05, + "loss": 0.1389, + "step": 29421 + }, + { + "epoch": 0.5247743730603217, + "grad_norm": 0.26535478234291077, + "learning_rate": 2.719728462919012e-05, + "loss": 0.1293, + "step": 29422 + }, + { + "epoch": 0.5247922091820355, + "grad_norm": 0.22747619450092316, + "learning_rate": 2.719573414093e-05, + "loss": 0.1243, + "step": 29423 + }, + { + "epoch": 0.5248100453037492, + "grad_norm": 0.2745887041091919, + "learning_rate": 2.7194183644158454e-05, + "loss": 0.1299, + "step": 29424 + }, + { + "epoch": 0.5248278814254629, + "grad_norm": 0.28680193424224854, + "learning_rate": 2.7192633138881495e-05, + "loss": 0.1361, + "step": 29425 + }, + { + "epoch": 0.5248457175471766, + "grad_norm": 0.25914618372917175, + "learning_rate": 2.7191082625105137e-05, + "loss": 0.1092, + "step": 29426 + }, + { + "epoch": 0.5248635536688903, + "grad_norm": 0.24438418447971344, + "learning_rate": 2.718953210283538e-05, + "loss": 0.1247, + "step": 29427 + }, + { + "epoch": 0.524881389790604, + "grad_norm": 0.30035993456840515, + "learning_rate": 2.7187981572078242e-05, + "loss": 0.1604, + "step": 29428 + }, + { + "epoch": 0.5248992259123176, + "grad_norm": 0.2815878093242645, + "learning_rate": 2.718643103283973e-05, + "loss": 0.0614, + "step": 29429 + }, + { + "epoch": 0.5249170620340313, + "grad_norm": 0.2685695290565491, + "learning_rate": 2.718488048512586e-05, + "loss": 0.1562, + "step": 29430 + }, + { + "epoch": 0.524934898155745, + "grad_norm": 0.27362924814224243, + "learning_rate": 2.7183329928942636e-05, + "loss": 0.114, + "step": 29431 + }, + { + "epoch": 0.5249527342774587, + "grad_norm": 0.2638143002986908, + "learning_rate": 2.7181779364296067e-05, + "loss": 0.1283, + "step": 29432 + }, + { + "epoch": 0.5249705703991724, + "grad_norm": 0.23861396312713623, + "learning_rate": 2.7180228791192176e-05, + "loss": 0.1443, + "step": 29433 + }, + { + "epoch": 0.5249884065208861, + "grad_norm": 0.27368634939193726, + "learning_rate": 2.717867820963696e-05, + "loss": 0.1901, + "step": 29434 + }, + { + "epoch": 0.5250062426425998, + "grad_norm": 0.29530012607574463, + "learning_rate": 2.7177127619636437e-05, + "loss": 0.1261, + "step": 29435 + }, + { + "epoch": 0.5250240787643135, + "grad_norm": 0.36335819959640503, + "learning_rate": 2.7175577021196615e-05, + "loss": 0.2225, + "step": 29436 + }, + { + "epoch": 0.5250419148860271, + "grad_norm": 0.3202860653400421, + "learning_rate": 2.71740264143235e-05, + "loss": 0.1513, + "step": 29437 + }, + { + "epoch": 0.5250597510077408, + "grad_norm": 0.32527410984039307, + "learning_rate": 2.7172475799023116e-05, + "loss": 0.1164, + "step": 29438 + }, + { + "epoch": 0.5250775871294546, + "grad_norm": 0.24065177142620087, + "learning_rate": 2.717092517530146e-05, + "loss": 0.1396, + "step": 29439 + }, + { + "epoch": 0.5250954232511683, + "grad_norm": 0.1808624565601349, + "learning_rate": 2.7169374543164556e-05, + "loss": 0.101, + "step": 29440 + }, + { + "epoch": 0.525113259372882, + "grad_norm": 0.27624067664146423, + "learning_rate": 2.71678239026184e-05, + "loss": 0.1357, + "step": 29441 + }, + { + "epoch": 0.5251310954945957, + "grad_norm": 0.25203272700309753, + "learning_rate": 2.716627325366901e-05, + "loss": 0.1475, + "step": 29442 + }, + { + "epoch": 0.5251489316163094, + "grad_norm": 0.2955394685268402, + "learning_rate": 2.7164722596322394e-05, + "loss": 0.1507, + "step": 29443 + }, + { + "epoch": 0.5251667677380231, + "grad_norm": 0.27341705560684204, + "learning_rate": 2.716317193058457e-05, + "loss": 0.1288, + "step": 29444 + }, + { + "epoch": 0.5251846038597368, + "grad_norm": 0.34787026047706604, + "learning_rate": 2.716162125646154e-05, + "loss": 0.121, + "step": 29445 + }, + { + "epoch": 0.5252024399814504, + "grad_norm": 0.3075961172580719, + "learning_rate": 2.716007057395933e-05, + "loss": 0.1456, + "step": 29446 + }, + { + "epoch": 0.5252202761031641, + "grad_norm": 0.29321229457855225, + "learning_rate": 2.7158519883083926e-05, + "loss": 0.1201, + "step": 29447 + }, + { + "epoch": 0.5252381122248778, + "grad_norm": 0.311604768037796, + "learning_rate": 2.715696918384137e-05, + "loss": 0.1686, + "step": 29448 + }, + { + "epoch": 0.5252559483465915, + "grad_norm": 0.287415474653244, + "learning_rate": 2.7155418476237644e-05, + "loss": 0.1175, + "step": 29449 + }, + { + "epoch": 0.5252737844683052, + "grad_norm": 0.2872602939605713, + "learning_rate": 2.715386776027878e-05, + "loss": 0.1229, + "step": 29450 + }, + { + "epoch": 0.5252916205900189, + "grad_norm": 0.2948637902736664, + "learning_rate": 2.715231703597077e-05, + "loss": 0.1046, + "step": 29451 + }, + { + "epoch": 0.5253094567117326, + "grad_norm": 0.33960095047950745, + "learning_rate": 2.7150766303319637e-05, + "loss": 0.1757, + "step": 29452 + }, + { + "epoch": 0.5253272928334463, + "grad_norm": 0.3231659531593323, + "learning_rate": 2.71492155623314e-05, + "loss": 0.2059, + "step": 29453 + }, + { + "epoch": 0.52534512895516, + "grad_norm": 0.2295851707458496, + "learning_rate": 2.714766481301205e-05, + "loss": 0.1708, + "step": 29454 + }, + { + "epoch": 0.5253629650768736, + "grad_norm": 0.3205930292606354, + "learning_rate": 2.7146114055367622e-05, + "loss": 0.1164, + "step": 29455 + }, + { + "epoch": 0.5253808011985874, + "grad_norm": 0.2763197422027588, + "learning_rate": 2.7144563289404107e-05, + "loss": 0.1028, + "step": 29456 + }, + { + "epoch": 0.5253986373203011, + "grad_norm": 0.37040671706199646, + "learning_rate": 2.714301251512753e-05, + "loss": 0.1638, + "step": 29457 + }, + { + "epoch": 0.5254164734420148, + "grad_norm": 0.23512087762355804, + "learning_rate": 2.714146173254389e-05, + "loss": 0.1503, + "step": 29458 + }, + { + "epoch": 0.5254343095637285, + "grad_norm": 0.3356296420097351, + "learning_rate": 2.7139910941659204e-05, + "loss": 0.1714, + "step": 29459 + }, + { + "epoch": 0.5254521456854422, + "grad_norm": 0.3051658868789673, + "learning_rate": 2.713836014247948e-05, + "loss": 0.1644, + "step": 29460 + }, + { + "epoch": 0.5254699818071559, + "grad_norm": 0.48958054184913635, + "learning_rate": 2.713680933501074e-05, + "loss": 0.1532, + "step": 29461 + }, + { + "epoch": 0.5254878179288696, + "grad_norm": 0.21316632628440857, + "learning_rate": 2.7135258519258992e-05, + "loss": 0.1452, + "step": 29462 + }, + { + "epoch": 0.5255056540505832, + "grad_norm": 0.3810008764266968, + "learning_rate": 2.7133707695230243e-05, + "loss": 0.1792, + "step": 29463 + }, + { + "epoch": 0.5255234901722969, + "grad_norm": 0.30944982171058655, + "learning_rate": 2.7132156862930503e-05, + "loss": 0.1735, + "step": 29464 + }, + { + "epoch": 0.5255413262940106, + "grad_norm": 0.25096839666366577, + "learning_rate": 2.713060602236579e-05, + "loss": 0.1322, + "step": 29465 + }, + { + "epoch": 0.5255591624157243, + "grad_norm": 0.30388572812080383, + "learning_rate": 2.712905517354211e-05, + "loss": 0.171, + "step": 29466 + }, + { + "epoch": 0.525576998537438, + "grad_norm": 0.2303423285484314, + "learning_rate": 2.7127504316465473e-05, + "loss": 0.1428, + "step": 29467 + }, + { + "epoch": 0.5255948346591517, + "grad_norm": 0.18653607368469238, + "learning_rate": 2.7125953451141907e-05, + "loss": 0.1105, + "step": 29468 + }, + { + "epoch": 0.5256126707808654, + "grad_norm": 0.3784129023551941, + "learning_rate": 2.7124402577577397e-05, + "loss": 0.124, + "step": 29469 + }, + { + "epoch": 0.5256305069025791, + "grad_norm": 0.2851652204990387, + "learning_rate": 2.712285169577798e-05, + "loss": 0.1557, + "step": 29470 + }, + { + "epoch": 0.5256483430242928, + "grad_norm": 0.3392007350921631, + "learning_rate": 2.712130080574965e-05, + "loss": 0.1505, + "step": 29471 + }, + { + "epoch": 0.5256661791460064, + "grad_norm": 0.24897627532482147, + "learning_rate": 2.711974990749842e-05, + "loss": 0.1158, + "step": 29472 + }, + { + "epoch": 0.5256840152677202, + "grad_norm": 0.5230845212936401, + "learning_rate": 2.7118199001030315e-05, + "loss": 0.2621, + "step": 29473 + }, + { + "epoch": 0.5257018513894339, + "grad_norm": 0.23026961088180542, + "learning_rate": 2.7116648086351338e-05, + "loss": 0.1139, + "step": 29474 + }, + { + "epoch": 0.5257196875111476, + "grad_norm": 0.2702396810054779, + "learning_rate": 2.7115097163467502e-05, + "loss": 0.1738, + "step": 29475 + }, + { + "epoch": 0.5257375236328613, + "grad_norm": 0.2927078902721405, + "learning_rate": 2.7113546232384824e-05, + "loss": 0.1282, + "step": 29476 + }, + { + "epoch": 0.525755359754575, + "grad_norm": 0.2175307720899582, + "learning_rate": 2.7111995293109304e-05, + "loss": 0.1177, + "step": 29477 + }, + { + "epoch": 0.5257731958762887, + "grad_norm": 0.2614126205444336, + "learning_rate": 2.7110444345646964e-05, + "loss": 0.124, + "step": 29478 + }, + { + "epoch": 0.5257910319980024, + "grad_norm": 0.28760653734207153, + "learning_rate": 2.710889339000381e-05, + "loss": 0.1105, + "step": 29479 + }, + { + "epoch": 0.525808868119716, + "grad_norm": 0.24923793971538544, + "learning_rate": 2.710734242618585e-05, + "loss": 0.1531, + "step": 29480 + }, + { + "epoch": 0.5258267042414297, + "grad_norm": 0.3242799639701843, + "learning_rate": 2.7105791454199113e-05, + "loss": 0.1365, + "step": 29481 + }, + { + "epoch": 0.5258445403631434, + "grad_norm": 0.3814876675605774, + "learning_rate": 2.71042404740496e-05, + "loss": 0.1704, + "step": 29482 + }, + { + "epoch": 0.5258623764848571, + "grad_norm": 0.25780394673347473, + "learning_rate": 2.7102689485743326e-05, + "loss": 0.1415, + "step": 29483 + }, + { + "epoch": 0.5258802126065708, + "grad_norm": 0.3115689754486084, + "learning_rate": 2.7101138489286293e-05, + "loss": 0.1374, + "step": 29484 + }, + { + "epoch": 0.5258980487282845, + "grad_norm": 0.3063698709011078, + "learning_rate": 2.709958748468453e-05, + "loss": 0.1437, + "step": 29485 + }, + { + "epoch": 0.5259158848499982, + "grad_norm": 0.23593035340309143, + "learning_rate": 2.709803647194404e-05, + "loss": 0.1417, + "step": 29486 + }, + { + "epoch": 0.5259337209717119, + "grad_norm": 0.2669139802455902, + "learning_rate": 2.709648545107083e-05, + "loss": 0.1469, + "step": 29487 + }, + { + "epoch": 0.5259515570934256, + "grad_norm": 0.25483858585357666, + "learning_rate": 2.7094934422070917e-05, + "loss": 0.1639, + "step": 29488 + }, + { + "epoch": 0.5259693932151392, + "grad_norm": 0.262802392244339, + "learning_rate": 2.7093383384950322e-05, + "loss": 0.1661, + "step": 29489 + }, + { + "epoch": 0.525987229336853, + "grad_norm": 0.19983015954494476, + "learning_rate": 2.709183233971505e-05, + "loss": 0.1162, + "step": 29490 + }, + { + "epoch": 0.5260050654585667, + "grad_norm": 0.25494176149368286, + "learning_rate": 2.709028128637111e-05, + "loss": 0.0973, + "step": 29491 + }, + { + "epoch": 0.5260229015802804, + "grad_norm": 0.23474548757076263, + "learning_rate": 2.708873022492452e-05, + "loss": 0.123, + "step": 29492 + }, + { + "epoch": 0.5260407377019941, + "grad_norm": 0.24141724407672882, + "learning_rate": 2.7087179155381282e-05, + "loss": 0.1155, + "step": 29493 + }, + { + "epoch": 0.5260585738237078, + "grad_norm": 0.29320254921913147, + "learning_rate": 2.7085628077747427e-05, + "loss": 0.1415, + "step": 29494 + }, + { + "epoch": 0.5260764099454215, + "grad_norm": 0.28299564123153687, + "learning_rate": 2.708407699202895e-05, + "loss": 0.1262, + "step": 29495 + }, + { + "epoch": 0.5260942460671352, + "grad_norm": 0.7342714071273804, + "learning_rate": 2.708252589823188e-05, + "loss": 0.1037, + "step": 29496 + }, + { + "epoch": 0.5261120821888489, + "grad_norm": 0.45992082357406616, + "learning_rate": 2.7080974796362213e-05, + "loss": 0.1468, + "step": 29497 + }, + { + "epoch": 0.5261299183105625, + "grad_norm": 0.4210277795791626, + "learning_rate": 2.7079423686425976e-05, + "loss": 0.1196, + "step": 29498 + }, + { + "epoch": 0.5261477544322762, + "grad_norm": 0.21317805349826813, + "learning_rate": 2.7077872568429164e-05, + "loss": 0.1207, + "step": 29499 + }, + { + "epoch": 0.5261655905539899, + "grad_norm": 0.2553720772266388, + "learning_rate": 2.7076321442377805e-05, + "loss": 0.1382, + "step": 29500 + }, + { + "epoch": 0.5261834266757036, + "grad_norm": 0.3327461779117584, + "learning_rate": 2.7074770308277904e-05, + "loss": 0.1278, + "step": 29501 + }, + { + "epoch": 0.5262012627974173, + "grad_norm": 0.22602009773254395, + "learning_rate": 2.7073219166135483e-05, + "loss": 0.1458, + "step": 29502 + }, + { + "epoch": 0.526219098919131, + "grad_norm": 0.2794208228588104, + "learning_rate": 2.707166801595655e-05, + "loss": 0.0882, + "step": 29503 + }, + { + "epoch": 0.5262369350408447, + "grad_norm": 0.4029618203639984, + "learning_rate": 2.707011685774711e-05, + "loss": 0.2117, + "step": 29504 + }, + { + "epoch": 0.5262547711625584, + "grad_norm": 0.48170581459999084, + "learning_rate": 2.7068565691513186e-05, + "loss": 0.1854, + "step": 29505 + }, + { + "epoch": 0.526272607284272, + "grad_norm": 0.2870563566684723, + "learning_rate": 2.706701451726078e-05, + "loss": 0.1438, + "step": 29506 + }, + { + "epoch": 0.5262904434059859, + "grad_norm": 0.20260033011436462, + "learning_rate": 2.7065463334995916e-05, + "loss": 0.1208, + "step": 29507 + }, + { + "epoch": 0.5263082795276995, + "grad_norm": 0.2535092532634735, + "learning_rate": 2.70639121447246e-05, + "loss": 0.158, + "step": 29508 + }, + { + "epoch": 0.5263261156494132, + "grad_norm": 0.2354477047920227, + "learning_rate": 2.7062360946452858e-05, + "loss": 0.1688, + "step": 29509 + }, + { + "epoch": 0.5263439517711269, + "grad_norm": 0.2548867166042328, + "learning_rate": 2.706080974018668e-05, + "loss": 0.1388, + "step": 29510 + }, + { + "epoch": 0.5263617878928406, + "grad_norm": 0.2875618636608124, + "learning_rate": 2.7059258525932103e-05, + "loss": 0.15, + "step": 29511 + }, + { + "epoch": 0.5263796240145543, + "grad_norm": 0.22793486714363098, + "learning_rate": 2.7057707303695118e-05, + "loss": 0.1571, + "step": 29512 + }, + { + "epoch": 0.526397460136268, + "grad_norm": 0.2482042759656906, + "learning_rate": 2.7056156073481758e-05, + "loss": 0.149, + "step": 29513 + }, + { + "epoch": 0.5264152962579817, + "grad_norm": 0.2522822618484497, + "learning_rate": 2.7054604835298015e-05, + "loss": 0.1271, + "step": 29514 + }, + { + "epoch": 0.5264331323796954, + "grad_norm": 0.27462905645370483, + "learning_rate": 2.7053053589149923e-05, + "loss": 0.1818, + "step": 29515 + }, + { + "epoch": 0.526450968501409, + "grad_norm": 0.2821716070175171, + "learning_rate": 2.7051502335043488e-05, + "loss": 0.2011, + "step": 29516 + }, + { + "epoch": 0.5264688046231227, + "grad_norm": 0.3535119891166687, + "learning_rate": 2.704995107298472e-05, + "loss": 0.1497, + "step": 29517 + }, + { + "epoch": 0.5264866407448364, + "grad_norm": 0.2884179651737213, + "learning_rate": 2.7048399802979635e-05, + "loss": 0.1137, + "step": 29518 + }, + { + "epoch": 0.5265044768665501, + "grad_norm": 0.2683907449245453, + "learning_rate": 2.704684852503424e-05, + "loss": 0.1806, + "step": 29519 + }, + { + "epoch": 0.5265223129882638, + "grad_norm": 0.3006495535373688, + "learning_rate": 2.704529723915455e-05, + "loss": 0.1311, + "step": 29520 + }, + { + "epoch": 0.5265401491099775, + "grad_norm": 0.3350886106491089, + "learning_rate": 2.7043745945346587e-05, + "loss": 0.1889, + "step": 29521 + }, + { + "epoch": 0.5265579852316912, + "grad_norm": 0.263200968503952, + "learning_rate": 2.704219464361636e-05, + "loss": 0.1609, + "step": 29522 + }, + { + "epoch": 0.5265758213534049, + "grad_norm": 0.2262687087059021, + "learning_rate": 2.704064333396988e-05, + "loss": 0.1655, + "step": 29523 + }, + { + "epoch": 0.5265936574751187, + "grad_norm": 0.38764479756355286, + "learning_rate": 2.703909201641316e-05, + "loss": 0.1276, + "step": 29524 + }, + { + "epoch": 0.5266114935968323, + "grad_norm": 0.2452140897512436, + "learning_rate": 2.703754069095221e-05, + "loss": 0.1555, + "step": 29525 + }, + { + "epoch": 0.526629329718546, + "grad_norm": 0.2584201395511627, + "learning_rate": 2.703598935759306e-05, + "loss": 0.1059, + "step": 29526 + }, + { + "epoch": 0.5266471658402597, + "grad_norm": 0.31932732462882996, + "learning_rate": 2.7034438016341706e-05, + "loss": 0.1245, + "step": 29527 + }, + { + "epoch": 0.5266650019619734, + "grad_norm": 0.26007887721061707, + "learning_rate": 2.7032886667204165e-05, + "loss": 0.1917, + "step": 29528 + }, + { + "epoch": 0.5266828380836871, + "grad_norm": 0.2648650109767914, + "learning_rate": 2.7031335310186456e-05, + "loss": 0.1272, + "step": 29529 + }, + { + "epoch": 0.5267006742054008, + "grad_norm": 0.2847784757614136, + "learning_rate": 2.7029783945294597e-05, + "loss": 0.1792, + "step": 29530 + }, + { + "epoch": 0.5267185103271145, + "grad_norm": 0.29020044207572937, + "learning_rate": 2.702823257253459e-05, + "loss": 0.1502, + "step": 29531 + }, + { + "epoch": 0.5267363464488282, + "grad_norm": 0.3325560390949249, + "learning_rate": 2.7026681191912446e-05, + "loss": 0.1535, + "step": 29532 + }, + { + "epoch": 0.5267541825705419, + "grad_norm": 0.21549859642982483, + "learning_rate": 2.7025129803434192e-05, + "loss": 0.1081, + "step": 29533 + }, + { + "epoch": 0.5267720186922555, + "grad_norm": 0.2609044313430786, + "learning_rate": 2.7023578407105833e-05, + "loss": 0.1439, + "step": 29534 + }, + { + "epoch": 0.5267898548139692, + "grad_norm": 0.26960626244544983, + "learning_rate": 2.702202700293338e-05, + "loss": 0.1535, + "step": 29535 + }, + { + "epoch": 0.5268076909356829, + "grad_norm": 0.2711268365383148, + "learning_rate": 2.702047559092286e-05, + "loss": 0.171, + "step": 29536 + }, + { + "epoch": 0.5268255270573966, + "grad_norm": 0.21222415566444397, + "learning_rate": 2.7018924171080278e-05, + "loss": 0.0975, + "step": 29537 + }, + { + "epoch": 0.5268433631791103, + "grad_norm": 0.28257322311401367, + "learning_rate": 2.7017372743411646e-05, + "loss": 0.1564, + "step": 29538 + }, + { + "epoch": 0.526861199300824, + "grad_norm": 0.25108975172042847, + "learning_rate": 2.7015821307922982e-05, + "loss": 0.1491, + "step": 29539 + }, + { + "epoch": 0.5268790354225378, + "grad_norm": 0.29895803332328796, + "learning_rate": 2.70142698646203e-05, + "loss": 0.1501, + "step": 29540 + }, + { + "epoch": 0.5268968715442515, + "grad_norm": 0.1883089244365692, + "learning_rate": 2.701271841350961e-05, + "loss": 0.1175, + "step": 29541 + }, + { + "epoch": 0.5269147076659652, + "grad_norm": 0.21324759721755981, + "learning_rate": 2.7011166954596923e-05, + "loss": 0.1199, + "step": 29542 + }, + { + "epoch": 0.5269325437876788, + "grad_norm": 0.26009321212768555, + "learning_rate": 2.7009615487888268e-05, + "loss": 0.134, + "step": 29543 + }, + { + "epoch": 0.5269503799093925, + "grad_norm": 0.31335973739624023, + "learning_rate": 2.7008064013389644e-05, + "loss": 0.1513, + "step": 29544 + }, + { + "epoch": 0.5269682160311062, + "grad_norm": 0.31468507647514343, + "learning_rate": 2.700651253110707e-05, + "loss": 0.1181, + "step": 29545 + }, + { + "epoch": 0.5269860521528199, + "grad_norm": 0.24568194150924683, + "learning_rate": 2.7004961041046566e-05, + "loss": 0.1637, + "step": 29546 + }, + { + "epoch": 0.5270038882745336, + "grad_norm": 0.3065956234931946, + "learning_rate": 2.7003409543214136e-05, + "loss": 0.1175, + "step": 29547 + }, + { + "epoch": 0.5270217243962473, + "grad_norm": 0.26989251375198364, + "learning_rate": 2.7001858037615796e-05, + "loss": 0.1209, + "step": 29548 + }, + { + "epoch": 0.527039560517961, + "grad_norm": 0.3446468114852905, + "learning_rate": 2.7000306524257564e-05, + "loss": 0.1534, + "step": 29549 + }, + { + "epoch": 0.5270573966396747, + "grad_norm": 0.27404987812042236, + "learning_rate": 2.699875500314546e-05, + "loss": 0.1546, + "step": 29550 + }, + { + "epoch": 0.5270752327613883, + "grad_norm": 0.28950831294059753, + "learning_rate": 2.699720347428548e-05, + "loss": 0.1027, + "step": 29551 + }, + { + "epoch": 0.527093068883102, + "grad_norm": 0.15191729366779327, + "learning_rate": 2.699565193768366e-05, + "loss": 0.1116, + "step": 29552 + }, + { + "epoch": 0.5271109050048157, + "grad_norm": 0.2926084101200104, + "learning_rate": 2.6994100393345993e-05, + "loss": 0.1555, + "step": 29553 + }, + { + "epoch": 0.5271287411265294, + "grad_norm": 0.3048332929611206, + "learning_rate": 2.699254884127852e-05, + "loss": 0.1286, + "step": 29554 + }, + { + "epoch": 0.5271465772482431, + "grad_norm": 0.23489142954349518, + "learning_rate": 2.6990997281487224e-05, + "loss": 0.1303, + "step": 29555 + }, + { + "epoch": 0.5271644133699568, + "grad_norm": 0.2898871600627899, + "learning_rate": 2.698944571397814e-05, + "loss": 0.1194, + "step": 29556 + }, + { + "epoch": 0.5271822494916706, + "grad_norm": 0.26908111572265625, + "learning_rate": 2.698789413875728e-05, + "loss": 0.1384, + "step": 29557 + }, + { + "epoch": 0.5272000856133843, + "grad_norm": 0.22471243143081665, + "learning_rate": 2.698634255583065e-05, + "loss": 0.1266, + "step": 29558 + }, + { + "epoch": 0.527217921735098, + "grad_norm": 0.22601579129695892, + "learning_rate": 2.698479096520428e-05, + "loss": 0.1509, + "step": 29559 + }, + { + "epoch": 0.5272357578568116, + "grad_norm": 0.23397766053676605, + "learning_rate": 2.6983239366884165e-05, + "loss": 0.1265, + "step": 29560 + }, + { + "epoch": 0.5272535939785253, + "grad_norm": 0.22585737705230713, + "learning_rate": 2.698168776087634e-05, + "loss": 0.1266, + "step": 29561 + }, + { + "epoch": 0.527271430100239, + "grad_norm": 0.2820310592651367, + "learning_rate": 2.6980136147186796e-05, + "loss": 0.1338, + "step": 29562 + }, + { + "epoch": 0.5272892662219527, + "grad_norm": 0.23659257590770721, + "learning_rate": 2.6978584525821566e-05, + "loss": 0.1454, + "step": 29563 + }, + { + "epoch": 0.5273071023436664, + "grad_norm": 0.2303420454263687, + "learning_rate": 2.6977032896786657e-05, + "loss": 0.1094, + "step": 29564 + }, + { + "epoch": 0.5273249384653801, + "grad_norm": 0.22334200143814087, + "learning_rate": 2.697548126008809e-05, + "loss": 0.0577, + "step": 29565 + }, + { + "epoch": 0.5273427745870938, + "grad_norm": 0.21660497784614563, + "learning_rate": 2.6973929615731873e-05, + "loss": 0.0816, + "step": 29566 + }, + { + "epoch": 0.5273606107088075, + "grad_norm": 0.38114750385284424, + "learning_rate": 2.6972377963724028e-05, + "loss": 0.1306, + "step": 29567 + }, + { + "epoch": 0.5273784468305212, + "grad_norm": 0.2799783945083618, + "learning_rate": 2.6970826304070558e-05, + "loss": 0.1357, + "step": 29568 + }, + { + "epoch": 0.5273962829522348, + "grad_norm": 0.23243758082389832, + "learning_rate": 2.696927463677748e-05, + "loss": 0.1711, + "step": 29569 + }, + { + "epoch": 0.5274141190739485, + "grad_norm": 0.23562775552272797, + "learning_rate": 2.696772296185082e-05, + "loss": 0.1615, + "step": 29570 + }, + { + "epoch": 0.5274319551956622, + "grad_norm": 0.20844227075576782, + "learning_rate": 2.696617127929659e-05, + "loss": 0.1214, + "step": 29571 + }, + { + "epoch": 0.5274497913173759, + "grad_norm": 0.23749417066574097, + "learning_rate": 2.6964619589120798e-05, + "loss": 0.1121, + "step": 29572 + }, + { + "epoch": 0.5274676274390896, + "grad_norm": 0.27115121483802795, + "learning_rate": 2.696306789132946e-05, + "loss": 0.1387, + "step": 29573 + }, + { + "epoch": 0.5274854635608034, + "grad_norm": 0.2638593912124634, + "learning_rate": 2.6961516185928597e-05, + "loss": 0.1229, + "step": 29574 + }, + { + "epoch": 0.5275032996825171, + "grad_norm": 0.31775814294815063, + "learning_rate": 2.6959964472924215e-05, + "loss": 0.0988, + "step": 29575 + }, + { + "epoch": 0.5275211358042308, + "grad_norm": 0.3245813846588135, + "learning_rate": 2.6958412752322333e-05, + "loss": 0.1409, + "step": 29576 + }, + { + "epoch": 0.5275389719259445, + "grad_norm": 0.24844306707382202, + "learning_rate": 2.6956861024128966e-05, + "loss": 0.178, + "step": 29577 + }, + { + "epoch": 0.5275568080476581, + "grad_norm": 0.23932389914989471, + "learning_rate": 2.6955309288350135e-05, + "loss": 0.1535, + "step": 29578 + }, + { + "epoch": 0.5275746441693718, + "grad_norm": 0.3015996515750885, + "learning_rate": 2.695375754499185e-05, + "loss": 0.1651, + "step": 29579 + }, + { + "epoch": 0.5275924802910855, + "grad_norm": 0.22868657112121582, + "learning_rate": 2.695220579406012e-05, + "loss": 0.1318, + "step": 29580 + }, + { + "epoch": 0.5276103164127992, + "grad_norm": 0.2838252782821655, + "learning_rate": 2.6950654035560967e-05, + "loss": 0.1814, + "step": 29581 + }, + { + "epoch": 0.5276281525345129, + "grad_norm": 0.2079337239265442, + "learning_rate": 2.6949102269500413e-05, + "loss": 0.1209, + "step": 29582 + }, + { + "epoch": 0.5276459886562266, + "grad_norm": 0.29225945472717285, + "learning_rate": 2.694755049588446e-05, + "loss": 0.1612, + "step": 29583 + }, + { + "epoch": 0.5276638247779403, + "grad_norm": 0.40615150332450867, + "learning_rate": 2.6945998714719127e-05, + "loss": 0.1316, + "step": 29584 + }, + { + "epoch": 0.527681660899654, + "grad_norm": 0.2622811794281006, + "learning_rate": 2.6944446926010436e-05, + "loss": 0.1351, + "step": 29585 + }, + { + "epoch": 0.5276994970213676, + "grad_norm": 0.23110252618789673, + "learning_rate": 2.694289512976439e-05, + "loss": 0.1074, + "step": 29586 + }, + { + "epoch": 0.5277173331430813, + "grad_norm": 0.2519962191581726, + "learning_rate": 2.6941343325987016e-05, + "loss": 0.1509, + "step": 29587 + }, + { + "epoch": 0.527735169264795, + "grad_norm": 0.335615873336792, + "learning_rate": 2.6939791514684325e-05, + "loss": 0.1742, + "step": 29588 + }, + { + "epoch": 0.5277530053865087, + "grad_norm": 0.24000419676303864, + "learning_rate": 2.6938239695862332e-05, + "loss": 0.1727, + "step": 29589 + }, + { + "epoch": 0.5277708415082224, + "grad_norm": 0.24141176044940948, + "learning_rate": 2.6936687869527056e-05, + "loss": 0.1193, + "step": 29590 + }, + { + "epoch": 0.5277886776299362, + "grad_norm": 0.40963640809059143, + "learning_rate": 2.69351360356845e-05, + "loss": 0.1632, + "step": 29591 + }, + { + "epoch": 0.5278065137516499, + "grad_norm": 0.27701905369758606, + "learning_rate": 2.693358419434069e-05, + "loss": 0.1825, + "step": 29592 + }, + { + "epoch": 0.5278243498733636, + "grad_norm": 0.23306919634342194, + "learning_rate": 2.6932032345501646e-05, + "loss": 0.1179, + "step": 29593 + }, + { + "epoch": 0.5278421859950773, + "grad_norm": 0.22983503341674805, + "learning_rate": 2.693048048917337e-05, + "loss": 0.1632, + "step": 29594 + }, + { + "epoch": 0.527860022116791, + "grad_norm": 0.26778826117515564, + "learning_rate": 2.6928928625361893e-05, + "loss": 0.0902, + "step": 29595 + }, + { + "epoch": 0.5278778582385046, + "grad_norm": 0.2940341532230377, + "learning_rate": 2.692737675407322e-05, + "loss": 0.1856, + "step": 29596 + }, + { + "epoch": 0.5278956943602183, + "grad_norm": 0.29812338948249817, + "learning_rate": 2.692582487531336e-05, + "loss": 0.1616, + "step": 29597 + }, + { + "epoch": 0.527913530481932, + "grad_norm": 0.19200968742370605, + "learning_rate": 2.692427298908835e-05, + "loss": 0.0886, + "step": 29598 + }, + { + "epoch": 0.5279313666036457, + "grad_norm": 0.28245770931243896, + "learning_rate": 2.6922721095404187e-05, + "loss": 0.1792, + "step": 29599 + }, + { + "epoch": 0.5279492027253594, + "grad_norm": 0.23266640305519104, + "learning_rate": 2.69211691942669e-05, + "loss": 0.1049, + "step": 29600 + }, + { + "epoch": 0.5279670388470731, + "grad_norm": 0.2918422222137451, + "learning_rate": 2.6919617285682487e-05, + "loss": 0.1096, + "step": 29601 + }, + { + "epoch": 0.5279848749687868, + "grad_norm": 0.3453911244869232, + "learning_rate": 2.6918065369656982e-05, + "loss": 0.1528, + "step": 29602 + }, + { + "epoch": 0.5280027110905005, + "grad_norm": 0.24862413108348846, + "learning_rate": 2.691651344619639e-05, + "loss": 0.1857, + "step": 29603 + }, + { + "epoch": 0.5280205472122141, + "grad_norm": 0.28764235973358154, + "learning_rate": 2.6914961515306726e-05, + "loss": 0.1483, + "step": 29604 + }, + { + "epoch": 0.5280383833339278, + "grad_norm": 0.2671939730644226, + "learning_rate": 2.6913409576994016e-05, + "loss": 0.1891, + "step": 29605 + }, + { + "epoch": 0.5280562194556415, + "grad_norm": 0.4348195493221283, + "learning_rate": 2.691185763126427e-05, + "loss": 0.13, + "step": 29606 + }, + { + "epoch": 0.5280740555773552, + "grad_norm": 0.32455480098724365, + "learning_rate": 2.6910305678123498e-05, + "loss": 0.1296, + "step": 29607 + }, + { + "epoch": 0.528091891699069, + "grad_norm": 0.38393643498420715, + "learning_rate": 2.6908753717577728e-05, + "loss": 0.1668, + "step": 29608 + }, + { + "epoch": 0.5281097278207827, + "grad_norm": 0.2736952006816864, + "learning_rate": 2.6907201749632966e-05, + "loss": 0.1148, + "step": 29609 + }, + { + "epoch": 0.5281275639424964, + "grad_norm": 0.34408512711524963, + "learning_rate": 2.6905649774295232e-05, + "loss": 0.1225, + "step": 29610 + }, + { + "epoch": 0.5281454000642101, + "grad_norm": 0.2795182168483734, + "learning_rate": 2.6904097791570538e-05, + "loss": 0.1444, + "step": 29611 + }, + { + "epoch": 0.5281632361859238, + "grad_norm": 0.24202504754066467, + "learning_rate": 2.69025458014649e-05, + "loss": 0.1344, + "step": 29612 + }, + { + "epoch": 0.5281810723076374, + "grad_norm": 0.48582082986831665, + "learning_rate": 2.6900993803984347e-05, + "loss": 0.1911, + "step": 29613 + }, + { + "epoch": 0.5281989084293511, + "grad_norm": 0.28955477476119995, + "learning_rate": 2.6899441799134885e-05, + "loss": 0.1472, + "step": 29614 + }, + { + "epoch": 0.5282167445510648, + "grad_norm": 0.3031627833843231, + "learning_rate": 2.6897889786922524e-05, + "loss": 0.1929, + "step": 29615 + }, + { + "epoch": 0.5282345806727785, + "grad_norm": 0.24173744022846222, + "learning_rate": 2.689633776735329e-05, + "loss": 0.1285, + "step": 29616 + }, + { + "epoch": 0.5282524167944922, + "grad_norm": 0.23135331273078918, + "learning_rate": 2.6894785740433198e-05, + "loss": 0.1228, + "step": 29617 + }, + { + "epoch": 0.5282702529162059, + "grad_norm": 0.2616306245326996, + "learning_rate": 2.6893233706168253e-05, + "loss": 0.1783, + "step": 29618 + }, + { + "epoch": 0.5282880890379196, + "grad_norm": 0.23648065328598022, + "learning_rate": 2.689168166456449e-05, + "loss": 0.0934, + "step": 29619 + }, + { + "epoch": 0.5283059251596333, + "grad_norm": 0.24911677837371826, + "learning_rate": 2.689012961562791e-05, + "loss": 0.104, + "step": 29620 + }, + { + "epoch": 0.528323761281347, + "grad_norm": 0.2668570578098297, + "learning_rate": 2.6888577559364542e-05, + "loss": 0.1233, + "step": 29621 + }, + { + "epoch": 0.5283415974030606, + "grad_norm": 0.3501882553100586, + "learning_rate": 2.6887025495780392e-05, + "loss": 0.1567, + "step": 29622 + }, + { + "epoch": 0.5283594335247743, + "grad_norm": 0.22387148439884186, + "learning_rate": 2.6885473424881475e-05, + "loss": 0.1439, + "step": 29623 + }, + { + "epoch": 0.528377269646488, + "grad_norm": 0.25209546089172363, + "learning_rate": 2.6883921346673813e-05, + "loss": 0.1373, + "step": 29624 + }, + { + "epoch": 0.5283951057682018, + "grad_norm": 0.25188687443733215, + "learning_rate": 2.688236926116342e-05, + "loss": 0.1448, + "step": 29625 + }, + { + "epoch": 0.5284129418899155, + "grad_norm": 0.3165016770362854, + "learning_rate": 2.688081716835632e-05, + "loss": 0.1273, + "step": 29626 + }, + { + "epoch": 0.5284307780116292, + "grad_norm": 0.2478540688753128, + "learning_rate": 2.687926506825852e-05, + "loss": 0.184, + "step": 29627 + }, + { + "epoch": 0.5284486141333429, + "grad_norm": 0.3281519114971161, + "learning_rate": 2.687771296087604e-05, + "loss": 0.1659, + "step": 29628 + }, + { + "epoch": 0.5284664502550566, + "grad_norm": 0.23523716628551483, + "learning_rate": 2.6876160846214892e-05, + "loss": 0.1309, + "step": 29629 + }, + { + "epoch": 0.5284842863767703, + "grad_norm": 0.4224299192428589, + "learning_rate": 2.6874608724281102e-05, + "loss": 0.1256, + "step": 29630 + }, + { + "epoch": 0.5285021224984839, + "grad_norm": 0.33960893750190735, + "learning_rate": 2.6873056595080674e-05, + "loss": 0.1542, + "step": 29631 + }, + { + "epoch": 0.5285199586201976, + "grad_norm": 0.16312552988529205, + "learning_rate": 2.687150445861964e-05, + "loss": 0.0935, + "step": 29632 + }, + { + "epoch": 0.5285377947419113, + "grad_norm": 0.2274068146944046, + "learning_rate": 2.6869952314904e-05, + "loss": 0.119, + "step": 29633 + }, + { + "epoch": 0.528555630863625, + "grad_norm": 0.2133219838142395, + "learning_rate": 2.686840016393979e-05, + "loss": 0.1243, + "step": 29634 + }, + { + "epoch": 0.5285734669853387, + "grad_norm": 0.27035537362098694, + "learning_rate": 2.686684800573301e-05, + "loss": 0.1155, + "step": 29635 + }, + { + "epoch": 0.5285913031070524, + "grad_norm": 0.2978620231151581, + "learning_rate": 2.686529584028968e-05, + "loss": 0.104, + "step": 29636 + }, + { + "epoch": 0.5286091392287661, + "grad_norm": 0.37502193450927734, + "learning_rate": 2.686374366761582e-05, + "loss": 0.1485, + "step": 29637 + }, + { + "epoch": 0.5286269753504798, + "grad_norm": 0.28957220911979675, + "learning_rate": 2.6862191487717448e-05, + "loss": 0.13, + "step": 29638 + }, + { + "epoch": 0.5286448114721934, + "grad_norm": 0.2958044409751892, + "learning_rate": 2.6860639300600577e-05, + "loss": 0.1565, + "step": 29639 + }, + { + "epoch": 0.5286626475939071, + "grad_norm": 0.2503563463687897, + "learning_rate": 2.685908710627122e-05, + "loss": 0.1419, + "step": 29640 + }, + { + "epoch": 0.5286804837156209, + "grad_norm": 0.2666504681110382, + "learning_rate": 2.685753490473541e-05, + "loss": 0.1417, + "step": 29641 + }, + { + "epoch": 0.5286983198373346, + "grad_norm": 0.33652621507644653, + "learning_rate": 2.6855982695999142e-05, + "loss": 0.141, + "step": 29642 + }, + { + "epoch": 0.5287161559590483, + "grad_norm": 0.2587040960788727, + "learning_rate": 2.6854430480068456e-05, + "loss": 0.1308, + "step": 29643 + }, + { + "epoch": 0.528733992080762, + "grad_norm": 0.31061822175979614, + "learning_rate": 2.685287825694935e-05, + "loss": 0.1143, + "step": 29644 + }, + { + "epoch": 0.5287518282024757, + "grad_norm": 0.3000825345516205, + "learning_rate": 2.685132602664785e-05, + "loss": 0.1805, + "step": 29645 + }, + { + "epoch": 0.5287696643241894, + "grad_norm": 0.1863195300102234, + "learning_rate": 2.6849773789169963e-05, + "loss": 0.0964, + "step": 29646 + }, + { + "epoch": 0.5287875004459031, + "grad_norm": 0.23746301233768463, + "learning_rate": 2.6848221544521722e-05, + "loss": 0.1174, + "step": 29647 + }, + { + "epoch": 0.5288053365676167, + "grad_norm": 0.23783206939697266, + "learning_rate": 2.6846669292709135e-05, + "loss": 0.1421, + "step": 29648 + }, + { + "epoch": 0.5288231726893304, + "grad_norm": 0.25436627864837646, + "learning_rate": 2.684511703373822e-05, + "loss": 0.1211, + "step": 29649 + }, + { + "epoch": 0.5288410088110441, + "grad_norm": 0.2817104160785675, + "learning_rate": 2.6843564767615e-05, + "loss": 0.1881, + "step": 29650 + }, + { + "epoch": 0.5288588449327578, + "grad_norm": 0.30766186118125916, + "learning_rate": 2.6842012494345475e-05, + "loss": 0.1358, + "step": 29651 + }, + { + "epoch": 0.5288766810544715, + "grad_norm": 0.20351248979568481, + "learning_rate": 2.6840460213935676e-05, + "loss": 0.1352, + "step": 29652 + }, + { + "epoch": 0.5288945171761852, + "grad_norm": 0.21055904030799866, + "learning_rate": 2.6838907926391614e-05, + "loss": 0.1063, + "step": 29653 + }, + { + "epoch": 0.5289123532978989, + "grad_norm": 0.35401269793510437, + "learning_rate": 2.6837355631719325e-05, + "loss": 0.2398, + "step": 29654 + }, + { + "epoch": 0.5289301894196126, + "grad_norm": 0.2958665192127228, + "learning_rate": 2.6835803329924798e-05, + "loss": 0.1446, + "step": 29655 + }, + { + "epoch": 0.5289480255413262, + "grad_norm": 0.27006688714027405, + "learning_rate": 2.683425102101407e-05, + "loss": 0.161, + "step": 29656 + }, + { + "epoch": 0.5289658616630399, + "grad_norm": 0.20635706186294556, + "learning_rate": 2.683269870499314e-05, + "loss": 0.096, + "step": 29657 + }, + { + "epoch": 0.5289836977847537, + "grad_norm": 0.22154195606708527, + "learning_rate": 2.6831146381868054e-05, + "loss": 0.1421, + "step": 29658 + }, + { + "epoch": 0.5290015339064674, + "grad_norm": 0.23413850367069244, + "learning_rate": 2.6829594051644803e-05, + "loss": 0.1423, + "step": 29659 + }, + { + "epoch": 0.5290193700281811, + "grad_norm": 0.24983985722064972, + "learning_rate": 2.682804171432941e-05, + "loss": 0.1313, + "step": 29660 + }, + { + "epoch": 0.5290372061498948, + "grad_norm": 0.33375123143196106, + "learning_rate": 2.68264893699279e-05, + "loss": 0.2191, + "step": 29661 + }, + { + "epoch": 0.5290550422716085, + "grad_norm": 0.27024921774864197, + "learning_rate": 2.6824937018446283e-05, + "loss": 0.1562, + "step": 29662 + }, + { + "epoch": 0.5290728783933222, + "grad_norm": 0.25165677070617676, + "learning_rate": 2.6823384659890587e-05, + "loss": 0.0682, + "step": 29663 + }, + { + "epoch": 0.5290907145150359, + "grad_norm": 0.2871391773223877, + "learning_rate": 2.6821832294266817e-05, + "loss": 0.1407, + "step": 29664 + }, + { + "epoch": 0.5291085506367496, + "grad_norm": 0.2757948637008667, + "learning_rate": 2.6820279921581e-05, + "loss": 0.1626, + "step": 29665 + }, + { + "epoch": 0.5291263867584632, + "grad_norm": 0.2632501423358917, + "learning_rate": 2.6818727541839145e-05, + "loss": 0.1607, + "step": 29666 + }, + { + "epoch": 0.5291442228801769, + "grad_norm": 0.3199211359024048, + "learning_rate": 2.681717515504727e-05, + "loss": 0.1263, + "step": 29667 + }, + { + "epoch": 0.5291620590018906, + "grad_norm": 0.3743148446083069, + "learning_rate": 2.68156227612114e-05, + "loss": 0.1301, + "step": 29668 + }, + { + "epoch": 0.5291798951236043, + "grad_norm": 0.2938719689846039, + "learning_rate": 2.6814070360337556e-05, + "loss": 0.146, + "step": 29669 + }, + { + "epoch": 0.529197731245318, + "grad_norm": 0.28218135237693787, + "learning_rate": 2.6812517952431738e-05, + "loss": 0.139, + "step": 29670 + }, + { + "epoch": 0.5292155673670317, + "grad_norm": 0.31690269708633423, + "learning_rate": 2.6810965537499982e-05, + "loss": 0.1806, + "step": 29671 + }, + { + "epoch": 0.5292334034887454, + "grad_norm": 0.22488749027252197, + "learning_rate": 2.6809413115548298e-05, + "loss": 0.1303, + "step": 29672 + }, + { + "epoch": 0.5292512396104591, + "grad_norm": 0.2351931631565094, + "learning_rate": 2.6807860686582696e-05, + "loss": 0.1121, + "step": 29673 + }, + { + "epoch": 0.5292690757321727, + "grad_norm": 0.23707011342048645, + "learning_rate": 2.6806308250609212e-05, + "loss": 0.178, + "step": 29674 + }, + { + "epoch": 0.5292869118538865, + "grad_norm": 0.26332613825798035, + "learning_rate": 2.6804755807633848e-05, + "loss": 0.1472, + "step": 29675 + }, + { + "epoch": 0.5293047479756002, + "grad_norm": 0.2231779396533966, + "learning_rate": 2.6803203357662633e-05, + "loss": 0.0987, + "step": 29676 + }, + { + "epoch": 0.5293225840973139, + "grad_norm": 0.24613699316978455, + "learning_rate": 2.680165090070157e-05, + "loss": 0.0976, + "step": 29677 + }, + { + "epoch": 0.5293404202190276, + "grad_norm": 0.20408323407173157, + "learning_rate": 2.6800098436756692e-05, + "loss": 0.1107, + "step": 29678 + }, + { + "epoch": 0.5293582563407413, + "grad_norm": 0.24747255444526672, + "learning_rate": 2.6798545965834005e-05, + "loss": 0.1432, + "step": 29679 + }, + { + "epoch": 0.529376092462455, + "grad_norm": 0.2198696732521057, + "learning_rate": 2.679699348793953e-05, + "loss": 0.131, + "step": 29680 + }, + { + "epoch": 0.5293939285841687, + "grad_norm": 0.2480219304561615, + "learning_rate": 2.6795441003079297e-05, + "loss": 0.188, + "step": 29681 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.22994859516620636, + "learning_rate": 2.6793888511259314e-05, + "loss": 0.1038, + "step": 29682 + }, + { + "epoch": 0.529429600827596, + "grad_norm": 0.29792824387550354, + "learning_rate": 2.679233601248559e-05, + "loss": 0.1203, + "step": 29683 + }, + { + "epoch": 0.5294474369493097, + "grad_norm": 0.21882866322994232, + "learning_rate": 2.6790783506764167e-05, + "loss": 0.1758, + "step": 29684 + }, + { + "epoch": 0.5294652730710234, + "grad_norm": 0.275244802236557, + "learning_rate": 2.6789230994101035e-05, + "loss": 0.1348, + "step": 29685 + }, + { + "epoch": 0.5294831091927371, + "grad_norm": 0.28200778365135193, + "learning_rate": 2.6787678474502237e-05, + "loss": 0.1247, + "step": 29686 + }, + { + "epoch": 0.5295009453144508, + "grad_norm": 0.3244102895259857, + "learning_rate": 2.678612594797377e-05, + "loss": 0.1275, + "step": 29687 + }, + { + "epoch": 0.5295187814361645, + "grad_norm": 0.31253206729888916, + "learning_rate": 2.6784573414521662e-05, + "loss": 0.1195, + "step": 29688 + }, + { + "epoch": 0.5295366175578782, + "grad_norm": 0.21333463490009308, + "learning_rate": 2.6783020874151943e-05, + "loss": 0.1227, + "step": 29689 + }, + { + "epoch": 0.5295544536795919, + "grad_norm": 0.29453352093696594, + "learning_rate": 2.6781468326870607e-05, + "loss": 0.1571, + "step": 29690 + }, + { + "epoch": 0.5295722898013056, + "grad_norm": 0.3719899654388428, + "learning_rate": 2.6779915772683694e-05, + "loss": 0.1878, + "step": 29691 + }, + { + "epoch": 0.5295901259230194, + "grad_norm": 0.20396637916564941, + "learning_rate": 2.6778363211597202e-05, + "loss": 0.0886, + "step": 29692 + }, + { + "epoch": 0.529607962044733, + "grad_norm": 0.26895245909690857, + "learning_rate": 2.6776810643617173e-05, + "loss": 0.1078, + "step": 29693 + }, + { + "epoch": 0.5296257981664467, + "grad_norm": 0.2998652458190918, + "learning_rate": 2.6775258068749598e-05, + "loss": 0.1118, + "step": 29694 + }, + { + "epoch": 0.5296436342881604, + "grad_norm": 0.3036497235298157, + "learning_rate": 2.6773705487000517e-05, + "loss": 0.1487, + "step": 29695 + }, + { + "epoch": 0.5296614704098741, + "grad_norm": 0.2659032642841339, + "learning_rate": 2.6772152898375934e-05, + "loss": 0.1517, + "step": 29696 + }, + { + "epoch": 0.5296793065315878, + "grad_norm": 0.2947097420692444, + "learning_rate": 2.6770600302881886e-05, + "loss": 0.1667, + "step": 29697 + }, + { + "epoch": 0.5296971426533015, + "grad_norm": 0.3916162848472595, + "learning_rate": 2.6769047700524375e-05, + "loss": 0.1186, + "step": 29698 + }, + { + "epoch": 0.5297149787750152, + "grad_norm": 0.3474830687046051, + "learning_rate": 2.6767495091309424e-05, + "loss": 0.1478, + "step": 29699 + }, + { + "epoch": 0.5297328148967289, + "grad_norm": 0.32286831736564636, + "learning_rate": 2.676594247524305e-05, + "loss": 0.1396, + "step": 29700 + }, + { + "epoch": 0.5297506510184425, + "grad_norm": 0.43405163288116455, + "learning_rate": 2.6764389852331275e-05, + "loss": 0.1474, + "step": 29701 + }, + { + "epoch": 0.5297684871401562, + "grad_norm": 0.2987808287143707, + "learning_rate": 2.676283722258011e-05, + "loss": 0.1519, + "step": 29702 + }, + { + "epoch": 0.5297863232618699, + "grad_norm": 0.2750794291496277, + "learning_rate": 2.6761284585995584e-05, + "loss": 0.175, + "step": 29703 + }, + { + "epoch": 0.5298041593835836, + "grad_norm": 0.238461434841156, + "learning_rate": 2.6759731942583714e-05, + "loss": 0.1357, + "step": 29704 + }, + { + "epoch": 0.5298219955052973, + "grad_norm": 0.3430945873260498, + "learning_rate": 2.675817929235051e-05, + "loss": 0.1359, + "step": 29705 + }, + { + "epoch": 0.529839831627011, + "grad_norm": 0.2710132300853729, + "learning_rate": 2.6756626635302e-05, + "loss": 0.1154, + "step": 29706 + }, + { + "epoch": 0.5298576677487247, + "grad_norm": 0.2584246098995209, + "learning_rate": 2.6755073971444195e-05, + "loss": 0.0837, + "step": 29707 + }, + { + "epoch": 0.5298755038704384, + "grad_norm": 0.20218966901302338, + "learning_rate": 2.6753521300783112e-05, + "loss": 0.046, + "step": 29708 + }, + { + "epoch": 0.5298933399921522, + "grad_norm": 0.3276652693748474, + "learning_rate": 2.675196862332478e-05, + "loss": 0.1813, + "step": 29709 + }, + { + "epoch": 0.5299111761138658, + "grad_norm": 0.6983379125595093, + "learning_rate": 2.6750415939075218e-05, + "loss": 0.1238, + "step": 29710 + }, + { + "epoch": 0.5299290122355795, + "grad_norm": 0.24598878622055054, + "learning_rate": 2.674886324804043e-05, + "loss": 0.1885, + "step": 29711 + }, + { + "epoch": 0.5299468483572932, + "grad_norm": 0.2559693157672882, + "learning_rate": 2.674731055022645e-05, + "loss": 0.1202, + "step": 29712 + }, + { + "epoch": 0.5299646844790069, + "grad_norm": 0.24974419176578522, + "learning_rate": 2.6745757845639297e-05, + "loss": 0.1501, + "step": 29713 + }, + { + "epoch": 0.5299825206007206, + "grad_norm": 0.2565857768058777, + "learning_rate": 2.6744205134284972e-05, + "loss": 0.0881, + "step": 29714 + }, + { + "epoch": 0.5300003567224343, + "grad_norm": 0.25720903277397156, + "learning_rate": 2.6742652416169506e-05, + "loss": 0.1274, + "step": 29715 + }, + { + "epoch": 0.530018192844148, + "grad_norm": 0.22872160375118256, + "learning_rate": 2.6741099691298914e-05, + "loss": 0.0985, + "step": 29716 + }, + { + "epoch": 0.5300360289658617, + "grad_norm": 0.25498610734939575, + "learning_rate": 2.6739546959679228e-05, + "loss": 0.0875, + "step": 29717 + }, + { + "epoch": 0.5300538650875753, + "grad_norm": 0.34247514605522156, + "learning_rate": 2.6737994221316452e-05, + "loss": 0.1209, + "step": 29718 + }, + { + "epoch": 0.530071701209289, + "grad_norm": 0.24290986359119415, + "learning_rate": 2.6736441476216616e-05, + "loss": 0.1383, + "step": 29719 + }, + { + "epoch": 0.5300895373310027, + "grad_norm": 0.28888919949531555, + "learning_rate": 2.6734888724385732e-05, + "loss": 0.1683, + "step": 29720 + }, + { + "epoch": 0.5301073734527164, + "grad_norm": 0.30773797631263733, + "learning_rate": 2.673333596582982e-05, + "loss": 0.1082, + "step": 29721 + }, + { + "epoch": 0.5301252095744301, + "grad_norm": 0.2061411738395691, + "learning_rate": 2.6731783200554895e-05, + "loss": 0.1251, + "step": 29722 + }, + { + "epoch": 0.5301430456961438, + "grad_norm": 0.3899582624435425, + "learning_rate": 2.6730230428566978e-05, + "loss": 0.1771, + "step": 29723 + }, + { + "epoch": 0.5301608818178575, + "grad_norm": 0.2877986431121826, + "learning_rate": 2.672867764987209e-05, + "loss": 0.1336, + "step": 29724 + }, + { + "epoch": 0.5301787179395712, + "grad_norm": 0.24312570691108704, + "learning_rate": 2.6727124864476255e-05, + "loss": 0.1526, + "step": 29725 + }, + { + "epoch": 0.530196554061285, + "grad_norm": 0.27368050813674927, + "learning_rate": 2.6725572072385485e-05, + "loss": 0.0723, + "step": 29726 + }, + { + "epoch": 0.5302143901829987, + "grad_norm": 0.227010577917099, + "learning_rate": 2.672401927360581e-05, + "loss": 0.0941, + "step": 29727 + }, + { + "epoch": 0.5302322263047123, + "grad_norm": 0.3303409516811371, + "learning_rate": 2.6722466468143232e-05, + "loss": 0.1767, + "step": 29728 + }, + { + "epoch": 0.530250062426426, + "grad_norm": 0.21052126586437225, + "learning_rate": 2.6720913656003778e-05, + "loss": 0.1243, + "step": 29729 + }, + { + "epoch": 0.5302678985481397, + "grad_norm": 0.2581946551799774, + "learning_rate": 2.6719360837193475e-05, + "loss": 0.1482, + "step": 29730 + }, + { + "epoch": 0.5302857346698534, + "grad_norm": 0.22533701360225677, + "learning_rate": 2.6717808011718327e-05, + "loss": 0.1331, + "step": 29731 + }, + { + "epoch": 0.5303035707915671, + "grad_norm": 0.26006996631622314, + "learning_rate": 2.6716255179584372e-05, + "loss": 0.1714, + "step": 29732 + }, + { + "epoch": 0.5303214069132808, + "grad_norm": 0.27289044857025146, + "learning_rate": 2.671470234079761e-05, + "loss": 0.1056, + "step": 29733 + }, + { + "epoch": 0.5303392430349945, + "grad_norm": 0.37725868821144104, + "learning_rate": 2.6713149495364076e-05, + "loss": 0.1461, + "step": 29734 + }, + { + "epoch": 0.5303570791567082, + "grad_norm": 0.2387055903673172, + "learning_rate": 2.6711596643289783e-05, + "loss": 0.1111, + "step": 29735 + }, + { + "epoch": 0.5303749152784218, + "grad_norm": 0.28241604566574097, + "learning_rate": 2.671004378458074e-05, + "loss": 0.174, + "step": 29736 + }, + { + "epoch": 0.5303927514001355, + "grad_norm": 0.25692611932754517, + "learning_rate": 2.6708490919242983e-05, + "loss": 0.1295, + "step": 29737 + }, + { + "epoch": 0.5304105875218492, + "grad_norm": 0.24634061753749847, + "learning_rate": 2.6706938047282532e-05, + "loss": 0.1471, + "step": 29738 + }, + { + "epoch": 0.5304284236435629, + "grad_norm": 0.22360451519489288, + "learning_rate": 2.6705385168705388e-05, + "loss": 0.1389, + "step": 29739 + }, + { + "epoch": 0.5304462597652766, + "grad_norm": 0.26613274216651917, + "learning_rate": 2.6703832283517595e-05, + "loss": 0.12, + "step": 29740 + }, + { + "epoch": 0.5304640958869903, + "grad_norm": 0.32375869154930115, + "learning_rate": 2.6702279391725155e-05, + "loss": 0.1344, + "step": 29741 + }, + { + "epoch": 0.5304819320087041, + "grad_norm": 0.2760258913040161, + "learning_rate": 2.6700726493334087e-05, + "loss": 0.1294, + "step": 29742 + }, + { + "epoch": 0.5304997681304178, + "grad_norm": 0.3204035758972168, + "learning_rate": 2.6699173588350416e-05, + "loss": 0.1836, + "step": 29743 + }, + { + "epoch": 0.5305176042521315, + "grad_norm": 0.28693145513534546, + "learning_rate": 2.669762067678016e-05, + "loss": 0.1686, + "step": 29744 + }, + { + "epoch": 0.5305354403738451, + "grad_norm": 0.35571402311325073, + "learning_rate": 2.669606775862935e-05, + "loss": 0.1634, + "step": 29745 + }, + { + "epoch": 0.5305532764955588, + "grad_norm": 0.19635014235973358, + "learning_rate": 2.6694514833903982e-05, + "loss": 0.0426, + "step": 29746 + }, + { + "epoch": 0.5305711126172725, + "grad_norm": 0.5483512282371521, + "learning_rate": 2.6692961902610103e-05, + "loss": 0.2172, + "step": 29747 + }, + { + "epoch": 0.5305889487389862, + "grad_norm": 0.20793403685092926, + "learning_rate": 2.669140896475371e-05, + "loss": 0.0944, + "step": 29748 + }, + { + "epoch": 0.5306067848606999, + "grad_norm": 0.2743823826313019, + "learning_rate": 2.6689856020340835e-05, + "loss": 0.1246, + "step": 29749 + }, + { + "epoch": 0.5306246209824136, + "grad_norm": 0.27652284502983093, + "learning_rate": 2.6688303069377492e-05, + "loss": 0.1256, + "step": 29750 + }, + { + "epoch": 0.5306424571041273, + "grad_norm": 0.24813875555992126, + "learning_rate": 2.66867501118697e-05, + "loss": 0.1316, + "step": 29751 + }, + { + "epoch": 0.530660293225841, + "grad_norm": 0.19636662304401398, + "learning_rate": 2.6685197147823487e-05, + "loss": 0.1251, + "step": 29752 + }, + { + "epoch": 0.5306781293475547, + "grad_norm": 0.3265813887119293, + "learning_rate": 2.668364417724487e-05, + "loss": 0.1243, + "step": 29753 + }, + { + "epoch": 0.5306959654692683, + "grad_norm": 0.28732502460479736, + "learning_rate": 2.668209120013987e-05, + "loss": 0.1251, + "step": 29754 + }, + { + "epoch": 0.530713801590982, + "grad_norm": 0.2857404351234436, + "learning_rate": 2.6680538216514493e-05, + "loss": 0.1169, + "step": 29755 + }, + { + "epoch": 0.5307316377126957, + "grad_norm": 0.2612438201904297, + "learning_rate": 2.6678985226374775e-05, + "loss": 0.0855, + "step": 29756 + }, + { + "epoch": 0.5307494738344094, + "grad_norm": 0.338833212852478, + "learning_rate": 2.6677432229726723e-05, + "loss": 0.1922, + "step": 29757 + }, + { + "epoch": 0.5307673099561231, + "grad_norm": 0.1790684461593628, + "learning_rate": 2.667587922657638e-05, + "loss": 0.1214, + "step": 29758 + }, + { + "epoch": 0.5307851460778369, + "grad_norm": 0.28192517161369324, + "learning_rate": 2.6674326216929736e-05, + "loss": 0.1728, + "step": 29759 + }, + { + "epoch": 0.5308029821995506, + "grad_norm": 0.2998497784137726, + "learning_rate": 2.6672773200792832e-05, + "loss": 0.1297, + "step": 29760 + }, + { + "epoch": 0.5308208183212643, + "grad_norm": 0.16805927455425262, + "learning_rate": 2.667122017817168e-05, + "loss": 0.1043, + "step": 29761 + }, + { + "epoch": 0.530838654442978, + "grad_norm": 0.2797226011753082, + "learning_rate": 2.6669667149072303e-05, + "loss": 0.1465, + "step": 29762 + }, + { + "epoch": 0.5308564905646916, + "grad_norm": 0.27864915132522583, + "learning_rate": 2.6668114113500715e-05, + "loss": 0.1243, + "step": 29763 + }, + { + "epoch": 0.5308743266864053, + "grad_norm": 0.2488710731267929, + "learning_rate": 2.6666561071462943e-05, + "loss": 0.1477, + "step": 29764 + }, + { + "epoch": 0.530892162808119, + "grad_norm": 0.25625959038734436, + "learning_rate": 2.6665008022965e-05, + "loss": 0.1583, + "step": 29765 + }, + { + "epoch": 0.5309099989298327, + "grad_norm": 0.24668513238430023, + "learning_rate": 2.666345496801292e-05, + "loss": 0.1245, + "step": 29766 + }, + { + "epoch": 0.5309278350515464, + "grad_norm": 0.3247588872909546, + "learning_rate": 2.6661901906612712e-05, + "loss": 0.0717, + "step": 29767 + }, + { + "epoch": 0.5309456711732601, + "grad_norm": 0.24523504078388214, + "learning_rate": 2.6660348838770392e-05, + "loss": 0.1394, + "step": 29768 + }, + { + "epoch": 0.5309635072949738, + "grad_norm": 0.3074474334716797, + "learning_rate": 2.6658795764491995e-05, + "loss": 0.1392, + "step": 29769 + }, + { + "epoch": 0.5309813434166875, + "grad_norm": 0.24126280844211578, + "learning_rate": 2.6657242683783523e-05, + "loss": 0.1548, + "step": 29770 + }, + { + "epoch": 0.5309991795384011, + "grad_norm": 0.21336881816387177, + "learning_rate": 2.6655689596651014e-05, + "loss": 0.0555, + "step": 29771 + }, + { + "epoch": 0.5310170156601148, + "grad_norm": 0.23083241283893585, + "learning_rate": 2.665413650310047e-05, + "loss": 0.1052, + "step": 29772 + }, + { + "epoch": 0.5310348517818285, + "grad_norm": 0.42378973960876465, + "learning_rate": 2.665258340313793e-05, + "loss": 0.1737, + "step": 29773 + }, + { + "epoch": 0.5310526879035422, + "grad_norm": 0.2683773338794708, + "learning_rate": 2.6651030296769402e-05, + "loss": 0.0945, + "step": 29774 + }, + { + "epoch": 0.5310705240252559, + "grad_norm": 0.25914040207862854, + "learning_rate": 2.664947718400092e-05, + "loss": 0.1695, + "step": 29775 + }, + { + "epoch": 0.5310883601469697, + "grad_norm": 0.2754868268966675, + "learning_rate": 2.6647924064838487e-05, + "loss": 0.1908, + "step": 29776 + }, + { + "epoch": 0.5311061962686834, + "grad_norm": 0.277584046125412, + "learning_rate": 2.6646370939288128e-05, + "loss": 0.1157, + "step": 29777 + }, + { + "epoch": 0.5311240323903971, + "grad_norm": 0.2447315752506256, + "learning_rate": 2.6644817807355867e-05, + "loss": 0.1698, + "step": 29778 + }, + { + "epoch": 0.5311418685121108, + "grad_norm": 0.25382477045059204, + "learning_rate": 2.6643264669047728e-05, + "loss": 0.1165, + "step": 29779 + }, + { + "epoch": 0.5311597046338244, + "grad_norm": 0.22990615665912628, + "learning_rate": 2.6641711524369735e-05, + "loss": 0.1301, + "step": 29780 + }, + { + "epoch": 0.5311775407555381, + "grad_norm": 0.27463415265083313, + "learning_rate": 2.664015837332789e-05, + "loss": 0.122, + "step": 29781 + }, + { + "epoch": 0.5311953768772518, + "grad_norm": 0.28555572032928467, + "learning_rate": 2.663860521592823e-05, + "loss": 0.1084, + "step": 29782 + }, + { + "epoch": 0.5312132129989655, + "grad_norm": 0.26648399233818054, + "learning_rate": 2.6637052052176764e-05, + "loss": 0.1537, + "step": 29783 + }, + { + "epoch": 0.5312310491206792, + "grad_norm": 0.35596129298210144, + "learning_rate": 2.6635498882079522e-05, + "loss": 0.1812, + "step": 29784 + }, + { + "epoch": 0.5312488852423929, + "grad_norm": 0.3660792112350464, + "learning_rate": 2.663394570564252e-05, + "loss": 0.2194, + "step": 29785 + }, + { + "epoch": 0.5312667213641066, + "grad_norm": 0.35417160391807556, + "learning_rate": 2.6632392522871786e-05, + "loss": 0.1558, + "step": 29786 + }, + { + "epoch": 0.5312845574858203, + "grad_norm": 0.2802301347255707, + "learning_rate": 2.663083933377333e-05, + "loss": 0.1853, + "step": 29787 + }, + { + "epoch": 0.531302393607534, + "grad_norm": 0.20734675228595734, + "learning_rate": 2.6629286138353184e-05, + "loss": 0.1531, + "step": 29788 + }, + { + "epoch": 0.5313202297292476, + "grad_norm": 0.28117266297340393, + "learning_rate": 2.662773293661735e-05, + "loss": 0.1478, + "step": 29789 + }, + { + "epoch": 0.5313380658509613, + "grad_norm": 0.27478399872779846, + "learning_rate": 2.662617972857187e-05, + "loss": 0.1511, + "step": 29790 + }, + { + "epoch": 0.531355901972675, + "grad_norm": 0.22104164958000183, + "learning_rate": 2.662462651422275e-05, + "loss": 0.138, + "step": 29791 + }, + { + "epoch": 0.5313737380943887, + "grad_norm": 0.23585538566112518, + "learning_rate": 2.6623073293576018e-05, + "loss": 0.1246, + "step": 29792 + }, + { + "epoch": 0.5313915742161025, + "grad_norm": 0.22908110916614532, + "learning_rate": 2.6621520066637702e-05, + "loss": 0.1173, + "step": 29793 + }, + { + "epoch": 0.5314094103378162, + "grad_norm": 0.29057371616363525, + "learning_rate": 2.6619966833413802e-05, + "loss": 0.1129, + "step": 29794 + }, + { + "epoch": 0.5314272464595299, + "grad_norm": 0.23905280232429504, + "learning_rate": 2.6618413593910363e-05, + "loss": 0.1638, + "step": 29795 + }, + { + "epoch": 0.5314450825812436, + "grad_norm": 0.2564665675163269, + "learning_rate": 2.6616860348133384e-05, + "loss": 0.0953, + "step": 29796 + }, + { + "epoch": 0.5314629187029573, + "grad_norm": 0.2837539613246918, + "learning_rate": 2.6615307096088903e-05, + "loss": 0.1605, + "step": 29797 + }, + { + "epoch": 0.5314807548246709, + "grad_norm": 0.29691603779792786, + "learning_rate": 2.6613753837782928e-05, + "loss": 0.0925, + "step": 29798 + }, + { + "epoch": 0.5314985909463846, + "grad_norm": 0.2663421332836151, + "learning_rate": 2.6612200573221486e-05, + "loss": 0.1285, + "step": 29799 + }, + { + "epoch": 0.5315164270680983, + "grad_norm": 0.3599540889263153, + "learning_rate": 2.6610647302410597e-05, + "loss": 0.1512, + "step": 29800 + }, + { + "epoch": 0.531534263189812, + "grad_norm": 0.23450668156147003, + "learning_rate": 2.6609094025356286e-05, + "loss": 0.1182, + "step": 29801 + }, + { + "epoch": 0.5315520993115257, + "grad_norm": 0.2937523126602173, + "learning_rate": 2.6607540742064567e-05, + "loss": 0.1358, + "step": 29802 + }, + { + "epoch": 0.5315699354332394, + "grad_norm": 0.22504177689552307, + "learning_rate": 2.6605987452541474e-05, + "loss": 0.1122, + "step": 29803 + }, + { + "epoch": 0.5315877715549531, + "grad_norm": 0.2189985066652298, + "learning_rate": 2.6604434156793014e-05, + "loss": 0.0763, + "step": 29804 + }, + { + "epoch": 0.5316056076766668, + "grad_norm": 0.2387552559375763, + "learning_rate": 2.6602880854825208e-05, + "loss": 0.0898, + "step": 29805 + }, + { + "epoch": 0.5316234437983804, + "grad_norm": 0.23725664615631104, + "learning_rate": 2.6601327546644083e-05, + "loss": 0.1453, + "step": 29806 + }, + { + "epoch": 0.5316412799200941, + "grad_norm": 0.2280396670103073, + "learning_rate": 2.6599774232255663e-05, + "loss": 0.1296, + "step": 29807 + }, + { + "epoch": 0.5316591160418078, + "grad_norm": 0.3512893319129944, + "learning_rate": 2.6598220911665973e-05, + "loss": 0.0848, + "step": 29808 + }, + { + "epoch": 0.5316769521635215, + "grad_norm": 0.26400449872016907, + "learning_rate": 2.6596667584881012e-05, + "loss": 0.1198, + "step": 29809 + }, + { + "epoch": 0.5316947882852353, + "grad_norm": 0.368965744972229, + "learning_rate": 2.6595114251906827e-05, + "loss": 0.1685, + "step": 29810 + }, + { + "epoch": 0.531712624406949, + "grad_norm": 0.25596120953559875, + "learning_rate": 2.6593560912749422e-05, + "loss": 0.1499, + "step": 29811 + }, + { + "epoch": 0.5317304605286627, + "grad_norm": 0.3207376301288605, + "learning_rate": 2.6592007567414823e-05, + "loss": 0.1718, + "step": 29812 + }, + { + "epoch": 0.5317482966503764, + "grad_norm": 0.2451477348804474, + "learning_rate": 2.6590454215909054e-05, + "loss": 0.1451, + "step": 29813 + }, + { + "epoch": 0.5317661327720901, + "grad_norm": 0.24433328211307526, + "learning_rate": 2.658890085823814e-05, + "loss": 0.1233, + "step": 29814 + }, + { + "epoch": 0.5317839688938037, + "grad_norm": 0.21517139673233032, + "learning_rate": 2.6587347494408094e-05, + "loss": 0.1412, + "step": 29815 + }, + { + "epoch": 0.5318018050155174, + "grad_norm": 0.2929937243461609, + "learning_rate": 2.6585794124424944e-05, + "loss": 0.1371, + "step": 29816 + }, + { + "epoch": 0.5318196411372311, + "grad_norm": 0.3339700400829315, + "learning_rate": 2.6584240748294704e-05, + "loss": 0.1404, + "step": 29817 + }, + { + "epoch": 0.5318374772589448, + "grad_norm": 0.2703548073768616, + "learning_rate": 2.6582687366023407e-05, + "loss": 0.1635, + "step": 29818 + }, + { + "epoch": 0.5318553133806585, + "grad_norm": 0.21247000992298126, + "learning_rate": 2.6581133977617058e-05, + "loss": 0.1018, + "step": 29819 + }, + { + "epoch": 0.5318731495023722, + "grad_norm": 0.27769261598587036, + "learning_rate": 2.6579580583081686e-05, + "loss": 0.1979, + "step": 29820 + }, + { + "epoch": 0.5318909856240859, + "grad_norm": 0.283643513917923, + "learning_rate": 2.6578027182423327e-05, + "loss": 0.1044, + "step": 29821 + }, + { + "epoch": 0.5319088217457996, + "grad_norm": 0.2439984828233719, + "learning_rate": 2.657647377564798e-05, + "loss": 0.1181, + "step": 29822 + }, + { + "epoch": 0.5319266578675133, + "grad_norm": 0.27115002274513245, + "learning_rate": 2.657492036276168e-05, + "loss": 0.0853, + "step": 29823 + }, + { + "epoch": 0.5319444939892269, + "grad_norm": 0.31924304366111755, + "learning_rate": 2.6573366943770438e-05, + "loss": 0.1259, + "step": 29824 + }, + { + "epoch": 0.5319623301109406, + "grad_norm": 0.24381551146507263, + "learning_rate": 2.6571813518680294e-05, + "loss": 0.1525, + "step": 29825 + }, + { + "epoch": 0.5319801662326543, + "grad_norm": 0.2539726197719574, + "learning_rate": 2.6570260087497246e-05, + "loss": 0.092, + "step": 29826 + }, + { + "epoch": 0.5319980023543681, + "grad_norm": 0.31162208318710327, + "learning_rate": 2.656870665022733e-05, + "loss": 0.1345, + "step": 29827 + }, + { + "epoch": 0.5320158384760818, + "grad_norm": 0.2673042416572571, + "learning_rate": 2.6567153206876566e-05, + "loss": 0.17, + "step": 29828 + }, + { + "epoch": 0.5320336745977955, + "grad_norm": 0.287284255027771, + "learning_rate": 2.6565599757450982e-05, + "loss": 0.1497, + "step": 29829 + }, + { + "epoch": 0.5320515107195092, + "grad_norm": 0.23218514025211334, + "learning_rate": 2.6564046301956584e-05, + "loss": 0.1215, + "step": 29830 + }, + { + "epoch": 0.5320693468412229, + "grad_norm": 0.23794060945510864, + "learning_rate": 2.6562492840399412e-05, + "loss": 0.1534, + "step": 29831 + }, + { + "epoch": 0.5320871829629366, + "grad_norm": 0.2640847861766815, + "learning_rate": 2.6560939372785467e-05, + "loss": 0.1588, + "step": 29832 + }, + { + "epoch": 0.5321050190846502, + "grad_norm": 0.2858348786830902, + "learning_rate": 2.6559385899120785e-05, + "loss": 0.1343, + "step": 29833 + }, + { + "epoch": 0.5321228552063639, + "grad_norm": 0.17319445312023163, + "learning_rate": 2.655783241941139e-05, + "loss": 0.08, + "step": 29834 + }, + { + "epoch": 0.5321406913280776, + "grad_norm": 0.4091813564300537, + "learning_rate": 2.655627893366329e-05, + "loss": 0.1332, + "step": 29835 + }, + { + "epoch": 0.5321585274497913, + "grad_norm": 0.22048220038414001, + "learning_rate": 2.6554725441882526e-05, + "loss": 0.1042, + "step": 29836 + }, + { + "epoch": 0.532176363571505, + "grad_norm": 0.27298569679260254, + "learning_rate": 2.65531719440751e-05, + "loss": 0.124, + "step": 29837 + }, + { + "epoch": 0.5321941996932187, + "grad_norm": 0.3180537521839142, + "learning_rate": 2.655161844024705e-05, + "loss": 0.1959, + "step": 29838 + }, + { + "epoch": 0.5322120358149324, + "grad_norm": 0.3155593276023865, + "learning_rate": 2.6550064930404384e-05, + "loss": 0.16, + "step": 29839 + }, + { + "epoch": 0.5322298719366461, + "grad_norm": 0.2793496251106262, + "learning_rate": 2.6548511414553136e-05, + "loss": 0.1217, + "step": 29840 + }, + { + "epoch": 0.5322477080583597, + "grad_norm": 0.2504490613937378, + "learning_rate": 2.6546957892699313e-05, + "loss": 0.143, + "step": 29841 + }, + { + "epoch": 0.5322655441800734, + "grad_norm": 0.21099653840065002, + "learning_rate": 2.6545404364848965e-05, + "loss": 0.1174, + "step": 29842 + }, + { + "epoch": 0.5322833803017872, + "grad_norm": 0.26392391324043274, + "learning_rate": 2.654385083100808e-05, + "loss": 0.1004, + "step": 29843 + }, + { + "epoch": 0.5323012164235009, + "grad_norm": 0.23668253421783447, + "learning_rate": 2.6542297291182704e-05, + "loss": 0.1493, + "step": 29844 + }, + { + "epoch": 0.5323190525452146, + "grad_norm": 0.2799191474914551, + "learning_rate": 2.6540743745378855e-05, + "loss": 0.1354, + "step": 29845 + }, + { + "epoch": 0.5323368886669283, + "grad_norm": 0.23976179957389832, + "learning_rate": 2.653919019360254e-05, + "loss": 0.1821, + "step": 29846 + }, + { + "epoch": 0.532354724788642, + "grad_norm": 0.28765764832496643, + "learning_rate": 2.6537636635859797e-05, + "loss": 0.0989, + "step": 29847 + }, + { + "epoch": 0.5323725609103557, + "grad_norm": 0.35809677839279175, + "learning_rate": 2.653608307215664e-05, + "loss": 0.1578, + "step": 29848 + }, + { + "epoch": 0.5323903970320694, + "grad_norm": 0.44214388728141785, + "learning_rate": 2.6534529502499105e-05, + "loss": 0.1761, + "step": 29849 + }, + { + "epoch": 0.532408233153783, + "grad_norm": 0.38853806257247925, + "learning_rate": 2.6532975926893195e-05, + "loss": 0.1776, + "step": 29850 + }, + { + "epoch": 0.5324260692754967, + "grad_norm": 0.2627245783805847, + "learning_rate": 2.6531422345344943e-05, + "loss": 0.144, + "step": 29851 + }, + { + "epoch": 0.5324439053972104, + "grad_norm": 0.26276645064353943, + "learning_rate": 2.6529868757860365e-05, + "loss": 0.1406, + "step": 29852 + }, + { + "epoch": 0.5324617415189241, + "grad_norm": 0.27102428674697876, + "learning_rate": 2.6528315164445493e-05, + "loss": 0.199, + "step": 29853 + }, + { + "epoch": 0.5324795776406378, + "grad_norm": 0.21290412545204163, + "learning_rate": 2.652676156510634e-05, + "loss": 0.1238, + "step": 29854 + }, + { + "epoch": 0.5324974137623515, + "grad_norm": 0.18232671916484833, + "learning_rate": 2.6525207959848934e-05, + "loss": 0.1155, + "step": 29855 + }, + { + "epoch": 0.5325152498840652, + "grad_norm": 0.2418159693479538, + "learning_rate": 2.6523654348679296e-05, + "loss": 0.1393, + "step": 29856 + }, + { + "epoch": 0.5325330860057789, + "grad_norm": 0.30120325088500977, + "learning_rate": 2.6522100731603446e-05, + "loss": 0.1769, + "step": 29857 + }, + { + "epoch": 0.5325509221274926, + "grad_norm": 0.48727932572364807, + "learning_rate": 2.6520547108627408e-05, + "loss": 0.1995, + "step": 29858 + }, + { + "epoch": 0.5325687582492062, + "grad_norm": 0.22200995683670044, + "learning_rate": 2.6518993479757204e-05, + "loss": 0.1206, + "step": 29859 + }, + { + "epoch": 0.53258659437092, + "grad_norm": 0.42962202429771423, + "learning_rate": 2.6517439844998853e-05, + "loss": 0.161, + "step": 29860 + }, + { + "epoch": 0.5326044304926337, + "grad_norm": 0.3077550232410431, + "learning_rate": 2.6515886204358387e-05, + "loss": 0.1597, + "step": 29861 + }, + { + "epoch": 0.5326222666143474, + "grad_norm": 0.2222198098897934, + "learning_rate": 2.651433255784182e-05, + "loss": 0.1206, + "step": 29862 + }, + { + "epoch": 0.5326401027360611, + "grad_norm": 0.21353916823863983, + "learning_rate": 2.6512778905455176e-05, + "loss": 0.1198, + "step": 29863 + }, + { + "epoch": 0.5326579388577748, + "grad_norm": 0.3128848373889923, + "learning_rate": 2.6511225247204485e-05, + "loss": 0.1392, + "step": 29864 + }, + { + "epoch": 0.5326757749794885, + "grad_norm": 0.2974700629711151, + "learning_rate": 2.650967158309575e-05, + "loss": 0.1377, + "step": 29865 + }, + { + "epoch": 0.5326936111012022, + "grad_norm": 0.2117987871170044, + "learning_rate": 2.6508117913135023e-05, + "loss": 0.1397, + "step": 29866 + }, + { + "epoch": 0.5327114472229159, + "grad_norm": 0.27953919768333435, + "learning_rate": 2.6506564237328297e-05, + "loss": 0.1442, + "step": 29867 + }, + { + "epoch": 0.5327292833446295, + "grad_norm": 0.18889255821704865, + "learning_rate": 2.6505010555681608e-05, + "loss": 0.139, + "step": 29868 + }, + { + "epoch": 0.5327471194663432, + "grad_norm": 0.2776961028575897, + "learning_rate": 2.6503456868200983e-05, + "loss": 0.146, + "step": 29869 + }, + { + "epoch": 0.5327649555880569, + "grad_norm": 0.2660003900527954, + "learning_rate": 2.650190317489244e-05, + "loss": 0.153, + "step": 29870 + }, + { + "epoch": 0.5327827917097706, + "grad_norm": 0.25715041160583496, + "learning_rate": 2.6500349475762003e-05, + "loss": 0.1067, + "step": 29871 + }, + { + "epoch": 0.5328006278314843, + "grad_norm": 0.21787290275096893, + "learning_rate": 2.6498795770815694e-05, + "loss": 0.0863, + "step": 29872 + }, + { + "epoch": 0.532818463953198, + "grad_norm": 0.2636333405971527, + "learning_rate": 2.6497242060059534e-05, + "loss": 0.0907, + "step": 29873 + }, + { + "epoch": 0.5328363000749117, + "grad_norm": 0.2915153503417969, + "learning_rate": 2.6495688343499543e-05, + "loss": 0.121, + "step": 29874 + }, + { + "epoch": 0.5328541361966254, + "grad_norm": 0.2586997449398041, + "learning_rate": 2.6494134621141743e-05, + "loss": 0.1219, + "step": 29875 + }, + { + "epoch": 0.532871972318339, + "grad_norm": 0.2664002478122711, + "learning_rate": 2.6492580892992165e-05, + "loss": 0.1605, + "step": 29876 + }, + { + "epoch": 0.5328898084400528, + "grad_norm": 0.29478463530540466, + "learning_rate": 2.6491027159056834e-05, + "loss": 0.1458, + "step": 29877 + }, + { + "epoch": 0.5329076445617665, + "grad_norm": 0.20806017518043518, + "learning_rate": 2.648947341934176e-05, + "loss": 0.1218, + "step": 29878 + }, + { + "epoch": 0.5329254806834802, + "grad_norm": 0.2276647537946701, + "learning_rate": 2.6487919673852978e-05, + "loss": 0.1051, + "step": 29879 + }, + { + "epoch": 0.5329433168051939, + "grad_norm": 0.31864044070243835, + "learning_rate": 2.64863659225965e-05, + "loss": 0.181, + "step": 29880 + }, + { + "epoch": 0.5329611529269076, + "grad_norm": 0.24166466295719147, + "learning_rate": 2.648481216557836e-05, + "loss": 0.1445, + "step": 29881 + }, + { + "epoch": 0.5329789890486213, + "grad_norm": 0.2853158116340637, + "learning_rate": 2.6483258402804566e-05, + "loss": 0.108, + "step": 29882 + }, + { + "epoch": 0.532996825170335, + "grad_norm": 0.32880693674087524, + "learning_rate": 2.6481704634281163e-05, + "loss": 0.1699, + "step": 29883 + }, + { + "epoch": 0.5330146612920487, + "grad_norm": 0.24686409533023834, + "learning_rate": 2.648015086001415e-05, + "loss": 0.1173, + "step": 29884 + }, + { + "epoch": 0.5330324974137624, + "grad_norm": 0.3073842525482178, + "learning_rate": 2.647859708000957e-05, + "loss": 0.1895, + "step": 29885 + }, + { + "epoch": 0.533050333535476, + "grad_norm": 0.2646964490413666, + "learning_rate": 2.6477043294273434e-05, + "loss": 0.1197, + "step": 29886 + }, + { + "epoch": 0.5330681696571897, + "grad_norm": 0.3958754241466522, + "learning_rate": 2.6475489502811762e-05, + "loss": 0.1571, + "step": 29887 + }, + { + "epoch": 0.5330860057789034, + "grad_norm": 0.24821855127811432, + "learning_rate": 2.6473935705630588e-05, + "loss": 0.1514, + "step": 29888 + }, + { + "epoch": 0.5331038419006171, + "grad_norm": 0.24609972536563873, + "learning_rate": 2.6472381902735927e-05, + "loss": 0.1458, + "step": 29889 + }, + { + "epoch": 0.5331216780223308, + "grad_norm": 0.25449633598327637, + "learning_rate": 2.6470828094133808e-05, + "loss": 0.1124, + "step": 29890 + }, + { + "epoch": 0.5331395141440445, + "grad_norm": 0.27728673815727234, + "learning_rate": 2.6469274279830253e-05, + "loss": 0.0901, + "step": 29891 + }, + { + "epoch": 0.5331573502657582, + "grad_norm": 0.3672589063644409, + "learning_rate": 2.6467720459831285e-05, + "loss": 0.1397, + "step": 29892 + }, + { + "epoch": 0.5331751863874719, + "grad_norm": 0.2867465317249298, + "learning_rate": 2.646616663414292e-05, + "loss": 0.1947, + "step": 29893 + }, + { + "epoch": 0.5331930225091857, + "grad_norm": 0.40466591715812683, + "learning_rate": 2.6464612802771193e-05, + "loss": 0.1654, + "step": 29894 + }, + { + "epoch": 0.5332108586308993, + "grad_norm": 0.2633300721645355, + "learning_rate": 2.6463058965722113e-05, + "loss": 0.1259, + "step": 29895 + }, + { + "epoch": 0.533228694752613, + "grad_norm": 0.17879962921142578, + "learning_rate": 2.646150512300171e-05, + "loss": 0.1153, + "step": 29896 + }, + { + "epoch": 0.5332465308743267, + "grad_norm": 0.2622550427913666, + "learning_rate": 2.645995127461602e-05, + "loss": 0.1032, + "step": 29897 + }, + { + "epoch": 0.5332643669960404, + "grad_norm": 0.3151096701622009, + "learning_rate": 2.6458397420571048e-05, + "loss": 0.1808, + "step": 29898 + }, + { + "epoch": 0.5332822031177541, + "grad_norm": 0.2530626058578491, + "learning_rate": 2.6456843560872825e-05, + "loss": 0.1445, + "step": 29899 + }, + { + "epoch": 0.5333000392394678, + "grad_norm": 0.2448253482580185, + "learning_rate": 2.645528969552737e-05, + "loss": 0.1322, + "step": 29900 + }, + { + "epoch": 0.5333178753611815, + "grad_norm": 0.2782707214355469, + "learning_rate": 2.6453735824540715e-05, + "loss": 0.1103, + "step": 29901 + }, + { + "epoch": 0.5333357114828952, + "grad_norm": 0.265190988779068, + "learning_rate": 2.645218194791887e-05, + "loss": 0.1508, + "step": 29902 + }, + { + "epoch": 0.5333535476046088, + "grad_norm": 0.35700690746307373, + "learning_rate": 2.645062806566787e-05, + "loss": 0.1444, + "step": 29903 + }, + { + "epoch": 0.5333713837263225, + "grad_norm": 0.3671668469905853, + "learning_rate": 2.644907417779373e-05, + "loss": 0.1281, + "step": 29904 + }, + { + "epoch": 0.5333892198480362, + "grad_norm": 0.25174978375434875, + "learning_rate": 2.644752028430249e-05, + "loss": 0.171, + "step": 29905 + }, + { + "epoch": 0.5334070559697499, + "grad_norm": 0.3751481771469116, + "learning_rate": 2.644596638520015e-05, + "loss": 0.149, + "step": 29906 + }, + { + "epoch": 0.5334248920914636, + "grad_norm": 0.312002032995224, + "learning_rate": 2.644441248049275e-05, + "loss": 0.1372, + "step": 29907 + }, + { + "epoch": 0.5334427282131773, + "grad_norm": 0.2236240953207016, + "learning_rate": 2.644285857018631e-05, + "loss": 0.1352, + "step": 29908 + }, + { + "epoch": 0.533460564334891, + "grad_norm": 0.3119545578956604, + "learning_rate": 2.6441304654286847e-05, + "loss": 0.2075, + "step": 29909 + }, + { + "epoch": 0.5334784004566047, + "grad_norm": 0.3627402186393738, + "learning_rate": 2.6439750732800394e-05, + "loss": 0.1621, + "step": 29910 + }, + { + "epoch": 0.5334962365783185, + "grad_norm": 0.2859326899051666, + "learning_rate": 2.643819680573297e-05, + "loss": 0.1618, + "step": 29911 + }, + { + "epoch": 0.5335140727000321, + "grad_norm": 0.22822651267051697, + "learning_rate": 2.6436642873090593e-05, + "loss": 0.1191, + "step": 29912 + }, + { + "epoch": 0.5335319088217458, + "grad_norm": 0.42542552947998047, + "learning_rate": 2.6435088934879298e-05, + "loss": 0.2156, + "step": 29913 + }, + { + "epoch": 0.5335497449434595, + "grad_norm": 0.1848410815000534, + "learning_rate": 2.6433534991105103e-05, + "loss": 0.118, + "step": 29914 + }, + { + "epoch": 0.5335675810651732, + "grad_norm": 0.24543717503547668, + "learning_rate": 2.6431981041774023e-05, + "loss": 0.1256, + "step": 29915 + }, + { + "epoch": 0.5335854171868869, + "grad_norm": 0.3678569197654724, + "learning_rate": 2.6430427086892096e-05, + "loss": 0.2001, + "step": 29916 + }, + { + "epoch": 0.5336032533086006, + "grad_norm": 0.24542857706546783, + "learning_rate": 2.6428873126465336e-05, + "loss": 0.1202, + "step": 29917 + }, + { + "epoch": 0.5336210894303143, + "grad_norm": 0.2463257908821106, + "learning_rate": 2.6427319160499776e-05, + "loss": 0.1619, + "step": 29918 + }, + { + "epoch": 0.533638925552028, + "grad_norm": 0.4380140006542206, + "learning_rate": 2.6425765189001427e-05, + "loss": 0.1066, + "step": 29919 + }, + { + "epoch": 0.5336567616737417, + "grad_norm": 0.3083948791027069, + "learning_rate": 2.642421121197633e-05, + "loss": 0.1328, + "step": 29920 + }, + { + "epoch": 0.5336745977954553, + "grad_norm": 0.29016420245170593, + "learning_rate": 2.6422657229430488e-05, + "loss": 0.1385, + "step": 29921 + }, + { + "epoch": 0.533692433917169, + "grad_norm": 0.20196279883384705, + "learning_rate": 2.6421103241369945e-05, + "loss": 0.0941, + "step": 29922 + }, + { + "epoch": 0.5337102700388827, + "grad_norm": 0.21342800557613373, + "learning_rate": 2.6419549247800702e-05, + "loss": 0.0794, + "step": 29923 + }, + { + "epoch": 0.5337281061605964, + "grad_norm": 0.22464700043201447, + "learning_rate": 2.64179952487288e-05, + "loss": 0.1438, + "step": 29924 + }, + { + "epoch": 0.5337459422823101, + "grad_norm": 0.2330685704946518, + "learning_rate": 2.6416441244160268e-05, + "loss": 0.0566, + "step": 29925 + }, + { + "epoch": 0.5337637784040238, + "grad_norm": 0.2709701657295227, + "learning_rate": 2.641488723410111e-05, + "loss": 0.113, + "step": 29926 + }, + { + "epoch": 0.5337816145257375, + "grad_norm": 0.28197982907295227, + "learning_rate": 2.6413333218557367e-05, + "loss": 0.1463, + "step": 29927 + }, + { + "epoch": 0.5337994506474513, + "grad_norm": 0.27952587604522705, + "learning_rate": 2.6411779197535052e-05, + "loss": 0.149, + "step": 29928 + }, + { + "epoch": 0.533817286769165, + "grad_norm": 0.2955726385116577, + "learning_rate": 2.64102251710402e-05, + "loss": 0.1669, + "step": 29929 + }, + { + "epoch": 0.5338351228908786, + "grad_norm": 0.2965660095214844, + "learning_rate": 2.6408671139078823e-05, + "loss": 0.1007, + "step": 29930 + }, + { + "epoch": 0.5338529590125923, + "grad_norm": 0.25237980484962463, + "learning_rate": 2.6407117101656947e-05, + "loss": 0.1413, + "step": 29931 + }, + { + "epoch": 0.533870795134306, + "grad_norm": 0.31322088837623596, + "learning_rate": 2.6405563058780596e-05, + "loss": 0.1752, + "step": 29932 + }, + { + "epoch": 0.5338886312560197, + "grad_norm": 0.4370499551296234, + "learning_rate": 2.640400901045581e-05, + "loss": 0.1133, + "step": 29933 + }, + { + "epoch": 0.5339064673777334, + "grad_norm": 0.26866334676742554, + "learning_rate": 2.6402454956688593e-05, + "loss": 0.0732, + "step": 29934 + }, + { + "epoch": 0.5339243034994471, + "grad_norm": 0.2273993194103241, + "learning_rate": 2.640090089748498e-05, + "loss": 0.1192, + "step": 29935 + }, + { + "epoch": 0.5339421396211608, + "grad_norm": 0.22413517534732819, + "learning_rate": 2.6399346832850985e-05, + "loss": 0.1164, + "step": 29936 + }, + { + "epoch": 0.5339599757428745, + "grad_norm": 0.2783486247062683, + "learning_rate": 2.639779276279264e-05, + "loss": 0.1615, + "step": 29937 + }, + { + "epoch": 0.5339778118645881, + "grad_norm": 0.34598222374916077, + "learning_rate": 2.639623868731597e-05, + "loss": 0.1844, + "step": 29938 + }, + { + "epoch": 0.5339956479863018, + "grad_norm": 0.3014484941959381, + "learning_rate": 2.6394684606426994e-05, + "loss": 0.1181, + "step": 29939 + }, + { + "epoch": 0.5340134841080155, + "grad_norm": 0.26582470536231995, + "learning_rate": 2.6393130520131744e-05, + "loss": 0.0887, + "step": 29940 + }, + { + "epoch": 0.5340313202297292, + "grad_norm": 0.21539784967899323, + "learning_rate": 2.639157642843623e-05, + "loss": 0.1153, + "step": 29941 + }, + { + "epoch": 0.5340491563514429, + "grad_norm": 0.27531206607818604, + "learning_rate": 2.63900223313465e-05, + "loss": 0.1273, + "step": 29942 + }, + { + "epoch": 0.5340669924731566, + "grad_norm": 0.25267964601516724, + "learning_rate": 2.6388468228868545e-05, + "loss": 0.0781, + "step": 29943 + }, + { + "epoch": 0.5340848285948704, + "grad_norm": 0.21789827942848206, + "learning_rate": 2.6386914121008416e-05, + "loss": 0.1112, + "step": 29944 + }, + { + "epoch": 0.5341026647165841, + "grad_norm": 0.2215350717306137, + "learning_rate": 2.638536000777213e-05, + "loss": 0.1445, + "step": 29945 + }, + { + "epoch": 0.5341205008382978, + "grad_norm": 0.36032167077064514, + "learning_rate": 2.6383805889165715e-05, + "loss": 0.1018, + "step": 29946 + }, + { + "epoch": 0.5341383369600115, + "grad_norm": 0.30838868021965027, + "learning_rate": 2.6382251765195178e-05, + "loss": 0.1778, + "step": 29947 + }, + { + "epoch": 0.5341561730817251, + "grad_norm": 0.23818707466125488, + "learning_rate": 2.6380697635866568e-05, + "loss": 0.1178, + "step": 29948 + }, + { + "epoch": 0.5341740092034388, + "grad_norm": 0.2256084531545639, + "learning_rate": 2.637914350118589e-05, + "loss": 0.1509, + "step": 29949 + }, + { + "epoch": 0.5341918453251525, + "grad_norm": 0.26305562257766724, + "learning_rate": 2.6377589361159178e-05, + "loss": 0.0861, + "step": 29950 + }, + { + "epoch": 0.5342096814468662, + "grad_norm": 0.26467767357826233, + "learning_rate": 2.637603521579245e-05, + "loss": 0.1338, + "step": 29951 + }, + { + "epoch": 0.5342275175685799, + "grad_norm": 0.3854021430015564, + "learning_rate": 2.637448106509174e-05, + "loss": 0.1433, + "step": 29952 + }, + { + "epoch": 0.5342453536902936, + "grad_norm": 0.22443512082099915, + "learning_rate": 2.637292690906306e-05, + "loss": 0.1563, + "step": 29953 + }, + { + "epoch": 0.5342631898120073, + "grad_norm": 0.26604029536247253, + "learning_rate": 2.637137274771245e-05, + "loss": 0.1044, + "step": 29954 + }, + { + "epoch": 0.534281025933721, + "grad_norm": 0.2421019822359085, + "learning_rate": 2.636981858104592e-05, + "loss": 0.1617, + "step": 29955 + }, + { + "epoch": 0.5342988620554346, + "grad_norm": 0.27613556385040283, + "learning_rate": 2.6368264409069498e-05, + "loss": 0.1303, + "step": 29956 + }, + { + "epoch": 0.5343166981771483, + "grad_norm": 0.26005810499191284, + "learning_rate": 2.6366710231789214e-05, + "loss": 0.1662, + "step": 29957 + }, + { + "epoch": 0.534334534298862, + "grad_norm": 0.23246777057647705, + "learning_rate": 2.6365156049211082e-05, + "loss": 0.1385, + "step": 29958 + }, + { + "epoch": 0.5343523704205757, + "grad_norm": 0.2797565162181854, + "learning_rate": 2.6363601861341142e-05, + "loss": 0.1426, + "step": 29959 + }, + { + "epoch": 0.5343702065422894, + "grad_norm": 0.2667892873287201, + "learning_rate": 2.6362047668185408e-05, + "loss": 0.1468, + "step": 29960 + }, + { + "epoch": 0.5343880426640032, + "grad_norm": 0.3596689701080322, + "learning_rate": 2.6360493469749907e-05, + "loss": 0.1453, + "step": 29961 + }, + { + "epoch": 0.5344058787857169, + "grad_norm": 0.22444729506969452, + "learning_rate": 2.635893926604066e-05, + "loss": 0.0812, + "step": 29962 + }, + { + "epoch": 0.5344237149074306, + "grad_norm": 0.21742577850818634, + "learning_rate": 2.6357385057063703e-05, + "loss": 0.1207, + "step": 29963 + }, + { + "epoch": 0.5344415510291443, + "grad_norm": 0.3289489150047302, + "learning_rate": 2.6355830842825048e-05, + "loss": 0.1125, + "step": 29964 + }, + { + "epoch": 0.534459387150858, + "grad_norm": 0.4211141765117645, + "learning_rate": 2.635427662333072e-05, + "loss": 0.1103, + "step": 29965 + }, + { + "epoch": 0.5344772232725716, + "grad_norm": 0.28083640336990356, + "learning_rate": 2.6352722398586755e-05, + "loss": 0.1161, + "step": 29966 + }, + { + "epoch": 0.5344950593942853, + "grad_norm": 0.24796617031097412, + "learning_rate": 2.6351168168599166e-05, + "loss": 0.1748, + "step": 29967 + }, + { + "epoch": 0.534512895515999, + "grad_norm": 0.38269057869911194, + "learning_rate": 2.634961393337399e-05, + "loss": 0.1394, + "step": 29968 + }, + { + "epoch": 0.5345307316377127, + "grad_norm": 0.26489534974098206, + "learning_rate": 2.6348059692917238e-05, + "loss": 0.1319, + "step": 29969 + }, + { + "epoch": 0.5345485677594264, + "grad_norm": 0.36083126068115234, + "learning_rate": 2.6346505447234944e-05, + "loss": 0.1525, + "step": 29970 + }, + { + "epoch": 0.5345664038811401, + "grad_norm": 0.28692829608917236, + "learning_rate": 2.6344951196333122e-05, + "loss": 0.1578, + "step": 29971 + }, + { + "epoch": 0.5345842400028538, + "grad_norm": 0.20260398089885712, + "learning_rate": 2.634339694021781e-05, + "loss": 0.1012, + "step": 29972 + }, + { + "epoch": 0.5346020761245674, + "grad_norm": 0.21179082989692688, + "learning_rate": 2.6341842678895027e-05, + "loss": 0.0928, + "step": 29973 + }, + { + "epoch": 0.5346199122462811, + "grad_norm": 0.24858523905277252, + "learning_rate": 2.63402884123708e-05, + "loss": 0.1533, + "step": 29974 + }, + { + "epoch": 0.5346377483679948, + "grad_norm": 0.2500287890434265, + "learning_rate": 2.633873414065115e-05, + "loss": 0.1609, + "step": 29975 + }, + { + "epoch": 0.5346555844897085, + "grad_norm": 0.29444488883018494, + "learning_rate": 2.633717986374211e-05, + "loss": 0.0953, + "step": 29976 + }, + { + "epoch": 0.5346734206114222, + "grad_norm": 0.17601536214351654, + "learning_rate": 2.63356255816497e-05, + "loss": 0.0706, + "step": 29977 + }, + { + "epoch": 0.534691256733136, + "grad_norm": 0.20715098083019257, + "learning_rate": 2.6334071294379938e-05, + "loss": 0.153, + "step": 29978 + }, + { + "epoch": 0.5347090928548497, + "grad_norm": 0.331758052110672, + "learning_rate": 2.633251700193885e-05, + "loss": 0.1597, + "step": 29979 + }, + { + "epoch": 0.5347269289765634, + "grad_norm": 0.2710334062576294, + "learning_rate": 2.6330962704332468e-05, + "loss": 0.1184, + "step": 29980 + }, + { + "epoch": 0.5347447650982771, + "grad_norm": 0.3113744556903839, + "learning_rate": 2.6329408401566825e-05, + "loss": 0.1223, + "step": 29981 + }, + { + "epoch": 0.5347626012199908, + "grad_norm": 0.25887617468833923, + "learning_rate": 2.6327854093647925e-05, + "loss": 0.1047, + "step": 29982 + }, + { + "epoch": 0.5347804373417044, + "grad_norm": 0.22559833526611328, + "learning_rate": 2.632629978058181e-05, + "loss": 0.1396, + "step": 29983 + }, + { + "epoch": 0.5347982734634181, + "grad_norm": 0.227296382188797, + "learning_rate": 2.6324745462374495e-05, + "loss": 0.1488, + "step": 29984 + }, + { + "epoch": 0.5348161095851318, + "grad_norm": 0.2206769436597824, + "learning_rate": 2.632319113903201e-05, + "loss": 0.1124, + "step": 29985 + }, + { + "epoch": 0.5348339457068455, + "grad_norm": 0.22993215918540955, + "learning_rate": 2.6321636810560373e-05, + "loss": 0.1447, + "step": 29986 + }, + { + "epoch": 0.5348517818285592, + "grad_norm": 0.24782823026180267, + "learning_rate": 2.6320082476965628e-05, + "loss": 0.1504, + "step": 29987 + }, + { + "epoch": 0.5348696179502729, + "grad_norm": 0.40900227427482605, + "learning_rate": 2.6318528138253777e-05, + "loss": 0.1561, + "step": 29988 + }, + { + "epoch": 0.5348874540719866, + "grad_norm": 0.29222583770751953, + "learning_rate": 2.6316973794430865e-05, + "loss": 0.1042, + "step": 29989 + }, + { + "epoch": 0.5349052901937003, + "grad_norm": 0.2259809374809265, + "learning_rate": 2.6315419445502902e-05, + "loss": 0.1326, + "step": 29990 + }, + { + "epoch": 0.5349231263154139, + "grad_norm": 0.30716872215270996, + "learning_rate": 2.631386509147592e-05, + "loss": 0.1155, + "step": 29991 + }, + { + "epoch": 0.5349409624371276, + "grad_norm": 0.2817372977733612, + "learning_rate": 2.6312310732355944e-05, + "loss": 0.1516, + "step": 29992 + }, + { + "epoch": 0.5349587985588413, + "grad_norm": 0.25902578234672546, + "learning_rate": 2.631075636814899e-05, + "loss": 0.1536, + "step": 29993 + }, + { + "epoch": 0.534976634680555, + "grad_norm": 0.2492329627275467, + "learning_rate": 2.6309201998861104e-05, + "loss": 0.167, + "step": 29994 + }, + { + "epoch": 0.5349944708022688, + "grad_norm": 0.2663809657096863, + "learning_rate": 2.6307647624498294e-05, + "loss": 0.1438, + "step": 29995 + }, + { + "epoch": 0.5350123069239825, + "grad_norm": 0.24601788818836212, + "learning_rate": 2.6306093245066594e-05, + "loss": 0.1316, + "step": 29996 + }, + { + "epoch": 0.5350301430456962, + "grad_norm": 0.286398708820343, + "learning_rate": 2.6304538860572016e-05, + "loss": 0.1861, + "step": 29997 + }, + { + "epoch": 0.5350479791674099, + "grad_norm": 0.29543009400367737, + "learning_rate": 2.6302984471020604e-05, + "loss": 0.1404, + "step": 29998 + }, + { + "epoch": 0.5350658152891236, + "grad_norm": 0.25596728920936584, + "learning_rate": 2.6301430076418365e-05, + "loss": 0.1137, + "step": 29999 + }, + { + "epoch": 0.5350836514108372, + "grad_norm": 0.2510005533695221, + "learning_rate": 2.629987567677134e-05, + "loss": 0.1072, + "step": 30000 + }, + { + "epoch": 0.5350836514108372, + "eval_loss": 0.13253282010555267, + "eval_runtime": 106.3992, + "eval_samples_per_second": 9.624, + "eval_steps_per_second": 1.607, + "step": 30000 + }, + { + "epoch": 0.5351014875325509, + "grad_norm": 0.23676490783691406, + "learning_rate": 2.6298321272085542e-05, + "loss": 0.1496, + "step": 30001 + }, + { + "epoch": 0.5351193236542646, + "grad_norm": 0.30027639865875244, + "learning_rate": 2.629676686236701e-05, + "loss": 0.1888, + "step": 30002 + }, + { + "epoch": 0.5351371597759783, + "grad_norm": 0.21820507943630219, + "learning_rate": 2.6295212447621766e-05, + "loss": 0.1165, + "step": 30003 + }, + { + "epoch": 0.535154995897692, + "grad_norm": 0.3790316879749298, + "learning_rate": 2.6293658027855817e-05, + "loss": 0.1541, + "step": 30004 + }, + { + "epoch": 0.5351728320194057, + "grad_norm": 0.2574770748615265, + "learning_rate": 2.6292103603075214e-05, + "loss": 0.1156, + "step": 30005 + }, + { + "epoch": 0.5351906681411194, + "grad_norm": 0.2600179612636566, + "learning_rate": 2.6290549173285966e-05, + "loss": 0.1073, + "step": 30006 + }, + { + "epoch": 0.5352085042628331, + "grad_norm": 0.23078328371047974, + "learning_rate": 2.62889947384941e-05, + "loss": 0.1648, + "step": 30007 + }, + { + "epoch": 0.5352263403845468, + "grad_norm": 0.32244226336479187, + "learning_rate": 2.6287440298705645e-05, + "loss": 0.113, + "step": 30008 + }, + { + "epoch": 0.5352441765062604, + "grad_norm": 0.37785470485687256, + "learning_rate": 2.628588585392664e-05, + "loss": 0.1938, + "step": 30009 + }, + { + "epoch": 0.5352620126279741, + "grad_norm": 0.19090965390205383, + "learning_rate": 2.6284331404163082e-05, + "loss": 0.1313, + "step": 30010 + }, + { + "epoch": 0.5352798487496878, + "grad_norm": 0.2262149155139923, + "learning_rate": 2.6282776949421022e-05, + "loss": 0.1318, + "step": 30011 + }, + { + "epoch": 0.5352976848714016, + "grad_norm": 0.251688152551651, + "learning_rate": 2.628122248970647e-05, + "loss": 0.1035, + "step": 30012 + }, + { + "epoch": 0.5353155209931153, + "grad_norm": 0.2728239893913269, + "learning_rate": 2.627966802502546e-05, + "loss": 0.186, + "step": 30013 + }, + { + "epoch": 0.535333357114829, + "grad_norm": 0.3314443528652191, + "learning_rate": 2.627811355538401e-05, + "loss": 0.1941, + "step": 30014 + }, + { + "epoch": 0.5353511932365427, + "grad_norm": 0.2868655323982239, + "learning_rate": 2.627655908078816e-05, + "loss": 0.114, + "step": 30015 + }, + { + "epoch": 0.5353690293582564, + "grad_norm": 0.2744075059890747, + "learning_rate": 2.6275004601243918e-05, + "loss": 0.1816, + "step": 30016 + }, + { + "epoch": 0.53538686547997, + "grad_norm": 0.2588185966014862, + "learning_rate": 2.627345011675732e-05, + "loss": 0.1857, + "step": 30017 + }, + { + "epoch": 0.5354047016016837, + "grad_norm": 0.20969374477863312, + "learning_rate": 2.627189562733439e-05, + "loss": 0.1199, + "step": 30018 + }, + { + "epoch": 0.5354225377233974, + "grad_norm": 0.28712162375450134, + "learning_rate": 2.627034113298115e-05, + "loss": 0.1394, + "step": 30019 + }, + { + "epoch": 0.5354403738451111, + "grad_norm": 0.24129994213581085, + "learning_rate": 2.6268786633703634e-05, + "loss": 0.1095, + "step": 30020 + }, + { + "epoch": 0.5354582099668248, + "grad_norm": 0.29542097449302673, + "learning_rate": 2.6267232129507857e-05, + "loss": 0.1032, + "step": 30021 + }, + { + "epoch": 0.5354760460885385, + "grad_norm": 0.2944839298725128, + "learning_rate": 2.626567762039986e-05, + "loss": 0.1575, + "step": 30022 + }, + { + "epoch": 0.5354938822102522, + "grad_norm": 0.306369811296463, + "learning_rate": 2.6264123106385652e-05, + "loss": 0.1733, + "step": 30023 + }, + { + "epoch": 0.5355117183319659, + "grad_norm": 0.20029355585575104, + "learning_rate": 2.6262568587471272e-05, + "loss": 0.1501, + "step": 30024 + }, + { + "epoch": 0.5355295544536796, + "grad_norm": 0.3794811964035034, + "learning_rate": 2.6261014063662732e-05, + "loss": 0.1384, + "step": 30025 + }, + { + "epoch": 0.5355473905753932, + "grad_norm": 0.27813950181007385, + "learning_rate": 2.6259459534966073e-05, + "loss": 0.1379, + "step": 30026 + }, + { + "epoch": 0.5355652266971069, + "grad_norm": 0.17722640931606293, + "learning_rate": 2.6257905001387306e-05, + "loss": 0.1379, + "step": 30027 + }, + { + "epoch": 0.5355830628188206, + "grad_norm": 0.4119080603122711, + "learning_rate": 2.6256350462932472e-05, + "loss": 0.2011, + "step": 30028 + }, + { + "epoch": 0.5356008989405344, + "grad_norm": 0.22568555176258087, + "learning_rate": 2.625479591960759e-05, + "loss": 0.1827, + "step": 30029 + }, + { + "epoch": 0.5356187350622481, + "grad_norm": 0.2339983731508255, + "learning_rate": 2.625324137141868e-05, + "loss": 0.0693, + "step": 30030 + }, + { + "epoch": 0.5356365711839618, + "grad_norm": 0.19217249751091003, + "learning_rate": 2.6251686818371778e-05, + "loss": 0.0946, + "step": 30031 + }, + { + "epoch": 0.5356544073056755, + "grad_norm": 0.30612075328826904, + "learning_rate": 2.6250132260472903e-05, + "loss": 0.168, + "step": 30032 + }, + { + "epoch": 0.5356722434273892, + "grad_norm": 0.30206209421157837, + "learning_rate": 2.6248577697728088e-05, + "loss": 0.1534, + "step": 30033 + }, + { + "epoch": 0.5356900795491029, + "grad_norm": 0.2501528859138489, + "learning_rate": 2.624702313014335e-05, + "loss": 0.1379, + "step": 30034 + }, + { + "epoch": 0.5357079156708165, + "grad_norm": 0.19195348024368286, + "learning_rate": 2.6245468557724718e-05, + "loss": 0.116, + "step": 30035 + }, + { + "epoch": 0.5357257517925302, + "grad_norm": 0.2196555882692337, + "learning_rate": 2.6243913980478218e-05, + "loss": 0.1494, + "step": 30036 + }, + { + "epoch": 0.5357435879142439, + "grad_norm": 0.25943294167518616, + "learning_rate": 2.6242359398409888e-05, + "loss": 0.1237, + "step": 30037 + }, + { + "epoch": 0.5357614240359576, + "grad_norm": 0.2966499626636505, + "learning_rate": 2.624080481152573e-05, + "loss": 0.1281, + "step": 30038 + }, + { + "epoch": 0.5357792601576713, + "grad_norm": 0.19807657599449158, + "learning_rate": 2.62392502198318e-05, + "loss": 0.1278, + "step": 30039 + }, + { + "epoch": 0.535797096279385, + "grad_norm": 0.2528213560581207, + "learning_rate": 2.6237695623334098e-05, + "loss": 0.1604, + "step": 30040 + }, + { + "epoch": 0.5358149324010987, + "grad_norm": 0.22685837745666504, + "learning_rate": 2.623614102203866e-05, + "loss": 0.1341, + "step": 30041 + }, + { + "epoch": 0.5358327685228124, + "grad_norm": 0.30094918608665466, + "learning_rate": 2.6234586415951517e-05, + "loss": 0.1883, + "step": 30042 + }, + { + "epoch": 0.535850604644526, + "grad_norm": 0.27206870913505554, + "learning_rate": 2.623303180507869e-05, + "loss": 0.0841, + "step": 30043 + }, + { + "epoch": 0.5358684407662397, + "grad_norm": 0.2474784404039383, + "learning_rate": 2.6231477189426208e-05, + "loss": 0.1116, + "step": 30044 + }, + { + "epoch": 0.5358862768879535, + "grad_norm": 0.20598961412906647, + "learning_rate": 2.6229922569000086e-05, + "loss": 0.1273, + "step": 30045 + }, + { + "epoch": 0.5359041130096672, + "grad_norm": 0.22235436737537384, + "learning_rate": 2.622836794380637e-05, + "loss": 0.1575, + "step": 30046 + }, + { + "epoch": 0.5359219491313809, + "grad_norm": 0.17955812811851501, + "learning_rate": 2.6226813313851067e-05, + "loss": 0.1078, + "step": 30047 + }, + { + "epoch": 0.5359397852530946, + "grad_norm": 0.23530493676662445, + "learning_rate": 2.6225258679140212e-05, + "loss": 0.1173, + "step": 30048 + }, + { + "epoch": 0.5359576213748083, + "grad_norm": 0.2665049731731415, + "learning_rate": 2.622370403967983e-05, + "loss": 0.1466, + "step": 30049 + }, + { + "epoch": 0.535975457496522, + "grad_norm": 0.3786855936050415, + "learning_rate": 2.6222149395475958e-05, + "loss": 0.1668, + "step": 30050 + }, + { + "epoch": 0.5359932936182357, + "grad_norm": 0.323861688375473, + "learning_rate": 2.6220594746534604e-05, + "loss": 0.1634, + "step": 30051 + }, + { + "epoch": 0.5360111297399494, + "grad_norm": 0.3055436909198761, + "learning_rate": 2.621904009286181e-05, + "loss": 0.1199, + "step": 30052 + }, + { + "epoch": 0.536028965861663, + "grad_norm": 0.2409042865037918, + "learning_rate": 2.6217485434463595e-05, + "loss": 0.0958, + "step": 30053 + }, + { + "epoch": 0.5360468019833767, + "grad_norm": 0.3947082757949829, + "learning_rate": 2.6215930771345986e-05, + "loss": 0.1385, + "step": 30054 + }, + { + "epoch": 0.5360646381050904, + "grad_norm": 0.2209540456533432, + "learning_rate": 2.6214376103515004e-05, + "loss": 0.1126, + "step": 30055 + }, + { + "epoch": 0.5360824742268041, + "grad_norm": 0.4303555488586426, + "learning_rate": 2.621282143097668e-05, + "loss": 0.1537, + "step": 30056 + }, + { + "epoch": 0.5361003103485178, + "grad_norm": 0.2560977041721344, + "learning_rate": 2.621126675373705e-05, + "loss": 0.1225, + "step": 30057 + }, + { + "epoch": 0.5361181464702315, + "grad_norm": 0.3105206787586212, + "learning_rate": 2.6209712071802123e-05, + "loss": 0.1543, + "step": 30058 + }, + { + "epoch": 0.5361359825919452, + "grad_norm": 0.2338428646326065, + "learning_rate": 2.620815738517794e-05, + "loss": 0.1353, + "step": 30059 + }, + { + "epoch": 0.5361538187136589, + "grad_norm": 0.34728971123695374, + "learning_rate": 2.6206602693870518e-05, + "loss": 0.1193, + "step": 30060 + }, + { + "epoch": 0.5361716548353725, + "grad_norm": 0.24602587521076202, + "learning_rate": 2.620504799788589e-05, + "loss": 0.1236, + "step": 30061 + }, + { + "epoch": 0.5361894909570863, + "grad_norm": 0.2876373827457428, + "learning_rate": 2.620349329723008e-05, + "loss": 0.1391, + "step": 30062 + }, + { + "epoch": 0.5362073270788, + "grad_norm": 0.27557122707366943, + "learning_rate": 2.620193859190911e-05, + "loss": 0.1289, + "step": 30063 + }, + { + "epoch": 0.5362251632005137, + "grad_norm": 0.22649092972278595, + "learning_rate": 2.6200383881929008e-05, + "loss": 0.1093, + "step": 30064 + }, + { + "epoch": 0.5362429993222274, + "grad_norm": 0.28372225165367126, + "learning_rate": 2.6198829167295814e-05, + "loss": 0.2103, + "step": 30065 + }, + { + "epoch": 0.5362608354439411, + "grad_norm": 0.2724907696247101, + "learning_rate": 2.6197274448015536e-05, + "loss": 0.1299, + "step": 30066 + }, + { + "epoch": 0.5362786715656548, + "grad_norm": 0.19539503753185272, + "learning_rate": 2.6195719724094215e-05, + "loss": 0.1357, + "step": 30067 + }, + { + "epoch": 0.5362965076873685, + "grad_norm": 0.2573840022087097, + "learning_rate": 2.6194164995537866e-05, + "loss": 0.1668, + "step": 30068 + }, + { + "epoch": 0.5363143438090822, + "grad_norm": 0.3429783582687378, + "learning_rate": 2.6192610262352522e-05, + "loss": 0.1078, + "step": 30069 + }, + { + "epoch": 0.5363321799307958, + "grad_norm": 0.314705491065979, + "learning_rate": 2.6191055524544212e-05, + "loss": 0.1762, + "step": 30070 + }, + { + "epoch": 0.5363500160525095, + "grad_norm": 0.24936902523040771, + "learning_rate": 2.618950078211896e-05, + "loss": 0.1112, + "step": 30071 + }, + { + "epoch": 0.5363678521742232, + "grad_norm": 0.4061737358570099, + "learning_rate": 2.618794603508279e-05, + "loss": 0.1711, + "step": 30072 + }, + { + "epoch": 0.5363856882959369, + "grad_norm": 0.26980453729629517, + "learning_rate": 2.618639128344173e-05, + "loss": 0.1328, + "step": 30073 + }, + { + "epoch": 0.5364035244176506, + "grad_norm": 0.2761056423187256, + "learning_rate": 2.6184836527201813e-05, + "loss": 0.1074, + "step": 30074 + }, + { + "epoch": 0.5364213605393643, + "grad_norm": 0.34335461258888245, + "learning_rate": 2.618328176636905e-05, + "loss": 0.1143, + "step": 30075 + }, + { + "epoch": 0.536439196661078, + "grad_norm": 0.23472779989242554, + "learning_rate": 2.618172700094948e-05, + "loss": 0.0967, + "step": 30076 + }, + { + "epoch": 0.5364570327827917, + "grad_norm": 0.36837902665138245, + "learning_rate": 2.618017223094913e-05, + "loss": 0.144, + "step": 30077 + }, + { + "epoch": 0.5364748689045054, + "grad_norm": 0.30010440945625305, + "learning_rate": 2.6178617456374027e-05, + "loss": 0.1903, + "step": 30078 + }, + { + "epoch": 0.5364927050262192, + "grad_norm": 0.20942270755767822, + "learning_rate": 2.6177062677230197e-05, + "loss": 0.0799, + "step": 30079 + }, + { + "epoch": 0.5365105411479328, + "grad_norm": 0.31642064452171326, + "learning_rate": 2.6175507893523665e-05, + "loss": 0.1384, + "step": 30080 + }, + { + "epoch": 0.5365283772696465, + "grad_norm": 0.23298269510269165, + "learning_rate": 2.617395310526046e-05, + "loss": 0.1309, + "step": 30081 + }, + { + "epoch": 0.5365462133913602, + "grad_norm": 0.2235119491815567, + "learning_rate": 2.61723983124466e-05, + "loss": 0.1052, + "step": 30082 + }, + { + "epoch": 0.5365640495130739, + "grad_norm": 0.22040526568889618, + "learning_rate": 2.6170843515088123e-05, + "loss": 0.133, + "step": 30083 + }, + { + "epoch": 0.5365818856347876, + "grad_norm": 0.2211795300245285, + "learning_rate": 2.6169288713191053e-05, + "loss": 0.1595, + "step": 30084 + }, + { + "epoch": 0.5365997217565013, + "grad_norm": 0.24885423481464386, + "learning_rate": 2.6167733906761415e-05, + "loss": 0.0994, + "step": 30085 + }, + { + "epoch": 0.536617557878215, + "grad_norm": 0.34601157903671265, + "learning_rate": 2.6166179095805236e-05, + "loss": 0.117, + "step": 30086 + }, + { + "epoch": 0.5366353939999287, + "grad_norm": 0.2754441499710083, + "learning_rate": 2.6164624280328548e-05, + "loss": 0.1403, + "step": 30087 + }, + { + "epoch": 0.5366532301216423, + "grad_norm": 0.23783928155899048, + "learning_rate": 2.616306946033737e-05, + "loss": 0.1268, + "step": 30088 + }, + { + "epoch": 0.536671066243356, + "grad_norm": 0.25563597679138184, + "learning_rate": 2.6161514635837742e-05, + "loss": 0.1221, + "step": 30089 + }, + { + "epoch": 0.5366889023650697, + "grad_norm": 0.285347044467926, + "learning_rate": 2.6159959806835665e-05, + "loss": 0.1017, + "step": 30090 + }, + { + "epoch": 0.5367067384867834, + "grad_norm": 0.20604771375656128, + "learning_rate": 2.6158404973337196e-05, + "loss": 0.1583, + "step": 30091 + }, + { + "epoch": 0.5367245746084971, + "grad_norm": 0.33926403522491455, + "learning_rate": 2.6156850135348343e-05, + "loss": 0.1479, + "step": 30092 + }, + { + "epoch": 0.5367424107302108, + "grad_norm": 0.22118711471557617, + "learning_rate": 2.615529529287515e-05, + "loss": 0.1572, + "step": 30093 + }, + { + "epoch": 0.5367602468519245, + "grad_norm": 0.2924410104751587, + "learning_rate": 2.6153740445923625e-05, + "loss": 0.1297, + "step": 30094 + }, + { + "epoch": 0.5367780829736382, + "grad_norm": 0.24608610570430756, + "learning_rate": 2.6152185594499807e-05, + "loss": 0.1712, + "step": 30095 + }, + { + "epoch": 0.536795919095352, + "grad_norm": 0.18926946818828583, + "learning_rate": 2.615063073860971e-05, + "loss": 0.1252, + "step": 30096 + }, + { + "epoch": 0.5368137552170656, + "grad_norm": 0.2684064507484436, + "learning_rate": 2.6149075878259378e-05, + "loss": 0.1373, + "step": 30097 + }, + { + "epoch": 0.5368315913387793, + "grad_norm": 0.2129097431898117, + "learning_rate": 2.6147521013454835e-05, + "loss": 0.1461, + "step": 30098 + }, + { + "epoch": 0.536849427460493, + "grad_norm": 0.2931802570819855, + "learning_rate": 2.61459661442021e-05, + "loss": 0.1609, + "step": 30099 + }, + { + "epoch": 0.5368672635822067, + "grad_norm": 0.2953733801841736, + "learning_rate": 2.614441127050721e-05, + "loss": 0.1512, + "step": 30100 + }, + { + "epoch": 0.5368850997039204, + "grad_norm": 0.2706487774848938, + "learning_rate": 2.6142856392376174e-05, + "loss": 0.0953, + "step": 30101 + }, + { + "epoch": 0.5369029358256341, + "grad_norm": 0.2619917690753937, + "learning_rate": 2.6141301509815042e-05, + "loss": 0.1596, + "step": 30102 + }, + { + "epoch": 0.5369207719473478, + "grad_norm": 0.17842665314674377, + "learning_rate": 2.613974662282983e-05, + "loss": 0.1196, + "step": 30103 + }, + { + "epoch": 0.5369386080690615, + "grad_norm": 0.309428870677948, + "learning_rate": 2.613819173142656e-05, + "loss": 0.1672, + "step": 30104 + }, + { + "epoch": 0.5369564441907752, + "grad_norm": 0.22137954831123352, + "learning_rate": 2.6136636835611265e-05, + "loss": 0.0996, + "step": 30105 + }, + { + "epoch": 0.5369742803124888, + "grad_norm": 0.2813457250595093, + "learning_rate": 2.6135081935389988e-05, + "loss": 0.0876, + "step": 30106 + }, + { + "epoch": 0.5369921164342025, + "grad_norm": 0.17232254147529602, + "learning_rate": 2.6133527030768733e-05, + "loss": 0.12, + "step": 30107 + }, + { + "epoch": 0.5370099525559162, + "grad_norm": 0.3035275936126709, + "learning_rate": 2.6131972121753536e-05, + "loss": 0.1718, + "step": 30108 + }, + { + "epoch": 0.5370277886776299, + "grad_norm": 0.30437931418418884, + "learning_rate": 2.6130417208350423e-05, + "loss": 0.2239, + "step": 30109 + }, + { + "epoch": 0.5370456247993436, + "grad_norm": 0.2418203055858612, + "learning_rate": 2.612886229056542e-05, + "loss": 0.1273, + "step": 30110 + }, + { + "epoch": 0.5370634609210573, + "grad_norm": 0.2719059884548187, + "learning_rate": 2.612730736840456e-05, + "loss": 0.1128, + "step": 30111 + }, + { + "epoch": 0.537081297042771, + "grad_norm": 0.22249765694141388, + "learning_rate": 2.612575244187386e-05, + "loss": 0.0664, + "step": 30112 + }, + { + "epoch": 0.5370991331644848, + "grad_norm": 0.28104168176651, + "learning_rate": 2.6124197510979366e-05, + "loss": 0.1348, + "step": 30113 + }, + { + "epoch": 0.5371169692861985, + "grad_norm": 0.3130894601345062, + "learning_rate": 2.6122642575727086e-05, + "loss": 0.2115, + "step": 30114 + }, + { + "epoch": 0.5371348054079121, + "grad_norm": 0.4007646143436432, + "learning_rate": 2.6121087636123066e-05, + "loss": 0.1529, + "step": 30115 + }, + { + "epoch": 0.5371526415296258, + "grad_norm": 0.24136139452457428, + "learning_rate": 2.611953269217331e-05, + "loss": 0.1712, + "step": 30116 + }, + { + "epoch": 0.5371704776513395, + "grad_norm": 0.25347164273262024, + "learning_rate": 2.6117977743883864e-05, + "loss": 0.1384, + "step": 30117 + }, + { + "epoch": 0.5371883137730532, + "grad_norm": 0.20396308600902557, + "learning_rate": 2.6116422791260747e-05, + "loss": 0.1432, + "step": 30118 + }, + { + "epoch": 0.5372061498947669, + "grad_norm": 0.2601173520088196, + "learning_rate": 2.6114867834309997e-05, + "loss": 0.1458, + "step": 30119 + }, + { + "epoch": 0.5372239860164806, + "grad_norm": 0.23643594980239868, + "learning_rate": 2.6113312873037634e-05, + "loss": 0.1273, + "step": 30120 + }, + { + "epoch": 0.5372418221381943, + "grad_norm": 0.4889412820339203, + "learning_rate": 2.6111757907449678e-05, + "loss": 0.1214, + "step": 30121 + }, + { + "epoch": 0.537259658259908, + "grad_norm": 0.28934311866760254, + "learning_rate": 2.6110202937552175e-05, + "loss": 0.1431, + "step": 30122 + }, + { + "epoch": 0.5372774943816216, + "grad_norm": 0.31065303087234497, + "learning_rate": 2.6108647963351135e-05, + "loss": 0.1707, + "step": 30123 + }, + { + "epoch": 0.5372953305033353, + "grad_norm": 0.2623595595359802, + "learning_rate": 2.6107092984852588e-05, + "loss": 0.1369, + "step": 30124 + }, + { + "epoch": 0.537313166625049, + "grad_norm": 0.26367178559303284, + "learning_rate": 2.610553800206257e-05, + "loss": 0.1536, + "step": 30125 + }, + { + "epoch": 0.5373310027467627, + "grad_norm": 0.23547498881816864, + "learning_rate": 2.610398301498711e-05, + "loss": 0.1333, + "step": 30126 + }, + { + "epoch": 0.5373488388684764, + "grad_norm": 0.2615172564983368, + "learning_rate": 2.6102428023632225e-05, + "loss": 0.1648, + "step": 30127 + }, + { + "epoch": 0.5373666749901901, + "grad_norm": 0.23980942368507385, + "learning_rate": 2.610087302800395e-05, + "loss": 0.1201, + "step": 30128 + }, + { + "epoch": 0.5373845111119038, + "grad_norm": 0.2491067349910736, + "learning_rate": 2.609931802810831e-05, + "loss": 0.1029, + "step": 30129 + }, + { + "epoch": 0.5374023472336176, + "grad_norm": 0.17991548776626587, + "learning_rate": 2.609776302395134e-05, + "loss": 0.0941, + "step": 30130 + }, + { + "epoch": 0.5374201833553313, + "grad_norm": 0.22439929842948914, + "learning_rate": 2.6096208015539054e-05, + "loss": 0.126, + "step": 30131 + }, + { + "epoch": 0.537438019477045, + "grad_norm": 0.2057967334985733, + "learning_rate": 2.609465300287749e-05, + "loss": 0.1166, + "step": 30132 + }, + { + "epoch": 0.5374558555987586, + "grad_norm": 0.3045075833797455, + "learning_rate": 2.609309798597267e-05, + "loss": 0.188, + "step": 30133 + }, + { + "epoch": 0.5374736917204723, + "grad_norm": 0.25291216373443604, + "learning_rate": 2.609154296483063e-05, + "loss": 0.1608, + "step": 30134 + }, + { + "epoch": 0.537491527842186, + "grad_norm": 0.2918686270713806, + "learning_rate": 2.6089987939457388e-05, + "loss": 0.1831, + "step": 30135 + }, + { + "epoch": 0.5375093639638997, + "grad_norm": 0.30761343240737915, + "learning_rate": 2.608843290985898e-05, + "loss": 0.1665, + "step": 30136 + }, + { + "epoch": 0.5375272000856134, + "grad_norm": 0.25711649656295776, + "learning_rate": 2.6086877876041427e-05, + "loss": 0.1112, + "step": 30137 + }, + { + "epoch": 0.5375450362073271, + "grad_norm": 0.25996163487434387, + "learning_rate": 2.608532283801076e-05, + "loss": 0.1641, + "step": 30138 + }, + { + "epoch": 0.5375628723290408, + "grad_norm": 0.2545626163482666, + "learning_rate": 2.6083767795773006e-05, + "loss": 0.1102, + "step": 30139 + }, + { + "epoch": 0.5375807084507545, + "grad_norm": 0.2902357578277588, + "learning_rate": 2.608221274933419e-05, + "loss": 0.1323, + "step": 30140 + }, + { + "epoch": 0.5375985445724681, + "grad_norm": 0.2788262367248535, + "learning_rate": 2.6080657698700356e-05, + "loss": 0.1411, + "step": 30141 + }, + { + "epoch": 0.5376163806941818, + "grad_norm": 0.25920966267585754, + "learning_rate": 2.6079102643877507e-05, + "loss": 0.1828, + "step": 30142 + }, + { + "epoch": 0.5376342168158955, + "grad_norm": 0.290206640958786, + "learning_rate": 2.607754758487169e-05, + "loss": 0.112, + "step": 30143 + }, + { + "epoch": 0.5376520529376092, + "grad_norm": 0.20573769509792328, + "learning_rate": 2.6075992521688924e-05, + "loss": 0.1547, + "step": 30144 + }, + { + "epoch": 0.5376698890593229, + "grad_norm": 0.26334241032600403, + "learning_rate": 2.607443745433524e-05, + "loss": 0.1506, + "step": 30145 + }, + { + "epoch": 0.5376877251810367, + "grad_norm": 0.3441622257232666, + "learning_rate": 2.607288238281666e-05, + "loss": 0.1253, + "step": 30146 + }, + { + "epoch": 0.5377055613027504, + "grad_norm": 0.31098997592926025, + "learning_rate": 2.607132730713923e-05, + "loss": 0.1946, + "step": 30147 + }, + { + "epoch": 0.5377233974244641, + "grad_norm": 0.22055085003376007, + "learning_rate": 2.6069772227308958e-05, + "loss": 0.1064, + "step": 30148 + }, + { + "epoch": 0.5377412335461778, + "grad_norm": 0.26129046082496643, + "learning_rate": 2.606821714333188e-05, + "loss": 0.0877, + "step": 30149 + }, + { + "epoch": 0.5377590696678914, + "grad_norm": 0.28059816360473633, + "learning_rate": 2.6066662055214024e-05, + "loss": 0.1255, + "step": 30150 + }, + { + "epoch": 0.5377769057896051, + "grad_norm": 0.3769588768482208, + "learning_rate": 2.6065106962961416e-05, + "loss": 0.169, + "step": 30151 + }, + { + "epoch": 0.5377947419113188, + "grad_norm": 0.27034151554107666, + "learning_rate": 2.6063551866580084e-05, + "loss": 0.1637, + "step": 30152 + }, + { + "epoch": 0.5378125780330325, + "grad_norm": 0.3192272186279297, + "learning_rate": 2.6061996766076057e-05, + "loss": 0.1272, + "step": 30153 + }, + { + "epoch": 0.5378304141547462, + "grad_norm": 0.28916579484939575, + "learning_rate": 2.606044166145537e-05, + "loss": 0.1584, + "step": 30154 + }, + { + "epoch": 0.5378482502764599, + "grad_norm": 0.3682737648487091, + "learning_rate": 2.605888655272404e-05, + "loss": 0.1519, + "step": 30155 + }, + { + "epoch": 0.5378660863981736, + "grad_norm": 0.20910081267356873, + "learning_rate": 2.6057331439888106e-05, + "loss": 0.1095, + "step": 30156 + }, + { + "epoch": 0.5378839225198873, + "grad_norm": 0.23869255185127258, + "learning_rate": 2.6055776322953585e-05, + "loss": 0.1462, + "step": 30157 + }, + { + "epoch": 0.537901758641601, + "grad_norm": 0.2382553219795227, + "learning_rate": 2.605422120192651e-05, + "loss": 0.1099, + "step": 30158 + }, + { + "epoch": 0.5379195947633146, + "grad_norm": 0.214219331741333, + "learning_rate": 2.605266607681291e-05, + "loss": 0.1353, + "step": 30159 + }, + { + "epoch": 0.5379374308850283, + "grad_norm": 0.28148606419563293, + "learning_rate": 2.605111094761881e-05, + "loss": 0.1112, + "step": 30160 + }, + { + "epoch": 0.537955267006742, + "grad_norm": 0.22239606082439423, + "learning_rate": 2.6049555814350248e-05, + "loss": 0.0909, + "step": 30161 + }, + { + "epoch": 0.5379731031284557, + "grad_norm": 0.26567742228507996, + "learning_rate": 2.604800067701324e-05, + "loss": 0.1038, + "step": 30162 + }, + { + "epoch": 0.5379909392501695, + "grad_norm": 0.19751587510108948, + "learning_rate": 2.6046445535613822e-05, + "loss": 0.1043, + "step": 30163 + }, + { + "epoch": 0.5380087753718832, + "grad_norm": 0.2616390883922577, + "learning_rate": 2.6044890390158017e-05, + "loss": 0.1221, + "step": 30164 + }, + { + "epoch": 0.5380266114935969, + "grad_norm": 0.2807294726371765, + "learning_rate": 2.6043335240651863e-05, + "loss": 0.1379, + "step": 30165 + }, + { + "epoch": 0.5380444476153106, + "grad_norm": 0.31500375270843506, + "learning_rate": 2.604178008710137e-05, + "loss": 0.1216, + "step": 30166 + }, + { + "epoch": 0.5380622837370242, + "grad_norm": 0.28421783447265625, + "learning_rate": 2.604022492951258e-05, + "loss": 0.1119, + "step": 30167 + }, + { + "epoch": 0.5380801198587379, + "grad_norm": 0.22175948321819305, + "learning_rate": 2.6038669767891522e-05, + "loss": 0.1153, + "step": 30168 + }, + { + "epoch": 0.5380979559804516, + "grad_norm": 0.2739332318305969, + "learning_rate": 2.6037114602244224e-05, + "loss": 0.0976, + "step": 30169 + }, + { + "epoch": 0.5381157921021653, + "grad_norm": 0.23701195418834686, + "learning_rate": 2.6035559432576707e-05, + "loss": 0.1702, + "step": 30170 + }, + { + "epoch": 0.538133628223879, + "grad_norm": 0.2555486857891083, + "learning_rate": 2.6034004258895006e-05, + "loss": 0.1654, + "step": 30171 + }, + { + "epoch": 0.5381514643455927, + "grad_norm": 0.328472763299942, + "learning_rate": 2.6032449081205146e-05, + "loss": 0.0959, + "step": 30172 + }, + { + "epoch": 0.5381693004673064, + "grad_norm": 0.29574477672576904, + "learning_rate": 2.6030893899513153e-05, + "loss": 0.1528, + "step": 30173 + }, + { + "epoch": 0.5381871365890201, + "grad_norm": 0.2431219220161438, + "learning_rate": 2.602933871382507e-05, + "loss": 0.1446, + "step": 30174 + }, + { + "epoch": 0.5382049727107338, + "grad_norm": 0.19734881818294525, + "learning_rate": 2.6027783524146903e-05, + "loss": 0.1229, + "step": 30175 + }, + { + "epoch": 0.5382228088324474, + "grad_norm": 0.3010072112083435, + "learning_rate": 2.6026228330484702e-05, + "loss": 0.1618, + "step": 30176 + }, + { + "epoch": 0.5382406449541611, + "grad_norm": 0.21486851572990417, + "learning_rate": 2.6024673132844478e-05, + "loss": 0.1431, + "step": 30177 + }, + { + "epoch": 0.5382584810758748, + "grad_norm": 0.2521578073501587, + "learning_rate": 2.6023117931232276e-05, + "loss": 0.1456, + "step": 30178 + }, + { + "epoch": 0.5382763171975885, + "grad_norm": 0.2575049102306366, + "learning_rate": 2.602156272565411e-05, + "loss": 0.1444, + "step": 30179 + }, + { + "epoch": 0.5382941533193023, + "grad_norm": 0.30389899015426636, + "learning_rate": 2.6020007516116007e-05, + "loss": 0.1386, + "step": 30180 + }, + { + "epoch": 0.538311989441016, + "grad_norm": 0.26248690485954285, + "learning_rate": 2.601845230262401e-05, + "loss": 0.0853, + "step": 30181 + }, + { + "epoch": 0.5383298255627297, + "grad_norm": 0.23721909523010254, + "learning_rate": 2.6016897085184145e-05, + "loss": 0.1285, + "step": 30182 + }, + { + "epoch": 0.5383476616844434, + "grad_norm": 0.3314650356769562, + "learning_rate": 2.601534186380242e-05, + "loss": 0.1629, + "step": 30183 + }, + { + "epoch": 0.5383654978061571, + "grad_norm": 0.27695146203041077, + "learning_rate": 2.60137866384849e-05, + "loss": 0.1539, + "step": 30184 + }, + { + "epoch": 0.5383833339278707, + "grad_norm": 0.2582625448703766, + "learning_rate": 2.6012231409237575e-05, + "loss": 0.1689, + "step": 30185 + }, + { + "epoch": 0.5384011700495844, + "grad_norm": 0.3072316348552704, + "learning_rate": 2.6010676176066507e-05, + "loss": 0.146, + "step": 30186 + }, + { + "epoch": 0.5384190061712981, + "grad_norm": 0.24565082788467407, + "learning_rate": 2.6009120938977692e-05, + "loss": 0.1432, + "step": 30187 + }, + { + "epoch": 0.5384368422930118, + "grad_norm": 0.2465829700231552, + "learning_rate": 2.6007565697977187e-05, + "loss": 0.1235, + "step": 30188 + }, + { + "epoch": 0.5384546784147255, + "grad_norm": 0.345730721950531, + "learning_rate": 2.600601045307101e-05, + "loss": 0.207, + "step": 30189 + }, + { + "epoch": 0.5384725145364392, + "grad_norm": 0.6362784504890442, + "learning_rate": 2.6004455204265182e-05, + "loss": 0.2045, + "step": 30190 + }, + { + "epoch": 0.5384903506581529, + "grad_norm": 0.2542247474193573, + "learning_rate": 2.6002899951565746e-05, + "loss": 0.1379, + "step": 30191 + }, + { + "epoch": 0.5385081867798666, + "grad_norm": 0.2850508987903595, + "learning_rate": 2.6001344694978717e-05, + "loss": 0.153, + "step": 30192 + }, + { + "epoch": 0.5385260229015802, + "grad_norm": 0.3003283143043518, + "learning_rate": 2.5999789434510136e-05, + "loss": 0.0988, + "step": 30193 + }, + { + "epoch": 0.5385438590232939, + "grad_norm": 0.22373245656490326, + "learning_rate": 2.5998234170166013e-05, + "loss": 0.1382, + "step": 30194 + }, + { + "epoch": 0.5385616951450076, + "grad_norm": 0.29682183265686035, + "learning_rate": 2.5996678901952403e-05, + "loss": 0.1302, + "step": 30195 + }, + { + "epoch": 0.5385795312667213, + "grad_norm": 0.22597390413284302, + "learning_rate": 2.5995123629875317e-05, + "loss": 0.1404, + "step": 30196 + }, + { + "epoch": 0.5385973673884351, + "grad_norm": 0.3263636827468872, + "learning_rate": 2.5993568353940795e-05, + "loss": 0.1298, + "step": 30197 + }, + { + "epoch": 0.5386152035101488, + "grad_norm": 0.23312269151210785, + "learning_rate": 2.5992013074154848e-05, + "loss": 0.1563, + "step": 30198 + }, + { + "epoch": 0.5386330396318625, + "grad_norm": 0.23365218937397003, + "learning_rate": 2.5990457790523527e-05, + "loss": 0.1451, + "step": 30199 + }, + { + "epoch": 0.5386508757535762, + "grad_norm": 0.24646709859371185, + "learning_rate": 2.5988902503052837e-05, + "loss": 0.1618, + "step": 30200 + }, + { + "epoch": 0.5386687118752899, + "grad_norm": 0.2120083123445511, + "learning_rate": 2.5987347211748824e-05, + "loss": 0.1298, + "step": 30201 + }, + { + "epoch": 0.5386865479970036, + "grad_norm": 0.23787184059619904, + "learning_rate": 2.598579191661752e-05, + "loss": 0.1307, + "step": 30202 + }, + { + "epoch": 0.5387043841187172, + "grad_norm": 0.26815640926361084, + "learning_rate": 2.5984236617664936e-05, + "loss": 0.1277, + "step": 30203 + }, + { + "epoch": 0.5387222202404309, + "grad_norm": 0.3354211151599884, + "learning_rate": 2.5982681314897118e-05, + "loss": 0.1802, + "step": 30204 + }, + { + "epoch": 0.5387400563621446, + "grad_norm": 0.21570003032684326, + "learning_rate": 2.5981126008320078e-05, + "loss": 0.0878, + "step": 30205 + }, + { + "epoch": 0.5387578924838583, + "grad_norm": 0.3057674169540405, + "learning_rate": 2.5979570697939864e-05, + "loss": 0.1484, + "step": 30206 + }, + { + "epoch": 0.538775728605572, + "grad_norm": 0.14569173753261566, + "learning_rate": 2.5978015383762495e-05, + "loss": 0.0774, + "step": 30207 + }, + { + "epoch": 0.5387935647272857, + "grad_norm": 0.20691975951194763, + "learning_rate": 2.5976460065793994e-05, + "loss": 0.1085, + "step": 30208 + }, + { + "epoch": 0.5388114008489994, + "grad_norm": 0.18781836330890656, + "learning_rate": 2.5974904744040397e-05, + "loss": 0.0624, + "step": 30209 + }, + { + "epoch": 0.538829236970713, + "grad_norm": 0.3202458620071411, + "learning_rate": 2.5973349418507742e-05, + "loss": 0.1566, + "step": 30210 + }, + { + "epoch": 0.5388470730924267, + "grad_norm": 0.2658301293849945, + "learning_rate": 2.597179408920204e-05, + "loss": 0.1645, + "step": 30211 + }, + { + "epoch": 0.5388649092141404, + "grad_norm": 0.2889425754547119, + "learning_rate": 2.5970238756129332e-05, + "loss": 0.1679, + "step": 30212 + }, + { + "epoch": 0.5388827453358541, + "grad_norm": 0.2121240496635437, + "learning_rate": 2.596868341929565e-05, + "loss": 0.1353, + "step": 30213 + }, + { + "epoch": 0.5389005814575679, + "grad_norm": 0.25801408290863037, + "learning_rate": 2.5967128078707004e-05, + "loss": 0.1484, + "step": 30214 + }, + { + "epoch": 0.5389184175792816, + "grad_norm": 0.2459481954574585, + "learning_rate": 2.5965572734369438e-05, + "loss": 0.1263, + "step": 30215 + }, + { + "epoch": 0.5389362537009953, + "grad_norm": 0.3302626609802246, + "learning_rate": 2.596401738628898e-05, + "loss": 0.1191, + "step": 30216 + }, + { + "epoch": 0.538954089822709, + "grad_norm": 0.31630364060401917, + "learning_rate": 2.5962462034471663e-05, + "loss": 0.1298, + "step": 30217 + }, + { + "epoch": 0.5389719259444227, + "grad_norm": 0.2410750389099121, + "learning_rate": 2.59609066789235e-05, + "loss": 0.0959, + "step": 30218 + }, + { + "epoch": 0.5389897620661364, + "grad_norm": 0.3196088969707489, + "learning_rate": 2.5959351319650543e-05, + "loss": 0.1573, + "step": 30219 + }, + { + "epoch": 0.53900759818785, + "grad_norm": 0.2422807663679123, + "learning_rate": 2.5957795956658804e-05, + "loss": 0.133, + "step": 30220 + }, + { + "epoch": 0.5390254343095637, + "grad_norm": 0.3216957151889801, + "learning_rate": 2.5956240589954318e-05, + "loss": 0.1602, + "step": 30221 + }, + { + "epoch": 0.5390432704312774, + "grad_norm": 0.34926286339759827, + "learning_rate": 2.5954685219543106e-05, + "loss": 0.1891, + "step": 30222 + }, + { + "epoch": 0.5390611065529911, + "grad_norm": 0.43251731991767883, + "learning_rate": 2.595312984543121e-05, + "loss": 0.1514, + "step": 30223 + }, + { + "epoch": 0.5390789426747048, + "grad_norm": 0.25458455085754395, + "learning_rate": 2.595157446762465e-05, + "loss": 0.143, + "step": 30224 + }, + { + "epoch": 0.5390967787964185, + "grad_norm": 0.28090301156044006, + "learning_rate": 2.5950019086129467e-05, + "loss": 0.1347, + "step": 30225 + }, + { + "epoch": 0.5391146149181322, + "grad_norm": 0.32432064414024353, + "learning_rate": 2.594846370095168e-05, + "loss": 0.1852, + "step": 30226 + }, + { + "epoch": 0.5391324510398459, + "grad_norm": 0.20416459441184998, + "learning_rate": 2.594690831209732e-05, + "loss": 0.112, + "step": 30227 + }, + { + "epoch": 0.5391502871615595, + "grad_norm": 0.17926311492919922, + "learning_rate": 2.5945352919572408e-05, + "loss": 0.0929, + "step": 30228 + }, + { + "epoch": 0.5391681232832732, + "grad_norm": 0.28817903995513916, + "learning_rate": 2.5943797523382985e-05, + "loss": 0.1671, + "step": 30229 + }, + { + "epoch": 0.5391859594049869, + "grad_norm": 0.3736797869205475, + "learning_rate": 2.5942242123535082e-05, + "loss": 0.2002, + "step": 30230 + }, + { + "epoch": 0.5392037955267007, + "grad_norm": 0.28220483660697937, + "learning_rate": 2.5940686720034718e-05, + "loss": 0.151, + "step": 30231 + }, + { + "epoch": 0.5392216316484144, + "grad_norm": 0.2664179801940918, + "learning_rate": 2.593913131288793e-05, + "loss": 0.1667, + "step": 30232 + }, + { + "epoch": 0.5392394677701281, + "grad_norm": 0.20628675818443298, + "learning_rate": 2.5937575902100747e-05, + "loss": 0.123, + "step": 30233 + }, + { + "epoch": 0.5392573038918418, + "grad_norm": 0.22482234239578247, + "learning_rate": 2.593602048767919e-05, + "loss": 0.078, + "step": 30234 + }, + { + "epoch": 0.5392751400135555, + "grad_norm": 0.24880845844745636, + "learning_rate": 2.5934465069629298e-05, + "loss": 0.1608, + "step": 30235 + }, + { + "epoch": 0.5392929761352692, + "grad_norm": 0.26692065596580505, + "learning_rate": 2.5932909647957092e-05, + "loss": 0.1364, + "step": 30236 + }, + { + "epoch": 0.5393108122569829, + "grad_norm": 0.22033682465553284, + "learning_rate": 2.593135422266861e-05, + "loss": 0.1082, + "step": 30237 + }, + { + "epoch": 0.5393286483786965, + "grad_norm": 0.3624100983142853, + "learning_rate": 2.5929798793769884e-05, + "loss": 0.127, + "step": 30238 + }, + { + "epoch": 0.5393464845004102, + "grad_norm": 0.2656458020210266, + "learning_rate": 2.592824336126693e-05, + "loss": 0.1301, + "step": 30239 + }, + { + "epoch": 0.5393643206221239, + "grad_norm": 0.29909950494766235, + "learning_rate": 2.5926687925165782e-05, + "loss": 0.1415, + "step": 30240 + }, + { + "epoch": 0.5393821567438376, + "grad_norm": 0.3570648431777954, + "learning_rate": 2.5925132485472476e-05, + "loss": 0.2211, + "step": 30241 + }, + { + "epoch": 0.5393999928655513, + "grad_norm": 0.23444794118404388, + "learning_rate": 2.5923577042193033e-05, + "loss": 0.0894, + "step": 30242 + }, + { + "epoch": 0.539417828987265, + "grad_norm": 0.2331296056509018, + "learning_rate": 2.5922021595333486e-05, + "loss": 0.096, + "step": 30243 + }, + { + "epoch": 0.5394356651089787, + "grad_norm": 0.41303956508636475, + "learning_rate": 2.5920466144899864e-05, + "loss": 0.1378, + "step": 30244 + }, + { + "epoch": 0.5394535012306924, + "grad_norm": 0.2473756968975067, + "learning_rate": 2.5918910690898206e-05, + "loss": 0.1428, + "step": 30245 + }, + { + "epoch": 0.539471337352406, + "grad_norm": 0.3279547393321991, + "learning_rate": 2.591735523333452e-05, + "loss": 0.1161, + "step": 30246 + }, + { + "epoch": 0.5394891734741197, + "grad_norm": 0.2827765643596649, + "learning_rate": 2.591579977221486e-05, + "loss": 0.1355, + "step": 30247 + }, + { + "epoch": 0.5395070095958335, + "grad_norm": 0.20733055472373962, + "learning_rate": 2.591424430754524e-05, + "loss": 0.1476, + "step": 30248 + }, + { + "epoch": 0.5395248457175472, + "grad_norm": 0.28367963433265686, + "learning_rate": 2.591268883933169e-05, + "loss": 0.1334, + "step": 30249 + }, + { + "epoch": 0.5395426818392609, + "grad_norm": 0.2542972266674042, + "learning_rate": 2.5911133367580244e-05, + "loss": 0.1188, + "step": 30250 + }, + { + "epoch": 0.5395605179609746, + "grad_norm": 0.2309126853942871, + "learning_rate": 2.590957789229693e-05, + "loss": 0.1272, + "step": 30251 + }, + { + "epoch": 0.5395783540826883, + "grad_norm": 0.25583431124687195, + "learning_rate": 2.5908022413487787e-05, + "loss": 0.1716, + "step": 30252 + }, + { + "epoch": 0.539596190204402, + "grad_norm": 0.20859479904174805, + "learning_rate": 2.5906466931158823e-05, + "loss": 0.108, + "step": 30253 + }, + { + "epoch": 0.5396140263261157, + "grad_norm": 0.24227586388587952, + "learning_rate": 2.5904911445316086e-05, + "loss": 0.1567, + "step": 30254 + }, + { + "epoch": 0.5396318624478293, + "grad_norm": 0.2867540717124939, + "learning_rate": 2.5903355955965603e-05, + "loss": 0.1571, + "step": 30255 + }, + { + "epoch": 0.539649698569543, + "grad_norm": 0.2770766317844391, + "learning_rate": 2.590180046311339e-05, + "loss": 0.1291, + "step": 30256 + }, + { + "epoch": 0.5396675346912567, + "grad_norm": 0.2663654088973999, + "learning_rate": 2.590024496676549e-05, + "loss": 0.165, + "step": 30257 + }, + { + "epoch": 0.5396853708129704, + "grad_norm": 0.2213406264781952, + "learning_rate": 2.589868946692794e-05, + "loss": 0.137, + "step": 30258 + }, + { + "epoch": 0.5397032069346841, + "grad_norm": 0.2811971604824066, + "learning_rate": 2.5897133963606744e-05, + "loss": 0.1488, + "step": 30259 + }, + { + "epoch": 0.5397210430563978, + "grad_norm": 0.2854959964752197, + "learning_rate": 2.589557845680796e-05, + "loss": 0.14, + "step": 30260 + }, + { + "epoch": 0.5397388791781115, + "grad_norm": 0.26762497425079346, + "learning_rate": 2.58940229465376e-05, + "loss": 0.1595, + "step": 30261 + }, + { + "epoch": 0.5397567152998252, + "grad_norm": 0.2501121163368225, + "learning_rate": 2.58924674328017e-05, + "loss": 0.1329, + "step": 30262 + }, + { + "epoch": 0.5397745514215389, + "grad_norm": 0.2629779279232025, + "learning_rate": 2.589091191560628e-05, + "loss": 0.136, + "step": 30263 + }, + { + "epoch": 0.5397923875432526, + "grad_norm": 0.29275083541870117, + "learning_rate": 2.588935639495738e-05, + "loss": 0.1453, + "step": 30264 + }, + { + "epoch": 0.5398102236649663, + "grad_norm": 0.28022679686546326, + "learning_rate": 2.588780087086104e-05, + "loss": 0.1353, + "step": 30265 + }, + { + "epoch": 0.53982805978668, + "grad_norm": 0.2810835838317871, + "learning_rate": 2.5886245343323263e-05, + "loss": 0.1593, + "step": 30266 + }, + { + "epoch": 0.5398458959083937, + "grad_norm": 0.3558344841003418, + "learning_rate": 2.58846898123501e-05, + "loss": 0.0792, + "step": 30267 + }, + { + "epoch": 0.5398637320301074, + "grad_norm": 0.2560634911060333, + "learning_rate": 2.588313427794757e-05, + "loss": 0.1374, + "step": 30268 + }, + { + "epoch": 0.5398815681518211, + "grad_norm": 0.26828914880752563, + "learning_rate": 2.588157874012171e-05, + "loss": 0.141, + "step": 30269 + }, + { + "epoch": 0.5398994042735348, + "grad_norm": 0.23489776253700256, + "learning_rate": 2.5880023198878544e-05, + "loss": 0.199, + "step": 30270 + }, + { + "epoch": 0.5399172403952485, + "grad_norm": 0.2857406735420227, + "learning_rate": 2.5878467654224104e-05, + "loss": 0.1525, + "step": 30271 + }, + { + "epoch": 0.5399350765169622, + "grad_norm": 0.25609347224235535, + "learning_rate": 2.5876912106164415e-05, + "loss": 0.1443, + "step": 30272 + }, + { + "epoch": 0.5399529126386758, + "grad_norm": 0.26739031076431274, + "learning_rate": 2.5875356554705522e-05, + "loss": 0.1295, + "step": 30273 + }, + { + "epoch": 0.5399707487603895, + "grad_norm": 0.3170407712459564, + "learning_rate": 2.587380099985344e-05, + "loss": 0.1599, + "step": 30274 + }, + { + "epoch": 0.5399885848821032, + "grad_norm": 0.2341240644454956, + "learning_rate": 2.5872245441614202e-05, + "loss": 0.1277, + "step": 30275 + }, + { + "epoch": 0.5400064210038169, + "grad_norm": 0.213741272687912, + "learning_rate": 2.587068987999384e-05, + "loss": 0.121, + "step": 30276 + }, + { + "epoch": 0.5400242571255306, + "grad_norm": 0.26799917221069336, + "learning_rate": 2.5869134314998382e-05, + "loss": 0.1149, + "step": 30277 + }, + { + "epoch": 0.5400420932472443, + "grad_norm": 0.2628558278083801, + "learning_rate": 2.5867578746633864e-05, + "loss": 0.0884, + "step": 30278 + }, + { + "epoch": 0.540059929368958, + "grad_norm": 0.2583215832710266, + "learning_rate": 2.5866023174906306e-05, + "loss": 0.0893, + "step": 30279 + }, + { + "epoch": 0.5400777654906717, + "grad_norm": 0.2964015603065491, + "learning_rate": 2.5864467599821753e-05, + "loss": 0.1313, + "step": 30280 + }, + { + "epoch": 0.5400956016123855, + "grad_norm": 0.39377182722091675, + "learning_rate": 2.586291202138621e-05, + "loss": 0.1669, + "step": 30281 + }, + { + "epoch": 0.5401134377340991, + "grad_norm": 0.19919002056121826, + "learning_rate": 2.586135643960573e-05, + "loss": 0.1247, + "step": 30282 + }, + { + "epoch": 0.5401312738558128, + "grad_norm": 0.2302965670824051, + "learning_rate": 2.585980085448634e-05, + "loss": 0.1231, + "step": 30283 + }, + { + "epoch": 0.5401491099775265, + "grad_norm": 0.24823573231697083, + "learning_rate": 2.5858245266034054e-05, + "loss": 0.0878, + "step": 30284 + }, + { + "epoch": 0.5401669460992402, + "grad_norm": 0.2385394126176834, + "learning_rate": 2.585668967425492e-05, + "loss": 0.1705, + "step": 30285 + }, + { + "epoch": 0.5401847822209539, + "grad_norm": 0.2751266360282898, + "learning_rate": 2.5855134079154957e-05, + "loss": 0.1358, + "step": 30286 + }, + { + "epoch": 0.5402026183426676, + "grad_norm": 0.3314450681209564, + "learning_rate": 2.5853578480740203e-05, + "loss": 0.1688, + "step": 30287 + }, + { + "epoch": 0.5402204544643813, + "grad_norm": 0.23528346419334412, + "learning_rate": 2.5852022879016684e-05, + "loss": 0.1569, + "step": 30288 + }, + { + "epoch": 0.540238290586095, + "grad_norm": 0.3035106658935547, + "learning_rate": 2.5850467273990424e-05, + "loss": 0.1463, + "step": 30289 + }, + { + "epoch": 0.5402561267078086, + "grad_norm": 0.4092877209186554, + "learning_rate": 2.5848911665667468e-05, + "loss": 0.1503, + "step": 30290 + }, + { + "epoch": 0.5402739628295223, + "grad_norm": 0.23853865265846252, + "learning_rate": 2.584735605405383e-05, + "loss": 0.1328, + "step": 30291 + }, + { + "epoch": 0.540291798951236, + "grad_norm": 0.2787911891937256, + "learning_rate": 2.5845800439155547e-05, + "loss": 0.1624, + "step": 30292 + }, + { + "epoch": 0.5403096350729497, + "grad_norm": 0.22847092151641846, + "learning_rate": 2.5844244820978657e-05, + "loss": 0.1847, + "step": 30293 + }, + { + "epoch": 0.5403274711946634, + "grad_norm": 0.31058481335639954, + "learning_rate": 2.5842689199529174e-05, + "loss": 0.1144, + "step": 30294 + }, + { + "epoch": 0.5403453073163771, + "grad_norm": 0.2743181586265564, + "learning_rate": 2.5841133574813142e-05, + "loss": 0.1191, + "step": 30295 + }, + { + "epoch": 0.5403631434380908, + "grad_norm": 0.2613810896873474, + "learning_rate": 2.5839577946836586e-05, + "loss": 0.13, + "step": 30296 + }, + { + "epoch": 0.5403809795598045, + "grad_norm": 0.27584362030029297, + "learning_rate": 2.5838022315605536e-05, + "loss": 0.1309, + "step": 30297 + }, + { + "epoch": 0.5403988156815183, + "grad_norm": 0.30723848938941956, + "learning_rate": 2.5836466681126016e-05, + "loss": 0.1095, + "step": 30298 + }, + { + "epoch": 0.540416651803232, + "grad_norm": 0.3005426824092865, + "learning_rate": 2.5834911043404064e-05, + "loss": 0.2067, + "step": 30299 + }, + { + "epoch": 0.5404344879249456, + "grad_norm": 0.21849948167800903, + "learning_rate": 2.583335540244571e-05, + "loss": 0.066, + "step": 30300 + }, + { + "epoch": 0.5404523240466593, + "grad_norm": 0.26733192801475525, + "learning_rate": 2.5831799758256985e-05, + "loss": 0.1004, + "step": 30301 + }, + { + "epoch": 0.540470160168373, + "grad_norm": 0.33891624212265015, + "learning_rate": 2.5830244110843916e-05, + "loss": 0.1587, + "step": 30302 + }, + { + "epoch": 0.5404879962900867, + "grad_norm": 0.3988088369369507, + "learning_rate": 2.5828688460212536e-05, + "loss": 0.1447, + "step": 30303 + }, + { + "epoch": 0.5405058324118004, + "grad_norm": 0.3887338638305664, + "learning_rate": 2.5827132806368864e-05, + "loss": 0.1319, + "step": 30304 + }, + { + "epoch": 0.5405236685335141, + "grad_norm": 0.22638940811157227, + "learning_rate": 2.5825577149318948e-05, + "loss": 0.1431, + "step": 30305 + }, + { + "epoch": 0.5405415046552278, + "grad_norm": 0.21133919060230255, + "learning_rate": 2.5824021489068807e-05, + "loss": 0.1482, + "step": 30306 + }, + { + "epoch": 0.5405593407769415, + "grad_norm": 0.25019216537475586, + "learning_rate": 2.5822465825624477e-05, + "loss": 0.1466, + "step": 30307 + }, + { + "epoch": 0.5405771768986551, + "grad_norm": 0.26719704270362854, + "learning_rate": 2.5820910158991983e-05, + "loss": 0.1256, + "step": 30308 + }, + { + "epoch": 0.5405950130203688, + "grad_norm": 0.2063877135515213, + "learning_rate": 2.5819354489177356e-05, + "loss": 0.1507, + "step": 30309 + }, + { + "epoch": 0.5406128491420825, + "grad_norm": 0.29922378063201904, + "learning_rate": 2.5817798816186633e-05, + "loss": 0.1246, + "step": 30310 + }, + { + "epoch": 0.5406306852637962, + "grad_norm": 0.2416669875383377, + "learning_rate": 2.5816243140025835e-05, + "loss": 0.1699, + "step": 30311 + }, + { + "epoch": 0.5406485213855099, + "grad_norm": 0.23381966352462769, + "learning_rate": 2.5814687460700993e-05, + "loss": 0.0865, + "step": 30312 + }, + { + "epoch": 0.5406663575072236, + "grad_norm": 0.2589220702648163, + "learning_rate": 2.5813131778218148e-05, + "loss": 0.1394, + "step": 30313 + }, + { + "epoch": 0.5406841936289373, + "grad_norm": 0.2585050165653229, + "learning_rate": 2.5811576092583323e-05, + "loss": 0.1436, + "step": 30314 + }, + { + "epoch": 0.5407020297506511, + "grad_norm": 0.2663988173007965, + "learning_rate": 2.5810020403802544e-05, + "loss": 0.1696, + "step": 30315 + }, + { + "epoch": 0.5407198658723648, + "grad_norm": 0.2469911426305771, + "learning_rate": 2.580846471188185e-05, + "loss": 0.1426, + "step": 30316 + }, + { + "epoch": 0.5407377019940784, + "grad_norm": 0.2628594636917114, + "learning_rate": 2.5806909016827275e-05, + "loss": 0.1712, + "step": 30317 + }, + { + "epoch": 0.5407555381157921, + "grad_norm": 0.23110628128051758, + "learning_rate": 2.5805353318644833e-05, + "loss": 0.1249, + "step": 30318 + }, + { + "epoch": 0.5407733742375058, + "grad_norm": 0.23697690665721893, + "learning_rate": 2.5803797617340557e-05, + "loss": 0.1552, + "step": 30319 + }, + { + "epoch": 0.5407912103592195, + "grad_norm": 0.30695194005966187, + "learning_rate": 2.580224191292049e-05, + "loss": 0.182, + "step": 30320 + }, + { + "epoch": 0.5408090464809332, + "grad_norm": 0.25207746028900146, + "learning_rate": 2.580068620539066e-05, + "loss": 0.1666, + "step": 30321 + }, + { + "epoch": 0.5408268826026469, + "grad_norm": 0.290650337934494, + "learning_rate": 2.579913049475709e-05, + "loss": 0.1302, + "step": 30322 + }, + { + "epoch": 0.5408447187243606, + "grad_norm": 0.2963530123233795, + "learning_rate": 2.579757478102582e-05, + "loss": 0.1491, + "step": 30323 + }, + { + "epoch": 0.5408625548460743, + "grad_norm": 0.23589985072612762, + "learning_rate": 2.5796019064202865e-05, + "loss": 0.0971, + "step": 30324 + }, + { + "epoch": 0.540880390967788, + "grad_norm": 0.3055668771266937, + "learning_rate": 2.5794463344294272e-05, + "loss": 0.1642, + "step": 30325 + }, + { + "epoch": 0.5408982270895016, + "grad_norm": 0.2741827070713043, + "learning_rate": 2.5792907621306057e-05, + "loss": 0.185, + "step": 30326 + }, + { + "epoch": 0.5409160632112153, + "grad_norm": 0.23917505145072937, + "learning_rate": 2.5791351895244265e-05, + "loss": 0.1275, + "step": 30327 + }, + { + "epoch": 0.540933899332929, + "grad_norm": 0.2493560016155243, + "learning_rate": 2.5789796166114915e-05, + "loss": 0.1586, + "step": 30328 + }, + { + "epoch": 0.5409517354546427, + "grad_norm": 0.238856241106987, + "learning_rate": 2.5788240433924053e-05, + "loss": 0.1515, + "step": 30329 + }, + { + "epoch": 0.5409695715763564, + "grad_norm": 0.29860156774520874, + "learning_rate": 2.5786684698677693e-05, + "loss": 0.1131, + "step": 30330 + }, + { + "epoch": 0.5409874076980701, + "grad_norm": 0.2356102019548416, + "learning_rate": 2.578512896038187e-05, + "loss": 0.1871, + "step": 30331 + }, + { + "epoch": 0.5410052438197839, + "grad_norm": 0.22272637486457825, + "learning_rate": 2.5783573219042613e-05, + "loss": 0.1193, + "step": 30332 + }, + { + "epoch": 0.5410230799414976, + "grad_norm": 0.22675439715385437, + "learning_rate": 2.5782017474665955e-05, + "loss": 0.1191, + "step": 30333 + }, + { + "epoch": 0.5410409160632113, + "grad_norm": 0.2381807565689087, + "learning_rate": 2.5780461727257936e-05, + "loss": 0.1224, + "step": 30334 + }, + { + "epoch": 0.5410587521849249, + "grad_norm": 0.182336688041687, + "learning_rate": 2.577890597682457e-05, + "loss": 0.095, + "step": 30335 + }, + { + "epoch": 0.5410765883066386, + "grad_norm": 0.2791365087032318, + "learning_rate": 2.57773502233719e-05, + "loss": 0.1449, + "step": 30336 + }, + { + "epoch": 0.5410944244283523, + "grad_norm": 0.21785160899162292, + "learning_rate": 2.577579446690595e-05, + "loss": 0.1061, + "step": 30337 + }, + { + "epoch": 0.541112260550066, + "grad_norm": 0.20653729140758514, + "learning_rate": 2.5774238707432758e-05, + "loss": 0.1034, + "step": 30338 + }, + { + "epoch": 0.5411300966717797, + "grad_norm": 0.37819990515708923, + "learning_rate": 2.5772682944958337e-05, + "loss": 0.2114, + "step": 30339 + }, + { + "epoch": 0.5411479327934934, + "grad_norm": 0.20874670147895813, + "learning_rate": 2.5771127179488736e-05, + "loss": 0.1244, + "step": 30340 + }, + { + "epoch": 0.5411657689152071, + "grad_norm": 0.31306561827659607, + "learning_rate": 2.576957141102998e-05, + "loss": 0.0999, + "step": 30341 + }, + { + "epoch": 0.5411836050369208, + "grad_norm": 0.26542919874191284, + "learning_rate": 2.5768015639588106e-05, + "loss": 0.1362, + "step": 30342 + }, + { + "epoch": 0.5412014411586344, + "grad_norm": 0.21382597088813782, + "learning_rate": 2.5766459865169128e-05, + "loss": 0.1231, + "step": 30343 + }, + { + "epoch": 0.5412192772803481, + "grad_norm": 0.32082098722457886, + "learning_rate": 2.5764904087779097e-05, + "loss": 0.1159, + "step": 30344 + }, + { + "epoch": 0.5412371134020618, + "grad_norm": 0.29864999651908875, + "learning_rate": 2.5763348307424028e-05, + "loss": 0.1295, + "step": 30345 + }, + { + "epoch": 0.5412549495237755, + "grad_norm": 0.1638377606868744, + "learning_rate": 2.5761792524109957e-05, + "loss": 0.0988, + "step": 30346 + }, + { + "epoch": 0.5412727856454892, + "grad_norm": 0.25024187564849854, + "learning_rate": 2.576023673784291e-05, + "loss": 0.1444, + "step": 30347 + }, + { + "epoch": 0.5412906217672029, + "grad_norm": 0.33483538031578064, + "learning_rate": 2.5758680948628928e-05, + "loss": 0.1113, + "step": 30348 + }, + { + "epoch": 0.5413084578889167, + "grad_norm": 0.31227901577949524, + "learning_rate": 2.575712515647404e-05, + "loss": 0.1559, + "step": 30349 + }, + { + "epoch": 0.5413262940106304, + "grad_norm": 0.2563287019729614, + "learning_rate": 2.5755569361384262e-05, + "loss": 0.1664, + "step": 30350 + }, + { + "epoch": 0.5413441301323441, + "grad_norm": 0.3479805886745453, + "learning_rate": 2.5754013563365652e-05, + "loss": 0.1112, + "step": 30351 + }, + { + "epoch": 0.5413619662540577, + "grad_norm": 0.24797408282756805, + "learning_rate": 2.5752457762424216e-05, + "loss": 0.1281, + "step": 30352 + }, + { + "epoch": 0.5413798023757714, + "grad_norm": 0.2663181722164154, + "learning_rate": 2.5750901958565994e-05, + "loss": 0.163, + "step": 30353 + }, + { + "epoch": 0.5413976384974851, + "grad_norm": 0.25221049785614014, + "learning_rate": 2.5749346151797012e-05, + "loss": 0.1359, + "step": 30354 + }, + { + "epoch": 0.5414154746191988, + "grad_norm": 0.21352344751358032, + "learning_rate": 2.5747790342123313e-05, + "loss": 0.1281, + "step": 30355 + }, + { + "epoch": 0.5414333107409125, + "grad_norm": 0.2809578478336334, + "learning_rate": 2.5746234529550918e-05, + "loss": 0.1336, + "step": 30356 + }, + { + "epoch": 0.5414511468626262, + "grad_norm": 0.2282564342021942, + "learning_rate": 2.5744678714085862e-05, + "loss": 0.0828, + "step": 30357 + }, + { + "epoch": 0.5414689829843399, + "grad_norm": 0.2017303854227066, + "learning_rate": 2.5743122895734178e-05, + "loss": 0.1083, + "step": 30358 + }, + { + "epoch": 0.5414868191060536, + "grad_norm": 0.26987993717193604, + "learning_rate": 2.5741567074501888e-05, + "loss": 0.2113, + "step": 30359 + }, + { + "epoch": 0.5415046552277673, + "grad_norm": 0.2673798203468323, + "learning_rate": 2.5740011250395026e-05, + "loss": 0.1316, + "step": 30360 + }, + { + "epoch": 0.5415224913494809, + "grad_norm": 0.29162001609802246, + "learning_rate": 2.5738455423419622e-05, + "loss": 0.1559, + "step": 30361 + }, + { + "epoch": 0.5415403274711946, + "grad_norm": 0.2928520441055298, + "learning_rate": 2.5736899593581714e-05, + "loss": 0.1369, + "step": 30362 + }, + { + "epoch": 0.5415581635929083, + "grad_norm": 0.24529993534088135, + "learning_rate": 2.573534376088733e-05, + "loss": 0.1252, + "step": 30363 + }, + { + "epoch": 0.541575999714622, + "grad_norm": 0.31121334433555603, + "learning_rate": 2.57337879253425e-05, + "loss": 0.1689, + "step": 30364 + }, + { + "epoch": 0.5415938358363358, + "grad_norm": 0.24328245222568512, + "learning_rate": 2.573223208695325e-05, + "loss": 0.1449, + "step": 30365 + }, + { + "epoch": 0.5416116719580495, + "grad_norm": 0.25867602229118347, + "learning_rate": 2.5730676245725622e-05, + "loss": 0.16, + "step": 30366 + }, + { + "epoch": 0.5416295080797632, + "grad_norm": 0.36305147409439087, + "learning_rate": 2.5729120401665635e-05, + "loss": 0.1427, + "step": 30367 + }, + { + "epoch": 0.5416473442014769, + "grad_norm": 0.3836963176727295, + "learning_rate": 2.5727564554779325e-05, + "loss": 0.2233, + "step": 30368 + }, + { + "epoch": 0.5416651803231906, + "grad_norm": 0.2705952227115631, + "learning_rate": 2.5726008705072725e-05, + "loss": 0.1498, + "step": 30369 + }, + { + "epoch": 0.5416830164449042, + "grad_norm": 0.25841236114501953, + "learning_rate": 2.5724452852551868e-05, + "loss": 0.1668, + "step": 30370 + }, + { + "epoch": 0.5417008525666179, + "grad_norm": 0.2395922690629959, + "learning_rate": 2.5722896997222785e-05, + "loss": 0.1035, + "step": 30371 + }, + { + "epoch": 0.5417186886883316, + "grad_norm": 0.3051241338253021, + "learning_rate": 2.5721341139091493e-05, + "loss": 0.1668, + "step": 30372 + }, + { + "epoch": 0.5417365248100453, + "grad_norm": 0.2469150722026825, + "learning_rate": 2.5719785278164044e-05, + "loss": 0.084, + "step": 30373 + }, + { + "epoch": 0.541754360931759, + "grad_norm": 0.27553534507751465, + "learning_rate": 2.5718229414446448e-05, + "loss": 0.1475, + "step": 30374 + }, + { + "epoch": 0.5417721970534727, + "grad_norm": 0.24740992486476898, + "learning_rate": 2.5716673547944753e-05, + "loss": 0.1371, + "step": 30375 + }, + { + "epoch": 0.5417900331751864, + "grad_norm": 0.2842337191104889, + "learning_rate": 2.571511767866498e-05, + "loss": 0.1437, + "step": 30376 + }, + { + "epoch": 0.5418078692969001, + "grad_norm": 0.23383453488349915, + "learning_rate": 2.571356180661317e-05, + "loss": 0.1091, + "step": 30377 + }, + { + "epoch": 0.5418257054186137, + "grad_norm": 0.2744031846523285, + "learning_rate": 2.5712005931795342e-05, + "loss": 0.1176, + "step": 30378 + }, + { + "epoch": 0.5418435415403274, + "grad_norm": 0.24337030947208405, + "learning_rate": 2.5710450054217532e-05, + "loss": 0.0945, + "step": 30379 + }, + { + "epoch": 0.5418613776620411, + "grad_norm": 0.24219100177288055, + "learning_rate": 2.570889417388578e-05, + "loss": 0.1219, + "step": 30380 + }, + { + "epoch": 0.5418792137837548, + "grad_norm": 0.26978981494903564, + "learning_rate": 2.57073382908061e-05, + "loss": 0.1516, + "step": 30381 + }, + { + "epoch": 0.5418970499054686, + "grad_norm": 0.22270672023296356, + "learning_rate": 2.5705782404984536e-05, + "loss": 0.1334, + "step": 30382 + }, + { + "epoch": 0.5419148860271823, + "grad_norm": 0.3178529739379883, + "learning_rate": 2.5704226516427123e-05, + "loss": 0.1762, + "step": 30383 + }, + { + "epoch": 0.541932722148896, + "grad_norm": 0.4411005973815918, + "learning_rate": 2.5702670625139878e-05, + "loss": 0.1183, + "step": 30384 + }, + { + "epoch": 0.5419505582706097, + "grad_norm": 0.34993645548820496, + "learning_rate": 2.570111473112884e-05, + "loss": 0.1347, + "step": 30385 + }, + { + "epoch": 0.5419683943923234, + "grad_norm": 0.22318151593208313, + "learning_rate": 2.569955883440004e-05, + "loss": 0.1056, + "step": 30386 + }, + { + "epoch": 0.541986230514037, + "grad_norm": 0.28538092970848083, + "learning_rate": 2.5698002934959503e-05, + "loss": 0.1775, + "step": 30387 + }, + { + "epoch": 0.5420040666357507, + "grad_norm": 0.24118423461914062, + "learning_rate": 2.569644703281327e-05, + "loss": 0.1358, + "step": 30388 + }, + { + "epoch": 0.5420219027574644, + "grad_norm": 0.3231845498085022, + "learning_rate": 2.569489112796736e-05, + "loss": 0.1363, + "step": 30389 + }, + { + "epoch": 0.5420397388791781, + "grad_norm": 0.27154645323753357, + "learning_rate": 2.5693335220427828e-05, + "loss": 0.103, + "step": 30390 + }, + { + "epoch": 0.5420575750008918, + "grad_norm": 0.37285539507865906, + "learning_rate": 2.5691779310200676e-05, + "loss": 0.1349, + "step": 30391 + }, + { + "epoch": 0.5420754111226055, + "grad_norm": 0.2843606770038605, + "learning_rate": 2.5690223397291957e-05, + "loss": 0.1644, + "step": 30392 + }, + { + "epoch": 0.5420932472443192, + "grad_norm": 0.29346659779548645, + "learning_rate": 2.568866748170769e-05, + "loss": 0.1086, + "step": 30393 + }, + { + "epoch": 0.5421110833660329, + "grad_norm": 0.23756791651248932, + "learning_rate": 2.5687111563453913e-05, + "loss": 0.1246, + "step": 30394 + }, + { + "epoch": 0.5421289194877466, + "grad_norm": 0.308624267578125, + "learning_rate": 2.5685555642536646e-05, + "loss": 0.1112, + "step": 30395 + }, + { + "epoch": 0.5421467556094602, + "grad_norm": 0.25421419739723206, + "learning_rate": 2.5683999718961932e-05, + "loss": 0.1963, + "step": 30396 + }, + { + "epoch": 0.5421645917311739, + "grad_norm": 0.22110804915428162, + "learning_rate": 2.5682443792735805e-05, + "loss": 0.1281, + "step": 30397 + }, + { + "epoch": 0.5421824278528876, + "grad_norm": 0.2372094690799713, + "learning_rate": 2.5680887863864283e-05, + "loss": 0.1336, + "step": 30398 + }, + { + "epoch": 0.5422002639746014, + "grad_norm": 0.226642906665802, + "learning_rate": 2.5679331932353412e-05, + "loss": 0.1442, + "step": 30399 + }, + { + "epoch": 0.5422181000963151, + "grad_norm": 0.23188887536525726, + "learning_rate": 2.5677775998209207e-05, + "loss": 0.1348, + "step": 30400 + }, + { + "epoch": 0.5422359362180288, + "grad_norm": 0.30838584899902344, + "learning_rate": 2.5676220061437717e-05, + "loss": 0.1631, + "step": 30401 + }, + { + "epoch": 0.5422537723397425, + "grad_norm": 0.2719489336013794, + "learning_rate": 2.5674664122044957e-05, + "loss": 0.1007, + "step": 30402 + }, + { + "epoch": 0.5422716084614562, + "grad_norm": 0.46597820520401, + "learning_rate": 2.5673108180036963e-05, + "loss": 0.1256, + "step": 30403 + }, + { + "epoch": 0.5422894445831699, + "grad_norm": 0.25430217385292053, + "learning_rate": 2.5671552235419776e-05, + "loss": 0.1366, + "step": 30404 + }, + { + "epoch": 0.5423072807048835, + "grad_norm": 0.340923547744751, + "learning_rate": 2.5669996288199422e-05, + "loss": 0.1385, + "step": 30405 + }, + { + "epoch": 0.5423251168265972, + "grad_norm": 0.2916175425052643, + "learning_rate": 2.5668440338381926e-05, + "loss": 0.1117, + "step": 30406 + }, + { + "epoch": 0.5423429529483109, + "grad_norm": 0.32697227597236633, + "learning_rate": 2.5666884385973327e-05, + "loss": 0.1037, + "step": 30407 + }, + { + "epoch": 0.5423607890700246, + "grad_norm": 0.33345985412597656, + "learning_rate": 2.566532843097965e-05, + "loss": 0.1764, + "step": 30408 + }, + { + "epoch": 0.5423786251917383, + "grad_norm": 0.21775665879249573, + "learning_rate": 2.5663772473406934e-05, + "loss": 0.1266, + "step": 30409 + }, + { + "epoch": 0.542396461313452, + "grad_norm": 0.318934828042984, + "learning_rate": 2.566221651326121e-05, + "loss": 0.1442, + "step": 30410 + }, + { + "epoch": 0.5424142974351657, + "grad_norm": 0.47422051429748535, + "learning_rate": 2.56606605505485e-05, + "loss": 0.1671, + "step": 30411 + }, + { + "epoch": 0.5424321335568794, + "grad_norm": 0.2716493308544159, + "learning_rate": 2.5659104585274844e-05, + "loss": 0.14, + "step": 30412 + }, + { + "epoch": 0.542449969678593, + "grad_norm": 0.3493393063545227, + "learning_rate": 2.565754861744627e-05, + "loss": 0.0959, + "step": 30413 + }, + { + "epoch": 0.5424678058003067, + "grad_norm": 0.3317636251449585, + "learning_rate": 2.5655992647068815e-05, + "loss": 0.1289, + "step": 30414 + }, + { + "epoch": 0.5424856419220204, + "grad_norm": 0.21200741827487946, + "learning_rate": 2.5654436674148503e-05, + "loss": 0.1255, + "step": 30415 + }, + { + "epoch": 0.5425034780437342, + "grad_norm": 0.35558727383613586, + "learning_rate": 2.565288069869136e-05, + "loss": 0.1129, + "step": 30416 + }, + { + "epoch": 0.5425213141654479, + "grad_norm": 0.30525141954421997, + "learning_rate": 2.5651324720703433e-05, + "loss": 0.1611, + "step": 30417 + }, + { + "epoch": 0.5425391502871616, + "grad_norm": 0.25791317224502563, + "learning_rate": 2.5649768740190755e-05, + "loss": 0.1654, + "step": 30418 + }, + { + "epoch": 0.5425569864088753, + "grad_norm": 0.24684809148311615, + "learning_rate": 2.5648212757159335e-05, + "loss": 0.1391, + "step": 30419 + }, + { + "epoch": 0.542574822530589, + "grad_norm": 0.2556397318840027, + "learning_rate": 2.564665677161523e-05, + "loss": 0.1189, + "step": 30420 + }, + { + "epoch": 0.5425926586523027, + "grad_norm": 0.22299419343471527, + "learning_rate": 2.5645100783564453e-05, + "loss": 0.0947, + "step": 30421 + }, + { + "epoch": 0.5426104947740163, + "grad_norm": 0.22191090881824493, + "learning_rate": 2.5643544793013048e-05, + "loss": 0.1475, + "step": 30422 + }, + { + "epoch": 0.54262833089573, + "grad_norm": 0.2882049083709717, + "learning_rate": 2.5641988799967033e-05, + "loss": 0.1277, + "step": 30423 + }, + { + "epoch": 0.5426461670174437, + "grad_norm": 0.29855647683143616, + "learning_rate": 2.5640432804432445e-05, + "loss": 0.1474, + "step": 30424 + }, + { + "epoch": 0.5426640031391574, + "grad_norm": 0.19883571565151215, + "learning_rate": 2.563887680641533e-05, + "loss": 0.0946, + "step": 30425 + }, + { + "epoch": 0.5426818392608711, + "grad_norm": 0.2799416482448578, + "learning_rate": 2.56373208059217e-05, + "loss": 0.0912, + "step": 30426 + }, + { + "epoch": 0.5426996753825848, + "grad_norm": 0.45761919021606445, + "learning_rate": 2.5635764802957602e-05, + "loss": 0.1379, + "step": 30427 + }, + { + "epoch": 0.5427175115042985, + "grad_norm": 0.24477513134479523, + "learning_rate": 2.5634208797529053e-05, + "loss": 0.1285, + "step": 30428 + }, + { + "epoch": 0.5427353476260122, + "grad_norm": 0.3262573778629303, + "learning_rate": 2.56326527896421e-05, + "loss": 0.1673, + "step": 30429 + }, + { + "epoch": 0.5427531837477259, + "grad_norm": 0.3951232135295868, + "learning_rate": 2.5631096779302748e-05, + "loss": 0.2156, + "step": 30430 + }, + { + "epoch": 0.5427710198694395, + "grad_norm": 0.1818213015794754, + "learning_rate": 2.5629540766517063e-05, + "loss": 0.1523, + "step": 30431 + }, + { + "epoch": 0.5427888559911532, + "grad_norm": 0.2641490697860718, + "learning_rate": 2.5627984751291057e-05, + "loss": 0.1106, + "step": 30432 + }, + { + "epoch": 0.542806692112867, + "grad_norm": 0.24945363402366638, + "learning_rate": 2.5626428733630762e-05, + "loss": 0.1194, + "step": 30433 + }, + { + "epoch": 0.5428245282345807, + "grad_norm": 0.2434176206588745, + "learning_rate": 2.562487271354222e-05, + "loss": 0.1712, + "step": 30434 + }, + { + "epoch": 0.5428423643562944, + "grad_norm": 0.23235736787319183, + "learning_rate": 2.562331669103145e-05, + "loss": 0.1177, + "step": 30435 + }, + { + "epoch": 0.5428602004780081, + "grad_norm": 0.28597357869148254, + "learning_rate": 2.5621760666104487e-05, + "loss": 0.1569, + "step": 30436 + }, + { + "epoch": 0.5428780365997218, + "grad_norm": 0.2535983622074127, + "learning_rate": 2.562020463876737e-05, + "loss": 0.1095, + "step": 30437 + }, + { + "epoch": 0.5428958727214355, + "grad_norm": 0.32386693358421326, + "learning_rate": 2.5618648609026125e-05, + "loss": 0.0916, + "step": 30438 + }, + { + "epoch": 0.5429137088431492, + "grad_norm": 0.19447185099124908, + "learning_rate": 2.5617092576886782e-05, + "loss": 0.1437, + "step": 30439 + }, + { + "epoch": 0.5429315449648628, + "grad_norm": 0.22546543180942535, + "learning_rate": 2.561553654235538e-05, + "loss": 0.1348, + "step": 30440 + }, + { + "epoch": 0.5429493810865765, + "grad_norm": 0.25543931126594543, + "learning_rate": 2.561398050543794e-05, + "loss": 0.1329, + "step": 30441 + }, + { + "epoch": 0.5429672172082902, + "grad_norm": 0.29706892371177673, + "learning_rate": 2.5612424466140504e-05, + "loss": 0.1356, + "step": 30442 + }, + { + "epoch": 0.5429850533300039, + "grad_norm": 0.2604389488697052, + "learning_rate": 2.56108684244691e-05, + "loss": 0.1773, + "step": 30443 + }, + { + "epoch": 0.5430028894517176, + "grad_norm": 0.30576810240745544, + "learning_rate": 2.560931238042975e-05, + "loss": 0.124, + "step": 30444 + }, + { + "epoch": 0.5430207255734313, + "grad_norm": 0.25967666506767273, + "learning_rate": 2.56077563340285e-05, + "loss": 0.1527, + "step": 30445 + }, + { + "epoch": 0.543038561695145, + "grad_norm": 0.2925667464733124, + "learning_rate": 2.5606200285271375e-05, + "loss": 0.1214, + "step": 30446 + }, + { + "epoch": 0.5430563978168587, + "grad_norm": 0.3637361228466034, + "learning_rate": 2.560464423416441e-05, + "loss": 0.1348, + "step": 30447 + }, + { + "epoch": 0.5430742339385723, + "grad_norm": 0.23158197104930878, + "learning_rate": 2.5603088180713642e-05, + "loss": 0.1017, + "step": 30448 + }, + { + "epoch": 0.543092070060286, + "grad_norm": 0.26995033025741577, + "learning_rate": 2.5601532124925094e-05, + "loss": 0.1462, + "step": 30449 + }, + { + "epoch": 0.5431099061819998, + "grad_norm": 0.26165539026260376, + "learning_rate": 2.5599976066804787e-05, + "loss": 0.1205, + "step": 30450 + }, + { + "epoch": 0.5431277423037135, + "grad_norm": 0.25607991218566895, + "learning_rate": 2.5598420006358775e-05, + "loss": 0.146, + "step": 30451 + }, + { + "epoch": 0.5431455784254272, + "grad_norm": 0.2899633049964905, + "learning_rate": 2.5596863943593075e-05, + "loss": 0.1241, + "step": 30452 + }, + { + "epoch": 0.5431634145471409, + "grad_norm": 0.23187774419784546, + "learning_rate": 2.559530787851373e-05, + "loss": 0.1261, + "step": 30453 + }, + { + "epoch": 0.5431812506688546, + "grad_norm": 0.256250262260437, + "learning_rate": 2.5593751811126765e-05, + "loss": 0.1463, + "step": 30454 + }, + { + "epoch": 0.5431990867905683, + "grad_norm": 0.23451587557792664, + "learning_rate": 2.5592195741438213e-05, + "loss": 0.1119, + "step": 30455 + }, + { + "epoch": 0.543216922912282, + "grad_norm": 0.32399609684944153, + "learning_rate": 2.55906396694541e-05, + "loss": 0.156, + "step": 30456 + }, + { + "epoch": 0.5432347590339957, + "grad_norm": 0.2855326235294342, + "learning_rate": 2.558908359518047e-05, + "loss": 0.1214, + "step": 30457 + }, + { + "epoch": 0.5432525951557093, + "grad_norm": 0.2582947611808777, + "learning_rate": 2.5587527518623345e-05, + "loss": 0.1254, + "step": 30458 + }, + { + "epoch": 0.543270431277423, + "grad_norm": 0.2899855077266693, + "learning_rate": 2.558597143978877e-05, + "loss": 0.1353, + "step": 30459 + }, + { + "epoch": 0.5432882673991367, + "grad_norm": 0.22167515754699707, + "learning_rate": 2.5584415358682755e-05, + "loss": 0.1203, + "step": 30460 + }, + { + "epoch": 0.5433061035208504, + "grad_norm": 0.2114895135164261, + "learning_rate": 2.558285927531135e-05, + "loss": 0.1255, + "step": 30461 + }, + { + "epoch": 0.5433239396425641, + "grad_norm": 0.32345715165138245, + "learning_rate": 2.5581303189680584e-05, + "loss": 0.1191, + "step": 30462 + }, + { + "epoch": 0.5433417757642778, + "grad_norm": 0.28123369812965393, + "learning_rate": 2.557974710179648e-05, + "loss": 0.1924, + "step": 30463 + }, + { + "epoch": 0.5433596118859915, + "grad_norm": 0.46767711639404297, + "learning_rate": 2.5578191011665077e-05, + "loss": 0.1753, + "step": 30464 + }, + { + "epoch": 0.5433774480077052, + "grad_norm": 0.2891663610935211, + "learning_rate": 2.5576634919292408e-05, + "loss": 0.1981, + "step": 30465 + }, + { + "epoch": 0.543395284129419, + "grad_norm": 0.3217829465866089, + "learning_rate": 2.55750788246845e-05, + "loss": 0.0849, + "step": 30466 + }, + { + "epoch": 0.5434131202511326, + "grad_norm": 0.1882012039422989, + "learning_rate": 2.5573522727847394e-05, + "loss": 0.1173, + "step": 30467 + }, + { + "epoch": 0.5434309563728463, + "grad_norm": 0.34905749559402466, + "learning_rate": 2.5571966628787114e-05, + "loss": 0.1452, + "step": 30468 + }, + { + "epoch": 0.54344879249456, + "grad_norm": 0.21599042415618896, + "learning_rate": 2.5570410527509686e-05, + "loss": 0.1405, + "step": 30469 + }, + { + "epoch": 0.5434666286162737, + "grad_norm": 0.28630906343460083, + "learning_rate": 2.556885442402116e-05, + "loss": 0.1624, + "step": 30470 + }, + { + "epoch": 0.5434844647379874, + "grad_norm": 0.2142532616853714, + "learning_rate": 2.5567298318327548e-05, + "loss": 0.1713, + "step": 30471 + }, + { + "epoch": 0.5435023008597011, + "grad_norm": 0.29085907340049744, + "learning_rate": 2.5565742210434897e-05, + "loss": 0.1854, + "step": 30472 + }, + { + "epoch": 0.5435201369814148, + "grad_norm": 0.21413438022136688, + "learning_rate": 2.556418610034923e-05, + "loss": 0.1107, + "step": 30473 + }, + { + "epoch": 0.5435379731031285, + "grad_norm": 0.18788836896419525, + "learning_rate": 2.5562629988076592e-05, + "loss": 0.1287, + "step": 30474 + }, + { + "epoch": 0.5435558092248421, + "grad_norm": 0.2793657183647156, + "learning_rate": 2.5561073873623003e-05, + "loss": 0.1074, + "step": 30475 + }, + { + "epoch": 0.5435736453465558, + "grad_norm": 0.319446325302124, + "learning_rate": 2.5559517756994495e-05, + "loss": 0.1811, + "step": 30476 + }, + { + "epoch": 0.5435914814682695, + "grad_norm": 0.298800528049469, + "learning_rate": 2.5557961638197104e-05, + "loss": 0.1785, + "step": 30477 + }, + { + "epoch": 0.5436093175899832, + "grad_norm": 0.2088286280632019, + "learning_rate": 2.555640551723686e-05, + "loss": 0.1208, + "step": 30478 + }, + { + "epoch": 0.5436271537116969, + "grad_norm": 0.3150528073310852, + "learning_rate": 2.5554849394119794e-05, + "loss": 0.099, + "step": 30479 + }, + { + "epoch": 0.5436449898334106, + "grad_norm": 0.20364587008953094, + "learning_rate": 2.5553293268851943e-05, + "loss": 0.1305, + "step": 30480 + }, + { + "epoch": 0.5436628259551243, + "grad_norm": 0.33581048250198364, + "learning_rate": 2.5551737141439336e-05, + "loss": 0.1764, + "step": 30481 + }, + { + "epoch": 0.543680662076838, + "grad_norm": 0.23047466576099396, + "learning_rate": 2.5550181011888002e-05, + "loss": 0.123, + "step": 30482 + }, + { + "epoch": 0.5436984981985518, + "grad_norm": 0.3257729709148407, + "learning_rate": 2.554862488020398e-05, + "loss": 0.1458, + "step": 30483 + }, + { + "epoch": 0.5437163343202654, + "grad_norm": 0.24259547889232635, + "learning_rate": 2.5547068746393297e-05, + "loss": 0.1067, + "step": 30484 + }, + { + "epoch": 0.5437341704419791, + "grad_norm": 0.2064991295337677, + "learning_rate": 2.5545512610461986e-05, + "loss": 0.1204, + "step": 30485 + }, + { + "epoch": 0.5437520065636928, + "grad_norm": 0.19359250366687775, + "learning_rate": 2.5543956472416076e-05, + "loss": 0.0974, + "step": 30486 + }, + { + "epoch": 0.5437698426854065, + "grad_norm": 0.2196130007505417, + "learning_rate": 2.554240033226161e-05, + "loss": 0.1534, + "step": 30487 + }, + { + "epoch": 0.5437876788071202, + "grad_norm": 0.2541401982307434, + "learning_rate": 2.5540844190004616e-05, + "loss": 0.1384, + "step": 30488 + }, + { + "epoch": 0.5438055149288339, + "grad_norm": 0.2838333547115326, + "learning_rate": 2.5539288045651117e-05, + "loss": 0.1543, + "step": 30489 + }, + { + "epoch": 0.5438233510505476, + "grad_norm": 0.2721298635005951, + "learning_rate": 2.5537731899207157e-05, + "loss": 0.1838, + "step": 30490 + }, + { + "epoch": 0.5438411871722613, + "grad_norm": 0.27650579810142517, + "learning_rate": 2.5536175750678754e-05, + "loss": 0.137, + "step": 30491 + }, + { + "epoch": 0.543859023293975, + "grad_norm": 0.2977495789527893, + "learning_rate": 2.5534619600071946e-05, + "loss": 0.1746, + "step": 30492 + }, + { + "epoch": 0.5438768594156886, + "grad_norm": 0.2611187994480133, + "learning_rate": 2.5533063447392774e-05, + "loss": 0.1421, + "step": 30493 + }, + { + "epoch": 0.5438946955374023, + "grad_norm": 0.2547118067741394, + "learning_rate": 2.5531507292647268e-05, + "loss": 0.1623, + "step": 30494 + }, + { + "epoch": 0.543912531659116, + "grad_norm": 0.324241042137146, + "learning_rate": 2.5529951135841453e-05, + "loss": 0.1912, + "step": 30495 + }, + { + "epoch": 0.5439303677808297, + "grad_norm": 0.2348569631576538, + "learning_rate": 2.5528394976981362e-05, + "loss": 0.1077, + "step": 30496 + }, + { + "epoch": 0.5439482039025434, + "grad_norm": 0.20454934239387512, + "learning_rate": 2.5526838816073027e-05, + "loss": 0.1546, + "step": 30497 + }, + { + "epoch": 0.5439660400242571, + "grad_norm": 0.2143382579088211, + "learning_rate": 2.5525282653122496e-05, + "loss": 0.0956, + "step": 30498 + }, + { + "epoch": 0.5439838761459708, + "grad_norm": 0.2045070379972458, + "learning_rate": 2.5523726488135776e-05, + "loss": 0.1146, + "step": 30499 + }, + { + "epoch": 0.5440017122676846, + "grad_norm": 0.21712088584899902, + "learning_rate": 2.552217032111891e-05, + "loss": 0.1074, + "step": 30500 + }, + { + "epoch": 0.5440195483893983, + "grad_norm": 0.2839733064174652, + "learning_rate": 2.5520614152077938e-05, + "loss": 0.126, + "step": 30501 + }, + { + "epoch": 0.5440373845111119, + "grad_norm": 0.30920034646987915, + "learning_rate": 2.551905798101888e-05, + "loss": 0.1601, + "step": 30502 + }, + { + "epoch": 0.5440552206328256, + "grad_norm": 0.37711143493652344, + "learning_rate": 2.551750180794778e-05, + "loss": 0.1334, + "step": 30503 + }, + { + "epoch": 0.5440730567545393, + "grad_norm": 0.425048828125, + "learning_rate": 2.551594563287066e-05, + "loss": 0.1727, + "step": 30504 + }, + { + "epoch": 0.544090892876253, + "grad_norm": 0.20313811302185059, + "learning_rate": 2.5514389455793563e-05, + "loss": 0.1036, + "step": 30505 + }, + { + "epoch": 0.5441087289979667, + "grad_norm": 0.2686541974544525, + "learning_rate": 2.5512833276722507e-05, + "loss": 0.1677, + "step": 30506 + }, + { + "epoch": 0.5441265651196804, + "grad_norm": 0.2547666132450104, + "learning_rate": 2.5511277095663533e-05, + "loss": 0.1215, + "step": 30507 + }, + { + "epoch": 0.5441444012413941, + "grad_norm": 0.2346441149711609, + "learning_rate": 2.550972091262267e-05, + "loss": 0.123, + "step": 30508 + }, + { + "epoch": 0.5441622373631078, + "grad_norm": 0.22550863027572632, + "learning_rate": 2.550816472760596e-05, + "loss": 0.1442, + "step": 30509 + }, + { + "epoch": 0.5441800734848214, + "grad_norm": 0.20683231949806213, + "learning_rate": 2.5506608540619425e-05, + "loss": 0.1489, + "step": 30510 + }, + { + "epoch": 0.5441979096065351, + "grad_norm": 0.2775627672672272, + "learning_rate": 2.5505052351669102e-05, + "loss": 0.1667, + "step": 30511 + }, + { + "epoch": 0.5442157457282488, + "grad_norm": 0.3498181104660034, + "learning_rate": 2.550349616076102e-05, + "loss": 0.122, + "step": 30512 + }, + { + "epoch": 0.5442335818499625, + "grad_norm": 0.2168666273355484, + "learning_rate": 2.5501939967901212e-05, + "loss": 0.1081, + "step": 30513 + }, + { + "epoch": 0.5442514179716762, + "grad_norm": 0.2527812421321869, + "learning_rate": 2.5500383773095717e-05, + "loss": 0.109, + "step": 30514 + }, + { + "epoch": 0.5442692540933899, + "grad_norm": 0.2178495228290558, + "learning_rate": 2.549882757635055e-05, + "loss": 0.1202, + "step": 30515 + }, + { + "epoch": 0.5442870902151036, + "grad_norm": 0.2771246135234833, + "learning_rate": 2.5497271377671765e-05, + "loss": 0.1298, + "step": 30516 + }, + { + "epoch": 0.5443049263368174, + "grad_norm": 0.31199607253074646, + "learning_rate": 2.5495715177065378e-05, + "loss": 0.1631, + "step": 30517 + }, + { + "epoch": 0.5443227624585311, + "grad_norm": 0.36720263957977295, + "learning_rate": 2.5494158974537436e-05, + "loss": 0.1154, + "step": 30518 + }, + { + "epoch": 0.5443405985802447, + "grad_norm": 0.2538905143737793, + "learning_rate": 2.5492602770093956e-05, + "loss": 0.1551, + "step": 30519 + }, + { + "epoch": 0.5443584347019584, + "grad_norm": 0.3278164863586426, + "learning_rate": 2.549104656374098e-05, + "loss": 0.1336, + "step": 30520 + }, + { + "epoch": 0.5443762708236721, + "grad_norm": 0.3136005699634552, + "learning_rate": 2.548949035548453e-05, + "loss": 0.1388, + "step": 30521 + }, + { + "epoch": 0.5443941069453858, + "grad_norm": 0.22819246351718903, + "learning_rate": 2.548793414533066e-05, + "loss": 0.106, + "step": 30522 + }, + { + "epoch": 0.5444119430670995, + "grad_norm": 0.28130069375038147, + "learning_rate": 2.548637793328538e-05, + "loss": 0.1155, + "step": 30523 + }, + { + "epoch": 0.5444297791888132, + "grad_norm": 0.25665944814682007, + "learning_rate": 2.5484821719354736e-05, + "loss": 0.1307, + "step": 30524 + }, + { + "epoch": 0.5444476153105269, + "grad_norm": 0.3511230945587158, + "learning_rate": 2.548326550354475e-05, + "loss": 0.1413, + "step": 30525 + }, + { + "epoch": 0.5444654514322406, + "grad_norm": 0.24509581923484802, + "learning_rate": 2.548170928586147e-05, + "loss": 0.1617, + "step": 30526 + }, + { + "epoch": 0.5444832875539543, + "grad_norm": 0.22838634252548218, + "learning_rate": 2.5480153066310908e-05, + "loss": 0.114, + "step": 30527 + }, + { + "epoch": 0.5445011236756679, + "grad_norm": 0.22285029292106628, + "learning_rate": 2.547859684489911e-05, + "loss": 0.1572, + "step": 30528 + }, + { + "epoch": 0.5445189597973816, + "grad_norm": 0.27975815534591675, + "learning_rate": 2.547704062163211e-05, + "loss": 0.1258, + "step": 30529 + }, + { + "epoch": 0.5445367959190953, + "grad_norm": 0.24844714999198914, + "learning_rate": 2.547548439651593e-05, + "loss": 0.1418, + "step": 30530 + }, + { + "epoch": 0.544554632040809, + "grad_norm": 0.292751282453537, + "learning_rate": 2.547392816955661e-05, + "loss": 0.1609, + "step": 30531 + }, + { + "epoch": 0.5445724681625227, + "grad_norm": 0.344444215297699, + "learning_rate": 2.5472371940760183e-05, + "loss": 0.0961, + "step": 30532 + }, + { + "epoch": 0.5445903042842364, + "grad_norm": 0.2207648754119873, + "learning_rate": 2.5470815710132678e-05, + "loss": 0.1639, + "step": 30533 + }, + { + "epoch": 0.5446081404059502, + "grad_norm": 0.28195247054100037, + "learning_rate": 2.546925947768012e-05, + "loss": 0.1045, + "step": 30534 + }, + { + "epoch": 0.5446259765276639, + "grad_norm": 0.3072606325149536, + "learning_rate": 2.546770324340856e-05, + "loss": 0.1046, + "step": 30535 + }, + { + "epoch": 0.5446438126493776, + "grad_norm": 0.3027358651161194, + "learning_rate": 2.5466147007324015e-05, + "loss": 0.1138, + "step": 30536 + }, + { + "epoch": 0.5446616487710912, + "grad_norm": 0.5015240907669067, + "learning_rate": 2.5464590769432535e-05, + "loss": 0.184, + "step": 30537 + }, + { + "epoch": 0.5446794848928049, + "grad_norm": 0.22844772040843964, + "learning_rate": 2.5463034529740127e-05, + "loss": 0.1228, + "step": 30538 + }, + { + "epoch": 0.5446973210145186, + "grad_norm": 0.31912747025489807, + "learning_rate": 2.5461478288252844e-05, + "loss": 0.1394, + "step": 30539 + }, + { + "epoch": 0.5447151571362323, + "grad_norm": 0.25199249386787415, + "learning_rate": 2.5459922044976708e-05, + "loss": 0.1538, + "step": 30540 + }, + { + "epoch": 0.544732993257946, + "grad_norm": 0.31932389736175537, + "learning_rate": 2.5458365799917754e-05, + "loss": 0.1663, + "step": 30541 + }, + { + "epoch": 0.5447508293796597, + "grad_norm": 0.26404711604118347, + "learning_rate": 2.545680955308203e-05, + "loss": 0.1406, + "step": 30542 + }, + { + "epoch": 0.5447686655013734, + "grad_norm": 0.2169080525636673, + "learning_rate": 2.545525330447554e-05, + "loss": 0.1146, + "step": 30543 + }, + { + "epoch": 0.5447865016230871, + "grad_norm": 0.2407020628452301, + "learning_rate": 2.5453697054104336e-05, + "loss": 0.1274, + "step": 30544 + }, + { + "epoch": 0.5448043377448007, + "grad_norm": 0.3030557930469513, + "learning_rate": 2.5452140801974446e-05, + "loss": 0.1283, + "step": 30545 + }, + { + "epoch": 0.5448221738665144, + "grad_norm": 0.3027314841747284, + "learning_rate": 2.54505845480919e-05, + "loss": 0.2079, + "step": 30546 + }, + { + "epoch": 0.5448400099882281, + "grad_norm": 0.1790473610162735, + "learning_rate": 2.544902829246273e-05, + "loss": 0.1592, + "step": 30547 + }, + { + "epoch": 0.5448578461099418, + "grad_norm": 0.2134745568037033, + "learning_rate": 2.544747203509297e-05, + "loss": 0.1289, + "step": 30548 + }, + { + "epoch": 0.5448756822316555, + "grad_norm": 0.23085126280784607, + "learning_rate": 2.544591577598866e-05, + "loss": 0.2047, + "step": 30549 + }, + { + "epoch": 0.5448935183533692, + "grad_norm": 0.24027085304260254, + "learning_rate": 2.544435951515583e-05, + "loss": 0.1223, + "step": 30550 + }, + { + "epoch": 0.544911354475083, + "grad_norm": 0.23477111756801605, + "learning_rate": 2.54428032526005e-05, + "loss": 0.1369, + "step": 30551 + }, + { + "epoch": 0.5449291905967967, + "grad_norm": 0.25696438550949097, + "learning_rate": 2.544124698832872e-05, + "loss": 0.1588, + "step": 30552 + }, + { + "epoch": 0.5449470267185104, + "grad_norm": 0.20204702019691467, + "learning_rate": 2.5439690722346505e-05, + "loss": 0.133, + "step": 30553 + }, + { + "epoch": 0.544964862840224, + "grad_norm": 0.3292404115200043, + "learning_rate": 2.5438134454659906e-05, + "loss": 0.1143, + "step": 30554 + }, + { + "epoch": 0.5449826989619377, + "grad_norm": 0.22995255887508392, + "learning_rate": 2.5436578185274938e-05, + "loss": 0.1485, + "step": 30555 + }, + { + "epoch": 0.5450005350836514, + "grad_norm": 0.20696400105953217, + "learning_rate": 2.5435021914197643e-05, + "loss": 0.117, + "step": 30556 + }, + { + "epoch": 0.5450183712053651, + "grad_norm": 0.2578725814819336, + "learning_rate": 2.5433465641434064e-05, + "loss": 0.1561, + "step": 30557 + }, + { + "epoch": 0.5450362073270788, + "grad_norm": 0.2952697277069092, + "learning_rate": 2.543190936699021e-05, + "loss": 0.0982, + "step": 30558 + }, + { + "epoch": 0.5450540434487925, + "grad_norm": 0.21113072335720062, + "learning_rate": 2.5430353090872132e-05, + "loss": 0.1454, + "step": 30559 + }, + { + "epoch": 0.5450718795705062, + "grad_norm": 0.2743690013885498, + "learning_rate": 2.5428796813085854e-05, + "loss": 0.1378, + "step": 30560 + }, + { + "epoch": 0.5450897156922199, + "grad_norm": 0.2805323600769043, + "learning_rate": 2.542724053363742e-05, + "loss": 0.2072, + "step": 30561 + }, + { + "epoch": 0.5451075518139336, + "grad_norm": 0.27272406220436096, + "learning_rate": 2.542568425253284e-05, + "loss": 0.1526, + "step": 30562 + }, + { + "epoch": 0.5451253879356472, + "grad_norm": 0.22355371713638306, + "learning_rate": 2.5424127969778165e-05, + "loss": 0.1136, + "step": 30563 + }, + { + "epoch": 0.5451432240573609, + "grad_norm": 0.2965892553329468, + "learning_rate": 2.5422571685379427e-05, + "loss": 0.1696, + "step": 30564 + }, + { + "epoch": 0.5451610601790746, + "grad_norm": 0.6076902151107788, + "learning_rate": 2.542101539934266e-05, + "loss": 0.1566, + "step": 30565 + }, + { + "epoch": 0.5451788963007883, + "grad_norm": 0.26456040143966675, + "learning_rate": 2.541945911167388e-05, + "loss": 0.1716, + "step": 30566 + }, + { + "epoch": 0.5451967324225021, + "grad_norm": 0.2619446814060211, + "learning_rate": 2.541790282237914e-05, + "loss": 0.1405, + "step": 30567 + }, + { + "epoch": 0.5452145685442158, + "grad_norm": 0.27856355905532837, + "learning_rate": 2.5416346531464465e-05, + "loss": 0.1589, + "step": 30568 + }, + { + "epoch": 0.5452324046659295, + "grad_norm": 0.4120332598686218, + "learning_rate": 2.541479023893588e-05, + "loss": 0.1287, + "step": 30569 + }, + { + "epoch": 0.5452502407876432, + "grad_norm": 0.29790472984313965, + "learning_rate": 2.5413233944799438e-05, + "loss": 0.1311, + "step": 30570 + }, + { + "epoch": 0.5452680769093569, + "grad_norm": 0.2873964011669159, + "learning_rate": 2.5411677649061144e-05, + "loss": 0.1255, + "step": 30571 + }, + { + "epoch": 0.5452859130310705, + "grad_norm": 0.26232606172561646, + "learning_rate": 2.541012135172705e-05, + "loss": 0.1438, + "step": 30572 + }, + { + "epoch": 0.5453037491527842, + "grad_norm": 0.2812102735042572, + "learning_rate": 2.5408565052803184e-05, + "loss": 0.1747, + "step": 30573 + }, + { + "epoch": 0.5453215852744979, + "grad_norm": 0.2334454357624054, + "learning_rate": 2.5407008752295586e-05, + "loss": 0.1511, + "step": 30574 + }, + { + "epoch": 0.5453394213962116, + "grad_norm": 0.303078830242157, + "learning_rate": 2.540545245021027e-05, + "loss": 0.1181, + "step": 30575 + }, + { + "epoch": 0.5453572575179253, + "grad_norm": 0.2523733079433441, + "learning_rate": 2.5403896146553286e-05, + "loss": 0.1707, + "step": 30576 + }, + { + "epoch": 0.545375093639639, + "grad_norm": 0.27870509028434753, + "learning_rate": 2.540233984133066e-05, + "loss": 0.1617, + "step": 30577 + }, + { + "epoch": 0.5453929297613527, + "grad_norm": 0.28360575437545776, + "learning_rate": 2.5400783534548428e-05, + "loss": 0.0939, + "step": 30578 + }, + { + "epoch": 0.5454107658830664, + "grad_norm": 0.2450939416885376, + "learning_rate": 2.5399227226212613e-05, + "loss": 0.1697, + "step": 30579 + }, + { + "epoch": 0.54542860200478, + "grad_norm": 0.270203173160553, + "learning_rate": 2.539767091632927e-05, + "loss": 0.1369, + "step": 30580 + }, + { + "epoch": 0.5454464381264937, + "grad_norm": 0.22776833176612854, + "learning_rate": 2.539611460490441e-05, + "loss": 0.138, + "step": 30581 + }, + { + "epoch": 0.5454642742482074, + "grad_norm": 0.23696552217006683, + "learning_rate": 2.5394558291944072e-05, + "loss": 0.1793, + "step": 30582 + }, + { + "epoch": 0.5454821103699211, + "grad_norm": 0.19679997861385345, + "learning_rate": 2.5393001977454285e-05, + "loss": 0.0973, + "step": 30583 + }, + { + "epoch": 0.5454999464916349, + "grad_norm": 0.19741256535053253, + "learning_rate": 2.5391445661441088e-05, + "loss": 0.124, + "step": 30584 + }, + { + "epoch": 0.5455177826133486, + "grad_norm": 0.20444637537002563, + "learning_rate": 2.5389889343910517e-05, + "loss": 0.0823, + "step": 30585 + }, + { + "epoch": 0.5455356187350623, + "grad_norm": 0.3365132808685303, + "learning_rate": 2.5388333024868595e-05, + "loss": 0.1782, + "step": 30586 + }, + { + "epoch": 0.545553454856776, + "grad_norm": 0.5505183339118958, + "learning_rate": 2.538677670432137e-05, + "loss": 0.2799, + "step": 30587 + }, + { + "epoch": 0.5455712909784897, + "grad_norm": 0.2781968116760254, + "learning_rate": 2.5385220382274853e-05, + "loss": 0.177, + "step": 30588 + }, + { + "epoch": 0.5455891271002034, + "grad_norm": 0.2326662242412567, + "learning_rate": 2.538366405873509e-05, + "loss": 0.1446, + "step": 30589 + }, + { + "epoch": 0.545606963221917, + "grad_norm": 0.1863633692264557, + "learning_rate": 2.5382107733708116e-05, + "loss": 0.0767, + "step": 30590 + }, + { + "epoch": 0.5456247993436307, + "grad_norm": 0.17594373226165771, + "learning_rate": 2.5380551407199964e-05, + "loss": 0.1099, + "step": 30591 + }, + { + "epoch": 0.5456426354653444, + "grad_norm": 0.20853294432163239, + "learning_rate": 2.537899507921666e-05, + "loss": 0.1262, + "step": 30592 + }, + { + "epoch": 0.5456604715870581, + "grad_norm": 0.3568669259548187, + "learning_rate": 2.5377438749764243e-05, + "loss": 0.175, + "step": 30593 + }, + { + "epoch": 0.5456783077087718, + "grad_norm": 0.2863282263278961, + "learning_rate": 2.5375882418848746e-05, + "loss": 0.1643, + "step": 30594 + }, + { + "epoch": 0.5456961438304855, + "grad_norm": 0.30819424986839294, + "learning_rate": 2.5374326086476192e-05, + "loss": 0.1427, + "step": 30595 + }, + { + "epoch": 0.5457139799521992, + "grad_norm": 0.26100361347198486, + "learning_rate": 2.537276975265262e-05, + "loss": 0.1662, + "step": 30596 + }, + { + "epoch": 0.5457318160739129, + "grad_norm": 0.2631935179233551, + "learning_rate": 2.5371213417384064e-05, + "loss": 0.121, + "step": 30597 + }, + { + "epoch": 0.5457496521956265, + "grad_norm": 0.240121990442276, + "learning_rate": 2.5369657080676562e-05, + "loss": 0.1514, + "step": 30598 + }, + { + "epoch": 0.5457674883173402, + "grad_norm": 0.22561949491500854, + "learning_rate": 2.5368100742536134e-05, + "loss": 0.1455, + "step": 30599 + }, + { + "epoch": 0.5457853244390539, + "grad_norm": 0.2451285719871521, + "learning_rate": 2.536654440296883e-05, + "loss": 0.1061, + "step": 30600 + }, + { + "epoch": 0.5458031605607677, + "grad_norm": 0.31559446454048157, + "learning_rate": 2.5364988061980666e-05, + "loss": 0.1468, + "step": 30601 + }, + { + "epoch": 0.5458209966824814, + "grad_norm": 0.2675754427909851, + "learning_rate": 2.5363431719577684e-05, + "loss": 0.1178, + "step": 30602 + }, + { + "epoch": 0.5458388328041951, + "grad_norm": 0.24018284678459167, + "learning_rate": 2.5361875375765913e-05, + "loss": 0.1207, + "step": 30603 + }, + { + "epoch": 0.5458566689259088, + "grad_norm": 0.2545557916164398, + "learning_rate": 2.5360319030551388e-05, + "loss": 0.1284, + "step": 30604 + }, + { + "epoch": 0.5458745050476225, + "grad_norm": 0.2843206226825714, + "learning_rate": 2.5358762683940146e-05, + "loss": 0.1541, + "step": 30605 + }, + { + "epoch": 0.5458923411693362, + "grad_norm": 0.31861063838005066, + "learning_rate": 2.5357206335938215e-05, + "loss": 0.0971, + "step": 30606 + }, + { + "epoch": 0.5459101772910498, + "grad_norm": 0.20014692842960358, + "learning_rate": 2.5355649986551633e-05, + "loss": 0.0905, + "step": 30607 + }, + { + "epoch": 0.5459280134127635, + "grad_norm": 0.21438264846801758, + "learning_rate": 2.535409363578642e-05, + "loss": 0.089, + "step": 30608 + }, + { + "epoch": 0.5459458495344772, + "grad_norm": 0.21107247471809387, + "learning_rate": 2.5352537283648626e-05, + "loss": 0.1327, + "step": 30609 + }, + { + "epoch": 0.5459636856561909, + "grad_norm": 0.26052507758140564, + "learning_rate": 2.5350980930144268e-05, + "loss": 0.1621, + "step": 30610 + }, + { + "epoch": 0.5459815217779046, + "grad_norm": 0.24816282093524933, + "learning_rate": 2.534942457527939e-05, + "loss": 0.1395, + "step": 30611 + }, + { + "epoch": 0.5459993578996183, + "grad_norm": 0.30408936738967896, + "learning_rate": 2.5347868219060017e-05, + "loss": 0.147, + "step": 30612 + }, + { + "epoch": 0.546017194021332, + "grad_norm": 0.2077506184577942, + "learning_rate": 2.5346311861492194e-05, + "loss": 0.0985, + "step": 30613 + }, + { + "epoch": 0.5460350301430457, + "grad_norm": 0.3313791751861572, + "learning_rate": 2.534475550258194e-05, + "loss": 0.1689, + "step": 30614 + }, + { + "epoch": 0.5460528662647594, + "grad_norm": 0.22992636263370514, + "learning_rate": 2.5343199142335307e-05, + "loss": 0.0933, + "step": 30615 + }, + { + "epoch": 0.546070702386473, + "grad_norm": 0.2696564197540283, + "learning_rate": 2.53416427807583e-05, + "loss": 0.1682, + "step": 30616 + }, + { + "epoch": 0.5460885385081867, + "grad_norm": 0.3920508325099945, + "learning_rate": 2.5340086417856972e-05, + "loss": 0.1667, + "step": 30617 + }, + { + "epoch": 0.5461063746299005, + "grad_norm": 0.2885618805885315, + "learning_rate": 2.5338530053637355e-05, + "loss": 0.0963, + "step": 30618 + }, + { + "epoch": 0.5461242107516142, + "grad_norm": 0.28976091742515564, + "learning_rate": 2.533697368810548e-05, + "loss": 0.1497, + "step": 30619 + }, + { + "epoch": 0.5461420468733279, + "grad_norm": 0.2502298057079315, + "learning_rate": 2.533541732126738e-05, + "loss": 0.1282, + "step": 30620 + }, + { + "epoch": 0.5461598829950416, + "grad_norm": 0.2730318009853363, + "learning_rate": 2.5333860953129084e-05, + "loss": 0.1285, + "step": 30621 + }, + { + "epoch": 0.5461777191167553, + "grad_norm": 0.22559292614459991, + "learning_rate": 2.533230458369663e-05, + "loss": 0.124, + "step": 30622 + }, + { + "epoch": 0.546195555238469, + "grad_norm": 0.3198586702346802, + "learning_rate": 2.5330748212976042e-05, + "loss": 0.133, + "step": 30623 + }, + { + "epoch": 0.5462133913601827, + "grad_norm": 0.23066243529319763, + "learning_rate": 2.532919184097336e-05, + "loss": 0.151, + "step": 30624 + }, + { + "epoch": 0.5462312274818963, + "grad_norm": 0.3152429461479187, + "learning_rate": 2.5327635467694616e-05, + "loss": 0.1003, + "step": 30625 + }, + { + "epoch": 0.54624906360361, + "grad_norm": 0.25440070033073425, + "learning_rate": 2.5326079093145856e-05, + "loss": 0.1183, + "step": 30626 + }, + { + "epoch": 0.5462668997253237, + "grad_norm": 0.2893313765525818, + "learning_rate": 2.532452271733309e-05, + "loss": 0.146, + "step": 30627 + }, + { + "epoch": 0.5462847358470374, + "grad_norm": 0.27915453910827637, + "learning_rate": 2.5322966340262366e-05, + "loss": 0.1599, + "step": 30628 + }, + { + "epoch": 0.5463025719687511, + "grad_norm": 0.28688129782676697, + "learning_rate": 2.5321409961939708e-05, + "loss": 0.1266, + "step": 30629 + }, + { + "epoch": 0.5463204080904648, + "grad_norm": 0.22225314378738403, + "learning_rate": 2.531985358237116e-05, + "loss": 0.1673, + "step": 30630 + }, + { + "epoch": 0.5463382442121785, + "grad_norm": 0.6175563335418701, + "learning_rate": 2.5318297201562745e-05, + "loss": 0.1635, + "step": 30631 + }, + { + "epoch": 0.5463560803338922, + "grad_norm": 0.3103273808956146, + "learning_rate": 2.5316740819520494e-05, + "loss": 0.1645, + "step": 30632 + }, + { + "epoch": 0.5463739164556058, + "grad_norm": 0.2526172697544098, + "learning_rate": 2.531518443625046e-05, + "loss": 0.1423, + "step": 30633 + }, + { + "epoch": 0.5463917525773195, + "grad_norm": 0.41474178433418274, + "learning_rate": 2.5313628051758654e-05, + "loss": 0.1432, + "step": 30634 + }, + { + "epoch": 0.5464095886990333, + "grad_norm": 0.23666517436504364, + "learning_rate": 2.5312071666051122e-05, + "loss": 0.1184, + "step": 30635 + }, + { + "epoch": 0.546427424820747, + "grad_norm": 0.24965380132198334, + "learning_rate": 2.531051527913389e-05, + "loss": 0.0966, + "step": 30636 + }, + { + "epoch": 0.5464452609424607, + "grad_norm": 0.2909402847290039, + "learning_rate": 2.5308958891012995e-05, + "loss": 0.166, + "step": 30637 + }, + { + "epoch": 0.5464630970641744, + "grad_norm": 0.2088889479637146, + "learning_rate": 2.5307402501694467e-05, + "loss": 0.0941, + "step": 30638 + }, + { + "epoch": 0.5464809331858881, + "grad_norm": 0.24103598296642303, + "learning_rate": 2.5305846111184333e-05, + "loss": 0.1212, + "step": 30639 + }, + { + "epoch": 0.5464987693076018, + "grad_norm": 0.29101163148880005, + "learning_rate": 2.5304289719488638e-05, + "loss": 0.1218, + "step": 30640 + }, + { + "epoch": 0.5465166054293155, + "grad_norm": 0.2213044911623001, + "learning_rate": 2.530273332661342e-05, + "loss": 0.1155, + "step": 30641 + }, + { + "epoch": 0.5465344415510291, + "grad_norm": 0.2774907350540161, + "learning_rate": 2.5301176932564696e-05, + "loss": 0.1533, + "step": 30642 + }, + { + "epoch": 0.5465522776727428, + "grad_norm": 0.32976096868515015, + "learning_rate": 2.5299620537348512e-05, + "loss": 0.1608, + "step": 30643 + }, + { + "epoch": 0.5465701137944565, + "grad_norm": 0.26635968685150146, + "learning_rate": 2.5298064140970883e-05, + "loss": 0.0899, + "step": 30644 + }, + { + "epoch": 0.5465879499161702, + "grad_norm": 0.23259654641151428, + "learning_rate": 2.5296507743437863e-05, + "loss": 0.1139, + "step": 30645 + }, + { + "epoch": 0.5466057860378839, + "grad_norm": 0.23877407610416412, + "learning_rate": 2.529495134475548e-05, + "loss": 0.1381, + "step": 30646 + }, + { + "epoch": 0.5466236221595976, + "grad_norm": 0.29260697960853577, + "learning_rate": 2.5293394944929754e-05, + "loss": 0.1563, + "step": 30647 + }, + { + "epoch": 0.5466414582813113, + "grad_norm": 0.21915209293365479, + "learning_rate": 2.529183854396674e-05, + "loss": 0.1013, + "step": 30648 + }, + { + "epoch": 0.546659294403025, + "grad_norm": 0.4737933874130249, + "learning_rate": 2.529028214187245e-05, + "loss": 0.1373, + "step": 30649 + }, + { + "epoch": 0.5466771305247387, + "grad_norm": 0.2935916781425476, + "learning_rate": 2.528872573865293e-05, + "loss": 0.1555, + "step": 30650 + }, + { + "epoch": 0.5466949666464523, + "grad_norm": 0.2658814787864685, + "learning_rate": 2.528716933431421e-05, + "loss": 0.1459, + "step": 30651 + }, + { + "epoch": 0.5467128027681661, + "grad_norm": 0.22968004643917084, + "learning_rate": 2.5285612928862317e-05, + "loss": 0.1294, + "step": 30652 + }, + { + "epoch": 0.5467306388898798, + "grad_norm": 0.3674127757549286, + "learning_rate": 2.528405652230329e-05, + "loss": 0.0873, + "step": 30653 + }, + { + "epoch": 0.5467484750115935, + "grad_norm": 0.19489099085330963, + "learning_rate": 2.5282500114643166e-05, + "loss": 0.0921, + "step": 30654 + }, + { + "epoch": 0.5467663111333072, + "grad_norm": 0.2663986086845398, + "learning_rate": 2.5280943705887973e-05, + "loss": 0.0878, + "step": 30655 + }, + { + "epoch": 0.5467841472550209, + "grad_norm": 0.2378002554178238, + "learning_rate": 2.527938729604375e-05, + "loss": 0.1579, + "step": 30656 + }, + { + "epoch": 0.5468019833767346, + "grad_norm": 0.3102874159812927, + "learning_rate": 2.5277830885116517e-05, + "loss": 0.1256, + "step": 30657 + }, + { + "epoch": 0.5468198194984483, + "grad_norm": 0.25617241859436035, + "learning_rate": 2.5276274473112322e-05, + "loss": 0.1455, + "step": 30658 + }, + { + "epoch": 0.546837655620162, + "grad_norm": 0.25545215606689453, + "learning_rate": 2.5274718060037183e-05, + "loss": 0.1232, + "step": 30659 + }, + { + "epoch": 0.5468554917418756, + "grad_norm": 0.34630346298217773, + "learning_rate": 2.5273161645897148e-05, + "loss": 0.1841, + "step": 30660 + }, + { + "epoch": 0.5468733278635893, + "grad_norm": 0.32550060749053955, + "learning_rate": 2.5271605230698247e-05, + "loss": 0.14, + "step": 30661 + }, + { + "epoch": 0.546891163985303, + "grad_norm": 0.24155452847480774, + "learning_rate": 2.52700488144465e-05, + "loss": 0.1294, + "step": 30662 + }, + { + "epoch": 0.5469090001070167, + "grad_norm": 0.3801244795322418, + "learning_rate": 2.5268492397147964e-05, + "loss": 0.1058, + "step": 30663 + }, + { + "epoch": 0.5469268362287304, + "grad_norm": 0.26613855361938477, + "learning_rate": 2.5266935978808647e-05, + "loss": 0.1782, + "step": 30664 + }, + { + "epoch": 0.5469446723504441, + "grad_norm": 0.22232182323932648, + "learning_rate": 2.5265379559434605e-05, + "loss": 0.0668, + "step": 30665 + }, + { + "epoch": 0.5469625084721578, + "grad_norm": 0.2696954607963562, + "learning_rate": 2.5263823139031846e-05, + "loss": 0.1174, + "step": 30666 + }, + { + "epoch": 0.5469803445938715, + "grad_norm": 0.25743570923805237, + "learning_rate": 2.5262266717606432e-05, + "loss": 0.1738, + "step": 30667 + }, + { + "epoch": 0.5469981807155853, + "grad_norm": 0.25451594591140747, + "learning_rate": 2.526071029516437e-05, + "loss": 0.0917, + "step": 30668 + }, + { + "epoch": 0.547016016837299, + "grad_norm": 0.2859238386154175, + "learning_rate": 2.5259153871711715e-05, + "loss": 0.1411, + "step": 30669 + }, + { + "epoch": 0.5470338529590126, + "grad_norm": 0.312207967042923, + "learning_rate": 2.5257597447254483e-05, + "loss": 0.1664, + "step": 30670 + }, + { + "epoch": 0.5470516890807263, + "grad_norm": 0.2456086426973343, + "learning_rate": 2.5256041021798714e-05, + "loss": 0.123, + "step": 30671 + }, + { + "epoch": 0.54706952520244, + "grad_norm": 0.34301429986953735, + "learning_rate": 2.5254484595350446e-05, + "loss": 0.1661, + "step": 30672 + }, + { + "epoch": 0.5470873613241537, + "grad_norm": 0.33208051323890686, + "learning_rate": 2.5252928167915703e-05, + "loss": 0.1969, + "step": 30673 + }, + { + "epoch": 0.5471051974458674, + "grad_norm": 0.38960370421409607, + "learning_rate": 2.525137173950053e-05, + "loss": 0.1444, + "step": 30674 + }, + { + "epoch": 0.5471230335675811, + "grad_norm": 0.2728498876094818, + "learning_rate": 2.5249815310110947e-05, + "loss": 0.2019, + "step": 30675 + }, + { + "epoch": 0.5471408696892948, + "grad_norm": 0.3494815230369568, + "learning_rate": 2.5248258879753002e-05, + "loss": 0.0997, + "step": 30676 + }, + { + "epoch": 0.5471587058110085, + "grad_norm": 0.2780648469924927, + "learning_rate": 2.524670244843271e-05, + "loss": 0.1554, + "step": 30677 + }, + { + "epoch": 0.5471765419327221, + "grad_norm": 0.3877607583999634, + "learning_rate": 2.524514601615612e-05, + "loss": 0.1757, + "step": 30678 + }, + { + "epoch": 0.5471943780544358, + "grad_norm": 0.3019520342350006, + "learning_rate": 2.5243589582929257e-05, + "loss": 0.1369, + "step": 30679 + }, + { + "epoch": 0.5472122141761495, + "grad_norm": 0.2377205193042755, + "learning_rate": 2.5242033148758152e-05, + "loss": 0.1284, + "step": 30680 + }, + { + "epoch": 0.5472300502978632, + "grad_norm": 0.2232901155948639, + "learning_rate": 2.5240476713648847e-05, + "loss": 0.1111, + "step": 30681 + }, + { + "epoch": 0.5472478864195769, + "grad_norm": 0.29449349641799927, + "learning_rate": 2.5238920277607376e-05, + "loss": 0.099, + "step": 30682 + }, + { + "epoch": 0.5472657225412906, + "grad_norm": 0.25763311982154846, + "learning_rate": 2.523736384063976e-05, + "loss": 0.1411, + "step": 30683 + }, + { + "epoch": 0.5472835586630043, + "grad_norm": 0.28363871574401855, + "learning_rate": 2.5235807402752043e-05, + "loss": 0.1052, + "step": 30684 + }, + { + "epoch": 0.5473013947847181, + "grad_norm": 0.19277070462703705, + "learning_rate": 2.5234250963950263e-05, + "loss": 0.0842, + "step": 30685 + }, + { + "epoch": 0.5473192309064318, + "grad_norm": 0.25599876046180725, + "learning_rate": 2.523269452424043e-05, + "loss": 0.1174, + "step": 30686 + }, + { + "epoch": 0.5473370670281454, + "grad_norm": 0.22644832730293274, + "learning_rate": 2.5231138083628603e-05, + "loss": 0.1631, + "step": 30687 + }, + { + "epoch": 0.5473549031498591, + "grad_norm": 0.2773655951023102, + "learning_rate": 2.5229581642120798e-05, + "loss": 0.118, + "step": 30688 + }, + { + "epoch": 0.5473727392715728, + "grad_norm": 0.25694385170936584, + "learning_rate": 2.522802519972306e-05, + "loss": 0.1185, + "step": 30689 + }, + { + "epoch": 0.5473905753932865, + "grad_norm": 0.26030248403549194, + "learning_rate": 2.5226468756441417e-05, + "loss": 0.1688, + "step": 30690 + }, + { + "epoch": 0.5474084115150002, + "grad_norm": 0.18752585351467133, + "learning_rate": 2.5224912312281907e-05, + "loss": 0.1214, + "step": 30691 + }, + { + "epoch": 0.5474262476367139, + "grad_norm": 0.2060348093509674, + "learning_rate": 2.5223355867250553e-05, + "loss": 0.1245, + "step": 30692 + }, + { + "epoch": 0.5474440837584276, + "grad_norm": 0.2473519891500473, + "learning_rate": 2.5221799421353392e-05, + "loss": 0.1694, + "step": 30693 + }, + { + "epoch": 0.5474619198801413, + "grad_norm": 0.34526124596595764, + "learning_rate": 2.5220242974596463e-05, + "loss": 0.141, + "step": 30694 + }, + { + "epoch": 0.547479756001855, + "grad_norm": 0.2736712396144867, + "learning_rate": 2.52186865269858e-05, + "loss": 0.1525, + "step": 30695 + }, + { + "epoch": 0.5474975921235686, + "grad_norm": 0.3155522346496582, + "learning_rate": 2.521713007852743e-05, + "loss": 0.1077, + "step": 30696 + }, + { + "epoch": 0.5475154282452823, + "grad_norm": 0.2977176308631897, + "learning_rate": 2.521557362922739e-05, + "loss": 0.1787, + "step": 30697 + }, + { + "epoch": 0.547533264366996, + "grad_norm": 0.24261996150016785, + "learning_rate": 2.5214017179091716e-05, + "loss": 0.1573, + "step": 30698 + }, + { + "epoch": 0.5475511004887097, + "grad_norm": 0.35513001680374146, + "learning_rate": 2.5212460728126432e-05, + "loss": 0.1152, + "step": 30699 + }, + { + "epoch": 0.5475689366104234, + "grad_norm": 0.24917559325695038, + "learning_rate": 2.521090427633757e-05, + "loss": 0.1271, + "step": 30700 + }, + { + "epoch": 0.5475867727321371, + "grad_norm": 0.24163921177387238, + "learning_rate": 2.5209347823731177e-05, + "loss": 0.0997, + "step": 30701 + }, + { + "epoch": 0.5476046088538509, + "grad_norm": 0.2742210626602173, + "learning_rate": 2.5207791370313282e-05, + "loss": 0.1415, + "step": 30702 + }, + { + "epoch": 0.5476224449755646, + "grad_norm": 0.22208933532238007, + "learning_rate": 2.5206234916089916e-05, + "loss": 0.1119, + "step": 30703 + }, + { + "epoch": 0.5476402810972782, + "grad_norm": 0.3882651627063751, + "learning_rate": 2.520467846106711e-05, + "loss": 0.184, + "step": 30704 + }, + { + "epoch": 0.5476581172189919, + "grad_norm": 0.24553602933883667, + "learning_rate": 2.5203122005250902e-05, + "loss": 0.096, + "step": 30705 + }, + { + "epoch": 0.5476759533407056, + "grad_norm": 0.43698596954345703, + "learning_rate": 2.520156554864732e-05, + "loss": 0.1092, + "step": 30706 + }, + { + "epoch": 0.5476937894624193, + "grad_norm": 0.28606539964675903, + "learning_rate": 2.52000090912624e-05, + "loss": 0.1197, + "step": 30707 + }, + { + "epoch": 0.547711625584133, + "grad_norm": 0.20325832068920135, + "learning_rate": 2.5198452633102177e-05, + "loss": 0.1456, + "step": 30708 + }, + { + "epoch": 0.5477294617058467, + "grad_norm": 0.30555295944213867, + "learning_rate": 2.519689617417268e-05, + "loss": 0.1244, + "step": 30709 + }, + { + "epoch": 0.5477472978275604, + "grad_norm": 0.2689393162727356, + "learning_rate": 2.5195339714479947e-05, + "loss": 0.1296, + "step": 30710 + }, + { + "epoch": 0.5477651339492741, + "grad_norm": 0.24193690717220306, + "learning_rate": 2.519378325403002e-05, + "loss": 0.1298, + "step": 30711 + }, + { + "epoch": 0.5477829700709878, + "grad_norm": 0.2649729549884796, + "learning_rate": 2.519222679282891e-05, + "loss": 0.1412, + "step": 30712 + }, + { + "epoch": 0.5478008061927014, + "grad_norm": 0.20728227496147156, + "learning_rate": 2.5190670330882664e-05, + "loss": 0.0884, + "step": 30713 + }, + { + "epoch": 0.5478186423144151, + "grad_norm": 0.32075342535972595, + "learning_rate": 2.5189113868197317e-05, + "loss": 0.1501, + "step": 30714 + }, + { + "epoch": 0.5478364784361288, + "grad_norm": 0.2802213430404663, + "learning_rate": 2.5187557404778893e-05, + "loss": 0.2015, + "step": 30715 + }, + { + "epoch": 0.5478543145578425, + "grad_norm": 0.33109137415885925, + "learning_rate": 2.5186000940633438e-05, + "loss": 0.1622, + "step": 30716 + }, + { + "epoch": 0.5478721506795562, + "grad_norm": 0.19377684593200684, + "learning_rate": 2.5184444475766984e-05, + "loss": 0.1096, + "step": 30717 + }, + { + "epoch": 0.5478899868012699, + "grad_norm": 0.18932682275772095, + "learning_rate": 2.5182888010185547e-05, + "loss": 0.0919, + "step": 30718 + }, + { + "epoch": 0.5479078229229837, + "grad_norm": 0.20289833843708038, + "learning_rate": 2.518133154389518e-05, + "loss": 0.157, + "step": 30719 + }, + { + "epoch": 0.5479256590446974, + "grad_norm": 0.2738206386566162, + "learning_rate": 2.517977507690191e-05, + "loss": 0.1547, + "step": 30720 + }, + { + "epoch": 0.547943495166411, + "grad_norm": 0.23485751450061798, + "learning_rate": 2.517821860921177e-05, + "loss": 0.1245, + "step": 30721 + }, + { + "epoch": 0.5479613312881247, + "grad_norm": 0.2354004979133606, + "learning_rate": 2.5176662140830786e-05, + "loss": 0.1383, + "step": 30722 + }, + { + "epoch": 0.5479791674098384, + "grad_norm": 0.3201492130756378, + "learning_rate": 2.5175105671765004e-05, + "loss": 0.1083, + "step": 30723 + }, + { + "epoch": 0.5479970035315521, + "grad_norm": 0.23176920413970947, + "learning_rate": 2.5173549202020458e-05, + "loss": 0.141, + "step": 30724 + }, + { + "epoch": 0.5480148396532658, + "grad_norm": 0.2387719750404358, + "learning_rate": 2.5171992731603167e-05, + "loss": 0.0748, + "step": 30725 + }, + { + "epoch": 0.5480326757749795, + "grad_norm": 0.2699023485183716, + "learning_rate": 2.5170436260519176e-05, + "loss": 0.1609, + "step": 30726 + }, + { + "epoch": 0.5480505118966932, + "grad_norm": 0.3339270055294037, + "learning_rate": 2.5168879788774514e-05, + "loss": 0.1164, + "step": 30727 + }, + { + "epoch": 0.5480683480184069, + "grad_norm": 0.29143330454826355, + "learning_rate": 2.5167323316375213e-05, + "loss": 0.1372, + "step": 30728 + }, + { + "epoch": 0.5480861841401206, + "grad_norm": 0.21815279126167297, + "learning_rate": 2.516576684332731e-05, + "loss": 0.1109, + "step": 30729 + }, + { + "epoch": 0.5481040202618342, + "grad_norm": 0.36182475090026855, + "learning_rate": 2.5164210369636843e-05, + "loss": 0.13, + "step": 30730 + }, + { + "epoch": 0.5481218563835479, + "grad_norm": 0.26641643047332764, + "learning_rate": 2.5162653895309834e-05, + "loss": 0.0967, + "step": 30731 + }, + { + "epoch": 0.5481396925052616, + "grad_norm": 0.2650129199028015, + "learning_rate": 2.5161097420352325e-05, + "loss": 0.1242, + "step": 30732 + }, + { + "epoch": 0.5481575286269753, + "grad_norm": 0.2204132229089737, + "learning_rate": 2.5159540944770342e-05, + "loss": 0.142, + "step": 30733 + }, + { + "epoch": 0.548175364748689, + "grad_norm": 0.28929421305656433, + "learning_rate": 2.5157984468569934e-05, + "loss": 0.1066, + "step": 30734 + }, + { + "epoch": 0.5481932008704027, + "grad_norm": 0.24537834525108337, + "learning_rate": 2.5156427991757114e-05, + "loss": 0.105, + "step": 30735 + }, + { + "epoch": 0.5482110369921165, + "grad_norm": 0.18724027276039124, + "learning_rate": 2.5154871514337924e-05, + "loss": 0.1089, + "step": 30736 + }, + { + "epoch": 0.5482288731138302, + "grad_norm": 0.2552861273288727, + "learning_rate": 2.5153315036318404e-05, + "loss": 0.117, + "step": 30737 + }, + { + "epoch": 0.5482467092355439, + "grad_norm": 0.30879995226860046, + "learning_rate": 2.5151758557704586e-05, + "loss": 0.156, + "step": 30738 + }, + { + "epoch": 0.5482645453572575, + "grad_norm": 0.2633190155029297, + "learning_rate": 2.5150202078502492e-05, + "loss": 0.1008, + "step": 30739 + }, + { + "epoch": 0.5482823814789712, + "grad_norm": 0.2739719748497009, + "learning_rate": 2.514864559871817e-05, + "loss": 0.1465, + "step": 30740 + }, + { + "epoch": 0.5483002176006849, + "grad_norm": 0.27851369976997375, + "learning_rate": 2.5147089118357643e-05, + "loss": 0.1116, + "step": 30741 + }, + { + "epoch": 0.5483180537223986, + "grad_norm": 0.2841547131538391, + "learning_rate": 2.514553263742694e-05, + "loss": 0.1486, + "step": 30742 + }, + { + "epoch": 0.5483358898441123, + "grad_norm": 0.3047911524772644, + "learning_rate": 2.5143976155932107e-05, + "loss": 0.166, + "step": 30743 + }, + { + "epoch": 0.548353725965826, + "grad_norm": 0.2939079999923706, + "learning_rate": 2.514241967387917e-05, + "loss": 0.1203, + "step": 30744 + }, + { + "epoch": 0.5483715620875397, + "grad_norm": 0.19341906905174255, + "learning_rate": 2.5140863191274172e-05, + "loss": 0.092, + "step": 30745 + }, + { + "epoch": 0.5483893982092534, + "grad_norm": 0.3536452054977417, + "learning_rate": 2.5139306708123133e-05, + "loss": 0.147, + "step": 30746 + }, + { + "epoch": 0.548407234330967, + "grad_norm": 0.31168386340141296, + "learning_rate": 2.5137750224432105e-05, + "loss": 0.1296, + "step": 30747 + }, + { + "epoch": 0.5484250704526807, + "grad_norm": 0.34788137674331665, + "learning_rate": 2.51361937402071e-05, + "loss": 0.1281, + "step": 30748 + }, + { + "epoch": 0.5484429065743944, + "grad_norm": 0.29051393270492554, + "learning_rate": 2.513463725545416e-05, + "loss": 0.1491, + "step": 30749 + }, + { + "epoch": 0.5484607426961081, + "grad_norm": 0.24871081113815308, + "learning_rate": 2.5133080770179317e-05, + "loss": 0.1408, + "step": 30750 + }, + { + "epoch": 0.5484785788178218, + "grad_norm": 0.25557181239128113, + "learning_rate": 2.5131524284388612e-05, + "loss": 0.1102, + "step": 30751 + }, + { + "epoch": 0.5484964149395355, + "grad_norm": 0.23606055974960327, + "learning_rate": 2.5129967798088078e-05, + "loss": 0.1409, + "step": 30752 + }, + { + "epoch": 0.5485142510612493, + "grad_norm": 0.2911280691623688, + "learning_rate": 2.512841131128374e-05, + "loss": 0.1273, + "step": 30753 + }, + { + "epoch": 0.548532087182963, + "grad_norm": 0.25190410017967224, + "learning_rate": 2.512685482398164e-05, + "loss": 0.1291, + "step": 30754 + }, + { + "epoch": 0.5485499233046767, + "grad_norm": 0.2589171528816223, + "learning_rate": 2.5125298336187796e-05, + "loss": 0.1163, + "step": 30755 + }, + { + "epoch": 0.5485677594263904, + "grad_norm": 0.2403724640607834, + "learning_rate": 2.5123741847908254e-05, + "loss": 0.1048, + "step": 30756 + }, + { + "epoch": 0.548585595548104, + "grad_norm": 0.24996334314346313, + "learning_rate": 2.5122185359149048e-05, + "loss": 0.1581, + "step": 30757 + }, + { + "epoch": 0.5486034316698177, + "grad_norm": 0.28113648295402527, + "learning_rate": 2.5120628869916214e-05, + "loss": 0.1182, + "step": 30758 + }, + { + "epoch": 0.5486212677915314, + "grad_norm": 0.3192913234233856, + "learning_rate": 2.5119072380215775e-05, + "loss": 0.1248, + "step": 30759 + }, + { + "epoch": 0.5486391039132451, + "grad_norm": 0.3085576891899109, + "learning_rate": 2.5117515890053777e-05, + "loss": 0.1608, + "step": 30760 + }, + { + "epoch": 0.5486569400349588, + "grad_norm": 0.29562899470329285, + "learning_rate": 2.5115959399436244e-05, + "loss": 0.1293, + "step": 30761 + }, + { + "epoch": 0.5486747761566725, + "grad_norm": 0.2797335088253021, + "learning_rate": 2.5114402908369212e-05, + "loss": 0.1248, + "step": 30762 + }, + { + "epoch": 0.5486926122783862, + "grad_norm": 0.31900107860565186, + "learning_rate": 2.5112846416858714e-05, + "loss": 0.089, + "step": 30763 + }, + { + "epoch": 0.5487104484000999, + "grad_norm": 0.35078883171081543, + "learning_rate": 2.511128992491078e-05, + "loss": 0.2087, + "step": 30764 + }, + { + "epoch": 0.5487282845218135, + "grad_norm": 0.8601611852645874, + "learning_rate": 2.5109733432531457e-05, + "loss": 0.1888, + "step": 30765 + }, + { + "epoch": 0.5487461206435272, + "grad_norm": 0.26061251759529114, + "learning_rate": 2.5108176939726763e-05, + "loss": 0.1492, + "step": 30766 + }, + { + "epoch": 0.5487639567652409, + "grad_norm": 0.32434362173080444, + "learning_rate": 2.5106620446502745e-05, + "loss": 0.1008, + "step": 30767 + }, + { + "epoch": 0.5487817928869546, + "grad_norm": 0.2760969400405884, + "learning_rate": 2.510506395286542e-05, + "loss": 0.1325, + "step": 30768 + }, + { + "epoch": 0.5487996290086684, + "grad_norm": 0.2676783502101898, + "learning_rate": 2.5103507458820834e-05, + "loss": 0.1376, + "step": 30769 + }, + { + "epoch": 0.5488174651303821, + "grad_norm": 0.26040154695510864, + "learning_rate": 2.5101950964375014e-05, + "loss": 0.1168, + "step": 30770 + }, + { + "epoch": 0.5488353012520958, + "grad_norm": 0.2696743309497833, + "learning_rate": 2.5100394469534006e-05, + "loss": 0.1805, + "step": 30771 + }, + { + "epoch": 0.5488531373738095, + "grad_norm": 0.28089988231658936, + "learning_rate": 2.5098837974303824e-05, + "loss": 0.1477, + "step": 30772 + }, + { + "epoch": 0.5488709734955232, + "grad_norm": 0.31052690744400024, + "learning_rate": 2.5097281478690522e-05, + "loss": 0.1641, + "step": 30773 + }, + { + "epoch": 0.5488888096172369, + "grad_norm": 0.30707675218582153, + "learning_rate": 2.5095724982700114e-05, + "loss": 0.0983, + "step": 30774 + }, + { + "epoch": 0.5489066457389505, + "grad_norm": 0.3314061164855957, + "learning_rate": 2.5094168486338648e-05, + "loss": 0.138, + "step": 30775 + }, + { + "epoch": 0.5489244818606642, + "grad_norm": 0.33650079369544983, + "learning_rate": 2.509261198961215e-05, + "loss": 0.1796, + "step": 30776 + }, + { + "epoch": 0.5489423179823779, + "grad_norm": 0.3104059398174286, + "learning_rate": 2.5091055492526655e-05, + "loss": 0.1741, + "step": 30777 + }, + { + "epoch": 0.5489601541040916, + "grad_norm": 0.33527112007141113, + "learning_rate": 2.5089498995088207e-05, + "loss": 0.1393, + "step": 30778 + }, + { + "epoch": 0.5489779902258053, + "grad_norm": 0.31261470913887024, + "learning_rate": 2.5087942497302817e-05, + "loss": 0.138, + "step": 30779 + }, + { + "epoch": 0.548995826347519, + "grad_norm": 0.27892300486564636, + "learning_rate": 2.5086385999176544e-05, + "loss": 0.1118, + "step": 30780 + }, + { + "epoch": 0.5490136624692327, + "grad_norm": 0.30686426162719727, + "learning_rate": 2.5084829500715402e-05, + "loss": 0.1776, + "step": 30781 + }, + { + "epoch": 0.5490314985909464, + "grad_norm": 0.23882944881916046, + "learning_rate": 2.5083273001925435e-05, + "loss": 0.1385, + "step": 30782 + }, + { + "epoch": 0.54904933471266, + "grad_norm": 0.3119361996650696, + "learning_rate": 2.508171650281267e-05, + "loss": 0.0854, + "step": 30783 + }, + { + "epoch": 0.5490671708343737, + "grad_norm": 0.2971063554286957, + "learning_rate": 2.508016000338314e-05, + "loss": 0.1361, + "step": 30784 + }, + { + "epoch": 0.5490850069560874, + "grad_norm": 0.2854631543159485, + "learning_rate": 2.5078603503642882e-05, + "loss": 0.2034, + "step": 30785 + }, + { + "epoch": 0.5491028430778012, + "grad_norm": 0.253529816865921, + "learning_rate": 2.5077047003597938e-05, + "loss": 0.1278, + "step": 30786 + }, + { + "epoch": 0.5491206791995149, + "grad_norm": 0.3247106373310089, + "learning_rate": 2.507549050325433e-05, + "loss": 0.1938, + "step": 30787 + }, + { + "epoch": 0.5491385153212286, + "grad_norm": 0.2657196819782257, + "learning_rate": 2.5073934002618094e-05, + "loss": 0.1302, + "step": 30788 + }, + { + "epoch": 0.5491563514429423, + "grad_norm": 0.21578383445739746, + "learning_rate": 2.507237750169526e-05, + "loss": 0.1387, + "step": 30789 + }, + { + "epoch": 0.549174187564656, + "grad_norm": 0.34705445170402527, + "learning_rate": 2.5070821000491873e-05, + "loss": 0.172, + "step": 30790 + }, + { + "epoch": 0.5491920236863697, + "grad_norm": 0.20326213538646698, + "learning_rate": 2.5069264499013956e-05, + "loss": 0.1314, + "step": 30791 + }, + { + "epoch": 0.5492098598080833, + "grad_norm": 0.2628207206726074, + "learning_rate": 2.5067707997267542e-05, + "loss": 0.1713, + "step": 30792 + }, + { + "epoch": 0.549227695929797, + "grad_norm": 0.23542331159114838, + "learning_rate": 2.5066151495258677e-05, + "loss": 0.1561, + "step": 30793 + }, + { + "epoch": 0.5492455320515107, + "grad_norm": 0.2329222708940506, + "learning_rate": 2.5064594992993378e-05, + "loss": 0.1186, + "step": 30794 + }, + { + "epoch": 0.5492633681732244, + "grad_norm": 0.3014250695705414, + "learning_rate": 2.5063038490477692e-05, + "loss": 0.1092, + "step": 30795 + }, + { + "epoch": 0.5492812042949381, + "grad_norm": 0.24184347689151764, + "learning_rate": 2.5061481987717644e-05, + "loss": 0.1218, + "step": 30796 + }, + { + "epoch": 0.5492990404166518, + "grad_norm": 0.26139211654663086, + "learning_rate": 2.5059925484719275e-05, + "loss": 0.129, + "step": 30797 + }, + { + "epoch": 0.5493168765383655, + "grad_norm": 0.241389662027359, + "learning_rate": 2.50583689814886e-05, + "loss": 0.0675, + "step": 30798 + }, + { + "epoch": 0.5493347126600792, + "grad_norm": 0.2729874849319458, + "learning_rate": 2.5056812478031683e-05, + "loss": 0.1503, + "step": 30799 + }, + { + "epoch": 0.5493525487817928, + "grad_norm": 0.26856231689453125, + "learning_rate": 2.5055255974354536e-05, + "loss": 0.1005, + "step": 30800 + }, + { + "epoch": 0.5493703849035065, + "grad_norm": 0.26102638244628906, + "learning_rate": 2.5053699470463198e-05, + "loss": 0.1703, + "step": 30801 + }, + { + "epoch": 0.5493882210252202, + "grad_norm": 0.3327820897102356, + "learning_rate": 2.50521429663637e-05, + "loss": 0.1842, + "step": 30802 + }, + { + "epoch": 0.549406057146934, + "grad_norm": 0.3525446057319641, + "learning_rate": 2.5050586462062082e-05, + "loss": 0.1633, + "step": 30803 + }, + { + "epoch": 0.5494238932686477, + "grad_norm": 0.19760239124298096, + "learning_rate": 2.504902995756437e-05, + "loss": 0.1205, + "step": 30804 + }, + { + "epoch": 0.5494417293903614, + "grad_norm": 0.24560704827308655, + "learning_rate": 2.5047473452876597e-05, + "loss": 0.1142, + "step": 30805 + }, + { + "epoch": 0.5494595655120751, + "grad_norm": 0.1904924213886261, + "learning_rate": 2.504591694800481e-05, + "loss": 0.1179, + "step": 30806 + }, + { + "epoch": 0.5494774016337888, + "grad_norm": 0.25329336524009705, + "learning_rate": 2.5044360442955023e-05, + "loss": 0.1206, + "step": 30807 + }, + { + "epoch": 0.5494952377555025, + "grad_norm": 0.2768438160419464, + "learning_rate": 2.5042803937733288e-05, + "loss": 0.1526, + "step": 30808 + }, + { + "epoch": 0.5495130738772162, + "grad_norm": 0.2582390606403351, + "learning_rate": 2.504124743234563e-05, + "loss": 0.1073, + "step": 30809 + }, + { + "epoch": 0.5495309099989298, + "grad_norm": 0.295486718416214, + "learning_rate": 2.503969092679808e-05, + "loss": 0.1422, + "step": 30810 + }, + { + "epoch": 0.5495487461206435, + "grad_norm": 0.363986611366272, + "learning_rate": 2.503813442109667e-05, + "loss": 0.1586, + "step": 30811 + }, + { + "epoch": 0.5495665822423572, + "grad_norm": 0.25413811206817627, + "learning_rate": 2.503657791524744e-05, + "loss": 0.1312, + "step": 30812 + }, + { + "epoch": 0.5495844183640709, + "grad_norm": 0.3368874192237854, + "learning_rate": 2.503502140925642e-05, + "loss": 0.13, + "step": 30813 + }, + { + "epoch": 0.5496022544857846, + "grad_norm": 0.2822263836860657, + "learning_rate": 2.5033464903129654e-05, + "loss": 0.1492, + "step": 30814 + }, + { + "epoch": 0.5496200906074983, + "grad_norm": 0.2521396279335022, + "learning_rate": 2.503190839687316e-05, + "loss": 0.1483, + "step": 30815 + }, + { + "epoch": 0.549637926729212, + "grad_norm": 0.27061811089515686, + "learning_rate": 2.503035189049298e-05, + "loss": 0.1386, + "step": 30816 + }, + { + "epoch": 0.5496557628509257, + "grad_norm": 0.35903432965278625, + "learning_rate": 2.5028795383995146e-05, + "loss": 0.1612, + "step": 30817 + }, + { + "epoch": 0.5496735989726393, + "grad_norm": 0.2776688039302826, + "learning_rate": 2.502723887738569e-05, + "loss": 0.1653, + "step": 30818 + }, + { + "epoch": 0.549691435094353, + "grad_norm": 0.2492254227399826, + "learning_rate": 2.5025682370670644e-05, + "loss": 0.1731, + "step": 30819 + }, + { + "epoch": 0.5497092712160668, + "grad_norm": 0.25500962138175964, + "learning_rate": 2.502412586385604e-05, + "loss": 0.1182, + "step": 30820 + }, + { + "epoch": 0.5497271073377805, + "grad_norm": 0.2514731287956238, + "learning_rate": 2.502256935694793e-05, + "loss": 0.0896, + "step": 30821 + }, + { + "epoch": 0.5497449434594942, + "grad_norm": 0.2468254715204239, + "learning_rate": 2.502101284995232e-05, + "loss": 0.1253, + "step": 30822 + }, + { + "epoch": 0.5497627795812079, + "grad_norm": 0.292104035615921, + "learning_rate": 2.501945634287527e-05, + "loss": 0.1338, + "step": 30823 + }, + { + "epoch": 0.5497806157029216, + "grad_norm": 0.26707273721694946, + "learning_rate": 2.5017899835722784e-05, + "loss": 0.1173, + "step": 30824 + }, + { + "epoch": 0.5497984518246353, + "grad_norm": 0.32471519708633423, + "learning_rate": 2.5016343328500923e-05, + "loss": 0.1128, + "step": 30825 + }, + { + "epoch": 0.549816287946349, + "grad_norm": 0.20818696916103363, + "learning_rate": 2.5014786821215703e-05, + "loss": 0.129, + "step": 30826 + }, + { + "epoch": 0.5498341240680626, + "grad_norm": 0.28682759404182434, + "learning_rate": 2.5013230313873176e-05, + "loss": 0.2049, + "step": 30827 + }, + { + "epoch": 0.5498519601897763, + "grad_norm": 0.25129520893096924, + "learning_rate": 2.5011673806479354e-05, + "loss": 0.1282, + "step": 30828 + }, + { + "epoch": 0.54986979631149, + "grad_norm": 0.23705129325389862, + "learning_rate": 2.5010117299040286e-05, + "loss": 0.1204, + "step": 30829 + }, + { + "epoch": 0.5498876324332037, + "grad_norm": 0.2183847725391388, + "learning_rate": 2.5008560791561997e-05, + "loss": 0.0953, + "step": 30830 + }, + { + "epoch": 0.5499054685549174, + "grad_norm": 0.21628688275814056, + "learning_rate": 2.5007004284050524e-05, + "loss": 0.1185, + "step": 30831 + }, + { + "epoch": 0.5499233046766311, + "grad_norm": 0.4334314465522766, + "learning_rate": 2.5005447776511893e-05, + "loss": 0.1468, + "step": 30832 + }, + { + "epoch": 0.5499411407983448, + "grad_norm": 0.30514198541641235, + "learning_rate": 2.500389126895215e-05, + "loss": 0.1456, + "step": 30833 + }, + { + "epoch": 0.5499589769200585, + "grad_norm": 0.2574711740016937, + "learning_rate": 2.5002334761377326e-05, + "loss": 0.119, + "step": 30834 + }, + { + "epoch": 0.5499768130417722, + "grad_norm": 0.23198111355304718, + "learning_rate": 2.500077825379345e-05, + "loss": 0.1455, + "step": 30835 + }, + { + "epoch": 0.5499946491634858, + "grad_norm": 0.2556779682636261, + "learning_rate": 2.4999221746206557e-05, + "loss": 0.0968, + "step": 30836 + }, + { + "epoch": 0.5500124852851996, + "grad_norm": 0.22976148128509521, + "learning_rate": 2.4997665238622683e-05, + "loss": 0.1397, + "step": 30837 + }, + { + "epoch": 0.5500303214069133, + "grad_norm": 0.2004426270723343, + "learning_rate": 2.4996108731047853e-05, + "loss": 0.0765, + "step": 30838 + }, + { + "epoch": 0.550048157528627, + "grad_norm": 0.263600617647171, + "learning_rate": 2.499455222348811e-05, + "loss": 0.1204, + "step": 30839 + }, + { + "epoch": 0.5500659936503407, + "grad_norm": 0.27715352177619934, + "learning_rate": 2.499299571594948e-05, + "loss": 0.1507, + "step": 30840 + }, + { + "epoch": 0.5500838297720544, + "grad_norm": 0.25137487053871155, + "learning_rate": 2.4991439208438013e-05, + "loss": 0.1484, + "step": 30841 + }, + { + "epoch": 0.5501016658937681, + "grad_norm": 0.28624987602233887, + "learning_rate": 2.4989882700959717e-05, + "loss": 0.1232, + "step": 30842 + }, + { + "epoch": 0.5501195020154818, + "grad_norm": 0.3630903959274292, + "learning_rate": 2.498832619352065e-05, + "loss": 0.1041, + "step": 30843 + }, + { + "epoch": 0.5501373381371955, + "grad_norm": 0.31580740213394165, + "learning_rate": 2.4986769686126833e-05, + "loss": 0.113, + "step": 30844 + }, + { + "epoch": 0.5501551742589091, + "grad_norm": 0.39485299587249756, + "learning_rate": 2.49852131787843e-05, + "loss": 0.114, + "step": 30845 + }, + { + "epoch": 0.5501730103806228, + "grad_norm": 0.3004516065120697, + "learning_rate": 2.4983656671499083e-05, + "loss": 0.1784, + "step": 30846 + }, + { + "epoch": 0.5501908465023365, + "grad_norm": 0.24809834361076355, + "learning_rate": 2.498210016427721e-05, + "loss": 0.1296, + "step": 30847 + }, + { + "epoch": 0.5502086826240502, + "grad_norm": 0.28357893228530884, + "learning_rate": 2.498054365712474e-05, + "loss": 0.0939, + "step": 30848 + }, + { + "epoch": 0.5502265187457639, + "grad_norm": 0.18697531521320343, + "learning_rate": 2.4978987150047682e-05, + "loss": 0.1068, + "step": 30849 + }, + { + "epoch": 0.5502443548674776, + "grad_norm": 0.34882503747940063, + "learning_rate": 2.4977430643052083e-05, + "loss": 0.1418, + "step": 30850 + }, + { + "epoch": 0.5502621909891913, + "grad_norm": 0.26318806409835815, + "learning_rate": 2.4975874136143956e-05, + "loss": 0.1161, + "step": 30851 + }, + { + "epoch": 0.550280027110905, + "grad_norm": 0.3377297818660736, + "learning_rate": 2.4974317629329365e-05, + "loss": 0.1343, + "step": 30852 + }, + { + "epoch": 0.5502978632326186, + "grad_norm": 0.26560068130493164, + "learning_rate": 2.4972761122614317e-05, + "loss": 0.1038, + "step": 30853 + }, + { + "epoch": 0.5503156993543324, + "grad_norm": 0.2568463087081909, + "learning_rate": 2.4971204616004863e-05, + "loss": 0.1563, + "step": 30854 + }, + { + "epoch": 0.5503335354760461, + "grad_norm": 0.22217004001140594, + "learning_rate": 2.4969648109507022e-05, + "loss": 0.1093, + "step": 30855 + }, + { + "epoch": 0.5503513715977598, + "grad_norm": 0.3076801300048828, + "learning_rate": 2.4968091603126843e-05, + "loss": 0.1781, + "step": 30856 + }, + { + "epoch": 0.5503692077194735, + "grad_norm": 0.17820556461811066, + "learning_rate": 2.4966535096870352e-05, + "loss": 0.1029, + "step": 30857 + }, + { + "epoch": 0.5503870438411872, + "grad_norm": 0.2968607544898987, + "learning_rate": 2.4964978590743585e-05, + "loss": 0.1403, + "step": 30858 + }, + { + "epoch": 0.5504048799629009, + "grad_norm": 0.2898006737232208, + "learning_rate": 2.4963422084752564e-05, + "loss": 0.1071, + "step": 30859 + }, + { + "epoch": 0.5504227160846146, + "grad_norm": 0.3475673496723175, + "learning_rate": 2.4961865578903336e-05, + "loss": 0.1497, + "step": 30860 + }, + { + "epoch": 0.5504405522063283, + "grad_norm": 0.282784104347229, + "learning_rate": 2.4960309073201926e-05, + "loss": 0.1019, + "step": 30861 + }, + { + "epoch": 0.550458388328042, + "grad_norm": 0.29330962896347046, + "learning_rate": 2.4958752567654377e-05, + "loss": 0.1623, + "step": 30862 + }, + { + "epoch": 0.5504762244497556, + "grad_norm": 0.20964418351650238, + "learning_rate": 2.495719606226672e-05, + "loss": 0.1089, + "step": 30863 + }, + { + "epoch": 0.5504940605714693, + "grad_norm": 0.20337319374084473, + "learning_rate": 2.4955639557044976e-05, + "loss": 0.1145, + "step": 30864 + }, + { + "epoch": 0.550511896693183, + "grad_norm": 0.26945674419403076, + "learning_rate": 2.4954083051995198e-05, + "loss": 0.1341, + "step": 30865 + }, + { + "epoch": 0.5505297328148967, + "grad_norm": 0.20108583569526672, + "learning_rate": 2.495252654712341e-05, + "loss": 0.1156, + "step": 30866 + }, + { + "epoch": 0.5505475689366104, + "grad_norm": 0.20346830785274506, + "learning_rate": 2.4950970042435636e-05, + "loss": 0.1458, + "step": 30867 + }, + { + "epoch": 0.5505654050583241, + "grad_norm": 0.24866527318954468, + "learning_rate": 2.4949413537937924e-05, + "loss": 0.0829, + "step": 30868 + }, + { + "epoch": 0.5505832411800378, + "grad_norm": 0.29681286215782166, + "learning_rate": 2.4947857033636303e-05, + "loss": 0.1154, + "step": 30869 + }, + { + "epoch": 0.5506010773017516, + "grad_norm": 0.20020225644111633, + "learning_rate": 2.4946300529536808e-05, + "loss": 0.0829, + "step": 30870 + }, + { + "epoch": 0.5506189134234653, + "grad_norm": 0.3476930260658264, + "learning_rate": 2.4944744025645473e-05, + "loss": 0.1363, + "step": 30871 + }, + { + "epoch": 0.5506367495451789, + "grad_norm": 0.2925347089767456, + "learning_rate": 2.494318752196833e-05, + "loss": 0.1111, + "step": 30872 + }, + { + "epoch": 0.5506545856668926, + "grad_norm": 0.262098491191864, + "learning_rate": 2.4941631018511397e-05, + "loss": 0.1433, + "step": 30873 + }, + { + "epoch": 0.5506724217886063, + "grad_norm": 0.29183250665664673, + "learning_rate": 2.494007451528073e-05, + "loss": 0.1548, + "step": 30874 + }, + { + "epoch": 0.55069025791032, + "grad_norm": 0.2617546021938324, + "learning_rate": 2.493851801228236e-05, + "loss": 0.1716, + "step": 30875 + }, + { + "epoch": 0.5507080940320337, + "grad_norm": 0.332061231136322, + "learning_rate": 2.493696150952232e-05, + "loss": 0.1518, + "step": 30876 + }, + { + "epoch": 0.5507259301537474, + "grad_norm": 0.3245917856693268, + "learning_rate": 2.493540500700662e-05, + "loss": 0.1575, + "step": 30877 + }, + { + "epoch": 0.5507437662754611, + "grad_norm": 0.3222822844982147, + "learning_rate": 2.493384850474133e-05, + "loss": 0.0951, + "step": 30878 + }, + { + "epoch": 0.5507616023971748, + "grad_norm": 0.21622516214847565, + "learning_rate": 2.4932292002732464e-05, + "loss": 0.1492, + "step": 30879 + }, + { + "epoch": 0.5507794385188884, + "grad_norm": 0.22095699608325958, + "learning_rate": 2.4930735500986054e-05, + "loss": 0.1349, + "step": 30880 + }, + { + "epoch": 0.5507972746406021, + "grad_norm": 0.24120375514030457, + "learning_rate": 2.4929178999508132e-05, + "loss": 0.078, + "step": 30881 + }, + { + "epoch": 0.5508151107623158, + "grad_norm": 0.2538900375366211, + "learning_rate": 2.4927622498304738e-05, + "loss": 0.1186, + "step": 30882 + }, + { + "epoch": 0.5508329468840295, + "grad_norm": 0.28807610273361206, + "learning_rate": 2.492606599738191e-05, + "loss": 0.1998, + "step": 30883 + }, + { + "epoch": 0.5508507830057432, + "grad_norm": 0.2732655704021454, + "learning_rate": 2.492450949674568e-05, + "loss": 0.1356, + "step": 30884 + }, + { + "epoch": 0.5508686191274569, + "grad_norm": 0.2930537760257721, + "learning_rate": 2.4922952996402075e-05, + "loss": 0.1536, + "step": 30885 + }, + { + "epoch": 0.5508864552491706, + "grad_norm": 0.3630143702030182, + "learning_rate": 2.492139649635712e-05, + "loss": 0.1656, + "step": 30886 + }, + { + "epoch": 0.5509042913708844, + "grad_norm": 0.28624990582466125, + "learning_rate": 2.4919839996616867e-05, + "loss": 0.1649, + "step": 30887 + }, + { + "epoch": 0.5509221274925981, + "grad_norm": 0.3337860405445099, + "learning_rate": 2.491828349718734e-05, + "loss": 0.0846, + "step": 30888 + }, + { + "epoch": 0.5509399636143117, + "grad_norm": 0.24200084805488586, + "learning_rate": 2.4916726998074578e-05, + "loss": 0.1079, + "step": 30889 + }, + { + "epoch": 0.5509577997360254, + "grad_norm": 0.30300071835517883, + "learning_rate": 2.49151704992846e-05, + "loss": 0.1623, + "step": 30890 + }, + { + "epoch": 0.5509756358577391, + "grad_norm": 0.23886799812316895, + "learning_rate": 2.491361400082346e-05, + "loss": 0.1253, + "step": 30891 + }, + { + "epoch": 0.5509934719794528, + "grad_norm": 0.2942934036254883, + "learning_rate": 2.4912057502697185e-05, + "loss": 0.1886, + "step": 30892 + }, + { + "epoch": 0.5510113081011665, + "grad_norm": 0.24329522252082825, + "learning_rate": 2.4910501004911803e-05, + "loss": 0.1352, + "step": 30893 + }, + { + "epoch": 0.5510291442228802, + "grad_norm": 0.2849752902984619, + "learning_rate": 2.490894450747334e-05, + "loss": 0.11, + "step": 30894 + }, + { + "epoch": 0.5510469803445939, + "grad_norm": 0.26910942792892456, + "learning_rate": 2.490738801038785e-05, + "loss": 0.15, + "step": 30895 + }, + { + "epoch": 0.5510648164663076, + "grad_norm": 0.2877151668071747, + "learning_rate": 2.4905831513661355e-05, + "loss": 0.1739, + "step": 30896 + }, + { + "epoch": 0.5510826525880212, + "grad_norm": 0.31718170642852783, + "learning_rate": 2.4904275017299892e-05, + "loss": 0.1838, + "step": 30897 + }, + { + "epoch": 0.5511004887097349, + "grad_norm": 0.20561911165714264, + "learning_rate": 2.490271852130949e-05, + "loss": 0.1015, + "step": 30898 + }, + { + "epoch": 0.5511183248314486, + "grad_norm": 0.24936097860336304, + "learning_rate": 2.4901162025696175e-05, + "loss": 0.1286, + "step": 30899 + }, + { + "epoch": 0.5511361609531623, + "grad_norm": 0.31895044445991516, + "learning_rate": 2.4899605530466003e-05, + "loss": 0.0918, + "step": 30900 + }, + { + "epoch": 0.551153997074876, + "grad_norm": 0.2934606969356537, + "learning_rate": 2.489804903562499e-05, + "loss": 0.1563, + "step": 30901 + }, + { + "epoch": 0.5511718331965897, + "grad_norm": 0.24587039649486542, + "learning_rate": 2.4896492541179175e-05, + "loss": 0.1147, + "step": 30902 + }, + { + "epoch": 0.5511896693183034, + "grad_norm": 0.2652030885219574, + "learning_rate": 2.489493604713458e-05, + "loss": 0.1504, + "step": 30903 + }, + { + "epoch": 0.5512075054400172, + "grad_norm": 0.24747587740421295, + "learning_rate": 2.4893379553497264e-05, + "loss": 0.1701, + "step": 30904 + }, + { + "epoch": 0.5512253415617309, + "grad_norm": 0.31529778242111206, + "learning_rate": 2.4891823060273242e-05, + "loss": 0.0831, + "step": 30905 + }, + { + "epoch": 0.5512431776834446, + "grad_norm": 0.22627204656600952, + "learning_rate": 2.4890266567468552e-05, + "loss": 0.1134, + "step": 30906 + }, + { + "epoch": 0.5512610138051582, + "grad_norm": 0.23765310645103455, + "learning_rate": 2.488871007508922e-05, + "loss": 0.1426, + "step": 30907 + }, + { + "epoch": 0.5512788499268719, + "grad_norm": 0.26153743267059326, + "learning_rate": 2.4887153583141292e-05, + "loss": 0.1237, + "step": 30908 + }, + { + "epoch": 0.5512966860485856, + "grad_norm": 0.22858697175979614, + "learning_rate": 2.488559709163079e-05, + "loss": 0.1556, + "step": 30909 + }, + { + "epoch": 0.5513145221702993, + "grad_norm": 0.29707035422325134, + "learning_rate": 2.488404060056376e-05, + "loss": 0.1259, + "step": 30910 + }, + { + "epoch": 0.551332358292013, + "grad_norm": 0.2790110409259796, + "learning_rate": 2.4882484109946232e-05, + "loss": 0.1631, + "step": 30911 + }, + { + "epoch": 0.5513501944137267, + "grad_norm": 0.22752730548381805, + "learning_rate": 2.488092761978422e-05, + "loss": 0.1577, + "step": 30912 + }, + { + "epoch": 0.5513680305354404, + "grad_norm": 0.2974559962749481, + "learning_rate": 2.4879371130083788e-05, + "loss": 0.141, + "step": 30913 + }, + { + "epoch": 0.5513858666571541, + "grad_norm": 0.22686950862407684, + "learning_rate": 2.4877814640850958e-05, + "loss": 0.1464, + "step": 30914 + }, + { + "epoch": 0.5514037027788677, + "grad_norm": 0.36017489433288574, + "learning_rate": 2.487625815209175e-05, + "loss": 0.1381, + "step": 30915 + }, + { + "epoch": 0.5514215389005814, + "grad_norm": 0.3199823498725891, + "learning_rate": 2.487470166381221e-05, + "loss": 0.1342, + "step": 30916 + }, + { + "epoch": 0.5514393750222951, + "grad_norm": 0.3009137213230133, + "learning_rate": 2.487314517601837e-05, + "loss": 0.1347, + "step": 30917 + }, + { + "epoch": 0.5514572111440088, + "grad_norm": 0.29534175992012024, + "learning_rate": 2.4871588688716267e-05, + "loss": 0.1324, + "step": 30918 + }, + { + "epoch": 0.5514750472657225, + "grad_norm": 0.2668975591659546, + "learning_rate": 2.487003220191193e-05, + "loss": 0.1025, + "step": 30919 + }, + { + "epoch": 0.5514928833874362, + "grad_norm": 0.26687297224998474, + "learning_rate": 2.4868475715611383e-05, + "loss": 0.1317, + "step": 30920 + }, + { + "epoch": 0.55151071950915, + "grad_norm": 0.2760293185710907, + "learning_rate": 2.4866919229820682e-05, + "loss": 0.1999, + "step": 30921 + }, + { + "epoch": 0.5515285556308637, + "grad_norm": 0.33544641733169556, + "learning_rate": 2.4865362744545847e-05, + "loss": 0.1301, + "step": 30922 + }, + { + "epoch": 0.5515463917525774, + "grad_norm": 0.9272142648696899, + "learning_rate": 2.4863806259792906e-05, + "loss": 0.2047, + "step": 30923 + }, + { + "epoch": 0.551564227874291, + "grad_norm": 0.26097604632377625, + "learning_rate": 2.4862249775567908e-05, + "loss": 0.1181, + "step": 30924 + }, + { + "epoch": 0.5515820639960047, + "grad_norm": 0.24084815382957458, + "learning_rate": 2.4860693291876862e-05, + "loss": 0.0997, + "step": 30925 + }, + { + "epoch": 0.5515999001177184, + "grad_norm": 0.2701628506183624, + "learning_rate": 2.485913680872583e-05, + "loss": 0.1293, + "step": 30926 + }, + { + "epoch": 0.5516177362394321, + "grad_norm": 0.18621596693992615, + "learning_rate": 2.4857580326120833e-05, + "loss": 0.1203, + "step": 30927 + }, + { + "epoch": 0.5516355723611458, + "grad_norm": 0.3313795328140259, + "learning_rate": 2.4856023844067905e-05, + "loss": 0.1261, + "step": 30928 + }, + { + "epoch": 0.5516534084828595, + "grad_norm": 0.2614094913005829, + "learning_rate": 2.4854467362573063e-05, + "loss": 0.1586, + "step": 30929 + }, + { + "epoch": 0.5516712446045732, + "grad_norm": 0.19842149317264557, + "learning_rate": 2.4852910881642366e-05, + "loss": 0.1114, + "step": 30930 + }, + { + "epoch": 0.5516890807262869, + "grad_norm": 0.21920661628246307, + "learning_rate": 2.485135440128184e-05, + "loss": 0.1026, + "step": 30931 + }, + { + "epoch": 0.5517069168480006, + "grad_norm": 0.49742481112480164, + "learning_rate": 2.4849797921497517e-05, + "loss": 0.1146, + "step": 30932 + }, + { + "epoch": 0.5517247529697142, + "grad_norm": 0.36548441648483276, + "learning_rate": 2.4848241442295417e-05, + "loss": 0.1393, + "step": 30933 + }, + { + "epoch": 0.5517425890914279, + "grad_norm": 0.285293847322464, + "learning_rate": 2.4846684963681595e-05, + "loss": 0.1337, + "step": 30934 + }, + { + "epoch": 0.5517604252131416, + "grad_norm": 0.3216269314289093, + "learning_rate": 2.484512848566208e-05, + "loss": 0.183, + "step": 30935 + }, + { + "epoch": 0.5517782613348553, + "grad_norm": 0.19250985980033875, + "learning_rate": 2.484357200824289e-05, + "loss": 0.1174, + "step": 30936 + }, + { + "epoch": 0.551796097456569, + "grad_norm": 0.20033006370067596, + "learning_rate": 2.4842015531430075e-05, + "loss": 0.1139, + "step": 30937 + }, + { + "epoch": 0.5518139335782828, + "grad_norm": 0.2517346143722534, + "learning_rate": 2.4840459055229653e-05, + "loss": 0.1462, + "step": 30938 + }, + { + "epoch": 0.5518317696999965, + "grad_norm": 0.2926196753978729, + "learning_rate": 2.483890257964768e-05, + "loss": 0.1243, + "step": 30939 + }, + { + "epoch": 0.5518496058217102, + "grad_norm": 0.27074941992759705, + "learning_rate": 2.4837346104690175e-05, + "loss": 0.1201, + "step": 30940 + }, + { + "epoch": 0.5518674419434239, + "grad_norm": 0.33330586552619934, + "learning_rate": 2.483578963036317e-05, + "loss": 0.1612, + "step": 30941 + }, + { + "epoch": 0.5518852780651375, + "grad_norm": 0.27145063877105713, + "learning_rate": 2.483423315667269e-05, + "loss": 0.1292, + "step": 30942 + }, + { + "epoch": 0.5519031141868512, + "grad_norm": 0.2024078220129013, + "learning_rate": 2.4832676683624792e-05, + "loss": 0.1356, + "step": 30943 + }, + { + "epoch": 0.5519209503085649, + "grad_norm": 0.2600943446159363, + "learning_rate": 2.4831120211225492e-05, + "loss": 0.1573, + "step": 30944 + }, + { + "epoch": 0.5519387864302786, + "grad_norm": 0.29306846857070923, + "learning_rate": 2.4829563739480836e-05, + "loss": 0.1176, + "step": 30945 + }, + { + "epoch": 0.5519566225519923, + "grad_norm": 0.21002456545829773, + "learning_rate": 2.4828007268396832e-05, + "loss": 0.1033, + "step": 30946 + }, + { + "epoch": 0.551974458673706, + "grad_norm": 0.22434687614440918, + "learning_rate": 2.4826450797979548e-05, + "loss": 0.1242, + "step": 30947 + }, + { + "epoch": 0.5519922947954197, + "grad_norm": 0.20229731500148773, + "learning_rate": 2.4824894328234998e-05, + "loss": 0.0977, + "step": 30948 + }, + { + "epoch": 0.5520101309171334, + "grad_norm": 0.2914944589138031, + "learning_rate": 2.482333785916922e-05, + "loss": 0.1592, + "step": 30949 + }, + { + "epoch": 0.552027967038847, + "grad_norm": 0.19878222048282623, + "learning_rate": 2.482178139078824e-05, + "loss": 0.1131, + "step": 30950 + }, + { + "epoch": 0.5520458031605607, + "grad_norm": 0.3126569092273712, + "learning_rate": 2.482022492309809e-05, + "loss": 0.18, + "step": 30951 + }, + { + "epoch": 0.5520636392822744, + "grad_norm": 0.2392844557762146, + "learning_rate": 2.4818668456104822e-05, + "loss": 0.1201, + "step": 30952 + }, + { + "epoch": 0.5520814754039881, + "grad_norm": 0.19470766186714172, + "learning_rate": 2.4817111989814455e-05, + "loss": 0.1282, + "step": 30953 + }, + { + "epoch": 0.5520993115257018, + "grad_norm": 0.3420150578022003, + "learning_rate": 2.481555552423303e-05, + "loss": 0.1997, + "step": 30954 + }, + { + "epoch": 0.5521171476474156, + "grad_norm": 0.2529189884662628, + "learning_rate": 2.4813999059366565e-05, + "loss": 0.1412, + "step": 30955 + }, + { + "epoch": 0.5521349837691293, + "grad_norm": 0.20657360553741455, + "learning_rate": 2.4812442595221112e-05, + "loss": 0.1562, + "step": 30956 + }, + { + "epoch": 0.552152819890843, + "grad_norm": 0.23202857375144958, + "learning_rate": 2.481088613180269e-05, + "loss": 0.1508, + "step": 30957 + }, + { + "epoch": 0.5521706560125567, + "grad_norm": 0.2111150026321411, + "learning_rate": 2.4809329669117342e-05, + "loss": 0.1196, + "step": 30958 + }, + { + "epoch": 0.5521884921342703, + "grad_norm": 0.1910131722688675, + "learning_rate": 2.4807773207171092e-05, + "loss": 0.0794, + "step": 30959 + }, + { + "epoch": 0.552206328255984, + "grad_norm": 0.22991007566452026, + "learning_rate": 2.4806216745969987e-05, + "loss": 0.1388, + "step": 30960 + }, + { + "epoch": 0.5522241643776977, + "grad_norm": 0.24920505285263062, + "learning_rate": 2.4804660285520055e-05, + "loss": 0.1181, + "step": 30961 + }, + { + "epoch": 0.5522420004994114, + "grad_norm": 0.1558464765548706, + "learning_rate": 2.4803103825827326e-05, + "loss": 0.0724, + "step": 30962 + }, + { + "epoch": 0.5522598366211251, + "grad_norm": 0.22510498762130737, + "learning_rate": 2.4801547366897832e-05, + "loss": 0.1168, + "step": 30963 + }, + { + "epoch": 0.5522776727428388, + "grad_norm": 0.23172415792942047, + "learning_rate": 2.4799990908737606e-05, + "loss": 0.1096, + "step": 30964 + }, + { + "epoch": 0.5522955088645525, + "grad_norm": 0.29675814509391785, + "learning_rate": 2.4798434451352683e-05, + "loss": 0.139, + "step": 30965 + }, + { + "epoch": 0.5523133449862662, + "grad_norm": 0.31687185168266296, + "learning_rate": 2.4796877994749107e-05, + "loss": 0.1614, + "step": 30966 + }, + { + "epoch": 0.5523311811079799, + "grad_norm": 0.22057831287384033, + "learning_rate": 2.47953215389329e-05, + "loss": 0.1121, + "step": 30967 + }, + { + "epoch": 0.5523490172296935, + "grad_norm": 0.19451525807380676, + "learning_rate": 2.4793765083910086e-05, + "loss": 0.107, + "step": 30968 + }, + { + "epoch": 0.5523668533514072, + "grad_norm": 0.3405996263027191, + "learning_rate": 2.479220862968672e-05, + "loss": 0.1305, + "step": 30969 + }, + { + "epoch": 0.5523846894731209, + "grad_norm": 0.24071982502937317, + "learning_rate": 2.479065217626883e-05, + "loss": 0.1372, + "step": 30970 + }, + { + "epoch": 0.5524025255948347, + "grad_norm": 0.2571866512298584, + "learning_rate": 2.478909572366243e-05, + "loss": 0.1239, + "step": 30971 + }, + { + "epoch": 0.5524203617165484, + "grad_norm": 0.37197786569595337, + "learning_rate": 2.4787539271873574e-05, + "loss": 0.1413, + "step": 30972 + }, + { + "epoch": 0.5524381978382621, + "grad_norm": 0.22080372273921967, + "learning_rate": 2.478598282090829e-05, + "loss": 0.1394, + "step": 30973 + }, + { + "epoch": 0.5524560339599758, + "grad_norm": 0.2480211853981018, + "learning_rate": 2.4784426370772613e-05, + "loss": 0.1485, + "step": 30974 + }, + { + "epoch": 0.5524738700816895, + "grad_norm": 0.21872583031654358, + "learning_rate": 2.4782869921472576e-05, + "loss": 0.1341, + "step": 30975 + }, + { + "epoch": 0.5524917062034032, + "grad_norm": 0.19811439514160156, + "learning_rate": 2.4781313473014208e-05, + "loss": 0.0775, + "step": 30976 + }, + { + "epoch": 0.5525095423251168, + "grad_norm": 0.2877408266067505, + "learning_rate": 2.4779757025403536e-05, + "loss": 0.1397, + "step": 30977 + }, + { + "epoch": 0.5525273784468305, + "grad_norm": 0.5443234443664551, + "learning_rate": 2.4778200578646613e-05, + "loss": 0.2372, + "step": 30978 + }, + { + "epoch": 0.5525452145685442, + "grad_norm": 0.17202936112880707, + "learning_rate": 2.4776644132749456e-05, + "loss": 0.0774, + "step": 30979 + }, + { + "epoch": 0.5525630506902579, + "grad_norm": 0.22290056943893433, + "learning_rate": 2.4775087687718105e-05, + "loss": 0.1076, + "step": 30980 + }, + { + "epoch": 0.5525808868119716, + "grad_norm": 0.2118106335401535, + "learning_rate": 2.4773531243558585e-05, + "loss": 0.1274, + "step": 30981 + }, + { + "epoch": 0.5525987229336853, + "grad_norm": 0.23682989180088043, + "learning_rate": 2.4771974800276944e-05, + "loss": 0.1237, + "step": 30982 + }, + { + "epoch": 0.552616559055399, + "grad_norm": 0.38061296939849854, + "learning_rate": 2.4770418357879208e-05, + "loss": 0.1766, + "step": 30983 + }, + { + "epoch": 0.5526343951771127, + "grad_norm": 0.2852959930896759, + "learning_rate": 2.4768861916371406e-05, + "loss": 0.1723, + "step": 30984 + }, + { + "epoch": 0.5526522312988263, + "grad_norm": 0.2268906533718109, + "learning_rate": 2.476730547575957e-05, + "loss": 0.0862, + "step": 30985 + }, + { + "epoch": 0.55267006742054, + "grad_norm": 0.24655452370643616, + "learning_rate": 2.4765749036049746e-05, + "loss": 0.1429, + "step": 30986 + }, + { + "epoch": 0.5526879035422537, + "grad_norm": 0.35229599475860596, + "learning_rate": 2.476419259724796e-05, + "loss": 0.1565, + "step": 30987 + }, + { + "epoch": 0.5527057396639675, + "grad_norm": 0.23618173599243164, + "learning_rate": 2.4762636159360248e-05, + "loss": 0.1337, + "step": 30988 + }, + { + "epoch": 0.5527235757856812, + "grad_norm": 0.25198864936828613, + "learning_rate": 2.4761079722392637e-05, + "loss": 0.1131, + "step": 30989 + }, + { + "epoch": 0.5527414119073949, + "grad_norm": 0.26145991683006287, + "learning_rate": 2.4759523286351155e-05, + "loss": 0.11, + "step": 30990 + }, + { + "epoch": 0.5527592480291086, + "grad_norm": 0.4384521245956421, + "learning_rate": 2.4757966851241853e-05, + "loss": 0.2093, + "step": 30991 + }, + { + "epoch": 0.5527770841508223, + "grad_norm": 0.339275985956192, + "learning_rate": 2.4756410417070752e-05, + "loss": 0.2122, + "step": 30992 + }, + { + "epoch": 0.552794920272536, + "grad_norm": 0.26871371269226074, + "learning_rate": 2.4754853983843893e-05, + "loss": 0.1298, + "step": 30993 + }, + { + "epoch": 0.5528127563942496, + "grad_norm": 0.21961291134357452, + "learning_rate": 2.4753297551567293e-05, + "loss": 0.1165, + "step": 30994 + }, + { + "epoch": 0.5528305925159633, + "grad_norm": 0.26469185948371887, + "learning_rate": 2.4751741120247007e-05, + "loss": 0.1484, + "step": 30995 + }, + { + "epoch": 0.552848428637677, + "grad_norm": 0.32913029193878174, + "learning_rate": 2.475018468988906e-05, + "loss": 0.1705, + "step": 30996 + }, + { + "epoch": 0.5528662647593907, + "grad_norm": 0.2835606038570404, + "learning_rate": 2.4748628260499483e-05, + "loss": 0.1428, + "step": 30997 + }, + { + "epoch": 0.5528841008811044, + "grad_norm": 0.22081094980239868, + "learning_rate": 2.4747071832084296e-05, + "loss": 0.112, + "step": 30998 + }, + { + "epoch": 0.5529019370028181, + "grad_norm": 0.27278465032577515, + "learning_rate": 2.474551540464956e-05, + "loss": 0.0811, + "step": 30999 + }, + { + "epoch": 0.5529197731245318, + "grad_norm": 0.28471705317497253, + "learning_rate": 2.474395897820129e-05, + "loss": 0.1299, + "step": 31000 + }, + { + "epoch": 0.5529197731245318, + "eval_loss": 0.13027334213256836, + "eval_runtime": 108.7216, + "eval_samples_per_second": 9.419, + "eval_steps_per_second": 1.573, + "step": 31000 + }, + { + "epoch": 0.5529376092462455, + "grad_norm": 0.29438138008117676, + "learning_rate": 2.4742402552745527e-05, + "loss": 0.1509, + "step": 31001 + }, + { + "epoch": 0.5529554453679592, + "grad_norm": 0.2521360516548157, + "learning_rate": 2.4740846128288298e-05, + "loss": 0.1508, + "step": 31002 + }, + { + "epoch": 0.5529732814896728, + "grad_norm": 0.30396267771720886, + "learning_rate": 2.473928970483563e-05, + "loss": 0.1261, + "step": 31003 + }, + { + "epoch": 0.5529911176113865, + "grad_norm": 0.3621559739112854, + "learning_rate": 2.4737733282393577e-05, + "loss": 0.1691, + "step": 31004 + }, + { + "epoch": 0.5530089537331003, + "grad_norm": 0.37630075216293335, + "learning_rate": 2.473617686096816e-05, + "loss": 0.086, + "step": 31005 + }, + { + "epoch": 0.553026789854814, + "grad_norm": 0.3095848560333252, + "learning_rate": 2.4734620440565407e-05, + "loss": 0.105, + "step": 31006 + }, + { + "epoch": 0.5530446259765277, + "grad_norm": 0.22440184652805328, + "learning_rate": 2.4733064021191352e-05, + "loss": 0.1235, + "step": 31007 + }, + { + "epoch": 0.5530624620982414, + "grad_norm": 0.24018505215644836, + "learning_rate": 2.4731507602852042e-05, + "loss": 0.1143, + "step": 31008 + }, + { + "epoch": 0.5530802982199551, + "grad_norm": 0.2785538136959076, + "learning_rate": 2.4729951185553502e-05, + "loss": 0.1196, + "step": 31009 + }, + { + "epoch": 0.5530981343416688, + "grad_norm": 0.28385409712791443, + "learning_rate": 2.4728394769301766e-05, + "loss": 0.1839, + "step": 31010 + }, + { + "epoch": 0.5531159704633825, + "grad_norm": 0.23382726311683655, + "learning_rate": 2.4726838354102855e-05, + "loss": 0.1021, + "step": 31011 + }, + { + "epoch": 0.5531338065850961, + "grad_norm": 0.32426387071609497, + "learning_rate": 2.4725281939962822e-05, + "loss": 0.0833, + "step": 31012 + }, + { + "epoch": 0.5531516427068098, + "grad_norm": 0.2664237320423126, + "learning_rate": 2.4723725526887687e-05, + "loss": 0.107, + "step": 31013 + }, + { + "epoch": 0.5531694788285235, + "grad_norm": 0.3490878939628601, + "learning_rate": 2.472216911488349e-05, + "loss": 0.166, + "step": 31014 + }, + { + "epoch": 0.5531873149502372, + "grad_norm": 0.3135441839694977, + "learning_rate": 2.4720612703956263e-05, + "loss": 0.1486, + "step": 31015 + }, + { + "epoch": 0.5532051510719509, + "grad_norm": 0.21774740517139435, + "learning_rate": 2.4719056294112026e-05, + "loss": 0.1288, + "step": 31016 + }, + { + "epoch": 0.5532229871936646, + "grad_norm": 0.20806358754634857, + "learning_rate": 2.4717499885356837e-05, + "loss": 0.1106, + "step": 31017 + }, + { + "epoch": 0.5532408233153783, + "grad_norm": 0.27105849981307983, + "learning_rate": 2.4715943477696715e-05, + "loss": 0.1412, + "step": 31018 + }, + { + "epoch": 0.553258659437092, + "grad_norm": 0.21384112536907196, + "learning_rate": 2.471438707113769e-05, + "loss": 0.1027, + "step": 31019 + }, + { + "epoch": 0.5532764955588056, + "grad_norm": 0.25188562273979187, + "learning_rate": 2.47128306656858e-05, + "loss": 0.1219, + "step": 31020 + }, + { + "epoch": 0.5532943316805193, + "grad_norm": 0.2596520781517029, + "learning_rate": 2.4711274261347073e-05, + "loss": 0.1066, + "step": 31021 + }, + { + "epoch": 0.5533121678022331, + "grad_norm": 0.31644824147224426, + "learning_rate": 2.4709717858127556e-05, + "loss": 0.1848, + "step": 31022 + }, + { + "epoch": 0.5533300039239468, + "grad_norm": 0.24661748111248016, + "learning_rate": 2.470816145603327e-05, + "loss": 0.1632, + "step": 31023 + }, + { + "epoch": 0.5533478400456605, + "grad_norm": 0.3832049071788788, + "learning_rate": 2.4706605055070242e-05, + "loss": 0.135, + "step": 31024 + }, + { + "epoch": 0.5533656761673742, + "grad_norm": 0.23447707295417786, + "learning_rate": 2.4705048655244525e-05, + "loss": 0.1169, + "step": 31025 + }, + { + "epoch": 0.5533835122890879, + "grad_norm": 0.2833011746406555, + "learning_rate": 2.4703492256562142e-05, + "loss": 0.1259, + "step": 31026 + }, + { + "epoch": 0.5534013484108016, + "grad_norm": 0.22857952117919922, + "learning_rate": 2.470193585902912e-05, + "loss": 0.1924, + "step": 31027 + }, + { + "epoch": 0.5534191845325153, + "grad_norm": 0.3064655065536499, + "learning_rate": 2.47003794626515e-05, + "loss": 0.2353, + "step": 31028 + }, + { + "epoch": 0.553437020654229, + "grad_norm": 0.15799081325531006, + "learning_rate": 2.4698823067435303e-05, + "loss": 0.0725, + "step": 31029 + }, + { + "epoch": 0.5534548567759426, + "grad_norm": 0.2504464089870453, + "learning_rate": 2.4697266673386584e-05, + "loss": 0.1401, + "step": 31030 + }, + { + "epoch": 0.5534726928976563, + "grad_norm": 0.22667838633060455, + "learning_rate": 2.4695710280511365e-05, + "loss": 0.1101, + "step": 31031 + }, + { + "epoch": 0.55349052901937, + "grad_norm": 0.27978646755218506, + "learning_rate": 2.4694153888815676e-05, + "loss": 0.1506, + "step": 31032 + }, + { + "epoch": 0.5535083651410837, + "grad_norm": 0.2594587206840515, + "learning_rate": 2.4692597498305542e-05, + "loss": 0.0991, + "step": 31033 + }, + { + "epoch": 0.5535262012627974, + "grad_norm": 0.27300968766212463, + "learning_rate": 2.469104110898701e-05, + "loss": 0.1958, + "step": 31034 + }, + { + "epoch": 0.5535440373845111, + "grad_norm": 0.26017341017723083, + "learning_rate": 2.4689484720866117e-05, + "loss": 0.1194, + "step": 31035 + }, + { + "epoch": 0.5535618735062248, + "grad_norm": 0.25154468417167664, + "learning_rate": 2.468792833394889e-05, + "loss": 0.1448, + "step": 31036 + }, + { + "epoch": 0.5535797096279385, + "grad_norm": 0.347650408744812, + "learning_rate": 2.4686371948241345e-05, + "loss": 0.1506, + "step": 31037 + }, + { + "epoch": 0.5535975457496521, + "grad_norm": 0.27731940150260925, + "learning_rate": 2.4684815563749543e-05, + "loss": 0.1662, + "step": 31038 + }, + { + "epoch": 0.5536153818713659, + "grad_norm": 0.21744950115680695, + "learning_rate": 2.4683259180479508e-05, + "loss": 0.1078, + "step": 31039 + }, + { + "epoch": 0.5536332179930796, + "grad_norm": 0.22725103795528412, + "learning_rate": 2.468170279843726e-05, + "loss": 0.1215, + "step": 31040 + }, + { + "epoch": 0.5536510541147933, + "grad_norm": 0.22972755134105682, + "learning_rate": 2.4680146417628853e-05, + "loss": 0.1357, + "step": 31041 + }, + { + "epoch": 0.553668890236507, + "grad_norm": 0.298816055059433, + "learning_rate": 2.467859003806029e-05, + "loss": 0.0911, + "step": 31042 + }, + { + "epoch": 0.5536867263582207, + "grad_norm": 0.24556376039981842, + "learning_rate": 2.4677033659737643e-05, + "loss": 0.1329, + "step": 31043 + }, + { + "epoch": 0.5537045624799344, + "grad_norm": 0.3007456064224243, + "learning_rate": 2.4675477282666917e-05, + "loss": 0.1314, + "step": 31044 + }, + { + "epoch": 0.5537223986016481, + "grad_norm": 0.23909218609333038, + "learning_rate": 2.467392090685416e-05, + "loss": 0.1282, + "step": 31045 + }, + { + "epoch": 0.5537402347233618, + "grad_norm": 0.26538655161857605, + "learning_rate": 2.467236453230538e-05, + "loss": 0.1578, + "step": 31046 + }, + { + "epoch": 0.5537580708450754, + "grad_norm": 0.3117910325527191, + "learning_rate": 2.4670808159026644e-05, + "loss": 0.1287, + "step": 31047 + }, + { + "epoch": 0.5537759069667891, + "grad_norm": 0.344798743724823, + "learning_rate": 2.4669251787023963e-05, + "loss": 0.1803, + "step": 31048 + }, + { + "epoch": 0.5537937430885028, + "grad_norm": 0.3242945671081543, + "learning_rate": 2.4667695416303382e-05, + "loss": 0.1382, + "step": 31049 + }, + { + "epoch": 0.5538115792102165, + "grad_norm": 0.27023184299468994, + "learning_rate": 2.4666139046870918e-05, + "loss": 0.1517, + "step": 31050 + }, + { + "epoch": 0.5538294153319302, + "grad_norm": 0.1973017156124115, + "learning_rate": 2.4664582678732622e-05, + "loss": 0.1107, + "step": 31051 + }, + { + "epoch": 0.5538472514536439, + "grad_norm": 0.2021416574716568, + "learning_rate": 2.4663026311894525e-05, + "loss": 0.1338, + "step": 31052 + }, + { + "epoch": 0.5538650875753576, + "grad_norm": 0.1999203860759735, + "learning_rate": 2.466146994636265e-05, + "loss": 0.0912, + "step": 31053 + }, + { + "epoch": 0.5538829236970713, + "grad_norm": 0.30637460947036743, + "learning_rate": 2.465991358214303e-05, + "loss": 0.158, + "step": 31054 + }, + { + "epoch": 0.553900759818785, + "grad_norm": 0.2713467478752136, + "learning_rate": 2.4658357219241702e-05, + "loss": 0.1261, + "step": 31055 + }, + { + "epoch": 0.5539185959404987, + "grad_norm": 0.384473979473114, + "learning_rate": 2.46568008576647e-05, + "loss": 0.1749, + "step": 31056 + }, + { + "epoch": 0.5539364320622124, + "grad_norm": 0.261934369802475, + "learning_rate": 2.4655244497418063e-05, + "loss": 0.1295, + "step": 31057 + }, + { + "epoch": 0.5539542681839261, + "grad_norm": 0.3581433892250061, + "learning_rate": 2.4653688138507815e-05, + "loss": 0.1174, + "step": 31058 + }, + { + "epoch": 0.5539721043056398, + "grad_norm": 0.24308906495571136, + "learning_rate": 2.465213178093998e-05, + "loss": 0.1407, + "step": 31059 + }, + { + "epoch": 0.5539899404273535, + "grad_norm": 0.23988047242164612, + "learning_rate": 2.4650575424720618e-05, + "loss": 0.1307, + "step": 31060 + }, + { + "epoch": 0.5540077765490672, + "grad_norm": 0.3088117241859436, + "learning_rate": 2.4649019069855735e-05, + "loss": 0.1466, + "step": 31061 + }, + { + "epoch": 0.5540256126707809, + "grad_norm": 0.33846238255500793, + "learning_rate": 2.4647462716351386e-05, + "loss": 0.1324, + "step": 31062 + }, + { + "epoch": 0.5540434487924946, + "grad_norm": 0.19548974931240082, + "learning_rate": 2.464590636421358e-05, + "loss": 0.1199, + "step": 31063 + }, + { + "epoch": 0.5540612849142083, + "grad_norm": 0.21766163408756256, + "learning_rate": 2.4644350013448373e-05, + "loss": 0.133, + "step": 31064 + }, + { + "epoch": 0.5540791210359219, + "grad_norm": 0.19711840152740479, + "learning_rate": 2.464279366406179e-05, + "loss": 0.0668, + "step": 31065 + }, + { + "epoch": 0.5540969571576356, + "grad_norm": 0.36362069845199585, + "learning_rate": 2.4641237316059863e-05, + "loss": 0.107, + "step": 31066 + }, + { + "epoch": 0.5541147932793493, + "grad_norm": 0.3641241490840912, + "learning_rate": 2.4639680969448618e-05, + "loss": 0.1598, + "step": 31067 + }, + { + "epoch": 0.554132629401063, + "grad_norm": 0.16779862344264984, + "learning_rate": 2.463812462423409e-05, + "loss": 0.0708, + "step": 31068 + }, + { + "epoch": 0.5541504655227767, + "grad_norm": 0.324424147605896, + "learning_rate": 2.463656828042232e-05, + "loss": 0.1874, + "step": 31069 + }, + { + "epoch": 0.5541683016444904, + "grad_norm": 0.18128371238708496, + "learning_rate": 2.4635011938019343e-05, + "loss": 0.0839, + "step": 31070 + }, + { + "epoch": 0.5541861377662041, + "grad_norm": 0.25565630197525024, + "learning_rate": 2.4633455597031184e-05, + "loss": 0.0935, + "step": 31071 + }, + { + "epoch": 0.5542039738879178, + "grad_norm": 0.27423450350761414, + "learning_rate": 2.4631899257463865e-05, + "loss": 0.0831, + "step": 31072 + }, + { + "epoch": 0.5542218100096316, + "grad_norm": 0.22155149281024933, + "learning_rate": 2.4630342919323447e-05, + "loss": 0.0737, + "step": 31073 + }, + { + "epoch": 0.5542396461313452, + "grad_norm": 0.26568421721458435, + "learning_rate": 2.462878658261594e-05, + "loss": 0.1175, + "step": 31074 + }, + { + "epoch": 0.5542574822530589, + "grad_norm": 0.2958373427391052, + "learning_rate": 2.4627230247347386e-05, + "loss": 0.1271, + "step": 31075 + }, + { + "epoch": 0.5542753183747726, + "grad_norm": 0.25283315777778625, + "learning_rate": 2.4625673913523814e-05, + "loss": 0.1073, + "step": 31076 + }, + { + "epoch": 0.5542931544964863, + "grad_norm": 0.19242306053638458, + "learning_rate": 2.462411758115126e-05, + "loss": 0.087, + "step": 31077 + }, + { + "epoch": 0.5543109906182, + "grad_norm": 0.2242615669965744, + "learning_rate": 2.4622561250235763e-05, + "loss": 0.1313, + "step": 31078 + }, + { + "epoch": 0.5543288267399137, + "grad_norm": 0.28264737129211426, + "learning_rate": 2.4621004920783345e-05, + "loss": 0.1041, + "step": 31079 + }, + { + "epoch": 0.5543466628616274, + "grad_norm": 0.2663622498512268, + "learning_rate": 2.4619448592800045e-05, + "loss": 0.0854, + "step": 31080 + }, + { + "epoch": 0.5543644989833411, + "grad_norm": 0.31599247455596924, + "learning_rate": 2.461789226629188e-05, + "loss": 0.1229, + "step": 31081 + }, + { + "epoch": 0.5543823351050547, + "grad_norm": 0.24944135546684265, + "learning_rate": 2.461633594126491e-05, + "loss": 0.1177, + "step": 31082 + }, + { + "epoch": 0.5544001712267684, + "grad_norm": 0.25315147638320923, + "learning_rate": 2.461477961772515e-05, + "loss": 0.107, + "step": 31083 + }, + { + "epoch": 0.5544180073484821, + "grad_norm": 0.2580139935016632, + "learning_rate": 2.4613223295678642e-05, + "loss": 0.1386, + "step": 31084 + }, + { + "epoch": 0.5544358434701958, + "grad_norm": 0.2543202340602875, + "learning_rate": 2.4611666975131404e-05, + "loss": 0.1045, + "step": 31085 + }, + { + "epoch": 0.5544536795919095, + "grad_norm": 0.28528475761413574, + "learning_rate": 2.461011065608949e-05, + "loss": 0.1359, + "step": 31086 + }, + { + "epoch": 0.5544715157136232, + "grad_norm": 0.2835729718208313, + "learning_rate": 2.460855433855892e-05, + "loss": 0.1519, + "step": 31087 + }, + { + "epoch": 0.5544893518353369, + "grad_norm": 0.2815497815608978, + "learning_rate": 2.460699802254572e-05, + "loss": 0.1274, + "step": 31088 + }, + { + "epoch": 0.5545071879570507, + "grad_norm": 0.3152524530887604, + "learning_rate": 2.4605441708055937e-05, + "loss": 0.1628, + "step": 31089 + }, + { + "epoch": 0.5545250240787644, + "grad_norm": 0.28345367312431335, + "learning_rate": 2.4603885395095597e-05, + "loss": 0.1422, + "step": 31090 + }, + { + "epoch": 0.554542860200478, + "grad_norm": 0.45184335112571716, + "learning_rate": 2.4602329083670737e-05, + "loss": 0.1416, + "step": 31091 + }, + { + "epoch": 0.5545606963221917, + "grad_norm": 0.24651899933815002, + "learning_rate": 2.460077277378739e-05, + "loss": 0.1417, + "step": 31092 + }, + { + "epoch": 0.5545785324439054, + "grad_norm": 0.2692287564277649, + "learning_rate": 2.4599216465451584e-05, + "loss": 0.0912, + "step": 31093 + }, + { + "epoch": 0.5545963685656191, + "grad_norm": 0.26124873757362366, + "learning_rate": 2.4597660158669343e-05, + "loss": 0.173, + "step": 31094 + }, + { + "epoch": 0.5546142046873328, + "grad_norm": 0.28615447878837585, + "learning_rate": 2.459610385344672e-05, + "loss": 0.1263, + "step": 31095 + }, + { + "epoch": 0.5546320408090465, + "grad_norm": 0.29463401436805725, + "learning_rate": 2.4594547549789735e-05, + "loss": 0.162, + "step": 31096 + }, + { + "epoch": 0.5546498769307602, + "grad_norm": 0.2976105511188507, + "learning_rate": 2.4592991247704426e-05, + "loss": 0.1113, + "step": 31097 + }, + { + "epoch": 0.5546677130524739, + "grad_norm": 0.2727324068546295, + "learning_rate": 2.4591434947196815e-05, + "loss": 0.1031, + "step": 31098 + }, + { + "epoch": 0.5546855491741876, + "grad_norm": 0.28413137793540955, + "learning_rate": 2.4589878648272952e-05, + "loss": 0.1278, + "step": 31099 + }, + { + "epoch": 0.5547033852959012, + "grad_norm": 0.28571563959121704, + "learning_rate": 2.4588322350938865e-05, + "loss": 0.1431, + "step": 31100 + }, + { + "epoch": 0.5547212214176149, + "grad_norm": 0.25055447220802307, + "learning_rate": 2.4586766055200578e-05, + "loss": 0.1598, + "step": 31101 + }, + { + "epoch": 0.5547390575393286, + "grad_norm": 0.22316960990428925, + "learning_rate": 2.458520976106412e-05, + "loss": 0.1122, + "step": 31102 + }, + { + "epoch": 0.5547568936610423, + "grad_norm": 0.26231351494789124, + "learning_rate": 2.458365346853554e-05, + "loss": 0.1289, + "step": 31103 + }, + { + "epoch": 0.554774729782756, + "grad_norm": 0.294583261013031, + "learning_rate": 2.4582097177620862e-05, + "loss": 0.1476, + "step": 31104 + }, + { + "epoch": 0.5547925659044697, + "grad_norm": 0.20654277503490448, + "learning_rate": 2.4580540888326124e-05, + "loss": 0.1304, + "step": 31105 + }, + { + "epoch": 0.5548104020261835, + "grad_norm": 0.28445523977279663, + "learning_rate": 2.4578984600657354e-05, + "loss": 0.1336, + "step": 31106 + }, + { + "epoch": 0.5548282381478972, + "grad_norm": 0.2922994792461395, + "learning_rate": 2.4577428314620572e-05, + "loss": 0.1049, + "step": 31107 + }, + { + "epoch": 0.5548460742696109, + "grad_norm": 0.22151844203472137, + "learning_rate": 2.4575872030221837e-05, + "loss": 0.1275, + "step": 31108 + }, + { + "epoch": 0.5548639103913245, + "grad_norm": 0.22371706366539001, + "learning_rate": 2.457431574746717e-05, + "loss": 0.1166, + "step": 31109 + }, + { + "epoch": 0.5548817465130382, + "grad_norm": 0.2557823359966278, + "learning_rate": 2.4572759466362593e-05, + "loss": 0.1667, + "step": 31110 + }, + { + "epoch": 0.5548995826347519, + "grad_norm": 0.23507322371006012, + "learning_rate": 2.4571203186914145e-05, + "loss": 0.1384, + "step": 31111 + }, + { + "epoch": 0.5549174187564656, + "grad_norm": 0.2234814167022705, + "learning_rate": 2.456964690912787e-05, + "loss": 0.118, + "step": 31112 + }, + { + "epoch": 0.5549352548781793, + "grad_norm": 0.21652117371559143, + "learning_rate": 2.4568090633009797e-05, + "loss": 0.1365, + "step": 31113 + }, + { + "epoch": 0.554953090999893, + "grad_norm": 0.22298663854599, + "learning_rate": 2.456653435856595e-05, + "loss": 0.0743, + "step": 31114 + }, + { + "epoch": 0.5549709271216067, + "grad_norm": 0.20688222348690033, + "learning_rate": 2.4564978085802353e-05, + "loss": 0.0949, + "step": 31115 + }, + { + "epoch": 0.5549887632433204, + "grad_norm": 0.29829275608062744, + "learning_rate": 2.4563421814725064e-05, + "loss": 0.1537, + "step": 31116 + }, + { + "epoch": 0.555006599365034, + "grad_norm": 0.2555026113986969, + "learning_rate": 2.45618655453401e-05, + "loss": 0.1734, + "step": 31117 + }, + { + "epoch": 0.5550244354867477, + "grad_norm": 0.49102213978767395, + "learning_rate": 2.45603092776535e-05, + "loss": 0.1202, + "step": 31118 + }, + { + "epoch": 0.5550422716084614, + "grad_norm": 0.2527959942817688, + "learning_rate": 2.455875301167129e-05, + "loss": 0.1524, + "step": 31119 + }, + { + "epoch": 0.5550601077301751, + "grad_norm": 0.27550408244132996, + "learning_rate": 2.45571967473995e-05, + "loss": 0.1602, + "step": 31120 + }, + { + "epoch": 0.5550779438518888, + "grad_norm": 0.2326967865228653, + "learning_rate": 2.455564048484418e-05, + "loss": 0.1114, + "step": 31121 + }, + { + "epoch": 0.5550957799736025, + "grad_norm": 0.23051267862319946, + "learning_rate": 2.4554084224011346e-05, + "loss": 0.1149, + "step": 31122 + }, + { + "epoch": 0.5551136160953163, + "grad_norm": 0.23868350684642792, + "learning_rate": 2.455252796490703e-05, + "loss": 0.1022, + "step": 31123 + }, + { + "epoch": 0.55513145221703, + "grad_norm": 0.22512511909008026, + "learning_rate": 2.4550971707537273e-05, + "loss": 0.1071, + "step": 31124 + }, + { + "epoch": 0.5551492883387437, + "grad_norm": 0.27983564138412476, + "learning_rate": 2.4549415451908107e-05, + "loss": 0.1112, + "step": 31125 + }, + { + "epoch": 0.5551671244604574, + "grad_norm": 0.23378156125545502, + "learning_rate": 2.4547859198025563e-05, + "loss": 0.126, + "step": 31126 + }, + { + "epoch": 0.555184960582171, + "grad_norm": 0.237847700715065, + "learning_rate": 2.4546302945895673e-05, + "loss": 0.105, + "step": 31127 + }, + { + "epoch": 0.5552027967038847, + "grad_norm": 0.45227423310279846, + "learning_rate": 2.4544746695524464e-05, + "loss": 0.1279, + "step": 31128 + }, + { + "epoch": 0.5552206328255984, + "grad_norm": 0.26082584261894226, + "learning_rate": 2.454319044691798e-05, + "loss": 0.1119, + "step": 31129 + }, + { + "epoch": 0.5552384689473121, + "grad_norm": 0.303725004196167, + "learning_rate": 2.454163420008225e-05, + "loss": 0.2165, + "step": 31130 + }, + { + "epoch": 0.5552563050690258, + "grad_norm": 0.2925204336643219, + "learning_rate": 2.4540077955023298e-05, + "loss": 0.1454, + "step": 31131 + }, + { + "epoch": 0.5552741411907395, + "grad_norm": 0.22085171937942505, + "learning_rate": 2.4538521711747166e-05, + "loss": 0.1023, + "step": 31132 + }, + { + "epoch": 0.5552919773124532, + "grad_norm": 0.33117154240608215, + "learning_rate": 2.4536965470259875e-05, + "loss": 0.1185, + "step": 31133 + }, + { + "epoch": 0.5553098134341669, + "grad_norm": 0.25302746891975403, + "learning_rate": 2.4535409230567474e-05, + "loss": 0.1658, + "step": 31134 + }, + { + "epoch": 0.5553276495558805, + "grad_norm": 0.2912423014640808, + "learning_rate": 2.453385299267599e-05, + "loss": 0.127, + "step": 31135 + }, + { + "epoch": 0.5553454856775942, + "grad_norm": 0.34991511702537537, + "learning_rate": 2.453229675659145e-05, + "loss": 0.1013, + "step": 31136 + }, + { + "epoch": 0.5553633217993079, + "grad_norm": 0.2708688974380493, + "learning_rate": 2.453074052231988e-05, + "loss": 0.1276, + "step": 31137 + }, + { + "epoch": 0.5553811579210216, + "grad_norm": 0.2998257577419281, + "learning_rate": 2.452918428986733e-05, + "loss": 0.1706, + "step": 31138 + }, + { + "epoch": 0.5553989940427353, + "grad_norm": 0.24800308048725128, + "learning_rate": 2.4527628059239826e-05, + "loss": 0.1166, + "step": 31139 + }, + { + "epoch": 0.5554168301644491, + "grad_norm": 0.21276402473449707, + "learning_rate": 2.45260718304434e-05, + "loss": 0.088, + "step": 31140 + }, + { + "epoch": 0.5554346662861628, + "grad_norm": 0.19047676026821136, + "learning_rate": 2.4524515603484073e-05, + "loss": 0.0666, + "step": 31141 + }, + { + "epoch": 0.5554525024078765, + "grad_norm": 0.24123834073543549, + "learning_rate": 2.4522959378367896e-05, + "loss": 0.2002, + "step": 31142 + }, + { + "epoch": 0.5554703385295902, + "grad_norm": 0.19772756099700928, + "learning_rate": 2.4521403155100898e-05, + "loss": 0.1002, + "step": 31143 + }, + { + "epoch": 0.5554881746513038, + "grad_norm": 0.31100645661354065, + "learning_rate": 2.4519846933689098e-05, + "loss": 0.1728, + "step": 31144 + }, + { + "epoch": 0.5555060107730175, + "grad_norm": 0.15906788408756256, + "learning_rate": 2.451829071413854e-05, + "loss": 0.0853, + "step": 31145 + }, + { + "epoch": 0.5555238468947312, + "grad_norm": 0.2505277991294861, + "learning_rate": 2.4516734496455248e-05, + "loss": 0.1264, + "step": 31146 + }, + { + "epoch": 0.5555416830164449, + "grad_norm": 0.3088262677192688, + "learning_rate": 2.4515178280645267e-05, + "loss": 0.1652, + "step": 31147 + }, + { + "epoch": 0.5555595191381586, + "grad_norm": 0.3293002247810364, + "learning_rate": 2.4513622066714624e-05, + "loss": 0.1787, + "step": 31148 + }, + { + "epoch": 0.5555773552598723, + "grad_norm": 0.2221067100763321, + "learning_rate": 2.4512065854669352e-05, + "loss": 0.0803, + "step": 31149 + }, + { + "epoch": 0.555595191381586, + "grad_norm": 0.24624836444854736, + "learning_rate": 2.4510509644515467e-05, + "loss": 0.1207, + "step": 31150 + }, + { + "epoch": 0.5556130275032997, + "grad_norm": 0.250946044921875, + "learning_rate": 2.4508953436259027e-05, + "loss": 0.1309, + "step": 31151 + }, + { + "epoch": 0.5556308636250133, + "grad_norm": 0.26991331577301025, + "learning_rate": 2.450739722990605e-05, + "loss": 0.1474, + "step": 31152 + }, + { + "epoch": 0.555648699746727, + "grad_norm": 0.24678856134414673, + "learning_rate": 2.4505841025462577e-05, + "loss": 0.1641, + "step": 31153 + }, + { + "epoch": 0.5556665358684407, + "grad_norm": 0.2615037262439728, + "learning_rate": 2.450428482293462e-05, + "loss": 0.1134, + "step": 31154 + }, + { + "epoch": 0.5556843719901544, + "grad_norm": 0.2891719937324524, + "learning_rate": 2.450272862232824e-05, + "loss": 0.1386, + "step": 31155 + }, + { + "epoch": 0.5557022081118681, + "grad_norm": 0.22479921579360962, + "learning_rate": 2.4501172423649453e-05, + "loss": 0.0839, + "step": 31156 + }, + { + "epoch": 0.5557200442335819, + "grad_norm": 0.27169883251190186, + "learning_rate": 2.44996162269043e-05, + "loss": 0.1376, + "step": 31157 + }, + { + "epoch": 0.5557378803552956, + "grad_norm": 0.2793879806995392, + "learning_rate": 2.4498060032098797e-05, + "loss": 0.1455, + "step": 31158 + }, + { + "epoch": 0.5557557164770093, + "grad_norm": 0.30989184975624084, + "learning_rate": 2.4496503839238987e-05, + "loss": 0.1301, + "step": 31159 + }, + { + "epoch": 0.555773552598723, + "grad_norm": 0.26302963495254517, + "learning_rate": 2.44949476483309e-05, + "loss": 0.1291, + "step": 31160 + }, + { + "epoch": 0.5557913887204367, + "grad_norm": 0.2543509006500244, + "learning_rate": 2.449339145938058e-05, + "loss": 0.1653, + "step": 31161 + }, + { + "epoch": 0.5558092248421503, + "grad_norm": 0.32371747493743896, + "learning_rate": 2.449183527239405e-05, + "loss": 0.0999, + "step": 31162 + }, + { + "epoch": 0.555827060963864, + "grad_norm": 0.3102806508541107, + "learning_rate": 2.449027908737733e-05, + "loss": 0.1152, + "step": 31163 + }, + { + "epoch": 0.5558448970855777, + "grad_norm": 0.2877858877182007, + "learning_rate": 2.4488722904336473e-05, + "loss": 0.1497, + "step": 31164 + }, + { + "epoch": 0.5558627332072914, + "grad_norm": 0.2872796356678009, + "learning_rate": 2.44871667232775e-05, + "loss": 0.1274, + "step": 31165 + }, + { + "epoch": 0.5558805693290051, + "grad_norm": 0.299836665391922, + "learning_rate": 2.448561054420645e-05, + "loss": 0.1083, + "step": 31166 + }, + { + "epoch": 0.5558984054507188, + "grad_norm": 0.2736889123916626, + "learning_rate": 2.448405436712934e-05, + "loss": 0.1796, + "step": 31167 + }, + { + "epoch": 0.5559162415724325, + "grad_norm": 0.2884184420108795, + "learning_rate": 2.4482498192052225e-05, + "loss": 0.1343, + "step": 31168 + }, + { + "epoch": 0.5559340776941462, + "grad_norm": 0.2364308089017868, + "learning_rate": 2.4480942018981124e-05, + "loss": 0.1339, + "step": 31169 + }, + { + "epoch": 0.5559519138158598, + "grad_norm": 0.25484520196914673, + "learning_rate": 2.4479385847922075e-05, + "loss": 0.1896, + "step": 31170 + }, + { + "epoch": 0.5559697499375735, + "grad_norm": 0.3305186331272125, + "learning_rate": 2.4477829678881095e-05, + "loss": 0.1364, + "step": 31171 + }, + { + "epoch": 0.5559875860592872, + "grad_norm": 0.26606112718582153, + "learning_rate": 2.447627351186423e-05, + "loss": 0.1602, + "step": 31172 + }, + { + "epoch": 0.5560054221810009, + "grad_norm": 0.2041306495666504, + "learning_rate": 2.4474717346877513e-05, + "loss": 0.1123, + "step": 31173 + }, + { + "epoch": 0.5560232583027147, + "grad_norm": 0.3697538375854492, + "learning_rate": 2.4473161183926975e-05, + "loss": 0.1457, + "step": 31174 + }, + { + "epoch": 0.5560410944244284, + "grad_norm": 0.22401094436645508, + "learning_rate": 2.4471605023018647e-05, + "loss": 0.0702, + "step": 31175 + }, + { + "epoch": 0.5560589305461421, + "grad_norm": 0.24855849146842957, + "learning_rate": 2.447004886415855e-05, + "loss": 0.1422, + "step": 31176 + }, + { + "epoch": 0.5560767666678558, + "grad_norm": 0.3434732258319855, + "learning_rate": 2.4468492707352738e-05, + "loss": 0.1493, + "step": 31177 + }, + { + "epoch": 0.5560946027895695, + "grad_norm": 0.26988792419433594, + "learning_rate": 2.4466936552607232e-05, + "loss": 0.1295, + "step": 31178 + }, + { + "epoch": 0.5561124389112831, + "grad_norm": 0.2510938346385956, + "learning_rate": 2.4465380399928056e-05, + "loss": 0.1657, + "step": 31179 + }, + { + "epoch": 0.5561302750329968, + "grad_norm": 0.30789700150489807, + "learning_rate": 2.446382424932125e-05, + "loss": 0.1356, + "step": 31180 + }, + { + "epoch": 0.5561481111547105, + "grad_norm": 0.20323596894741058, + "learning_rate": 2.4462268100792852e-05, + "loss": 0.1174, + "step": 31181 + }, + { + "epoch": 0.5561659472764242, + "grad_norm": 0.2767643630504608, + "learning_rate": 2.446071195434889e-05, + "loss": 0.1892, + "step": 31182 + }, + { + "epoch": 0.5561837833981379, + "grad_norm": 0.3257175087928772, + "learning_rate": 2.4459155809995393e-05, + "loss": 0.1606, + "step": 31183 + }, + { + "epoch": 0.5562016195198516, + "grad_norm": 0.1835612803697586, + "learning_rate": 2.44575996677384e-05, + "loss": 0.1435, + "step": 31184 + }, + { + "epoch": 0.5562194556415653, + "grad_norm": 0.27789485454559326, + "learning_rate": 2.4456043527583923e-05, + "loss": 0.1353, + "step": 31185 + }, + { + "epoch": 0.556237291763279, + "grad_norm": 0.2492380142211914, + "learning_rate": 2.445448738953802e-05, + "loss": 0.0676, + "step": 31186 + }, + { + "epoch": 0.5562551278849927, + "grad_norm": 0.25265613198280334, + "learning_rate": 2.445293125360671e-05, + "loss": 0.136, + "step": 31187 + }, + { + "epoch": 0.5562729640067063, + "grad_norm": 0.46785983443260193, + "learning_rate": 2.445137511979603e-05, + "loss": 0.119, + "step": 31188 + }, + { + "epoch": 0.55629080012842, + "grad_norm": 0.2458163946866989, + "learning_rate": 2.4449818988112e-05, + "loss": 0.1703, + "step": 31189 + }, + { + "epoch": 0.5563086362501338, + "grad_norm": 0.3483573794364929, + "learning_rate": 2.4448262858560673e-05, + "loss": 0.1786, + "step": 31190 + }, + { + "epoch": 0.5563264723718475, + "grad_norm": 0.19383332133293152, + "learning_rate": 2.4446706731148066e-05, + "loss": 0.0777, + "step": 31191 + }, + { + "epoch": 0.5563443084935612, + "grad_norm": 0.2786974608898163, + "learning_rate": 2.444515060588021e-05, + "loss": 0.0997, + "step": 31192 + }, + { + "epoch": 0.5563621446152749, + "grad_norm": 0.24330143630504608, + "learning_rate": 2.4443594482763146e-05, + "loss": 0.1582, + "step": 31193 + }, + { + "epoch": 0.5563799807369886, + "grad_norm": 0.3224984407424927, + "learning_rate": 2.4442038361802898e-05, + "loss": 0.1418, + "step": 31194 + }, + { + "epoch": 0.5563978168587023, + "grad_norm": 0.22933678328990936, + "learning_rate": 2.444048224300551e-05, + "loss": 0.1498, + "step": 31195 + }, + { + "epoch": 0.556415652980416, + "grad_norm": 0.36137765645980835, + "learning_rate": 2.4438926126377006e-05, + "loss": 0.191, + "step": 31196 + }, + { + "epoch": 0.5564334891021296, + "grad_norm": 0.19844384491443634, + "learning_rate": 2.4437370011923417e-05, + "loss": 0.1402, + "step": 31197 + }, + { + "epoch": 0.5564513252238433, + "grad_norm": 0.3075028955936432, + "learning_rate": 2.4435813899650766e-05, + "loss": 0.1688, + "step": 31198 + }, + { + "epoch": 0.556469161345557, + "grad_norm": 0.25540030002593994, + "learning_rate": 2.4434257789565106e-05, + "loss": 0.1169, + "step": 31199 + }, + { + "epoch": 0.5564869974672707, + "grad_norm": 0.329569935798645, + "learning_rate": 2.4432701681672455e-05, + "loss": 0.1199, + "step": 31200 + }, + { + "epoch": 0.5565048335889844, + "grad_norm": 0.22274599969387054, + "learning_rate": 2.443114557597885e-05, + "loss": 0.1077, + "step": 31201 + }, + { + "epoch": 0.5565226697106981, + "grad_norm": 0.2922258973121643, + "learning_rate": 2.4429589472490313e-05, + "loss": 0.1027, + "step": 31202 + }, + { + "epoch": 0.5565405058324118, + "grad_norm": 0.20232409238815308, + "learning_rate": 2.4428033371212895e-05, + "loss": 0.1056, + "step": 31203 + }, + { + "epoch": 0.5565583419541255, + "grad_norm": 0.4353424608707428, + "learning_rate": 2.4426477272152615e-05, + "loss": 0.1437, + "step": 31204 + }, + { + "epoch": 0.5565761780758391, + "grad_norm": 0.42640116810798645, + "learning_rate": 2.4424921175315506e-05, + "loss": 0.1189, + "step": 31205 + }, + { + "epoch": 0.5565940141975528, + "grad_norm": 0.2993132472038269, + "learning_rate": 2.4423365080707595e-05, + "loss": 0.1731, + "step": 31206 + }, + { + "epoch": 0.5566118503192666, + "grad_norm": 0.2619737386703491, + "learning_rate": 2.4421808988334925e-05, + "loss": 0.1102, + "step": 31207 + }, + { + "epoch": 0.5566296864409803, + "grad_norm": 0.2965092658996582, + "learning_rate": 2.4420252898203525e-05, + "loss": 0.1467, + "step": 31208 + }, + { + "epoch": 0.556647522562694, + "grad_norm": 0.1895340383052826, + "learning_rate": 2.4418696810319425e-05, + "loss": 0.1009, + "step": 31209 + }, + { + "epoch": 0.5566653586844077, + "grad_norm": 0.29556336998939514, + "learning_rate": 2.441714072468866e-05, + "loss": 0.1726, + "step": 31210 + }, + { + "epoch": 0.5566831948061214, + "grad_norm": 0.2298395186662674, + "learning_rate": 2.4415584641317247e-05, + "loss": 0.1851, + "step": 31211 + }, + { + "epoch": 0.5567010309278351, + "grad_norm": 0.3290596008300781, + "learning_rate": 2.4414028560211237e-05, + "loss": 0.1886, + "step": 31212 + }, + { + "epoch": 0.5567188670495488, + "grad_norm": 0.246909499168396, + "learning_rate": 2.4412472481376658e-05, + "loss": 0.1674, + "step": 31213 + }, + { + "epoch": 0.5567367031712624, + "grad_norm": 0.22312992811203003, + "learning_rate": 2.4410916404819533e-05, + "loss": 0.108, + "step": 31214 + }, + { + "epoch": 0.5567545392929761, + "grad_norm": 0.2371286302804947, + "learning_rate": 2.4409360330545894e-05, + "loss": 0.2067, + "step": 31215 + }, + { + "epoch": 0.5567723754146898, + "grad_norm": 0.20558395981788635, + "learning_rate": 2.440780425856179e-05, + "loss": 0.0875, + "step": 31216 + }, + { + "epoch": 0.5567902115364035, + "grad_norm": 0.22025008499622345, + "learning_rate": 2.440624818887324e-05, + "loss": 0.1176, + "step": 31217 + }, + { + "epoch": 0.5568080476581172, + "grad_norm": 0.29906705021858215, + "learning_rate": 2.4404692121486278e-05, + "loss": 0.128, + "step": 31218 + }, + { + "epoch": 0.5568258837798309, + "grad_norm": 0.26025450229644775, + "learning_rate": 2.4403136056406924e-05, + "loss": 0.0914, + "step": 31219 + }, + { + "epoch": 0.5568437199015446, + "grad_norm": 0.23228274285793304, + "learning_rate": 2.440157999364123e-05, + "loss": 0.1302, + "step": 31220 + }, + { + "epoch": 0.5568615560232583, + "grad_norm": 0.33225101232528687, + "learning_rate": 2.4400023933195215e-05, + "loss": 0.11, + "step": 31221 + }, + { + "epoch": 0.556879392144972, + "grad_norm": 0.24343401193618774, + "learning_rate": 2.439846787507492e-05, + "loss": 0.0844, + "step": 31222 + }, + { + "epoch": 0.5568972282666856, + "grad_norm": 0.2668202519416809, + "learning_rate": 2.439691181928637e-05, + "loss": 0.1144, + "step": 31223 + }, + { + "epoch": 0.5569150643883994, + "grad_norm": 0.4529821574687958, + "learning_rate": 2.439535576583559e-05, + "loss": 0.146, + "step": 31224 + }, + { + "epoch": 0.5569329005101131, + "grad_norm": 0.2705513536930084, + "learning_rate": 2.4393799714728628e-05, + "loss": 0.1577, + "step": 31225 + }, + { + "epoch": 0.5569507366318268, + "grad_norm": 0.24749356508255005, + "learning_rate": 2.439224366597151e-05, + "loss": 0.1397, + "step": 31226 + }, + { + "epoch": 0.5569685727535405, + "grad_norm": 0.2518272399902344, + "learning_rate": 2.4390687619570257e-05, + "loss": 0.144, + "step": 31227 + }, + { + "epoch": 0.5569864088752542, + "grad_norm": 0.27374425530433655, + "learning_rate": 2.438913157553091e-05, + "loss": 0.1283, + "step": 31228 + }, + { + "epoch": 0.5570042449969679, + "grad_norm": 0.2797556519508362, + "learning_rate": 2.4387575533859498e-05, + "loss": 0.1165, + "step": 31229 + }, + { + "epoch": 0.5570220811186816, + "grad_norm": 0.229637011885643, + "learning_rate": 2.4386019494562065e-05, + "loss": 0.1549, + "step": 31230 + }, + { + "epoch": 0.5570399172403953, + "grad_norm": 0.2515673339366913, + "learning_rate": 2.438446345764463e-05, + "loss": 0.1498, + "step": 31231 + }, + { + "epoch": 0.5570577533621089, + "grad_norm": 0.34347423911094666, + "learning_rate": 2.4382907423113217e-05, + "loss": 0.0765, + "step": 31232 + }, + { + "epoch": 0.5570755894838226, + "grad_norm": 0.3063320815563202, + "learning_rate": 2.4381351390973877e-05, + "loss": 0.1888, + "step": 31233 + }, + { + "epoch": 0.5570934256055363, + "grad_norm": 0.22773747146129608, + "learning_rate": 2.4379795361232636e-05, + "loss": 0.1485, + "step": 31234 + }, + { + "epoch": 0.55711126172725, + "grad_norm": 0.2152549773454666, + "learning_rate": 2.4378239333895515e-05, + "loss": 0.098, + "step": 31235 + }, + { + "epoch": 0.5571290978489637, + "grad_norm": 0.23356224596500397, + "learning_rate": 2.437668330896856e-05, + "loss": 0.1388, + "step": 31236 + }, + { + "epoch": 0.5571469339706774, + "grad_norm": 0.22563858330249786, + "learning_rate": 2.437512728645778e-05, + "loss": 0.1139, + "step": 31237 + }, + { + "epoch": 0.5571647700923911, + "grad_norm": 0.457992285490036, + "learning_rate": 2.437357126636924e-05, + "loss": 0.1092, + "step": 31238 + }, + { + "epoch": 0.5571826062141048, + "grad_norm": 0.31593573093414307, + "learning_rate": 2.4372015248708952e-05, + "loss": 0.1503, + "step": 31239 + }, + { + "epoch": 0.5572004423358184, + "grad_norm": 0.2338608354330063, + "learning_rate": 2.437045923348295e-05, + "loss": 0.0909, + "step": 31240 + }, + { + "epoch": 0.5572182784575322, + "grad_norm": 0.243311807513237, + "learning_rate": 2.436890322069725e-05, + "loss": 0.1218, + "step": 31241 + }, + { + "epoch": 0.5572361145792459, + "grad_norm": 0.2441393882036209, + "learning_rate": 2.4367347210357907e-05, + "loss": 0.1595, + "step": 31242 + }, + { + "epoch": 0.5572539507009596, + "grad_norm": 0.22858485579490662, + "learning_rate": 2.4365791202470953e-05, + "loss": 0.1423, + "step": 31243 + }, + { + "epoch": 0.5572717868226733, + "grad_norm": 0.30238595604896545, + "learning_rate": 2.436423519704241e-05, + "loss": 0.1768, + "step": 31244 + }, + { + "epoch": 0.557289622944387, + "grad_norm": 0.2489360272884369, + "learning_rate": 2.4362679194078297e-05, + "loss": 0.1327, + "step": 31245 + }, + { + "epoch": 0.5573074590661007, + "grad_norm": 0.3165079951286316, + "learning_rate": 2.4361123193584673e-05, + "loss": 0.1728, + "step": 31246 + }, + { + "epoch": 0.5573252951878144, + "grad_norm": 0.26704007387161255, + "learning_rate": 2.4359567195567557e-05, + "loss": 0.098, + "step": 31247 + }, + { + "epoch": 0.5573431313095281, + "grad_norm": 0.24416470527648926, + "learning_rate": 2.4358011200032973e-05, + "loss": 0.1203, + "step": 31248 + }, + { + "epoch": 0.5573609674312417, + "grad_norm": 0.27771514654159546, + "learning_rate": 2.435645520698696e-05, + "loss": 0.1227, + "step": 31249 + }, + { + "epoch": 0.5573788035529554, + "grad_norm": 0.28032785654067993, + "learning_rate": 2.435489921643555e-05, + "loss": 0.1299, + "step": 31250 + }, + { + "epoch": 0.5573966396746691, + "grad_norm": 0.2338113933801651, + "learning_rate": 2.4353343228384776e-05, + "loss": 0.1119, + "step": 31251 + }, + { + "epoch": 0.5574144757963828, + "grad_norm": 0.21300457417964935, + "learning_rate": 2.435178724284067e-05, + "loss": 0.1066, + "step": 31252 + }, + { + "epoch": 0.5574323119180965, + "grad_norm": 0.31338635087013245, + "learning_rate": 2.4350231259809258e-05, + "loss": 0.1526, + "step": 31253 + }, + { + "epoch": 0.5574501480398102, + "grad_norm": 0.35606101155281067, + "learning_rate": 2.4348675279296566e-05, + "loss": 0.1415, + "step": 31254 + }, + { + "epoch": 0.5574679841615239, + "grad_norm": 0.24667878448963165, + "learning_rate": 2.434711930130864e-05, + "loss": 0.1318, + "step": 31255 + }, + { + "epoch": 0.5574858202832376, + "grad_norm": 0.19740404188632965, + "learning_rate": 2.4345563325851503e-05, + "loss": 0.0701, + "step": 31256 + }, + { + "epoch": 0.5575036564049513, + "grad_norm": 0.3051196038722992, + "learning_rate": 2.4344007352931194e-05, + "loss": 0.1237, + "step": 31257 + }, + { + "epoch": 0.557521492526665, + "grad_norm": 0.23537857830524445, + "learning_rate": 2.434245138255373e-05, + "loss": 0.1538, + "step": 31258 + }, + { + "epoch": 0.5575393286483787, + "grad_norm": 0.2209349423646927, + "learning_rate": 2.4340895414725158e-05, + "loss": 0.1217, + "step": 31259 + }, + { + "epoch": 0.5575571647700924, + "grad_norm": 0.25894731283187866, + "learning_rate": 2.4339339449451504e-05, + "loss": 0.1648, + "step": 31260 + }, + { + "epoch": 0.5575750008918061, + "grad_norm": 0.22068540751934052, + "learning_rate": 2.43377834867388e-05, + "loss": 0.0931, + "step": 31261 + }, + { + "epoch": 0.5575928370135198, + "grad_norm": 0.40496519207954407, + "learning_rate": 2.4336227526593065e-05, + "loss": 0.1509, + "step": 31262 + }, + { + "epoch": 0.5576106731352335, + "grad_norm": 0.2762773633003235, + "learning_rate": 2.4334671569020352e-05, + "loss": 0.1244, + "step": 31263 + }, + { + "epoch": 0.5576285092569472, + "grad_norm": 0.2728966772556305, + "learning_rate": 2.4333115614026676e-05, + "loss": 0.1939, + "step": 31264 + }, + { + "epoch": 0.5576463453786609, + "grad_norm": 0.3698302209377289, + "learning_rate": 2.433155966161808e-05, + "loss": 0.1973, + "step": 31265 + }, + { + "epoch": 0.5576641815003746, + "grad_norm": 0.26120084524154663, + "learning_rate": 2.433000371180059e-05, + "loss": 0.089, + "step": 31266 + }, + { + "epoch": 0.5576820176220882, + "grad_norm": 0.28453847765922546, + "learning_rate": 2.4328447764580227e-05, + "loss": 0.1935, + "step": 31267 + }, + { + "epoch": 0.5576998537438019, + "grad_norm": 0.2795289158821106, + "learning_rate": 2.432689181996304e-05, + "loss": 0.1126, + "step": 31268 + }, + { + "epoch": 0.5577176898655156, + "grad_norm": 0.2534651458263397, + "learning_rate": 2.432533587795505e-05, + "loss": 0.1066, + "step": 31269 + }, + { + "epoch": 0.5577355259872293, + "grad_norm": 0.31744736433029175, + "learning_rate": 2.4323779938562292e-05, + "loss": 0.1334, + "step": 31270 + }, + { + "epoch": 0.557753362108943, + "grad_norm": 0.3402377963066101, + "learning_rate": 2.432222400179079e-05, + "loss": 0.1963, + "step": 31271 + }, + { + "epoch": 0.5577711982306567, + "grad_norm": 0.25750455260276794, + "learning_rate": 2.4320668067646594e-05, + "loss": 0.1652, + "step": 31272 + }, + { + "epoch": 0.5577890343523704, + "grad_norm": 0.2741274833679199, + "learning_rate": 2.4319112136135723e-05, + "loss": 0.1066, + "step": 31273 + }, + { + "epoch": 0.5578068704740841, + "grad_norm": 0.25941941142082214, + "learning_rate": 2.4317556207264204e-05, + "loss": 0.1234, + "step": 31274 + }, + { + "epoch": 0.5578247065957979, + "grad_norm": 0.25675323605537415, + "learning_rate": 2.4316000281038063e-05, + "loss": 0.1555, + "step": 31275 + }, + { + "epoch": 0.5578425427175115, + "grad_norm": 0.26543018221855164, + "learning_rate": 2.4314444357463356e-05, + "loss": 0.0824, + "step": 31276 + }, + { + "epoch": 0.5578603788392252, + "grad_norm": 0.3261551856994629, + "learning_rate": 2.4312888436546093e-05, + "loss": 0.1137, + "step": 31277 + }, + { + "epoch": 0.5578782149609389, + "grad_norm": 0.4933760166168213, + "learning_rate": 2.4311332518292315e-05, + "loss": 0.1287, + "step": 31278 + }, + { + "epoch": 0.5578960510826526, + "grad_norm": 0.2757829427719116, + "learning_rate": 2.4309776602708055e-05, + "loss": 0.1117, + "step": 31279 + }, + { + "epoch": 0.5579138872043663, + "grad_norm": 0.27446433901786804, + "learning_rate": 2.4308220689799323e-05, + "loss": 0.0958, + "step": 31280 + }, + { + "epoch": 0.55793172332608, + "grad_norm": 0.3416633903980255, + "learning_rate": 2.4306664779572177e-05, + "loss": 0.1447, + "step": 31281 + }, + { + "epoch": 0.5579495594477937, + "grad_norm": 0.21480360627174377, + "learning_rate": 2.430510887203264e-05, + "loss": 0.1091, + "step": 31282 + }, + { + "epoch": 0.5579673955695074, + "grad_norm": 0.2571437954902649, + "learning_rate": 2.4303552967186737e-05, + "loss": 0.1308, + "step": 31283 + }, + { + "epoch": 0.557985231691221, + "grad_norm": 0.33606624603271484, + "learning_rate": 2.43019970650405e-05, + "loss": 0.1223, + "step": 31284 + }, + { + "epoch": 0.5580030678129347, + "grad_norm": 0.22661249339580536, + "learning_rate": 2.4300441165599967e-05, + "loss": 0.0671, + "step": 31285 + }, + { + "epoch": 0.5580209039346484, + "grad_norm": 0.2513117492198944, + "learning_rate": 2.429888526887117e-05, + "loss": 0.1551, + "step": 31286 + }, + { + "epoch": 0.5580387400563621, + "grad_norm": 0.249809131026268, + "learning_rate": 2.4297329374860134e-05, + "loss": 0.1236, + "step": 31287 + }, + { + "epoch": 0.5580565761780758, + "grad_norm": 0.2533338963985443, + "learning_rate": 2.429577348357288e-05, + "loss": 0.1206, + "step": 31288 + }, + { + "epoch": 0.5580744122997895, + "grad_norm": 0.27693861722946167, + "learning_rate": 2.4294217595015463e-05, + "loss": 0.1198, + "step": 31289 + }, + { + "epoch": 0.5580922484215032, + "grad_norm": 0.23216712474822998, + "learning_rate": 2.4292661709193906e-05, + "loss": 0.16, + "step": 31290 + }, + { + "epoch": 0.558110084543217, + "grad_norm": 0.25312209129333496, + "learning_rate": 2.429110582611423e-05, + "loss": 0.1396, + "step": 31291 + }, + { + "epoch": 0.5581279206649307, + "grad_norm": 0.23938021063804626, + "learning_rate": 2.4289549945782473e-05, + "loss": 0.1877, + "step": 31292 + }, + { + "epoch": 0.5581457567866444, + "grad_norm": 0.3467380404472351, + "learning_rate": 2.428799406820466e-05, + "loss": 0.2085, + "step": 31293 + }, + { + "epoch": 0.558163592908358, + "grad_norm": 0.2804919183254242, + "learning_rate": 2.4286438193386837e-05, + "loss": 0.1895, + "step": 31294 + }, + { + "epoch": 0.5581814290300717, + "grad_norm": 0.3616204857826233, + "learning_rate": 2.428488232133503e-05, + "loss": 0.0976, + "step": 31295 + }, + { + "epoch": 0.5581992651517854, + "grad_norm": 0.21757498383522034, + "learning_rate": 2.4283326452055256e-05, + "loss": 0.0909, + "step": 31296 + }, + { + "epoch": 0.5582171012734991, + "grad_norm": 0.36979496479034424, + "learning_rate": 2.4281770585553558e-05, + "loss": 0.1721, + "step": 31297 + }, + { + "epoch": 0.5582349373952128, + "grad_norm": 0.2841634154319763, + "learning_rate": 2.4280214721835965e-05, + "loss": 0.1395, + "step": 31298 + }, + { + "epoch": 0.5582527735169265, + "grad_norm": 0.23487070202827454, + "learning_rate": 2.4278658860908513e-05, + "loss": 0.1255, + "step": 31299 + }, + { + "epoch": 0.5582706096386402, + "grad_norm": 0.16677440702915192, + "learning_rate": 2.4277103002777228e-05, + "loss": 0.0967, + "step": 31300 + }, + { + "epoch": 0.5582884457603539, + "grad_norm": 0.22451193630695343, + "learning_rate": 2.427554714744813e-05, + "loss": 0.1233, + "step": 31301 + }, + { + "epoch": 0.5583062818820675, + "grad_norm": 0.2914694845676422, + "learning_rate": 2.4273991294927274e-05, + "loss": 0.1579, + "step": 31302 + }, + { + "epoch": 0.5583241180037812, + "grad_norm": 0.20527410507202148, + "learning_rate": 2.4272435445220677e-05, + "loss": 0.1005, + "step": 31303 + }, + { + "epoch": 0.5583419541254949, + "grad_norm": 0.18512022495269775, + "learning_rate": 2.427087959833437e-05, + "loss": 0.1056, + "step": 31304 + }, + { + "epoch": 0.5583597902472086, + "grad_norm": 0.24566857516765594, + "learning_rate": 2.4269323754274387e-05, + "loss": 0.1247, + "step": 31305 + }, + { + "epoch": 0.5583776263689223, + "grad_norm": 0.25929155945777893, + "learning_rate": 2.426776791304675e-05, + "loss": 0.1729, + "step": 31306 + }, + { + "epoch": 0.558395462490636, + "grad_norm": 0.2857249677181244, + "learning_rate": 2.4266212074657505e-05, + "loss": 0.174, + "step": 31307 + }, + { + "epoch": 0.5584132986123498, + "grad_norm": 0.2965015769004822, + "learning_rate": 2.4264656239112676e-05, + "loss": 0.1511, + "step": 31308 + }, + { + "epoch": 0.5584311347340635, + "grad_norm": 0.29612934589385986, + "learning_rate": 2.4263100406418295e-05, + "loss": 0.1008, + "step": 31309 + }, + { + "epoch": 0.5584489708557772, + "grad_norm": 0.33067581057548523, + "learning_rate": 2.4261544576580377e-05, + "loss": 0.1457, + "step": 31310 + }, + { + "epoch": 0.5584668069774908, + "grad_norm": 0.21364837884902954, + "learning_rate": 2.4259988749604984e-05, + "loss": 0.1031, + "step": 31311 + }, + { + "epoch": 0.5584846430992045, + "grad_norm": 0.35552164912223816, + "learning_rate": 2.425843292549812e-05, + "loss": 0.163, + "step": 31312 + }, + { + "epoch": 0.5585024792209182, + "grad_norm": 0.2339756190776825, + "learning_rate": 2.425687710426583e-05, + "loss": 0.0841, + "step": 31313 + }, + { + "epoch": 0.5585203153426319, + "grad_norm": 0.4499208331108093, + "learning_rate": 2.4255321285914137e-05, + "loss": 0.1466, + "step": 31314 + }, + { + "epoch": 0.5585381514643456, + "grad_norm": 0.29159289598464966, + "learning_rate": 2.4253765470449084e-05, + "loss": 0.1015, + "step": 31315 + }, + { + "epoch": 0.5585559875860593, + "grad_norm": 0.22079280018806458, + "learning_rate": 2.425220965787669e-05, + "loss": 0.1227, + "step": 31316 + }, + { + "epoch": 0.558573823707773, + "grad_norm": 0.2803367078304291, + "learning_rate": 2.4250653848202994e-05, + "loss": 0.1132, + "step": 31317 + }, + { + "epoch": 0.5585916598294867, + "grad_norm": 0.2252812683582306, + "learning_rate": 2.4249098041434015e-05, + "loss": 0.1458, + "step": 31318 + }, + { + "epoch": 0.5586094959512004, + "grad_norm": 0.31765860319137573, + "learning_rate": 2.4247542237575784e-05, + "loss": 0.1242, + "step": 31319 + }, + { + "epoch": 0.558627332072914, + "grad_norm": 0.23371125757694244, + "learning_rate": 2.4245986436634354e-05, + "loss": 0.1356, + "step": 31320 + }, + { + "epoch": 0.5586451681946277, + "grad_norm": 0.21731913089752197, + "learning_rate": 2.424443063861574e-05, + "loss": 0.1373, + "step": 31321 + }, + { + "epoch": 0.5586630043163414, + "grad_norm": 0.24007317423820496, + "learning_rate": 2.4242874843525973e-05, + "loss": 0.1632, + "step": 31322 + }, + { + "epoch": 0.5586808404380551, + "grad_norm": 0.2753141522407532, + "learning_rate": 2.4241319051371074e-05, + "loss": 0.0983, + "step": 31323 + }, + { + "epoch": 0.5586986765597688, + "grad_norm": 0.23878073692321777, + "learning_rate": 2.4239763262157094e-05, + "loss": 0.13, + "step": 31324 + }, + { + "epoch": 0.5587165126814826, + "grad_norm": 0.31503576040267944, + "learning_rate": 2.4238207475890052e-05, + "loss": 0.1439, + "step": 31325 + }, + { + "epoch": 0.5587343488031963, + "grad_norm": 0.3762138783931732, + "learning_rate": 2.4236651692575985e-05, + "loss": 0.0864, + "step": 31326 + }, + { + "epoch": 0.55875218492491, + "grad_norm": 0.25332483649253845, + "learning_rate": 2.4235095912220905e-05, + "loss": 0.0681, + "step": 31327 + }, + { + "epoch": 0.5587700210466237, + "grad_norm": 0.27977555990219116, + "learning_rate": 2.4233540134830874e-05, + "loss": 0.1156, + "step": 31328 + }, + { + "epoch": 0.5587878571683373, + "grad_norm": 0.19785727560520172, + "learning_rate": 2.4231984360411903e-05, + "loss": 0.0633, + "step": 31329 + }, + { + "epoch": 0.558805693290051, + "grad_norm": 0.2541104555130005, + "learning_rate": 2.4230428588970026e-05, + "loss": 0.1187, + "step": 31330 + }, + { + "epoch": 0.5588235294117647, + "grad_norm": 0.2536443769931793, + "learning_rate": 2.422887282051127e-05, + "loss": 0.1072, + "step": 31331 + }, + { + "epoch": 0.5588413655334784, + "grad_norm": 0.23140481114387512, + "learning_rate": 2.422731705504167e-05, + "loss": 0.1097, + "step": 31332 + }, + { + "epoch": 0.5588592016551921, + "grad_norm": 0.26262009143829346, + "learning_rate": 2.422576129256725e-05, + "loss": 0.1303, + "step": 31333 + }, + { + "epoch": 0.5588770377769058, + "grad_norm": 0.3466799557209015, + "learning_rate": 2.4224205533094058e-05, + "loss": 0.1591, + "step": 31334 + }, + { + "epoch": 0.5588948738986195, + "grad_norm": 0.3482324481010437, + "learning_rate": 2.422264977662811e-05, + "loss": 0.2553, + "step": 31335 + }, + { + "epoch": 0.5589127100203332, + "grad_norm": 0.27339667081832886, + "learning_rate": 2.422109402317543e-05, + "loss": 0.1135, + "step": 31336 + }, + { + "epoch": 0.5589305461420468, + "grad_norm": 0.23287396132946014, + "learning_rate": 2.421953827274207e-05, + "loss": 0.1449, + "step": 31337 + }, + { + "epoch": 0.5589483822637605, + "grad_norm": 0.3708248436450958, + "learning_rate": 2.4217982525334047e-05, + "loss": 0.1823, + "step": 31338 + }, + { + "epoch": 0.5589662183854742, + "grad_norm": 0.24411886930465698, + "learning_rate": 2.4216426780957393e-05, + "loss": 0.1332, + "step": 31339 + }, + { + "epoch": 0.5589840545071879, + "grad_norm": 0.3579823076725006, + "learning_rate": 2.4214871039618138e-05, + "loss": 0.1483, + "step": 31340 + }, + { + "epoch": 0.5590018906289016, + "grad_norm": 0.3297712206840515, + "learning_rate": 2.4213315301322313e-05, + "loss": 0.1554, + "step": 31341 + }, + { + "epoch": 0.5590197267506154, + "grad_norm": 0.3117685914039612, + "learning_rate": 2.4211759566075953e-05, + "loss": 0.1232, + "step": 31342 + }, + { + "epoch": 0.5590375628723291, + "grad_norm": 0.24371536076068878, + "learning_rate": 2.4210203833885087e-05, + "loss": 0.1491, + "step": 31343 + }, + { + "epoch": 0.5590553989940428, + "grad_norm": 0.3649060130119324, + "learning_rate": 2.4208648104755745e-05, + "loss": 0.141, + "step": 31344 + }, + { + "epoch": 0.5590732351157565, + "grad_norm": 0.23793701827526093, + "learning_rate": 2.4207092378693942e-05, + "loss": 0.1024, + "step": 31345 + }, + { + "epoch": 0.5590910712374701, + "grad_norm": 0.22641438245773315, + "learning_rate": 2.4205536655705734e-05, + "loss": 0.1109, + "step": 31346 + }, + { + "epoch": 0.5591089073591838, + "grad_norm": 0.2940894067287445, + "learning_rate": 2.420398093579714e-05, + "loss": 0.1167, + "step": 31347 + }, + { + "epoch": 0.5591267434808975, + "grad_norm": 0.2984673082828522, + "learning_rate": 2.420242521897419e-05, + "loss": 0.1558, + "step": 31348 + }, + { + "epoch": 0.5591445796026112, + "grad_norm": 0.27737459540367126, + "learning_rate": 2.420086950524291e-05, + "loss": 0.1169, + "step": 31349 + }, + { + "epoch": 0.5591624157243249, + "grad_norm": 0.2794012129306793, + "learning_rate": 2.4199313794609342e-05, + "loss": 0.1392, + "step": 31350 + }, + { + "epoch": 0.5591802518460386, + "grad_norm": 0.2727005183696747, + "learning_rate": 2.4197758087079513e-05, + "loss": 0.1399, + "step": 31351 + }, + { + "epoch": 0.5591980879677523, + "grad_norm": 0.24773304164409637, + "learning_rate": 2.4196202382659446e-05, + "loss": 0.1556, + "step": 31352 + }, + { + "epoch": 0.559215924089466, + "grad_norm": 0.2942865490913391, + "learning_rate": 2.4194646681355176e-05, + "loss": 0.1276, + "step": 31353 + }, + { + "epoch": 0.5592337602111797, + "grad_norm": 0.34158968925476074, + "learning_rate": 2.419309098317273e-05, + "loss": 0.1646, + "step": 31354 + }, + { + "epoch": 0.5592515963328933, + "grad_norm": 0.27357548475265503, + "learning_rate": 2.419153528811815e-05, + "loss": 0.1754, + "step": 31355 + }, + { + "epoch": 0.559269432454607, + "grad_norm": 0.2923235595226288, + "learning_rate": 2.418997959619746e-05, + "loss": 0.1142, + "step": 31356 + }, + { + "epoch": 0.5592872685763207, + "grad_norm": 0.268451988697052, + "learning_rate": 2.418842390741669e-05, + "loss": 0.1326, + "step": 31357 + }, + { + "epoch": 0.5593051046980344, + "grad_norm": 0.20715366303920746, + "learning_rate": 2.418686822178185e-05, + "loss": 0.1287, + "step": 31358 + }, + { + "epoch": 0.5593229408197482, + "grad_norm": 0.22980226576328278, + "learning_rate": 2.418531253929901e-05, + "loss": 0.1162, + "step": 31359 + }, + { + "epoch": 0.5593407769414619, + "grad_norm": 0.3117606043815613, + "learning_rate": 2.418375685997417e-05, + "loss": 0.1826, + "step": 31360 + }, + { + "epoch": 0.5593586130631756, + "grad_norm": 0.24659809470176697, + "learning_rate": 2.4182201183813376e-05, + "loss": 0.1456, + "step": 31361 + }, + { + "epoch": 0.5593764491848893, + "grad_norm": 0.21749085187911987, + "learning_rate": 2.4180645510822643e-05, + "loss": 0.1277, + "step": 31362 + }, + { + "epoch": 0.559394285306603, + "grad_norm": 0.3348606824874878, + "learning_rate": 2.4179089841008022e-05, + "loss": 0.1296, + "step": 31363 + }, + { + "epoch": 0.5594121214283166, + "grad_norm": 0.24284961819648743, + "learning_rate": 2.4177534174375532e-05, + "loss": 0.1354, + "step": 31364 + }, + { + "epoch": 0.5594299575500303, + "grad_norm": 0.2766849100589752, + "learning_rate": 2.4175978510931202e-05, + "loss": 0.1027, + "step": 31365 + }, + { + "epoch": 0.559447793671744, + "grad_norm": 0.2365037202835083, + "learning_rate": 2.4174422850681054e-05, + "loss": 0.1203, + "step": 31366 + }, + { + "epoch": 0.5594656297934577, + "grad_norm": 0.2361527979373932, + "learning_rate": 2.4172867193631142e-05, + "loss": 0.1325, + "step": 31367 + }, + { + "epoch": 0.5594834659151714, + "grad_norm": 0.28245800733566284, + "learning_rate": 2.4171311539787474e-05, + "loss": 0.1538, + "step": 31368 + }, + { + "epoch": 0.5595013020368851, + "grad_norm": 0.2114911824464798, + "learning_rate": 2.4169755889156093e-05, + "loss": 0.0824, + "step": 31369 + }, + { + "epoch": 0.5595191381585988, + "grad_norm": 0.27884936332702637, + "learning_rate": 2.4168200241743024e-05, + "loss": 0.1024, + "step": 31370 + }, + { + "epoch": 0.5595369742803125, + "grad_norm": 0.32638683915138245, + "learning_rate": 2.416664459755429e-05, + "loss": 0.126, + "step": 31371 + }, + { + "epoch": 0.5595548104020261, + "grad_norm": 0.25983357429504395, + "learning_rate": 2.416508895659594e-05, + "loss": 0.0625, + "step": 31372 + }, + { + "epoch": 0.5595726465237398, + "grad_norm": 0.29169797897338867, + "learning_rate": 2.416353331887399e-05, + "loss": 0.132, + "step": 31373 + }, + { + "epoch": 0.5595904826454535, + "grad_norm": 0.3536311984062195, + "learning_rate": 2.4161977684394477e-05, + "loss": 0.1278, + "step": 31374 + }, + { + "epoch": 0.5596083187671672, + "grad_norm": 0.2661953866481781, + "learning_rate": 2.4160422053163417e-05, + "loss": 0.1349, + "step": 31375 + }, + { + "epoch": 0.559626154888881, + "grad_norm": 0.32716691493988037, + "learning_rate": 2.415886642518686e-05, + "loss": 0.1213, + "step": 31376 + }, + { + "epoch": 0.5596439910105947, + "grad_norm": 0.2951219379901886, + "learning_rate": 2.4157310800470832e-05, + "loss": 0.154, + "step": 31377 + }, + { + "epoch": 0.5596618271323084, + "grad_norm": 0.3086981773376465, + "learning_rate": 2.4155755179021356e-05, + "loss": 0.1987, + "step": 31378 + }, + { + "epoch": 0.5596796632540221, + "grad_norm": 0.2586064040660858, + "learning_rate": 2.4154199560844455e-05, + "loss": 0.1098, + "step": 31379 + }, + { + "epoch": 0.5596974993757358, + "grad_norm": 0.2792659401893616, + "learning_rate": 2.4152643945946173e-05, + "loss": 0.1649, + "step": 31380 + }, + { + "epoch": 0.5597153354974495, + "grad_norm": 0.27620840072631836, + "learning_rate": 2.4151088334332538e-05, + "loss": 0.207, + "step": 31381 + }, + { + "epoch": 0.5597331716191631, + "grad_norm": 0.27956655621528625, + "learning_rate": 2.414953272600958e-05, + "loss": 0.1389, + "step": 31382 + }, + { + "epoch": 0.5597510077408768, + "grad_norm": 0.39406895637512207, + "learning_rate": 2.414797712098333e-05, + "loss": 0.1383, + "step": 31383 + }, + { + "epoch": 0.5597688438625905, + "grad_norm": 0.2809605002403259, + "learning_rate": 2.41464215192598e-05, + "loss": 0.1417, + "step": 31384 + }, + { + "epoch": 0.5597866799843042, + "grad_norm": 0.32095375657081604, + "learning_rate": 2.4144865920845046e-05, + "loss": 0.1161, + "step": 31385 + }, + { + "epoch": 0.5598045161060179, + "grad_norm": 0.22089307010173798, + "learning_rate": 2.4143310325745087e-05, + "loss": 0.1239, + "step": 31386 + }, + { + "epoch": 0.5598223522277316, + "grad_norm": 0.25564903020858765, + "learning_rate": 2.4141754733965952e-05, + "loss": 0.0696, + "step": 31387 + }, + { + "epoch": 0.5598401883494453, + "grad_norm": 0.2967927157878876, + "learning_rate": 2.4140199145513667e-05, + "loss": 0.189, + "step": 31388 + }, + { + "epoch": 0.559858024471159, + "grad_norm": 0.33322104811668396, + "learning_rate": 2.413864356039427e-05, + "loss": 0.1818, + "step": 31389 + }, + { + "epoch": 0.5598758605928726, + "grad_norm": 0.24041935801506042, + "learning_rate": 2.4137087978613794e-05, + "loss": 0.1629, + "step": 31390 + }, + { + "epoch": 0.5598936967145863, + "grad_norm": 0.23456811904907227, + "learning_rate": 2.4135532400178263e-05, + "loss": 0.1234, + "step": 31391 + }, + { + "epoch": 0.5599115328363001, + "grad_norm": 0.2907894253730774, + "learning_rate": 2.4133976825093693e-05, + "loss": 0.1741, + "step": 31392 + }, + { + "epoch": 0.5599293689580138, + "grad_norm": 0.22135528922080994, + "learning_rate": 2.4132421253366142e-05, + "loss": 0.1577, + "step": 31393 + }, + { + "epoch": 0.5599472050797275, + "grad_norm": 0.33190566301345825, + "learning_rate": 2.4130865685001623e-05, + "loss": 0.1944, + "step": 31394 + }, + { + "epoch": 0.5599650412014412, + "grad_norm": 0.28703323006629944, + "learning_rate": 2.4129310120006165e-05, + "loss": 0.1479, + "step": 31395 + }, + { + "epoch": 0.5599828773231549, + "grad_norm": 0.28238335251808167, + "learning_rate": 2.4127754558385807e-05, + "loss": 0.1392, + "step": 31396 + }, + { + "epoch": 0.5600007134448686, + "grad_norm": 0.25788432359695435, + "learning_rate": 2.4126199000146564e-05, + "loss": 0.1412, + "step": 31397 + }, + { + "epoch": 0.5600185495665823, + "grad_norm": 0.21524713933467865, + "learning_rate": 2.4124643445294484e-05, + "loss": 0.1134, + "step": 31398 + }, + { + "epoch": 0.560036385688296, + "grad_norm": 0.22623580694198608, + "learning_rate": 2.4123087893835587e-05, + "loss": 0.1156, + "step": 31399 + }, + { + "epoch": 0.5600542218100096, + "grad_norm": 0.27302345633506775, + "learning_rate": 2.412153234577591e-05, + "loss": 0.1778, + "step": 31400 + }, + { + "epoch": 0.5600720579317233, + "grad_norm": 0.3543337881565094, + "learning_rate": 2.4119976801121465e-05, + "loss": 0.1561, + "step": 31401 + }, + { + "epoch": 0.560089894053437, + "grad_norm": 0.3550248444080353, + "learning_rate": 2.4118421259878294e-05, + "loss": 0.1934, + "step": 31402 + }, + { + "epoch": 0.5601077301751507, + "grad_norm": 0.17590175569057465, + "learning_rate": 2.4116865722052433e-05, + "loss": 0.1129, + "step": 31403 + }, + { + "epoch": 0.5601255662968644, + "grad_norm": 0.3290773630142212, + "learning_rate": 2.4115310187649907e-05, + "loss": 0.1012, + "step": 31404 + }, + { + "epoch": 0.5601434024185781, + "grad_norm": 0.4822506308555603, + "learning_rate": 2.4113754656676736e-05, + "loss": 0.1741, + "step": 31405 + }, + { + "epoch": 0.5601612385402918, + "grad_norm": 0.2730311155319214, + "learning_rate": 2.4112199129138968e-05, + "loss": 0.145, + "step": 31406 + }, + { + "epoch": 0.5601790746620054, + "grad_norm": 0.30690997838974, + "learning_rate": 2.411064360504262e-05, + "loss": 0.1346, + "step": 31407 + }, + { + "epoch": 0.5601969107837191, + "grad_norm": 0.2378779798746109, + "learning_rate": 2.4109088084393724e-05, + "loss": 0.125, + "step": 31408 + }, + { + "epoch": 0.5602147469054329, + "grad_norm": 0.32127776741981506, + "learning_rate": 2.4107532567198313e-05, + "loss": 0.1233, + "step": 31409 + }, + { + "epoch": 0.5602325830271466, + "grad_norm": 0.2980946898460388, + "learning_rate": 2.4105977053462402e-05, + "loss": 0.1342, + "step": 31410 + }, + { + "epoch": 0.5602504191488603, + "grad_norm": 0.1978059560060501, + "learning_rate": 2.4104421543192044e-05, + "loss": 0.1377, + "step": 31411 + }, + { + "epoch": 0.560268255270574, + "grad_norm": 0.2657274901866913, + "learning_rate": 2.4102866036393258e-05, + "loss": 0.1315, + "step": 31412 + }, + { + "epoch": 0.5602860913922877, + "grad_norm": 0.43789440393447876, + "learning_rate": 2.4101310533072074e-05, + "loss": 0.1327, + "step": 31413 + }, + { + "epoch": 0.5603039275140014, + "grad_norm": 0.32028093934059143, + "learning_rate": 2.409975503323451e-05, + "loss": 0.1276, + "step": 31414 + }, + { + "epoch": 0.5603217636357151, + "grad_norm": 0.2650982141494751, + "learning_rate": 2.4098199536886616e-05, + "loss": 0.1556, + "step": 31415 + }, + { + "epoch": 0.5603395997574288, + "grad_norm": 0.29290321469306946, + "learning_rate": 2.4096644044034406e-05, + "loss": 0.1159, + "step": 31416 + }, + { + "epoch": 0.5603574358791424, + "grad_norm": 0.26071709394454956, + "learning_rate": 2.409508855468392e-05, + "loss": 0.0988, + "step": 31417 + }, + { + "epoch": 0.5603752720008561, + "grad_norm": 0.24011465907096863, + "learning_rate": 2.4093533068841176e-05, + "loss": 0.1082, + "step": 31418 + }, + { + "epoch": 0.5603931081225698, + "grad_norm": 0.21061640977859497, + "learning_rate": 2.4091977586512223e-05, + "loss": 0.117, + "step": 31419 + }, + { + "epoch": 0.5604109442442835, + "grad_norm": 0.3192479610443115, + "learning_rate": 2.4090422107703074e-05, + "loss": 0.1421, + "step": 31420 + }, + { + "epoch": 0.5604287803659972, + "grad_norm": 0.24465888738632202, + "learning_rate": 2.408886663241976e-05, + "loss": 0.1264, + "step": 31421 + }, + { + "epoch": 0.5604466164877109, + "grad_norm": 0.22927770018577576, + "learning_rate": 2.4087311160668315e-05, + "loss": 0.1111, + "step": 31422 + }, + { + "epoch": 0.5604644526094246, + "grad_norm": 0.36364415287971497, + "learning_rate": 2.4085755692454763e-05, + "loss": 0.1195, + "step": 31423 + }, + { + "epoch": 0.5604822887311383, + "grad_norm": 0.20706453919410706, + "learning_rate": 2.4084200227785143e-05, + "loss": 0.1032, + "step": 31424 + }, + { + "epoch": 0.5605001248528519, + "grad_norm": 0.2741907238960266, + "learning_rate": 2.408264476666548e-05, + "loss": 0.1141, + "step": 31425 + }, + { + "epoch": 0.5605179609745657, + "grad_norm": 0.23388290405273438, + "learning_rate": 2.4081089309101806e-05, + "loss": 0.1257, + "step": 31426 + }, + { + "epoch": 0.5605357970962794, + "grad_norm": 0.3187490403652191, + "learning_rate": 2.4079533855100132e-05, + "loss": 0.1578, + "step": 31427 + }, + { + "epoch": 0.5605536332179931, + "grad_norm": 0.28279832005500793, + "learning_rate": 2.4077978404666516e-05, + "loss": 0.1487, + "step": 31428 + }, + { + "epoch": 0.5605714693397068, + "grad_norm": 0.313147634267807, + "learning_rate": 2.407642295780697e-05, + "loss": 0.1286, + "step": 31429 + }, + { + "epoch": 0.5605893054614205, + "grad_norm": 0.31667324900627136, + "learning_rate": 2.4074867514527533e-05, + "loss": 0.0985, + "step": 31430 + }, + { + "epoch": 0.5606071415831342, + "grad_norm": 0.24084815382957458, + "learning_rate": 2.4073312074834217e-05, + "loss": 0.0968, + "step": 31431 + }, + { + "epoch": 0.5606249777048479, + "grad_norm": 0.2430635690689087, + "learning_rate": 2.4071756638733077e-05, + "loss": 0.1148, + "step": 31432 + }, + { + "epoch": 0.5606428138265616, + "grad_norm": 0.3483937978744507, + "learning_rate": 2.4070201206230125e-05, + "loss": 0.14, + "step": 31433 + }, + { + "epoch": 0.5606606499482752, + "grad_norm": 0.3065212368965149, + "learning_rate": 2.4068645777331394e-05, + "loss": 0.2544, + "step": 31434 + }, + { + "epoch": 0.5606784860699889, + "grad_norm": 0.28280800580978394, + "learning_rate": 2.406709035204291e-05, + "loss": 0.1354, + "step": 31435 + }, + { + "epoch": 0.5606963221917026, + "grad_norm": 0.5980995297431946, + "learning_rate": 2.4065534930370704e-05, + "loss": 0.1478, + "step": 31436 + }, + { + "epoch": 0.5607141583134163, + "grad_norm": 0.2971732020378113, + "learning_rate": 2.4063979512320812e-05, + "loss": 0.1532, + "step": 31437 + }, + { + "epoch": 0.56073199443513, + "grad_norm": 0.30833563208580017, + "learning_rate": 2.4062424097899263e-05, + "loss": 0.1498, + "step": 31438 + }, + { + "epoch": 0.5607498305568437, + "grad_norm": 0.37537726759910583, + "learning_rate": 2.4060868687112078e-05, + "loss": 0.2028, + "step": 31439 + }, + { + "epoch": 0.5607676666785574, + "grad_norm": 0.2662407457828522, + "learning_rate": 2.405931327996528e-05, + "loss": 0.1564, + "step": 31440 + }, + { + "epoch": 0.5607855028002711, + "grad_norm": 0.25998353958129883, + "learning_rate": 2.4057757876464924e-05, + "loss": 0.188, + "step": 31441 + }, + { + "epoch": 0.5608033389219848, + "grad_norm": 0.23177604377269745, + "learning_rate": 2.405620247661702e-05, + "loss": 0.1339, + "step": 31442 + }, + { + "epoch": 0.5608211750436985, + "grad_norm": 0.31647083163261414, + "learning_rate": 2.4054647080427598e-05, + "loss": 0.1516, + "step": 31443 + }, + { + "epoch": 0.5608390111654122, + "grad_norm": 0.38324934244155884, + "learning_rate": 2.4053091687902687e-05, + "loss": 0.1683, + "step": 31444 + }, + { + "epoch": 0.5608568472871259, + "grad_norm": 0.18483775854110718, + "learning_rate": 2.4051536299048323e-05, + "loss": 0.1005, + "step": 31445 + }, + { + "epoch": 0.5608746834088396, + "grad_norm": 0.2281084954738617, + "learning_rate": 2.404998091387054e-05, + "loss": 0.102, + "step": 31446 + }, + { + "epoch": 0.5608925195305533, + "grad_norm": 0.2561064064502716, + "learning_rate": 2.4048425532375352e-05, + "loss": 0.1334, + "step": 31447 + }, + { + "epoch": 0.560910355652267, + "grad_norm": 0.3684125542640686, + "learning_rate": 2.4046870154568798e-05, + "loss": 0.1451, + "step": 31448 + }, + { + "epoch": 0.5609281917739807, + "grad_norm": 0.29495108127593994, + "learning_rate": 2.4045314780456896e-05, + "loss": 0.2184, + "step": 31449 + }, + { + "epoch": 0.5609460278956944, + "grad_norm": 0.26864001154899597, + "learning_rate": 2.4043759410045688e-05, + "loss": 0.1198, + "step": 31450 + }, + { + "epoch": 0.560963864017408, + "grad_norm": 0.28136900067329407, + "learning_rate": 2.40422040433412e-05, + "loss": 0.1467, + "step": 31451 + }, + { + "epoch": 0.5609817001391217, + "grad_norm": 0.2807161808013916, + "learning_rate": 2.4040648680349467e-05, + "loss": 0.1331, + "step": 31452 + }, + { + "epoch": 0.5609995362608354, + "grad_norm": 0.1994100958108902, + "learning_rate": 2.4039093321076494e-05, + "loss": 0.1138, + "step": 31453 + }, + { + "epoch": 0.5610173723825491, + "grad_norm": 0.22997941076755524, + "learning_rate": 2.4037537965528343e-05, + "loss": 0.1614, + "step": 31454 + }, + { + "epoch": 0.5610352085042628, + "grad_norm": 0.23174940049648285, + "learning_rate": 2.4035982613711026e-05, + "loss": 0.1259, + "step": 31455 + }, + { + "epoch": 0.5610530446259765, + "grad_norm": 0.3028358221054077, + "learning_rate": 2.4034427265630564e-05, + "loss": 0.0863, + "step": 31456 + }, + { + "epoch": 0.5610708807476902, + "grad_norm": 0.31821393966674805, + "learning_rate": 2.4032871921293002e-05, + "loss": 0.1826, + "step": 31457 + }, + { + "epoch": 0.5610887168694039, + "grad_norm": 0.2550991475582123, + "learning_rate": 2.4031316580704357e-05, + "loss": 0.0723, + "step": 31458 + }, + { + "epoch": 0.5611065529911176, + "grad_norm": 0.29452019929885864, + "learning_rate": 2.402976124387067e-05, + "loss": 0.1164, + "step": 31459 + }, + { + "epoch": 0.5611243891128314, + "grad_norm": 0.28820693492889404, + "learning_rate": 2.4028205910797965e-05, + "loss": 0.098, + "step": 31460 + }, + { + "epoch": 0.561142225234545, + "grad_norm": 0.22527167201042175, + "learning_rate": 2.402665058149227e-05, + "loss": 0.1627, + "step": 31461 + }, + { + "epoch": 0.5611600613562587, + "grad_norm": 0.29793596267700195, + "learning_rate": 2.4025095255959602e-05, + "loss": 0.1407, + "step": 31462 + }, + { + "epoch": 0.5611778974779724, + "grad_norm": 0.2924523651599884, + "learning_rate": 2.4023539934206012e-05, + "loss": 0.1459, + "step": 31463 + }, + { + "epoch": 0.5611957335996861, + "grad_norm": 0.23874494433403015, + "learning_rate": 2.402198461623751e-05, + "loss": 0.1313, + "step": 31464 + }, + { + "epoch": 0.5612135697213998, + "grad_norm": 0.35290271043777466, + "learning_rate": 2.4020429302060142e-05, + "loss": 0.1292, + "step": 31465 + }, + { + "epoch": 0.5612314058431135, + "grad_norm": 0.29797959327697754, + "learning_rate": 2.4018873991679917e-05, + "loss": 0.1872, + "step": 31466 + }, + { + "epoch": 0.5612492419648272, + "grad_norm": 0.28676173090934753, + "learning_rate": 2.4017318685102888e-05, + "loss": 0.1154, + "step": 31467 + }, + { + "epoch": 0.5612670780865409, + "grad_norm": 0.26434823870658875, + "learning_rate": 2.401576338233507e-05, + "loss": 0.1503, + "step": 31468 + }, + { + "epoch": 0.5612849142082545, + "grad_norm": 0.3844640254974365, + "learning_rate": 2.4014208083382496e-05, + "loss": 0.1533, + "step": 31469 + }, + { + "epoch": 0.5613027503299682, + "grad_norm": 0.28696200251579285, + "learning_rate": 2.4012652788251175e-05, + "loss": 0.2161, + "step": 31470 + }, + { + "epoch": 0.5613205864516819, + "grad_norm": 0.2041654735803604, + "learning_rate": 2.4011097496947166e-05, + "loss": 0.1288, + "step": 31471 + }, + { + "epoch": 0.5613384225733956, + "grad_norm": 0.26276299357414246, + "learning_rate": 2.400954220947648e-05, + "loss": 0.1454, + "step": 31472 + }, + { + "epoch": 0.5613562586951093, + "grad_norm": 0.25883156061172485, + "learning_rate": 2.4007986925845154e-05, + "loss": 0.1309, + "step": 31473 + }, + { + "epoch": 0.561374094816823, + "grad_norm": 0.33456939458847046, + "learning_rate": 2.4006431646059218e-05, + "loss": 0.1275, + "step": 31474 + }, + { + "epoch": 0.5613919309385367, + "grad_norm": 0.23295243084430695, + "learning_rate": 2.4004876370124682e-05, + "loss": 0.1196, + "step": 31475 + }, + { + "epoch": 0.5614097670602504, + "grad_norm": 0.2399265170097351, + "learning_rate": 2.40033210980476e-05, + "loss": 0.085, + "step": 31476 + }, + { + "epoch": 0.5614276031819642, + "grad_norm": 0.23919454216957092, + "learning_rate": 2.400176582983399e-05, + "loss": 0.1595, + "step": 31477 + }, + { + "epoch": 0.5614454393036779, + "grad_norm": 0.33691704273223877, + "learning_rate": 2.4000210565489877e-05, + "loss": 0.145, + "step": 31478 + }, + { + "epoch": 0.5614632754253915, + "grad_norm": 0.23280148208141327, + "learning_rate": 2.3998655305021282e-05, + "loss": 0.1136, + "step": 31479 + }, + { + "epoch": 0.5614811115471052, + "grad_norm": 0.26939666271209717, + "learning_rate": 2.399710004843426e-05, + "loss": 0.1611, + "step": 31480 + }, + { + "epoch": 0.5614989476688189, + "grad_norm": 0.2814178466796875, + "learning_rate": 2.3995544795734824e-05, + "loss": 0.1318, + "step": 31481 + }, + { + "epoch": 0.5615167837905326, + "grad_norm": 0.25734108686447144, + "learning_rate": 2.3993989546929002e-05, + "loss": 0.1375, + "step": 31482 + }, + { + "epoch": 0.5615346199122463, + "grad_norm": 0.248263418674469, + "learning_rate": 2.3992434302022815e-05, + "loss": 0.1053, + "step": 31483 + }, + { + "epoch": 0.56155245603396, + "grad_norm": 0.25484323501586914, + "learning_rate": 2.399087906102231e-05, + "loss": 0.1065, + "step": 31484 + }, + { + "epoch": 0.5615702921556737, + "grad_norm": 0.29617932438850403, + "learning_rate": 2.3989323823933502e-05, + "loss": 0.1417, + "step": 31485 + }, + { + "epoch": 0.5615881282773874, + "grad_norm": 0.25645482540130615, + "learning_rate": 2.3987768590762428e-05, + "loss": 0.1698, + "step": 31486 + }, + { + "epoch": 0.561605964399101, + "grad_norm": 0.24056769907474518, + "learning_rate": 2.3986213361515115e-05, + "loss": 0.1102, + "step": 31487 + }, + { + "epoch": 0.5616238005208147, + "grad_norm": 0.2023090273141861, + "learning_rate": 2.3984658136197575e-05, + "loss": 0.1153, + "step": 31488 + }, + { + "epoch": 0.5616416366425284, + "grad_norm": 0.41713064908981323, + "learning_rate": 2.3983102914815864e-05, + "loss": 0.1225, + "step": 31489 + }, + { + "epoch": 0.5616594727642421, + "grad_norm": 0.2384224534034729, + "learning_rate": 2.3981547697375997e-05, + "loss": 0.0912, + "step": 31490 + }, + { + "epoch": 0.5616773088859558, + "grad_norm": 0.2911020815372467, + "learning_rate": 2.3979992483884e-05, + "loss": 0.1037, + "step": 31491 + }, + { + "epoch": 0.5616951450076695, + "grad_norm": 0.26076075434684753, + "learning_rate": 2.39784372743459e-05, + "loss": 0.1607, + "step": 31492 + }, + { + "epoch": 0.5617129811293833, + "grad_norm": 0.19151878356933594, + "learning_rate": 2.397688206876773e-05, + "loss": 0.1202, + "step": 31493 + }, + { + "epoch": 0.561730817251097, + "grad_norm": 0.3104279935359955, + "learning_rate": 2.3975326867155525e-05, + "loss": 0.1364, + "step": 31494 + }, + { + "epoch": 0.5617486533728107, + "grad_norm": 0.2121114730834961, + "learning_rate": 2.3973771669515307e-05, + "loss": 0.1585, + "step": 31495 + }, + { + "epoch": 0.5617664894945243, + "grad_norm": 0.20801205933094025, + "learning_rate": 2.3972216475853092e-05, + "loss": 0.0916, + "step": 31496 + }, + { + "epoch": 0.561784325616238, + "grad_norm": 0.29379209876060486, + "learning_rate": 2.3970661286174937e-05, + "loss": 0.1195, + "step": 31497 + }, + { + "epoch": 0.5618021617379517, + "grad_norm": 0.286279559135437, + "learning_rate": 2.3969106100486853e-05, + "loss": 0.1321, + "step": 31498 + }, + { + "epoch": 0.5618199978596654, + "grad_norm": 0.2575676739215851, + "learning_rate": 2.396755091879486e-05, + "loss": 0.0872, + "step": 31499 + }, + { + "epoch": 0.5618378339813791, + "grad_norm": 0.36126649379730225, + "learning_rate": 2.3965995741105006e-05, + "loss": 0.1446, + "step": 31500 + }, + { + "epoch": 0.5618556701030928, + "grad_norm": 0.27827003598213196, + "learning_rate": 2.3964440567423295e-05, + "loss": 0.1698, + "step": 31501 + }, + { + "epoch": 0.5618735062248065, + "grad_norm": 0.23590479791164398, + "learning_rate": 2.3962885397755782e-05, + "loss": 0.1479, + "step": 31502 + }, + { + "epoch": 0.5618913423465202, + "grad_norm": 0.2628158926963806, + "learning_rate": 2.3961330232108487e-05, + "loss": 0.14, + "step": 31503 + }, + { + "epoch": 0.5619091784682338, + "grad_norm": 0.4789707362651825, + "learning_rate": 2.395977507048743e-05, + "loss": 0.2047, + "step": 31504 + }, + { + "epoch": 0.5619270145899475, + "grad_norm": 0.2957553565502167, + "learning_rate": 2.3958219912898635e-05, + "loss": 0.1629, + "step": 31505 + }, + { + "epoch": 0.5619448507116612, + "grad_norm": 0.23507602512836456, + "learning_rate": 2.3956664759348146e-05, + "loss": 0.1538, + "step": 31506 + }, + { + "epoch": 0.5619626868333749, + "grad_norm": 0.2927339971065521, + "learning_rate": 2.395510960984199e-05, + "loss": 0.1163, + "step": 31507 + }, + { + "epoch": 0.5619805229550886, + "grad_norm": 0.17802639305591583, + "learning_rate": 2.3953554464386187e-05, + "loss": 0.126, + "step": 31508 + }, + { + "epoch": 0.5619983590768023, + "grad_norm": 0.2092919945716858, + "learning_rate": 2.3951999322986762e-05, + "loss": 0.1244, + "step": 31509 + }, + { + "epoch": 0.5620161951985161, + "grad_norm": 0.44034096598625183, + "learning_rate": 2.3950444185649758e-05, + "loss": 0.1617, + "step": 31510 + }, + { + "epoch": 0.5620340313202298, + "grad_norm": 0.29422372579574585, + "learning_rate": 2.3948889052381195e-05, + "loss": 0.12, + "step": 31511 + }, + { + "epoch": 0.5620518674419435, + "grad_norm": 0.2691257894039154, + "learning_rate": 2.3947333923187095e-05, + "loss": 0.0886, + "step": 31512 + }, + { + "epoch": 0.5620697035636572, + "grad_norm": 0.2979081869125366, + "learning_rate": 2.39457787980735e-05, + "loss": 0.0941, + "step": 31513 + }, + { + "epoch": 0.5620875396853708, + "grad_norm": 0.33796852827072144, + "learning_rate": 2.3944223677046418e-05, + "loss": 0.12, + "step": 31514 + }, + { + "epoch": 0.5621053758070845, + "grad_norm": 0.26629310846328735, + "learning_rate": 2.39426685601119e-05, + "loss": 0.1534, + "step": 31515 + }, + { + "epoch": 0.5621232119287982, + "grad_norm": 0.25867897272109985, + "learning_rate": 2.3941113447275964e-05, + "loss": 0.1296, + "step": 31516 + }, + { + "epoch": 0.5621410480505119, + "grad_norm": 0.2649631202220917, + "learning_rate": 2.393955833854464e-05, + "loss": 0.1467, + "step": 31517 + }, + { + "epoch": 0.5621588841722256, + "grad_norm": 0.28785979747772217, + "learning_rate": 2.3938003233923942e-05, + "loss": 0.1556, + "step": 31518 + }, + { + "epoch": 0.5621767202939393, + "grad_norm": 0.34189853072166443, + "learning_rate": 2.393644813341992e-05, + "loss": 0.2038, + "step": 31519 + }, + { + "epoch": 0.562194556415653, + "grad_norm": 0.33835071325302124, + "learning_rate": 2.393489303703859e-05, + "loss": 0.1185, + "step": 31520 + }, + { + "epoch": 0.5622123925373667, + "grad_norm": 0.2385687381029129, + "learning_rate": 2.3933337944785982e-05, + "loss": 0.1691, + "step": 31521 + }, + { + "epoch": 0.5622302286590803, + "grad_norm": 0.18524384498596191, + "learning_rate": 2.393178285666812e-05, + "loss": 0.0813, + "step": 31522 + }, + { + "epoch": 0.562248064780794, + "grad_norm": 0.19806908071041107, + "learning_rate": 2.3930227772691045e-05, + "loss": 0.1207, + "step": 31523 + }, + { + "epoch": 0.5622659009025077, + "grad_norm": 0.32618603110313416, + "learning_rate": 2.3928672692860777e-05, + "loss": 0.1575, + "step": 31524 + }, + { + "epoch": 0.5622837370242214, + "grad_norm": 0.3744017779827118, + "learning_rate": 2.3927117617183344e-05, + "loss": 0.1242, + "step": 31525 + }, + { + "epoch": 0.5623015731459351, + "grad_norm": 0.2930123507976532, + "learning_rate": 2.3925562545664763e-05, + "loss": 0.1431, + "step": 31526 + }, + { + "epoch": 0.5623194092676489, + "grad_norm": 0.24760590493679047, + "learning_rate": 2.3924007478311075e-05, + "loss": 0.1592, + "step": 31527 + }, + { + "epoch": 0.5623372453893626, + "grad_norm": 0.22133322060108185, + "learning_rate": 2.3922452415128312e-05, + "loss": 0.1122, + "step": 31528 + }, + { + "epoch": 0.5623550815110763, + "grad_norm": 0.34430113434791565, + "learning_rate": 2.39208973561225e-05, + "loss": 0.1653, + "step": 31529 + }, + { + "epoch": 0.56237291763279, + "grad_norm": 0.23589089512825012, + "learning_rate": 2.3919342301299656e-05, + "loss": 0.1209, + "step": 31530 + }, + { + "epoch": 0.5623907537545036, + "grad_norm": 0.21339066326618195, + "learning_rate": 2.391778725066581e-05, + "loss": 0.149, + "step": 31531 + }, + { + "epoch": 0.5624085898762173, + "grad_norm": 0.3414429724216461, + "learning_rate": 2.3916232204227e-05, + "loss": 0.1262, + "step": 31532 + }, + { + "epoch": 0.562426425997931, + "grad_norm": 0.2576408088207245, + "learning_rate": 2.3914677161989248e-05, + "loss": 0.1664, + "step": 31533 + }, + { + "epoch": 0.5624442621196447, + "grad_norm": 0.32501357793807983, + "learning_rate": 2.3913122123958583e-05, + "loss": 0.1715, + "step": 31534 + }, + { + "epoch": 0.5624620982413584, + "grad_norm": 0.22479450702667236, + "learning_rate": 2.3911567090141022e-05, + "loss": 0.0836, + "step": 31535 + }, + { + "epoch": 0.5624799343630721, + "grad_norm": 0.20873993635177612, + "learning_rate": 2.3910012060542618e-05, + "loss": 0.1213, + "step": 31536 + }, + { + "epoch": 0.5624977704847858, + "grad_norm": 0.23867380619049072, + "learning_rate": 2.390845703516938e-05, + "loss": 0.1517, + "step": 31537 + }, + { + "epoch": 0.5625156066064995, + "grad_norm": 0.25853532552719116, + "learning_rate": 2.390690201402734e-05, + "loss": 0.1644, + "step": 31538 + }, + { + "epoch": 0.5625334427282132, + "grad_norm": 0.22269342839717865, + "learning_rate": 2.390534699712252e-05, + "loss": 0.0977, + "step": 31539 + }, + { + "epoch": 0.5625512788499268, + "grad_norm": 0.2442581206560135, + "learning_rate": 2.390379198446095e-05, + "loss": 0.1287, + "step": 31540 + }, + { + "epoch": 0.5625691149716405, + "grad_norm": 0.21370096504688263, + "learning_rate": 2.3902236976048664e-05, + "loss": 0.1322, + "step": 31541 + }, + { + "epoch": 0.5625869510933542, + "grad_norm": 0.25343915820121765, + "learning_rate": 2.3900681971891693e-05, + "loss": 0.1053, + "step": 31542 + }, + { + "epoch": 0.5626047872150679, + "grad_norm": 0.1930512934923172, + "learning_rate": 2.3899126971996058e-05, + "loss": 0.1081, + "step": 31543 + }, + { + "epoch": 0.5626226233367817, + "grad_norm": 0.2629535496234894, + "learning_rate": 2.3897571976367774e-05, + "loss": 0.1135, + "step": 31544 + }, + { + "epoch": 0.5626404594584954, + "grad_norm": 0.2897069752216339, + "learning_rate": 2.3896016985012897e-05, + "loss": 0.1237, + "step": 31545 + }, + { + "epoch": 0.5626582955802091, + "grad_norm": 0.25566011667251587, + "learning_rate": 2.3894461997937433e-05, + "loss": 0.1522, + "step": 31546 + }, + { + "epoch": 0.5626761317019228, + "grad_norm": 0.35920479893684387, + "learning_rate": 2.3892907015147418e-05, + "loss": 0.1527, + "step": 31547 + }, + { + "epoch": 0.5626939678236365, + "grad_norm": 0.21773487329483032, + "learning_rate": 2.3891352036648874e-05, + "loss": 0.0874, + "step": 31548 + }, + { + "epoch": 0.5627118039453501, + "grad_norm": 0.36515769362449646, + "learning_rate": 2.388979706244783e-05, + "loss": 0.1415, + "step": 31549 + }, + { + "epoch": 0.5627296400670638, + "grad_norm": 0.24259336292743683, + "learning_rate": 2.3888242092550324e-05, + "loss": 0.1734, + "step": 31550 + }, + { + "epoch": 0.5627474761887775, + "grad_norm": 0.2675774395465851, + "learning_rate": 2.3886687126962376e-05, + "loss": 0.1233, + "step": 31551 + }, + { + "epoch": 0.5627653123104912, + "grad_norm": 0.21952860057353973, + "learning_rate": 2.3885132165690012e-05, + "loss": 0.1465, + "step": 31552 + }, + { + "epoch": 0.5627831484322049, + "grad_norm": 0.2538287341594696, + "learning_rate": 2.3883577208739252e-05, + "loss": 0.1548, + "step": 31553 + }, + { + "epoch": 0.5628009845539186, + "grad_norm": 0.23038217425346375, + "learning_rate": 2.3882022256116142e-05, + "loss": 0.1288, + "step": 31554 + }, + { + "epoch": 0.5628188206756323, + "grad_norm": 0.3215599060058594, + "learning_rate": 2.3880467307826697e-05, + "loss": 0.1397, + "step": 31555 + }, + { + "epoch": 0.562836656797346, + "grad_norm": 0.24070176482200623, + "learning_rate": 2.3878912363876947e-05, + "loss": 0.1671, + "step": 31556 + }, + { + "epoch": 0.5628544929190596, + "grad_norm": 0.33674654364585876, + "learning_rate": 2.3877357424272913e-05, + "loss": 0.1607, + "step": 31557 + }, + { + "epoch": 0.5628723290407733, + "grad_norm": 0.27031153440475464, + "learning_rate": 2.387580248902064e-05, + "loss": 0.1013, + "step": 31558 + }, + { + "epoch": 0.562890165162487, + "grad_norm": 0.22360815107822418, + "learning_rate": 2.3874247558126144e-05, + "loss": 0.1084, + "step": 31559 + }, + { + "epoch": 0.5629080012842007, + "grad_norm": 0.28280171751976013, + "learning_rate": 2.3872692631595447e-05, + "loss": 0.158, + "step": 31560 + }, + { + "epoch": 0.5629258374059145, + "grad_norm": 0.2862938344478607, + "learning_rate": 2.3871137709434587e-05, + "loss": 0.1525, + "step": 31561 + }, + { + "epoch": 0.5629436735276282, + "grad_norm": 0.21458853781223297, + "learning_rate": 2.3869582791649583e-05, + "loss": 0.1382, + "step": 31562 + }, + { + "epoch": 0.5629615096493419, + "grad_norm": 0.26677051186561584, + "learning_rate": 2.3868027878246474e-05, + "loss": 0.1548, + "step": 31563 + }, + { + "epoch": 0.5629793457710556, + "grad_norm": 0.21544599533081055, + "learning_rate": 2.3866472969231276e-05, + "loss": 0.1286, + "step": 31564 + }, + { + "epoch": 0.5629971818927693, + "grad_norm": 0.30368542671203613, + "learning_rate": 2.3864918064610025e-05, + "loss": 0.1498, + "step": 31565 + }, + { + "epoch": 0.563015018014483, + "grad_norm": 0.26172858476638794, + "learning_rate": 2.386336316438873e-05, + "loss": 0.1019, + "step": 31566 + }, + { + "epoch": 0.5630328541361966, + "grad_norm": 0.25865015387535095, + "learning_rate": 2.3861808268573447e-05, + "loss": 0.1318, + "step": 31567 + }, + { + "epoch": 0.5630506902579103, + "grad_norm": 0.28514692187309265, + "learning_rate": 2.386025337717018e-05, + "loss": 0.1161, + "step": 31568 + }, + { + "epoch": 0.563068526379624, + "grad_norm": 0.29225876927375793, + "learning_rate": 2.3858698490184967e-05, + "loss": 0.1199, + "step": 31569 + }, + { + "epoch": 0.5630863625013377, + "grad_norm": 0.3115609884262085, + "learning_rate": 2.3857143607623825e-05, + "loss": 0.1212, + "step": 31570 + }, + { + "epoch": 0.5631041986230514, + "grad_norm": 0.288664847612381, + "learning_rate": 2.38555887294928e-05, + "loss": 0.169, + "step": 31571 + }, + { + "epoch": 0.5631220347447651, + "grad_norm": 0.26004230976104736, + "learning_rate": 2.3854033855797908e-05, + "loss": 0.0906, + "step": 31572 + }, + { + "epoch": 0.5631398708664788, + "grad_norm": 0.3064267039299011, + "learning_rate": 2.3852478986545178e-05, + "loss": 0.1581, + "step": 31573 + }, + { + "epoch": 0.5631577069881925, + "grad_norm": 0.3240392208099365, + "learning_rate": 2.385092412174062e-05, + "loss": 0.1467, + "step": 31574 + }, + { + "epoch": 0.5631755431099061, + "grad_norm": 0.5359348058700562, + "learning_rate": 2.3849369261390293e-05, + "loss": 0.1329, + "step": 31575 + }, + { + "epoch": 0.5631933792316198, + "grad_norm": 0.2657002806663513, + "learning_rate": 2.3847814405500202e-05, + "loss": 0.1194, + "step": 31576 + }, + { + "epoch": 0.5632112153533335, + "grad_norm": 0.30079349875450134, + "learning_rate": 2.3846259554076384e-05, + "loss": 0.1212, + "step": 31577 + }, + { + "epoch": 0.5632290514750473, + "grad_norm": 0.3471679091453552, + "learning_rate": 2.3844704707124863e-05, + "loss": 0.1431, + "step": 31578 + }, + { + "epoch": 0.563246887596761, + "grad_norm": 0.2554193139076233, + "learning_rate": 2.3843149864651656e-05, + "loss": 0.1475, + "step": 31579 + }, + { + "epoch": 0.5632647237184747, + "grad_norm": 0.25658100843429565, + "learning_rate": 2.3841595026662807e-05, + "loss": 0.1241, + "step": 31580 + }, + { + "epoch": 0.5632825598401884, + "grad_norm": 0.28506821393966675, + "learning_rate": 2.384004019316434e-05, + "loss": 0.1535, + "step": 31581 + }, + { + "epoch": 0.5633003959619021, + "grad_norm": 0.30916842818260193, + "learning_rate": 2.383848536416227e-05, + "loss": 0.1059, + "step": 31582 + }, + { + "epoch": 0.5633182320836158, + "grad_norm": 0.26445603370666504, + "learning_rate": 2.3836930539662628e-05, + "loss": 0.1342, + "step": 31583 + }, + { + "epoch": 0.5633360682053294, + "grad_norm": 0.2803240120410919, + "learning_rate": 2.3835375719671454e-05, + "loss": 0.1491, + "step": 31584 + }, + { + "epoch": 0.5633539043270431, + "grad_norm": 0.16185447573661804, + "learning_rate": 2.3833820904194766e-05, + "loss": 0.1203, + "step": 31585 + }, + { + "epoch": 0.5633717404487568, + "grad_norm": 0.23894047737121582, + "learning_rate": 2.3832266093238594e-05, + "loss": 0.1365, + "step": 31586 + }, + { + "epoch": 0.5633895765704705, + "grad_norm": 0.3779670298099518, + "learning_rate": 2.383071128680895e-05, + "loss": 0.1468, + "step": 31587 + }, + { + "epoch": 0.5634074126921842, + "grad_norm": 0.46665042638778687, + "learning_rate": 2.3829156484911883e-05, + "loss": 0.1008, + "step": 31588 + }, + { + "epoch": 0.5634252488138979, + "grad_norm": 0.34893810749053955, + "learning_rate": 2.3827601687553402e-05, + "loss": 0.1506, + "step": 31589 + }, + { + "epoch": 0.5634430849356116, + "grad_norm": 0.3829846978187561, + "learning_rate": 2.382604689473955e-05, + "loss": 0.0985, + "step": 31590 + }, + { + "epoch": 0.5634609210573253, + "grad_norm": 0.320425808429718, + "learning_rate": 2.3824492106476345e-05, + "loss": 0.1113, + "step": 31591 + }, + { + "epoch": 0.563478757179039, + "grad_norm": 0.24195371568202972, + "learning_rate": 2.3822937322769805e-05, + "loss": 0.0892, + "step": 31592 + }, + { + "epoch": 0.5634965933007526, + "grad_norm": 0.24868498742580414, + "learning_rate": 2.3821382543625975e-05, + "loss": 0.1047, + "step": 31593 + }, + { + "epoch": 0.5635144294224664, + "grad_norm": 0.33224624395370483, + "learning_rate": 2.3819827769050875e-05, + "loss": 0.0948, + "step": 31594 + }, + { + "epoch": 0.5635322655441801, + "grad_norm": 0.24193434417247772, + "learning_rate": 2.3818272999050525e-05, + "loss": 0.1465, + "step": 31595 + }, + { + "epoch": 0.5635501016658938, + "grad_norm": 0.36690574884414673, + "learning_rate": 2.3816718233630956e-05, + "loss": 0.1723, + "step": 31596 + }, + { + "epoch": 0.5635679377876075, + "grad_norm": 0.25640392303466797, + "learning_rate": 2.3815163472798196e-05, + "loss": 0.1293, + "step": 31597 + }, + { + "epoch": 0.5635857739093212, + "grad_norm": 0.35884809494018555, + "learning_rate": 2.381360871655828e-05, + "loss": 0.1376, + "step": 31598 + }, + { + "epoch": 0.5636036100310349, + "grad_norm": 0.25123217701911926, + "learning_rate": 2.3812053964917223e-05, + "loss": 0.0984, + "step": 31599 + }, + { + "epoch": 0.5636214461527486, + "grad_norm": 0.31737473607063293, + "learning_rate": 2.3810499217881044e-05, + "loss": 0.1674, + "step": 31600 + }, + { + "epoch": 0.5636392822744622, + "grad_norm": 0.3481954038143158, + "learning_rate": 2.380894447545579e-05, + "loss": 0.191, + "step": 31601 + }, + { + "epoch": 0.5636571183961759, + "grad_norm": 0.24566833674907684, + "learning_rate": 2.3807389737647483e-05, + "loss": 0.1392, + "step": 31602 + }, + { + "epoch": 0.5636749545178896, + "grad_norm": 0.24603791534900665, + "learning_rate": 2.3805835004462136e-05, + "loss": 0.1113, + "step": 31603 + }, + { + "epoch": 0.5636927906396033, + "grad_norm": 0.22214391827583313, + "learning_rate": 2.3804280275905794e-05, + "loss": 0.1416, + "step": 31604 + }, + { + "epoch": 0.563710626761317, + "grad_norm": 0.32133549451828003, + "learning_rate": 2.3802725551984463e-05, + "loss": 0.1509, + "step": 31605 + }, + { + "epoch": 0.5637284628830307, + "grad_norm": 0.2267196774482727, + "learning_rate": 2.3801170832704192e-05, + "loss": 0.0849, + "step": 31606 + }, + { + "epoch": 0.5637462990047444, + "grad_norm": 0.32285070419311523, + "learning_rate": 2.3799616118070994e-05, + "loss": 0.0792, + "step": 31607 + }, + { + "epoch": 0.5637641351264581, + "grad_norm": 0.2434544563293457, + "learning_rate": 2.37980614080909e-05, + "loss": 0.1143, + "step": 31608 + }, + { + "epoch": 0.5637819712481718, + "grad_norm": 0.23506079614162445, + "learning_rate": 2.3796506702769926e-05, + "loss": 0.0958, + "step": 31609 + }, + { + "epoch": 0.5637998073698854, + "grad_norm": 0.22474223375320435, + "learning_rate": 2.3794952002114113e-05, + "loss": 0.1651, + "step": 31610 + }, + { + "epoch": 0.5638176434915992, + "grad_norm": 0.21540921926498413, + "learning_rate": 2.3793397306129488e-05, + "loss": 0.1069, + "step": 31611 + }, + { + "epoch": 0.5638354796133129, + "grad_norm": 0.22561131417751312, + "learning_rate": 2.379184261482207e-05, + "loss": 0.1322, + "step": 31612 + }, + { + "epoch": 0.5638533157350266, + "grad_norm": 0.3039524257183075, + "learning_rate": 2.3790287928197876e-05, + "loss": 0.1484, + "step": 31613 + }, + { + "epoch": 0.5638711518567403, + "grad_norm": 0.24317194521427155, + "learning_rate": 2.3788733246262956e-05, + "loss": 0.179, + "step": 31614 + }, + { + "epoch": 0.563888987978454, + "grad_norm": 0.36901363730430603, + "learning_rate": 2.3787178569023325e-05, + "loss": 0.1516, + "step": 31615 + }, + { + "epoch": 0.5639068241001677, + "grad_norm": 0.2813870906829834, + "learning_rate": 2.3785623896485002e-05, + "loss": 0.1315, + "step": 31616 + }, + { + "epoch": 0.5639246602218814, + "grad_norm": 0.249116912484169, + "learning_rate": 2.3784069228654027e-05, + "loss": 0.1714, + "step": 31617 + }, + { + "epoch": 0.5639424963435951, + "grad_norm": 0.35946303606033325, + "learning_rate": 2.3782514565536407e-05, + "loss": 0.1597, + "step": 31618 + }, + { + "epoch": 0.5639603324653087, + "grad_norm": 0.27216583490371704, + "learning_rate": 2.378095990713819e-05, + "loss": 0.1367, + "step": 31619 + }, + { + "epoch": 0.5639781685870224, + "grad_norm": 0.20986083149909973, + "learning_rate": 2.37794052534654e-05, + "loss": 0.1279, + "step": 31620 + }, + { + "epoch": 0.5639960047087361, + "grad_norm": 0.3030647933483124, + "learning_rate": 2.377785060452405e-05, + "loss": 0.1728, + "step": 31621 + }, + { + "epoch": 0.5640138408304498, + "grad_norm": 0.29480674862861633, + "learning_rate": 2.3776295960320166e-05, + "loss": 0.1521, + "step": 31622 + }, + { + "epoch": 0.5640316769521635, + "grad_norm": 0.2918318808078766, + "learning_rate": 2.377474132085979e-05, + "loss": 0.107, + "step": 31623 + }, + { + "epoch": 0.5640495130738772, + "grad_norm": 0.3118656277656555, + "learning_rate": 2.3773186686148942e-05, + "loss": 0.1272, + "step": 31624 + }, + { + "epoch": 0.5640673491955909, + "grad_norm": 0.272695928812027, + "learning_rate": 2.3771632056193645e-05, + "loss": 0.1707, + "step": 31625 + }, + { + "epoch": 0.5640851853173046, + "grad_norm": 0.245751291513443, + "learning_rate": 2.3770077430999917e-05, + "loss": 0.1334, + "step": 31626 + }, + { + "epoch": 0.5641030214390182, + "grad_norm": 0.3448629379272461, + "learning_rate": 2.37685228105738e-05, + "loss": 0.1723, + "step": 31627 + }, + { + "epoch": 0.564120857560732, + "grad_norm": 0.19837304949760437, + "learning_rate": 2.376696819492132e-05, + "loss": 0.1538, + "step": 31628 + }, + { + "epoch": 0.5641386936824457, + "grad_norm": 0.2494966685771942, + "learning_rate": 2.3765413584048492e-05, + "loss": 0.1038, + "step": 31629 + }, + { + "epoch": 0.5641565298041594, + "grad_norm": 0.228762686252594, + "learning_rate": 2.3763858977961344e-05, + "loss": 0.1606, + "step": 31630 + }, + { + "epoch": 0.5641743659258731, + "grad_norm": 0.20529478788375854, + "learning_rate": 2.3762304376665908e-05, + "loss": 0.0634, + "step": 31631 + }, + { + "epoch": 0.5641922020475868, + "grad_norm": 0.3570258617401123, + "learning_rate": 2.3760749780168205e-05, + "loss": 0.1452, + "step": 31632 + }, + { + "epoch": 0.5642100381693005, + "grad_norm": 0.2126232236623764, + "learning_rate": 2.375919518847427e-05, + "loss": 0.1472, + "step": 31633 + }, + { + "epoch": 0.5642278742910142, + "grad_norm": 0.2873368561267853, + "learning_rate": 2.3757640601590125e-05, + "loss": 0.1112, + "step": 31634 + }, + { + "epoch": 0.5642457104127279, + "grad_norm": 0.26218634843826294, + "learning_rate": 2.375608601952178e-05, + "loss": 0.0989, + "step": 31635 + }, + { + "epoch": 0.5642635465344416, + "grad_norm": 0.23815672099590302, + "learning_rate": 2.375453144227529e-05, + "loss": 0.0932, + "step": 31636 + }, + { + "epoch": 0.5642813826561552, + "grad_norm": 0.402576744556427, + "learning_rate": 2.375297686985666e-05, + "loss": 0.1609, + "step": 31637 + }, + { + "epoch": 0.5642992187778689, + "grad_norm": 0.21550101041793823, + "learning_rate": 2.3751422302271925e-05, + "loss": 0.0999, + "step": 31638 + }, + { + "epoch": 0.5643170548995826, + "grad_norm": 0.30787888169288635, + "learning_rate": 2.37498677395271e-05, + "loss": 0.1269, + "step": 31639 + }, + { + "epoch": 0.5643348910212963, + "grad_norm": 0.26787036657333374, + "learning_rate": 2.3748313181628228e-05, + "loss": 0.1833, + "step": 31640 + }, + { + "epoch": 0.56435272714301, + "grad_norm": 0.33512693643569946, + "learning_rate": 2.3746758628581327e-05, + "loss": 0.1576, + "step": 31641 + }, + { + "epoch": 0.5643705632647237, + "grad_norm": 0.2764591872692108, + "learning_rate": 2.374520408039242e-05, + "loss": 0.107, + "step": 31642 + }, + { + "epoch": 0.5643883993864374, + "grad_norm": 0.28388968110084534, + "learning_rate": 2.374364953706753e-05, + "loss": 0.1359, + "step": 31643 + }, + { + "epoch": 0.564406235508151, + "grad_norm": 0.266609251499176, + "learning_rate": 2.3742094998612696e-05, + "loss": 0.112, + "step": 31644 + }, + { + "epoch": 0.5644240716298649, + "grad_norm": 0.2514941394329071, + "learning_rate": 2.3740540465033933e-05, + "loss": 0.1063, + "step": 31645 + }, + { + "epoch": 0.5644419077515785, + "grad_norm": 0.5056187510490417, + "learning_rate": 2.3738985936337274e-05, + "loss": 0.1778, + "step": 31646 + }, + { + "epoch": 0.5644597438732922, + "grad_norm": 0.3707565665245056, + "learning_rate": 2.373743141252874e-05, + "loss": 0.1366, + "step": 31647 + }, + { + "epoch": 0.5644775799950059, + "grad_norm": 0.3683274984359741, + "learning_rate": 2.3735876893614347e-05, + "loss": 0.1302, + "step": 31648 + }, + { + "epoch": 0.5644954161167196, + "grad_norm": 0.2075379192829132, + "learning_rate": 2.3734322379600147e-05, + "loss": 0.1054, + "step": 31649 + }, + { + "epoch": 0.5645132522384333, + "grad_norm": 0.24443985521793365, + "learning_rate": 2.3732767870492145e-05, + "loss": 0.1584, + "step": 31650 + }, + { + "epoch": 0.564531088360147, + "grad_norm": 0.26057642698287964, + "learning_rate": 2.3731213366296372e-05, + "loss": 0.1289, + "step": 31651 + }, + { + "epoch": 0.5645489244818607, + "grad_norm": 0.21860824525356293, + "learning_rate": 2.372965886701885e-05, + "loss": 0.1378, + "step": 31652 + }, + { + "epoch": 0.5645667606035744, + "grad_norm": 0.2887742221355438, + "learning_rate": 2.372810437266561e-05, + "loss": 0.1795, + "step": 31653 + }, + { + "epoch": 0.564584596725288, + "grad_norm": 0.23349636793136597, + "learning_rate": 2.3726549883242685e-05, + "loss": 0.1284, + "step": 31654 + }, + { + "epoch": 0.5646024328470017, + "grad_norm": 0.2939760982990265, + "learning_rate": 2.3724995398756088e-05, + "loss": 0.1302, + "step": 31655 + }, + { + "epoch": 0.5646202689687154, + "grad_norm": 0.3866825997829437, + "learning_rate": 2.3723440919211843e-05, + "loss": 0.1186, + "step": 31656 + }, + { + "epoch": 0.5646381050904291, + "grad_norm": 0.30090954899787903, + "learning_rate": 2.372188644461599e-05, + "loss": 0.1266, + "step": 31657 + }, + { + "epoch": 0.5646559412121428, + "grad_norm": 0.22703874111175537, + "learning_rate": 2.3720331974974545e-05, + "loss": 0.1107, + "step": 31658 + }, + { + "epoch": 0.5646737773338565, + "grad_norm": 0.2614491879940033, + "learning_rate": 2.3718777510293533e-05, + "loss": 0.1144, + "step": 31659 + }, + { + "epoch": 0.5646916134555702, + "grad_norm": 0.23779863119125366, + "learning_rate": 2.3717223050578987e-05, + "loss": 0.1522, + "step": 31660 + }, + { + "epoch": 0.5647094495772839, + "grad_norm": 0.24018393456935883, + "learning_rate": 2.3715668595836914e-05, + "loss": 0.1078, + "step": 31661 + }, + { + "epoch": 0.5647272856989977, + "grad_norm": 0.3777134120464325, + "learning_rate": 2.3714114146073368e-05, + "loss": 0.1733, + "step": 31662 + }, + { + "epoch": 0.5647451218207113, + "grad_norm": 0.251590758562088, + "learning_rate": 2.3712559701294358e-05, + "loss": 0.1868, + "step": 31663 + }, + { + "epoch": 0.564762957942425, + "grad_norm": 0.31605085730552673, + "learning_rate": 2.3711005261505902e-05, + "loss": 0.1329, + "step": 31664 + }, + { + "epoch": 0.5647807940641387, + "grad_norm": 0.24037417769432068, + "learning_rate": 2.370945082671404e-05, + "loss": 0.1477, + "step": 31665 + }, + { + "epoch": 0.5647986301858524, + "grad_norm": 0.3526400327682495, + "learning_rate": 2.3707896396924792e-05, + "loss": 0.165, + "step": 31666 + }, + { + "epoch": 0.5648164663075661, + "grad_norm": 0.1747768372297287, + "learning_rate": 2.3706341972144185e-05, + "loss": 0.1146, + "step": 31667 + }, + { + "epoch": 0.5648343024292798, + "grad_norm": 0.30727043747901917, + "learning_rate": 2.370478755237825e-05, + "loss": 0.1341, + "step": 31668 + }, + { + "epoch": 0.5648521385509935, + "grad_norm": 0.23793411254882812, + "learning_rate": 2.3703233137632988e-05, + "loss": 0.0998, + "step": 31669 + }, + { + "epoch": 0.5648699746727072, + "grad_norm": 0.18020571768283844, + "learning_rate": 2.3701678727914457e-05, + "loss": 0.0935, + "step": 31670 + }, + { + "epoch": 0.5648878107944209, + "grad_norm": 0.2109021544456482, + "learning_rate": 2.3700124323228666e-05, + "loss": 0.1395, + "step": 31671 + }, + { + "epoch": 0.5649056469161345, + "grad_norm": 0.24290896952152252, + "learning_rate": 2.3698569923581638e-05, + "loss": 0.1086, + "step": 31672 + }, + { + "epoch": 0.5649234830378482, + "grad_norm": 0.32073161005973816, + "learning_rate": 2.369701552897941e-05, + "loss": 0.1675, + "step": 31673 + }, + { + "epoch": 0.5649413191595619, + "grad_norm": 0.27767741680145264, + "learning_rate": 2.3695461139427986e-05, + "loss": 0.1662, + "step": 31674 + }, + { + "epoch": 0.5649591552812756, + "grad_norm": 0.22632794082164764, + "learning_rate": 2.3693906754933415e-05, + "loss": 0.1301, + "step": 31675 + }, + { + "epoch": 0.5649769914029893, + "grad_norm": 0.23737747967243195, + "learning_rate": 2.3692352375501715e-05, + "loss": 0.1311, + "step": 31676 + }, + { + "epoch": 0.564994827524703, + "grad_norm": 0.2506340444087982, + "learning_rate": 2.3690798001138905e-05, + "loss": 0.1596, + "step": 31677 + }, + { + "epoch": 0.5650126636464167, + "grad_norm": 0.30249613523483276, + "learning_rate": 2.3689243631851008e-05, + "loss": 0.1343, + "step": 31678 + }, + { + "epoch": 0.5650304997681305, + "grad_norm": 0.36731263995170593, + "learning_rate": 2.3687689267644065e-05, + "loss": 0.1945, + "step": 31679 + }, + { + "epoch": 0.5650483358898442, + "grad_norm": 0.2557585835456848, + "learning_rate": 2.3686134908524086e-05, + "loss": 0.1438, + "step": 31680 + }, + { + "epoch": 0.5650661720115578, + "grad_norm": 0.3712798058986664, + "learning_rate": 2.3684580554497104e-05, + "loss": 0.1038, + "step": 31681 + }, + { + "epoch": 0.5650840081332715, + "grad_norm": 0.20703443884849548, + "learning_rate": 2.3683026205569138e-05, + "loss": 0.1525, + "step": 31682 + }, + { + "epoch": 0.5651018442549852, + "grad_norm": 0.29868629574775696, + "learning_rate": 2.3681471861746222e-05, + "loss": 0.1479, + "step": 31683 + }, + { + "epoch": 0.5651196803766989, + "grad_norm": 0.2785952389240265, + "learning_rate": 2.3679917523034378e-05, + "loss": 0.1336, + "step": 31684 + }, + { + "epoch": 0.5651375164984126, + "grad_norm": 0.26928675174713135, + "learning_rate": 2.3678363189439632e-05, + "loss": 0.1377, + "step": 31685 + }, + { + "epoch": 0.5651553526201263, + "grad_norm": 0.3255443572998047, + "learning_rate": 2.3676808860967996e-05, + "loss": 0.14, + "step": 31686 + }, + { + "epoch": 0.56517318874184, + "grad_norm": 0.395115464925766, + "learning_rate": 2.3675254537625507e-05, + "loss": 0.075, + "step": 31687 + }, + { + "epoch": 0.5651910248635537, + "grad_norm": 0.3141167163848877, + "learning_rate": 2.3673700219418196e-05, + "loss": 0.1156, + "step": 31688 + }, + { + "epoch": 0.5652088609852673, + "grad_norm": 0.25743556022644043, + "learning_rate": 2.3672145906352084e-05, + "loss": 0.1249, + "step": 31689 + }, + { + "epoch": 0.565226697106981, + "grad_norm": 0.29879993200302124, + "learning_rate": 2.367059159843319e-05, + "loss": 0.1274, + "step": 31690 + }, + { + "epoch": 0.5652445332286947, + "grad_norm": 0.2265031486749649, + "learning_rate": 2.366903729566753e-05, + "loss": 0.1053, + "step": 31691 + }, + { + "epoch": 0.5652623693504084, + "grad_norm": 0.1926048845052719, + "learning_rate": 2.3667482998061155e-05, + "loss": 0.1019, + "step": 31692 + }, + { + "epoch": 0.5652802054721221, + "grad_norm": 0.20879815518856049, + "learning_rate": 2.366592870562007e-05, + "loss": 0.1017, + "step": 31693 + }, + { + "epoch": 0.5652980415938358, + "grad_norm": 0.23162026703357697, + "learning_rate": 2.3664374418350313e-05, + "loss": 0.1075, + "step": 31694 + }, + { + "epoch": 0.5653158777155496, + "grad_norm": 0.26945286989212036, + "learning_rate": 2.3662820136257892e-05, + "loss": 0.1505, + "step": 31695 + }, + { + "epoch": 0.5653337138372633, + "grad_norm": 0.20930252969264984, + "learning_rate": 2.366126585934885e-05, + "loss": 0.0911, + "step": 31696 + }, + { + "epoch": 0.565351549958977, + "grad_norm": 0.30685189366340637, + "learning_rate": 2.3659711587629202e-05, + "loss": 0.0946, + "step": 31697 + }, + { + "epoch": 0.5653693860806907, + "grad_norm": 0.2428974211215973, + "learning_rate": 2.3658157321104978e-05, + "loss": 0.1724, + "step": 31698 + }, + { + "epoch": 0.5653872222024043, + "grad_norm": 0.2702333629131317, + "learning_rate": 2.3656603059782194e-05, + "loss": 0.1277, + "step": 31699 + }, + { + "epoch": 0.565405058324118, + "grad_norm": 0.23337192833423615, + "learning_rate": 2.365504880366688e-05, + "loss": 0.1267, + "step": 31700 + }, + { + "epoch": 0.5654228944458317, + "grad_norm": 0.2820528447628021, + "learning_rate": 2.365349455276506e-05, + "loss": 0.161, + "step": 31701 + }, + { + "epoch": 0.5654407305675454, + "grad_norm": 0.22095970809459686, + "learning_rate": 2.3651940307082768e-05, + "loss": 0.0814, + "step": 31702 + }, + { + "epoch": 0.5654585666892591, + "grad_norm": 0.29226115345954895, + "learning_rate": 2.365038606662602e-05, + "loss": 0.1553, + "step": 31703 + }, + { + "epoch": 0.5654764028109728, + "grad_norm": 0.25860917568206787, + "learning_rate": 2.364883183140083e-05, + "loss": 0.2047, + "step": 31704 + }, + { + "epoch": 0.5654942389326865, + "grad_norm": 0.3201715350151062, + "learning_rate": 2.3647277601413247e-05, + "loss": 0.1221, + "step": 31705 + }, + { + "epoch": 0.5655120750544002, + "grad_norm": 0.28690049052238464, + "learning_rate": 2.3645723376669286e-05, + "loss": 0.1056, + "step": 31706 + }, + { + "epoch": 0.5655299111761138, + "grad_norm": 0.2842472791671753, + "learning_rate": 2.3644169157174958e-05, + "loss": 0.1755, + "step": 31707 + }, + { + "epoch": 0.5655477472978275, + "grad_norm": 0.35904672741889954, + "learning_rate": 2.36426149429363e-05, + "loss": 0.1602, + "step": 31708 + }, + { + "epoch": 0.5655655834195412, + "grad_norm": 0.27771344780921936, + "learning_rate": 2.3641060733959335e-05, + "loss": 0.1644, + "step": 31709 + }, + { + "epoch": 0.5655834195412549, + "grad_norm": 0.277072936296463, + "learning_rate": 2.3639506530250095e-05, + "loss": 0.1714, + "step": 31710 + }, + { + "epoch": 0.5656012556629686, + "grad_norm": 0.2892456650733948, + "learning_rate": 2.3637952331814598e-05, + "loss": 0.1027, + "step": 31711 + }, + { + "epoch": 0.5656190917846824, + "grad_norm": 0.23468153178691864, + "learning_rate": 2.3636398138658867e-05, + "loss": 0.1374, + "step": 31712 + }, + { + "epoch": 0.5656369279063961, + "grad_norm": 0.21396775543689728, + "learning_rate": 2.3634843950788917e-05, + "loss": 0.1246, + "step": 31713 + }, + { + "epoch": 0.5656547640281098, + "grad_norm": 0.25983208417892456, + "learning_rate": 2.3633289768210788e-05, + "loss": 0.1649, + "step": 31714 + }, + { + "epoch": 0.5656726001498235, + "grad_norm": 0.508179247379303, + "learning_rate": 2.3631735590930508e-05, + "loss": 0.1637, + "step": 31715 + }, + { + "epoch": 0.5656904362715371, + "grad_norm": 0.22736430168151855, + "learning_rate": 2.363018141895409e-05, + "loss": 0.1183, + "step": 31716 + }, + { + "epoch": 0.5657082723932508, + "grad_norm": 0.40757668018341064, + "learning_rate": 2.3628627252287554e-05, + "loss": 0.1033, + "step": 31717 + }, + { + "epoch": 0.5657261085149645, + "grad_norm": 0.27163437008857727, + "learning_rate": 2.3627073090936945e-05, + "loss": 0.1532, + "step": 31718 + }, + { + "epoch": 0.5657439446366782, + "grad_norm": 0.2641112804412842, + "learning_rate": 2.362551893490827e-05, + "loss": 0.1104, + "step": 31719 + }, + { + "epoch": 0.5657617807583919, + "grad_norm": 0.294924259185791, + "learning_rate": 2.3623964784207556e-05, + "loss": 0.1575, + "step": 31720 + }, + { + "epoch": 0.5657796168801056, + "grad_norm": 0.2325085550546646, + "learning_rate": 2.3622410638840828e-05, + "loss": 0.1337, + "step": 31721 + }, + { + "epoch": 0.5657974530018193, + "grad_norm": 0.3161733150482178, + "learning_rate": 2.362085649881411e-05, + "loss": 0.1484, + "step": 31722 + }, + { + "epoch": 0.565815289123533, + "grad_norm": 0.3015326261520386, + "learning_rate": 2.3619302364133438e-05, + "loss": 0.136, + "step": 31723 + }, + { + "epoch": 0.5658331252452466, + "grad_norm": 0.2192542999982834, + "learning_rate": 2.3617748234804825e-05, + "loss": 0.14, + "step": 31724 + }, + { + "epoch": 0.5658509613669603, + "grad_norm": 0.3219265043735504, + "learning_rate": 2.36161941108343e-05, + "loss": 0.1201, + "step": 31725 + }, + { + "epoch": 0.565868797488674, + "grad_norm": 0.2728806734085083, + "learning_rate": 2.361463999222787e-05, + "loss": 0.1396, + "step": 31726 + }, + { + "epoch": 0.5658866336103877, + "grad_norm": 0.25127914547920227, + "learning_rate": 2.3613085878991587e-05, + "loss": 0.1148, + "step": 31727 + }, + { + "epoch": 0.5659044697321014, + "grad_norm": 0.23592594265937805, + "learning_rate": 2.3611531771131457e-05, + "loss": 0.125, + "step": 31728 + }, + { + "epoch": 0.5659223058538152, + "grad_norm": 0.24304333329200745, + "learning_rate": 2.3609977668653514e-05, + "loss": 0.1695, + "step": 31729 + }, + { + "epoch": 0.5659401419755289, + "grad_norm": 0.2560407519340515, + "learning_rate": 2.3608423571563767e-05, + "loss": 0.1232, + "step": 31730 + }, + { + "epoch": 0.5659579780972426, + "grad_norm": 0.19703637063503265, + "learning_rate": 2.360686947986826e-05, + "loss": 0.1334, + "step": 31731 + }, + { + "epoch": 0.5659758142189563, + "grad_norm": 0.3853825628757477, + "learning_rate": 2.360531539357301e-05, + "loss": 0.1288, + "step": 31732 + }, + { + "epoch": 0.56599365034067, + "grad_norm": 0.22643814980983734, + "learning_rate": 2.3603761312684038e-05, + "loss": 0.1141, + "step": 31733 + }, + { + "epoch": 0.5660114864623836, + "grad_norm": 0.2896316945552826, + "learning_rate": 2.360220723720736e-05, + "loss": 0.2069, + "step": 31734 + }, + { + "epoch": 0.5660293225840973, + "grad_norm": 0.2831781208515167, + "learning_rate": 2.360065316714902e-05, + "loss": 0.1198, + "step": 31735 + }, + { + "epoch": 0.566047158705811, + "grad_norm": 0.2595368027687073, + "learning_rate": 2.3599099102515025e-05, + "loss": 0.106, + "step": 31736 + }, + { + "epoch": 0.5660649948275247, + "grad_norm": 0.2445223331451416, + "learning_rate": 2.3597545043311416e-05, + "loss": 0.1361, + "step": 31737 + }, + { + "epoch": 0.5660828309492384, + "grad_norm": 0.3353172242641449, + "learning_rate": 2.35959909895442e-05, + "loss": 0.119, + "step": 31738 + }, + { + "epoch": 0.5661006670709521, + "grad_norm": 0.24241621792316437, + "learning_rate": 2.35944369412194e-05, + "loss": 0.108, + "step": 31739 + }, + { + "epoch": 0.5661185031926658, + "grad_norm": 0.2791469693183899, + "learning_rate": 2.3592882898343062e-05, + "loss": 0.1495, + "step": 31740 + }, + { + "epoch": 0.5661363393143795, + "grad_norm": 0.18953026831150055, + "learning_rate": 2.3591328860921186e-05, + "loss": 0.1259, + "step": 31741 + }, + { + "epoch": 0.5661541754360931, + "grad_norm": 0.24994948506355286, + "learning_rate": 2.3589774828959813e-05, + "loss": 0.1407, + "step": 31742 + }, + { + "epoch": 0.5661720115578068, + "grad_norm": 0.3137910068035126, + "learning_rate": 2.3588220802464947e-05, + "loss": 0.1007, + "step": 31743 + }, + { + "epoch": 0.5661898476795205, + "grad_norm": 0.2321649044752121, + "learning_rate": 2.358666678144264e-05, + "loss": 0.1315, + "step": 31744 + }, + { + "epoch": 0.5662076838012342, + "grad_norm": 0.23086529970169067, + "learning_rate": 2.3585112765898895e-05, + "loss": 0.1085, + "step": 31745 + }, + { + "epoch": 0.566225519922948, + "grad_norm": 0.29069772362709045, + "learning_rate": 2.3583558755839744e-05, + "loss": 0.1281, + "step": 31746 + }, + { + "epoch": 0.5662433560446617, + "grad_norm": 0.18334349989891052, + "learning_rate": 2.3582004751271197e-05, + "loss": 0.092, + "step": 31747 + }, + { + "epoch": 0.5662611921663754, + "grad_norm": 0.3791230022907257, + "learning_rate": 2.35804507521993e-05, + "loss": 0.2191, + "step": 31748 + }, + { + "epoch": 0.5662790282880891, + "grad_norm": 0.2948589324951172, + "learning_rate": 2.3578896758630065e-05, + "loss": 0.1546, + "step": 31749 + }, + { + "epoch": 0.5662968644098028, + "grad_norm": 0.29607513546943665, + "learning_rate": 2.3577342770569518e-05, + "loss": 0.151, + "step": 31750 + }, + { + "epoch": 0.5663147005315164, + "grad_norm": 0.27405473589897156, + "learning_rate": 2.3575788788023685e-05, + "loss": 0.1376, + "step": 31751 + }, + { + "epoch": 0.5663325366532301, + "grad_norm": 0.19494296610355377, + "learning_rate": 2.357423481099857e-05, + "loss": 0.1041, + "step": 31752 + }, + { + "epoch": 0.5663503727749438, + "grad_norm": 0.2988523840904236, + "learning_rate": 2.357268083950023e-05, + "loss": 0.1439, + "step": 31753 + }, + { + "epoch": 0.5663682088966575, + "grad_norm": 0.2279803305864334, + "learning_rate": 2.357112687353467e-05, + "loss": 0.1446, + "step": 31754 + }, + { + "epoch": 0.5663860450183712, + "grad_norm": 0.18937614560127258, + "learning_rate": 2.3569572913107913e-05, + "loss": 0.1139, + "step": 31755 + }, + { + "epoch": 0.5664038811400849, + "grad_norm": 0.30287057161331177, + "learning_rate": 2.3568018958225982e-05, + "loss": 0.1598, + "step": 31756 + }, + { + "epoch": 0.5664217172617986, + "grad_norm": 0.26082295179367065, + "learning_rate": 2.3566465008894903e-05, + "loss": 0.1332, + "step": 31757 + }, + { + "epoch": 0.5664395533835123, + "grad_norm": 0.38175928592681885, + "learning_rate": 2.3564911065120708e-05, + "loss": 0.149, + "step": 31758 + }, + { + "epoch": 0.566457389505226, + "grad_norm": 0.36254653334617615, + "learning_rate": 2.3563357126909413e-05, + "loss": 0.1751, + "step": 31759 + }, + { + "epoch": 0.5664752256269396, + "grad_norm": 0.2453150451183319, + "learning_rate": 2.356180319426703e-05, + "loss": 0.116, + "step": 31760 + }, + { + "epoch": 0.5664930617486533, + "grad_norm": 0.2619331479072571, + "learning_rate": 2.3560249267199612e-05, + "loss": 0.1479, + "step": 31761 + }, + { + "epoch": 0.566510897870367, + "grad_norm": 0.23770898580551147, + "learning_rate": 2.355869534571316e-05, + "loss": 0.1466, + "step": 31762 + }, + { + "epoch": 0.5665287339920808, + "grad_norm": 0.28270354866981506, + "learning_rate": 2.3557141429813693e-05, + "loss": 0.1443, + "step": 31763 + }, + { + "epoch": 0.5665465701137945, + "grad_norm": 0.2974907159805298, + "learning_rate": 2.3555587519507258e-05, + "loss": 0.1177, + "step": 31764 + }, + { + "epoch": 0.5665644062355082, + "grad_norm": 0.22263066470623016, + "learning_rate": 2.3554033614799846e-05, + "loss": 0.1477, + "step": 31765 + }, + { + "epoch": 0.5665822423572219, + "grad_norm": 0.22978141903877258, + "learning_rate": 2.355247971569752e-05, + "loss": 0.1351, + "step": 31766 + }, + { + "epoch": 0.5666000784789356, + "grad_norm": 0.3551715612411499, + "learning_rate": 2.3550925822206273e-05, + "loss": 0.1021, + "step": 31767 + }, + { + "epoch": 0.5666179146006493, + "grad_norm": 0.31905874609947205, + "learning_rate": 2.3549371934332136e-05, + "loss": 0.125, + "step": 31768 + }, + { + "epoch": 0.5666357507223629, + "grad_norm": 0.2632167637348175, + "learning_rate": 2.3547818052081133e-05, + "loss": 0.1566, + "step": 31769 + }, + { + "epoch": 0.5666535868440766, + "grad_norm": 0.2459852248430252, + "learning_rate": 2.3546264175459294e-05, + "loss": 0.1493, + "step": 31770 + }, + { + "epoch": 0.5666714229657903, + "grad_norm": 0.24205629527568817, + "learning_rate": 2.3544710304472636e-05, + "loss": 0.1165, + "step": 31771 + }, + { + "epoch": 0.566689259087504, + "grad_norm": 0.2550499141216278, + "learning_rate": 2.3543156439127187e-05, + "loss": 0.1261, + "step": 31772 + }, + { + "epoch": 0.5667070952092177, + "grad_norm": 0.25032225251197815, + "learning_rate": 2.3541602579428955e-05, + "loss": 0.1405, + "step": 31773 + }, + { + "epoch": 0.5667249313309314, + "grad_norm": 0.24248740077018738, + "learning_rate": 2.3540048725383986e-05, + "loss": 0.1423, + "step": 31774 + }, + { + "epoch": 0.5667427674526451, + "grad_norm": 0.20904380083084106, + "learning_rate": 2.3538494876998295e-05, + "loss": 0.1286, + "step": 31775 + }, + { + "epoch": 0.5667606035743588, + "grad_norm": 0.2057378888130188, + "learning_rate": 2.3536941034277892e-05, + "loss": 0.1282, + "step": 31776 + }, + { + "epoch": 0.5667784396960724, + "grad_norm": 0.41894152760505676, + "learning_rate": 2.353538719722882e-05, + "loss": 0.1706, + "step": 31777 + }, + { + "epoch": 0.5667962758177861, + "grad_norm": 0.2069336622953415, + "learning_rate": 2.353383336585708e-05, + "loss": 0.1119, + "step": 31778 + }, + { + "epoch": 0.5668141119394998, + "grad_norm": 0.2987953722476959, + "learning_rate": 2.3532279540168724e-05, + "loss": 0.1285, + "step": 31779 + }, + { + "epoch": 0.5668319480612136, + "grad_norm": 0.29955628514289856, + "learning_rate": 2.3530725720169753e-05, + "loss": 0.151, + "step": 31780 + }, + { + "epoch": 0.5668497841829273, + "grad_norm": 0.2698342800140381, + "learning_rate": 2.3529171905866198e-05, + "loss": 0.1455, + "step": 31781 + }, + { + "epoch": 0.566867620304641, + "grad_norm": 0.20955033600330353, + "learning_rate": 2.3527618097264072e-05, + "loss": 0.1261, + "step": 31782 + }, + { + "epoch": 0.5668854564263547, + "grad_norm": 0.3072643280029297, + "learning_rate": 2.3526064294369418e-05, + "loss": 0.1743, + "step": 31783 + }, + { + "epoch": 0.5669032925480684, + "grad_norm": 0.3178693652153015, + "learning_rate": 2.352451049718824e-05, + "loss": 0.0975, + "step": 31784 + }, + { + "epoch": 0.5669211286697821, + "grad_norm": 0.27388060092926025, + "learning_rate": 2.3522956705726575e-05, + "loss": 0.1122, + "step": 31785 + }, + { + "epoch": 0.5669389647914957, + "grad_norm": 0.2072443813085556, + "learning_rate": 2.3521402919990433e-05, + "loss": 0.0983, + "step": 31786 + }, + { + "epoch": 0.5669568009132094, + "grad_norm": 0.275480717420578, + "learning_rate": 2.351984913998585e-05, + "loss": 0.0596, + "step": 31787 + }, + { + "epoch": 0.5669746370349231, + "grad_norm": 0.3250078558921814, + "learning_rate": 2.3518295365718846e-05, + "loss": 0.1457, + "step": 31788 + }, + { + "epoch": 0.5669924731566368, + "grad_norm": 0.2418910712003708, + "learning_rate": 2.3516741597195436e-05, + "loss": 0.125, + "step": 31789 + }, + { + "epoch": 0.5670103092783505, + "grad_norm": 0.2838447690010071, + "learning_rate": 2.351518783442165e-05, + "loss": 0.1399, + "step": 31790 + }, + { + "epoch": 0.5670281454000642, + "grad_norm": 0.2699092626571655, + "learning_rate": 2.3513634077403498e-05, + "loss": 0.1128, + "step": 31791 + }, + { + "epoch": 0.5670459815217779, + "grad_norm": 0.5057222247123718, + "learning_rate": 2.3512080326147028e-05, + "loss": 0.137, + "step": 31792 + }, + { + "epoch": 0.5670638176434916, + "grad_norm": 0.22933563590049744, + "learning_rate": 2.3510526580658245e-05, + "loss": 0.1345, + "step": 31793 + }, + { + "epoch": 0.5670816537652053, + "grad_norm": 0.20409952104091644, + "learning_rate": 2.3508972840943178e-05, + "loss": 0.0944, + "step": 31794 + }, + { + "epoch": 0.5670994898869189, + "grad_norm": 0.35851675271987915, + "learning_rate": 2.3507419107007834e-05, + "loss": 0.1124, + "step": 31795 + }, + { + "epoch": 0.5671173260086327, + "grad_norm": 0.25478753447532654, + "learning_rate": 2.350586537885826e-05, + "loss": 0.193, + "step": 31796 + }, + { + "epoch": 0.5671351621303464, + "grad_norm": 0.34525060653686523, + "learning_rate": 2.3504311656500466e-05, + "loss": 0.1628, + "step": 31797 + }, + { + "epoch": 0.5671529982520601, + "grad_norm": 0.24626538157463074, + "learning_rate": 2.3502757939940478e-05, + "loss": 0.1331, + "step": 31798 + }, + { + "epoch": 0.5671708343737738, + "grad_norm": 0.27038154006004333, + "learning_rate": 2.350120422918431e-05, + "loss": 0.1437, + "step": 31799 + }, + { + "epoch": 0.5671886704954875, + "grad_norm": 0.36471810936927795, + "learning_rate": 2.3499650524238e-05, + "loss": 0.0896, + "step": 31800 + }, + { + "epoch": 0.5672065066172012, + "grad_norm": 0.20968887209892273, + "learning_rate": 2.3498096825107564e-05, + "loss": 0.1239, + "step": 31801 + }, + { + "epoch": 0.5672243427389149, + "grad_norm": 0.22653204202651978, + "learning_rate": 2.3496543131799023e-05, + "loss": 0.1195, + "step": 31802 + }, + { + "epoch": 0.5672421788606286, + "grad_norm": 0.36928629875183105, + "learning_rate": 2.3494989444318398e-05, + "loss": 0.1745, + "step": 31803 + }, + { + "epoch": 0.5672600149823422, + "grad_norm": 0.3669426441192627, + "learning_rate": 2.3493435762671708e-05, + "loss": 0.1337, + "step": 31804 + }, + { + "epoch": 0.5672778511040559, + "grad_norm": 0.293701708316803, + "learning_rate": 2.3491882086864982e-05, + "loss": 0.1636, + "step": 31805 + }, + { + "epoch": 0.5672956872257696, + "grad_norm": 0.24530857801437378, + "learning_rate": 2.349032841690425e-05, + "loss": 0.1976, + "step": 31806 + }, + { + "epoch": 0.5673135233474833, + "grad_norm": 0.304094135761261, + "learning_rate": 2.3488774752795528e-05, + "loss": 0.1476, + "step": 31807 + }, + { + "epoch": 0.567331359469197, + "grad_norm": 0.2270730435848236, + "learning_rate": 2.3487221094544823e-05, + "loss": 0.13, + "step": 31808 + }, + { + "epoch": 0.5673491955909107, + "grad_norm": 0.26939326524734497, + "learning_rate": 2.3485667442158186e-05, + "loss": 0.1602, + "step": 31809 + }, + { + "epoch": 0.5673670317126244, + "grad_norm": 0.21226643025875092, + "learning_rate": 2.3484113795641622e-05, + "loss": 0.0952, + "step": 31810 + }, + { + "epoch": 0.5673848678343381, + "grad_norm": 0.268289178609848, + "learning_rate": 2.3482560155001153e-05, + "loss": 0.1723, + "step": 31811 + }, + { + "epoch": 0.5674027039560517, + "grad_norm": 0.2653530538082123, + "learning_rate": 2.3481006520242802e-05, + "loss": 0.1076, + "step": 31812 + }, + { + "epoch": 0.5674205400777655, + "grad_norm": 0.22172777354717255, + "learning_rate": 2.3479452891372594e-05, + "loss": 0.0963, + "step": 31813 + }, + { + "epoch": 0.5674383761994792, + "grad_norm": 0.20208679139614105, + "learning_rate": 2.347789926839656e-05, + "loss": 0.0771, + "step": 31814 + }, + { + "epoch": 0.5674562123211929, + "grad_norm": 0.21570773422718048, + "learning_rate": 2.3476345651320713e-05, + "loss": 0.1293, + "step": 31815 + }, + { + "epoch": 0.5674740484429066, + "grad_norm": 0.24860739707946777, + "learning_rate": 2.347479204015108e-05, + "loss": 0.1052, + "step": 31816 + }, + { + "epoch": 0.5674918845646203, + "grad_norm": 0.31034550070762634, + "learning_rate": 2.3473238434893662e-05, + "loss": 0.1394, + "step": 31817 + }, + { + "epoch": 0.567509720686334, + "grad_norm": 0.2236240953207016, + "learning_rate": 2.347168483555451e-05, + "loss": 0.1212, + "step": 31818 + }, + { + "epoch": 0.5675275568080477, + "grad_norm": 0.46751800179481506, + "learning_rate": 2.3470131242139637e-05, + "loss": 0.1669, + "step": 31819 + }, + { + "epoch": 0.5675453929297614, + "grad_norm": 0.30457568168640137, + "learning_rate": 2.3468577654655066e-05, + "loss": 0.0997, + "step": 31820 + }, + { + "epoch": 0.567563229051475, + "grad_norm": 0.26304295659065247, + "learning_rate": 2.3467024073106807e-05, + "loss": 0.1565, + "step": 31821 + }, + { + "epoch": 0.5675810651731887, + "grad_norm": 0.3105275630950928, + "learning_rate": 2.34654704975009e-05, + "loss": 0.1089, + "step": 31822 + }, + { + "epoch": 0.5675989012949024, + "grad_norm": 0.23342853784561157, + "learning_rate": 2.3463916927843364e-05, + "loss": 0.1311, + "step": 31823 + }, + { + "epoch": 0.5676167374166161, + "grad_norm": 0.2506275475025177, + "learning_rate": 2.346236336414021e-05, + "loss": 0.1197, + "step": 31824 + }, + { + "epoch": 0.5676345735383298, + "grad_norm": 0.29939284920692444, + "learning_rate": 2.3460809806397462e-05, + "loss": 0.1589, + "step": 31825 + }, + { + "epoch": 0.5676524096600435, + "grad_norm": 0.3076852560043335, + "learning_rate": 2.345925625462115e-05, + "loss": 0.1414, + "step": 31826 + }, + { + "epoch": 0.5676702457817572, + "grad_norm": 0.35731741786003113, + "learning_rate": 2.34577027088173e-05, + "loss": 0.2079, + "step": 31827 + }, + { + "epoch": 0.5676880819034709, + "grad_norm": 0.2032073438167572, + "learning_rate": 2.3456149168991924e-05, + "loss": 0.1229, + "step": 31828 + }, + { + "epoch": 0.5677059180251846, + "grad_norm": 0.18575747311115265, + "learning_rate": 2.345459563515105e-05, + "loss": 0.1095, + "step": 31829 + }, + { + "epoch": 0.5677237541468984, + "grad_norm": 0.2545779049396515, + "learning_rate": 2.3453042107300682e-05, + "loss": 0.1396, + "step": 31830 + }, + { + "epoch": 0.567741590268612, + "grad_norm": 0.25183358788490295, + "learning_rate": 2.345148858544687e-05, + "loss": 0.1648, + "step": 31831 + }, + { + "epoch": 0.5677594263903257, + "grad_norm": 0.35085731744766235, + "learning_rate": 2.3449935069595618e-05, + "loss": 0.1143, + "step": 31832 + }, + { + "epoch": 0.5677772625120394, + "grad_norm": 0.28099489212036133, + "learning_rate": 2.344838155975296e-05, + "loss": 0.1137, + "step": 31833 + }, + { + "epoch": 0.5677950986337531, + "grad_norm": 0.27445825934410095, + "learning_rate": 2.34468280559249e-05, + "loss": 0.0834, + "step": 31834 + }, + { + "epoch": 0.5678129347554668, + "grad_norm": 0.2711659073829651, + "learning_rate": 2.344527455811748e-05, + "loss": 0.1134, + "step": 31835 + }, + { + "epoch": 0.5678307708771805, + "grad_norm": 0.34612905979156494, + "learning_rate": 2.344372106633671e-05, + "loss": 0.1334, + "step": 31836 + }, + { + "epoch": 0.5678486069988942, + "grad_norm": 0.18164853751659393, + "learning_rate": 2.3442167580588624e-05, + "loss": 0.0841, + "step": 31837 + }, + { + "epoch": 0.5678664431206079, + "grad_norm": 0.29229211807250977, + "learning_rate": 2.3440614100879217e-05, + "loss": 0.1462, + "step": 31838 + }, + { + "epoch": 0.5678842792423215, + "grad_norm": 0.28702273964881897, + "learning_rate": 2.3439060627214536e-05, + "loss": 0.2186, + "step": 31839 + }, + { + "epoch": 0.5679021153640352, + "grad_norm": 0.2673148810863495, + "learning_rate": 2.3437507159600597e-05, + "loss": 0.1317, + "step": 31840 + }, + { + "epoch": 0.5679199514857489, + "grad_norm": 0.2983056604862213, + "learning_rate": 2.3435953698043418e-05, + "loss": 0.1997, + "step": 31841 + }, + { + "epoch": 0.5679377876074626, + "grad_norm": 0.3157918453216553, + "learning_rate": 2.343440024254903e-05, + "loss": 0.1529, + "step": 31842 + }, + { + "epoch": 0.5679556237291763, + "grad_norm": 0.21640507876873016, + "learning_rate": 2.343284679312343e-05, + "loss": 0.145, + "step": 31843 + }, + { + "epoch": 0.56797345985089, + "grad_norm": 0.23252245783805847, + "learning_rate": 2.3431293349772672e-05, + "loss": 0.1157, + "step": 31844 + }, + { + "epoch": 0.5679912959726037, + "grad_norm": 0.28265056014060974, + "learning_rate": 2.3429739912502757e-05, + "loss": 0.1789, + "step": 31845 + }, + { + "epoch": 0.5680091320943174, + "grad_norm": 0.2644653618335724, + "learning_rate": 2.342818648131972e-05, + "loss": 0.139, + "step": 31846 + }, + { + "epoch": 0.5680269682160312, + "grad_norm": 0.1959562748670578, + "learning_rate": 2.342663305622956e-05, + "loss": 0.0951, + "step": 31847 + }, + { + "epoch": 0.5680448043377448, + "grad_norm": 0.24504858255386353, + "learning_rate": 2.3425079637238326e-05, + "loss": 0.1548, + "step": 31848 + }, + { + "epoch": 0.5680626404594585, + "grad_norm": 0.21574705839157104, + "learning_rate": 2.3423526224352026e-05, + "loss": 0.1129, + "step": 31849 + }, + { + "epoch": 0.5680804765811722, + "grad_norm": 0.2897961437702179, + "learning_rate": 2.3421972817576686e-05, + "loss": 0.0994, + "step": 31850 + }, + { + "epoch": 0.5680983127028859, + "grad_norm": 0.23364970088005066, + "learning_rate": 2.342041941691831e-05, + "loss": 0.1182, + "step": 31851 + }, + { + "epoch": 0.5681161488245996, + "grad_norm": 0.4226117432117462, + "learning_rate": 2.3418866022382948e-05, + "loss": 0.1823, + "step": 31852 + }, + { + "epoch": 0.5681339849463133, + "grad_norm": 0.239666149020195, + "learning_rate": 2.34173126339766e-05, + "loss": 0.1228, + "step": 31853 + }, + { + "epoch": 0.568151821068027, + "grad_norm": 0.3042081892490387, + "learning_rate": 2.34157592517053e-05, + "loss": 0.1316, + "step": 31854 + }, + { + "epoch": 0.5681696571897407, + "grad_norm": 0.24170371890068054, + "learning_rate": 2.3414205875575068e-05, + "loss": 0.1067, + "step": 31855 + }, + { + "epoch": 0.5681874933114544, + "grad_norm": 0.2811267077922821, + "learning_rate": 2.3412652505591905e-05, + "loss": 0.1101, + "step": 31856 + }, + { + "epoch": 0.568205329433168, + "grad_norm": 0.29492050409317017, + "learning_rate": 2.3411099141761864e-05, + "loss": 0.1344, + "step": 31857 + }, + { + "epoch": 0.5682231655548817, + "grad_norm": 0.3043202757835388, + "learning_rate": 2.3409545784090952e-05, + "loss": 0.1036, + "step": 31858 + }, + { + "epoch": 0.5682410016765954, + "grad_norm": 0.27893373370170593, + "learning_rate": 2.3407992432585183e-05, + "loss": 0.1489, + "step": 31859 + }, + { + "epoch": 0.5682588377983091, + "grad_norm": 0.23858894407749176, + "learning_rate": 2.3406439087250584e-05, + "loss": 0.1242, + "step": 31860 + }, + { + "epoch": 0.5682766739200228, + "grad_norm": 0.31661051511764526, + "learning_rate": 2.340488574809318e-05, + "loss": 0.1698, + "step": 31861 + }, + { + "epoch": 0.5682945100417365, + "grad_norm": 0.20748351514339447, + "learning_rate": 2.3403332415118993e-05, + "loss": 0.0778, + "step": 31862 + }, + { + "epoch": 0.5683123461634502, + "grad_norm": 0.24186772108078003, + "learning_rate": 2.3401779088334043e-05, + "loss": 0.1236, + "step": 31863 + }, + { + "epoch": 0.568330182285164, + "grad_norm": 0.3661872446537018, + "learning_rate": 2.3400225767744336e-05, + "loss": 0.1579, + "step": 31864 + }, + { + "epoch": 0.5683480184068777, + "grad_norm": 0.3736518919467926, + "learning_rate": 2.339867245335592e-05, + "loss": 0.1422, + "step": 31865 + }, + { + "epoch": 0.5683658545285913, + "grad_norm": 0.21076396107673645, + "learning_rate": 2.33971191451748e-05, + "loss": 0.0995, + "step": 31866 + }, + { + "epoch": 0.568383690650305, + "grad_norm": 0.38257306814193726, + "learning_rate": 2.3395565843206995e-05, + "loss": 0.1414, + "step": 31867 + }, + { + "epoch": 0.5684015267720187, + "grad_norm": 0.2380020171403885, + "learning_rate": 2.339401254745854e-05, + "loss": 0.1093, + "step": 31868 + }, + { + "epoch": 0.5684193628937324, + "grad_norm": 0.3140031397342682, + "learning_rate": 2.3392459257935432e-05, + "loss": 0.1513, + "step": 31869 + }, + { + "epoch": 0.5684371990154461, + "grad_norm": 0.3794325590133667, + "learning_rate": 2.3390905974643716e-05, + "loss": 0.2061, + "step": 31870 + }, + { + "epoch": 0.5684550351371598, + "grad_norm": 0.3635885417461395, + "learning_rate": 2.338935269758941e-05, + "loss": 0.1426, + "step": 31871 + }, + { + "epoch": 0.5684728712588735, + "grad_norm": 0.39303216338157654, + "learning_rate": 2.338779942677852e-05, + "loss": 0.167, + "step": 31872 + }, + { + "epoch": 0.5684907073805872, + "grad_norm": 0.20632973313331604, + "learning_rate": 2.3386246162217078e-05, + "loss": 0.15, + "step": 31873 + }, + { + "epoch": 0.5685085435023008, + "grad_norm": 0.22460660338401794, + "learning_rate": 2.3384692903911103e-05, + "loss": 0.1138, + "step": 31874 + }, + { + "epoch": 0.5685263796240145, + "grad_norm": 0.21528342366218567, + "learning_rate": 2.3383139651866622e-05, + "loss": 0.1034, + "step": 31875 + }, + { + "epoch": 0.5685442157457282, + "grad_norm": 0.2516711950302124, + "learning_rate": 2.338158640608965e-05, + "loss": 0.0816, + "step": 31876 + }, + { + "epoch": 0.5685620518674419, + "grad_norm": 0.2805202305316925, + "learning_rate": 2.3380033166586197e-05, + "loss": 0.0803, + "step": 31877 + }, + { + "epoch": 0.5685798879891556, + "grad_norm": 0.2692238390445709, + "learning_rate": 2.3378479933362304e-05, + "loss": 0.0922, + "step": 31878 + }, + { + "epoch": 0.5685977241108693, + "grad_norm": 0.2968035042285919, + "learning_rate": 2.3376926706423985e-05, + "loss": 0.1776, + "step": 31879 + }, + { + "epoch": 0.568615560232583, + "grad_norm": 0.2722340524196625, + "learning_rate": 2.3375373485777254e-05, + "loss": 0.1227, + "step": 31880 + }, + { + "epoch": 0.5686333963542968, + "grad_norm": 0.23171938955783844, + "learning_rate": 2.337382027142814e-05, + "loss": 0.1121, + "step": 31881 + }, + { + "epoch": 0.5686512324760105, + "grad_norm": 0.41601529717445374, + "learning_rate": 2.3372267063382648e-05, + "loss": 0.1537, + "step": 31882 + }, + { + "epoch": 0.5686690685977241, + "grad_norm": 0.22648778557777405, + "learning_rate": 2.3370713861646826e-05, + "loss": 0.1703, + "step": 31883 + }, + { + "epoch": 0.5686869047194378, + "grad_norm": 0.224659726023674, + "learning_rate": 2.3369160666226677e-05, + "loss": 0.1118, + "step": 31884 + }, + { + "epoch": 0.5687047408411515, + "grad_norm": 0.2781287133693695, + "learning_rate": 2.3367607477128227e-05, + "loss": 0.16, + "step": 31885 + }, + { + "epoch": 0.5687225769628652, + "grad_norm": 0.29262882471084595, + "learning_rate": 2.3366054294357482e-05, + "loss": 0.1595, + "step": 31886 + }, + { + "epoch": 0.5687404130845789, + "grad_norm": 0.22359992563724518, + "learning_rate": 2.3364501117920484e-05, + "loss": 0.1064, + "step": 31887 + }, + { + "epoch": 0.5687582492062926, + "grad_norm": 0.24614740908145905, + "learning_rate": 2.3362947947823242e-05, + "loss": 0.163, + "step": 31888 + }, + { + "epoch": 0.5687760853280063, + "grad_norm": 0.2586507201194763, + "learning_rate": 2.3361394784071782e-05, + "loss": 0.1544, + "step": 31889 + }, + { + "epoch": 0.56879392144972, + "grad_norm": 0.24406969547271729, + "learning_rate": 2.3359841626672113e-05, + "loss": 0.1329, + "step": 31890 + }, + { + "epoch": 0.5688117575714337, + "grad_norm": 0.24728265404701233, + "learning_rate": 2.3358288475630274e-05, + "loss": 0.1359, + "step": 31891 + }, + { + "epoch": 0.5688295936931473, + "grad_norm": 0.3328385353088379, + "learning_rate": 2.3356735330952275e-05, + "loss": 0.1788, + "step": 31892 + }, + { + "epoch": 0.568847429814861, + "grad_norm": 0.29807713627815247, + "learning_rate": 2.335518219264414e-05, + "loss": 0.1841, + "step": 31893 + }, + { + "epoch": 0.5688652659365747, + "grad_norm": 0.22984002530574799, + "learning_rate": 2.3353629060711878e-05, + "loss": 0.1167, + "step": 31894 + }, + { + "epoch": 0.5688831020582884, + "grad_norm": 0.3549318015575409, + "learning_rate": 2.3352075935161516e-05, + "loss": 0.1444, + "step": 31895 + }, + { + "epoch": 0.5689009381800021, + "grad_norm": 0.24268245697021484, + "learning_rate": 2.3350522815999086e-05, + "loss": 0.1241, + "step": 31896 + }, + { + "epoch": 0.5689187743017158, + "grad_norm": 0.24521784484386444, + "learning_rate": 2.33489697032306e-05, + "loss": 0.0854, + "step": 31897 + }, + { + "epoch": 0.5689366104234296, + "grad_norm": 0.2668326497077942, + "learning_rate": 2.3347416596862078e-05, + "loss": 0.0893, + "step": 31898 + }, + { + "epoch": 0.5689544465451433, + "grad_norm": 0.20976288616657257, + "learning_rate": 2.3345863496899527e-05, + "loss": 0.1322, + "step": 31899 + }, + { + "epoch": 0.568972282666857, + "grad_norm": 0.2963721752166748, + "learning_rate": 2.3344310403348995e-05, + "loss": 0.1077, + "step": 31900 + }, + { + "epoch": 0.5689901187885706, + "grad_norm": 0.2996422350406647, + "learning_rate": 2.3342757316216483e-05, + "loss": 0.0912, + "step": 31901 + }, + { + "epoch": 0.5690079549102843, + "grad_norm": 0.2290738970041275, + "learning_rate": 2.3341204235508018e-05, + "loss": 0.1502, + "step": 31902 + }, + { + "epoch": 0.569025791031998, + "grad_norm": 0.2685301899909973, + "learning_rate": 2.333965116122961e-05, + "loss": 0.1368, + "step": 31903 + }, + { + "epoch": 0.5690436271537117, + "grad_norm": 0.21077746152877808, + "learning_rate": 2.3338098093387294e-05, + "loss": 0.1316, + "step": 31904 + }, + { + "epoch": 0.5690614632754254, + "grad_norm": 0.3262901306152344, + "learning_rate": 2.3336545031987087e-05, + "loss": 0.0929, + "step": 31905 + }, + { + "epoch": 0.5690792993971391, + "grad_norm": 0.34701892733573914, + "learning_rate": 2.3334991977035006e-05, + "loss": 0.1654, + "step": 31906 + }, + { + "epoch": 0.5690971355188528, + "grad_norm": 0.20195747911930084, + "learning_rate": 2.3333438928537066e-05, + "loss": 0.1275, + "step": 31907 + }, + { + "epoch": 0.5691149716405665, + "grad_norm": 0.16705262660980225, + "learning_rate": 2.333188588649929e-05, + "loss": 0.1099, + "step": 31908 + }, + { + "epoch": 0.5691328077622801, + "grad_norm": 0.2527308166027069, + "learning_rate": 2.3330332850927703e-05, + "loss": 0.1506, + "step": 31909 + }, + { + "epoch": 0.5691506438839938, + "grad_norm": 0.2541621923446655, + "learning_rate": 2.332877982182833e-05, + "loss": 0.1472, + "step": 31910 + }, + { + "epoch": 0.5691684800057075, + "grad_norm": 0.2527715265750885, + "learning_rate": 2.3327226799207177e-05, + "loss": 0.1368, + "step": 31911 + }, + { + "epoch": 0.5691863161274212, + "grad_norm": 0.3632810711860657, + "learning_rate": 2.3325673783070266e-05, + "loss": 0.1062, + "step": 31912 + }, + { + "epoch": 0.5692041522491349, + "grad_norm": 0.27961620688438416, + "learning_rate": 2.332412077342363e-05, + "loss": 0.1489, + "step": 31913 + }, + { + "epoch": 0.5692219883708487, + "grad_norm": 0.22823116183280945, + "learning_rate": 2.332256777027328e-05, + "loss": 0.1038, + "step": 31914 + }, + { + "epoch": 0.5692398244925624, + "grad_norm": 0.3571709096431732, + "learning_rate": 2.3321014773625234e-05, + "loss": 0.1755, + "step": 31915 + }, + { + "epoch": 0.5692576606142761, + "grad_norm": 0.18647009134292603, + "learning_rate": 2.331946178348551e-05, + "loss": 0.1023, + "step": 31916 + }, + { + "epoch": 0.5692754967359898, + "grad_norm": 0.2772287428379059, + "learning_rate": 2.3317908799860135e-05, + "loss": 0.0915, + "step": 31917 + }, + { + "epoch": 0.5692933328577034, + "grad_norm": 0.25653931498527527, + "learning_rate": 2.3316355822755136e-05, + "loss": 0.1312, + "step": 31918 + }, + { + "epoch": 0.5693111689794171, + "grad_norm": 0.24136759340763092, + "learning_rate": 2.331480285217652e-05, + "loss": 0.1246, + "step": 31919 + }, + { + "epoch": 0.5693290051011308, + "grad_norm": 0.22177313268184662, + "learning_rate": 2.3313249888130308e-05, + "loss": 0.147, + "step": 31920 + }, + { + "epoch": 0.5693468412228445, + "grad_norm": 0.21226876974105835, + "learning_rate": 2.331169693062251e-05, + "loss": 0.0793, + "step": 31921 + }, + { + "epoch": 0.5693646773445582, + "grad_norm": 0.3017657995223999, + "learning_rate": 2.3310143979659167e-05, + "loss": 0.1554, + "step": 31922 + }, + { + "epoch": 0.5693825134662719, + "grad_norm": 0.20766964554786682, + "learning_rate": 2.3308591035246292e-05, + "loss": 0.0575, + "step": 31923 + }, + { + "epoch": 0.5694003495879856, + "grad_norm": 0.3533168435096741, + "learning_rate": 2.3307038097389906e-05, + "loss": 0.1226, + "step": 31924 + }, + { + "epoch": 0.5694181857096993, + "grad_norm": 0.35080260038375854, + "learning_rate": 2.3305485166096013e-05, + "loss": 0.1395, + "step": 31925 + }, + { + "epoch": 0.569436021831413, + "grad_norm": 0.4067175090312958, + "learning_rate": 2.3303932241370658e-05, + "loss": 0.117, + "step": 31926 + }, + { + "epoch": 0.5694538579531266, + "grad_norm": 0.20107346773147583, + "learning_rate": 2.3302379323219846e-05, + "loss": 0.1126, + "step": 31927 + }, + { + "epoch": 0.5694716940748403, + "grad_norm": 0.21098102629184723, + "learning_rate": 2.330082641164959e-05, + "loss": 0.1516, + "step": 31928 + }, + { + "epoch": 0.569489530196554, + "grad_norm": 0.3123009204864502, + "learning_rate": 2.329927350666592e-05, + "loss": 0.132, + "step": 31929 + }, + { + "epoch": 0.5695073663182677, + "grad_norm": 0.26454970240592957, + "learning_rate": 2.329772060827485e-05, + "loss": 0.123, + "step": 31930 + }, + { + "epoch": 0.5695252024399815, + "grad_norm": 0.308457612991333, + "learning_rate": 2.329616771648241e-05, + "loss": 0.1505, + "step": 31931 + }, + { + "epoch": 0.5695430385616952, + "grad_norm": 0.22915974259376526, + "learning_rate": 2.3294614831294615e-05, + "loss": 0.0846, + "step": 31932 + }, + { + "epoch": 0.5695608746834089, + "grad_norm": 0.2821044921875, + "learning_rate": 2.329306195271748e-05, + "loss": 0.1022, + "step": 31933 + }, + { + "epoch": 0.5695787108051226, + "grad_norm": 0.2712126672267914, + "learning_rate": 2.3291509080757016e-05, + "loss": 0.1705, + "step": 31934 + }, + { + "epoch": 0.5695965469268363, + "grad_norm": 0.2786289155483246, + "learning_rate": 2.328995621541926e-05, + "loss": 0.1308, + "step": 31935 + }, + { + "epoch": 0.5696143830485499, + "grad_norm": 0.22536014020442963, + "learning_rate": 2.3288403356710227e-05, + "loss": 0.1025, + "step": 31936 + }, + { + "epoch": 0.5696322191702636, + "grad_norm": 0.22865964472293854, + "learning_rate": 2.3286850504635933e-05, + "loss": 0.1401, + "step": 31937 + }, + { + "epoch": 0.5696500552919773, + "grad_norm": 0.22847989201545715, + "learning_rate": 2.328529765920239e-05, + "loss": 0.1056, + "step": 31938 + }, + { + "epoch": 0.569667891413691, + "grad_norm": 0.24454420804977417, + "learning_rate": 2.3283744820415634e-05, + "loss": 0.1302, + "step": 31939 + }, + { + "epoch": 0.5696857275354047, + "grad_norm": 0.2526075541973114, + "learning_rate": 2.3282191988281676e-05, + "loss": 0.0834, + "step": 31940 + }, + { + "epoch": 0.5697035636571184, + "grad_norm": 0.38525134325027466, + "learning_rate": 2.3280639162806538e-05, + "loss": 0.1491, + "step": 31941 + }, + { + "epoch": 0.5697213997788321, + "grad_norm": 0.48143163323402405, + "learning_rate": 2.3279086343996225e-05, + "loss": 0.1437, + "step": 31942 + }, + { + "epoch": 0.5697392359005458, + "grad_norm": 0.31480303406715393, + "learning_rate": 2.3277533531856777e-05, + "loss": 0.1154, + "step": 31943 + }, + { + "epoch": 0.5697570720222594, + "grad_norm": 0.320762038230896, + "learning_rate": 2.32759807263942e-05, + "loss": 0.2208, + "step": 31944 + }, + { + "epoch": 0.5697749081439731, + "grad_norm": 0.25504228472709656, + "learning_rate": 2.3274427927614518e-05, + "loss": 0.1716, + "step": 31945 + }, + { + "epoch": 0.5697927442656868, + "grad_norm": 0.1853378564119339, + "learning_rate": 2.3272875135523754e-05, + "loss": 0.0676, + "step": 31946 + }, + { + "epoch": 0.5698105803874005, + "grad_norm": 0.23926015198230743, + "learning_rate": 2.327132235012791e-05, + "loss": 0.1285, + "step": 31947 + }, + { + "epoch": 0.5698284165091143, + "grad_norm": 0.2526625692844391, + "learning_rate": 2.3269769571433028e-05, + "loss": 0.1028, + "step": 31948 + }, + { + "epoch": 0.569846252630828, + "grad_norm": 0.32967737317085266, + "learning_rate": 2.3268216799445114e-05, + "loss": 0.1818, + "step": 31949 + }, + { + "epoch": 0.5698640887525417, + "grad_norm": 0.2688600420951843, + "learning_rate": 2.3266664034170194e-05, + "loss": 0.1025, + "step": 31950 + }, + { + "epoch": 0.5698819248742554, + "grad_norm": 0.3574877083301544, + "learning_rate": 2.326511127561427e-05, + "loss": 0.1226, + "step": 31951 + }, + { + "epoch": 0.5698997609959691, + "grad_norm": 0.2183462381362915, + "learning_rate": 2.3263558523783386e-05, + "loss": 0.1615, + "step": 31952 + }, + { + "epoch": 0.5699175971176828, + "grad_norm": 0.24285347759723663, + "learning_rate": 2.326200577868355e-05, + "loss": 0.132, + "step": 31953 + }, + { + "epoch": 0.5699354332393964, + "grad_norm": 0.20640622079372406, + "learning_rate": 2.3260453040320778e-05, + "loss": 0.0971, + "step": 31954 + }, + { + "epoch": 0.5699532693611101, + "grad_norm": 0.2130829393863678, + "learning_rate": 2.325890030870108e-05, + "loss": 0.1391, + "step": 31955 + }, + { + "epoch": 0.5699711054828238, + "grad_norm": 0.277995228767395, + "learning_rate": 2.32573475838305e-05, + "loss": 0.1785, + "step": 31956 + }, + { + "epoch": 0.5699889416045375, + "grad_norm": 0.2410154789686203, + "learning_rate": 2.3255794865715037e-05, + "loss": 0.1381, + "step": 31957 + }, + { + "epoch": 0.5700067777262512, + "grad_norm": 0.21570318937301636, + "learning_rate": 2.3254242154360716e-05, + "loss": 0.1228, + "step": 31958 + }, + { + "epoch": 0.5700246138479649, + "grad_norm": 0.3246050775051117, + "learning_rate": 2.3252689449773558e-05, + "loss": 0.1417, + "step": 31959 + }, + { + "epoch": 0.5700424499696786, + "grad_norm": 0.20825032889842987, + "learning_rate": 2.325113675195957e-05, + "loss": 0.0807, + "step": 31960 + }, + { + "epoch": 0.5700602860913923, + "grad_norm": 0.273908406496048, + "learning_rate": 2.324958406092479e-05, + "loss": 0.1563, + "step": 31961 + }, + { + "epoch": 0.5700781222131059, + "grad_norm": 0.31246012449264526, + "learning_rate": 2.3248031376675226e-05, + "loss": 0.1729, + "step": 31962 + }, + { + "epoch": 0.5700959583348196, + "grad_norm": 0.2307450920343399, + "learning_rate": 2.324647869921689e-05, + "loss": 0.0948, + "step": 31963 + }, + { + "epoch": 0.5701137944565333, + "grad_norm": 0.234095498919487, + "learning_rate": 2.3244926028555815e-05, + "loss": 0.1093, + "step": 31964 + }, + { + "epoch": 0.5701316305782471, + "grad_norm": 0.2592236399650574, + "learning_rate": 2.3243373364698006e-05, + "loss": 0.1204, + "step": 31965 + }, + { + "epoch": 0.5701494666999608, + "grad_norm": 0.25265398621559143, + "learning_rate": 2.3241820707649495e-05, + "loss": 0.146, + "step": 31966 + }, + { + "epoch": 0.5701673028216745, + "grad_norm": 0.3033604919910431, + "learning_rate": 2.32402680574163e-05, + "loss": 0.1488, + "step": 31967 + }, + { + "epoch": 0.5701851389433882, + "grad_norm": 0.3133663237094879, + "learning_rate": 2.3238715414004415e-05, + "loss": 0.1684, + "step": 31968 + }, + { + "epoch": 0.5702029750651019, + "grad_norm": 0.3297819197177887, + "learning_rate": 2.323716277741989e-05, + "loss": 0.1415, + "step": 31969 + }, + { + "epoch": 0.5702208111868156, + "grad_norm": 0.2560020387172699, + "learning_rate": 2.3235610147668735e-05, + "loss": 0.0864, + "step": 31970 + }, + { + "epoch": 0.5702386473085292, + "grad_norm": 0.23315803706645966, + "learning_rate": 2.3234057524756956e-05, + "loss": 0.1196, + "step": 31971 + }, + { + "epoch": 0.5702564834302429, + "grad_norm": 0.32611286640167236, + "learning_rate": 2.3232504908690585e-05, + "loss": 0.151, + "step": 31972 + }, + { + "epoch": 0.5702743195519566, + "grad_norm": 0.24620841443538666, + "learning_rate": 2.3230952299475628e-05, + "loss": 0.0744, + "step": 31973 + }, + { + "epoch": 0.5702921556736703, + "grad_norm": 0.24318484961986542, + "learning_rate": 2.3229399697118116e-05, + "loss": 0.1477, + "step": 31974 + }, + { + "epoch": 0.570309991795384, + "grad_norm": 0.24427008628845215, + "learning_rate": 2.322784710162407e-05, + "loss": 0.1613, + "step": 31975 + }, + { + "epoch": 0.5703278279170977, + "grad_norm": 0.23330405354499817, + "learning_rate": 2.3226294512999496e-05, + "loss": 0.1493, + "step": 31976 + }, + { + "epoch": 0.5703456640388114, + "grad_norm": 0.252483069896698, + "learning_rate": 2.3224741931250404e-05, + "loss": 0.1353, + "step": 31977 + }, + { + "epoch": 0.5703635001605251, + "grad_norm": 0.3242998421192169, + "learning_rate": 2.3223189356382836e-05, + "loss": 0.1127, + "step": 31978 + }, + { + "epoch": 0.5703813362822387, + "grad_norm": 0.2655237913131714, + "learning_rate": 2.32216367884028e-05, + "loss": 0.1298, + "step": 31979 + }, + { + "epoch": 0.5703991724039524, + "grad_norm": 0.27564719319343567, + "learning_rate": 2.322008422731632e-05, + "loss": 0.1236, + "step": 31980 + }, + { + "epoch": 0.5704170085256661, + "grad_norm": 0.29023295640945435, + "learning_rate": 2.321853167312939e-05, + "loss": 0.1997, + "step": 31981 + }, + { + "epoch": 0.5704348446473799, + "grad_norm": 0.2064339965581894, + "learning_rate": 2.3216979125848067e-05, + "loss": 0.0732, + "step": 31982 + }, + { + "epoch": 0.5704526807690936, + "grad_norm": 0.3277190625667572, + "learning_rate": 2.321542658547834e-05, + "loss": 0.1234, + "step": 31983 + }, + { + "epoch": 0.5704705168908073, + "grad_norm": 0.2326844185590744, + "learning_rate": 2.3213874052026234e-05, + "loss": 0.1093, + "step": 31984 + }, + { + "epoch": 0.570488353012521, + "grad_norm": 0.31463688611984253, + "learning_rate": 2.3212321525497776e-05, + "loss": 0.146, + "step": 31985 + }, + { + "epoch": 0.5705061891342347, + "grad_norm": 0.217011496424675, + "learning_rate": 2.3210769005898964e-05, + "loss": 0.087, + "step": 31986 + }, + { + "epoch": 0.5705240252559484, + "grad_norm": 0.36521267890930176, + "learning_rate": 2.3209216493235842e-05, + "loss": 0.1446, + "step": 31987 + }, + { + "epoch": 0.570541861377662, + "grad_norm": 0.22679482400417328, + "learning_rate": 2.3207663987514412e-05, + "loss": 0.1876, + "step": 31988 + }, + { + "epoch": 0.5705596974993757, + "grad_norm": 0.24948854744434357, + "learning_rate": 2.3206111488740702e-05, + "loss": 0.1131, + "step": 31989 + }, + { + "epoch": 0.5705775336210894, + "grad_norm": 0.2821401059627533, + "learning_rate": 2.3204558996920706e-05, + "loss": 0.1105, + "step": 31990 + }, + { + "epoch": 0.5705953697428031, + "grad_norm": 0.27452602982521057, + "learning_rate": 2.320300651206047e-05, + "loss": 0.1048, + "step": 31991 + }, + { + "epoch": 0.5706132058645168, + "grad_norm": 0.2265673726797104, + "learning_rate": 2.3201454034166e-05, + "loss": 0.1382, + "step": 31992 + }, + { + "epoch": 0.5706310419862305, + "grad_norm": 0.18851260840892792, + "learning_rate": 2.319990156324332e-05, + "loss": 0.1029, + "step": 31993 + }, + { + "epoch": 0.5706488781079442, + "grad_norm": 0.3642235994338989, + "learning_rate": 2.3198349099298433e-05, + "loss": 0.1312, + "step": 31994 + }, + { + "epoch": 0.5706667142296579, + "grad_norm": 0.31986698508262634, + "learning_rate": 2.3196796642337376e-05, + "loss": 0.1692, + "step": 31995 + }, + { + "epoch": 0.5706845503513716, + "grad_norm": 0.3594389855861664, + "learning_rate": 2.3195244192366158e-05, + "loss": 0.2242, + "step": 31996 + }, + { + "epoch": 0.5707023864730852, + "grad_norm": 0.3043067157268524, + "learning_rate": 2.31936917493908e-05, + "loss": 0.1267, + "step": 31997 + }, + { + "epoch": 0.5707202225947989, + "grad_norm": 0.23148998618125916, + "learning_rate": 2.3192139313417306e-05, + "loss": 0.1303, + "step": 31998 + }, + { + "epoch": 0.5707380587165127, + "grad_norm": 0.22920602560043335, + "learning_rate": 2.3190586884451705e-05, + "loss": 0.1424, + "step": 31999 + }, + { + "epoch": 0.5707558948382264, + "grad_norm": 0.27284783124923706, + "learning_rate": 2.318903446250002e-05, + "loss": 0.1461, + "step": 32000 + }, + { + "epoch": 0.5707558948382264, + "eval_loss": 0.1288762390613556, + "eval_runtime": 107.6868, + "eval_samples_per_second": 9.509, + "eval_steps_per_second": 1.588, + "step": 32000 + }, + { + "epoch": 0.5707737309599401, + "grad_norm": 0.24406161904335022, + "learning_rate": 2.3187482047568264e-05, + "loss": 0.1575, + "step": 32001 + }, + { + "epoch": 0.5707915670816538, + "grad_norm": 0.2429829239845276, + "learning_rate": 2.3185929639662457e-05, + "loss": 0.0983, + "step": 32002 + }, + { + "epoch": 0.5708094032033675, + "grad_norm": 0.29681476950645447, + "learning_rate": 2.3184377238788598e-05, + "loss": 0.1032, + "step": 32003 + }, + { + "epoch": 0.5708272393250812, + "grad_norm": 0.33057114481925964, + "learning_rate": 2.3182824844952733e-05, + "loss": 0.1621, + "step": 32004 + }, + { + "epoch": 0.5708450754467949, + "grad_norm": 0.2808226943016052, + "learning_rate": 2.3181272458160864e-05, + "loss": 0.1406, + "step": 32005 + }, + { + "epoch": 0.5708629115685085, + "grad_norm": 0.2762940526008606, + "learning_rate": 2.317972007841901e-05, + "loss": 0.1391, + "step": 32006 + }, + { + "epoch": 0.5708807476902222, + "grad_norm": 0.2615235447883606, + "learning_rate": 2.3178167705733185e-05, + "loss": 0.1292, + "step": 32007 + }, + { + "epoch": 0.5708985838119359, + "grad_norm": 0.26972535252571106, + "learning_rate": 2.3176615340109416e-05, + "loss": 0.103, + "step": 32008 + }, + { + "epoch": 0.5709164199336496, + "grad_norm": 0.26495981216430664, + "learning_rate": 2.3175062981553723e-05, + "loss": 0.1524, + "step": 32009 + }, + { + "epoch": 0.5709342560553633, + "grad_norm": 0.22156774997711182, + "learning_rate": 2.317351063007211e-05, + "loss": 0.1419, + "step": 32010 + }, + { + "epoch": 0.570952092177077, + "grad_norm": 0.33964696526527405, + "learning_rate": 2.31719582856706e-05, + "loss": 0.1563, + "step": 32011 + }, + { + "epoch": 0.5709699282987907, + "grad_norm": 0.2056892216205597, + "learning_rate": 2.3170405948355206e-05, + "loss": 0.0784, + "step": 32012 + }, + { + "epoch": 0.5709877644205044, + "grad_norm": 0.20111960172653198, + "learning_rate": 2.3168853618131952e-05, + "loss": 0.1085, + "step": 32013 + }, + { + "epoch": 0.571005600542218, + "grad_norm": 0.24330471456050873, + "learning_rate": 2.316730129500686e-05, + "loss": 0.1232, + "step": 32014 + }, + { + "epoch": 0.5710234366639318, + "grad_norm": 0.2663096487522125, + "learning_rate": 2.3165748978985944e-05, + "loss": 0.1221, + "step": 32015 + }, + { + "epoch": 0.5710412727856455, + "grad_norm": 0.2402796596288681, + "learning_rate": 2.31641966700752e-05, + "loss": 0.0941, + "step": 32016 + }, + { + "epoch": 0.5710591089073592, + "grad_norm": 0.2702605128288269, + "learning_rate": 2.3162644368280684e-05, + "loss": 0.1172, + "step": 32017 + }, + { + "epoch": 0.5710769450290729, + "grad_norm": 0.2644036114215851, + "learning_rate": 2.316109207360839e-05, + "loss": 0.1294, + "step": 32018 + }, + { + "epoch": 0.5710947811507866, + "grad_norm": 0.2989738881587982, + "learning_rate": 2.315953978606433e-05, + "loss": 0.1195, + "step": 32019 + }, + { + "epoch": 0.5711126172725003, + "grad_norm": 0.1917884647846222, + "learning_rate": 2.315798750565453e-05, + "loss": 0.0947, + "step": 32020 + }, + { + "epoch": 0.571130453394214, + "grad_norm": 0.27884235978126526, + "learning_rate": 2.3156435232385007e-05, + "loss": 0.1653, + "step": 32021 + }, + { + "epoch": 0.5711482895159277, + "grad_norm": 0.2796356976032257, + "learning_rate": 2.3154882966261784e-05, + "loss": 0.0953, + "step": 32022 + }, + { + "epoch": 0.5711661256376414, + "grad_norm": 0.32896968722343445, + "learning_rate": 2.315333070729087e-05, + "loss": 0.1703, + "step": 32023 + }, + { + "epoch": 0.571183961759355, + "grad_norm": 0.24676883220672607, + "learning_rate": 2.3151778455478287e-05, + "loss": 0.1382, + "step": 32024 + }, + { + "epoch": 0.5712017978810687, + "grad_norm": 0.23007610440254211, + "learning_rate": 2.315022621083004e-05, + "loss": 0.0919, + "step": 32025 + }, + { + "epoch": 0.5712196340027824, + "grad_norm": 0.25032544136047363, + "learning_rate": 2.3148673973352157e-05, + "loss": 0.1297, + "step": 32026 + }, + { + "epoch": 0.5712374701244961, + "grad_norm": 0.3636959493160248, + "learning_rate": 2.314712174305066e-05, + "loss": 0.1164, + "step": 32027 + }, + { + "epoch": 0.5712553062462098, + "grad_norm": 0.272922545671463, + "learning_rate": 2.3145569519931557e-05, + "loss": 0.1306, + "step": 32028 + }, + { + "epoch": 0.5712731423679235, + "grad_norm": 0.2876552641391754, + "learning_rate": 2.3144017304000857e-05, + "loss": 0.1617, + "step": 32029 + }, + { + "epoch": 0.5712909784896372, + "grad_norm": 0.4710802137851715, + "learning_rate": 2.3142465095264598e-05, + "loss": 0.1255, + "step": 32030 + }, + { + "epoch": 0.5713088146113509, + "grad_norm": 0.2886894643306732, + "learning_rate": 2.3140912893728786e-05, + "loss": 0.0996, + "step": 32031 + }, + { + "epoch": 0.5713266507330647, + "grad_norm": 0.27970659732818604, + "learning_rate": 2.3139360699399433e-05, + "loss": 0.1217, + "step": 32032 + }, + { + "epoch": 0.5713444868547783, + "grad_norm": 0.25713902711868286, + "learning_rate": 2.3137808512282558e-05, + "loss": 0.1298, + "step": 32033 + }, + { + "epoch": 0.571362322976492, + "grad_norm": 0.24366679787635803, + "learning_rate": 2.3136256332384182e-05, + "loss": 0.1432, + "step": 32034 + }, + { + "epoch": 0.5713801590982057, + "grad_norm": 0.3444467782974243, + "learning_rate": 2.3134704159710327e-05, + "loss": 0.1781, + "step": 32035 + }, + { + "epoch": 0.5713979952199194, + "grad_norm": 0.2424277663230896, + "learning_rate": 2.3133151994267003e-05, + "loss": 0.119, + "step": 32036 + }, + { + "epoch": 0.5714158313416331, + "grad_norm": 0.42974215745925903, + "learning_rate": 2.3131599836060226e-05, + "loss": 0.1411, + "step": 32037 + }, + { + "epoch": 0.5714336674633468, + "grad_norm": 0.21793238818645477, + "learning_rate": 2.3130047685095998e-05, + "loss": 0.1341, + "step": 32038 + }, + { + "epoch": 0.5714515035850605, + "grad_norm": 0.23208774626255035, + "learning_rate": 2.312849554138037e-05, + "loss": 0.1417, + "step": 32039 + }, + { + "epoch": 0.5714693397067742, + "grad_norm": 0.29587915539741516, + "learning_rate": 2.3126943404919328e-05, + "loss": 0.0714, + "step": 32040 + }, + { + "epoch": 0.5714871758284878, + "grad_norm": 0.5470567941665649, + "learning_rate": 2.3125391275718907e-05, + "loss": 0.1491, + "step": 32041 + }, + { + "epoch": 0.5715050119502015, + "grad_norm": 0.25324326753616333, + "learning_rate": 2.3123839153785107e-05, + "loss": 0.1425, + "step": 32042 + }, + { + "epoch": 0.5715228480719152, + "grad_norm": 0.3722260296344757, + "learning_rate": 2.3122287039123966e-05, + "loss": 0.1209, + "step": 32043 + }, + { + "epoch": 0.5715406841936289, + "grad_norm": 0.2965531051158905, + "learning_rate": 2.3120734931741487e-05, + "loss": 0.1132, + "step": 32044 + }, + { + "epoch": 0.5715585203153426, + "grad_norm": 0.2074170708656311, + "learning_rate": 2.311918283164369e-05, + "loss": 0.1377, + "step": 32045 + }, + { + "epoch": 0.5715763564370563, + "grad_norm": 0.24770928919315338, + "learning_rate": 2.311763073883658e-05, + "loss": 0.1758, + "step": 32046 + }, + { + "epoch": 0.57159419255877, + "grad_norm": 0.35495057702064514, + "learning_rate": 2.3116078653326192e-05, + "loss": 0.174, + "step": 32047 + }, + { + "epoch": 0.5716120286804837, + "grad_norm": 0.2525014877319336, + "learning_rate": 2.3114526575118527e-05, + "loss": 0.1753, + "step": 32048 + }, + { + "epoch": 0.5716298648021975, + "grad_norm": 0.20264236629009247, + "learning_rate": 2.3112974504219617e-05, + "loss": 0.1315, + "step": 32049 + }, + { + "epoch": 0.5716477009239112, + "grad_norm": 0.3439137041568756, + "learning_rate": 2.3111422440635457e-05, + "loss": 0.1561, + "step": 32050 + }, + { + "epoch": 0.5716655370456248, + "grad_norm": 0.24801819026470184, + "learning_rate": 2.310987038437209e-05, + "loss": 0.1004, + "step": 32051 + }, + { + "epoch": 0.5716833731673385, + "grad_norm": 0.28539639711380005, + "learning_rate": 2.3108318335435513e-05, + "loss": 0.1909, + "step": 32052 + }, + { + "epoch": 0.5717012092890522, + "grad_norm": 0.27709439396858215, + "learning_rate": 2.3106766293831752e-05, + "loss": 0.1241, + "step": 32053 + }, + { + "epoch": 0.5717190454107659, + "grad_norm": 0.3075057566165924, + "learning_rate": 2.310521425956681e-05, + "loss": 0.1156, + "step": 32054 + }, + { + "epoch": 0.5717368815324796, + "grad_norm": 0.5115877985954285, + "learning_rate": 2.310366223264671e-05, + "loss": 0.1382, + "step": 32055 + }, + { + "epoch": 0.5717547176541933, + "grad_norm": 0.2903379499912262, + "learning_rate": 2.3102110213077478e-05, + "loss": 0.1106, + "step": 32056 + }, + { + "epoch": 0.571772553775907, + "grad_norm": 0.24777671694755554, + "learning_rate": 2.3100558200865124e-05, + "loss": 0.0839, + "step": 32057 + }, + { + "epoch": 0.5717903898976207, + "grad_norm": 0.20852568745613098, + "learning_rate": 2.3099006196015662e-05, + "loss": 0.1269, + "step": 32058 + }, + { + "epoch": 0.5718082260193343, + "grad_norm": 0.31661632657051086, + "learning_rate": 2.3097454198535098e-05, + "loss": 0.1835, + "step": 32059 + }, + { + "epoch": 0.571826062141048, + "grad_norm": 0.24231091141700745, + "learning_rate": 2.3095902208429468e-05, + "loss": 0.0581, + "step": 32060 + }, + { + "epoch": 0.5718438982627617, + "grad_norm": 0.2693176567554474, + "learning_rate": 2.3094350225704773e-05, + "loss": 0.1361, + "step": 32061 + }, + { + "epoch": 0.5718617343844754, + "grad_norm": 0.20122888684272766, + "learning_rate": 2.3092798250367044e-05, + "loss": 0.0929, + "step": 32062 + }, + { + "epoch": 0.5718795705061891, + "grad_norm": 0.3333703577518463, + "learning_rate": 2.3091246282422275e-05, + "loss": 0.1147, + "step": 32063 + }, + { + "epoch": 0.5718974066279028, + "grad_norm": 0.28907883167266846, + "learning_rate": 2.3089694321876505e-05, + "loss": 0.1281, + "step": 32064 + }, + { + "epoch": 0.5719152427496165, + "grad_norm": 0.45795542001724243, + "learning_rate": 2.3088142368735738e-05, + "loss": 0.1485, + "step": 32065 + }, + { + "epoch": 0.5719330788713303, + "grad_norm": 0.2489219754934311, + "learning_rate": 2.3086590423005993e-05, + "loss": 0.1095, + "step": 32066 + }, + { + "epoch": 0.571950914993044, + "grad_norm": 0.2608923316001892, + "learning_rate": 2.3085038484693276e-05, + "loss": 0.1487, + "step": 32067 + }, + { + "epoch": 0.5719687511147576, + "grad_norm": 0.2482975870370865, + "learning_rate": 2.3083486553803617e-05, + "loss": 0.1135, + "step": 32068 + }, + { + "epoch": 0.5719865872364713, + "grad_norm": 0.22487591207027435, + "learning_rate": 2.308193463034302e-05, + "loss": 0.124, + "step": 32069 + }, + { + "epoch": 0.572004423358185, + "grad_norm": 0.22936218976974487, + "learning_rate": 2.308038271431752e-05, + "loss": 0.1364, + "step": 32070 + }, + { + "epoch": 0.5720222594798987, + "grad_norm": 0.28268882632255554, + "learning_rate": 2.3078830805733114e-05, + "loss": 0.165, + "step": 32071 + }, + { + "epoch": 0.5720400956016124, + "grad_norm": 0.3238881528377533, + "learning_rate": 2.3077278904595815e-05, + "loss": 0.1552, + "step": 32072 + }, + { + "epoch": 0.5720579317233261, + "grad_norm": 0.3021112382411957, + "learning_rate": 2.3075727010911655e-05, + "loss": 0.092, + "step": 32073 + }, + { + "epoch": 0.5720757678450398, + "grad_norm": 0.31802070140838623, + "learning_rate": 2.3074175124686643e-05, + "loss": 0.1321, + "step": 32074 + }, + { + "epoch": 0.5720936039667535, + "grad_norm": 0.27723920345306396, + "learning_rate": 2.307262324592679e-05, + "loss": 0.1191, + "step": 32075 + }, + { + "epoch": 0.5721114400884671, + "grad_norm": 0.31892484426498413, + "learning_rate": 2.3071071374638106e-05, + "loss": 0.1319, + "step": 32076 + }, + { + "epoch": 0.5721292762101808, + "grad_norm": 0.26982811093330383, + "learning_rate": 2.306951951082663e-05, + "loss": 0.1498, + "step": 32077 + }, + { + "epoch": 0.5721471123318945, + "grad_norm": 0.20521079003810883, + "learning_rate": 2.306796765449836e-05, + "loss": 0.1022, + "step": 32078 + }, + { + "epoch": 0.5721649484536082, + "grad_norm": 0.27309471368789673, + "learning_rate": 2.3066415805659316e-05, + "loss": 0.1624, + "step": 32079 + }, + { + "epoch": 0.5721827845753219, + "grad_norm": 0.23783299326896667, + "learning_rate": 2.306486396431551e-05, + "loss": 0.0787, + "step": 32080 + }, + { + "epoch": 0.5722006206970356, + "grad_norm": 0.34868064522743225, + "learning_rate": 2.3063312130472953e-05, + "loss": 0.1518, + "step": 32081 + }, + { + "epoch": 0.5722184568187493, + "grad_norm": 0.23732613027095795, + "learning_rate": 2.306176030413767e-05, + "loss": 0.1222, + "step": 32082 + }, + { + "epoch": 0.5722362929404631, + "grad_norm": 0.3081076145172119, + "learning_rate": 2.306020848531568e-05, + "loss": 0.1439, + "step": 32083 + }, + { + "epoch": 0.5722541290621768, + "grad_norm": 0.18338099122047424, + "learning_rate": 2.305865667401299e-05, + "loss": 0.1427, + "step": 32084 + }, + { + "epoch": 0.5722719651838905, + "grad_norm": 0.24480365216732025, + "learning_rate": 2.3057104870235606e-05, + "loss": 0.1032, + "step": 32085 + }, + { + "epoch": 0.5722898013056041, + "grad_norm": 0.2501121759414673, + "learning_rate": 2.305555307398957e-05, + "loss": 0.1679, + "step": 32086 + }, + { + "epoch": 0.5723076374273178, + "grad_norm": 0.32427743077278137, + "learning_rate": 2.305400128528088e-05, + "loss": 0.1343, + "step": 32087 + }, + { + "epoch": 0.5723254735490315, + "grad_norm": 0.33027711510658264, + "learning_rate": 2.3052449504115545e-05, + "loss": 0.1109, + "step": 32088 + }, + { + "epoch": 0.5723433096707452, + "grad_norm": 0.20214340090751648, + "learning_rate": 2.305089773049959e-05, + "loss": 0.1018, + "step": 32089 + }, + { + "epoch": 0.5723611457924589, + "grad_norm": 0.25955909490585327, + "learning_rate": 2.3049345964439028e-05, + "loss": 0.0906, + "step": 32090 + }, + { + "epoch": 0.5723789819141726, + "grad_norm": 0.24333368241786957, + "learning_rate": 2.3047794205939883e-05, + "loss": 0.1298, + "step": 32091 + }, + { + "epoch": 0.5723968180358863, + "grad_norm": 0.19318167865276337, + "learning_rate": 2.304624245500816e-05, + "loss": 0.1212, + "step": 32092 + }, + { + "epoch": 0.5724146541576, + "grad_norm": 0.24397949874401093, + "learning_rate": 2.3044690711649875e-05, + "loss": 0.1529, + "step": 32093 + }, + { + "epoch": 0.5724324902793136, + "grad_norm": 0.24525824189186096, + "learning_rate": 2.3043138975871033e-05, + "loss": 0.1114, + "step": 32094 + }, + { + "epoch": 0.5724503264010273, + "grad_norm": 0.2584737539291382, + "learning_rate": 2.3041587247677673e-05, + "loss": 0.0793, + "step": 32095 + }, + { + "epoch": 0.572468162522741, + "grad_norm": 0.2366800457239151, + "learning_rate": 2.3040035527075794e-05, + "loss": 0.1644, + "step": 32096 + }, + { + "epoch": 0.5724859986444547, + "grad_norm": 0.23789042234420776, + "learning_rate": 2.3038483814071416e-05, + "loss": 0.1215, + "step": 32097 + }, + { + "epoch": 0.5725038347661684, + "grad_norm": 0.2214551717042923, + "learning_rate": 2.3036932108670543e-05, + "loss": 0.1402, + "step": 32098 + }, + { + "epoch": 0.5725216708878821, + "grad_norm": 0.280853271484375, + "learning_rate": 2.3035380410879208e-05, + "loss": 0.1227, + "step": 32099 + }, + { + "epoch": 0.5725395070095959, + "grad_norm": 0.3992501497268677, + "learning_rate": 2.3033828720703417e-05, + "loss": 0.0877, + "step": 32100 + }, + { + "epoch": 0.5725573431313096, + "grad_norm": 0.24538998305797577, + "learning_rate": 2.3032277038149185e-05, + "loss": 0.1155, + "step": 32101 + }, + { + "epoch": 0.5725751792530233, + "grad_norm": 0.27168336510658264, + "learning_rate": 2.3030725363222518e-05, + "loss": 0.0808, + "step": 32102 + }, + { + "epoch": 0.572593015374737, + "grad_norm": 0.33248263597488403, + "learning_rate": 2.3029173695929445e-05, + "loss": 0.1536, + "step": 32103 + }, + { + "epoch": 0.5726108514964506, + "grad_norm": 0.2076752632856369, + "learning_rate": 2.302762203627598e-05, + "loss": 0.1611, + "step": 32104 + }, + { + "epoch": 0.5726286876181643, + "grad_norm": 0.21981672942638397, + "learning_rate": 2.3026070384268132e-05, + "loss": 0.1324, + "step": 32105 + }, + { + "epoch": 0.572646523739878, + "grad_norm": 0.3746435046195984, + "learning_rate": 2.302451873991192e-05, + "loss": 0.0712, + "step": 32106 + }, + { + "epoch": 0.5726643598615917, + "grad_norm": 0.23974384367465973, + "learning_rate": 2.3022967103213346e-05, + "loss": 0.1133, + "step": 32107 + }, + { + "epoch": 0.5726821959833054, + "grad_norm": 0.3026624023914337, + "learning_rate": 2.302141547417844e-05, + "loss": 0.1432, + "step": 32108 + }, + { + "epoch": 0.5727000321050191, + "grad_norm": 0.22384096682071686, + "learning_rate": 2.3019863852813207e-05, + "loss": 0.1138, + "step": 32109 + }, + { + "epoch": 0.5727178682267328, + "grad_norm": 0.25508683919906616, + "learning_rate": 2.3018312239123675e-05, + "loss": 0.117, + "step": 32110 + }, + { + "epoch": 0.5727357043484465, + "grad_norm": 0.694191575050354, + "learning_rate": 2.3016760633115834e-05, + "loss": 0.2424, + "step": 32111 + }, + { + "epoch": 0.5727535404701601, + "grad_norm": 0.3212147653102875, + "learning_rate": 2.3015209034795725e-05, + "loss": 0.1275, + "step": 32112 + }, + { + "epoch": 0.5727713765918738, + "grad_norm": 0.22423692047595978, + "learning_rate": 2.301365744416935e-05, + "loss": 0.1405, + "step": 32113 + }, + { + "epoch": 0.5727892127135875, + "grad_norm": 0.2901543378829956, + "learning_rate": 2.301210586124273e-05, + "loss": 0.2205, + "step": 32114 + }, + { + "epoch": 0.5728070488353012, + "grad_norm": 0.23535673320293427, + "learning_rate": 2.301055428602186e-05, + "loss": 0.1353, + "step": 32115 + }, + { + "epoch": 0.572824884957015, + "grad_norm": 0.2565169930458069, + "learning_rate": 2.300900271851278e-05, + "loss": 0.1001, + "step": 32116 + }, + { + "epoch": 0.5728427210787287, + "grad_norm": 0.23519715666770935, + "learning_rate": 2.3007451158721488e-05, + "loss": 0.0915, + "step": 32117 + }, + { + "epoch": 0.5728605572004424, + "grad_norm": 0.20799244940280914, + "learning_rate": 2.300589960665401e-05, + "loss": 0.1578, + "step": 32118 + }, + { + "epoch": 0.5728783933221561, + "grad_norm": 0.23612584173679352, + "learning_rate": 2.300434806231635e-05, + "loss": 0.1592, + "step": 32119 + }, + { + "epoch": 0.5728962294438698, + "grad_norm": 0.27974092960357666, + "learning_rate": 2.3002796525714517e-05, + "loss": 0.1455, + "step": 32120 + }, + { + "epoch": 0.5729140655655834, + "grad_norm": 0.20070381462574005, + "learning_rate": 2.300124499685455e-05, + "loss": 0.1346, + "step": 32121 + }, + { + "epoch": 0.5729319016872971, + "grad_norm": 0.24519819021224976, + "learning_rate": 2.299969347574244e-05, + "loss": 0.0985, + "step": 32122 + }, + { + "epoch": 0.5729497378090108, + "grad_norm": 0.260475218296051, + "learning_rate": 2.299814196238421e-05, + "loss": 0.1423, + "step": 32123 + }, + { + "epoch": 0.5729675739307245, + "grad_norm": 0.27158546447753906, + "learning_rate": 2.299659045678587e-05, + "loss": 0.1368, + "step": 32124 + }, + { + "epoch": 0.5729854100524382, + "grad_norm": 0.19083476066589355, + "learning_rate": 2.2995038958953437e-05, + "loss": 0.111, + "step": 32125 + }, + { + "epoch": 0.5730032461741519, + "grad_norm": 0.2080836296081543, + "learning_rate": 2.2993487468892934e-05, + "loss": 0.1352, + "step": 32126 + }, + { + "epoch": 0.5730210822958656, + "grad_norm": 0.23523162305355072, + "learning_rate": 2.2991935986610365e-05, + "loss": 0.1105, + "step": 32127 + }, + { + "epoch": 0.5730389184175793, + "grad_norm": 0.2426353543996811, + "learning_rate": 2.2990384512111735e-05, + "loss": 0.1048, + "step": 32128 + }, + { + "epoch": 0.573056754539293, + "grad_norm": 0.20853424072265625, + "learning_rate": 2.2988833045403076e-05, + "loss": 0.0951, + "step": 32129 + }, + { + "epoch": 0.5730745906610066, + "grad_norm": 0.24359837174415588, + "learning_rate": 2.29872815864904e-05, + "loss": 0.1272, + "step": 32130 + }, + { + "epoch": 0.5730924267827203, + "grad_norm": 0.25653213262557983, + "learning_rate": 2.298573013537971e-05, + "loss": 0.1057, + "step": 32131 + }, + { + "epoch": 0.573110262904434, + "grad_norm": 0.292688250541687, + "learning_rate": 2.2984178692077027e-05, + "loss": 0.1614, + "step": 32132 + }, + { + "epoch": 0.5731280990261478, + "grad_norm": 0.22703872621059418, + "learning_rate": 2.2982627256588353e-05, + "loss": 0.0945, + "step": 32133 + }, + { + "epoch": 0.5731459351478615, + "grad_norm": 0.29195770621299744, + "learning_rate": 2.2981075828919728e-05, + "loss": 0.1472, + "step": 32134 + }, + { + "epoch": 0.5731637712695752, + "grad_norm": 0.3011448383331299, + "learning_rate": 2.2979524409077145e-05, + "loss": 0.1406, + "step": 32135 + }, + { + "epoch": 0.5731816073912889, + "grad_norm": 0.3198857009410858, + "learning_rate": 2.2977972997066622e-05, + "loss": 0.184, + "step": 32136 + }, + { + "epoch": 0.5731994435130026, + "grad_norm": 0.21236436069011688, + "learning_rate": 2.2976421592894172e-05, + "loss": 0.0994, + "step": 32137 + }, + { + "epoch": 0.5732172796347162, + "grad_norm": 0.2372962385416031, + "learning_rate": 2.297487019656581e-05, + "loss": 0.1371, + "step": 32138 + }, + { + "epoch": 0.5732351157564299, + "grad_norm": 0.371499240398407, + "learning_rate": 2.297331880808756e-05, + "loss": 0.1349, + "step": 32139 + }, + { + "epoch": 0.5732529518781436, + "grad_norm": 0.2742388844490051, + "learning_rate": 2.2971767427465425e-05, + "loss": 0.1859, + "step": 32140 + }, + { + "epoch": 0.5732707879998573, + "grad_norm": 0.26087868213653564, + "learning_rate": 2.2970216054705406e-05, + "loss": 0.1091, + "step": 32141 + }, + { + "epoch": 0.573288624121571, + "grad_norm": 0.2959029972553253, + "learning_rate": 2.2968664689813543e-05, + "loss": 0.1544, + "step": 32142 + }, + { + "epoch": 0.5733064602432847, + "grad_norm": 0.26370272040367126, + "learning_rate": 2.2967113332795838e-05, + "loss": 0.1608, + "step": 32143 + }, + { + "epoch": 0.5733242963649984, + "grad_norm": 0.36787140369415283, + "learning_rate": 2.29655619836583e-05, + "loss": 0.1148, + "step": 32144 + }, + { + "epoch": 0.5733421324867121, + "grad_norm": 0.3151266872882843, + "learning_rate": 2.2964010642406948e-05, + "loss": 0.1205, + "step": 32145 + }, + { + "epoch": 0.5733599686084258, + "grad_norm": 0.2222466766834259, + "learning_rate": 2.2962459309047784e-05, + "loss": 0.1555, + "step": 32146 + }, + { + "epoch": 0.5733778047301394, + "grad_norm": 0.2472175806760788, + "learning_rate": 2.2960907983586844e-05, + "loss": 0.1094, + "step": 32147 + }, + { + "epoch": 0.5733956408518531, + "grad_norm": 0.23866930603981018, + "learning_rate": 2.295935666603013e-05, + "loss": 0.1108, + "step": 32148 + }, + { + "epoch": 0.5734134769735668, + "grad_norm": 0.21956735849380493, + "learning_rate": 2.2957805356383654e-05, + "loss": 0.1442, + "step": 32149 + }, + { + "epoch": 0.5734313130952806, + "grad_norm": 0.21173205971717834, + "learning_rate": 2.2956254054653415e-05, + "loss": 0.102, + "step": 32150 + }, + { + "epoch": 0.5734491492169943, + "grad_norm": 0.2393854707479477, + "learning_rate": 2.2954702760845452e-05, + "loss": 0.1332, + "step": 32151 + }, + { + "epoch": 0.573466985338708, + "grad_norm": 0.22613154351711273, + "learning_rate": 2.2953151474965768e-05, + "loss": 0.1505, + "step": 32152 + }, + { + "epoch": 0.5734848214604217, + "grad_norm": 0.311028391122818, + "learning_rate": 2.2951600197020377e-05, + "loss": 0.1001, + "step": 32153 + }, + { + "epoch": 0.5735026575821354, + "grad_norm": 0.2462809830904007, + "learning_rate": 2.295004892701528e-05, + "loss": 0.1326, + "step": 32154 + }, + { + "epoch": 0.573520493703849, + "grad_norm": 0.2720085680484772, + "learning_rate": 2.2948497664956514e-05, + "loss": 0.138, + "step": 32155 + }, + { + "epoch": 0.5735383298255627, + "grad_norm": 0.32655903697013855, + "learning_rate": 2.294694641085008e-05, + "loss": 0.1129, + "step": 32156 + }, + { + "epoch": 0.5735561659472764, + "grad_norm": 0.17895837128162384, + "learning_rate": 2.294539516470199e-05, + "loss": 0.1469, + "step": 32157 + }, + { + "epoch": 0.5735740020689901, + "grad_norm": 0.2596011161804199, + "learning_rate": 2.294384392651825e-05, + "loss": 0.1651, + "step": 32158 + }, + { + "epoch": 0.5735918381907038, + "grad_norm": 0.21749746799468994, + "learning_rate": 2.2942292696304878e-05, + "loss": 0.1599, + "step": 32159 + }, + { + "epoch": 0.5736096743124175, + "grad_norm": 0.21060678362846375, + "learning_rate": 2.2940741474067902e-05, + "loss": 0.1095, + "step": 32160 + }, + { + "epoch": 0.5736275104341312, + "grad_norm": 0.27059638500213623, + "learning_rate": 2.2939190259813324e-05, + "loss": 0.1457, + "step": 32161 + }, + { + "epoch": 0.5736453465558449, + "grad_norm": 0.3091186583042145, + "learning_rate": 2.2937639053547155e-05, + "loss": 0.1242, + "step": 32162 + }, + { + "epoch": 0.5736631826775586, + "grad_norm": 0.3126698136329651, + "learning_rate": 2.2936087855275398e-05, + "loss": 0.1473, + "step": 32163 + }, + { + "epoch": 0.5736810187992722, + "grad_norm": 0.31462979316711426, + "learning_rate": 2.293453666500409e-05, + "loss": 0.1228, + "step": 32164 + }, + { + "epoch": 0.5736988549209859, + "grad_norm": 0.362069308757782, + "learning_rate": 2.2932985482739223e-05, + "loss": 0.1807, + "step": 32165 + }, + { + "epoch": 0.5737166910426996, + "grad_norm": 0.39632242918014526, + "learning_rate": 2.2931434308486826e-05, + "loss": 0.1493, + "step": 32166 + }, + { + "epoch": 0.5737345271644134, + "grad_norm": 0.28127777576446533, + "learning_rate": 2.292988314225289e-05, + "loss": 0.1157, + "step": 32167 + }, + { + "epoch": 0.5737523632861271, + "grad_norm": 0.23270058631896973, + "learning_rate": 2.2928331984043454e-05, + "loss": 0.1053, + "step": 32168 + }, + { + "epoch": 0.5737701994078408, + "grad_norm": 0.23575817048549652, + "learning_rate": 2.2926780833864523e-05, + "loss": 0.1141, + "step": 32169 + }, + { + "epoch": 0.5737880355295545, + "grad_norm": 0.23141250014305115, + "learning_rate": 2.2925229691722102e-05, + "loss": 0.1524, + "step": 32170 + }, + { + "epoch": 0.5738058716512682, + "grad_norm": 0.24286416172981262, + "learning_rate": 2.29236785576222e-05, + "loss": 0.109, + "step": 32171 + }, + { + "epoch": 0.5738237077729819, + "grad_norm": 0.22819821536540985, + "learning_rate": 2.292212743157084e-05, + "loss": 0.1425, + "step": 32172 + }, + { + "epoch": 0.5738415438946955, + "grad_norm": 0.5288501977920532, + "learning_rate": 2.2920576313574033e-05, + "loss": 0.1163, + "step": 32173 + }, + { + "epoch": 0.5738593800164092, + "grad_norm": 0.2385987788438797, + "learning_rate": 2.2919025203637793e-05, + "loss": 0.0881, + "step": 32174 + }, + { + "epoch": 0.5738772161381229, + "grad_norm": 0.3185761868953705, + "learning_rate": 2.291747410176813e-05, + "loss": 0.1443, + "step": 32175 + }, + { + "epoch": 0.5738950522598366, + "grad_norm": 0.20291152596473694, + "learning_rate": 2.2915923007971046e-05, + "loss": 0.1127, + "step": 32176 + }, + { + "epoch": 0.5739128883815503, + "grad_norm": 0.2372930943965912, + "learning_rate": 2.291437192225258e-05, + "loss": 0.1167, + "step": 32177 + }, + { + "epoch": 0.573930724503264, + "grad_norm": 0.2707439064979553, + "learning_rate": 2.291282084461872e-05, + "loss": 0.1334, + "step": 32178 + }, + { + "epoch": 0.5739485606249777, + "grad_norm": 0.2781921625137329, + "learning_rate": 2.291126977507549e-05, + "loss": 0.1281, + "step": 32179 + }, + { + "epoch": 0.5739663967466914, + "grad_norm": 0.21838612854480743, + "learning_rate": 2.2909718713628888e-05, + "loss": 0.1075, + "step": 32180 + }, + { + "epoch": 0.573984232868405, + "grad_norm": 0.23490910232067108, + "learning_rate": 2.2908167660284952e-05, + "loss": 0.1509, + "step": 32181 + }, + { + "epoch": 0.5740020689901187, + "grad_norm": 0.22514356672763824, + "learning_rate": 2.290661661504968e-05, + "loss": 0.1067, + "step": 32182 + }, + { + "epoch": 0.5740199051118324, + "grad_norm": 0.25914818048477173, + "learning_rate": 2.2905065577929085e-05, + "loss": 0.1273, + "step": 32183 + }, + { + "epoch": 0.5740377412335462, + "grad_norm": 0.25469735264778137, + "learning_rate": 2.2903514548929185e-05, + "loss": 0.1143, + "step": 32184 + }, + { + "epoch": 0.5740555773552599, + "grad_norm": 0.23247644305229187, + "learning_rate": 2.2901963528055966e-05, + "loss": 0.1127, + "step": 32185 + }, + { + "epoch": 0.5740734134769736, + "grad_norm": 0.3107673227787018, + "learning_rate": 2.2900412515315473e-05, + "loss": 0.1773, + "step": 32186 + }, + { + "epoch": 0.5740912495986873, + "grad_norm": 0.27206945419311523, + "learning_rate": 2.289886151071371e-05, + "loss": 0.1132, + "step": 32187 + }, + { + "epoch": 0.574109085720401, + "grad_norm": 0.41328123211860657, + "learning_rate": 2.2897310514256687e-05, + "loss": 0.1392, + "step": 32188 + }, + { + "epoch": 0.5741269218421147, + "grad_norm": 0.3105325698852539, + "learning_rate": 2.28957595259504e-05, + "loss": 0.13, + "step": 32189 + }, + { + "epoch": 0.5741447579638284, + "grad_norm": 0.22780832648277283, + "learning_rate": 2.289420854580089e-05, + "loss": 0.0973, + "step": 32190 + }, + { + "epoch": 0.574162594085542, + "grad_norm": 0.2111360728740692, + "learning_rate": 2.2892657573814153e-05, + "loss": 0.0907, + "step": 32191 + }, + { + "epoch": 0.5741804302072557, + "grad_norm": 0.24277439713478088, + "learning_rate": 2.28911066099962e-05, + "loss": 0.1429, + "step": 32192 + }, + { + "epoch": 0.5741982663289694, + "grad_norm": 0.25361335277557373, + "learning_rate": 2.2889555654353046e-05, + "loss": 0.1214, + "step": 32193 + }, + { + "epoch": 0.5742161024506831, + "grad_norm": 0.2720125615596771, + "learning_rate": 2.2888004706890702e-05, + "loss": 0.1039, + "step": 32194 + }, + { + "epoch": 0.5742339385723968, + "grad_norm": 0.26404550671577454, + "learning_rate": 2.2886453767615185e-05, + "loss": 0.1523, + "step": 32195 + }, + { + "epoch": 0.5742517746941105, + "grad_norm": 0.3610498905181885, + "learning_rate": 2.2884902836532504e-05, + "loss": 0.1361, + "step": 32196 + }, + { + "epoch": 0.5742696108158242, + "grad_norm": 0.28425687551498413, + "learning_rate": 2.288335191364867e-05, + "loss": 0.1539, + "step": 32197 + }, + { + "epoch": 0.5742874469375379, + "grad_norm": 0.23330342769622803, + "learning_rate": 2.2881800998969687e-05, + "loss": 0.1322, + "step": 32198 + }, + { + "epoch": 0.5743052830592515, + "grad_norm": 0.2934582531452179, + "learning_rate": 2.2880250092501583e-05, + "loss": 0.149, + "step": 32199 + }, + { + "epoch": 0.5743231191809652, + "grad_norm": 0.2155960649251938, + "learning_rate": 2.287869919425036e-05, + "loss": 0.1272, + "step": 32200 + }, + { + "epoch": 0.574340955302679, + "grad_norm": 0.21304568648338318, + "learning_rate": 2.2877148304222033e-05, + "loss": 0.1113, + "step": 32201 + }, + { + "epoch": 0.5743587914243927, + "grad_norm": 0.3859685957431793, + "learning_rate": 2.2875597422422602e-05, + "loss": 0.1098, + "step": 32202 + }, + { + "epoch": 0.5743766275461064, + "grad_norm": 0.24139446020126343, + "learning_rate": 2.2874046548858103e-05, + "loss": 0.1257, + "step": 32203 + }, + { + "epoch": 0.5743944636678201, + "grad_norm": 0.23132754862308502, + "learning_rate": 2.287249568353453e-05, + "loss": 0.0869, + "step": 32204 + }, + { + "epoch": 0.5744122997895338, + "grad_norm": 0.23843498528003693, + "learning_rate": 2.28709448264579e-05, + "loss": 0.1416, + "step": 32205 + }, + { + "epoch": 0.5744301359112475, + "grad_norm": 0.2811092138290405, + "learning_rate": 2.2869393977634212e-05, + "loss": 0.1327, + "step": 32206 + }, + { + "epoch": 0.5744479720329612, + "grad_norm": 0.2144453227519989, + "learning_rate": 2.28678431370695e-05, + "loss": 0.0985, + "step": 32207 + }, + { + "epoch": 0.5744658081546749, + "grad_norm": 0.33172857761383057, + "learning_rate": 2.286629230476976e-05, + "loss": 0.1485, + "step": 32208 + }, + { + "epoch": 0.5744836442763885, + "grad_norm": 0.2953266203403473, + "learning_rate": 2.2864741480741014e-05, + "loss": 0.1626, + "step": 32209 + }, + { + "epoch": 0.5745014803981022, + "grad_norm": 0.2817229926586151, + "learning_rate": 2.2863190664989264e-05, + "loss": 0.1411, + "step": 32210 + }, + { + "epoch": 0.5745193165198159, + "grad_norm": 0.31147894263267517, + "learning_rate": 2.2861639857520518e-05, + "loss": 0.179, + "step": 32211 + }, + { + "epoch": 0.5745371526415296, + "grad_norm": 0.3016244173049927, + "learning_rate": 2.2860089058340802e-05, + "loss": 0.1514, + "step": 32212 + }, + { + "epoch": 0.5745549887632433, + "grad_norm": 0.3923414945602417, + "learning_rate": 2.2858538267456117e-05, + "loss": 0.1341, + "step": 32213 + }, + { + "epoch": 0.574572824884957, + "grad_norm": 0.3515370488166809, + "learning_rate": 2.2856987484872484e-05, + "loss": 0.1453, + "step": 32214 + }, + { + "epoch": 0.5745906610066707, + "grad_norm": 0.24974365532398224, + "learning_rate": 2.2855436710595892e-05, + "loss": 0.1069, + "step": 32215 + }, + { + "epoch": 0.5746084971283844, + "grad_norm": 0.2985091805458069, + "learning_rate": 2.2853885944632383e-05, + "loss": 0.0915, + "step": 32216 + }, + { + "epoch": 0.5746263332500982, + "grad_norm": 0.2817144989967346, + "learning_rate": 2.285233518698795e-05, + "loss": 0.1239, + "step": 32217 + }, + { + "epoch": 0.5746441693718118, + "grad_norm": 0.3197835087776184, + "learning_rate": 2.2850784437668613e-05, + "loss": 0.1594, + "step": 32218 + }, + { + "epoch": 0.5746620054935255, + "grad_norm": 0.3060835003852844, + "learning_rate": 2.2849233696680362e-05, + "loss": 0.1126, + "step": 32219 + }, + { + "epoch": 0.5746798416152392, + "grad_norm": 0.2791560888290405, + "learning_rate": 2.2847682964029236e-05, + "loss": 0.185, + "step": 32220 + }, + { + "epoch": 0.5746976777369529, + "grad_norm": 0.28000256419181824, + "learning_rate": 2.284613223972123e-05, + "loss": 0.1343, + "step": 32221 + }, + { + "epoch": 0.5747155138586666, + "grad_norm": 0.25441911816596985, + "learning_rate": 2.2844581523762365e-05, + "loss": 0.155, + "step": 32222 + }, + { + "epoch": 0.5747333499803803, + "grad_norm": 0.2840367555618286, + "learning_rate": 2.2843030816158644e-05, + "loss": 0.1878, + "step": 32223 + }, + { + "epoch": 0.574751186102094, + "grad_norm": 0.3270607888698578, + "learning_rate": 2.284148011691607e-05, + "loss": 0.1382, + "step": 32224 + }, + { + "epoch": 0.5747690222238077, + "grad_norm": 0.3001517653465271, + "learning_rate": 2.283992942604068e-05, + "loss": 0.0993, + "step": 32225 + }, + { + "epoch": 0.5747868583455213, + "grad_norm": 0.37920400500297546, + "learning_rate": 2.283837874353846e-05, + "loss": 0.1394, + "step": 32226 + }, + { + "epoch": 0.574804694467235, + "grad_norm": 0.21001584827899933, + "learning_rate": 2.2836828069415435e-05, + "loss": 0.128, + "step": 32227 + }, + { + "epoch": 0.5748225305889487, + "grad_norm": 0.25147759914398193, + "learning_rate": 2.283527740367761e-05, + "loss": 0.1445, + "step": 32228 + }, + { + "epoch": 0.5748403667106624, + "grad_norm": 0.26965656876564026, + "learning_rate": 2.2833726746330995e-05, + "loss": 0.1321, + "step": 32229 + }, + { + "epoch": 0.5748582028323761, + "grad_norm": 0.24212568998336792, + "learning_rate": 2.283217609738161e-05, + "loss": 0.0776, + "step": 32230 + }, + { + "epoch": 0.5748760389540898, + "grad_norm": 0.1760663092136383, + "learning_rate": 2.2830625456835456e-05, + "loss": 0.0613, + "step": 32231 + }, + { + "epoch": 0.5748938750758035, + "grad_norm": 0.272725909948349, + "learning_rate": 2.282907482469854e-05, + "loss": 0.1953, + "step": 32232 + }, + { + "epoch": 0.5749117111975172, + "grad_norm": 0.23237788677215576, + "learning_rate": 2.2827524200976887e-05, + "loss": 0.108, + "step": 32233 + }, + { + "epoch": 0.574929547319231, + "grad_norm": 0.23438678681850433, + "learning_rate": 2.2825973585676504e-05, + "loss": 0.1062, + "step": 32234 + }, + { + "epoch": 0.5749473834409446, + "grad_norm": 0.2420775592327118, + "learning_rate": 2.282442297880339e-05, + "loss": 0.1212, + "step": 32235 + }, + { + "epoch": 0.5749652195626583, + "grad_norm": 0.26243868470191956, + "learning_rate": 2.2822872380363572e-05, + "loss": 0.116, + "step": 32236 + }, + { + "epoch": 0.574983055684372, + "grad_norm": 0.2642301023006439, + "learning_rate": 2.282132179036304e-05, + "loss": 0.1473, + "step": 32237 + }, + { + "epoch": 0.5750008918060857, + "grad_norm": 0.19997823238372803, + "learning_rate": 2.281977120880783e-05, + "loss": 0.1308, + "step": 32238 + }, + { + "epoch": 0.5750187279277994, + "grad_norm": 0.2773137092590332, + "learning_rate": 2.281822063570394e-05, + "loss": 0.13, + "step": 32239 + }, + { + "epoch": 0.5750365640495131, + "grad_norm": 0.418454647064209, + "learning_rate": 2.2816670071057373e-05, + "loss": 0.1025, + "step": 32240 + }, + { + "epoch": 0.5750544001712268, + "grad_norm": 0.32250580191612244, + "learning_rate": 2.2815119514874144e-05, + "loss": 0.1277, + "step": 32241 + }, + { + "epoch": 0.5750722362929405, + "grad_norm": 0.3185238838195801, + "learning_rate": 2.281356896716027e-05, + "loss": 0.1632, + "step": 32242 + }, + { + "epoch": 0.5750900724146542, + "grad_norm": 0.24539116024971008, + "learning_rate": 2.2812018427921767e-05, + "loss": 0.2008, + "step": 32243 + }, + { + "epoch": 0.5751079085363678, + "grad_norm": 0.24439223110675812, + "learning_rate": 2.281046789716463e-05, + "loss": 0.0915, + "step": 32244 + }, + { + "epoch": 0.5751257446580815, + "grad_norm": 0.26436668634414673, + "learning_rate": 2.2808917374894866e-05, + "loss": 0.1539, + "step": 32245 + }, + { + "epoch": 0.5751435807797952, + "grad_norm": 0.27153873443603516, + "learning_rate": 2.280736686111851e-05, + "loss": 0.1698, + "step": 32246 + }, + { + "epoch": 0.5751614169015089, + "grad_norm": 0.26940545439720154, + "learning_rate": 2.280581635584155e-05, + "loss": 0.1216, + "step": 32247 + }, + { + "epoch": 0.5751792530232226, + "grad_norm": 0.22699624300003052, + "learning_rate": 2.2804265859070006e-05, + "loss": 0.1065, + "step": 32248 + }, + { + "epoch": 0.5751970891449363, + "grad_norm": 0.22343863546848297, + "learning_rate": 2.2802715370809888e-05, + "loss": 0.0993, + "step": 32249 + }, + { + "epoch": 0.57521492526665, + "grad_norm": 0.27126502990722656, + "learning_rate": 2.280116489106719e-05, + "loss": 0.1248, + "step": 32250 + }, + { + "epoch": 0.5752327613883638, + "grad_norm": 0.231038436293602, + "learning_rate": 2.2799614419847953e-05, + "loss": 0.1411, + "step": 32251 + }, + { + "epoch": 0.5752505975100775, + "grad_norm": 0.2828325033187866, + "learning_rate": 2.279806395715817e-05, + "loss": 0.1709, + "step": 32252 + }, + { + "epoch": 0.5752684336317911, + "grad_norm": 0.27043285965919495, + "learning_rate": 2.2796513503003848e-05, + "loss": 0.0826, + "step": 32253 + }, + { + "epoch": 0.5752862697535048, + "grad_norm": 0.23741087317466736, + "learning_rate": 2.279496305739099e-05, + "loss": 0.1213, + "step": 32254 + }, + { + "epoch": 0.5753041058752185, + "grad_norm": 0.2832827866077423, + "learning_rate": 2.279341262032563e-05, + "loss": 0.1685, + "step": 32255 + }, + { + "epoch": 0.5753219419969322, + "grad_norm": 0.29893118143081665, + "learning_rate": 2.279186219181376e-05, + "loss": 0.1193, + "step": 32256 + }, + { + "epoch": 0.5753397781186459, + "grad_norm": 0.28755274415016174, + "learning_rate": 2.2790311771861397e-05, + "loss": 0.1035, + "step": 32257 + }, + { + "epoch": 0.5753576142403596, + "grad_norm": 0.22959105670452118, + "learning_rate": 2.278876136047454e-05, + "loss": 0.1221, + "step": 32258 + }, + { + "epoch": 0.5753754503620733, + "grad_norm": 0.191476970911026, + "learning_rate": 2.278721095765922e-05, + "loss": 0.1004, + "step": 32259 + }, + { + "epoch": 0.575393286483787, + "grad_norm": 0.49067234992980957, + "learning_rate": 2.2785660563421432e-05, + "loss": 0.1197, + "step": 32260 + }, + { + "epoch": 0.5754111226055006, + "grad_norm": 0.30326080322265625, + "learning_rate": 2.2784110177767186e-05, + "loss": 0.1522, + "step": 32261 + }, + { + "epoch": 0.5754289587272143, + "grad_norm": 0.20653213560581207, + "learning_rate": 2.2782559800702494e-05, + "loss": 0.0968, + "step": 32262 + }, + { + "epoch": 0.575446794848928, + "grad_norm": 0.2553901672363281, + "learning_rate": 2.278100943223336e-05, + "loss": 0.1475, + "step": 32263 + }, + { + "epoch": 0.5754646309706417, + "grad_norm": 0.21338555216789246, + "learning_rate": 2.2779459072365808e-05, + "loss": 0.1258, + "step": 32264 + }, + { + "epoch": 0.5754824670923554, + "grad_norm": 0.28063151240348816, + "learning_rate": 2.2777908721105843e-05, + "loss": 0.1185, + "step": 32265 + }, + { + "epoch": 0.5755003032140691, + "grad_norm": 0.3025532066822052, + "learning_rate": 2.277635837845947e-05, + "loss": 0.0916, + "step": 32266 + }, + { + "epoch": 0.5755181393357828, + "grad_norm": 0.30818408727645874, + "learning_rate": 2.2774808044432688e-05, + "loss": 0.1837, + "step": 32267 + }, + { + "epoch": 0.5755359754574966, + "grad_norm": 0.21512390673160553, + "learning_rate": 2.277325771903153e-05, + "loss": 0.1575, + "step": 32268 + }, + { + "epoch": 0.5755538115792103, + "grad_norm": 0.31319689750671387, + "learning_rate": 2.2771707402261988e-05, + "loss": 0.1036, + "step": 32269 + }, + { + "epoch": 0.575571647700924, + "grad_norm": 0.27915140986442566, + "learning_rate": 2.2770157094130084e-05, + "loss": 0.1317, + "step": 32270 + }, + { + "epoch": 0.5755894838226376, + "grad_norm": 0.19175270199775696, + "learning_rate": 2.276860679464181e-05, + "loss": 0.0915, + "step": 32271 + }, + { + "epoch": 0.5756073199443513, + "grad_norm": 0.2898296117782593, + "learning_rate": 2.2767056503803197e-05, + "loss": 0.1748, + "step": 32272 + }, + { + "epoch": 0.575625156066065, + "grad_norm": 0.21256984770298004, + "learning_rate": 2.2765506221620245e-05, + "loss": 0.0868, + "step": 32273 + }, + { + "epoch": 0.5756429921877787, + "grad_norm": 0.2054421603679657, + "learning_rate": 2.2763955948098965e-05, + "loss": 0.1103, + "step": 32274 + }, + { + "epoch": 0.5756608283094924, + "grad_norm": 0.3292624056339264, + "learning_rate": 2.2762405683245355e-05, + "loss": 0.1189, + "step": 32275 + }, + { + "epoch": 0.5756786644312061, + "grad_norm": 0.25626716017723083, + "learning_rate": 2.2760855427065434e-05, + "loss": 0.1148, + "step": 32276 + }, + { + "epoch": 0.5756965005529198, + "grad_norm": 0.26967746019363403, + "learning_rate": 2.2759305179565213e-05, + "loss": 0.1372, + "step": 32277 + }, + { + "epoch": 0.5757143366746335, + "grad_norm": 0.2222234308719635, + "learning_rate": 2.2757754940750704e-05, + "loss": 0.1239, + "step": 32278 + }, + { + "epoch": 0.5757321727963471, + "grad_norm": 0.274392694234848, + "learning_rate": 2.275620471062791e-05, + "loss": 0.141, + "step": 32279 + }, + { + "epoch": 0.5757500089180608, + "grad_norm": 0.289774090051651, + "learning_rate": 2.275465448920283e-05, + "loss": 0.1249, + "step": 32280 + }, + { + "epoch": 0.5757678450397745, + "grad_norm": 0.24564850330352783, + "learning_rate": 2.2753104276481496e-05, + "loss": 0.1401, + "step": 32281 + }, + { + "epoch": 0.5757856811614882, + "grad_norm": 0.29305174946784973, + "learning_rate": 2.2751554072469904e-05, + "loss": 0.1134, + "step": 32282 + }, + { + "epoch": 0.5758035172832019, + "grad_norm": 0.2242419719696045, + "learning_rate": 2.2750003877174065e-05, + "loss": 0.1336, + "step": 32283 + }, + { + "epoch": 0.5758213534049156, + "grad_norm": 0.2604919672012329, + "learning_rate": 2.2748453690599977e-05, + "loss": 0.0976, + "step": 32284 + }, + { + "epoch": 0.5758391895266294, + "grad_norm": 0.2729603350162506, + "learning_rate": 2.2746903512753677e-05, + "loss": 0.1064, + "step": 32285 + }, + { + "epoch": 0.5758570256483431, + "grad_norm": 0.2426808625459671, + "learning_rate": 2.274535334364115e-05, + "loss": 0.1417, + "step": 32286 + }, + { + "epoch": 0.5758748617700568, + "grad_norm": 0.2500886023044586, + "learning_rate": 2.2743803183268418e-05, + "loss": 0.1016, + "step": 32287 + }, + { + "epoch": 0.5758926978917704, + "grad_norm": 0.32663434743881226, + "learning_rate": 2.2742253031641484e-05, + "loss": 0.2067, + "step": 32288 + }, + { + "epoch": 0.5759105340134841, + "grad_norm": 0.23015238344669342, + "learning_rate": 2.274070288876634e-05, + "loss": 0.1339, + "step": 32289 + }, + { + "epoch": 0.5759283701351978, + "grad_norm": 0.3159938454627991, + "learning_rate": 2.2739152754649025e-05, + "loss": 0.1412, + "step": 32290 + }, + { + "epoch": 0.5759462062569115, + "grad_norm": 0.24782168865203857, + "learning_rate": 2.2737602629295535e-05, + "loss": 0.1035, + "step": 32291 + }, + { + "epoch": 0.5759640423786252, + "grad_norm": 0.24251790344715118, + "learning_rate": 2.273605251271188e-05, + "loss": 0.1304, + "step": 32292 + }, + { + "epoch": 0.5759818785003389, + "grad_norm": 0.22524601221084595, + "learning_rate": 2.273450240490406e-05, + "loss": 0.0901, + "step": 32293 + }, + { + "epoch": 0.5759997146220526, + "grad_norm": 0.29124411940574646, + "learning_rate": 2.2732952305878098e-05, + "loss": 0.1211, + "step": 32294 + }, + { + "epoch": 0.5760175507437663, + "grad_norm": 0.24226750433444977, + "learning_rate": 2.2731402215639995e-05, + "loss": 0.1424, + "step": 32295 + }, + { + "epoch": 0.57603538686548, + "grad_norm": 0.2564210593700409, + "learning_rate": 2.2729852134195758e-05, + "loss": 0.1443, + "step": 32296 + }, + { + "epoch": 0.5760532229871936, + "grad_norm": 0.22292165458202362, + "learning_rate": 2.27283020615514e-05, + "loss": 0.094, + "step": 32297 + }, + { + "epoch": 0.5760710591089073, + "grad_norm": 0.26974618434906006, + "learning_rate": 2.2726751997712922e-05, + "loss": 0.1365, + "step": 32298 + }, + { + "epoch": 0.576088895230621, + "grad_norm": 0.2666942775249481, + "learning_rate": 2.272520194268635e-05, + "loss": 0.1485, + "step": 32299 + }, + { + "epoch": 0.5761067313523347, + "grad_norm": 0.3342462182044983, + "learning_rate": 2.2723651896477676e-05, + "loss": 0.1473, + "step": 32300 + }, + { + "epoch": 0.5761245674740484, + "grad_norm": 0.27080947160720825, + "learning_rate": 2.2722101859092914e-05, + "loss": 0.1132, + "step": 32301 + }, + { + "epoch": 0.5761424035957622, + "grad_norm": 0.4054609537124634, + "learning_rate": 2.2720551830538065e-05, + "loss": 0.1572, + "step": 32302 + }, + { + "epoch": 0.5761602397174759, + "grad_norm": 0.21025905013084412, + "learning_rate": 2.271900181081915e-05, + "loss": 0.1356, + "step": 32303 + }, + { + "epoch": 0.5761780758391896, + "grad_norm": 0.2511407732963562, + "learning_rate": 2.271745179994217e-05, + "loss": 0.1376, + "step": 32304 + }, + { + "epoch": 0.5761959119609033, + "grad_norm": 0.3078734576702118, + "learning_rate": 2.271590179791314e-05, + "loss": 0.1433, + "step": 32305 + }, + { + "epoch": 0.5762137480826169, + "grad_norm": 0.2085607796907425, + "learning_rate": 2.2714351804738054e-05, + "loss": 0.1632, + "step": 32306 + }, + { + "epoch": 0.5762315842043306, + "grad_norm": 0.3166585862636566, + "learning_rate": 2.271280182042294e-05, + "loss": 0.1412, + "step": 32307 + }, + { + "epoch": 0.5762494203260443, + "grad_norm": 0.23929819464683533, + "learning_rate": 2.2711251844973793e-05, + "loss": 0.1149, + "step": 32308 + }, + { + "epoch": 0.576267256447758, + "grad_norm": 0.26305484771728516, + "learning_rate": 2.270970187839663e-05, + "loss": 0.1546, + "step": 32309 + }, + { + "epoch": 0.5762850925694717, + "grad_norm": 0.2060820460319519, + "learning_rate": 2.2708151920697434e-05, + "loss": 0.1282, + "step": 32310 + }, + { + "epoch": 0.5763029286911854, + "grad_norm": 0.18784496188163757, + "learning_rate": 2.2706601971882253e-05, + "loss": 0.1005, + "step": 32311 + }, + { + "epoch": 0.5763207648128991, + "grad_norm": 0.3025131821632385, + "learning_rate": 2.2705052031957065e-05, + "loss": 0.1328, + "step": 32312 + }, + { + "epoch": 0.5763386009346128, + "grad_norm": 0.3148513734340668, + "learning_rate": 2.270350210092789e-05, + "loss": 0.1657, + "step": 32313 + }, + { + "epoch": 0.5763564370563264, + "grad_norm": 0.2889856696128845, + "learning_rate": 2.270195217880074e-05, + "loss": 0.1316, + "step": 32314 + }, + { + "epoch": 0.5763742731780401, + "grad_norm": 0.19084741175174713, + "learning_rate": 2.2700402265581606e-05, + "loss": 0.0864, + "step": 32315 + }, + { + "epoch": 0.5763921092997538, + "grad_norm": 0.3317219913005829, + "learning_rate": 2.2698852361276512e-05, + "loss": 0.2156, + "step": 32316 + }, + { + "epoch": 0.5764099454214675, + "grad_norm": 0.21589666604995728, + "learning_rate": 2.269730246589146e-05, + "loss": 0.0568, + "step": 32317 + }, + { + "epoch": 0.5764277815431813, + "grad_norm": 0.2767912447452545, + "learning_rate": 2.2695752579432467e-05, + "loss": 0.1245, + "step": 32318 + }, + { + "epoch": 0.576445617664895, + "grad_norm": 0.2305850386619568, + "learning_rate": 2.2694202701905516e-05, + "loss": 0.0695, + "step": 32319 + }, + { + "epoch": 0.5764634537866087, + "grad_norm": 0.5760293006896973, + "learning_rate": 2.2692652833316647e-05, + "loss": 0.1479, + "step": 32320 + }, + { + "epoch": 0.5764812899083224, + "grad_norm": 0.22271789610385895, + "learning_rate": 2.2691102973671853e-05, + "loss": 0.135, + "step": 32321 + }, + { + "epoch": 0.5764991260300361, + "grad_norm": 0.34600135684013367, + "learning_rate": 2.2689553122977138e-05, + "loss": 0.1476, + "step": 32322 + }, + { + "epoch": 0.5765169621517497, + "grad_norm": 0.2297000139951706, + "learning_rate": 2.2688003281238505e-05, + "loss": 0.1, + "step": 32323 + }, + { + "epoch": 0.5765347982734634, + "grad_norm": 0.2916584610939026, + "learning_rate": 2.268645344846198e-05, + "loss": 0.2058, + "step": 32324 + }, + { + "epoch": 0.5765526343951771, + "grad_norm": 0.23188558220863342, + "learning_rate": 2.2684903624653553e-05, + "loss": 0.1224, + "step": 32325 + }, + { + "epoch": 0.5765704705168908, + "grad_norm": 0.2858402729034424, + "learning_rate": 2.2683353809819244e-05, + "loss": 0.1454, + "step": 32326 + }, + { + "epoch": 0.5765883066386045, + "grad_norm": 0.27780696749687195, + "learning_rate": 2.2681804003965057e-05, + "loss": 0.1215, + "step": 32327 + }, + { + "epoch": 0.5766061427603182, + "grad_norm": 0.245209738612175, + "learning_rate": 2.2680254207096992e-05, + "loss": 0.1607, + "step": 32328 + }, + { + "epoch": 0.5766239788820319, + "grad_norm": 0.5074828267097473, + "learning_rate": 2.2678704419221066e-05, + "loss": 0.2376, + "step": 32329 + }, + { + "epoch": 0.5766418150037456, + "grad_norm": 0.3205380439758301, + "learning_rate": 2.267715464034329e-05, + "loss": 0.1327, + "step": 32330 + }, + { + "epoch": 0.5766596511254592, + "grad_norm": 0.23271895945072174, + "learning_rate": 2.267560487046966e-05, + "loss": 0.1457, + "step": 32331 + }, + { + "epoch": 0.5766774872471729, + "grad_norm": 0.23441344499588013, + "learning_rate": 2.2674055109606183e-05, + "loss": 0.102, + "step": 32332 + }, + { + "epoch": 0.5766953233688866, + "grad_norm": 0.23528803884983063, + "learning_rate": 2.267250535775887e-05, + "loss": 0.064, + "step": 32333 + }, + { + "epoch": 0.5767131594906003, + "grad_norm": 0.34019744396209717, + "learning_rate": 2.2670955614933743e-05, + "loss": 0.1291, + "step": 32334 + }, + { + "epoch": 0.5767309956123141, + "grad_norm": 0.24590329825878143, + "learning_rate": 2.2669405881136795e-05, + "loss": 0.1486, + "step": 32335 + }, + { + "epoch": 0.5767488317340278, + "grad_norm": 0.3161316514015198, + "learning_rate": 2.266785615637402e-05, + "loss": 0.1303, + "step": 32336 + }, + { + "epoch": 0.5767666678557415, + "grad_norm": 0.2143571823835373, + "learning_rate": 2.2666306440651452e-05, + "loss": 0.1343, + "step": 32337 + }, + { + "epoch": 0.5767845039774552, + "grad_norm": 0.28213709592819214, + "learning_rate": 2.2664756733975086e-05, + "loss": 0.1254, + "step": 32338 + }, + { + "epoch": 0.5768023400991689, + "grad_norm": 0.18796256184577942, + "learning_rate": 2.2663207036350926e-05, + "loss": 0.0971, + "step": 32339 + }, + { + "epoch": 0.5768201762208826, + "grad_norm": 0.36780208349227905, + "learning_rate": 2.266165734778499e-05, + "loss": 0.1238, + "step": 32340 + }, + { + "epoch": 0.5768380123425962, + "grad_norm": 0.30121201276779175, + "learning_rate": 2.2660107668283262e-05, + "loss": 0.1269, + "step": 32341 + }, + { + "epoch": 0.5768558484643099, + "grad_norm": 0.46117663383483887, + "learning_rate": 2.2658557997851777e-05, + "loss": 0.1618, + "step": 32342 + }, + { + "epoch": 0.5768736845860236, + "grad_norm": 0.27574294805526733, + "learning_rate": 2.2657008336496532e-05, + "loss": 0.1404, + "step": 32343 + }, + { + "epoch": 0.5768915207077373, + "grad_norm": 0.24472962319850922, + "learning_rate": 2.2655458684223523e-05, + "loss": 0.1387, + "step": 32344 + }, + { + "epoch": 0.576909356829451, + "grad_norm": 0.2560674250125885, + "learning_rate": 2.265390904103877e-05, + "loss": 0.0876, + "step": 32345 + }, + { + "epoch": 0.5769271929511647, + "grad_norm": 0.23681677877902985, + "learning_rate": 2.2652359406948273e-05, + "loss": 0.1161, + "step": 32346 + }, + { + "epoch": 0.5769450290728784, + "grad_norm": 0.27823352813720703, + "learning_rate": 2.265080978195805e-05, + "loss": 0.164, + "step": 32347 + }, + { + "epoch": 0.5769628651945921, + "grad_norm": 0.22411201894283295, + "learning_rate": 2.2649260166074095e-05, + "loss": 0.1097, + "step": 32348 + }, + { + "epoch": 0.5769807013163057, + "grad_norm": 0.2635782063007355, + "learning_rate": 2.2647710559302412e-05, + "loss": 0.1394, + "step": 32349 + }, + { + "epoch": 0.5769985374380194, + "grad_norm": 0.24432113766670227, + "learning_rate": 2.2646160961649028e-05, + "loss": 0.1375, + "step": 32350 + }, + { + "epoch": 0.5770163735597331, + "grad_norm": 0.18400879204273224, + "learning_rate": 2.2644611373119934e-05, + "loss": 0.1159, + "step": 32351 + }, + { + "epoch": 0.5770342096814469, + "grad_norm": 0.22061029076576233, + "learning_rate": 2.2643061793721134e-05, + "loss": 0.1237, + "step": 32352 + }, + { + "epoch": 0.5770520458031606, + "grad_norm": 0.25662875175476074, + "learning_rate": 2.2641512223458648e-05, + "loss": 0.1133, + "step": 32353 + }, + { + "epoch": 0.5770698819248743, + "grad_norm": 0.20533181726932526, + "learning_rate": 2.2639962662338466e-05, + "loss": 0.0927, + "step": 32354 + }, + { + "epoch": 0.577087718046588, + "grad_norm": 0.31295469403266907, + "learning_rate": 2.263841311036661e-05, + "loss": 0.1318, + "step": 32355 + }, + { + "epoch": 0.5771055541683017, + "grad_norm": 0.2255227118730545, + "learning_rate": 2.2636863567549082e-05, + "loss": 0.1432, + "step": 32356 + }, + { + "epoch": 0.5771233902900154, + "grad_norm": 0.17697836458683014, + "learning_rate": 2.2635314033891887e-05, + "loss": 0.1064, + "step": 32357 + }, + { + "epoch": 0.577141226411729, + "grad_norm": 0.2235167920589447, + "learning_rate": 2.2633764509401022e-05, + "loss": 0.1375, + "step": 32358 + }, + { + "epoch": 0.5771590625334427, + "grad_norm": 0.2680342495441437, + "learning_rate": 2.2632214994082514e-05, + "loss": 0.1587, + "step": 32359 + }, + { + "epoch": 0.5771768986551564, + "grad_norm": 0.34540942311286926, + "learning_rate": 2.2630665487942353e-05, + "loss": 0.0837, + "step": 32360 + }, + { + "epoch": 0.5771947347768701, + "grad_norm": 0.22852060198783875, + "learning_rate": 2.2629115990986553e-05, + "loss": 0.0948, + "step": 32361 + }, + { + "epoch": 0.5772125708985838, + "grad_norm": 0.24823763966560364, + "learning_rate": 2.2627566503221112e-05, + "loss": 0.1024, + "step": 32362 + }, + { + "epoch": 0.5772304070202975, + "grad_norm": 0.27845174074172974, + "learning_rate": 2.2626017024652053e-05, + "loss": 0.139, + "step": 32363 + }, + { + "epoch": 0.5772482431420112, + "grad_norm": 0.2625521123409271, + "learning_rate": 2.262446755528537e-05, + "loss": 0.0898, + "step": 32364 + }, + { + "epoch": 0.5772660792637249, + "grad_norm": 0.23189617693424225, + "learning_rate": 2.262291809512707e-05, + "loss": 0.1198, + "step": 32365 + }, + { + "epoch": 0.5772839153854386, + "grad_norm": 0.1825329214334488, + "learning_rate": 2.2621368644183158e-05, + "loss": 0.1378, + "step": 32366 + }, + { + "epoch": 0.5773017515071522, + "grad_norm": 0.2309417873620987, + "learning_rate": 2.2619819202459636e-05, + "loss": 0.1346, + "step": 32367 + }, + { + "epoch": 0.5773195876288659, + "grad_norm": 0.22425667941570282, + "learning_rate": 2.261826976996253e-05, + "loss": 0.1211, + "step": 32368 + }, + { + "epoch": 0.5773374237505797, + "grad_norm": 0.3045188784599304, + "learning_rate": 2.261672034669783e-05, + "loss": 0.1261, + "step": 32369 + }, + { + "epoch": 0.5773552598722934, + "grad_norm": 0.34467121958732605, + "learning_rate": 2.2615170932671544e-05, + "loss": 0.1003, + "step": 32370 + }, + { + "epoch": 0.5773730959940071, + "grad_norm": 0.1908007562160492, + "learning_rate": 2.2613621527889673e-05, + "loss": 0.1012, + "step": 32371 + }, + { + "epoch": 0.5773909321157208, + "grad_norm": 0.2497616857290268, + "learning_rate": 2.2612072132358236e-05, + "loss": 0.1163, + "step": 32372 + }, + { + "epoch": 0.5774087682374345, + "grad_norm": 0.3948986530303955, + "learning_rate": 2.261052274608323e-05, + "loss": 0.1301, + "step": 32373 + }, + { + "epoch": 0.5774266043591482, + "grad_norm": 0.28548774123191833, + "learning_rate": 2.2608973369070666e-05, + "loss": 0.1267, + "step": 32374 + }, + { + "epoch": 0.5774444404808619, + "grad_norm": 0.3578229546546936, + "learning_rate": 2.2607424001326537e-05, + "loss": 0.1558, + "step": 32375 + }, + { + "epoch": 0.5774622766025755, + "grad_norm": 0.2500741183757782, + "learning_rate": 2.260587464285687e-05, + "loss": 0.1213, + "step": 32376 + }, + { + "epoch": 0.5774801127242892, + "grad_norm": 0.3045952618122101, + "learning_rate": 2.260432529366766e-05, + "loss": 0.159, + "step": 32377 + }, + { + "epoch": 0.5774979488460029, + "grad_norm": 0.2800423502922058, + "learning_rate": 2.260277595376491e-05, + "loss": 0.1178, + "step": 32378 + }, + { + "epoch": 0.5775157849677166, + "grad_norm": 0.6150677800178528, + "learning_rate": 2.2601226623154625e-05, + "loss": 0.1378, + "step": 32379 + }, + { + "epoch": 0.5775336210894303, + "grad_norm": 0.25808438658714294, + "learning_rate": 2.2599677301842813e-05, + "loss": 0.1337, + "step": 32380 + }, + { + "epoch": 0.577551457211144, + "grad_norm": 0.28601396083831787, + "learning_rate": 2.2598127989835482e-05, + "loss": 0.1315, + "step": 32381 + }, + { + "epoch": 0.5775692933328577, + "grad_norm": 0.22821418941020966, + "learning_rate": 2.2596578687138642e-05, + "loss": 0.0986, + "step": 32382 + }, + { + "epoch": 0.5775871294545714, + "grad_norm": 0.23115041851997375, + "learning_rate": 2.2595029393758294e-05, + "loss": 0.0956, + "step": 32383 + }, + { + "epoch": 0.577604965576285, + "grad_norm": 0.27500250935554504, + "learning_rate": 2.259348010970043e-05, + "loss": 0.1488, + "step": 32384 + }, + { + "epoch": 0.5776228016979987, + "grad_norm": 0.3548430800437927, + "learning_rate": 2.259193083497108e-05, + "loss": 0.0995, + "step": 32385 + }, + { + "epoch": 0.5776406378197125, + "grad_norm": 0.350884348154068, + "learning_rate": 2.2590381569576237e-05, + "loss": 0.1162, + "step": 32386 + }, + { + "epoch": 0.5776584739414262, + "grad_norm": 0.2650716304779053, + "learning_rate": 2.2588832313521904e-05, + "loss": 0.1265, + "step": 32387 + }, + { + "epoch": 0.5776763100631399, + "grad_norm": 0.22411689162254333, + "learning_rate": 2.2587283066814088e-05, + "loss": 0.0905, + "step": 32388 + }, + { + "epoch": 0.5776941461848536, + "grad_norm": 0.32652562856674194, + "learning_rate": 2.2585733829458796e-05, + "loss": 0.1306, + "step": 32389 + }, + { + "epoch": 0.5777119823065673, + "grad_norm": 0.29913076758384705, + "learning_rate": 2.2584184601462042e-05, + "loss": 0.1058, + "step": 32390 + }, + { + "epoch": 0.577729818428281, + "grad_norm": 0.42833200097084045, + "learning_rate": 2.258263538282982e-05, + "loss": 0.1677, + "step": 32391 + }, + { + "epoch": 0.5777476545499947, + "grad_norm": 0.31012651324272156, + "learning_rate": 2.2581086173568138e-05, + "loss": 0.148, + "step": 32392 + }, + { + "epoch": 0.5777654906717083, + "grad_norm": 0.20969222486019135, + "learning_rate": 2.2579536973682992e-05, + "loss": 0.1604, + "step": 32393 + }, + { + "epoch": 0.577783326793422, + "grad_norm": 0.2786845266819, + "learning_rate": 2.2577987783180403e-05, + "loss": 0.1268, + "step": 32394 + }, + { + "epoch": 0.5778011629151357, + "grad_norm": 0.3049396574497223, + "learning_rate": 2.257643860206637e-05, + "loss": 0.1898, + "step": 32395 + }, + { + "epoch": 0.5778189990368494, + "grad_norm": 0.23088280856609344, + "learning_rate": 2.2574889430346903e-05, + "loss": 0.1102, + "step": 32396 + }, + { + "epoch": 0.5778368351585631, + "grad_norm": 0.24143581092357635, + "learning_rate": 2.257334026802799e-05, + "loss": 0.1312, + "step": 32397 + }, + { + "epoch": 0.5778546712802768, + "grad_norm": 0.3001772165298462, + "learning_rate": 2.257179111511566e-05, + "loss": 0.1872, + "step": 32398 + }, + { + "epoch": 0.5778725074019905, + "grad_norm": 0.23794269561767578, + "learning_rate": 2.2570241971615903e-05, + "loss": 0.1221, + "step": 32399 + }, + { + "epoch": 0.5778903435237042, + "grad_norm": 0.20139265060424805, + "learning_rate": 2.2568692837534724e-05, + "loss": 0.1183, + "step": 32400 + }, + { + "epoch": 0.5779081796454179, + "grad_norm": 0.28829342126846313, + "learning_rate": 2.2567143712878135e-05, + "loss": 0.145, + "step": 32401 + }, + { + "epoch": 0.5779260157671315, + "grad_norm": 0.26882028579711914, + "learning_rate": 2.2565594597652133e-05, + "loss": 0.1549, + "step": 32402 + }, + { + "epoch": 0.5779438518888453, + "grad_norm": 0.3388108015060425, + "learning_rate": 2.256404549186273e-05, + "loss": 0.1362, + "step": 32403 + }, + { + "epoch": 0.577961688010559, + "grad_norm": 0.33914968371391296, + "learning_rate": 2.2562496395515935e-05, + "loss": 0.1498, + "step": 32404 + }, + { + "epoch": 0.5779795241322727, + "grad_norm": 0.3868884742259979, + "learning_rate": 2.256094730861774e-05, + "loss": 0.1174, + "step": 32405 + }, + { + "epoch": 0.5779973602539864, + "grad_norm": 0.2845574617385864, + "learning_rate": 2.2559398231174147e-05, + "loss": 0.1387, + "step": 32406 + }, + { + "epoch": 0.5780151963757001, + "grad_norm": 0.25631916522979736, + "learning_rate": 2.2557849163191184e-05, + "loss": 0.106, + "step": 32407 + }, + { + "epoch": 0.5780330324974138, + "grad_norm": 0.3559955358505249, + "learning_rate": 2.255630010467483e-05, + "loss": 0.1167, + "step": 32408 + }, + { + "epoch": 0.5780508686191275, + "grad_norm": 0.2889615297317505, + "learning_rate": 2.2554751055631107e-05, + "loss": 0.1705, + "step": 32409 + }, + { + "epoch": 0.5780687047408412, + "grad_norm": 0.32917043566703796, + "learning_rate": 2.2553202016066004e-05, + "loss": 0.1252, + "step": 32410 + }, + { + "epoch": 0.5780865408625548, + "grad_norm": 0.26466673612594604, + "learning_rate": 2.2551652985985546e-05, + "loss": 0.1176, + "step": 32411 + }, + { + "epoch": 0.5781043769842685, + "grad_norm": 0.26158544421195984, + "learning_rate": 2.2550103965395726e-05, + "loss": 0.1075, + "step": 32412 + }, + { + "epoch": 0.5781222131059822, + "grad_norm": 0.22283294796943665, + "learning_rate": 2.2548554954302552e-05, + "loss": 0.0809, + "step": 32413 + }, + { + "epoch": 0.5781400492276959, + "grad_norm": 0.30263715982437134, + "learning_rate": 2.2547005952712013e-05, + "loss": 0.1499, + "step": 32414 + }, + { + "epoch": 0.5781578853494096, + "grad_norm": 0.2318008989095688, + "learning_rate": 2.2545456960630134e-05, + "loss": 0.1196, + "step": 32415 + }, + { + "epoch": 0.5781757214711233, + "grad_norm": 0.28474387526512146, + "learning_rate": 2.2543907978062912e-05, + "loss": 0.1372, + "step": 32416 + }, + { + "epoch": 0.578193557592837, + "grad_norm": 0.22017902135849, + "learning_rate": 2.2542359005016353e-05, + "loss": 0.1422, + "step": 32417 + }, + { + "epoch": 0.5782113937145507, + "grad_norm": 0.27815932035446167, + "learning_rate": 2.2540810041496456e-05, + "loss": 0.071, + "step": 32418 + }, + { + "epoch": 0.5782292298362645, + "grad_norm": 0.26125043630599976, + "learning_rate": 2.2539261087509223e-05, + "loss": 0.1602, + "step": 32419 + }, + { + "epoch": 0.5782470659579781, + "grad_norm": 0.23184041678905487, + "learning_rate": 2.2537712143060675e-05, + "loss": 0.1161, + "step": 32420 + }, + { + "epoch": 0.5782649020796918, + "grad_norm": 0.3446803689002991, + "learning_rate": 2.2536163208156797e-05, + "loss": 0.1358, + "step": 32421 + }, + { + "epoch": 0.5782827382014055, + "grad_norm": 0.24228334426879883, + "learning_rate": 2.2534614282803607e-05, + "loss": 0.0953, + "step": 32422 + }, + { + "epoch": 0.5783005743231192, + "grad_norm": 0.322874516248703, + "learning_rate": 2.2533065367007094e-05, + "loss": 0.1005, + "step": 32423 + }, + { + "epoch": 0.5783184104448329, + "grad_norm": 0.3061692416667938, + "learning_rate": 2.2531516460773282e-05, + "loss": 0.1236, + "step": 32424 + }, + { + "epoch": 0.5783362465665466, + "grad_norm": 0.24308371543884277, + "learning_rate": 2.2529967564108164e-05, + "loss": 0.1428, + "step": 32425 + }, + { + "epoch": 0.5783540826882603, + "grad_norm": 0.2664937376976013, + "learning_rate": 2.2528418677017746e-05, + "loss": 0.1133, + "step": 32426 + }, + { + "epoch": 0.578371918809974, + "grad_norm": 0.22126761078834534, + "learning_rate": 2.252686979950802e-05, + "loss": 0.1204, + "step": 32427 + }, + { + "epoch": 0.5783897549316876, + "grad_norm": 0.25034287571907043, + "learning_rate": 2.252532093158501e-05, + "loss": 0.1152, + "step": 32428 + }, + { + "epoch": 0.5784075910534013, + "grad_norm": 0.2439475655555725, + "learning_rate": 2.252377207325471e-05, + "loss": 0.1298, + "step": 32429 + }, + { + "epoch": 0.578425427175115, + "grad_norm": 0.3364785611629486, + "learning_rate": 2.2522223224523127e-05, + "loss": 0.1418, + "step": 32430 + }, + { + "epoch": 0.5784432632968287, + "grad_norm": 0.2142474353313446, + "learning_rate": 2.252067438539626e-05, + "loss": 0.1033, + "step": 32431 + }, + { + "epoch": 0.5784610994185424, + "grad_norm": 0.30729854106903076, + "learning_rate": 2.2519125555880108e-05, + "loss": 0.0981, + "step": 32432 + }, + { + "epoch": 0.5784789355402561, + "grad_norm": 0.285013347864151, + "learning_rate": 2.251757673598069e-05, + "loss": 0.1697, + "step": 32433 + }, + { + "epoch": 0.5784967716619698, + "grad_norm": 0.23829345405101776, + "learning_rate": 2.2516027925704005e-05, + "loss": 0.1067, + "step": 32434 + }, + { + "epoch": 0.5785146077836835, + "grad_norm": 0.23749589920043945, + "learning_rate": 2.2514479125056048e-05, + "loss": 0.0967, + "step": 32435 + }, + { + "epoch": 0.5785324439053973, + "grad_norm": 0.23260420560836792, + "learning_rate": 2.2512930334042826e-05, + "loss": 0.1127, + "step": 32436 + }, + { + "epoch": 0.578550280027111, + "grad_norm": 0.3212168216705322, + "learning_rate": 2.2511381552670347e-05, + "loss": 0.1618, + "step": 32437 + }, + { + "epoch": 0.5785681161488246, + "grad_norm": 0.2994755208492279, + "learning_rate": 2.2509832780944618e-05, + "loss": 0.1751, + "step": 32438 + }, + { + "epoch": 0.5785859522705383, + "grad_norm": 0.24331361055374146, + "learning_rate": 2.2508284018871635e-05, + "loss": 0.0889, + "step": 32439 + }, + { + "epoch": 0.578603788392252, + "grad_norm": 0.2054992914199829, + "learning_rate": 2.2506735266457397e-05, + "loss": 0.1391, + "step": 32440 + }, + { + "epoch": 0.5786216245139657, + "grad_norm": 0.29034626483917236, + "learning_rate": 2.2505186523707923e-05, + "loss": 0.121, + "step": 32441 + }, + { + "epoch": 0.5786394606356794, + "grad_norm": 0.18746566772460938, + "learning_rate": 2.250363779062921e-05, + "loss": 0.1208, + "step": 32442 + }, + { + "epoch": 0.5786572967573931, + "grad_norm": 0.20350977778434753, + "learning_rate": 2.250208906722725e-05, + "loss": 0.1132, + "step": 32443 + }, + { + "epoch": 0.5786751328791068, + "grad_norm": 0.2571881115436554, + "learning_rate": 2.250054035350806e-05, + "loss": 0.1247, + "step": 32444 + }, + { + "epoch": 0.5786929690008205, + "grad_norm": 0.33046942949295044, + "learning_rate": 2.249899164947763e-05, + "loss": 0.1379, + "step": 32445 + }, + { + "epoch": 0.5787108051225341, + "grad_norm": 0.3338419795036316, + "learning_rate": 2.2497442955141986e-05, + "loss": 0.1815, + "step": 32446 + }, + { + "epoch": 0.5787286412442478, + "grad_norm": 0.33364900946617126, + "learning_rate": 2.2495894270507113e-05, + "loss": 0.1732, + "step": 32447 + }, + { + "epoch": 0.5787464773659615, + "grad_norm": 0.3296375870704651, + "learning_rate": 2.2494345595579015e-05, + "loss": 0.1019, + "step": 32448 + }, + { + "epoch": 0.5787643134876752, + "grad_norm": 0.29567357897758484, + "learning_rate": 2.2492796930363698e-05, + "loss": 0.1187, + "step": 32449 + }, + { + "epoch": 0.5787821496093889, + "grad_norm": 0.2151576429605484, + "learning_rate": 2.249124827486717e-05, + "loss": 0.1083, + "step": 32450 + }, + { + "epoch": 0.5787999857311026, + "grad_norm": 0.33929628133773804, + "learning_rate": 2.2489699629095432e-05, + "loss": 0.1297, + "step": 32451 + }, + { + "epoch": 0.5788178218528163, + "grad_norm": 0.2873328924179077, + "learning_rate": 2.2488150993054484e-05, + "loss": 0.1927, + "step": 32452 + }, + { + "epoch": 0.5788356579745301, + "grad_norm": 0.30493971705436707, + "learning_rate": 2.2486602366750322e-05, + "loss": 0.1498, + "step": 32453 + }, + { + "epoch": 0.5788534940962438, + "grad_norm": 0.23510156571865082, + "learning_rate": 2.248505375018897e-05, + "loss": 0.1286, + "step": 32454 + }, + { + "epoch": 0.5788713302179574, + "grad_norm": 0.21999810636043549, + "learning_rate": 2.2483505143376416e-05, + "loss": 0.1391, + "step": 32455 + }, + { + "epoch": 0.5788891663396711, + "grad_norm": 0.24669304490089417, + "learning_rate": 2.248195654631866e-05, + "loss": 0.1003, + "step": 32456 + }, + { + "epoch": 0.5789070024613848, + "grad_norm": 0.29634609818458557, + "learning_rate": 2.248040795902171e-05, + "loss": 0.1001, + "step": 32457 + }, + { + "epoch": 0.5789248385830985, + "grad_norm": 0.33521899580955505, + "learning_rate": 2.2478859381491568e-05, + "loss": 0.1302, + "step": 32458 + }, + { + "epoch": 0.5789426747048122, + "grad_norm": 0.26556825637817383, + "learning_rate": 2.2477310813734242e-05, + "loss": 0.1692, + "step": 32459 + }, + { + "epoch": 0.5789605108265259, + "grad_norm": 0.2117352932691574, + "learning_rate": 2.2475762255755732e-05, + "loss": 0.1016, + "step": 32460 + }, + { + "epoch": 0.5789783469482396, + "grad_norm": 0.3086357116699219, + "learning_rate": 2.247421370756204e-05, + "loss": 0.1153, + "step": 32461 + }, + { + "epoch": 0.5789961830699533, + "grad_norm": 0.2631135582923889, + "learning_rate": 2.247266516915916e-05, + "loss": 0.1228, + "step": 32462 + }, + { + "epoch": 0.579014019191667, + "grad_norm": 0.18638308346271515, + "learning_rate": 2.247111664055311e-05, + "loss": 0.1271, + "step": 32463 + }, + { + "epoch": 0.5790318553133806, + "grad_norm": 0.3414674997329712, + "learning_rate": 2.246956812174988e-05, + "loss": 0.1986, + "step": 32464 + }, + { + "epoch": 0.5790496914350943, + "grad_norm": 0.17765916883945465, + "learning_rate": 2.2468019612755487e-05, + "loss": 0.1212, + "step": 32465 + }, + { + "epoch": 0.579067527556808, + "grad_norm": 0.22507187724113464, + "learning_rate": 2.246647111357591e-05, + "loss": 0.1297, + "step": 32466 + }, + { + "epoch": 0.5790853636785217, + "grad_norm": 0.26670798659324646, + "learning_rate": 2.246492262421718e-05, + "loss": 0.1125, + "step": 32467 + }, + { + "epoch": 0.5791031998002354, + "grad_norm": 0.28797197341918945, + "learning_rate": 2.2463374144685277e-05, + "loss": 0.1079, + "step": 32468 + }, + { + "epoch": 0.5791210359219491, + "grad_norm": 0.2610745131969452, + "learning_rate": 2.246182567498622e-05, + "loss": 0.158, + "step": 32469 + }, + { + "epoch": 0.5791388720436629, + "grad_norm": 0.2122703343629837, + "learning_rate": 2.2460277215125992e-05, + "loss": 0.0729, + "step": 32470 + }, + { + "epoch": 0.5791567081653766, + "grad_norm": 0.3137718737125397, + "learning_rate": 2.245872876511061e-05, + "loss": 0.1646, + "step": 32471 + }, + { + "epoch": 0.5791745442870903, + "grad_norm": 0.2896655201911926, + "learning_rate": 2.2457180324946078e-05, + "loss": 0.1874, + "step": 32472 + }, + { + "epoch": 0.5791923804088039, + "grad_norm": 0.33308401703834534, + "learning_rate": 2.2455631894638395e-05, + "loss": 0.1579, + "step": 32473 + }, + { + "epoch": 0.5792102165305176, + "grad_norm": 0.2378171980381012, + "learning_rate": 2.245408347419356e-05, + "loss": 0.1393, + "step": 32474 + }, + { + "epoch": 0.5792280526522313, + "grad_norm": 0.2899779975414276, + "learning_rate": 2.245253506361757e-05, + "loss": 0.1351, + "step": 32475 + }, + { + "epoch": 0.579245888773945, + "grad_norm": 0.2345377653837204, + "learning_rate": 2.245098666291644e-05, + "loss": 0.1069, + "step": 32476 + }, + { + "epoch": 0.5792637248956587, + "grad_norm": 0.2698560953140259, + "learning_rate": 2.2449438272096164e-05, + "loss": 0.133, + "step": 32477 + }, + { + "epoch": 0.5792815610173724, + "grad_norm": 0.2843421697616577, + "learning_rate": 2.244788989116275e-05, + "loss": 0.1202, + "step": 32478 + }, + { + "epoch": 0.5792993971390861, + "grad_norm": 0.27611538767814636, + "learning_rate": 2.244634152012218e-05, + "loss": 0.1557, + "step": 32479 + }, + { + "epoch": 0.5793172332607998, + "grad_norm": 0.26356732845306396, + "learning_rate": 2.244479315898049e-05, + "loss": 0.1474, + "step": 32480 + }, + { + "epoch": 0.5793350693825134, + "grad_norm": 0.28101345896720886, + "learning_rate": 2.2443244807743665e-05, + "loss": 0.1653, + "step": 32481 + }, + { + "epoch": 0.5793529055042271, + "grad_norm": 0.28042471408843994, + "learning_rate": 2.24416964664177e-05, + "loss": 0.1305, + "step": 32482 + }, + { + "epoch": 0.5793707416259408, + "grad_norm": 0.23569683730602264, + "learning_rate": 2.24401481350086e-05, + "loss": 0.1578, + "step": 32483 + }, + { + "epoch": 0.5793885777476545, + "grad_norm": 0.34091052412986755, + "learning_rate": 2.2438599813522375e-05, + "loss": 0.173, + "step": 32484 + }, + { + "epoch": 0.5794064138693682, + "grad_norm": 0.2379416525363922, + "learning_rate": 2.243705150196502e-05, + "loss": 0.1181, + "step": 32485 + }, + { + "epoch": 0.5794242499910819, + "grad_norm": 0.2575030028820038, + "learning_rate": 2.243550320034254e-05, + "loss": 0.1356, + "step": 32486 + }, + { + "epoch": 0.5794420861127957, + "grad_norm": 0.21261177957057953, + "learning_rate": 2.2433954908660943e-05, + "loss": 0.1235, + "step": 32487 + }, + { + "epoch": 0.5794599222345094, + "grad_norm": 0.22560103237628937, + "learning_rate": 2.2432406626926207e-05, + "loss": 0.173, + "step": 32488 + }, + { + "epoch": 0.5794777583562231, + "grad_norm": 0.24423782527446747, + "learning_rate": 2.2430858355144358e-05, + "loss": 0.1458, + "step": 32489 + }, + { + "epoch": 0.5794955944779367, + "grad_norm": 0.3535788953304291, + "learning_rate": 2.2429310093321394e-05, + "loss": 0.1887, + "step": 32490 + }, + { + "epoch": 0.5795134305996504, + "grad_norm": 0.2025614231824875, + "learning_rate": 2.24277618414633e-05, + "loss": 0.1213, + "step": 32491 + }, + { + "epoch": 0.5795312667213641, + "grad_norm": 0.2531871795654297, + "learning_rate": 2.2426213599576095e-05, + "loss": 0.1716, + "step": 32492 + }, + { + "epoch": 0.5795491028430778, + "grad_norm": 0.24019251763820648, + "learning_rate": 2.2424665367665777e-05, + "loss": 0.122, + "step": 32493 + }, + { + "epoch": 0.5795669389647915, + "grad_norm": 0.31495600938796997, + "learning_rate": 2.242311714573835e-05, + "loss": 0.1664, + "step": 32494 + }, + { + "epoch": 0.5795847750865052, + "grad_norm": 0.20914030075073242, + "learning_rate": 2.2421568933799805e-05, + "loss": 0.1135, + "step": 32495 + }, + { + "epoch": 0.5796026112082189, + "grad_norm": 0.3683305084705353, + "learning_rate": 2.2420020731856145e-05, + "loss": 0.1826, + "step": 32496 + }, + { + "epoch": 0.5796204473299326, + "grad_norm": 0.20640255510807037, + "learning_rate": 2.2418472539913382e-05, + "loss": 0.0893, + "step": 32497 + }, + { + "epoch": 0.5796382834516463, + "grad_norm": 0.24882817268371582, + "learning_rate": 2.241692435797751e-05, + "loss": 0.1184, + "step": 32498 + }, + { + "epoch": 0.5796561195733599, + "grad_norm": 0.3185776174068451, + "learning_rate": 2.241537618605453e-05, + "loss": 0.1087, + "step": 32499 + }, + { + "epoch": 0.5796739556950736, + "grad_norm": 0.23715680837631226, + "learning_rate": 2.2413828024150453e-05, + "loss": 0.1367, + "step": 32500 + }, + { + "epoch": 0.5796917918167873, + "grad_norm": 0.23971588909626007, + "learning_rate": 2.2412279872271254e-05, + "loss": 0.0904, + "step": 32501 + }, + { + "epoch": 0.579709627938501, + "grad_norm": 0.27716606855392456, + "learning_rate": 2.241073173042297e-05, + "loss": 0.1338, + "step": 32502 + }, + { + "epoch": 0.5797274640602147, + "grad_norm": 0.5109320878982544, + "learning_rate": 2.2409183598611576e-05, + "loss": 0.1947, + "step": 32503 + }, + { + "epoch": 0.5797453001819285, + "grad_norm": 0.26904845237731934, + "learning_rate": 2.240763547684308e-05, + "loss": 0.0817, + "step": 32504 + }, + { + "epoch": 0.5797631363036422, + "grad_norm": 0.24545560777187347, + "learning_rate": 2.2406087365123486e-05, + "loss": 0.1638, + "step": 32505 + }, + { + "epoch": 0.5797809724253559, + "grad_norm": 0.49714016914367676, + "learning_rate": 2.240453926345879e-05, + "loss": 0.1396, + "step": 32506 + }, + { + "epoch": 0.5797988085470696, + "grad_norm": 0.24684901535511017, + "learning_rate": 2.2402991171855003e-05, + "loss": 0.1294, + "step": 32507 + }, + { + "epoch": 0.5798166446687832, + "grad_norm": 0.22937579452991486, + "learning_rate": 2.2401443090318123e-05, + "loss": 0.0887, + "step": 32508 + }, + { + "epoch": 0.5798344807904969, + "grad_norm": 0.24646563827991486, + "learning_rate": 2.2399895018854132e-05, + "loss": 0.1634, + "step": 32509 + }, + { + "epoch": 0.5798523169122106, + "grad_norm": 0.31046807765960693, + "learning_rate": 2.2398346957469057e-05, + "loss": 0.1536, + "step": 32510 + }, + { + "epoch": 0.5798701530339243, + "grad_norm": 0.27808254957199097, + "learning_rate": 2.2396798906168888e-05, + "loss": 0.1182, + "step": 32511 + }, + { + "epoch": 0.579887989155638, + "grad_norm": 0.27484357357025146, + "learning_rate": 2.239525086495962e-05, + "loss": 0.1193, + "step": 32512 + }, + { + "epoch": 0.5799058252773517, + "grad_norm": 0.31249159574508667, + "learning_rate": 2.2393702833847264e-05, + "loss": 0.119, + "step": 32513 + }, + { + "epoch": 0.5799236613990654, + "grad_norm": 0.20423823595046997, + "learning_rate": 2.2392154812837807e-05, + "loss": 0.1026, + "step": 32514 + }, + { + "epoch": 0.5799414975207791, + "grad_norm": 0.27444329857826233, + "learning_rate": 2.239060680193727e-05, + "loss": 0.1281, + "step": 32515 + }, + { + "epoch": 0.5799593336424927, + "grad_norm": 0.21621066331863403, + "learning_rate": 2.238905880115164e-05, + "loss": 0.0878, + "step": 32516 + }, + { + "epoch": 0.5799771697642064, + "grad_norm": 0.4129605293273926, + "learning_rate": 2.238751081048692e-05, + "loss": 0.1234, + "step": 32517 + }, + { + "epoch": 0.5799950058859201, + "grad_norm": 0.3110547363758087, + "learning_rate": 2.23859628299491e-05, + "loss": 0.185, + "step": 32518 + }, + { + "epoch": 0.5800128420076338, + "grad_norm": 0.2133747637271881, + "learning_rate": 2.2384414859544204e-05, + "loss": 0.0705, + "step": 32519 + }, + { + "epoch": 0.5800306781293476, + "grad_norm": 0.2495887279510498, + "learning_rate": 2.238286689927821e-05, + "loss": 0.1426, + "step": 32520 + }, + { + "epoch": 0.5800485142510613, + "grad_norm": 0.31482580304145813, + "learning_rate": 2.2381318949157136e-05, + "loss": 0.1074, + "step": 32521 + }, + { + "epoch": 0.580066350372775, + "grad_norm": 0.23461973667144775, + "learning_rate": 2.2379771009186963e-05, + "loss": 0.1594, + "step": 32522 + }, + { + "epoch": 0.5800841864944887, + "grad_norm": 0.2245056927204132, + "learning_rate": 2.237822307937371e-05, + "loss": 0.162, + "step": 32523 + }, + { + "epoch": 0.5801020226162024, + "grad_norm": 0.2365114986896515, + "learning_rate": 2.2376675159723377e-05, + "loss": 0.1594, + "step": 32524 + }, + { + "epoch": 0.580119858737916, + "grad_norm": 0.24609144032001495, + "learning_rate": 2.2375127250241945e-05, + "loss": 0.1301, + "step": 32525 + }, + { + "epoch": 0.5801376948596297, + "grad_norm": 0.22959747910499573, + "learning_rate": 2.2373579350935438e-05, + "loss": 0.1206, + "step": 32526 + }, + { + "epoch": 0.5801555309813434, + "grad_norm": 0.24409672617912292, + "learning_rate": 2.2372031461809827e-05, + "loss": 0.1072, + "step": 32527 + }, + { + "epoch": 0.5801733671030571, + "grad_norm": 0.27131932973861694, + "learning_rate": 2.2370483582871146e-05, + "loss": 0.1113, + "step": 32528 + }, + { + "epoch": 0.5801912032247708, + "grad_norm": 0.25985661149024963, + "learning_rate": 2.2368935714125374e-05, + "loss": 0.127, + "step": 32529 + }, + { + "epoch": 0.5802090393464845, + "grad_norm": 0.26481959223747253, + "learning_rate": 2.236738785557852e-05, + "loss": 0.1245, + "step": 32530 + }, + { + "epoch": 0.5802268754681982, + "grad_norm": 0.28355562686920166, + "learning_rate": 2.236584000723657e-05, + "loss": 0.0727, + "step": 32531 + }, + { + "epoch": 0.5802447115899119, + "grad_norm": 0.39319220185279846, + "learning_rate": 2.236429216910554e-05, + "loss": 0.1827, + "step": 32532 + }, + { + "epoch": 0.5802625477116256, + "grad_norm": 0.27143603563308716, + "learning_rate": 2.2362744341191423e-05, + "loss": 0.1551, + "step": 32533 + }, + { + "epoch": 0.5802803838333392, + "grad_norm": 0.2402048259973526, + "learning_rate": 2.2361196523500223e-05, + "loss": 0.1651, + "step": 32534 + }, + { + "epoch": 0.5802982199550529, + "grad_norm": 0.23064440488815308, + "learning_rate": 2.2359648716037926e-05, + "loss": 0.1135, + "step": 32535 + }, + { + "epoch": 0.5803160560767666, + "grad_norm": 0.2531764805316925, + "learning_rate": 2.235810091881055e-05, + "loss": 0.1158, + "step": 32536 + }, + { + "epoch": 0.5803338921984804, + "grad_norm": 0.23505248129367828, + "learning_rate": 2.2356553131824093e-05, + "loss": 0.102, + "step": 32537 + }, + { + "epoch": 0.5803517283201941, + "grad_norm": 0.25851181149482727, + "learning_rate": 2.2355005355084545e-05, + "loss": 0.1176, + "step": 32538 + }, + { + "epoch": 0.5803695644419078, + "grad_norm": 0.22408215701580048, + "learning_rate": 2.235345758859791e-05, + "loss": 0.1076, + "step": 32539 + }, + { + "epoch": 0.5803874005636215, + "grad_norm": 0.2597883641719818, + "learning_rate": 2.2351909832370183e-05, + "loss": 0.1359, + "step": 32540 + }, + { + "epoch": 0.5804052366853352, + "grad_norm": 0.31105777621269226, + "learning_rate": 2.2350362086407367e-05, + "loss": 0.1094, + "step": 32541 + }, + { + "epoch": 0.5804230728070489, + "grad_norm": 0.32001444697380066, + "learning_rate": 2.234881435071547e-05, + "loss": 0.1197, + "step": 32542 + }, + { + "epoch": 0.5804409089287625, + "grad_norm": 0.21128469705581665, + "learning_rate": 2.2347266625300483e-05, + "loss": 0.1162, + "step": 32543 + }, + { + "epoch": 0.5804587450504762, + "grad_norm": 0.33103376626968384, + "learning_rate": 2.23457189101684e-05, + "loss": 0.1001, + "step": 32544 + }, + { + "epoch": 0.5804765811721899, + "grad_norm": 0.3422320783138275, + "learning_rate": 2.2344171205325234e-05, + "loss": 0.1577, + "step": 32545 + }, + { + "epoch": 0.5804944172939036, + "grad_norm": 0.24828532338142395, + "learning_rate": 2.234262351077698e-05, + "loss": 0.1562, + "step": 32546 + }, + { + "epoch": 0.5805122534156173, + "grad_norm": 0.22820478677749634, + "learning_rate": 2.2341075826529627e-05, + "loss": 0.1282, + "step": 32547 + }, + { + "epoch": 0.580530089537331, + "grad_norm": 0.3037009537220001, + "learning_rate": 2.233952815258918e-05, + "loss": 0.1515, + "step": 32548 + }, + { + "epoch": 0.5805479256590447, + "grad_norm": 0.18708738684654236, + "learning_rate": 2.2337980488961648e-05, + "loss": 0.123, + "step": 32549 + }, + { + "epoch": 0.5805657617807584, + "grad_norm": 0.269703209400177, + "learning_rate": 2.2336432835653025e-05, + "loss": 0.1482, + "step": 32550 + }, + { + "epoch": 0.580583597902472, + "grad_norm": 0.24714531004428864, + "learning_rate": 2.233488519266931e-05, + "loss": 0.0766, + "step": 32551 + }, + { + "epoch": 0.5806014340241857, + "grad_norm": 0.2163441777229309, + "learning_rate": 2.2333337560016497e-05, + "loss": 0.1151, + "step": 32552 + }, + { + "epoch": 0.5806192701458994, + "grad_norm": 0.21663017570972443, + "learning_rate": 2.2331789937700582e-05, + "loss": 0.1529, + "step": 32553 + }, + { + "epoch": 0.5806371062676132, + "grad_norm": 0.27300775051116943, + "learning_rate": 2.2330242325727574e-05, + "loss": 0.1695, + "step": 32554 + }, + { + "epoch": 0.5806549423893269, + "grad_norm": 0.20462629199028015, + "learning_rate": 2.2328694724103475e-05, + "loss": 0.1232, + "step": 32555 + }, + { + "epoch": 0.5806727785110406, + "grad_norm": 0.2590431272983551, + "learning_rate": 2.232714713283428e-05, + "loss": 0.1313, + "step": 32556 + }, + { + "epoch": 0.5806906146327543, + "grad_norm": 0.2840895354747772, + "learning_rate": 2.232559955192597e-05, + "loss": 0.1604, + "step": 32557 + }, + { + "epoch": 0.580708450754468, + "grad_norm": 0.2970852851867676, + "learning_rate": 2.232405198138457e-05, + "loss": 0.1134, + "step": 32558 + }, + { + "epoch": 0.5807262868761817, + "grad_norm": 0.31411030888557434, + "learning_rate": 2.2322504421216074e-05, + "loss": 0.1581, + "step": 32559 + }, + { + "epoch": 0.5807441229978954, + "grad_norm": 0.28287172317504883, + "learning_rate": 2.2320956871426468e-05, + "loss": 0.1238, + "step": 32560 + }, + { + "epoch": 0.580761959119609, + "grad_norm": 0.27553895115852356, + "learning_rate": 2.231940933202176e-05, + "loss": 0.0806, + "step": 32561 + }, + { + "epoch": 0.5807797952413227, + "grad_norm": 0.2762424945831299, + "learning_rate": 2.2317861803007944e-05, + "loss": 0.11, + "step": 32562 + }, + { + "epoch": 0.5807976313630364, + "grad_norm": 0.2315920889377594, + "learning_rate": 2.231631428439103e-05, + "loss": 0.105, + "step": 32563 + }, + { + "epoch": 0.5808154674847501, + "grad_norm": 0.23663736879825592, + "learning_rate": 2.2314766776177007e-05, + "loss": 0.1547, + "step": 32564 + }, + { + "epoch": 0.5808333036064638, + "grad_norm": 0.22240598499774933, + "learning_rate": 2.2313219278371876e-05, + "loss": 0.1086, + "step": 32565 + }, + { + "epoch": 0.5808511397281775, + "grad_norm": 0.31067079305648804, + "learning_rate": 2.2311671790981625e-05, + "loss": 0.0923, + "step": 32566 + }, + { + "epoch": 0.5808689758498912, + "grad_norm": 0.2335953712463379, + "learning_rate": 2.2310124314012272e-05, + "loss": 0.1251, + "step": 32567 + }, + { + "epoch": 0.5808868119716049, + "grad_norm": 0.30275240540504456, + "learning_rate": 2.2308576847469802e-05, + "loss": 0.1347, + "step": 32568 + }, + { + "epoch": 0.5809046480933185, + "grad_norm": 0.25733524560928345, + "learning_rate": 2.2307029391360224e-05, + "loss": 0.0968, + "step": 32569 + }, + { + "epoch": 0.5809224842150322, + "grad_norm": 0.30112549662590027, + "learning_rate": 2.2305481945689517e-05, + "loss": 0.1442, + "step": 32570 + }, + { + "epoch": 0.580940320336746, + "grad_norm": 0.2403465062379837, + "learning_rate": 2.2303934510463706e-05, + "loss": 0.1165, + "step": 32571 + }, + { + "epoch": 0.5809581564584597, + "grad_norm": 0.24589656293392181, + "learning_rate": 2.2302387085688776e-05, + "loss": 0.1269, + "step": 32572 + }, + { + "epoch": 0.5809759925801734, + "grad_norm": 0.23497413098812103, + "learning_rate": 2.2300839671370723e-05, + "loss": 0.1373, + "step": 32573 + }, + { + "epoch": 0.5809938287018871, + "grad_norm": 0.21979612112045288, + "learning_rate": 2.229929226751554e-05, + "loss": 0.1223, + "step": 32574 + }, + { + "epoch": 0.5810116648236008, + "grad_norm": 0.2341279536485672, + "learning_rate": 2.2297744874129235e-05, + "loss": 0.1635, + "step": 32575 + }, + { + "epoch": 0.5810295009453145, + "grad_norm": 0.4703749716281891, + "learning_rate": 2.229619749121781e-05, + "loss": 0.1178, + "step": 32576 + }, + { + "epoch": 0.5810473370670282, + "grad_norm": 0.2579607367515564, + "learning_rate": 2.2294650118787262e-05, + "loss": 0.1474, + "step": 32577 + }, + { + "epoch": 0.5810651731887418, + "grad_norm": 0.3578774034976959, + "learning_rate": 2.229310275684358e-05, + "loss": 0.1239, + "step": 32578 + }, + { + "epoch": 0.5810830093104555, + "grad_norm": 0.2348346710205078, + "learning_rate": 2.2291555405392756e-05, + "loss": 0.1656, + "step": 32579 + }, + { + "epoch": 0.5811008454321692, + "grad_norm": 0.215261310338974, + "learning_rate": 2.229000806444081e-05, + "loss": 0.082, + "step": 32580 + }, + { + "epoch": 0.5811186815538829, + "grad_norm": 0.2548074722290039, + "learning_rate": 2.2288460733993723e-05, + "loss": 0.1051, + "step": 32581 + }, + { + "epoch": 0.5811365176755966, + "grad_norm": 0.31394532322883606, + "learning_rate": 2.2286913414057505e-05, + "loss": 0.0837, + "step": 32582 + }, + { + "epoch": 0.5811543537973103, + "grad_norm": 0.19115255773067474, + "learning_rate": 2.2285366104638136e-05, + "loss": 0.0986, + "step": 32583 + }, + { + "epoch": 0.581172189919024, + "grad_norm": 0.28062525391578674, + "learning_rate": 2.2283818805741637e-05, + "loss": 0.1476, + "step": 32584 + }, + { + "epoch": 0.5811900260407377, + "grad_norm": 0.24052348732948303, + "learning_rate": 2.2282271517373995e-05, + "loss": 0.1201, + "step": 32585 + }, + { + "epoch": 0.5812078621624513, + "grad_norm": 0.4481671154499054, + "learning_rate": 2.228072423954121e-05, + "loss": 0.2107, + "step": 32586 + }, + { + "epoch": 0.581225698284165, + "grad_norm": 0.44187813997268677, + "learning_rate": 2.227917697224926e-05, + "loss": 0.1524, + "step": 32587 + }, + { + "epoch": 0.5812435344058788, + "grad_norm": 0.27583783864974976, + "learning_rate": 2.2277629715504175e-05, + "loss": 0.176, + "step": 32588 + }, + { + "epoch": 0.5812613705275925, + "grad_norm": 0.23410040140151978, + "learning_rate": 2.2276082469311932e-05, + "loss": 0.1377, + "step": 32589 + }, + { + "epoch": 0.5812792066493062, + "grad_norm": 0.3143663704395294, + "learning_rate": 2.2274535233678535e-05, + "loss": 0.1447, + "step": 32590 + }, + { + "epoch": 0.5812970427710199, + "grad_norm": 0.20452502369880676, + "learning_rate": 2.2272988008609984e-05, + "loss": 0.156, + "step": 32591 + }, + { + "epoch": 0.5813148788927336, + "grad_norm": 0.23313471674919128, + "learning_rate": 2.2271440794112264e-05, + "loss": 0.1153, + "step": 32592 + }, + { + "epoch": 0.5813327150144473, + "grad_norm": 0.2702648937702179, + "learning_rate": 2.2269893590191395e-05, + "loss": 0.1148, + "step": 32593 + }, + { + "epoch": 0.581350551136161, + "grad_norm": 0.19469690322875977, + "learning_rate": 2.2268346396853354e-05, + "loss": 0.1192, + "step": 32594 + }, + { + "epoch": 0.5813683872578747, + "grad_norm": 0.31677988171577454, + "learning_rate": 2.2266799214104144e-05, + "loss": 0.1142, + "step": 32595 + }, + { + "epoch": 0.5813862233795883, + "grad_norm": 0.8086874485015869, + "learning_rate": 2.2265252041949765e-05, + "loss": 0.1796, + "step": 32596 + }, + { + "epoch": 0.581404059501302, + "grad_norm": 0.2221411019563675, + "learning_rate": 2.2263704880396212e-05, + "loss": 0.0916, + "step": 32597 + }, + { + "epoch": 0.5814218956230157, + "grad_norm": 0.27320221066474915, + "learning_rate": 2.226215772944949e-05, + "loss": 0.152, + "step": 32598 + }, + { + "epoch": 0.5814397317447294, + "grad_norm": 0.2692977488040924, + "learning_rate": 2.2260610589115595e-05, + "loss": 0.0969, + "step": 32599 + }, + { + "epoch": 0.5814575678664431, + "grad_norm": 0.2220585197210312, + "learning_rate": 2.2259063459400503e-05, + "loss": 0.1319, + "step": 32600 + }, + { + "epoch": 0.5814754039881568, + "grad_norm": 0.26261645555496216, + "learning_rate": 2.2257516340310237e-05, + "loss": 0.1536, + "step": 32601 + }, + { + "epoch": 0.5814932401098705, + "grad_norm": 0.2868455648422241, + "learning_rate": 2.2255969231850784e-05, + "loss": 0.1225, + "step": 32602 + }, + { + "epoch": 0.5815110762315842, + "grad_norm": 0.24366708099842072, + "learning_rate": 2.2254422134028146e-05, + "loss": 0.1646, + "step": 32603 + }, + { + "epoch": 0.5815289123532978, + "grad_norm": 0.26461949944496155, + "learning_rate": 2.2252875046848318e-05, + "loss": 0.1334, + "step": 32604 + }, + { + "epoch": 0.5815467484750116, + "grad_norm": 0.2557846009731293, + "learning_rate": 2.225132797031728e-05, + "loss": 0.1281, + "step": 32605 + }, + { + "epoch": 0.5815645845967253, + "grad_norm": 0.23969800770282745, + "learning_rate": 2.224978090444106e-05, + "loss": 0.1209, + "step": 32606 + }, + { + "epoch": 0.581582420718439, + "grad_norm": 0.24777325987815857, + "learning_rate": 2.2248233849225636e-05, + "loss": 0.1574, + "step": 32607 + }, + { + "epoch": 0.5816002568401527, + "grad_norm": 0.26014086604118347, + "learning_rate": 2.2246686804677004e-05, + "loss": 0.1713, + "step": 32608 + }, + { + "epoch": 0.5816180929618664, + "grad_norm": 0.19621795415878296, + "learning_rate": 2.2245139770801163e-05, + "loss": 0.1108, + "step": 32609 + }, + { + "epoch": 0.5816359290835801, + "grad_norm": 0.14991632103919983, + "learning_rate": 2.224359274760411e-05, + "loss": 0.0583, + "step": 32610 + }, + { + "epoch": 0.5816537652052938, + "grad_norm": 0.24359093606472015, + "learning_rate": 2.2242045735091856e-05, + "loss": 0.07, + "step": 32611 + }, + { + "epoch": 0.5816716013270075, + "grad_norm": 0.24627834558486938, + "learning_rate": 2.2240498733270377e-05, + "loss": 0.1371, + "step": 32612 + }, + { + "epoch": 0.5816894374487211, + "grad_norm": 0.2858821749687195, + "learning_rate": 2.223895174214567e-05, + "loss": 0.1467, + "step": 32613 + }, + { + "epoch": 0.5817072735704348, + "grad_norm": 0.29005980491638184, + "learning_rate": 2.2237404761723755e-05, + "loss": 0.1175, + "step": 32614 + }, + { + "epoch": 0.5817251096921485, + "grad_norm": 0.22842881083488464, + "learning_rate": 2.223585779201061e-05, + "loss": 0.1225, + "step": 32615 + }, + { + "epoch": 0.5817429458138622, + "grad_norm": 0.2409631758928299, + "learning_rate": 2.223431083301223e-05, + "loss": 0.1331, + "step": 32616 + }, + { + "epoch": 0.5817607819355759, + "grad_norm": 0.2476070672273636, + "learning_rate": 2.223276388473462e-05, + "loss": 0.1332, + "step": 32617 + }, + { + "epoch": 0.5817786180572896, + "grad_norm": 0.2819904685020447, + "learning_rate": 2.2231216947183763e-05, + "loss": 0.0912, + "step": 32618 + }, + { + "epoch": 0.5817964541790033, + "grad_norm": 0.25920283794403076, + "learning_rate": 2.2229670020365677e-05, + "loss": 0.1286, + "step": 32619 + }, + { + "epoch": 0.581814290300717, + "grad_norm": 0.3017318546772003, + "learning_rate": 2.222812310428635e-05, + "loss": 0.1153, + "step": 32620 + }, + { + "epoch": 0.5818321264224308, + "grad_norm": 0.19782114028930664, + "learning_rate": 2.222657619895177e-05, + "loss": 0.0501, + "step": 32621 + }, + { + "epoch": 0.5818499625441445, + "grad_norm": 0.24279803037643433, + "learning_rate": 2.2225029304367928e-05, + "loss": 0.1084, + "step": 32622 + }, + { + "epoch": 0.5818677986658581, + "grad_norm": 0.36125853657722473, + "learning_rate": 2.2223482420540842e-05, + "loss": 0.1765, + "step": 32623 + }, + { + "epoch": 0.5818856347875718, + "grad_norm": 0.2242000550031662, + "learning_rate": 2.2221935547476493e-05, + "loss": 0.1181, + "step": 32624 + }, + { + "epoch": 0.5819034709092855, + "grad_norm": 0.2644370496273041, + "learning_rate": 2.2220388685180888e-05, + "loss": 0.1236, + "step": 32625 + }, + { + "epoch": 0.5819213070309992, + "grad_norm": 0.2659998834133148, + "learning_rate": 2.2218841833660005e-05, + "loss": 0.1615, + "step": 32626 + }, + { + "epoch": 0.5819391431527129, + "grad_norm": 0.24852383136749268, + "learning_rate": 2.221729499291986e-05, + "loss": 0.1483, + "step": 32627 + }, + { + "epoch": 0.5819569792744266, + "grad_norm": 0.21230261027812958, + "learning_rate": 2.2215748162966445e-05, + "loss": 0.1627, + "step": 32628 + }, + { + "epoch": 0.5819748153961403, + "grad_norm": 0.2367599904537201, + "learning_rate": 2.2214201343805744e-05, + "loss": 0.0937, + "step": 32629 + }, + { + "epoch": 0.581992651517854, + "grad_norm": 0.32215067744255066, + "learning_rate": 2.2212654535443762e-05, + "loss": 0.1222, + "step": 32630 + }, + { + "epoch": 0.5820104876395676, + "grad_norm": 0.21733912825584412, + "learning_rate": 2.2211107737886488e-05, + "loss": 0.1288, + "step": 32631 + }, + { + "epoch": 0.5820283237612813, + "grad_norm": 0.32909077405929565, + "learning_rate": 2.2209560951139936e-05, + "loss": 0.1802, + "step": 32632 + }, + { + "epoch": 0.582046159882995, + "grad_norm": 0.2339247465133667, + "learning_rate": 2.2208014175210083e-05, + "loss": 0.0921, + "step": 32633 + }, + { + "epoch": 0.5820639960047087, + "grad_norm": 0.3000536561012268, + "learning_rate": 2.220646741010294e-05, + "loss": 0.1533, + "step": 32634 + }, + { + "epoch": 0.5820818321264224, + "grad_norm": 0.18525777757167816, + "learning_rate": 2.2204920655824478e-05, + "loss": 0.1147, + "step": 32635 + }, + { + "epoch": 0.5820996682481361, + "grad_norm": 0.2412268966436386, + "learning_rate": 2.2203373912380717e-05, + "loss": 0.0987, + "step": 32636 + }, + { + "epoch": 0.5821175043698498, + "grad_norm": 0.2635987102985382, + "learning_rate": 2.2201827179777643e-05, + "loss": 0.184, + "step": 32637 + }, + { + "epoch": 0.5821353404915636, + "grad_norm": 0.33686697483062744, + "learning_rate": 2.220028045802126e-05, + "loss": 0.1325, + "step": 32638 + }, + { + "epoch": 0.5821531766132773, + "grad_norm": 0.21539783477783203, + "learning_rate": 2.2198733747117546e-05, + "loss": 0.1264, + "step": 32639 + }, + { + "epoch": 0.582171012734991, + "grad_norm": 0.314988911151886, + "learning_rate": 2.2197187047072514e-05, + "loss": 0.18, + "step": 32640 + }, + { + "epoch": 0.5821888488567046, + "grad_norm": 0.2584691643714905, + "learning_rate": 2.2195640357892156e-05, + "loss": 0.1548, + "step": 32641 + }, + { + "epoch": 0.5822066849784183, + "grad_norm": 0.2685898244380951, + "learning_rate": 2.2194093679582464e-05, + "loss": 0.1427, + "step": 32642 + }, + { + "epoch": 0.582224521100132, + "grad_norm": 0.40392282605171204, + "learning_rate": 2.219254701214943e-05, + "loss": 0.1509, + "step": 32643 + }, + { + "epoch": 0.5822423572218457, + "grad_norm": 0.4018021523952484, + "learning_rate": 2.2191000355599053e-05, + "loss": 0.171, + "step": 32644 + }, + { + "epoch": 0.5822601933435594, + "grad_norm": 0.2413187026977539, + "learning_rate": 2.2189453709937327e-05, + "loss": 0.106, + "step": 32645 + }, + { + "epoch": 0.5822780294652731, + "grad_norm": 0.19793254137039185, + "learning_rate": 2.2187907075170255e-05, + "loss": 0.1146, + "step": 32646 + }, + { + "epoch": 0.5822958655869868, + "grad_norm": 0.3355720639228821, + "learning_rate": 2.218636045130383e-05, + "loss": 0.1907, + "step": 32647 + }, + { + "epoch": 0.5823137017087004, + "grad_norm": 0.2272433191537857, + "learning_rate": 2.218481383834403e-05, + "loss": 0.1571, + "step": 32648 + }, + { + "epoch": 0.5823315378304141, + "grad_norm": 0.251446396112442, + "learning_rate": 2.2183267236296874e-05, + "loss": 0.0966, + "step": 32649 + }, + { + "epoch": 0.5823493739521278, + "grad_norm": 0.21056394279003143, + "learning_rate": 2.218172064516835e-05, + "loss": 0.1622, + "step": 32650 + }, + { + "epoch": 0.5823672100738415, + "grad_norm": 0.33742034435272217, + "learning_rate": 2.218017406496444e-05, + "loss": 0.2206, + "step": 32651 + }, + { + "epoch": 0.5823850461955552, + "grad_norm": 0.2827896475791931, + "learning_rate": 2.2178627495691147e-05, + "loss": 0.1065, + "step": 32652 + }, + { + "epoch": 0.5824028823172689, + "grad_norm": 0.2870829105377197, + "learning_rate": 2.217708093735448e-05, + "loss": 0.153, + "step": 32653 + }, + { + "epoch": 0.5824207184389826, + "grad_norm": 0.27159231901168823, + "learning_rate": 2.217553438996042e-05, + "loss": 0.1211, + "step": 32654 + }, + { + "epoch": 0.5824385545606964, + "grad_norm": 0.23261034488677979, + "learning_rate": 2.2173987853514964e-05, + "loss": 0.1223, + "step": 32655 + }, + { + "epoch": 0.5824563906824101, + "grad_norm": 0.2524707019329071, + "learning_rate": 2.217244132802411e-05, + "loss": 0.1129, + "step": 32656 + }, + { + "epoch": 0.5824742268041238, + "grad_norm": 0.29308706521987915, + "learning_rate": 2.2170894813493836e-05, + "loss": 0.1545, + "step": 32657 + }, + { + "epoch": 0.5824920629258374, + "grad_norm": 0.26639047265052795, + "learning_rate": 2.216934830993016e-05, + "loss": 0.1401, + "step": 32658 + }, + { + "epoch": 0.5825098990475511, + "grad_norm": 0.3612046539783478, + "learning_rate": 2.216780181733907e-05, + "loss": 0.1245, + "step": 32659 + }, + { + "epoch": 0.5825277351692648, + "grad_norm": 0.30385059118270874, + "learning_rate": 2.216625533572656e-05, + "loss": 0.1333, + "step": 32660 + }, + { + "epoch": 0.5825455712909785, + "grad_norm": 0.31221461296081543, + "learning_rate": 2.216470886509861e-05, + "loss": 0.1156, + "step": 32661 + }, + { + "epoch": 0.5825634074126922, + "grad_norm": 0.27799734473228455, + "learning_rate": 2.2163162405461242e-05, + "loss": 0.1421, + "step": 32662 + }, + { + "epoch": 0.5825812435344059, + "grad_norm": 0.3311552405357361, + "learning_rate": 2.2161615956820434e-05, + "loss": 0.162, + "step": 32663 + }, + { + "epoch": 0.5825990796561196, + "grad_norm": 0.2670689821243286, + "learning_rate": 2.2160069519182177e-05, + "loss": 0.1387, + "step": 32664 + }, + { + "epoch": 0.5826169157778333, + "grad_norm": 0.2806749939918518, + "learning_rate": 2.2158523092552476e-05, + "loss": 0.1508, + "step": 32665 + }, + { + "epoch": 0.5826347518995469, + "grad_norm": 0.22325634956359863, + "learning_rate": 2.2156976676937313e-05, + "loss": 0.1797, + "step": 32666 + }, + { + "epoch": 0.5826525880212606, + "grad_norm": 0.2963971495628357, + "learning_rate": 2.21554302723427e-05, + "loss": 0.1076, + "step": 32667 + }, + { + "epoch": 0.5826704241429743, + "grad_norm": 0.5901766419410706, + "learning_rate": 2.2153883878774624e-05, + "loss": 0.154, + "step": 32668 + }, + { + "epoch": 0.582688260264688, + "grad_norm": 0.28280287981033325, + "learning_rate": 2.2152337496239073e-05, + "loss": 0.1569, + "step": 32669 + }, + { + "epoch": 0.5827060963864017, + "grad_norm": 0.275160014629364, + "learning_rate": 2.2150791124742037e-05, + "loss": 0.1821, + "step": 32670 + }, + { + "epoch": 0.5827239325081154, + "grad_norm": 0.2983241081237793, + "learning_rate": 2.214924476428953e-05, + "loss": 0.1412, + "step": 32671 + }, + { + "epoch": 0.5827417686298292, + "grad_norm": 0.26532986760139465, + "learning_rate": 2.2147698414887528e-05, + "loss": 0.1163, + "step": 32672 + }, + { + "epoch": 0.5827596047515429, + "grad_norm": 0.19336272776126862, + "learning_rate": 2.2146152076542038e-05, + "loss": 0.1103, + "step": 32673 + }, + { + "epoch": 0.5827774408732566, + "grad_norm": 0.23490720987319946, + "learning_rate": 2.2144605749259038e-05, + "loss": 0.0891, + "step": 32674 + }, + { + "epoch": 0.5827952769949702, + "grad_norm": 0.25227001309394836, + "learning_rate": 2.214305943304454e-05, + "loss": 0.1409, + "step": 32675 + }, + { + "epoch": 0.5828131131166839, + "grad_norm": 0.23049236834049225, + "learning_rate": 2.2141513127904533e-05, + "loss": 0.1073, + "step": 32676 + }, + { + "epoch": 0.5828309492383976, + "grad_norm": 0.249522864818573, + "learning_rate": 2.2139966833845012e-05, + "loss": 0.1197, + "step": 32677 + }, + { + "epoch": 0.5828487853601113, + "grad_norm": 0.2541482448577881, + "learning_rate": 2.213842055087195e-05, + "loss": 0.1539, + "step": 32678 + }, + { + "epoch": 0.582866621481825, + "grad_norm": 0.2356705665588379, + "learning_rate": 2.213687427899137e-05, + "loss": 0.1259, + "step": 32679 + }, + { + "epoch": 0.5828844576035387, + "grad_norm": 0.3538076877593994, + "learning_rate": 2.2135328018209255e-05, + "loss": 0.1501, + "step": 32680 + }, + { + "epoch": 0.5829022937252524, + "grad_norm": 0.30948957800865173, + "learning_rate": 2.2133781768531597e-05, + "loss": 0.172, + "step": 32681 + }, + { + "epoch": 0.5829201298469661, + "grad_norm": 0.22080104053020477, + "learning_rate": 2.2132235529964392e-05, + "loss": 0.13, + "step": 32682 + }, + { + "epoch": 0.5829379659686797, + "grad_norm": 0.2092370241880417, + "learning_rate": 2.2130689302513624e-05, + "loss": 0.0844, + "step": 32683 + }, + { + "epoch": 0.5829558020903934, + "grad_norm": 0.3171740174293518, + "learning_rate": 2.2129143086185306e-05, + "loss": 0.1485, + "step": 32684 + }, + { + "epoch": 0.5829736382121071, + "grad_norm": 0.2533302903175354, + "learning_rate": 2.2127596880985413e-05, + "loss": 0.1536, + "step": 32685 + }, + { + "epoch": 0.5829914743338208, + "grad_norm": 0.24403366446495056, + "learning_rate": 2.2126050686919954e-05, + "loss": 0.1583, + "step": 32686 + }, + { + "epoch": 0.5830093104555345, + "grad_norm": 0.30744120478630066, + "learning_rate": 2.2124504503994902e-05, + "loss": 0.0661, + "step": 32687 + }, + { + "epoch": 0.5830271465772482, + "grad_norm": 0.24740423262119293, + "learning_rate": 2.212295833221628e-05, + "loss": 0.1025, + "step": 32688 + }, + { + "epoch": 0.583044982698962, + "grad_norm": 0.32939428091049194, + "learning_rate": 2.2121412171590058e-05, + "loss": 0.1427, + "step": 32689 + }, + { + "epoch": 0.5830628188206757, + "grad_norm": 0.35520297288894653, + "learning_rate": 2.2119866022122242e-05, + "loss": 0.1687, + "step": 32690 + }, + { + "epoch": 0.5830806549423894, + "grad_norm": 0.30466994643211365, + "learning_rate": 2.211831988381881e-05, + "loss": 0.1658, + "step": 32691 + }, + { + "epoch": 0.583098491064103, + "grad_norm": 0.21075256168842316, + "learning_rate": 2.211677375668577e-05, + "loss": 0.1554, + "step": 32692 + }, + { + "epoch": 0.5831163271858167, + "grad_norm": 0.2382170408964157, + "learning_rate": 2.211522764072911e-05, + "loss": 0.0959, + "step": 32693 + }, + { + "epoch": 0.5831341633075304, + "grad_norm": 0.3092350661754608, + "learning_rate": 2.2113681535954828e-05, + "loss": 0.1642, + "step": 32694 + }, + { + "epoch": 0.5831519994292441, + "grad_norm": 0.2672624886035919, + "learning_rate": 2.2112135442368913e-05, + "loss": 0.1401, + "step": 32695 + }, + { + "epoch": 0.5831698355509578, + "grad_norm": 0.34344083070755005, + "learning_rate": 2.211058935997735e-05, + "loss": 0.1362, + "step": 32696 + }, + { + "epoch": 0.5831876716726715, + "grad_norm": 0.2562764585018158, + "learning_rate": 2.2109043288786148e-05, + "loss": 0.1317, + "step": 32697 + }, + { + "epoch": 0.5832055077943852, + "grad_norm": 0.32705238461494446, + "learning_rate": 2.2107497228801295e-05, + "loss": 0.161, + "step": 32698 + }, + { + "epoch": 0.5832233439160989, + "grad_norm": 0.3412044644355774, + "learning_rate": 2.2105951180028776e-05, + "loss": 0.1686, + "step": 32699 + }, + { + "epoch": 0.5832411800378126, + "grad_norm": 0.3075982630252838, + "learning_rate": 2.210440514247459e-05, + "loss": 0.1191, + "step": 32700 + }, + { + "epoch": 0.5832590161595262, + "grad_norm": 0.2473858892917633, + "learning_rate": 2.210285911614473e-05, + "loss": 0.1503, + "step": 32701 + }, + { + "epoch": 0.5832768522812399, + "grad_norm": 0.22259639203548431, + "learning_rate": 2.2101313101045193e-05, + "loss": 0.1348, + "step": 32702 + }, + { + "epoch": 0.5832946884029536, + "grad_norm": 0.25709596276283264, + "learning_rate": 2.2099767097181968e-05, + "loss": 0.1407, + "step": 32703 + }, + { + "epoch": 0.5833125245246673, + "grad_norm": 0.26394790410995483, + "learning_rate": 2.2098221104561036e-05, + "loss": 0.1161, + "step": 32704 + }, + { + "epoch": 0.583330360646381, + "grad_norm": 0.318134605884552, + "learning_rate": 2.2096675123188416e-05, + "loss": 0.1573, + "step": 32705 + }, + { + "epoch": 0.5833481967680948, + "grad_norm": 0.2347719967365265, + "learning_rate": 2.2095129153070076e-05, + "loss": 0.1192, + "step": 32706 + }, + { + "epoch": 0.5833660328898085, + "grad_norm": 0.2109861522912979, + "learning_rate": 2.2093583194212025e-05, + "loss": 0.1126, + "step": 32707 + }, + { + "epoch": 0.5833838690115222, + "grad_norm": 0.33003291487693787, + "learning_rate": 2.2092037246620252e-05, + "loss": 0.1619, + "step": 32708 + }, + { + "epoch": 0.5834017051332359, + "grad_norm": 0.23176167905330658, + "learning_rate": 2.2090491310300732e-05, + "loss": 0.1069, + "step": 32709 + }, + { + "epoch": 0.5834195412549495, + "grad_norm": 0.25414010882377625, + "learning_rate": 2.2088945385259486e-05, + "loss": 0.1715, + "step": 32710 + }, + { + "epoch": 0.5834373773766632, + "grad_norm": 0.21339866518974304, + "learning_rate": 2.208739947150249e-05, + "loss": 0.1374, + "step": 32711 + }, + { + "epoch": 0.5834552134983769, + "grad_norm": 0.30355340242385864, + "learning_rate": 2.2085853569035737e-05, + "loss": 0.1231, + "step": 32712 + }, + { + "epoch": 0.5834730496200906, + "grad_norm": 0.2227211892604828, + "learning_rate": 2.2084307677865225e-05, + "loss": 0.117, + "step": 32713 + }, + { + "epoch": 0.5834908857418043, + "grad_norm": 0.40996474027633667, + "learning_rate": 2.208276179799694e-05, + "loss": 0.2058, + "step": 32714 + }, + { + "epoch": 0.583508721863518, + "grad_norm": 0.19010716676712036, + "learning_rate": 2.2081215929436882e-05, + "loss": 0.1022, + "step": 32715 + }, + { + "epoch": 0.5835265579852317, + "grad_norm": 0.1919294148683548, + "learning_rate": 2.2079670072191042e-05, + "loss": 0.11, + "step": 32716 + }, + { + "epoch": 0.5835443941069454, + "grad_norm": 0.2056245654821396, + "learning_rate": 2.2078124226265397e-05, + "loss": 0.1287, + "step": 32717 + }, + { + "epoch": 0.583562230228659, + "grad_norm": 0.22734293341636658, + "learning_rate": 2.2076578391665962e-05, + "loss": 0.1322, + "step": 32718 + }, + { + "epoch": 0.5835800663503727, + "grad_norm": 0.20983117818832397, + "learning_rate": 2.207503256839872e-05, + "loss": 0.1232, + "step": 32719 + }, + { + "epoch": 0.5835979024720864, + "grad_norm": 0.2745174467563629, + "learning_rate": 2.2073486756469658e-05, + "loss": 0.1495, + "step": 32720 + }, + { + "epoch": 0.5836157385938001, + "grad_norm": 0.2101171612739563, + "learning_rate": 2.2071940955884776e-05, + "loss": 0.0974, + "step": 32721 + }, + { + "epoch": 0.5836335747155138, + "grad_norm": 0.2162742018699646, + "learning_rate": 2.207039516665005e-05, + "loss": 0.1293, + "step": 32722 + }, + { + "epoch": 0.5836514108372276, + "grad_norm": 0.34841805696487427, + "learning_rate": 2.20688493887715e-05, + "loss": 0.1794, + "step": 32723 + }, + { + "epoch": 0.5836692469589413, + "grad_norm": 0.23516437411308289, + "learning_rate": 2.20673036222551e-05, + "loss": 0.1815, + "step": 32724 + }, + { + "epoch": 0.583687083080655, + "grad_norm": 0.18162007629871368, + "learning_rate": 2.206575786710684e-05, + "loss": 0.0877, + "step": 32725 + }, + { + "epoch": 0.5837049192023687, + "grad_norm": 0.31141769886016846, + "learning_rate": 2.2064212123332707e-05, + "loss": 0.1652, + "step": 32726 + }, + { + "epoch": 0.5837227553240824, + "grad_norm": 0.251302570104599, + "learning_rate": 2.2062666390938714e-05, + "loss": 0.1381, + "step": 32727 + }, + { + "epoch": 0.583740591445796, + "grad_norm": 0.2424483299255371, + "learning_rate": 2.2061120669930836e-05, + "loss": 0.1003, + "step": 32728 + }, + { + "epoch": 0.5837584275675097, + "grad_norm": 0.31126800179481506, + "learning_rate": 2.2059574960315073e-05, + "loss": 0.1488, + "step": 32729 + }, + { + "epoch": 0.5837762636892234, + "grad_norm": 0.3306005299091339, + "learning_rate": 2.2058029262097402e-05, + "loss": 0.1885, + "step": 32730 + }, + { + "epoch": 0.5837940998109371, + "grad_norm": 0.2621316909790039, + "learning_rate": 2.2056483575283837e-05, + "loss": 0.163, + "step": 32731 + }, + { + "epoch": 0.5838119359326508, + "grad_norm": 0.30854377150535583, + "learning_rate": 2.205493789988036e-05, + "loss": 0.125, + "step": 32732 + }, + { + "epoch": 0.5838297720543645, + "grad_norm": 0.23702551424503326, + "learning_rate": 2.2053392235892956e-05, + "loss": 0.141, + "step": 32733 + }, + { + "epoch": 0.5838476081760782, + "grad_norm": 0.2055118978023529, + "learning_rate": 2.205184658332762e-05, + "loss": 0.1041, + "step": 32734 + }, + { + "epoch": 0.5838654442977919, + "grad_norm": 0.2355274111032486, + "learning_rate": 2.2050300942190337e-05, + "loss": 0.1114, + "step": 32735 + }, + { + "epoch": 0.5838832804195055, + "grad_norm": 0.25135165452957153, + "learning_rate": 2.2048755312487122e-05, + "loss": 0.1709, + "step": 32736 + }, + { + "epoch": 0.5839011165412192, + "grad_norm": 0.2206704020500183, + "learning_rate": 2.2047209694223947e-05, + "loss": 0.1723, + "step": 32737 + }, + { + "epoch": 0.5839189526629329, + "grad_norm": 0.26961734890937805, + "learning_rate": 2.204566408740681e-05, + "loss": 0.0855, + "step": 32738 + }, + { + "epoch": 0.5839367887846467, + "grad_norm": 0.2480851411819458, + "learning_rate": 2.2044118492041683e-05, + "loss": 0.1264, + "step": 32739 + }, + { + "epoch": 0.5839546249063604, + "grad_norm": 0.34333324432373047, + "learning_rate": 2.204257290813459e-05, + "loss": 0.1268, + "step": 32740 + }, + { + "epoch": 0.5839724610280741, + "grad_norm": 0.2474355250597, + "learning_rate": 2.20410273356915e-05, + "loss": 0.1162, + "step": 32741 + }, + { + "epoch": 0.5839902971497878, + "grad_norm": 0.2821494936943054, + "learning_rate": 2.203948177471841e-05, + "loss": 0.1325, + "step": 32742 + }, + { + "epoch": 0.5840081332715015, + "grad_norm": 0.30933400988578796, + "learning_rate": 2.2037936225221303e-05, + "loss": 0.181, + "step": 32743 + }, + { + "epoch": 0.5840259693932152, + "grad_norm": 0.185426265001297, + "learning_rate": 2.203639068720619e-05, + "loss": 0.1325, + "step": 32744 + }, + { + "epoch": 0.5840438055149288, + "grad_norm": 0.2748141288757324, + "learning_rate": 2.203484516067905e-05, + "loss": 0.1602, + "step": 32745 + }, + { + "epoch": 0.5840616416366425, + "grad_norm": 0.24127641320228577, + "learning_rate": 2.203329964564588e-05, + "loss": 0.1033, + "step": 32746 + }, + { + "epoch": 0.5840794777583562, + "grad_norm": 0.23739008605480194, + "learning_rate": 2.2031754142112652e-05, + "loss": 0.1444, + "step": 32747 + }, + { + "epoch": 0.5840973138800699, + "grad_norm": 0.22942371666431427, + "learning_rate": 2.2030208650085373e-05, + "loss": 0.1023, + "step": 32748 + }, + { + "epoch": 0.5841151500017836, + "grad_norm": 0.1496797502040863, + "learning_rate": 2.202866316957003e-05, + "loss": 0.0817, + "step": 32749 + }, + { + "epoch": 0.5841329861234973, + "grad_norm": 0.2373179942369461, + "learning_rate": 2.202711770057262e-05, + "loss": 0.1269, + "step": 32750 + }, + { + "epoch": 0.584150822245211, + "grad_norm": 0.2223374992609024, + "learning_rate": 2.2025572243099128e-05, + "loss": 0.092, + "step": 32751 + }, + { + "epoch": 0.5841686583669247, + "grad_norm": 0.24049773812294006, + "learning_rate": 2.202402679715554e-05, + "loss": 0.1296, + "step": 32752 + }, + { + "epoch": 0.5841864944886384, + "grad_norm": 0.2524278163909912, + "learning_rate": 2.2022481362747856e-05, + "loss": 0.0977, + "step": 32753 + }, + { + "epoch": 0.584204330610352, + "grad_norm": 0.280766099691391, + "learning_rate": 2.2020935939882067e-05, + "loss": 0.1218, + "step": 32754 + }, + { + "epoch": 0.5842221667320657, + "grad_norm": 0.2215835452079773, + "learning_rate": 2.2019390528564152e-05, + "loss": 0.1263, + "step": 32755 + }, + { + "epoch": 0.5842400028537795, + "grad_norm": 0.17980517446994781, + "learning_rate": 2.2017845128800105e-05, + "loss": 0.0973, + "step": 32756 + }, + { + "epoch": 0.5842578389754932, + "grad_norm": 0.23257394134998322, + "learning_rate": 2.2016299740595927e-05, + "loss": 0.1005, + "step": 32757 + }, + { + "epoch": 0.5842756750972069, + "grad_norm": 0.2657771706581116, + "learning_rate": 2.2014754363957608e-05, + "loss": 0.0448, + "step": 32758 + }, + { + "epoch": 0.5842935112189206, + "grad_norm": 0.21145270764827728, + "learning_rate": 2.2013208998891128e-05, + "loss": 0.0756, + "step": 32759 + }, + { + "epoch": 0.5843113473406343, + "grad_norm": 0.3095283806324005, + "learning_rate": 2.201166364540248e-05, + "loss": 0.1284, + "step": 32760 + }, + { + "epoch": 0.584329183462348, + "grad_norm": 0.2519843578338623, + "learning_rate": 2.201011830349765e-05, + "loss": 0.1153, + "step": 32761 + }, + { + "epoch": 0.5843470195840617, + "grad_norm": 0.2932208478450775, + "learning_rate": 2.2008572973182633e-05, + "loss": 0.1414, + "step": 32762 + }, + { + "epoch": 0.5843648557057753, + "grad_norm": 0.30687496066093445, + "learning_rate": 2.200702765446343e-05, + "loss": 0.174, + "step": 32763 + }, + { + "epoch": 0.584382691827489, + "grad_norm": 0.2801288068294525, + "learning_rate": 2.2005482347346025e-05, + "loss": 0.161, + "step": 32764 + }, + { + "epoch": 0.5844005279492027, + "grad_norm": 0.30687326192855835, + "learning_rate": 2.2003937051836386e-05, + "loss": 0.0798, + "step": 32765 + }, + { + "epoch": 0.5844183640709164, + "grad_norm": 0.24588850140571594, + "learning_rate": 2.2002391767940538e-05, + "loss": 0.1265, + "step": 32766 + }, + { + "epoch": 0.5844362001926301, + "grad_norm": 0.31414589285850525, + "learning_rate": 2.2000846495664453e-05, + "loss": 0.1126, + "step": 32767 + }, + { + "epoch": 0.5844540363143438, + "grad_norm": 0.28846508264541626, + "learning_rate": 2.199930123501412e-05, + "loss": 0.0993, + "step": 32768 + }, + { + "epoch": 0.5844718724360575, + "grad_norm": 0.17866621911525726, + "learning_rate": 2.1997755985995528e-05, + "loss": 0.1037, + "step": 32769 + }, + { + "epoch": 0.5844897085577712, + "grad_norm": 0.28960931301116943, + "learning_rate": 2.1996210748614673e-05, + "loss": 0.1602, + "step": 32770 + }, + { + "epoch": 0.5845075446794848, + "grad_norm": 0.2598286271095276, + "learning_rate": 2.1994665522877547e-05, + "loss": 0.0612, + "step": 32771 + }, + { + "epoch": 0.5845253808011985, + "grad_norm": 0.20510664582252502, + "learning_rate": 2.1993120308790136e-05, + "loss": 0.106, + "step": 32772 + }, + { + "epoch": 0.5845432169229123, + "grad_norm": 0.24668174982070923, + "learning_rate": 2.199157510635843e-05, + "loss": 0.1482, + "step": 32773 + }, + { + "epoch": 0.584561053044626, + "grad_norm": 0.2542809844017029, + "learning_rate": 2.1990029915588406e-05, + "loss": 0.1075, + "step": 32774 + }, + { + "epoch": 0.5845788891663397, + "grad_norm": 0.3045494556427002, + "learning_rate": 2.1988484736486075e-05, + "loss": 0.0926, + "step": 32775 + }, + { + "epoch": 0.5845967252880534, + "grad_norm": 0.23481105268001556, + "learning_rate": 2.1986939569057416e-05, + "loss": 0.0682, + "step": 32776 + }, + { + "epoch": 0.5846145614097671, + "grad_norm": 0.346417635679245, + "learning_rate": 2.198539441330842e-05, + "loss": 0.1392, + "step": 32777 + }, + { + "epoch": 0.5846323975314808, + "grad_norm": 0.24586959183216095, + "learning_rate": 2.198384926924507e-05, + "loss": 0.0864, + "step": 32778 + }, + { + "epoch": 0.5846502336531945, + "grad_norm": 0.24564705789089203, + "learning_rate": 2.198230413687337e-05, + "loss": 0.1379, + "step": 32779 + }, + { + "epoch": 0.5846680697749082, + "grad_norm": 0.2375066876411438, + "learning_rate": 2.1980759016199304e-05, + "loss": 0.0936, + "step": 32780 + }, + { + "epoch": 0.5846859058966218, + "grad_norm": 0.2650754749774933, + "learning_rate": 2.197921390722886e-05, + "loss": 0.1956, + "step": 32781 + }, + { + "epoch": 0.5847037420183355, + "grad_norm": 0.280928373336792, + "learning_rate": 2.197766880996801e-05, + "loss": 0.1248, + "step": 32782 + }, + { + "epoch": 0.5847215781400492, + "grad_norm": 0.18832944333553314, + "learning_rate": 2.197612372442277e-05, + "loss": 0.1073, + "step": 32783 + }, + { + "epoch": 0.5847394142617629, + "grad_norm": 0.303588330745697, + "learning_rate": 2.1974578650599123e-05, + "loss": 0.118, + "step": 32784 + }, + { + "epoch": 0.5847572503834766, + "grad_norm": 0.251571387052536, + "learning_rate": 2.1973033588503054e-05, + "loss": 0.1572, + "step": 32785 + }, + { + "epoch": 0.5847750865051903, + "grad_norm": 0.25179558992385864, + "learning_rate": 2.197148853814055e-05, + "loss": 0.1555, + "step": 32786 + }, + { + "epoch": 0.584792922626904, + "grad_norm": 0.24583813548088074, + "learning_rate": 2.1969943499517595e-05, + "loss": 0.1483, + "step": 32787 + }, + { + "epoch": 0.5848107587486177, + "grad_norm": 0.19594620168209076, + "learning_rate": 2.1968398472640196e-05, + "loss": 0.1307, + "step": 32788 + }, + { + "epoch": 0.5848285948703313, + "grad_norm": 0.384545236825943, + "learning_rate": 2.1966853457514322e-05, + "loss": 0.1234, + "step": 32789 + }, + { + "epoch": 0.5848464309920451, + "grad_norm": 0.23136396706104279, + "learning_rate": 2.196530845414598e-05, + "loss": 0.0821, + "step": 32790 + }, + { + "epoch": 0.5848642671137588, + "grad_norm": 0.22859366238117218, + "learning_rate": 2.196376346254114e-05, + "loss": 0.1243, + "step": 32791 + }, + { + "epoch": 0.5848821032354725, + "grad_norm": 0.3239055275917053, + "learning_rate": 2.1962218482705812e-05, + "loss": 0.1712, + "step": 32792 + }, + { + "epoch": 0.5848999393571862, + "grad_norm": 0.2586090564727783, + "learning_rate": 2.1960673514645974e-05, + "loss": 0.1524, + "step": 32793 + }, + { + "epoch": 0.5849177754788999, + "grad_norm": 0.3600848615169525, + "learning_rate": 2.1959128558367617e-05, + "loss": 0.1712, + "step": 32794 + }, + { + "epoch": 0.5849356116006136, + "grad_norm": 0.2688920497894287, + "learning_rate": 2.1957583613876715e-05, + "loss": 0.2064, + "step": 32795 + }, + { + "epoch": 0.5849534477223273, + "grad_norm": 0.2552483081817627, + "learning_rate": 2.195603868117928e-05, + "loss": 0.0927, + "step": 32796 + }, + { + "epoch": 0.584971283844041, + "grad_norm": 0.2931370139122009, + "learning_rate": 2.195449376028129e-05, + "loss": 0.1073, + "step": 32797 + }, + { + "epoch": 0.5849891199657546, + "grad_norm": 0.26654571294784546, + "learning_rate": 2.1952948851188734e-05, + "loss": 0.1286, + "step": 32798 + }, + { + "epoch": 0.5850069560874683, + "grad_norm": 0.3052487373352051, + "learning_rate": 2.1951403953907603e-05, + "loss": 0.138, + "step": 32799 + }, + { + "epoch": 0.585024792209182, + "grad_norm": 0.3467080295085907, + "learning_rate": 2.1949859068443873e-05, + "loss": 0.1843, + "step": 32800 + }, + { + "epoch": 0.5850426283308957, + "grad_norm": 0.23049597442150116, + "learning_rate": 2.194831419480355e-05, + "loss": 0.1662, + "step": 32801 + }, + { + "epoch": 0.5850604644526094, + "grad_norm": 0.42516759037971497, + "learning_rate": 2.194676933299262e-05, + "loss": 0.1521, + "step": 32802 + }, + { + "epoch": 0.5850783005743231, + "grad_norm": 0.2271551638841629, + "learning_rate": 2.194522448301706e-05, + "loss": 0.1575, + "step": 32803 + }, + { + "epoch": 0.5850961366960368, + "grad_norm": 0.2876285910606384, + "learning_rate": 2.194367964488286e-05, + "loss": 0.1419, + "step": 32804 + }, + { + "epoch": 0.5851139728177505, + "grad_norm": 0.24481911957263947, + "learning_rate": 2.194213481859602e-05, + "loss": 0.1039, + "step": 32805 + }, + { + "epoch": 0.5851318089394641, + "grad_norm": 0.32447394728660583, + "learning_rate": 2.1940590004162524e-05, + "loss": 0.1707, + "step": 32806 + }, + { + "epoch": 0.585149645061178, + "grad_norm": 0.2697230577468872, + "learning_rate": 2.1939045201588357e-05, + "loss": 0.1636, + "step": 32807 + }, + { + "epoch": 0.5851674811828916, + "grad_norm": 0.18803580105304718, + "learning_rate": 2.19375004108795e-05, + "loss": 0.1165, + "step": 32808 + }, + { + "epoch": 0.5851853173046053, + "grad_norm": 0.28365233540534973, + "learning_rate": 2.1935955632041957e-05, + "loss": 0.1859, + "step": 32809 + }, + { + "epoch": 0.585203153426319, + "grad_norm": 0.31464138627052307, + "learning_rate": 2.193441086508171e-05, + "loss": 0.1167, + "step": 32810 + }, + { + "epoch": 0.5852209895480327, + "grad_norm": 0.2586153447628021, + "learning_rate": 2.1932866110004736e-05, + "loss": 0.1127, + "step": 32811 + }, + { + "epoch": 0.5852388256697464, + "grad_norm": 0.2822292745113373, + "learning_rate": 2.1931321366817042e-05, + "loss": 0.0737, + "step": 32812 + }, + { + "epoch": 0.5852566617914601, + "grad_norm": 0.29303503036499023, + "learning_rate": 2.1929776635524594e-05, + "loss": 0.0901, + "step": 32813 + }, + { + "epoch": 0.5852744979131738, + "grad_norm": 0.1761719435453415, + "learning_rate": 2.1928231916133407e-05, + "loss": 0.114, + "step": 32814 + }, + { + "epoch": 0.5852923340348875, + "grad_norm": 0.3096100091934204, + "learning_rate": 2.192668720864945e-05, + "loss": 0.1644, + "step": 32815 + }, + { + "epoch": 0.5853101701566011, + "grad_norm": 0.25290408730506897, + "learning_rate": 2.1925142513078708e-05, + "loss": 0.1218, + "step": 32816 + }, + { + "epoch": 0.5853280062783148, + "grad_norm": 0.34670233726501465, + "learning_rate": 2.1923597829427177e-05, + "loss": 0.1262, + "step": 32817 + }, + { + "epoch": 0.5853458424000285, + "grad_norm": 0.21767553687095642, + "learning_rate": 2.1922053157700848e-05, + "loss": 0.1394, + "step": 32818 + }, + { + "epoch": 0.5853636785217422, + "grad_norm": 0.36319202184677124, + "learning_rate": 2.1920508497905702e-05, + "loss": 0.157, + "step": 32819 + }, + { + "epoch": 0.5853815146434559, + "grad_norm": 0.23719051480293274, + "learning_rate": 2.1918963850047734e-05, + "loss": 0.1393, + "step": 32820 + }, + { + "epoch": 0.5853993507651696, + "grad_norm": 0.2927936315536499, + "learning_rate": 2.1917419214132914e-05, + "loss": 0.1234, + "step": 32821 + }, + { + "epoch": 0.5854171868868833, + "grad_norm": 0.2914182245731354, + "learning_rate": 2.1915874590167252e-05, + "loss": 0.1339, + "step": 32822 + }, + { + "epoch": 0.585435023008597, + "grad_norm": 0.22109076380729675, + "learning_rate": 2.1914329978156724e-05, + "loss": 0.1194, + "step": 32823 + }, + { + "epoch": 0.5854528591303108, + "grad_norm": 0.2898158133029938, + "learning_rate": 2.191278537810732e-05, + "loss": 0.1333, + "step": 32824 + }, + { + "epoch": 0.5854706952520244, + "grad_norm": 0.37327486276626587, + "learning_rate": 2.191124079002502e-05, + "loss": 0.1907, + "step": 32825 + }, + { + "epoch": 0.5854885313737381, + "grad_norm": 0.23889465630054474, + "learning_rate": 2.1909696213915816e-05, + "loss": 0.1153, + "step": 32826 + }, + { + "epoch": 0.5855063674954518, + "grad_norm": 0.25527429580688477, + "learning_rate": 2.1908151649785704e-05, + "loss": 0.1113, + "step": 32827 + }, + { + "epoch": 0.5855242036171655, + "grad_norm": 0.2556540071964264, + "learning_rate": 2.1906607097640666e-05, + "loss": 0.1584, + "step": 32828 + }, + { + "epoch": 0.5855420397388792, + "grad_norm": 0.20545242726802826, + "learning_rate": 2.1905062557486685e-05, + "loss": 0.1078, + "step": 32829 + }, + { + "epoch": 0.5855598758605929, + "grad_norm": 0.2756780683994293, + "learning_rate": 2.1903518029329743e-05, + "loss": 0.1225, + "step": 32830 + }, + { + "epoch": 0.5855777119823066, + "grad_norm": 0.34050872921943665, + "learning_rate": 2.1901973513175844e-05, + "loss": 0.1469, + "step": 32831 + }, + { + "epoch": 0.5855955481040203, + "grad_norm": 0.24623584747314453, + "learning_rate": 2.190042900903096e-05, + "loss": 0.1128, + "step": 32832 + }, + { + "epoch": 0.585613384225734, + "grad_norm": 0.29396793246269226, + "learning_rate": 2.1898884516901088e-05, + "loss": 0.1214, + "step": 32833 + }, + { + "epoch": 0.5856312203474476, + "grad_norm": 0.2876686453819275, + "learning_rate": 2.1897340036792198e-05, + "loss": 0.2136, + "step": 32834 + }, + { + "epoch": 0.5856490564691613, + "grad_norm": 0.3147815763950348, + "learning_rate": 2.1895795568710305e-05, + "loss": 0.1358, + "step": 32835 + }, + { + "epoch": 0.585666892590875, + "grad_norm": 0.24319984018802643, + "learning_rate": 2.189425111266138e-05, + "loss": 0.1103, + "step": 32836 + }, + { + "epoch": 0.5856847287125887, + "grad_norm": 0.3565587103366852, + "learning_rate": 2.1892706668651408e-05, + "loss": 0.1238, + "step": 32837 + }, + { + "epoch": 0.5857025648343024, + "grad_norm": 0.20736780762672424, + "learning_rate": 2.1891162236686373e-05, + "loss": 0.1176, + "step": 32838 + }, + { + "epoch": 0.5857204009560161, + "grad_norm": 0.26560136675834656, + "learning_rate": 2.1889617816772263e-05, + "loss": 0.1397, + "step": 32839 + }, + { + "epoch": 0.5857382370777299, + "grad_norm": 0.23852647840976715, + "learning_rate": 2.188807340891508e-05, + "loss": 0.1234, + "step": 32840 + }, + { + "epoch": 0.5857560731994436, + "grad_norm": 0.23946654796600342, + "learning_rate": 2.1886529013120795e-05, + "loss": 0.1077, + "step": 32841 + }, + { + "epoch": 0.5857739093211572, + "grad_norm": 0.2291734665632248, + "learning_rate": 2.1884984629395405e-05, + "loss": 0.1144, + "step": 32842 + }, + { + "epoch": 0.5857917454428709, + "grad_norm": 0.2994038164615631, + "learning_rate": 2.1883440257744876e-05, + "loss": 0.096, + "step": 32843 + }, + { + "epoch": 0.5858095815645846, + "grad_norm": 0.33408018946647644, + "learning_rate": 2.188189589817522e-05, + "loss": 0.1977, + "step": 32844 + }, + { + "epoch": 0.5858274176862983, + "grad_norm": 0.23636189103126526, + "learning_rate": 2.1880351550692407e-05, + "loss": 0.1266, + "step": 32845 + }, + { + "epoch": 0.585845253808012, + "grad_norm": 0.2544075548648834, + "learning_rate": 2.1878807215302437e-05, + "loss": 0.1381, + "step": 32846 + }, + { + "epoch": 0.5858630899297257, + "grad_norm": 0.26502206921577454, + "learning_rate": 2.1877262892011275e-05, + "loss": 0.1526, + "step": 32847 + }, + { + "epoch": 0.5858809260514394, + "grad_norm": 0.2717975676059723, + "learning_rate": 2.187571858082493e-05, + "loss": 0.1015, + "step": 32848 + }, + { + "epoch": 0.5858987621731531, + "grad_norm": 0.3843843340873718, + "learning_rate": 2.1874174281749383e-05, + "loss": 0.1008, + "step": 32849 + }, + { + "epoch": 0.5859165982948668, + "grad_norm": 0.255064457654953, + "learning_rate": 2.1872629994790612e-05, + "loss": 0.1169, + "step": 32850 + }, + { + "epoch": 0.5859344344165804, + "grad_norm": 0.3429819345474243, + "learning_rate": 2.1871085719954604e-05, + "loss": 0.1476, + "step": 32851 + }, + { + "epoch": 0.5859522705382941, + "grad_norm": 0.2847726345062256, + "learning_rate": 2.186954145724735e-05, + "loss": 0.1478, + "step": 32852 + }, + { + "epoch": 0.5859701066600078, + "grad_norm": 0.2902240753173828, + "learning_rate": 2.186799720667483e-05, + "loss": 0.2203, + "step": 32853 + }, + { + "epoch": 0.5859879427817215, + "grad_norm": 0.34646478295326233, + "learning_rate": 2.1866452968243044e-05, + "loss": 0.0979, + "step": 32854 + }, + { + "epoch": 0.5860057789034352, + "grad_norm": 0.26700881123542786, + "learning_rate": 2.1864908741957965e-05, + "loss": 0.1525, + "step": 32855 + }, + { + "epoch": 0.5860236150251489, + "grad_norm": 0.30656698346138, + "learning_rate": 2.1863364527825575e-05, + "loss": 0.1815, + "step": 32856 + }, + { + "epoch": 0.5860414511468627, + "grad_norm": 0.30681997537612915, + "learning_rate": 2.1861820325851877e-05, + "loss": 0.162, + "step": 32857 + }, + { + "epoch": 0.5860592872685764, + "grad_norm": 0.30271705985069275, + "learning_rate": 2.1860276136042845e-05, + "loss": 0.1931, + "step": 32858 + }, + { + "epoch": 0.5860771233902901, + "grad_norm": 0.2553333342075348, + "learning_rate": 2.1858731958404467e-05, + "loss": 0.1518, + "step": 32859 + }, + { + "epoch": 0.5860949595120037, + "grad_norm": 0.2667693495750427, + "learning_rate": 2.1857187792942717e-05, + "loss": 0.1796, + "step": 32860 + }, + { + "epoch": 0.5861127956337174, + "grad_norm": 0.22701171040534973, + "learning_rate": 2.185564363966361e-05, + "loss": 0.1057, + "step": 32861 + }, + { + "epoch": 0.5861306317554311, + "grad_norm": 0.22908222675323486, + "learning_rate": 2.1854099498573112e-05, + "loss": 0.0846, + "step": 32862 + }, + { + "epoch": 0.5861484678771448, + "grad_norm": 0.30423903465270996, + "learning_rate": 2.1852555369677208e-05, + "loss": 0.1547, + "step": 32863 + }, + { + "epoch": 0.5861663039988585, + "grad_norm": 0.22806040942668915, + "learning_rate": 2.1851011252981878e-05, + "loss": 0.1516, + "step": 32864 + }, + { + "epoch": 0.5861841401205722, + "grad_norm": 0.3106060028076172, + "learning_rate": 2.1849467148493127e-05, + "loss": 0.1529, + "step": 32865 + }, + { + "epoch": 0.5862019762422859, + "grad_norm": 0.2221730500459671, + "learning_rate": 2.1847923056216924e-05, + "loss": 0.0887, + "step": 32866 + }, + { + "epoch": 0.5862198123639996, + "grad_norm": 0.24242134392261505, + "learning_rate": 2.1846378976159266e-05, + "loss": 0.108, + "step": 32867 + }, + { + "epoch": 0.5862376484857132, + "grad_norm": 0.2813638150691986, + "learning_rate": 2.1844834908326133e-05, + "loss": 0.1465, + "step": 32868 + }, + { + "epoch": 0.5862554846074269, + "grad_norm": 0.2639690637588501, + "learning_rate": 2.1843290852723495e-05, + "loss": 0.1462, + "step": 32869 + }, + { + "epoch": 0.5862733207291406, + "grad_norm": 0.2633322775363922, + "learning_rate": 2.1841746809357363e-05, + "loss": 0.1052, + "step": 32870 + }, + { + "epoch": 0.5862911568508543, + "grad_norm": 0.23971325159072876, + "learning_rate": 2.1840202778233716e-05, + "loss": 0.1417, + "step": 32871 + }, + { + "epoch": 0.586308992972568, + "grad_norm": 0.4095028042793274, + "learning_rate": 2.1838658759358525e-05, + "loss": 0.0737, + "step": 32872 + }, + { + "epoch": 0.5863268290942817, + "grad_norm": 0.3204917311668396, + "learning_rate": 2.1837114752737787e-05, + "loss": 0.1108, + "step": 32873 + }, + { + "epoch": 0.5863446652159955, + "grad_norm": 0.2623700201511383, + "learning_rate": 2.1835570758377484e-05, + "loss": 0.125, + "step": 32874 + }, + { + "epoch": 0.5863625013377092, + "grad_norm": 0.37184256315231323, + "learning_rate": 2.1834026776283605e-05, + "loss": 0.1038, + "step": 32875 + }, + { + "epoch": 0.5863803374594229, + "grad_norm": 0.35463324189186096, + "learning_rate": 2.183248280646213e-05, + "loss": 0.0976, + "step": 32876 + }, + { + "epoch": 0.5863981735811366, + "grad_norm": 0.24924468994140625, + "learning_rate": 2.183093884891904e-05, + "loss": 0.0647, + "step": 32877 + }, + { + "epoch": 0.5864160097028502, + "grad_norm": 0.28524017333984375, + "learning_rate": 2.1829394903660334e-05, + "loss": 0.1728, + "step": 32878 + }, + { + "epoch": 0.5864338458245639, + "grad_norm": 0.3683130145072937, + "learning_rate": 2.182785097069199e-05, + "loss": 0.1655, + "step": 32879 + }, + { + "epoch": 0.5864516819462776, + "grad_norm": 0.2757599949836731, + "learning_rate": 2.1826307050019983e-05, + "loss": 0.094, + "step": 32880 + }, + { + "epoch": 0.5864695180679913, + "grad_norm": 0.2617654502391815, + "learning_rate": 2.1824763141650316e-05, + "loss": 0.1363, + "step": 32881 + }, + { + "epoch": 0.586487354189705, + "grad_norm": 0.30277103185653687, + "learning_rate": 2.182321924558895e-05, + "loss": 0.1637, + "step": 32882 + }, + { + "epoch": 0.5865051903114187, + "grad_norm": 0.25579097867012024, + "learning_rate": 2.1821675361841894e-05, + "loss": 0.1479, + "step": 32883 + }, + { + "epoch": 0.5865230264331324, + "grad_norm": 0.24641653895378113, + "learning_rate": 2.1820131490415122e-05, + "loss": 0.0902, + "step": 32884 + }, + { + "epoch": 0.586540862554846, + "grad_norm": 0.2992168068885803, + "learning_rate": 2.1818587631314617e-05, + "loss": 0.1242, + "step": 32885 + }, + { + "epoch": 0.5865586986765597, + "grad_norm": 0.17372293770313263, + "learning_rate": 2.1817043784546357e-05, + "loss": 0.0995, + "step": 32886 + }, + { + "epoch": 0.5865765347982734, + "grad_norm": 0.27276352047920227, + "learning_rate": 2.1815499950116347e-05, + "loss": 0.1047, + "step": 32887 + }, + { + "epoch": 0.5865943709199871, + "grad_norm": 0.2772451639175415, + "learning_rate": 2.181395612803055e-05, + "loss": 0.1047, + "step": 32888 + }, + { + "epoch": 0.5866122070417008, + "grad_norm": 0.32433974742889404, + "learning_rate": 2.1812412318294965e-05, + "loss": 0.2032, + "step": 32889 + }, + { + "epoch": 0.5866300431634145, + "grad_norm": 0.25707507133483887, + "learning_rate": 2.1810868520915563e-05, + "loss": 0.1136, + "step": 32890 + }, + { + "epoch": 0.5866478792851283, + "grad_norm": 0.3069787323474884, + "learning_rate": 2.1809324735898346e-05, + "loss": 0.1296, + "step": 32891 + }, + { + "epoch": 0.586665715406842, + "grad_norm": 0.30653896927833557, + "learning_rate": 2.1807780963249286e-05, + "loss": 0.1496, + "step": 32892 + }, + { + "epoch": 0.5866835515285557, + "grad_norm": 0.2532758414745331, + "learning_rate": 2.1806237202974365e-05, + "loss": 0.1688, + "step": 32893 + }, + { + "epoch": 0.5867013876502694, + "grad_norm": 0.24667032063007355, + "learning_rate": 2.1804693455079576e-05, + "loss": 0.1263, + "step": 32894 + }, + { + "epoch": 0.586719223771983, + "grad_norm": 0.2821042537689209, + "learning_rate": 2.1803149719570893e-05, + "loss": 0.1768, + "step": 32895 + }, + { + "epoch": 0.5867370598936967, + "grad_norm": 0.22775308787822723, + "learning_rate": 2.1801605996454315e-05, + "loss": 0.1261, + "step": 32896 + }, + { + "epoch": 0.5867548960154104, + "grad_norm": 0.22688975930213928, + "learning_rate": 2.1800062285735815e-05, + "loss": 0.1223, + "step": 32897 + }, + { + "epoch": 0.5867727321371241, + "grad_norm": 0.20398736000061035, + "learning_rate": 2.1798518587421378e-05, + "loss": 0.0926, + "step": 32898 + }, + { + "epoch": 0.5867905682588378, + "grad_norm": 0.29975900053977966, + "learning_rate": 2.179697490151698e-05, + "loss": 0.1343, + "step": 32899 + }, + { + "epoch": 0.5868084043805515, + "grad_norm": 0.20599398016929626, + "learning_rate": 2.1795431228028625e-05, + "loss": 0.1155, + "step": 32900 + }, + { + "epoch": 0.5868262405022652, + "grad_norm": 0.23313100636005402, + "learning_rate": 2.1793887566962275e-05, + "loss": 0.1188, + "step": 32901 + }, + { + "epoch": 0.5868440766239789, + "grad_norm": 0.2681022882461548, + "learning_rate": 2.1792343918323935e-05, + "loss": 0.1137, + "step": 32902 + }, + { + "epoch": 0.5868619127456925, + "grad_norm": 0.25823819637298584, + "learning_rate": 2.1790800282119564e-05, + "loss": 0.1451, + "step": 32903 + }, + { + "epoch": 0.5868797488674062, + "grad_norm": 0.2917575538158417, + "learning_rate": 2.1789256658355174e-05, + "loss": 0.1439, + "step": 32904 + }, + { + "epoch": 0.5868975849891199, + "grad_norm": 0.23296421766281128, + "learning_rate": 2.178771304703673e-05, + "loss": 0.1409, + "step": 32905 + }, + { + "epoch": 0.5869154211108336, + "grad_norm": 0.2006133496761322, + "learning_rate": 2.178616944817022e-05, + "loss": 0.0925, + "step": 32906 + }, + { + "epoch": 0.5869332572325473, + "grad_norm": 0.2315547913312912, + "learning_rate": 2.1784625861761624e-05, + "loss": 0.141, + "step": 32907 + }, + { + "epoch": 0.5869510933542611, + "grad_norm": 0.2253972887992859, + "learning_rate": 2.178308228781693e-05, + "loss": 0.1746, + "step": 32908 + }, + { + "epoch": 0.5869689294759748, + "grad_norm": 0.3042829930782318, + "learning_rate": 2.1781538726342115e-05, + "loss": 0.1722, + "step": 32909 + }, + { + "epoch": 0.5869867655976885, + "grad_norm": 0.2084905058145523, + "learning_rate": 2.1779995177343178e-05, + "loss": 0.0904, + "step": 32910 + }, + { + "epoch": 0.5870046017194022, + "grad_norm": 0.26686906814575195, + "learning_rate": 2.1778451640826087e-05, + "loss": 0.1174, + "step": 32911 + }, + { + "epoch": 0.5870224378411159, + "grad_norm": 0.21772006154060364, + "learning_rate": 2.1776908116796824e-05, + "loss": 0.0924, + "step": 32912 + }, + { + "epoch": 0.5870402739628295, + "grad_norm": 0.2813258767127991, + "learning_rate": 2.1775364605261385e-05, + "loss": 0.1617, + "step": 32913 + }, + { + "epoch": 0.5870581100845432, + "grad_norm": 0.21322877705097198, + "learning_rate": 2.177382110622575e-05, + "loss": 0.1144, + "step": 32914 + }, + { + "epoch": 0.5870759462062569, + "grad_norm": 0.1968001127243042, + "learning_rate": 2.1772277619695893e-05, + "loss": 0.1205, + "step": 32915 + }, + { + "epoch": 0.5870937823279706, + "grad_norm": 0.277910441160202, + "learning_rate": 2.1770734145677796e-05, + "loss": 0.1793, + "step": 32916 + }, + { + "epoch": 0.5871116184496843, + "grad_norm": 0.29438337683677673, + "learning_rate": 2.176919068417746e-05, + "loss": 0.1032, + "step": 32917 + }, + { + "epoch": 0.587129454571398, + "grad_norm": 0.23804901540279388, + "learning_rate": 2.1767647235200856e-05, + "loss": 0.0904, + "step": 32918 + }, + { + "epoch": 0.5871472906931117, + "grad_norm": 0.3984792232513428, + "learning_rate": 2.1766103798753966e-05, + "loss": 0.1457, + "step": 32919 + }, + { + "epoch": 0.5871651268148254, + "grad_norm": 0.5022168159484863, + "learning_rate": 2.1764560374842774e-05, + "loss": 0.2379, + "step": 32920 + }, + { + "epoch": 0.587182962936539, + "grad_norm": 0.2859339416027069, + "learning_rate": 2.176301696347326e-05, + "loss": 0.1428, + "step": 32921 + }, + { + "epoch": 0.5872007990582527, + "grad_norm": 0.25966086983680725, + "learning_rate": 2.1761473564651412e-05, + "loss": 0.0891, + "step": 32922 + }, + { + "epoch": 0.5872186351799664, + "grad_norm": 0.322607159614563, + "learning_rate": 2.1759930178383218e-05, + "loss": 0.1516, + "step": 32923 + }, + { + "epoch": 0.5872364713016801, + "grad_norm": 0.2912963926792145, + "learning_rate": 2.1758386804674652e-05, + "loss": 0.1056, + "step": 32924 + }, + { + "epoch": 0.5872543074233939, + "grad_norm": 0.28418809175491333, + "learning_rate": 2.1756843443531685e-05, + "loss": 0.1334, + "step": 32925 + }, + { + "epoch": 0.5872721435451076, + "grad_norm": 0.2073073387145996, + "learning_rate": 2.1755300094960327e-05, + "loss": 0.088, + "step": 32926 + }, + { + "epoch": 0.5872899796668213, + "grad_norm": 0.2851669490337372, + "learning_rate": 2.1753756758966547e-05, + "loss": 0.1231, + "step": 32927 + }, + { + "epoch": 0.587307815788535, + "grad_norm": 0.22561010718345642, + "learning_rate": 2.175221343555632e-05, + "loss": 0.108, + "step": 32928 + }, + { + "epoch": 0.5873256519102487, + "grad_norm": 0.36879727244377136, + "learning_rate": 2.1750670124735633e-05, + "loss": 0.157, + "step": 32929 + }, + { + "epoch": 0.5873434880319623, + "grad_norm": 0.29005277156829834, + "learning_rate": 2.1749126826510473e-05, + "loss": 0.1254, + "step": 32930 + }, + { + "epoch": 0.587361324153676, + "grad_norm": 0.2758479416370392, + "learning_rate": 2.1747583540886826e-05, + "loss": 0.117, + "step": 32931 + }, + { + "epoch": 0.5873791602753897, + "grad_norm": 0.26588526368141174, + "learning_rate": 2.174604026787067e-05, + "loss": 0.1844, + "step": 32932 + }, + { + "epoch": 0.5873969963971034, + "grad_norm": 0.30634355545043945, + "learning_rate": 2.1744497007467985e-05, + "loss": 0.1326, + "step": 32933 + }, + { + "epoch": 0.5874148325188171, + "grad_norm": 0.33632099628448486, + "learning_rate": 2.174295375968474e-05, + "loss": 0.1459, + "step": 32934 + }, + { + "epoch": 0.5874326686405308, + "grad_norm": 0.23401671648025513, + "learning_rate": 2.1741410524526944e-05, + "loss": 0.1204, + "step": 32935 + }, + { + "epoch": 0.5874505047622445, + "grad_norm": 0.24698470532894135, + "learning_rate": 2.1739867302000563e-05, + "loss": 0.0907, + "step": 32936 + }, + { + "epoch": 0.5874683408839582, + "grad_norm": 0.20571982860565186, + "learning_rate": 2.1738324092111588e-05, + "loss": 0.1365, + "step": 32937 + }, + { + "epoch": 0.5874861770056719, + "grad_norm": 0.24413610994815826, + "learning_rate": 2.1736780894865982e-05, + "loss": 0.1106, + "step": 32938 + }, + { + "epoch": 0.5875040131273855, + "grad_norm": 0.25464290380477905, + "learning_rate": 2.173523771026975e-05, + "loss": 0.1285, + "step": 32939 + }, + { + "epoch": 0.5875218492490992, + "grad_norm": 0.30646461248397827, + "learning_rate": 2.1733694538328868e-05, + "loss": 0.1141, + "step": 32940 + }, + { + "epoch": 0.587539685370813, + "grad_norm": 0.24935881793498993, + "learning_rate": 2.1732151379049312e-05, + "loss": 0.1756, + "step": 32941 + }, + { + "epoch": 0.5875575214925267, + "grad_norm": 0.27392441034317017, + "learning_rate": 2.1730608232437056e-05, + "loss": 0.1338, + "step": 32942 + }, + { + "epoch": 0.5875753576142404, + "grad_norm": 0.2358497679233551, + "learning_rate": 2.1729065098498096e-05, + "loss": 0.1216, + "step": 32943 + }, + { + "epoch": 0.5875931937359541, + "grad_norm": 0.3070487082004547, + "learning_rate": 2.1727521977238417e-05, + "loss": 0.1446, + "step": 32944 + }, + { + "epoch": 0.5876110298576678, + "grad_norm": 0.31453925371170044, + "learning_rate": 2.172597886866399e-05, + "loss": 0.1343, + "step": 32945 + }, + { + "epoch": 0.5876288659793815, + "grad_norm": 0.28357356786727905, + "learning_rate": 2.1724435772780803e-05, + "loss": 0.1348, + "step": 32946 + }, + { + "epoch": 0.5876467021010952, + "grad_norm": 0.303285151720047, + "learning_rate": 2.172289268959482e-05, + "loss": 0.1004, + "step": 32947 + }, + { + "epoch": 0.5876645382228088, + "grad_norm": 0.26704922318458557, + "learning_rate": 2.1721349619112047e-05, + "loss": 0.115, + "step": 32948 + }, + { + "epoch": 0.5876823743445225, + "grad_norm": 0.22708401083946228, + "learning_rate": 2.1719806561338453e-05, + "loss": 0.1355, + "step": 32949 + }, + { + "epoch": 0.5877002104662362, + "grad_norm": 0.3322717547416687, + "learning_rate": 2.1718263516280026e-05, + "loss": 0.1504, + "step": 32950 + }, + { + "epoch": 0.5877180465879499, + "grad_norm": 0.2664613425731659, + "learning_rate": 2.1716720483942733e-05, + "loss": 0.0897, + "step": 32951 + }, + { + "epoch": 0.5877358827096636, + "grad_norm": 0.2527411878108978, + "learning_rate": 2.1715177464332574e-05, + "loss": 0.1288, + "step": 32952 + }, + { + "epoch": 0.5877537188313773, + "grad_norm": 0.22989559173583984, + "learning_rate": 2.1713634457455527e-05, + "loss": 0.0958, + "step": 32953 + }, + { + "epoch": 0.587771554953091, + "grad_norm": 0.298786461353302, + "learning_rate": 2.1712091463317563e-05, + "loss": 0.1087, + "step": 32954 + }, + { + "epoch": 0.5877893910748047, + "grad_norm": 0.2342507690191269, + "learning_rate": 2.171054848192466e-05, + "loss": 0.1508, + "step": 32955 + }, + { + "epoch": 0.5878072271965183, + "grad_norm": 0.2711906135082245, + "learning_rate": 2.1709005513282815e-05, + "loss": 0.1102, + "step": 32956 + }, + { + "epoch": 0.587825063318232, + "grad_norm": 0.2384423017501831, + "learning_rate": 2.1707462557397998e-05, + "loss": 0.189, + "step": 32957 + }, + { + "epoch": 0.5878428994399458, + "grad_norm": 0.33205151557922363, + "learning_rate": 2.1705919614276198e-05, + "loss": 0.1645, + "step": 32958 + }, + { + "epoch": 0.5878607355616595, + "grad_norm": 0.24081368744373322, + "learning_rate": 2.170437668392339e-05, + "loss": 0.122, + "step": 32959 + }, + { + "epoch": 0.5878785716833732, + "grad_norm": 0.3218442499637604, + "learning_rate": 2.170283376634555e-05, + "loss": 0.1172, + "step": 32960 + }, + { + "epoch": 0.5878964078050869, + "grad_norm": 0.377908855676651, + "learning_rate": 2.1701290861548674e-05, + "loss": 0.0869, + "step": 32961 + }, + { + "epoch": 0.5879142439268006, + "grad_norm": 0.22866292297840118, + "learning_rate": 2.169974796953873e-05, + "loss": 0.1229, + "step": 32962 + }, + { + "epoch": 0.5879320800485143, + "grad_norm": 0.21684928238391876, + "learning_rate": 2.1698205090321704e-05, + "loss": 0.1193, + "step": 32963 + }, + { + "epoch": 0.587949916170228, + "grad_norm": 0.27293986082077026, + "learning_rate": 2.169666222390357e-05, + "loss": 0.1601, + "step": 32964 + }, + { + "epoch": 0.5879677522919416, + "grad_norm": 0.2788138687610626, + "learning_rate": 2.1695119370290316e-05, + "loss": 0.1112, + "step": 32965 + }, + { + "epoch": 0.5879855884136553, + "grad_norm": 0.2058860957622528, + "learning_rate": 2.1693576529487925e-05, + "loss": 0.0955, + "step": 32966 + }, + { + "epoch": 0.588003424535369, + "grad_norm": 0.2647230625152588, + "learning_rate": 2.1692033701502372e-05, + "loss": 0.0747, + "step": 32967 + }, + { + "epoch": 0.5880212606570827, + "grad_norm": 0.23579511046409607, + "learning_rate": 2.169049088633963e-05, + "loss": 0.107, + "step": 32968 + }, + { + "epoch": 0.5880390967787964, + "grad_norm": 0.24442631006240845, + "learning_rate": 2.1688948084005702e-05, + "loss": 0.0979, + "step": 32969 + }, + { + "epoch": 0.5880569329005101, + "grad_norm": 0.2332504242658615, + "learning_rate": 2.1687405294506548e-05, + "loss": 0.1505, + "step": 32970 + }, + { + "epoch": 0.5880747690222238, + "grad_norm": 0.2690383791923523, + "learning_rate": 2.168586251784816e-05, + "loss": 0.07, + "step": 32971 + }, + { + "epoch": 0.5880926051439375, + "grad_norm": 0.39139053225517273, + "learning_rate": 2.1684319754036513e-05, + "loss": 0.1601, + "step": 32972 + }, + { + "epoch": 0.5881104412656512, + "grad_norm": 0.23805727064609528, + "learning_rate": 2.1682777003077578e-05, + "loss": 0.0746, + "step": 32973 + }, + { + "epoch": 0.5881282773873648, + "grad_norm": 0.4265058934688568, + "learning_rate": 2.1681234264977356e-05, + "loss": 0.1239, + "step": 32974 + }, + { + "epoch": 0.5881461135090786, + "grad_norm": 0.20426680147647858, + "learning_rate": 2.1679691539741815e-05, + "loss": 0.0997, + "step": 32975 + }, + { + "epoch": 0.5881639496307923, + "grad_norm": 0.21748527884483337, + "learning_rate": 2.167814882737693e-05, + "loss": 0.1442, + "step": 32976 + }, + { + "epoch": 0.588181785752506, + "grad_norm": 0.19584710896015167, + "learning_rate": 2.1676606127888692e-05, + "loss": 0.1391, + "step": 32977 + }, + { + "epoch": 0.5881996218742197, + "grad_norm": 0.209767684340477, + "learning_rate": 2.167506344128307e-05, + "loss": 0.1177, + "step": 32978 + }, + { + "epoch": 0.5882174579959334, + "grad_norm": 0.231951043009758, + "learning_rate": 2.167352076756606e-05, + "loss": 0.1165, + "step": 32979 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.658835768699646, + "learning_rate": 2.1671978106743632e-05, + "loss": 0.1854, + "step": 32980 + }, + { + "epoch": 0.5882531302393608, + "grad_norm": 0.2097320854663849, + "learning_rate": 2.1670435458821756e-05, + "loss": 0.0957, + "step": 32981 + }, + { + "epoch": 0.5882709663610745, + "grad_norm": 0.2828176021575928, + "learning_rate": 2.1668892823806433e-05, + "loss": 0.1088, + "step": 32982 + }, + { + "epoch": 0.5882888024827881, + "grad_norm": 0.2525877058506012, + "learning_rate": 2.166735020170363e-05, + "loss": 0.1019, + "step": 32983 + }, + { + "epoch": 0.5883066386045018, + "grad_norm": 0.26961150765419006, + "learning_rate": 2.1665807592519324e-05, + "loss": 0.1458, + "step": 32984 + }, + { + "epoch": 0.5883244747262155, + "grad_norm": 0.2963247001171112, + "learning_rate": 2.1664264996259505e-05, + "loss": 0.115, + "step": 32985 + }, + { + "epoch": 0.5883423108479292, + "grad_norm": 0.2993464469909668, + "learning_rate": 2.1662722412930136e-05, + "loss": 0.1835, + "step": 32986 + }, + { + "epoch": 0.5883601469696429, + "grad_norm": 0.2097903937101364, + "learning_rate": 2.1661179842537223e-05, + "loss": 0.1332, + "step": 32987 + }, + { + "epoch": 0.5883779830913566, + "grad_norm": 0.2858603894710541, + "learning_rate": 2.1659637285086722e-05, + "loss": 0.1081, + "step": 32988 + }, + { + "epoch": 0.5883958192130703, + "grad_norm": 0.22751972079277039, + "learning_rate": 2.1658094740584628e-05, + "loss": 0.1378, + "step": 32989 + }, + { + "epoch": 0.588413655334784, + "grad_norm": 0.3043662905693054, + "learning_rate": 2.1656552209036897e-05, + "loss": 0.1377, + "step": 32990 + }, + { + "epoch": 0.5884314914564976, + "grad_norm": 0.3018110394477844, + "learning_rate": 2.165500969044954e-05, + "loss": 0.1095, + "step": 32991 + }, + { + "epoch": 0.5884493275782114, + "grad_norm": 0.2185465544462204, + "learning_rate": 2.165346718482851e-05, + "loss": 0.1256, + "step": 32992 + }, + { + "epoch": 0.5884671636999251, + "grad_norm": 0.2335309088230133, + "learning_rate": 2.1651924692179804e-05, + "loss": 0.1088, + "step": 32993 + }, + { + "epoch": 0.5884849998216388, + "grad_norm": 0.412971168756485, + "learning_rate": 2.1650382212509385e-05, + "loss": 0.1455, + "step": 32994 + }, + { + "epoch": 0.5885028359433525, + "grad_norm": 0.25159329175949097, + "learning_rate": 2.1648839745823252e-05, + "loss": 0.1444, + "step": 32995 + }, + { + "epoch": 0.5885206720650662, + "grad_norm": 0.3027758300304413, + "learning_rate": 2.1647297292127373e-05, + "loss": 0.1288, + "step": 32996 + }, + { + "epoch": 0.5885385081867799, + "grad_norm": 0.21371954679489136, + "learning_rate": 2.1645754851427723e-05, + "loss": 0.0805, + "step": 32997 + }, + { + "epoch": 0.5885563443084936, + "grad_norm": 0.40860676765441895, + "learning_rate": 2.164421242373029e-05, + "loss": 0.1316, + "step": 32998 + }, + { + "epoch": 0.5885741804302073, + "grad_norm": 0.31980836391448975, + "learning_rate": 2.164267000904104e-05, + "loss": 0.1778, + "step": 32999 + }, + { + "epoch": 0.588592016551921, + "grad_norm": 0.2727595567703247, + "learning_rate": 2.164112760736597e-05, + "loss": 0.111, + "step": 33000 + }, + { + "epoch": 0.588592016551921, + "eval_loss": 0.1261141002178192, + "eval_runtime": 108.1661, + "eval_samples_per_second": 9.467, + "eval_steps_per_second": 1.581, + "step": 33000 + }, + { + "epoch": 0.5886098526736346, + "grad_norm": 0.19662021100521088, + "learning_rate": 2.1639585218711047e-05, + "loss": 0.1202, + "step": 33001 + }, + { + "epoch": 0.5886276887953483, + "grad_norm": 0.24519965052604675, + "learning_rate": 2.1638042843082257e-05, + "loss": 0.1111, + "step": 33002 + }, + { + "epoch": 0.588645524917062, + "grad_norm": 0.2554810345172882, + "learning_rate": 2.163650048048556e-05, + "loss": 0.1099, + "step": 33003 + }, + { + "epoch": 0.5886633610387757, + "grad_norm": 0.2318880259990692, + "learning_rate": 2.163495813092696e-05, + "loss": 0.1224, + "step": 33004 + }, + { + "epoch": 0.5886811971604894, + "grad_norm": 0.21858911216259003, + "learning_rate": 2.163341579441242e-05, + "loss": 0.1571, + "step": 33005 + }, + { + "epoch": 0.5886990332822031, + "grad_norm": 0.3522951602935791, + "learning_rate": 2.1631873470947928e-05, + "loss": 0.1202, + "step": 33006 + }, + { + "epoch": 0.5887168694039168, + "grad_norm": 0.22781698405742645, + "learning_rate": 2.1630331160539445e-05, + "loss": 0.1457, + "step": 33007 + }, + { + "epoch": 0.5887347055256305, + "grad_norm": 0.23906351625919342, + "learning_rate": 2.1628788863192976e-05, + "loss": 0.0934, + "step": 33008 + }, + { + "epoch": 0.5887525416473443, + "grad_norm": 0.2531311810016632, + "learning_rate": 2.162724657891449e-05, + "loss": 0.1355, + "step": 33009 + }, + { + "epoch": 0.5887703777690579, + "grad_norm": 0.2691013813018799, + "learning_rate": 2.1625704307709952e-05, + "loss": 0.1535, + "step": 33010 + }, + { + "epoch": 0.5887882138907716, + "grad_norm": 0.17697560787200928, + "learning_rate": 2.162416204958535e-05, + "loss": 0.0761, + "step": 33011 + }, + { + "epoch": 0.5888060500124853, + "grad_norm": 0.25484609603881836, + "learning_rate": 2.1622619804546663e-05, + "loss": 0.1137, + "step": 33012 + }, + { + "epoch": 0.588823886134199, + "grad_norm": 0.3665103018283844, + "learning_rate": 2.1621077572599862e-05, + "loss": 0.1083, + "step": 33013 + }, + { + "epoch": 0.5888417222559127, + "grad_norm": 0.37395647168159485, + "learning_rate": 2.161953535375094e-05, + "loss": 0.1766, + "step": 33014 + }, + { + "epoch": 0.5888595583776264, + "grad_norm": 0.1853387951850891, + "learning_rate": 2.1617993148005868e-05, + "loss": 0.1093, + "step": 33015 + }, + { + "epoch": 0.5888773944993401, + "grad_norm": 0.27346721291542053, + "learning_rate": 2.161645095537061e-05, + "loss": 0.1097, + "step": 33016 + }, + { + "epoch": 0.5888952306210538, + "grad_norm": 0.27654165029525757, + "learning_rate": 2.161490877585117e-05, + "loss": 0.1263, + "step": 33017 + }, + { + "epoch": 0.5889130667427674, + "grad_norm": 0.3392234742641449, + "learning_rate": 2.161336660945351e-05, + "loss": 0.1515, + "step": 33018 + }, + { + "epoch": 0.5889309028644811, + "grad_norm": 0.22535693645477295, + "learning_rate": 2.1611824456183608e-05, + "loss": 0.1225, + "step": 33019 + }, + { + "epoch": 0.5889487389861948, + "grad_norm": 0.2584347426891327, + "learning_rate": 2.1610282316047437e-05, + "loss": 0.1421, + "step": 33020 + }, + { + "epoch": 0.5889665751079085, + "grad_norm": 0.26880693435668945, + "learning_rate": 2.1608740189050993e-05, + "loss": 0.1311, + "step": 33021 + }, + { + "epoch": 0.5889844112296222, + "grad_norm": 0.18865562975406647, + "learning_rate": 2.1607198075200246e-05, + "loss": 0.0603, + "step": 33022 + }, + { + "epoch": 0.5890022473513359, + "grad_norm": 0.2707449495792389, + "learning_rate": 2.160565597450117e-05, + "loss": 0.1246, + "step": 33023 + }, + { + "epoch": 0.5890200834730496, + "grad_norm": 0.3323516547679901, + "learning_rate": 2.1604113886959737e-05, + "loss": 0.1399, + "step": 33024 + }, + { + "epoch": 0.5890379195947633, + "grad_norm": 0.2800370156764984, + "learning_rate": 2.160257181258193e-05, + "loss": 0.1318, + "step": 33025 + }, + { + "epoch": 0.5890557557164771, + "grad_norm": 0.30118528008461, + "learning_rate": 2.1601029751373733e-05, + "loss": 0.1684, + "step": 33026 + }, + { + "epoch": 0.5890735918381907, + "grad_norm": 0.23637090623378754, + "learning_rate": 2.159948770334112e-05, + "loss": 0.1354, + "step": 33027 + }, + { + "epoch": 0.5890914279599044, + "grad_norm": 0.27454230189323425, + "learning_rate": 2.159794566849007e-05, + "loss": 0.1039, + "step": 33028 + }, + { + "epoch": 0.5891092640816181, + "grad_norm": 0.33771610260009766, + "learning_rate": 2.1596403646826543e-05, + "loss": 0.1776, + "step": 33029 + }, + { + "epoch": 0.5891271002033318, + "grad_norm": 0.28984037041664124, + "learning_rate": 2.1594861638356544e-05, + "loss": 0.1603, + "step": 33030 + }, + { + "epoch": 0.5891449363250455, + "grad_norm": 0.2714102864265442, + "learning_rate": 2.1593319643086042e-05, + "loss": 0.1764, + "step": 33031 + }, + { + "epoch": 0.5891627724467592, + "grad_norm": 0.24412472546100616, + "learning_rate": 2.1591777661020998e-05, + "loss": 0.0756, + "step": 33032 + }, + { + "epoch": 0.5891806085684729, + "grad_norm": 0.3281966745853424, + "learning_rate": 2.15902356921674e-05, + "loss": 0.1142, + "step": 33033 + }, + { + "epoch": 0.5891984446901866, + "grad_norm": 0.32212576270103455, + "learning_rate": 2.158869373653123e-05, + "loss": 0.1457, + "step": 33034 + }, + { + "epoch": 0.5892162808119003, + "grad_norm": 0.2897561490535736, + "learning_rate": 2.1587151794118465e-05, + "loss": 0.1751, + "step": 33035 + }, + { + "epoch": 0.5892341169336139, + "grad_norm": 0.24207904934883118, + "learning_rate": 2.158560986493508e-05, + "loss": 0.1885, + "step": 33036 + }, + { + "epoch": 0.5892519530553276, + "grad_norm": 0.2025081217288971, + "learning_rate": 2.158406794898705e-05, + "loss": 0.1137, + "step": 33037 + }, + { + "epoch": 0.5892697891770413, + "grad_norm": 0.24720114469528198, + "learning_rate": 2.1582526046280342e-05, + "loss": 0.0953, + "step": 33038 + }, + { + "epoch": 0.589287625298755, + "grad_norm": 0.255377858877182, + "learning_rate": 2.1580984156820952e-05, + "loss": 0.0721, + "step": 33039 + }, + { + "epoch": 0.5893054614204687, + "grad_norm": 0.24377861618995667, + "learning_rate": 2.1579442280614846e-05, + "loss": 0.1438, + "step": 33040 + }, + { + "epoch": 0.5893232975421824, + "grad_norm": 0.2838647663593292, + "learning_rate": 2.1577900417668008e-05, + "loss": 0.0886, + "step": 33041 + }, + { + "epoch": 0.5893411336638962, + "grad_norm": 0.23067690432071686, + "learning_rate": 2.1576358567986397e-05, + "loss": 0.1315, + "step": 33042 + }, + { + "epoch": 0.5893589697856099, + "grad_norm": 0.24995240569114685, + "learning_rate": 2.1574816731576013e-05, + "loss": 0.0782, + "step": 33043 + }, + { + "epoch": 0.5893768059073236, + "grad_norm": 0.2691757380962372, + "learning_rate": 2.1573274908442827e-05, + "loss": 0.164, + "step": 33044 + }, + { + "epoch": 0.5893946420290372, + "grad_norm": 0.29734519124031067, + "learning_rate": 2.1571733098592805e-05, + "loss": 0.1863, + "step": 33045 + }, + { + "epoch": 0.5894124781507509, + "grad_norm": 0.24343740940093994, + "learning_rate": 2.1570191302031923e-05, + "loss": 0.1301, + "step": 33046 + }, + { + "epoch": 0.5894303142724646, + "grad_norm": 0.2541879415512085, + "learning_rate": 2.1568649518766165e-05, + "loss": 0.1749, + "step": 33047 + }, + { + "epoch": 0.5894481503941783, + "grad_norm": 0.27241525053977966, + "learning_rate": 2.1567107748801518e-05, + "loss": 0.1709, + "step": 33048 + }, + { + "epoch": 0.589465986515892, + "grad_norm": 0.3554043471813202, + "learning_rate": 2.1565565992143945e-05, + "loss": 0.1084, + "step": 33049 + }, + { + "epoch": 0.5894838226376057, + "grad_norm": 0.25861111283302307, + "learning_rate": 2.1564024248799422e-05, + "loss": 0.128, + "step": 33050 + }, + { + "epoch": 0.5895016587593194, + "grad_norm": 0.30657315254211426, + "learning_rate": 2.1562482518773917e-05, + "loss": 0.2006, + "step": 33051 + }, + { + "epoch": 0.5895194948810331, + "grad_norm": 0.31410300731658936, + "learning_rate": 2.1560940802073433e-05, + "loss": 0.1223, + "step": 33052 + }, + { + "epoch": 0.5895373310027467, + "grad_norm": 0.21022747457027435, + "learning_rate": 2.155939909870392e-05, + "loss": 0.1048, + "step": 33053 + }, + { + "epoch": 0.5895551671244604, + "grad_norm": 0.23118728399276733, + "learning_rate": 2.155785740867137e-05, + "loss": 0.1332, + "step": 33054 + }, + { + "epoch": 0.5895730032461741, + "grad_norm": 0.20418299734592438, + "learning_rate": 2.1556315731981743e-05, + "loss": 0.1176, + "step": 33055 + }, + { + "epoch": 0.5895908393678878, + "grad_norm": 0.24226684868335724, + "learning_rate": 2.1554774068641037e-05, + "loss": 0.1598, + "step": 33056 + }, + { + "epoch": 0.5896086754896015, + "grad_norm": 0.21234562993049622, + "learning_rate": 2.155323241865522e-05, + "loss": 0.0776, + "step": 33057 + }, + { + "epoch": 0.5896265116113152, + "grad_norm": 0.2150479108095169, + "learning_rate": 2.1551690782030258e-05, + "loss": 0.1232, + "step": 33058 + }, + { + "epoch": 0.589644347733029, + "grad_norm": 0.2894500195980072, + "learning_rate": 2.1550149158772128e-05, + "loss": 0.1188, + "step": 33059 + }, + { + "epoch": 0.5896621838547427, + "grad_norm": 0.24211075901985168, + "learning_rate": 2.154860754888682e-05, + "loss": 0.1247, + "step": 33060 + }, + { + "epoch": 0.5896800199764564, + "grad_norm": 0.26943182945251465, + "learning_rate": 2.1547065952380295e-05, + "loss": 0.1398, + "step": 33061 + }, + { + "epoch": 0.58969785609817, + "grad_norm": 0.2406005561351776, + "learning_rate": 2.1545524369258543e-05, + "loss": 0.1333, + "step": 33062 + }, + { + "epoch": 0.5897156922198837, + "grad_norm": 0.30333927273750305, + "learning_rate": 2.154398279952753e-05, + "loss": 0.1911, + "step": 33063 + }, + { + "epoch": 0.5897335283415974, + "grad_norm": 0.198168084025383, + "learning_rate": 2.1542441243193223e-05, + "loss": 0.0999, + "step": 33064 + }, + { + "epoch": 0.5897513644633111, + "grad_norm": 0.42712706327438354, + "learning_rate": 2.1540899700261622e-05, + "loss": 0.129, + "step": 33065 + }, + { + "epoch": 0.5897692005850248, + "grad_norm": 0.26657959818840027, + "learning_rate": 2.1539358170738682e-05, + "loss": 0.1049, + "step": 33066 + }, + { + "epoch": 0.5897870367067385, + "grad_norm": 0.2324095219373703, + "learning_rate": 2.1537816654630385e-05, + "loss": 0.123, + "step": 33067 + }, + { + "epoch": 0.5898048728284522, + "grad_norm": 0.22534257173538208, + "learning_rate": 2.1536275151942702e-05, + "loss": 0.1097, + "step": 33068 + }, + { + "epoch": 0.5898227089501659, + "grad_norm": 0.28968656063079834, + "learning_rate": 2.1534733662681616e-05, + "loss": 0.1912, + "step": 33069 + }, + { + "epoch": 0.5898405450718796, + "grad_norm": 0.38640910387039185, + "learning_rate": 2.1533192186853104e-05, + "loss": 0.0936, + "step": 33070 + }, + { + "epoch": 0.5898583811935932, + "grad_norm": 0.2354934960603714, + "learning_rate": 2.153165072446314e-05, + "loss": 0.1364, + "step": 33071 + }, + { + "epoch": 0.5898762173153069, + "grad_norm": 0.298694372177124, + "learning_rate": 2.153010927551768e-05, + "loss": 0.1579, + "step": 33072 + }, + { + "epoch": 0.5898940534370206, + "grad_norm": 0.27349498867988586, + "learning_rate": 2.1528567840022722e-05, + "loss": 0.1645, + "step": 33073 + }, + { + "epoch": 0.5899118895587343, + "grad_norm": 0.2971816658973694, + "learning_rate": 2.1527026417984237e-05, + "loss": 0.1145, + "step": 33074 + }, + { + "epoch": 0.589929725680448, + "grad_norm": 0.22058752179145813, + "learning_rate": 2.1525485009408196e-05, + "loss": 0.1099, + "step": 33075 + }, + { + "epoch": 0.5899475618021618, + "grad_norm": 0.26899510622024536, + "learning_rate": 2.152394361430058e-05, + "loss": 0.1407, + "step": 33076 + }, + { + "epoch": 0.5899653979238755, + "grad_norm": 0.2096758633852005, + "learning_rate": 2.1522402232667345e-05, + "loss": 0.1305, + "step": 33077 + }, + { + "epoch": 0.5899832340455892, + "grad_norm": 0.2950533926486969, + "learning_rate": 2.1520860864514494e-05, + "loss": 0.1063, + "step": 33078 + }, + { + "epoch": 0.5900010701673029, + "grad_norm": 0.19849883019924164, + "learning_rate": 2.1519319509847984e-05, + "loss": 0.0943, + "step": 33079 + }, + { + "epoch": 0.5900189062890165, + "grad_norm": 0.34837716817855835, + "learning_rate": 2.1517778168673793e-05, + "loss": 0.171, + "step": 33080 + }, + { + "epoch": 0.5900367424107302, + "grad_norm": 0.2350277453660965, + "learning_rate": 2.1516236840997893e-05, + "loss": 0.1433, + "step": 33081 + }, + { + "epoch": 0.5900545785324439, + "grad_norm": 0.25818994641304016, + "learning_rate": 2.1514695526826265e-05, + "loss": 0.128, + "step": 33082 + }, + { + "epoch": 0.5900724146541576, + "grad_norm": 0.24907296895980835, + "learning_rate": 2.151315422616488e-05, + "loss": 0.1362, + "step": 33083 + }, + { + "epoch": 0.5900902507758713, + "grad_norm": 0.24716128408908844, + "learning_rate": 2.1511612939019724e-05, + "loss": 0.1009, + "step": 33084 + }, + { + "epoch": 0.590108086897585, + "grad_norm": 0.28970226645469666, + "learning_rate": 2.151007166539674e-05, + "loss": 0.1638, + "step": 33085 + }, + { + "epoch": 0.5901259230192987, + "grad_norm": 0.21572163701057434, + "learning_rate": 2.150853040530194e-05, + "loss": 0.0895, + "step": 33086 + }, + { + "epoch": 0.5901437591410124, + "grad_norm": 0.22393271327018738, + "learning_rate": 2.1506989158741285e-05, + "loss": 0.1068, + "step": 33087 + }, + { + "epoch": 0.590161595262726, + "grad_norm": 0.23315875232219696, + "learning_rate": 2.1505447925720734e-05, + "loss": 0.1095, + "step": 33088 + }, + { + "epoch": 0.5901794313844397, + "grad_norm": 0.28150245547294617, + "learning_rate": 2.1503906706246285e-05, + "loss": 0.1484, + "step": 33089 + }, + { + "epoch": 0.5901972675061534, + "grad_norm": 0.1934724897146225, + "learning_rate": 2.1502365500323887e-05, + "loss": 0.1068, + "step": 33090 + }, + { + "epoch": 0.5902151036278671, + "grad_norm": 0.22217927873134613, + "learning_rate": 2.1500824307959538e-05, + "loss": 0.1447, + "step": 33091 + }, + { + "epoch": 0.5902329397495808, + "grad_norm": 0.23097416758537292, + "learning_rate": 2.1499283129159205e-05, + "loss": 0.123, + "step": 33092 + }, + { + "epoch": 0.5902507758712946, + "grad_norm": 0.2799910604953766, + "learning_rate": 2.1497741963928862e-05, + "loss": 0.1272, + "step": 33093 + }, + { + "epoch": 0.5902686119930083, + "grad_norm": 0.22545427083969116, + "learning_rate": 2.1496200812274465e-05, + "loss": 0.1405, + "step": 33094 + }, + { + "epoch": 0.590286448114722, + "grad_norm": 0.2801673114299774, + "learning_rate": 2.1494659674202018e-05, + "loss": 0.1398, + "step": 33095 + }, + { + "epoch": 0.5903042842364357, + "grad_norm": 0.18933075666427612, + "learning_rate": 2.1493118549717473e-05, + "loss": 0.1188, + "step": 33096 + }, + { + "epoch": 0.5903221203581493, + "grad_norm": 0.23201805353164673, + "learning_rate": 2.1491577438826818e-05, + "loss": 0.1298, + "step": 33097 + }, + { + "epoch": 0.590339956479863, + "grad_norm": 0.2574159502983093, + "learning_rate": 2.149003634153601e-05, + "loss": 0.1398, + "step": 33098 + }, + { + "epoch": 0.5903577926015767, + "grad_norm": 0.28991690278053284, + "learning_rate": 2.1488495257851045e-05, + "loss": 0.1524, + "step": 33099 + }, + { + "epoch": 0.5903756287232904, + "grad_norm": 0.22774146497249603, + "learning_rate": 2.1486954187777887e-05, + "loss": 0.1063, + "step": 33100 + }, + { + "epoch": 0.5903934648450041, + "grad_norm": 0.30327367782592773, + "learning_rate": 2.1485413131322496e-05, + "loss": 0.1603, + "step": 33101 + }, + { + "epoch": 0.5904113009667178, + "grad_norm": 0.311737984418869, + "learning_rate": 2.148387208849087e-05, + "loss": 0.1408, + "step": 33102 + }, + { + "epoch": 0.5904291370884315, + "grad_norm": 0.29760342836380005, + "learning_rate": 2.1482331059288953e-05, + "loss": 0.1073, + "step": 33103 + }, + { + "epoch": 0.5904469732101452, + "grad_norm": 0.41577932238578796, + "learning_rate": 2.1480790043722752e-05, + "loss": 0.135, + "step": 33104 + }, + { + "epoch": 0.5904648093318589, + "grad_norm": 0.2204284369945526, + "learning_rate": 2.1479249041798223e-05, + "loss": 0.108, + "step": 33105 + }, + { + "epoch": 0.5904826454535725, + "grad_norm": 0.44028276205062866, + "learning_rate": 2.147770805352134e-05, + "loss": 0.1592, + "step": 33106 + }, + { + "epoch": 0.5905004815752862, + "grad_norm": 0.2741379737854004, + "learning_rate": 2.1476167078898066e-05, + "loss": 0.124, + "step": 33107 + }, + { + "epoch": 0.5905183176969999, + "grad_norm": 0.3365446925163269, + "learning_rate": 2.1474626117934397e-05, + "loss": 0.0945, + "step": 33108 + }, + { + "epoch": 0.5905361538187136, + "grad_norm": 0.24904195964336395, + "learning_rate": 2.147308517063629e-05, + "loss": 0.1379, + "step": 33109 + }, + { + "epoch": 0.5905539899404274, + "grad_norm": 0.26429814100265503, + "learning_rate": 2.147154423700973e-05, + "loss": 0.1086, + "step": 33110 + }, + { + "epoch": 0.5905718260621411, + "grad_norm": 0.26478275656700134, + "learning_rate": 2.1470003317060666e-05, + "loss": 0.1482, + "step": 33111 + }, + { + "epoch": 0.5905896621838548, + "grad_norm": 0.18958696722984314, + "learning_rate": 2.1468462410795105e-05, + "loss": 0.0939, + "step": 33112 + }, + { + "epoch": 0.5906074983055685, + "grad_norm": 0.2733018398284912, + "learning_rate": 2.1466921518219e-05, + "loss": 0.1197, + "step": 33113 + }, + { + "epoch": 0.5906253344272822, + "grad_norm": 0.21467597782611847, + "learning_rate": 2.146538063933833e-05, + "loss": 0.0808, + "step": 33114 + }, + { + "epoch": 0.5906431705489958, + "grad_norm": 0.19865699112415314, + "learning_rate": 2.146383977415906e-05, + "loss": 0.1393, + "step": 33115 + }, + { + "epoch": 0.5906610066707095, + "grad_norm": 0.24648387730121613, + "learning_rate": 2.1462298922687166e-05, + "loss": 0.0997, + "step": 33116 + }, + { + "epoch": 0.5906788427924232, + "grad_norm": 0.24080605804920197, + "learning_rate": 2.1460758084928624e-05, + "loss": 0.2018, + "step": 33117 + }, + { + "epoch": 0.5906966789141369, + "grad_norm": 0.18410654366016388, + "learning_rate": 2.1459217260889413e-05, + "loss": 0.1034, + "step": 33118 + }, + { + "epoch": 0.5907145150358506, + "grad_norm": 0.242695152759552, + "learning_rate": 2.1457676450575497e-05, + "loss": 0.1454, + "step": 33119 + }, + { + "epoch": 0.5907323511575643, + "grad_norm": 0.24289849400520325, + "learning_rate": 2.1456135653992843e-05, + "loss": 0.1683, + "step": 33120 + }, + { + "epoch": 0.590750187279278, + "grad_norm": 0.23219898343086243, + "learning_rate": 2.1454594871147436e-05, + "loss": 0.1256, + "step": 33121 + }, + { + "epoch": 0.5907680234009917, + "grad_norm": 0.202924907207489, + "learning_rate": 2.145305410204525e-05, + "loss": 0.0917, + "step": 33122 + }, + { + "epoch": 0.5907858595227053, + "grad_norm": 0.21039503812789917, + "learning_rate": 2.1451513346692244e-05, + "loss": 0.1268, + "step": 33123 + }, + { + "epoch": 0.590803695644419, + "grad_norm": 0.2628650367259979, + "learning_rate": 2.1449972605094394e-05, + "loss": 0.0922, + "step": 33124 + }, + { + "epoch": 0.5908215317661327, + "grad_norm": 0.3127400279045105, + "learning_rate": 2.1448431877257686e-05, + "loss": 0.1875, + "step": 33125 + }, + { + "epoch": 0.5908393678878464, + "grad_norm": 0.25862622261047363, + "learning_rate": 2.144689116318808e-05, + "loss": 0.1941, + "step": 33126 + }, + { + "epoch": 0.5908572040095602, + "grad_norm": 0.2257784754037857, + "learning_rate": 2.1445350462891552e-05, + "loss": 0.1398, + "step": 33127 + }, + { + "epoch": 0.5908750401312739, + "grad_norm": 0.2474130243062973, + "learning_rate": 2.144380977637407e-05, + "loss": 0.1426, + "step": 33128 + }, + { + "epoch": 0.5908928762529876, + "grad_norm": 0.27754154801368713, + "learning_rate": 2.144226910364161e-05, + "loss": 0.1452, + "step": 33129 + }, + { + "epoch": 0.5909107123747013, + "grad_norm": 0.36767107248306274, + "learning_rate": 2.1440728444700144e-05, + "loss": 0.1816, + "step": 33130 + }, + { + "epoch": 0.590928548496415, + "grad_norm": 0.2943879961967468, + "learning_rate": 2.1439187799555643e-05, + "loss": 0.1192, + "step": 33131 + }, + { + "epoch": 0.5909463846181287, + "grad_norm": 0.19691762328147888, + "learning_rate": 2.1437647168214087e-05, + "loss": 0.1088, + "step": 33132 + }, + { + "epoch": 0.5909642207398423, + "grad_norm": 0.20330274105072021, + "learning_rate": 2.1436106550681424e-05, + "loss": 0.1288, + "step": 33133 + }, + { + "epoch": 0.590982056861556, + "grad_norm": 0.2210235446691513, + "learning_rate": 2.1434565946963662e-05, + "loss": 0.1704, + "step": 33134 + }, + { + "epoch": 0.5909998929832697, + "grad_norm": 0.259302020072937, + "learning_rate": 2.1433025357066747e-05, + "loss": 0.1239, + "step": 33135 + }, + { + "epoch": 0.5910177291049834, + "grad_norm": 0.2674104571342468, + "learning_rate": 2.1431484780996655e-05, + "loss": 0.1035, + "step": 33136 + }, + { + "epoch": 0.5910355652266971, + "grad_norm": 0.2931918799877167, + "learning_rate": 2.142994421875936e-05, + "loss": 0.1052, + "step": 33137 + }, + { + "epoch": 0.5910534013484108, + "grad_norm": 0.24653010070323944, + "learning_rate": 2.1428403670360836e-05, + "loss": 0.1111, + "step": 33138 + }, + { + "epoch": 0.5910712374701245, + "grad_norm": 0.2379283457994461, + "learning_rate": 2.1426863135807058e-05, + "loss": 0.1412, + "step": 33139 + }, + { + "epoch": 0.5910890735918382, + "grad_norm": 0.47157180309295654, + "learning_rate": 2.1425322615103986e-05, + "loss": 0.1638, + "step": 33140 + }, + { + "epoch": 0.5911069097135518, + "grad_norm": 0.23331566154956818, + "learning_rate": 2.1423782108257608e-05, + "loss": 0.131, + "step": 33141 + }, + { + "epoch": 0.5911247458352655, + "grad_norm": 0.3077402114868164, + "learning_rate": 2.142224161527387e-05, + "loss": 0.1736, + "step": 33142 + }, + { + "epoch": 0.5911425819569793, + "grad_norm": 0.27864784002304077, + "learning_rate": 2.142070113615877e-05, + "loss": 0.1374, + "step": 33143 + }, + { + "epoch": 0.591160418078693, + "grad_norm": 0.2880363464355469, + "learning_rate": 2.1419160670918264e-05, + "loss": 0.1374, + "step": 33144 + }, + { + "epoch": 0.5911782542004067, + "grad_norm": 0.32880255579948425, + "learning_rate": 2.141762021955833e-05, + "loss": 0.1356, + "step": 33145 + }, + { + "epoch": 0.5911960903221204, + "grad_norm": 0.19545045495033264, + "learning_rate": 2.1416079782084933e-05, + "loss": 0.111, + "step": 33146 + }, + { + "epoch": 0.5912139264438341, + "grad_norm": 0.29984405636787415, + "learning_rate": 2.1414539358504055e-05, + "loss": 0.1429, + "step": 33147 + }, + { + "epoch": 0.5912317625655478, + "grad_norm": 0.23452989757061005, + "learning_rate": 2.1412998948821663e-05, + "loss": 0.0702, + "step": 33148 + }, + { + "epoch": 0.5912495986872615, + "grad_norm": 0.23591265082359314, + "learning_rate": 2.1411458553043727e-05, + "loss": 0.0892, + "step": 33149 + }, + { + "epoch": 0.5912674348089751, + "grad_norm": 0.24769672751426697, + "learning_rate": 2.1409918171176203e-05, + "loss": 0.1454, + "step": 33150 + }, + { + "epoch": 0.5912852709306888, + "grad_norm": 0.27837643027305603, + "learning_rate": 2.140837780322508e-05, + "loss": 0.1206, + "step": 33151 + }, + { + "epoch": 0.5913031070524025, + "grad_norm": 0.2272465080022812, + "learning_rate": 2.1406837449196335e-05, + "loss": 0.1474, + "step": 33152 + }, + { + "epoch": 0.5913209431741162, + "grad_norm": 0.1675087958574295, + "learning_rate": 2.1405297109095928e-05, + "loss": 0.0863, + "step": 33153 + }, + { + "epoch": 0.5913387792958299, + "grad_norm": 0.27321112155914307, + "learning_rate": 2.140375678292983e-05, + "loss": 0.1465, + "step": 33154 + }, + { + "epoch": 0.5913566154175436, + "grad_norm": 0.23717403411865234, + "learning_rate": 2.1402216470704002e-05, + "loss": 0.1119, + "step": 33155 + }, + { + "epoch": 0.5913744515392573, + "grad_norm": 0.2761916518211365, + "learning_rate": 2.140067617242444e-05, + "loss": 0.1021, + "step": 33156 + }, + { + "epoch": 0.591392287660971, + "grad_norm": 0.362943559885025, + "learning_rate": 2.139913588809709e-05, + "loss": 0.1478, + "step": 33157 + }, + { + "epoch": 0.5914101237826846, + "grad_norm": 0.20055672526359558, + "learning_rate": 2.1397595617727938e-05, + "loss": 0.1414, + "step": 33158 + }, + { + "epoch": 0.5914279599043983, + "grad_norm": 0.3138004541397095, + "learning_rate": 2.139605536132294e-05, + "loss": 0.1577, + "step": 33159 + }, + { + "epoch": 0.5914457960261121, + "grad_norm": 0.1988336145877838, + "learning_rate": 2.139451511888809e-05, + "loss": 0.1207, + "step": 33160 + }, + { + "epoch": 0.5914636321478258, + "grad_norm": 0.18993769586086273, + "learning_rate": 2.1392974890429344e-05, + "loss": 0.0875, + "step": 33161 + }, + { + "epoch": 0.5914814682695395, + "grad_norm": 0.2395271509885788, + "learning_rate": 2.139143467595267e-05, + "loss": 0.1175, + "step": 33162 + }, + { + "epoch": 0.5914993043912532, + "grad_norm": 0.32086557149887085, + "learning_rate": 2.1389894475464036e-05, + "loss": 0.1572, + "step": 33163 + }, + { + "epoch": 0.5915171405129669, + "grad_norm": 0.22084881365299225, + "learning_rate": 2.1388354288969424e-05, + "loss": 0.0874, + "step": 33164 + }, + { + "epoch": 0.5915349766346806, + "grad_norm": 0.2549118399620056, + "learning_rate": 2.1386814116474794e-05, + "loss": 0.1229, + "step": 33165 + }, + { + "epoch": 0.5915528127563943, + "grad_norm": 0.33142685890197754, + "learning_rate": 2.1385273957986125e-05, + "loss": 0.1197, + "step": 33166 + }, + { + "epoch": 0.591570648878108, + "grad_norm": 0.26726052165031433, + "learning_rate": 2.1383733813509382e-05, + "loss": 0.0824, + "step": 33167 + }, + { + "epoch": 0.5915884849998216, + "grad_norm": 0.2754552364349365, + "learning_rate": 2.1382193683050527e-05, + "loss": 0.1529, + "step": 33168 + }, + { + "epoch": 0.5916063211215353, + "grad_norm": 0.3096371293067932, + "learning_rate": 2.138065356661555e-05, + "loss": 0.1211, + "step": 33169 + }, + { + "epoch": 0.591624157243249, + "grad_norm": 0.2355424165725708, + "learning_rate": 2.137911346421041e-05, + "loss": 0.1711, + "step": 33170 + }, + { + "epoch": 0.5916419933649627, + "grad_norm": 0.22811809182167053, + "learning_rate": 2.137757337584107e-05, + "loss": 0.097, + "step": 33171 + }, + { + "epoch": 0.5916598294866764, + "grad_norm": 0.23363593220710754, + "learning_rate": 2.1376033301513504e-05, + "loss": 0.0937, + "step": 33172 + }, + { + "epoch": 0.5916776656083901, + "grad_norm": 0.21910591423511505, + "learning_rate": 2.137449324123369e-05, + "loss": 0.1081, + "step": 33173 + }, + { + "epoch": 0.5916955017301038, + "grad_norm": 0.25870445370674133, + "learning_rate": 2.1372953195007596e-05, + "loss": 0.1207, + "step": 33174 + }, + { + "epoch": 0.5917133378518175, + "grad_norm": 0.27380484342575073, + "learning_rate": 2.1371413162841185e-05, + "loss": 0.1311, + "step": 33175 + }, + { + "epoch": 0.5917311739735311, + "grad_norm": 0.1943562626838684, + "learning_rate": 2.1369873144740424e-05, + "loss": 0.0986, + "step": 33176 + }, + { + "epoch": 0.5917490100952449, + "grad_norm": 0.3256903290748596, + "learning_rate": 2.1368333140711295e-05, + "loss": 0.1545, + "step": 33177 + }, + { + "epoch": 0.5917668462169586, + "grad_norm": 0.21299290657043457, + "learning_rate": 2.136679315075976e-05, + "loss": 0.1013, + "step": 33178 + }, + { + "epoch": 0.5917846823386723, + "grad_norm": 0.32878273725509644, + "learning_rate": 2.136525317489179e-05, + "loss": 0.1196, + "step": 33179 + }, + { + "epoch": 0.591802518460386, + "grad_norm": 0.3366239666938782, + "learning_rate": 2.136371321311336e-05, + "loss": 0.122, + "step": 33180 + }, + { + "epoch": 0.5918203545820997, + "grad_norm": 0.3108285963535309, + "learning_rate": 2.1362173265430418e-05, + "loss": 0.106, + "step": 33181 + }, + { + "epoch": 0.5918381907038134, + "grad_norm": 0.35554245114326477, + "learning_rate": 2.1360633331848962e-05, + "loss": 0.2326, + "step": 33182 + }, + { + "epoch": 0.5918560268255271, + "grad_norm": 0.23072634637355804, + "learning_rate": 2.135909341237495e-05, + "loss": 0.1185, + "step": 33183 + }, + { + "epoch": 0.5918738629472408, + "grad_norm": 0.3261723518371582, + "learning_rate": 2.1357553507014337e-05, + "loss": 0.158, + "step": 33184 + }, + { + "epoch": 0.5918916990689544, + "grad_norm": 0.26431435346603394, + "learning_rate": 2.135601361577311e-05, + "loss": 0.1418, + "step": 33185 + }, + { + "epoch": 0.5919095351906681, + "grad_norm": 0.2369711548089981, + "learning_rate": 2.1354473738657233e-05, + "loss": 0.1012, + "step": 33186 + }, + { + "epoch": 0.5919273713123818, + "grad_norm": 0.3591991364955902, + "learning_rate": 2.135293387567268e-05, + "loss": 0.1384, + "step": 33187 + }, + { + "epoch": 0.5919452074340955, + "grad_norm": 0.278198778629303, + "learning_rate": 2.1351394026825415e-05, + "loss": 0.1316, + "step": 33188 + }, + { + "epoch": 0.5919630435558092, + "grad_norm": 0.2619699239730835, + "learning_rate": 2.1349854192121392e-05, + "loss": 0.1274, + "step": 33189 + }, + { + "epoch": 0.5919808796775229, + "grad_norm": 0.35606810450553894, + "learning_rate": 2.134831437156661e-05, + "loss": 0.1375, + "step": 33190 + }, + { + "epoch": 0.5919987157992366, + "grad_norm": 0.25631991028785706, + "learning_rate": 2.1346774565167022e-05, + "loss": 0.1843, + "step": 33191 + }, + { + "epoch": 0.5920165519209503, + "grad_norm": 0.21016977727413177, + "learning_rate": 2.134523477292859e-05, + "loss": 0.1186, + "step": 33192 + }, + { + "epoch": 0.592034388042664, + "grad_norm": 0.25714293122291565, + "learning_rate": 2.13436949948573e-05, + "loss": 0.1232, + "step": 33193 + }, + { + "epoch": 0.5920522241643777, + "grad_norm": 0.24614916741847992, + "learning_rate": 2.13421552309591e-05, + "loss": 0.1201, + "step": 33194 + }, + { + "epoch": 0.5920700602860914, + "grad_norm": 0.2710455358028412, + "learning_rate": 2.1340615481239975e-05, + "loss": 0.1175, + "step": 33195 + }, + { + "epoch": 0.5920878964078051, + "grad_norm": 0.3321419954299927, + "learning_rate": 2.1339075745705894e-05, + "loss": 0.181, + "step": 33196 + }, + { + "epoch": 0.5921057325295188, + "grad_norm": 0.2175750583410263, + "learning_rate": 2.1337536024362818e-05, + "loss": 0.1124, + "step": 33197 + }, + { + "epoch": 0.5921235686512325, + "grad_norm": 0.19029946625232697, + "learning_rate": 2.1335996317216705e-05, + "loss": 0.1369, + "step": 33198 + }, + { + "epoch": 0.5921414047729462, + "grad_norm": 0.3677079975605011, + "learning_rate": 2.1334456624273548e-05, + "loss": 0.1125, + "step": 33199 + }, + { + "epoch": 0.5921592408946599, + "grad_norm": 0.2554343342781067, + "learning_rate": 2.13329169455393e-05, + "loss": 0.1047, + "step": 33200 + }, + { + "epoch": 0.5921770770163736, + "grad_norm": 0.3062489330768585, + "learning_rate": 2.1331377281019932e-05, + "loss": 0.1156, + "step": 33201 + }, + { + "epoch": 0.5921949131380873, + "grad_norm": 0.26012712717056274, + "learning_rate": 2.1329837630721405e-05, + "loss": 0.1473, + "step": 33202 + }, + { + "epoch": 0.5922127492598009, + "grad_norm": 0.22849048674106598, + "learning_rate": 2.1328297994649708e-05, + "loss": 0.1321, + "step": 33203 + }, + { + "epoch": 0.5922305853815146, + "grad_norm": 0.26811352372169495, + "learning_rate": 2.1326758372810793e-05, + "loss": 0.1249, + "step": 33204 + }, + { + "epoch": 0.5922484215032283, + "grad_norm": 0.28943127393722534, + "learning_rate": 2.1325218765210628e-05, + "loss": 0.1687, + "step": 33205 + }, + { + "epoch": 0.592266257624942, + "grad_norm": 0.18160155415534973, + "learning_rate": 2.132367917185519e-05, + "loss": 0.0659, + "step": 33206 + }, + { + "epoch": 0.5922840937466557, + "grad_norm": 0.3300116956233978, + "learning_rate": 2.1322139592750428e-05, + "loss": 0.1284, + "step": 33207 + }, + { + "epoch": 0.5923019298683694, + "grad_norm": 0.3537009060382843, + "learning_rate": 2.1320600027902336e-05, + "loss": 0.1192, + "step": 33208 + }, + { + "epoch": 0.5923197659900831, + "grad_norm": 0.29702624678611755, + "learning_rate": 2.1319060477316867e-05, + "loss": 0.0956, + "step": 33209 + }, + { + "epoch": 0.5923376021117968, + "grad_norm": 0.24897907674312592, + "learning_rate": 2.1317520940999992e-05, + "loss": 0.121, + "step": 33210 + }, + { + "epoch": 0.5923554382335106, + "grad_norm": 0.3004474937915802, + "learning_rate": 2.1315981418957666e-05, + "loss": 0.1506, + "step": 33211 + }, + { + "epoch": 0.5923732743552242, + "grad_norm": 0.21354342997074127, + "learning_rate": 2.1314441911195882e-05, + "loss": 0.121, + "step": 33212 + }, + { + "epoch": 0.5923911104769379, + "grad_norm": 0.2853353023529053, + "learning_rate": 2.1312902417720586e-05, + "loss": 0.1517, + "step": 33213 + }, + { + "epoch": 0.5924089465986516, + "grad_norm": 0.20831961929798126, + "learning_rate": 2.1311362938537764e-05, + "loss": 0.1212, + "step": 33214 + }, + { + "epoch": 0.5924267827203653, + "grad_norm": 0.290397047996521, + "learning_rate": 2.1309823473653357e-05, + "loss": 0.11, + "step": 33215 + }, + { + "epoch": 0.592444618842079, + "grad_norm": 0.2503226101398468, + "learning_rate": 2.130828402307336e-05, + "loss": 0.1584, + "step": 33216 + }, + { + "epoch": 0.5924624549637927, + "grad_norm": 0.25400683283805847, + "learning_rate": 2.1306744586803735e-05, + "loss": 0.1318, + "step": 33217 + }, + { + "epoch": 0.5924802910855064, + "grad_norm": 0.27401265501976013, + "learning_rate": 2.1305205164850437e-05, + "loss": 0.1127, + "step": 33218 + }, + { + "epoch": 0.5924981272072201, + "grad_norm": 0.22648422420024872, + "learning_rate": 2.1303665757219437e-05, + "loss": 0.1564, + "step": 33219 + }, + { + "epoch": 0.5925159633289337, + "grad_norm": 0.27828386425971985, + "learning_rate": 2.1302126363916706e-05, + "loss": 0.1429, + "step": 33220 + }, + { + "epoch": 0.5925337994506474, + "grad_norm": 0.24349278211593628, + "learning_rate": 2.1300586984948206e-05, + "loss": 0.1573, + "step": 33221 + }, + { + "epoch": 0.5925516355723611, + "grad_norm": 0.2145342230796814, + "learning_rate": 2.1299047620319916e-05, + "loss": 0.0828, + "step": 33222 + }, + { + "epoch": 0.5925694716940748, + "grad_norm": 0.233333021402359, + "learning_rate": 2.12975082700378e-05, + "loss": 0.1058, + "step": 33223 + }, + { + "epoch": 0.5925873078157885, + "grad_norm": 0.268494188785553, + "learning_rate": 2.129596893410781e-05, + "loss": 0.1383, + "step": 33224 + }, + { + "epoch": 0.5926051439375022, + "grad_norm": 0.2279106229543686, + "learning_rate": 2.1294429612535928e-05, + "loss": 0.0904, + "step": 33225 + }, + { + "epoch": 0.5926229800592159, + "grad_norm": 0.32699477672576904, + "learning_rate": 2.129289030532812e-05, + "loss": 0.1768, + "step": 33226 + }, + { + "epoch": 0.5926408161809296, + "grad_norm": 0.25072968006134033, + "learning_rate": 2.1291351012490345e-05, + "loss": 0.1309, + "step": 33227 + }, + { + "epoch": 0.5926586523026434, + "grad_norm": 0.3012182414531708, + "learning_rate": 2.1289811734028568e-05, + "loss": 0.1198, + "step": 33228 + }, + { + "epoch": 0.592676488424357, + "grad_norm": 0.3182644248008728, + "learning_rate": 2.128827246994877e-05, + "loss": 0.1187, + "step": 33229 + }, + { + "epoch": 0.5926943245460707, + "grad_norm": 0.21471263468265533, + "learning_rate": 2.1286733220256916e-05, + "loss": 0.1387, + "step": 33230 + }, + { + "epoch": 0.5927121606677844, + "grad_norm": 0.26514896750450134, + "learning_rate": 2.1285193984958966e-05, + "loss": 0.1157, + "step": 33231 + }, + { + "epoch": 0.5927299967894981, + "grad_norm": 0.23476961255073547, + "learning_rate": 2.1283654764060885e-05, + "loss": 0.135, + "step": 33232 + }, + { + "epoch": 0.5927478329112118, + "grad_norm": 0.30298492312431335, + "learning_rate": 2.128211555756863e-05, + "loss": 0.1451, + "step": 33233 + }, + { + "epoch": 0.5927656690329255, + "grad_norm": 0.2590707242488861, + "learning_rate": 2.1280576365488186e-05, + "loss": 0.1196, + "step": 33234 + }, + { + "epoch": 0.5927835051546392, + "grad_norm": 0.2447502166032791, + "learning_rate": 2.127903718782552e-05, + "loss": 0.0814, + "step": 33235 + }, + { + "epoch": 0.5928013412763529, + "grad_norm": 0.32594069838523865, + "learning_rate": 2.1277498024586583e-05, + "loss": 0.1291, + "step": 33236 + }, + { + "epoch": 0.5928191773980666, + "grad_norm": 0.261898398399353, + "learning_rate": 2.1275958875777345e-05, + "loss": 0.1656, + "step": 33237 + }, + { + "epoch": 0.5928370135197802, + "grad_norm": 0.22964385151863098, + "learning_rate": 2.1274419741403788e-05, + "loss": 0.081, + "step": 33238 + }, + { + "epoch": 0.5928548496414939, + "grad_norm": 0.31449735164642334, + "learning_rate": 2.1272880621471868e-05, + "loss": 0.1983, + "step": 33239 + }, + { + "epoch": 0.5928726857632076, + "grad_norm": 0.24741274118423462, + "learning_rate": 2.1271341515987538e-05, + "loss": 0.1264, + "step": 33240 + }, + { + "epoch": 0.5928905218849213, + "grad_norm": 0.22285273671150208, + "learning_rate": 2.126980242495678e-05, + "loss": 0.1227, + "step": 33241 + }, + { + "epoch": 0.592908358006635, + "grad_norm": 0.252570241689682, + "learning_rate": 2.126826334838556e-05, + "loss": 0.1071, + "step": 33242 + }, + { + "epoch": 0.5929261941283487, + "grad_norm": 0.29839420318603516, + "learning_rate": 2.126672428627984e-05, + "loss": 0.1808, + "step": 33243 + }, + { + "epoch": 0.5929440302500625, + "grad_norm": 0.3151942789554596, + "learning_rate": 2.1265185238645587e-05, + "loss": 0.1161, + "step": 33244 + }, + { + "epoch": 0.5929618663717762, + "grad_norm": 0.2681471109390259, + "learning_rate": 2.126364620548877e-05, + "loss": 0.0971, + "step": 33245 + }, + { + "epoch": 0.5929797024934899, + "grad_norm": 0.20755575597286224, + "learning_rate": 2.1262107186815332e-05, + "loss": 0.1093, + "step": 33246 + }, + { + "epoch": 0.5929975386152035, + "grad_norm": 0.2835428714752197, + "learning_rate": 2.1260568182631278e-05, + "loss": 0.1113, + "step": 33247 + }, + { + "epoch": 0.5930153747369172, + "grad_norm": 0.28805971145629883, + "learning_rate": 2.1259029192942542e-05, + "loss": 0.105, + "step": 33248 + }, + { + "epoch": 0.5930332108586309, + "grad_norm": 0.23881368339061737, + "learning_rate": 2.125749021775511e-05, + "loss": 0.1335, + "step": 33249 + }, + { + "epoch": 0.5930510469803446, + "grad_norm": 0.343791127204895, + "learning_rate": 2.125595125707492e-05, + "loss": 0.1481, + "step": 33250 + }, + { + "epoch": 0.5930688831020583, + "grad_norm": 0.2349422574043274, + "learning_rate": 2.1254412310907974e-05, + "loss": 0.1234, + "step": 33251 + }, + { + "epoch": 0.593086719223772, + "grad_norm": 0.3011320233345032, + "learning_rate": 2.125287337926022e-05, + "loss": 0.1532, + "step": 33252 + }, + { + "epoch": 0.5931045553454857, + "grad_norm": 0.2773435115814209, + "learning_rate": 2.125133446213762e-05, + "loss": 0.1304, + "step": 33253 + }, + { + "epoch": 0.5931223914671994, + "grad_norm": 0.22966253757476807, + "learning_rate": 2.124979555954613e-05, + "loss": 0.1214, + "step": 33254 + }, + { + "epoch": 0.593140227588913, + "grad_norm": 0.2792550027370453, + "learning_rate": 2.1248256671491736e-05, + "loss": 0.1606, + "step": 33255 + }, + { + "epoch": 0.5931580637106267, + "grad_norm": 0.19948892295360565, + "learning_rate": 2.12467177979804e-05, + "loss": 0.075, + "step": 33256 + }, + { + "epoch": 0.5931758998323404, + "grad_norm": 0.2976965308189392, + "learning_rate": 2.124517893901808e-05, + "loss": 0.2024, + "step": 33257 + }, + { + "epoch": 0.5931937359540541, + "grad_norm": 0.31936338543891907, + "learning_rate": 2.1243640094610738e-05, + "loss": 0.1153, + "step": 33258 + }, + { + "epoch": 0.5932115720757678, + "grad_norm": 0.28131014108657837, + "learning_rate": 2.124210126476435e-05, + "loss": 0.0815, + "step": 33259 + }, + { + "epoch": 0.5932294081974815, + "grad_norm": 0.2174483686685562, + "learning_rate": 2.1240562449484878e-05, + "loss": 0.104, + "step": 33260 + }, + { + "epoch": 0.5932472443191953, + "grad_norm": 0.34584856033325195, + "learning_rate": 2.1239023648778276e-05, + "loss": 0.091, + "step": 33261 + }, + { + "epoch": 0.593265080440909, + "grad_norm": 0.2531794011592865, + "learning_rate": 2.1237484862650525e-05, + "loss": 0.1126, + "step": 33262 + }, + { + "epoch": 0.5932829165626227, + "grad_norm": 0.2942150831222534, + "learning_rate": 2.1235946091107572e-05, + "loss": 0.1479, + "step": 33263 + }, + { + "epoch": 0.5933007526843364, + "grad_norm": 0.26067620515823364, + "learning_rate": 2.1234407334155403e-05, + "loss": 0.1134, + "step": 33264 + }, + { + "epoch": 0.59331858880605, + "grad_norm": 0.23999664187431335, + "learning_rate": 2.123286859179997e-05, + "loss": 0.1282, + "step": 33265 + }, + { + "epoch": 0.5933364249277637, + "grad_norm": 0.2643487751483917, + "learning_rate": 2.123132986404724e-05, + "loss": 0.1483, + "step": 33266 + }, + { + "epoch": 0.5933542610494774, + "grad_norm": 0.246018186211586, + "learning_rate": 2.1229791150903168e-05, + "loss": 0.1292, + "step": 33267 + }, + { + "epoch": 0.5933720971711911, + "grad_norm": 0.23561933636665344, + "learning_rate": 2.1228252452373738e-05, + "loss": 0.1335, + "step": 33268 + }, + { + "epoch": 0.5933899332929048, + "grad_norm": 0.2946788966655731, + "learning_rate": 2.1226713768464895e-05, + "loss": 0.1326, + "step": 33269 + }, + { + "epoch": 0.5934077694146185, + "grad_norm": 0.24714688956737518, + "learning_rate": 2.1225175099182625e-05, + "loss": 0.1222, + "step": 33270 + }, + { + "epoch": 0.5934256055363322, + "grad_norm": 0.37364429235458374, + "learning_rate": 2.122363644453286e-05, + "loss": 0.1289, + "step": 33271 + }, + { + "epoch": 0.5934434416580459, + "grad_norm": 0.27294230461120605, + "learning_rate": 2.1222097804521603e-05, + "loss": 0.1842, + "step": 33272 + }, + { + "epoch": 0.5934612777797595, + "grad_norm": 0.2676510214805603, + "learning_rate": 2.1220559179154796e-05, + "loss": 0.149, + "step": 33273 + }, + { + "epoch": 0.5934791139014732, + "grad_norm": 0.1891804039478302, + "learning_rate": 2.121902056843841e-05, + "loss": 0.085, + "step": 33274 + }, + { + "epoch": 0.5934969500231869, + "grad_norm": 0.22598101198673248, + "learning_rate": 2.12174819723784e-05, + "loss": 0.1187, + "step": 33275 + }, + { + "epoch": 0.5935147861449006, + "grad_norm": 0.24087947607040405, + "learning_rate": 2.1215943390980734e-05, + "loss": 0.1438, + "step": 33276 + }, + { + "epoch": 0.5935326222666143, + "grad_norm": 0.24859794974327087, + "learning_rate": 2.121440482425138e-05, + "loss": 0.1221, + "step": 33277 + }, + { + "epoch": 0.5935504583883281, + "grad_norm": 0.25371286273002625, + "learning_rate": 2.1212866272196306e-05, + "loss": 0.1408, + "step": 33278 + }, + { + "epoch": 0.5935682945100418, + "grad_norm": 0.22118690609931946, + "learning_rate": 2.1211327734821467e-05, + "loss": 0.1543, + "step": 33279 + }, + { + "epoch": 0.5935861306317555, + "grad_norm": 0.20435787737369537, + "learning_rate": 2.1209789212132823e-05, + "loss": 0.1168, + "step": 33280 + }, + { + "epoch": 0.5936039667534692, + "grad_norm": 0.26853296160697937, + "learning_rate": 2.1208250704136356e-05, + "loss": 0.0991, + "step": 33281 + }, + { + "epoch": 0.5936218028751828, + "grad_norm": 0.21009793877601624, + "learning_rate": 2.1206712210838014e-05, + "loss": 0.0905, + "step": 33282 + }, + { + "epoch": 0.5936396389968965, + "grad_norm": 0.2654861509799957, + "learning_rate": 2.120517373224377e-05, + "loss": 0.1391, + "step": 33283 + }, + { + "epoch": 0.5936574751186102, + "grad_norm": 0.2824491560459137, + "learning_rate": 2.1203635268359574e-05, + "loss": 0.138, + "step": 33284 + }, + { + "epoch": 0.5936753112403239, + "grad_norm": 0.2922595143318176, + "learning_rate": 2.1202096819191405e-05, + "loss": 0.1387, + "step": 33285 + }, + { + "epoch": 0.5936931473620376, + "grad_norm": 0.4046544134616852, + "learning_rate": 2.1200558384745224e-05, + "loss": 0.1128, + "step": 33286 + }, + { + "epoch": 0.5937109834837513, + "grad_norm": 0.219674214720726, + "learning_rate": 2.1199019965026988e-05, + "loss": 0.1052, + "step": 33287 + }, + { + "epoch": 0.593728819605465, + "grad_norm": 0.2652757465839386, + "learning_rate": 2.1197481560042663e-05, + "loss": 0.1132, + "step": 33288 + }, + { + "epoch": 0.5937466557271787, + "grad_norm": 0.2525189518928528, + "learning_rate": 2.119594316979821e-05, + "loss": 0.1223, + "step": 33289 + }, + { + "epoch": 0.5937644918488924, + "grad_norm": 0.2301081269979477, + "learning_rate": 2.1194404794299596e-05, + "loss": 0.1202, + "step": 33290 + }, + { + "epoch": 0.593782327970606, + "grad_norm": 0.2703513503074646, + "learning_rate": 2.119286643355279e-05, + "loss": 0.1107, + "step": 33291 + }, + { + "epoch": 0.5938001640923197, + "grad_norm": 0.3026869297027588, + "learning_rate": 2.1191328087563743e-05, + "loss": 0.1378, + "step": 33292 + }, + { + "epoch": 0.5938180002140334, + "grad_norm": 0.32537534832954407, + "learning_rate": 2.1189789756338417e-05, + "loss": 0.1657, + "step": 33293 + }, + { + "epoch": 0.5938358363357471, + "grad_norm": 0.34009039402008057, + "learning_rate": 2.1188251439882794e-05, + "loss": 0.1728, + "step": 33294 + }, + { + "epoch": 0.5938536724574609, + "grad_norm": 0.22713157534599304, + "learning_rate": 2.1186713138202825e-05, + "loss": 0.1316, + "step": 33295 + }, + { + "epoch": 0.5938715085791746, + "grad_norm": 0.4985451400279999, + "learning_rate": 2.1185174851304467e-05, + "loss": 0.2686, + "step": 33296 + }, + { + "epoch": 0.5938893447008883, + "grad_norm": 0.2969025671482086, + "learning_rate": 2.118363657919369e-05, + "loss": 0.1032, + "step": 33297 + }, + { + "epoch": 0.593907180822602, + "grad_norm": 0.2373484969139099, + "learning_rate": 2.118209832187645e-05, + "loss": 0.1491, + "step": 33298 + }, + { + "epoch": 0.5939250169443157, + "grad_norm": 0.273009717464447, + "learning_rate": 2.1180560079358726e-05, + "loss": 0.1272, + "step": 33299 + }, + { + "epoch": 0.5939428530660293, + "grad_norm": 0.2795901596546173, + "learning_rate": 2.117902185164647e-05, + "loss": 0.1289, + "step": 33300 + }, + { + "epoch": 0.593960689187743, + "grad_norm": 0.24349498748779297, + "learning_rate": 2.1177483638745643e-05, + "loss": 0.1095, + "step": 33301 + }, + { + "epoch": 0.5939785253094567, + "grad_norm": 0.3849669396877289, + "learning_rate": 2.11759454406622e-05, + "loss": 0.1098, + "step": 33302 + }, + { + "epoch": 0.5939963614311704, + "grad_norm": 0.28109389543533325, + "learning_rate": 2.1174407257402123e-05, + "loss": 0.1234, + "step": 33303 + }, + { + "epoch": 0.5940141975528841, + "grad_norm": 0.2173144370317459, + "learning_rate": 2.1172869088971362e-05, + "loss": 0.1219, + "step": 33304 + }, + { + "epoch": 0.5940320336745978, + "grad_norm": 0.26749375462532043, + "learning_rate": 2.1171330935375885e-05, + "loss": 0.1006, + "step": 33305 + }, + { + "epoch": 0.5940498697963115, + "grad_norm": 0.3091845214366913, + "learning_rate": 2.1169792796621642e-05, + "loss": 0.1239, + "step": 33306 + }, + { + "epoch": 0.5940677059180252, + "grad_norm": 0.24436207115650177, + "learning_rate": 2.1168254672714616e-05, + "loss": 0.1681, + "step": 33307 + }, + { + "epoch": 0.5940855420397388, + "grad_norm": 0.3007710576057434, + "learning_rate": 2.116671656366076e-05, + "loss": 0.1298, + "step": 33308 + }, + { + "epoch": 0.5941033781614525, + "grad_norm": 0.33093568682670593, + "learning_rate": 2.116517846946603e-05, + "loss": 0.1179, + "step": 33309 + }, + { + "epoch": 0.5941212142831662, + "grad_norm": 0.313926100730896, + "learning_rate": 2.1163640390136387e-05, + "loss": 0.1884, + "step": 33310 + }, + { + "epoch": 0.5941390504048799, + "grad_norm": 0.21629378199577332, + "learning_rate": 2.1162102325677803e-05, + "loss": 0.1333, + "step": 33311 + }, + { + "epoch": 0.5941568865265937, + "grad_norm": 0.22283364832401276, + "learning_rate": 2.116056427609624e-05, + "loss": 0.1107, + "step": 33312 + }, + { + "epoch": 0.5941747226483074, + "grad_norm": 0.2408323436975479, + "learning_rate": 2.1159026241397657e-05, + "loss": 0.1306, + "step": 33313 + }, + { + "epoch": 0.5941925587700211, + "grad_norm": 0.30166885256767273, + "learning_rate": 2.1157488221588013e-05, + "loss": 0.0941, + "step": 33314 + }, + { + "epoch": 0.5942103948917348, + "grad_norm": 0.2446063756942749, + "learning_rate": 2.1155950216673263e-05, + "loss": 0.096, + "step": 33315 + }, + { + "epoch": 0.5942282310134485, + "grad_norm": 0.2665475904941559, + "learning_rate": 2.1154412226659388e-05, + "loss": 0.1321, + "step": 33316 + }, + { + "epoch": 0.5942460671351621, + "grad_norm": 0.20058229565620422, + "learning_rate": 2.1152874251552333e-05, + "loss": 0.1159, + "step": 33317 + }, + { + "epoch": 0.5942639032568758, + "grad_norm": 0.33355823159217834, + "learning_rate": 2.115133629135807e-05, + "loss": 0.2198, + "step": 33318 + }, + { + "epoch": 0.5942817393785895, + "grad_norm": 0.24582815170288086, + "learning_rate": 2.114979834608255e-05, + "loss": 0.1305, + "step": 33319 + }, + { + "epoch": 0.5942995755003032, + "grad_norm": 0.2085101157426834, + "learning_rate": 2.114826041573175e-05, + "loss": 0.1076, + "step": 33320 + }, + { + "epoch": 0.5943174116220169, + "grad_norm": 0.2698776125907898, + "learning_rate": 2.1146722500311624e-05, + "loss": 0.1046, + "step": 33321 + }, + { + "epoch": 0.5943352477437306, + "grad_norm": 0.30865195393562317, + "learning_rate": 2.114518459982813e-05, + "loss": 0.1145, + "step": 33322 + }, + { + "epoch": 0.5943530838654443, + "grad_norm": 0.35019543766975403, + "learning_rate": 2.1143646714287226e-05, + "loss": 0.1861, + "step": 33323 + }, + { + "epoch": 0.594370919987158, + "grad_norm": 0.2837424576282501, + "learning_rate": 2.1142108843694886e-05, + "loss": 0.1243, + "step": 33324 + }, + { + "epoch": 0.5943887561088717, + "grad_norm": 0.21246322989463806, + "learning_rate": 2.114057098805706e-05, + "loss": 0.1121, + "step": 33325 + }, + { + "epoch": 0.5944065922305853, + "grad_norm": 0.44531095027923584, + "learning_rate": 2.113903314737972e-05, + "loss": 0.1534, + "step": 33326 + }, + { + "epoch": 0.594424428352299, + "grad_norm": 0.25767385959625244, + "learning_rate": 2.113749532166882e-05, + "loss": 0.1031, + "step": 33327 + }, + { + "epoch": 0.5944422644740127, + "grad_norm": 0.32219457626342773, + "learning_rate": 2.1135957510930312e-05, + "loss": 0.1226, + "step": 33328 + }, + { + "epoch": 0.5944601005957265, + "grad_norm": 0.30908921360969543, + "learning_rate": 2.113441971517018e-05, + "loss": 0.1359, + "step": 33329 + }, + { + "epoch": 0.5944779367174402, + "grad_norm": 0.21614624559879303, + "learning_rate": 2.113288193439437e-05, + "loss": 0.1206, + "step": 33330 + }, + { + "epoch": 0.5944957728391539, + "grad_norm": 0.27616238594055176, + "learning_rate": 2.113134416860884e-05, + "loss": 0.1943, + "step": 33331 + }, + { + "epoch": 0.5945136089608676, + "grad_norm": 0.27395099401474, + "learning_rate": 2.112980641781955e-05, + "loss": 0.1448, + "step": 33332 + }, + { + "epoch": 0.5945314450825813, + "grad_norm": 0.3325318694114685, + "learning_rate": 2.112826868203248e-05, + "loss": 0.1472, + "step": 33333 + }, + { + "epoch": 0.594549281204295, + "grad_norm": 0.2864380478858948, + "learning_rate": 2.1126730961253576e-05, + "loss": 0.1497, + "step": 33334 + }, + { + "epoch": 0.5945671173260086, + "grad_norm": 0.3155297040939331, + "learning_rate": 2.11251932554888e-05, + "loss": 0.1583, + "step": 33335 + }, + { + "epoch": 0.5945849534477223, + "grad_norm": 0.2774326205253601, + "learning_rate": 2.1123655564744106e-05, + "loss": 0.1158, + "step": 33336 + }, + { + "epoch": 0.594602789569436, + "grad_norm": 0.27363941073417664, + "learning_rate": 2.112211788902547e-05, + "loss": 0.0904, + "step": 33337 + }, + { + "epoch": 0.5946206256911497, + "grad_norm": 0.2833259105682373, + "learning_rate": 2.112058022833884e-05, + "loss": 0.1192, + "step": 33338 + }, + { + "epoch": 0.5946384618128634, + "grad_norm": 0.21289248764514923, + "learning_rate": 2.1119042582690184e-05, + "loss": 0.1013, + "step": 33339 + }, + { + "epoch": 0.5946562979345771, + "grad_norm": 0.25423264503479004, + "learning_rate": 2.1117504952085463e-05, + "loss": 0.1413, + "step": 33340 + }, + { + "epoch": 0.5946741340562908, + "grad_norm": 0.2733125388622284, + "learning_rate": 2.111596733653062e-05, + "loss": 0.2077, + "step": 33341 + }, + { + "epoch": 0.5946919701780045, + "grad_norm": 0.29845696687698364, + "learning_rate": 2.1114429736031643e-05, + "loss": 0.2034, + "step": 33342 + }, + { + "epoch": 0.5947098062997181, + "grad_norm": 0.2779388427734375, + "learning_rate": 2.1112892150594476e-05, + "loss": 0.1165, + "step": 33343 + }, + { + "epoch": 0.5947276424214318, + "grad_norm": 0.2692381739616394, + "learning_rate": 2.1111354580225077e-05, + "loss": 0.1429, + "step": 33344 + }, + { + "epoch": 0.5947454785431456, + "grad_norm": 0.1827811747789383, + "learning_rate": 2.110981702492941e-05, + "loss": 0.0748, + "step": 33345 + }, + { + "epoch": 0.5947633146648593, + "grad_norm": 0.17644137144088745, + "learning_rate": 2.1108279484713437e-05, + "loss": 0.0894, + "step": 33346 + }, + { + "epoch": 0.594781150786573, + "grad_norm": 0.21733225882053375, + "learning_rate": 2.1106741959583122e-05, + "loss": 0.1039, + "step": 33347 + }, + { + "epoch": 0.5947989869082867, + "grad_norm": 0.24388693273067474, + "learning_rate": 2.110520444954442e-05, + "loss": 0.1347, + "step": 33348 + }, + { + "epoch": 0.5948168230300004, + "grad_norm": 0.275558739900589, + "learning_rate": 2.1103666954603278e-05, + "loss": 0.0931, + "step": 33349 + }, + { + "epoch": 0.5948346591517141, + "grad_norm": 0.25211426615715027, + "learning_rate": 2.1102129474765682e-05, + "loss": 0.1247, + "step": 33350 + }, + { + "epoch": 0.5948524952734278, + "grad_norm": 0.27250900864601135, + "learning_rate": 2.1100592010037575e-05, + "loss": 0.1499, + "step": 33351 + }, + { + "epoch": 0.5948703313951414, + "grad_norm": 0.22814081609249115, + "learning_rate": 2.109905456042492e-05, + "loss": 0.0994, + "step": 33352 + }, + { + "epoch": 0.5948881675168551, + "grad_norm": 0.44190946221351624, + "learning_rate": 2.1097517125933676e-05, + "loss": 0.1893, + "step": 33353 + }, + { + "epoch": 0.5949060036385688, + "grad_norm": 0.1724148988723755, + "learning_rate": 2.1095979706569797e-05, + "loss": 0.1342, + "step": 33354 + }, + { + "epoch": 0.5949238397602825, + "grad_norm": 0.4163680970668793, + "learning_rate": 2.109444230233926e-05, + "loss": 0.1964, + "step": 33355 + }, + { + "epoch": 0.5949416758819962, + "grad_norm": 0.289826363325119, + "learning_rate": 2.1092904913248013e-05, + "loss": 0.1629, + "step": 33356 + }, + { + "epoch": 0.5949595120037099, + "grad_norm": 0.24717707931995392, + "learning_rate": 2.1091367539302014e-05, + "loss": 0.1576, + "step": 33357 + }, + { + "epoch": 0.5949773481254236, + "grad_norm": 0.3352966606616974, + "learning_rate": 2.1089830180507215e-05, + "loss": 0.1558, + "step": 33358 + }, + { + "epoch": 0.5949951842471373, + "grad_norm": 0.3377259075641632, + "learning_rate": 2.108829283686959e-05, + "loss": 0.0998, + "step": 33359 + }, + { + "epoch": 0.595013020368851, + "grad_norm": 0.3050462305545807, + "learning_rate": 2.10867555083951e-05, + "loss": 0.1626, + "step": 33360 + }, + { + "epoch": 0.5950308564905646, + "grad_norm": 0.408711701631546, + "learning_rate": 2.1085218195089694e-05, + "loss": 0.1507, + "step": 33361 + }, + { + "epoch": 0.5950486926122784, + "grad_norm": 0.2460470199584961, + "learning_rate": 2.1083680896959327e-05, + "loss": 0.1037, + "step": 33362 + }, + { + "epoch": 0.5950665287339921, + "grad_norm": 0.3225964307785034, + "learning_rate": 2.1082143614009976e-05, + "loss": 0.1138, + "step": 33363 + }, + { + "epoch": 0.5950843648557058, + "grad_norm": 0.18813158571720123, + "learning_rate": 2.1080606346247586e-05, + "loss": 0.1068, + "step": 33364 + }, + { + "epoch": 0.5951022009774195, + "grad_norm": 0.37332475185394287, + "learning_rate": 2.1079069093678115e-05, + "loss": 0.2153, + "step": 33365 + }, + { + "epoch": 0.5951200370991332, + "grad_norm": 0.28281331062316895, + "learning_rate": 2.1077531856307535e-05, + "loss": 0.1248, + "step": 33366 + }, + { + "epoch": 0.5951378732208469, + "grad_norm": 0.25799229741096497, + "learning_rate": 2.1075994634141787e-05, + "loss": 0.1422, + "step": 33367 + }, + { + "epoch": 0.5951557093425606, + "grad_norm": 0.19244201481342316, + "learning_rate": 2.1074457427186846e-05, + "loss": 0.1143, + "step": 33368 + }, + { + "epoch": 0.5951735454642743, + "grad_norm": 0.3046590983867645, + "learning_rate": 2.107292023544866e-05, + "loss": 0.0733, + "step": 33369 + }, + { + "epoch": 0.5951913815859879, + "grad_norm": 0.28025656938552856, + "learning_rate": 2.1071383058933198e-05, + "loss": 0.1024, + "step": 33370 + }, + { + "epoch": 0.5952092177077016, + "grad_norm": 0.18772763013839722, + "learning_rate": 2.10698458976464e-05, + "loss": 0.1443, + "step": 33371 + }, + { + "epoch": 0.5952270538294153, + "grad_norm": 0.26078474521636963, + "learning_rate": 2.1068308751594247e-05, + "loss": 0.107, + "step": 33372 + }, + { + "epoch": 0.595244889951129, + "grad_norm": 0.29275834560394287, + "learning_rate": 2.1066771620782682e-05, + "loss": 0.1261, + "step": 33373 + }, + { + "epoch": 0.5952627260728427, + "grad_norm": 0.25916218757629395, + "learning_rate": 2.1065234505217673e-05, + "loss": 0.0741, + "step": 33374 + }, + { + "epoch": 0.5952805621945564, + "grad_norm": 0.2652100920677185, + "learning_rate": 2.1063697404905162e-05, + "loss": 0.1134, + "step": 33375 + }, + { + "epoch": 0.5952983983162701, + "grad_norm": 0.2871285080909729, + "learning_rate": 2.1062160319851134e-05, + "loss": 0.0927, + "step": 33376 + }, + { + "epoch": 0.5953162344379838, + "grad_norm": 0.2495378851890564, + "learning_rate": 2.1060623250061533e-05, + "loss": 0.1472, + "step": 33377 + }, + { + "epoch": 0.5953340705596974, + "grad_norm": 0.2431400567293167, + "learning_rate": 2.1059086195542314e-05, + "loss": 0.0966, + "step": 33378 + }, + { + "epoch": 0.5953519066814112, + "grad_norm": 0.23413266241550446, + "learning_rate": 2.1057549156299432e-05, + "loss": 0.1348, + "step": 33379 + }, + { + "epoch": 0.5953697428031249, + "grad_norm": 0.20966389775276184, + "learning_rate": 2.1056012132338855e-05, + "loss": 0.11, + "step": 33380 + }, + { + "epoch": 0.5953875789248386, + "grad_norm": 0.28601932525634766, + "learning_rate": 2.1054475123666534e-05, + "loss": 0.1482, + "step": 33381 + }, + { + "epoch": 0.5954054150465523, + "grad_norm": 0.240091472864151, + "learning_rate": 2.105293813028844e-05, + "loss": 0.1023, + "step": 33382 + }, + { + "epoch": 0.595423251168266, + "grad_norm": 0.24070826172828674, + "learning_rate": 2.1051401152210515e-05, + "loss": 0.1135, + "step": 33383 + }, + { + "epoch": 0.5954410872899797, + "grad_norm": 0.3157764971256256, + "learning_rate": 2.1049864189438713e-05, + "loss": 0.1471, + "step": 33384 + }, + { + "epoch": 0.5954589234116934, + "grad_norm": 0.37929174304008484, + "learning_rate": 2.1048327241979014e-05, + "loss": 0.1747, + "step": 33385 + }, + { + "epoch": 0.5954767595334071, + "grad_norm": 0.2279956042766571, + "learning_rate": 2.1046790309837367e-05, + "loss": 0.0797, + "step": 33386 + }, + { + "epoch": 0.5954945956551208, + "grad_norm": 0.30363374948501587, + "learning_rate": 2.1045253393019718e-05, + "loss": 0.1187, + "step": 33387 + }, + { + "epoch": 0.5955124317768344, + "grad_norm": 0.25819629430770874, + "learning_rate": 2.1043716491532027e-05, + "loss": 0.1616, + "step": 33388 + }, + { + "epoch": 0.5955302678985481, + "grad_norm": 0.23750442266464233, + "learning_rate": 2.1042179605380267e-05, + "loss": 0.1144, + "step": 33389 + }, + { + "epoch": 0.5955481040202618, + "grad_norm": 0.4008646607398987, + "learning_rate": 2.104064273457039e-05, + "loss": 0.1571, + "step": 33390 + }, + { + "epoch": 0.5955659401419755, + "grad_norm": 0.1990721970796585, + "learning_rate": 2.1039105879108344e-05, + "loss": 0.0821, + "step": 33391 + }, + { + "epoch": 0.5955837762636892, + "grad_norm": 0.2515743672847748, + "learning_rate": 2.103756903900009e-05, + "loss": 0.1208, + "step": 33392 + }, + { + "epoch": 0.5956016123854029, + "grad_norm": 0.3623788058757782, + "learning_rate": 2.1036032214251584e-05, + "loss": 0.1851, + "step": 33393 + }, + { + "epoch": 0.5956194485071166, + "grad_norm": 0.2713909447193146, + "learning_rate": 2.1034495404868785e-05, + "loss": 0.1443, + "step": 33394 + }, + { + "epoch": 0.5956372846288303, + "grad_norm": 0.3002442419528961, + "learning_rate": 2.1032958610857663e-05, + "loss": 0.1442, + "step": 33395 + }, + { + "epoch": 0.595655120750544, + "grad_norm": 0.21712704002857208, + "learning_rate": 2.1031421832224156e-05, + "loss": 0.124, + "step": 33396 + }, + { + "epoch": 0.5956729568722577, + "grad_norm": 0.35082346200942993, + "learning_rate": 2.1029885068974224e-05, + "loss": 0.0981, + "step": 33397 + }, + { + "epoch": 0.5956907929939714, + "grad_norm": 0.24211251735687256, + "learning_rate": 2.1028348321113837e-05, + "loss": 0.0879, + "step": 33398 + }, + { + "epoch": 0.5957086291156851, + "grad_norm": 0.31062939763069153, + "learning_rate": 2.1026811588648945e-05, + "loss": 0.1071, + "step": 33399 + }, + { + "epoch": 0.5957264652373988, + "grad_norm": 0.38054537773132324, + "learning_rate": 2.1025274871585497e-05, + "loss": 0.176, + "step": 33400 + }, + { + "epoch": 0.5957443013591125, + "grad_norm": 0.26598379015922546, + "learning_rate": 2.1023738169929457e-05, + "loss": 0.1102, + "step": 33401 + }, + { + "epoch": 0.5957621374808262, + "grad_norm": 0.25477680563926697, + "learning_rate": 2.1022201483686783e-05, + "loss": 0.1151, + "step": 33402 + }, + { + "epoch": 0.5957799736025399, + "grad_norm": 0.33847224712371826, + "learning_rate": 2.102066481286343e-05, + "loss": 0.1444, + "step": 33403 + }, + { + "epoch": 0.5957978097242536, + "grad_norm": 0.3492162823677063, + "learning_rate": 2.101912815746536e-05, + "loss": 0.1283, + "step": 33404 + }, + { + "epoch": 0.5958156458459672, + "grad_norm": 0.31831786036491394, + "learning_rate": 2.101759151749852e-05, + "loss": 0.1629, + "step": 33405 + }, + { + "epoch": 0.5958334819676809, + "grad_norm": 0.3120494484901428, + "learning_rate": 2.1016054892968863e-05, + "loss": 0.1237, + "step": 33406 + }, + { + "epoch": 0.5958513180893946, + "grad_norm": 0.2621499300003052, + "learning_rate": 2.1014518283882363e-05, + "loss": 0.1145, + "step": 33407 + }, + { + "epoch": 0.5958691542111083, + "grad_norm": 0.28799012303352356, + "learning_rate": 2.101298169024496e-05, + "loss": 0.0874, + "step": 33408 + }, + { + "epoch": 0.595886990332822, + "grad_norm": 0.25026583671569824, + "learning_rate": 2.1011445112062625e-05, + "loss": 0.1322, + "step": 33409 + }, + { + "epoch": 0.5959048264545357, + "grad_norm": 0.2893786132335663, + "learning_rate": 2.1009908549341296e-05, + "loss": 0.1563, + "step": 33410 + }, + { + "epoch": 0.5959226625762494, + "grad_norm": 0.23970358073711395, + "learning_rate": 2.1008372002086947e-05, + "loss": 0.1054, + "step": 33411 + }, + { + "epoch": 0.5959404986979631, + "grad_norm": 0.2955992519855499, + "learning_rate": 2.100683547030553e-05, + "loss": 0.1415, + "step": 33412 + }, + { + "epoch": 0.5959583348196769, + "grad_norm": 0.33711910247802734, + "learning_rate": 2.1005298954002995e-05, + "loss": 0.1696, + "step": 33413 + }, + { + "epoch": 0.5959761709413905, + "grad_norm": 0.28058022260665894, + "learning_rate": 2.1003762453185293e-05, + "loss": 0.1622, + "step": 33414 + }, + { + "epoch": 0.5959940070631042, + "grad_norm": 0.29343101382255554, + "learning_rate": 2.100222596785839e-05, + "loss": 0.1366, + "step": 33415 + }, + { + "epoch": 0.5960118431848179, + "grad_norm": 0.2745125889778137, + "learning_rate": 2.1000689498028248e-05, + "loss": 0.1391, + "step": 33416 + }, + { + "epoch": 0.5960296793065316, + "grad_norm": 0.27711477875709534, + "learning_rate": 2.0999153043700814e-05, + "loss": 0.1516, + "step": 33417 + }, + { + "epoch": 0.5960475154282453, + "grad_norm": 0.24365398287773132, + "learning_rate": 2.0997616604882044e-05, + "loss": 0.1031, + "step": 33418 + }, + { + "epoch": 0.596065351549959, + "grad_norm": 0.20161396265029907, + "learning_rate": 2.0996080181577886e-05, + "loss": 0.1145, + "step": 33419 + }, + { + "epoch": 0.5960831876716727, + "grad_norm": 0.23411734402179718, + "learning_rate": 2.0994543773794316e-05, + "loss": 0.1331, + "step": 33420 + }, + { + "epoch": 0.5961010237933864, + "grad_norm": 0.2648693025112152, + "learning_rate": 2.0993007381537266e-05, + "loss": 0.1795, + "step": 33421 + }, + { + "epoch": 0.5961188599151, + "grad_norm": 0.209220290184021, + "learning_rate": 2.0991471004812714e-05, + "loss": 0.1217, + "step": 33422 + }, + { + "epoch": 0.5961366960368137, + "grad_norm": 0.3890189230442047, + "learning_rate": 2.098993464362659e-05, + "loss": 0.1424, + "step": 33423 + }, + { + "epoch": 0.5961545321585274, + "grad_norm": 0.26895254850387573, + "learning_rate": 2.0988398297984878e-05, + "loss": 0.1421, + "step": 33424 + }, + { + "epoch": 0.5961723682802411, + "grad_norm": 0.23200629651546478, + "learning_rate": 2.0986861967893524e-05, + "loss": 0.1164, + "step": 33425 + }, + { + "epoch": 0.5961902044019548, + "grad_norm": 0.2543470859527588, + "learning_rate": 2.0985325653358473e-05, + "loss": 0.0801, + "step": 33426 + }, + { + "epoch": 0.5962080405236685, + "grad_norm": 0.30066829919815063, + "learning_rate": 2.0983789354385677e-05, + "loss": 0.1875, + "step": 33427 + }, + { + "epoch": 0.5962258766453822, + "grad_norm": 0.28343334794044495, + "learning_rate": 2.0982253070981112e-05, + "loss": 0.1537, + "step": 33428 + }, + { + "epoch": 0.5962437127670959, + "grad_norm": 0.356293261051178, + "learning_rate": 2.0980716803150716e-05, + "loss": 0.1492, + "step": 33429 + }, + { + "epoch": 0.5962615488888097, + "grad_norm": 0.2436731904745102, + "learning_rate": 2.0979180550900453e-05, + "loss": 0.1077, + "step": 33430 + }, + { + "epoch": 0.5962793850105234, + "grad_norm": 0.2647886276245117, + "learning_rate": 2.0977644314236278e-05, + "loss": 0.0954, + "step": 33431 + }, + { + "epoch": 0.596297221132237, + "grad_norm": 0.20411249995231628, + "learning_rate": 2.0976108093164133e-05, + "loss": 0.0756, + "step": 33432 + }, + { + "epoch": 0.5963150572539507, + "grad_norm": 0.3440779447555542, + "learning_rate": 2.097457188768999e-05, + "loss": 0.154, + "step": 33433 + }, + { + "epoch": 0.5963328933756644, + "grad_norm": 0.251858651638031, + "learning_rate": 2.09730356978198e-05, + "loss": 0.1252, + "step": 33434 + }, + { + "epoch": 0.5963507294973781, + "grad_norm": 0.3432500958442688, + "learning_rate": 2.0971499523559506e-05, + "loss": 0.1285, + "step": 33435 + }, + { + "epoch": 0.5963685656190918, + "grad_norm": 0.22653472423553467, + "learning_rate": 2.0969963364915068e-05, + "loss": 0.0866, + "step": 33436 + }, + { + "epoch": 0.5963864017408055, + "grad_norm": 0.3579009175300598, + "learning_rate": 2.0968427221892455e-05, + "loss": 0.1734, + "step": 33437 + }, + { + "epoch": 0.5964042378625192, + "grad_norm": 0.23884211480617523, + "learning_rate": 2.096689109449761e-05, + "loss": 0.1586, + "step": 33438 + }, + { + "epoch": 0.5964220739842329, + "grad_norm": 0.1790805160999298, + "learning_rate": 2.096535498273649e-05, + "loss": 0.0926, + "step": 33439 + }, + { + "epoch": 0.5964399101059465, + "grad_norm": 0.20240965485572815, + "learning_rate": 2.0963818886615034e-05, + "loss": 0.0715, + "step": 33440 + }, + { + "epoch": 0.5964577462276602, + "grad_norm": 0.4511924386024475, + "learning_rate": 2.096228280613922e-05, + "loss": 0.1523, + "step": 33441 + }, + { + "epoch": 0.5964755823493739, + "grad_norm": 0.4020020067691803, + "learning_rate": 2.0960746741314992e-05, + "loss": 0.1524, + "step": 33442 + }, + { + "epoch": 0.5964934184710876, + "grad_norm": 0.3994465470314026, + "learning_rate": 2.0959210692148306e-05, + "loss": 0.2365, + "step": 33443 + }, + { + "epoch": 0.5965112545928013, + "grad_norm": 0.3291572332382202, + "learning_rate": 2.0957674658645115e-05, + "loss": 0.113, + "step": 33444 + }, + { + "epoch": 0.596529090714515, + "grad_norm": 0.21668535470962524, + "learning_rate": 2.0956138640811363e-05, + "loss": 0.1603, + "step": 33445 + }, + { + "epoch": 0.5965469268362288, + "grad_norm": 0.25522488355636597, + "learning_rate": 2.0954602638653027e-05, + "loss": 0.127, + "step": 33446 + }, + { + "epoch": 0.5965647629579425, + "grad_norm": 0.2664676010608673, + "learning_rate": 2.0953066652176045e-05, + "loss": 0.1381, + "step": 33447 + }, + { + "epoch": 0.5965825990796562, + "grad_norm": 0.2657136023044586, + "learning_rate": 2.095153068138637e-05, + "loss": 0.1276, + "step": 33448 + }, + { + "epoch": 0.5966004352013698, + "grad_norm": 0.2838156521320343, + "learning_rate": 2.094999472628996e-05, + "loss": 0.0919, + "step": 33449 + }, + { + "epoch": 0.5966182713230835, + "grad_norm": 0.2860822379589081, + "learning_rate": 2.094845878689277e-05, + "loss": 0.1354, + "step": 33450 + }, + { + "epoch": 0.5966361074447972, + "grad_norm": 0.2965041995048523, + "learning_rate": 2.0946922863200764e-05, + "loss": 0.1647, + "step": 33451 + }, + { + "epoch": 0.5966539435665109, + "grad_norm": 0.2065976858139038, + "learning_rate": 2.0945386955219878e-05, + "loss": 0.1204, + "step": 33452 + }, + { + "epoch": 0.5966717796882246, + "grad_norm": 0.24486243724822998, + "learning_rate": 2.0943851062956063e-05, + "loss": 0.1554, + "step": 33453 + }, + { + "epoch": 0.5966896158099383, + "grad_norm": 0.2656596004962921, + "learning_rate": 2.0942315186415294e-05, + "loss": 0.1264, + "step": 33454 + }, + { + "epoch": 0.596707451931652, + "grad_norm": 0.2900265157222748, + "learning_rate": 2.0940779325603514e-05, + "loss": 0.175, + "step": 33455 + }, + { + "epoch": 0.5967252880533657, + "grad_norm": 0.23209281265735626, + "learning_rate": 2.0939243480526665e-05, + "loss": 0.1579, + "step": 33456 + }, + { + "epoch": 0.5967431241750794, + "grad_norm": 0.24760572612285614, + "learning_rate": 2.093770765119072e-05, + "loss": 0.1151, + "step": 33457 + }, + { + "epoch": 0.596760960296793, + "grad_norm": 0.20232835412025452, + "learning_rate": 2.0936171837601613e-05, + "loss": 0.1399, + "step": 33458 + }, + { + "epoch": 0.5967787964185067, + "grad_norm": 0.2508339583873749, + "learning_rate": 2.0934636039765317e-05, + "loss": 0.1242, + "step": 33459 + }, + { + "epoch": 0.5967966325402204, + "grad_norm": 0.2698204219341278, + "learning_rate": 2.0933100257687776e-05, + "loss": 0.0805, + "step": 33460 + }, + { + "epoch": 0.5968144686619341, + "grad_norm": 0.235343798995018, + "learning_rate": 2.0931564491374946e-05, + "loss": 0.1015, + "step": 33461 + }, + { + "epoch": 0.5968323047836478, + "grad_norm": 0.3445269763469696, + "learning_rate": 2.0930028740832764e-05, + "loss": 0.1336, + "step": 33462 + }, + { + "epoch": 0.5968501409053616, + "grad_norm": 0.21841862797737122, + "learning_rate": 2.0928493006067204e-05, + "loss": 0.098, + "step": 33463 + }, + { + "epoch": 0.5968679770270753, + "grad_norm": 0.25828567147254944, + "learning_rate": 2.092695728708421e-05, + "loss": 0.1447, + "step": 33464 + }, + { + "epoch": 0.596885813148789, + "grad_norm": 0.23380765318870544, + "learning_rate": 2.092542158388974e-05, + "loss": 0.1772, + "step": 33465 + }, + { + "epoch": 0.5969036492705027, + "grad_norm": 0.29929211735725403, + "learning_rate": 2.0923885896489734e-05, + "loss": 0.1315, + "step": 33466 + }, + { + "epoch": 0.5969214853922163, + "grad_norm": 0.2961951792240143, + "learning_rate": 2.0922350224890163e-05, + "loss": 0.1137, + "step": 33467 + }, + { + "epoch": 0.59693932151393, + "grad_norm": 0.23490402102470398, + "learning_rate": 2.092081456909697e-05, + "loss": 0.1172, + "step": 33468 + }, + { + "epoch": 0.5969571576356437, + "grad_norm": 0.26800209283828735, + "learning_rate": 2.0919278929116106e-05, + "loss": 0.1379, + "step": 33469 + }, + { + "epoch": 0.5969749937573574, + "grad_norm": 0.21255186200141907, + "learning_rate": 2.0917743304953534e-05, + "loss": 0.1449, + "step": 33470 + }, + { + "epoch": 0.5969928298790711, + "grad_norm": 0.3051043450832367, + "learning_rate": 2.091620769661518e-05, + "loss": 0.1378, + "step": 33471 + }, + { + "epoch": 0.5970106660007848, + "grad_norm": 0.2557990550994873, + "learning_rate": 2.0914672104107032e-05, + "loss": 0.1095, + "step": 33472 + }, + { + "epoch": 0.5970285021224985, + "grad_norm": 0.31415480375289917, + "learning_rate": 2.0913136527435026e-05, + "loss": 0.129, + "step": 33473 + }, + { + "epoch": 0.5970463382442122, + "grad_norm": 0.2764354944229126, + "learning_rate": 2.0911600966605114e-05, + "loss": 0.0963, + "step": 33474 + }, + { + "epoch": 0.5970641743659258, + "grad_norm": 0.2895379662513733, + "learning_rate": 2.0910065421623236e-05, + "loss": 0.1437, + "step": 33475 + }, + { + "epoch": 0.5970820104876395, + "grad_norm": 0.257869154214859, + "learning_rate": 2.090852989249537e-05, + "loss": 0.1338, + "step": 33476 + }, + { + "epoch": 0.5970998466093532, + "grad_norm": 0.20041821897029877, + "learning_rate": 2.090699437922745e-05, + "loss": 0.1147, + "step": 33477 + }, + { + "epoch": 0.5971176827310669, + "grad_norm": 0.26313722133636475, + "learning_rate": 2.090545888182544e-05, + "loss": 0.0902, + "step": 33478 + }, + { + "epoch": 0.5971355188527806, + "grad_norm": 0.4446222484111786, + "learning_rate": 2.0903923400295273e-05, + "loss": 0.1261, + "step": 33479 + }, + { + "epoch": 0.5971533549744944, + "grad_norm": 0.19148489832878113, + "learning_rate": 2.0902387934642923e-05, + "loss": 0.1077, + "step": 33480 + }, + { + "epoch": 0.5971711910962081, + "grad_norm": 0.3146808445453644, + "learning_rate": 2.0900852484874335e-05, + "loss": 0.0833, + "step": 33481 + }, + { + "epoch": 0.5971890272179218, + "grad_norm": 0.21696031093597412, + "learning_rate": 2.0899317050995454e-05, + "loss": 0.0937, + "step": 33482 + }, + { + "epoch": 0.5972068633396355, + "grad_norm": 0.1917061060667038, + "learning_rate": 2.0897781633012238e-05, + "loss": 0.0897, + "step": 33483 + }, + { + "epoch": 0.5972246994613492, + "grad_norm": 0.31284597516059875, + "learning_rate": 2.0896246230930632e-05, + "loss": 0.1646, + "step": 33484 + }, + { + "epoch": 0.5972425355830628, + "grad_norm": 0.3136140704154968, + "learning_rate": 2.0894710844756593e-05, + "loss": 0.19, + "step": 33485 + }, + { + "epoch": 0.5972603717047765, + "grad_norm": 0.30459582805633545, + "learning_rate": 2.089317547449608e-05, + "loss": 0.1533, + "step": 33486 + }, + { + "epoch": 0.5972782078264902, + "grad_norm": 0.35404688119888306, + "learning_rate": 2.0891640120155037e-05, + "loss": 0.1404, + "step": 33487 + }, + { + "epoch": 0.5972960439482039, + "grad_norm": 0.2684485912322998, + "learning_rate": 2.0890104781739405e-05, + "loss": 0.1098, + "step": 33488 + }, + { + "epoch": 0.5973138800699176, + "grad_norm": 0.39728739857673645, + "learning_rate": 2.088856945925516e-05, + "loss": 0.1215, + "step": 33489 + }, + { + "epoch": 0.5973317161916313, + "grad_norm": 0.31502604484558105, + "learning_rate": 2.0887034152708234e-05, + "loss": 0.128, + "step": 33490 + }, + { + "epoch": 0.597349552313345, + "grad_norm": 0.25949475169181824, + "learning_rate": 2.0885498862104584e-05, + "loss": 0.1328, + "step": 33491 + }, + { + "epoch": 0.5973673884350587, + "grad_norm": 0.34783926606178284, + "learning_rate": 2.0883963587450154e-05, + "loss": 0.0974, + "step": 33492 + }, + { + "epoch": 0.5973852245567723, + "grad_norm": 0.22386687994003296, + "learning_rate": 2.0882428328750914e-05, + "loss": 0.1278, + "step": 33493 + }, + { + "epoch": 0.597403060678486, + "grad_norm": 0.23003214597702026, + "learning_rate": 2.0880893086012804e-05, + "loss": 0.1089, + "step": 33494 + }, + { + "epoch": 0.5974208968001997, + "grad_norm": 0.21393248438835144, + "learning_rate": 2.0879357859241773e-05, + "loss": 0.1036, + "step": 33495 + }, + { + "epoch": 0.5974387329219134, + "grad_norm": 0.24486044049263, + "learning_rate": 2.087782264844377e-05, + "loss": 0.167, + "step": 33496 + }, + { + "epoch": 0.5974565690436272, + "grad_norm": 0.300487756729126, + "learning_rate": 2.087628745362475e-05, + "loss": 0.1555, + "step": 33497 + }, + { + "epoch": 0.5974744051653409, + "grad_norm": 0.29788637161254883, + "learning_rate": 2.0874752274790665e-05, + "loss": 0.153, + "step": 33498 + }, + { + "epoch": 0.5974922412870546, + "grad_norm": 0.19125089049339294, + "learning_rate": 2.087321711194747e-05, + "loss": 0.0831, + "step": 33499 + }, + { + "epoch": 0.5975100774087683, + "grad_norm": 0.3453262150287628, + "learning_rate": 2.0871681965101114e-05, + "loss": 0.182, + "step": 33500 + }, + { + "epoch": 0.597527913530482, + "grad_norm": 0.31526249647140503, + "learning_rate": 2.087014683425753e-05, + "loss": 0.1612, + "step": 33501 + }, + { + "epoch": 0.5975457496521956, + "grad_norm": 0.21088182926177979, + "learning_rate": 2.0868611719422696e-05, + "loss": 0.0618, + "step": 33502 + }, + { + "epoch": 0.5975635857739093, + "grad_norm": 0.36538219451904297, + "learning_rate": 2.0867076620602548e-05, + "loss": 0.1237, + "step": 33503 + }, + { + "epoch": 0.597581421895623, + "grad_norm": 0.21200306713581085, + "learning_rate": 2.0865541537803032e-05, + "loss": 0.1167, + "step": 33504 + }, + { + "epoch": 0.5975992580173367, + "grad_norm": 0.17556160688400269, + "learning_rate": 2.0864006471030108e-05, + "loss": 0.1343, + "step": 33505 + }, + { + "epoch": 0.5976170941390504, + "grad_norm": 0.2599734365940094, + "learning_rate": 2.086247142028972e-05, + "loss": 0.111, + "step": 33506 + }, + { + "epoch": 0.5976349302607641, + "grad_norm": 0.21359191834926605, + "learning_rate": 2.0860936385587833e-05, + "loss": 0.1201, + "step": 33507 + }, + { + "epoch": 0.5976527663824778, + "grad_norm": 0.23952512443065643, + "learning_rate": 2.085940136693038e-05, + "loss": 0.0894, + "step": 33508 + }, + { + "epoch": 0.5976706025041915, + "grad_norm": 0.20351237058639526, + "learning_rate": 2.0857866364323324e-05, + "loss": 0.1061, + "step": 33509 + }, + { + "epoch": 0.5976884386259051, + "grad_norm": 0.2951543927192688, + "learning_rate": 2.0856331377772592e-05, + "loss": 0.1574, + "step": 33510 + }, + { + "epoch": 0.5977062747476188, + "grad_norm": 0.3727482855319977, + "learning_rate": 2.0854796407284163e-05, + "loss": 0.1197, + "step": 33511 + }, + { + "epoch": 0.5977241108693325, + "grad_norm": 0.3370814919471741, + "learning_rate": 2.085326145286397e-05, + "loss": 0.1528, + "step": 33512 + }, + { + "epoch": 0.5977419469910462, + "grad_norm": 0.2609003484249115, + "learning_rate": 2.0851726514517973e-05, + "loss": 0.1433, + "step": 33513 + }, + { + "epoch": 0.59775978311276, + "grad_norm": 0.27603965997695923, + "learning_rate": 2.0850191592252106e-05, + "loss": 0.1227, + "step": 33514 + }, + { + "epoch": 0.5977776192344737, + "grad_norm": 0.2421277016401291, + "learning_rate": 2.084865668607234e-05, + "loss": 0.1268, + "step": 33515 + }, + { + "epoch": 0.5977954553561874, + "grad_norm": 0.3370356857776642, + "learning_rate": 2.0847121795984616e-05, + "loss": 0.1436, + "step": 33516 + }, + { + "epoch": 0.5978132914779011, + "grad_norm": 0.28814664483070374, + "learning_rate": 2.084558692199488e-05, + "loss": 0.1243, + "step": 33517 + }, + { + "epoch": 0.5978311275996148, + "grad_norm": 0.2421230524778366, + "learning_rate": 2.0844052064109076e-05, + "loss": 0.051, + "step": 33518 + }, + { + "epoch": 0.5978489637213285, + "grad_norm": 0.21642711758613586, + "learning_rate": 2.0842517222333168e-05, + "loss": 0.1429, + "step": 33519 + }, + { + "epoch": 0.5978667998430421, + "grad_norm": 0.3548171818256378, + "learning_rate": 2.0840982396673098e-05, + "loss": 0.1707, + "step": 33520 + }, + { + "epoch": 0.5978846359647558, + "grad_norm": 0.22144341468811035, + "learning_rate": 2.0839447587134824e-05, + "loss": 0.0972, + "step": 33521 + }, + { + "epoch": 0.5979024720864695, + "grad_norm": 0.3229910135269165, + "learning_rate": 2.0837912793724284e-05, + "loss": 0.1417, + "step": 33522 + }, + { + "epoch": 0.5979203082081832, + "grad_norm": 0.33225852251052856, + "learning_rate": 2.083637801644742e-05, + "loss": 0.1249, + "step": 33523 + }, + { + "epoch": 0.5979381443298969, + "grad_norm": 0.28374913334846497, + "learning_rate": 2.0834843255310208e-05, + "loss": 0.1629, + "step": 33524 + }, + { + "epoch": 0.5979559804516106, + "grad_norm": 0.1922423243522644, + "learning_rate": 2.083330851031857e-05, + "loss": 0.0805, + "step": 33525 + }, + { + "epoch": 0.5979738165733243, + "grad_norm": 0.23374253511428833, + "learning_rate": 2.083177378147848e-05, + "loss": 0.1491, + "step": 33526 + }, + { + "epoch": 0.597991652695038, + "grad_norm": 0.28973060846328735, + "learning_rate": 2.083023906879586e-05, + "loss": 0.1215, + "step": 33527 + }, + { + "epoch": 0.5980094888167516, + "grad_norm": 0.23298515379428864, + "learning_rate": 2.0828704372276686e-05, + "loss": 0.1105, + "step": 33528 + }, + { + "epoch": 0.5980273249384653, + "grad_norm": 0.16594116389751434, + "learning_rate": 2.0827169691926894e-05, + "loss": 0.0932, + "step": 33529 + }, + { + "epoch": 0.598045161060179, + "grad_norm": 0.2171420007944107, + "learning_rate": 2.0825635027752433e-05, + "loss": 0.1023, + "step": 33530 + }, + { + "epoch": 0.5980629971818928, + "grad_norm": 0.2784365117549896, + "learning_rate": 2.082410037975924e-05, + "loss": 0.1553, + "step": 33531 + }, + { + "epoch": 0.5980808333036065, + "grad_norm": 0.26501089334487915, + "learning_rate": 2.082256574795329e-05, + "loss": 0.167, + "step": 33532 + }, + { + "epoch": 0.5980986694253202, + "grad_norm": 0.28261762857437134, + "learning_rate": 2.082103113234051e-05, + "loss": 0.123, + "step": 33533 + }, + { + "epoch": 0.5981165055470339, + "grad_norm": 0.28261658549308777, + "learning_rate": 2.0819496532926864e-05, + "loss": 0.1854, + "step": 33534 + }, + { + "epoch": 0.5981343416687476, + "grad_norm": 0.344864159822464, + "learning_rate": 2.0817961949718294e-05, + "loss": 0.1237, + "step": 33535 + }, + { + "epoch": 0.5981521777904613, + "grad_norm": 0.35241538286209106, + "learning_rate": 2.0816427382720734e-05, + "loss": 0.186, + "step": 33536 + }, + { + "epoch": 0.598170013912175, + "grad_norm": 0.2554865777492523, + "learning_rate": 2.081489283194016e-05, + "loss": 0.1417, + "step": 33537 + }, + { + "epoch": 0.5981878500338886, + "grad_norm": 0.25521984696388245, + "learning_rate": 2.081335829738251e-05, + "loss": 0.1297, + "step": 33538 + }, + { + "epoch": 0.5982056861556023, + "grad_norm": 0.18651452660560608, + "learning_rate": 2.0811823779053714e-05, + "loss": 0.0904, + "step": 33539 + }, + { + "epoch": 0.598223522277316, + "grad_norm": 0.2694258689880371, + "learning_rate": 2.0810289276959743e-05, + "loss": 0.1532, + "step": 33540 + }, + { + "epoch": 0.5982413583990297, + "grad_norm": 0.24189093708992004, + "learning_rate": 2.0808754791106537e-05, + "loss": 0.1289, + "step": 33541 + }, + { + "epoch": 0.5982591945207434, + "grad_norm": 0.27592676877975464, + "learning_rate": 2.0807220321500047e-05, + "loss": 0.1672, + "step": 33542 + }, + { + "epoch": 0.5982770306424571, + "grad_norm": 0.251250684261322, + "learning_rate": 2.0805685868146225e-05, + "loss": 0.1144, + "step": 33543 + }, + { + "epoch": 0.5982948667641708, + "grad_norm": 0.3173721730709076, + "learning_rate": 2.0804151431051e-05, + "loss": 0.1539, + "step": 33544 + }, + { + "epoch": 0.5983127028858845, + "grad_norm": 0.2538539469242096, + "learning_rate": 2.0802617010220343e-05, + "loss": 0.1707, + "step": 33545 + }, + { + "epoch": 0.5983305390075981, + "grad_norm": 0.4023657739162445, + "learning_rate": 2.0801082605660186e-05, + "loss": 0.132, + "step": 33546 + }, + { + "epoch": 0.5983483751293119, + "grad_norm": 0.3657127618789673, + "learning_rate": 2.0799548217376492e-05, + "loss": 0.1358, + "step": 33547 + }, + { + "epoch": 0.5983662112510256, + "grad_norm": 0.23266048729419708, + "learning_rate": 2.0798013845375197e-05, + "loss": 0.1228, + "step": 33548 + }, + { + "epoch": 0.5983840473727393, + "grad_norm": 0.25415685772895813, + "learning_rate": 2.079647948966224e-05, + "loss": 0.1237, + "step": 33549 + }, + { + "epoch": 0.598401883494453, + "grad_norm": 0.2588161826133728, + "learning_rate": 2.0794945150243593e-05, + "loss": 0.1152, + "step": 33550 + }, + { + "epoch": 0.5984197196161667, + "grad_norm": 0.24725402891635895, + "learning_rate": 2.079341082712519e-05, + "loss": 0.1146, + "step": 33551 + }, + { + "epoch": 0.5984375557378804, + "grad_norm": 0.37954816222190857, + "learning_rate": 2.0791876520312974e-05, + "loss": 0.1536, + "step": 33552 + }, + { + "epoch": 0.5984553918595941, + "grad_norm": 0.41757869720458984, + "learning_rate": 2.0790342229812898e-05, + "loss": 0.1306, + "step": 33553 + }, + { + "epoch": 0.5984732279813078, + "grad_norm": 0.2830967307090759, + "learning_rate": 2.0788807955630912e-05, + "loss": 0.1099, + "step": 33554 + }, + { + "epoch": 0.5984910641030214, + "grad_norm": 0.25304535031318665, + "learning_rate": 2.0787273697772965e-05, + "loss": 0.1444, + "step": 33555 + }, + { + "epoch": 0.5985089002247351, + "grad_norm": 0.298365980386734, + "learning_rate": 2.0785739456245e-05, + "loss": 0.1762, + "step": 33556 + }, + { + "epoch": 0.5985267363464488, + "grad_norm": 0.2548024654388428, + "learning_rate": 2.0784205231052953e-05, + "loss": 0.0839, + "step": 33557 + }, + { + "epoch": 0.5985445724681625, + "grad_norm": 0.2621406018733978, + "learning_rate": 2.0782671022202794e-05, + "loss": 0.1558, + "step": 33558 + }, + { + "epoch": 0.5985624085898762, + "grad_norm": 0.26424023509025574, + "learning_rate": 2.0781136829700456e-05, + "loss": 0.0989, + "step": 33559 + }, + { + "epoch": 0.5985802447115899, + "grad_norm": 0.27558988332748413, + "learning_rate": 2.0779602653551888e-05, + "loss": 0.1342, + "step": 33560 + }, + { + "epoch": 0.5985980808333036, + "grad_norm": 0.22411102056503296, + "learning_rate": 2.0778068493763043e-05, + "loss": 0.1433, + "step": 33561 + }, + { + "epoch": 0.5986159169550173, + "grad_norm": 0.2516861855983734, + "learning_rate": 2.077653435033985e-05, + "loss": 0.1465, + "step": 33562 + }, + { + "epoch": 0.598633753076731, + "grad_norm": 0.28461748361587524, + "learning_rate": 2.0775000223288276e-05, + "loss": 0.16, + "step": 33563 + }, + { + "epoch": 0.5986515891984447, + "grad_norm": 0.37802982330322266, + "learning_rate": 2.0773466112614264e-05, + "loss": 0.0974, + "step": 33564 + }, + { + "epoch": 0.5986694253201584, + "grad_norm": 0.3002423346042633, + "learning_rate": 2.077193201832376e-05, + "loss": 0.1923, + "step": 33565 + }, + { + "epoch": 0.5986872614418721, + "grad_norm": 0.2853030264377594, + "learning_rate": 2.0770397940422692e-05, + "loss": 0.125, + "step": 33566 + }, + { + "epoch": 0.5987050975635858, + "grad_norm": 0.2667273283004761, + "learning_rate": 2.0768863878917032e-05, + "loss": 0.1164, + "step": 33567 + }, + { + "epoch": 0.5987229336852995, + "grad_norm": 0.2134263515472412, + "learning_rate": 2.0767329833812717e-05, + "loss": 0.1015, + "step": 33568 + }, + { + "epoch": 0.5987407698070132, + "grad_norm": 0.40813490748405457, + "learning_rate": 2.0765795805115696e-05, + "loss": 0.1395, + "step": 33569 + }, + { + "epoch": 0.5987586059287269, + "grad_norm": 0.2575003206729889, + "learning_rate": 2.0764261792831906e-05, + "loss": 0.151, + "step": 33570 + }, + { + "epoch": 0.5987764420504406, + "grad_norm": 0.32282647490501404, + "learning_rate": 2.0762727796967304e-05, + "loss": 0.1564, + "step": 33571 + }, + { + "epoch": 0.5987942781721542, + "grad_norm": 0.29961487650871277, + "learning_rate": 2.0761193817527836e-05, + "loss": 0.1702, + "step": 33572 + }, + { + "epoch": 0.5988121142938679, + "grad_norm": 0.2437695413827896, + "learning_rate": 2.0759659854519443e-05, + "loss": 0.0923, + "step": 33573 + }, + { + "epoch": 0.5988299504155816, + "grad_norm": 0.26290687918663025, + "learning_rate": 2.0758125907948073e-05, + "loss": 0.1354, + "step": 33574 + }, + { + "epoch": 0.5988477865372953, + "grad_norm": 0.3356347382068634, + "learning_rate": 2.0756591977819665e-05, + "loss": 0.1146, + "step": 33575 + }, + { + "epoch": 0.598865622659009, + "grad_norm": 0.33639901876449585, + "learning_rate": 2.0755058064140183e-05, + "loss": 0.1561, + "step": 33576 + }, + { + "epoch": 0.5988834587807227, + "grad_norm": 0.21176671981811523, + "learning_rate": 2.075352416691556e-05, + "loss": 0.1189, + "step": 33577 + }, + { + "epoch": 0.5989012949024364, + "grad_norm": 0.28039446473121643, + "learning_rate": 2.0751990286151744e-05, + "loss": 0.1272, + "step": 33578 + }, + { + "epoch": 0.5989191310241501, + "grad_norm": 0.25911805033683777, + "learning_rate": 2.075045642185467e-05, + "loss": 0.1402, + "step": 33579 + }, + { + "epoch": 0.5989369671458638, + "grad_norm": 0.2090800702571869, + "learning_rate": 2.0748922574030306e-05, + "loss": 0.119, + "step": 33580 + }, + { + "epoch": 0.5989548032675776, + "grad_norm": 0.2648777961730957, + "learning_rate": 2.074738874268458e-05, + "loss": 0.0934, + "step": 33581 + }, + { + "epoch": 0.5989726393892912, + "grad_norm": 0.27204152941703796, + "learning_rate": 2.074585492782345e-05, + "loss": 0.1448, + "step": 33582 + }, + { + "epoch": 0.5989904755110049, + "grad_norm": 0.24797779321670532, + "learning_rate": 2.0744321129452846e-05, + "loss": 0.1169, + "step": 33583 + }, + { + "epoch": 0.5990083116327186, + "grad_norm": 0.2660035490989685, + "learning_rate": 2.074278734757873e-05, + "loss": 0.1262, + "step": 33584 + }, + { + "epoch": 0.5990261477544323, + "grad_norm": 0.2804333567619324, + "learning_rate": 2.0741253582207045e-05, + "loss": 0.1417, + "step": 33585 + }, + { + "epoch": 0.599043983876146, + "grad_norm": 0.3176865875720978, + "learning_rate": 2.0739719833343732e-05, + "loss": 0.1361, + "step": 33586 + }, + { + "epoch": 0.5990618199978597, + "grad_norm": 0.2855227589607239, + "learning_rate": 2.073818610099473e-05, + "loss": 0.1329, + "step": 33587 + }, + { + "epoch": 0.5990796561195734, + "grad_norm": 0.29159054160118103, + "learning_rate": 2.0736652385165993e-05, + "loss": 0.1169, + "step": 33588 + }, + { + "epoch": 0.599097492241287, + "grad_norm": 0.25642481446266174, + "learning_rate": 2.073511868586346e-05, + "loss": 0.0432, + "step": 33589 + }, + { + "epoch": 0.5991153283630007, + "grad_norm": 0.573711633682251, + "learning_rate": 2.0733585003093086e-05, + "loss": 0.2194, + "step": 33590 + }, + { + "epoch": 0.5991331644847144, + "grad_norm": 0.28695085644721985, + "learning_rate": 2.0732051336860812e-05, + "loss": 0.1025, + "step": 33591 + }, + { + "epoch": 0.5991510006064281, + "grad_norm": 0.21565885841846466, + "learning_rate": 2.073051768717257e-05, + "loss": 0.1193, + "step": 33592 + }, + { + "epoch": 0.5991688367281418, + "grad_norm": 0.23005762696266174, + "learning_rate": 2.0728984054034323e-05, + "loss": 0.1775, + "step": 33593 + }, + { + "epoch": 0.5991866728498555, + "grad_norm": 0.2618167996406555, + "learning_rate": 2.0727450437452013e-05, + "loss": 0.1688, + "step": 33594 + }, + { + "epoch": 0.5992045089715692, + "grad_norm": 0.3125520944595337, + "learning_rate": 2.0725916837431576e-05, + "loss": 0.1419, + "step": 33595 + }, + { + "epoch": 0.5992223450932829, + "grad_norm": 0.46696290373802185, + "learning_rate": 2.072438325397895e-05, + "loss": 0.1088, + "step": 33596 + }, + { + "epoch": 0.5992401812149966, + "grad_norm": 0.19843453168869019, + "learning_rate": 2.072284968710011e-05, + "loss": 0.1163, + "step": 33597 + }, + { + "epoch": 0.5992580173367104, + "grad_norm": 0.2652958035469055, + "learning_rate": 2.072131613680098e-05, + "loss": 0.1154, + "step": 33598 + }, + { + "epoch": 0.599275853458424, + "grad_norm": 0.24856826663017273, + "learning_rate": 2.0719782603087504e-05, + "loss": 0.1073, + "step": 33599 + }, + { + "epoch": 0.5992936895801377, + "grad_norm": 0.5437614321708679, + "learning_rate": 2.0718249085965623e-05, + "loss": 0.2109, + "step": 33600 + }, + { + "epoch": 0.5993115257018514, + "grad_norm": 0.2498537302017212, + "learning_rate": 2.071671558544129e-05, + "loss": 0.0947, + "step": 33601 + }, + { + "epoch": 0.5993293618235651, + "grad_norm": 0.3388812243938446, + "learning_rate": 2.0715182101520448e-05, + "loss": 0.1609, + "step": 33602 + }, + { + "epoch": 0.5993471979452788, + "grad_norm": 0.4093707501888275, + "learning_rate": 2.0713648634209044e-05, + "loss": 0.1098, + "step": 33603 + }, + { + "epoch": 0.5993650340669925, + "grad_norm": 0.33917126059532166, + "learning_rate": 2.0712115183513014e-05, + "loss": 0.1524, + "step": 33604 + }, + { + "epoch": 0.5993828701887062, + "grad_norm": 0.2906486988067627, + "learning_rate": 2.07105817494383e-05, + "loss": 0.1419, + "step": 33605 + }, + { + "epoch": 0.5994007063104199, + "grad_norm": 0.27075502276420593, + "learning_rate": 2.0709048331990863e-05, + "loss": 0.1334, + "step": 33606 + }, + { + "epoch": 0.5994185424321335, + "grad_norm": 0.24697193503379822, + "learning_rate": 2.0707514931176637e-05, + "loss": 0.131, + "step": 33607 + }, + { + "epoch": 0.5994363785538472, + "grad_norm": 0.22174084186553955, + "learning_rate": 2.070598154700156e-05, + "loss": 0.1139, + "step": 33608 + }, + { + "epoch": 0.5994542146755609, + "grad_norm": 0.24717064201831818, + "learning_rate": 2.070444817947158e-05, + "loss": 0.1576, + "step": 33609 + }, + { + "epoch": 0.5994720507972746, + "grad_norm": 0.255853533744812, + "learning_rate": 2.0702914828592647e-05, + "loss": 0.1453, + "step": 33610 + }, + { + "epoch": 0.5994898869189883, + "grad_norm": 0.2678378224372864, + "learning_rate": 2.07013814943707e-05, + "loss": 0.1831, + "step": 33611 + }, + { + "epoch": 0.599507723040702, + "grad_norm": 0.24620047211647034, + "learning_rate": 2.0699848176811686e-05, + "loss": 0.0922, + "step": 33612 + }, + { + "epoch": 0.5995255591624157, + "grad_norm": 0.3019634187221527, + "learning_rate": 2.0698314875921544e-05, + "loss": 0.1352, + "step": 33613 + }, + { + "epoch": 0.5995433952841294, + "grad_norm": 0.24899795651435852, + "learning_rate": 2.069678159170621e-05, + "loss": 0.1333, + "step": 33614 + }, + { + "epoch": 0.5995612314058432, + "grad_norm": 0.27833905816078186, + "learning_rate": 2.069524832417165e-05, + "loss": 0.1996, + "step": 33615 + }, + { + "epoch": 0.5995790675275569, + "grad_norm": 0.3196410834789276, + "learning_rate": 2.0693715073323786e-05, + "loss": 0.1179, + "step": 33616 + }, + { + "epoch": 0.5995969036492705, + "grad_norm": 0.32081323862075806, + "learning_rate": 2.0692181839168575e-05, + "loss": 0.1426, + "step": 33617 + }, + { + "epoch": 0.5996147397709842, + "grad_norm": 0.32498452067375183, + "learning_rate": 2.0690648621711944e-05, + "loss": 0.13, + "step": 33618 + }, + { + "epoch": 0.5996325758926979, + "grad_norm": 0.25471729040145874, + "learning_rate": 2.068911542095986e-05, + "loss": 0.0906, + "step": 33619 + }, + { + "epoch": 0.5996504120144116, + "grad_norm": 0.3257145583629608, + "learning_rate": 2.0687582236918252e-05, + "loss": 0.1501, + "step": 33620 + }, + { + "epoch": 0.5996682481361253, + "grad_norm": 0.34932589530944824, + "learning_rate": 2.0686049069593066e-05, + "loss": 0.1342, + "step": 33621 + }, + { + "epoch": 0.599686084257839, + "grad_norm": 0.23956450819969177, + "learning_rate": 2.0684515918990232e-05, + "loss": 0.1375, + "step": 33622 + }, + { + "epoch": 0.5997039203795527, + "grad_norm": 0.30068206787109375, + "learning_rate": 2.0682982785115712e-05, + "loss": 0.1309, + "step": 33623 + }, + { + "epoch": 0.5997217565012664, + "grad_norm": 0.21349970996379852, + "learning_rate": 2.0681449667975443e-05, + "loss": 0.1373, + "step": 33624 + }, + { + "epoch": 0.59973959262298, + "grad_norm": 0.2551549971103668, + "learning_rate": 2.067991656757537e-05, + "loss": 0.1728, + "step": 33625 + }, + { + "epoch": 0.5997574287446937, + "grad_norm": 0.274891197681427, + "learning_rate": 2.067838348392143e-05, + "loss": 0.1353, + "step": 33626 + }, + { + "epoch": 0.5997752648664074, + "grad_norm": 0.2301880568265915, + "learning_rate": 2.067685041701956e-05, + "loss": 0.139, + "step": 33627 + }, + { + "epoch": 0.5997931009881211, + "grad_norm": 0.2576291859149933, + "learning_rate": 2.0675317366875724e-05, + "loss": 0.1411, + "step": 33628 + }, + { + "epoch": 0.5998109371098348, + "grad_norm": 0.2454328089952469, + "learning_rate": 2.067378433349584e-05, + "loss": 0.0862, + "step": 33629 + }, + { + "epoch": 0.5998287732315485, + "grad_norm": 0.2625868618488312, + "learning_rate": 2.0672251316885872e-05, + "loss": 0.0978, + "step": 33630 + }, + { + "epoch": 0.5998466093532622, + "grad_norm": 0.23759739100933075, + "learning_rate": 2.067071831705174e-05, + "loss": 0.1208, + "step": 33631 + }, + { + "epoch": 0.599864445474976, + "grad_norm": 0.3765805959701538, + "learning_rate": 2.066918533399941e-05, + "loss": 0.1195, + "step": 33632 + }, + { + "epoch": 0.5998822815966897, + "grad_norm": 0.22054898738861084, + "learning_rate": 2.0667652367734815e-05, + "loss": 0.0887, + "step": 33633 + }, + { + "epoch": 0.5999001177184033, + "grad_norm": 0.2293272465467453, + "learning_rate": 2.0666119418263897e-05, + "loss": 0.1121, + "step": 33634 + }, + { + "epoch": 0.599917953840117, + "grad_norm": 0.30799606442451477, + "learning_rate": 2.066458648559258e-05, + "loss": 0.1532, + "step": 33635 + }, + { + "epoch": 0.5999357899618307, + "grad_norm": 0.3114411532878876, + "learning_rate": 2.066305356972684e-05, + "loss": 0.1422, + "step": 33636 + }, + { + "epoch": 0.5999536260835444, + "grad_norm": 0.259997695684433, + "learning_rate": 2.0661520670672597e-05, + "loss": 0.1584, + "step": 33637 + }, + { + "epoch": 0.5999714622052581, + "grad_norm": 0.3229637145996094, + "learning_rate": 2.06599877884358e-05, + "loss": 0.1241, + "step": 33638 + }, + { + "epoch": 0.5999892983269718, + "grad_norm": 0.28554773330688477, + "learning_rate": 2.065845492302239e-05, + "loss": 0.15, + "step": 33639 + }, + { + "epoch": 0.6000071344486855, + "grad_norm": 0.31168311834335327, + "learning_rate": 2.0656922074438298e-05, + "loss": 0.1242, + "step": 33640 + }, + { + "epoch": 0.6000249705703992, + "grad_norm": 0.22958698868751526, + "learning_rate": 2.0655389242689487e-05, + "loss": 0.1216, + "step": 33641 + }, + { + "epoch": 0.6000428066921129, + "grad_norm": 0.2597629725933075, + "learning_rate": 2.0653856427781886e-05, + "loss": 0.1105, + "step": 33642 + }, + { + "epoch": 0.6000606428138265, + "grad_norm": 0.22892449796199799, + "learning_rate": 2.065232362972144e-05, + "loss": 0.1098, + "step": 33643 + }, + { + "epoch": 0.6000784789355402, + "grad_norm": 0.30714118480682373, + "learning_rate": 2.065079084851408e-05, + "loss": 0.1643, + "step": 33644 + }, + { + "epoch": 0.6000963150572539, + "grad_norm": 0.3327876329421997, + "learning_rate": 2.064925808416576e-05, + "loss": 0.1282, + "step": 33645 + }, + { + "epoch": 0.6001141511789676, + "grad_norm": 0.2873730957508087, + "learning_rate": 2.0647725336682427e-05, + "loss": 0.0702, + "step": 33646 + }, + { + "epoch": 0.6001319873006813, + "grad_norm": 0.3141648769378662, + "learning_rate": 2.0646192606070014e-05, + "loss": 0.1433, + "step": 33647 + }, + { + "epoch": 0.600149823422395, + "grad_norm": 0.333107590675354, + "learning_rate": 2.064465989233445e-05, + "loss": 0.1521, + "step": 33648 + }, + { + "epoch": 0.6001676595441088, + "grad_norm": 0.24174322187900543, + "learning_rate": 2.06431271954817e-05, + "loss": 0.1026, + "step": 33649 + }, + { + "epoch": 0.6001854956658225, + "grad_norm": 0.26976731419563293, + "learning_rate": 2.0641594515517685e-05, + "loss": 0.1204, + "step": 33650 + }, + { + "epoch": 0.6002033317875362, + "grad_norm": 0.2513549029827118, + "learning_rate": 2.064006185244836e-05, + "loss": 0.1413, + "step": 33651 + }, + { + "epoch": 0.6002211679092498, + "grad_norm": 0.238325297832489, + "learning_rate": 2.0638529206279667e-05, + "loss": 0.1142, + "step": 33652 + }, + { + "epoch": 0.6002390040309635, + "grad_norm": 0.22856846451759338, + "learning_rate": 2.0636996577017524e-05, + "loss": 0.12, + "step": 33653 + }, + { + "epoch": 0.6002568401526772, + "grad_norm": 0.3029005527496338, + "learning_rate": 2.0635463964667905e-05, + "loss": 0.1454, + "step": 33654 + }, + { + "epoch": 0.6002746762743909, + "grad_norm": 0.2216426283121109, + "learning_rate": 2.0633931369236733e-05, + "loss": 0.1172, + "step": 33655 + }, + { + "epoch": 0.6002925123961046, + "grad_norm": 0.2906281650066376, + "learning_rate": 2.0632398790729946e-05, + "loss": 0.0831, + "step": 33656 + }, + { + "epoch": 0.6003103485178183, + "grad_norm": 0.24564191699028015, + "learning_rate": 2.0630866229153488e-05, + "loss": 0.0933, + "step": 33657 + }, + { + "epoch": 0.600328184639532, + "grad_norm": 0.27084264159202576, + "learning_rate": 2.0629333684513304e-05, + "loss": 0.1753, + "step": 33658 + }, + { + "epoch": 0.6003460207612457, + "grad_norm": 0.35575029253959656, + "learning_rate": 2.0627801156815338e-05, + "loss": 0.1634, + "step": 33659 + }, + { + "epoch": 0.6003638568829593, + "grad_norm": 0.2876966893672943, + "learning_rate": 2.0626268646065522e-05, + "loss": 0.1347, + "step": 33660 + }, + { + "epoch": 0.600381693004673, + "grad_norm": 0.297993928194046, + "learning_rate": 2.0624736152269793e-05, + "loss": 0.1622, + "step": 33661 + }, + { + "epoch": 0.6003995291263867, + "grad_norm": 0.34068602323532104, + "learning_rate": 2.0623203675434105e-05, + "loss": 0.1878, + "step": 33662 + }, + { + "epoch": 0.6004173652481004, + "grad_norm": 0.30879369378089905, + "learning_rate": 2.0621671215564393e-05, + "loss": 0.1434, + "step": 33663 + }, + { + "epoch": 0.6004352013698141, + "grad_norm": 0.23983246088027954, + "learning_rate": 2.0620138772666585e-05, + "loss": 0.1205, + "step": 33664 + }, + { + "epoch": 0.6004530374915279, + "grad_norm": 0.33153408765792847, + "learning_rate": 2.0618606346746638e-05, + "loss": 0.105, + "step": 33665 + }, + { + "epoch": 0.6004708736132416, + "grad_norm": 0.30795496702194214, + "learning_rate": 2.0617073937810484e-05, + "loss": 0.0838, + "step": 33666 + }, + { + "epoch": 0.6004887097349553, + "grad_norm": 0.2563682198524475, + "learning_rate": 2.061554154586407e-05, + "loss": 0.1129, + "step": 33667 + }, + { + "epoch": 0.600506545856669, + "grad_norm": 0.26605424284935, + "learning_rate": 2.0614009170913333e-05, + "loss": 0.1146, + "step": 33668 + }, + { + "epoch": 0.6005243819783826, + "grad_norm": 0.28893157839775085, + "learning_rate": 2.061247681296421e-05, + "loss": 0.1232, + "step": 33669 + }, + { + "epoch": 0.6005422181000963, + "grad_norm": 0.247714102268219, + "learning_rate": 2.0610944472022632e-05, + "loss": 0.141, + "step": 33670 + }, + { + "epoch": 0.60056005422181, + "grad_norm": 0.2717185914516449, + "learning_rate": 2.060941214809456e-05, + "loss": 0.1269, + "step": 33671 + }, + { + "epoch": 0.6005778903435237, + "grad_norm": 0.26414069533348083, + "learning_rate": 2.060787984118592e-05, + "loss": 0.1483, + "step": 33672 + }, + { + "epoch": 0.6005957264652374, + "grad_norm": 0.253976970911026, + "learning_rate": 2.0606347551302654e-05, + "loss": 0.1227, + "step": 33673 + }, + { + "epoch": 0.6006135625869511, + "grad_norm": 0.2299121469259262, + "learning_rate": 2.0604815278450695e-05, + "loss": 0.1344, + "step": 33674 + }, + { + "epoch": 0.6006313987086648, + "grad_norm": 0.3089067041873932, + "learning_rate": 2.0603283022636003e-05, + "loss": 0.1037, + "step": 33675 + }, + { + "epoch": 0.6006492348303785, + "grad_norm": 0.22295323014259338, + "learning_rate": 2.0601750783864503e-05, + "loss": 0.1066, + "step": 33676 + }, + { + "epoch": 0.6006670709520922, + "grad_norm": 0.28311818838119507, + "learning_rate": 2.060021856214213e-05, + "loss": 0.0847, + "step": 33677 + }, + { + "epoch": 0.6006849070738058, + "grad_norm": 0.2758885622024536, + "learning_rate": 2.0598686357474827e-05, + "loss": 0.1876, + "step": 33678 + }, + { + "epoch": 0.6007027431955195, + "grad_norm": 0.28104543685913086, + "learning_rate": 2.059715416986854e-05, + "loss": 0.1038, + "step": 33679 + }, + { + "epoch": 0.6007205793172332, + "grad_norm": 0.24740703403949738, + "learning_rate": 2.059562199932921e-05, + "loss": 0.1203, + "step": 33680 + }, + { + "epoch": 0.6007384154389469, + "grad_norm": 0.31523147225379944, + "learning_rate": 2.0594089845862768e-05, + "loss": 0.1008, + "step": 33681 + }, + { + "epoch": 0.6007562515606607, + "grad_norm": 0.29176992177963257, + "learning_rate": 2.0592557709475155e-05, + "loss": 0.1693, + "step": 33682 + }, + { + "epoch": 0.6007740876823744, + "grad_norm": 0.23750586807727814, + "learning_rate": 2.05910255901723e-05, + "loss": 0.148, + "step": 33683 + }, + { + "epoch": 0.6007919238040881, + "grad_norm": 0.23496584594249725, + "learning_rate": 2.0589493487960164e-05, + "loss": 0.1505, + "step": 33684 + }, + { + "epoch": 0.6008097599258018, + "grad_norm": 0.22927476465702057, + "learning_rate": 2.0587961402844672e-05, + "loss": 0.0851, + "step": 33685 + }, + { + "epoch": 0.6008275960475155, + "grad_norm": 0.23963990807533264, + "learning_rate": 2.0586429334831768e-05, + "loss": 0.1341, + "step": 33686 + }, + { + "epoch": 0.6008454321692291, + "grad_norm": 0.29195401072502136, + "learning_rate": 2.0584897283927377e-05, + "loss": 0.1863, + "step": 33687 + }, + { + "epoch": 0.6008632682909428, + "grad_norm": 0.2602557837963104, + "learning_rate": 2.058336525013746e-05, + "loss": 0.1306, + "step": 33688 + }, + { + "epoch": 0.6008811044126565, + "grad_norm": 0.27246448397636414, + "learning_rate": 2.0581833233467946e-05, + "loss": 0.1509, + "step": 33689 + }, + { + "epoch": 0.6008989405343702, + "grad_norm": 0.2719744145870209, + "learning_rate": 2.058030123392477e-05, + "loss": 0.1268, + "step": 33690 + }, + { + "epoch": 0.6009167766560839, + "grad_norm": 0.3758862614631653, + "learning_rate": 2.0578769251513866e-05, + "loss": 0.1405, + "step": 33691 + }, + { + "epoch": 0.6009346127777976, + "grad_norm": 0.27474191784858704, + "learning_rate": 2.057723728624119e-05, + "loss": 0.158, + "step": 33692 + }, + { + "epoch": 0.6009524488995113, + "grad_norm": 0.325585275888443, + "learning_rate": 2.057570533811266e-05, + "loss": 0.1564, + "step": 33693 + }, + { + "epoch": 0.600970285021225, + "grad_norm": 0.3095877766609192, + "learning_rate": 2.0574173407134233e-05, + "loss": 0.1687, + "step": 33694 + }, + { + "epoch": 0.6009881211429386, + "grad_norm": 0.2678094506263733, + "learning_rate": 2.0572641493311835e-05, + "loss": 0.112, + "step": 33695 + }, + { + "epoch": 0.6010059572646523, + "grad_norm": 0.3076682984828949, + "learning_rate": 2.05711095966514e-05, + "loss": 0.1155, + "step": 33696 + }, + { + "epoch": 0.601023793386366, + "grad_norm": 0.2864644527435303, + "learning_rate": 2.0569577717158887e-05, + "loss": 0.1337, + "step": 33697 + }, + { + "epoch": 0.6010416295080797, + "grad_norm": 0.34131529927253723, + "learning_rate": 2.056804585484022e-05, + "loss": 0.1445, + "step": 33698 + }, + { + "epoch": 0.6010594656297935, + "grad_norm": 0.22036266326904297, + "learning_rate": 2.0566514009701328e-05, + "loss": 0.0817, + "step": 33699 + }, + { + "epoch": 0.6010773017515072, + "grad_norm": 0.3300800919532776, + "learning_rate": 2.0564982181748155e-05, + "loss": 0.1148, + "step": 33700 + }, + { + "epoch": 0.6010951378732209, + "grad_norm": 0.22799605131149292, + "learning_rate": 2.0563450370986655e-05, + "loss": 0.0775, + "step": 33701 + }, + { + "epoch": 0.6011129739949346, + "grad_norm": 0.2662222385406494, + "learning_rate": 2.0561918577422756e-05, + "loss": 0.1146, + "step": 33702 + }, + { + "epoch": 0.6011308101166483, + "grad_norm": 0.22732426226139069, + "learning_rate": 2.056038680106239e-05, + "loss": 0.1046, + "step": 33703 + }, + { + "epoch": 0.601148646238362, + "grad_norm": 0.27283963561058044, + "learning_rate": 2.055885504191149e-05, + "loss": 0.1473, + "step": 33704 + }, + { + "epoch": 0.6011664823600756, + "grad_norm": 0.21563753485679626, + "learning_rate": 2.055732329997601e-05, + "loss": 0.1294, + "step": 33705 + }, + { + "epoch": 0.6011843184817893, + "grad_norm": 0.2540137469768524, + "learning_rate": 2.0555791575261877e-05, + "loss": 0.1579, + "step": 33706 + }, + { + "epoch": 0.601202154603503, + "grad_norm": 0.2659685015678406, + "learning_rate": 2.0554259867775034e-05, + "loss": 0.1387, + "step": 33707 + }, + { + "epoch": 0.6012199907252167, + "grad_norm": 0.27802035212516785, + "learning_rate": 2.0552728177521413e-05, + "loss": 0.1672, + "step": 33708 + }, + { + "epoch": 0.6012378268469304, + "grad_norm": 0.29357317090034485, + "learning_rate": 2.0551196504506948e-05, + "loss": 0.1612, + "step": 33709 + }, + { + "epoch": 0.6012556629686441, + "grad_norm": 0.3730638921260834, + "learning_rate": 2.0549664848737586e-05, + "loss": 0.0681, + "step": 33710 + }, + { + "epoch": 0.6012734990903578, + "grad_norm": 0.21985360980033875, + "learning_rate": 2.0548133210219267e-05, + "loss": 0.1164, + "step": 33711 + }, + { + "epoch": 0.6012913352120715, + "grad_norm": 0.22377026081085205, + "learning_rate": 2.0546601588957916e-05, + "loss": 0.1112, + "step": 33712 + }, + { + "epoch": 0.6013091713337851, + "grad_norm": 0.2902832329273224, + "learning_rate": 2.054506998495947e-05, + "loss": 0.1001, + "step": 33713 + }, + { + "epoch": 0.6013270074554988, + "grad_norm": 0.231264129281044, + "learning_rate": 2.0543538398229875e-05, + "loss": 0.1216, + "step": 33714 + }, + { + "epoch": 0.6013448435772125, + "grad_norm": 0.27825841307640076, + "learning_rate": 2.0542006828775068e-05, + "loss": 0.1693, + "step": 33715 + }, + { + "epoch": 0.6013626796989263, + "grad_norm": 0.3316711187362671, + "learning_rate": 2.0540475276600983e-05, + "loss": 0.1469, + "step": 33716 + }, + { + "epoch": 0.60138051582064, + "grad_norm": 0.2353348284959793, + "learning_rate": 2.0538943741713547e-05, + "loss": 0.1411, + "step": 33717 + }, + { + "epoch": 0.6013983519423537, + "grad_norm": 0.29443326592445374, + "learning_rate": 2.053741222411871e-05, + "loss": 0.1717, + "step": 33718 + }, + { + "epoch": 0.6014161880640674, + "grad_norm": 0.22235290706157684, + "learning_rate": 2.0535880723822416e-05, + "loss": 0.1132, + "step": 33719 + }, + { + "epoch": 0.6014340241857811, + "grad_norm": 0.23972205817699432, + "learning_rate": 2.0534349240830574e-05, + "loss": 0.1204, + "step": 33720 + }, + { + "epoch": 0.6014518603074948, + "grad_norm": 0.2309301346540451, + "learning_rate": 2.0532817775149148e-05, + "loss": 0.1521, + "step": 33721 + }, + { + "epoch": 0.6014696964292084, + "grad_norm": 0.2577821910381317, + "learning_rate": 2.0531286326784054e-05, + "loss": 0.1363, + "step": 33722 + }, + { + "epoch": 0.6014875325509221, + "grad_norm": 0.3417529761791229, + "learning_rate": 2.0529754895741244e-05, + "loss": 0.2097, + "step": 33723 + }, + { + "epoch": 0.6015053686726358, + "grad_norm": 0.22946985065937042, + "learning_rate": 2.0528223482026647e-05, + "loss": 0.1248, + "step": 33724 + }, + { + "epoch": 0.6015232047943495, + "grad_norm": 0.29907146096229553, + "learning_rate": 2.0526692085646204e-05, + "loss": 0.1472, + "step": 33725 + }, + { + "epoch": 0.6015410409160632, + "grad_norm": 0.2472325712442398, + "learning_rate": 2.0525160706605834e-05, + "loss": 0.1375, + "step": 33726 + }, + { + "epoch": 0.6015588770377769, + "grad_norm": 0.21015435457229614, + "learning_rate": 2.0523629344911495e-05, + "loss": 0.1436, + "step": 33727 + }, + { + "epoch": 0.6015767131594906, + "grad_norm": 0.21222342550754547, + "learning_rate": 2.0522098000569117e-05, + "loss": 0.1267, + "step": 33728 + }, + { + "epoch": 0.6015945492812043, + "grad_norm": 0.3917984366416931, + "learning_rate": 2.0520566673584637e-05, + "loss": 0.1433, + "step": 33729 + }, + { + "epoch": 0.601612385402918, + "grad_norm": 0.3214591443538666, + "learning_rate": 2.0519035363963973e-05, + "loss": 0.1412, + "step": 33730 + }, + { + "epoch": 0.6016302215246316, + "grad_norm": 0.3727131187915802, + "learning_rate": 2.0517504071713088e-05, + "loss": 0.1439, + "step": 33731 + }, + { + "epoch": 0.6016480576463453, + "grad_norm": 0.25211301445961, + "learning_rate": 2.0515972796837905e-05, + "loss": 0.1034, + "step": 33732 + }, + { + "epoch": 0.6016658937680591, + "grad_norm": 0.28403568267822266, + "learning_rate": 2.051444153934436e-05, + "loss": 0.1043, + "step": 33733 + }, + { + "epoch": 0.6016837298897728, + "grad_norm": 0.2361334264278412, + "learning_rate": 2.051291029923839e-05, + "loss": 0.1797, + "step": 33734 + }, + { + "epoch": 0.6017015660114865, + "grad_norm": 0.2434931844472885, + "learning_rate": 2.0511379076525917e-05, + "loss": 0.1476, + "step": 33735 + }, + { + "epoch": 0.6017194021332002, + "grad_norm": 0.2450096309185028, + "learning_rate": 2.0509847871212902e-05, + "loss": 0.1014, + "step": 33736 + }, + { + "epoch": 0.6017372382549139, + "grad_norm": 0.26669010519981384, + "learning_rate": 2.050831668330527e-05, + "loss": 0.1177, + "step": 33737 + }, + { + "epoch": 0.6017550743766276, + "grad_norm": 0.228338822722435, + "learning_rate": 2.050678551280895e-05, + "loss": 0.1443, + "step": 33738 + }, + { + "epoch": 0.6017729104983413, + "grad_norm": 0.29480478167533875, + "learning_rate": 2.050525435972987e-05, + "loss": 0.2194, + "step": 33739 + }, + { + "epoch": 0.6017907466200549, + "grad_norm": 0.30891695618629456, + "learning_rate": 2.050372322407399e-05, + "loss": 0.1428, + "step": 33740 + }, + { + "epoch": 0.6018085827417686, + "grad_norm": 0.24988296627998352, + "learning_rate": 2.0502192105847222e-05, + "loss": 0.1006, + "step": 33741 + }, + { + "epoch": 0.6018264188634823, + "grad_norm": 0.569564938545227, + "learning_rate": 2.050066100505552e-05, + "loss": 0.121, + "step": 33742 + }, + { + "epoch": 0.601844254985196, + "grad_norm": 0.25561919808387756, + "learning_rate": 2.04991299217048e-05, + "loss": 0.1149, + "step": 33743 + }, + { + "epoch": 0.6018620911069097, + "grad_norm": 0.2002534568309784, + "learning_rate": 2.049759885580102e-05, + "loss": 0.0952, + "step": 33744 + }, + { + "epoch": 0.6018799272286234, + "grad_norm": 0.19361424446105957, + "learning_rate": 2.0496067807350096e-05, + "loss": 0.1052, + "step": 33745 + }, + { + "epoch": 0.6018977633503371, + "grad_norm": 0.1840454638004303, + "learning_rate": 2.049453677635797e-05, + "loss": 0.0782, + "step": 33746 + }, + { + "epoch": 0.6019155994720508, + "grad_norm": 0.3458082675933838, + "learning_rate": 2.0493005762830573e-05, + "loss": 0.1884, + "step": 33747 + }, + { + "epoch": 0.6019334355937644, + "grad_norm": 0.18853051960468292, + "learning_rate": 2.049147476677384e-05, + "loss": 0.1029, + "step": 33748 + }, + { + "epoch": 0.6019512717154781, + "grad_norm": 0.2220182716846466, + "learning_rate": 2.0489943788193708e-05, + "loss": 0.1139, + "step": 33749 + }, + { + "epoch": 0.6019691078371919, + "grad_norm": 0.2086227536201477, + "learning_rate": 2.0488412827096117e-05, + "loss": 0.0996, + "step": 33750 + }, + { + "epoch": 0.6019869439589056, + "grad_norm": 0.2594011127948761, + "learning_rate": 2.0486881883486996e-05, + "loss": 0.1199, + "step": 33751 + }, + { + "epoch": 0.6020047800806193, + "grad_norm": 0.2105686515569687, + "learning_rate": 2.0485350957372272e-05, + "loss": 0.0998, + "step": 33752 + }, + { + "epoch": 0.602022616202333, + "grad_norm": 0.24959707260131836, + "learning_rate": 2.0483820048757893e-05, + "loss": 0.1221, + "step": 33753 + }, + { + "epoch": 0.6020404523240467, + "grad_norm": 0.28625673055648804, + "learning_rate": 2.0482289157649788e-05, + "loss": 0.0892, + "step": 33754 + }, + { + "epoch": 0.6020582884457604, + "grad_norm": 0.2425151914358139, + "learning_rate": 2.048075828405389e-05, + "loss": 0.0821, + "step": 33755 + }, + { + "epoch": 0.6020761245674741, + "grad_norm": 0.22902196645736694, + "learning_rate": 2.0479227427976125e-05, + "loss": 0.1162, + "step": 33756 + }, + { + "epoch": 0.6020939606891877, + "grad_norm": 0.2373112291097641, + "learning_rate": 2.0477696589422447e-05, + "loss": 0.1133, + "step": 33757 + }, + { + "epoch": 0.6021117968109014, + "grad_norm": 0.27441054582595825, + "learning_rate": 2.047616576839878e-05, + "loss": 0.0879, + "step": 33758 + }, + { + "epoch": 0.6021296329326151, + "grad_norm": 0.2369239181280136, + "learning_rate": 2.0474634964911055e-05, + "loss": 0.1124, + "step": 33759 + }, + { + "epoch": 0.6021474690543288, + "grad_norm": 0.2260851263999939, + "learning_rate": 2.0473104178965204e-05, + "loss": 0.1235, + "step": 33760 + }, + { + "epoch": 0.6021653051760425, + "grad_norm": 0.22847603261470795, + "learning_rate": 2.0471573410567164e-05, + "loss": 0.1193, + "step": 33761 + }, + { + "epoch": 0.6021831412977562, + "grad_norm": 0.2566198408603668, + "learning_rate": 2.0470042659722867e-05, + "loss": 0.1115, + "step": 33762 + }, + { + "epoch": 0.6022009774194699, + "grad_norm": 0.2882314622402191, + "learning_rate": 2.046851192643826e-05, + "loss": 0.1417, + "step": 33763 + }, + { + "epoch": 0.6022188135411836, + "grad_norm": 0.20216074585914612, + "learning_rate": 2.0466981210719263e-05, + "loss": 0.1085, + "step": 33764 + }, + { + "epoch": 0.6022366496628972, + "grad_norm": 0.2686808705329895, + "learning_rate": 2.0465450512571802e-05, + "loss": 0.1518, + "step": 33765 + }, + { + "epoch": 0.602254485784611, + "grad_norm": 0.24801988899707794, + "learning_rate": 2.0463919832001834e-05, + "loss": 0.1188, + "step": 33766 + }, + { + "epoch": 0.6022723219063247, + "grad_norm": 0.2675819396972656, + "learning_rate": 2.0462389169015273e-05, + "loss": 0.1464, + "step": 33767 + }, + { + "epoch": 0.6022901580280384, + "grad_norm": 0.382968932390213, + "learning_rate": 2.046085852361806e-05, + "loss": 0.095, + "step": 33768 + }, + { + "epoch": 0.6023079941497521, + "grad_norm": 0.7312154769897461, + "learning_rate": 2.0459327895816122e-05, + "loss": 0.1518, + "step": 33769 + }, + { + "epoch": 0.6023258302714658, + "grad_norm": 0.2627183496952057, + "learning_rate": 2.04577972856154e-05, + "loss": 0.1269, + "step": 33770 + }, + { + "epoch": 0.6023436663931795, + "grad_norm": 0.28598958253860474, + "learning_rate": 2.0456266693021832e-05, + "loss": 0.1335, + "step": 33771 + }, + { + "epoch": 0.6023615025148932, + "grad_norm": 0.2967975437641144, + "learning_rate": 2.045473611804134e-05, + "loss": 0.1099, + "step": 33772 + }, + { + "epoch": 0.6023793386366069, + "grad_norm": 0.3294130861759186, + "learning_rate": 2.0453205560679862e-05, + "loss": 0.1175, + "step": 33773 + }, + { + "epoch": 0.6023971747583206, + "grad_norm": 0.22721338272094727, + "learning_rate": 2.045167502094332e-05, + "loss": 0.0825, + "step": 33774 + }, + { + "epoch": 0.6024150108800342, + "grad_norm": 0.32131215929985046, + "learning_rate": 2.045014449883767e-05, + "loss": 0.141, + "step": 33775 + }, + { + "epoch": 0.6024328470017479, + "grad_norm": 0.5400431752204895, + "learning_rate": 2.044861399436882e-05, + "loss": 0.1895, + "step": 33776 + }, + { + "epoch": 0.6024506831234616, + "grad_norm": 0.25084590911865234, + "learning_rate": 2.044708350754272e-05, + "loss": 0.0854, + "step": 33777 + }, + { + "epoch": 0.6024685192451753, + "grad_norm": 0.1929490864276886, + "learning_rate": 2.0445553038365288e-05, + "loss": 0.1409, + "step": 33778 + }, + { + "epoch": 0.602486355366889, + "grad_norm": 0.24766503274440765, + "learning_rate": 2.0444022586842477e-05, + "loss": 0.1122, + "step": 33779 + }, + { + "epoch": 0.6025041914886027, + "grad_norm": 0.2801390588283539, + "learning_rate": 2.044249215298021e-05, + "loss": 0.1135, + "step": 33780 + }, + { + "epoch": 0.6025220276103164, + "grad_norm": 0.29283440113067627, + "learning_rate": 2.044096173678441e-05, + "loss": 0.1722, + "step": 33781 + }, + { + "epoch": 0.6025398637320301, + "grad_norm": 0.24493587017059326, + "learning_rate": 2.0439431338261013e-05, + "loss": 0.0924, + "step": 33782 + }, + { + "epoch": 0.6025576998537439, + "grad_norm": 0.21509939432144165, + "learning_rate": 2.0437900957415963e-05, + "loss": 0.1262, + "step": 33783 + }, + { + "epoch": 0.6025755359754575, + "grad_norm": 0.2098754346370697, + "learning_rate": 2.0436370594255183e-05, + "loss": 0.0651, + "step": 33784 + }, + { + "epoch": 0.6025933720971712, + "grad_norm": 0.3117353022098541, + "learning_rate": 2.043484024878461e-05, + "loss": 0.1048, + "step": 33785 + }, + { + "epoch": 0.6026112082188849, + "grad_norm": 0.2930700480937958, + "learning_rate": 2.0433309921010173e-05, + "loss": 0.1625, + "step": 33786 + }, + { + "epoch": 0.6026290443405986, + "grad_norm": 0.2191721796989441, + "learning_rate": 2.0431779610937793e-05, + "loss": 0.1037, + "step": 33787 + }, + { + "epoch": 0.6026468804623123, + "grad_norm": 0.3632800877094269, + "learning_rate": 2.0430249318573425e-05, + "loss": 0.1163, + "step": 33788 + }, + { + "epoch": 0.602664716584026, + "grad_norm": 0.3090948462486267, + "learning_rate": 2.042871904392298e-05, + "loss": 0.1476, + "step": 33789 + }, + { + "epoch": 0.6026825527057397, + "grad_norm": 0.29852980375289917, + "learning_rate": 2.0427188786992406e-05, + "loss": 0.1406, + "step": 33790 + }, + { + "epoch": 0.6027003888274534, + "grad_norm": 0.1763669103384018, + "learning_rate": 2.042565854778762e-05, + "loss": 0.0965, + "step": 33791 + }, + { + "epoch": 0.602718224949167, + "grad_norm": 0.30537042021751404, + "learning_rate": 2.0424128326314567e-05, + "loss": 0.1376, + "step": 33792 + }, + { + "epoch": 0.6027360610708807, + "grad_norm": 0.2414328008890152, + "learning_rate": 2.0422598122579177e-05, + "loss": 0.135, + "step": 33793 + }, + { + "epoch": 0.6027538971925944, + "grad_norm": 0.19762182235717773, + "learning_rate": 2.0421067936587376e-05, + "loss": 0.0854, + "step": 33794 + }, + { + "epoch": 0.6027717333143081, + "grad_norm": 0.24279852211475372, + "learning_rate": 2.041953776834509e-05, + "loss": 0.1528, + "step": 33795 + }, + { + "epoch": 0.6027895694360218, + "grad_norm": 0.20025098323822021, + "learning_rate": 2.0418007617858262e-05, + "loss": 0.0959, + "step": 33796 + }, + { + "epoch": 0.6028074055577355, + "grad_norm": 0.22615863382816315, + "learning_rate": 2.041647748513282e-05, + "loss": 0.1196, + "step": 33797 + }, + { + "epoch": 0.6028252416794492, + "grad_norm": 0.236972376704216, + "learning_rate": 2.041494737017469e-05, + "loss": 0.1132, + "step": 33798 + }, + { + "epoch": 0.6028430778011629, + "grad_norm": 0.2631887197494507, + "learning_rate": 2.0413417272989818e-05, + "loss": 0.1182, + "step": 33799 + }, + { + "epoch": 0.6028609139228767, + "grad_norm": 0.2501896619796753, + "learning_rate": 2.041188719358411e-05, + "loss": 0.1206, + "step": 33800 + }, + { + "epoch": 0.6028787500445904, + "grad_norm": 0.23473411798477173, + "learning_rate": 2.0410357131963524e-05, + "loss": 0.1509, + "step": 33801 + }, + { + "epoch": 0.602896586166304, + "grad_norm": 0.3168741464614868, + "learning_rate": 2.0408827088133975e-05, + "loss": 0.1184, + "step": 33802 + }, + { + "epoch": 0.6029144222880177, + "grad_norm": 0.25983482599258423, + "learning_rate": 2.0407297062101395e-05, + "loss": 0.1145, + "step": 33803 + }, + { + "epoch": 0.6029322584097314, + "grad_norm": 0.2601884603500366, + "learning_rate": 2.0405767053871712e-05, + "loss": 0.1017, + "step": 33804 + }, + { + "epoch": 0.6029500945314451, + "grad_norm": 0.3014456629753113, + "learning_rate": 2.0404237063450877e-05, + "loss": 0.1378, + "step": 33805 + }, + { + "epoch": 0.6029679306531588, + "grad_norm": 0.2662687301635742, + "learning_rate": 2.04027070908448e-05, + "loss": 0.1088, + "step": 33806 + }, + { + "epoch": 0.6029857667748725, + "grad_norm": 0.33728307485580444, + "learning_rate": 2.0401177136059425e-05, + "loss": 0.1549, + "step": 33807 + }, + { + "epoch": 0.6030036028965862, + "grad_norm": 0.2911766469478607, + "learning_rate": 2.0399647199100663e-05, + "loss": 0.1311, + "step": 33808 + }, + { + "epoch": 0.6030214390182999, + "grad_norm": 0.2578064203262329, + "learning_rate": 2.0398117279974466e-05, + "loss": 0.1247, + "step": 33809 + }, + { + "epoch": 0.6030392751400135, + "grad_norm": 0.2399481236934662, + "learning_rate": 2.0396587378686752e-05, + "loss": 0.0991, + "step": 33810 + }, + { + "epoch": 0.6030571112617272, + "grad_norm": 0.22368918359279633, + "learning_rate": 2.039505749524346e-05, + "loss": 0.0966, + "step": 33811 + }, + { + "epoch": 0.6030749473834409, + "grad_norm": 0.23402705788612366, + "learning_rate": 2.0393527629650518e-05, + "loss": 0.0917, + "step": 33812 + }, + { + "epoch": 0.6030927835051546, + "grad_norm": 0.2707475423812866, + "learning_rate": 2.039199778191384e-05, + "loss": 0.0804, + "step": 33813 + }, + { + "epoch": 0.6031106196268683, + "grad_norm": 0.24490302801132202, + "learning_rate": 2.039046795203938e-05, + "loss": 0.1271, + "step": 33814 + }, + { + "epoch": 0.603128455748582, + "grad_norm": 0.34024158120155334, + "learning_rate": 2.0388938140033064e-05, + "loss": 0.0903, + "step": 33815 + }, + { + "epoch": 0.6031462918702957, + "grad_norm": 0.3218767046928406, + "learning_rate": 2.0387408345900804e-05, + "loss": 0.1325, + "step": 33816 + }, + { + "epoch": 0.6031641279920095, + "grad_norm": 0.2643524706363678, + "learning_rate": 2.0385878569648546e-05, + "loss": 0.1221, + "step": 33817 + }, + { + "epoch": 0.6031819641137232, + "grad_norm": 0.2380962073802948, + "learning_rate": 2.0384348811282216e-05, + "loss": 0.156, + "step": 33818 + }, + { + "epoch": 0.6031998002354368, + "grad_norm": 0.2571446895599365, + "learning_rate": 2.0382819070807752e-05, + "loss": 0.1111, + "step": 33819 + }, + { + "epoch": 0.6032176363571505, + "grad_norm": 0.24764426052570343, + "learning_rate": 2.038128934823107e-05, + "loss": 0.1309, + "step": 33820 + }, + { + "epoch": 0.6032354724788642, + "grad_norm": 0.2795545756816864, + "learning_rate": 2.03797596435581e-05, + "loss": 0.1593, + "step": 33821 + }, + { + "epoch": 0.6032533086005779, + "grad_norm": 0.32767194509506226, + "learning_rate": 2.0378229956794787e-05, + "loss": 0.1665, + "step": 33822 + }, + { + "epoch": 0.6032711447222916, + "grad_norm": 0.2888633608818054, + "learning_rate": 2.0376700287947052e-05, + "loss": 0.1505, + "step": 33823 + }, + { + "epoch": 0.6032889808440053, + "grad_norm": 0.2893064618110657, + "learning_rate": 2.0375170637020817e-05, + "loss": 0.0845, + "step": 33824 + }, + { + "epoch": 0.603306816965719, + "grad_norm": 0.2887312173843384, + "learning_rate": 2.0373641004022024e-05, + "loss": 0.1336, + "step": 33825 + }, + { + "epoch": 0.6033246530874327, + "grad_norm": 0.2215019166469574, + "learning_rate": 2.0372111388956582e-05, + "loss": 0.1256, + "step": 33826 + }, + { + "epoch": 0.6033424892091463, + "grad_norm": 0.20559757947921753, + "learning_rate": 2.037058179183045e-05, + "loss": 0.1463, + "step": 33827 + }, + { + "epoch": 0.60336032533086, + "grad_norm": 0.3059665560722351, + "learning_rate": 2.036905221264954e-05, + "loss": 0.1102, + "step": 33828 + }, + { + "epoch": 0.6033781614525737, + "grad_norm": 0.2694060504436493, + "learning_rate": 2.0367522651419783e-05, + "loss": 0.0967, + "step": 33829 + }, + { + "epoch": 0.6033959975742874, + "grad_norm": 0.3812882602214813, + "learning_rate": 2.0365993108147095e-05, + "loss": 0.1279, + "step": 33830 + }, + { + "epoch": 0.6034138336960011, + "grad_norm": 0.2730935513973236, + "learning_rate": 2.0364463582837428e-05, + "loss": 0.1261, + "step": 33831 + }, + { + "epoch": 0.6034316698177148, + "grad_norm": 0.28607073426246643, + "learning_rate": 2.0362934075496705e-05, + "loss": 0.0997, + "step": 33832 + }, + { + "epoch": 0.6034495059394285, + "grad_norm": 0.2214580774307251, + "learning_rate": 2.0361404586130854e-05, + "loss": 0.1234, + "step": 33833 + }, + { + "epoch": 0.6034673420611423, + "grad_norm": 0.3885900378227234, + "learning_rate": 2.0359875114745784e-05, + "loss": 0.1922, + "step": 33834 + }, + { + "epoch": 0.603485178182856, + "grad_norm": 0.2388555407524109, + "learning_rate": 2.0358345661347456e-05, + "loss": 0.0879, + "step": 33835 + }, + { + "epoch": 0.6035030143045697, + "grad_norm": 0.2469319850206375, + "learning_rate": 2.035681622594178e-05, + "loss": 0.1084, + "step": 33836 + }, + { + "epoch": 0.6035208504262833, + "grad_norm": 0.37272289395332336, + "learning_rate": 2.0355286808534686e-05, + "loss": 0.1444, + "step": 33837 + }, + { + "epoch": 0.603538686547997, + "grad_norm": 0.2836235761642456, + "learning_rate": 2.035375740913211e-05, + "loss": 0.1177, + "step": 33838 + }, + { + "epoch": 0.6035565226697107, + "grad_norm": 0.2795677185058594, + "learning_rate": 2.0352228027739964e-05, + "loss": 0.0925, + "step": 33839 + }, + { + "epoch": 0.6035743587914244, + "grad_norm": 0.2673434913158417, + "learning_rate": 2.0350698664364196e-05, + "loss": 0.1391, + "step": 33840 + }, + { + "epoch": 0.6035921949131381, + "grad_norm": 0.1964213103055954, + "learning_rate": 2.0349169319010728e-05, + "loss": 0.1113, + "step": 33841 + }, + { + "epoch": 0.6036100310348518, + "grad_norm": 0.25359031558036804, + "learning_rate": 2.0347639991685485e-05, + "loss": 0.0855, + "step": 33842 + }, + { + "epoch": 0.6036278671565655, + "grad_norm": 0.25008174777030945, + "learning_rate": 2.0346110682394383e-05, + "loss": 0.1187, + "step": 33843 + }, + { + "epoch": 0.6036457032782792, + "grad_norm": 0.21026980876922607, + "learning_rate": 2.034458139114338e-05, + "loss": 0.114, + "step": 33844 + }, + { + "epoch": 0.6036635393999928, + "grad_norm": 0.2965630292892456, + "learning_rate": 2.0343052117938378e-05, + "loss": 0.0858, + "step": 33845 + }, + { + "epoch": 0.6036813755217065, + "grad_norm": 0.1811700463294983, + "learning_rate": 2.034152286278532e-05, + "loss": 0.1248, + "step": 33846 + }, + { + "epoch": 0.6036992116434202, + "grad_norm": 0.29956483840942383, + "learning_rate": 2.033999362569012e-05, + "loss": 0.1357, + "step": 33847 + }, + { + "epoch": 0.6037170477651339, + "grad_norm": 0.26156920194625854, + "learning_rate": 2.0338464406658722e-05, + "loss": 0.116, + "step": 33848 + }, + { + "epoch": 0.6037348838868476, + "grad_norm": 0.27455633878707886, + "learning_rate": 2.0336935205697046e-05, + "loss": 0.1237, + "step": 33849 + }, + { + "epoch": 0.6037527200085613, + "grad_norm": 0.23272190988063812, + "learning_rate": 2.0335406022811022e-05, + "loss": 0.1157, + "step": 33850 + }, + { + "epoch": 0.6037705561302751, + "grad_norm": 0.3226031959056854, + "learning_rate": 2.0333876858006567e-05, + "loss": 0.2059, + "step": 33851 + }, + { + "epoch": 0.6037883922519888, + "grad_norm": 0.2438773512840271, + "learning_rate": 2.033234771128962e-05, + "loss": 0.1119, + "step": 33852 + }, + { + "epoch": 0.6038062283737025, + "grad_norm": 0.32348883152008057, + "learning_rate": 2.0330818582666105e-05, + "loss": 0.1398, + "step": 33853 + }, + { + "epoch": 0.6038240644954161, + "grad_norm": 0.18496102094650269, + "learning_rate": 2.0329289472141954e-05, + "loss": 0.1195, + "step": 33854 + }, + { + "epoch": 0.6038419006171298, + "grad_norm": 0.21836508810520172, + "learning_rate": 2.032776037972309e-05, + "loss": 0.0993, + "step": 33855 + }, + { + "epoch": 0.6038597367388435, + "grad_norm": 0.2764545977115631, + "learning_rate": 2.0326231305415432e-05, + "loss": 0.1546, + "step": 33856 + }, + { + "epoch": 0.6038775728605572, + "grad_norm": 0.2795056700706482, + "learning_rate": 2.0324702249224924e-05, + "loss": 0.158, + "step": 33857 + }, + { + "epoch": 0.6038954089822709, + "grad_norm": 0.3521125614643097, + "learning_rate": 2.0323173211157482e-05, + "loss": 0.0996, + "step": 33858 + }, + { + "epoch": 0.6039132451039846, + "grad_norm": 0.24373279511928558, + "learning_rate": 2.032164419121904e-05, + "loss": 0.0603, + "step": 33859 + }, + { + "epoch": 0.6039310812256983, + "grad_norm": 0.25983476638793945, + "learning_rate": 2.032011518941551e-05, + "loss": 0.126, + "step": 33860 + }, + { + "epoch": 0.603948917347412, + "grad_norm": 0.2526746988296509, + "learning_rate": 2.031858620575284e-05, + "loss": 0.1612, + "step": 33861 + }, + { + "epoch": 0.6039667534691257, + "grad_norm": 0.2785739004611969, + "learning_rate": 2.0317057240236946e-05, + "loss": 0.1431, + "step": 33862 + }, + { + "epoch": 0.6039845895908393, + "grad_norm": 0.24862068891525269, + "learning_rate": 2.031552829287376e-05, + "loss": 0.116, + "step": 33863 + }, + { + "epoch": 0.604002425712553, + "grad_norm": 0.2915647327899933, + "learning_rate": 2.0313999363669194e-05, + "loss": 0.1121, + "step": 33864 + }, + { + "epoch": 0.6040202618342667, + "grad_norm": 0.3019426465034485, + "learning_rate": 2.0312470452629186e-05, + "loss": 0.1056, + "step": 33865 + }, + { + "epoch": 0.6040380979559804, + "grad_norm": 0.26330095529556274, + "learning_rate": 2.0310941559759664e-05, + "loss": 0.0832, + "step": 33866 + }, + { + "epoch": 0.6040559340776942, + "grad_norm": 0.22627967596054077, + "learning_rate": 2.0309412685066555e-05, + "loss": 0.0694, + "step": 33867 + }, + { + "epoch": 0.6040737701994079, + "grad_norm": 0.20052380859851837, + "learning_rate": 2.0307883828555783e-05, + "loss": 0.0475, + "step": 33868 + }, + { + "epoch": 0.6040916063211216, + "grad_norm": 0.21634647250175476, + "learning_rate": 2.0306354990233266e-05, + "loss": 0.1278, + "step": 33869 + }, + { + "epoch": 0.6041094424428353, + "grad_norm": 0.23765288293361664, + "learning_rate": 2.0304826170104945e-05, + "loss": 0.0769, + "step": 33870 + }, + { + "epoch": 0.604127278564549, + "grad_norm": 0.31639793515205383, + "learning_rate": 2.030329736817674e-05, + "loss": 0.1165, + "step": 33871 + }, + { + "epoch": 0.6041451146862626, + "grad_norm": 0.2135034203529358, + "learning_rate": 2.0301768584454572e-05, + "loss": 0.1266, + "step": 33872 + }, + { + "epoch": 0.6041629508079763, + "grad_norm": 0.3173833191394806, + "learning_rate": 2.0300239818944372e-05, + "loss": 0.1449, + "step": 33873 + }, + { + "epoch": 0.60418078692969, + "grad_norm": 0.25829756259918213, + "learning_rate": 2.0298711071652065e-05, + "loss": 0.1219, + "step": 33874 + }, + { + "epoch": 0.6041986230514037, + "grad_norm": 0.2784562408924103, + "learning_rate": 2.0297182342583584e-05, + "loss": 0.0998, + "step": 33875 + }, + { + "epoch": 0.6042164591731174, + "grad_norm": 0.2562209367752075, + "learning_rate": 2.0295653631744847e-05, + "loss": 0.1683, + "step": 33876 + }, + { + "epoch": 0.6042342952948311, + "grad_norm": 0.265676349401474, + "learning_rate": 2.0294124939141785e-05, + "loss": 0.1391, + "step": 33877 + }, + { + "epoch": 0.6042521314165448, + "grad_norm": 0.24899938702583313, + "learning_rate": 2.0292596264780306e-05, + "loss": 0.1114, + "step": 33878 + }, + { + "epoch": 0.6042699675382585, + "grad_norm": 0.47234484553337097, + "learning_rate": 2.0291067608666362e-05, + "loss": 0.1856, + "step": 33879 + }, + { + "epoch": 0.6042878036599721, + "grad_norm": 0.4021255075931549, + "learning_rate": 2.028953897080586e-05, + "loss": 0.0839, + "step": 33880 + }, + { + "epoch": 0.6043056397816858, + "grad_norm": 0.21968500316143036, + "learning_rate": 2.0288010351204736e-05, + "loss": 0.1216, + "step": 33881 + }, + { + "epoch": 0.6043234759033995, + "grad_norm": 0.3248167037963867, + "learning_rate": 2.02864817498689e-05, + "loss": 0.145, + "step": 33882 + }, + { + "epoch": 0.6043413120251132, + "grad_norm": 0.24360419809818268, + "learning_rate": 2.0284953166804306e-05, + "loss": 0.1271, + "step": 33883 + }, + { + "epoch": 0.604359148146827, + "grad_norm": 0.21658781170845032, + "learning_rate": 2.0283424602016858e-05, + "loss": 0.1612, + "step": 33884 + }, + { + "epoch": 0.6043769842685407, + "grad_norm": 0.2616976797580719, + "learning_rate": 2.0281896055512485e-05, + "loss": 0.1589, + "step": 33885 + }, + { + "epoch": 0.6043948203902544, + "grad_norm": 0.45536941289901733, + "learning_rate": 2.02803675272971e-05, + "loss": 0.1408, + "step": 33886 + }, + { + "epoch": 0.6044126565119681, + "grad_norm": 0.34238550066947937, + "learning_rate": 2.0278839017376647e-05, + "loss": 0.1163, + "step": 33887 + }, + { + "epoch": 0.6044304926336818, + "grad_norm": 0.32174745202064514, + "learning_rate": 2.027731052575705e-05, + "loss": 0.1145, + "step": 33888 + }, + { + "epoch": 0.6044483287553954, + "grad_norm": 0.1972050964832306, + "learning_rate": 2.0275782052444232e-05, + "loss": 0.0798, + "step": 33889 + }, + { + "epoch": 0.6044661648771091, + "grad_norm": 0.16877669095993042, + "learning_rate": 2.027425359744411e-05, + "loss": 0.1081, + "step": 33890 + }, + { + "epoch": 0.6044840009988228, + "grad_norm": 0.29525187611579895, + "learning_rate": 2.0272725160762603e-05, + "loss": 0.1108, + "step": 33891 + }, + { + "epoch": 0.6045018371205365, + "grad_norm": 0.21393455564975739, + "learning_rate": 2.027119674240566e-05, + "loss": 0.133, + "step": 33892 + }, + { + "epoch": 0.6045196732422502, + "grad_norm": 0.3553031086921692, + "learning_rate": 2.0269668342379183e-05, + "loss": 0.1705, + "step": 33893 + }, + { + "epoch": 0.6045375093639639, + "grad_norm": 0.29390406608581543, + "learning_rate": 2.0268139960689112e-05, + "loss": 0.1674, + "step": 33894 + }, + { + "epoch": 0.6045553454856776, + "grad_norm": 0.3952547311782837, + "learning_rate": 2.0266611597341353e-05, + "loss": 0.1113, + "step": 33895 + }, + { + "epoch": 0.6045731816073913, + "grad_norm": 0.21549092233181, + "learning_rate": 2.0265083252341856e-05, + "loss": 0.0924, + "step": 33896 + }, + { + "epoch": 0.604591017729105, + "grad_norm": 0.26260876655578613, + "learning_rate": 2.026355492569653e-05, + "loss": 0.1063, + "step": 33897 + }, + { + "epoch": 0.6046088538508186, + "grad_norm": 0.25324922800064087, + "learning_rate": 2.0262026617411302e-05, + "loss": 0.1513, + "step": 33898 + }, + { + "epoch": 0.6046266899725323, + "grad_norm": 0.26757606863975525, + "learning_rate": 2.026049832749208e-05, + "loss": 0.1257, + "step": 33899 + }, + { + "epoch": 0.604644526094246, + "grad_norm": 0.2973572611808777, + "learning_rate": 2.025897005594482e-05, + "loss": 0.1352, + "step": 33900 + }, + { + "epoch": 0.6046623622159598, + "grad_norm": 0.31855741143226624, + "learning_rate": 2.025744180277542e-05, + "loss": 0.173, + "step": 33901 + }, + { + "epoch": 0.6046801983376735, + "grad_norm": 0.2872883975505829, + "learning_rate": 2.025591356798982e-05, + "loss": 0.105, + "step": 33902 + }, + { + "epoch": 0.6046980344593872, + "grad_norm": 0.27869299054145813, + "learning_rate": 2.025438535159394e-05, + "loss": 0.1346, + "step": 33903 + }, + { + "epoch": 0.6047158705811009, + "grad_norm": 0.30435711145401, + "learning_rate": 2.0252857153593687e-05, + "loss": 0.1301, + "step": 33904 + }, + { + "epoch": 0.6047337067028146, + "grad_norm": 0.25051939487457275, + "learning_rate": 2.0251328973995014e-05, + "loss": 0.1275, + "step": 33905 + }, + { + "epoch": 0.6047515428245283, + "grad_norm": 0.2828250825405121, + "learning_rate": 2.0249800812803828e-05, + "loss": 0.1254, + "step": 33906 + }, + { + "epoch": 0.6047693789462419, + "grad_norm": 0.2737036347389221, + "learning_rate": 2.024827267002605e-05, + "loss": 0.1273, + "step": 33907 + }, + { + "epoch": 0.6047872150679556, + "grad_norm": 0.3362065851688385, + "learning_rate": 2.02467445456676e-05, + "loss": 0.0723, + "step": 33908 + }, + { + "epoch": 0.6048050511896693, + "grad_norm": 0.2804470956325531, + "learning_rate": 2.0245216439734425e-05, + "loss": 0.1172, + "step": 33909 + }, + { + "epoch": 0.604822887311383, + "grad_norm": 0.33394521474838257, + "learning_rate": 2.0243688352232432e-05, + "loss": 0.1608, + "step": 33910 + }, + { + "epoch": 0.6048407234330967, + "grad_norm": 0.25664955377578735, + "learning_rate": 2.0242160283167544e-05, + "loss": 0.1339, + "step": 33911 + }, + { + "epoch": 0.6048585595548104, + "grad_norm": 0.25859886407852173, + "learning_rate": 2.0240632232545677e-05, + "loss": 0.0995, + "step": 33912 + }, + { + "epoch": 0.6048763956765241, + "grad_norm": 0.24843475222587585, + "learning_rate": 2.023910420037277e-05, + "loss": 0.1509, + "step": 33913 + }, + { + "epoch": 0.6048942317982378, + "grad_norm": 0.2508089542388916, + "learning_rate": 2.023757618665474e-05, + "loss": 0.1652, + "step": 33914 + }, + { + "epoch": 0.6049120679199514, + "grad_norm": 0.21523480117321014, + "learning_rate": 2.0236048191397515e-05, + "loss": 0.1239, + "step": 33915 + }, + { + "epoch": 0.6049299040416651, + "grad_norm": 0.2109798938035965, + "learning_rate": 2.0234520214607007e-05, + "loss": 0.0884, + "step": 33916 + }, + { + "epoch": 0.6049477401633788, + "grad_norm": 0.22996476292610168, + "learning_rate": 2.0232992256289136e-05, + "loss": 0.1049, + "step": 33917 + }, + { + "epoch": 0.6049655762850926, + "grad_norm": 0.24631816148757935, + "learning_rate": 2.0231464316449848e-05, + "loss": 0.1028, + "step": 33918 + }, + { + "epoch": 0.6049834124068063, + "grad_norm": 0.25303933024406433, + "learning_rate": 2.0229936395095048e-05, + "loss": 0.1351, + "step": 33919 + }, + { + "epoch": 0.60500124852852, + "grad_norm": 0.3592248857021332, + "learning_rate": 2.0228408492230656e-05, + "loss": 0.1241, + "step": 33920 + }, + { + "epoch": 0.6050190846502337, + "grad_norm": 0.21240203082561493, + "learning_rate": 2.0226880607862603e-05, + "loss": 0.0698, + "step": 33921 + }, + { + "epoch": 0.6050369207719474, + "grad_norm": 0.28455227613449097, + "learning_rate": 2.0225352741996806e-05, + "loss": 0.1326, + "step": 33922 + }, + { + "epoch": 0.6050547568936611, + "grad_norm": 0.1921006143093109, + "learning_rate": 2.0223824894639197e-05, + "loss": 0.1256, + "step": 33923 + }, + { + "epoch": 0.6050725930153747, + "grad_norm": 0.23239262402057648, + "learning_rate": 2.0222297065795696e-05, + "loss": 0.1137, + "step": 33924 + }, + { + "epoch": 0.6050904291370884, + "grad_norm": 0.19406414031982422, + "learning_rate": 2.022076925547221e-05, + "loss": 0.0683, + "step": 33925 + }, + { + "epoch": 0.6051082652588021, + "grad_norm": 0.35909807682037354, + "learning_rate": 2.0219241463674682e-05, + "loss": 0.1217, + "step": 33926 + }, + { + "epoch": 0.6051261013805158, + "grad_norm": 0.23722302913665771, + "learning_rate": 2.0217713690409027e-05, + "loss": 0.0949, + "step": 33927 + }, + { + "epoch": 0.6051439375022295, + "grad_norm": 0.4289957880973816, + "learning_rate": 2.021618593568116e-05, + "loss": 0.1334, + "step": 33928 + }, + { + "epoch": 0.6051617736239432, + "grad_norm": 0.2533237338066101, + "learning_rate": 2.021465819949701e-05, + "loss": 0.1007, + "step": 33929 + }, + { + "epoch": 0.6051796097456569, + "grad_norm": 0.25178685784339905, + "learning_rate": 2.0213130481862492e-05, + "loss": 0.1371, + "step": 33930 + }, + { + "epoch": 0.6051974458673706, + "grad_norm": 0.2986135184764862, + "learning_rate": 2.021160278278354e-05, + "loss": 0.1255, + "step": 33931 + }, + { + "epoch": 0.6052152819890843, + "grad_norm": 0.2928605079650879, + "learning_rate": 2.0210075102266075e-05, + "loss": 0.1172, + "step": 33932 + }, + { + "epoch": 0.6052331181107979, + "grad_norm": 0.44925788044929504, + "learning_rate": 2.020854744031601e-05, + "loss": 0.1825, + "step": 33933 + }, + { + "epoch": 0.6052509542325116, + "grad_norm": 0.267938494682312, + "learning_rate": 2.020701979693926e-05, + "loss": 0.1755, + "step": 33934 + }, + { + "epoch": 0.6052687903542254, + "grad_norm": 0.2114374041557312, + "learning_rate": 2.020549217214176e-05, + "loss": 0.1189, + "step": 33935 + }, + { + "epoch": 0.6052866264759391, + "grad_norm": 0.2029339224100113, + "learning_rate": 2.0203964565929434e-05, + "loss": 0.1328, + "step": 33936 + }, + { + "epoch": 0.6053044625976528, + "grad_norm": 0.24222591519355774, + "learning_rate": 2.0202436978308197e-05, + "loss": 0.0997, + "step": 33937 + }, + { + "epoch": 0.6053222987193665, + "grad_norm": 0.33572790026664734, + "learning_rate": 2.0200909409283965e-05, + "loss": 0.1658, + "step": 33938 + }, + { + "epoch": 0.6053401348410802, + "grad_norm": 0.28817689418792725, + "learning_rate": 2.0199381858862674e-05, + "loss": 0.1063, + "step": 33939 + }, + { + "epoch": 0.6053579709627939, + "grad_norm": 0.1996411681175232, + "learning_rate": 2.0197854327050235e-05, + "loss": 0.1061, + "step": 33940 + }, + { + "epoch": 0.6053758070845076, + "grad_norm": 0.3642538785934448, + "learning_rate": 2.019632681385257e-05, + "loss": 0.1754, + "step": 33941 + }, + { + "epoch": 0.6053936432062212, + "grad_norm": 0.24938619136810303, + "learning_rate": 2.0194799319275604e-05, + "loss": 0.1148, + "step": 33942 + }, + { + "epoch": 0.6054114793279349, + "grad_norm": 0.21470627188682556, + "learning_rate": 2.0193271843325243e-05, + "loss": 0.0988, + "step": 33943 + }, + { + "epoch": 0.6054293154496486, + "grad_norm": 0.25092217326164246, + "learning_rate": 2.0191744386007434e-05, + "loss": 0.133, + "step": 33944 + }, + { + "epoch": 0.6054471515713623, + "grad_norm": 0.3918848931789398, + "learning_rate": 2.0190216947328085e-05, + "loss": 0.104, + "step": 33945 + }, + { + "epoch": 0.605464987693076, + "grad_norm": 0.2911170423030853, + "learning_rate": 2.0188689527293116e-05, + "loss": 0.1594, + "step": 33946 + }, + { + "epoch": 0.6054828238147897, + "grad_norm": 0.22725218534469604, + "learning_rate": 2.0187162125908434e-05, + "loss": 0.1087, + "step": 33947 + }, + { + "epoch": 0.6055006599365034, + "grad_norm": 0.30462467670440674, + "learning_rate": 2.0185634743179988e-05, + "loss": 0.1372, + "step": 33948 + }, + { + "epoch": 0.6055184960582171, + "grad_norm": 0.33564862608909607, + "learning_rate": 2.018410737911368e-05, + "loss": 0.1278, + "step": 33949 + }, + { + "epoch": 0.6055363321799307, + "grad_norm": 0.23036031424999237, + "learning_rate": 2.0182580033715436e-05, + "loss": 0.1408, + "step": 33950 + }, + { + "epoch": 0.6055541683016444, + "grad_norm": 0.21163055300712585, + "learning_rate": 2.018105270699117e-05, + "loss": 0.1238, + "step": 33951 + }, + { + "epoch": 0.6055720044233582, + "grad_norm": 0.30165475606918335, + "learning_rate": 2.017952539894682e-05, + "loss": 0.1263, + "step": 33952 + }, + { + "epoch": 0.6055898405450719, + "grad_norm": 0.3575689494609833, + "learning_rate": 2.017799810958829e-05, + "loss": 0.1797, + "step": 33953 + }, + { + "epoch": 0.6056076766667856, + "grad_norm": 0.26383379101753235, + "learning_rate": 2.0176470838921506e-05, + "loss": 0.1212, + "step": 33954 + }, + { + "epoch": 0.6056255127884993, + "grad_norm": 0.24447055160999298, + "learning_rate": 2.017494358695238e-05, + "loss": 0.1275, + "step": 33955 + }, + { + "epoch": 0.605643348910213, + "grad_norm": 0.2355479598045349, + "learning_rate": 2.0173416353686843e-05, + "loss": 0.1487, + "step": 33956 + }, + { + "epoch": 0.6056611850319267, + "grad_norm": 0.22544480860233307, + "learning_rate": 2.017188913913081e-05, + "loss": 0.1255, + "step": 33957 + }, + { + "epoch": 0.6056790211536404, + "grad_norm": 0.23199716210365295, + "learning_rate": 2.0170361943290204e-05, + "loss": 0.1048, + "step": 33958 + }, + { + "epoch": 0.605696857275354, + "grad_norm": 0.22903956472873688, + "learning_rate": 2.0168834766170946e-05, + "loss": 0.0951, + "step": 33959 + }, + { + "epoch": 0.6057146933970677, + "grad_norm": 0.2908589243888855, + "learning_rate": 2.0167307607778942e-05, + "loss": 0.0983, + "step": 33960 + }, + { + "epoch": 0.6057325295187814, + "grad_norm": 0.26266583800315857, + "learning_rate": 2.0165780468120136e-05, + "loss": 0.0999, + "step": 33961 + }, + { + "epoch": 0.6057503656404951, + "grad_norm": 0.2423059046268463, + "learning_rate": 2.0164253347200432e-05, + "loss": 0.1278, + "step": 33962 + }, + { + "epoch": 0.6057682017622088, + "grad_norm": 0.43719765543937683, + "learning_rate": 2.0162726245025748e-05, + "loss": 0.155, + "step": 33963 + }, + { + "epoch": 0.6057860378839225, + "grad_norm": 0.2557755410671234, + "learning_rate": 2.0161199161602e-05, + "loss": 0.159, + "step": 33964 + }, + { + "epoch": 0.6058038740056362, + "grad_norm": 0.2540648281574249, + "learning_rate": 2.015967209693513e-05, + "loss": 0.1166, + "step": 33965 + }, + { + "epoch": 0.6058217101273499, + "grad_norm": 0.3005029559135437, + "learning_rate": 2.015814505103104e-05, + "loss": 0.1268, + "step": 33966 + }, + { + "epoch": 0.6058395462490636, + "grad_norm": 0.207055002450943, + "learning_rate": 2.0156618023895655e-05, + "loss": 0.0992, + "step": 33967 + }, + { + "epoch": 0.6058573823707774, + "grad_norm": 0.26175299286842346, + "learning_rate": 2.0155091015534884e-05, + "loss": 0.122, + "step": 33968 + }, + { + "epoch": 0.605875218492491, + "grad_norm": 0.2624555230140686, + "learning_rate": 2.0153564025954653e-05, + "loss": 0.1581, + "step": 33969 + }, + { + "epoch": 0.6058930546142047, + "grad_norm": 0.17795804142951965, + "learning_rate": 2.0152037055160884e-05, + "loss": 0.108, + "step": 33970 + }, + { + "epoch": 0.6059108907359184, + "grad_norm": 0.3625028431415558, + "learning_rate": 2.0150510103159496e-05, + "loss": 0.1378, + "step": 33971 + }, + { + "epoch": 0.6059287268576321, + "grad_norm": 0.49435219168663025, + "learning_rate": 2.0148983169956405e-05, + "loss": 0.1431, + "step": 33972 + }, + { + "epoch": 0.6059465629793458, + "grad_norm": 0.3415945768356323, + "learning_rate": 2.0147456255557524e-05, + "loss": 0.0993, + "step": 33973 + }, + { + "epoch": 0.6059643991010595, + "grad_norm": 0.31059375405311584, + "learning_rate": 2.0145929359968788e-05, + "loss": 0.1254, + "step": 33974 + }, + { + "epoch": 0.6059822352227732, + "grad_norm": 0.20938971638679504, + "learning_rate": 2.0144402483196106e-05, + "loss": 0.1153, + "step": 33975 + }, + { + "epoch": 0.6060000713444869, + "grad_norm": 0.21535004675388336, + "learning_rate": 2.014287562524539e-05, + "loss": 0.1185, + "step": 33976 + }, + { + "epoch": 0.6060179074662005, + "grad_norm": 0.2150115668773651, + "learning_rate": 2.014134878612257e-05, + "loss": 0.1271, + "step": 33977 + }, + { + "epoch": 0.6060357435879142, + "grad_norm": 0.5401460528373718, + "learning_rate": 2.0139821965833554e-05, + "loss": 0.192, + "step": 33978 + }, + { + "epoch": 0.6060535797096279, + "grad_norm": 0.35790112614631653, + "learning_rate": 2.0138295164384277e-05, + "loss": 0.1438, + "step": 33979 + }, + { + "epoch": 0.6060714158313416, + "grad_norm": 0.2671012580394745, + "learning_rate": 2.0136768381780645e-05, + "loss": 0.1448, + "step": 33980 + }, + { + "epoch": 0.6060892519530553, + "grad_norm": 0.37059590220451355, + "learning_rate": 2.013524161802858e-05, + "loss": 0.1706, + "step": 33981 + }, + { + "epoch": 0.606107088074769, + "grad_norm": 0.26011282205581665, + "learning_rate": 2.0133714873133985e-05, + "loss": 0.1657, + "step": 33982 + }, + { + "epoch": 0.6061249241964827, + "grad_norm": 0.23580026626586914, + "learning_rate": 2.0132188147102807e-05, + "loss": 0.122, + "step": 33983 + }, + { + "epoch": 0.6061427603181964, + "grad_norm": 0.22677592933177948, + "learning_rate": 2.013066143994094e-05, + "loss": 0.1333, + "step": 33984 + }, + { + "epoch": 0.6061605964399102, + "grad_norm": 0.2703092694282532, + "learning_rate": 2.012913475165432e-05, + "loss": 0.1546, + "step": 33985 + }, + { + "epoch": 0.6061784325616238, + "grad_norm": 0.2766292095184326, + "learning_rate": 2.0127608082248843e-05, + "loss": 0.1175, + "step": 33986 + }, + { + "epoch": 0.6061962686833375, + "grad_norm": 0.25880423188209534, + "learning_rate": 2.012608143173045e-05, + "loss": 0.1167, + "step": 33987 + }, + { + "epoch": 0.6062141048050512, + "grad_norm": 0.3839293122291565, + "learning_rate": 2.012455480010505e-05, + "loss": 0.17, + "step": 33988 + }, + { + "epoch": 0.6062319409267649, + "grad_norm": 0.32370656728744507, + "learning_rate": 2.012302818737856e-05, + "loss": 0.1364, + "step": 33989 + }, + { + "epoch": 0.6062497770484786, + "grad_norm": 0.2363174557685852, + "learning_rate": 2.0121501593556884e-05, + "loss": 0.1344, + "step": 33990 + }, + { + "epoch": 0.6062676131701923, + "grad_norm": 0.28111395239830017, + "learning_rate": 2.011997501864596e-05, + "loss": 0.167, + "step": 33991 + }, + { + "epoch": 0.606285449291906, + "grad_norm": 0.30178555846214294, + "learning_rate": 2.0118448462651705e-05, + "loss": 0.113, + "step": 33992 + }, + { + "epoch": 0.6063032854136197, + "grad_norm": 0.3490726053714752, + "learning_rate": 2.0116921925580025e-05, + "loss": 0.1593, + "step": 33993 + }, + { + "epoch": 0.6063211215353334, + "grad_norm": 0.307981014251709, + "learning_rate": 2.0115395407436848e-05, + "loss": 0.1455, + "step": 33994 + }, + { + "epoch": 0.606338957657047, + "grad_norm": 0.26569628715515137, + "learning_rate": 2.0113868908228072e-05, + "loss": 0.1808, + "step": 33995 + }, + { + "epoch": 0.6063567937787607, + "grad_norm": 0.26798880100250244, + "learning_rate": 2.0112342427959638e-05, + "loss": 0.1418, + "step": 33996 + }, + { + "epoch": 0.6063746299004744, + "grad_norm": 0.2148667722940445, + "learning_rate": 2.0110815966637447e-05, + "loss": 0.1472, + "step": 33997 + }, + { + "epoch": 0.6063924660221881, + "grad_norm": 0.25280389189720154, + "learning_rate": 2.010928952426743e-05, + "loss": 0.1038, + "step": 33998 + }, + { + "epoch": 0.6064103021439018, + "grad_norm": 0.2429462969303131, + "learning_rate": 2.0107763100855484e-05, + "loss": 0.1198, + "step": 33999 + }, + { + "epoch": 0.6064281382656155, + "grad_norm": 0.30507683753967285, + "learning_rate": 2.0106236696407547e-05, + "loss": 0.1417, + "step": 34000 + }, + { + "epoch": 0.6064281382656155, + "eval_loss": 0.1242346316576004, + "eval_runtime": 107.0263, + "eval_samples_per_second": 9.568, + "eval_steps_per_second": 1.598, + "step": 34000 + }, + { + "epoch": 0.6064459743873292, + "grad_norm": 0.2689165472984314, + "learning_rate": 2.0104710310929527e-05, + "loss": 0.1506, + "step": 34001 + }, + { + "epoch": 0.606463810509043, + "grad_norm": 0.31818389892578125, + "learning_rate": 2.0103183944427344e-05, + "loss": 0.1521, + "step": 34002 + }, + { + "epoch": 0.6064816466307567, + "grad_norm": 0.29521316289901733, + "learning_rate": 2.0101657596906896e-05, + "loss": 0.0735, + "step": 34003 + }, + { + "epoch": 0.6064994827524703, + "grad_norm": 0.3413086235523224, + "learning_rate": 2.010013126837413e-05, + "loss": 0.1599, + "step": 34004 + }, + { + "epoch": 0.606517318874184, + "grad_norm": 0.21740014851093292, + "learning_rate": 2.0098604958834942e-05, + "loss": 0.1141, + "step": 34005 + }, + { + "epoch": 0.6065351549958977, + "grad_norm": 0.25479939579963684, + "learning_rate": 2.0097078668295257e-05, + "loss": 0.1308, + "step": 34006 + }, + { + "epoch": 0.6065529911176114, + "grad_norm": 0.25989586114883423, + "learning_rate": 2.009555239676099e-05, + "loss": 0.1305, + "step": 34007 + }, + { + "epoch": 0.6065708272393251, + "grad_norm": 0.2964211106300354, + "learning_rate": 2.0094026144238044e-05, + "loss": 0.1695, + "step": 34008 + }, + { + "epoch": 0.6065886633610388, + "grad_norm": 0.23744186758995056, + "learning_rate": 2.009249991073236e-05, + "loss": 0.1314, + "step": 34009 + }, + { + "epoch": 0.6066064994827525, + "grad_norm": 0.25520065426826477, + "learning_rate": 2.0090973696249838e-05, + "loss": 0.1051, + "step": 34010 + }, + { + "epoch": 0.6066243356044662, + "grad_norm": 0.36073005199432373, + "learning_rate": 2.0089447500796395e-05, + "loss": 0.1386, + "step": 34011 + }, + { + "epoch": 0.6066421717261798, + "grad_norm": 0.28548458218574524, + "learning_rate": 2.0087921324377944e-05, + "loss": 0.0898, + "step": 34012 + }, + { + "epoch": 0.6066600078478935, + "grad_norm": 0.24678723514080048, + "learning_rate": 2.0086395167000414e-05, + "loss": 0.1279, + "step": 34013 + }, + { + "epoch": 0.6066778439696072, + "grad_norm": 0.3923526704311371, + "learning_rate": 2.0084869028669717e-05, + "loss": 0.1274, + "step": 34014 + }, + { + "epoch": 0.6066956800913209, + "grad_norm": 0.21978864073753357, + "learning_rate": 2.008334290939176e-05, + "loss": 0.0898, + "step": 34015 + }, + { + "epoch": 0.6067135162130346, + "grad_norm": 0.2535965144634247, + "learning_rate": 2.008181680917246e-05, + "loss": 0.1188, + "step": 34016 + }, + { + "epoch": 0.6067313523347483, + "grad_norm": 0.22535917162895203, + "learning_rate": 2.0080290728017745e-05, + "loss": 0.1228, + "step": 34017 + }, + { + "epoch": 0.606749188456462, + "grad_norm": 0.34669050574302673, + "learning_rate": 2.0078764665933515e-05, + "loss": 0.1017, + "step": 34018 + }, + { + "epoch": 0.6067670245781758, + "grad_norm": 0.29563677310943604, + "learning_rate": 2.00772386229257e-05, + "loss": 0.1188, + "step": 34019 + }, + { + "epoch": 0.6067848606998895, + "grad_norm": 0.24344807863235474, + "learning_rate": 2.007571259900021e-05, + "loss": 0.1641, + "step": 34020 + }, + { + "epoch": 0.6068026968216031, + "grad_norm": 0.24273742735385895, + "learning_rate": 2.0074186594162947e-05, + "loss": 0.1025, + "step": 34021 + }, + { + "epoch": 0.6068205329433168, + "grad_norm": 0.3006818890571594, + "learning_rate": 2.0072660608419845e-05, + "loss": 0.1333, + "step": 34022 + }, + { + "epoch": 0.6068383690650305, + "grad_norm": 0.25352567434310913, + "learning_rate": 2.0071134641776818e-05, + "loss": 0.1666, + "step": 34023 + }, + { + "epoch": 0.6068562051867442, + "grad_norm": 0.34692496061325073, + "learning_rate": 2.0069608694239768e-05, + "loss": 0.0932, + "step": 34024 + }, + { + "epoch": 0.6068740413084579, + "grad_norm": 0.26554763317108154, + "learning_rate": 2.0068082765814616e-05, + "loss": 0.1389, + "step": 34025 + }, + { + "epoch": 0.6068918774301716, + "grad_norm": 0.1897977888584137, + "learning_rate": 2.006655685650728e-05, + "loss": 0.1423, + "step": 34026 + }, + { + "epoch": 0.6069097135518853, + "grad_norm": 0.29043421149253845, + "learning_rate": 2.0065030966323678e-05, + "loss": 0.1189, + "step": 34027 + }, + { + "epoch": 0.606927549673599, + "grad_norm": 0.3475072383880615, + "learning_rate": 2.006350509526972e-05, + "loss": 0.1736, + "step": 34028 + }, + { + "epoch": 0.6069453857953127, + "grad_norm": 0.28601470589637756, + "learning_rate": 2.0061979243351313e-05, + "loss": 0.1087, + "step": 34029 + }, + { + "epoch": 0.6069632219170263, + "grad_norm": 0.27227428555488586, + "learning_rate": 2.006045341057439e-05, + "loss": 0.1418, + "step": 34030 + }, + { + "epoch": 0.60698105803874, + "grad_norm": 0.23274704813957214, + "learning_rate": 2.0058927596944853e-05, + "loss": 0.0936, + "step": 34031 + }, + { + "epoch": 0.6069988941604537, + "grad_norm": 0.2962673604488373, + "learning_rate": 2.0057401802468618e-05, + "loss": 0.1494, + "step": 34032 + }, + { + "epoch": 0.6070167302821674, + "grad_norm": 0.22344911098480225, + "learning_rate": 2.0055876027151604e-05, + "loss": 0.1089, + "step": 34033 + }, + { + "epoch": 0.6070345664038811, + "grad_norm": 0.33765116333961487, + "learning_rate": 2.005435027099971e-05, + "loss": 0.1437, + "step": 34034 + }, + { + "epoch": 0.6070524025255948, + "grad_norm": 0.22132675349712372, + "learning_rate": 2.005282453401888e-05, + "loss": 0.1363, + "step": 34035 + }, + { + "epoch": 0.6070702386473086, + "grad_norm": 0.29878923296928406, + "learning_rate": 2.0051298816215002e-05, + "loss": 0.1148, + "step": 34036 + }, + { + "epoch": 0.6070880747690223, + "grad_norm": 0.3090602457523346, + "learning_rate": 2.0049773117594003e-05, + "loss": 0.1295, + "step": 34037 + }, + { + "epoch": 0.607105910890736, + "grad_norm": 0.275724858045578, + "learning_rate": 2.0048247438161783e-05, + "loss": 0.1388, + "step": 34038 + }, + { + "epoch": 0.6071237470124496, + "grad_norm": 0.19276633858680725, + "learning_rate": 2.004672177792427e-05, + "loss": 0.109, + "step": 34039 + }, + { + "epoch": 0.6071415831341633, + "grad_norm": 0.3063611090183258, + "learning_rate": 2.004519613688738e-05, + "loss": 0.1526, + "step": 34040 + }, + { + "epoch": 0.607159419255877, + "grad_norm": 0.2948013246059418, + "learning_rate": 2.0043670515057022e-05, + "loss": 0.1926, + "step": 34041 + }, + { + "epoch": 0.6071772553775907, + "grad_norm": 0.20293796062469482, + "learning_rate": 2.00421449124391e-05, + "loss": 0.1019, + "step": 34042 + }, + { + "epoch": 0.6071950914993044, + "grad_norm": 0.26538240909576416, + "learning_rate": 2.004061932903954e-05, + "loss": 0.1449, + "step": 34043 + }, + { + "epoch": 0.6072129276210181, + "grad_norm": 0.27516084909439087, + "learning_rate": 2.003909376486426e-05, + "loss": 0.1479, + "step": 34044 + }, + { + "epoch": 0.6072307637427318, + "grad_norm": 0.23916086554527283, + "learning_rate": 2.0037568219919157e-05, + "loss": 0.0884, + "step": 34045 + }, + { + "epoch": 0.6072485998644455, + "grad_norm": 0.24960020184516907, + "learning_rate": 2.003604269421016e-05, + "loss": 0.1873, + "step": 34046 + }, + { + "epoch": 0.6072664359861591, + "grad_norm": 0.17460691928863525, + "learning_rate": 2.0034517187743165e-05, + "loss": 0.0854, + "step": 34047 + }, + { + "epoch": 0.6072842721078728, + "grad_norm": 0.23467344045639038, + "learning_rate": 2.0032991700524106e-05, + "loss": 0.1356, + "step": 34048 + }, + { + "epoch": 0.6073021082295865, + "grad_norm": 0.2633321285247803, + "learning_rate": 2.003146623255889e-05, + "loss": 0.067, + "step": 34049 + }, + { + "epoch": 0.6073199443513002, + "grad_norm": 0.27493250370025635, + "learning_rate": 2.0029940783853423e-05, + "loss": 0.1435, + "step": 34050 + }, + { + "epoch": 0.6073377804730139, + "grad_norm": 0.3249135911464691, + "learning_rate": 2.0028415354413615e-05, + "loss": 0.1388, + "step": 34051 + }, + { + "epoch": 0.6073556165947276, + "grad_norm": 0.3141963481903076, + "learning_rate": 2.0026889944245397e-05, + "loss": 0.1225, + "step": 34052 + }, + { + "epoch": 0.6073734527164414, + "grad_norm": 0.21605466306209564, + "learning_rate": 2.0025364553354666e-05, + "loss": 0.0989, + "step": 34053 + }, + { + "epoch": 0.6073912888381551, + "grad_norm": 0.279005229473114, + "learning_rate": 2.002383918174734e-05, + "loss": 0.1279, + "step": 34054 + }, + { + "epoch": 0.6074091249598688, + "grad_norm": 0.28412842750549316, + "learning_rate": 2.0022313829429328e-05, + "loss": 0.1607, + "step": 34055 + }, + { + "epoch": 0.6074269610815825, + "grad_norm": 0.2515621781349182, + "learning_rate": 2.002078849640655e-05, + "loss": 0.0837, + "step": 34056 + }, + { + "epoch": 0.6074447972032961, + "grad_norm": 0.2768721878528595, + "learning_rate": 2.001926318268492e-05, + "loss": 0.1623, + "step": 34057 + }, + { + "epoch": 0.6074626333250098, + "grad_norm": 0.27684301137924194, + "learning_rate": 2.001773788827035e-05, + "loss": 0.096, + "step": 34058 + }, + { + "epoch": 0.6074804694467235, + "grad_norm": 0.34042295813560486, + "learning_rate": 2.001621261316874e-05, + "loss": 0.1516, + "step": 34059 + }, + { + "epoch": 0.6074983055684372, + "grad_norm": 0.24979668855667114, + "learning_rate": 2.0014687357386007e-05, + "loss": 0.1554, + "step": 34060 + }, + { + "epoch": 0.6075161416901509, + "grad_norm": 0.2412799447774887, + "learning_rate": 2.0013162120928074e-05, + "loss": 0.152, + "step": 34061 + }, + { + "epoch": 0.6075339778118646, + "grad_norm": 0.23882125318050385, + "learning_rate": 2.001163690380085e-05, + "loss": 0.1459, + "step": 34062 + }, + { + "epoch": 0.6075518139335783, + "grad_norm": 0.2327205240726471, + "learning_rate": 2.0010111706010246e-05, + "loss": 0.1369, + "step": 34063 + }, + { + "epoch": 0.607569650055292, + "grad_norm": 0.25749075412750244, + "learning_rate": 2.000858652756216e-05, + "loss": 0.093, + "step": 34064 + }, + { + "epoch": 0.6075874861770056, + "grad_norm": 0.24764353036880493, + "learning_rate": 2.0007061368462527e-05, + "loss": 0.146, + "step": 34065 + }, + { + "epoch": 0.6076053222987193, + "grad_norm": 0.2832137942314148, + "learning_rate": 2.0005536228717248e-05, + "loss": 0.1352, + "step": 34066 + }, + { + "epoch": 0.607623158420433, + "grad_norm": 0.2774147689342499, + "learning_rate": 2.000401110833223e-05, + "loss": 0.1892, + "step": 34067 + }, + { + "epoch": 0.6076409945421467, + "grad_norm": 0.31670087575912476, + "learning_rate": 2.0002486007313386e-05, + "loss": 0.1522, + "step": 34068 + }, + { + "epoch": 0.6076588306638605, + "grad_norm": 0.20794054865837097, + "learning_rate": 2.0000960925666645e-05, + "loss": 0.128, + "step": 34069 + }, + { + "epoch": 0.6076766667855742, + "grad_norm": 0.25698500871658325, + "learning_rate": 1.9999435863397904e-05, + "loss": 0.1666, + "step": 34070 + }, + { + "epoch": 0.6076945029072879, + "grad_norm": 0.2686476409435272, + "learning_rate": 1.999791082051308e-05, + "loss": 0.1747, + "step": 34071 + }, + { + "epoch": 0.6077123390290016, + "grad_norm": 0.2534348666667938, + "learning_rate": 1.9996385797018067e-05, + "loss": 0.1083, + "step": 34072 + }, + { + "epoch": 0.6077301751507153, + "grad_norm": 0.4016781151294708, + "learning_rate": 1.9994860792918802e-05, + "loss": 0.1703, + "step": 34073 + }, + { + "epoch": 0.607748011272429, + "grad_norm": 0.2587624192237854, + "learning_rate": 1.9993335808221182e-05, + "loss": 0.1367, + "step": 34074 + }, + { + "epoch": 0.6077658473941426, + "grad_norm": 0.3383089601993561, + "learning_rate": 1.9991810842931124e-05, + "loss": 0.1752, + "step": 34075 + }, + { + "epoch": 0.6077836835158563, + "grad_norm": 0.26160141825675964, + "learning_rate": 1.999028589705454e-05, + "loss": 0.0497, + "step": 34076 + }, + { + "epoch": 0.60780151963757, + "grad_norm": 0.2654273808002472, + "learning_rate": 1.9988760970597322e-05, + "loss": 0.1649, + "step": 34077 + }, + { + "epoch": 0.6078193557592837, + "grad_norm": 0.29998132586479187, + "learning_rate": 1.998723606356541e-05, + "loss": 0.115, + "step": 34078 + }, + { + "epoch": 0.6078371918809974, + "grad_norm": 0.2516030967235565, + "learning_rate": 1.998571117596471e-05, + "loss": 0.1638, + "step": 34079 + }, + { + "epoch": 0.6078550280027111, + "grad_norm": 0.24922418594360352, + "learning_rate": 1.9984186307801113e-05, + "loss": 0.1112, + "step": 34080 + }, + { + "epoch": 0.6078728641244248, + "grad_norm": 0.3222910463809967, + "learning_rate": 1.9982661459080542e-05, + "loss": 0.1405, + "step": 34081 + }, + { + "epoch": 0.6078907002461384, + "grad_norm": 0.286417156457901, + "learning_rate": 1.998113662980891e-05, + "loss": 0.1348, + "step": 34082 + }, + { + "epoch": 0.6079085363678521, + "grad_norm": 0.3120495676994324, + "learning_rate": 1.997961181999213e-05, + "loss": 0.1489, + "step": 34083 + }, + { + "epoch": 0.6079263724895658, + "grad_norm": 0.19435276091098785, + "learning_rate": 1.997808702963611e-05, + "loss": 0.1184, + "step": 34084 + }, + { + "epoch": 0.6079442086112795, + "grad_norm": 0.21123048663139343, + "learning_rate": 1.9976562258746746e-05, + "loss": 0.1583, + "step": 34085 + }, + { + "epoch": 0.6079620447329933, + "grad_norm": 0.36681869626045227, + "learning_rate": 1.9975037507329976e-05, + "loss": 0.127, + "step": 34086 + }, + { + "epoch": 0.607979880854707, + "grad_norm": 0.2873871326446533, + "learning_rate": 1.9973512775391694e-05, + "loss": 0.1262, + "step": 34087 + }, + { + "epoch": 0.6079977169764207, + "grad_norm": 0.24000637233257294, + "learning_rate": 1.997198806293781e-05, + "loss": 0.1069, + "step": 34088 + }, + { + "epoch": 0.6080155530981344, + "grad_norm": 0.2373301237821579, + "learning_rate": 1.997046336997424e-05, + "loss": 0.0747, + "step": 34089 + }, + { + "epoch": 0.6080333892198481, + "grad_norm": 0.24301819503307343, + "learning_rate": 1.9968938696506876e-05, + "loss": 0.1761, + "step": 34090 + }, + { + "epoch": 0.6080512253415618, + "grad_norm": 0.7439576387405396, + "learning_rate": 1.996741404254166e-05, + "loss": 0.1731, + "step": 34091 + }, + { + "epoch": 0.6080690614632754, + "grad_norm": 0.2787877917289734, + "learning_rate": 1.9965889408084483e-05, + "loss": 0.1359, + "step": 34092 + }, + { + "epoch": 0.6080868975849891, + "grad_norm": 0.2211727648973465, + "learning_rate": 1.996436479314126e-05, + "loss": 0.0982, + "step": 34093 + }, + { + "epoch": 0.6081047337067028, + "grad_norm": 0.23949767649173737, + "learning_rate": 1.9962840197717887e-05, + "loss": 0.1078, + "step": 34094 + }, + { + "epoch": 0.6081225698284165, + "grad_norm": 0.2994820773601532, + "learning_rate": 1.9961315621820286e-05, + "loss": 0.1266, + "step": 34095 + }, + { + "epoch": 0.6081404059501302, + "grad_norm": 0.22102081775665283, + "learning_rate": 1.9959791065454376e-05, + "loss": 0.1099, + "step": 34096 + }, + { + "epoch": 0.6081582420718439, + "grad_norm": 0.25049301981925964, + "learning_rate": 1.9958266528626058e-05, + "loss": 0.1639, + "step": 34097 + }, + { + "epoch": 0.6081760781935576, + "grad_norm": 0.3949906826019287, + "learning_rate": 1.9956742011341225e-05, + "loss": 0.1561, + "step": 34098 + }, + { + "epoch": 0.6081939143152713, + "grad_norm": 0.24044504761695862, + "learning_rate": 1.9955217513605815e-05, + "loss": 0.1068, + "step": 34099 + }, + { + "epoch": 0.6082117504369849, + "grad_norm": 0.270826131105423, + "learning_rate": 1.9953693035425726e-05, + "loss": 0.1076, + "step": 34100 + }, + { + "epoch": 0.6082295865586986, + "grad_norm": 0.25290408730506897, + "learning_rate": 1.9952168576806856e-05, + "loss": 0.0949, + "step": 34101 + }, + { + "epoch": 0.6082474226804123, + "grad_norm": 0.22967013716697693, + "learning_rate": 1.9950644137755132e-05, + "loss": 0.1298, + "step": 34102 + }, + { + "epoch": 0.6082652588021261, + "grad_norm": 0.2955591380596161, + "learning_rate": 1.9949119718276442e-05, + "loss": 0.2006, + "step": 34103 + }, + { + "epoch": 0.6082830949238398, + "grad_norm": 0.27504876255989075, + "learning_rate": 1.9947595318376722e-05, + "loss": 0.137, + "step": 34104 + }, + { + "epoch": 0.6083009310455535, + "grad_norm": 0.30607736110687256, + "learning_rate": 1.9946070938061867e-05, + "loss": 0.135, + "step": 34105 + }, + { + "epoch": 0.6083187671672672, + "grad_norm": 0.22454895079135895, + "learning_rate": 1.9944546577337787e-05, + "loss": 0.1128, + "step": 34106 + }, + { + "epoch": 0.6083366032889809, + "grad_norm": 0.2026168256998062, + "learning_rate": 1.994302223621038e-05, + "loss": 0.1249, + "step": 34107 + }, + { + "epoch": 0.6083544394106946, + "grad_norm": 0.18978428840637207, + "learning_rate": 1.9941497914685574e-05, + "loss": 0.0853, + "step": 34108 + }, + { + "epoch": 0.6083722755324082, + "grad_norm": 0.2684705853462219, + "learning_rate": 1.9939973612769267e-05, + "loss": 0.1369, + "step": 34109 + }, + { + "epoch": 0.6083901116541219, + "grad_norm": 0.330782413482666, + "learning_rate": 1.9938449330467373e-05, + "loss": 0.1358, + "step": 34110 + }, + { + "epoch": 0.6084079477758356, + "grad_norm": 0.27003681659698486, + "learning_rate": 1.9936925067785787e-05, + "loss": 0.1153, + "step": 34111 + }, + { + "epoch": 0.6084257838975493, + "grad_norm": 0.22729693353176117, + "learning_rate": 1.9935400824730437e-05, + "loss": 0.0943, + "step": 34112 + }, + { + "epoch": 0.608443620019263, + "grad_norm": 0.2702178955078125, + "learning_rate": 1.993387660130723e-05, + "loss": 0.0668, + "step": 34113 + }, + { + "epoch": 0.6084614561409767, + "grad_norm": 0.24327760934829712, + "learning_rate": 1.9932352397522057e-05, + "loss": 0.1077, + "step": 34114 + }, + { + "epoch": 0.6084792922626904, + "grad_norm": 0.2638218402862549, + "learning_rate": 1.993082821338084e-05, + "loss": 0.0992, + "step": 34115 + }, + { + "epoch": 0.6084971283844041, + "grad_norm": 0.4522806704044342, + "learning_rate": 1.9929304048889475e-05, + "loss": 0.1623, + "step": 34116 + }, + { + "epoch": 0.6085149645061178, + "grad_norm": 0.354140043258667, + "learning_rate": 1.992777990405389e-05, + "loss": 0.2001, + "step": 34117 + }, + { + "epoch": 0.6085328006278314, + "grad_norm": 0.25301435589790344, + "learning_rate": 1.9926255778879978e-05, + "loss": 0.067, + "step": 34118 + }, + { + "epoch": 0.6085506367495451, + "grad_norm": 0.27323105931282043, + "learning_rate": 1.9924731673373657e-05, + "loss": 0.1098, + "step": 34119 + }, + { + "epoch": 0.6085684728712589, + "grad_norm": 0.2349899411201477, + "learning_rate": 1.9923207587540814e-05, + "loss": 0.103, + "step": 34120 + }, + { + "epoch": 0.6085863089929726, + "grad_norm": 0.3228157162666321, + "learning_rate": 1.9921683521387386e-05, + "loss": 0.1176, + "step": 34121 + }, + { + "epoch": 0.6086041451146863, + "grad_norm": 0.3161437511444092, + "learning_rate": 1.9920159474919255e-05, + "loss": 0.111, + "step": 34122 + }, + { + "epoch": 0.6086219812364, + "grad_norm": 0.31503742933273315, + "learning_rate": 1.9918635448142352e-05, + "loss": 0.1351, + "step": 34123 + }, + { + "epoch": 0.6086398173581137, + "grad_norm": 0.21428050100803375, + "learning_rate": 1.991711144106256e-05, + "loss": 0.1409, + "step": 34124 + }, + { + "epoch": 0.6086576534798274, + "grad_norm": 0.24637004733085632, + "learning_rate": 1.991558745368581e-05, + "loss": 0.1115, + "step": 34125 + }, + { + "epoch": 0.608675489601541, + "grad_norm": 0.2616561949253082, + "learning_rate": 1.9914063486018e-05, + "loss": 0.1417, + "step": 34126 + }, + { + "epoch": 0.6086933257232547, + "grad_norm": 0.23726029694080353, + "learning_rate": 1.9912539538065038e-05, + "loss": 0.0882, + "step": 34127 + }, + { + "epoch": 0.6087111618449684, + "grad_norm": 0.22764617204666138, + "learning_rate": 1.9911015609832823e-05, + "loss": 0.152, + "step": 34128 + }, + { + "epoch": 0.6087289979666821, + "grad_norm": 0.18785883486270905, + "learning_rate": 1.9909491701327266e-05, + "loss": 0.1016, + "step": 34129 + }, + { + "epoch": 0.6087468340883958, + "grad_norm": 0.21322157979011536, + "learning_rate": 1.9907967812554284e-05, + "loss": 0.1329, + "step": 34130 + }, + { + "epoch": 0.6087646702101095, + "grad_norm": 0.26051679253578186, + "learning_rate": 1.990644394351978e-05, + "loss": 0.1578, + "step": 34131 + }, + { + "epoch": 0.6087825063318232, + "grad_norm": 0.2870309352874756, + "learning_rate": 1.9904920094229655e-05, + "loss": 0.1322, + "step": 34132 + }, + { + "epoch": 0.6088003424535369, + "grad_norm": 0.250116765499115, + "learning_rate": 1.9903396264689813e-05, + "loss": 0.1447, + "step": 34133 + }, + { + "epoch": 0.6088181785752506, + "grad_norm": 0.30739596486091614, + "learning_rate": 1.9901872454906176e-05, + "loss": 0.2093, + "step": 34134 + }, + { + "epoch": 0.6088360146969642, + "grad_norm": 0.2513252794742584, + "learning_rate": 1.9900348664884642e-05, + "loss": 0.1389, + "step": 34135 + }, + { + "epoch": 0.6088538508186779, + "grad_norm": 0.26735758781433105, + "learning_rate": 1.9898824894631116e-05, + "loss": 0.1346, + "step": 34136 + }, + { + "epoch": 0.6088716869403917, + "grad_norm": 0.3010508418083191, + "learning_rate": 1.9897301144151502e-05, + "loss": 0.1581, + "step": 34137 + }, + { + "epoch": 0.6088895230621054, + "grad_norm": 0.27822795510292053, + "learning_rate": 1.9895777413451717e-05, + "loss": 0.0965, + "step": 34138 + }, + { + "epoch": 0.6089073591838191, + "grad_norm": 0.34074705839157104, + "learning_rate": 1.989425370253766e-05, + "loss": 0.1304, + "step": 34139 + }, + { + "epoch": 0.6089251953055328, + "grad_norm": 0.2117132693529129, + "learning_rate": 1.9892730011415245e-05, + "loss": 0.1317, + "step": 34140 + }, + { + "epoch": 0.6089430314272465, + "grad_norm": 0.30902761220932007, + "learning_rate": 1.989120634009037e-05, + "loss": 0.1181, + "step": 34141 + }, + { + "epoch": 0.6089608675489602, + "grad_norm": 0.2999575138092041, + "learning_rate": 1.988968268856893e-05, + "loss": 0.1224, + "step": 34142 + }, + { + "epoch": 0.6089787036706739, + "grad_norm": 0.35089242458343506, + "learning_rate": 1.9888159056856858e-05, + "loss": 0.1854, + "step": 34143 + }, + { + "epoch": 0.6089965397923875, + "grad_norm": 0.2679131031036377, + "learning_rate": 1.9886635444960044e-05, + "loss": 0.1278, + "step": 34144 + }, + { + "epoch": 0.6090143759141012, + "grad_norm": 0.42240259051322937, + "learning_rate": 1.9885111852884404e-05, + "loss": 0.1643, + "step": 34145 + }, + { + "epoch": 0.6090322120358149, + "grad_norm": 0.2190408706665039, + "learning_rate": 1.988358828063582e-05, + "loss": 0.1285, + "step": 34146 + }, + { + "epoch": 0.6090500481575286, + "grad_norm": 0.24016138911247253, + "learning_rate": 1.9882064728220227e-05, + "loss": 0.1616, + "step": 34147 + }, + { + "epoch": 0.6090678842792423, + "grad_norm": 0.23907403647899628, + "learning_rate": 1.9880541195643522e-05, + "loss": 0.1572, + "step": 34148 + }, + { + "epoch": 0.609085720400956, + "grad_norm": 0.254570871591568, + "learning_rate": 1.9879017682911603e-05, + "loss": 0.0794, + "step": 34149 + }, + { + "epoch": 0.6091035565226697, + "grad_norm": 0.30045750737190247, + "learning_rate": 1.9877494190030376e-05, + "loss": 0.1975, + "step": 34150 + }, + { + "epoch": 0.6091213926443834, + "grad_norm": 0.30831149220466614, + "learning_rate": 1.9875970717005755e-05, + "loss": 0.1631, + "step": 34151 + }, + { + "epoch": 0.609139228766097, + "grad_norm": 0.3792189359664917, + "learning_rate": 1.9874447263843644e-05, + "loss": 0.2206, + "step": 34152 + }, + { + "epoch": 0.6091570648878107, + "grad_norm": 0.2211562693119049, + "learning_rate": 1.9872923830549944e-05, + "loss": 0.0924, + "step": 34153 + }, + { + "epoch": 0.6091749010095245, + "grad_norm": 0.2657011151313782, + "learning_rate": 1.9871400417130564e-05, + "loss": 0.1238, + "step": 34154 + }, + { + "epoch": 0.6091927371312382, + "grad_norm": 0.311235636472702, + "learning_rate": 1.98698770235914e-05, + "loss": 0.1411, + "step": 34155 + }, + { + "epoch": 0.6092105732529519, + "grad_norm": 0.2832774817943573, + "learning_rate": 1.986835364993837e-05, + "loss": 0.1304, + "step": 34156 + }, + { + "epoch": 0.6092284093746656, + "grad_norm": 0.25852981209754944, + "learning_rate": 1.9866830296177372e-05, + "loss": 0.119, + "step": 34157 + }, + { + "epoch": 0.6092462454963793, + "grad_norm": 0.350294828414917, + "learning_rate": 1.986530696231432e-05, + "loss": 0.1365, + "step": 34158 + }, + { + "epoch": 0.609264081618093, + "grad_norm": 0.2197064906358719, + "learning_rate": 1.9863783648355096e-05, + "loss": 0.0999, + "step": 34159 + }, + { + "epoch": 0.6092819177398067, + "grad_norm": 0.183275043964386, + "learning_rate": 1.9862260354305627e-05, + "loss": 0.1393, + "step": 34160 + }, + { + "epoch": 0.6092997538615204, + "grad_norm": 0.2630709409713745, + "learning_rate": 1.986073708017182e-05, + "loss": 0.0853, + "step": 34161 + }, + { + "epoch": 0.609317589983234, + "grad_norm": 0.34906816482543945, + "learning_rate": 1.985921382595957e-05, + "loss": 0.1438, + "step": 34162 + }, + { + "epoch": 0.6093354261049477, + "grad_norm": 0.2369636744260788, + "learning_rate": 1.9857690591674768e-05, + "loss": 0.1259, + "step": 34163 + }, + { + "epoch": 0.6093532622266614, + "grad_norm": 0.350037157535553, + "learning_rate": 1.9856167377323347e-05, + "loss": 0.195, + "step": 34164 + }, + { + "epoch": 0.6093710983483751, + "grad_norm": 0.18668416142463684, + "learning_rate": 1.9854644182911193e-05, + "loss": 0.1089, + "step": 34165 + }, + { + "epoch": 0.6093889344700888, + "grad_norm": 0.21468549966812134, + "learning_rate": 1.985312100844422e-05, + "loss": 0.1294, + "step": 34166 + }, + { + "epoch": 0.6094067705918025, + "grad_norm": 0.35791632533073425, + "learning_rate": 1.9851597853928327e-05, + "loss": 0.136, + "step": 34167 + }, + { + "epoch": 0.6094246067135162, + "grad_norm": 0.239421084523201, + "learning_rate": 1.985007471936941e-05, + "loss": 0.1353, + "step": 34168 + }, + { + "epoch": 0.6094424428352299, + "grad_norm": 0.2497035413980484, + "learning_rate": 1.9848551604773387e-05, + "loss": 0.0773, + "step": 34169 + }, + { + "epoch": 0.6094602789569437, + "grad_norm": 0.22092542052268982, + "learning_rate": 1.9847028510146167e-05, + "loss": 0.1334, + "step": 34170 + }, + { + "epoch": 0.6094781150786573, + "grad_norm": 0.22181285917758942, + "learning_rate": 1.984550543549363e-05, + "loss": 0.1151, + "step": 34171 + }, + { + "epoch": 0.609495951200371, + "grad_norm": 0.3666284382343292, + "learning_rate": 1.9843982380821692e-05, + "loss": 0.1349, + "step": 34172 + }, + { + "epoch": 0.6095137873220847, + "grad_norm": 0.26535218954086304, + "learning_rate": 1.9842459346136273e-05, + "loss": 0.1896, + "step": 34173 + }, + { + "epoch": 0.6095316234437984, + "grad_norm": 0.3155516982078552, + "learning_rate": 1.9840936331443257e-05, + "loss": 0.1502, + "step": 34174 + }, + { + "epoch": 0.6095494595655121, + "grad_norm": 0.24373160302639008, + "learning_rate": 1.9839413336748557e-05, + "loss": 0.1228, + "step": 34175 + }, + { + "epoch": 0.6095672956872258, + "grad_norm": 0.2694872319698334, + "learning_rate": 1.983789036205806e-05, + "loss": 0.1657, + "step": 34176 + }, + { + "epoch": 0.6095851318089395, + "grad_norm": 0.2525143325328827, + "learning_rate": 1.98363674073777e-05, + "loss": 0.1219, + "step": 34177 + }, + { + "epoch": 0.6096029679306532, + "grad_norm": 0.3405756950378418, + "learning_rate": 1.9834844472713352e-05, + "loss": 0.186, + "step": 34178 + }, + { + "epoch": 0.6096208040523668, + "grad_norm": 0.16753199696540833, + "learning_rate": 1.983332155807094e-05, + "loss": 0.0856, + "step": 34179 + }, + { + "epoch": 0.6096386401740805, + "grad_norm": 0.21299999952316284, + "learning_rate": 1.9831798663456354e-05, + "loss": 0.1491, + "step": 34180 + }, + { + "epoch": 0.6096564762957942, + "grad_norm": 0.193110853433609, + "learning_rate": 1.9830275788875493e-05, + "loss": 0.0893, + "step": 34181 + }, + { + "epoch": 0.6096743124175079, + "grad_norm": 0.23878173530101776, + "learning_rate": 1.982875293433428e-05, + "loss": 0.1487, + "step": 34182 + }, + { + "epoch": 0.6096921485392216, + "grad_norm": 0.26100605726242065, + "learning_rate": 1.9827230099838606e-05, + "loss": 0.1289, + "step": 34183 + }, + { + "epoch": 0.6097099846609353, + "grad_norm": 0.27328264713287354, + "learning_rate": 1.9825707285394367e-05, + "loss": 0.1483, + "step": 34184 + }, + { + "epoch": 0.609727820782649, + "grad_norm": 0.30536216497421265, + "learning_rate": 1.982418449100748e-05, + "loss": 0.1395, + "step": 34185 + }, + { + "epoch": 0.6097456569043627, + "grad_norm": 0.27769824862480164, + "learning_rate": 1.982266171668384e-05, + "loss": 0.1278, + "step": 34186 + }, + { + "epoch": 0.6097634930260765, + "grad_norm": 0.1696663647890091, + "learning_rate": 1.982113896242935e-05, + "loss": 0.0707, + "step": 34187 + }, + { + "epoch": 0.6097813291477902, + "grad_norm": 0.2461685836315155, + "learning_rate": 1.981961622824992e-05, + "loss": 0.0882, + "step": 34188 + }, + { + "epoch": 0.6097991652695038, + "grad_norm": 0.3408156931400299, + "learning_rate": 1.9818093514151435e-05, + "loss": 0.0945, + "step": 34189 + }, + { + "epoch": 0.6098170013912175, + "grad_norm": 0.3597780466079712, + "learning_rate": 1.9816570820139818e-05, + "loss": 0.1091, + "step": 34190 + }, + { + "epoch": 0.6098348375129312, + "grad_norm": 0.1795509308576584, + "learning_rate": 1.9815048146220967e-05, + "loss": 0.1037, + "step": 34191 + }, + { + "epoch": 0.6098526736346449, + "grad_norm": 0.22486886382102966, + "learning_rate": 1.981352549240077e-05, + "loss": 0.0914, + "step": 34192 + }, + { + "epoch": 0.6098705097563586, + "grad_norm": 0.30734267830848694, + "learning_rate": 1.9812002858685147e-05, + "loss": 0.1615, + "step": 34193 + }, + { + "epoch": 0.6098883458780723, + "grad_norm": 0.2527114450931549, + "learning_rate": 1.9810480245079982e-05, + "loss": 0.1404, + "step": 34194 + }, + { + "epoch": 0.609906181999786, + "grad_norm": 0.23037639260292053, + "learning_rate": 1.98089576515912e-05, + "loss": 0.0911, + "step": 34195 + }, + { + "epoch": 0.6099240181214997, + "grad_norm": 0.24946050345897675, + "learning_rate": 1.980743507822469e-05, + "loss": 0.1245, + "step": 34196 + }, + { + "epoch": 0.6099418542432133, + "grad_norm": 0.2214963585138321, + "learning_rate": 1.9805912524986354e-05, + "loss": 0.129, + "step": 34197 + }, + { + "epoch": 0.609959690364927, + "grad_norm": 0.22918926179409027, + "learning_rate": 1.9804389991882086e-05, + "loss": 0.1153, + "step": 34198 + }, + { + "epoch": 0.6099775264866407, + "grad_norm": 0.35305845737457275, + "learning_rate": 1.98028674789178e-05, + "loss": 0.115, + "step": 34199 + }, + { + "epoch": 0.6099953626083544, + "grad_norm": 0.2801823318004608, + "learning_rate": 1.9801344986099403e-05, + "loss": 0.1569, + "step": 34200 + }, + { + "epoch": 0.6100131987300681, + "grad_norm": 0.22940704226493835, + "learning_rate": 1.9799822513432785e-05, + "loss": 0.1385, + "step": 34201 + }, + { + "epoch": 0.6100310348517818, + "grad_norm": 0.34577682614326477, + "learning_rate": 1.9798300060923847e-05, + "loss": 0.1282, + "step": 34202 + }, + { + "epoch": 0.6100488709734955, + "grad_norm": 0.2817285358905792, + "learning_rate": 1.97967776285785e-05, + "loss": 0.1624, + "step": 34203 + }, + { + "epoch": 0.6100667070952093, + "grad_norm": 0.3031929135322571, + "learning_rate": 1.9795255216402637e-05, + "loss": 0.1474, + "step": 34204 + }, + { + "epoch": 0.610084543216923, + "grad_norm": 0.19765150547027588, + "learning_rate": 1.9793732824402166e-05, + "loss": 0.1099, + "step": 34205 + }, + { + "epoch": 0.6101023793386366, + "grad_norm": 0.31663721799850464, + "learning_rate": 1.9792210452582983e-05, + "loss": 0.1244, + "step": 34206 + }, + { + "epoch": 0.6101202154603503, + "grad_norm": 0.2439422607421875, + "learning_rate": 1.9790688100950983e-05, + "loss": 0.1514, + "step": 34207 + }, + { + "epoch": 0.610138051582064, + "grad_norm": 0.2734171450138092, + "learning_rate": 1.9789165769512083e-05, + "loss": 0.1354, + "step": 34208 + }, + { + "epoch": 0.6101558877037777, + "grad_norm": 0.23721948266029358, + "learning_rate": 1.9787643458272178e-05, + "loss": 0.1209, + "step": 34209 + }, + { + "epoch": 0.6101737238254914, + "grad_norm": 0.2097400724887848, + "learning_rate": 1.978612116723717e-05, + "loss": 0.1291, + "step": 34210 + }, + { + "epoch": 0.6101915599472051, + "grad_norm": 0.23914022743701935, + "learning_rate": 1.9784598896412943e-05, + "loss": 0.1679, + "step": 34211 + }, + { + "epoch": 0.6102093960689188, + "grad_norm": 0.27593016624450684, + "learning_rate": 1.9783076645805422e-05, + "loss": 0.1552, + "step": 34212 + }, + { + "epoch": 0.6102272321906325, + "grad_norm": 0.27197569608688354, + "learning_rate": 1.9781554415420493e-05, + "loss": 0.127, + "step": 34213 + }, + { + "epoch": 0.6102450683123462, + "grad_norm": 0.22706715762615204, + "learning_rate": 1.978003220526407e-05, + "loss": 0.1118, + "step": 34214 + }, + { + "epoch": 0.6102629044340598, + "grad_norm": 0.3037537634372711, + "learning_rate": 1.9778510015342034e-05, + "loss": 0.1413, + "step": 34215 + }, + { + "epoch": 0.6102807405557735, + "grad_norm": 0.2389712780714035, + "learning_rate": 1.9776987845660305e-05, + "loss": 0.1133, + "step": 34216 + }, + { + "epoch": 0.6102985766774872, + "grad_norm": 0.23319010436534882, + "learning_rate": 1.9775465696224777e-05, + "loss": 0.1128, + "step": 34217 + }, + { + "epoch": 0.6103164127992009, + "grad_norm": 0.2975747585296631, + "learning_rate": 1.9773943567041346e-05, + "loss": 0.0963, + "step": 34218 + }, + { + "epoch": 0.6103342489209146, + "grad_norm": 0.25807785987854004, + "learning_rate": 1.977242145811591e-05, + "loss": 0.1057, + "step": 34219 + }, + { + "epoch": 0.6103520850426283, + "grad_norm": 0.24133466184139252, + "learning_rate": 1.9770899369454377e-05, + "loss": 0.1077, + "step": 34220 + }, + { + "epoch": 0.6103699211643421, + "grad_norm": 0.19542334973812103, + "learning_rate": 1.9769377301062643e-05, + "loss": 0.0914, + "step": 34221 + }, + { + "epoch": 0.6103877572860558, + "grad_norm": 0.3509823977947235, + "learning_rate": 1.976785525294661e-05, + "loss": 0.1649, + "step": 34222 + }, + { + "epoch": 0.6104055934077695, + "grad_norm": 0.27924787998199463, + "learning_rate": 1.9766333225112183e-05, + "loss": 0.0883, + "step": 34223 + }, + { + "epoch": 0.6104234295294831, + "grad_norm": 0.31095796823501587, + "learning_rate": 1.976481121756524e-05, + "loss": 0.1443, + "step": 34224 + }, + { + "epoch": 0.6104412656511968, + "grad_norm": 0.29152435064315796, + "learning_rate": 1.9763289230311712e-05, + "loss": 0.0874, + "step": 34225 + }, + { + "epoch": 0.6104591017729105, + "grad_norm": 0.2683859169483185, + "learning_rate": 1.976176726335748e-05, + "loss": 0.127, + "step": 34226 + }, + { + "epoch": 0.6104769378946242, + "grad_norm": 0.26801446080207825, + "learning_rate": 1.9760245316708445e-05, + "loss": 0.1143, + "step": 34227 + }, + { + "epoch": 0.6104947740163379, + "grad_norm": 0.27824270725250244, + "learning_rate": 1.9758723390370503e-05, + "loss": 0.1502, + "step": 34228 + }, + { + "epoch": 0.6105126101380516, + "grad_norm": 0.3395063281059265, + "learning_rate": 1.9757201484349567e-05, + "loss": 0.1607, + "step": 34229 + }, + { + "epoch": 0.6105304462597653, + "grad_norm": 0.23764997720718384, + "learning_rate": 1.9755679598651534e-05, + "loss": 0.1029, + "step": 34230 + }, + { + "epoch": 0.610548282381479, + "grad_norm": 0.4214462637901306, + "learning_rate": 1.9754157733282298e-05, + "loss": 0.1329, + "step": 34231 + }, + { + "epoch": 0.6105661185031926, + "grad_norm": 0.3583911955356598, + "learning_rate": 1.975263588824775e-05, + "loss": 0.1016, + "step": 34232 + }, + { + "epoch": 0.6105839546249063, + "grad_norm": 0.25084757804870605, + "learning_rate": 1.97511140635538e-05, + "loss": 0.1512, + "step": 34233 + }, + { + "epoch": 0.61060179074662, + "grad_norm": 0.28068971633911133, + "learning_rate": 1.974959225920634e-05, + "loss": 0.1105, + "step": 34234 + }, + { + "epoch": 0.6106196268683337, + "grad_norm": 0.2599231004714966, + "learning_rate": 1.9748070475211283e-05, + "loss": 0.1225, + "step": 34235 + }, + { + "epoch": 0.6106374629900474, + "grad_norm": 0.2763570249080658, + "learning_rate": 1.974654871157452e-05, + "loss": 0.1211, + "step": 34236 + }, + { + "epoch": 0.6106552991117611, + "grad_norm": 0.19660866260528564, + "learning_rate": 1.9745026968301935e-05, + "loss": 0.1044, + "step": 34237 + }, + { + "epoch": 0.6106731352334749, + "grad_norm": 0.25641003251075745, + "learning_rate": 1.9743505245399453e-05, + "loss": 0.1464, + "step": 34238 + }, + { + "epoch": 0.6106909713551886, + "grad_norm": 0.26230141520500183, + "learning_rate": 1.9741983542872962e-05, + "loss": 0.1119, + "step": 34239 + }, + { + "epoch": 0.6107088074769023, + "grad_norm": 0.24544131755828857, + "learning_rate": 1.974046186072835e-05, + "loss": 0.1611, + "step": 34240 + }, + { + "epoch": 0.610726643598616, + "grad_norm": 0.21997612714767456, + "learning_rate": 1.9738940198971527e-05, + "loss": 0.1438, + "step": 34241 + }, + { + "epoch": 0.6107444797203296, + "grad_norm": 0.3547186851501465, + "learning_rate": 1.9737418557608387e-05, + "loss": 0.1408, + "step": 34242 + }, + { + "epoch": 0.6107623158420433, + "grad_norm": 0.2614040970802307, + "learning_rate": 1.9735896936644836e-05, + "loss": 0.1023, + "step": 34243 + }, + { + "epoch": 0.610780151963757, + "grad_norm": 0.31797677278518677, + "learning_rate": 1.9734375336086766e-05, + "loss": 0.1389, + "step": 34244 + }, + { + "epoch": 0.6107979880854707, + "grad_norm": 0.29432475566864014, + "learning_rate": 1.973285375594008e-05, + "loss": 0.1878, + "step": 34245 + }, + { + "epoch": 0.6108158242071844, + "grad_norm": 0.22685424983501434, + "learning_rate": 1.9731332196210655e-05, + "loss": 0.1235, + "step": 34246 + }, + { + "epoch": 0.6108336603288981, + "grad_norm": 0.21128126978874207, + "learning_rate": 1.9729810656904418e-05, + "loss": 0.1546, + "step": 34247 + }, + { + "epoch": 0.6108514964506118, + "grad_norm": 0.3172430694103241, + "learning_rate": 1.9728289138027253e-05, + "loss": 0.1062, + "step": 34248 + }, + { + "epoch": 0.6108693325723255, + "grad_norm": 0.33118540048599243, + "learning_rate": 1.9726767639585063e-05, + "loss": 0.1491, + "step": 34249 + }, + { + "epoch": 0.6108871686940391, + "grad_norm": 0.21668967604637146, + "learning_rate": 1.9725246161583737e-05, + "loss": 0.1533, + "step": 34250 + }, + { + "epoch": 0.6109050048157528, + "grad_norm": 0.2716294229030609, + "learning_rate": 1.9723724704029182e-05, + "loss": 0.147, + "step": 34251 + }, + { + "epoch": 0.6109228409374665, + "grad_norm": 0.31951966881752014, + "learning_rate": 1.9722203266927296e-05, + "loss": 0.1432, + "step": 34252 + }, + { + "epoch": 0.6109406770591802, + "grad_norm": 0.25519925355911255, + "learning_rate": 1.972068185028397e-05, + "loss": 0.1288, + "step": 34253 + }, + { + "epoch": 0.6109585131808939, + "grad_norm": 0.24568498134613037, + "learning_rate": 1.97191604541051e-05, + "loss": 0.0943, + "step": 34254 + }, + { + "epoch": 0.6109763493026077, + "grad_norm": 0.24958018958568573, + "learning_rate": 1.9717639078396595e-05, + "loss": 0.1045, + "step": 34255 + }, + { + "epoch": 0.6109941854243214, + "grad_norm": 0.34653839468955994, + "learning_rate": 1.9716117723164346e-05, + "loss": 0.1455, + "step": 34256 + }, + { + "epoch": 0.6110120215460351, + "grad_norm": 0.2604621350765228, + "learning_rate": 1.9714596388414248e-05, + "loss": 0.1176, + "step": 34257 + }, + { + "epoch": 0.6110298576677488, + "grad_norm": 0.22539140284061432, + "learning_rate": 1.9713075074152203e-05, + "loss": 0.1233, + "step": 34258 + }, + { + "epoch": 0.6110476937894624, + "grad_norm": 0.29068025946617126, + "learning_rate": 1.9711553780384093e-05, + "loss": 0.1277, + "step": 34259 + }, + { + "epoch": 0.6110655299111761, + "grad_norm": 0.2469399869441986, + "learning_rate": 1.9710032507115837e-05, + "loss": 0.123, + "step": 34260 + }, + { + "epoch": 0.6110833660328898, + "grad_norm": 0.3095710277557373, + "learning_rate": 1.9708511254353318e-05, + "loss": 0.1083, + "step": 34261 + }, + { + "epoch": 0.6111012021546035, + "grad_norm": 0.23690742254257202, + "learning_rate": 1.9706990022102443e-05, + "loss": 0.1661, + "step": 34262 + }, + { + "epoch": 0.6111190382763172, + "grad_norm": 0.32723286747932434, + "learning_rate": 1.970546881036909e-05, + "loss": 0.123, + "step": 34263 + }, + { + "epoch": 0.6111368743980309, + "grad_norm": 0.28426942229270935, + "learning_rate": 1.970394761915918e-05, + "loss": 0.1627, + "step": 34264 + }, + { + "epoch": 0.6111547105197446, + "grad_norm": 0.23602648079395294, + "learning_rate": 1.97024264484786e-05, + "loss": 0.1219, + "step": 34265 + }, + { + "epoch": 0.6111725466414583, + "grad_norm": 0.26101139187812805, + "learning_rate": 1.9700905298333244e-05, + "loss": 0.1462, + "step": 34266 + }, + { + "epoch": 0.611190382763172, + "grad_norm": 0.23656578361988068, + "learning_rate": 1.9699384168729e-05, + "loss": 0.0578, + "step": 34267 + }, + { + "epoch": 0.6112082188848856, + "grad_norm": 0.2915533483028412, + "learning_rate": 1.9697863059671783e-05, + "loss": 0.1806, + "step": 34268 + }, + { + "epoch": 0.6112260550065993, + "grad_norm": 0.2514471411705017, + "learning_rate": 1.9696341971167474e-05, + "loss": 0.1252, + "step": 34269 + }, + { + "epoch": 0.611243891128313, + "grad_norm": 0.228690505027771, + "learning_rate": 1.9694820903221977e-05, + "loss": 0.1373, + "step": 34270 + }, + { + "epoch": 0.6112617272500268, + "grad_norm": 0.32270339131355286, + "learning_rate": 1.9693299855841193e-05, + "loss": 0.1129, + "step": 34271 + }, + { + "epoch": 0.6112795633717405, + "grad_norm": 0.31586652994155884, + "learning_rate": 1.9691778829031e-05, + "loss": 0.1563, + "step": 34272 + }, + { + "epoch": 0.6112973994934542, + "grad_norm": 0.2652791440486908, + "learning_rate": 1.9690257822797315e-05, + "loss": 0.1156, + "step": 34273 + }, + { + "epoch": 0.6113152356151679, + "grad_norm": 0.2195061594247818, + "learning_rate": 1.9688736837146025e-05, + "loss": 0.0932, + "step": 34274 + }, + { + "epoch": 0.6113330717368816, + "grad_norm": 0.35406753420829773, + "learning_rate": 1.9687215872083016e-05, + "loss": 0.1623, + "step": 34275 + }, + { + "epoch": 0.6113509078585952, + "grad_norm": 0.2465466558933258, + "learning_rate": 1.9685694927614194e-05, + "loss": 0.1597, + "step": 34276 + }, + { + "epoch": 0.6113687439803089, + "grad_norm": 0.23225544393062592, + "learning_rate": 1.968417400374546e-05, + "loss": 0.1112, + "step": 34277 + }, + { + "epoch": 0.6113865801020226, + "grad_norm": 0.25865229964256287, + "learning_rate": 1.9682653100482707e-05, + "loss": 0.1227, + "step": 34278 + }, + { + "epoch": 0.6114044162237363, + "grad_norm": 0.24608168005943298, + "learning_rate": 1.9681132217831827e-05, + "loss": 0.1574, + "step": 34279 + }, + { + "epoch": 0.61142225234545, + "grad_norm": 0.22632557153701782, + "learning_rate": 1.96796113557987e-05, + "loss": 0.1078, + "step": 34280 + }, + { + "epoch": 0.6114400884671637, + "grad_norm": 0.23244720697402954, + "learning_rate": 1.9678090514389255e-05, + "loss": 0.0939, + "step": 34281 + }, + { + "epoch": 0.6114579245888774, + "grad_norm": 0.26269736886024475, + "learning_rate": 1.967656969360936e-05, + "loss": 0.1544, + "step": 34282 + }, + { + "epoch": 0.6114757607105911, + "grad_norm": 0.19003522396087646, + "learning_rate": 1.9675048893464927e-05, + "loss": 0.0948, + "step": 34283 + }, + { + "epoch": 0.6114935968323048, + "grad_norm": 0.30170726776123047, + "learning_rate": 1.967352811396184e-05, + "loss": 0.1636, + "step": 34284 + }, + { + "epoch": 0.6115114329540184, + "grad_norm": 0.3415099084377289, + "learning_rate": 1.967200735510599e-05, + "loss": 0.1257, + "step": 34285 + }, + { + "epoch": 0.6115292690757321, + "grad_norm": 0.34446874260902405, + "learning_rate": 1.967048661690329e-05, + "loss": 0.188, + "step": 34286 + }, + { + "epoch": 0.6115471051974458, + "grad_norm": 0.21127624809741974, + "learning_rate": 1.9668965899359622e-05, + "loss": 0.1457, + "step": 34287 + }, + { + "epoch": 0.6115649413191596, + "grad_norm": 0.2781338095664978, + "learning_rate": 1.966744520248088e-05, + "loss": 0.1149, + "step": 34288 + }, + { + "epoch": 0.6115827774408733, + "grad_norm": 0.2464647889137268, + "learning_rate": 1.9665924526272964e-05, + "loss": 0.0987, + "step": 34289 + }, + { + "epoch": 0.611600613562587, + "grad_norm": 0.364133358001709, + "learning_rate": 1.9664403870741765e-05, + "loss": 0.1304, + "step": 34290 + }, + { + "epoch": 0.6116184496843007, + "grad_norm": 0.21656657755374908, + "learning_rate": 1.9662883235893185e-05, + "loss": 0.1182, + "step": 34291 + }, + { + "epoch": 0.6116362858060144, + "grad_norm": 0.23903073370456696, + "learning_rate": 1.966136262173311e-05, + "loss": 0.1361, + "step": 34292 + }, + { + "epoch": 0.6116541219277281, + "grad_norm": 0.213217630982399, + "learning_rate": 1.9659842028267433e-05, + "loss": 0.112, + "step": 34293 + }, + { + "epoch": 0.6116719580494417, + "grad_norm": 0.28249478340148926, + "learning_rate": 1.965832145550206e-05, + "loss": 0.1917, + "step": 34294 + }, + { + "epoch": 0.6116897941711554, + "grad_norm": 0.26392725110054016, + "learning_rate": 1.965680090344288e-05, + "loss": 0.1329, + "step": 34295 + }, + { + "epoch": 0.6117076302928691, + "grad_norm": 0.23368078470230103, + "learning_rate": 1.9655280372095773e-05, + "loss": 0.0913, + "step": 34296 + }, + { + "epoch": 0.6117254664145828, + "grad_norm": 0.3019675612449646, + "learning_rate": 1.965375986146666e-05, + "loss": 0.103, + "step": 34297 + }, + { + "epoch": 0.6117433025362965, + "grad_norm": 0.2722953259944916, + "learning_rate": 1.9652239371561405e-05, + "loss": 0.1641, + "step": 34298 + }, + { + "epoch": 0.6117611386580102, + "grad_norm": 0.22746914625167847, + "learning_rate": 1.9650718902385924e-05, + "loss": 0.0903, + "step": 34299 + }, + { + "epoch": 0.6117789747797239, + "grad_norm": 0.27003300189971924, + "learning_rate": 1.964919845394611e-05, + "loss": 0.1058, + "step": 34300 + }, + { + "epoch": 0.6117968109014376, + "grad_norm": 0.2875027656555176, + "learning_rate": 1.964767802624785e-05, + "loss": 0.1324, + "step": 34301 + }, + { + "epoch": 0.6118146470231512, + "grad_norm": 0.20354218780994415, + "learning_rate": 1.9646157619297027e-05, + "loss": 0.1125, + "step": 34302 + }, + { + "epoch": 0.6118324831448649, + "grad_norm": 0.35294947028160095, + "learning_rate": 1.964463723309955e-05, + "loss": 0.1317, + "step": 34303 + }, + { + "epoch": 0.6118503192665786, + "grad_norm": 0.26902082562446594, + "learning_rate": 1.964311686766132e-05, + "loss": 0.1115, + "step": 34304 + }, + { + "epoch": 0.6118681553882924, + "grad_norm": 0.1982295662164688, + "learning_rate": 1.9641596522988212e-05, + "loss": 0.1012, + "step": 34305 + }, + { + "epoch": 0.6118859915100061, + "grad_norm": 0.23082873225212097, + "learning_rate": 1.964007619908612e-05, + "loss": 0.0953, + "step": 34306 + }, + { + "epoch": 0.6119038276317198, + "grad_norm": 0.26585277915000916, + "learning_rate": 1.9638555895960954e-05, + "loss": 0.1138, + "step": 34307 + }, + { + "epoch": 0.6119216637534335, + "grad_norm": 0.34653255343437195, + "learning_rate": 1.9637035613618596e-05, + "loss": 0.1574, + "step": 34308 + }, + { + "epoch": 0.6119394998751472, + "grad_norm": 0.37732672691345215, + "learning_rate": 1.9635515352064934e-05, + "loss": 0.1242, + "step": 34309 + }, + { + "epoch": 0.6119573359968609, + "grad_norm": 0.2179662585258484, + "learning_rate": 1.9633995111305874e-05, + "loss": 0.0796, + "step": 34310 + }, + { + "epoch": 0.6119751721185746, + "grad_norm": 0.3336751163005829, + "learning_rate": 1.9632474891347293e-05, + "loss": 0.1132, + "step": 34311 + }, + { + "epoch": 0.6119930082402882, + "grad_norm": 0.2968040108680725, + "learning_rate": 1.9630954692195104e-05, + "loss": 0.0928, + "step": 34312 + }, + { + "epoch": 0.6120108443620019, + "grad_norm": 0.269959956407547, + "learning_rate": 1.9629434513855185e-05, + "loss": 0.103, + "step": 34313 + }, + { + "epoch": 0.6120286804837156, + "grad_norm": 0.26234179735183716, + "learning_rate": 1.962791435633344e-05, + "loss": 0.1198, + "step": 34314 + }, + { + "epoch": 0.6120465166054293, + "grad_norm": 0.22992199659347534, + "learning_rate": 1.9626394219635734e-05, + "loss": 0.1624, + "step": 34315 + }, + { + "epoch": 0.612064352727143, + "grad_norm": 0.2733350992202759, + "learning_rate": 1.9624874103768e-05, + "loss": 0.0902, + "step": 34316 + }, + { + "epoch": 0.6120821888488567, + "grad_norm": 0.18465390801429749, + "learning_rate": 1.96233540087361e-05, + "loss": 0.1095, + "step": 34317 + }, + { + "epoch": 0.6121000249705704, + "grad_norm": 0.270947128534317, + "learning_rate": 1.962183393454594e-05, + "loss": 0.1048, + "step": 34318 + }, + { + "epoch": 0.612117861092284, + "grad_norm": 0.20678196847438812, + "learning_rate": 1.9620313881203406e-05, + "loss": 0.1061, + "step": 34319 + }, + { + "epoch": 0.6121356972139977, + "grad_norm": 0.34201112389564514, + "learning_rate": 1.96187938487144e-05, + "loss": 0.1642, + "step": 34320 + }, + { + "epoch": 0.6121535333357114, + "grad_norm": 0.27690356969833374, + "learning_rate": 1.961727383708481e-05, + "loss": 0.1429, + "step": 34321 + }, + { + "epoch": 0.6121713694574252, + "grad_norm": 0.26979929208755493, + "learning_rate": 1.961575384632052e-05, + "loss": 0.1395, + "step": 34322 + }, + { + "epoch": 0.6121892055791389, + "grad_norm": 0.29312068223953247, + "learning_rate": 1.9614233876427423e-05, + "loss": 0.1107, + "step": 34323 + }, + { + "epoch": 0.6122070417008526, + "grad_norm": 0.28420430421829224, + "learning_rate": 1.961271392741142e-05, + "loss": 0.1312, + "step": 34324 + }, + { + "epoch": 0.6122248778225663, + "grad_norm": 0.3592130243778229, + "learning_rate": 1.96111939992784e-05, + "loss": 0.1331, + "step": 34325 + }, + { + "epoch": 0.61224271394428, + "grad_norm": 0.24376356601715088, + "learning_rate": 1.960967409203425e-05, + "loss": 0.0853, + "step": 34326 + }, + { + "epoch": 0.6122605500659937, + "grad_norm": 0.2419992834329605, + "learning_rate": 1.9608154205684876e-05, + "loss": 0.0771, + "step": 34327 + }, + { + "epoch": 0.6122783861877074, + "grad_norm": 0.27290013432502747, + "learning_rate": 1.960663434023614e-05, + "loss": 0.1363, + "step": 34328 + }, + { + "epoch": 0.612296222309421, + "grad_norm": 0.22209122776985168, + "learning_rate": 1.9605114495693965e-05, + "loss": 0.1201, + "step": 34329 + }, + { + "epoch": 0.6123140584311347, + "grad_norm": 0.2678241729736328, + "learning_rate": 1.9603594672064225e-05, + "loss": 0.1493, + "step": 34330 + }, + { + "epoch": 0.6123318945528484, + "grad_norm": 0.21880501508712769, + "learning_rate": 1.960207486935282e-05, + "loss": 0.1167, + "step": 34331 + }, + { + "epoch": 0.6123497306745621, + "grad_norm": 0.25183209776878357, + "learning_rate": 1.960055508756563e-05, + "loss": 0.1271, + "step": 34332 + }, + { + "epoch": 0.6123675667962758, + "grad_norm": 0.30424416065216064, + "learning_rate": 1.959903532670856e-05, + "loss": 0.131, + "step": 34333 + }, + { + "epoch": 0.6123854029179895, + "grad_norm": 0.32900524139404297, + "learning_rate": 1.9597515586787496e-05, + "loss": 0.1343, + "step": 34334 + }, + { + "epoch": 0.6124032390397032, + "grad_norm": 0.25981438159942627, + "learning_rate": 1.9595995867808324e-05, + "loss": 0.1148, + "step": 34335 + }, + { + "epoch": 0.6124210751614169, + "grad_norm": 0.26695725321769714, + "learning_rate": 1.959447616977694e-05, + "loss": 0.1253, + "step": 34336 + }, + { + "epoch": 0.6124389112831305, + "grad_norm": 0.277535080909729, + "learning_rate": 1.959295649269923e-05, + "loss": 0.1613, + "step": 34337 + }, + { + "epoch": 0.6124567474048442, + "grad_norm": 0.336956262588501, + "learning_rate": 1.9591436836581088e-05, + "loss": 0.1542, + "step": 34338 + }, + { + "epoch": 0.612474583526558, + "grad_norm": 0.20411263406276703, + "learning_rate": 1.958991720142841e-05, + "loss": 0.1426, + "step": 34339 + }, + { + "epoch": 0.6124924196482717, + "grad_norm": 0.2600489854812622, + "learning_rate": 1.958839758724708e-05, + "loss": 0.1099, + "step": 34340 + }, + { + "epoch": 0.6125102557699854, + "grad_norm": 0.33901867270469666, + "learning_rate": 1.9586877994042984e-05, + "loss": 0.1769, + "step": 34341 + }, + { + "epoch": 0.6125280918916991, + "grad_norm": 0.2957024872303009, + "learning_rate": 1.9585358421822024e-05, + "loss": 0.1391, + "step": 34342 + }, + { + "epoch": 0.6125459280134128, + "grad_norm": 0.27155688405036926, + "learning_rate": 1.9583838870590087e-05, + "loss": 0.0925, + "step": 34343 + }, + { + "epoch": 0.6125637641351265, + "grad_norm": 0.4036776125431061, + "learning_rate": 1.958231934035306e-05, + "loss": 0.2128, + "step": 34344 + }, + { + "epoch": 0.6125816002568402, + "grad_norm": 0.2016754448413849, + "learning_rate": 1.9580799831116827e-05, + "loss": 0.0806, + "step": 34345 + }, + { + "epoch": 0.6125994363785539, + "grad_norm": 0.29734769463539124, + "learning_rate": 1.957928034288729e-05, + "loss": 0.1161, + "step": 34346 + }, + { + "epoch": 0.6126172725002675, + "grad_norm": 0.3466891348361969, + "learning_rate": 1.957776087567034e-05, + "loss": 0.1477, + "step": 34347 + }, + { + "epoch": 0.6126351086219812, + "grad_norm": 0.2298237383365631, + "learning_rate": 1.957624142947186e-05, + "loss": 0.1321, + "step": 34348 + }, + { + "epoch": 0.6126529447436949, + "grad_norm": 0.28484708070755005, + "learning_rate": 1.9574722004297745e-05, + "loss": 0.1147, + "step": 34349 + }, + { + "epoch": 0.6126707808654086, + "grad_norm": 0.38168346881866455, + "learning_rate": 1.9573202600153868e-05, + "loss": 0.1607, + "step": 34350 + }, + { + "epoch": 0.6126886169871223, + "grad_norm": 0.38014307618141174, + "learning_rate": 1.9571683217046143e-05, + "loss": 0.1344, + "step": 34351 + }, + { + "epoch": 0.612706453108836, + "grad_norm": 0.1755140721797943, + "learning_rate": 1.957016385498044e-05, + "loss": 0.1133, + "step": 34352 + }, + { + "epoch": 0.6127242892305497, + "grad_norm": 0.2478196620941162, + "learning_rate": 1.9568644513962667e-05, + "loss": 0.1372, + "step": 34353 + }, + { + "epoch": 0.6127421253522634, + "grad_norm": 0.2102089375257492, + "learning_rate": 1.9567125193998693e-05, + "loss": 0.1131, + "step": 34354 + }, + { + "epoch": 0.612759961473977, + "grad_norm": 0.29458630084991455, + "learning_rate": 1.956560589509443e-05, + "loss": 0.1728, + "step": 34355 + }, + { + "epoch": 0.6127777975956908, + "grad_norm": 0.2509872317314148, + "learning_rate": 1.956408661725575e-05, + "loss": 0.1361, + "step": 34356 + }, + { + "epoch": 0.6127956337174045, + "grad_norm": 0.1840127557516098, + "learning_rate": 1.9562567360488546e-05, + "loss": 0.0811, + "step": 34357 + }, + { + "epoch": 0.6128134698391182, + "grad_norm": 0.2356569468975067, + "learning_rate": 1.956104812479871e-05, + "loss": 0.1137, + "step": 34358 + }, + { + "epoch": 0.6128313059608319, + "grad_norm": 0.3009377717971802, + "learning_rate": 1.9559528910192126e-05, + "loss": 0.1735, + "step": 34359 + }, + { + "epoch": 0.6128491420825456, + "grad_norm": 0.24567048251628876, + "learning_rate": 1.9558009716674698e-05, + "loss": 0.1497, + "step": 34360 + }, + { + "epoch": 0.6128669782042593, + "grad_norm": 0.29931673407554626, + "learning_rate": 1.9556490544252297e-05, + "loss": 0.1251, + "step": 34361 + }, + { + "epoch": 0.612884814325973, + "grad_norm": 0.23354960978031158, + "learning_rate": 1.9554971392930828e-05, + "loss": 0.1071, + "step": 34362 + }, + { + "epoch": 0.6129026504476867, + "grad_norm": 0.25356778502464294, + "learning_rate": 1.9553452262716153e-05, + "loss": 0.1515, + "step": 34363 + }, + { + "epoch": 0.6129204865694003, + "grad_norm": 0.32689082622528076, + "learning_rate": 1.955193315361419e-05, + "loss": 0.1667, + "step": 34364 + }, + { + "epoch": 0.612938322691114, + "grad_norm": 0.2577555179595947, + "learning_rate": 1.9550414065630813e-05, + "loss": 0.1415, + "step": 34365 + }, + { + "epoch": 0.6129561588128277, + "grad_norm": 0.2391408085823059, + "learning_rate": 1.9548894998771916e-05, + "loss": 0.1365, + "step": 34366 + }, + { + "epoch": 0.6129739949345414, + "grad_norm": 0.3446679413318634, + "learning_rate": 1.9547375953043373e-05, + "loss": 0.1255, + "step": 34367 + }, + { + "epoch": 0.6129918310562551, + "grad_norm": 0.28184783458709717, + "learning_rate": 1.9545856928451098e-05, + "loss": 0.1754, + "step": 34368 + }, + { + "epoch": 0.6130096671779688, + "grad_norm": 0.2451029121875763, + "learning_rate": 1.9544337925000964e-05, + "loss": 0.144, + "step": 34369 + }, + { + "epoch": 0.6130275032996825, + "grad_norm": 0.24050399661064148, + "learning_rate": 1.954281894269886e-05, + "loss": 0.1273, + "step": 34370 + }, + { + "epoch": 0.6130453394213962, + "grad_norm": 0.32015708088874817, + "learning_rate": 1.9541299981550666e-05, + "loss": 0.1199, + "step": 34371 + }, + { + "epoch": 0.61306317554311, + "grad_norm": 0.23027445375919342, + "learning_rate": 1.9539781041562284e-05, + "loss": 0.1209, + "step": 34372 + }, + { + "epoch": 0.6130810116648236, + "grad_norm": 0.21623660624027252, + "learning_rate": 1.9538262122739596e-05, + "loss": 0.1234, + "step": 34373 + }, + { + "epoch": 0.6130988477865373, + "grad_norm": 0.3044774532318115, + "learning_rate": 1.9536743225088496e-05, + "loss": 0.1246, + "step": 34374 + }, + { + "epoch": 0.613116683908251, + "grad_norm": 0.267084002494812, + "learning_rate": 1.9535224348614862e-05, + "loss": 0.1039, + "step": 34375 + }, + { + "epoch": 0.6131345200299647, + "grad_norm": 0.32404825091362, + "learning_rate": 1.953370549332458e-05, + "loss": 0.1614, + "step": 34376 + }, + { + "epoch": 0.6131523561516784, + "grad_norm": 0.3082652986049652, + "learning_rate": 1.953218665922355e-05, + "loss": 0.1478, + "step": 34377 + }, + { + "epoch": 0.6131701922733921, + "grad_norm": 0.22887858748435974, + "learning_rate": 1.9530667846317657e-05, + "loss": 0.0915, + "step": 34378 + }, + { + "epoch": 0.6131880283951058, + "grad_norm": 0.34719282388687134, + "learning_rate": 1.9529149054612778e-05, + "loss": 0.1594, + "step": 34379 + }, + { + "epoch": 0.6132058645168195, + "grad_norm": 0.2712959945201874, + "learning_rate": 1.95276302841148e-05, + "loss": 0.1132, + "step": 34380 + }, + { + "epoch": 0.6132237006385332, + "grad_norm": 0.1935320645570755, + "learning_rate": 1.952611153482963e-05, + "loss": 0.1423, + "step": 34381 + }, + { + "epoch": 0.6132415367602468, + "grad_norm": 0.6655855774879456, + "learning_rate": 1.952459280676314e-05, + "loss": 0.1128, + "step": 34382 + }, + { + "epoch": 0.6132593728819605, + "grad_norm": 0.2346189171075821, + "learning_rate": 1.9523074099921223e-05, + "loss": 0.1043, + "step": 34383 + }, + { + "epoch": 0.6132772090036742, + "grad_norm": 0.28837835788726807, + "learning_rate": 1.9521555414309747e-05, + "loss": 0.1335, + "step": 34384 + }, + { + "epoch": 0.6132950451253879, + "grad_norm": 0.25568902492523193, + "learning_rate": 1.9520036749934628e-05, + "loss": 0.149, + "step": 34385 + }, + { + "epoch": 0.6133128812471016, + "grad_norm": 0.2111838310956955, + "learning_rate": 1.9518518106801733e-05, + "loss": 0.0832, + "step": 34386 + }, + { + "epoch": 0.6133307173688153, + "grad_norm": 0.23745548725128174, + "learning_rate": 1.951699948491696e-05, + "loss": 0.1274, + "step": 34387 + }, + { + "epoch": 0.613348553490529, + "grad_norm": 0.24505575001239777, + "learning_rate": 1.951548088428619e-05, + "loss": 0.1125, + "step": 34388 + }, + { + "epoch": 0.6133663896122428, + "grad_norm": 0.28381475806236267, + "learning_rate": 1.9513962304915302e-05, + "loss": 0.1155, + "step": 34389 + }, + { + "epoch": 0.6133842257339565, + "grad_norm": 0.2749321162700653, + "learning_rate": 1.9512443746810204e-05, + "loss": 0.1905, + "step": 34390 + }, + { + "epoch": 0.6134020618556701, + "grad_norm": 0.27395883202552795, + "learning_rate": 1.9510925209976762e-05, + "loss": 0.1436, + "step": 34391 + }, + { + "epoch": 0.6134198979773838, + "grad_norm": 0.35737577080726624, + "learning_rate": 1.950940669442087e-05, + "loss": 0.1411, + "step": 34392 + }, + { + "epoch": 0.6134377340990975, + "grad_norm": 0.30439358949661255, + "learning_rate": 1.9507888200148414e-05, + "loss": 0.1414, + "step": 34393 + }, + { + "epoch": 0.6134555702208112, + "grad_norm": 0.31353920698165894, + "learning_rate": 1.9506369727165278e-05, + "loss": 0.1468, + "step": 34394 + }, + { + "epoch": 0.6134734063425249, + "grad_norm": 0.24509234726428986, + "learning_rate": 1.9504851275477357e-05, + "loss": 0.0892, + "step": 34395 + }, + { + "epoch": 0.6134912424642386, + "grad_norm": 0.2553241550922394, + "learning_rate": 1.950333284509053e-05, + "loss": 0.1261, + "step": 34396 + }, + { + "epoch": 0.6135090785859523, + "grad_norm": 0.32922300696372986, + "learning_rate": 1.9501814436010672e-05, + "loss": 0.1199, + "step": 34397 + }, + { + "epoch": 0.613526914707666, + "grad_norm": 0.31108734011650085, + "learning_rate": 1.950029604824369e-05, + "loss": 0.1462, + "step": 34398 + }, + { + "epoch": 0.6135447508293796, + "grad_norm": 0.1704423576593399, + "learning_rate": 1.949877768179546e-05, + "loss": 0.1186, + "step": 34399 + }, + { + "epoch": 0.6135625869510933, + "grad_norm": 0.3232390582561493, + "learning_rate": 1.9497259336671868e-05, + "loss": 0.1092, + "step": 34400 + }, + { + "epoch": 0.613580423072807, + "grad_norm": 0.26013556122779846, + "learning_rate": 1.94957410128788e-05, + "loss": 0.0965, + "step": 34401 + }, + { + "epoch": 0.6135982591945207, + "grad_norm": 0.24986091256141663, + "learning_rate": 1.949422271042213e-05, + "loss": 0.1225, + "step": 34402 + }, + { + "epoch": 0.6136160953162344, + "grad_norm": 0.24714888632297516, + "learning_rate": 1.9492704429307768e-05, + "loss": 0.0926, + "step": 34403 + }, + { + "epoch": 0.6136339314379481, + "grad_norm": 0.28164413571357727, + "learning_rate": 1.9491186169541585e-05, + "loss": 0.1164, + "step": 34404 + }, + { + "epoch": 0.6136517675596618, + "grad_norm": 0.2647169530391693, + "learning_rate": 1.9489667931129465e-05, + "loss": 0.1958, + "step": 34405 + }, + { + "epoch": 0.6136696036813756, + "grad_norm": 0.24187271296977997, + "learning_rate": 1.9488149714077287e-05, + "loss": 0.128, + "step": 34406 + }, + { + "epoch": 0.6136874398030893, + "grad_norm": 0.29026854038238525, + "learning_rate": 1.9486631518390946e-05, + "loss": 0.1544, + "step": 34407 + }, + { + "epoch": 0.613705275924803, + "grad_norm": 0.25430163741111755, + "learning_rate": 1.9485113344076333e-05, + "loss": 0.1133, + "step": 34408 + }, + { + "epoch": 0.6137231120465166, + "grad_norm": 0.2896972894668579, + "learning_rate": 1.9483595191139324e-05, + "loss": 0.1259, + "step": 34409 + }, + { + "epoch": 0.6137409481682303, + "grad_norm": 0.3382795453071594, + "learning_rate": 1.9482077059585798e-05, + "loss": 0.1295, + "step": 34410 + }, + { + "epoch": 0.613758784289944, + "grad_norm": 0.37541109323501587, + "learning_rate": 1.9480558949421656e-05, + "loss": 0.2029, + "step": 34411 + }, + { + "epoch": 0.6137766204116577, + "grad_norm": 0.20420439541339874, + "learning_rate": 1.9479040860652777e-05, + "loss": 0.1261, + "step": 34412 + }, + { + "epoch": 0.6137944565333714, + "grad_norm": 0.2869111895561218, + "learning_rate": 1.9477522793285032e-05, + "loss": 0.1405, + "step": 34413 + }, + { + "epoch": 0.6138122926550851, + "grad_norm": 0.26251155138015747, + "learning_rate": 1.9476004747324324e-05, + "loss": 0.1087, + "step": 34414 + }, + { + "epoch": 0.6138301287767988, + "grad_norm": 0.2096259891986847, + "learning_rate": 1.947448672277652e-05, + "loss": 0.1356, + "step": 34415 + }, + { + "epoch": 0.6138479648985125, + "grad_norm": 0.2769961655139923, + "learning_rate": 1.947296871964752e-05, + "loss": 0.1445, + "step": 34416 + }, + { + "epoch": 0.6138658010202261, + "grad_norm": 0.27338385581970215, + "learning_rate": 1.947145073794321e-05, + "loss": 0.1554, + "step": 34417 + }, + { + "epoch": 0.6138836371419398, + "grad_norm": 0.27455809712409973, + "learning_rate": 1.946993277766946e-05, + "loss": 0.1414, + "step": 34418 + }, + { + "epoch": 0.6139014732636535, + "grad_norm": 0.24713504314422607, + "learning_rate": 1.946841483883215e-05, + "loss": 0.1728, + "step": 34419 + }, + { + "epoch": 0.6139193093853672, + "grad_norm": 0.2966313660144806, + "learning_rate": 1.946689692143719e-05, + "loss": 0.13, + "step": 34420 + }, + { + "epoch": 0.6139371455070809, + "grad_norm": 0.35055744647979736, + "learning_rate": 1.9465379025490438e-05, + "loss": 0.1001, + "step": 34421 + }, + { + "epoch": 0.6139549816287946, + "grad_norm": 0.3221801221370697, + "learning_rate": 1.9463861150997797e-05, + "loss": 0.1242, + "step": 34422 + }, + { + "epoch": 0.6139728177505084, + "grad_norm": 0.22965948283672333, + "learning_rate": 1.946234329796513e-05, + "loss": 0.1102, + "step": 34423 + }, + { + "epoch": 0.6139906538722221, + "grad_norm": 0.31719475984573364, + "learning_rate": 1.9460825466398343e-05, + "loss": 0.1534, + "step": 34424 + }, + { + "epoch": 0.6140084899939358, + "grad_norm": 0.2211807817220688, + "learning_rate": 1.945930765630331e-05, + "loss": 0.1578, + "step": 34425 + }, + { + "epoch": 0.6140263261156494, + "grad_norm": 0.23441603779792786, + "learning_rate": 1.9457789867685916e-05, + "loss": 0.1062, + "step": 34426 + }, + { + "epoch": 0.6140441622373631, + "grad_norm": 0.23012542724609375, + "learning_rate": 1.9456272100552035e-05, + "loss": 0.1185, + "step": 34427 + }, + { + "epoch": 0.6140619983590768, + "grad_norm": 0.2291250377893448, + "learning_rate": 1.945475435490756e-05, + "loss": 0.0959, + "step": 34428 + }, + { + "epoch": 0.6140798344807905, + "grad_norm": 0.28275078535079956, + "learning_rate": 1.9453236630758375e-05, + "loss": 0.1624, + "step": 34429 + }, + { + "epoch": 0.6140976706025042, + "grad_norm": 0.2316838800907135, + "learning_rate": 1.9451718928110363e-05, + "loss": 0.1045, + "step": 34430 + }, + { + "epoch": 0.6141155067242179, + "grad_norm": 0.37640243768692017, + "learning_rate": 1.9450201246969405e-05, + "loss": 0.1487, + "step": 34431 + }, + { + "epoch": 0.6141333428459316, + "grad_norm": 0.27081558108329773, + "learning_rate": 1.944868358734137e-05, + "loss": 0.1399, + "step": 34432 + }, + { + "epoch": 0.6141511789676453, + "grad_norm": 0.28149375319480896, + "learning_rate": 1.944716594923217e-05, + "loss": 0.1747, + "step": 34433 + }, + { + "epoch": 0.614169015089359, + "grad_norm": 0.3042205274105072, + "learning_rate": 1.9445648332647667e-05, + "loss": 0.129, + "step": 34434 + }, + { + "epoch": 0.6141868512110726, + "grad_norm": 0.2766510844230652, + "learning_rate": 1.944413073759375e-05, + "loss": 0.132, + "step": 34435 + }, + { + "epoch": 0.6142046873327863, + "grad_norm": 0.33994901180267334, + "learning_rate": 1.9442613164076294e-05, + "loss": 0.1366, + "step": 34436 + }, + { + "epoch": 0.6142225234545, + "grad_norm": 0.21081985533237457, + "learning_rate": 1.9441095612101202e-05, + "loss": 0.1333, + "step": 34437 + }, + { + "epoch": 0.6142403595762137, + "grad_norm": 0.3470522463321686, + "learning_rate": 1.943957808167434e-05, + "loss": 0.1094, + "step": 34438 + }, + { + "epoch": 0.6142581956979274, + "grad_norm": 0.20546455681324005, + "learning_rate": 1.94380605728016e-05, + "loss": 0.1395, + "step": 34439 + }, + { + "epoch": 0.6142760318196412, + "grad_norm": 0.30680495500564575, + "learning_rate": 1.9436543085488847e-05, + "loss": 0.1306, + "step": 34440 + }, + { + "epoch": 0.6142938679413549, + "grad_norm": 0.24016529321670532, + "learning_rate": 1.9435025619741974e-05, + "loss": 0.1367, + "step": 34441 + }, + { + "epoch": 0.6143117040630686, + "grad_norm": 0.3430418074131012, + "learning_rate": 1.9433508175566865e-05, + "loss": 0.1101, + "step": 34442 + }, + { + "epoch": 0.6143295401847823, + "grad_norm": 0.2906799018383026, + "learning_rate": 1.9431990752969402e-05, + "loss": 0.1512, + "step": 34443 + }, + { + "epoch": 0.6143473763064959, + "grad_norm": 0.26818475127220154, + "learning_rate": 1.9430473351955474e-05, + "loss": 0.1171, + "step": 34444 + }, + { + "epoch": 0.6143652124282096, + "grad_norm": 0.3054639399051666, + "learning_rate": 1.9428955972530938e-05, + "loss": 0.1464, + "step": 34445 + }, + { + "epoch": 0.6143830485499233, + "grad_norm": 0.23612850904464722, + "learning_rate": 1.9427438614701707e-05, + "loss": 0.147, + "step": 34446 + }, + { + "epoch": 0.614400884671637, + "grad_norm": 0.310243159532547, + "learning_rate": 1.9425921278473646e-05, + "loss": 0.0852, + "step": 34447 + }, + { + "epoch": 0.6144187207933507, + "grad_norm": 0.23815155029296875, + "learning_rate": 1.942440396385264e-05, + "loss": 0.1498, + "step": 34448 + }, + { + "epoch": 0.6144365569150644, + "grad_norm": 0.37250688672065735, + "learning_rate": 1.9422886670844563e-05, + "loss": 0.1528, + "step": 34449 + }, + { + "epoch": 0.6144543930367781, + "grad_norm": 0.30396950244903564, + "learning_rate": 1.942136939945531e-05, + "loss": 0.0858, + "step": 34450 + }, + { + "epoch": 0.6144722291584918, + "grad_norm": 0.24318405985832214, + "learning_rate": 1.9419852149690755e-05, + "loss": 0.1134, + "step": 34451 + }, + { + "epoch": 0.6144900652802054, + "grad_norm": 0.29396671056747437, + "learning_rate": 1.9418334921556783e-05, + "loss": 0.127, + "step": 34452 + }, + { + "epoch": 0.6145079014019191, + "grad_norm": 0.27991798520088196, + "learning_rate": 1.941681771505927e-05, + "loss": 0.1275, + "step": 34453 + }, + { + "epoch": 0.6145257375236328, + "grad_norm": 0.2710343301296234, + "learning_rate": 1.9415300530204095e-05, + "loss": 0.12, + "step": 34454 + }, + { + "epoch": 0.6145435736453465, + "grad_norm": 0.2413530945777893, + "learning_rate": 1.941378336699715e-05, + "loss": 0.0984, + "step": 34455 + }, + { + "epoch": 0.6145614097670602, + "grad_norm": 0.2243719846010208, + "learning_rate": 1.9412266225444305e-05, + "loss": 0.1147, + "step": 34456 + }, + { + "epoch": 0.614579245888774, + "grad_norm": 0.2554042339324951, + "learning_rate": 1.9410749105551453e-05, + "loss": 0.117, + "step": 34457 + }, + { + "epoch": 0.6145970820104877, + "grad_norm": 0.33550775051116943, + "learning_rate": 1.9409232007324458e-05, + "loss": 0.0943, + "step": 34458 + }, + { + "epoch": 0.6146149181322014, + "grad_norm": 0.20211301743984222, + "learning_rate": 1.940771493076922e-05, + "loss": 0.1141, + "step": 34459 + }, + { + "epoch": 0.6146327542539151, + "grad_norm": 0.2155083864927292, + "learning_rate": 1.9406197875891613e-05, + "loss": 0.1186, + "step": 34460 + }, + { + "epoch": 0.6146505903756287, + "grad_norm": 0.3749031126499176, + "learning_rate": 1.9404680842697507e-05, + "loss": 0.1727, + "step": 34461 + }, + { + "epoch": 0.6146684264973424, + "grad_norm": 0.3554801642894745, + "learning_rate": 1.940316383119279e-05, + "loss": 0.1177, + "step": 34462 + }, + { + "epoch": 0.6146862626190561, + "grad_norm": 0.27102598547935486, + "learning_rate": 1.9401646841383346e-05, + "loss": 0.1127, + "step": 34463 + }, + { + "epoch": 0.6147040987407698, + "grad_norm": 0.30681324005126953, + "learning_rate": 1.9400129873275055e-05, + "loss": 0.1572, + "step": 34464 + }, + { + "epoch": 0.6147219348624835, + "grad_norm": 0.293938010931015, + "learning_rate": 1.9398612926873792e-05, + "loss": 0.1629, + "step": 34465 + }, + { + "epoch": 0.6147397709841972, + "grad_norm": 0.6156755089759827, + "learning_rate": 1.9397096002185447e-05, + "loss": 0.1008, + "step": 34466 + }, + { + "epoch": 0.6147576071059109, + "grad_norm": 0.24818457663059235, + "learning_rate": 1.939557909921588e-05, + "loss": 0.148, + "step": 34467 + }, + { + "epoch": 0.6147754432276246, + "grad_norm": 0.29153475165367126, + "learning_rate": 1.9394062217970994e-05, + "loss": 0.1014, + "step": 34468 + }, + { + "epoch": 0.6147932793493383, + "grad_norm": 0.3758023679256439, + "learning_rate": 1.9392545358456653e-05, + "loss": 0.138, + "step": 34469 + }, + { + "epoch": 0.6148111154710519, + "grad_norm": 0.2715372145175934, + "learning_rate": 1.939102852067875e-05, + "loss": 0.1495, + "step": 34470 + }, + { + "epoch": 0.6148289515927656, + "grad_norm": 0.24557875096797943, + "learning_rate": 1.9389511704643143e-05, + "loss": 0.1348, + "step": 34471 + }, + { + "epoch": 0.6148467877144793, + "grad_norm": 0.18844127655029297, + "learning_rate": 1.9387994910355743e-05, + "loss": 0.1308, + "step": 34472 + }, + { + "epoch": 0.614864623836193, + "grad_norm": 0.2698347568511963, + "learning_rate": 1.938647813782241e-05, + "loss": 0.1283, + "step": 34473 + }, + { + "epoch": 0.6148824599579068, + "grad_norm": 0.2662133276462555, + "learning_rate": 1.9384961387049023e-05, + "loss": 0.1165, + "step": 34474 + }, + { + "epoch": 0.6149002960796205, + "grad_norm": 0.23293600976467133, + "learning_rate": 1.938344465804146e-05, + "loss": 0.0728, + "step": 34475 + }, + { + "epoch": 0.6149181322013342, + "grad_norm": 0.26369717717170715, + "learning_rate": 1.9381927950805615e-05, + "loss": 0.1187, + "step": 34476 + }, + { + "epoch": 0.6149359683230479, + "grad_norm": 0.2170003056526184, + "learning_rate": 1.938041126534735e-05, + "loss": 0.0846, + "step": 34477 + }, + { + "epoch": 0.6149538044447616, + "grad_norm": 0.2619723975658417, + "learning_rate": 1.937889460167256e-05, + "loss": 0.1615, + "step": 34478 + }, + { + "epoch": 0.6149716405664752, + "grad_norm": 0.3334028422832489, + "learning_rate": 1.9377377959787106e-05, + "loss": 0.1099, + "step": 34479 + }, + { + "epoch": 0.6149894766881889, + "grad_norm": 0.22388894855976105, + "learning_rate": 1.9375861339696884e-05, + "loss": 0.0987, + "step": 34480 + }, + { + "epoch": 0.6150073128099026, + "grad_norm": 0.26746198534965515, + "learning_rate": 1.9374344741407768e-05, + "loss": 0.1226, + "step": 34481 + }, + { + "epoch": 0.6150251489316163, + "grad_norm": 0.3266472816467285, + "learning_rate": 1.937282816492563e-05, + "loss": 0.1769, + "step": 34482 + }, + { + "epoch": 0.61504298505333, + "grad_norm": 0.23216617107391357, + "learning_rate": 1.9371311610256358e-05, + "loss": 0.1134, + "step": 34483 + }, + { + "epoch": 0.6150608211750437, + "grad_norm": 0.2086256444454193, + "learning_rate": 1.9369795077405813e-05, + "loss": 0.0978, + "step": 34484 + }, + { + "epoch": 0.6150786572967574, + "grad_norm": 0.29555922746658325, + "learning_rate": 1.9368278566379902e-05, + "loss": 0.0864, + "step": 34485 + }, + { + "epoch": 0.6150964934184711, + "grad_norm": 0.23405064642429352, + "learning_rate": 1.9366762077184487e-05, + "loss": 0.1443, + "step": 34486 + }, + { + "epoch": 0.6151143295401847, + "grad_norm": 0.23380424082279205, + "learning_rate": 1.9365245609825448e-05, + "loss": 0.0673, + "step": 34487 + }, + { + "epoch": 0.6151321656618984, + "grad_norm": 0.19976286590099335, + "learning_rate": 1.936372916430865e-05, + "loss": 0.0942, + "step": 34488 + }, + { + "epoch": 0.6151500017836121, + "grad_norm": 0.2814916670322418, + "learning_rate": 1.9362212740639997e-05, + "loss": 0.119, + "step": 34489 + }, + { + "epoch": 0.6151678379053259, + "grad_norm": 0.2697890102863312, + "learning_rate": 1.936069633882535e-05, + "loss": 0.156, + "step": 34490 + }, + { + "epoch": 0.6151856740270396, + "grad_norm": 0.32851386070251465, + "learning_rate": 1.9359179958870595e-05, + "loss": 0.2139, + "step": 34491 + }, + { + "epoch": 0.6152035101487533, + "grad_norm": 0.2962421476840973, + "learning_rate": 1.93576636007816e-05, + "loss": 0.1382, + "step": 34492 + }, + { + "epoch": 0.615221346270467, + "grad_norm": 0.3056343197822571, + "learning_rate": 1.9356147264564257e-05, + "loss": 0.1818, + "step": 34493 + }, + { + "epoch": 0.6152391823921807, + "grad_norm": 0.22571781277656555, + "learning_rate": 1.9354630950224438e-05, + "loss": 0.1378, + "step": 34494 + }, + { + "epoch": 0.6152570185138944, + "grad_norm": 0.28213798999786377, + "learning_rate": 1.9353114657768017e-05, + "loss": 0.1575, + "step": 34495 + }, + { + "epoch": 0.615274854635608, + "grad_norm": 0.3141067624092102, + "learning_rate": 1.9351598387200872e-05, + "loss": 0.1092, + "step": 34496 + }, + { + "epoch": 0.6152926907573217, + "grad_norm": 0.2912456691265106, + "learning_rate": 1.935008213852888e-05, + "loss": 0.1753, + "step": 34497 + }, + { + "epoch": 0.6153105268790354, + "grad_norm": 0.28660398721694946, + "learning_rate": 1.9348565911757924e-05, + "loss": 0.158, + "step": 34498 + }, + { + "epoch": 0.6153283630007491, + "grad_norm": 0.23358003795146942, + "learning_rate": 1.9347049706893884e-05, + "loss": 0.1094, + "step": 34499 + }, + { + "epoch": 0.6153461991224628, + "grad_norm": 0.23171283304691315, + "learning_rate": 1.9345533523942628e-05, + "loss": 0.1096, + "step": 34500 + }, + { + "epoch": 0.6153640352441765, + "grad_norm": 0.24595412611961365, + "learning_rate": 1.9344017362910026e-05, + "loss": 0.129, + "step": 34501 + }, + { + "epoch": 0.6153818713658902, + "grad_norm": 0.22029787302017212, + "learning_rate": 1.934250122380198e-05, + "loss": 0.1019, + "step": 34502 + }, + { + "epoch": 0.6153997074876039, + "grad_norm": 0.26852864027023315, + "learning_rate": 1.9340985106624354e-05, + "loss": 0.128, + "step": 34503 + }, + { + "epoch": 0.6154175436093176, + "grad_norm": 0.2965337634086609, + "learning_rate": 1.9339469011383017e-05, + "loss": 0.1563, + "step": 34504 + }, + { + "epoch": 0.6154353797310312, + "grad_norm": 0.35104653239250183, + "learning_rate": 1.933795293808385e-05, + "loss": 0.1768, + "step": 34505 + }, + { + "epoch": 0.6154532158527449, + "grad_norm": 0.2872408628463745, + "learning_rate": 1.933643688673274e-05, + "loss": 0.1758, + "step": 34506 + }, + { + "epoch": 0.6154710519744587, + "grad_norm": 0.37812554836273193, + "learning_rate": 1.933492085733556e-05, + "loss": 0.1516, + "step": 34507 + }, + { + "epoch": 0.6154888880961724, + "grad_norm": 0.18025898933410645, + "learning_rate": 1.933340484989818e-05, + "loss": 0.122, + "step": 34508 + }, + { + "epoch": 0.6155067242178861, + "grad_norm": 0.32844042778015137, + "learning_rate": 1.9331888864426482e-05, + "loss": 0.1333, + "step": 34509 + }, + { + "epoch": 0.6155245603395998, + "grad_norm": 0.24264642596244812, + "learning_rate": 1.933037290092633e-05, + "loss": 0.1221, + "step": 34510 + }, + { + "epoch": 0.6155423964613135, + "grad_norm": 0.240075945854187, + "learning_rate": 1.932885695940361e-05, + "loss": 0.1256, + "step": 34511 + }, + { + "epoch": 0.6155602325830272, + "grad_norm": 0.23587296903133392, + "learning_rate": 1.9327341039864216e-05, + "loss": 0.1066, + "step": 34512 + }, + { + "epoch": 0.6155780687047409, + "grad_norm": 0.2907547950744629, + "learning_rate": 1.9325825142314002e-05, + "loss": 0.1326, + "step": 34513 + }, + { + "epoch": 0.6155959048264545, + "grad_norm": 0.3378159999847412, + "learning_rate": 1.932430926675884e-05, + "loss": 0.109, + "step": 34514 + }, + { + "epoch": 0.6156137409481682, + "grad_norm": 0.21095429360866547, + "learning_rate": 1.932279341320462e-05, + "loss": 0.0832, + "step": 34515 + }, + { + "epoch": 0.6156315770698819, + "grad_norm": 0.31158697605133057, + "learning_rate": 1.932127758165722e-05, + "loss": 0.1304, + "step": 34516 + }, + { + "epoch": 0.6156494131915956, + "grad_norm": 0.20838488638401031, + "learning_rate": 1.9319761772122503e-05, + "loss": 0.1048, + "step": 34517 + }, + { + "epoch": 0.6156672493133093, + "grad_norm": 0.21778038144111633, + "learning_rate": 1.9318245984606352e-05, + "loss": 0.1258, + "step": 34518 + }, + { + "epoch": 0.615685085435023, + "grad_norm": 0.28368258476257324, + "learning_rate": 1.931673021911464e-05, + "loss": 0.1583, + "step": 34519 + }, + { + "epoch": 0.6157029215567367, + "grad_norm": 0.2694331705570221, + "learning_rate": 1.931521447565325e-05, + "loss": 0.117, + "step": 34520 + }, + { + "epoch": 0.6157207576784504, + "grad_norm": 0.24522793292999268, + "learning_rate": 1.9313698754228056e-05, + "loss": 0.0907, + "step": 34521 + }, + { + "epoch": 0.615738593800164, + "grad_norm": 0.27080926299095154, + "learning_rate": 1.9312183054844924e-05, + "loss": 0.1173, + "step": 34522 + }, + { + "epoch": 0.6157564299218777, + "grad_norm": 0.242206871509552, + "learning_rate": 1.9310667377509728e-05, + "loss": 0.1594, + "step": 34523 + }, + { + "epoch": 0.6157742660435915, + "grad_norm": 0.25656843185424805, + "learning_rate": 1.930915172222836e-05, + "loss": 0.1226, + "step": 34524 + }, + { + "epoch": 0.6157921021653052, + "grad_norm": 0.3067161440849304, + "learning_rate": 1.9307636089006683e-05, + "loss": 0.1251, + "step": 34525 + }, + { + "epoch": 0.6158099382870189, + "grad_norm": 0.2224901020526886, + "learning_rate": 1.9306120477850574e-05, + "loss": 0.1497, + "step": 34526 + }, + { + "epoch": 0.6158277744087326, + "grad_norm": 0.23707859218120575, + "learning_rate": 1.93046048887659e-05, + "loss": 0.125, + "step": 34527 + }, + { + "epoch": 0.6158456105304463, + "grad_norm": 0.3091038167476654, + "learning_rate": 1.9303089321758555e-05, + "loss": 0.1575, + "step": 34528 + }, + { + "epoch": 0.61586344665216, + "grad_norm": 0.35471346974372864, + "learning_rate": 1.9301573776834405e-05, + "loss": 0.1107, + "step": 34529 + }, + { + "epoch": 0.6158812827738737, + "grad_norm": 0.24750669300556183, + "learning_rate": 1.9300058253999323e-05, + "loss": 0.1243, + "step": 34530 + }, + { + "epoch": 0.6158991188955873, + "grad_norm": 0.3448469936847687, + "learning_rate": 1.9298542753259173e-05, + "loss": 0.09, + "step": 34531 + }, + { + "epoch": 0.615916955017301, + "grad_norm": 0.2808707654476166, + "learning_rate": 1.9297027274619854e-05, + "loss": 0.1859, + "step": 34532 + }, + { + "epoch": 0.6159347911390147, + "grad_norm": 0.24901944398880005, + "learning_rate": 1.9295511818087218e-05, + "loss": 0.1059, + "step": 34533 + }, + { + "epoch": 0.6159526272607284, + "grad_norm": 0.2689917981624603, + "learning_rate": 1.9293996383667152e-05, + "loss": 0.1603, + "step": 34534 + }, + { + "epoch": 0.6159704633824421, + "grad_norm": 0.27367252111434937, + "learning_rate": 1.9292480971365533e-05, + "loss": 0.1224, + "step": 34535 + }, + { + "epoch": 0.6159882995041558, + "grad_norm": 0.3604848086833954, + "learning_rate": 1.9290965581188214e-05, + "loss": 0.1777, + "step": 34536 + }, + { + "epoch": 0.6160061356258695, + "grad_norm": 0.20487985014915466, + "learning_rate": 1.9289450213141095e-05, + "loss": 0.1232, + "step": 34537 + }, + { + "epoch": 0.6160239717475832, + "grad_norm": 0.30140063166618347, + "learning_rate": 1.9287934867230033e-05, + "loss": 0.1012, + "step": 34538 + }, + { + "epoch": 0.6160418078692969, + "grad_norm": 0.2638993263244629, + "learning_rate": 1.9286419543460917e-05, + "loss": 0.1395, + "step": 34539 + }, + { + "epoch": 0.6160596439910105, + "grad_norm": 0.280154824256897, + "learning_rate": 1.92849042418396e-05, + "loss": 0.1135, + "step": 34540 + }, + { + "epoch": 0.6160774801127243, + "grad_norm": 0.362231582403183, + "learning_rate": 1.928338896237198e-05, + "loss": 0.1523, + "step": 34541 + }, + { + "epoch": 0.616095316234438, + "grad_norm": 0.35468077659606934, + "learning_rate": 1.9281873705063915e-05, + "loss": 0.1052, + "step": 34542 + }, + { + "epoch": 0.6161131523561517, + "grad_norm": 0.25615227222442627, + "learning_rate": 1.9280358469921286e-05, + "loss": 0.1474, + "step": 34543 + }, + { + "epoch": 0.6161309884778654, + "grad_norm": 0.2822414040565491, + "learning_rate": 1.9278843256949952e-05, + "loss": 0.1223, + "step": 34544 + }, + { + "epoch": 0.6161488245995791, + "grad_norm": 0.2659647762775421, + "learning_rate": 1.927732806615581e-05, + "loss": 0.0817, + "step": 34545 + }, + { + "epoch": 0.6161666607212928, + "grad_norm": 0.20857706665992737, + "learning_rate": 1.9275812897544712e-05, + "loss": 0.1019, + "step": 34546 + }, + { + "epoch": 0.6161844968430065, + "grad_norm": 0.6639837622642517, + "learning_rate": 1.9274297751122547e-05, + "loss": 0.1182, + "step": 34547 + }, + { + "epoch": 0.6162023329647202, + "grad_norm": 0.28585055470466614, + "learning_rate": 1.927278262689518e-05, + "loss": 0.1752, + "step": 34548 + }, + { + "epoch": 0.6162201690864338, + "grad_norm": 0.25726133584976196, + "learning_rate": 1.9271267524868478e-05, + "loss": 0.1788, + "step": 34549 + }, + { + "epoch": 0.6162380052081475, + "grad_norm": 0.2813021242618561, + "learning_rate": 1.926975244504833e-05, + "loss": 0.1245, + "step": 34550 + }, + { + "epoch": 0.6162558413298612, + "grad_norm": 0.31017810106277466, + "learning_rate": 1.9268237387440603e-05, + "loss": 0.181, + "step": 34551 + }, + { + "epoch": 0.6162736774515749, + "grad_norm": 0.22194547951221466, + "learning_rate": 1.926672235205116e-05, + "loss": 0.1123, + "step": 34552 + }, + { + "epoch": 0.6162915135732886, + "grad_norm": 0.2474585324525833, + "learning_rate": 1.9265207338885884e-05, + "loss": 0.1269, + "step": 34553 + }, + { + "epoch": 0.6163093496950023, + "grad_norm": 0.3163282871246338, + "learning_rate": 1.926369234795064e-05, + "loss": 0.1281, + "step": 34554 + }, + { + "epoch": 0.616327185816716, + "grad_norm": 0.24996168911457062, + "learning_rate": 1.9262177379251318e-05, + "loss": 0.1267, + "step": 34555 + }, + { + "epoch": 0.6163450219384297, + "grad_norm": 0.26003187894821167, + "learning_rate": 1.926066243279377e-05, + "loss": 0.1216, + "step": 34556 + }, + { + "epoch": 0.6163628580601433, + "grad_norm": 0.26683512330055237, + "learning_rate": 1.9259147508583876e-05, + "loss": 0.1633, + "step": 34557 + }, + { + "epoch": 0.6163806941818571, + "grad_norm": 0.18827903270721436, + "learning_rate": 1.925763260662751e-05, + "loss": 0.0968, + "step": 34558 + }, + { + "epoch": 0.6163985303035708, + "grad_norm": 0.28749945759773254, + "learning_rate": 1.925611772693055e-05, + "loss": 0.1482, + "step": 34559 + }, + { + "epoch": 0.6164163664252845, + "grad_norm": 0.24320589005947113, + "learning_rate": 1.9254602869498852e-05, + "loss": 0.1399, + "step": 34560 + }, + { + "epoch": 0.6164342025469982, + "grad_norm": 0.21242789924144745, + "learning_rate": 1.9253088034338307e-05, + "loss": 0.1048, + "step": 34561 + }, + { + "epoch": 0.6164520386687119, + "grad_norm": 0.3348024785518646, + "learning_rate": 1.9251573221454765e-05, + "loss": 0.1667, + "step": 34562 + }, + { + "epoch": 0.6164698747904256, + "grad_norm": 0.24434778094291687, + "learning_rate": 1.925005843085412e-05, + "loss": 0.1507, + "step": 34563 + }, + { + "epoch": 0.6164877109121393, + "grad_norm": 0.22308313846588135, + "learning_rate": 1.9248543662542237e-05, + "loss": 0.1261, + "step": 34564 + }, + { + "epoch": 0.616505547033853, + "grad_norm": 0.29116660356521606, + "learning_rate": 1.9247028916524984e-05, + "loss": 0.1287, + "step": 34565 + }, + { + "epoch": 0.6165233831555667, + "grad_norm": 0.2406667321920395, + "learning_rate": 1.924551419280822e-05, + "loss": 0.1029, + "step": 34566 + }, + { + "epoch": 0.6165412192772803, + "grad_norm": 0.2553635537624359, + "learning_rate": 1.9243999491397844e-05, + "loss": 0.1153, + "step": 34567 + }, + { + "epoch": 0.616559055398994, + "grad_norm": 0.2551077902317047, + "learning_rate": 1.9242484812299715e-05, + "loss": 0.1841, + "step": 34568 + }, + { + "epoch": 0.6165768915207077, + "grad_norm": 0.2785532474517822, + "learning_rate": 1.92409701555197e-05, + "loss": 0.1535, + "step": 34569 + }, + { + "epoch": 0.6165947276424214, + "grad_norm": 0.3574562966823578, + "learning_rate": 1.923945552106367e-05, + "loss": 0.1461, + "step": 34570 + }, + { + "epoch": 0.6166125637641351, + "grad_norm": 0.21311809122562408, + "learning_rate": 1.9237940908937504e-05, + "loss": 0.1181, + "step": 34571 + }, + { + "epoch": 0.6166303998858488, + "grad_norm": 0.2024383842945099, + "learning_rate": 1.923642631914707e-05, + "loss": 0.1096, + "step": 34572 + }, + { + "epoch": 0.6166482360075625, + "grad_norm": 0.27098020911216736, + "learning_rate": 1.9234911751698238e-05, + "loss": 0.1451, + "step": 34573 + }, + { + "epoch": 0.6166660721292762, + "grad_norm": 0.417580783367157, + "learning_rate": 1.9233397206596882e-05, + "loss": 0.1281, + "step": 34574 + }, + { + "epoch": 0.61668390825099, + "grad_norm": 0.3413103520870209, + "learning_rate": 1.9231882683848857e-05, + "loss": 0.1513, + "step": 34575 + }, + { + "epoch": 0.6167017443727036, + "grad_norm": 0.2721801996231079, + "learning_rate": 1.923036818346006e-05, + "loss": 0.1043, + "step": 34576 + }, + { + "epoch": 0.6167195804944173, + "grad_norm": 0.36068153381347656, + "learning_rate": 1.9228853705436346e-05, + "loss": 0.2121, + "step": 34577 + }, + { + "epoch": 0.616737416616131, + "grad_norm": 0.2523716688156128, + "learning_rate": 1.9227339249783594e-05, + "loss": 0.1594, + "step": 34578 + }, + { + "epoch": 0.6167552527378447, + "grad_norm": 0.282390832901001, + "learning_rate": 1.9225824816507652e-05, + "loss": 0.1155, + "step": 34579 + }, + { + "epoch": 0.6167730888595584, + "grad_norm": 0.22251537442207336, + "learning_rate": 1.9224310405614424e-05, + "loss": 0.1649, + "step": 34580 + }, + { + "epoch": 0.6167909249812721, + "grad_norm": 0.24038361012935638, + "learning_rate": 1.9222796017109755e-05, + "loss": 0.1264, + "step": 34581 + }, + { + "epoch": 0.6168087611029858, + "grad_norm": 0.3214409351348877, + "learning_rate": 1.9221281650999528e-05, + "loss": 0.1094, + "step": 34582 + }, + { + "epoch": 0.6168265972246995, + "grad_norm": 0.2674230635166168, + "learning_rate": 1.9219767307289603e-05, + "loss": 0.1233, + "step": 34583 + }, + { + "epoch": 0.6168444333464131, + "grad_norm": 0.273513525724411, + "learning_rate": 1.9218252985985864e-05, + "loss": 0.1583, + "step": 34584 + }, + { + "epoch": 0.6168622694681268, + "grad_norm": 0.2540189325809479, + "learning_rate": 1.9216738687094173e-05, + "loss": 0.1109, + "step": 34585 + }, + { + "epoch": 0.6168801055898405, + "grad_norm": 0.2843306362628937, + "learning_rate": 1.92152244106204e-05, + "loss": 0.1056, + "step": 34586 + }, + { + "epoch": 0.6168979417115542, + "grad_norm": 0.3077537715435028, + "learning_rate": 1.921371015657041e-05, + "loss": 0.2278, + "step": 34587 + }, + { + "epoch": 0.6169157778332679, + "grad_norm": 0.3433613181114197, + "learning_rate": 1.9212195924950073e-05, + "loss": 0.1423, + "step": 34588 + }, + { + "epoch": 0.6169336139549816, + "grad_norm": 0.21594177186489105, + "learning_rate": 1.9210681715765277e-05, + "loss": 0.0986, + "step": 34589 + }, + { + "epoch": 0.6169514500766953, + "grad_norm": 0.33674225211143494, + "learning_rate": 1.9209167529021873e-05, + "loss": 0.1237, + "step": 34590 + }, + { + "epoch": 0.6169692861984091, + "grad_norm": 0.22517625987529755, + "learning_rate": 1.920765336472574e-05, + "loss": 0.1437, + "step": 34591 + }, + { + "epoch": 0.6169871223201228, + "grad_norm": 0.21776363253593445, + "learning_rate": 1.9206139222882733e-05, + "loss": 0.1623, + "step": 34592 + }, + { + "epoch": 0.6170049584418364, + "grad_norm": 0.31965121626853943, + "learning_rate": 1.920462510349874e-05, + "loss": 0.1432, + "step": 34593 + }, + { + "epoch": 0.6170227945635501, + "grad_norm": 0.2996142506599426, + "learning_rate": 1.9203111006579616e-05, + "loss": 0.1275, + "step": 34594 + }, + { + "epoch": 0.6170406306852638, + "grad_norm": 0.22301287949085236, + "learning_rate": 1.9201596932131242e-05, + "loss": 0.1386, + "step": 34595 + }, + { + "epoch": 0.6170584668069775, + "grad_norm": 0.28665006160736084, + "learning_rate": 1.920008288015947e-05, + "loss": 0.1695, + "step": 34596 + }, + { + "epoch": 0.6170763029286912, + "grad_norm": 0.3356695771217346, + "learning_rate": 1.919856885067019e-05, + "loss": 0.1569, + "step": 34597 + }, + { + "epoch": 0.6170941390504049, + "grad_norm": 0.29975688457489014, + "learning_rate": 1.919705484366926e-05, + "loss": 0.1258, + "step": 34598 + }, + { + "epoch": 0.6171119751721186, + "grad_norm": 0.23956458270549774, + "learning_rate": 1.9195540859162553e-05, + "loss": 0.1217, + "step": 34599 + }, + { + "epoch": 0.6171298112938323, + "grad_norm": 0.19150055944919586, + "learning_rate": 1.919402689715593e-05, + "loss": 0.1083, + "step": 34600 + }, + { + "epoch": 0.617147647415546, + "grad_norm": 0.22420692443847656, + "learning_rate": 1.9192512957655263e-05, + "loss": 0.1238, + "step": 34601 + }, + { + "epoch": 0.6171654835372596, + "grad_norm": 0.27167558670043945, + "learning_rate": 1.9190999040666418e-05, + "loss": 0.2033, + "step": 34602 + }, + { + "epoch": 0.6171833196589733, + "grad_norm": 0.24470174312591553, + "learning_rate": 1.9189485146195273e-05, + "loss": 0.1348, + "step": 34603 + }, + { + "epoch": 0.617201155780687, + "grad_norm": 0.27418550848960876, + "learning_rate": 1.9187971274247697e-05, + "loss": 0.1132, + "step": 34604 + }, + { + "epoch": 0.6172189919024007, + "grad_norm": 0.48278388381004333, + "learning_rate": 1.9186457424829536e-05, + "loss": 0.1301, + "step": 34605 + }, + { + "epoch": 0.6172368280241144, + "grad_norm": 0.22970767319202423, + "learning_rate": 1.918494359794668e-05, + "loss": 0.1658, + "step": 34606 + }, + { + "epoch": 0.6172546641458281, + "grad_norm": 0.2338697463274002, + "learning_rate": 1.9183429793605e-05, + "loss": 0.1401, + "step": 34607 + }, + { + "epoch": 0.6172725002675419, + "grad_norm": 0.32704606652259827, + "learning_rate": 1.9181916011810344e-05, + "loss": 0.2, + "step": 34608 + }, + { + "epoch": 0.6172903363892556, + "grad_norm": 0.27111607789993286, + "learning_rate": 1.9180402252568595e-05, + "loss": 0.1646, + "step": 34609 + }, + { + "epoch": 0.6173081725109693, + "grad_norm": 0.24766652286052704, + "learning_rate": 1.9178888515885614e-05, + "loss": 0.1522, + "step": 34610 + }, + { + "epoch": 0.6173260086326829, + "grad_norm": 0.279319167137146, + "learning_rate": 1.9177374801767275e-05, + "loss": 0.1451, + "step": 34611 + }, + { + "epoch": 0.6173438447543966, + "grad_norm": 0.19560639560222626, + "learning_rate": 1.9175861110219444e-05, + "loss": 0.1347, + "step": 34612 + }, + { + "epoch": 0.6173616808761103, + "grad_norm": 0.20941701531410217, + "learning_rate": 1.9174347441247993e-05, + "loss": 0.1462, + "step": 34613 + }, + { + "epoch": 0.617379516997824, + "grad_norm": 0.24162720143795013, + "learning_rate": 1.9172833794858764e-05, + "loss": 0.0993, + "step": 34614 + }, + { + "epoch": 0.6173973531195377, + "grad_norm": 0.2666206359863281, + "learning_rate": 1.9171320171057655e-05, + "loss": 0.1176, + "step": 34615 + }, + { + "epoch": 0.6174151892412514, + "grad_norm": 0.2780097424983978, + "learning_rate": 1.9169806569850523e-05, + "loss": 0.1736, + "step": 34616 + }, + { + "epoch": 0.6174330253629651, + "grad_norm": 0.258361279964447, + "learning_rate": 1.916829299124324e-05, + "loss": 0.0983, + "step": 34617 + }, + { + "epoch": 0.6174508614846788, + "grad_norm": 0.32893630862236023, + "learning_rate": 1.9166779435241653e-05, + "loss": 0.1402, + "step": 34618 + }, + { + "epoch": 0.6174686976063924, + "grad_norm": 0.46025940775871277, + "learning_rate": 1.9165265901851654e-05, + "loss": 0.1776, + "step": 34619 + }, + { + "epoch": 0.6174865337281061, + "grad_norm": 0.35117071866989136, + "learning_rate": 1.9163752391079103e-05, + "loss": 0.1739, + "step": 34620 + }, + { + "epoch": 0.6175043698498198, + "grad_norm": 0.24502874910831451, + "learning_rate": 1.9162238902929852e-05, + "loss": 0.1037, + "step": 34621 + }, + { + "epoch": 0.6175222059715335, + "grad_norm": 0.22721615433692932, + "learning_rate": 1.9160725437409785e-05, + "loss": 0.0963, + "step": 34622 + }, + { + "epoch": 0.6175400420932472, + "grad_norm": 0.24761302769184113, + "learning_rate": 1.9159211994524763e-05, + "loss": 0.1273, + "step": 34623 + }, + { + "epoch": 0.6175578782149609, + "grad_norm": 0.3963606059551239, + "learning_rate": 1.9157698574280656e-05, + "loss": 0.1661, + "step": 34624 + }, + { + "epoch": 0.6175757143366747, + "grad_norm": 0.2536150813102722, + "learning_rate": 1.9156185176683324e-05, + "loss": 0.0821, + "step": 34625 + }, + { + "epoch": 0.6175935504583884, + "grad_norm": 0.2644543945789337, + "learning_rate": 1.9154671801738642e-05, + "loss": 0.1194, + "step": 34626 + }, + { + "epoch": 0.6176113865801021, + "grad_norm": 0.2237444370985031, + "learning_rate": 1.9153158449452456e-05, + "loss": 0.147, + "step": 34627 + }, + { + "epoch": 0.6176292227018157, + "grad_norm": 0.31942877173423767, + "learning_rate": 1.9151645119830664e-05, + "loss": 0.1234, + "step": 34628 + }, + { + "epoch": 0.6176470588235294, + "grad_norm": 0.26018068194389343, + "learning_rate": 1.9150131812879105e-05, + "loss": 0.15, + "step": 34629 + }, + { + "epoch": 0.6176648949452431, + "grad_norm": 0.24610719084739685, + "learning_rate": 1.914861852860366e-05, + "loss": 0.1406, + "step": 34630 + }, + { + "epoch": 0.6176827310669568, + "grad_norm": 0.24228277802467346, + "learning_rate": 1.9147105267010184e-05, + "loss": 0.1111, + "step": 34631 + }, + { + "epoch": 0.6177005671886705, + "grad_norm": 0.24873076379299164, + "learning_rate": 1.914559202810456e-05, + "loss": 0.0777, + "step": 34632 + }, + { + "epoch": 0.6177184033103842, + "grad_norm": 0.3159685432910919, + "learning_rate": 1.9144078811892642e-05, + "loss": 0.1694, + "step": 34633 + }, + { + "epoch": 0.6177362394320979, + "grad_norm": 0.26278772950172424, + "learning_rate": 1.9142565618380296e-05, + "loss": 0.1365, + "step": 34634 + }, + { + "epoch": 0.6177540755538116, + "grad_norm": 0.2747803330421448, + "learning_rate": 1.914105244757338e-05, + "loss": 0.103, + "step": 34635 + }, + { + "epoch": 0.6177719116755253, + "grad_norm": 0.24300013482570648, + "learning_rate": 1.9139539299477777e-05, + "loss": 0.1233, + "step": 34636 + }, + { + "epoch": 0.6177897477972389, + "grad_norm": 0.2568221092224121, + "learning_rate": 1.9138026174099338e-05, + "loss": 0.1406, + "step": 34637 + }, + { + "epoch": 0.6178075839189526, + "grad_norm": 0.3869577646255493, + "learning_rate": 1.9136513071443944e-05, + "loss": 0.1404, + "step": 34638 + }, + { + "epoch": 0.6178254200406663, + "grad_norm": 0.2926936447620392, + "learning_rate": 1.9134999991517445e-05, + "loss": 0.1289, + "step": 34639 + }, + { + "epoch": 0.61784325616238, + "grad_norm": 0.2502843737602234, + "learning_rate": 1.9133486934325704e-05, + "loss": 0.1135, + "step": 34640 + }, + { + "epoch": 0.6178610922840937, + "grad_norm": 0.2720404863357544, + "learning_rate": 1.9131973899874605e-05, + "loss": 0.1298, + "step": 34641 + }, + { + "epoch": 0.6178789284058075, + "grad_norm": 0.3430882394313812, + "learning_rate": 1.913046088817e-05, + "loss": 0.1905, + "step": 34642 + }, + { + "epoch": 0.6178967645275212, + "grad_norm": 0.20249223709106445, + "learning_rate": 1.9128947899217754e-05, + "loss": 0.0725, + "step": 34643 + }, + { + "epoch": 0.6179146006492349, + "grad_norm": 0.23356664180755615, + "learning_rate": 1.9127434933023724e-05, + "loss": 0.1566, + "step": 34644 + }, + { + "epoch": 0.6179324367709486, + "grad_norm": 0.2784213125705719, + "learning_rate": 1.9125921989593797e-05, + "loss": 0.1495, + "step": 34645 + }, + { + "epoch": 0.6179502728926622, + "grad_norm": 0.2882097661495209, + "learning_rate": 1.9124409068933826e-05, + "loss": 0.1132, + "step": 34646 + }, + { + "epoch": 0.6179681090143759, + "grad_norm": 0.2580234706401825, + "learning_rate": 1.9122896171049673e-05, + "loss": 0.1635, + "step": 34647 + }, + { + "epoch": 0.6179859451360896, + "grad_norm": 0.3438928723335266, + "learning_rate": 1.9121383295947196e-05, + "loss": 0.2015, + "step": 34648 + }, + { + "epoch": 0.6180037812578033, + "grad_norm": 0.3683916926383972, + "learning_rate": 1.9119870443632277e-05, + "loss": 0.164, + "step": 34649 + }, + { + "epoch": 0.618021617379517, + "grad_norm": 0.4905417561531067, + "learning_rate": 1.9118357614110764e-05, + "loss": 0.15, + "step": 34650 + }, + { + "epoch": 0.6180394535012307, + "grad_norm": 0.22359319031238556, + "learning_rate": 1.9116844807388536e-05, + "loss": 0.117, + "step": 34651 + }, + { + "epoch": 0.6180572896229444, + "grad_norm": 0.29820236563682556, + "learning_rate": 1.911533202347145e-05, + "loss": 0.1605, + "step": 34652 + }, + { + "epoch": 0.6180751257446581, + "grad_norm": 0.35331961512565613, + "learning_rate": 1.9113819262365357e-05, + "loss": 0.101, + "step": 34653 + }, + { + "epoch": 0.6180929618663717, + "grad_norm": 0.29426342248916626, + "learning_rate": 1.9112306524076147e-05, + "loss": 0.122, + "step": 34654 + }, + { + "epoch": 0.6181107979880854, + "grad_norm": 0.3921533524990082, + "learning_rate": 1.9110793808609667e-05, + "loss": 0.1474, + "step": 34655 + }, + { + "epoch": 0.6181286341097991, + "grad_norm": 0.30939364433288574, + "learning_rate": 1.9109281115971782e-05, + "loss": 0.1759, + "step": 34656 + }, + { + "epoch": 0.6181464702315128, + "grad_norm": 0.34440159797668457, + "learning_rate": 1.9107768446168357e-05, + "loss": 0.1547, + "step": 34657 + }, + { + "epoch": 0.6181643063532265, + "grad_norm": 0.2829227149486542, + "learning_rate": 1.9106255799205256e-05, + "loss": 0.1298, + "step": 34658 + }, + { + "epoch": 0.6181821424749403, + "grad_norm": 0.29732856154441833, + "learning_rate": 1.910474317508835e-05, + "loss": 0.205, + "step": 34659 + }, + { + "epoch": 0.618199978596654, + "grad_norm": 0.2636476755142212, + "learning_rate": 1.9103230573823492e-05, + "loss": 0.122, + "step": 34660 + }, + { + "epoch": 0.6182178147183677, + "grad_norm": 0.2381862848997116, + "learning_rate": 1.9101717995416542e-05, + "loss": 0.1476, + "step": 34661 + }, + { + "epoch": 0.6182356508400814, + "grad_norm": 0.23479658365249634, + "learning_rate": 1.9100205439873382e-05, + "loss": 0.0886, + "step": 34662 + }, + { + "epoch": 0.618253486961795, + "grad_norm": 0.22317281365394592, + "learning_rate": 1.9098692907199858e-05, + "loss": 0.1242, + "step": 34663 + }, + { + "epoch": 0.6182713230835087, + "grad_norm": 0.3403395712375641, + "learning_rate": 1.9097180397401837e-05, + "loss": 0.1386, + "step": 34664 + }, + { + "epoch": 0.6182891592052224, + "grad_norm": 0.5015697479248047, + "learning_rate": 1.909566791048519e-05, + "loss": 0.1562, + "step": 34665 + }, + { + "epoch": 0.6183069953269361, + "grad_norm": 0.3411105275154114, + "learning_rate": 1.9094155446455757e-05, + "loss": 0.1174, + "step": 34666 + }, + { + "epoch": 0.6183248314486498, + "grad_norm": 0.27326858043670654, + "learning_rate": 1.9092643005319433e-05, + "loss": 0.1115, + "step": 34667 + }, + { + "epoch": 0.6183426675703635, + "grad_norm": 0.3388707637786865, + "learning_rate": 1.9091130587082066e-05, + "loss": 0.1288, + "step": 34668 + }, + { + "epoch": 0.6183605036920772, + "grad_norm": 0.27115598320961, + "learning_rate": 1.9089618191749513e-05, + "loss": 0.1243, + "step": 34669 + }, + { + "epoch": 0.6183783398137909, + "grad_norm": 0.22266532480716705, + "learning_rate": 1.9088105819327633e-05, + "loss": 0.1227, + "step": 34670 + }, + { + "epoch": 0.6183961759355046, + "grad_norm": 0.24149054288864136, + "learning_rate": 1.9086593469822307e-05, + "loss": 0.1492, + "step": 34671 + }, + { + "epoch": 0.6184140120572182, + "grad_norm": 0.20407485961914062, + "learning_rate": 1.9085081143239386e-05, + "loss": 0.1081, + "step": 34672 + }, + { + "epoch": 0.6184318481789319, + "grad_norm": 0.1755708009004593, + "learning_rate": 1.9083568839584733e-05, + "loss": 0.0737, + "step": 34673 + }, + { + "epoch": 0.6184496843006456, + "grad_norm": 0.19151778519153595, + "learning_rate": 1.90820565588642e-05, + "loss": 0.1032, + "step": 34674 + }, + { + "epoch": 0.6184675204223593, + "grad_norm": 0.25832992792129517, + "learning_rate": 1.9080544301083677e-05, + "loss": 0.1379, + "step": 34675 + }, + { + "epoch": 0.6184853565440731, + "grad_norm": 0.3052057921886444, + "learning_rate": 1.9079032066249004e-05, + "loss": 0.1275, + "step": 34676 + }, + { + "epoch": 0.6185031926657868, + "grad_norm": 0.2708165943622589, + "learning_rate": 1.9077519854366044e-05, + "loss": 0.1224, + "step": 34677 + }, + { + "epoch": 0.6185210287875005, + "grad_norm": 0.26803216338157654, + "learning_rate": 1.9076007665440665e-05, + "loss": 0.1075, + "step": 34678 + }, + { + "epoch": 0.6185388649092142, + "grad_norm": 0.2422078251838684, + "learning_rate": 1.907449549947872e-05, + "loss": 0.107, + "step": 34679 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 0.2929508686065674, + "learning_rate": 1.9072983356486086e-05, + "loss": 0.0941, + "step": 34680 + }, + { + "epoch": 0.6185745371526415, + "grad_norm": 0.2869585156440735, + "learning_rate": 1.9071471236468613e-05, + "loss": 0.116, + "step": 34681 + }, + { + "epoch": 0.6185923732743552, + "grad_norm": 0.3429265022277832, + "learning_rate": 1.9069959139432167e-05, + "loss": 0.1027, + "step": 34682 + }, + { + "epoch": 0.6186102093960689, + "grad_norm": 0.2660747766494751, + "learning_rate": 1.9068447065382596e-05, + "loss": 0.1154, + "step": 34683 + }, + { + "epoch": 0.6186280455177826, + "grad_norm": 0.23053212463855743, + "learning_rate": 1.9066935014325785e-05, + "loss": 0.1236, + "step": 34684 + }, + { + "epoch": 0.6186458816394963, + "grad_norm": 0.3426898419857025, + "learning_rate": 1.906542298626758e-05, + "loss": 0.1502, + "step": 34685 + }, + { + "epoch": 0.61866371776121, + "grad_norm": 0.19473044574260712, + "learning_rate": 1.9063910981213845e-05, + "loss": 0.0907, + "step": 34686 + }, + { + "epoch": 0.6186815538829237, + "grad_norm": 0.534480631351471, + "learning_rate": 1.9062398999170432e-05, + "loss": 0.174, + "step": 34687 + }, + { + "epoch": 0.6186993900046374, + "grad_norm": 0.2357155680656433, + "learning_rate": 1.9060887040143226e-05, + "loss": 0.1068, + "step": 34688 + }, + { + "epoch": 0.618717226126351, + "grad_norm": 0.238226518034935, + "learning_rate": 1.905937510413807e-05, + "loss": 0.1192, + "step": 34689 + }, + { + "epoch": 0.6187350622480647, + "grad_norm": 0.4326547086238861, + "learning_rate": 1.905786319116083e-05, + "loss": 0.1522, + "step": 34690 + }, + { + "epoch": 0.6187528983697784, + "grad_norm": 0.5117208361625671, + "learning_rate": 1.9056351301217356e-05, + "loss": 0.1849, + "step": 34691 + }, + { + "epoch": 0.6187707344914922, + "grad_norm": 0.39278343319892883, + "learning_rate": 1.9054839434313514e-05, + "loss": 0.1761, + "step": 34692 + }, + { + "epoch": 0.6187885706132059, + "grad_norm": 0.20553728938102722, + "learning_rate": 1.9053327590455178e-05, + "loss": 0.1125, + "step": 34693 + }, + { + "epoch": 0.6188064067349196, + "grad_norm": 0.2404250055551529, + "learning_rate": 1.9051815769648197e-05, + "loss": 0.1286, + "step": 34694 + }, + { + "epoch": 0.6188242428566333, + "grad_norm": 0.24899372458457947, + "learning_rate": 1.9050303971898432e-05, + "loss": 0.1014, + "step": 34695 + }, + { + "epoch": 0.618842078978347, + "grad_norm": 0.2878899872303009, + "learning_rate": 1.9048792197211738e-05, + "loss": 0.1042, + "step": 34696 + }, + { + "epoch": 0.6188599151000607, + "grad_norm": 0.21250182390213013, + "learning_rate": 1.9047280445593986e-05, + "loss": 0.1317, + "step": 34697 + }, + { + "epoch": 0.6188777512217744, + "grad_norm": 0.2949186861515045, + "learning_rate": 1.904576871705103e-05, + "loss": 0.126, + "step": 34698 + }, + { + "epoch": 0.618895587343488, + "grad_norm": 0.28523415327072144, + "learning_rate": 1.9044257011588734e-05, + "loss": 0.0839, + "step": 34699 + }, + { + "epoch": 0.6189134234652017, + "grad_norm": 0.24234086275100708, + "learning_rate": 1.9042745329212944e-05, + "loss": 0.1051, + "step": 34700 + }, + { + "epoch": 0.6189312595869154, + "grad_norm": 0.29429903626441956, + "learning_rate": 1.9041233669929542e-05, + "loss": 0.1268, + "step": 34701 + }, + { + "epoch": 0.6189490957086291, + "grad_norm": 0.24774394929409027, + "learning_rate": 1.9039722033744377e-05, + "loss": 0.1529, + "step": 34702 + }, + { + "epoch": 0.6189669318303428, + "grad_norm": 0.303415447473526, + "learning_rate": 1.903821042066331e-05, + "loss": 0.0951, + "step": 34703 + }, + { + "epoch": 0.6189847679520565, + "grad_norm": 0.36782971024513245, + "learning_rate": 1.903669883069219e-05, + "loss": 0.1009, + "step": 34704 + }, + { + "epoch": 0.6190026040737702, + "grad_norm": 0.539315402507782, + "learning_rate": 1.9035187263836883e-05, + "loss": 0.2632, + "step": 34705 + }, + { + "epoch": 0.6190204401954839, + "grad_norm": 0.28242090344429016, + "learning_rate": 1.9033675720103253e-05, + "loss": 0.149, + "step": 34706 + }, + { + "epoch": 0.6190382763171975, + "grad_norm": 0.2896815836429596, + "learning_rate": 1.903216419949716e-05, + "loss": 0.1433, + "step": 34707 + }, + { + "epoch": 0.6190561124389112, + "grad_norm": 0.22880099713802338, + "learning_rate": 1.9030652702024463e-05, + "loss": 0.0924, + "step": 34708 + }, + { + "epoch": 0.619073948560625, + "grad_norm": 0.23233024775981903, + "learning_rate": 1.9029141227691004e-05, + "loss": 0.108, + "step": 34709 + }, + { + "epoch": 0.6190917846823387, + "grad_norm": 0.25911736488342285, + "learning_rate": 1.902762977650267e-05, + "loss": 0.1474, + "step": 34710 + }, + { + "epoch": 0.6191096208040524, + "grad_norm": 0.33848893642425537, + "learning_rate": 1.9026118348465306e-05, + "loss": 0.1734, + "step": 34711 + }, + { + "epoch": 0.6191274569257661, + "grad_norm": 0.23676836490631104, + "learning_rate": 1.902460694358476e-05, + "loss": 0.0838, + "step": 34712 + }, + { + "epoch": 0.6191452930474798, + "grad_norm": 0.34058821201324463, + "learning_rate": 1.9023095561866906e-05, + "loss": 0.1424, + "step": 34713 + }, + { + "epoch": 0.6191631291691935, + "grad_norm": 0.32348740100860596, + "learning_rate": 1.9021584203317598e-05, + "loss": 0.1966, + "step": 34714 + }, + { + "epoch": 0.6191809652909072, + "grad_norm": 0.27467042207717896, + "learning_rate": 1.90200728679427e-05, + "loss": 0.1908, + "step": 34715 + }, + { + "epoch": 0.6191988014126208, + "grad_norm": 0.2522459030151367, + "learning_rate": 1.901856155574806e-05, + "loss": 0.1498, + "step": 34716 + }, + { + "epoch": 0.6192166375343345, + "grad_norm": 0.23489893972873688, + "learning_rate": 1.9017050266739543e-05, + "loss": 0.1081, + "step": 34717 + }, + { + "epoch": 0.6192344736560482, + "grad_norm": 0.2017030119895935, + "learning_rate": 1.9015539000923e-05, + "loss": 0.1348, + "step": 34718 + }, + { + "epoch": 0.6192523097777619, + "grad_norm": 0.2718386650085449, + "learning_rate": 1.90140277583043e-05, + "loss": 0.1295, + "step": 34719 + }, + { + "epoch": 0.6192701458994756, + "grad_norm": 0.39019495248794556, + "learning_rate": 1.9012516538889295e-05, + "loss": 0.2074, + "step": 34720 + }, + { + "epoch": 0.6192879820211893, + "grad_norm": 0.21726562082767487, + "learning_rate": 1.9011005342683847e-05, + "loss": 0.1016, + "step": 34721 + }, + { + "epoch": 0.619305818142903, + "grad_norm": 0.25415462255477905, + "learning_rate": 1.9009494169693796e-05, + "loss": 0.1318, + "step": 34722 + }, + { + "epoch": 0.6193236542646167, + "grad_norm": 0.2440441995859146, + "learning_rate": 1.9007983019925032e-05, + "loss": 0.0837, + "step": 34723 + }, + { + "epoch": 0.6193414903863304, + "grad_norm": 0.30083510279655457, + "learning_rate": 1.9006471893383392e-05, + "loss": 0.1415, + "step": 34724 + }, + { + "epoch": 0.619359326508044, + "grad_norm": 0.266088604927063, + "learning_rate": 1.9004960790074734e-05, + "loss": 0.1192, + "step": 34725 + }, + { + "epoch": 0.6193771626297578, + "grad_norm": 0.25652962923049927, + "learning_rate": 1.9003449710004916e-05, + "loss": 0.1444, + "step": 34726 + }, + { + "epoch": 0.6193949987514715, + "grad_norm": 0.26292935013771057, + "learning_rate": 1.90019386531798e-05, + "loss": 0.1206, + "step": 34727 + }, + { + "epoch": 0.6194128348731852, + "grad_norm": 0.31460875272750854, + "learning_rate": 1.9000427619605247e-05, + "loss": 0.1319, + "step": 34728 + }, + { + "epoch": 0.6194306709948989, + "grad_norm": 0.26876816153526306, + "learning_rate": 1.899891660928711e-05, + "loss": 0.149, + "step": 34729 + }, + { + "epoch": 0.6194485071166126, + "grad_norm": 0.3208983540534973, + "learning_rate": 1.899740562223124e-05, + "loss": 0.1117, + "step": 34730 + }, + { + "epoch": 0.6194663432383263, + "grad_norm": 0.2541704475879669, + "learning_rate": 1.899589465844349e-05, + "loss": 0.1593, + "step": 34731 + }, + { + "epoch": 0.61948417936004, + "grad_norm": 0.3421974778175354, + "learning_rate": 1.8994383717929736e-05, + "loss": 0.1915, + "step": 34732 + }, + { + "epoch": 0.6195020154817537, + "grad_norm": 0.3019033670425415, + "learning_rate": 1.8992872800695823e-05, + "loss": 0.15, + "step": 34733 + }, + { + "epoch": 0.6195198516034673, + "grad_norm": 0.26344621181488037, + "learning_rate": 1.8991361906747614e-05, + "loss": 0.1006, + "step": 34734 + }, + { + "epoch": 0.619537687725181, + "grad_norm": 0.35831519961357117, + "learning_rate": 1.898985103609095e-05, + "loss": 0.1487, + "step": 34735 + }, + { + "epoch": 0.6195555238468947, + "grad_norm": 0.28683534264564514, + "learning_rate": 1.8988340188731712e-05, + "loss": 0.1165, + "step": 34736 + }, + { + "epoch": 0.6195733599686084, + "grad_norm": 0.26843005418777466, + "learning_rate": 1.898682936467574e-05, + "loss": 0.1318, + "step": 34737 + }, + { + "epoch": 0.6195911960903221, + "grad_norm": 0.31570354104042053, + "learning_rate": 1.8985318563928897e-05, + "loss": 0.1489, + "step": 34738 + }, + { + "epoch": 0.6196090322120358, + "grad_norm": 0.3438796103000641, + "learning_rate": 1.8983807786497026e-05, + "loss": 0.1762, + "step": 34739 + }, + { + "epoch": 0.6196268683337495, + "grad_norm": 0.32528194785118103, + "learning_rate": 1.8982297032386005e-05, + "loss": 0.1045, + "step": 34740 + }, + { + "epoch": 0.6196447044554632, + "grad_norm": 0.3089931905269623, + "learning_rate": 1.898078630160167e-05, + "loss": 0.1323, + "step": 34741 + }, + { + "epoch": 0.6196625405771768, + "grad_norm": 0.4437181055545807, + "learning_rate": 1.8979275594149897e-05, + "loss": 0.1738, + "step": 34742 + }, + { + "epoch": 0.6196803766988906, + "grad_norm": 0.2528478801250458, + "learning_rate": 1.8977764910036526e-05, + "loss": 0.1181, + "step": 34743 + }, + { + "epoch": 0.6196982128206043, + "grad_norm": 0.23057478666305542, + "learning_rate": 1.897625424926741e-05, + "loss": 0.1161, + "step": 34744 + }, + { + "epoch": 0.619716048942318, + "grad_norm": 0.19571517407894135, + "learning_rate": 1.897474361184843e-05, + "loss": 0.1311, + "step": 34745 + }, + { + "epoch": 0.6197338850640317, + "grad_norm": 0.33104103803634644, + "learning_rate": 1.8973232997785416e-05, + "loss": 0.1542, + "step": 34746 + }, + { + "epoch": 0.6197517211857454, + "grad_norm": 0.20647558569908142, + "learning_rate": 1.897172240708423e-05, + "loss": 0.1247, + "step": 34747 + }, + { + "epoch": 0.6197695573074591, + "grad_norm": 0.23894667625427246, + "learning_rate": 1.8970211839750724e-05, + "loss": 0.1833, + "step": 34748 + }, + { + "epoch": 0.6197873934291728, + "grad_norm": 0.26335445046424866, + "learning_rate": 1.896870129579077e-05, + "loss": 0.1437, + "step": 34749 + }, + { + "epoch": 0.6198052295508865, + "grad_norm": 0.2467309683561325, + "learning_rate": 1.8967190775210214e-05, + "loss": 0.0854, + "step": 34750 + }, + { + "epoch": 0.6198230656726001, + "grad_norm": 0.2242477834224701, + "learning_rate": 1.896568027801491e-05, + "loss": 0.1147, + "step": 34751 + }, + { + "epoch": 0.6198409017943138, + "grad_norm": 0.22373074293136597, + "learning_rate": 1.89641698042107e-05, + "loss": 0.1186, + "step": 34752 + }, + { + "epoch": 0.6198587379160275, + "grad_norm": 0.2690976560115814, + "learning_rate": 1.896265935380347e-05, + "loss": 0.1225, + "step": 34753 + }, + { + "epoch": 0.6198765740377412, + "grad_norm": 0.35138142108917236, + "learning_rate": 1.8961148926799048e-05, + "loss": 0.1265, + "step": 34754 + }, + { + "epoch": 0.6198944101594549, + "grad_norm": 0.2890932261943817, + "learning_rate": 1.8959638523203305e-05, + "loss": 0.138, + "step": 34755 + }, + { + "epoch": 0.6199122462811686, + "grad_norm": 0.22307175397872925, + "learning_rate": 1.8958128143022086e-05, + "loss": 0.1057, + "step": 34756 + }, + { + "epoch": 0.6199300824028823, + "grad_norm": 0.35873791575431824, + "learning_rate": 1.895661778626124e-05, + "loss": 0.1605, + "step": 34757 + }, + { + "epoch": 0.619947918524596, + "grad_norm": 0.2750474512577057, + "learning_rate": 1.8955107452926643e-05, + "loss": 0.1413, + "step": 34758 + }, + { + "epoch": 0.6199657546463097, + "grad_norm": 0.22222664952278137, + "learning_rate": 1.895359714302414e-05, + "loss": 0.0822, + "step": 34759 + }, + { + "epoch": 0.6199835907680235, + "grad_norm": 0.35030174255371094, + "learning_rate": 1.8952086856559574e-05, + "loss": 0.119, + "step": 34760 + }, + { + "epoch": 0.6200014268897371, + "grad_norm": 0.24885420501232147, + "learning_rate": 1.8950576593538804e-05, + "loss": 0.0902, + "step": 34761 + }, + { + "epoch": 0.6200192630114508, + "grad_norm": 0.2903689444065094, + "learning_rate": 1.8949066353967694e-05, + "loss": 0.1727, + "step": 34762 + }, + { + "epoch": 0.6200370991331645, + "grad_norm": 0.17736518383026123, + "learning_rate": 1.89475561378521e-05, + "loss": 0.0853, + "step": 34763 + }, + { + "epoch": 0.6200549352548782, + "grad_norm": 0.22412770986557007, + "learning_rate": 1.8946045945197867e-05, + "loss": 0.1222, + "step": 34764 + }, + { + "epoch": 0.6200727713765919, + "grad_norm": 0.2723255157470703, + "learning_rate": 1.8944535776010834e-05, + "loss": 0.1546, + "step": 34765 + }, + { + "epoch": 0.6200906074983056, + "grad_norm": 0.2728655934333801, + "learning_rate": 1.894302563029689e-05, + "loss": 0.1712, + "step": 34766 + }, + { + "epoch": 0.6201084436200193, + "grad_norm": 0.267630398273468, + "learning_rate": 1.894151550806187e-05, + "loss": 0.1003, + "step": 34767 + }, + { + "epoch": 0.620126279741733, + "grad_norm": 0.30812835693359375, + "learning_rate": 1.8940005409311618e-05, + "loss": 0.1416, + "step": 34768 + }, + { + "epoch": 0.6201441158634466, + "grad_norm": 0.24239419400691986, + "learning_rate": 1.8938495334052008e-05, + "loss": 0.1337, + "step": 34769 + }, + { + "epoch": 0.6201619519851603, + "grad_norm": 0.21828460693359375, + "learning_rate": 1.8936985282288872e-05, + "loss": 0.0862, + "step": 34770 + }, + { + "epoch": 0.620179788106874, + "grad_norm": 0.2725733816623688, + "learning_rate": 1.8935475254028085e-05, + "loss": 0.1338, + "step": 34771 + }, + { + "epoch": 0.6201976242285877, + "grad_norm": 0.25624027848243713, + "learning_rate": 1.8933965249275494e-05, + "loss": 0.181, + "step": 34772 + }, + { + "epoch": 0.6202154603503014, + "grad_norm": 0.19563564658164978, + "learning_rate": 1.893245526803694e-05, + "loss": 0.1149, + "step": 34773 + }, + { + "epoch": 0.6202332964720151, + "grad_norm": 0.22358255088329315, + "learning_rate": 1.893094531031828e-05, + "loss": 0.117, + "step": 34774 + }, + { + "epoch": 0.6202511325937288, + "grad_norm": 0.212210550904274, + "learning_rate": 1.892943537612538e-05, + "loss": 0.149, + "step": 34775 + }, + { + "epoch": 0.6202689687154425, + "grad_norm": 0.2765887975692749, + "learning_rate": 1.892792546546409e-05, + "loss": 0.1296, + "step": 34776 + }, + { + "epoch": 0.6202868048371563, + "grad_norm": 0.3130955100059509, + "learning_rate": 1.8926415578340256e-05, + "loss": 0.1666, + "step": 34777 + }, + { + "epoch": 0.62030464095887, + "grad_norm": 0.3359980881214142, + "learning_rate": 1.8924905714759723e-05, + "loss": 0.1215, + "step": 34778 + }, + { + "epoch": 0.6203224770805836, + "grad_norm": 0.2572863698005676, + "learning_rate": 1.8923395874728365e-05, + "loss": 0.0559, + "step": 34779 + }, + { + "epoch": 0.6203403132022973, + "grad_norm": 0.32762399315834045, + "learning_rate": 1.892188605825202e-05, + "loss": 0.1947, + "step": 34780 + }, + { + "epoch": 0.620358149324011, + "grad_norm": 0.23259948194026947, + "learning_rate": 1.8920376265336544e-05, + "loss": 0.103, + "step": 34781 + }, + { + "epoch": 0.6203759854457247, + "grad_norm": 0.20898884534835815, + "learning_rate": 1.8918866495987792e-05, + "loss": 0.0944, + "step": 34782 + }, + { + "epoch": 0.6203938215674384, + "grad_norm": 0.18224264681339264, + "learning_rate": 1.8917356750211602e-05, + "loss": 0.0847, + "step": 34783 + }, + { + "epoch": 0.6204116576891521, + "grad_norm": 0.3516651391983032, + "learning_rate": 1.891584702801385e-05, + "loss": 0.143, + "step": 34784 + }, + { + "epoch": 0.6204294938108658, + "grad_norm": 0.5660027861595154, + "learning_rate": 1.8914337329400374e-05, + "loss": 0.169, + "step": 34785 + }, + { + "epoch": 0.6204473299325794, + "grad_norm": 0.23294174671173096, + "learning_rate": 1.891282765437703e-05, + "loss": 0.1123, + "step": 34786 + }, + { + "epoch": 0.6204651660542931, + "grad_norm": 0.22643277049064636, + "learning_rate": 1.891131800294966e-05, + "loss": 0.1207, + "step": 34787 + }, + { + "epoch": 0.6204830021760068, + "grad_norm": 0.2500450015068054, + "learning_rate": 1.890980837512413e-05, + "loss": 0.1193, + "step": 34788 + }, + { + "epoch": 0.6205008382977205, + "grad_norm": 0.24870911240577698, + "learning_rate": 1.890829877090629e-05, + "loss": 0.1241, + "step": 34789 + }, + { + "epoch": 0.6205186744194342, + "grad_norm": 0.27819690108299255, + "learning_rate": 1.8906789190301984e-05, + "loss": 0.1088, + "step": 34790 + }, + { + "epoch": 0.6205365105411479, + "grad_norm": 0.29976966977119446, + "learning_rate": 1.890527963331706e-05, + "loss": 0.1404, + "step": 34791 + }, + { + "epoch": 0.6205543466628616, + "grad_norm": 0.26283878087997437, + "learning_rate": 1.890377009995739e-05, + "loss": 0.1238, + "step": 34792 + }, + { + "epoch": 0.6205721827845754, + "grad_norm": 0.23070664703845978, + "learning_rate": 1.890226059022881e-05, + "loss": 0.1428, + "step": 34793 + }, + { + "epoch": 0.6205900189062891, + "grad_norm": 0.31828173995018005, + "learning_rate": 1.8900751104137178e-05, + "loss": 0.0978, + "step": 34794 + }, + { + "epoch": 0.6206078550280028, + "grad_norm": 0.269256055355072, + "learning_rate": 1.889924164168833e-05, + "loss": 0.101, + "step": 34795 + }, + { + "epoch": 0.6206256911497164, + "grad_norm": 0.2324729859828949, + "learning_rate": 1.8897732202888133e-05, + "loss": 0.1637, + "step": 34796 + }, + { + "epoch": 0.6206435272714301, + "grad_norm": 0.28690508008003235, + "learning_rate": 1.889622278774243e-05, + "loss": 0.1419, + "step": 34797 + }, + { + "epoch": 0.6206613633931438, + "grad_norm": 0.31229984760284424, + "learning_rate": 1.8894713396257086e-05, + "loss": 0.1487, + "step": 34798 + }, + { + "epoch": 0.6206791995148575, + "grad_norm": 0.309701532125473, + "learning_rate": 1.889320402843794e-05, + "loss": 0.1304, + "step": 34799 + }, + { + "epoch": 0.6206970356365712, + "grad_norm": 0.28423207998275757, + "learning_rate": 1.889169468429083e-05, + "loss": 0.1582, + "step": 34800 + }, + { + "epoch": 0.6207148717582849, + "grad_norm": 0.2632710337638855, + "learning_rate": 1.8890185363821637e-05, + "loss": 0.1378, + "step": 34801 + }, + { + "epoch": 0.6207327078799986, + "grad_norm": 0.2610422372817993, + "learning_rate": 1.888867606703619e-05, + "loss": 0.149, + "step": 34802 + }, + { + "epoch": 0.6207505440017123, + "grad_norm": 0.28421416878700256, + "learning_rate": 1.8887166793940344e-05, + "loss": 0.1641, + "step": 34803 + }, + { + "epoch": 0.620768380123426, + "grad_norm": 0.22791031002998352, + "learning_rate": 1.8885657544539945e-05, + "loss": 0.1239, + "step": 34804 + }, + { + "epoch": 0.6207862162451396, + "grad_norm": 0.273660272359848, + "learning_rate": 1.888414831884086e-05, + "loss": 0.1385, + "step": 34805 + }, + { + "epoch": 0.6208040523668533, + "grad_norm": 0.19438989460468292, + "learning_rate": 1.8882639116848928e-05, + "loss": 0.1135, + "step": 34806 + }, + { + "epoch": 0.620821888488567, + "grad_norm": 0.19421444833278656, + "learning_rate": 1.888112993857e-05, + "loss": 0.1068, + "step": 34807 + }, + { + "epoch": 0.6208397246102807, + "grad_norm": 0.21850250661373138, + "learning_rate": 1.8879620784009918e-05, + "loss": 0.0892, + "step": 34808 + }, + { + "epoch": 0.6208575607319944, + "grad_norm": 0.27259522676467896, + "learning_rate": 1.8878111653174544e-05, + "loss": 0.1675, + "step": 34809 + }, + { + "epoch": 0.6208753968537082, + "grad_norm": 0.23691260814666748, + "learning_rate": 1.887660254606972e-05, + "loss": 0.1078, + "step": 34810 + }, + { + "epoch": 0.6208932329754219, + "grad_norm": 0.2843174636363983, + "learning_rate": 1.8875093462701308e-05, + "loss": 0.1262, + "step": 34811 + }, + { + "epoch": 0.6209110690971356, + "grad_norm": 0.3329859972000122, + "learning_rate": 1.8873584403075144e-05, + "loss": 0.1885, + "step": 34812 + }, + { + "epoch": 0.6209289052188492, + "grad_norm": 0.25744545459747314, + "learning_rate": 1.8872075367197077e-05, + "loss": 0.1447, + "step": 34813 + }, + { + "epoch": 0.6209467413405629, + "grad_norm": 0.24773626029491425, + "learning_rate": 1.887056635507297e-05, + "loss": 0.1418, + "step": 34814 + }, + { + "epoch": 0.6209645774622766, + "grad_norm": 0.25728708505630493, + "learning_rate": 1.886905736670867e-05, + "loss": 0.1077, + "step": 34815 + }, + { + "epoch": 0.6209824135839903, + "grad_norm": 0.33350130915641785, + "learning_rate": 1.8867548402110013e-05, + "loss": 0.1396, + "step": 34816 + }, + { + "epoch": 0.621000249705704, + "grad_norm": 0.26106029748916626, + "learning_rate": 1.8866039461282856e-05, + "loss": 0.155, + "step": 34817 + }, + { + "epoch": 0.6210180858274177, + "grad_norm": 0.4445750415325165, + "learning_rate": 1.886453054423305e-05, + "loss": 0.1108, + "step": 34818 + }, + { + "epoch": 0.6210359219491314, + "grad_norm": 0.25004708766937256, + "learning_rate": 1.8863021650966446e-05, + "loss": 0.1409, + "step": 34819 + }, + { + "epoch": 0.6210537580708451, + "grad_norm": 0.1734635978937149, + "learning_rate": 1.886151278148889e-05, + "loss": 0.0696, + "step": 34820 + }, + { + "epoch": 0.6210715941925588, + "grad_norm": 0.28979402780532837, + "learning_rate": 1.8860003935806232e-05, + "loss": 0.1816, + "step": 34821 + }, + { + "epoch": 0.6210894303142724, + "grad_norm": 0.29880282282829285, + "learning_rate": 1.8858495113924304e-05, + "loss": 0.0975, + "step": 34822 + }, + { + "epoch": 0.6211072664359861, + "grad_norm": 0.29395395517349243, + "learning_rate": 1.8856986315848985e-05, + "loss": 0.1123, + "step": 34823 + }, + { + "epoch": 0.6211251025576998, + "grad_norm": 0.3052173852920532, + "learning_rate": 1.8855477541586103e-05, + "loss": 0.1338, + "step": 34824 + }, + { + "epoch": 0.6211429386794135, + "grad_norm": 0.23753082752227783, + "learning_rate": 1.8853968791141517e-05, + "loss": 0.1175, + "step": 34825 + }, + { + "epoch": 0.6211607748011272, + "grad_norm": 0.24740256369113922, + "learning_rate": 1.885246006452106e-05, + "loss": 0.1373, + "step": 34826 + }, + { + "epoch": 0.621178610922841, + "grad_norm": 0.2468625158071518, + "learning_rate": 1.8850951361730603e-05, + "loss": 0.1125, + "step": 34827 + }, + { + "epoch": 0.6211964470445547, + "grad_norm": 0.2553881108760834, + "learning_rate": 1.8849442682775984e-05, + "loss": 0.0977, + "step": 34828 + }, + { + "epoch": 0.6212142831662684, + "grad_norm": 0.1776779592037201, + "learning_rate": 1.884793402766304e-05, + "loss": 0.0435, + "step": 34829 + }, + { + "epoch": 0.621232119287982, + "grad_norm": 0.2639588713645935, + "learning_rate": 1.884642539639763e-05, + "loss": 0.1508, + "step": 34830 + }, + { + "epoch": 0.6212499554096957, + "grad_norm": 0.340195894241333, + "learning_rate": 1.8844916788985603e-05, + "loss": 0.1287, + "step": 34831 + }, + { + "epoch": 0.6212677915314094, + "grad_norm": 0.2531953752040863, + "learning_rate": 1.8843408205432807e-05, + "loss": 0.1626, + "step": 34832 + }, + { + "epoch": 0.6212856276531231, + "grad_norm": 0.25456300377845764, + "learning_rate": 1.884189964574509e-05, + "loss": 0.0749, + "step": 34833 + }, + { + "epoch": 0.6213034637748368, + "grad_norm": 0.26447778940200806, + "learning_rate": 1.8840391109928294e-05, + "loss": 0.0955, + "step": 34834 + }, + { + "epoch": 0.6213212998965505, + "grad_norm": 0.2910635471343994, + "learning_rate": 1.883888259798826e-05, + "loss": 0.1499, + "step": 34835 + }, + { + "epoch": 0.6213391360182642, + "grad_norm": 0.24692995846271515, + "learning_rate": 1.8837374109930856e-05, + "loss": 0.1165, + "step": 34836 + }, + { + "epoch": 0.6213569721399779, + "grad_norm": 0.28305307030677795, + "learning_rate": 1.8835865645761915e-05, + "loss": 0.1413, + "step": 34837 + }, + { + "epoch": 0.6213748082616916, + "grad_norm": 0.3302740752696991, + "learning_rate": 1.883435720548729e-05, + "loss": 0.1073, + "step": 34838 + }, + { + "epoch": 0.6213926443834052, + "grad_norm": 0.23480768501758575, + "learning_rate": 1.8832848789112816e-05, + "loss": 0.091, + "step": 34839 + }, + { + "epoch": 0.6214104805051189, + "grad_norm": 0.2914566397666931, + "learning_rate": 1.883134039664436e-05, + "loss": 0.1474, + "step": 34840 + }, + { + "epoch": 0.6214283166268326, + "grad_norm": 0.3203480839729309, + "learning_rate": 1.8829832028087756e-05, + "loss": 0.1548, + "step": 34841 + }, + { + "epoch": 0.6214461527485463, + "grad_norm": 0.2751144766807556, + "learning_rate": 1.8828323683448862e-05, + "loss": 0.1295, + "step": 34842 + }, + { + "epoch": 0.62146398887026, + "grad_norm": 0.25555625557899475, + "learning_rate": 1.8826815362733503e-05, + "loss": 0.1739, + "step": 34843 + }, + { + "epoch": 0.6214818249919738, + "grad_norm": 0.250133216381073, + "learning_rate": 1.882530706594755e-05, + "loss": 0.0985, + "step": 34844 + }, + { + "epoch": 0.6214996611136875, + "grad_norm": 0.30852439999580383, + "learning_rate": 1.8823798793096832e-05, + "loss": 0.2224, + "step": 34845 + }, + { + "epoch": 0.6215174972354012, + "grad_norm": 0.23542943596839905, + "learning_rate": 1.882229054418721e-05, + "loss": 0.0653, + "step": 34846 + }, + { + "epoch": 0.6215353333571149, + "grad_norm": 0.30950629711151123, + "learning_rate": 1.8820782319224524e-05, + "loss": 0.1665, + "step": 34847 + }, + { + "epoch": 0.6215531694788285, + "grad_norm": 0.3568834066390991, + "learning_rate": 1.8819274118214612e-05, + "loss": 0.0878, + "step": 34848 + }, + { + "epoch": 0.6215710056005422, + "grad_norm": 0.4003061056137085, + "learning_rate": 1.8817765941163335e-05, + "loss": 0.1161, + "step": 34849 + }, + { + "epoch": 0.6215888417222559, + "grad_norm": 0.2607308030128479, + "learning_rate": 1.8816257788076534e-05, + "loss": 0.1639, + "step": 34850 + }, + { + "epoch": 0.6216066778439696, + "grad_norm": 0.2799322307109833, + "learning_rate": 1.8814749658960047e-05, + "loss": 0.1577, + "step": 34851 + }, + { + "epoch": 0.6216245139656833, + "grad_norm": 0.22770659625530243, + "learning_rate": 1.8813241553819723e-05, + "loss": 0.1039, + "step": 34852 + }, + { + "epoch": 0.621642350087397, + "grad_norm": 0.26042523980140686, + "learning_rate": 1.8811733472661422e-05, + "loss": 0.1352, + "step": 34853 + }, + { + "epoch": 0.6216601862091107, + "grad_norm": 0.223334401845932, + "learning_rate": 1.881022541549098e-05, + "loss": 0.1454, + "step": 34854 + }, + { + "epoch": 0.6216780223308244, + "grad_norm": 0.30797743797302246, + "learning_rate": 1.880871738231424e-05, + "loss": 0.1655, + "step": 34855 + }, + { + "epoch": 0.621695858452538, + "grad_norm": 0.16385234892368317, + "learning_rate": 1.880720937313704e-05, + "loss": 0.0549, + "step": 34856 + }, + { + "epoch": 0.6217136945742517, + "grad_norm": 0.3494318425655365, + "learning_rate": 1.880570138796525e-05, + "loss": 0.166, + "step": 34857 + }, + { + "epoch": 0.6217315306959654, + "grad_norm": 0.18118543922901154, + "learning_rate": 1.880419342680469e-05, + "loss": 0.1032, + "step": 34858 + }, + { + "epoch": 0.6217493668176791, + "grad_norm": 0.3267197608947754, + "learning_rate": 1.8802685489661227e-05, + "loss": 0.1634, + "step": 34859 + }, + { + "epoch": 0.6217672029393928, + "grad_norm": 0.31090041995048523, + "learning_rate": 1.880117757654069e-05, + "loss": 0.1309, + "step": 34860 + }, + { + "epoch": 0.6217850390611066, + "grad_norm": 0.3254074454307556, + "learning_rate": 1.8799669687448922e-05, + "loss": 0.1998, + "step": 34861 + }, + { + "epoch": 0.6218028751828203, + "grad_norm": 0.25550737977027893, + "learning_rate": 1.8798161822391785e-05, + "loss": 0.1348, + "step": 34862 + }, + { + "epoch": 0.621820711304534, + "grad_norm": 0.25607091188430786, + "learning_rate": 1.8796653981375114e-05, + "loss": 0.115, + "step": 34863 + }, + { + "epoch": 0.6218385474262477, + "grad_norm": 0.19000785052776337, + "learning_rate": 1.8795146164404753e-05, + "loss": 0.0887, + "step": 34864 + }, + { + "epoch": 0.6218563835479614, + "grad_norm": 0.23918886482715607, + "learning_rate": 1.8793638371486546e-05, + "loss": 0.1412, + "step": 34865 + }, + { + "epoch": 0.621874219669675, + "grad_norm": 0.30615827441215515, + "learning_rate": 1.879213060262634e-05, + "loss": 0.118, + "step": 34866 + }, + { + "epoch": 0.6218920557913887, + "grad_norm": 0.22328411042690277, + "learning_rate": 1.8790622857829988e-05, + "loss": 0.0934, + "step": 34867 + }, + { + "epoch": 0.6219098919131024, + "grad_norm": 0.24728624522686005, + "learning_rate": 1.878911513710332e-05, + "loss": 0.1477, + "step": 34868 + }, + { + "epoch": 0.6219277280348161, + "grad_norm": 0.27808111906051636, + "learning_rate": 1.8787607440452185e-05, + "loss": 0.128, + "step": 34869 + }, + { + "epoch": 0.6219455641565298, + "grad_norm": 0.2123926728963852, + "learning_rate": 1.8786099767882435e-05, + "loss": 0.122, + "step": 34870 + }, + { + "epoch": 0.6219634002782435, + "grad_norm": 0.3079756796360016, + "learning_rate": 1.8784592119399907e-05, + "loss": 0.1453, + "step": 34871 + }, + { + "epoch": 0.6219812363999572, + "grad_norm": 0.2823425233364105, + "learning_rate": 1.8783084495010445e-05, + "loss": 0.1041, + "step": 34872 + }, + { + "epoch": 0.6219990725216709, + "grad_norm": 0.2872985303401947, + "learning_rate": 1.8781576894719893e-05, + "loss": 0.1634, + "step": 34873 + }, + { + "epoch": 0.6220169086433845, + "grad_norm": 0.27197402715682983, + "learning_rate": 1.8780069318534097e-05, + "loss": 0.1363, + "step": 34874 + }, + { + "epoch": 0.6220347447650982, + "grad_norm": 0.2895960509777069, + "learning_rate": 1.8778561766458908e-05, + "loss": 0.0938, + "step": 34875 + }, + { + "epoch": 0.6220525808868119, + "grad_norm": 0.2363155335187912, + "learning_rate": 1.877705423850016e-05, + "loss": 0.1107, + "step": 34876 + }, + { + "epoch": 0.6220704170085256, + "grad_norm": 0.31056854128837585, + "learning_rate": 1.87755467346637e-05, + "loss": 0.0787, + "step": 34877 + }, + { + "epoch": 0.6220882531302394, + "grad_norm": 0.22825270891189575, + "learning_rate": 1.877403925495536e-05, + "loss": 0.0931, + "step": 34878 + }, + { + "epoch": 0.6221060892519531, + "grad_norm": 0.29031768441200256, + "learning_rate": 1.8772531799380996e-05, + "loss": 0.0996, + "step": 34879 + }, + { + "epoch": 0.6221239253736668, + "grad_norm": 0.20994466543197632, + "learning_rate": 1.8771024367946456e-05, + "loss": 0.1589, + "step": 34880 + }, + { + "epoch": 0.6221417614953805, + "grad_norm": 0.2341243028640747, + "learning_rate": 1.8769516960657583e-05, + "loss": 0.1213, + "step": 34881 + }, + { + "epoch": 0.6221595976170942, + "grad_norm": 0.2786446213722229, + "learning_rate": 1.8768009577520198e-05, + "loss": 0.156, + "step": 34882 + }, + { + "epoch": 0.6221774337388079, + "grad_norm": 0.2707771062850952, + "learning_rate": 1.876650221854017e-05, + "loss": 0.1383, + "step": 34883 + }, + { + "epoch": 0.6221952698605215, + "grad_norm": 0.2270188182592392, + "learning_rate": 1.8764994883723336e-05, + "loss": 0.1229, + "step": 34884 + }, + { + "epoch": 0.6222131059822352, + "grad_norm": 0.35651758313179016, + "learning_rate": 1.876348757307553e-05, + "loss": 0.1828, + "step": 34885 + }, + { + "epoch": 0.6222309421039489, + "grad_norm": 0.3626936078071594, + "learning_rate": 1.87619802866026e-05, + "loss": 0.1239, + "step": 34886 + }, + { + "epoch": 0.6222487782256626, + "grad_norm": 0.20958806574344635, + "learning_rate": 1.8760473024310388e-05, + "loss": 0.1031, + "step": 34887 + }, + { + "epoch": 0.6222666143473763, + "grad_norm": 0.23583637177944183, + "learning_rate": 1.8758965786204742e-05, + "loss": 0.1431, + "step": 34888 + }, + { + "epoch": 0.62228445046909, + "grad_norm": 0.2802148163318634, + "learning_rate": 1.8757458572291502e-05, + "loss": 0.1099, + "step": 34889 + }, + { + "epoch": 0.6223022865908037, + "grad_norm": 0.2654156982898712, + "learning_rate": 1.875595138257651e-05, + "loss": 0.1322, + "step": 34890 + }, + { + "epoch": 0.6223201227125174, + "grad_norm": 0.24719032645225525, + "learning_rate": 1.875444421706559e-05, + "loss": 0.1392, + "step": 34891 + }, + { + "epoch": 0.622337958834231, + "grad_norm": 0.39449170231819153, + "learning_rate": 1.8752937075764616e-05, + "loss": 0.1808, + "step": 34892 + }, + { + "epoch": 0.6223557949559447, + "grad_norm": 0.19806192815303802, + "learning_rate": 1.8751429958679412e-05, + "loss": 0.1355, + "step": 34893 + }, + { + "epoch": 0.6223736310776585, + "grad_norm": 0.3297383189201355, + "learning_rate": 1.8749922865815827e-05, + "loss": 0.0947, + "step": 34894 + }, + { + "epoch": 0.6223914671993722, + "grad_norm": 0.29932481050491333, + "learning_rate": 1.874841579717969e-05, + "loss": 0.1673, + "step": 34895 + }, + { + "epoch": 0.6224093033210859, + "grad_norm": 0.2314309924840927, + "learning_rate": 1.8746908752776866e-05, + "loss": 0.1324, + "step": 34896 + }, + { + "epoch": 0.6224271394427996, + "grad_norm": 0.3002726435661316, + "learning_rate": 1.8745401732613177e-05, + "loss": 0.1588, + "step": 34897 + }, + { + "epoch": 0.6224449755645133, + "grad_norm": 0.2145930677652359, + "learning_rate": 1.8743894736694477e-05, + "loss": 0.1071, + "step": 34898 + }, + { + "epoch": 0.622462811686227, + "grad_norm": 0.259424090385437, + "learning_rate": 1.8742387765026588e-05, + "loss": 0.1705, + "step": 34899 + }, + { + "epoch": 0.6224806478079407, + "grad_norm": 0.2604461908340454, + "learning_rate": 1.8740880817615375e-05, + "loss": 0.1346, + "step": 34900 + }, + { + "epoch": 0.6224984839296543, + "grad_norm": 0.34902670979499817, + "learning_rate": 1.8739373894466665e-05, + "loss": 0.1103, + "step": 34901 + }, + { + "epoch": 0.622516320051368, + "grad_norm": 0.3610472083091736, + "learning_rate": 1.8737866995586313e-05, + "loss": 0.1597, + "step": 34902 + }, + { + "epoch": 0.6225341561730817, + "grad_norm": 0.2789466083049774, + "learning_rate": 1.8736360120980147e-05, + "loss": 0.1109, + "step": 34903 + }, + { + "epoch": 0.6225519922947954, + "grad_norm": 0.26244744658470154, + "learning_rate": 1.8734853270654004e-05, + "loss": 0.1084, + "step": 34904 + }, + { + "epoch": 0.6225698284165091, + "grad_norm": 0.22947083413600922, + "learning_rate": 1.8733346444613745e-05, + "loss": 0.1513, + "step": 34905 + }, + { + "epoch": 0.6225876645382228, + "grad_norm": 0.24667462706565857, + "learning_rate": 1.8731839642865192e-05, + "loss": 0.12, + "step": 34906 + }, + { + "epoch": 0.6226055006599365, + "grad_norm": 0.26830005645751953, + "learning_rate": 1.8730332865414203e-05, + "loss": 0.1923, + "step": 34907 + }, + { + "epoch": 0.6226233367816502, + "grad_norm": 0.3659855127334595, + "learning_rate": 1.8728826112266594e-05, + "loss": 0.113, + "step": 34908 + }, + { + "epoch": 0.6226411729033638, + "grad_norm": 0.2905159294605255, + "learning_rate": 1.8727319383428232e-05, + "loss": 0.1378, + "step": 34909 + }, + { + "epoch": 0.6226590090250775, + "grad_norm": 0.2375769168138504, + "learning_rate": 1.8725812678904946e-05, + "loss": 0.1017, + "step": 34910 + }, + { + "epoch": 0.6226768451467913, + "grad_norm": 0.23519118130207062, + "learning_rate": 1.8724305998702582e-05, + "loss": 0.1031, + "step": 34911 + }, + { + "epoch": 0.622694681268505, + "grad_norm": 0.2780896723270416, + "learning_rate": 1.8722799342826963e-05, + "loss": 0.1167, + "step": 34912 + }, + { + "epoch": 0.6227125173902187, + "grad_norm": 0.2808263599872589, + "learning_rate": 1.872129271128395e-05, + "loss": 0.1177, + "step": 34913 + }, + { + "epoch": 0.6227303535119324, + "grad_norm": 0.23439735174179077, + "learning_rate": 1.871978610407937e-05, + "loss": 0.095, + "step": 34914 + }, + { + "epoch": 0.6227481896336461, + "grad_norm": 0.3182724118232727, + "learning_rate": 1.8718279521219077e-05, + "loss": 0.1321, + "step": 34915 + }, + { + "epoch": 0.6227660257553598, + "grad_norm": 0.272592693567276, + "learning_rate": 1.87167729627089e-05, + "loss": 0.1439, + "step": 34916 + }, + { + "epoch": 0.6227838618770735, + "grad_norm": 0.29960349202156067, + "learning_rate": 1.8715266428554667e-05, + "loss": 0.0881, + "step": 34917 + }, + { + "epoch": 0.6228016979987872, + "grad_norm": 0.22677046060562134, + "learning_rate": 1.8713759918762247e-05, + "loss": 0.1142, + "step": 34918 + }, + { + "epoch": 0.6228195341205008, + "grad_norm": 0.39995241165161133, + "learning_rate": 1.8712253433337463e-05, + "loss": 0.1468, + "step": 34919 + }, + { + "epoch": 0.6228373702422145, + "grad_norm": 0.3321211636066437, + "learning_rate": 1.871074697228615e-05, + "loss": 0.1585, + "step": 34920 + }, + { + "epoch": 0.6228552063639282, + "grad_norm": 0.3118199110031128, + "learning_rate": 1.870924053561416e-05, + "loss": 0.1265, + "step": 34921 + }, + { + "epoch": 0.6228730424856419, + "grad_norm": 0.22003091871738434, + "learning_rate": 1.8707734123327323e-05, + "loss": 0.1207, + "step": 34922 + }, + { + "epoch": 0.6228908786073556, + "grad_norm": 0.28756463527679443, + "learning_rate": 1.8706227735431485e-05, + "loss": 0.1358, + "step": 34923 + }, + { + "epoch": 0.6229087147290693, + "grad_norm": 0.27470022439956665, + "learning_rate": 1.8704721371932484e-05, + "loss": 0.1706, + "step": 34924 + }, + { + "epoch": 0.622926550850783, + "grad_norm": 0.24368305504322052, + "learning_rate": 1.8703215032836145e-05, + "loss": 0.1467, + "step": 34925 + }, + { + "epoch": 0.6229443869724967, + "grad_norm": 0.40861910581588745, + "learning_rate": 1.8701708718148332e-05, + "loss": 0.2057, + "step": 34926 + }, + { + "epoch": 0.6229622230942103, + "grad_norm": 0.2805299460887909, + "learning_rate": 1.8700202427874868e-05, + "loss": 0.1248, + "step": 34927 + }, + { + "epoch": 0.6229800592159241, + "grad_norm": 0.2974446415901184, + "learning_rate": 1.8698696162021594e-05, + "loss": 0.1369, + "step": 34928 + }, + { + "epoch": 0.6229978953376378, + "grad_norm": 0.22959685325622559, + "learning_rate": 1.8697189920594355e-05, + "loss": 0.1481, + "step": 34929 + }, + { + "epoch": 0.6230157314593515, + "grad_norm": 0.21923013031482697, + "learning_rate": 1.8695683703598975e-05, + "loss": 0.09, + "step": 34930 + }, + { + "epoch": 0.6230335675810652, + "grad_norm": 0.30453383922576904, + "learning_rate": 1.869417751104131e-05, + "loss": 0.1414, + "step": 34931 + }, + { + "epoch": 0.6230514037027789, + "grad_norm": 0.20010130107402802, + "learning_rate": 1.8692671342927193e-05, + "loss": 0.1095, + "step": 34932 + }, + { + "epoch": 0.6230692398244926, + "grad_norm": 0.35191333293914795, + "learning_rate": 1.869116519926245e-05, + "loss": 0.1498, + "step": 34933 + }, + { + "epoch": 0.6230870759462063, + "grad_norm": 0.23263229429721832, + "learning_rate": 1.8689659080052934e-05, + "loss": 0.122, + "step": 34934 + }, + { + "epoch": 0.62310491206792, + "grad_norm": 0.2575501501560211, + "learning_rate": 1.868815298530448e-05, + "loss": 0.1763, + "step": 34935 + }, + { + "epoch": 0.6231227481896336, + "grad_norm": 0.31331855058670044, + "learning_rate": 1.8686646915022927e-05, + "loss": 0.1784, + "step": 34936 + }, + { + "epoch": 0.6231405843113473, + "grad_norm": 0.19131147861480713, + "learning_rate": 1.8685140869214115e-05, + "loss": 0.0875, + "step": 34937 + }, + { + "epoch": 0.623158420433061, + "grad_norm": 0.2755252420902252, + "learning_rate": 1.8683634847883865e-05, + "loss": 0.1507, + "step": 34938 + }, + { + "epoch": 0.6231762565547747, + "grad_norm": 0.22289063036441803, + "learning_rate": 1.868212885103804e-05, + "loss": 0.1119, + "step": 34939 + }, + { + "epoch": 0.6231940926764884, + "grad_norm": 0.26629912853240967, + "learning_rate": 1.8680622878682464e-05, + "loss": 0.1095, + "step": 34940 + }, + { + "epoch": 0.6232119287982021, + "grad_norm": 0.3054090142250061, + "learning_rate": 1.8679116930822974e-05, + "loss": 0.1474, + "step": 34941 + }, + { + "epoch": 0.6232297649199158, + "grad_norm": 0.2053171694278717, + "learning_rate": 1.8677611007465412e-05, + "loss": 0.1046, + "step": 34942 + }, + { + "epoch": 0.6232476010416295, + "grad_norm": 0.235662043094635, + "learning_rate": 1.8676105108615603e-05, + "loss": 0.1348, + "step": 34943 + }, + { + "epoch": 0.6232654371633432, + "grad_norm": 0.3221290409564972, + "learning_rate": 1.867459923427941e-05, + "loss": 0.1563, + "step": 34944 + }, + { + "epoch": 0.623283273285057, + "grad_norm": 0.28004884719848633, + "learning_rate": 1.8673093384462647e-05, + "loss": 0.1237, + "step": 34945 + }, + { + "epoch": 0.6233011094067706, + "grad_norm": 0.3150613307952881, + "learning_rate": 1.8671587559171165e-05, + "loss": 0.1434, + "step": 34946 + }, + { + "epoch": 0.6233189455284843, + "grad_norm": 0.22993752360343933, + "learning_rate": 1.8670081758410784e-05, + "loss": 0.111, + "step": 34947 + }, + { + "epoch": 0.623336781650198, + "grad_norm": 0.2804214358329773, + "learning_rate": 1.8668575982187365e-05, + "loss": 0.1196, + "step": 34948 + }, + { + "epoch": 0.6233546177719117, + "grad_norm": 0.21633709967136383, + "learning_rate": 1.8667070230506722e-05, + "loss": 0.1325, + "step": 34949 + }, + { + "epoch": 0.6233724538936254, + "grad_norm": 0.28193071484565735, + "learning_rate": 1.866556450337471e-05, + "loss": 0.1273, + "step": 34950 + }, + { + "epoch": 0.6233902900153391, + "grad_norm": 0.19965648651123047, + "learning_rate": 1.8664058800797147e-05, + "loss": 0.1113, + "step": 34951 + }, + { + "epoch": 0.6234081261370528, + "grad_norm": 0.25982069969177246, + "learning_rate": 1.866255312277989e-05, + "loss": 0.1717, + "step": 34952 + }, + { + "epoch": 0.6234259622587665, + "grad_norm": 0.307998389005661, + "learning_rate": 1.8661047469328767e-05, + "loss": 0.1556, + "step": 34953 + }, + { + "epoch": 0.6234437983804801, + "grad_norm": 0.2638751268386841, + "learning_rate": 1.8659541840449616e-05, + "loss": 0.0961, + "step": 34954 + }, + { + "epoch": 0.6234616345021938, + "grad_norm": 0.2946605682373047, + "learning_rate": 1.8658036236148264e-05, + "loss": 0.1414, + "step": 34955 + }, + { + "epoch": 0.6234794706239075, + "grad_norm": 0.27083146572113037, + "learning_rate": 1.8656530656430546e-05, + "loss": 0.1483, + "step": 34956 + }, + { + "epoch": 0.6234973067456212, + "grad_norm": 0.26402199268341064, + "learning_rate": 1.865502510130232e-05, + "loss": 0.1093, + "step": 34957 + }, + { + "epoch": 0.6235151428673349, + "grad_norm": 0.29530850052833557, + "learning_rate": 1.8653519570769406e-05, + "loss": 0.1031, + "step": 34958 + }, + { + "epoch": 0.6235329789890486, + "grad_norm": 0.21409687399864197, + "learning_rate": 1.8652014064837643e-05, + "loss": 0.0773, + "step": 34959 + }, + { + "epoch": 0.6235508151107623, + "grad_norm": 0.29825323820114136, + "learning_rate": 1.8650508583512855e-05, + "loss": 0.1405, + "step": 34960 + }, + { + "epoch": 0.623568651232476, + "grad_norm": 0.2612580955028534, + "learning_rate": 1.86490031268009e-05, + "loss": 0.1301, + "step": 34961 + }, + { + "epoch": 0.6235864873541898, + "grad_norm": 0.280009925365448, + "learning_rate": 1.8647497694707593e-05, + "loss": 0.1037, + "step": 34962 + }, + { + "epoch": 0.6236043234759034, + "grad_norm": 0.2956874966621399, + "learning_rate": 1.8645992287238788e-05, + "loss": 0.1327, + "step": 34963 + }, + { + "epoch": 0.6236221595976171, + "grad_norm": 0.29175421595573425, + "learning_rate": 1.8644486904400306e-05, + "loss": 0.1531, + "step": 34964 + }, + { + "epoch": 0.6236399957193308, + "grad_norm": 0.24473018944263458, + "learning_rate": 1.864298154619799e-05, + "loss": 0.1146, + "step": 34965 + }, + { + "epoch": 0.6236578318410445, + "grad_norm": 0.22427351772785187, + "learning_rate": 1.8641476212637676e-05, + "loss": 0.1708, + "step": 34966 + }, + { + "epoch": 0.6236756679627582, + "grad_norm": 0.26586952805519104, + "learning_rate": 1.8639970903725197e-05, + "loss": 0.0959, + "step": 34967 + }, + { + "epoch": 0.6236935040844719, + "grad_norm": 0.24371206760406494, + "learning_rate": 1.8638465619466384e-05, + "loss": 0.0958, + "step": 34968 + }, + { + "epoch": 0.6237113402061856, + "grad_norm": 0.2460588812828064, + "learning_rate": 1.8636960359867072e-05, + "loss": 0.1325, + "step": 34969 + }, + { + "epoch": 0.6237291763278993, + "grad_norm": 0.22553405165672302, + "learning_rate": 1.8635455124933102e-05, + "loss": 0.0797, + "step": 34970 + }, + { + "epoch": 0.623747012449613, + "grad_norm": 0.36743104457855225, + "learning_rate": 1.8633949914670312e-05, + "loss": 0.2188, + "step": 34971 + }, + { + "epoch": 0.6237648485713266, + "grad_norm": 0.23060420155525208, + "learning_rate": 1.863244472908453e-05, + "loss": 0.102, + "step": 34972 + }, + { + "epoch": 0.6237826846930403, + "grad_norm": 0.249001607298851, + "learning_rate": 1.8630939568181578e-05, + "loss": 0.1214, + "step": 34973 + }, + { + "epoch": 0.623800520814754, + "grad_norm": 0.23359011113643646, + "learning_rate": 1.862943443196732e-05, + "loss": 0.1445, + "step": 34974 + }, + { + "epoch": 0.6238183569364677, + "grad_norm": 0.24766728281974792, + "learning_rate": 1.8627929320447568e-05, + "loss": 0.1357, + "step": 34975 + }, + { + "epoch": 0.6238361930581814, + "grad_norm": 0.20163387060165405, + "learning_rate": 1.862642423362816e-05, + "loss": 0.1431, + "step": 34976 + }, + { + "epoch": 0.6238540291798951, + "grad_norm": 0.250692218542099, + "learning_rate": 1.8624919171514936e-05, + "loss": 0.091, + "step": 34977 + }, + { + "epoch": 0.6238718653016088, + "grad_norm": 0.2954672574996948, + "learning_rate": 1.8623414134113725e-05, + "loss": 0.0856, + "step": 34978 + }, + { + "epoch": 0.6238897014233226, + "grad_norm": 0.24775518476963043, + "learning_rate": 1.8621909121430365e-05, + "loss": 0.1393, + "step": 34979 + }, + { + "epoch": 0.6239075375450363, + "grad_norm": 0.25005194544792175, + "learning_rate": 1.862040413347069e-05, + "loss": 0.0765, + "step": 34980 + }, + { + "epoch": 0.6239253736667499, + "grad_norm": 0.29128676652908325, + "learning_rate": 1.861889917024054e-05, + "loss": 0.1363, + "step": 34981 + }, + { + "epoch": 0.6239432097884636, + "grad_norm": 0.2110409438610077, + "learning_rate": 1.8617394231745723e-05, + "loss": 0.0768, + "step": 34982 + }, + { + "epoch": 0.6239610459101773, + "grad_norm": 0.21204902231693268, + "learning_rate": 1.86158893179921e-05, + "loss": 0.0861, + "step": 34983 + }, + { + "epoch": 0.623978882031891, + "grad_norm": 0.30694273114204407, + "learning_rate": 1.8614384428985493e-05, + "loss": 0.1528, + "step": 34984 + }, + { + "epoch": 0.6239967181536047, + "grad_norm": 0.28929030895233154, + "learning_rate": 1.8612879564731742e-05, + "loss": 0.1294, + "step": 34985 + }, + { + "epoch": 0.6240145542753184, + "grad_norm": 0.39789798855781555, + "learning_rate": 1.8611374725236662e-05, + "loss": 0.1603, + "step": 34986 + }, + { + "epoch": 0.6240323903970321, + "grad_norm": 0.35863196849823, + "learning_rate": 1.8609869910506118e-05, + "loss": 0.1511, + "step": 34987 + }, + { + "epoch": 0.6240502265187458, + "grad_norm": 0.2548728883266449, + "learning_rate": 1.8608365120545918e-05, + "loss": 0.1334, + "step": 34988 + }, + { + "epoch": 0.6240680626404594, + "grad_norm": 0.20638984441757202, + "learning_rate": 1.86068603553619e-05, + "loss": 0.1348, + "step": 34989 + }, + { + "epoch": 0.6240858987621731, + "grad_norm": 0.31494128704071045, + "learning_rate": 1.86053556149599e-05, + "loss": 0.11, + "step": 34990 + }, + { + "epoch": 0.6241037348838868, + "grad_norm": 0.25856077671051025, + "learning_rate": 1.860385089934575e-05, + "loss": 0.1582, + "step": 34991 + }, + { + "epoch": 0.6241215710056005, + "grad_norm": 0.27843621373176575, + "learning_rate": 1.8602346208525284e-05, + "loss": 0.1154, + "step": 34992 + }, + { + "epoch": 0.6241394071273142, + "grad_norm": 0.34963467717170715, + "learning_rate": 1.8600841542504337e-05, + "loss": 0.1559, + "step": 34993 + }, + { + "epoch": 0.6241572432490279, + "grad_norm": 0.2968690097332001, + "learning_rate": 1.859933690128874e-05, + "loss": 0.1018, + "step": 34994 + }, + { + "epoch": 0.6241750793707417, + "grad_norm": 0.28566789627075195, + "learning_rate": 1.8597832284884313e-05, + "loss": 0.0909, + "step": 34995 + }, + { + "epoch": 0.6241929154924554, + "grad_norm": 0.3561452031135559, + "learning_rate": 1.8596327693296912e-05, + "loss": 0.1413, + "step": 34996 + }, + { + "epoch": 0.6242107516141691, + "grad_norm": 0.25409865379333496, + "learning_rate": 1.8594823126532346e-05, + "loss": 0.106, + "step": 34997 + }, + { + "epoch": 0.6242285877358827, + "grad_norm": 0.25925424695014954, + "learning_rate": 1.8593318584596468e-05, + "loss": 0.1082, + "step": 34998 + }, + { + "epoch": 0.6242464238575964, + "grad_norm": 0.2867615520954132, + "learning_rate": 1.8591814067495084e-05, + "loss": 0.137, + "step": 34999 + }, + { + "epoch": 0.6242642599793101, + "grad_norm": 0.19471020996570587, + "learning_rate": 1.8590309575234056e-05, + "loss": 0.1151, + "step": 35000 + }, + { + "epoch": 0.6242642599793101, + "eval_loss": 0.12341565638780594, + "eval_runtime": 106.8107, + "eval_samples_per_second": 9.587, + "eval_steps_per_second": 1.601, + "step": 35000 + }, + { + "epoch": 0.6242820961010238, + "grad_norm": 0.26165613532066345, + "learning_rate": 1.85888051078192e-05, + "loss": 0.1512, + "step": 35001 + }, + { + "epoch": 0.6242999322227375, + "grad_norm": 0.3111760914325714, + "learning_rate": 1.858730066525635e-05, + "loss": 0.1654, + "step": 35002 + }, + { + "epoch": 0.6243177683444512, + "grad_norm": 0.28350284695625305, + "learning_rate": 1.858579624755133e-05, + "loss": 0.1646, + "step": 35003 + }, + { + "epoch": 0.6243356044661649, + "grad_norm": 0.292869508266449, + "learning_rate": 1.858429185470999e-05, + "loss": 0.1053, + "step": 35004 + }, + { + "epoch": 0.6243534405878786, + "grad_norm": 0.2753280997276306, + "learning_rate": 1.8582787486738144e-05, + "loss": 0.1348, + "step": 35005 + }, + { + "epoch": 0.6243712767095922, + "grad_norm": 0.2785457968711853, + "learning_rate": 1.8581283143641634e-05, + "loss": 0.1168, + "step": 35006 + }, + { + "epoch": 0.6243891128313059, + "grad_norm": 0.23466309905052185, + "learning_rate": 1.8579778825426286e-05, + "loss": 0.1285, + "step": 35007 + }, + { + "epoch": 0.6244069489530196, + "grad_norm": 0.26247933506965637, + "learning_rate": 1.857827453209793e-05, + "loss": 0.1088, + "step": 35008 + }, + { + "epoch": 0.6244247850747333, + "grad_norm": 0.24538198113441467, + "learning_rate": 1.8576770263662403e-05, + "loss": 0.0781, + "step": 35009 + }, + { + "epoch": 0.624442621196447, + "grad_norm": 0.2285107523202896, + "learning_rate": 1.857526602012553e-05, + "loss": 0.0909, + "step": 35010 + }, + { + "epoch": 0.6244604573181607, + "grad_norm": 0.3254603445529938, + "learning_rate": 1.8573761801493147e-05, + "loss": 0.132, + "step": 35011 + }, + { + "epoch": 0.6244782934398745, + "grad_norm": 0.26341378688812256, + "learning_rate": 1.8572257607771077e-05, + "loss": 0.0584, + "step": 35012 + }, + { + "epoch": 0.6244961295615882, + "grad_norm": 0.35393351316452026, + "learning_rate": 1.8570753438965162e-05, + "loss": 0.2023, + "step": 35013 + }, + { + "epoch": 0.6245139656833019, + "grad_norm": 0.22882148623466492, + "learning_rate": 1.8569249295081233e-05, + "loss": 0.0997, + "step": 35014 + }, + { + "epoch": 0.6245318018050156, + "grad_norm": 0.2795863747596741, + "learning_rate": 1.856774517612511e-05, + "loss": 0.1317, + "step": 35015 + }, + { + "epoch": 0.6245496379267292, + "grad_norm": 0.2590351998806, + "learning_rate": 1.856624108210262e-05, + "loss": 0.0999, + "step": 35016 + }, + { + "epoch": 0.6245674740484429, + "grad_norm": 0.2701863944530487, + "learning_rate": 1.856473701301961e-05, + "loss": 0.1984, + "step": 35017 + }, + { + "epoch": 0.6245853101701566, + "grad_norm": 0.279201477766037, + "learning_rate": 1.8563232968881902e-05, + "loss": 0.1296, + "step": 35018 + }, + { + "epoch": 0.6246031462918703, + "grad_norm": 0.3249446153640747, + "learning_rate": 1.8561728949695328e-05, + "loss": 0.1588, + "step": 35019 + }, + { + "epoch": 0.624620982413584, + "grad_norm": 0.24219025671482086, + "learning_rate": 1.856022495546572e-05, + "loss": 0.0987, + "step": 35020 + }, + { + "epoch": 0.6246388185352977, + "grad_norm": 0.2391340583562851, + "learning_rate": 1.855872098619889e-05, + "loss": 0.1008, + "step": 35021 + }, + { + "epoch": 0.6246566546570114, + "grad_norm": 0.352358877658844, + "learning_rate": 1.8557217041900693e-05, + "loss": 0.161, + "step": 35022 + }, + { + "epoch": 0.6246744907787251, + "grad_norm": 0.27890506386756897, + "learning_rate": 1.8555713122576945e-05, + "loss": 0.1332, + "step": 35023 + }, + { + "epoch": 0.6246923269004387, + "grad_norm": 0.26933708786964417, + "learning_rate": 1.855420922823348e-05, + "loss": 0.0986, + "step": 35024 + }, + { + "epoch": 0.6247101630221524, + "grad_norm": 0.2250606268644333, + "learning_rate": 1.855270535887612e-05, + "loss": 0.1238, + "step": 35025 + }, + { + "epoch": 0.6247279991438661, + "grad_norm": 0.27833935618400574, + "learning_rate": 1.8551201514510708e-05, + "loss": 0.1077, + "step": 35026 + }, + { + "epoch": 0.6247458352655798, + "grad_norm": 0.23978637158870697, + "learning_rate": 1.8549697695143065e-05, + "loss": 0.1192, + "step": 35027 + }, + { + "epoch": 0.6247636713872935, + "grad_norm": 0.283624529838562, + "learning_rate": 1.8548193900779025e-05, + "loss": 0.1359, + "step": 35028 + }, + { + "epoch": 0.6247815075090073, + "grad_norm": 0.3338007628917694, + "learning_rate": 1.85466901314244e-05, + "loss": 0.1951, + "step": 35029 + }, + { + "epoch": 0.624799343630721, + "grad_norm": 0.22621211409568787, + "learning_rate": 1.854518638708505e-05, + "loss": 0.1115, + "step": 35030 + }, + { + "epoch": 0.6248171797524347, + "grad_norm": 0.31063011288642883, + "learning_rate": 1.8543682667766783e-05, + "loss": 0.1844, + "step": 35031 + }, + { + "epoch": 0.6248350158741484, + "grad_norm": 0.34097880125045776, + "learning_rate": 1.8542178973475423e-05, + "loss": 0.2192, + "step": 35032 + }, + { + "epoch": 0.624852851995862, + "grad_norm": 0.3390691876411438, + "learning_rate": 1.8540675304216818e-05, + "loss": 0.1911, + "step": 35033 + }, + { + "epoch": 0.6248706881175757, + "grad_norm": 0.25483274459838867, + "learning_rate": 1.8539171659996774e-05, + "loss": 0.1235, + "step": 35034 + }, + { + "epoch": 0.6248885242392894, + "grad_norm": 0.2510347366333008, + "learning_rate": 1.8537668040821143e-05, + "loss": 0.166, + "step": 35035 + }, + { + "epoch": 0.6249063603610031, + "grad_norm": 0.21529719233512878, + "learning_rate": 1.8536164446695742e-05, + "loss": 0.0877, + "step": 35036 + }, + { + "epoch": 0.6249241964827168, + "grad_norm": 0.3016282320022583, + "learning_rate": 1.8534660877626396e-05, + "loss": 0.1286, + "step": 35037 + }, + { + "epoch": 0.6249420326044305, + "grad_norm": 0.3185059130191803, + "learning_rate": 1.853315733361894e-05, + "loss": 0.211, + "step": 35038 + }, + { + "epoch": 0.6249598687261442, + "grad_norm": 0.25188443064689636, + "learning_rate": 1.8531653814679195e-05, + "loss": 0.138, + "step": 35039 + }, + { + "epoch": 0.6249777048478579, + "grad_norm": 0.35887062549591064, + "learning_rate": 1.8530150320813e-05, + "loss": 0.0978, + "step": 35040 + }, + { + "epoch": 0.6249955409695716, + "grad_norm": 0.24346831440925598, + "learning_rate": 1.852864685202618e-05, + "loss": 0.1391, + "step": 35041 + }, + { + "epoch": 0.6250133770912852, + "grad_norm": 0.22355423867702484, + "learning_rate": 1.852714340832455e-05, + "loss": 0.1261, + "step": 35042 + }, + { + "epoch": 0.6250312132129989, + "grad_norm": 0.21421067416667938, + "learning_rate": 1.8525639989713954e-05, + "loss": 0.1313, + "step": 35043 + }, + { + "epoch": 0.6250490493347126, + "grad_norm": 0.3358588218688965, + "learning_rate": 1.8524136596200216e-05, + "loss": 0.1281, + "step": 35044 + }, + { + "epoch": 0.6250668854564263, + "grad_norm": 0.17502638697624207, + "learning_rate": 1.8522633227789153e-05, + "loss": 0.1153, + "step": 35045 + }, + { + "epoch": 0.6250847215781401, + "grad_norm": 0.36354413628578186, + "learning_rate": 1.852112988448661e-05, + "loss": 0.1676, + "step": 35046 + }, + { + "epoch": 0.6251025576998538, + "grad_norm": 0.2767677903175354, + "learning_rate": 1.8519626566298394e-05, + "loss": 0.1234, + "step": 35047 + }, + { + "epoch": 0.6251203938215675, + "grad_norm": 0.3142389953136444, + "learning_rate": 1.8518123273230353e-05, + "loss": 0.0912, + "step": 35048 + }, + { + "epoch": 0.6251382299432812, + "grad_norm": 0.27851995825767517, + "learning_rate": 1.8516620005288304e-05, + "loss": 0.1101, + "step": 35049 + }, + { + "epoch": 0.6251560660649949, + "grad_norm": 0.2035122662782669, + "learning_rate": 1.851511676247808e-05, + "loss": 0.0998, + "step": 35050 + }, + { + "epoch": 0.6251739021867085, + "grad_norm": 0.19163979589939117, + "learning_rate": 1.8513613544805487e-05, + "loss": 0.0824, + "step": 35051 + }, + { + "epoch": 0.6251917383084222, + "grad_norm": 0.236705482006073, + "learning_rate": 1.851211035227638e-05, + "loss": 0.1225, + "step": 35052 + }, + { + "epoch": 0.6252095744301359, + "grad_norm": 0.2939296066761017, + "learning_rate": 1.8510607184896573e-05, + "loss": 0.1062, + "step": 35053 + }, + { + "epoch": 0.6252274105518496, + "grad_norm": 0.2122703641653061, + "learning_rate": 1.8509104042671893e-05, + "loss": 0.1207, + "step": 35054 + }, + { + "epoch": 0.6252452466735633, + "grad_norm": 0.2184041440486908, + "learning_rate": 1.850760092560816e-05, + "loss": 0.1363, + "step": 35055 + }, + { + "epoch": 0.625263082795277, + "grad_norm": 0.209747314453125, + "learning_rate": 1.850609783371122e-05, + "loss": 0.1433, + "step": 35056 + }, + { + "epoch": 0.6252809189169907, + "grad_norm": 0.38966289162635803, + "learning_rate": 1.850459476698689e-05, + "loss": 0.1179, + "step": 35057 + }, + { + "epoch": 0.6252987550387044, + "grad_norm": 0.3571484088897705, + "learning_rate": 1.850309172544099e-05, + "loss": 0.1584, + "step": 35058 + }, + { + "epoch": 0.625316591160418, + "grad_norm": 0.28210461139678955, + "learning_rate": 1.8501588709079344e-05, + "loss": 0.1162, + "step": 35059 + }, + { + "epoch": 0.6253344272821317, + "grad_norm": 0.27694180607795715, + "learning_rate": 1.850008571790778e-05, + "loss": 0.0815, + "step": 35060 + }, + { + "epoch": 0.6253522634038454, + "grad_norm": 0.24310360848903656, + "learning_rate": 1.849858275193214e-05, + "loss": 0.1257, + "step": 35061 + }, + { + "epoch": 0.6253700995255591, + "grad_norm": 0.3024975061416626, + "learning_rate": 1.849707981115824e-05, + "loss": 0.1472, + "step": 35062 + }, + { + "epoch": 0.6253879356472729, + "grad_norm": 0.2642667591571808, + "learning_rate": 1.8495576895591903e-05, + "loss": 0.1404, + "step": 35063 + }, + { + "epoch": 0.6254057717689866, + "grad_norm": 0.3894912600517273, + "learning_rate": 1.8494074005238948e-05, + "loss": 0.0994, + "step": 35064 + }, + { + "epoch": 0.6254236078907003, + "grad_norm": 0.2190268486738205, + "learning_rate": 1.849257114010522e-05, + "loss": 0.1276, + "step": 35065 + }, + { + "epoch": 0.625441444012414, + "grad_norm": 0.5063741207122803, + "learning_rate": 1.8491068300196526e-05, + "loss": 0.1299, + "step": 35066 + }, + { + "epoch": 0.6254592801341277, + "grad_norm": 0.28464534878730774, + "learning_rate": 1.8489565485518707e-05, + "loss": 0.1105, + "step": 35067 + }, + { + "epoch": 0.6254771162558413, + "grad_norm": 0.37525391578674316, + "learning_rate": 1.8488062696077567e-05, + "loss": 0.0932, + "step": 35068 + }, + { + "epoch": 0.625494952377555, + "grad_norm": 0.23884116113185883, + "learning_rate": 1.848655993187896e-05, + "loss": 0.1127, + "step": 35069 + }, + { + "epoch": 0.6255127884992687, + "grad_norm": 0.27507731318473816, + "learning_rate": 1.8485057192928694e-05, + "loss": 0.1407, + "step": 35070 + }, + { + "epoch": 0.6255306246209824, + "grad_norm": 0.2676922082901001, + "learning_rate": 1.8483554479232594e-05, + "loss": 0.1569, + "step": 35071 + }, + { + "epoch": 0.6255484607426961, + "grad_norm": 0.24304607510566711, + "learning_rate": 1.8482051790796488e-05, + "loss": 0.1246, + "step": 35072 + }, + { + "epoch": 0.6255662968644098, + "grad_norm": 0.21580667793750763, + "learning_rate": 1.84805491276262e-05, + "loss": 0.1153, + "step": 35073 + }, + { + "epoch": 0.6255841329861235, + "grad_norm": 0.257205605506897, + "learning_rate": 1.847904648972755e-05, + "loss": 0.1021, + "step": 35074 + }, + { + "epoch": 0.6256019691078372, + "grad_norm": 0.2752120792865753, + "learning_rate": 1.847754387710638e-05, + "loss": 0.1256, + "step": 35075 + }, + { + "epoch": 0.6256198052295509, + "grad_norm": 0.2943366467952728, + "learning_rate": 1.8476041289768497e-05, + "loss": 0.1537, + "step": 35076 + }, + { + "epoch": 0.6256376413512645, + "grad_norm": 0.2767156660556793, + "learning_rate": 1.847453872771972e-05, + "loss": 0.1244, + "step": 35077 + }, + { + "epoch": 0.6256554774729782, + "grad_norm": 0.29404208064079285, + "learning_rate": 1.84730361909659e-05, + "loss": 0.1562, + "step": 35078 + }, + { + "epoch": 0.6256733135946919, + "grad_norm": 0.21938000619411469, + "learning_rate": 1.8471533679512844e-05, + "loss": 0.1348, + "step": 35079 + }, + { + "epoch": 0.6256911497164057, + "grad_norm": 0.25450754165649414, + "learning_rate": 1.8470031193366372e-05, + "loss": 0.0866, + "step": 35080 + }, + { + "epoch": 0.6257089858381194, + "grad_norm": 0.2514766454696655, + "learning_rate": 1.846852873253232e-05, + "loss": 0.1102, + "step": 35081 + }, + { + "epoch": 0.6257268219598331, + "grad_norm": 0.24066147208213806, + "learning_rate": 1.84670262970165e-05, + "loss": 0.1372, + "step": 35082 + }, + { + "epoch": 0.6257446580815468, + "grad_norm": 0.26582470536231995, + "learning_rate": 1.8465523886824747e-05, + "loss": 0.1094, + "step": 35083 + }, + { + "epoch": 0.6257624942032605, + "grad_norm": 0.2871421277523041, + "learning_rate": 1.8464021501962887e-05, + "loss": 0.1321, + "step": 35084 + }, + { + "epoch": 0.6257803303249742, + "grad_norm": 0.28860166668891907, + "learning_rate": 1.846251914243673e-05, + "loss": 0.1721, + "step": 35085 + }, + { + "epoch": 0.6257981664466878, + "grad_norm": 0.23838277161121368, + "learning_rate": 1.84610168082521e-05, + "loss": 0.1825, + "step": 35086 + }, + { + "epoch": 0.6258160025684015, + "grad_norm": 0.32682380080223083, + "learning_rate": 1.845951449941483e-05, + "loss": 0.1362, + "step": 35087 + }, + { + "epoch": 0.6258338386901152, + "grad_norm": 0.2525024712085724, + "learning_rate": 1.845801221593075e-05, + "loss": 0.1102, + "step": 35088 + }, + { + "epoch": 0.6258516748118289, + "grad_norm": 0.4192144274711609, + "learning_rate": 1.8456509957805673e-05, + "loss": 0.1279, + "step": 35089 + }, + { + "epoch": 0.6258695109335426, + "grad_norm": 0.23294328153133392, + "learning_rate": 1.8455007725045415e-05, + "loss": 0.1077, + "step": 35090 + }, + { + "epoch": 0.6258873470552563, + "grad_norm": 0.3153073787689209, + "learning_rate": 1.8453505517655813e-05, + "loss": 0.178, + "step": 35091 + }, + { + "epoch": 0.62590518317697, + "grad_norm": 0.2681513726711273, + "learning_rate": 1.8452003335642688e-05, + "loss": 0.1217, + "step": 35092 + }, + { + "epoch": 0.6259230192986837, + "grad_norm": 0.3369585871696472, + "learning_rate": 1.8450501179011853e-05, + "loss": 0.1649, + "step": 35093 + }, + { + "epoch": 0.6259408554203973, + "grad_norm": 0.2894989550113678, + "learning_rate": 1.8448999047769138e-05, + "loss": 0.1607, + "step": 35094 + }, + { + "epoch": 0.625958691542111, + "grad_norm": 0.28532910346984863, + "learning_rate": 1.8447496941920368e-05, + "loss": 0.1092, + "step": 35095 + }, + { + "epoch": 0.6259765276638248, + "grad_norm": 0.32344838976860046, + "learning_rate": 1.8445994861471362e-05, + "loss": 0.1416, + "step": 35096 + }, + { + "epoch": 0.6259943637855385, + "grad_norm": 0.30115872621536255, + "learning_rate": 1.844449280642795e-05, + "loss": 0.1702, + "step": 35097 + }, + { + "epoch": 0.6260121999072522, + "grad_norm": 0.21956495940685272, + "learning_rate": 1.8442990776795944e-05, + "loss": 0.1341, + "step": 35098 + }, + { + "epoch": 0.6260300360289659, + "grad_norm": 0.32527607679367065, + "learning_rate": 1.844148877258116e-05, + "loss": 0.147, + "step": 35099 + }, + { + "epoch": 0.6260478721506796, + "grad_norm": 0.2595917880535126, + "learning_rate": 1.8439986793789443e-05, + "loss": 0.179, + "step": 35100 + }, + { + "epoch": 0.6260657082723933, + "grad_norm": 0.34333574771881104, + "learning_rate": 1.84384848404266e-05, + "loss": 0.1131, + "step": 35101 + }, + { + "epoch": 0.626083544394107, + "grad_norm": 0.24040868878364563, + "learning_rate": 1.8436982912498457e-05, + "loss": 0.113, + "step": 35102 + }, + { + "epoch": 0.6261013805158206, + "grad_norm": 0.23016753792762756, + "learning_rate": 1.8435481010010826e-05, + "loss": 0.1142, + "step": 35103 + }, + { + "epoch": 0.6261192166375343, + "grad_norm": 0.2544845938682556, + "learning_rate": 1.843397913296955e-05, + "loss": 0.1258, + "step": 35104 + }, + { + "epoch": 0.626137052759248, + "grad_norm": 0.19867968559265137, + "learning_rate": 1.8432477281380436e-05, + "loss": 0.0618, + "step": 35105 + }, + { + "epoch": 0.6261548888809617, + "grad_norm": 0.2854829728603363, + "learning_rate": 1.843097545524931e-05, + "loss": 0.0984, + "step": 35106 + }, + { + "epoch": 0.6261727250026754, + "grad_norm": 0.23106160759925842, + "learning_rate": 1.842947365458198e-05, + "loss": 0.0763, + "step": 35107 + }, + { + "epoch": 0.6261905611243891, + "grad_norm": 0.2902849614620209, + "learning_rate": 1.842797187938429e-05, + "loss": 0.1153, + "step": 35108 + }, + { + "epoch": 0.6262083972461028, + "grad_norm": 0.3303748071193695, + "learning_rate": 1.842647012966205e-05, + "loss": 0.1457, + "step": 35109 + }, + { + "epoch": 0.6262262333678165, + "grad_norm": 0.29859596490859985, + "learning_rate": 1.8424968405421085e-05, + "loss": 0.1482, + "step": 35110 + }, + { + "epoch": 0.6262440694895302, + "grad_norm": 0.27811217308044434, + "learning_rate": 1.8423466706667215e-05, + "loss": 0.1441, + "step": 35111 + }, + { + "epoch": 0.6262619056112438, + "grad_norm": 0.22279682755470276, + "learning_rate": 1.8421965033406243e-05, + "loss": 0.1125, + "step": 35112 + }, + { + "epoch": 0.6262797417329576, + "grad_norm": 0.24229790270328522, + "learning_rate": 1.842046338564402e-05, + "loss": 0.1204, + "step": 35113 + }, + { + "epoch": 0.6262975778546713, + "grad_norm": 0.2506665289402008, + "learning_rate": 1.841896176338635e-05, + "loss": 0.085, + "step": 35114 + }, + { + "epoch": 0.626315413976385, + "grad_norm": 0.2284838855266571, + "learning_rate": 1.841746016663906e-05, + "loss": 0.0778, + "step": 35115 + }, + { + "epoch": 0.6263332500980987, + "grad_norm": 0.26494085788726807, + "learning_rate": 1.8415958595407963e-05, + "loss": 0.1126, + "step": 35116 + }, + { + "epoch": 0.6263510862198124, + "grad_norm": 0.25604185461997986, + "learning_rate": 1.841445704969889e-05, + "loss": 0.1066, + "step": 35117 + }, + { + "epoch": 0.6263689223415261, + "grad_norm": 0.21456775069236755, + "learning_rate": 1.8412955529517655e-05, + "loss": 0.0748, + "step": 35118 + }, + { + "epoch": 0.6263867584632398, + "grad_norm": 0.2448813021183014, + "learning_rate": 1.8411454034870082e-05, + "loss": 0.1535, + "step": 35119 + }, + { + "epoch": 0.6264045945849535, + "grad_norm": 0.4279812276363373, + "learning_rate": 1.840995256576198e-05, + "loss": 0.15, + "step": 35120 + }, + { + "epoch": 0.6264224307066671, + "grad_norm": 0.311682790517807, + "learning_rate": 1.8408451122199184e-05, + "loss": 0.1327, + "step": 35121 + }, + { + "epoch": 0.6264402668283808, + "grad_norm": 0.25895968079566956, + "learning_rate": 1.8406949704187504e-05, + "loss": 0.108, + "step": 35122 + }, + { + "epoch": 0.6264581029500945, + "grad_norm": 0.23475706577301025, + "learning_rate": 1.840544831173277e-05, + "loss": 0.1099, + "step": 35123 + }, + { + "epoch": 0.6264759390718082, + "grad_norm": 0.33879831433296204, + "learning_rate": 1.8403946944840798e-05, + "loss": 0.0652, + "step": 35124 + }, + { + "epoch": 0.6264937751935219, + "grad_norm": 0.2367793768644333, + "learning_rate": 1.8402445603517394e-05, + "loss": 0.1329, + "step": 35125 + }, + { + "epoch": 0.6265116113152356, + "grad_norm": 0.22501425445079803, + "learning_rate": 1.8400944287768397e-05, + "loss": 0.0997, + "step": 35126 + }, + { + "epoch": 0.6265294474369493, + "grad_norm": 0.2000752091407776, + "learning_rate": 1.8399442997599627e-05, + "loss": 0.0914, + "step": 35127 + }, + { + "epoch": 0.626547283558663, + "grad_norm": 0.22648796439170837, + "learning_rate": 1.8397941733016882e-05, + "loss": 0.0873, + "step": 35128 + }, + { + "epoch": 0.6265651196803766, + "grad_norm": 0.2413891702890396, + "learning_rate": 1.8396440494025998e-05, + "loss": 0.1018, + "step": 35129 + }, + { + "epoch": 0.6265829558020904, + "grad_norm": 0.2727953791618347, + "learning_rate": 1.8394939280632792e-05, + "loss": 0.0895, + "step": 35130 + }, + { + "epoch": 0.6266007919238041, + "grad_norm": 0.2796498239040375, + "learning_rate": 1.8393438092843088e-05, + "loss": 0.1154, + "step": 35131 + }, + { + "epoch": 0.6266186280455178, + "grad_norm": 0.32536715269088745, + "learning_rate": 1.83919369306627e-05, + "loss": 0.1517, + "step": 35132 + }, + { + "epoch": 0.6266364641672315, + "grad_norm": 0.26453447341918945, + "learning_rate": 1.8390435794097435e-05, + "loss": 0.1437, + "step": 35133 + }, + { + "epoch": 0.6266543002889452, + "grad_norm": 0.3098036050796509, + "learning_rate": 1.8388934683153135e-05, + "loss": 0.1176, + "step": 35134 + }, + { + "epoch": 0.6266721364106589, + "grad_norm": 0.29212474822998047, + "learning_rate": 1.8387433597835607e-05, + "loss": 0.119, + "step": 35135 + }, + { + "epoch": 0.6266899725323726, + "grad_norm": 0.30823978781700134, + "learning_rate": 1.8385932538150667e-05, + "loss": 0.0657, + "step": 35136 + }, + { + "epoch": 0.6267078086540863, + "grad_norm": 0.2584109902381897, + "learning_rate": 1.838443150410414e-05, + "loss": 0.1288, + "step": 35137 + }, + { + "epoch": 0.6267256447758, + "grad_norm": 0.24601887166500092, + "learning_rate": 1.8382930495701833e-05, + "loss": 0.1153, + "step": 35138 + }, + { + "epoch": 0.6267434808975136, + "grad_norm": 0.3031660318374634, + "learning_rate": 1.8381429512949583e-05, + "loss": 0.1733, + "step": 35139 + }, + { + "epoch": 0.6267613170192273, + "grad_norm": 0.3300594687461853, + "learning_rate": 1.8379928555853198e-05, + "loss": 0.0857, + "step": 35140 + }, + { + "epoch": 0.626779153140941, + "grad_norm": 0.2779475450515747, + "learning_rate": 1.8378427624418496e-05, + "loss": 0.1255, + "step": 35141 + }, + { + "epoch": 0.6267969892626547, + "grad_norm": 0.22482886910438538, + "learning_rate": 1.8376926718651282e-05, + "loss": 0.1368, + "step": 35142 + }, + { + "epoch": 0.6268148253843684, + "grad_norm": 0.25578710436820984, + "learning_rate": 1.83754258385574e-05, + "loss": 0.1461, + "step": 35143 + }, + { + "epoch": 0.6268326615060821, + "grad_norm": 0.2601737678050995, + "learning_rate": 1.8373924984142654e-05, + "loss": 0.082, + "step": 35144 + }, + { + "epoch": 0.6268504976277958, + "grad_norm": 0.2680853605270386, + "learning_rate": 1.8372424155412866e-05, + "loss": 0.1383, + "step": 35145 + }, + { + "epoch": 0.6268683337495095, + "grad_norm": 0.38748306035995483, + "learning_rate": 1.837092335237384e-05, + "loss": 0.1416, + "step": 35146 + }, + { + "epoch": 0.6268861698712233, + "grad_norm": 0.2574920356273651, + "learning_rate": 1.8369422575031414e-05, + "loss": 0.1677, + "step": 35147 + }, + { + "epoch": 0.6269040059929369, + "grad_norm": 0.235775426030159, + "learning_rate": 1.83679218233914e-05, + "loss": 0.1069, + "step": 35148 + }, + { + "epoch": 0.6269218421146506, + "grad_norm": 0.2731079161167145, + "learning_rate": 1.8366421097459602e-05, + "loss": 0.1503, + "step": 35149 + }, + { + "epoch": 0.6269396782363643, + "grad_norm": 0.2916219234466553, + "learning_rate": 1.8364920397241856e-05, + "loss": 0.157, + "step": 35150 + }, + { + "epoch": 0.626957514358078, + "grad_norm": 0.3127744197845459, + "learning_rate": 1.8363419722743957e-05, + "loss": 0.2193, + "step": 35151 + }, + { + "epoch": 0.6269753504797917, + "grad_norm": 0.24434545636177063, + "learning_rate": 1.8361919073971746e-05, + "loss": 0.0999, + "step": 35152 + }, + { + "epoch": 0.6269931866015054, + "grad_norm": 0.22819340229034424, + "learning_rate": 1.8360418450931034e-05, + "loss": 0.1478, + "step": 35153 + }, + { + "epoch": 0.6270110227232191, + "grad_norm": 0.297372967004776, + "learning_rate": 1.835891785362763e-05, + "loss": 0.0978, + "step": 35154 + }, + { + "epoch": 0.6270288588449328, + "grad_norm": 0.2813517451286316, + "learning_rate": 1.835741728206734e-05, + "loss": 0.0918, + "step": 35155 + }, + { + "epoch": 0.6270466949666464, + "grad_norm": 0.2257905900478363, + "learning_rate": 1.8355916736256012e-05, + "loss": 0.0963, + "step": 35156 + }, + { + "epoch": 0.6270645310883601, + "grad_norm": 1.021179437637329, + "learning_rate": 1.8354416216199436e-05, + "loss": 0.1598, + "step": 35157 + }, + { + "epoch": 0.6270823672100738, + "grad_norm": 0.20912602543830872, + "learning_rate": 1.8352915721903443e-05, + "loss": 0.1447, + "step": 35158 + }, + { + "epoch": 0.6271002033317875, + "grad_norm": 0.2968735992908478, + "learning_rate": 1.835141525337384e-05, + "loss": 0.1269, + "step": 35159 + }, + { + "epoch": 0.6271180394535012, + "grad_norm": 0.29712948203086853, + "learning_rate": 1.834991481061645e-05, + "loss": 0.108, + "step": 35160 + }, + { + "epoch": 0.6271358755752149, + "grad_norm": 0.2570733428001404, + "learning_rate": 1.8348414393637092e-05, + "loss": 0.1159, + "step": 35161 + }, + { + "epoch": 0.6271537116969286, + "grad_norm": 0.15819190442562103, + "learning_rate": 1.8346914002441573e-05, + "loss": 0.0953, + "step": 35162 + }, + { + "epoch": 0.6271715478186423, + "grad_norm": 0.24046814441680908, + "learning_rate": 1.8345413637035713e-05, + "loss": 0.153, + "step": 35163 + }, + { + "epoch": 0.6271893839403561, + "grad_norm": 0.24306350946426392, + "learning_rate": 1.834391329742532e-05, + "loss": 0.1311, + "step": 35164 + }, + { + "epoch": 0.6272072200620697, + "grad_norm": 0.2626388370990753, + "learning_rate": 1.834241298361623e-05, + "loss": 0.1239, + "step": 35165 + }, + { + "epoch": 0.6272250561837834, + "grad_norm": 0.24841630458831787, + "learning_rate": 1.8340912695614246e-05, + "loss": 0.1426, + "step": 35166 + }, + { + "epoch": 0.6272428923054971, + "grad_norm": 0.27767080068588257, + "learning_rate": 1.8339412433425186e-05, + "loss": 0.1054, + "step": 35167 + }, + { + "epoch": 0.6272607284272108, + "grad_norm": 0.37322691082954407, + "learning_rate": 1.833791219705485e-05, + "loss": 0.1271, + "step": 35168 + }, + { + "epoch": 0.6272785645489245, + "grad_norm": 0.27964141964912415, + "learning_rate": 1.8336411986509078e-05, + "loss": 0.1114, + "step": 35169 + }, + { + "epoch": 0.6272964006706382, + "grad_norm": 0.2561666667461395, + "learning_rate": 1.8334911801793673e-05, + "loss": 0.1249, + "step": 35170 + }, + { + "epoch": 0.6273142367923519, + "grad_norm": 0.2643705904483795, + "learning_rate": 1.8333411642914456e-05, + "loss": 0.082, + "step": 35171 + }, + { + "epoch": 0.6273320729140656, + "grad_norm": 0.29310816526412964, + "learning_rate": 1.8331911509877225e-05, + "loss": 0.1416, + "step": 35172 + }, + { + "epoch": 0.6273499090357793, + "grad_norm": 0.2245894819498062, + "learning_rate": 1.8330411402687818e-05, + "loss": 0.1276, + "step": 35173 + }, + { + "epoch": 0.6273677451574929, + "grad_norm": 0.32910066843032837, + "learning_rate": 1.8328911321352042e-05, + "loss": 0.1636, + "step": 35174 + }, + { + "epoch": 0.6273855812792066, + "grad_norm": 0.2751857042312622, + "learning_rate": 1.8327411265875714e-05, + "loss": 0.1428, + "step": 35175 + }, + { + "epoch": 0.6274034174009203, + "grad_norm": 0.2538220286369324, + "learning_rate": 1.832591123626464e-05, + "loss": 0.1035, + "step": 35176 + }, + { + "epoch": 0.627421253522634, + "grad_norm": 0.25720953941345215, + "learning_rate": 1.832441123252463e-05, + "loss": 0.1662, + "step": 35177 + }, + { + "epoch": 0.6274390896443477, + "grad_norm": 0.2071014940738678, + "learning_rate": 1.8322911254661513e-05, + "loss": 0.1321, + "step": 35178 + }, + { + "epoch": 0.6274569257660614, + "grad_norm": 0.33361488580703735, + "learning_rate": 1.8321411302681102e-05, + "loss": 0.1273, + "step": 35179 + }, + { + "epoch": 0.6274747618877751, + "grad_norm": 0.3598553240299225, + "learning_rate": 1.831991137658921e-05, + "loss": 0.142, + "step": 35180 + }, + { + "epoch": 0.6274925980094889, + "grad_norm": 0.22990982234477997, + "learning_rate": 1.8318411476391635e-05, + "loss": 0.1074, + "step": 35181 + }, + { + "epoch": 0.6275104341312026, + "grad_norm": 0.257588267326355, + "learning_rate": 1.8316911602094218e-05, + "loss": 0.1136, + "step": 35182 + }, + { + "epoch": 0.6275282702529162, + "grad_norm": 0.28625985980033875, + "learning_rate": 1.831541175370276e-05, + "loss": 0.1325, + "step": 35183 + }, + { + "epoch": 0.6275461063746299, + "grad_norm": 0.3204520642757416, + "learning_rate": 1.8313911931223066e-05, + "loss": 0.1374, + "step": 35184 + }, + { + "epoch": 0.6275639424963436, + "grad_norm": 0.29593488574028015, + "learning_rate": 1.831241213466096e-05, + "loss": 0.1299, + "step": 35185 + }, + { + "epoch": 0.6275817786180573, + "grad_norm": 0.28135180473327637, + "learning_rate": 1.8310912364022256e-05, + "loss": 0.1263, + "step": 35186 + }, + { + "epoch": 0.627599614739771, + "grad_norm": 0.22319857776165009, + "learning_rate": 1.8309412619312772e-05, + "loss": 0.0832, + "step": 35187 + }, + { + "epoch": 0.6276174508614847, + "grad_norm": 0.2607048749923706, + "learning_rate": 1.8307912900538315e-05, + "loss": 0.1273, + "step": 35188 + }, + { + "epoch": 0.6276352869831984, + "grad_norm": 0.1879798322916031, + "learning_rate": 1.8306413207704697e-05, + "loss": 0.1391, + "step": 35189 + }, + { + "epoch": 0.6276531231049121, + "grad_norm": 0.2310120314359665, + "learning_rate": 1.8304913540817727e-05, + "loss": 0.1136, + "step": 35190 + }, + { + "epoch": 0.6276709592266257, + "grad_norm": 0.274537056684494, + "learning_rate": 1.8303413899883223e-05, + "loss": 0.1247, + "step": 35191 + }, + { + "epoch": 0.6276887953483394, + "grad_norm": 0.2561551630496979, + "learning_rate": 1.830191428490701e-05, + "loss": 0.1562, + "step": 35192 + }, + { + "epoch": 0.6277066314700531, + "grad_norm": 0.23715558648109436, + "learning_rate": 1.830041469589489e-05, + "loss": 0.1284, + "step": 35193 + }, + { + "epoch": 0.6277244675917668, + "grad_norm": 0.32786503434181213, + "learning_rate": 1.8298915132852662e-05, + "loss": 0.215, + "step": 35194 + }, + { + "epoch": 0.6277423037134805, + "grad_norm": 0.28258782625198364, + "learning_rate": 1.8297415595786173e-05, + "loss": 0.1339, + "step": 35195 + }, + { + "epoch": 0.6277601398351942, + "grad_norm": 0.1871049851179123, + "learning_rate": 1.829591608470121e-05, + "loss": 0.0548, + "step": 35196 + }, + { + "epoch": 0.627777975956908, + "grad_norm": 0.19045434892177582, + "learning_rate": 1.8294416599603584e-05, + "loss": 0.0911, + "step": 35197 + }, + { + "epoch": 0.6277958120786217, + "grad_norm": 0.40389135479927063, + "learning_rate": 1.829291714049912e-05, + "loss": 0.1439, + "step": 35198 + }, + { + "epoch": 0.6278136482003354, + "grad_norm": 0.37807926535606384, + "learning_rate": 1.8291417707393622e-05, + "loss": 0.1596, + "step": 35199 + }, + { + "epoch": 0.627831484322049, + "grad_norm": 0.4714040756225586, + "learning_rate": 1.8289918300292914e-05, + "loss": 0.1336, + "step": 35200 + }, + { + "epoch": 0.6278493204437627, + "grad_norm": 0.32570865750312805, + "learning_rate": 1.82884189192028e-05, + "loss": 0.1734, + "step": 35201 + }, + { + "epoch": 0.6278671565654764, + "grad_norm": 0.28308457136154175, + "learning_rate": 1.828691956412909e-05, + "loss": 0.0957, + "step": 35202 + }, + { + "epoch": 0.6278849926871901, + "grad_norm": 0.27648767828941345, + "learning_rate": 1.828542023507759e-05, + "loss": 0.2208, + "step": 35203 + }, + { + "epoch": 0.6279028288089038, + "grad_norm": 0.2610180974006653, + "learning_rate": 1.8283920932054134e-05, + "loss": 0.1137, + "step": 35204 + }, + { + "epoch": 0.6279206649306175, + "grad_norm": 0.20775818824768066, + "learning_rate": 1.828242165506451e-05, + "loss": 0.1585, + "step": 35205 + }, + { + "epoch": 0.6279385010523312, + "grad_norm": 0.20920971035957336, + "learning_rate": 1.8280922404114547e-05, + "loss": 0.1657, + "step": 35206 + }, + { + "epoch": 0.6279563371740449, + "grad_norm": 0.23494596779346466, + "learning_rate": 1.8279423179210036e-05, + "loss": 0.1188, + "step": 35207 + }, + { + "epoch": 0.6279741732957586, + "grad_norm": 0.3598296642303467, + "learning_rate": 1.8277923980356817e-05, + "loss": 0.1417, + "step": 35208 + }, + { + "epoch": 0.6279920094174722, + "grad_norm": 0.25582340359687805, + "learning_rate": 1.8276424807560686e-05, + "loss": 0.1313, + "step": 35209 + }, + { + "epoch": 0.6280098455391859, + "grad_norm": 0.27216625213623047, + "learning_rate": 1.8274925660827453e-05, + "loss": 0.1568, + "step": 35210 + }, + { + "epoch": 0.6280276816608996, + "grad_norm": 0.2343744933605194, + "learning_rate": 1.827342654016292e-05, + "loss": 0.1007, + "step": 35211 + }, + { + "epoch": 0.6280455177826133, + "grad_norm": 0.2554820775985718, + "learning_rate": 1.827192744557292e-05, + "loss": 0.1278, + "step": 35212 + }, + { + "epoch": 0.628063353904327, + "grad_norm": 0.24556128680706024, + "learning_rate": 1.827042837706325e-05, + "loss": 0.1454, + "step": 35213 + }, + { + "epoch": 0.6280811900260408, + "grad_norm": 0.2335948795080185, + "learning_rate": 1.8268929334639722e-05, + "loss": 0.152, + "step": 35214 + }, + { + "epoch": 0.6280990261477545, + "grad_norm": 0.20663274824619293, + "learning_rate": 1.8267430318308157e-05, + "loss": 0.115, + "step": 35215 + }, + { + "epoch": 0.6281168622694682, + "grad_norm": 0.2269904911518097, + "learning_rate": 1.826593132807434e-05, + "loss": 0.1364, + "step": 35216 + }, + { + "epoch": 0.6281346983911819, + "grad_norm": 0.4112590551376343, + "learning_rate": 1.8264432363944116e-05, + "loss": 0.1458, + "step": 35217 + }, + { + "epoch": 0.6281525345128955, + "grad_norm": 0.2837974429130554, + "learning_rate": 1.8262933425923275e-05, + "loss": 0.1379, + "step": 35218 + }, + { + "epoch": 0.6281703706346092, + "grad_norm": 0.2636870741844177, + "learning_rate": 1.8261434514017628e-05, + "loss": 0.0979, + "step": 35219 + }, + { + "epoch": 0.6281882067563229, + "grad_norm": 0.2792653441429138, + "learning_rate": 1.8259935628232984e-05, + "loss": 0.1953, + "step": 35220 + }, + { + "epoch": 0.6282060428780366, + "grad_norm": 0.21954795718193054, + "learning_rate": 1.825843676857516e-05, + "loss": 0.117, + "step": 35221 + }, + { + "epoch": 0.6282238789997503, + "grad_norm": 0.21932348608970642, + "learning_rate": 1.8256937935049973e-05, + "loss": 0.0848, + "step": 35222 + }, + { + "epoch": 0.628241715121464, + "grad_norm": 0.2366136759519577, + "learning_rate": 1.8255439127663218e-05, + "loss": 0.0794, + "step": 35223 + }, + { + "epoch": 0.6282595512431777, + "grad_norm": 0.23892684280872345, + "learning_rate": 1.82539403464207e-05, + "loss": 0.1127, + "step": 35224 + }, + { + "epoch": 0.6282773873648914, + "grad_norm": 0.3476484417915344, + "learning_rate": 1.825244159132825e-05, + "loss": 0.1588, + "step": 35225 + }, + { + "epoch": 0.628295223486605, + "grad_norm": 0.34581294655799866, + "learning_rate": 1.8250942862391667e-05, + "loss": 0.1292, + "step": 35226 + }, + { + "epoch": 0.6283130596083187, + "grad_norm": 0.2861466407775879, + "learning_rate": 1.824944415961676e-05, + "loss": 0.1482, + "step": 35227 + }, + { + "epoch": 0.6283308957300324, + "grad_norm": 0.21293097734451294, + "learning_rate": 1.8247945483009342e-05, + "loss": 0.1391, + "step": 35228 + }, + { + "epoch": 0.6283487318517461, + "grad_norm": 0.19983923435211182, + "learning_rate": 1.8246446832575207e-05, + "loss": 0.1055, + "step": 35229 + }, + { + "epoch": 0.6283665679734598, + "grad_norm": 0.32493898272514343, + "learning_rate": 1.824494820832019e-05, + "loss": 0.1763, + "step": 35230 + }, + { + "epoch": 0.6283844040951736, + "grad_norm": 0.30365660786628723, + "learning_rate": 1.8243449610250084e-05, + "loss": 0.1586, + "step": 35231 + }, + { + "epoch": 0.6284022402168873, + "grad_norm": 0.22662179172039032, + "learning_rate": 1.8241951038370696e-05, + "loss": 0.1024, + "step": 35232 + }, + { + "epoch": 0.628420076338601, + "grad_norm": 0.2568208873271942, + "learning_rate": 1.824045249268784e-05, + "loss": 0.1387, + "step": 35233 + }, + { + "epoch": 0.6284379124603147, + "grad_norm": 0.2992926239967346, + "learning_rate": 1.8238953973207325e-05, + "loss": 0.1134, + "step": 35234 + }, + { + "epoch": 0.6284557485820284, + "grad_norm": 0.23189367353916168, + "learning_rate": 1.8237455479934963e-05, + "loss": 0.131, + "step": 35235 + }, + { + "epoch": 0.628473584703742, + "grad_norm": 0.16864114999771118, + "learning_rate": 1.8235957012876563e-05, + "loss": 0.0858, + "step": 35236 + }, + { + "epoch": 0.6284914208254557, + "grad_norm": 0.28745537996292114, + "learning_rate": 1.8234458572037915e-05, + "loss": 0.136, + "step": 35237 + }, + { + "epoch": 0.6285092569471694, + "grad_norm": 0.3122803866863251, + "learning_rate": 1.8232960157424855e-05, + "loss": 0.1512, + "step": 35238 + }, + { + "epoch": 0.6285270930688831, + "grad_norm": 0.28759995102882385, + "learning_rate": 1.8231461769043178e-05, + "loss": 0.1615, + "step": 35239 + }, + { + "epoch": 0.6285449291905968, + "grad_norm": 0.2791527211666107, + "learning_rate": 1.8229963406898692e-05, + "loss": 0.1623, + "step": 35240 + }, + { + "epoch": 0.6285627653123105, + "grad_norm": 0.288387268781662, + "learning_rate": 1.8228465070997208e-05, + "loss": 0.1648, + "step": 35241 + }, + { + "epoch": 0.6285806014340242, + "grad_norm": 0.3625470995903015, + "learning_rate": 1.8226966761344523e-05, + "loss": 0.1474, + "step": 35242 + }, + { + "epoch": 0.6285984375557379, + "grad_norm": 0.2694432735443115, + "learning_rate": 1.822546847794646e-05, + "loss": 0.1595, + "step": 35243 + }, + { + "epoch": 0.6286162736774515, + "grad_norm": 0.24849697947502136, + "learning_rate": 1.822397022080883e-05, + "loss": 0.1179, + "step": 35244 + }, + { + "epoch": 0.6286341097991652, + "grad_norm": 0.3011578619480133, + "learning_rate": 1.8222471989937422e-05, + "loss": 0.0834, + "step": 35245 + }, + { + "epoch": 0.6286519459208789, + "grad_norm": 0.24738922715187073, + "learning_rate": 1.8220973785338046e-05, + "loss": 0.1341, + "step": 35246 + }, + { + "epoch": 0.6286697820425926, + "grad_norm": 0.28066784143447876, + "learning_rate": 1.8219475607016525e-05, + "loss": 0.1012, + "step": 35247 + }, + { + "epoch": 0.6286876181643064, + "grad_norm": 0.2238895446062088, + "learning_rate": 1.8217977454978663e-05, + "loss": 0.0762, + "step": 35248 + }, + { + "epoch": 0.6287054542860201, + "grad_norm": 0.2647109925746918, + "learning_rate": 1.821647932923026e-05, + "loss": 0.1374, + "step": 35249 + }, + { + "epoch": 0.6287232904077338, + "grad_norm": 0.2317148596048355, + "learning_rate": 1.8214981229777117e-05, + "loss": 0.0958, + "step": 35250 + }, + { + "epoch": 0.6287411265294475, + "grad_norm": 0.3785940706729889, + "learning_rate": 1.8213483156625063e-05, + "loss": 0.1834, + "step": 35251 + }, + { + "epoch": 0.6287589626511612, + "grad_norm": 0.2935950458049774, + "learning_rate": 1.8211985109779888e-05, + "loss": 0.1028, + "step": 35252 + }, + { + "epoch": 0.6287767987728748, + "grad_norm": 0.22883553802967072, + "learning_rate": 1.8210487089247402e-05, + "loss": 0.111, + "step": 35253 + }, + { + "epoch": 0.6287946348945885, + "grad_norm": 0.2786867022514343, + "learning_rate": 1.820898909503342e-05, + "loss": 0.1429, + "step": 35254 + }, + { + "epoch": 0.6288124710163022, + "grad_norm": 0.28457269072532654, + "learning_rate": 1.8207491127143728e-05, + "loss": 0.1401, + "step": 35255 + }, + { + "epoch": 0.6288303071380159, + "grad_norm": 0.28889283537864685, + "learning_rate": 1.8205993185584155e-05, + "loss": 0.133, + "step": 35256 + }, + { + "epoch": 0.6288481432597296, + "grad_norm": 0.2618843615055084, + "learning_rate": 1.82044952703605e-05, + "loss": 0.1173, + "step": 35257 + }, + { + "epoch": 0.6288659793814433, + "grad_norm": 0.27908554673194885, + "learning_rate": 1.820299738147857e-05, + "loss": 0.1123, + "step": 35258 + }, + { + "epoch": 0.628883815503157, + "grad_norm": 0.26213741302490234, + "learning_rate": 1.820149951894416e-05, + "loss": 0.1414, + "step": 35259 + }, + { + "epoch": 0.6289016516248707, + "grad_norm": 0.44762203097343445, + "learning_rate": 1.8200001682763096e-05, + "loss": 0.1221, + "step": 35260 + }, + { + "epoch": 0.6289194877465843, + "grad_norm": 0.21335327625274658, + "learning_rate": 1.8198503872941168e-05, + "loss": 0.0894, + "step": 35261 + }, + { + "epoch": 0.628937323868298, + "grad_norm": 0.2803504765033722, + "learning_rate": 1.819700608948419e-05, + "loss": 0.165, + "step": 35262 + }, + { + "epoch": 0.6289551599900117, + "grad_norm": 0.4563461244106293, + "learning_rate": 1.8195508332397962e-05, + "loss": 0.1187, + "step": 35263 + }, + { + "epoch": 0.6289729961117254, + "grad_norm": 0.3154296875, + "learning_rate": 1.8194010601688302e-05, + "loss": 0.1545, + "step": 35264 + }, + { + "epoch": 0.6289908322334392, + "grad_norm": 0.2549230754375458, + "learning_rate": 1.8192512897361008e-05, + "loss": 0.1886, + "step": 35265 + }, + { + "epoch": 0.6290086683551529, + "grad_norm": 0.4323887228965759, + "learning_rate": 1.8191015219421883e-05, + "loss": 0.1681, + "step": 35266 + }, + { + "epoch": 0.6290265044768666, + "grad_norm": 0.23212343454360962, + "learning_rate": 1.8189517567876728e-05, + "loss": 0.1064, + "step": 35267 + }, + { + "epoch": 0.6290443405985803, + "grad_norm": 0.23868195712566376, + "learning_rate": 1.8188019942731354e-05, + "loss": 0.1134, + "step": 35268 + }, + { + "epoch": 0.629062176720294, + "grad_norm": 0.2580852806568146, + "learning_rate": 1.818652234399158e-05, + "loss": 0.1795, + "step": 35269 + }, + { + "epoch": 0.6290800128420077, + "grad_norm": 0.24365024268627167, + "learning_rate": 1.8185024771663195e-05, + "loss": 0.1416, + "step": 35270 + }, + { + "epoch": 0.6290978489637213, + "grad_norm": 0.27210095524787903, + "learning_rate": 1.8183527225752007e-05, + "loss": 0.0799, + "step": 35271 + }, + { + "epoch": 0.629115685085435, + "grad_norm": 0.2471434324979782, + "learning_rate": 1.8182029706263816e-05, + "loss": 0.0827, + "step": 35272 + }, + { + "epoch": 0.6291335212071487, + "grad_norm": 0.24963009357452393, + "learning_rate": 1.8180532213204438e-05, + "loss": 0.1304, + "step": 35273 + }, + { + "epoch": 0.6291513573288624, + "grad_norm": 0.4840315580368042, + "learning_rate": 1.8179034746579667e-05, + "loss": 0.1214, + "step": 35274 + }, + { + "epoch": 0.6291691934505761, + "grad_norm": 0.3027991056442261, + "learning_rate": 1.8177537306395322e-05, + "loss": 0.0933, + "step": 35275 + }, + { + "epoch": 0.6291870295722898, + "grad_norm": 0.2912178635597229, + "learning_rate": 1.8176039892657188e-05, + "loss": 0.1435, + "step": 35276 + }, + { + "epoch": 0.6292048656940035, + "grad_norm": 0.2589821517467499, + "learning_rate": 1.817454250537109e-05, + "loss": 0.1004, + "step": 35277 + }, + { + "epoch": 0.6292227018157172, + "grad_norm": 0.18125738203525543, + "learning_rate": 1.817304514454282e-05, + "loss": 0.1352, + "step": 35278 + }, + { + "epoch": 0.6292405379374308, + "grad_norm": 0.2323274314403534, + "learning_rate": 1.8171547810178187e-05, + "loss": 0.1082, + "step": 35279 + }, + { + "epoch": 0.6292583740591445, + "grad_norm": 0.44053277373313904, + "learning_rate": 1.8170050502282983e-05, + "loss": 0.1753, + "step": 35280 + }, + { + "epoch": 0.6292762101808582, + "grad_norm": 0.24591851234436035, + "learning_rate": 1.8168553220863034e-05, + "loss": 0.1682, + "step": 35281 + }, + { + "epoch": 0.629294046302572, + "grad_norm": 0.32596495747566223, + "learning_rate": 1.8167055965924123e-05, + "loss": 0.1467, + "step": 35282 + }, + { + "epoch": 0.6293118824242857, + "grad_norm": 0.2672705352306366, + "learning_rate": 1.816555873747207e-05, + "loss": 0.1472, + "step": 35283 + }, + { + "epoch": 0.6293297185459994, + "grad_norm": 0.19331452250480652, + "learning_rate": 1.816406153551268e-05, + "loss": 0.0885, + "step": 35284 + }, + { + "epoch": 0.6293475546677131, + "grad_norm": 0.2907578647136688, + "learning_rate": 1.8162564360051726e-05, + "loss": 0.1332, + "step": 35285 + }, + { + "epoch": 0.6293653907894268, + "grad_norm": 0.42572343349456787, + "learning_rate": 1.8161067211095052e-05, + "loss": 0.114, + "step": 35286 + }, + { + "epoch": 0.6293832269111405, + "grad_norm": 0.26793837547302246, + "learning_rate": 1.8159570088648438e-05, + "loss": 0.1455, + "step": 35287 + }, + { + "epoch": 0.6294010630328541, + "grad_norm": 0.2772933542728424, + "learning_rate": 1.8158072992717693e-05, + "loss": 0.1524, + "step": 35288 + }, + { + "epoch": 0.6294188991545678, + "grad_norm": 0.27731627225875854, + "learning_rate": 1.8156575923308616e-05, + "loss": 0.1234, + "step": 35289 + }, + { + "epoch": 0.6294367352762815, + "grad_norm": 0.27789533138275146, + "learning_rate": 1.8155078880427016e-05, + "loss": 0.1593, + "step": 35290 + }, + { + "epoch": 0.6294545713979952, + "grad_norm": 0.28437814116477966, + "learning_rate": 1.81535818640787e-05, + "loss": 0.1187, + "step": 35291 + }, + { + "epoch": 0.6294724075197089, + "grad_norm": 0.3501490652561188, + "learning_rate": 1.815208487426947e-05, + "loss": 0.1174, + "step": 35292 + }, + { + "epoch": 0.6294902436414226, + "grad_norm": 0.26043999195098877, + "learning_rate": 1.8150587911005107e-05, + "loss": 0.1128, + "step": 35293 + }, + { + "epoch": 0.6295080797631363, + "grad_norm": 0.2151462882757187, + "learning_rate": 1.8149090974291443e-05, + "loss": 0.1097, + "step": 35294 + }, + { + "epoch": 0.62952591588485, + "grad_norm": 0.2857922911643982, + "learning_rate": 1.814759406413427e-05, + "loss": 0.1687, + "step": 35295 + }, + { + "epoch": 0.6295437520065637, + "grad_norm": 0.2592170834541321, + "learning_rate": 1.8146097180539385e-05, + "loss": 0.1326, + "step": 35296 + }, + { + "epoch": 0.6295615881282773, + "grad_norm": 0.24273072183132172, + "learning_rate": 1.8144600323512595e-05, + "loss": 0.1376, + "step": 35297 + }, + { + "epoch": 0.629579424249991, + "grad_norm": 0.2877618670463562, + "learning_rate": 1.8143103493059692e-05, + "loss": 0.1389, + "step": 35298 + }, + { + "epoch": 0.6295972603717048, + "grad_norm": 0.3057037591934204, + "learning_rate": 1.8141606689186503e-05, + "loss": 0.2019, + "step": 35299 + }, + { + "epoch": 0.6296150964934185, + "grad_norm": 0.2851075828075409, + "learning_rate": 1.8140109911898816e-05, + "loss": 0.1354, + "step": 35300 + }, + { + "epoch": 0.6296329326151322, + "grad_norm": 0.29026395082473755, + "learning_rate": 1.8138613161202423e-05, + "loss": 0.1338, + "step": 35301 + }, + { + "epoch": 0.6296507687368459, + "grad_norm": 0.3381284177303314, + "learning_rate": 1.8137116437103136e-05, + "loss": 0.1433, + "step": 35302 + }, + { + "epoch": 0.6296686048585596, + "grad_norm": 0.27081307768821716, + "learning_rate": 1.813561973960676e-05, + "loss": 0.1556, + "step": 35303 + }, + { + "epoch": 0.6296864409802733, + "grad_norm": 0.2977769672870636, + "learning_rate": 1.813412306871909e-05, + "loss": 0.1449, + "step": 35304 + }, + { + "epoch": 0.629704277101987, + "grad_norm": 0.33787962794303894, + "learning_rate": 1.8132626424445937e-05, + "loss": 0.1488, + "step": 35305 + }, + { + "epoch": 0.6297221132237006, + "grad_norm": 0.24437111616134644, + "learning_rate": 1.8131129806793084e-05, + "loss": 0.0835, + "step": 35306 + }, + { + "epoch": 0.6297399493454143, + "grad_norm": 0.34457412362098694, + "learning_rate": 1.8129633215766353e-05, + "loss": 0.1573, + "step": 35307 + }, + { + "epoch": 0.629757785467128, + "grad_norm": 0.34267064929008484, + "learning_rate": 1.8128136651371537e-05, + "loss": 0.1436, + "step": 35308 + }, + { + "epoch": 0.6297756215888417, + "grad_norm": 0.30829447507858276, + "learning_rate": 1.8126640113614436e-05, + "loss": 0.1282, + "step": 35309 + }, + { + "epoch": 0.6297934577105554, + "grad_norm": 0.24545936286449432, + "learning_rate": 1.8125143602500852e-05, + "loss": 0.12, + "step": 35310 + }, + { + "epoch": 0.6298112938322691, + "grad_norm": 0.22269515693187714, + "learning_rate": 1.8123647118036578e-05, + "loss": 0.1013, + "step": 35311 + }, + { + "epoch": 0.6298291299539828, + "grad_norm": 0.2587223947048187, + "learning_rate": 1.8122150660227434e-05, + "loss": 0.1099, + "step": 35312 + }, + { + "epoch": 0.6298469660756965, + "grad_norm": 0.22916986048221588, + "learning_rate": 1.8120654229079205e-05, + "loss": 0.1339, + "step": 35313 + }, + { + "epoch": 0.6298648021974101, + "grad_norm": 0.2595296800136566, + "learning_rate": 1.8119157824597697e-05, + "loss": 0.1546, + "step": 35314 + }, + { + "epoch": 0.6298826383191239, + "grad_norm": 0.25523489713668823, + "learning_rate": 1.81176614467887e-05, + "loss": 0.1295, + "step": 35315 + }, + { + "epoch": 0.6299004744408376, + "grad_norm": 0.2520177960395813, + "learning_rate": 1.8116165095658038e-05, + "loss": 0.1121, + "step": 35316 + }, + { + "epoch": 0.6299183105625513, + "grad_norm": 0.27788999676704407, + "learning_rate": 1.811466877121149e-05, + "loss": 0.1529, + "step": 35317 + }, + { + "epoch": 0.629936146684265, + "grad_norm": 0.2944776713848114, + "learning_rate": 1.811317247345487e-05, + "loss": 0.1352, + "step": 35318 + }, + { + "epoch": 0.6299539828059787, + "grad_norm": 0.2595115005970001, + "learning_rate": 1.811167620239396e-05, + "loss": 0.1131, + "step": 35319 + }, + { + "epoch": 0.6299718189276924, + "grad_norm": 0.32978975772857666, + "learning_rate": 1.8110179958034586e-05, + "loss": 0.1247, + "step": 35320 + }, + { + "epoch": 0.6299896550494061, + "grad_norm": 0.21254689991474152, + "learning_rate": 1.8108683740382536e-05, + "loss": 0.1234, + "step": 35321 + }, + { + "epoch": 0.6300074911711198, + "grad_norm": 0.27821600437164307, + "learning_rate": 1.81071875494436e-05, + "loss": 0.0912, + "step": 35322 + }, + { + "epoch": 0.6300253272928334, + "grad_norm": 0.32335394620895386, + "learning_rate": 1.810569138522359e-05, + "loss": 0.1322, + "step": 35323 + }, + { + "epoch": 0.6300431634145471, + "grad_norm": 0.26081162691116333, + "learning_rate": 1.810419524772829e-05, + "loss": 0.1196, + "step": 35324 + }, + { + "epoch": 0.6300609995362608, + "grad_norm": 0.255611389875412, + "learning_rate": 1.810269913696352e-05, + "loss": 0.0957, + "step": 35325 + }, + { + "epoch": 0.6300788356579745, + "grad_norm": 0.32435518503189087, + "learning_rate": 1.8101203052935074e-05, + "loss": 0.1254, + "step": 35326 + }, + { + "epoch": 0.6300966717796882, + "grad_norm": 0.3529960513114929, + "learning_rate": 1.8099706995648746e-05, + "loss": 0.1348, + "step": 35327 + }, + { + "epoch": 0.6301145079014019, + "grad_norm": 0.3728181719779968, + "learning_rate": 1.809821096511033e-05, + "loss": 0.1127, + "step": 35328 + }, + { + "epoch": 0.6301323440231156, + "grad_norm": 0.2500733733177185, + "learning_rate": 1.8096714961325638e-05, + "loss": 0.1084, + "step": 35329 + }, + { + "epoch": 0.6301501801448293, + "grad_norm": 0.27013060450553894, + "learning_rate": 1.809521898430046e-05, + "loss": 0.1493, + "step": 35330 + }, + { + "epoch": 0.630168016266543, + "grad_norm": 0.44017860293388367, + "learning_rate": 1.8093723034040603e-05, + "loss": 0.1217, + "step": 35331 + }, + { + "epoch": 0.6301858523882568, + "grad_norm": 0.2977251708507538, + "learning_rate": 1.809222711055185e-05, + "loss": 0.1278, + "step": 35332 + }, + { + "epoch": 0.6302036885099704, + "grad_norm": 0.3162805140018463, + "learning_rate": 1.809073121384002e-05, + "loss": 0.1953, + "step": 35333 + }, + { + "epoch": 0.6302215246316841, + "grad_norm": 0.24452361464500427, + "learning_rate": 1.8089235343910903e-05, + "loss": 0.1103, + "step": 35334 + }, + { + "epoch": 0.6302393607533978, + "grad_norm": 0.24200385808944702, + "learning_rate": 1.80877395007703e-05, + "loss": 0.137, + "step": 35335 + }, + { + "epoch": 0.6302571968751115, + "grad_norm": 0.24446938931941986, + "learning_rate": 1.8086243684423998e-05, + "loss": 0.1028, + "step": 35336 + }, + { + "epoch": 0.6302750329968252, + "grad_norm": 0.27976661920547485, + "learning_rate": 1.80847478948778e-05, + "loss": 0.1646, + "step": 35337 + }, + { + "epoch": 0.6302928691185389, + "grad_norm": 0.4105447828769684, + "learning_rate": 1.808325213213751e-05, + "loss": 0.1822, + "step": 35338 + }, + { + "epoch": 0.6303107052402526, + "grad_norm": 0.29472458362579346, + "learning_rate": 1.808175639620893e-05, + "loss": 0.1054, + "step": 35339 + }, + { + "epoch": 0.6303285413619663, + "grad_norm": 0.25440189242362976, + "learning_rate": 1.8080260687097854e-05, + "loss": 0.1508, + "step": 35340 + }, + { + "epoch": 0.6303463774836799, + "grad_norm": 0.22787262499332428, + "learning_rate": 1.8078765004810068e-05, + "loss": 0.1392, + "step": 35341 + }, + { + "epoch": 0.6303642136053936, + "grad_norm": 0.29044678807258606, + "learning_rate": 1.8077269349351383e-05, + "loss": 0.1189, + "step": 35342 + }, + { + "epoch": 0.6303820497271073, + "grad_norm": 0.25068604946136475, + "learning_rate": 1.80757737207276e-05, + "loss": 0.1745, + "step": 35343 + }, + { + "epoch": 0.630399885848821, + "grad_norm": 0.3342738747596741, + "learning_rate": 1.8074278118944497e-05, + "loss": 0.1493, + "step": 35344 + }, + { + "epoch": 0.6304177219705347, + "grad_norm": 0.26094210147857666, + "learning_rate": 1.8072782544007884e-05, + "loss": 0.1191, + "step": 35345 + }, + { + "epoch": 0.6304355580922484, + "grad_norm": 0.2362978756427765, + "learning_rate": 1.807128699592357e-05, + "loss": 0.1196, + "step": 35346 + }, + { + "epoch": 0.6304533942139621, + "grad_norm": 0.2806144058704376, + "learning_rate": 1.8069791474697338e-05, + "loss": 0.1461, + "step": 35347 + }, + { + "epoch": 0.6304712303356758, + "grad_norm": 0.3263757526874542, + "learning_rate": 1.806829598033499e-05, + "loss": 0.1657, + "step": 35348 + }, + { + "epoch": 0.6304890664573896, + "grad_norm": 0.36512333154678345, + "learning_rate": 1.806680051284232e-05, + "loss": 0.1278, + "step": 35349 + }, + { + "epoch": 0.6305069025791032, + "grad_norm": 0.22032690048217773, + "learning_rate": 1.8065305072225115e-05, + "loss": 0.1253, + "step": 35350 + }, + { + "epoch": 0.6305247387008169, + "grad_norm": 0.42624059319496155, + "learning_rate": 1.8063809658489183e-05, + "loss": 0.1633, + "step": 35351 + }, + { + "epoch": 0.6305425748225306, + "grad_norm": 0.24347251653671265, + "learning_rate": 1.8062314271640335e-05, + "loss": 0.121, + "step": 35352 + }, + { + "epoch": 0.6305604109442443, + "grad_norm": 0.2626495957374573, + "learning_rate": 1.8060818911684347e-05, + "loss": 0.2003, + "step": 35353 + }, + { + "epoch": 0.630578247065958, + "grad_norm": 0.18093395233154297, + "learning_rate": 1.8059323578627012e-05, + "loss": 0.1031, + "step": 35354 + }, + { + "epoch": 0.6305960831876717, + "grad_norm": 0.23422425985336304, + "learning_rate": 1.8057828272474146e-05, + "loss": 0.1394, + "step": 35355 + }, + { + "epoch": 0.6306139193093854, + "grad_norm": 0.2314637303352356, + "learning_rate": 1.8056332993231538e-05, + "loss": 0.1144, + "step": 35356 + }, + { + "epoch": 0.6306317554310991, + "grad_norm": 0.2514317035675049, + "learning_rate": 1.8054837740904974e-05, + "loss": 0.1, + "step": 35357 + }, + { + "epoch": 0.6306495915528127, + "grad_norm": 0.2635010778903961, + "learning_rate": 1.805334251550026e-05, + "loss": 0.1233, + "step": 35358 + }, + { + "epoch": 0.6306674276745264, + "grad_norm": 0.31342944502830505, + "learning_rate": 1.8051847317023186e-05, + "loss": 0.1216, + "step": 35359 + }, + { + "epoch": 0.6306852637962401, + "grad_norm": 0.29472488164901733, + "learning_rate": 1.805035214547956e-05, + "loss": 0.1309, + "step": 35360 + }, + { + "epoch": 0.6307030999179538, + "grad_norm": 0.28208810091018677, + "learning_rate": 1.804885700087517e-05, + "loss": 0.2039, + "step": 35361 + }, + { + "epoch": 0.6307209360396675, + "grad_norm": 0.22973541915416718, + "learning_rate": 1.804736188321581e-05, + "loss": 0.1171, + "step": 35362 + }, + { + "epoch": 0.6307387721613812, + "grad_norm": 0.2388831228017807, + "learning_rate": 1.8045866792507264e-05, + "loss": 0.0958, + "step": 35363 + }, + { + "epoch": 0.6307566082830949, + "grad_norm": 0.36623063683509827, + "learning_rate": 1.804437172875535e-05, + "loss": 0.179, + "step": 35364 + }, + { + "epoch": 0.6307744444048086, + "grad_norm": 0.3038334846496582, + "learning_rate": 1.8042876691965854e-05, + "loss": 0.1082, + "step": 35365 + }, + { + "epoch": 0.6307922805265224, + "grad_norm": 0.4096284508705139, + "learning_rate": 1.804138168214457e-05, + "loss": 0.1713, + "step": 35366 + }, + { + "epoch": 0.630810116648236, + "grad_norm": 0.23774613440036774, + "learning_rate": 1.8039886699297287e-05, + "loss": 0.1382, + "step": 35367 + }, + { + "epoch": 0.6308279527699497, + "grad_norm": 0.22512365877628326, + "learning_rate": 1.803839174342982e-05, + "loss": 0.1226, + "step": 35368 + }, + { + "epoch": 0.6308457888916634, + "grad_norm": 0.29915398359298706, + "learning_rate": 1.803689681454795e-05, + "loss": 0.153, + "step": 35369 + }, + { + "epoch": 0.6308636250133771, + "grad_norm": 0.26841655373573303, + "learning_rate": 1.8035401912657468e-05, + "loss": 0.0868, + "step": 35370 + }, + { + "epoch": 0.6308814611350908, + "grad_norm": 0.24068579077720642, + "learning_rate": 1.8033907037764167e-05, + "loss": 0.1445, + "step": 35371 + }, + { + "epoch": 0.6308992972568045, + "grad_norm": 0.32395458221435547, + "learning_rate": 1.803241218987386e-05, + "loss": 0.1177, + "step": 35372 + }, + { + "epoch": 0.6309171333785182, + "grad_norm": 0.23811013996601105, + "learning_rate": 1.803091736899232e-05, + "loss": 0.1279, + "step": 35373 + }, + { + "epoch": 0.6309349695002319, + "grad_norm": 0.22637274861335754, + "learning_rate": 1.802942257512536e-05, + "loss": 0.1313, + "step": 35374 + }, + { + "epoch": 0.6309528056219456, + "grad_norm": 0.3097386956214905, + "learning_rate": 1.8027927808278765e-05, + "loss": 0.1434, + "step": 35375 + }, + { + "epoch": 0.6309706417436592, + "grad_norm": 0.26544082164764404, + "learning_rate": 1.8026433068458322e-05, + "loss": 0.1145, + "step": 35376 + }, + { + "epoch": 0.6309884778653729, + "grad_norm": 0.2120247781276703, + "learning_rate": 1.8024938355669837e-05, + "loss": 0.0861, + "step": 35377 + }, + { + "epoch": 0.6310063139870866, + "grad_norm": 0.33098259568214417, + "learning_rate": 1.80234436699191e-05, + "loss": 0.2105, + "step": 35378 + }, + { + "epoch": 0.6310241501088003, + "grad_norm": 0.2544548511505127, + "learning_rate": 1.802194901121191e-05, + "loss": 0.1287, + "step": 35379 + }, + { + "epoch": 0.631041986230514, + "grad_norm": 0.3424955904483795, + "learning_rate": 1.8020454379554043e-05, + "loss": 0.1076, + "step": 35380 + }, + { + "epoch": 0.6310598223522277, + "grad_norm": 0.34930822253227234, + "learning_rate": 1.8018959774951317e-05, + "loss": 0.1959, + "step": 35381 + }, + { + "epoch": 0.6310776584739414, + "grad_norm": 0.2676719129085541, + "learning_rate": 1.8017465197409515e-05, + "loss": 0.1169, + "step": 35382 + }, + { + "epoch": 0.6310954945956552, + "grad_norm": 0.2419026792049408, + "learning_rate": 1.8015970646934427e-05, + "loss": 0.1098, + "step": 35383 + }, + { + "epoch": 0.6311133307173689, + "grad_norm": 0.2810690701007843, + "learning_rate": 1.8014476123531843e-05, + "loss": 0.1476, + "step": 35384 + }, + { + "epoch": 0.6311311668390825, + "grad_norm": 0.27666938304901123, + "learning_rate": 1.801298162720757e-05, + "loss": 0.1369, + "step": 35385 + }, + { + "epoch": 0.6311490029607962, + "grad_norm": 0.29098209738731384, + "learning_rate": 1.8011487157967385e-05, + "loss": 0.1528, + "step": 35386 + }, + { + "epoch": 0.6311668390825099, + "grad_norm": 0.3187029957771301, + "learning_rate": 1.8009992715817098e-05, + "loss": 0.1557, + "step": 35387 + }, + { + "epoch": 0.6311846752042236, + "grad_norm": 0.26841095089912415, + "learning_rate": 1.8008498300762494e-05, + "loss": 0.1369, + "step": 35388 + }, + { + "epoch": 0.6312025113259373, + "grad_norm": 0.19001835584640503, + "learning_rate": 1.8007003912809356e-05, + "loss": 0.1049, + "step": 35389 + }, + { + "epoch": 0.631220347447651, + "grad_norm": 0.24521635472774506, + "learning_rate": 1.8005509551963494e-05, + "loss": 0.1163, + "step": 35390 + }, + { + "epoch": 0.6312381835693647, + "grad_norm": 0.27687886357307434, + "learning_rate": 1.8004015218230694e-05, + "loss": 0.1474, + "step": 35391 + }, + { + "epoch": 0.6312560196910784, + "grad_norm": 0.23672008514404297, + "learning_rate": 1.8002520911616743e-05, + "loss": 0.1028, + "step": 35392 + }, + { + "epoch": 0.631273855812792, + "grad_norm": 0.24028043448925018, + "learning_rate": 1.8001026632127435e-05, + "loss": 0.1088, + "step": 35393 + }, + { + "epoch": 0.6312916919345057, + "grad_norm": 0.27435794472694397, + "learning_rate": 1.7999532379768567e-05, + "loss": 0.0855, + "step": 35394 + }, + { + "epoch": 0.6313095280562194, + "grad_norm": 0.26046356558799744, + "learning_rate": 1.7998038154545935e-05, + "loss": 0.0864, + "step": 35395 + }, + { + "epoch": 0.6313273641779331, + "grad_norm": 0.29235291481018066, + "learning_rate": 1.7996543956465325e-05, + "loss": 0.1659, + "step": 35396 + }, + { + "epoch": 0.6313452002996468, + "grad_norm": 0.274905264377594, + "learning_rate": 1.7995049785532518e-05, + "loss": 0.1183, + "step": 35397 + }, + { + "epoch": 0.6313630364213605, + "grad_norm": 0.31036612391471863, + "learning_rate": 1.799355564175333e-05, + "loss": 0.1428, + "step": 35398 + }, + { + "epoch": 0.6313808725430742, + "grad_norm": 0.27010050415992737, + "learning_rate": 1.7992061525133543e-05, + "loss": 0.1515, + "step": 35399 + }, + { + "epoch": 0.631398708664788, + "grad_norm": 0.3791591227054596, + "learning_rate": 1.7990567435678935e-05, + "loss": 0.1558, + "step": 35400 + }, + { + "epoch": 0.6314165447865017, + "grad_norm": 0.25948670506477356, + "learning_rate": 1.7989073373395315e-05, + "loss": 0.0875, + "step": 35401 + }, + { + "epoch": 0.6314343809082154, + "grad_norm": 0.21399211883544922, + "learning_rate": 1.798757933828846e-05, + "loss": 0.0835, + "step": 35402 + }, + { + "epoch": 0.631452217029929, + "grad_norm": 0.22009773552417755, + "learning_rate": 1.7986085330364177e-05, + "loss": 0.1328, + "step": 35403 + }, + { + "epoch": 0.6314700531516427, + "grad_norm": 0.3885743319988251, + "learning_rate": 1.798459134962825e-05, + "loss": 0.1383, + "step": 35404 + }, + { + "epoch": 0.6314878892733564, + "grad_norm": 0.20693336427211761, + "learning_rate": 1.798309739608647e-05, + "loss": 0.1289, + "step": 35405 + }, + { + "epoch": 0.6315057253950701, + "grad_norm": 0.36580348014831543, + "learning_rate": 1.798160346974462e-05, + "loss": 0.1375, + "step": 35406 + }, + { + "epoch": 0.6315235615167838, + "grad_norm": 0.2585957944393158, + "learning_rate": 1.7980109570608504e-05, + "loss": 0.1375, + "step": 35407 + }, + { + "epoch": 0.6315413976384975, + "grad_norm": 0.2605012357234955, + "learning_rate": 1.797861569868391e-05, + "loss": 0.0938, + "step": 35408 + }, + { + "epoch": 0.6315592337602112, + "grad_norm": 0.21506501734256744, + "learning_rate": 1.797712185397663e-05, + "loss": 0.1124, + "step": 35409 + }, + { + "epoch": 0.6315770698819249, + "grad_norm": 0.3258536756038666, + "learning_rate": 1.7975628036492444e-05, + "loss": 0.1323, + "step": 35410 + }, + { + "epoch": 0.6315949060036385, + "grad_norm": 0.22610147297382355, + "learning_rate": 1.797413424623715e-05, + "loss": 0.1152, + "step": 35411 + }, + { + "epoch": 0.6316127421253522, + "grad_norm": 0.22787535190582275, + "learning_rate": 1.7972640483216546e-05, + "loss": 0.1298, + "step": 35412 + }, + { + "epoch": 0.6316305782470659, + "grad_norm": 0.32898852229118347, + "learning_rate": 1.797114674743641e-05, + "loss": 0.1244, + "step": 35413 + }, + { + "epoch": 0.6316484143687796, + "grad_norm": 0.2604621350765228, + "learning_rate": 1.796965303890254e-05, + "loss": 0.172, + "step": 35414 + }, + { + "epoch": 0.6316662504904933, + "grad_norm": 0.31942859292030334, + "learning_rate": 1.7968159357620712e-05, + "loss": 0.1063, + "step": 35415 + }, + { + "epoch": 0.6316840866122071, + "grad_norm": 0.34835657477378845, + "learning_rate": 1.7966665703596736e-05, + "loss": 0.1041, + "step": 35416 + }, + { + "epoch": 0.6317019227339208, + "grad_norm": 0.26756277680397034, + "learning_rate": 1.7965172076836394e-05, + "loss": 0.1313, + "step": 35417 + }, + { + "epoch": 0.6317197588556345, + "grad_norm": 0.3831746578216553, + "learning_rate": 1.7963678477345477e-05, + "loss": 0.168, + "step": 35418 + }, + { + "epoch": 0.6317375949773482, + "grad_norm": 0.28253981471061707, + "learning_rate": 1.796218490512976e-05, + "loss": 0.1293, + "step": 35419 + }, + { + "epoch": 0.6317554310990618, + "grad_norm": 0.28203126788139343, + "learning_rate": 1.796069136019506e-05, + "loss": 0.1108, + "step": 35420 + }, + { + "epoch": 0.6317732672207755, + "grad_norm": 0.29136377573013306, + "learning_rate": 1.795919784254714e-05, + "loss": 0.139, + "step": 35421 + }, + { + "epoch": 0.6317911033424892, + "grad_norm": 0.2343776375055313, + "learning_rate": 1.795770435219181e-05, + "loss": 0.0724, + "step": 35422 + }, + { + "epoch": 0.6318089394642029, + "grad_norm": 0.2761262059211731, + "learning_rate": 1.795621088913484e-05, + "loss": 0.1395, + "step": 35423 + }, + { + "epoch": 0.6318267755859166, + "grad_norm": 0.263091117143631, + "learning_rate": 1.7954717453382035e-05, + "loss": 0.0654, + "step": 35424 + }, + { + "epoch": 0.6318446117076303, + "grad_norm": 0.22587265074253082, + "learning_rate": 1.7953224044939186e-05, + "loss": 0.1215, + "step": 35425 + }, + { + "epoch": 0.631862447829344, + "grad_norm": 0.2848919928073883, + "learning_rate": 1.795173066381207e-05, + "loss": 0.1314, + "step": 35426 + }, + { + "epoch": 0.6318802839510577, + "grad_norm": 0.28789907693862915, + "learning_rate": 1.7950237310006474e-05, + "loss": 0.1281, + "step": 35427 + }, + { + "epoch": 0.6318981200727714, + "grad_norm": 0.2507967948913574, + "learning_rate": 1.7948743983528187e-05, + "loss": 0.093, + "step": 35428 + }, + { + "epoch": 0.631915956194485, + "grad_norm": 0.24011845886707306, + "learning_rate": 1.794725068438302e-05, + "loss": 0.1097, + "step": 35429 + }, + { + "epoch": 0.6319337923161987, + "grad_norm": 0.21063701808452606, + "learning_rate": 1.794575741257674e-05, + "loss": 0.0642, + "step": 35430 + }, + { + "epoch": 0.6319516284379124, + "grad_norm": 0.28884100914001465, + "learning_rate": 1.7944264168115144e-05, + "loss": 0.1426, + "step": 35431 + }, + { + "epoch": 0.6319694645596261, + "grad_norm": 0.2138759195804596, + "learning_rate": 1.7942770951004007e-05, + "loss": 0.1489, + "step": 35432 + }, + { + "epoch": 0.6319873006813399, + "grad_norm": 0.2516428232192993, + "learning_rate": 1.7941277761249138e-05, + "loss": 0.1459, + "step": 35433 + }, + { + "epoch": 0.6320051368030536, + "grad_norm": 0.30709967017173767, + "learning_rate": 1.7939784598856305e-05, + "loss": 0.139, + "step": 35434 + }, + { + "epoch": 0.6320229729247673, + "grad_norm": 0.2141241729259491, + "learning_rate": 1.7938291463831314e-05, + "loss": 0.0964, + "step": 35435 + }, + { + "epoch": 0.632040809046481, + "grad_norm": 0.3225373327732086, + "learning_rate": 1.7936798356179928e-05, + "loss": 0.142, + "step": 35436 + }, + { + "epoch": 0.6320586451681947, + "grad_norm": 0.5683669447898865, + "learning_rate": 1.793530527590797e-05, + "loss": 0.1654, + "step": 35437 + }, + { + "epoch": 0.6320764812899083, + "grad_norm": 0.20786359906196594, + "learning_rate": 1.7933812223021207e-05, + "loss": 0.0944, + "step": 35438 + }, + { + "epoch": 0.632094317411622, + "grad_norm": 0.23722374439239502, + "learning_rate": 1.7932319197525423e-05, + "loss": 0.117, + "step": 35439 + }, + { + "epoch": 0.6321121535333357, + "grad_norm": 0.22797037661075592, + "learning_rate": 1.793082619942641e-05, + "loss": 0.1134, + "step": 35440 + }, + { + "epoch": 0.6321299896550494, + "grad_norm": 0.27003049850463867, + "learning_rate": 1.7929333228729956e-05, + "loss": 0.0831, + "step": 35441 + }, + { + "epoch": 0.6321478257767631, + "grad_norm": 0.264720618724823, + "learning_rate": 1.792784028544185e-05, + "loss": 0.138, + "step": 35442 + }, + { + "epoch": 0.6321656618984768, + "grad_norm": 0.2569016218185425, + "learning_rate": 1.792634736956788e-05, + "loss": 0.1218, + "step": 35443 + }, + { + "epoch": 0.6321834980201905, + "grad_norm": 0.229520782828331, + "learning_rate": 1.7924854481113832e-05, + "loss": 0.1467, + "step": 35444 + }, + { + "epoch": 0.6322013341419042, + "grad_norm": 0.3214118778705597, + "learning_rate": 1.792336162008548e-05, + "loss": 0.2048, + "step": 35445 + }, + { + "epoch": 0.6322191702636178, + "grad_norm": 0.2533576488494873, + "learning_rate": 1.7921868786488632e-05, + "loss": 0.1219, + "step": 35446 + }, + { + "epoch": 0.6322370063853315, + "grad_norm": 0.2557198405265808, + "learning_rate": 1.792037598032907e-05, + "loss": 0.1063, + "step": 35447 + }, + { + "epoch": 0.6322548425070452, + "grad_norm": 0.27354636788368225, + "learning_rate": 1.791888320161257e-05, + "loss": 0.1562, + "step": 35448 + }, + { + "epoch": 0.6322726786287589, + "grad_norm": 0.22823558747768402, + "learning_rate": 1.7917390450344916e-05, + "loss": 0.1284, + "step": 35449 + }, + { + "epoch": 0.6322905147504727, + "grad_norm": 0.23739616572856903, + "learning_rate": 1.7915897726531913e-05, + "loss": 0.1166, + "step": 35450 + }, + { + "epoch": 0.6323083508721864, + "grad_norm": 0.21627265214920044, + "learning_rate": 1.7914405030179336e-05, + "loss": 0.0861, + "step": 35451 + }, + { + "epoch": 0.6323261869939001, + "grad_norm": 0.23258353769779205, + "learning_rate": 1.7912912361292978e-05, + "loss": 0.125, + "step": 35452 + }, + { + "epoch": 0.6323440231156138, + "grad_norm": 0.23154449462890625, + "learning_rate": 1.7911419719878616e-05, + "loss": 0.1145, + "step": 35453 + }, + { + "epoch": 0.6323618592373275, + "grad_norm": 0.21744675934314728, + "learning_rate": 1.7909927105942033e-05, + "loss": 0.1045, + "step": 35454 + }, + { + "epoch": 0.6323796953590411, + "grad_norm": 0.38329291343688965, + "learning_rate": 1.790843451948902e-05, + "loss": 0.1396, + "step": 35455 + }, + { + "epoch": 0.6323975314807548, + "grad_norm": 0.22451618313789368, + "learning_rate": 1.7906941960525376e-05, + "loss": 0.1263, + "step": 35456 + }, + { + "epoch": 0.6324153676024685, + "grad_norm": 0.27160733938217163, + "learning_rate": 1.790544942905687e-05, + "loss": 0.127, + "step": 35457 + }, + { + "epoch": 0.6324332037241822, + "grad_norm": 0.39817431569099426, + "learning_rate": 1.7903956925089283e-05, + "loss": 0.1451, + "step": 35458 + }, + { + "epoch": 0.6324510398458959, + "grad_norm": 0.34811851382255554, + "learning_rate": 1.790246444862842e-05, + "loss": 0.2113, + "step": 35459 + }, + { + "epoch": 0.6324688759676096, + "grad_norm": 0.2817688584327698, + "learning_rate": 1.7900971999680056e-05, + "loss": 0.1934, + "step": 35460 + }, + { + "epoch": 0.6324867120893233, + "grad_norm": 0.2951194643974304, + "learning_rate": 1.7899479578249972e-05, + "loss": 0.1118, + "step": 35461 + }, + { + "epoch": 0.632504548211037, + "grad_norm": 0.22974111139774323, + "learning_rate": 1.789798718434396e-05, + "loss": 0.1024, + "step": 35462 + }, + { + "epoch": 0.6325223843327507, + "grad_norm": 0.2822018265724182, + "learning_rate": 1.78964948179678e-05, + "loss": 0.1206, + "step": 35463 + }, + { + "epoch": 0.6325402204544643, + "grad_norm": 0.2894124686717987, + "learning_rate": 1.7895002479127283e-05, + "loss": 0.1408, + "step": 35464 + }, + { + "epoch": 0.632558056576178, + "grad_norm": 0.3011755347251892, + "learning_rate": 1.7893510167828192e-05, + "loss": 0.0877, + "step": 35465 + }, + { + "epoch": 0.6325758926978917, + "grad_norm": 0.2765551209449768, + "learning_rate": 1.789201788407631e-05, + "loss": 0.1284, + "step": 35466 + }, + { + "epoch": 0.6325937288196055, + "grad_norm": 0.16731123626232147, + "learning_rate": 1.7890525627877415e-05, + "loss": 0.0566, + "step": 35467 + }, + { + "epoch": 0.6326115649413192, + "grad_norm": 0.23484602570533752, + "learning_rate": 1.7889033399237305e-05, + "loss": 0.0872, + "step": 35468 + }, + { + "epoch": 0.6326294010630329, + "grad_norm": 0.26866719126701355, + "learning_rate": 1.788754119816175e-05, + "loss": 0.0844, + "step": 35469 + }, + { + "epoch": 0.6326472371847466, + "grad_norm": 0.2254677414894104, + "learning_rate": 1.7886049024656555e-05, + "loss": 0.1125, + "step": 35470 + }, + { + "epoch": 0.6326650733064603, + "grad_norm": 0.23129501938819885, + "learning_rate": 1.788455687872747e-05, + "loss": 0.0864, + "step": 35471 + }, + { + "epoch": 0.632682909428174, + "grad_norm": 0.25478705763816833, + "learning_rate": 1.7883064760380318e-05, + "loss": 0.1352, + "step": 35472 + }, + { + "epoch": 0.6327007455498876, + "grad_norm": 0.2677682340145111, + "learning_rate": 1.7881572669620865e-05, + "loss": 0.1136, + "step": 35473 + }, + { + "epoch": 0.6327185816716013, + "grad_norm": 0.25433531403541565, + "learning_rate": 1.7880080606454893e-05, + "loss": 0.1532, + "step": 35474 + }, + { + "epoch": 0.632736417793315, + "grad_norm": 0.2669883072376251, + "learning_rate": 1.7878588570888178e-05, + "loss": 0.1163, + "step": 35475 + }, + { + "epoch": 0.6327542539150287, + "grad_norm": 0.2778422236442566, + "learning_rate": 1.787709656292652e-05, + "loss": 0.1339, + "step": 35476 + }, + { + "epoch": 0.6327720900367424, + "grad_norm": 0.1996283084154129, + "learning_rate": 1.7875604582575695e-05, + "loss": 0.0714, + "step": 35477 + }, + { + "epoch": 0.6327899261584561, + "grad_norm": 0.27443283796310425, + "learning_rate": 1.7874112629841494e-05, + "loss": 0.0842, + "step": 35478 + }, + { + "epoch": 0.6328077622801698, + "grad_norm": 0.34911268949508667, + "learning_rate": 1.7872620704729688e-05, + "loss": 0.0948, + "step": 35479 + }, + { + "epoch": 0.6328255984018835, + "grad_norm": 0.21580761671066284, + "learning_rate": 1.787112880724606e-05, + "loss": 0.1, + "step": 35480 + }, + { + "epoch": 0.6328434345235971, + "grad_norm": 0.23352940380573273, + "learning_rate": 1.786963693739641e-05, + "loss": 0.1175, + "step": 35481 + }, + { + "epoch": 0.6328612706453108, + "grad_norm": 0.23411938548088074, + "learning_rate": 1.78681450951865e-05, + "loss": 0.1555, + "step": 35482 + }, + { + "epoch": 0.6328791067670245, + "grad_norm": 0.2596302330493927, + "learning_rate": 1.786665328062213e-05, + "loss": 0.1349, + "step": 35483 + }, + { + "epoch": 0.6328969428887383, + "grad_norm": 0.3417161703109741, + "learning_rate": 1.7865161493709067e-05, + "loss": 0.1198, + "step": 35484 + }, + { + "epoch": 0.632914779010452, + "grad_norm": 0.31121987104415894, + "learning_rate": 1.7863669734453115e-05, + "loss": 0.1268, + "step": 35485 + }, + { + "epoch": 0.6329326151321657, + "grad_norm": 0.22938700020313263, + "learning_rate": 1.786217800286004e-05, + "loss": 0.1227, + "step": 35486 + }, + { + "epoch": 0.6329504512538794, + "grad_norm": 0.25341150164604187, + "learning_rate": 1.786068629893563e-05, + "loss": 0.1138, + "step": 35487 + }, + { + "epoch": 0.6329682873755931, + "grad_norm": 0.3317761719226837, + "learning_rate": 1.7859194622685653e-05, + "loss": 0.0888, + "step": 35488 + }, + { + "epoch": 0.6329861234973068, + "grad_norm": 0.2983003854751587, + "learning_rate": 1.7857702974115915e-05, + "loss": 0.1612, + "step": 35489 + }, + { + "epoch": 0.6330039596190205, + "grad_norm": 0.23485371470451355, + "learning_rate": 1.7856211353232184e-05, + "loss": 0.0575, + "step": 35490 + }, + { + "epoch": 0.6330217957407341, + "grad_norm": 0.25275713205337524, + "learning_rate": 1.7854719760040247e-05, + "loss": 0.1278, + "step": 35491 + }, + { + "epoch": 0.6330396318624478, + "grad_norm": 0.25497502088546753, + "learning_rate": 1.7853228194545887e-05, + "loss": 0.1361, + "step": 35492 + }, + { + "epoch": 0.6330574679841615, + "grad_norm": 0.32100817561149597, + "learning_rate": 1.7851736656754872e-05, + "loss": 0.1927, + "step": 35493 + }, + { + "epoch": 0.6330753041058752, + "grad_norm": 0.3167615830898285, + "learning_rate": 1.7850245146673005e-05, + "loss": 0.1235, + "step": 35494 + }, + { + "epoch": 0.6330931402275889, + "grad_norm": 0.27259528636932373, + "learning_rate": 1.7848753664306056e-05, + "loss": 0.106, + "step": 35495 + }, + { + "epoch": 0.6331109763493026, + "grad_norm": 0.24194732308387756, + "learning_rate": 1.7847262209659805e-05, + "loss": 0.1338, + "step": 35496 + }, + { + "epoch": 0.6331288124710163, + "grad_norm": 0.20803293585777283, + "learning_rate": 1.7845770782740035e-05, + "loss": 0.0623, + "step": 35497 + }, + { + "epoch": 0.63314664859273, + "grad_norm": 0.2798668146133423, + "learning_rate": 1.7844279383552528e-05, + "loss": 0.1364, + "step": 35498 + }, + { + "epoch": 0.6331644847144436, + "grad_norm": 0.26660528779029846, + "learning_rate": 1.7842788012103073e-05, + "loss": 0.1407, + "step": 35499 + }, + { + "epoch": 0.6331823208361573, + "grad_norm": 0.28037959337234497, + "learning_rate": 1.784129666839744e-05, + "loss": 0.1617, + "step": 35500 + }, + { + "epoch": 0.6332001569578711, + "grad_norm": 0.26895105838775635, + "learning_rate": 1.7839805352441406e-05, + "loss": 0.1269, + "step": 35501 + }, + { + "epoch": 0.6332179930795848, + "grad_norm": 0.39157748222351074, + "learning_rate": 1.7838314064240768e-05, + "loss": 0.1529, + "step": 35502 + }, + { + "epoch": 0.6332358292012985, + "grad_norm": 0.2692275047302246, + "learning_rate": 1.7836822803801302e-05, + "loss": 0.1558, + "step": 35503 + }, + { + "epoch": 0.6332536653230122, + "grad_norm": 0.3016781508922577, + "learning_rate": 1.7835331571128773e-05, + "loss": 0.1249, + "step": 35504 + }, + { + "epoch": 0.6332715014447259, + "grad_norm": 0.22750602662563324, + "learning_rate": 1.7833840366228984e-05, + "loss": 0.1004, + "step": 35505 + }, + { + "epoch": 0.6332893375664396, + "grad_norm": 0.24716240167617798, + "learning_rate": 1.7832349189107694e-05, + "loss": 0.1071, + "step": 35506 + }, + { + "epoch": 0.6333071736881533, + "grad_norm": 0.29719221591949463, + "learning_rate": 1.7830858039770705e-05, + "loss": 0.0892, + "step": 35507 + }, + { + "epoch": 0.633325009809867, + "grad_norm": 0.2533837854862213, + "learning_rate": 1.7829366918223782e-05, + "loss": 0.1297, + "step": 35508 + }, + { + "epoch": 0.6333428459315806, + "grad_norm": 0.22649559378623962, + "learning_rate": 1.7827875824472707e-05, + "loss": 0.0712, + "step": 35509 + }, + { + "epoch": 0.6333606820532943, + "grad_norm": 0.3142421245574951, + "learning_rate": 1.782638475852326e-05, + "loss": 0.1693, + "step": 35510 + }, + { + "epoch": 0.633378518175008, + "grad_norm": 0.3048451840877533, + "learning_rate": 1.782489372038123e-05, + "loss": 0.1504, + "step": 35511 + }, + { + "epoch": 0.6333963542967217, + "grad_norm": 0.22719921171665192, + "learning_rate": 1.782340271005239e-05, + "loss": 0.1455, + "step": 35512 + }, + { + "epoch": 0.6334141904184354, + "grad_norm": 0.26450788974761963, + "learning_rate": 1.7821911727542524e-05, + "loss": 0.1376, + "step": 35513 + }, + { + "epoch": 0.6334320265401491, + "grad_norm": 0.22958171367645264, + "learning_rate": 1.7820420772857392e-05, + "loss": 0.1471, + "step": 35514 + }, + { + "epoch": 0.6334498626618628, + "grad_norm": 0.24663661420345306, + "learning_rate": 1.7818929846002802e-05, + "loss": 0.0869, + "step": 35515 + }, + { + "epoch": 0.6334676987835764, + "grad_norm": 0.2804535925388336, + "learning_rate": 1.7817438946984523e-05, + "loss": 0.1257, + "step": 35516 + }, + { + "epoch": 0.6334855349052902, + "grad_norm": 0.23646791279315948, + "learning_rate": 1.781594807580832e-05, + "loss": 0.1403, + "step": 35517 + }, + { + "epoch": 0.6335033710270039, + "grad_norm": 0.24591633677482605, + "learning_rate": 1.7814457232479993e-05, + "loss": 0.1691, + "step": 35518 + }, + { + "epoch": 0.6335212071487176, + "grad_norm": 0.35216137766838074, + "learning_rate": 1.7812966417005298e-05, + "loss": 0.1352, + "step": 35519 + }, + { + "epoch": 0.6335390432704313, + "grad_norm": 0.21545404195785522, + "learning_rate": 1.781147562939004e-05, + "loss": 0.1526, + "step": 35520 + }, + { + "epoch": 0.633556879392145, + "grad_norm": 0.27540323138237, + "learning_rate": 1.7809984869639986e-05, + "loss": 0.0944, + "step": 35521 + }, + { + "epoch": 0.6335747155138587, + "grad_norm": 0.285632461309433, + "learning_rate": 1.780849413776091e-05, + "loss": 0.1222, + "step": 35522 + }, + { + "epoch": 0.6335925516355724, + "grad_norm": 0.3289864659309387, + "learning_rate": 1.780700343375859e-05, + "loss": 0.0917, + "step": 35523 + }, + { + "epoch": 0.6336103877572861, + "grad_norm": 0.27732864022254944, + "learning_rate": 1.7805512757638815e-05, + "loss": 0.1528, + "step": 35524 + }, + { + "epoch": 0.6336282238789998, + "grad_norm": 0.3110494017601013, + "learning_rate": 1.7804022109407354e-05, + "loss": 0.1446, + "step": 35525 + }, + { + "epoch": 0.6336460600007134, + "grad_norm": 0.24167461693286896, + "learning_rate": 1.7802531489069996e-05, + "loss": 0.1208, + "step": 35526 + }, + { + "epoch": 0.6336638961224271, + "grad_norm": 0.28206831216812134, + "learning_rate": 1.7801040896632497e-05, + "loss": 0.152, + "step": 35527 + }, + { + "epoch": 0.6336817322441408, + "grad_norm": 0.3009721040725708, + "learning_rate": 1.779955033210066e-05, + "loss": 0.1823, + "step": 35528 + }, + { + "epoch": 0.6336995683658545, + "grad_norm": 0.20980344712734222, + "learning_rate": 1.7798059795480258e-05, + "loss": 0.1058, + "step": 35529 + }, + { + "epoch": 0.6337174044875682, + "grad_norm": 0.235727921128273, + "learning_rate": 1.779656928677706e-05, + "loss": 0.1177, + "step": 35530 + }, + { + "epoch": 0.6337352406092819, + "grad_norm": 0.32229310274124146, + "learning_rate": 1.7795078805996845e-05, + "loss": 0.1283, + "step": 35531 + }, + { + "epoch": 0.6337530767309956, + "grad_norm": 0.2604036033153534, + "learning_rate": 1.7793588353145385e-05, + "loss": 0.1021, + "step": 35532 + }, + { + "epoch": 0.6337709128527093, + "grad_norm": 0.256980836391449, + "learning_rate": 1.779209792822848e-05, + "loss": 0.116, + "step": 35533 + }, + { + "epoch": 0.633788748974423, + "grad_norm": 0.2389703392982483, + "learning_rate": 1.7790607531251886e-05, + "loss": 0.121, + "step": 35534 + }, + { + "epoch": 0.6338065850961367, + "grad_norm": 0.24550354480743408, + "learning_rate": 1.778911716222139e-05, + "loss": 0.0616, + "step": 35535 + }, + { + "epoch": 0.6338244212178504, + "grad_norm": 0.28213346004486084, + "learning_rate": 1.7787626821142762e-05, + "loss": 0.1214, + "step": 35536 + }, + { + "epoch": 0.6338422573395641, + "grad_norm": 0.21905206143856049, + "learning_rate": 1.778613650802179e-05, + "loss": 0.1474, + "step": 35537 + }, + { + "epoch": 0.6338600934612778, + "grad_norm": 0.3287425637245178, + "learning_rate": 1.7784646222864236e-05, + "loss": 0.1766, + "step": 35538 + }, + { + "epoch": 0.6338779295829915, + "grad_norm": 0.18994520604610443, + "learning_rate": 1.7783155965675893e-05, + "loss": 0.1187, + "step": 35539 + }, + { + "epoch": 0.6338957657047052, + "grad_norm": 0.29004308581352234, + "learning_rate": 1.778166573646252e-05, + "loss": 0.1474, + "step": 35540 + }, + { + "epoch": 0.6339136018264189, + "grad_norm": 0.1801350712776184, + "learning_rate": 1.7780175535229916e-05, + "loss": 0.0587, + "step": 35541 + }, + { + "epoch": 0.6339314379481326, + "grad_norm": 0.3327726721763611, + "learning_rate": 1.7778685361983842e-05, + "loss": 0.2133, + "step": 35542 + }, + { + "epoch": 0.6339492740698462, + "grad_norm": 0.314936101436615, + "learning_rate": 1.777719521673008e-05, + "loss": 0.1558, + "step": 35543 + }, + { + "epoch": 0.6339671101915599, + "grad_norm": 0.2670227885246277, + "learning_rate": 1.7775705099474398e-05, + "loss": 0.1464, + "step": 35544 + }, + { + "epoch": 0.6339849463132736, + "grad_norm": 0.3188895285129547, + "learning_rate": 1.7774215010222577e-05, + "loss": 0.103, + "step": 35545 + }, + { + "epoch": 0.6340027824349873, + "grad_norm": 0.27550625801086426, + "learning_rate": 1.7772724948980395e-05, + "loss": 0.1175, + "step": 35546 + }, + { + "epoch": 0.634020618556701, + "grad_norm": 0.20753541588783264, + "learning_rate": 1.777123491575363e-05, + "loss": 0.0777, + "step": 35547 + }, + { + "epoch": 0.6340384546784147, + "grad_norm": 0.2836662828922272, + "learning_rate": 1.7769744910548062e-05, + "loss": 0.0908, + "step": 35548 + }, + { + "epoch": 0.6340562908001284, + "grad_norm": 0.23503363132476807, + "learning_rate": 1.7768254933369445e-05, + "loss": 0.1358, + "step": 35549 + }, + { + "epoch": 0.6340741269218421, + "grad_norm": 0.2181215137243271, + "learning_rate": 1.776676498422358e-05, + "loss": 0.1199, + "step": 35550 + }, + { + "epoch": 0.6340919630435559, + "grad_norm": 0.3643927574157715, + "learning_rate": 1.7765275063116232e-05, + "loss": 0.0994, + "step": 35551 + }, + { + "epoch": 0.6341097991652695, + "grad_norm": 0.27812886238098145, + "learning_rate": 1.7763785170053174e-05, + "loss": 0.1238, + "step": 35552 + }, + { + "epoch": 0.6341276352869832, + "grad_norm": 0.25353866815567017, + "learning_rate": 1.7762295305040184e-05, + "loss": 0.1569, + "step": 35553 + }, + { + "epoch": 0.6341454714086969, + "grad_norm": 0.28846225142478943, + "learning_rate": 1.7760805468083032e-05, + "loss": 0.1461, + "step": 35554 + }, + { + "epoch": 0.6341633075304106, + "grad_norm": 0.33720266819000244, + "learning_rate": 1.775931565918751e-05, + "loss": 0.1077, + "step": 35555 + }, + { + "epoch": 0.6341811436521243, + "grad_norm": 0.32191047072410583, + "learning_rate": 1.7757825878359376e-05, + "loss": 0.1594, + "step": 35556 + }, + { + "epoch": 0.634198979773838, + "grad_norm": 0.18688996136188507, + "learning_rate": 1.775633612560441e-05, + "loss": 0.0838, + "step": 35557 + }, + { + "epoch": 0.6342168158955517, + "grad_norm": 0.34050777554512024, + "learning_rate": 1.7754846400928382e-05, + "loss": 0.1339, + "step": 35558 + }, + { + "epoch": 0.6342346520172654, + "grad_norm": 0.2503282427787781, + "learning_rate": 1.7753356704337076e-05, + "loss": 0.1137, + "step": 35559 + }, + { + "epoch": 0.634252488138979, + "grad_norm": 0.3249073624610901, + "learning_rate": 1.7751867035836265e-05, + "loss": 0.1313, + "step": 35560 + }, + { + "epoch": 0.6342703242606927, + "grad_norm": 0.3234670162200928, + "learning_rate": 1.7750377395431723e-05, + "loss": 0.1454, + "step": 35561 + }, + { + "epoch": 0.6342881603824064, + "grad_norm": 0.33902645111083984, + "learning_rate": 1.774888778312921e-05, + "loss": 0.1158, + "step": 35562 + }, + { + "epoch": 0.6343059965041201, + "grad_norm": 0.28533488512039185, + "learning_rate": 1.7747398198934524e-05, + "loss": 0.1087, + "step": 35563 + }, + { + "epoch": 0.6343238326258338, + "grad_norm": 0.24593883752822876, + "learning_rate": 1.7745908642853427e-05, + "loss": 0.1102, + "step": 35564 + }, + { + "epoch": 0.6343416687475475, + "grad_norm": 0.23370836675167084, + "learning_rate": 1.7744419114891686e-05, + "loss": 0.1593, + "step": 35565 + }, + { + "epoch": 0.6343595048692612, + "grad_norm": 0.22786179184913635, + "learning_rate": 1.7742929615055084e-05, + "loss": 0.1068, + "step": 35566 + }, + { + "epoch": 0.6343773409909749, + "grad_norm": 0.28017571568489075, + "learning_rate": 1.7741440143349395e-05, + "loss": 0.0901, + "step": 35567 + }, + { + "epoch": 0.6343951771126887, + "grad_norm": 0.2812330722808838, + "learning_rate": 1.7739950699780396e-05, + "loss": 0.1301, + "step": 35568 + }, + { + "epoch": 0.6344130132344024, + "grad_norm": 0.3041155934333801, + "learning_rate": 1.7738461284353852e-05, + "loss": 0.1343, + "step": 35569 + }, + { + "epoch": 0.634430849356116, + "grad_norm": 0.26265212893486023, + "learning_rate": 1.7736971897075542e-05, + "loss": 0.1526, + "step": 35570 + }, + { + "epoch": 0.6344486854778297, + "grad_norm": 0.2535557448863983, + "learning_rate": 1.7735482537951232e-05, + "loss": 0.1418, + "step": 35571 + }, + { + "epoch": 0.6344665215995434, + "grad_norm": 0.2951648533344269, + "learning_rate": 1.7733993206986705e-05, + "loss": 0.1338, + "step": 35572 + }, + { + "epoch": 0.6344843577212571, + "grad_norm": 0.2221120446920395, + "learning_rate": 1.7732503904187723e-05, + "loss": 0.1289, + "step": 35573 + }, + { + "epoch": 0.6345021938429708, + "grad_norm": 0.2539917826652527, + "learning_rate": 1.773101462956008e-05, + "loss": 0.1191, + "step": 35574 + }, + { + "epoch": 0.6345200299646845, + "grad_norm": 0.27111244201660156, + "learning_rate": 1.7729525383109518e-05, + "loss": 0.0865, + "step": 35575 + }, + { + "epoch": 0.6345378660863982, + "grad_norm": 0.27790161967277527, + "learning_rate": 1.772803616484184e-05, + "loss": 0.1288, + "step": 35576 + }, + { + "epoch": 0.6345557022081119, + "grad_norm": 0.2927458882331848, + "learning_rate": 1.7726546974762805e-05, + "loss": 0.1265, + "step": 35577 + }, + { + "epoch": 0.6345735383298255, + "grad_norm": 0.26739680767059326, + "learning_rate": 1.7725057812878183e-05, + "loss": 0.145, + "step": 35578 + }, + { + "epoch": 0.6345913744515392, + "grad_norm": 0.27402496337890625, + "learning_rate": 1.772356867919374e-05, + "loss": 0.0985, + "step": 35579 + }, + { + "epoch": 0.6346092105732529, + "grad_norm": 0.24193179607391357, + "learning_rate": 1.7722079573715273e-05, + "loss": 0.1293, + "step": 35580 + }, + { + "epoch": 0.6346270466949666, + "grad_norm": 0.2774505615234375, + "learning_rate": 1.772059049644853e-05, + "loss": 0.1419, + "step": 35581 + }, + { + "epoch": 0.6346448828166803, + "grad_norm": 0.22570741176605225, + "learning_rate": 1.77191014473993e-05, + "loss": 0.1248, + "step": 35582 + }, + { + "epoch": 0.634662718938394, + "grad_norm": 0.28375673294067383, + "learning_rate": 1.7717612426573342e-05, + "loss": 0.1305, + "step": 35583 + }, + { + "epoch": 0.6346805550601077, + "grad_norm": 0.19663527607917786, + "learning_rate": 1.771612343397643e-05, + "loss": 0.0762, + "step": 35584 + }, + { + "epoch": 0.6346983911818215, + "grad_norm": 0.3376742899417877, + "learning_rate": 1.771463446961435e-05, + "loss": 0.1256, + "step": 35585 + }, + { + "epoch": 0.6347162273035352, + "grad_norm": 0.29334262013435364, + "learning_rate": 1.7713145533492854e-05, + "loss": 0.1324, + "step": 35586 + }, + { + "epoch": 0.6347340634252489, + "grad_norm": 0.2523557245731354, + "learning_rate": 1.771165662561773e-05, + "loss": 0.1556, + "step": 35587 + }, + { + "epoch": 0.6347518995469625, + "grad_norm": 0.25364235043525696, + "learning_rate": 1.7710167745994725e-05, + "loss": 0.1171, + "step": 35588 + }, + { + "epoch": 0.6347697356686762, + "grad_norm": 0.21740026772022247, + "learning_rate": 1.7708678894629645e-05, + "loss": 0.0898, + "step": 35589 + }, + { + "epoch": 0.6347875717903899, + "grad_norm": 0.296003133058548, + "learning_rate": 1.7707190071528244e-05, + "loss": 0.1287, + "step": 35590 + }, + { + "epoch": 0.6348054079121036, + "grad_norm": 0.384896457195282, + "learning_rate": 1.770570127669629e-05, + "loss": 0.1239, + "step": 35591 + }, + { + "epoch": 0.6348232440338173, + "grad_norm": 0.39708447456359863, + "learning_rate": 1.770421251013955e-05, + "loss": 0.1216, + "step": 35592 + }, + { + "epoch": 0.634841080155531, + "grad_norm": 0.27358177304267883, + "learning_rate": 1.770272377186381e-05, + "loss": 0.1401, + "step": 35593 + }, + { + "epoch": 0.6348589162772447, + "grad_norm": 0.23284989595413208, + "learning_rate": 1.7701235061874833e-05, + "loss": 0.1073, + "step": 35594 + }, + { + "epoch": 0.6348767523989584, + "grad_norm": 0.20216771960258484, + "learning_rate": 1.769974638017839e-05, + "loss": 0.0887, + "step": 35595 + }, + { + "epoch": 0.634894588520672, + "grad_norm": 0.4100785255432129, + "learning_rate": 1.769825772678025e-05, + "loss": 0.129, + "step": 35596 + }, + { + "epoch": 0.6349124246423857, + "grad_norm": 0.3061642050743103, + "learning_rate": 1.7696769101686178e-05, + "loss": 0.1144, + "step": 35597 + }, + { + "epoch": 0.6349302607640994, + "grad_norm": 0.2426021248102188, + "learning_rate": 1.7695280504901962e-05, + "loss": 0.1205, + "step": 35598 + }, + { + "epoch": 0.6349480968858131, + "grad_norm": 0.25247666239738464, + "learning_rate": 1.7693791936433364e-05, + "loss": 0.0971, + "step": 35599 + }, + { + "epoch": 0.6349659330075268, + "grad_norm": 0.248277485370636, + "learning_rate": 1.769230339628614e-05, + "loss": 0.0957, + "step": 35600 + }, + { + "epoch": 0.6349837691292405, + "grad_norm": 0.29131919145584106, + "learning_rate": 1.7690814884466076e-05, + "loss": 0.1475, + "step": 35601 + }, + { + "epoch": 0.6350016052509543, + "grad_norm": 0.23761622607707977, + "learning_rate": 1.7689326400978937e-05, + "loss": 0.1116, + "step": 35602 + }, + { + "epoch": 0.635019441372668, + "grad_norm": 0.2253965586423874, + "learning_rate": 1.76878379458305e-05, + "loss": 0.109, + "step": 35603 + }, + { + "epoch": 0.6350372774943817, + "grad_norm": 0.20029105246067047, + "learning_rate": 1.768634951902653e-05, + "loss": 0.1178, + "step": 35604 + }, + { + "epoch": 0.6350551136160953, + "grad_norm": 0.2966753840446472, + "learning_rate": 1.7684861120572783e-05, + "loss": 0.1764, + "step": 35605 + }, + { + "epoch": 0.635072949737809, + "grad_norm": 0.3561903238296509, + "learning_rate": 1.768337275047505e-05, + "loss": 0.1306, + "step": 35606 + }, + { + "epoch": 0.6350907858595227, + "grad_norm": 0.3040234446525574, + "learning_rate": 1.768188440873909e-05, + "loss": 0.1564, + "step": 35607 + }, + { + "epoch": 0.6351086219812364, + "grad_norm": 0.33845001459121704, + "learning_rate": 1.7680396095370672e-05, + "loss": 0.1244, + "step": 35608 + }, + { + "epoch": 0.6351264581029501, + "grad_norm": 0.30711135268211365, + "learning_rate": 1.7678907810375574e-05, + "loss": 0.1715, + "step": 35609 + }, + { + "epoch": 0.6351442942246638, + "grad_norm": 0.21534603834152222, + "learning_rate": 1.7677419553759546e-05, + "loss": 0.0841, + "step": 35610 + }, + { + "epoch": 0.6351621303463775, + "grad_norm": 0.25893518328666687, + "learning_rate": 1.7675931325528375e-05, + "loss": 0.0913, + "step": 35611 + }, + { + "epoch": 0.6351799664680912, + "grad_norm": 0.29681482911109924, + "learning_rate": 1.767444312568783e-05, + "loss": 0.0675, + "step": 35612 + }, + { + "epoch": 0.6351978025898048, + "grad_norm": 0.21716804802417755, + "learning_rate": 1.767295495424366e-05, + "loss": 0.1549, + "step": 35613 + }, + { + "epoch": 0.6352156387115185, + "grad_norm": 0.28385019302368164, + "learning_rate": 1.7671466811201653e-05, + "loss": 0.1418, + "step": 35614 + }, + { + "epoch": 0.6352334748332322, + "grad_norm": 0.4200933277606964, + "learning_rate": 1.7669978696567575e-05, + "loss": 0.1026, + "step": 35615 + }, + { + "epoch": 0.6352513109549459, + "grad_norm": 0.1612042933702469, + "learning_rate": 1.766849061034719e-05, + "loss": 0.1134, + "step": 35616 + }, + { + "epoch": 0.6352691470766596, + "grad_norm": 0.21531610190868378, + "learning_rate": 1.766700255254627e-05, + "loss": 0.13, + "step": 35617 + }, + { + "epoch": 0.6352869831983734, + "grad_norm": 0.2569911479949951, + "learning_rate": 1.7665514523170575e-05, + "loss": 0.117, + "step": 35618 + }, + { + "epoch": 0.6353048193200871, + "grad_norm": 0.3161217272281647, + "learning_rate": 1.7664026522225885e-05, + "loss": 0.1056, + "step": 35619 + }, + { + "epoch": 0.6353226554418008, + "grad_norm": 0.2552807331085205, + "learning_rate": 1.7662538549717962e-05, + "loss": 0.1043, + "step": 35620 + }, + { + "epoch": 0.6353404915635145, + "grad_norm": 0.2670985758304596, + "learning_rate": 1.7661050605652568e-05, + "loss": 0.1343, + "step": 35621 + }, + { + "epoch": 0.6353583276852282, + "grad_norm": 0.2933639585971832, + "learning_rate": 1.765956269003548e-05, + "loss": 0.157, + "step": 35622 + }, + { + "epoch": 0.6353761638069418, + "grad_norm": 0.3414352536201477, + "learning_rate": 1.7658074802872456e-05, + "loss": 0.166, + "step": 35623 + }, + { + "epoch": 0.6353939999286555, + "grad_norm": 0.22835437953472137, + "learning_rate": 1.7656586944169278e-05, + "loss": 0.0802, + "step": 35624 + }, + { + "epoch": 0.6354118360503692, + "grad_norm": 0.29871606826782227, + "learning_rate": 1.765509911393171e-05, + "loss": 0.145, + "step": 35625 + }, + { + "epoch": 0.6354296721720829, + "grad_norm": 0.3048114478588104, + "learning_rate": 1.765361131216551e-05, + "loss": 0.1526, + "step": 35626 + }, + { + "epoch": 0.6354475082937966, + "grad_norm": 0.24475961923599243, + "learning_rate": 1.765212353887644e-05, + "loss": 0.1202, + "step": 35627 + }, + { + "epoch": 0.6354653444155103, + "grad_norm": 0.2493455857038498, + "learning_rate": 1.7650635794070287e-05, + "loss": 0.1356, + "step": 35628 + }, + { + "epoch": 0.635483180537224, + "grad_norm": 0.23936395347118378, + "learning_rate": 1.76491480777528e-05, + "loss": 0.1072, + "step": 35629 + }, + { + "epoch": 0.6355010166589377, + "grad_norm": 0.2471265345811844, + "learning_rate": 1.7647660389929765e-05, + "loss": 0.1469, + "step": 35630 + }, + { + "epoch": 0.6355188527806513, + "grad_norm": 0.25238221883773804, + "learning_rate": 1.7646172730606928e-05, + "loss": 0.1307, + "step": 35631 + }, + { + "epoch": 0.635536688902365, + "grad_norm": 0.2571694552898407, + "learning_rate": 1.7644685099790073e-05, + "loss": 0.1512, + "step": 35632 + }, + { + "epoch": 0.6355545250240787, + "grad_norm": 0.24108360707759857, + "learning_rate": 1.764319749748496e-05, + "loss": 0.1126, + "step": 35633 + }, + { + "epoch": 0.6355723611457924, + "grad_norm": 0.2176097333431244, + "learning_rate": 1.7641709923697353e-05, + "loss": 0.106, + "step": 35634 + }, + { + "epoch": 0.6355901972675062, + "grad_norm": 0.32649004459381104, + "learning_rate": 1.7640222378433014e-05, + "loss": 0.1245, + "step": 35635 + }, + { + "epoch": 0.6356080333892199, + "grad_norm": 0.28774306178092957, + "learning_rate": 1.763873486169771e-05, + "loss": 0.1153, + "step": 35636 + }, + { + "epoch": 0.6356258695109336, + "grad_norm": 0.3249153196811676, + "learning_rate": 1.7637247373497226e-05, + "loss": 0.142, + "step": 35637 + }, + { + "epoch": 0.6356437056326473, + "grad_norm": 0.2671641409397125, + "learning_rate": 1.7635759913837314e-05, + "loss": 0.15, + "step": 35638 + }, + { + "epoch": 0.635661541754361, + "grad_norm": 0.3016379475593567, + "learning_rate": 1.7634272482723736e-05, + "loss": 0.1488, + "step": 35639 + }, + { + "epoch": 0.6356793778760746, + "grad_norm": 0.4725029468536377, + "learning_rate": 1.763278508016225e-05, + "loss": 0.1868, + "step": 35640 + }, + { + "epoch": 0.6356972139977883, + "grad_norm": 0.3273630738258362, + "learning_rate": 1.763129770615865e-05, + "loss": 0.1387, + "step": 35641 + }, + { + "epoch": 0.635715050119502, + "grad_norm": 0.2814585566520691, + "learning_rate": 1.762981036071868e-05, + "loss": 0.1179, + "step": 35642 + }, + { + "epoch": 0.6357328862412157, + "grad_norm": 0.3014568090438843, + "learning_rate": 1.7628323043848112e-05, + "loss": 0.1189, + "step": 35643 + }, + { + "epoch": 0.6357507223629294, + "grad_norm": 0.26144954562187195, + "learning_rate": 1.7626835755552702e-05, + "loss": 0.1371, + "step": 35644 + }, + { + "epoch": 0.6357685584846431, + "grad_norm": 0.2884332835674286, + "learning_rate": 1.7625348495838227e-05, + "loss": 0.2149, + "step": 35645 + }, + { + "epoch": 0.6357863946063568, + "grad_norm": 0.2273387610912323, + "learning_rate": 1.7623861264710457e-05, + "loss": 0.1397, + "step": 35646 + }, + { + "epoch": 0.6358042307280705, + "grad_norm": 0.27078571915626526, + "learning_rate": 1.7622374062175144e-05, + "loss": 0.1181, + "step": 35647 + }, + { + "epoch": 0.6358220668497842, + "grad_norm": 0.2247803956270218, + "learning_rate": 1.7620886888238052e-05, + "loss": 0.1117, + "step": 35648 + }, + { + "epoch": 0.6358399029714978, + "grad_norm": 0.21677479147911072, + "learning_rate": 1.7619399742904954e-05, + "loss": 0.1627, + "step": 35649 + }, + { + "epoch": 0.6358577390932115, + "grad_norm": 0.26142430305480957, + "learning_rate": 1.761791262618161e-05, + "loss": 0.0893, + "step": 35650 + }, + { + "epoch": 0.6358755752149252, + "grad_norm": 0.3166951835155487, + "learning_rate": 1.761642553807379e-05, + "loss": 0.1391, + "step": 35651 + }, + { + "epoch": 0.635893411336639, + "grad_norm": 0.2555030286312103, + "learning_rate": 1.7614938478587256e-05, + "loss": 0.1149, + "step": 35652 + }, + { + "epoch": 0.6359112474583527, + "grad_norm": 0.26169532537460327, + "learning_rate": 1.7613451447727765e-05, + "loss": 0.1681, + "step": 35653 + }, + { + "epoch": 0.6359290835800664, + "grad_norm": 0.28741729259490967, + "learning_rate": 1.7611964445501093e-05, + "loss": 0.1178, + "step": 35654 + }, + { + "epoch": 0.6359469197017801, + "grad_norm": 0.33196428418159485, + "learning_rate": 1.7610477471913e-05, + "loss": 0.0873, + "step": 35655 + }, + { + "epoch": 0.6359647558234938, + "grad_norm": 0.2961922883987427, + "learning_rate": 1.7608990526969242e-05, + "loss": 0.1112, + "step": 35656 + }, + { + "epoch": 0.6359825919452075, + "grad_norm": 0.27429425716400146, + "learning_rate": 1.760750361067559e-05, + "loss": 0.1246, + "step": 35657 + }, + { + "epoch": 0.6360004280669211, + "grad_norm": 0.23447948694229126, + "learning_rate": 1.7606016723037808e-05, + "loss": 0.1238, + "step": 35658 + }, + { + "epoch": 0.6360182641886348, + "grad_norm": 0.2777652442455292, + "learning_rate": 1.7604529864061664e-05, + "loss": 0.1365, + "step": 35659 + }, + { + "epoch": 0.6360361003103485, + "grad_norm": 0.23477955162525177, + "learning_rate": 1.7603043033752916e-05, + "loss": 0.1012, + "step": 35660 + }, + { + "epoch": 0.6360539364320622, + "grad_norm": 0.2730066776275635, + "learning_rate": 1.7601556232117328e-05, + "loss": 0.1826, + "step": 35661 + }, + { + "epoch": 0.6360717725537759, + "grad_norm": 0.34212398529052734, + "learning_rate": 1.7600069459160656e-05, + "loss": 0.128, + "step": 35662 + }, + { + "epoch": 0.6360896086754896, + "grad_norm": 0.22933344542980194, + "learning_rate": 1.7598582714888674e-05, + "loss": 0.1017, + "step": 35663 + }, + { + "epoch": 0.6361074447972033, + "grad_norm": 0.35666170716285706, + "learning_rate": 1.7597095999307144e-05, + "loss": 0.156, + "step": 35664 + }, + { + "epoch": 0.636125280918917, + "grad_norm": 0.26355376839637756, + "learning_rate": 1.759560931242183e-05, + "loss": 0.094, + "step": 35665 + }, + { + "epoch": 0.6361431170406306, + "grad_norm": 0.24461154639720917, + "learning_rate": 1.759412265423848e-05, + "loss": 0.1304, + "step": 35666 + }, + { + "epoch": 0.6361609531623443, + "grad_norm": 0.31336119771003723, + "learning_rate": 1.759263602476288e-05, + "loss": 0.171, + "step": 35667 + }, + { + "epoch": 0.636178789284058, + "grad_norm": 0.24168451130390167, + "learning_rate": 1.759114942400078e-05, + "loss": 0.1055, + "step": 35668 + }, + { + "epoch": 0.6361966254057718, + "grad_norm": 0.3456285893917084, + "learning_rate": 1.758966285195794e-05, + "loss": 0.1505, + "step": 35669 + }, + { + "epoch": 0.6362144615274855, + "grad_norm": 0.3267318606376648, + "learning_rate": 1.7588176308640124e-05, + "loss": 0.1658, + "step": 35670 + }, + { + "epoch": 0.6362322976491992, + "grad_norm": 0.2556320130825043, + "learning_rate": 1.7586689794053095e-05, + "loss": 0.1198, + "step": 35671 + }, + { + "epoch": 0.6362501337709129, + "grad_norm": 0.29885348677635193, + "learning_rate": 1.758520330820263e-05, + "loss": 0.1149, + "step": 35672 + }, + { + "epoch": 0.6362679698926266, + "grad_norm": 0.37620025873184204, + "learning_rate": 1.758371685109447e-05, + "loss": 0.1615, + "step": 35673 + }, + { + "epoch": 0.6362858060143403, + "grad_norm": 0.2157006710767746, + "learning_rate": 1.758223042273439e-05, + "loss": 0.1186, + "step": 35674 + }, + { + "epoch": 0.636303642136054, + "grad_norm": 0.25466403365135193, + "learning_rate": 1.7580744023128133e-05, + "loss": 0.0967, + "step": 35675 + }, + { + "epoch": 0.6363214782577676, + "grad_norm": 0.2770139276981354, + "learning_rate": 1.7579257652281487e-05, + "loss": 0.0927, + "step": 35676 + }, + { + "epoch": 0.6363393143794813, + "grad_norm": 0.20351476967334747, + "learning_rate": 1.7577771310200193e-05, + "loss": 0.0912, + "step": 35677 + }, + { + "epoch": 0.636357150501195, + "grad_norm": 0.27072563767433167, + "learning_rate": 1.757628499689003e-05, + "loss": 0.1248, + "step": 35678 + }, + { + "epoch": 0.6363749866229087, + "grad_norm": 0.3443397283554077, + "learning_rate": 1.757479871235674e-05, + "loss": 0.1182, + "step": 35679 + }, + { + "epoch": 0.6363928227446224, + "grad_norm": 0.30466386675834656, + "learning_rate": 1.7573312456606103e-05, + "loss": 0.1494, + "step": 35680 + }, + { + "epoch": 0.6364106588663361, + "grad_norm": 0.3863126337528229, + "learning_rate": 1.7571826229643874e-05, + "loss": 0.1812, + "step": 35681 + }, + { + "epoch": 0.6364284949880498, + "grad_norm": 0.3277699649333954, + "learning_rate": 1.7570340031475808e-05, + "loss": 0.1362, + "step": 35682 + }, + { + "epoch": 0.6364463311097635, + "grad_norm": 0.25987011194229126, + "learning_rate": 1.7568853862107666e-05, + "loss": 0.1372, + "step": 35683 + }, + { + "epoch": 0.6364641672314771, + "grad_norm": 0.3657575249671936, + "learning_rate": 1.756736772154522e-05, + "loss": 0.1324, + "step": 35684 + }, + { + "epoch": 0.6364820033531908, + "grad_norm": 0.3056054413318634, + "learning_rate": 1.756588160979422e-05, + "loss": 0.1334, + "step": 35685 + }, + { + "epoch": 0.6364998394749046, + "grad_norm": 0.2705551087856293, + "learning_rate": 1.7564395526860433e-05, + "loss": 0.1188, + "step": 35686 + }, + { + "epoch": 0.6365176755966183, + "grad_norm": 0.28838902711868286, + "learning_rate": 1.756290947274961e-05, + "loss": 0.1533, + "step": 35687 + }, + { + "epoch": 0.636535511718332, + "grad_norm": 0.3430512845516205, + "learning_rate": 1.7561423447467527e-05, + "loss": 0.1664, + "step": 35688 + }, + { + "epoch": 0.6365533478400457, + "grad_norm": 0.3388262391090393, + "learning_rate": 1.7559937451019937e-05, + "loss": 0.1484, + "step": 35689 + }, + { + "epoch": 0.6365711839617594, + "grad_norm": 0.28996366262435913, + "learning_rate": 1.755845148341259e-05, + "loss": 0.1809, + "step": 35690 + }, + { + "epoch": 0.6365890200834731, + "grad_norm": 0.23881135880947113, + "learning_rate": 1.7556965544651264e-05, + "loss": 0.1142, + "step": 35691 + }, + { + "epoch": 0.6366068562051868, + "grad_norm": 0.4434231221675873, + "learning_rate": 1.75554796347417e-05, + "loss": 0.1873, + "step": 35692 + }, + { + "epoch": 0.6366246923269004, + "grad_norm": 0.267465740442276, + "learning_rate": 1.755399375368968e-05, + "loss": 0.1638, + "step": 35693 + }, + { + "epoch": 0.6366425284486141, + "grad_norm": 0.2562426030635834, + "learning_rate": 1.7552507901500948e-05, + "loss": 0.1707, + "step": 35694 + }, + { + "epoch": 0.6366603645703278, + "grad_norm": 0.3207739293575287, + "learning_rate": 1.7551022078181272e-05, + "loss": 0.1342, + "step": 35695 + }, + { + "epoch": 0.6366782006920415, + "grad_norm": 0.25512486696243286, + "learning_rate": 1.7549536283736397e-05, + "loss": 0.0666, + "step": 35696 + }, + { + "epoch": 0.6366960368137552, + "grad_norm": 0.3103644847869873, + "learning_rate": 1.75480505181721e-05, + "loss": 0.0924, + "step": 35697 + }, + { + "epoch": 0.6367138729354689, + "grad_norm": 0.2736837565898895, + "learning_rate": 1.7546564781494126e-05, + "loss": 0.1115, + "step": 35698 + }, + { + "epoch": 0.6367317090571826, + "grad_norm": 0.2774411737918854, + "learning_rate": 1.754507907370825e-05, + "loss": 0.1225, + "step": 35699 + }, + { + "epoch": 0.6367495451788963, + "grad_norm": 0.33015555143356323, + "learning_rate": 1.754359339482021e-05, + "loss": 0.1194, + "step": 35700 + }, + { + "epoch": 0.63676738130061, + "grad_norm": 0.2377718836069107, + "learning_rate": 1.7542107744835788e-05, + "loss": 0.1177, + "step": 35701 + }, + { + "epoch": 0.6367852174223236, + "grad_norm": 0.2173222303390503, + "learning_rate": 1.7540622123760736e-05, + "loss": 0.1022, + "step": 35702 + }, + { + "epoch": 0.6368030535440374, + "grad_norm": 0.27858468890190125, + "learning_rate": 1.7539136531600803e-05, + "loss": 0.1105, + "step": 35703 + }, + { + "epoch": 0.6368208896657511, + "grad_norm": 0.3620661497116089, + "learning_rate": 1.7537650968361753e-05, + "loss": 0.1499, + "step": 35704 + }, + { + "epoch": 0.6368387257874648, + "grad_norm": 0.2760492265224457, + "learning_rate": 1.7536165434049346e-05, + "loss": 0.1991, + "step": 35705 + }, + { + "epoch": 0.6368565619091785, + "grad_norm": 0.2980818450450897, + "learning_rate": 1.7534679928669335e-05, + "loss": 0.1687, + "step": 35706 + }, + { + "epoch": 0.6368743980308922, + "grad_norm": 0.3211263120174408, + "learning_rate": 1.7533194452227493e-05, + "loss": 0.147, + "step": 35707 + }, + { + "epoch": 0.6368922341526059, + "grad_norm": 0.2986134886741638, + "learning_rate": 1.7531709004729562e-05, + "loss": 0.1362, + "step": 35708 + }, + { + "epoch": 0.6369100702743196, + "grad_norm": 0.30289921164512634, + "learning_rate": 1.75302235861813e-05, + "loss": 0.1772, + "step": 35709 + }, + { + "epoch": 0.6369279063960332, + "grad_norm": 0.2835098206996918, + "learning_rate": 1.7528738196588484e-05, + "loss": 0.1301, + "step": 35710 + }, + { + "epoch": 0.6369457425177469, + "grad_norm": 0.2816462814807892, + "learning_rate": 1.7527252835956856e-05, + "loss": 0.1081, + "step": 35711 + }, + { + "epoch": 0.6369635786394606, + "grad_norm": 0.2283925712108612, + "learning_rate": 1.7525767504292172e-05, + "loss": 0.1442, + "step": 35712 + }, + { + "epoch": 0.6369814147611743, + "grad_norm": 0.22793836891651154, + "learning_rate": 1.752428220160019e-05, + "loss": 0.147, + "step": 35713 + }, + { + "epoch": 0.636999250882888, + "grad_norm": 0.4044966399669647, + "learning_rate": 1.7522796927886684e-05, + "loss": 0.1251, + "step": 35714 + }, + { + "epoch": 0.6370170870046017, + "grad_norm": 0.24216333031654358, + "learning_rate": 1.75213116831574e-05, + "loss": 0.1262, + "step": 35715 + }, + { + "epoch": 0.6370349231263154, + "grad_norm": 0.24650423228740692, + "learning_rate": 1.7519826467418092e-05, + "loss": 0.1202, + "step": 35716 + }, + { + "epoch": 0.6370527592480291, + "grad_norm": 0.40519607067108154, + "learning_rate": 1.751834128067452e-05, + "loss": 0.073, + "step": 35717 + }, + { + "epoch": 0.6370705953697428, + "grad_norm": 0.43996450304985046, + "learning_rate": 1.7516856122932434e-05, + "loss": 0.1438, + "step": 35718 + }, + { + "epoch": 0.6370884314914566, + "grad_norm": 0.2021629363298416, + "learning_rate": 1.7515370994197603e-05, + "loss": 0.1516, + "step": 35719 + }, + { + "epoch": 0.6371062676131702, + "grad_norm": 0.20187199115753174, + "learning_rate": 1.7513885894475785e-05, + "loss": 0.0818, + "step": 35720 + }, + { + "epoch": 0.6371241037348839, + "grad_norm": 0.3389519453048706, + "learning_rate": 1.751240082377273e-05, + "loss": 0.1703, + "step": 35721 + }, + { + "epoch": 0.6371419398565976, + "grad_norm": 0.2622396945953369, + "learning_rate": 1.751091578209418e-05, + "loss": 0.1317, + "step": 35722 + }, + { + "epoch": 0.6371597759783113, + "grad_norm": 0.30014288425445557, + "learning_rate": 1.7509430769445923e-05, + "loss": 0.1546, + "step": 35723 + }, + { + "epoch": 0.637177612100025, + "grad_norm": 0.22925706207752228, + "learning_rate": 1.7507945785833697e-05, + "loss": 0.1139, + "step": 35724 + }, + { + "epoch": 0.6371954482217387, + "grad_norm": 0.2455938458442688, + "learning_rate": 1.750646083126326e-05, + "loss": 0.1087, + "step": 35725 + }, + { + "epoch": 0.6372132843434524, + "grad_norm": 0.25852280855178833, + "learning_rate": 1.7504975905740365e-05, + "loss": 0.1463, + "step": 35726 + }, + { + "epoch": 0.6372311204651661, + "grad_norm": 0.24533499777317047, + "learning_rate": 1.7503491009270774e-05, + "loss": 0.1449, + "step": 35727 + }, + { + "epoch": 0.6372489565868797, + "grad_norm": 0.2822505831718445, + "learning_rate": 1.7502006141860244e-05, + "loss": 0.1117, + "step": 35728 + }, + { + "epoch": 0.6372667927085934, + "grad_norm": 0.25432395935058594, + "learning_rate": 1.7500521303514528e-05, + "loss": 0.1003, + "step": 35729 + }, + { + "epoch": 0.6372846288303071, + "grad_norm": 0.23213037848472595, + "learning_rate": 1.7499036494239386e-05, + "loss": 0.1525, + "step": 35730 + }, + { + "epoch": 0.6373024649520208, + "grad_norm": 0.2061501294374466, + "learning_rate": 1.7497551714040554e-05, + "loss": 0.1212, + "step": 35731 + }, + { + "epoch": 0.6373203010737345, + "grad_norm": 0.1983170211315155, + "learning_rate": 1.7496066962923816e-05, + "loss": 0.0937, + "step": 35732 + }, + { + "epoch": 0.6373381371954482, + "grad_norm": 0.26010000705718994, + "learning_rate": 1.7494582240894907e-05, + "loss": 0.1626, + "step": 35733 + }, + { + "epoch": 0.6373559733171619, + "grad_norm": 0.24518905580043793, + "learning_rate": 1.74930975479596e-05, + "loss": 0.1243, + "step": 35734 + }, + { + "epoch": 0.6373738094388756, + "grad_norm": 0.24404463171958923, + "learning_rate": 1.7491612884123622e-05, + "loss": 0.1527, + "step": 35735 + }, + { + "epoch": 0.6373916455605894, + "grad_norm": 0.22220630943775177, + "learning_rate": 1.7490128249392758e-05, + "loss": 0.1117, + "step": 35736 + }, + { + "epoch": 0.637409481682303, + "grad_norm": 0.2968965470790863, + "learning_rate": 1.748864364377275e-05, + "loss": 0.1062, + "step": 35737 + }, + { + "epoch": 0.6374273178040167, + "grad_norm": 0.2716768980026245, + "learning_rate": 1.7487159067269356e-05, + "loss": 0.1884, + "step": 35738 + }, + { + "epoch": 0.6374451539257304, + "grad_norm": 0.29261428117752075, + "learning_rate": 1.7485674519888318e-05, + "loss": 0.1199, + "step": 35739 + }, + { + "epoch": 0.6374629900474441, + "grad_norm": 0.30178436636924744, + "learning_rate": 1.7484190001635408e-05, + "loss": 0.1596, + "step": 35740 + }, + { + "epoch": 0.6374808261691578, + "grad_norm": 0.22892168164253235, + "learning_rate": 1.748270551251637e-05, + "loss": 0.1141, + "step": 35741 + }, + { + "epoch": 0.6374986622908715, + "grad_norm": 0.23763489723205566, + "learning_rate": 1.7481221052536973e-05, + "loss": 0.1752, + "step": 35742 + }, + { + "epoch": 0.6375164984125852, + "grad_norm": 0.2826126515865326, + "learning_rate": 1.7479736621702955e-05, + "loss": 0.1254, + "step": 35743 + }, + { + "epoch": 0.6375343345342989, + "grad_norm": 0.2782338559627533, + "learning_rate": 1.7478252220020064e-05, + "loss": 0.1395, + "step": 35744 + }, + { + "epoch": 0.6375521706560126, + "grad_norm": 0.2369375079870224, + "learning_rate": 1.7476767847494077e-05, + "loss": 0.1246, + "step": 35745 + }, + { + "epoch": 0.6375700067777262, + "grad_norm": 0.20960021018981934, + "learning_rate": 1.747528350413073e-05, + "loss": 0.0993, + "step": 35746 + }, + { + "epoch": 0.6375878428994399, + "grad_norm": 0.2015584111213684, + "learning_rate": 1.747379918993579e-05, + "loss": 0.0951, + "step": 35747 + }, + { + "epoch": 0.6376056790211536, + "grad_norm": 0.20795419812202454, + "learning_rate": 1.7472314904914995e-05, + "loss": 0.1034, + "step": 35748 + }, + { + "epoch": 0.6376235151428673, + "grad_norm": 0.2713645100593567, + "learning_rate": 1.7470830649074115e-05, + "loss": 0.107, + "step": 35749 + }, + { + "epoch": 0.637641351264581, + "grad_norm": 0.2632594704627991, + "learning_rate": 1.7469346422418893e-05, + "loss": 0.1032, + "step": 35750 + }, + { + "epoch": 0.6376591873862947, + "grad_norm": 0.2530999481678009, + "learning_rate": 1.746786222495509e-05, + "loss": 0.1229, + "step": 35751 + }, + { + "epoch": 0.6376770235080084, + "grad_norm": 0.20867447555065155, + "learning_rate": 1.746637805668844e-05, + "loss": 0.1303, + "step": 35752 + }, + { + "epoch": 0.6376948596297222, + "grad_norm": 0.23367738723754883, + "learning_rate": 1.7464893917624724e-05, + "loss": 0.1385, + "step": 35753 + }, + { + "epoch": 0.6377126957514359, + "grad_norm": 0.2347860336303711, + "learning_rate": 1.7463409807769678e-05, + "loss": 0.1495, + "step": 35754 + }, + { + "epoch": 0.6377305318731495, + "grad_norm": 0.2776419222354889, + "learning_rate": 1.7461925727129065e-05, + "loss": 0.1457, + "step": 35755 + }, + { + "epoch": 0.6377483679948632, + "grad_norm": 0.23866885900497437, + "learning_rate": 1.7460441675708626e-05, + "loss": 0.113, + "step": 35756 + }, + { + "epoch": 0.6377662041165769, + "grad_norm": 0.28532931208610535, + "learning_rate": 1.745895765351411e-05, + "loss": 0.1442, + "step": 35757 + }, + { + "epoch": 0.6377840402382906, + "grad_norm": 0.2766592800617218, + "learning_rate": 1.7457473660551295e-05, + "loss": 0.1352, + "step": 35758 + }, + { + "epoch": 0.6378018763600043, + "grad_norm": 0.299600750207901, + "learning_rate": 1.7455989696825908e-05, + "loss": 0.1271, + "step": 35759 + }, + { + "epoch": 0.637819712481718, + "grad_norm": 0.27411141991615295, + "learning_rate": 1.745450576234371e-05, + "loss": 0.1173, + "step": 35760 + }, + { + "epoch": 0.6378375486034317, + "grad_norm": 0.2585819959640503, + "learning_rate": 1.7453021857110455e-05, + "loss": 0.082, + "step": 35761 + }, + { + "epoch": 0.6378553847251454, + "grad_norm": 0.219051331281662, + "learning_rate": 1.7451537981131895e-05, + "loss": 0.1238, + "step": 35762 + }, + { + "epoch": 0.637873220846859, + "grad_norm": 0.30432379245758057, + "learning_rate": 1.745005413441378e-05, + "loss": 0.1202, + "step": 35763 + }, + { + "epoch": 0.6378910569685727, + "grad_norm": 0.22092677652835846, + "learning_rate": 1.744857031696187e-05, + "loss": 0.144, + "step": 35764 + }, + { + "epoch": 0.6379088930902864, + "grad_norm": 0.3004605770111084, + "learning_rate": 1.7447086528781892e-05, + "loss": 0.1, + "step": 35765 + }, + { + "epoch": 0.6379267292120001, + "grad_norm": 0.26434415578842163, + "learning_rate": 1.7445602769879633e-05, + "loss": 0.1327, + "step": 35766 + }, + { + "epoch": 0.6379445653337138, + "grad_norm": 0.21752548217773438, + "learning_rate": 1.7444119040260814e-05, + "loss": 0.0713, + "step": 35767 + }, + { + "epoch": 0.6379624014554275, + "grad_norm": 0.24605584144592285, + "learning_rate": 1.7442635339931212e-05, + "loss": 0.1117, + "step": 35768 + }, + { + "epoch": 0.6379802375771412, + "grad_norm": 0.2782864570617676, + "learning_rate": 1.744115166889656e-05, + "loss": 0.1311, + "step": 35769 + }, + { + "epoch": 0.637998073698855, + "grad_norm": 0.4238375425338745, + "learning_rate": 1.743966802716261e-05, + "loss": 0.1304, + "step": 35770 + }, + { + "epoch": 0.6380159098205687, + "grad_norm": 0.26671743392944336, + "learning_rate": 1.7438184414735122e-05, + "loss": 0.1276, + "step": 35771 + }, + { + "epoch": 0.6380337459422823, + "grad_norm": 0.21730798482894897, + "learning_rate": 1.7436700831619846e-05, + "loss": 0.1356, + "step": 35772 + }, + { + "epoch": 0.638051582063996, + "grad_norm": 0.23240308463573456, + "learning_rate": 1.7435217277822523e-05, + "loss": 0.1407, + "step": 35773 + }, + { + "epoch": 0.6380694181857097, + "grad_norm": 0.2365575134754181, + "learning_rate": 1.7433733753348917e-05, + "loss": 0.161, + "step": 35774 + }, + { + "epoch": 0.6380872543074234, + "grad_norm": 0.25101301074028015, + "learning_rate": 1.7432250258204766e-05, + "loss": 0.1315, + "step": 35775 + }, + { + "epoch": 0.6381050904291371, + "grad_norm": 0.29103437066078186, + "learning_rate": 1.7430766792395835e-05, + "loss": 0.1221, + "step": 35776 + }, + { + "epoch": 0.6381229265508508, + "grad_norm": 0.2293984591960907, + "learning_rate": 1.7429283355927865e-05, + "loss": 0.1146, + "step": 35777 + }, + { + "epoch": 0.6381407626725645, + "grad_norm": 0.2701701819896698, + "learning_rate": 1.7427799948806595e-05, + "loss": 0.0909, + "step": 35778 + }, + { + "epoch": 0.6381585987942782, + "grad_norm": 0.3107643723487854, + "learning_rate": 1.7426316571037802e-05, + "loss": 0.1791, + "step": 35779 + }, + { + "epoch": 0.6381764349159919, + "grad_norm": 0.39670756459236145, + "learning_rate": 1.7424833222627218e-05, + "loss": 0.1587, + "step": 35780 + }, + { + "epoch": 0.6381942710377055, + "grad_norm": 0.21293073892593384, + "learning_rate": 1.7423349903580596e-05, + "loss": 0.1267, + "step": 35781 + }, + { + "epoch": 0.6382121071594192, + "grad_norm": 0.2524206042289734, + "learning_rate": 1.742186661390369e-05, + "loss": 0.1107, + "step": 35782 + }, + { + "epoch": 0.6382299432811329, + "grad_norm": 0.21061506867408752, + "learning_rate": 1.7420383353602236e-05, + "loss": 0.1124, + "step": 35783 + }, + { + "epoch": 0.6382477794028466, + "grad_norm": 0.2210262268781662, + "learning_rate": 1.7418900122682003e-05, + "loss": 0.1003, + "step": 35784 + }, + { + "epoch": 0.6382656155245603, + "grad_norm": 0.2678980827331543, + "learning_rate": 1.7417416921148734e-05, + "loss": 0.1138, + "step": 35785 + }, + { + "epoch": 0.638283451646274, + "grad_norm": 0.2675687074661255, + "learning_rate": 1.7415933749008173e-05, + "loss": 0.1272, + "step": 35786 + }, + { + "epoch": 0.6383012877679878, + "grad_norm": 0.28626006841659546, + "learning_rate": 1.7414450606266065e-05, + "loss": 0.1133, + "step": 35787 + }, + { + "epoch": 0.6383191238897015, + "grad_norm": 0.3768504858016968, + "learning_rate": 1.741296749292817e-05, + "loss": 0.2613, + "step": 35788 + }, + { + "epoch": 0.6383369600114152, + "grad_norm": 0.2312556803226471, + "learning_rate": 1.741148440900024e-05, + "loss": 0.1033, + "step": 35789 + }, + { + "epoch": 0.6383547961331288, + "grad_norm": 0.27568963170051575, + "learning_rate": 1.7410001354488015e-05, + "loss": 0.1631, + "step": 35790 + }, + { + "epoch": 0.6383726322548425, + "grad_norm": 0.34590527415275574, + "learning_rate": 1.7408518329397236e-05, + "loss": 0.1898, + "step": 35791 + }, + { + "epoch": 0.6383904683765562, + "grad_norm": 0.3124268651008606, + "learning_rate": 1.740703533373367e-05, + "loss": 0.1312, + "step": 35792 + }, + { + "epoch": 0.6384083044982699, + "grad_norm": 0.2380906641483307, + "learning_rate": 1.7405552367503063e-05, + "loss": 0.1432, + "step": 35793 + }, + { + "epoch": 0.6384261406199836, + "grad_norm": 0.3375542461872101, + "learning_rate": 1.7404069430711155e-05, + "loss": 0.1361, + "step": 35794 + }, + { + "epoch": 0.6384439767416973, + "grad_norm": 0.26807522773742676, + "learning_rate": 1.7402586523363694e-05, + "loss": 0.1456, + "step": 35795 + }, + { + "epoch": 0.638461812863411, + "grad_norm": 0.2570466697216034, + "learning_rate": 1.7401103645466428e-05, + "loss": 0.1404, + "step": 35796 + }, + { + "epoch": 0.6384796489851247, + "grad_norm": 0.2428160011768341, + "learning_rate": 1.7399620797025115e-05, + "loss": 0.1375, + "step": 35797 + }, + { + "epoch": 0.6384974851068383, + "grad_norm": 0.36094430088996887, + "learning_rate": 1.7398137978045498e-05, + "loss": 0.1131, + "step": 35798 + }, + { + "epoch": 0.638515321228552, + "grad_norm": 0.18215768039226532, + "learning_rate": 1.7396655188533324e-05, + "loss": 0.1321, + "step": 35799 + }, + { + "epoch": 0.6385331573502657, + "grad_norm": 0.32127895951271057, + "learning_rate": 1.7395172428494333e-05, + "loss": 0.1182, + "step": 35800 + }, + { + "epoch": 0.6385509934719794, + "grad_norm": 0.2995901107788086, + "learning_rate": 1.739368969793429e-05, + "loss": 0.087, + "step": 35801 + }, + { + "epoch": 0.6385688295936931, + "grad_norm": 0.16200973093509674, + "learning_rate": 1.7392206996858922e-05, + "loss": 0.106, + "step": 35802 + }, + { + "epoch": 0.6385866657154068, + "grad_norm": 0.21286509931087494, + "learning_rate": 1.7390724325274e-05, + "loss": 0.0891, + "step": 35803 + }, + { + "epoch": 0.6386045018371206, + "grad_norm": 0.18890058994293213, + "learning_rate": 1.7389241683185244e-05, + "loss": 0.0962, + "step": 35804 + }, + { + "epoch": 0.6386223379588343, + "grad_norm": 0.2941281199455261, + "learning_rate": 1.738775907059843e-05, + "loss": 0.136, + "step": 35805 + }, + { + "epoch": 0.638640174080548, + "grad_norm": 0.23328498005867004, + "learning_rate": 1.7386276487519287e-05, + "loss": 0.1051, + "step": 35806 + }, + { + "epoch": 0.6386580102022617, + "grad_norm": 0.2769111096858978, + "learning_rate": 1.7384793933953565e-05, + "loss": 0.1724, + "step": 35807 + }, + { + "epoch": 0.6386758463239753, + "grad_norm": 0.22209565341472626, + "learning_rate": 1.738331140990701e-05, + "loss": 0.1558, + "step": 35808 + }, + { + "epoch": 0.638693682445689, + "grad_norm": 0.2373506873846054, + "learning_rate": 1.738182891538537e-05, + "loss": 0.1084, + "step": 35809 + }, + { + "epoch": 0.6387115185674027, + "grad_norm": 0.2676689624786377, + "learning_rate": 1.738034645039439e-05, + "loss": 0.1241, + "step": 35810 + }, + { + "epoch": 0.6387293546891164, + "grad_norm": 0.30411019921302795, + "learning_rate": 1.7378864014939827e-05, + "loss": 0.1231, + "step": 35811 + }, + { + "epoch": 0.6387471908108301, + "grad_norm": 0.293819785118103, + "learning_rate": 1.737738160902742e-05, + "loss": 0.1235, + "step": 35812 + }, + { + "epoch": 0.6387650269325438, + "grad_norm": 0.2943223714828491, + "learning_rate": 1.73758992326629e-05, + "loss": 0.1163, + "step": 35813 + }, + { + "epoch": 0.6387828630542575, + "grad_norm": 0.22722260653972626, + "learning_rate": 1.7374416885852045e-05, + "loss": 0.081, + "step": 35814 + }, + { + "epoch": 0.6388006991759712, + "grad_norm": 0.30290743708610535, + "learning_rate": 1.737293456860058e-05, + "loss": 0.1423, + "step": 35815 + }, + { + "epoch": 0.6388185352976848, + "grad_norm": 0.28768107295036316, + "learning_rate": 1.737145228091425e-05, + "loss": 0.1017, + "step": 35816 + }, + { + "epoch": 0.6388363714193985, + "grad_norm": 0.28279608488082886, + "learning_rate": 1.73699700227988e-05, + "loss": 0.058, + "step": 35817 + }, + { + "epoch": 0.6388542075411122, + "grad_norm": 0.2586454451084137, + "learning_rate": 1.7368487794259992e-05, + "loss": 0.1027, + "step": 35818 + }, + { + "epoch": 0.6388720436628259, + "grad_norm": 0.3797576129436493, + "learning_rate": 1.736700559530356e-05, + "loss": 0.1213, + "step": 35819 + }, + { + "epoch": 0.6388898797845397, + "grad_norm": 0.27608340978622437, + "learning_rate": 1.7365523425935255e-05, + "loss": 0.1443, + "step": 35820 + }, + { + "epoch": 0.6389077159062534, + "grad_norm": 0.33411461114883423, + "learning_rate": 1.7364041286160815e-05, + "loss": 0.073, + "step": 35821 + }, + { + "epoch": 0.6389255520279671, + "grad_norm": 0.22558556497097015, + "learning_rate": 1.7362559175985975e-05, + "loss": 0.1257, + "step": 35822 + }, + { + "epoch": 0.6389433881496808, + "grad_norm": 0.2996184527873993, + "learning_rate": 1.7361077095416502e-05, + "loss": 0.1621, + "step": 35823 + }, + { + "epoch": 0.6389612242713945, + "grad_norm": 0.2431306093931198, + "learning_rate": 1.735959504445814e-05, + "loss": 0.1192, + "step": 35824 + }, + { + "epoch": 0.6389790603931081, + "grad_norm": 0.34502482414245605, + "learning_rate": 1.7358113023116625e-05, + "loss": 0.134, + "step": 35825 + }, + { + "epoch": 0.6389968965148218, + "grad_norm": 0.2907029390335083, + "learning_rate": 1.7356631031397693e-05, + "loss": 0.1002, + "step": 35826 + }, + { + "epoch": 0.6390147326365355, + "grad_norm": 0.19775918126106262, + "learning_rate": 1.7355149069307102e-05, + "loss": 0.1065, + "step": 35827 + }, + { + "epoch": 0.6390325687582492, + "grad_norm": 0.3696146011352539, + "learning_rate": 1.7353667136850603e-05, + "loss": 0.1859, + "step": 35828 + }, + { + "epoch": 0.6390504048799629, + "grad_norm": 0.308233380317688, + "learning_rate": 1.7352185234033925e-05, + "loss": 0.1451, + "step": 35829 + }, + { + "epoch": 0.6390682410016766, + "grad_norm": 0.2556229829788208, + "learning_rate": 1.7350703360862813e-05, + "loss": 0.093, + "step": 35830 + }, + { + "epoch": 0.6390860771233903, + "grad_norm": 0.27065831422805786, + "learning_rate": 1.7349221517343022e-05, + "loss": 0.1386, + "step": 35831 + }, + { + "epoch": 0.639103913245104, + "grad_norm": 0.1999167948961258, + "learning_rate": 1.7347739703480294e-05, + "loss": 0.0911, + "step": 35832 + }, + { + "epoch": 0.6391217493668176, + "grad_norm": 0.2251693159341812, + "learning_rate": 1.734625791928037e-05, + "loss": 0.1396, + "step": 35833 + }, + { + "epoch": 0.6391395854885313, + "grad_norm": 0.279585063457489, + "learning_rate": 1.7344776164748994e-05, + "loss": 0.1306, + "step": 35834 + }, + { + "epoch": 0.639157421610245, + "grad_norm": 0.22249330580234528, + "learning_rate": 1.7343294439891898e-05, + "loss": 0.1124, + "step": 35835 + }, + { + "epoch": 0.6391752577319587, + "grad_norm": 0.2314291000366211, + "learning_rate": 1.7341812744714854e-05, + "loss": 0.0846, + "step": 35836 + }, + { + "epoch": 0.6391930938536725, + "grad_norm": 0.29955893754959106, + "learning_rate": 1.7340331079223576e-05, + "loss": 0.159, + "step": 35837 + }, + { + "epoch": 0.6392109299753862, + "grad_norm": 0.24937936663627625, + "learning_rate": 1.7338849443423826e-05, + "loss": 0.1335, + "step": 35838 + }, + { + "epoch": 0.6392287660970999, + "grad_norm": 0.2849934697151184, + "learning_rate": 1.7337367837321336e-05, + "loss": 0.1838, + "step": 35839 + }, + { + "epoch": 0.6392466022188136, + "grad_norm": 0.25680679082870483, + "learning_rate": 1.733588626092186e-05, + "loss": 0.1062, + "step": 35840 + }, + { + "epoch": 0.6392644383405273, + "grad_norm": 0.26158425211906433, + "learning_rate": 1.733440471423114e-05, + "loss": 0.098, + "step": 35841 + }, + { + "epoch": 0.639282274462241, + "grad_norm": 0.21850423514842987, + "learning_rate": 1.7332923197254913e-05, + "loss": 0.0955, + "step": 35842 + }, + { + "epoch": 0.6393001105839546, + "grad_norm": 0.2790837585926056, + "learning_rate": 1.7331441709998914e-05, + "loss": 0.1351, + "step": 35843 + }, + { + "epoch": 0.6393179467056683, + "grad_norm": 0.24826689064502716, + "learning_rate": 1.7329960252468903e-05, + "loss": 0.1728, + "step": 35844 + }, + { + "epoch": 0.639335782827382, + "grad_norm": 0.223775714635849, + "learning_rate": 1.732847882467062e-05, + "loss": 0.1092, + "step": 35845 + }, + { + "epoch": 0.6393536189490957, + "grad_norm": 0.21161296963691711, + "learning_rate": 1.7326997426609798e-05, + "loss": 0.0874, + "step": 35846 + }, + { + "epoch": 0.6393714550708094, + "grad_norm": 0.30553925037384033, + "learning_rate": 1.7325516058292187e-05, + "loss": 0.1292, + "step": 35847 + }, + { + "epoch": 0.6393892911925231, + "grad_norm": 0.28877514600753784, + "learning_rate": 1.7324034719723518e-05, + "loss": 0.1197, + "step": 35848 + }, + { + "epoch": 0.6394071273142368, + "grad_norm": 0.2557810842990875, + "learning_rate": 1.7322553410909548e-05, + "loss": 0.1078, + "step": 35849 + }, + { + "epoch": 0.6394249634359505, + "grad_norm": 0.2571440637111664, + "learning_rate": 1.732107213185601e-05, + "loss": 0.1314, + "step": 35850 + }, + { + "epoch": 0.6394427995576641, + "grad_norm": 0.2803826928138733, + "learning_rate": 1.7319590882568655e-05, + "loss": 0.1511, + "step": 35851 + }, + { + "epoch": 0.6394606356793778, + "grad_norm": 0.35693272948265076, + "learning_rate": 1.7318109663053207e-05, + "loss": 0.0879, + "step": 35852 + }, + { + "epoch": 0.6394784718010915, + "grad_norm": 0.2565140426158905, + "learning_rate": 1.7316628473315427e-05, + "loss": 0.1003, + "step": 35853 + }, + { + "epoch": 0.6394963079228053, + "grad_norm": 0.3079506754875183, + "learning_rate": 1.7315147313361053e-05, + "loss": 0.1525, + "step": 35854 + }, + { + "epoch": 0.639514144044519, + "grad_norm": 0.21569791436195374, + "learning_rate": 1.7313666183195822e-05, + "loss": 0.1102, + "step": 35855 + }, + { + "epoch": 0.6395319801662327, + "grad_norm": 0.2801554203033447, + "learning_rate": 1.731218508282546e-05, + "loss": 0.1321, + "step": 35856 + }, + { + "epoch": 0.6395498162879464, + "grad_norm": 0.20545944571495056, + "learning_rate": 1.7310704012255738e-05, + "loss": 0.1243, + "step": 35857 + }, + { + "epoch": 0.6395676524096601, + "grad_norm": 0.23091839253902435, + "learning_rate": 1.7309222971492378e-05, + "loss": 0.138, + "step": 35858 + }, + { + "epoch": 0.6395854885313738, + "grad_norm": 0.22200728952884674, + "learning_rate": 1.7307741960541128e-05, + "loss": 0.125, + "step": 35859 + }, + { + "epoch": 0.6396033246530874, + "grad_norm": 0.40865370631217957, + "learning_rate": 1.7306260979407732e-05, + "loss": 0.1897, + "step": 35860 + }, + { + "epoch": 0.6396211607748011, + "grad_norm": 0.22435827553272247, + "learning_rate": 1.7304780028097912e-05, + "loss": 0.1281, + "step": 35861 + }, + { + "epoch": 0.6396389968965148, + "grad_norm": 0.2696791887283325, + "learning_rate": 1.7303299106617434e-05, + "loss": 0.1322, + "step": 35862 + }, + { + "epoch": 0.6396568330182285, + "grad_norm": 0.2418757528066635, + "learning_rate": 1.7301818214972026e-05, + "loss": 0.0813, + "step": 35863 + }, + { + "epoch": 0.6396746691399422, + "grad_norm": 0.21032550930976868, + "learning_rate": 1.730033735316743e-05, + "loss": 0.1523, + "step": 35864 + }, + { + "epoch": 0.6396925052616559, + "grad_norm": 0.24804969131946564, + "learning_rate": 1.729885652120938e-05, + "loss": 0.18, + "step": 35865 + }, + { + "epoch": 0.6397103413833696, + "grad_norm": 0.2403768002986908, + "learning_rate": 1.729737571910362e-05, + "loss": 0.1308, + "step": 35866 + }, + { + "epoch": 0.6397281775050833, + "grad_norm": 0.2027972787618637, + "learning_rate": 1.72958949468559e-05, + "loss": 0.1368, + "step": 35867 + }, + { + "epoch": 0.639746013626797, + "grad_norm": 0.28435927629470825, + "learning_rate": 1.7294414204471954e-05, + "loss": 0.1185, + "step": 35868 + }, + { + "epoch": 0.6397638497485106, + "grad_norm": 0.29404228925704956, + "learning_rate": 1.7292933491957507e-05, + "loss": 0.1448, + "step": 35869 + }, + { + "epoch": 0.6397816858702243, + "grad_norm": 0.28542855381965637, + "learning_rate": 1.7291452809318325e-05, + "loss": 0.1225, + "step": 35870 + }, + { + "epoch": 0.6397995219919381, + "grad_norm": 0.22775453329086304, + "learning_rate": 1.7289972156560127e-05, + "loss": 0.1263, + "step": 35871 + }, + { + "epoch": 0.6398173581136518, + "grad_norm": 0.2884061336517334, + "learning_rate": 1.7288491533688668e-05, + "loss": 0.1744, + "step": 35872 + }, + { + "epoch": 0.6398351942353655, + "grad_norm": 0.8027663230895996, + "learning_rate": 1.7287010940709675e-05, + "loss": 0.1256, + "step": 35873 + }, + { + "epoch": 0.6398530303570792, + "grad_norm": 0.4754054844379425, + "learning_rate": 1.7285530377628885e-05, + "loss": 0.1185, + "step": 35874 + }, + { + "epoch": 0.6398708664787929, + "grad_norm": 0.18994231522083282, + "learning_rate": 1.7284049844452048e-05, + "loss": 0.1278, + "step": 35875 + }, + { + "epoch": 0.6398887026005066, + "grad_norm": 0.3504869043827057, + "learning_rate": 1.7282569341184903e-05, + "loss": 0.1462, + "step": 35876 + }, + { + "epoch": 0.6399065387222203, + "grad_norm": 0.30787503719329834, + "learning_rate": 1.728108886783318e-05, + "loss": 0.1897, + "step": 35877 + }, + { + "epoch": 0.6399243748439339, + "grad_norm": 0.24686484038829803, + "learning_rate": 1.727960842440262e-05, + "loss": 0.1098, + "step": 35878 + }, + { + "epoch": 0.6399422109656476, + "grad_norm": 0.2583046853542328, + "learning_rate": 1.727812801089897e-05, + "loss": 0.145, + "step": 35879 + }, + { + "epoch": 0.6399600470873613, + "grad_norm": 0.2936982810497284, + "learning_rate": 1.7276647627327964e-05, + "loss": 0.1537, + "step": 35880 + }, + { + "epoch": 0.639977883209075, + "grad_norm": 0.23829172551631927, + "learning_rate": 1.727516727369534e-05, + "loss": 0.0931, + "step": 35881 + }, + { + "epoch": 0.6399957193307887, + "grad_norm": 0.30978360772132874, + "learning_rate": 1.7273686950006823e-05, + "loss": 0.12, + "step": 35882 + }, + { + "epoch": 0.6400135554525024, + "grad_norm": 0.2778049111366272, + "learning_rate": 1.7272206656268175e-05, + "loss": 0.1404, + "step": 35883 + }, + { + "epoch": 0.6400313915742161, + "grad_norm": 0.2594253420829773, + "learning_rate": 1.7270726392485125e-05, + "loss": 0.1094, + "step": 35884 + }, + { + "epoch": 0.6400492276959298, + "grad_norm": 0.3210033178329468, + "learning_rate": 1.7269246158663406e-05, + "loss": 0.1031, + "step": 35885 + }, + { + "epoch": 0.6400670638176434, + "grad_norm": 0.31004631519317627, + "learning_rate": 1.726776595480876e-05, + "loss": 0.1422, + "step": 35886 + }, + { + "epoch": 0.6400848999393571, + "grad_norm": 0.2223798930644989, + "learning_rate": 1.7266285780926915e-05, + "loss": 0.1158, + "step": 35887 + }, + { + "epoch": 0.6401027360610709, + "grad_norm": 0.34921783208847046, + "learning_rate": 1.7264805637023628e-05, + "loss": 0.1322, + "step": 35888 + }, + { + "epoch": 0.6401205721827846, + "grad_norm": 0.25858503580093384, + "learning_rate": 1.7263325523104627e-05, + "loss": 0.1418, + "step": 35889 + }, + { + "epoch": 0.6401384083044983, + "grad_norm": 0.2730313241481781, + "learning_rate": 1.7261845439175644e-05, + "loss": 0.1002, + "step": 35890 + }, + { + "epoch": 0.640156244426212, + "grad_norm": 0.44230887293815613, + "learning_rate": 1.7260365385242415e-05, + "loss": 0.1291, + "step": 35891 + }, + { + "epoch": 0.6401740805479257, + "grad_norm": 0.2491108626127243, + "learning_rate": 1.7258885361310697e-05, + "loss": 0.1867, + "step": 35892 + }, + { + "epoch": 0.6401919166696394, + "grad_norm": 0.25240418314933777, + "learning_rate": 1.7257405367386198e-05, + "loss": 0.1376, + "step": 35893 + }, + { + "epoch": 0.6402097527913531, + "grad_norm": 0.2325664460659027, + "learning_rate": 1.725592540347468e-05, + "loss": 0.1374, + "step": 35894 + }, + { + "epoch": 0.6402275889130667, + "grad_norm": 0.3108990490436554, + "learning_rate": 1.725444546958186e-05, + "loss": 0.0994, + "step": 35895 + }, + { + "epoch": 0.6402454250347804, + "grad_norm": 0.23212184011936188, + "learning_rate": 1.7252965565713495e-05, + "loss": 0.1406, + "step": 35896 + }, + { + "epoch": 0.6402632611564941, + "grad_norm": 0.26845085620880127, + "learning_rate": 1.725148569187531e-05, + "loss": 0.1388, + "step": 35897 + }, + { + "epoch": 0.6402810972782078, + "grad_norm": 0.1796015053987503, + "learning_rate": 1.7250005848073042e-05, + "loss": 0.0844, + "step": 35898 + }, + { + "epoch": 0.6402989333999215, + "grad_norm": 0.3172871470451355, + "learning_rate": 1.7248526034312428e-05, + "loss": 0.133, + "step": 35899 + }, + { + "epoch": 0.6403167695216352, + "grad_norm": 0.2341238111257553, + "learning_rate": 1.7247046250599195e-05, + "loss": 0.1373, + "step": 35900 + }, + { + "epoch": 0.6403346056433489, + "grad_norm": 0.225291907787323, + "learning_rate": 1.72455664969391e-05, + "loss": 0.1368, + "step": 35901 + }, + { + "epoch": 0.6403524417650626, + "grad_norm": 0.3168637752532959, + "learning_rate": 1.7244086773337864e-05, + "loss": 0.1828, + "step": 35902 + }, + { + "epoch": 0.6403702778867763, + "grad_norm": 0.31857556104660034, + "learning_rate": 1.7242607079801233e-05, + "loss": 0.1322, + "step": 35903 + }, + { + "epoch": 0.6403881140084899, + "grad_norm": 0.30921950936317444, + "learning_rate": 1.724112741633492e-05, + "loss": 0.1479, + "step": 35904 + }, + { + "epoch": 0.6404059501302037, + "grad_norm": 0.30880025029182434, + "learning_rate": 1.723964778294469e-05, + "loss": 0.1164, + "step": 35905 + }, + { + "epoch": 0.6404237862519174, + "grad_norm": 0.2310551404953003, + "learning_rate": 1.723816817963626e-05, + "loss": 0.1296, + "step": 35906 + }, + { + "epoch": 0.6404416223736311, + "grad_norm": 0.3688663840293884, + "learning_rate": 1.7236688606415374e-05, + "loss": 0.1532, + "step": 35907 + }, + { + "epoch": 0.6404594584953448, + "grad_norm": 0.3194214701652527, + "learning_rate": 1.723520906328776e-05, + "loss": 0.1362, + "step": 35908 + }, + { + "epoch": 0.6404772946170585, + "grad_norm": 0.28247639536857605, + "learning_rate": 1.7233729550259162e-05, + "loss": 0.1258, + "step": 35909 + }, + { + "epoch": 0.6404951307387722, + "grad_norm": 0.32303890585899353, + "learning_rate": 1.7232250067335312e-05, + "loss": 0.1689, + "step": 35910 + }, + { + "epoch": 0.6405129668604859, + "grad_norm": 0.24534259736537933, + "learning_rate": 1.7230770614521945e-05, + "loss": 0.099, + "step": 35911 + }, + { + "epoch": 0.6405308029821996, + "grad_norm": 0.21690906584262848, + "learning_rate": 1.7229291191824787e-05, + "loss": 0.0687, + "step": 35912 + }, + { + "epoch": 0.6405486391039132, + "grad_norm": 0.25254303216934204, + "learning_rate": 1.7227811799249584e-05, + "loss": 0.1292, + "step": 35913 + }, + { + "epoch": 0.6405664752256269, + "grad_norm": 0.3543197512626648, + "learning_rate": 1.7226332436802062e-05, + "loss": 0.1527, + "step": 35914 + }, + { + "epoch": 0.6405843113473406, + "grad_norm": 0.2187603861093521, + "learning_rate": 1.722485310448797e-05, + "loss": 0.1019, + "step": 35915 + }, + { + "epoch": 0.6406021474690543, + "grad_norm": 0.20415639877319336, + "learning_rate": 1.722337380231303e-05, + "loss": 0.1366, + "step": 35916 + }, + { + "epoch": 0.640619983590768, + "grad_norm": 0.24557319283485413, + "learning_rate": 1.7221894530282973e-05, + "loss": 0.1378, + "step": 35917 + }, + { + "epoch": 0.6406378197124817, + "grad_norm": 0.30781519412994385, + "learning_rate": 1.7220415288403544e-05, + "loss": 0.1122, + "step": 35918 + }, + { + "epoch": 0.6406556558341954, + "grad_norm": 0.19187171757221222, + "learning_rate": 1.7218936076680474e-05, + "loss": 0.0965, + "step": 35919 + }, + { + "epoch": 0.6406734919559091, + "grad_norm": 0.297652006149292, + "learning_rate": 1.7217456895119494e-05, + "loss": 0.152, + "step": 35920 + }, + { + "epoch": 0.6406913280776229, + "grad_norm": 0.26912230253219604, + "learning_rate": 1.7215977743726332e-05, + "loss": 0.1321, + "step": 35921 + }, + { + "epoch": 0.6407091641993365, + "grad_norm": 0.23096053302288055, + "learning_rate": 1.7214498622506736e-05, + "loss": 0.1214, + "step": 35922 + }, + { + "epoch": 0.6407270003210502, + "grad_norm": 0.27023589611053467, + "learning_rate": 1.7213019531466433e-05, + "loss": 0.1138, + "step": 35923 + }, + { + "epoch": 0.6407448364427639, + "grad_norm": 0.2834970951080322, + "learning_rate": 1.721154047061116e-05, + "loss": 0.1096, + "step": 35924 + }, + { + "epoch": 0.6407626725644776, + "grad_norm": 0.20260052382946014, + "learning_rate": 1.721006143994664e-05, + "loss": 0.1463, + "step": 35925 + }, + { + "epoch": 0.6407805086861913, + "grad_norm": 0.3352355360984802, + "learning_rate": 1.7208582439478603e-05, + "loss": 0.0754, + "step": 35926 + }, + { + "epoch": 0.640798344807905, + "grad_norm": 0.32622987031936646, + "learning_rate": 1.7207103469212805e-05, + "loss": 0.0635, + "step": 35927 + }, + { + "epoch": 0.6408161809296187, + "grad_norm": 0.30420923233032227, + "learning_rate": 1.7205624529154963e-05, + "loss": 0.0951, + "step": 35928 + }, + { + "epoch": 0.6408340170513324, + "grad_norm": 0.2683747112751007, + "learning_rate": 1.7204145619310813e-05, + "loss": 0.1618, + "step": 35929 + }, + { + "epoch": 0.640851853173046, + "grad_norm": 0.2540469467639923, + "learning_rate": 1.720266673968608e-05, + "loss": 0.1564, + "step": 35930 + }, + { + "epoch": 0.6408696892947597, + "grad_norm": 0.2046431452035904, + "learning_rate": 1.720118789028651e-05, + "loss": 0.1448, + "step": 35931 + }, + { + "epoch": 0.6408875254164734, + "grad_norm": 0.32680729031562805, + "learning_rate": 1.719970907111783e-05, + "loss": 0.1689, + "step": 35932 + }, + { + "epoch": 0.6409053615381871, + "grad_norm": 0.21240676939487457, + "learning_rate": 1.719823028218577e-05, + "loss": 0.1214, + "step": 35933 + }, + { + "epoch": 0.6409231976599008, + "grad_norm": 0.2598065435886383, + "learning_rate": 1.7196751523496062e-05, + "loss": 0.1097, + "step": 35934 + }, + { + "epoch": 0.6409410337816145, + "grad_norm": 0.23117214441299438, + "learning_rate": 1.719527279505444e-05, + "loss": 0.1105, + "step": 35935 + }, + { + "epoch": 0.6409588699033282, + "grad_norm": 0.24915702641010284, + "learning_rate": 1.7193794096866645e-05, + "loss": 0.1353, + "step": 35936 + }, + { + "epoch": 0.6409767060250419, + "grad_norm": 0.19787028431892395, + "learning_rate": 1.7192315428938395e-05, + "loss": 0.1213, + "step": 35937 + }, + { + "epoch": 0.6409945421467557, + "grad_norm": 0.26631495356559753, + "learning_rate": 1.719083679127543e-05, + "loss": 0.1095, + "step": 35938 + }, + { + "epoch": 0.6410123782684694, + "grad_norm": 0.2686077952384949, + "learning_rate": 1.718935818388347e-05, + "loss": 0.1318, + "step": 35939 + }, + { + "epoch": 0.641030214390183, + "grad_norm": 0.2649911642074585, + "learning_rate": 1.718787960676826e-05, + "loss": 0.1292, + "step": 35940 + }, + { + "epoch": 0.6410480505118967, + "grad_norm": 0.2614991366863251, + "learning_rate": 1.7186401059935524e-05, + "loss": 0.2089, + "step": 35941 + }, + { + "epoch": 0.6410658866336104, + "grad_norm": 0.2648104429244995, + "learning_rate": 1.7184922543391005e-05, + "loss": 0.1365, + "step": 35942 + }, + { + "epoch": 0.6410837227553241, + "grad_norm": 0.3426940441131592, + "learning_rate": 1.718344405714041e-05, + "loss": 0.1253, + "step": 35943 + }, + { + "epoch": 0.6411015588770378, + "grad_norm": 0.30241209268569946, + "learning_rate": 1.71819656011895e-05, + "loss": 0.1787, + "step": 35944 + }, + { + "epoch": 0.6411193949987515, + "grad_norm": 0.32440218329429626, + "learning_rate": 1.718048717554399e-05, + "loss": 0.1227, + "step": 35945 + }, + { + "epoch": 0.6411372311204652, + "grad_norm": 0.2700096070766449, + "learning_rate": 1.7179008780209615e-05, + "loss": 0.1234, + "step": 35946 + }, + { + "epoch": 0.6411550672421789, + "grad_norm": 0.2275232970714569, + "learning_rate": 1.717753041519209e-05, + "loss": 0.1144, + "step": 35947 + }, + { + "epoch": 0.6411729033638925, + "grad_norm": 0.19442588090896606, + "learning_rate": 1.7176052080497165e-05, + "loss": 0.1141, + "step": 35948 + }, + { + "epoch": 0.6411907394856062, + "grad_norm": 0.19584928452968597, + "learning_rate": 1.717457377613057e-05, + "loss": 0.108, + "step": 35949 + }, + { + "epoch": 0.6412085756073199, + "grad_norm": 0.32251083850860596, + "learning_rate": 1.717309550209803e-05, + "loss": 0.0717, + "step": 35950 + }, + { + "epoch": 0.6412264117290336, + "grad_norm": 0.25962111353874207, + "learning_rate": 1.7171617258405273e-05, + "loss": 0.1259, + "step": 35951 + }, + { + "epoch": 0.6412442478507473, + "grad_norm": 0.31683894991874695, + "learning_rate": 1.717013904505802e-05, + "loss": 0.1283, + "step": 35952 + }, + { + "epoch": 0.641262083972461, + "grad_norm": 0.34520360827445984, + "learning_rate": 1.7168660862062027e-05, + "loss": 0.1306, + "step": 35953 + }, + { + "epoch": 0.6412799200941747, + "grad_norm": 0.20821036398410797, + "learning_rate": 1.7167182709423004e-05, + "loss": 0.1357, + "step": 35954 + }, + { + "epoch": 0.6412977562158885, + "grad_norm": 0.30845940113067627, + "learning_rate": 1.716570458714669e-05, + "loss": 0.0505, + "step": 35955 + }, + { + "epoch": 0.6413155923376022, + "grad_norm": 0.323530375957489, + "learning_rate": 1.71642264952388e-05, + "loss": 0.1336, + "step": 35956 + }, + { + "epoch": 0.6413334284593158, + "grad_norm": 0.23813045024871826, + "learning_rate": 1.7162748433705083e-05, + "loss": 0.0763, + "step": 35957 + }, + { + "epoch": 0.6413512645810295, + "grad_norm": 0.2854834198951721, + "learning_rate": 1.7161270402551262e-05, + "loss": 0.1141, + "step": 35958 + }, + { + "epoch": 0.6413691007027432, + "grad_norm": 0.2865433394908905, + "learning_rate": 1.7159792401783064e-05, + "loss": 0.1144, + "step": 35959 + }, + { + "epoch": 0.6413869368244569, + "grad_norm": 0.2748376727104187, + "learning_rate": 1.715831443140621e-05, + "loss": 0.082, + "step": 35960 + }, + { + "epoch": 0.6414047729461706, + "grad_norm": 0.2565107047557831, + "learning_rate": 1.715683649142645e-05, + "loss": 0.1583, + "step": 35961 + }, + { + "epoch": 0.6414226090678843, + "grad_norm": 0.21768087148666382, + "learning_rate": 1.715535858184949e-05, + "loss": 0.1128, + "step": 35962 + }, + { + "epoch": 0.641440445189598, + "grad_norm": 0.25623881816864014, + "learning_rate": 1.7153880702681074e-05, + "loss": 0.1445, + "step": 35963 + }, + { + "epoch": 0.6414582813113117, + "grad_norm": 0.22707068920135498, + "learning_rate": 1.715240285392693e-05, + "loss": 0.0627, + "step": 35964 + }, + { + "epoch": 0.6414761174330254, + "grad_norm": 0.26484373211860657, + "learning_rate": 1.7150925035592776e-05, + "loss": 0.0878, + "step": 35965 + }, + { + "epoch": 0.641493953554739, + "grad_norm": 0.2964410185813904, + "learning_rate": 1.714944724768435e-05, + "loss": 0.1623, + "step": 35966 + }, + { + "epoch": 0.6415117896764527, + "grad_norm": 0.2748417258262634, + "learning_rate": 1.7147969490207383e-05, + "loss": 0.1453, + "step": 35967 + }, + { + "epoch": 0.6415296257981664, + "grad_norm": 0.21738409996032715, + "learning_rate": 1.714649176316759e-05, + "loss": 0.0973, + "step": 35968 + }, + { + "epoch": 0.6415474619198801, + "grad_norm": 0.3531220257282257, + "learning_rate": 1.714501406657071e-05, + "loss": 0.0611, + "step": 35969 + }, + { + "epoch": 0.6415652980415938, + "grad_norm": 0.277566134929657, + "learning_rate": 1.7143536400422467e-05, + "loss": 0.1347, + "step": 35970 + }, + { + "epoch": 0.6415831341633075, + "grad_norm": 0.20635554194450378, + "learning_rate": 1.7142058764728597e-05, + "loss": 0.0615, + "step": 35971 + }, + { + "epoch": 0.6416009702850213, + "grad_norm": 0.31696733832359314, + "learning_rate": 1.7140581159494825e-05, + "loss": 0.1406, + "step": 35972 + }, + { + "epoch": 0.641618806406735, + "grad_norm": 0.35350048542022705, + "learning_rate": 1.7139103584726855e-05, + "loss": 0.1526, + "step": 35973 + }, + { + "epoch": 0.6416366425284487, + "grad_norm": 0.24697697162628174, + "learning_rate": 1.7137626040430452e-05, + "loss": 0.1129, + "step": 35974 + }, + { + "epoch": 0.6416544786501623, + "grad_norm": 0.21649472415447235, + "learning_rate": 1.713614852661132e-05, + "loss": 0.1166, + "step": 35975 + }, + { + "epoch": 0.641672314771876, + "grad_norm": 0.23622985184192657, + "learning_rate": 1.7134671043275196e-05, + "loss": 0.0932, + "step": 35976 + }, + { + "epoch": 0.6416901508935897, + "grad_norm": 0.3605780303478241, + "learning_rate": 1.7133193590427804e-05, + "loss": 0.1286, + "step": 35977 + }, + { + "epoch": 0.6417079870153034, + "grad_norm": 0.3458549380302429, + "learning_rate": 1.713171616807486e-05, + "loss": 0.1667, + "step": 35978 + }, + { + "epoch": 0.6417258231370171, + "grad_norm": 0.24873648583889008, + "learning_rate": 1.7130238776222112e-05, + "loss": 0.14, + "step": 35979 + }, + { + "epoch": 0.6417436592587308, + "grad_norm": 0.26050683856010437, + "learning_rate": 1.712876141487528e-05, + "loss": 0.1237, + "step": 35980 + }, + { + "epoch": 0.6417614953804445, + "grad_norm": 0.22043856978416443, + "learning_rate": 1.712728408404008e-05, + "loss": 0.1453, + "step": 35981 + }, + { + "epoch": 0.6417793315021582, + "grad_norm": 0.24657335877418518, + "learning_rate": 1.7125806783722243e-05, + "loss": 0.1363, + "step": 35982 + }, + { + "epoch": 0.6417971676238718, + "grad_norm": 0.447663813829422, + "learning_rate": 1.7124329513927504e-05, + "loss": 0.1672, + "step": 35983 + }, + { + "epoch": 0.6418150037455855, + "grad_norm": 0.24941036105155945, + "learning_rate": 1.7122852274661584e-05, + "loss": 0.1213, + "step": 35984 + }, + { + "epoch": 0.6418328398672992, + "grad_norm": 0.2186688333749771, + "learning_rate": 1.7121375065930212e-05, + "loss": 0.0804, + "step": 35985 + }, + { + "epoch": 0.6418506759890129, + "grad_norm": 0.2392829954624176, + "learning_rate": 1.71198978877391e-05, + "loss": 0.1258, + "step": 35986 + }, + { + "epoch": 0.6418685121107266, + "grad_norm": 0.26403605937957764, + "learning_rate": 1.7118420740093998e-05, + "loss": 0.1426, + "step": 35987 + }, + { + "epoch": 0.6418863482324403, + "grad_norm": 0.3009972870349884, + "learning_rate": 1.711694362300062e-05, + "loss": 0.1327, + "step": 35988 + }, + { + "epoch": 0.6419041843541541, + "grad_norm": 0.23631148040294647, + "learning_rate": 1.7115466536464684e-05, + "loss": 0.1211, + "step": 35989 + }, + { + "epoch": 0.6419220204758678, + "grad_norm": 0.3221214711666107, + "learning_rate": 1.7113989480491927e-05, + "loss": 0.1422, + "step": 35990 + }, + { + "epoch": 0.6419398565975815, + "grad_norm": 0.26724332571029663, + "learning_rate": 1.711251245508806e-05, + "loss": 0.0988, + "step": 35991 + }, + { + "epoch": 0.6419576927192951, + "grad_norm": 0.29655665159225464, + "learning_rate": 1.711103546025883e-05, + "loss": 0.1107, + "step": 35992 + }, + { + "epoch": 0.6419755288410088, + "grad_norm": 0.2704985439777374, + "learning_rate": 1.7109558496009952e-05, + "loss": 0.14, + "step": 35993 + }, + { + "epoch": 0.6419933649627225, + "grad_norm": 0.32120829820632935, + "learning_rate": 1.7108081562347153e-05, + "loss": 0.1737, + "step": 35994 + }, + { + "epoch": 0.6420112010844362, + "grad_norm": 0.3925207555294037, + "learning_rate": 1.7106604659276142e-05, + "loss": 0.1918, + "step": 35995 + }, + { + "epoch": 0.6420290372061499, + "grad_norm": 0.31661248207092285, + "learning_rate": 1.7105127786802668e-05, + "loss": 0.1601, + "step": 35996 + }, + { + "epoch": 0.6420468733278636, + "grad_norm": 0.391795814037323, + "learning_rate": 1.7103650944932443e-05, + "loss": 0.0995, + "step": 35997 + }, + { + "epoch": 0.6420647094495773, + "grad_norm": 0.28095105290412903, + "learning_rate": 1.7102174133671197e-05, + "loss": 0.1039, + "step": 35998 + }, + { + "epoch": 0.642082545571291, + "grad_norm": 0.26800575852394104, + "learning_rate": 1.7100697353024642e-05, + "loss": 0.1336, + "step": 35999 + }, + { + "epoch": 0.6421003816930047, + "grad_norm": 0.2607799172401428, + "learning_rate": 1.7099220602998522e-05, + "loss": 0.132, + "step": 36000 + }, + { + "epoch": 0.6421003816930047, + "eval_loss": 0.12125831842422485, + "eval_runtime": 106.6956, + "eval_samples_per_second": 9.597, + "eval_steps_per_second": 1.603, + "step": 36000 + }, + { + "epoch": 0.6421182178147183, + "grad_norm": 0.2816165089607239, + "learning_rate": 1.7097743883598555e-05, + "loss": 0.1578, + "step": 36001 + }, + { + "epoch": 0.642136053936432, + "grad_norm": 0.22245261073112488, + "learning_rate": 1.7096267194830457e-05, + "loss": 0.118, + "step": 36002 + }, + { + "epoch": 0.6421538900581457, + "grad_norm": 0.307090163230896, + "learning_rate": 1.7094790536699957e-05, + "loss": 0.1103, + "step": 36003 + }, + { + "epoch": 0.6421717261798594, + "grad_norm": 0.2222188264131546, + "learning_rate": 1.7093313909212772e-05, + "loss": 0.0873, + "step": 36004 + }, + { + "epoch": 0.6421895623015731, + "grad_norm": 0.27537801861763, + "learning_rate": 1.7091837312374644e-05, + "loss": 0.1772, + "step": 36005 + }, + { + "epoch": 0.6422073984232869, + "grad_norm": 0.32166436314582825, + "learning_rate": 1.7090360746191285e-05, + "loss": 0.0986, + "step": 36006 + }, + { + "epoch": 0.6422252345450006, + "grad_norm": 0.20470120012760162, + "learning_rate": 1.708888421066842e-05, + "loss": 0.0992, + "step": 36007 + }, + { + "epoch": 0.6422430706667143, + "grad_norm": 0.20956771075725555, + "learning_rate": 1.708740770581176e-05, + "loss": 0.096, + "step": 36008 + }, + { + "epoch": 0.642260906788428, + "grad_norm": 0.2966596782207489, + "learning_rate": 1.7085931231627055e-05, + "loss": 0.1703, + "step": 36009 + }, + { + "epoch": 0.6422787429101416, + "grad_norm": 0.3364904224872589, + "learning_rate": 1.7084454788120006e-05, + "loss": 0.1068, + "step": 36010 + }, + { + "epoch": 0.6422965790318553, + "grad_norm": 0.23444807529449463, + "learning_rate": 1.708297837529635e-05, + "loss": 0.0957, + "step": 36011 + }, + { + "epoch": 0.642314415153569, + "grad_norm": 0.3281082808971405, + "learning_rate": 1.7081501993161792e-05, + "loss": 0.1165, + "step": 36012 + }, + { + "epoch": 0.6423322512752827, + "grad_norm": 0.20914587378501892, + "learning_rate": 1.7080025641722082e-05, + "loss": 0.1079, + "step": 36013 + }, + { + "epoch": 0.6423500873969964, + "grad_norm": 0.27111852169036865, + "learning_rate": 1.7078549320982922e-05, + "loss": 0.1245, + "step": 36014 + }, + { + "epoch": 0.6423679235187101, + "grad_norm": 0.35877928137779236, + "learning_rate": 1.7077073030950048e-05, + "loss": 0.0839, + "step": 36015 + }, + { + "epoch": 0.6423857596404238, + "grad_norm": 0.22931590676307678, + "learning_rate": 1.7075596771629165e-05, + "loss": 0.1019, + "step": 36016 + }, + { + "epoch": 0.6424035957621375, + "grad_norm": 0.24991843104362488, + "learning_rate": 1.7074120543026007e-05, + "loss": 0.113, + "step": 36017 + }, + { + "epoch": 0.6424214318838511, + "grad_norm": 0.22310291230678558, + "learning_rate": 1.7072644345146295e-05, + "loss": 0.1298, + "step": 36018 + }, + { + "epoch": 0.6424392680055648, + "grad_norm": 0.286344975233078, + "learning_rate": 1.7071168177995757e-05, + "loss": 0.1552, + "step": 36019 + }, + { + "epoch": 0.6424571041272785, + "grad_norm": 0.29034942388534546, + "learning_rate": 1.7069692041580114e-05, + "loss": 0.1067, + "step": 36020 + }, + { + "epoch": 0.6424749402489922, + "grad_norm": 0.23364169895648956, + "learning_rate": 1.706821593590507e-05, + "loss": 0.1106, + "step": 36021 + }, + { + "epoch": 0.642492776370706, + "grad_norm": 0.27880609035491943, + "learning_rate": 1.7066739860976367e-05, + "loss": 0.1342, + "step": 36022 + }, + { + "epoch": 0.6425106124924197, + "grad_norm": 0.2684519290924072, + "learning_rate": 1.7065263816799725e-05, + "loss": 0.0925, + "step": 36023 + }, + { + "epoch": 0.6425284486141334, + "grad_norm": 0.2521001398563385, + "learning_rate": 1.7063787803380856e-05, + "loss": 0.172, + "step": 36024 + }, + { + "epoch": 0.6425462847358471, + "grad_norm": 0.27383750677108765, + "learning_rate": 1.706231182072548e-05, + "loss": 0.1846, + "step": 36025 + }, + { + "epoch": 0.6425641208575608, + "grad_norm": 0.3476960062980652, + "learning_rate": 1.7060835868839333e-05, + "loss": 0.2084, + "step": 36026 + }, + { + "epoch": 0.6425819569792744, + "grad_norm": 0.2195775806903839, + "learning_rate": 1.7059359947728132e-05, + "loss": 0.128, + "step": 36027 + }, + { + "epoch": 0.6425997931009881, + "grad_norm": 0.2555517256259918, + "learning_rate": 1.7057884057397593e-05, + "loss": 0.1401, + "step": 36028 + }, + { + "epoch": 0.6426176292227018, + "grad_norm": 0.26960569620132446, + "learning_rate": 1.705640819785344e-05, + "loss": 0.1411, + "step": 36029 + }, + { + "epoch": 0.6426354653444155, + "grad_norm": 0.2766706347465515, + "learning_rate": 1.705493236910138e-05, + "loss": 0.1105, + "step": 36030 + }, + { + "epoch": 0.6426533014661292, + "grad_norm": 0.33902889490127563, + "learning_rate": 1.7053456571147152e-05, + "loss": 0.0995, + "step": 36031 + }, + { + "epoch": 0.6426711375878429, + "grad_norm": 0.3518093228340149, + "learning_rate": 1.7051980803996474e-05, + "loss": 0.1596, + "step": 36032 + }, + { + "epoch": 0.6426889737095566, + "grad_norm": 0.3192064166069031, + "learning_rate": 1.705050506765507e-05, + "loss": 0.1208, + "step": 36033 + }, + { + "epoch": 0.6427068098312703, + "grad_norm": 0.23662027716636658, + "learning_rate": 1.7049029362128637e-05, + "loss": 0.1695, + "step": 36034 + }, + { + "epoch": 0.642724645952984, + "grad_norm": 0.20568744838237762, + "learning_rate": 1.7047553687422925e-05, + "loss": 0.0996, + "step": 36035 + }, + { + "epoch": 0.6427424820746976, + "grad_norm": 0.2770882248878479, + "learning_rate": 1.7046078043543642e-05, + "loss": 0.1899, + "step": 36036 + }, + { + "epoch": 0.6427603181964113, + "grad_norm": 0.2567894458770752, + "learning_rate": 1.7044602430496504e-05, + "loss": 0.1019, + "step": 36037 + }, + { + "epoch": 0.642778154318125, + "grad_norm": 0.2708216905593872, + "learning_rate": 1.7043126848287234e-05, + "loss": 0.1059, + "step": 36038 + }, + { + "epoch": 0.6427959904398388, + "grad_norm": 0.3747189939022064, + "learning_rate": 1.704165129692155e-05, + "loss": 0.126, + "step": 36039 + }, + { + "epoch": 0.6428138265615525, + "grad_norm": 0.4661749303340912, + "learning_rate": 1.704017577640518e-05, + "loss": 0.1293, + "step": 36040 + }, + { + "epoch": 0.6428316626832662, + "grad_norm": 0.2464616745710373, + "learning_rate": 1.703870028674384e-05, + "loss": 0.1106, + "step": 36041 + }, + { + "epoch": 0.6428494988049799, + "grad_norm": 0.29713109135627747, + "learning_rate": 1.7037224827943246e-05, + "loss": 0.1056, + "step": 36042 + }, + { + "epoch": 0.6428673349266936, + "grad_norm": 0.3102748990058899, + "learning_rate": 1.703574940000911e-05, + "loss": 0.1749, + "step": 36043 + }, + { + "epoch": 0.6428851710484073, + "grad_norm": 0.22702239453792572, + "learning_rate": 1.703427400294717e-05, + "loss": 0.0937, + "step": 36044 + }, + { + "epoch": 0.6429030071701209, + "grad_norm": 0.2195417582988739, + "learning_rate": 1.703279863676313e-05, + "loss": 0.1045, + "step": 36045 + }, + { + "epoch": 0.6429208432918346, + "grad_norm": 0.28234267234802246, + "learning_rate": 1.7031323301462716e-05, + "loss": 0.1081, + "step": 36046 + }, + { + "epoch": 0.6429386794135483, + "grad_norm": 0.39787518978118896, + "learning_rate": 1.702984799705164e-05, + "loss": 0.1442, + "step": 36047 + }, + { + "epoch": 0.642956515535262, + "grad_norm": 0.2355199158191681, + "learning_rate": 1.7028372723535633e-05, + "loss": 0.1391, + "step": 36048 + }, + { + "epoch": 0.6429743516569757, + "grad_norm": 0.23079955577850342, + "learning_rate": 1.7026897480920408e-05, + "loss": 0.1013, + "step": 36049 + }, + { + "epoch": 0.6429921877786894, + "grad_norm": 0.24834102392196655, + "learning_rate": 1.7025422269211684e-05, + "loss": 0.1214, + "step": 36050 + }, + { + "epoch": 0.6430100239004031, + "grad_norm": 0.27061596512794495, + "learning_rate": 1.7023947088415163e-05, + "loss": 0.1409, + "step": 36051 + }, + { + "epoch": 0.6430278600221168, + "grad_norm": 0.22637474536895752, + "learning_rate": 1.7022471938536587e-05, + "loss": 0.1328, + "step": 36052 + }, + { + "epoch": 0.6430456961438304, + "grad_norm": 0.5987288951873779, + "learning_rate": 1.7020996819581664e-05, + "loss": 0.1467, + "step": 36053 + }, + { + "epoch": 0.6430635322655441, + "grad_norm": 0.2617497742176056, + "learning_rate": 1.701952173155612e-05, + "loss": 0.1279, + "step": 36054 + }, + { + "epoch": 0.6430813683872578, + "grad_norm": 0.23993206024169922, + "learning_rate": 1.7018046674465666e-05, + "loss": 0.1253, + "step": 36055 + }, + { + "epoch": 0.6430992045089716, + "grad_norm": 0.21473948657512665, + "learning_rate": 1.7016571648316005e-05, + "loss": 0.0846, + "step": 36056 + }, + { + "epoch": 0.6431170406306853, + "grad_norm": 0.19074192643165588, + "learning_rate": 1.7015096653112884e-05, + "loss": 0.0971, + "step": 36057 + }, + { + "epoch": 0.643134876752399, + "grad_norm": 0.27036252617836, + "learning_rate": 1.7013621688861996e-05, + "loss": 0.1113, + "step": 36058 + }, + { + "epoch": 0.6431527128741127, + "grad_norm": 0.30164045095443726, + "learning_rate": 1.701214675556908e-05, + "loss": 0.0903, + "step": 36059 + }, + { + "epoch": 0.6431705489958264, + "grad_norm": 0.2766153812408447, + "learning_rate": 1.701067185323983e-05, + "loss": 0.1285, + "step": 36060 + }, + { + "epoch": 0.6431883851175401, + "grad_norm": 0.17690503597259521, + "learning_rate": 1.700919698187998e-05, + "loss": 0.0917, + "step": 36061 + }, + { + "epoch": 0.6432062212392538, + "grad_norm": 0.23135077953338623, + "learning_rate": 1.700772214149525e-05, + "loss": 0.1242, + "step": 36062 + }, + { + "epoch": 0.6432240573609674, + "grad_norm": 0.26133644580841064, + "learning_rate": 1.7006247332091348e-05, + "loss": 0.1209, + "step": 36063 + }, + { + "epoch": 0.6432418934826811, + "grad_norm": 0.2696077823638916, + "learning_rate": 1.7004772553673983e-05, + "loss": 0.0748, + "step": 36064 + }, + { + "epoch": 0.6432597296043948, + "grad_norm": 0.25742682814598083, + "learning_rate": 1.7003297806248886e-05, + "loss": 0.1012, + "step": 36065 + }, + { + "epoch": 0.6432775657261085, + "grad_norm": 0.2589276134967804, + "learning_rate": 1.7001823089821766e-05, + "loss": 0.1216, + "step": 36066 + }, + { + "epoch": 0.6432954018478222, + "grad_norm": 0.25723347067832947, + "learning_rate": 1.700034840439835e-05, + "loss": 0.0764, + "step": 36067 + }, + { + "epoch": 0.6433132379695359, + "grad_norm": 0.22905592620372772, + "learning_rate": 1.6998873749984344e-05, + "loss": 0.1285, + "step": 36068 + }, + { + "epoch": 0.6433310740912496, + "grad_norm": 0.2404824048280716, + "learning_rate": 1.6997399126585457e-05, + "loss": 0.1047, + "step": 36069 + }, + { + "epoch": 0.6433489102129633, + "grad_norm": 0.24083207547664642, + "learning_rate": 1.6995924534207424e-05, + "loss": 0.0988, + "step": 36070 + }, + { + "epoch": 0.6433667463346769, + "grad_norm": 0.2020757645368576, + "learning_rate": 1.6994449972855953e-05, + "loss": 0.0889, + "step": 36071 + }, + { + "epoch": 0.6433845824563906, + "grad_norm": 0.3013845980167389, + "learning_rate": 1.6992975442536757e-05, + "loss": 0.133, + "step": 36072 + }, + { + "epoch": 0.6434024185781044, + "grad_norm": 0.2327493280172348, + "learning_rate": 1.699150094325555e-05, + "loss": 0.1048, + "step": 36073 + }, + { + "epoch": 0.6434202546998181, + "grad_norm": 0.24093088507652283, + "learning_rate": 1.699002647501805e-05, + "loss": 0.1029, + "step": 36074 + }, + { + "epoch": 0.6434380908215318, + "grad_norm": 0.28378912806510925, + "learning_rate": 1.6988552037829983e-05, + "loss": 0.1156, + "step": 36075 + }, + { + "epoch": 0.6434559269432455, + "grad_norm": 0.2900311052799225, + "learning_rate": 1.6987077631697056e-05, + "loss": 0.1336, + "step": 36076 + }, + { + "epoch": 0.6434737630649592, + "grad_norm": 0.2954043447971344, + "learning_rate": 1.698560325662497e-05, + "loss": 0.1185, + "step": 36077 + }, + { + "epoch": 0.6434915991866729, + "grad_norm": 0.24001437425613403, + "learning_rate": 1.6984128912619463e-05, + "loss": 0.1452, + "step": 36078 + }, + { + "epoch": 0.6435094353083866, + "grad_norm": 0.30434873700141907, + "learning_rate": 1.6982654599686242e-05, + "loss": 0.1331, + "step": 36079 + }, + { + "epoch": 0.6435272714301002, + "grad_norm": 0.3666851222515106, + "learning_rate": 1.698118031783102e-05, + "loss": 0.1589, + "step": 36080 + }, + { + "epoch": 0.6435451075518139, + "grad_norm": 0.20924252271652222, + "learning_rate": 1.6979706067059513e-05, + "loss": 0.0816, + "step": 36081 + }, + { + "epoch": 0.6435629436735276, + "grad_norm": 0.2992520034313202, + "learning_rate": 1.697823184737743e-05, + "loss": 0.1173, + "step": 36082 + }, + { + "epoch": 0.6435807797952413, + "grad_norm": 0.24502576887607574, + "learning_rate": 1.6976757658790495e-05, + "loss": 0.1061, + "step": 36083 + }, + { + "epoch": 0.643598615916955, + "grad_norm": 0.38678261637687683, + "learning_rate": 1.6975283501304422e-05, + "loss": 0.1387, + "step": 36084 + }, + { + "epoch": 0.6436164520386687, + "grad_norm": 0.24350497126579285, + "learning_rate": 1.6973809374924915e-05, + "loss": 0.1791, + "step": 36085 + }, + { + "epoch": 0.6436342881603824, + "grad_norm": 0.22026783227920532, + "learning_rate": 1.6972335279657698e-05, + "loss": 0.1187, + "step": 36086 + }, + { + "epoch": 0.6436521242820961, + "grad_norm": 0.3392118811607361, + "learning_rate": 1.6970861215508482e-05, + "loss": 0.1551, + "step": 36087 + }, + { + "epoch": 0.6436699604038097, + "grad_norm": 0.2958141267299652, + "learning_rate": 1.696938718248298e-05, + "loss": 0.1385, + "step": 36088 + }, + { + "epoch": 0.6436877965255234, + "grad_norm": 0.24164767563343048, + "learning_rate": 1.6967913180586916e-05, + "loss": 0.1198, + "step": 36089 + }, + { + "epoch": 0.6437056326472372, + "grad_norm": 0.22696180641651154, + "learning_rate": 1.696643920982598e-05, + "loss": 0.0825, + "step": 36090 + }, + { + "epoch": 0.6437234687689509, + "grad_norm": 0.2741759419441223, + "learning_rate": 1.696496527020591e-05, + "loss": 0.1192, + "step": 36091 + }, + { + "epoch": 0.6437413048906646, + "grad_norm": 0.19998280704021454, + "learning_rate": 1.696349136173241e-05, + "loss": 0.1368, + "step": 36092 + }, + { + "epoch": 0.6437591410123783, + "grad_norm": 0.24313251674175262, + "learning_rate": 1.6962017484411188e-05, + "loss": 0.1281, + "step": 36093 + }, + { + "epoch": 0.643776977134092, + "grad_norm": 0.2841765284538269, + "learning_rate": 1.6960543638247964e-05, + "loss": 0.1327, + "step": 36094 + }, + { + "epoch": 0.6437948132558057, + "grad_norm": 0.2545332908630371, + "learning_rate": 1.6959069823248445e-05, + "loss": 0.1306, + "step": 36095 + }, + { + "epoch": 0.6438126493775194, + "grad_norm": 0.30480217933654785, + "learning_rate": 1.695759603941836e-05, + "loss": 0.1272, + "step": 36096 + }, + { + "epoch": 0.643830485499233, + "grad_norm": 0.23049749433994293, + "learning_rate": 1.695612228676341e-05, + "loss": 0.0851, + "step": 36097 + }, + { + "epoch": 0.6438483216209467, + "grad_norm": 0.3085883557796478, + "learning_rate": 1.6954648565289304e-05, + "loss": 0.1292, + "step": 36098 + }, + { + "epoch": 0.6438661577426604, + "grad_norm": 0.23839989304542542, + "learning_rate": 1.6953174875001753e-05, + "loss": 0.1176, + "step": 36099 + }, + { + "epoch": 0.6438839938643741, + "grad_norm": 0.2832323908805847, + "learning_rate": 1.695170121590648e-05, + "loss": 0.1078, + "step": 36100 + }, + { + "epoch": 0.6439018299860878, + "grad_norm": 0.2851586639881134, + "learning_rate": 1.6950227588009194e-05, + "loss": 0.0741, + "step": 36101 + }, + { + "epoch": 0.6439196661078015, + "grad_norm": 0.2892163395881653, + "learning_rate": 1.694875399131561e-05, + "loss": 0.1528, + "step": 36102 + }, + { + "epoch": 0.6439375022295152, + "grad_norm": 0.2886578440666199, + "learning_rate": 1.6947280425831423e-05, + "loss": 0.1211, + "step": 36103 + }, + { + "epoch": 0.6439553383512289, + "grad_norm": 0.2814652919769287, + "learning_rate": 1.694580689156237e-05, + "loss": 0.0901, + "step": 36104 + }, + { + "epoch": 0.6439731744729426, + "grad_norm": 0.24924111366271973, + "learning_rate": 1.6944333388514148e-05, + "loss": 0.1286, + "step": 36105 + }, + { + "epoch": 0.6439910105946562, + "grad_norm": 0.3414084017276764, + "learning_rate": 1.6942859916692477e-05, + "loss": 0.0766, + "step": 36106 + }, + { + "epoch": 0.64400884671637, + "grad_norm": 0.23955604434013367, + "learning_rate": 1.6941386476103056e-05, + "loss": 0.1415, + "step": 36107 + }, + { + "epoch": 0.6440266828380837, + "grad_norm": 0.23224258422851562, + "learning_rate": 1.6939913066751606e-05, + "loss": 0.1281, + "step": 36108 + }, + { + "epoch": 0.6440445189597974, + "grad_norm": 0.2662828266620636, + "learning_rate": 1.693843968864384e-05, + "loss": 0.104, + "step": 36109 + }, + { + "epoch": 0.6440623550815111, + "grad_norm": 0.374972939491272, + "learning_rate": 1.693696634178547e-05, + "loss": 0.1304, + "step": 36110 + }, + { + "epoch": 0.6440801912032248, + "grad_norm": 0.42245882749557495, + "learning_rate": 1.6935493026182197e-05, + "loss": 0.1789, + "step": 36111 + }, + { + "epoch": 0.6440980273249385, + "grad_norm": 0.408315509557724, + "learning_rate": 1.6934019741839735e-05, + "loss": 0.1477, + "step": 36112 + }, + { + "epoch": 0.6441158634466522, + "grad_norm": 0.29706352949142456, + "learning_rate": 1.6932546488763808e-05, + "loss": 0.1218, + "step": 36113 + }, + { + "epoch": 0.6441336995683659, + "grad_norm": 0.3683321475982666, + "learning_rate": 1.6931073266960107e-05, + "loss": 0.1427, + "step": 36114 + }, + { + "epoch": 0.6441515356900795, + "grad_norm": 0.23992182314395905, + "learning_rate": 1.6929600076434364e-05, + "loss": 0.107, + "step": 36115 + }, + { + "epoch": 0.6441693718117932, + "grad_norm": 0.2670586407184601, + "learning_rate": 1.6928126917192262e-05, + "loss": 0.1607, + "step": 36116 + }, + { + "epoch": 0.6441872079335069, + "grad_norm": 0.3094174265861511, + "learning_rate": 1.6926653789239544e-05, + "loss": 0.1833, + "step": 36117 + }, + { + "epoch": 0.6442050440552206, + "grad_norm": 0.27157357335090637, + "learning_rate": 1.69251806925819e-05, + "loss": 0.1313, + "step": 36118 + }, + { + "epoch": 0.6442228801769343, + "grad_norm": 0.2662277817726135, + "learning_rate": 1.692370762722505e-05, + "loss": 0.1656, + "step": 36119 + }, + { + "epoch": 0.644240716298648, + "grad_norm": 0.24142740666866302, + "learning_rate": 1.692223459317468e-05, + "loss": 0.0742, + "step": 36120 + }, + { + "epoch": 0.6442585524203617, + "grad_norm": 0.23111611604690552, + "learning_rate": 1.6920761590436536e-05, + "loss": 0.0935, + "step": 36121 + }, + { + "epoch": 0.6442763885420754, + "grad_norm": 0.2675308287143707, + "learning_rate": 1.6919288619016306e-05, + "loss": 0.1086, + "step": 36122 + }, + { + "epoch": 0.644294224663789, + "grad_norm": 0.27754026651382446, + "learning_rate": 1.6917815678919706e-05, + "loss": 0.1297, + "step": 36123 + }, + { + "epoch": 0.6443120607855028, + "grad_norm": 0.25029289722442627, + "learning_rate": 1.6916342770152443e-05, + "loss": 0.1109, + "step": 36124 + }, + { + "epoch": 0.6443298969072165, + "grad_norm": 0.23299990594387054, + "learning_rate": 1.691486989272022e-05, + "loss": 0.1135, + "step": 36125 + }, + { + "epoch": 0.6443477330289302, + "grad_norm": 0.22696441411972046, + "learning_rate": 1.6913397046628765e-05, + "loss": 0.143, + "step": 36126 + }, + { + "epoch": 0.6443655691506439, + "grad_norm": 0.23908481001853943, + "learning_rate": 1.6911924231883776e-05, + "loss": 0.0963, + "step": 36127 + }, + { + "epoch": 0.6443834052723576, + "grad_norm": 0.43284371495246887, + "learning_rate": 1.691045144849095e-05, + "loss": 0.1693, + "step": 36128 + }, + { + "epoch": 0.6444012413940713, + "grad_norm": 0.2912270426750183, + "learning_rate": 1.6908978696456015e-05, + "loss": 0.1288, + "step": 36129 + }, + { + "epoch": 0.644419077515785, + "grad_norm": 0.42308008670806885, + "learning_rate": 1.690750597578467e-05, + "loss": 0.143, + "step": 36130 + }, + { + "epoch": 0.6444369136374987, + "grad_norm": 0.24147115647792816, + "learning_rate": 1.6906033286482637e-05, + "loss": 0.0954, + "step": 36131 + }, + { + "epoch": 0.6444547497592124, + "grad_norm": 0.27462226152420044, + "learning_rate": 1.690456062855561e-05, + "loss": 0.1192, + "step": 36132 + }, + { + "epoch": 0.644472585880926, + "grad_norm": 0.2730329632759094, + "learning_rate": 1.6903088002009292e-05, + "loss": 0.1269, + "step": 36133 + }, + { + "epoch": 0.6444904220026397, + "grad_norm": 0.29419299960136414, + "learning_rate": 1.6901615406849415e-05, + "loss": 0.1409, + "step": 36134 + }, + { + "epoch": 0.6445082581243534, + "grad_norm": 0.3213551640510559, + "learning_rate": 1.6900142843081667e-05, + "loss": 0.1047, + "step": 36135 + }, + { + "epoch": 0.6445260942460671, + "grad_norm": 0.24554812908172607, + "learning_rate": 1.6898670310711766e-05, + "loss": 0.135, + "step": 36136 + }, + { + "epoch": 0.6445439303677808, + "grad_norm": 0.32526692748069763, + "learning_rate": 1.689719780974542e-05, + "loss": 0.1031, + "step": 36137 + }, + { + "epoch": 0.6445617664894945, + "grad_norm": 0.23407816886901855, + "learning_rate": 1.6895725340188316e-05, + "loss": 0.1296, + "step": 36138 + }, + { + "epoch": 0.6445796026112082, + "grad_norm": 0.29700562357902527, + "learning_rate": 1.68942529020462e-05, + "loss": 0.1096, + "step": 36139 + }, + { + "epoch": 0.644597438732922, + "grad_norm": 0.3054434657096863, + "learning_rate": 1.6892780495324756e-05, + "loss": 0.0952, + "step": 36140 + }, + { + "epoch": 0.6446152748546357, + "grad_norm": 0.3107934594154358, + "learning_rate": 1.6891308120029685e-05, + "loss": 0.1653, + "step": 36141 + }, + { + "epoch": 0.6446331109763493, + "grad_norm": 0.21975281834602356, + "learning_rate": 1.6889835776166713e-05, + "loss": 0.1026, + "step": 36142 + }, + { + "epoch": 0.644650947098063, + "grad_norm": 0.22903771698474884, + "learning_rate": 1.6888363463741534e-05, + "loss": 0.0924, + "step": 36143 + }, + { + "epoch": 0.6446687832197767, + "grad_norm": 0.33508217334747314, + "learning_rate": 1.6886891182759865e-05, + "loss": 0.2171, + "step": 36144 + }, + { + "epoch": 0.6446866193414904, + "grad_norm": 0.3408792316913605, + "learning_rate": 1.6885418933227408e-05, + "loss": 0.1719, + "step": 36145 + }, + { + "epoch": 0.6447044554632041, + "grad_norm": 0.26822131872177124, + "learning_rate": 1.688394671514986e-05, + "loss": 0.0906, + "step": 36146 + }, + { + "epoch": 0.6447222915849178, + "grad_norm": 0.3770483732223511, + "learning_rate": 1.688247452853295e-05, + "loss": 0.1431, + "step": 36147 + }, + { + "epoch": 0.6447401277066315, + "grad_norm": 0.3141520023345947, + "learning_rate": 1.6881002373382367e-05, + "loss": 0.0999, + "step": 36148 + }, + { + "epoch": 0.6447579638283452, + "grad_norm": 0.201151043176651, + "learning_rate": 1.6879530249703824e-05, + "loss": 0.1254, + "step": 36149 + }, + { + "epoch": 0.6447757999500588, + "grad_norm": 0.24054493010044098, + "learning_rate": 1.6878058157503027e-05, + "loss": 0.1212, + "step": 36150 + }, + { + "epoch": 0.6447936360717725, + "grad_norm": 0.33309999108314514, + "learning_rate": 1.6876586096785673e-05, + "loss": 0.1762, + "step": 36151 + }, + { + "epoch": 0.6448114721934862, + "grad_norm": 0.33809059858322144, + "learning_rate": 1.6875114067557486e-05, + "loss": 0.1398, + "step": 36152 + }, + { + "epoch": 0.6448293083151999, + "grad_norm": 0.3542320728302002, + "learning_rate": 1.6873642069824167e-05, + "loss": 0.0625, + "step": 36153 + }, + { + "epoch": 0.6448471444369136, + "grad_norm": 0.25446248054504395, + "learning_rate": 1.6872170103591417e-05, + "loss": 0.1516, + "step": 36154 + }, + { + "epoch": 0.6448649805586273, + "grad_norm": 0.2221420258283615, + "learning_rate": 1.6870698168864928e-05, + "loss": 0.1365, + "step": 36155 + }, + { + "epoch": 0.644882816680341, + "grad_norm": 0.261918842792511, + "learning_rate": 1.6869226265650436e-05, + "loss": 0.1174, + "step": 36156 + }, + { + "epoch": 0.6449006528020548, + "grad_norm": 0.288920134305954, + "learning_rate": 1.6867754393953623e-05, + "loss": 0.1442, + "step": 36157 + }, + { + "epoch": 0.6449184889237685, + "grad_norm": 0.3040105998516083, + "learning_rate": 1.686628255378021e-05, + "loss": 0.1179, + "step": 36158 + }, + { + "epoch": 0.6449363250454822, + "grad_norm": 0.2427533119916916, + "learning_rate": 1.6864810745135885e-05, + "loss": 0.1212, + "step": 36159 + }, + { + "epoch": 0.6449541611671958, + "grad_norm": 0.257458359003067, + "learning_rate": 1.6863338968026375e-05, + "loss": 0.1302, + "step": 36160 + }, + { + "epoch": 0.6449719972889095, + "grad_norm": 0.2693089544773102, + "learning_rate": 1.686186722245737e-05, + "loss": 0.0975, + "step": 36161 + }, + { + "epoch": 0.6449898334106232, + "grad_norm": 0.3116805851459503, + "learning_rate": 1.6860395508434574e-05, + "loss": 0.1545, + "step": 36162 + }, + { + "epoch": 0.6450076695323369, + "grad_norm": 0.2290050983428955, + "learning_rate": 1.6858923825963702e-05, + "loss": 0.117, + "step": 36163 + }, + { + "epoch": 0.6450255056540506, + "grad_norm": 0.2532957196235657, + "learning_rate": 1.6857452175050446e-05, + "loss": 0.1196, + "step": 36164 + }, + { + "epoch": 0.6450433417757643, + "grad_norm": 0.2511919140815735, + "learning_rate": 1.6855980555700523e-05, + "loss": 0.1317, + "step": 36165 + }, + { + "epoch": 0.645061177897478, + "grad_norm": 0.25995415449142456, + "learning_rate": 1.6854508967919634e-05, + "loss": 0.1174, + "step": 36166 + }, + { + "epoch": 0.6450790140191917, + "grad_norm": 0.2380102425813675, + "learning_rate": 1.6853037411713484e-05, + "loss": 0.1216, + "step": 36167 + }, + { + "epoch": 0.6450968501409053, + "grad_norm": 0.2656167149543762, + "learning_rate": 1.685156588708776e-05, + "loss": 0.0825, + "step": 36168 + }, + { + "epoch": 0.645114686262619, + "grad_norm": 0.29976800084114075, + "learning_rate": 1.6850094394048194e-05, + "loss": 0.1186, + "step": 36169 + }, + { + "epoch": 0.6451325223843327, + "grad_norm": 0.27171820402145386, + "learning_rate": 1.6848622932600473e-05, + "loss": 0.1398, + "step": 36170 + }, + { + "epoch": 0.6451503585060464, + "grad_norm": 0.20812223851680756, + "learning_rate": 1.6847151502750307e-05, + "loss": 0.1009, + "step": 36171 + }, + { + "epoch": 0.6451681946277601, + "grad_norm": 0.24184690415859222, + "learning_rate": 1.684568010450339e-05, + "loss": 0.1297, + "step": 36172 + }, + { + "epoch": 0.6451860307494738, + "grad_norm": 0.31422850489616394, + "learning_rate": 1.6844208737865443e-05, + "loss": 0.0909, + "step": 36173 + }, + { + "epoch": 0.6452038668711876, + "grad_norm": 0.2704690992832184, + "learning_rate": 1.684273740284216e-05, + "loss": 0.0752, + "step": 36174 + }, + { + "epoch": 0.6452217029929013, + "grad_norm": 0.24138988554477692, + "learning_rate": 1.6841266099439243e-05, + "loss": 0.1509, + "step": 36175 + }, + { + "epoch": 0.645239539114615, + "grad_norm": 0.2506619393825531, + "learning_rate": 1.6839794827662393e-05, + "loss": 0.1149, + "step": 36176 + }, + { + "epoch": 0.6452573752363286, + "grad_norm": 0.23446524143218994, + "learning_rate": 1.6838323587517316e-05, + "loss": 0.1364, + "step": 36177 + }, + { + "epoch": 0.6452752113580423, + "grad_norm": 0.2749983072280884, + "learning_rate": 1.6836852379009717e-05, + "loss": 0.1307, + "step": 36178 + }, + { + "epoch": 0.645293047479756, + "grad_norm": 0.28372645378112793, + "learning_rate": 1.68353812021453e-05, + "loss": 0.198, + "step": 36179 + }, + { + "epoch": 0.6453108836014697, + "grad_norm": 0.28295403718948364, + "learning_rate": 1.6833910056929768e-05, + "loss": 0.1216, + "step": 36180 + }, + { + "epoch": 0.6453287197231834, + "grad_norm": 0.22426429390907288, + "learning_rate": 1.6832438943368805e-05, + "loss": 0.0969, + "step": 36181 + }, + { + "epoch": 0.6453465558448971, + "grad_norm": 0.19216197729110718, + "learning_rate": 1.6830967861468145e-05, + "loss": 0.1148, + "step": 36182 + }, + { + "epoch": 0.6453643919666108, + "grad_norm": 0.2421443909406662, + "learning_rate": 1.6829496811233474e-05, + "loss": 0.1171, + "step": 36183 + }, + { + "epoch": 0.6453822280883245, + "grad_norm": 0.25270938873291016, + "learning_rate": 1.6828025792670492e-05, + "loss": 0.1617, + "step": 36184 + }, + { + "epoch": 0.6454000642100381, + "grad_norm": 0.3278418779373169, + "learning_rate": 1.68265548057849e-05, + "loss": 0.1203, + "step": 36185 + }, + { + "epoch": 0.6454179003317518, + "grad_norm": 0.22865568101406097, + "learning_rate": 1.682508385058241e-05, + "loss": 0.1389, + "step": 36186 + }, + { + "epoch": 0.6454357364534655, + "grad_norm": 0.2596084177494049, + "learning_rate": 1.6823612927068723e-05, + "loss": 0.1136, + "step": 36187 + }, + { + "epoch": 0.6454535725751792, + "grad_norm": 0.34987136721611023, + "learning_rate": 1.6822142035249538e-05, + "loss": 0.1406, + "step": 36188 + }, + { + "epoch": 0.6454714086968929, + "grad_norm": 0.34618711471557617, + "learning_rate": 1.6820671175130544e-05, + "loss": 0.1388, + "step": 36189 + }, + { + "epoch": 0.6454892448186066, + "grad_norm": 0.30179232358932495, + "learning_rate": 1.6819200346717456e-05, + "loss": 0.0788, + "step": 36190 + }, + { + "epoch": 0.6455070809403204, + "grad_norm": 0.2815258502960205, + "learning_rate": 1.6817729550015975e-05, + "loss": 0.0901, + "step": 36191 + }, + { + "epoch": 0.6455249170620341, + "grad_norm": 0.2742551863193512, + "learning_rate": 1.68162587850318e-05, + "loss": 0.1336, + "step": 36192 + }, + { + "epoch": 0.6455427531837478, + "grad_norm": 0.20963624119758606, + "learning_rate": 1.681478805177064e-05, + "loss": 0.1182, + "step": 36193 + }, + { + "epoch": 0.6455605893054615, + "grad_norm": 0.2866366505622864, + "learning_rate": 1.6813317350238176e-05, + "loss": 0.1015, + "step": 36194 + }, + { + "epoch": 0.6455784254271751, + "grad_norm": 0.277170866727829, + "learning_rate": 1.681184668044013e-05, + "loss": 0.119, + "step": 36195 + }, + { + "epoch": 0.6455962615488888, + "grad_norm": 0.20249556005001068, + "learning_rate": 1.6810376042382193e-05, + "loss": 0.1248, + "step": 36196 + }, + { + "epoch": 0.6456140976706025, + "grad_norm": 0.2913576662540436, + "learning_rate": 1.6808905436070066e-05, + "loss": 0.1683, + "step": 36197 + }, + { + "epoch": 0.6456319337923162, + "grad_norm": 0.2869216203689575, + "learning_rate": 1.6807434861509446e-05, + "loss": 0.1525, + "step": 36198 + }, + { + "epoch": 0.6456497699140299, + "grad_norm": 0.19986768066883087, + "learning_rate": 1.6805964318706043e-05, + "loss": 0.1054, + "step": 36199 + }, + { + "epoch": 0.6456676060357436, + "grad_norm": 0.20991632342338562, + "learning_rate": 1.6804493807665555e-05, + "loss": 0.136, + "step": 36200 + }, + { + "epoch": 0.6456854421574573, + "grad_norm": 0.29451504349708557, + "learning_rate": 1.680302332839368e-05, + "loss": 0.1445, + "step": 36201 + }, + { + "epoch": 0.645703278279171, + "grad_norm": 0.3638874590396881, + "learning_rate": 1.6801552880896115e-05, + "loss": 0.1193, + "step": 36202 + }, + { + "epoch": 0.6457211144008846, + "grad_norm": 0.2540421187877655, + "learning_rate": 1.6800082465178558e-05, + "loss": 0.1338, + "step": 36203 + }, + { + "epoch": 0.6457389505225983, + "grad_norm": 0.45227232575416565, + "learning_rate": 1.6798612081246717e-05, + "loss": 0.1278, + "step": 36204 + }, + { + "epoch": 0.645756786644312, + "grad_norm": 0.24821460247039795, + "learning_rate": 1.6797141729106287e-05, + "loss": 0.0638, + "step": 36205 + }, + { + "epoch": 0.6457746227660257, + "grad_norm": 0.22063791751861572, + "learning_rate": 1.6795671408762976e-05, + "loss": 0.1703, + "step": 36206 + }, + { + "epoch": 0.6457924588877394, + "grad_norm": 0.34213772416114807, + "learning_rate": 1.6794201120222465e-05, + "loss": 0.1661, + "step": 36207 + }, + { + "epoch": 0.6458102950094532, + "grad_norm": 0.19845445454120636, + "learning_rate": 1.679273086349047e-05, + "loss": 0.1269, + "step": 36208 + }, + { + "epoch": 0.6458281311311669, + "grad_norm": 0.2927187979221344, + "learning_rate": 1.679126063857269e-05, + "loss": 0.1044, + "step": 36209 + }, + { + "epoch": 0.6458459672528806, + "grad_norm": 0.22445560991764069, + "learning_rate": 1.678979044547482e-05, + "loss": 0.1552, + "step": 36210 + }, + { + "epoch": 0.6458638033745943, + "grad_norm": 0.24601800739765167, + "learning_rate": 1.6788320284202544e-05, + "loss": 0.127, + "step": 36211 + }, + { + "epoch": 0.645881639496308, + "grad_norm": 0.36156129837036133, + "learning_rate": 1.678685015476158e-05, + "loss": 0.1691, + "step": 36212 + }, + { + "epoch": 0.6458994756180216, + "grad_norm": 0.20540089905261993, + "learning_rate": 1.678538005715763e-05, + "loss": 0.1188, + "step": 36213 + }, + { + "epoch": 0.6459173117397353, + "grad_norm": 0.243947371840477, + "learning_rate": 1.678390999139638e-05, + "loss": 0.1008, + "step": 36214 + }, + { + "epoch": 0.645935147861449, + "grad_norm": 0.21612975001335144, + "learning_rate": 1.6782439957483537e-05, + "loss": 0.1266, + "step": 36215 + }, + { + "epoch": 0.6459529839831627, + "grad_norm": 0.3539649546146393, + "learning_rate": 1.678096995542478e-05, + "loss": 0.1315, + "step": 36216 + }, + { + "epoch": 0.6459708201048764, + "grad_norm": 0.25346601009368896, + "learning_rate": 1.6779499985225835e-05, + "loss": 0.0822, + "step": 36217 + }, + { + "epoch": 0.6459886562265901, + "grad_norm": 0.3246941566467285, + "learning_rate": 1.677803004689238e-05, + "loss": 0.1154, + "step": 36218 + }, + { + "epoch": 0.6460064923483038, + "grad_norm": 0.2321978211402893, + "learning_rate": 1.677656014043013e-05, + "loss": 0.1207, + "step": 36219 + }, + { + "epoch": 0.6460243284700175, + "grad_norm": 0.23822705447673798, + "learning_rate": 1.677509026584476e-05, + "loss": 0.0964, + "step": 36220 + }, + { + "epoch": 0.6460421645917311, + "grad_norm": 0.2451213002204895, + "learning_rate": 1.6773620423141993e-05, + "loss": 0.1193, + "step": 36221 + }, + { + "epoch": 0.6460600007134448, + "grad_norm": 0.304071843624115, + "learning_rate": 1.6772150612327512e-05, + "loss": 0.1353, + "step": 36222 + }, + { + "epoch": 0.6460778368351585, + "grad_norm": 0.26478129625320435, + "learning_rate": 1.6770680833407017e-05, + "loss": 0.1468, + "step": 36223 + }, + { + "epoch": 0.6460956729568722, + "grad_norm": 0.2717961370944977, + "learning_rate": 1.6769211086386195e-05, + "loss": 0.1653, + "step": 36224 + }, + { + "epoch": 0.646113509078586, + "grad_norm": 0.2218887060880661, + "learning_rate": 1.6767741371270767e-05, + "loss": 0.1295, + "step": 36225 + }, + { + "epoch": 0.6461313452002997, + "grad_norm": 0.20231322944164276, + "learning_rate": 1.6766271688066408e-05, + "loss": 0.1412, + "step": 36226 + }, + { + "epoch": 0.6461491813220134, + "grad_norm": 0.2450684905052185, + "learning_rate": 1.6764802036778833e-05, + "loss": 0.0997, + "step": 36227 + }, + { + "epoch": 0.6461670174437271, + "grad_norm": 0.24115802347660065, + "learning_rate": 1.6763332417413727e-05, + "loss": 0.1251, + "step": 36228 + }, + { + "epoch": 0.6461848535654408, + "grad_norm": 0.38725319504737854, + "learning_rate": 1.676186282997678e-05, + "loss": 0.1815, + "step": 36229 + }, + { + "epoch": 0.6462026896871544, + "grad_norm": 0.4499475359916687, + "learning_rate": 1.6760393274473705e-05, + "loss": 0.1293, + "step": 36230 + }, + { + "epoch": 0.6462205258088681, + "grad_norm": 0.26100361347198486, + "learning_rate": 1.6758923750910198e-05, + "loss": 0.0841, + "step": 36231 + }, + { + "epoch": 0.6462383619305818, + "grad_norm": 0.32052701711654663, + "learning_rate": 1.6757454259291938e-05, + "loss": 0.1381, + "step": 36232 + }, + { + "epoch": 0.6462561980522955, + "grad_norm": 0.3102074861526489, + "learning_rate": 1.6755984799624635e-05, + "loss": 0.1462, + "step": 36233 + }, + { + "epoch": 0.6462740341740092, + "grad_norm": 0.2903602719306946, + "learning_rate": 1.675451537191398e-05, + "loss": 0.1492, + "step": 36234 + }, + { + "epoch": 0.6462918702957229, + "grad_norm": 0.2678000032901764, + "learning_rate": 1.6753045976165678e-05, + "loss": 0.1484, + "step": 36235 + }, + { + "epoch": 0.6463097064174366, + "grad_norm": 0.2865830659866333, + "learning_rate": 1.6751576612385422e-05, + "loss": 0.1307, + "step": 36236 + }, + { + "epoch": 0.6463275425391503, + "grad_norm": 0.3205627202987671, + "learning_rate": 1.6750107280578884e-05, + "loss": 0.1273, + "step": 36237 + }, + { + "epoch": 0.646345378660864, + "grad_norm": 0.351855993270874, + "learning_rate": 1.67486379807518e-05, + "loss": 0.1811, + "step": 36238 + }, + { + "epoch": 0.6463632147825776, + "grad_norm": 0.33485251665115356, + "learning_rate": 1.6747168712909837e-05, + "loss": 0.1789, + "step": 36239 + }, + { + "epoch": 0.6463810509042913, + "grad_norm": 0.3263581395149231, + "learning_rate": 1.6745699477058702e-05, + "loss": 0.1758, + "step": 36240 + }, + { + "epoch": 0.6463988870260051, + "grad_norm": 0.3114212453365326, + "learning_rate": 1.6744230273204087e-05, + "loss": 0.1226, + "step": 36241 + }, + { + "epoch": 0.6464167231477188, + "grad_norm": 0.5191286206245422, + "learning_rate": 1.6742761101351678e-05, + "loss": 0.1303, + "step": 36242 + }, + { + "epoch": 0.6464345592694325, + "grad_norm": 0.306167870759964, + "learning_rate": 1.6741291961507187e-05, + "loss": 0.1729, + "step": 36243 + }, + { + "epoch": 0.6464523953911462, + "grad_norm": 0.25029274821281433, + "learning_rate": 1.67398228536763e-05, + "loss": 0.1257, + "step": 36244 + }, + { + "epoch": 0.6464702315128599, + "grad_norm": 0.3108735680580139, + "learning_rate": 1.673835377786471e-05, + "loss": 0.1558, + "step": 36245 + }, + { + "epoch": 0.6464880676345736, + "grad_norm": 0.3183059096336365, + "learning_rate": 1.6736884734078114e-05, + "loss": 0.114, + "step": 36246 + }, + { + "epoch": 0.6465059037562872, + "grad_norm": 0.24241836369037628, + "learning_rate": 1.6735415722322206e-05, + "loss": 0.1219, + "step": 36247 + }, + { + "epoch": 0.6465237398780009, + "grad_norm": 0.2833211123943329, + "learning_rate": 1.6733946742602684e-05, + "loss": 0.1154, + "step": 36248 + }, + { + "epoch": 0.6465415759997146, + "grad_norm": 0.23954704403877258, + "learning_rate": 1.6732477794925245e-05, + "loss": 0.1473, + "step": 36249 + }, + { + "epoch": 0.6465594121214283, + "grad_norm": 0.44087162613868713, + "learning_rate": 1.6731008879295566e-05, + "loss": 0.1261, + "step": 36250 + }, + { + "epoch": 0.646577248243142, + "grad_norm": 0.25401169061660767, + "learning_rate": 1.672953999571936e-05, + "loss": 0.1271, + "step": 36251 + }, + { + "epoch": 0.6465950843648557, + "grad_norm": 0.27473825216293335, + "learning_rate": 1.6728071144202315e-05, + "loss": 0.1104, + "step": 36252 + }, + { + "epoch": 0.6466129204865694, + "grad_norm": 0.46097078919410706, + "learning_rate": 1.6726602324750117e-05, + "loss": 0.1473, + "step": 36253 + }, + { + "epoch": 0.6466307566082831, + "grad_norm": 0.28831246495246887, + "learning_rate": 1.6725133537368473e-05, + "loss": 0.1507, + "step": 36254 + }, + { + "epoch": 0.6466485927299968, + "grad_norm": 0.3747731149196625, + "learning_rate": 1.672366478206306e-05, + "loss": 0.1165, + "step": 36255 + }, + { + "epoch": 0.6466664288517104, + "grad_norm": 0.2572822868824005, + "learning_rate": 1.6722196058839587e-05, + "loss": 0.0803, + "step": 36256 + }, + { + "epoch": 0.6466842649734241, + "grad_norm": 0.2616421580314636, + "learning_rate": 1.6720727367703743e-05, + "loss": 0.1261, + "step": 36257 + }, + { + "epoch": 0.6467021010951379, + "grad_norm": 0.23427090048789978, + "learning_rate": 1.671925870866122e-05, + "loss": 0.1029, + "step": 36258 + }, + { + "epoch": 0.6467199372168516, + "grad_norm": 0.20507794618606567, + "learning_rate": 1.67177900817177e-05, + "loss": 0.1415, + "step": 36259 + }, + { + "epoch": 0.6467377733385653, + "grad_norm": 0.2586061954498291, + "learning_rate": 1.6716321486878894e-05, + "loss": 0.1593, + "step": 36260 + }, + { + "epoch": 0.646755609460279, + "grad_norm": 0.24695369601249695, + "learning_rate": 1.6714852924150486e-05, + "loss": 0.1734, + "step": 36261 + }, + { + "epoch": 0.6467734455819927, + "grad_norm": 0.42236751317977905, + "learning_rate": 1.6713384393538173e-05, + "loss": 0.1379, + "step": 36262 + }, + { + "epoch": 0.6467912817037064, + "grad_norm": 0.2234976887702942, + "learning_rate": 1.6711915895047633e-05, + "loss": 0.1261, + "step": 36263 + }, + { + "epoch": 0.64680911782542, + "grad_norm": 0.28489330410957336, + "learning_rate": 1.671044742868458e-05, + "loss": 0.1322, + "step": 36264 + }, + { + "epoch": 0.6468269539471337, + "grad_norm": 0.2886865437030792, + "learning_rate": 1.67089789944547e-05, + "loss": 0.1295, + "step": 36265 + }, + { + "epoch": 0.6468447900688474, + "grad_norm": 0.25815892219543457, + "learning_rate": 1.6707510592363672e-05, + "loss": 0.1273, + "step": 36266 + }, + { + "epoch": 0.6468626261905611, + "grad_norm": 0.2884007394313812, + "learning_rate": 1.6706042222417202e-05, + "loss": 0.1237, + "step": 36267 + }, + { + "epoch": 0.6468804623122748, + "grad_norm": 0.46964511275291443, + "learning_rate": 1.6704573884620968e-05, + "loss": 0.1564, + "step": 36268 + }, + { + "epoch": 0.6468982984339885, + "grad_norm": 0.2360130399465561, + "learning_rate": 1.670310557898068e-05, + "loss": 0.1149, + "step": 36269 + }, + { + "epoch": 0.6469161345557022, + "grad_norm": 0.2545284628868103, + "learning_rate": 1.670163730550202e-05, + "loss": 0.1091, + "step": 36270 + }, + { + "epoch": 0.6469339706774159, + "grad_norm": 0.2422153204679489, + "learning_rate": 1.670016906419068e-05, + "loss": 0.11, + "step": 36271 + }, + { + "epoch": 0.6469518067991296, + "grad_norm": 0.3159201145172119, + "learning_rate": 1.6698700855052343e-05, + "loss": 0.1337, + "step": 36272 + }, + { + "epoch": 0.6469696429208432, + "grad_norm": 0.24787463247776031, + "learning_rate": 1.6697232678092718e-05, + "loss": 0.0966, + "step": 36273 + }, + { + "epoch": 0.6469874790425569, + "grad_norm": 0.2403479665517807, + "learning_rate": 1.6695764533317482e-05, + "loss": 0.0861, + "step": 36274 + }, + { + "epoch": 0.6470053151642707, + "grad_norm": 0.26724961400032043, + "learning_rate": 1.6694296420732337e-05, + "loss": 0.1447, + "step": 36275 + }, + { + "epoch": 0.6470231512859844, + "grad_norm": 0.24436023831367493, + "learning_rate": 1.6692828340342955e-05, + "loss": 0.1088, + "step": 36276 + }, + { + "epoch": 0.6470409874076981, + "grad_norm": 0.35987207293510437, + "learning_rate": 1.669136029215505e-05, + "loss": 0.0616, + "step": 36277 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.18692877888679504, + "learning_rate": 1.6689892276174302e-05, + "loss": 0.0947, + "step": 36278 + }, + { + "epoch": 0.6470766596511255, + "grad_norm": 0.3369114100933075, + "learning_rate": 1.6688424292406405e-05, + "loss": 0.1396, + "step": 36279 + }, + { + "epoch": 0.6470944957728392, + "grad_norm": 0.287370890378952, + "learning_rate": 1.6686956340857036e-05, + "loss": 0.121, + "step": 36280 + }, + { + "epoch": 0.6471123318945529, + "grad_norm": 0.2611466944217682, + "learning_rate": 1.66854884215319e-05, + "loss": 0.0755, + "step": 36281 + }, + { + "epoch": 0.6471301680162665, + "grad_norm": 0.3269018232822418, + "learning_rate": 1.668402053443668e-05, + "loss": 0.1188, + "step": 36282 + }, + { + "epoch": 0.6471480041379802, + "grad_norm": 0.2913071811199188, + "learning_rate": 1.6682552679577075e-05, + "loss": 0.1184, + "step": 36283 + }, + { + "epoch": 0.6471658402596939, + "grad_norm": 0.31349894404411316, + "learning_rate": 1.668108485695877e-05, + "loss": 0.0667, + "step": 36284 + }, + { + "epoch": 0.6471836763814076, + "grad_norm": 0.31691157817840576, + "learning_rate": 1.667961706658744e-05, + "loss": 0.1177, + "step": 36285 + }, + { + "epoch": 0.6472015125031213, + "grad_norm": 0.21011942625045776, + "learning_rate": 1.66781493084688e-05, + "loss": 0.0976, + "step": 36286 + }, + { + "epoch": 0.647219348624835, + "grad_norm": 0.24647122621536255, + "learning_rate": 1.6676681582608527e-05, + "loss": 0.0921, + "step": 36287 + }, + { + "epoch": 0.6472371847465487, + "grad_norm": 0.3006668984889984, + "learning_rate": 1.6675213889012308e-05, + "loss": 0.1108, + "step": 36288 + }, + { + "epoch": 0.6472550208682624, + "grad_norm": 0.35260167717933655, + "learning_rate": 1.6673746227685826e-05, + "loss": 0.0982, + "step": 36289 + }, + { + "epoch": 0.647272856989976, + "grad_norm": 0.24643471837043762, + "learning_rate": 1.6672278598634795e-05, + "loss": 0.1282, + "step": 36290 + }, + { + "epoch": 0.6472906931116897, + "grad_norm": 0.24415390193462372, + "learning_rate": 1.6670811001864882e-05, + "loss": 0.1131, + "step": 36291 + }, + { + "epoch": 0.6473085292334035, + "grad_norm": 0.2958952784538269, + "learning_rate": 1.6669343437381787e-05, + "loss": 0.143, + "step": 36292 + }, + { + "epoch": 0.6473263653551172, + "grad_norm": 0.38828209042549133, + "learning_rate": 1.6667875905191187e-05, + "loss": 0.1421, + "step": 36293 + }, + { + "epoch": 0.6473442014768309, + "grad_norm": 0.31823015213012695, + "learning_rate": 1.666640840529878e-05, + "loss": 0.1072, + "step": 36294 + }, + { + "epoch": 0.6473620375985446, + "grad_norm": 0.32994458079338074, + "learning_rate": 1.666494093771025e-05, + "loss": 0.1531, + "step": 36295 + }, + { + "epoch": 0.6473798737202583, + "grad_norm": 0.39449965953826904, + "learning_rate": 1.6663473502431294e-05, + "loss": 0.2115, + "step": 36296 + }, + { + "epoch": 0.647397709841972, + "grad_norm": 0.2324059009552002, + "learning_rate": 1.6662006099467594e-05, + "loss": 0.1574, + "step": 36297 + }, + { + "epoch": 0.6474155459636857, + "grad_norm": 0.3697079122066498, + "learning_rate": 1.666053872882483e-05, + "loss": 0.1455, + "step": 36298 + }, + { + "epoch": 0.6474333820853994, + "grad_norm": 0.2839532792568207, + "learning_rate": 1.6659071390508703e-05, + "loss": 0.1038, + "step": 36299 + }, + { + "epoch": 0.647451218207113, + "grad_norm": 0.20604731142520905, + "learning_rate": 1.66576040845249e-05, + "loss": 0.0891, + "step": 36300 + }, + { + "epoch": 0.6474690543288267, + "grad_norm": 0.20650988817214966, + "learning_rate": 1.66561368108791e-05, + "loss": 0.1208, + "step": 36301 + }, + { + "epoch": 0.6474868904505404, + "grad_norm": 0.19601276516914368, + "learning_rate": 1.6654669569577e-05, + "loss": 0.1408, + "step": 36302 + }, + { + "epoch": 0.6475047265722541, + "grad_norm": 0.1895613968372345, + "learning_rate": 1.6653202360624274e-05, + "loss": 0.0602, + "step": 36303 + }, + { + "epoch": 0.6475225626939678, + "grad_norm": 0.30013152956962585, + "learning_rate": 1.665173518402663e-05, + "loss": 0.1001, + "step": 36304 + }, + { + "epoch": 0.6475403988156815, + "grad_norm": 0.24785785377025604, + "learning_rate": 1.665026803978974e-05, + "loss": 0.1042, + "step": 36305 + }, + { + "epoch": 0.6475582349373952, + "grad_norm": 0.28571587800979614, + "learning_rate": 1.6648800927919294e-05, + "loss": 0.0973, + "step": 36306 + }, + { + "epoch": 0.6475760710591089, + "grad_norm": 0.21622537076473236, + "learning_rate": 1.6647333848420973e-05, + "loss": 0.1153, + "step": 36307 + }, + { + "epoch": 0.6475939071808225, + "grad_norm": 0.38675451278686523, + "learning_rate": 1.664586680130048e-05, + "loss": 0.1655, + "step": 36308 + }, + { + "epoch": 0.6476117433025363, + "grad_norm": 0.31630417704582214, + "learning_rate": 1.664439978656349e-05, + "loss": 0.1549, + "step": 36309 + }, + { + "epoch": 0.64762957942425, + "grad_norm": 0.37522152066230774, + "learning_rate": 1.6642932804215693e-05, + "loss": 0.1301, + "step": 36310 + }, + { + "epoch": 0.6476474155459637, + "grad_norm": 0.22811684012413025, + "learning_rate": 1.6641465854262767e-05, + "loss": 0.1337, + "step": 36311 + }, + { + "epoch": 0.6476652516676774, + "grad_norm": 0.2267579585313797, + "learning_rate": 1.6639998936710417e-05, + "loss": 0.1339, + "step": 36312 + }, + { + "epoch": 0.6476830877893911, + "grad_norm": 0.2597489058971405, + "learning_rate": 1.663853205156432e-05, + "loss": 0.1177, + "step": 36313 + }, + { + "epoch": 0.6477009239111048, + "grad_norm": 0.27039024233818054, + "learning_rate": 1.663706519883016e-05, + "loss": 0.107, + "step": 36314 + }, + { + "epoch": 0.6477187600328185, + "grad_norm": 0.23155201971530914, + "learning_rate": 1.663559837851361e-05, + "loss": 0.1174, + "step": 36315 + }, + { + "epoch": 0.6477365961545322, + "grad_norm": 0.26490023732185364, + "learning_rate": 1.663413159062038e-05, + "loss": 0.1378, + "step": 36316 + }, + { + "epoch": 0.6477544322762459, + "grad_norm": 0.27218809723854065, + "learning_rate": 1.663266483515615e-05, + "loss": 0.1204, + "step": 36317 + }, + { + "epoch": 0.6477722683979595, + "grad_norm": 0.2294764369726181, + "learning_rate": 1.6631198112126595e-05, + "loss": 0.0989, + "step": 36318 + }, + { + "epoch": 0.6477901045196732, + "grad_norm": 0.21647778153419495, + "learning_rate": 1.6629731421537407e-05, + "loss": 0.0577, + "step": 36319 + }, + { + "epoch": 0.6478079406413869, + "grad_norm": 0.3042411506175995, + "learning_rate": 1.6628264763394267e-05, + "loss": 0.1286, + "step": 36320 + }, + { + "epoch": 0.6478257767631006, + "grad_norm": 0.20827218890190125, + "learning_rate": 1.6626798137702874e-05, + "loss": 0.0902, + "step": 36321 + }, + { + "epoch": 0.6478436128848143, + "grad_norm": 0.310955673456192, + "learning_rate": 1.6625331544468895e-05, + "loss": 0.1566, + "step": 36322 + }, + { + "epoch": 0.647861449006528, + "grad_norm": 0.29786819219589233, + "learning_rate": 1.6623864983698027e-05, + "loss": 0.1314, + "step": 36323 + }, + { + "epoch": 0.6478792851282417, + "grad_norm": 0.32501375675201416, + "learning_rate": 1.6622398455395948e-05, + "loss": 0.1523, + "step": 36324 + }, + { + "epoch": 0.6478971212499554, + "grad_norm": 0.3099982738494873, + "learning_rate": 1.662093195956835e-05, + "loss": 0.1397, + "step": 36325 + }, + { + "epoch": 0.6479149573716692, + "grad_norm": 0.3020944893360138, + "learning_rate": 1.6619465496220916e-05, + "loss": 0.1175, + "step": 36326 + }, + { + "epoch": 0.6479327934933828, + "grad_norm": 0.1899459809064865, + "learning_rate": 1.661799906535933e-05, + "loss": 0.0725, + "step": 36327 + }, + { + "epoch": 0.6479506296150965, + "grad_norm": 0.2732452154159546, + "learning_rate": 1.661653266698926e-05, + "loss": 0.131, + "step": 36328 + }, + { + "epoch": 0.6479684657368102, + "grad_norm": 0.30049529671669006, + "learning_rate": 1.6615066301116418e-05, + "loss": 0.1241, + "step": 36329 + }, + { + "epoch": 0.6479863018585239, + "grad_norm": 0.23166239261627197, + "learning_rate": 1.6613599967746472e-05, + "loss": 0.131, + "step": 36330 + }, + { + "epoch": 0.6480041379802376, + "grad_norm": 0.28806835412979126, + "learning_rate": 1.661213366688511e-05, + "loss": 0.1829, + "step": 36331 + }, + { + "epoch": 0.6480219741019513, + "grad_norm": 0.31063300371170044, + "learning_rate": 1.6610667398538017e-05, + "loss": 0.1555, + "step": 36332 + }, + { + "epoch": 0.648039810223665, + "grad_norm": 0.24637821316719055, + "learning_rate": 1.6609201162710865e-05, + "loss": 0.0964, + "step": 36333 + }, + { + "epoch": 0.6480576463453787, + "grad_norm": 0.2434396594762802, + "learning_rate": 1.6607734959409357e-05, + "loss": 0.1249, + "step": 36334 + }, + { + "epoch": 0.6480754824670923, + "grad_norm": 0.2530849874019623, + "learning_rate": 1.6606268788639166e-05, + "loss": 0.1377, + "step": 36335 + }, + { + "epoch": 0.648093318588806, + "grad_norm": 0.27908849716186523, + "learning_rate": 1.6604802650405974e-05, + "loss": 0.0817, + "step": 36336 + }, + { + "epoch": 0.6481111547105197, + "grad_norm": 0.23288792371749878, + "learning_rate": 1.6603336544715463e-05, + "loss": 0.1121, + "step": 36337 + }, + { + "epoch": 0.6481289908322334, + "grad_norm": 0.3089233636856079, + "learning_rate": 1.660187047157332e-05, + "loss": 0.1723, + "step": 36338 + }, + { + "epoch": 0.6481468269539471, + "grad_norm": 0.2522965967655182, + "learning_rate": 1.6600404430985236e-05, + "loss": 0.1109, + "step": 36339 + }, + { + "epoch": 0.6481646630756608, + "grad_norm": 0.279598593711853, + "learning_rate": 1.6598938422956885e-05, + "loss": 0.1199, + "step": 36340 + }, + { + "epoch": 0.6481824991973745, + "grad_norm": 0.252302348613739, + "learning_rate": 1.659747244749394e-05, + "loss": 0.1184, + "step": 36341 + }, + { + "epoch": 0.6482003353190883, + "grad_norm": 0.24944572150707245, + "learning_rate": 1.6596006504602102e-05, + "loss": 0.0956, + "step": 36342 + }, + { + "epoch": 0.648218171440802, + "grad_norm": 0.28131645917892456, + "learning_rate": 1.6594540594287038e-05, + "loss": 0.1104, + "step": 36343 + }, + { + "epoch": 0.6482360075625156, + "grad_norm": 0.1979651302099228, + "learning_rate": 1.6593074716554448e-05, + "loss": 0.1144, + "step": 36344 + }, + { + "epoch": 0.6482538436842293, + "grad_norm": 0.35066723823547363, + "learning_rate": 1.6591608871410004e-05, + "loss": 0.1359, + "step": 36345 + }, + { + "epoch": 0.648271679805943, + "grad_norm": 0.2735930383205414, + "learning_rate": 1.6590143058859374e-05, + "loss": 0.1025, + "step": 36346 + }, + { + "epoch": 0.6482895159276567, + "grad_norm": 0.25720998644828796, + "learning_rate": 1.6588677278908264e-05, + "loss": 0.1246, + "step": 36347 + }, + { + "epoch": 0.6483073520493704, + "grad_norm": 0.24710716307163239, + "learning_rate": 1.6587211531562353e-05, + "loss": 0.1518, + "step": 36348 + }, + { + "epoch": 0.6483251881710841, + "grad_norm": 0.2664247751235962, + "learning_rate": 1.6585745816827304e-05, + "loss": 0.1272, + "step": 36349 + }, + { + "epoch": 0.6483430242927978, + "grad_norm": 0.20622743666172028, + "learning_rate": 1.658428013470881e-05, + "loss": 0.0709, + "step": 36350 + }, + { + "epoch": 0.6483608604145115, + "grad_norm": 0.24626660346984863, + "learning_rate": 1.6582814485212563e-05, + "loss": 0.1329, + "step": 36351 + }, + { + "epoch": 0.6483786965362252, + "grad_norm": 0.2569271922111511, + "learning_rate": 1.6581348868344226e-05, + "loss": 0.1372, + "step": 36352 + }, + { + "epoch": 0.6483965326579388, + "grad_norm": 0.37782010436058044, + "learning_rate": 1.6579883284109498e-05, + "loss": 0.1116, + "step": 36353 + }, + { + "epoch": 0.6484143687796525, + "grad_norm": 0.2427438497543335, + "learning_rate": 1.6578417732514035e-05, + "loss": 0.1578, + "step": 36354 + }, + { + "epoch": 0.6484322049013662, + "grad_norm": 0.3771073520183563, + "learning_rate": 1.6576952213563545e-05, + "loss": 0.1861, + "step": 36355 + }, + { + "epoch": 0.6484500410230799, + "grad_norm": 0.22933746874332428, + "learning_rate": 1.65754867272637e-05, + "loss": 0.1156, + "step": 36356 + }, + { + "epoch": 0.6484678771447936, + "grad_norm": 0.22353418171405792, + "learning_rate": 1.6574021273620172e-05, + "loss": 0.1076, + "step": 36357 + }, + { + "epoch": 0.6484857132665073, + "grad_norm": 0.23841258883476257, + "learning_rate": 1.657255585263865e-05, + "loss": 0.1272, + "step": 36358 + }, + { + "epoch": 0.6485035493882211, + "grad_norm": 0.23630429804325104, + "learning_rate": 1.6571090464324804e-05, + "loss": 0.0882, + "step": 36359 + }, + { + "epoch": 0.6485213855099348, + "grad_norm": 0.4377550482749939, + "learning_rate": 1.6569625108684332e-05, + "loss": 0.1623, + "step": 36360 + }, + { + "epoch": 0.6485392216316485, + "grad_norm": 0.25166547298431396, + "learning_rate": 1.656815978572291e-05, + "loss": 0.1292, + "step": 36361 + }, + { + "epoch": 0.6485570577533621, + "grad_norm": 0.2722752094268799, + "learning_rate": 1.6566694495446205e-05, + "loss": 0.1115, + "step": 36362 + }, + { + "epoch": 0.6485748938750758, + "grad_norm": 0.24089182913303375, + "learning_rate": 1.6565229237859897e-05, + "loss": 0.1222, + "step": 36363 + }, + { + "epoch": 0.6485927299967895, + "grad_norm": 0.24389807879924774, + "learning_rate": 1.6563764012969688e-05, + "loss": 0.1484, + "step": 36364 + }, + { + "epoch": 0.6486105661185032, + "grad_norm": 0.3104889690876007, + "learning_rate": 1.6562298820781235e-05, + "loss": 0.1021, + "step": 36365 + }, + { + "epoch": 0.6486284022402169, + "grad_norm": 0.3075515925884247, + "learning_rate": 1.656083366130023e-05, + "loss": 0.118, + "step": 36366 + }, + { + "epoch": 0.6486462383619306, + "grad_norm": 0.3432249128818512, + "learning_rate": 1.6559368534532344e-05, + "loss": 0.2021, + "step": 36367 + }, + { + "epoch": 0.6486640744836443, + "grad_norm": 0.29838305711746216, + "learning_rate": 1.6557903440483266e-05, + "loss": 0.1279, + "step": 36368 + }, + { + "epoch": 0.648681910605358, + "grad_norm": 0.23626558482646942, + "learning_rate": 1.6556438379158673e-05, + "loss": 0.1308, + "step": 36369 + }, + { + "epoch": 0.6486997467270716, + "grad_norm": 0.20485560595989227, + "learning_rate": 1.6554973350564232e-05, + "loss": 0.0889, + "step": 36370 + }, + { + "epoch": 0.6487175828487853, + "grad_norm": 0.2429179847240448, + "learning_rate": 1.6553508354705636e-05, + "loss": 0.1302, + "step": 36371 + }, + { + "epoch": 0.648735418970499, + "grad_norm": 0.320940226316452, + "learning_rate": 1.6552043391588554e-05, + "loss": 0.1045, + "step": 36372 + }, + { + "epoch": 0.6487532550922127, + "grad_norm": 0.2593878209590912, + "learning_rate": 1.6550578461218676e-05, + "loss": 0.1133, + "step": 36373 + }, + { + "epoch": 0.6487710912139264, + "grad_norm": 0.20208634436130524, + "learning_rate": 1.6549113563601675e-05, + "loss": 0.0769, + "step": 36374 + }, + { + "epoch": 0.6487889273356401, + "grad_norm": 0.25661903619766235, + "learning_rate": 1.6547648698743233e-05, + "loss": 0.1344, + "step": 36375 + }, + { + "epoch": 0.6488067634573539, + "grad_norm": 0.2480912059545517, + "learning_rate": 1.6546183866649005e-05, + "loss": 0.1334, + "step": 36376 + }, + { + "epoch": 0.6488245995790676, + "grad_norm": 0.22495205700397491, + "learning_rate": 1.6544719067324706e-05, + "loss": 0.106, + "step": 36377 + }, + { + "epoch": 0.6488424357007813, + "grad_norm": 0.29496297240257263, + "learning_rate": 1.654325430077599e-05, + "loss": 0.1541, + "step": 36378 + }, + { + "epoch": 0.648860271822495, + "grad_norm": 0.3045872747898102, + "learning_rate": 1.6541789567008542e-05, + "loss": 0.174, + "step": 36379 + }, + { + "epoch": 0.6488781079442086, + "grad_norm": 0.20767450332641602, + "learning_rate": 1.6540324866028034e-05, + "loss": 0.0715, + "step": 36380 + }, + { + "epoch": 0.6488959440659223, + "grad_norm": 0.308723509311676, + "learning_rate": 1.6538860197840156e-05, + "loss": 0.1058, + "step": 36381 + }, + { + "epoch": 0.648913780187636, + "grad_norm": 0.2800751030445099, + "learning_rate": 1.6537395562450577e-05, + "loss": 0.1242, + "step": 36382 + }, + { + "epoch": 0.6489316163093497, + "grad_norm": 0.2784283757209778, + "learning_rate": 1.6535930959864977e-05, + "loss": 0.2043, + "step": 36383 + }, + { + "epoch": 0.6489494524310634, + "grad_norm": 0.23597945272922516, + "learning_rate": 1.653446639008903e-05, + "loss": 0.1005, + "step": 36384 + }, + { + "epoch": 0.6489672885527771, + "grad_norm": 0.2835776209831238, + "learning_rate": 1.653300185312841e-05, + "loss": 0.1554, + "step": 36385 + }, + { + "epoch": 0.6489851246744908, + "grad_norm": 0.2600294053554535, + "learning_rate": 1.6531537348988803e-05, + "loss": 0.1418, + "step": 36386 + }, + { + "epoch": 0.6490029607962045, + "grad_norm": 0.28589382767677307, + "learning_rate": 1.6530072877675885e-05, + "loss": 0.1042, + "step": 36387 + }, + { + "epoch": 0.6490207969179181, + "grad_norm": 0.2674868702888489, + "learning_rate": 1.652860843919533e-05, + "loss": 0.1154, + "step": 36388 + }, + { + "epoch": 0.6490386330396318, + "grad_norm": 0.3101354241371155, + "learning_rate": 1.6527144033552805e-05, + "loss": 0.1401, + "step": 36389 + }, + { + "epoch": 0.6490564691613455, + "grad_norm": 0.2235003560781479, + "learning_rate": 1.6525679660754006e-05, + "loss": 0.0861, + "step": 36390 + }, + { + "epoch": 0.6490743052830592, + "grad_norm": 0.27728989720344543, + "learning_rate": 1.6524215320804602e-05, + "loss": 0.1419, + "step": 36391 + }, + { + "epoch": 0.6490921414047729, + "grad_norm": 0.20885300636291504, + "learning_rate": 1.652275101371026e-05, + "loss": 0.1097, + "step": 36392 + }, + { + "epoch": 0.6491099775264867, + "grad_norm": 0.16699521243572235, + "learning_rate": 1.652128673947666e-05, + "loss": 0.0667, + "step": 36393 + }, + { + "epoch": 0.6491278136482004, + "grad_norm": 0.25783777236938477, + "learning_rate": 1.651982249810949e-05, + "loss": 0.1369, + "step": 36394 + }, + { + "epoch": 0.6491456497699141, + "grad_norm": 0.31035658717155457, + "learning_rate": 1.6518358289614415e-05, + "loss": 0.1015, + "step": 36395 + }, + { + "epoch": 0.6491634858916278, + "grad_norm": 0.21649883687496185, + "learning_rate": 1.651689411399711e-05, + "loss": 0.1099, + "step": 36396 + }, + { + "epoch": 0.6491813220133414, + "grad_norm": 0.4408283233642578, + "learning_rate": 1.651542997126326e-05, + "loss": 0.1819, + "step": 36397 + }, + { + "epoch": 0.6491991581350551, + "grad_norm": 0.22239850461483002, + "learning_rate": 1.651396586141852e-05, + "loss": 0.0948, + "step": 36398 + }, + { + "epoch": 0.6492169942567688, + "grad_norm": 0.2303265631198883, + "learning_rate": 1.6512501784468588e-05, + "loss": 0.1221, + "step": 36399 + }, + { + "epoch": 0.6492348303784825, + "grad_norm": 0.3153400123119354, + "learning_rate": 1.651103774041913e-05, + "loss": 0.1081, + "step": 36400 + }, + { + "epoch": 0.6492526665001962, + "grad_norm": 0.25307878851890564, + "learning_rate": 1.6509573729275823e-05, + "loss": 0.1101, + "step": 36401 + }, + { + "epoch": 0.6492705026219099, + "grad_norm": 0.23737239837646484, + "learning_rate": 1.6508109751044334e-05, + "loss": 0.1311, + "step": 36402 + }, + { + "epoch": 0.6492883387436236, + "grad_norm": 0.3012915551662445, + "learning_rate": 1.6506645805730352e-05, + "loss": 0.1411, + "step": 36403 + }, + { + "epoch": 0.6493061748653373, + "grad_norm": 0.21931597590446472, + "learning_rate": 1.6505181893339548e-05, + "loss": 0.1417, + "step": 36404 + }, + { + "epoch": 0.649324010987051, + "grad_norm": 0.27210038900375366, + "learning_rate": 1.6503718013877583e-05, + "loss": 0.0862, + "step": 36405 + }, + { + "epoch": 0.6493418471087646, + "grad_norm": 0.23444069921970367, + "learning_rate": 1.6502254167350146e-05, + "loss": 0.1135, + "step": 36406 + }, + { + "epoch": 0.6493596832304783, + "grad_norm": 0.23188433051109314, + "learning_rate": 1.6500790353762903e-05, + "loss": 0.1388, + "step": 36407 + }, + { + "epoch": 0.649377519352192, + "grad_norm": 0.23815785348415375, + "learning_rate": 1.6499326573121538e-05, + "loss": 0.1237, + "step": 36408 + }, + { + "epoch": 0.6493953554739057, + "grad_norm": 0.24979661405086517, + "learning_rate": 1.649786282543172e-05, + "loss": 0.1382, + "step": 36409 + }, + { + "epoch": 0.6494131915956195, + "grad_norm": 0.21794262528419495, + "learning_rate": 1.6496399110699123e-05, + "loss": 0.1123, + "step": 36410 + }, + { + "epoch": 0.6494310277173332, + "grad_norm": 0.31610429286956787, + "learning_rate": 1.649493542892941e-05, + "loss": 0.0961, + "step": 36411 + }, + { + "epoch": 0.6494488638390469, + "grad_norm": 0.2783012092113495, + "learning_rate": 1.6493471780128272e-05, + "loss": 0.0985, + "step": 36412 + }, + { + "epoch": 0.6494666999607606, + "grad_norm": 0.35128551721572876, + "learning_rate": 1.6492008164301375e-05, + "loss": 0.1644, + "step": 36413 + }, + { + "epoch": 0.6494845360824743, + "grad_norm": 0.2892989218235016, + "learning_rate": 1.6490544581454394e-05, + "loss": 0.122, + "step": 36414 + }, + { + "epoch": 0.6495023722041879, + "grad_norm": 0.32591888308525085, + "learning_rate": 1.6489081031592996e-05, + "loss": 0.1734, + "step": 36415 + }, + { + "epoch": 0.6495202083259016, + "grad_norm": 0.2848375737667084, + "learning_rate": 1.6487617514722865e-05, + "loss": 0.1781, + "step": 36416 + }, + { + "epoch": 0.6495380444476153, + "grad_norm": 0.3675004541873932, + "learning_rate": 1.648615403084967e-05, + "loss": 0.0993, + "step": 36417 + }, + { + "epoch": 0.649555880569329, + "grad_norm": 0.2759046256542206, + "learning_rate": 1.6484690579979083e-05, + "loss": 0.0879, + "step": 36418 + }, + { + "epoch": 0.6495737166910427, + "grad_norm": 0.23948530852794647, + "learning_rate": 1.6483227162116765e-05, + "loss": 0.1219, + "step": 36419 + }, + { + "epoch": 0.6495915528127564, + "grad_norm": 0.240462526679039, + "learning_rate": 1.6481763777268404e-05, + "loss": 0.1548, + "step": 36420 + }, + { + "epoch": 0.6496093889344701, + "grad_norm": 0.255749374628067, + "learning_rate": 1.6480300425439678e-05, + "loss": 0.1049, + "step": 36421 + }, + { + "epoch": 0.6496272250561838, + "grad_norm": 0.30505040287971497, + "learning_rate": 1.6478837106636247e-05, + "loss": 0.1195, + "step": 36422 + }, + { + "epoch": 0.6496450611778974, + "grad_norm": 0.2784702777862549, + "learning_rate": 1.647737382086379e-05, + "loss": 0.0709, + "step": 36423 + }, + { + "epoch": 0.6496628972996111, + "grad_norm": 0.23502427339553833, + "learning_rate": 1.6475910568127962e-05, + "loss": 0.1001, + "step": 36424 + }, + { + "epoch": 0.6496807334213248, + "grad_norm": 0.341061532497406, + "learning_rate": 1.6474447348434458e-05, + "loss": 0.1271, + "step": 36425 + }, + { + "epoch": 0.6496985695430385, + "grad_norm": 0.29591915011405945, + "learning_rate": 1.647298416178894e-05, + "loss": 0.1814, + "step": 36426 + }, + { + "epoch": 0.6497164056647523, + "grad_norm": 0.26693111658096313, + "learning_rate": 1.6471521008197084e-05, + "loss": 0.11, + "step": 36427 + }, + { + "epoch": 0.649734241786466, + "grad_norm": 0.24729366600513458, + "learning_rate": 1.6470057887664544e-05, + "loss": 0.1333, + "step": 36428 + }, + { + "epoch": 0.6497520779081797, + "grad_norm": 0.4628967046737671, + "learning_rate": 1.6468594800197016e-05, + "loss": 0.1053, + "step": 36429 + }, + { + "epoch": 0.6497699140298934, + "grad_norm": 0.22024314105510712, + "learning_rate": 1.646713174580017e-05, + "loss": 0.1091, + "step": 36430 + }, + { + "epoch": 0.6497877501516071, + "grad_norm": 0.223907470703125, + "learning_rate": 1.646566872447966e-05, + "loss": 0.0935, + "step": 36431 + }, + { + "epoch": 0.6498055862733207, + "grad_norm": 0.2727627456188202, + "learning_rate": 1.6464205736241157e-05, + "loss": 0.128, + "step": 36432 + }, + { + "epoch": 0.6498234223950344, + "grad_norm": 0.3021184504032135, + "learning_rate": 1.6462742781090357e-05, + "loss": 0.1222, + "step": 36433 + }, + { + "epoch": 0.6498412585167481, + "grad_norm": 0.29952171444892883, + "learning_rate": 1.64612798590329e-05, + "loss": 0.1432, + "step": 36434 + }, + { + "epoch": 0.6498590946384618, + "grad_norm": 0.24076001346111298, + "learning_rate": 1.6459816970074483e-05, + "loss": 0.0952, + "step": 36435 + }, + { + "epoch": 0.6498769307601755, + "grad_norm": 0.2131306529045105, + "learning_rate": 1.6458354114220766e-05, + "loss": 0.1173, + "step": 36436 + }, + { + "epoch": 0.6498947668818892, + "grad_norm": 0.2728300094604492, + "learning_rate": 1.6456891291477407e-05, + "loss": 0.1367, + "step": 36437 + }, + { + "epoch": 0.6499126030036029, + "grad_norm": 0.1958208680152893, + "learning_rate": 1.6455428501850096e-05, + "loss": 0.1282, + "step": 36438 + }, + { + "epoch": 0.6499304391253166, + "grad_norm": 0.343657523393631, + "learning_rate": 1.6453965745344497e-05, + "loss": 0.1647, + "step": 36439 + }, + { + "epoch": 0.6499482752470302, + "grad_norm": 0.2814064919948578, + "learning_rate": 1.6452503021966272e-05, + "loss": 0.1054, + "step": 36440 + }, + { + "epoch": 0.6499661113687439, + "grad_norm": 0.2679210305213928, + "learning_rate": 1.6451040331721096e-05, + "loss": 0.101, + "step": 36441 + }, + { + "epoch": 0.6499839474904576, + "grad_norm": 0.23407524824142456, + "learning_rate": 1.6449577674614644e-05, + "loss": 0.0737, + "step": 36442 + }, + { + "epoch": 0.6500017836121714, + "grad_norm": 0.22013986110687256, + "learning_rate": 1.6448115050652584e-05, + "loss": 0.1094, + "step": 36443 + }, + { + "epoch": 0.6500196197338851, + "grad_norm": 0.2233494222164154, + "learning_rate": 1.644665245984059e-05, + "loss": 0.12, + "step": 36444 + }, + { + "epoch": 0.6500374558555988, + "grad_norm": 0.29774734377861023, + "learning_rate": 1.644518990218431e-05, + "loss": 0.135, + "step": 36445 + }, + { + "epoch": 0.6500552919773125, + "grad_norm": 0.2491357922554016, + "learning_rate": 1.6443727377689437e-05, + "loss": 0.1274, + "step": 36446 + }, + { + "epoch": 0.6500731280990262, + "grad_norm": 0.26864978671073914, + "learning_rate": 1.644226488636163e-05, + "loss": 0.1243, + "step": 36447 + }, + { + "epoch": 0.6500909642207399, + "grad_norm": 0.3040514588356018, + "learning_rate": 1.6440802428206565e-05, + "loss": 0.1074, + "step": 36448 + }, + { + "epoch": 0.6501088003424536, + "grad_norm": 0.301169753074646, + "learning_rate": 1.643934000322991e-05, + "loss": 0.0944, + "step": 36449 + }, + { + "epoch": 0.6501266364641672, + "grad_norm": 0.37722280621528625, + "learning_rate": 1.6437877611437314e-05, + "loss": 0.2293, + "step": 36450 + }, + { + "epoch": 0.6501444725858809, + "grad_norm": 0.26749464869499207, + "learning_rate": 1.643641525283447e-05, + "loss": 0.1476, + "step": 36451 + }, + { + "epoch": 0.6501623087075946, + "grad_norm": 0.25366055965423584, + "learning_rate": 1.6434952927427043e-05, + "loss": 0.1162, + "step": 36452 + }, + { + "epoch": 0.6501801448293083, + "grad_norm": 0.2756160497665405, + "learning_rate": 1.6433490635220687e-05, + "loss": 0.0932, + "step": 36453 + }, + { + "epoch": 0.650197980951022, + "grad_norm": 0.23393933475017548, + "learning_rate": 1.6432028376221086e-05, + "loss": 0.1422, + "step": 36454 + }, + { + "epoch": 0.6502158170727357, + "grad_norm": 0.3221278488636017, + "learning_rate": 1.64305661504339e-05, + "loss": 0.1636, + "step": 36455 + }, + { + "epoch": 0.6502336531944494, + "grad_norm": 0.3012202978134155, + "learning_rate": 1.6429103957864806e-05, + "loss": 0.1538, + "step": 36456 + }, + { + "epoch": 0.6502514893161631, + "grad_norm": 0.2315862476825714, + "learning_rate": 1.642764179851946e-05, + "loss": 0.1317, + "step": 36457 + }, + { + "epoch": 0.6502693254378767, + "grad_norm": 0.409334272146225, + "learning_rate": 1.642617967240353e-05, + "loss": 0.1573, + "step": 36458 + }, + { + "epoch": 0.6502871615595904, + "grad_norm": 0.31774845719337463, + "learning_rate": 1.6424717579522696e-05, + "loss": 0.1466, + "step": 36459 + }, + { + "epoch": 0.6503049976813042, + "grad_norm": 0.2690187990665436, + "learning_rate": 1.6423255519882622e-05, + "loss": 0.1109, + "step": 36460 + }, + { + "epoch": 0.6503228338030179, + "grad_norm": 0.30237069725990295, + "learning_rate": 1.6421793493488964e-05, + "loss": 0.1759, + "step": 36461 + }, + { + "epoch": 0.6503406699247316, + "grad_norm": 0.2925209403038025, + "learning_rate": 1.64203315003474e-05, + "loss": 0.1144, + "step": 36462 + }, + { + "epoch": 0.6503585060464453, + "grad_norm": 0.2588273882865906, + "learning_rate": 1.641886954046359e-05, + "loss": 0.1746, + "step": 36463 + }, + { + "epoch": 0.650376342168159, + "grad_norm": 0.30670827627182007, + "learning_rate": 1.6417407613843214e-05, + "loss": 0.1273, + "step": 36464 + }, + { + "epoch": 0.6503941782898727, + "grad_norm": 0.29957830905914307, + "learning_rate": 1.641594572049193e-05, + "loss": 0.1078, + "step": 36465 + }, + { + "epoch": 0.6504120144115864, + "grad_norm": 0.22714495658874512, + "learning_rate": 1.6414483860415404e-05, + "loss": 0.1129, + "step": 36466 + }, + { + "epoch": 0.6504298505333, + "grad_norm": 0.26168763637542725, + "learning_rate": 1.6413022033619294e-05, + "loss": 0.1372, + "step": 36467 + }, + { + "epoch": 0.6504476866550137, + "grad_norm": 0.2847360670566559, + "learning_rate": 1.6411560240109285e-05, + "loss": 0.0949, + "step": 36468 + }, + { + "epoch": 0.6504655227767274, + "grad_norm": 0.2242434173822403, + "learning_rate": 1.6410098479891035e-05, + "loss": 0.1013, + "step": 36469 + }, + { + "epoch": 0.6504833588984411, + "grad_norm": 0.21255548298358917, + "learning_rate": 1.6408636752970212e-05, + "loss": 0.1244, + "step": 36470 + }, + { + "epoch": 0.6505011950201548, + "grad_norm": 0.27606844902038574, + "learning_rate": 1.640717505935247e-05, + "loss": 0.1584, + "step": 36471 + }, + { + "epoch": 0.6505190311418685, + "grad_norm": 0.21023155748844147, + "learning_rate": 1.6405713399043493e-05, + "loss": 0.1066, + "step": 36472 + }, + { + "epoch": 0.6505368672635822, + "grad_norm": 0.3528434932231903, + "learning_rate": 1.6404251772048946e-05, + "loss": 0.1644, + "step": 36473 + }, + { + "epoch": 0.6505547033852959, + "grad_norm": 0.27668148279190063, + "learning_rate": 1.6402790178374484e-05, + "loss": 0.1425, + "step": 36474 + }, + { + "epoch": 0.6505725395070096, + "grad_norm": 0.26946985721588135, + "learning_rate": 1.6401328618025767e-05, + "loss": 0.0557, + "step": 36475 + }, + { + "epoch": 0.6505903756287232, + "grad_norm": 0.2576809227466583, + "learning_rate": 1.639986709100847e-05, + "loss": 0.0878, + "step": 36476 + }, + { + "epoch": 0.650608211750437, + "grad_norm": 0.285159170627594, + "learning_rate": 1.639840559732827e-05, + "loss": 0.0972, + "step": 36477 + }, + { + "epoch": 0.6506260478721507, + "grad_norm": 0.25818583369255066, + "learning_rate": 1.6396944136990817e-05, + "loss": 0.1072, + "step": 36478 + }, + { + "epoch": 0.6506438839938644, + "grad_norm": 0.24948260188102722, + "learning_rate": 1.6395482710001787e-05, + "loss": 0.1252, + "step": 36479 + }, + { + "epoch": 0.6506617201155781, + "grad_norm": 0.2777738869190216, + "learning_rate": 1.639402131636682e-05, + "loss": 0.115, + "step": 36480 + }, + { + "epoch": 0.6506795562372918, + "grad_norm": 0.288896769285202, + "learning_rate": 1.6392559956091615e-05, + "loss": 0.1548, + "step": 36481 + }, + { + "epoch": 0.6506973923590055, + "grad_norm": 0.23145286738872528, + "learning_rate": 1.639109862918181e-05, + "loss": 0.1228, + "step": 36482 + }, + { + "epoch": 0.6507152284807192, + "grad_norm": 0.2289765328168869, + "learning_rate": 1.6389637335643088e-05, + "loss": 0.143, + "step": 36483 + }, + { + "epoch": 0.6507330646024329, + "grad_norm": 0.31107097864151, + "learning_rate": 1.6388176075481092e-05, + "loss": 0.1567, + "step": 36484 + }, + { + "epoch": 0.6507509007241465, + "grad_norm": 0.29385697841644287, + "learning_rate": 1.6386714848701513e-05, + "loss": 0.1277, + "step": 36485 + }, + { + "epoch": 0.6507687368458602, + "grad_norm": 0.32878878712654114, + "learning_rate": 1.6385253655310003e-05, + "loss": 0.1413, + "step": 36486 + }, + { + "epoch": 0.6507865729675739, + "grad_norm": 0.30792805552482605, + "learning_rate": 1.638379249531222e-05, + "loss": 0.1025, + "step": 36487 + }, + { + "epoch": 0.6508044090892876, + "grad_norm": 0.3447185754776001, + "learning_rate": 1.6382331368713834e-05, + "loss": 0.103, + "step": 36488 + }, + { + "epoch": 0.6508222452110013, + "grad_norm": 0.26704999804496765, + "learning_rate": 1.638087027552051e-05, + "loss": 0.1207, + "step": 36489 + }, + { + "epoch": 0.650840081332715, + "grad_norm": 0.23830831050872803, + "learning_rate": 1.6379409215737905e-05, + "loss": 0.0776, + "step": 36490 + }, + { + "epoch": 0.6508579174544287, + "grad_norm": 0.22711147367954254, + "learning_rate": 1.6377948189371694e-05, + "loss": 0.1273, + "step": 36491 + }, + { + "epoch": 0.6508757535761424, + "grad_norm": 0.2503523528575897, + "learning_rate": 1.6376487196427533e-05, + "loss": 0.0811, + "step": 36492 + }, + { + "epoch": 0.650893589697856, + "grad_norm": 0.2805105745792389, + "learning_rate": 1.637502623691108e-05, + "loss": 0.0857, + "step": 36493 + }, + { + "epoch": 0.6509114258195698, + "grad_norm": 0.296419620513916, + "learning_rate": 1.637356531082801e-05, + "loss": 0.1356, + "step": 36494 + }, + { + "epoch": 0.6509292619412835, + "grad_norm": 0.376533180475235, + "learning_rate": 1.637210441818398e-05, + "loss": 0.1345, + "step": 36495 + }, + { + "epoch": 0.6509470980629972, + "grad_norm": 0.3167206048965454, + "learning_rate": 1.6370643558984648e-05, + "loss": 0.1859, + "step": 36496 + }, + { + "epoch": 0.6509649341847109, + "grad_norm": 0.24843305349349976, + "learning_rate": 1.6369182733235677e-05, + "loss": 0.126, + "step": 36497 + }, + { + "epoch": 0.6509827703064246, + "grad_norm": 0.24652425944805145, + "learning_rate": 1.6367721940942748e-05, + "loss": 0.1316, + "step": 36498 + }, + { + "epoch": 0.6510006064281383, + "grad_norm": 0.3290373980998993, + "learning_rate": 1.6366261182111507e-05, + "loss": 0.1064, + "step": 36499 + }, + { + "epoch": 0.651018442549852, + "grad_norm": 0.3015643060207367, + "learning_rate": 1.6364800456747622e-05, + "loss": 0.1616, + "step": 36500 + }, + { + "epoch": 0.6510362786715657, + "grad_norm": 0.32982712984085083, + "learning_rate": 1.6363339764856745e-05, + "loss": 0.201, + "step": 36501 + }, + { + "epoch": 0.6510541147932793, + "grad_norm": 0.26367032527923584, + "learning_rate": 1.636187910644455e-05, + "loss": 0.1098, + "step": 36502 + }, + { + "epoch": 0.651071950914993, + "grad_norm": 0.1986067146062851, + "learning_rate": 1.636041848151669e-05, + "loss": 0.0999, + "step": 36503 + }, + { + "epoch": 0.6510897870367067, + "grad_norm": 0.28099775314331055, + "learning_rate": 1.6358957890078846e-05, + "loss": 0.0872, + "step": 36504 + }, + { + "epoch": 0.6511076231584204, + "grad_norm": 0.2609463036060333, + "learning_rate": 1.635749733213666e-05, + "loss": 0.1042, + "step": 36505 + }, + { + "epoch": 0.6511254592801341, + "grad_norm": 0.3888946771621704, + "learning_rate": 1.6356036807695788e-05, + "loss": 0.1265, + "step": 36506 + }, + { + "epoch": 0.6511432954018478, + "grad_norm": 0.4863142967224121, + "learning_rate": 1.6354576316761915e-05, + "loss": 0.1389, + "step": 36507 + }, + { + "epoch": 0.6511611315235615, + "grad_norm": 0.23898905515670776, + "learning_rate": 1.635311585934069e-05, + "loss": 0.1282, + "step": 36508 + }, + { + "epoch": 0.6511789676452752, + "grad_norm": 0.2688154876232147, + "learning_rate": 1.635165543543777e-05, + "loss": 0.127, + "step": 36509 + }, + { + "epoch": 0.6511968037669889, + "grad_norm": 0.3582136332988739, + "learning_rate": 1.6350195045058817e-05, + "loss": 0.1177, + "step": 36510 + }, + { + "epoch": 0.6512146398887027, + "grad_norm": 0.30040034651756287, + "learning_rate": 1.63487346882095e-05, + "loss": 0.1222, + "step": 36511 + }, + { + "epoch": 0.6512324760104163, + "grad_norm": 0.27914533019065857, + "learning_rate": 1.6347274364895476e-05, + "loss": 0.1124, + "step": 36512 + }, + { + "epoch": 0.65125031213213, + "grad_norm": 0.3377338647842407, + "learning_rate": 1.6345814075122407e-05, + "loss": 0.1273, + "step": 36513 + }, + { + "epoch": 0.6512681482538437, + "grad_norm": 0.27192583680152893, + "learning_rate": 1.634435381889594e-05, + "loss": 0.1565, + "step": 36514 + }, + { + "epoch": 0.6512859843755574, + "grad_norm": 0.30426520109176636, + "learning_rate": 1.634289359622176e-05, + "loss": 0.1439, + "step": 36515 + }, + { + "epoch": 0.6513038204972711, + "grad_norm": 0.24232463538646698, + "learning_rate": 1.6341433407105518e-05, + "loss": 0.1534, + "step": 36516 + }, + { + "epoch": 0.6513216566189848, + "grad_norm": 0.26685845851898193, + "learning_rate": 1.633997325155286e-05, + "loss": 0.1123, + "step": 36517 + }, + { + "epoch": 0.6513394927406985, + "grad_norm": 0.21063801646232605, + "learning_rate": 1.6338513129569463e-05, + "loss": 0.1038, + "step": 36518 + }, + { + "epoch": 0.6513573288624122, + "grad_norm": 0.3503827154636383, + "learning_rate": 1.633705304116097e-05, + "loss": 0.1289, + "step": 36519 + }, + { + "epoch": 0.6513751649841258, + "grad_norm": 0.2744157016277313, + "learning_rate": 1.6335592986333063e-05, + "loss": 0.148, + "step": 36520 + }, + { + "epoch": 0.6513930011058395, + "grad_norm": 0.2273973673582077, + "learning_rate": 1.633413296509139e-05, + "loss": 0.1122, + "step": 36521 + }, + { + "epoch": 0.6514108372275532, + "grad_norm": 0.32366955280303955, + "learning_rate": 1.6332672977441608e-05, + "loss": 0.1019, + "step": 36522 + }, + { + "epoch": 0.6514286733492669, + "grad_norm": 0.4621117413043976, + "learning_rate": 1.6331213023389374e-05, + "loss": 0.1288, + "step": 36523 + }, + { + "epoch": 0.6514465094709806, + "grad_norm": 0.2193380892276764, + "learning_rate": 1.6329753102940353e-05, + "loss": 0.1161, + "step": 36524 + }, + { + "epoch": 0.6514643455926943, + "grad_norm": 0.3388438820838928, + "learning_rate": 1.6328293216100214e-05, + "loss": 0.1447, + "step": 36525 + }, + { + "epoch": 0.651482181714408, + "grad_norm": 0.24322980642318726, + "learning_rate": 1.63268333628746e-05, + "loss": 0.1169, + "step": 36526 + }, + { + "epoch": 0.6515000178361217, + "grad_norm": 0.3384764790534973, + "learning_rate": 1.632537354326917e-05, + "loss": 0.1764, + "step": 36527 + }, + { + "epoch": 0.6515178539578355, + "grad_norm": 0.3127395212650299, + "learning_rate": 1.6323913757289593e-05, + "loss": 0.118, + "step": 36528 + }, + { + "epoch": 0.6515356900795491, + "grad_norm": 0.22918832302093506, + "learning_rate": 1.632245400494153e-05, + "loss": 0.0829, + "step": 36529 + }, + { + "epoch": 0.6515535262012628, + "grad_norm": 0.19899854063987732, + "learning_rate": 1.632099428623062e-05, + "loss": 0.0791, + "step": 36530 + }, + { + "epoch": 0.6515713623229765, + "grad_norm": 0.2879485785961151, + "learning_rate": 1.6319534601162544e-05, + "loss": 0.1024, + "step": 36531 + }, + { + "epoch": 0.6515891984446902, + "grad_norm": 0.22283729910850525, + "learning_rate": 1.6318074949742934e-05, + "loss": 0.0893, + "step": 36532 + }, + { + "epoch": 0.6516070345664039, + "grad_norm": 0.24246029555797577, + "learning_rate": 1.631661533197748e-05, + "loss": 0.1398, + "step": 36533 + }, + { + "epoch": 0.6516248706881176, + "grad_norm": 0.35749685764312744, + "learning_rate": 1.6315155747871823e-05, + "loss": 0.1366, + "step": 36534 + }, + { + "epoch": 0.6516427068098313, + "grad_norm": 0.3563341796398163, + "learning_rate": 1.631369619743162e-05, + "loss": 0.1227, + "step": 36535 + }, + { + "epoch": 0.651660542931545, + "grad_norm": 0.208069309592247, + "learning_rate": 1.6312236680662525e-05, + "loss": 0.1073, + "step": 36536 + }, + { + "epoch": 0.6516783790532586, + "grad_norm": 0.25359785556793213, + "learning_rate": 1.6310777197570205e-05, + "loss": 0.1136, + "step": 36537 + }, + { + "epoch": 0.6516962151749723, + "grad_norm": 0.273954838514328, + "learning_rate": 1.6309317748160312e-05, + "loss": 0.1347, + "step": 36538 + }, + { + "epoch": 0.651714051296686, + "grad_norm": 0.3235228955745697, + "learning_rate": 1.6307858332438513e-05, + "loss": 0.1697, + "step": 36539 + }, + { + "epoch": 0.6517318874183997, + "grad_norm": 0.22341467440128326, + "learning_rate": 1.6306398950410443e-05, + "loss": 0.0636, + "step": 36540 + }, + { + "epoch": 0.6517497235401134, + "grad_norm": 0.24738852679729462, + "learning_rate": 1.6304939602081783e-05, + "loss": 0.1257, + "step": 36541 + }, + { + "epoch": 0.6517675596618271, + "grad_norm": 0.6136704683303833, + "learning_rate": 1.6303480287458184e-05, + "loss": 0.1652, + "step": 36542 + }, + { + "epoch": 0.6517853957835408, + "grad_norm": 0.2681564390659332, + "learning_rate": 1.6302021006545296e-05, + "loss": 0.1095, + "step": 36543 + }, + { + "epoch": 0.6518032319052546, + "grad_norm": 0.30172199010849, + "learning_rate": 1.6300561759348775e-05, + "loss": 0.1739, + "step": 36544 + }, + { + "epoch": 0.6518210680269683, + "grad_norm": 0.22530655562877655, + "learning_rate": 1.629910254587428e-05, + "loss": 0.1269, + "step": 36545 + }, + { + "epoch": 0.651838904148682, + "grad_norm": 0.18717870116233826, + "learning_rate": 1.629764336612747e-05, + "loss": 0.0847, + "step": 36546 + }, + { + "epoch": 0.6518567402703956, + "grad_norm": 0.3527531623840332, + "learning_rate": 1.6296184220114005e-05, + "loss": 0.1493, + "step": 36547 + }, + { + "epoch": 0.6518745763921093, + "grad_norm": 0.26330482959747314, + "learning_rate": 1.6294725107839536e-05, + "loss": 0.119, + "step": 36548 + }, + { + "epoch": 0.651892412513823, + "grad_norm": 0.26929330825805664, + "learning_rate": 1.6293266029309707e-05, + "loss": 0.0991, + "step": 36549 + }, + { + "epoch": 0.6519102486355367, + "grad_norm": 0.28058624267578125, + "learning_rate": 1.62918069845302e-05, + "loss": 0.1234, + "step": 36550 + }, + { + "epoch": 0.6519280847572504, + "grad_norm": 0.2653823792934418, + "learning_rate": 1.6290347973506656e-05, + "loss": 0.0905, + "step": 36551 + }, + { + "epoch": 0.6519459208789641, + "grad_norm": 0.24755524098873138, + "learning_rate": 1.6288888996244726e-05, + "loss": 0.1466, + "step": 36552 + }, + { + "epoch": 0.6519637570006778, + "grad_norm": 0.23279769718647003, + "learning_rate": 1.6287430052750062e-05, + "loss": 0.1251, + "step": 36553 + }, + { + "epoch": 0.6519815931223915, + "grad_norm": 0.2427375167608261, + "learning_rate": 1.6285971143028344e-05, + "loss": 0.066, + "step": 36554 + }, + { + "epoch": 0.6519994292441051, + "grad_norm": 0.27041172981262207, + "learning_rate": 1.6284512267085212e-05, + "loss": 0.1446, + "step": 36555 + }, + { + "epoch": 0.6520172653658188, + "grad_norm": 0.22511109709739685, + "learning_rate": 1.6283053424926314e-05, + "loss": 0.1106, + "step": 36556 + }, + { + "epoch": 0.6520351014875325, + "grad_norm": 0.27532774209976196, + "learning_rate": 1.6281594616557312e-05, + "loss": 0.1333, + "step": 36557 + }, + { + "epoch": 0.6520529376092462, + "grad_norm": 0.2043699026107788, + "learning_rate": 1.628013584198386e-05, + "loss": 0.0577, + "step": 36558 + }, + { + "epoch": 0.6520707737309599, + "grad_norm": 0.3530641794204712, + "learning_rate": 1.6278677101211615e-05, + "loss": 0.1114, + "step": 36559 + }, + { + "epoch": 0.6520886098526736, + "grad_norm": 0.22718781232833862, + "learning_rate": 1.627721839424623e-05, + "loss": 0.099, + "step": 36560 + }, + { + "epoch": 0.6521064459743874, + "grad_norm": 0.2519572079181671, + "learning_rate": 1.627575972109336e-05, + "loss": 0.1555, + "step": 36561 + }, + { + "epoch": 0.6521242820961011, + "grad_norm": 0.2823973298072815, + "learning_rate": 1.6274301081758653e-05, + "loss": 0.1236, + "step": 36562 + }, + { + "epoch": 0.6521421182178148, + "grad_norm": 0.25831878185272217, + "learning_rate": 1.6272842476247774e-05, + "loss": 0.1324, + "step": 36563 + }, + { + "epoch": 0.6521599543395284, + "grad_norm": 0.2902027666568756, + "learning_rate": 1.6271383904566374e-05, + "loss": 0.0829, + "step": 36564 + }, + { + "epoch": 0.6521777904612421, + "grad_norm": 0.2948017120361328, + "learning_rate": 1.6269925366720102e-05, + "loss": 0.1481, + "step": 36565 + }, + { + "epoch": 0.6521956265829558, + "grad_norm": 0.2625763416290283, + "learning_rate": 1.626846686271461e-05, + "loss": 0.0787, + "step": 36566 + }, + { + "epoch": 0.6522134627046695, + "grad_norm": 0.3372184932231903, + "learning_rate": 1.6267008392555556e-05, + "loss": 0.1731, + "step": 36567 + }, + { + "epoch": 0.6522312988263832, + "grad_norm": 0.3237401247024536, + "learning_rate": 1.6265549956248606e-05, + "loss": 0.1827, + "step": 36568 + }, + { + "epoch": 0.6522491349480969, + "grad_norm": 0.2846832871437073, + "learning_rate": 1.626409155379939e-05, + "loss": 0.1373, + "step": 36569 + }, + { + "epoch": 0.6522669710698106, + "grad_norm": 0.30843910574913025, + "learning_rate": 1.6262633185213582e-05, + "loss": 0.12, + "step": 36570 + }, + { + "epoch": 0.6522848071915243, + "grad_norm": 0.24406075477600098, + "learning_rate": 1.6261174850496812e-05, + "loss": 0.0752, + "step": 36571 + }, + { + "epoch": 0.652302643313238, + "grad_norm": 0.2792684733867645, + "learning_rate": 1.6259716549654757e-05, + "loss": 0.1378, + "step": 36572 + }, + { + "epoch": 0.6523204794349516, + "grad_norm": 0.240322083234787, + "learning_rate": 1.6258258282693053e-05, + "loss": 0.1248, + "step": 36573 + }, + { + "epoch": 0.6523383155566653, + "grad_norm": 0.29805871844291687, + "learning_rate": 1.6256800049617367e-05, + "loss": 0.151, + "step": 36574 + }, + { + "epoch": 0.652356151678379, + "grad_norm": 0.19931620359420776, + "learning_rate": 1.625534185043333e-05, + "loss": 0.1365, + "step": 36575 + }, + { + "epoch": 0.6523739878000927, + "grad_norm": 0.24127689003944397, + "learning_rate": 1.6253883685146622e-05, + "loss": 0.1463, + "step": 36576 + }, + { + "epoch": 0.6523918239218064, + "grad_norm": 0.29494351148605347, + "learning_rate": 1.625242555376288e-05, + "loss": 0.1675, + "step": 36577 + }, + { + "epoch": 0.6524096600435202, + "grad_norm": 0.296726256608963, + "learning_rate": 1.625096745628776e-05, + "loss": 0.0924, + "step": 36578 + }, + { + "epoch": 0.6524274961652339, + "grad_norm": 0.2584410011768341, + "learning_rate": 1.6249509392726896e-05, + "loss": 0.1329, + "step": 36579 + }, + { + "epoch": 0.6524453322869476, + "grad_norm": 0.30448561906814575, + "learning_rate": 1.6248051363085966e-05, + "loss": 0.1142, + "step": 36580 + }, + { + "epoch": 0.6524631684086613, + "grad_norm": 0.29676708579063416, + "learning_rate": 1.6246593367370616e-05, + "loss": 0.081, + "step": 36581 + }, + { + "epoch": 0.6524810045303749, + "grad_norm": 0.28955918550491333, + "learning_rate": 1.624513540558649e-05, + "loss": 0.1582, + "step": 36582 + }, + { + "epoch": 0.6524988406520886, + "grad_norm": 0.3830130398273468, + "learning_rate": 1.624367747773925e-05, + "loss": 0.1056, + "step": 36583 + }, + { + "epoch": 0.6525166767738023, + "grad_norm": 0.2280164361000061, + "learning_rate": 1.624221958383453e-05, + "loss": 0.0884, + "step": 36584 + }, + { + "epoch": 0.652534512895516, + "grad_norm": 0.16443973779678345, + "learning_rate": 1.6240761723877994e-05, + "loss": 0.0975, + "step": 36585 + }, + { + "epoch": 0.6525523490172297, + "grad_norm": 0.2433801144361496, + "learning_rate": 1.623930389787529e-05, + "loss": 0.1158, + "step": 36586 + }, + { + "epoch": 0.6525701851389434, + "grad_norm": 0.33221641182899475, + "learning_rate": 1.623784610583208e-05, + "loss": 0.1469, + "step": 36587 + }, + { + "epoch": 0.6525880212606571, + "grad_norm": 0.6027882695198059, + "learning_rate": 1.6236388347753984e-05, + "loss": 0.1325, + "step": 36588 + }, + { + "epoch": 0.6526058573823708, + "grad_norm": 0.23628756403923035, + "learning_rate": 1.623493062364669e-05, + "loss": 0.165, + "step": 36589 + }, + { + "epoch": 0.6526236935040844, + "grad_norm": 0.29483330249786377, + "learning_rate": 1.623347293351583e-05, + "loss": 0.0888, + "step": 36590 + }, + { + "epoch": 0.6526415296257981, + "grad_norm": 0.2532746493816376, + "learning_rate": 1.6232015277367062e-05, + "loss": 0.1564, + "step": 36591 + }, + { + "epoch": 0.6526593657475118, + "grad_norm": 0.3249821066856384, + "learning_rate": 1.6230557655206015e-05, + "loss": 0.1461, + "step": 36592 + }, + { + "epoch": 0.6526772018692255, + "grad_norm": 0.2694990336894989, + "learning_rate": 1.6229100067038372e-05, + "loss": 0.1127, + "step": 36593 + }, + { + "epoch": 0.6526950379909392, + "grad_norm": 0.32716861367225647, + "learning_rate": 1.6227642512869757e-05, + "loss": 0.151, + "step": 36594 + }, + { + "epoch": 0.652712874112653, + "grad_norm": 0.29200080037117004, + "learning_rate": 1.6226184992705833e-05, + "loss": 0.1053, + "step": 36595 + }, + { + "epoch": 0.6527307102343667, + "grad_norm": 0.32692429423332214, + "learning_rate": 1.622472750655225e-05, + "loss": 0.1206, + "step": 36596 + }, + { + "epoch": 0.6527485463560804, + "grad_norm": 0.258450984954834, + "learning_rate": 1.6223270054414642e-05, + "loss": 0.1286, + "step": 36597 + }, + { + "epoch": 0.6527663824777941, + "grad_norm": 0.3511247932910919, + "learning_rate": 1.622181263629868e-05, + "loss": 0.1628, + "step": 36598 + }, + { + "epoch": 0.6527842185995077, + "grad_norm": 0.25084805488586426, + "learning_rate": 1.6220355252210008e-05, + "loss": 0.0885, + "step": 36599 + }, + { + "epoch": 0.6528020547212214, + "grad_norm": 0.2578677535057068, + "learning_rate": 1.6218897902154263e-05, + "loss": 0.1078, + "step": 36600 + }, + { + "epoch": 0.6528198908429351, + "grad_norm": 0.39849749207496643, + "learning_rate": 1.6217440586137097e-05, + "loss": 0.2284, + "step": 36601 + }, + { + "epoch": 0.6528377269646488, + "grad_norm": 0.1891050636768341, + "learning_rate": 1.6215983304164177e-05, + "loss": 0.0667, + "step": 36602 + }, + { + "epoch": 0.6528555630863625, + "grad_norm": 0.260824054479599, + "learning_rate": 1.6214526056241138e-05, + "loss": 0.1826, + "step": 36603 + }, + { + "epoch": 0.6528733992080762, + "grad_norm": 0.3125922977924347, + "learning_rate": 1.621306884237363e-05, + "loss": 0.1529, + "step": 36604 + }, + { + "epoch": 0.6528912353297899, + "grad_norm": 0.28257906436920166, + "learning_rate": 1.6211611662567294e-05, + "loss": 0.1198, + "step": 36605 + }, + { + "epoch": 0.6529090714515036, + "grad_norm": 0.3025089204311371, + "learning_rate": 1.62101545168278e-05, + "loss": 0.089, + "step": 36606 + }, + { + "epoch": 0.6529269075732173, + "grad_norm": 0.25927066802978516, + "learning_rate": 1.620869740516077e-05, + "loss": 0.1249, + "step": 36607 + }, + { + "epoch": 0.6529447436949309, + "grad_norm": 0.2381625920534134, + "learning_rate": 1.6207240327571875e-05, + "loss": 0.1176, + "step": 36608 + }, + { + "epoch": 0.6529625798166446, + "grad_norm": 0.29237988591194153, + "learning_rate": 1.620578328406675e-05, + "loss": 0.1321, + "step": 36609 + }, + { + "epoch": 0.6529804159383583, + "grad_norm": 0.3511929512023926, + "learning_rate": 1.6204326274651037e-05, + "loss": 0.1419, + "step": 36610 + }, + { + "epoch": 0.652998252060072, + "grad_norm": 0.23130159080028534, + "learning_rate": 1.6202869299330407e-05, + "loss": 0.101, + "step": 36611 + }, + { + "epoch": 0.6530160881817858, + "grad_norm": 0.3628959655761719, + "learning_rate": 1.6201412358110493e-05, + "loss": 0.1716, + "step": 36612 + }, + { + "epoch": 0.6530339243034995, + "grad_norm": 0.286790132522583, + "learning_rate": 1.619995545099694e-05, + "loss": 0.1554, + "step": 36613 + }, + { + "epoch": 0.6530517604252132, + "grad_norm": 0.33376434445381165, + "learning_rate": 1.6198498577995397e-05, + "loss": 0.1423, + "step": 36614 + }, + { + "epoch": 0.6530695965469269, + "grad_norm": 0.20308566093444824, + "learning_rate": 1.6197041739111514e-05, + "loss": 0.1381, + "step": 36615 + }, + { + "epoch": 0.6530874326686406, + "grad_norm": 0.21876616775989532, + "learning_rate": 1.619558493435094e-05, + "loss": 0.1319, + "step": 36616 + }, + { + "epoch": 0.6531052687903542, + "grad_norm": 0.2928486764431, + "learning_rate": 1.619412816371932e-05, + "loss": 0.132, + "step": 36617 + }, + { + "epoch": 0.6531231049120679, + "grad_norm": 0.3087984621524811, + "learning_rate": 1.6192671427222296e-05, + "loss": 0.1788, + "step": 36618 + }, + { + "epoch": 0.6531409410337816, + "grad_norm": 0.23868723213672638, + "learning_rate": 1.619121472486552e-05, + "loss": 0.1063, + "step": 36619 + }, + { + "epoch": 0.6531587771554953, + "grad_norm": 0.25535544753074646, + "learning_rate": 1.6189758056654646e-05, + "loss": 0.1242, + "step": 36620 + }, + { + "epoch": 0.653176613277209, + "grad_norm": 0.26487067341804504, + "learning_rate": 1.618830142259531e-05, + "loss": 0.1234, + "step": 36621 + }, + { + "epoch": 0.6531944493989227, + "grad_norm": 0.20190942287445068, + "learning_rate": 1.618684482269316e-05, + "loss": 0.1064, + "step": 36622 + }, + { + "epoch": 0.6532122855206364, + "grad_norm": 0.29771608114242554, + "learning_rate": 1.6185388256953834e-05, + "loss": 0.1384, + "step": 36623 + }, + { + "epoch": 0.6532301216423501, + "grad_norm": 0.2840154767036438, + "learning_rate": 1.6183931725383e-05, + "loss": 0.1603, + "step": 36624 + }, + { + "epoch": 0.6532479577640637, + "grad_norm": 0.31259262561798096, + "learning_rate": 1.618247522798629e-05, + "loss": 0.1127, + "step": 36625 + }, + { + "epoch": 0.6532657938857774, + "grad_norm": 0.2954271733760834, + "learning_rate": 1.618101876476935e-05, + "loss": 0.0857, + "step": 36626 + }, + { + "epoch": 0.6532836300074911, + "grad_norm": 0.2797059118747711, + "learning_rate": 1.6179562335737822e-05, + "loss": 0.1001, + "step": 36627 + }, + { + "epoch": 0.6533014661292048, + "grad_norm": 0.20581994950771332, + "learning_rate": 1.6178105940897365e-05, + "loss": 0.109, + "step": 36628 + }, + { + "epoch": 0.6533193022509186, + "grad_norm": 0.24776767194271088, + "learning_rate": 1.6176649580253613e-05, + "loss": 0.1214, + "step": 36629 + }, + { + "epoch": 0.6533371383726323, + "grad_norm": 0.29020121693611145, + "learning_rate": 1.6175193253812215e-05, + "loss": 0.098, + "step": 36630 + }, + { + "epoch": 0.653354974494346, + "grad_norm": 0.21446137130260468, + "learning_rate": 1.6173736961578804e-05, + "loss": 0.1118, + "step": 36631 + }, + { + "epoch": 0.6533728106160597, + "grad_norm": 0.3136409819126129, + "learning_rate": 1.6172280703559057e-05, + "loss": 0.1483, + "step": 36632 + }, + { + "epoch": 0.6533906467377734, + "grad_norm": 0.3030506670475006, + "learning_rate": 1.617082447975859e-05, + "loss": 0.14, + "step": 36633 + }, + { + "epoch": 0.653408482859487, + "grad_norm": 0.2149314284324646, + "learning_rate": 1.6169368290183056e-05, + "loss": 0.1223, + "step": 36634 + }, + { + "epoch": 0.6534263189812007, + "grad_norm": 0.29998061060905457, + "learning_rate": 1.6167912134838102e-05, + "loss": 0.1749, + "step": 36635 + }, + { + "epoch": 0.6534441551029144, + "grad_norm": 0.24123790860176086, + "learning_rate": 1.6166456013729365e-05, + "loss": 0.1209, + "step": 36636 + }, + { + "epoch": 0.6534619912246281, + "grad_norm": 0.3291873335838318, + "learning_rate": 1.6164999926862505e-05, + "loss": 0.1397, + "step": 36637 + }, + { + "epoch": 0.6534798273463418, + "grad_norm": 0.2772262990474701, + "learning_rate": 1.6163543874243155e-05, + "loss": 0.1123, + "step": 36638 + }, + { + "epoch": 0.6534976634680555, + "grad_norm": 0.2379869520664215, + "learning_rate": 1.616208785587696e-05, + "loss": 0.0894, + "step": 36639 + }, + { + "epoch": 0.6535154995897692, + "grad_norm": 0.21421286463737488, + "learning_rate": 1.616063187176956e-05, + "loss": 0.1177, + "step": 36640 + }, + { + "epoch": 0.6535333357114829, + "grad_norm": 0.324499249458313, + "learning_rate": 1.6159175921926612e-05, + "loss": 0.1484, + "step": 36641 + }, + { + "epoch": 0.6535511718331966, + "grad_norm": 0.2708475589752197, + "learning_rate": 1.6157720006353744e-05, + "loss": 0.1897, + "step": 36642 + }, + { + "epoch": 0.6535690079549102, + "grad_norm": 0.19356797635555267, + "learning_rate": 1.6156264125056613e-05, + "loss": 0.1316, + "step": 36643 + }, + { + "epoch": 0.6535868440766239, + "grad_norm": 0.21283204853534698, + "learning_rate": 1.615480827804085e-05, + "loss": 0.1091, + "step": 36644 + }, + { + "epoch": 0.6536046801983377, + "grad_norm": 0.24207979440689087, + "learning_rate": 1.615335246531211e-05, + "loss": 0.0879, + "step": 36645 + }, + { + "epoch": 0.6536225163200514, + "grad_norm": 0.249407559633255, + "learning_rate": 1.6151896686876035e-05, + "loss": 0.1135, + "step": 36646 + }, + { + "epoch": 0.6536403524417651, + "grad_norm": 0.20008577406406403, + "learning_rate": 1.6150440942738265e-05, + "loss": 0.0819, + "step": 36647 + }, + { + "epoch": 0.6536581885634788, + "grad_norm": 0.2643984258174896, + "learning_rate": 1.6148985232904434e-05, + "loss": 0.1165, + "step": 36648 + }, + { + "epoch": 0.6536760246851925, + "grad_norm": 0.24120087921619415, + "learning_rate": 1.61475295573802e-05, + "loss": 0.1659, + "step": 36649 + }, + { + "epoch": 0.6536938608069062, + "grad_norm": 0.25766047835350037, + "learning_rate": 1.614607391617119e-05, + "loss": 0.2027, + "step": 36650 + }, + { + "epoch": 0.6537116969286199, + "grad_norm": 0.2485821694135666, + "learning_rate": 1.6144618309283066e-05, + "loss": 0.155, + "step": 36651 + }, + { + "epoch": 0.6537295330503335, + "grad_norm": 0.26759710907936096, + "learning_rate": 1.6143162736721455e-05, + "loss": 0.1197, + "step": 36652 + }, + { + "epoch": 0.6537473691720472, + "grad_norm": 0.256425678730011, + "learning_rate": 1.6141707198492e-05, + "loss": 0.0932, + "step": 36653 + }, + { + "epoch": 0.6537652052937609, + "grad_norm": 0.26338186860084534, + "learning_rate": 1.6140251694600357e-05, + "loss": 0.1373, + "step": 36654 + }, + { + "epoch": 0.6537830414154746, + "grad_norm": 0.3361250162124634, + "learning_rate": 1.6138796225052156e-05, + "loss": 0.1741, + "step": 36655 + }, + { + "epoch": 0.6538008775371883, + "grad_norm": 0.24860045313835144, + "learning_rate": 1.6137340789853038e-05, + "loss": 0.1433, + "step": 36656 + }, + { + "epoch": 0.653818713658902, + "grad_norm": 0.2154330611228943, + "learning_rate": 1.6135885389008644e-05, + "loss": 0.0919, + "step": 36657 + }, + { + "epoch": 0.6538365497806157, + "grad_norm": 0.23145145177841187, + "learning_rate": 1.6134430022524627e-05, + "loss": 0.1149, + "step": 36658 + }, + { + "epoch": 0.6538543859023294, + "grad_norm": 0.2496713399887085, + "learning_rate": 1.6132974690406623e-05, + "loss": 0.1318, + "step": 36659 + }, + { + "epoch": 0.653872222024043, + "grad_norm": 0.26362666487693787, + "learning_rate": 1.613151939266027e-05, + "loss": 0.1569, + "step": 36660 + }, + { + "epoch": 0.6538900581457567, + "grad_norm": 0.3588272035121918, + "learning_rate": 1.6130064129291205e-05, + "loss": 0.1902, + "step": 36661 + }, + { + "epoch": 0.6539078942674705, + "grad_norm": 0.21589115262031555, + "learning_rate": 1.612860890030508e-05, + "loss": 0.1523, + "step": 36662 + }, + { + "epoch": 0.6539257303891842, + "grad_norm": 0.3719049394130707, + "learning_rate": 1.6127153705707525e-05, + "loss": 0.1664, + "step": 36663 + }, + { + "epoch": 0.6539435665108979, + "grad_norm": 0.36984556913375854, + "learning_rate": 1.61256985455042e-05, + "loss": 0.1154, + "step": 36664 + }, + { + "epoch": 0.6539614026326116, + "grad_norm": 0.19224058091640472, + "learning_rate": 1.6124243419700724e-05, + "loss": 0.1032, + "step": 36665 + }, + { + "epoch": 0.6539792387543253, + "grad_norm": 0.30286917090415955, + "learning_rate": 1.612278832830274e-05, + "loss": 0.1323, + "step": 36666 + }, + { + "epoch": 0.653997074876039, + "grad_norm": 0.2640041410923004, + "learning_rate": 1.6121333271315906e-05, + "loss": 0.1574, + "step": 36667 + }, + { + "epoch": 0.6540149109977527, + "grad_norm": 0.235600546002388, + "learning_rate": 1.611987824874585e-05, + "loss": 0.1564, + "step": 36668 + }, + { + "epoch": 0.6540327471194664, + "grad_norm": 0.2564827501773834, + "learning_rate": 1.6118423260598205e-05, + "loss": 0.1209, + "step": 36669 + }, + { + "epoch": 0.65405058324118, + "grad_norm": 0.24721381068229675, + "learning_rate": 1.611696830687862e-05, + "loss": 0.1377, + "step": 36670 + }, + { + "epoch": 0.6540684193628937, + "grad_norm": 0.23483727872371674, + "learning_rate": 1.6115513387592733e-05, + "loss": 0.1321, + "step": 36671 + }, + { + "epoch": 0.6540862554846074, + "grad_norm": 0.29178568720817566, + "learning_rate": 1.6114058502746195e-05, + "loss": 0.1682, + "step": 36672 + }, + { + "epoch": 0.6541040916063211, + "grad_norm": 0.2708684206008911, + "learning_rate": 1.6112603652344634e-05, + "loss": 0.1284, + "step": 36673 + }, + { + "epoch": 0.6541219277280348, + "grad_norm": 0.2366989552974701, + "learning_rate": 1.6111148836393685e-05, + "loss": 0.1639, + "step": 36674 + }, + { + "epoch": 0.6541397638497485, + "grad_norm": 0.3067340552806854, + "learning_rate": 1.610969405489899e-05, + "loss": 0.1139, + "step": 36675 + }, + { + "epoch": 0.6541575999714622, + "grad_norm": 0.24117280542850494, + "learning_rate": 1.6108239307866196e-05, + "loss": 0.1808, + "step": 36676 + }, + { + "epoch": 0.6541754360931759, + "grad_norm": 0.2600924074649811, + "learning_rate": 1.6106784595300938e-05, + "loss": 0.1434, + "step": 36677 + }, + { + "epoch": 0.6541932722148895, + "grad_norm": 0.3785773813724518, + "learning_rate": 1.6105329917208857e-05, + "loss": 0.1626, + "step": 36678 + }, + { + "epoch": 0.6542111083366033, + "grad_norm": 0.23973430693149567, + "learning_rate": 1.6103875273595574e-05, + "loss": 0.1316, + "step": 36679 + }, + { + "epoch": 0.654228944458317, + "grad_norm": 0.2361200600862503, + "learning_rate": 1.610242066446676e-05, + "loss": 0.1131, + "step": 36680 + }, + { + "epoch": 0.6542467805800307, + "grad_norm": 0.415348619222641, + "learning_rate": 1.6100966089828035e-05, + "loss": 0.1164, + "step": 36681 + }, + { + "epoch": 0.6542646167017444, + "grad_norm": 0.21911504864692688, + "learning_rate": 1.6099511549685044e-05, + "loss": 0.1025, + "step": 36682 + }, + { + "epoch": 0.6542824528234581, + "grad_norm": 0.2965392470359802, + "learning_rate": 1.6098057044043403e-05, + "loss": 0.1013, + "step": 36683 + }, + { + "epoch": 0.6543002889451718, + "grad_norm": 0.2654988467693329, + "learning_rate": 1.6096602572908772e-05, + "loss": 0.1355, + "step": 36684 + }, + { + "epoch": 0.6543181250668855, + "grad_norm": 0.24729987978935242, + "learning_rate": 1.609514813628679e-05, + "loss": 0.1608, + "step": 36685 + }, + { + "epoch": 0.6543359611885992, + "grad_norm": 0.2575724422931671, + "learning_rate": 1.6093693734183096e-05, + "loss": 0.0807, + "step": 36686 + }, + { + "epoch": 0.6543537973103128, + "grad_norm": 0.30059322714805603, + "learning_rate": 1.609223936660332e-05, + "loss": 0.0856, + "step": 36687 + }, + { + "epoch": 0.6543716334320265, + "grad_norm": 0.3593789041042328, + "learning_rate": 1.6090785033553087e-05, + "loss": 0.0964, + "step": 36688 + }, + { + "epoch": 0.6543894695537402, + "grad_norm": 0.23953427374362946, + "learning_rate": 1.6089330735038055e-05, + "loss": 0.0602, + "step": 36689 + }, + { + "epoch": 0.6544073056754539, + "grad_norm": 0.23126541078090668, + "learning_rate": 1.6087876471063855e-05, + "loss": 0.1121, + "step": 36690 + }, + { + "epoch": 0.6544251417971676, + "grad_norm": 0.28415441513061523, + "learning_rate": 1.6086422241636127e-05, + "loss": 0.1136, + "step": 36691 + }, + { + "epoch": 0.6544429779188813, + "grad_norm": 0.3295811116695404, + "learning_rate": 1.6084968046760496e-05, + "loss": 0.1321, + "step": 36692 + }, + { + "epoch": 0.654460814040595, + "grad_norm": 0.2556428015232086, + "learning_rate": 1.608351388644262e-05, + "loss": 0.0838, + "step": 36693 + }, + { + "epoch": 0.6544786501623087, + "grad_norm": 0.202883780002594, + "learning_rate": 1.608205976068812e-05, + "loss": 0.1283, + "step": 36694 + }, + { + "epoch": 0.6544964862840223, + "grad_norm": 0.21548840403556824, + "learning_rate": 1.6080605669502638e-05, + "loss": 0.0958, + "step": 36695 + }, + { + "epoch": 0.6545143224057361, + "grad_norm": 0.3393465578556061, + "learning_rate": 1.60791516128918e-05, + "loss": 0.1186, + "step": 36696 + }, + { + "epoch": 0.6545321585274498, + "grad_norm": 0.26381170749664307, + "learning_rate": 1.6077697590861258e-05, + "loss": 0.1762, + "step": 36697 + }, + { + "epoch": 0.6545499946491635, + "grad_norm": 0.3036540448665619, + "learning_rate": 1.607624360341664e-05, + "loss": 0.1498, + "step": 36698 + }, + { + "epoch": 0.6545678307708772, + "grad_norm": 0.3125988245010376, + "learning_rate": 1.607478965056359e-05, + "loss": 0.2193, + "step": 36699 + }, + { + "epoch": 0.6545856668925909, + "grad_norm": 0.21715353429317474, + "learning_rate": 1.607333573230774e-05, + "loss": 0.1198, + "step": 36700 + }, + { + "epoch": 0.6546035030143046, + "grad_norm": 0.2881722152233124, + "learning_rate": 1.6071881848654706e-05, + "loss": 0.1284, + "step": 36701 + }, + { + "epoch": 0.6546213391360183, + "grad_norm": 0.28733929991722107, + "learning_rate": 1.6070427999610156e-05, + "loss": 0.1008, + "step": 36702 + }, + { + "epoch": 0.654639175257732, + "grad_norm": 0.2897832691669464, + "learning_rate": 1.6068974185179712e-05, + "loss": 0.1241, + "step": 36703 + }, + { + "epoch": 0.6546570113794457, + "grad_norm": 0.23576177656650543, + "learning_rate": 1.6067520405369002e-05, + "loss": 0.1125, + "step": 36704 + }, + { + "epoch": 0.6546748475011593, + "grad_norm": 0.26221007108688354, + "learning_rate": 1.606606666018367e-05, + "loss": 0.0948, + "step": 36705 + }, + { + "epoch": 0.654692683622873, + "grad_norm": 0.2798518240451813, + "learning_rate": 1.6064612949629347e-05, + "loss": 0.0842, + "step": 36706 + }, + { + "epoch": 0.6547105197445867, + "grad_norm": 0.30837762355804443, + "learning_rate": 1.6063159273711677e-05, + "loss": 0.144, + "step": 36707 + }, + { + "epoch": 0.6547283558663004, + "grad_norm": 0.32586193084716797, + "learning_rate": 1.6061705632436292e-05, + "loss": 0.158, + "step": 36708 + }, + { + "epoch": 0.6547461919880141, + "grad_norm": 0.2606954574584961, + "learning_rate": 1.6060252025808807e-05, + "loss": 0.1452, + "step": 36709 + }, + { + "epoch": 0.6547640281097278, + "grad_norm": 0.22420062124729156, + "learning_rate": 1.6058798453834884e-05, + "loss": 0.1069, + "step": 36710 + }, + { + "epoch": 0.6547818642314415, + "grad_norm": 0.28502655029296875, + "learning_rate": 1.6057344916520145e-05, + "loss": 0.1007, + "step": 36711 + }, + { + "epoch": 0.6547997003531552, + "grad_norm": 0.2172628939151764, + "learning_rate": 1.6055891413870227e-05, + "loss": 0.1203, + "step": 36712 + }, + { + "epoch": 0.654817536474869, + "grad_norm": 0.22606569528579712, + "learning_rate": 1.6054437945890767e-05, + "loss": 0.1122, + "step": 36713 + }, + { + "epoch": 0.6548353725965826, + "grad_norm": 0.21293863654136658, + "learning_rate": 1.605298451258738e-05, + "loss": 0.1122, + "step": 36714 + }, + { + "epoch": 0.6548532087182963, + "grad_norm": 0.2248375415802002, + "learning_rate": 1.605153111396573e-05, + "loss": 0.1657, + "step": 36715 + }, + { + "epoch": 0.65487104484001, + "grad_norm": 0.4894512891769409, + "learning_rate": 1.6050077750031433e-05, + "loss": 0.1289, + "step": 36716 + }, + { + "epoch": 0.6548888809617237, + "grad_norm": 0.27615973353385925, + "learning_rate": 1.604862442079012e-05, + "loss": 0.2331, + "step": 36717 + }, + { + "epoch": 0.6549067170834374, + "grad_norm": 0.23804430663585663, + "learning_rate": 1.6047171126247434e-05, + "loss": 0.1251, + "step": 36718 + }, + { + "epoch": 0.6549245532051511, + "grad_norm": 0.24958065152168274, + "learning_rate": 1.6045717866409003e-05, + "loss": 0.1315, + "step": 36719 + }, + { + "epoch": 0.6549423893268648, + "grad_norm": 0.231605663895607, + "learning_rate": 1.6044264641280465e-05, + "loss": 0.1054, + "step": 36720 + }, + { + "epoch": 0.6549602254485785, + "grad_norm": 0.23613178730010986, + "learning_rate": 1.6042811450867457e-05, + "loss": 0.0794, + "step": 36721 + }, + { + "epoch": 0.6549780615702921, + "grad_norm": 0.2802891731262207, + "learning_rate": 1.604135829517559e-05, + "loss": 0.1407, + "step": 36722 + }, + { + "epoch": 0.6549958976920058, + "grad_norm": 0.2869025766849518, + "learning_rate": 1.6039905174210522e-05, + "loss": 0.1379, + "step": 36723 + }, + { + "epoch": 0.6550137338137195, + "grad_norm": 0.3316204249858856, + "learning_rate": 1.6038452087977878e-05, + "loss": 0.1316, + "step": 36724 + }, + { + "epoch": 0.6550315699354332, + "grad_norm": 0.3360423743724823, + "learning_rate": 1.6036999036483284e-05, + "loss": 0.1494, + "step": 36725 + }, + { + "epoch": 0.6550494060571469, + "grad_norm": 0.25246569514274597, + "learning_rate": 1.6035546019732383e-05, + "loss": 0.1211, + "step": 36726 + }, + { + "epoch": 0.6550672421788606, + "grad_norm": 0.15617601573467255, + "learning_rate": 1.603409303773079e-05, + "loss": 0.0671, + "step": 36727 + }, + { + "epoch": 0.6550850783005743, + "grad_norm": 0.22333931922912598, + "learning_rate": 1.6032640090484165e-05, + "loss": 0.1157, + "step": 36728 + }, + { + "epoch": 0.655102914422288, + "grad_norm": 0.2514164447784424, + "learning_rate": 1.6031187177998117e-05, + "loss": 0.1067, + "step": 36729 + }, + { + "epoch": 0.6551207505440018, + "grad_norm": 0.19420118629932404, + "learning_rate": 1.602973430027829e-05, + "loss": 0.1108, + "step": 36730 + }, + { + "epoch": 0.6551385866657154, + "grad_norm": 0.3844279944896698, + "learning_rate": 1.6028281457330295e-05, + "loss": 0.1365, + "step": 36731 + }, + { + "epoch": 0.6551564227874291, + "grad_norm": 0.20976576209068298, + "learning_rate": 1.6026828649159797e-05, + "loss": 0.0951, + "step": 36732 + }, + { + "epoch": 0.6551742589091428, + "grad_norm": 0.25581127405166626, + "learning_rate": 1.60253758757724e-05, + "loss": 0.1242, + "step": 36733 + }, + { + "epoch": 0.6551920950308565, + "grad_norm": 0.23793557286262512, + "learning_rate": 1.6023923137173754e-05, + "loss": 0.1358, + "step": 36734 + }, + { + "epoch": 0.6552099311525702, + "grad_norm": 0.3044520318508148, + "learning_rate": 1.602247043336947e-05, + "loss": 0.1253, + "step": 36735 + }, + { + "epoch": 0.6552277672742839, + "grad_norm": 0.2654673159122467, + "learning_rate": 1.6021017764365203e-05, + "loss": 0.1191, + "step": 36736 + }, + { + "epoch": 0.6552456033959976, + "grad_norm": 0.2546164393424988, + "learning_rate": 1.601956513016657e-05, + "loss": 0.2122, + "step": 36737 + }, + { + "epoch": 0.6552634395177113, + "grad_norm": 0.26594099402427673, + "learning_rate": 1.6018112530779205e-05, + "loss": 0.0932, + "step": 36738 + }, + { + "epoch": 0.655281275639425, + "grad_norm": 0.18992877006530762, + "learning_rate": 1.6016659966208738e-05, + "loss": 0.103, + "step": 36739 + }, + { + "epoch": 0.6552991117611386, + "grad_norm": 0.301959753036499, + "learning_rate": 1.6015207436460792e-05, + "loss": 0.1153, + "step": 36740 + }, + { + "epoch": 0.6553169478828523, + "grad_norm": 0.25648316740989685, + "learning_rate": 1.6013754941541014e-05, + "loss": 0.0859, + "step": 36741 + }, + { + "epoch": 0.655334784004566, + "grad_norm": 0.21612359583377838, + "learning_rate": 1.6012302481455026e-05, + "loss": 0.096, + "step": 36742 + }, + { + "epoch": 0.6553526201262797, + "grad_norm": 0.2982625663280487, + "learning_rate": 1.6010850056208455e-05, + "loss": 0.1573, + "step": 36743 + }, + { + "epoch": 0.6553704562479934, + "grad_norm": 0.23175761103630066, + "learning_rate": 1.600939766580693e-05, + "loss": 0.1157, + "step": 36744 + }, + { + "epoch": 0.6553882923697071, + "grad_norm": 0.28712818026542664, + "learning_rate": 1.600794531025609e-05, + "loss": 0.1464, + "step": 36745 + }, + { + "epoch": 0.6554061284914209, + "grad_norm": 0.19944629073143005, + "learning_rate": 1.6006492989561556e-05, + "loss": 0.0913, + "step": 36746 + }, + { + "epoch": 0.6554239646131346, + "grad_norm": 0.19325481355190277, + "learning_rate": 1.600504070372897e-05, + "loss": 0.0788, + "step": 36747 + }, + { + "epoch": 0.6554418007348483, + "grad_norm": 0.24068132042884827, + "learning_rate": 1.6003588452763936e-05, + "loss": 0.1054, + "step": 36748 + }, + { + "epoch": 0.655459636856562, + "grad_norm": 0.2822006642818451, + "learning_rate": 1.6002136236672117e-05, + "loss": 0.0884, + "step": 36749 + }, + { + "epoch": 0.6554774729782756, + "grad_norm": 0.41239863634109497, + "learning_rate": 1.600068405545912e-05, + "loss": 0.1465, + "step": 36750 + }, + { + "epoch": 0.6554953090999893, + "grad_norm": 0.27067211270332336, + "learning_rate": 1.5999231909130585e-05, + "loss": 0.1513, + "step": 36751 + }, + { + "epoch": 0.655513145221703, + "grad_norm": 0.23247037827968597, + "learning_rate": 1.5997779797692127e-05, + "loss": 0.1117, + "step": 36752 + }, + { + "epoch": 0.6555309813434167, + "grad_norm": 0.276947021484375, + "learning_rate": 1.5996327721149386e-05, + "loss": 0.0959, + "step": 36753 + }, + { + "epoch": 0.6555488174651304, + "grad_norm": 0.3171471059322357, + "learning_rate": 1.5994875679507987e-05, + "loss": 0.1664, + "step": 36754 + }, + { + "epoch": 0.6555666535868441, + "grad_norm": 0.2898976504802704, + "learning_rate": 1.5993423672773565e-05, + "loss": 0.0861, + "step": 36755 + }, + { + "epoch": 0.6555844897085578, + "grad_norm": 0.26713642477989197, + "learning_rate": 1.5991971700951747e-05, + "loss": 0.0912, + "step": 36756 + }, + { + "epoch": 0.6556023258302714, + "grad_norm": 0.23357582092285156, + "learning_rate": 1.5990519764048144e-05, + "loss": 0.0912, + "step": 36757 + }, + { + "epoch": 0.6556201619519851, + "grad_norm": 0.2820734977722168, + "learning_rate": 1.598906786206841e-05, + "loss": 0.1412, + "step": 36758 + }, + { + "epoch": 0.6556379980736988, + "grad_norm": 0.222149059176445, + "learning_rate": 1.5987615995018162e-05, + "loss": 0.0783, + "step": 36759 + }, + { + "epoch": 0.6556558341954125, + "grad_norm": 0.3330352306365967, + "learning_rate": 1.598616416290302e-05, + "loss": 0.2134, + "step": 36760 + }, + { + "epoch": 0.6556736703171262, + "grad_norm": 0.21069347858428955, + "learning_rate": 1.5984712365728616e-05, + "loss": 0.1493, + "step": 36761 + }, + { + "epoch": 0.6556915064388399, + "grad_norm": 0.25062334537506104, + "learning_rate": 1.5983260603500587e-05, + "loss": 0.1475, + "step": 36762 + }, + { + "epoch": 0.6557093425605537, + "grad_norm": 0.29943299293518066, + "learning_rate": 1.5981808876224557e-05, + "loss": 0.0934, + "step": 36763 + }, + { + "epoch": 0.6557271786822674, + "grad_norm": 0.26778480410575867, + "learning_rate": 1.5980357183906153e-05, + "loss": 0.1454, + "step": 36764 + }, + { + "epoch": 0.6557450148039811, + "grad_norm": 0.3308759927749634, + "learning_rate": 1.5978905526550992e-05, + "loss": 0.1256, + "step": 36765 + }, + { + "epoch": 0.6557628509256948, + "grad_norm": 0.2862666845321655, + "learning_rate": 1.597745390416471e-05, + "loss": 0.1056, + "step": 36766 + }, + { + "epoch": 0.6557806870474084, + "grad_norm": 0.23114730417728424, + "learning_rate": 1.597600231675293e-05, + "loss": 0.0913, + "step": 36767 + }, + { + "epoch": 0.6557985231691221, + "grad_norm": 0.2226518839597702, + "learning_rate": 1.597455076432129e-05, + "loss": 0.1028, + "step": 36768 + }, + { + "epoch": 0.6558163592908358, + "grad_norm": 0.29544445872306824, + "learning_rate": 1.5973099246875408e-05, + "loss": 0.1827, + "step": 36769 + }, + { + "epoch": 0.6558341954125495, + "grad_norm": 0.329287052154541, + "learning_rate": 1.59716477644209e-05, + "loss": 0.1857, + "step": 36770 + }, + { + "epoch": 0.6558520315342632, + "grad_norm": 0.28796297311782837, + "learning_rate": 1.5970196316963416e-05, + "loss": 0.1466, + "step": 36771 + }, + { + "epoch": 0.6558698676559769, + "grad_norm": 0.2649242579936981, + "learning_rate": 1.5968744904508567e-05, + "loss": 0.1768, + "step": 36772 + }, + { + "epoch": 0.6558877037776906, + "grad_norm": 0.2383357435464859, + "learning_rate": 1.596729352706198e-05, + "loss": 0.1196, + "step": 36773 + }, + { + "epoch": 0.6559055398994043, + "grad_norm": 0.24642813205718994, + "learning_rate": 1.5965842184629283e-05, + "loss": 0.1664, + "step": 36774 + }, + { + "epoch": 0.6559233760211179, + "grad_norm": 0.24704863131046295, + "learning_rate": 1.5964390877216094e-05, + "loss": 0.1128, + "step": 36775 + }, + { + "epoch": 0.6559412121428316, + "grad_norm": 0.2743159532546997, + "learning_rate": 1.5962939604828063e-05, + "loss": 0.1505, + "step": 36776 + }, + { + "epoch": 0.6559590482645453, + "grad_norm": 0.46685105562210083, + "learning_rate": 1.5961488367470794e-05, + "loss": 0.1633, + "step": 36777 + }, + { + "epoch": 0.655976884386259, + "grad_norm": 0.20926645398139954, + "learning_rate": 1.5960037165149914e-05, + "loss": 0.1405, + "step": 36778 + }, + { + "epoch": 0.6559947205079727, + "grad_norm": 0.20892958343029022, + "learning_rate": 1.5958585997871052e-05, + "loss": 0.1442, + "step": 36779 + }, + { + "epoch": 0.6560125566296865, + "grad_norm": 0.21241679787635803, + "learning_rate": 1.5957134865639835e-05, + "loss": 0.1255, + "step": 36780 + }, + { + "epoch": 0.6560303927514002, + "grad_norm": 0.2082797735929489, + "learning_rate": 1.5955683768461884e-05, + "loss": 0.0813, + "step": 36781 + }, + { + "epoch": 0.6560482288731139, + "grad_norm": 0.1898459792137146, + "learning_rate": 1.5954232706342833e-05, + "loss": 0.0725, + "step": 36782 + }, + { + "epoch": 0.6560660649948276, + "grad_norm": 0.2718604803085327, + "learning_rate": 1.595278167928829e-05, + "loss": 0.1515, + "step": 36783 + }, + { + "epoch": 0.6560839011165412, + "grad_norm": 0.23570284247398376, + "learning_rate": 1.5951330687303902e-05, + "loss": 0.1541, + "step": 36784 + }, + { + "epoch": 0.6561017372382549, + "grad_norm": 0.3093301057815552, + "learning_rate": 1.5949879730395278e-05, + "loss": 0.1286, + "step": 36785 + }, + { + "epoch": 0.6561195733599686, + "grad_norm": 0.3159850239753723, + "learning_rate": 1.5948428808568044e-05, + "loss": 0.1027, + "step": 36786 + }, + { + "epoch": 0.6561374094816823, + "grad_norm": 0.253013014793396, + "learning_rate": 1.5946977921827822e-05, + "loss": 0.1438, + "step": 36787 + }, + { + "epoch": 0.656155245603396, + "grad_norm": 0.2947070598602295, + "learning_rate": 1.594552707018024e-05, + "loss": 0.1374, + "step": 36788 + }, + { + "epoch": 0.6561730817251097, + "grad_norm": 0.31292644143104553, + "learning_rate": 1.594407625363093e-05, + "loss": 0.1542, + "step": 36789 + }, + { + "epoch": 0.6561909178468234, + "grad_norm": 0.24879132211208344, + "learning_rate": 1.594262547218551e-05, + "loss": 0.1792, + "step": 36790 + }, + { + "epoch": 0.6562087539685371, + "grad_norm": 0.21500030159950256, + "learning_rate": 1.59411747258496e-05, + "loss": 0.1426, + "step": 36791 + }, + { + "epoch": 0.6562265900902507, + "grad_norm": 0.2210945338010788, + "learning_rate": 1.5939724014628816e-05, + "loss": 0.1066, + "step": 36792 + }, + { + "epoch": 0.6562444262119644, + "grad_norm": 0.26017841696739197, + "learning_rate": 1.5938273338528802e-05, + "loss": 0.1624, + "step": 36793 + }, + { + "epoch": 0.6562622623336781, + "grad_norm": 0.21701890230178833, + "learning_rate": 1.5936822697555166e-05, + "loss": 0.1479, + "step": 36794 + }, + { + "epoch": 0.6562800984553918, + "grad_norm": 0.2686643898487091, + "learning_rate": 1.5935372091713536e-05, + "loss": 0.1422, + "step": 36795 + }, + { + "epoch": 0.6562979345771055, + "grad_norm": 0.2434459924697876, + "learning_rate": 1.5933921521009526e-05, + "loss": 0.0992, + "step": 36796 + }, + { + "epoch": 0.6563157706988193, + "grad_norm": 0.26744312047958374, + "learning_rate": 1.593247098544878e-05, + "loss": 0.126, + "step": 36797 + }, + { + "epoch": 0.656333606820533, + "grad_norm": 0.28083112835884094, + "learning_rate": 1.593102048503691e-05, + "loss": 0.1702, + "step": 36798 + }, + { + "epoch": 0.6563514429422467, + "grad_norm": 0.26350125670433044, + "learning_rate": 1.5929570019779533e-05, + "loss": 0.1566, + "step": 36799 + }, + { + "epoch": 0.6563692790639604, + "grad_norm": 0.3550322949886322, + "learning_rate": 1.5928119589682267e-05, + "loss": 0.0997, + "step": 36800 + }, + { + "epoch": 0.656387115185674, + "grad_norm": 0.24596041440963745, + "learning_rate": 1.592666919475075e-05, + "loss": 0.1622, + "step": 36801 + }, + { + "epoch": 0.6564049513073877, + "grad_norm": 0.28364241123199463, + "learning_rate": 1.5925218834990596e-05, + "loss": 0.1241, + "step": 36802 + }, + { + "epoch": 0.6564227874291014, + "grad_norm": 0.20477141439914703, + "learning_rate": 1.592376851040743e-05, + "loss": 0.1216, + "step": 36803 + }, + { + "epoch": 0.6564406235508151, + "grad_norm": 0.2321634739637375, + "learning_rate": 1.5922318221006875e-05, + "loss": 0.0839, + "step": 36804 + }, + { + "epoch": 0.6564584596725288, + "grad_norm": 0.27941617369651794, + "learning_rate": 1.5920867966794538e-05, + "loss": 0.0998, + "step": 36805 + }, + { + "epoch": 0.6564762957942425, + "grad_norm": 0.19286619126796722, + "learning_rate": 1.5919417747776063e-05, + "loss": 0.0718, + "step": 36806 + }, + { + "epoch": 0.6564941319159562, + "grad_norm": 0.21201549470424652, + "learning_rate": 1.5917967563957063e-05, + "loss": 0.114, + "step": 36807 + }, + { + "epoch": 0.6565119680376699, + "grad_norm": 0.22896206378936768, + "learning_rate": 1.591651741534315e-05, + "loss": 0.0786, + "step": 36808 + }, + { + "epoch": 0.6565298041593836, + "grad_norm": 0.2870287001132965, + "learning_rate": 1.5915067301939953e-05, + "loss": 0.1288, + "step": 36809 + }, + { + "epoch": 0.6565476402810972, + "grad_norm": 0.23888632655143738, + "learning_rate": 1.5913617223753093e-05, + "loss": 0.1108, + "step": 36810 + }, + { + "epoch": 0.6565654764028109, + "grad_norm": 0.28307342529296875, + "learning_rate": 1.5912167180788194e-05, + "loss": 0.1156, + "step": 36811 + }, + { + "epoch": 0.6565833125245246, + "grad_norm": 0.4067381024360657, + "learning_rate": 1.591071717305088e-05, + "loss": 0.1558, + "step": 36812 + }, + { + "epoch": 0.6566011486462383, + "grad_norm": 0.2559942305088043, + "learning_rate": 1.5909267200546753e-05, + "loss": 0.1419, + "step": 36813 + }, + { + "epoch": 0.6566189847679521, + "grad_norm": 0.25679758191108704, + "learning_rate": 1.5907817263281456e-05, + "loss": 0.1075, + "step": 36814 + }, + { + "epoch": 0.6566368208896658, + "grad_norm": 0.25280389189720154, + "learning_rate": 1.5906367361260593e-05, + "loss": 0.1116, + "step": 36815 + }, + { + "epoch": 0.6566546570113795, + "grad_norm": 0.2718549072742462, + "learning_rate": 1.5904917494489795e-05, + "loss": 0.1559, + "step": 36816 + }, + { + "epoch": 0.6566724931330932, + "grad_norm": 0.2740013599395752, + "learning_rate": 1.590346766297468e-05, + "loss": 0.1422, + "step": 36817 + }, + { + "epoch": 0.6566903292548069, + "grad_norm": 0.2877633571624756, + "learning_rate": 1.5902017866720857e-05, + "loss": 0.139, + "step": 36818 + }, + { + "epoch": 0.6567081653765205, + "grad_norm": 0.26468172669410706, + "learning_rate": 1.5900568105733965e-05, + "loss": 0.1057, + "step": 36819 + }, + { + "epoch": 0.6567260014982342, + "grad_norm": 0.3145536780357361, + "learning_rate": 1.589911838001961e-05, + "loss": 0.1154, + "step": 36820 + }, + { + "epoch": 0.6567438376199479, + "grad_norm": 0.24582944810390472, + "learning_rate": 1.5897668689583418e-05, + "loss": 0.1042, + "step": 36821 + }, + { + "epoch": 0.6567616737416616, + "grad_norm": 0.30437421798706055, + "learning_rate": 1.5896219034431e-05, + "loss": 0.1595, + "step": 36822 + }, + { + "epoch": 0.6567795098633753, + "grad_norm": 0.22705450654029846, + "learning_rate": 1.589476941456798e-05, + "loss": 0.1052, + "step": 36823 + }, + { + "epoch": 0.656797345985089, + "grad_norm": 0.23533622920513153, + "learning_rate": 1.5893319829999987e-05, + "loss": 0.0945, + "step": 36824 + }, + { + "epoch": 0.6568151821068027, + "grad_norm": 0.24495822191238403, + "learning_rate": 1.5891870280732632e-05, + "loss": 0.1221, + "step": 36825 + }, + { + "epoch": 0.6568330182285164, + "grad_norm": 0.2742573022842407, + "learning_rate": 1.5890420766771525e-05, + "loss": 0.1, + "step": 36826 + }, + { + "epoch": 0.65685085435023, + "grad_norm": 0.4061833620071411, + "learning_rate": 1.58889712881223e-05, + "loss": 0.1925, + "step": 36827 + }, + { + "epoch": 0.6568686904719437, + "grad_norm": 0.2576970160007477, + "learning_rate": 1.588752184479057e-05, + "loss": 0.1603, + "step": 36828 + }, + { + "epoch": 0.6568865265936574, + "grad_norm": 0.25764453411102295, + "learning_rate": 1.5886072436781945e-05, + "loss": 0.1339, + "step": 36829 + }, + { + "epoch": 0.6569043627153711, + "grad_norm": 0.2629469037055969, + "learning_rate": 1.588462306410206e-05, + "loss": 0.1046, + "step": 36830 + }, + { + "epoch": 0.6569221988370849, + "grad_norm": 0.2936239540576935, + "learning_rate": 1.588317372675652e-05, + "loss": 0.1129, + "step": 36831 + }, + { + "epoch": 0.6569400349587986, + "grad_norm": 0.2374291718006134, + "learning_rate": 1.5881724424750946e-05, + "loss": 0.1002, + "step": 36832 + }, + { + "epoch": 0.6569578710805123, + "grad_norm": 0.30064189434051514, + "learning_rate": 1.5880275158090964e-05, + "loss": 0.11, + "step": 36833 + }, + { + "epoch": 0.656975707202226, + "grad_norm": 0.2121768444776535, + "learning_rate": 1.5878825926782183e-05, + "loss": 0.1132, + "step": 36834 + }, + { + "epoch": 0.6569935433239397, + "grad_norm": 0.1883653700351715, + "learning_rate": 1.587737673083021e-05, + "loss": 0.1404, + "step": 36835 + }, + { + "epoch": 0.6570113794456534, + "grad_norm": 0.41294899582862854, + "learning_rate": 1.5875927570240695e-05, + "loss": 0.1462, + "step": 36836 + }, + { + "epoch": 0.657029215567367, + "grad_norm": 0.26511743664741516, + "learning_rate": 1.5874478445019227e-05, + "loss": 0.1305, + "step": 36837 + }, + { + "epoch": 0.6570470516890807, + "grad_norm": 0.4264880418777466, + "learning_rate": 1.5873029355171433e-05, + "loss": 0.1112, + "step": 36838 + }, + { + "epoch": 0.6570648878107944, + "grad_norm": 0.3528462052345276, + "learning_rate": 1.587158030070292e-05, + "loss": 0.1159, + "step": 36839 + }, + { + "epoch": 0.6570827239325081, + "grad_norm": 0.2888167202472687, + "learning_rate": 1.587013128161933e-05, + "loss": 0.121, + "step": 36840 + }, + { + "epoch": 0.6571005600542218, + "grad_norm": 0.21214784681797028, + "learning_rate": 1.5868682297926262e-05, + "loss": 0.087, + "step": 36841 + }, + { + "epoch": 0.6571183961759355, + "grad_norm": 0.17510448396205902, + "learning_rate": 1.586723334962933e-05, + "loss": 0.1075, + "step": 36842 + }, + { + "epoch": 0.6571362322976492, + "grad_norm": 0.23990722000598907, + "learning_rate": 1.586578443673416e-05, + "loss": 0.1168, + "step": 36843 + }, + { + "epoch": 0.6571540684193629, + "grad_norm": 0.2533864974975586, + "learning_rate": 1.5864335559246357e-05, + "loss": 0.1539, + "step": 36844 + }, + { + "epoch": 0.6571719045410765, + "grad_norm": 0.4161410927772522, + "learning_rate": 1.5862886717171553e-05, + "loss": 0.1141, + "step": 36845 + }, + { + "epoch": 0.6571897406627902, + "grad_norm": 0.3502938449382782, + "learning_rate": 1.5861437910515354e-05, + "loss": 0.1548, + "step": 36846 + }, + { + "epoch": 0.657207576784504, + "grad_norm": 0.3132579028606415, + "learning_rate": 1.585998913928338e-05, + "loss": 0.053, + "step": 36847 + }, + { + "epoch": 0.6572254129062177, + "grad_norm": 0.2882586419582367, + "learning_rate": 1.5858540403481238e-05, + "loss": 0.1141, + "step": 36848 + }, + { + "epoch": 0.6572432490279314, + "grad_norm": 0.24397030472755432, + "learning_rate": 1.5857091703114555e-05, + "loss": 0.0764, + "step": 36849 + }, + { + "epoch": 0.6572610851496451, + "grad_norm": 0.37102025747299194, + "learning_rate": 1.585564303818894e-05, + "loss": 0.1525, + "step": 36850 + }, + { + "epoch": 0.6572789212713588, + "grad_norm": 0.2623554468154907, + "learning_rate": 1.5854194408710014e-05, + "loss": 0.1365, + "step": 36851 + }, + { + "epoch": 0.6572967573930725, + "grad_norm": 0.26536741852760315, + "learning_rate": 1.5852745814683385e-05, + "loss": 0.1322, + "step": 36852 + }, + { + "epoch": 0.6573145935147862, + "grad_norm": 0.2044999748468399, + "learning_rate": 1.585129725611468e-05, + "loss": 0.0934, + "step": 36853 + }, + { + "epoch": 0.6573324296364998, + "grad_norm": 0.2445453554391861, + "learning_rate": 1.5849848733009502e-05, + "loss": 0.1103, + "step": 36854 + }, + { + "epoch": 0.6573502657582135, + "grad_norm": 0.2444705069065094, + "learning_rate": 1.5848400245373473e-05, + "loss": 0.1231, + "step": 36855 + }, + { + "epoch": 0.6573681018799272, + "grad_norm": 0.21722236275672913, + "learning_rate": 1.5846951793212205e-05, + "loss": 0.1007, + "step": 36856 + }, + { + "epoch": 0.6573859380016409, + "grad_norm": 0.27680715918540955, + "learning_rate": 1.584550337653131e-05, + "loss": 0.1166, + "step": 36857 + }, + { + "epoch": 0.6574037741233546, + "grad_norm": 0.24962249398231506, + "learning_rate": 1.5844054995336406e-05, + "loss": 0.1058, + "step": 36858 + }, + { + "epoch": 0.6574216102450683, + "grad_norm": 0.2826508581638336, + "learning_rate": 1.5842606649633114e-05, + "loss": 0.1358, + "step": 36859 + }, + { + "epoch": 0.657439446366782, + "grad_norm": 0.40490302443504333, + "learning_rate": 1.5841158339427036e-05, + "loss": 0.2105, + "step": 36860 + }, + { + "epoch": 0.6574572824884957, + "grad_norm": 0.27626582980155945, + "learning_rate": 1.5839710064723784e-05, + "loss": 0.0891, + "step": 36861 + }, + { + "epoch": 0.6574751186102094, + "grad_norm": 0.27266401052474976, + "learning_rate": 1.5838261825528994e-05, + "loss": 0.0898, + "step": 36862 + }, + { + "epoch": 0.657492954731923, + "grad_norm": 0.23167169094085693, + "learning_rate": 1.5836813621848262e-05, + "loss": 0.1127, + "step": 36863 + }, + { + "epoch": 0.6575107908536368, + "grad_norm": 0.2848247289657593, + "learning_rate": 1.58353654536872e-05, + "loss": 0.1361, + "step": 36864 + }, + { + "epoch": 0.6575286269753505, + "grad_norm": 0.2954593896865845, + "learning_rate": 1.5833917321051424e-05, + "loss": 0.1532, + "step": 36865 + }, + { + "epoch": 0.6575464630970642, + "grad_norm": 0.2845885753631592, + "learning_rate": 1.5832469223946556e-05, + "loss": 0.1205, + "step": 36866 + }, + { + "epoch": 0.6575642992187779, + "grad_norm": 0.24876411259174347, + "learning_rate": 1.5831021162378207e-05, + "loss": 0.0983, + "step": 36867 + }, + { + "epoch": 0.6575821353404916, + "grad_norm": 0.23809607326984406, + "learning_rate": 1.5829573136351987e-05, + "loss": 0.1451, + "step": 36868 + }, + { + "epoch": 0.6575999714622053, + "grad_norm": 0.2451048493385315, + "learning_rate": 1.58281251458735e-05, + "loss": 0.1205, + "step": 36869 + }, + { + "epoch": 0.657617807583919, + "grad_norm": 0.2045835256576538, + "learning_rate": 1.5826677190948375e-05, + "loss": 0.1304, + "step": 36870 + }, + { + "epoch": 0.6576356437056327, + "grad_norm": 0.3112563192844391, + "learning_rate": 1.582522927158221e-05, + "loss": 0.1472, + "step": 36871 + }, + { + "epoch": 0.6576534798273463, + "grad_norm": 0.21952079236507416, + "learning_rate": 1.5823781387780635e-05, + "loss": 0.0855, + "step": 36872 + }, + { + "epoch": 0.65767131594906, + "grad_norm": 0.27830639481544495, + "learning_rate": 1.582233353954925e-05, + "loss": 0.1736, + "step": 36873 + }, + { + "epoch": 0.6576891520707737, + "grad_norm": 0.2714455723762512, + "learning_rate": 1.5820885726893665e-05, + "loss": 0.1422, + "step": 36874 + }, + { + "epoch": 0.6577069881924874, + "grad_norm": 0.3178907036781311, + "learning_rate": 1.5819437949819502e-05, + "loss": 0.1352, + "step": 36875 + }, + { + "epoch": 0.6577248243142011, + "grad_norm": 0.23411467671394348, + "learning_rate": 1.581799020833237e-05, + "loss": 0.135, + "step": 36876 + }, + { + "epoch": 0.6577426604359148, + "grad_norm": 0.2939034700393677, + "learning_rate": 1.5816542502437876e-05, + "loss": 0.1608, + "step": 36877 + }, + { + "epoch": 0.6577604965576285, + "grad_norm": 0.29110389947891235, + "learning_rate": 1.581509483214163e-05, + "loss": 0.135, + "step": 36878 + }, + { + "epoch": 0.6577783326793422, + "grad_norm": 0.3218103051185608, + "learning_rate": 1.5813647197449254e-05, + "loss": 0.1176, + "step": 36879 + }, + { + "epoch": 0.6577961688010558, + "grad_norm": 0.2967422306537628, + "learning_rate": 1.5812199598366356e-05, + "loss": 0.1625, + "step": 36880 + }, + { + "epoch": 0.6578140049227696, + "grad_norm": 0.19686435163021088, + "learning_rate": 1.581075203489855e-05, + "loss": 0.1245, + "step": 36881 + }, + { + "epoch": 0.6578318410444833, + "grad_norm": 0.2903866171836853, + "learning_rate": 1.580930450705144e-05, + "loss": 0.0989, + "step": 36882 + }, + { + "epoch": 0.657849677166197, + "grad_norm": 0.24043291807174683, + "learning_rate": 1.580785701483063e-05, + "loss": 0.1727, + "step": 36883 + }, + { + "epoch": 0.6578675132879107, + "grad_norm": 0.3193179666996002, + "learning_rate": 1.5806409558241753e-05, + "loss": 0.1781, + "step": 36884 + }, + { + "epoch": 0.6578853494096244, + "grad_norm": 0.2343636006116867, + "learning_rate": 1.5804962137290398e-05, + "loss": 0.1001, + "step": 36885 + }, + { + "epoch": 0.6579031855313381, + "grad_norm": 0.2991853952407837, + "learning_rate": 1.5803514751982197e-05, + "loss": 0.1548, + "step": 36886 + }, + { + "epoch": 0.6579210216530518, + "grad_norm": 0.29604876041412354, + "learning_rate": 1.5802067402322736e-05, + "loss": 0.1555, + "step": 36887 + }, + { + "epoch": 0.6579388577747655, + "grad_norm": 0.38010314106941223, + "learning_rate": 1.5800620088317646e-05, + "loss": 0.141, + "step": 36888 + }, + { + "epoch": 0.6579566938964792, + "grad_norm": 0.2148333191871643, + "learning_rate": 1.5799172809972535e-05, + "loss": 0.1223, + "step": 36889 + }, + { + "epoch": 0.6579745300181928, + "grad_norm": 0.1873437762260437, + "learning_rate": 1.5797725567293006e-05, + "loss": 0.1185, + "step": 36890 + }, + { + "epoch": 0.6579923661399065, + "grad_norm": 0.21285198628902435, + "learning_rate": 1.579627836028466e-05, + "loss": 0.1194, + "step": 36891 + }, + { + "epoch": 0.6580102022616202, + "grad_norm": 0.26031050086021423, + "learning_rate": 1.5794831188953123e-05, + "loss": 0.1397, + "step": 36892 + }, + { + "epoch": 0.6580280383833339, + "grad_norm": 0.23620277643203735, + "learning_rate": 1.5793384053304007e-05, + "loss": 0.124, + "step": 36893 + }, + { + "epoch": 0.6580458745050476, + "grad_norm": 0.2794645428657532, + "learning_rate": 1.5791936953342913e-05, + "loss": 0.1188, + "step": 36894 + }, + { + "epoch": 0.6580637106267613, + "grad_norm": 0.20696409046649933, + "learning_rate": 1.5790489889075442e-05, + "loss": 0.0847, + "step": 36895 + }, + { + "epoch": 0.658081546748475, + "grad_norm": 0.256647527217865, + "learning_rate": 1.578904286050722e-05, + "loss": 0.1642, + "step": 36896 + }, + { + "epoch": 0.6580993828701887, + "grad_norm": 0.2705405652523041, + "learning_rate": 1.5787595867643855e-05, + "loss": 0.1344, + "step": 36897 + }, + { + "epoch": 0.6581172189919025, + "grad_norm": 0.2152704894542694, + "learning_rate": 1.578614891049094e-05, + "loss": 0.1309, + "step": 36898 + }, + { + "epoch": 0.6581350551136161, + "grad_norm": 0.31815510988235474, + "learning_rate": 1.5784701989054102e-05, + "loss": 0.1236, + "step": 36899 + }, + { + "epoch": 0.6581528912353298, + "grad_norm": 0.21713735163211823, + "learning_rate": 1.5783255103338933e-05, + "loss": 0.1254, + "step": 36900 + }, + { + "epoch": 0.6581707273570435, + "grad_norm": 0.2968268394470215, + "learning_rate": 1.578180825335106e-05, + "loss": 0.103, + "step": 36901 + }, + { + "epoch": 0.6581885634787572, + "grad_norm": 0.27430063486099243, + "learning_rate": 1.5780361439096078e-05, + "loss": 0.1088, + "step": 36902 + }, + { + "epoch": 0.6582063996004709, + "grad_norm": 0.2858230769634247, + "learning_rate": 1.5778914660579604e-05, + "loss": 0.1356, + "step": 36903 + }, + { + "epoch": 0.6582242357221846, + "grad_norm": 0.2608380615711212, + "learning_rate": 1.5777467917807228e-05, + "loss": 0.1196, + "step": 36904 + }, + { + "epoch": 0.6582420718438983, + "grad_norm": 0.21127164363861084, + "learning_rate": 1.5776021210784586e-05, + "loss": 0.1055, + "step": 36905 + }, + { + "epoch": 0.658259907965612, + "grad_norm": 0.2470557540655136, + "learning_rate": 1.577457453951726e-05, + "loss": 0.1363, + "step": 36906 + }, + { + "epoch": 0.6582777440873256, + "grad_norm": 0.3971419334411621, + "learning_rate": 1.5773127904010882e-05, + "loss": 0.1071, + "step": 36907 + }, + { + "epoch": 0.6582955802090393, + "grad_norm": 0.28614407777786255, + "learning_rate": 1.577168130427103e-05, + "loss": 0.1279, + "step": 36908 + }, + { + "epoch": 0.658313416330753, + "grad_norm": 0.242633655667305, + "learning_rate": 1.5770234740303347e-05, + "loss": 0.1204, + "step": 36909 + }, + { + "epoch": 0.6583312524524667, + "grad_norm": 0.25513705611228943, + "learning_rate": 1.5768788212113416e-05, + "loss": 0.0872, + "step": 36910 + }, + { + "epoch": 0.6583490885741804, + "grad_norm": 0.3504336178302765, + "learning_rate": 1.5767341719706852e-05, + "loss": 0.1624, + "step": 36911 + }, + { + "epoch": 0.6583669246958941, + "grad_norm": 0.31027668714523315, + "learning_rate": 1.5765895263089254e-05, + "loss": 0.1621, + "step": 36912 + }, + { + "epoch": 0.6583847608176078, + "grad_norm": 0.2630026042461395, + "learning_rate": 1.5764448842266238e-05, + "loss": 0.1365, + "step": 36913 + }, + { + "epoch": 0.6584025969393215, + "grad_norm": 0.18853366374969482, + "learning_rate": 1.5763002457243405e-05, + "loss": 0.085, + "step": 36914 + }, + { + "epoch": 0.6584204330610353, + "grad_norm": 0.3125610947608948, + "learning_rate": 1.5761556108026372e-05, + "loss": 0.0983, + "step": 36915 + }, + { + "epoch": 0.658438269182749, + "grad_norm": 0.28227341175079346, + "learning_rate": 1.5760109794620737e-05, + "loss": 0.0968, + "step": 36916 + }, + { + "epoch": 0.6584561053044626, + "grad_norm": 0.2525133788585663, + "learning_rate": 1.57586635170321e-05, + "loss": 0.1481, + "step": 36917 + }, + { + "epoch": 0.6584739414261763, + "grad_norm": 0.2040320485830307, + "learning_rate": 1.5757217275266082e-05, + "loss": 0.0962, + "step": 36918 + }, + { + "epoch": 0.65849177754789, + "grad_norm": 0.36210864782333374, + "learning_rate": 1.5755771069328278e-05, + "loss": 0.1517, + "step": 36919 + }, + { + "epoch": 0.6585096136696037, + "grad_norm": 0.3569958209991455, + "learning_rate": 1.5754324899224304e-05, + "loss": 0.1592, + "step": 36920 + }, + { + "epoch": 0.6585274497913174, + "grad_norm": 0.2147177755832672, + "learning_rate": 1.575287876495975e-05, + "loss": 0.0895, + "step": 36921 + }, + { + "epoch": 0.6585452859130311, + "grad_norm": 0.2651868462562561, + "learning_rate": 1.5751432666540245e-05, + "loss": 0.1138, + "step": 36922 + }, + { + "epoch": 0.6585631220347448, + "grad_norm": 0.30353134870529175, + "learning_rate": 1.574998660397138e-05, + "loss": 0.1335, + "step": 36923 + }, + { + "epoch": 0.6585809581564585, + "grad_norm": 0.20026427507400513, + "learning_rate": 1.5748540577258757e-05, + "loss": 0.1472, + "step": 36924 + }, + { + "epoch": 0.6585987942781721, + "grad_norm": 0.21521052718162537, + "learning_rate": 1.5747094586407984e-05, + "loss": 0.1536, + "step": 36925 + }, + { + "epoch": 0.6586166303998858, + "grad_norm": 0.3330404460430145, + "learning_rate": 1.5745648631424666e-05, + "loss": 0.1049, + "step": 36926 + }, + { + "epoch": 0.6586344665215995, + "grad_norm": 0.24024979770183563, + "learning_rate": 1.5744202712314417e-05, + "loss": 0.1637, + "step": 36927 + }, + { + "epoch": 0.6586523026433132, + "grad_norm": 0.24170449376106262, + "learning_rate": 1.5742756829082832e-05, + "loss": 0.1349, + "step": 36928 + }, + { + "epoch": 0.6586701387650269, + "grad_norm": 0.2711646556854248, + "learning_rate": 1.5741310981735525e-05, + "loss": 0.111, + "step": 36929 + }, + { + "epoch": 0.6586879748867406, + "grad_norm": 0.19986176490783691, + "learning_rate": 1.5739865170278077e-05, + "loss": 0.1212, + "step": 36930 + }, + { + "epoch": 0.6587058110084543, + "grad_norm": 0.28152593970298767, + "learning_rate": 1.5738419394716127e-05, + "loss": 0.1741, + "step": 36931 + }, + { + "epoch": 0.6587236471301681, + "grad_norm": 0.26788750290870667, + "learning_rate": 1.5736973655055262e-05, + "loss": 0.0571, + "step": 36932 + }, + { + "epoch": 0.6587414832518818, + "grad_norm": 0.23063886165618896, + "learning_rate": 1.573552795130108e-05, + "loss": 0.0957, + "step": 36933 + }, + { + "epoch": 0.6587593193735954, + "grad_norm": 0.24991591274738312, + "learning_rate": 1.573408228345919e-05, + "loss": 0.1202, + "step": 36934 + }, + { + "epoch": 0.6587771554953091, + "grad_norm": 0.21629846096038818, + "learning_rate": 1.57326366515352e-05, + "loss": 0.0837, + "step": 36935 + }, + { + "epoch": 0.6587949916170228, + "grad_norm": 0.2798084616661072, + "learning_rate": 1.5731191055534714e-05, + "loss": 0.1531, + "step": 36936 + }, + { + "epoch": 0.6588128277387365, + "grad_norm": 0.23518231511116028, + "learning_rate": 1.5729745495463333e-05, + "loss": 0.1435, + "step": 36937 + }, + { + "epoch": 0.6588306638604502, + "grad_norm": 0.3445497453212738, + "learning_rate": 1.572829997132666e-05, + "loss": 0.1299, + "step": 36938 + }, + { + "epoch": 0.6588484999821639, + "grad_norm": 0.3493834137916565, + "learning_rate": 1.572685448313029e-05, + "loss": 0.1336, + "step": 36939 + }, + { + "epoch": 0.6588663361038776, + "grad_norm": 0.44469985365867615, + "learning_rate": 1.5725409030879845e-05, + "loss": 0.1948, + "step": 36940 + }, + { + "epoch": 0.6588841722255913, + "grad_norm": 0.27621257305145264, + "learning_rate": 1.5723963614580912e-05, + "loss": 0.0811, + "step": 36941 + }, + { + "epoch": 0.658902008347305, + "grad_norm": 0.23101843893527985, + "learning_rate": 1.5722518234239104e-05, + "loss": 0.1048, + "step": 36942 + }, + { + "epoch": 0.6589198444690186, + "grad_norm": 0.2903430759906769, + "learning_rate": 1.572107288986001e-05, + "loss": 0.1957, + "step": 36943 + }, + { + "epoch": 0.6589376805907323, + "grad_norm": 0.30021992325782776, + "learning_rate": 1.5719627581449255e-05, + "loss": 0.1363, + "step": 36944 + }, + { + "epoch": 0.658955516712446, + "grad_norm": 0.2059231847524643, + "learning_rate": 1.5718182309012427e-05, + "loss": 0.0921, + "step": 36945 + }, + { + "epoch": 0.6589733528341597, + "grad_norm": 0.24417108297348022, + "learning_rate": 1.5716737072555127e-05, + "loss": 0.0898, + "step": 36946 + }, + { + "epoch": 0.6589911889558734, + "grad_norm": 0.2771771550178528, + "learning_rate": 1.5715291872082958e-05, + "loss": 0.1144, + "step": 36947 + }, + { + "epoch": 0.6590090250775872, + "grad_norm": 0.23978090286254883, + "learning_rate": 1.5713846707601527e-05, + "loss": 0.0886, + "step": 36948 + }, + { + "epoch": 0.6590268611993009, + "grad_norm": 0.22668437659740448, + "learning_rate": 1.5712401579116434e-05, + "loss": 0.1347, + "step": 36949 + }, + { + "epoch": 0.6590446973210146, + "grad_norm": 0.3065158724784851, + "learning_rate": 1.571095648663328e-05, + "loss": 0.1225, + "step": 36950 + }, + { + "epoch": 0.6590625334427282, + "grad_norm": 0.34733688831329346, + "learning_rate": 1.5709511430157675e-05, + "loss": 0.1057, + "step": 36951 + }, + { + "epoch": 0.6590803695644419, + "grad_norm": 0.33316707611083984, + "learning_rate": 1.5708066409695193e-05, + "loss": 0.1937, + "step": 36952 + }, + { + "epoch": 0.6590982056861556, + "grad_norm": 0.2104162871837616, + "learning_rate": 1.5706621425251472e-05, + "loss": 0.1233, + "step": 36953 + }, + { + "epoch": 0.6591160418078693, + "grad_norm": 0.2924996614456177, + "learning_rate": 1.570517647683209e-05, + "loss": 0.1209, + "step": 36954 + }, + { + "epoch": 0.659133877929583, + "grad_norm": 0.302206814289093, + "learning_rate": 1.570373156444266e-05, + "loss": 0.1504, + "step": 36955 + }, + { + "epoch": 0.6591517140512967, + "grad_norm": 0.2522921860218048, + "learning_rate": 1.5702286688088762e-05, + "loss": 0.1255, + "step": 36956 + }, + { + "epoch": 0.6591695501730104, + "grad_norm": 0.24559594690799713, + "learning_rate": 1.5700841847776028e-05, + "loss": 0.0931, + "step": 36957 + }, + { + "epoch": 0.6591873862947241, + "grad_norm": 0.31043174862861633, + "learning_rate": 1.569939704351004e-05, + "loss": 0.1095, + "step": 36958 + }, + { + "epoch": 0.6592052224164378, + "grad_norm": 0.3222244381904602, + "learning_rate": 1.5697952275296397e-05, + "loss": 0.1059, + "step": 36959 + }, + { + "epoch": 0.6592230585381514, + "grad_norm": 0.2710152566432953, + "learning_rate": 1.5696507543140698e-05, + "loss": 0.1253, + "step": 36960 + }, + { + "epoch": 0.6592408946598651, + "grad_norm": 0.2671167552471161, + "learning_rate": 1.569506284704856e-05, + "loss": 0.1323, + "step": 36961 + }, + { + "epoch": 0.6592587307815788, + "grad_norm": 0.1917286366224289, + "learning_rate": 1.569361818702557e-05, + "loss": 0.111, + "step": 36962 + }, + { + "epoch": 0.6592765669032925, + "grad_norm": 0.2842090427875519, + "learning_rate": 1.569217356307733e-05, + "loss": 0.1303, + "step": 36963 + }, + { + "epoch": 0.6592944030250062, + "grad_norm": 0.2837993800640106, + "learning_rate": 1.5690728975209444e-05, + "loss": 0.1533, + "step": 36964 + }, + { + "epoch": 0.65931223914672, + "grad_norm": 0.3480733633041382, + "learning_rate": 1.5689284423427493e-05, + "loss": 0.1632, + "step": 36965 + }, + { + "epoch": 0.6593300752684337, + "grad_norm": 0.2921847999095917, + "learning_rate": 1.5687839907737108e-05, + "loss": 0.1025, + "step": 36966 + }, + { + "epoch": 0.6593479113901474, + "grad_norm": 0.31307971477508545, + "learning_rate": 1.5686395428143865e-05, + "loss": 0.1128, + "step": 36967 + }, + { + "epoch": 0.6593657475118611, + "grad_norm": 0.22050459682941437, + "learning_rate": 1.5684950984653368e-05, + "loss": 0.1043, + "step": 36968 + }, + { + "epoch": 0.6593835836335747, + "grad_norm": 0.19954946637153625, + "learning_rate": 1.5683506577271212e-05, + "loss": 0.1127, + "step": 36969 + }, + { + "epoch": 0.6594014197552884, + "grad_norm": 0.2509217858314514, + "learning_rate": 1.5682062206003017e-05, + "loss": 0.0912, + "step": 36970 + }, + { + "epoch": 0.6594192558770021, + "grad_norm": 0.18361979722976685, + "learning_rate": 1.5680617870854363e-05, + "loss": 0.0719, + "step": 36971 + }, + { + "epoch": 0.6594370919987158, + "grad_norm": 0.35508817434310913, + "learning_rate": 1.5679173571830852e-05, + "loss": 0.1202, + "step": 36972 + }, + { + "epoch": 0.6594549281204295, + "grad_norm": 0.28412824869155884, + "learning_rate": 1.5677729308938076e-05, + "loss": 0.1375, + "step": 36973 + }, + { + "epoch": 0.6594727642421432, + "grad_norm": 0.3273811340332031, + "learning_rate": 1.567628508218165e-05, + "loss": 0.1225, + "step": 36974 + }, + { + "epoch": 0.6594906003638569, + "grad_norm": 0.47485023736953735, + "learning_rate": 1.5674840891567162e-05, + "loss": 0.126, + "step": 36975 + }, + { + "epoch": 0.6595084364855706, + "grad_norm": 0.22489425539970398, + "learning_rate": 1.567339673710021e-05, + "loss": 0.1145, + "step": 36976 + }, + { + "epoch": 0.6595262726072842, + "grad_norm": 0.31754279136657715, + "learning_rate": 1.5671952618786398e-05, + "loss": 0.1423, + "step": 36977 + }, + { + "epoch": 0.6595441087289979, + "grad_norm": 0.2403276413679123, + "learning_rate": 1.567050853663131e-05, + "loss": 0.1292, + "step": 36978 + }, + { + "epoch": 0.6595619448507116, + "grad_norm": 0.46837303042411804, + "learning_rate": 1.5669064490640562e-05, + "loss": 0.1654, + "step": 36979 + }, + { + "epoch": 0.6595797809724253, + "grad_norm": 0.24127860367298126, + "learning_rate": 1.566762048081974e-05, + "loss": 0.1551, + "step": 36980 + }, + { + "epoch": 0.659597617094139, + "grad_norm": 0.28468719124794006, + "learning_rate": 1.5666176507174442e-05, + "loss": 0.1218, + "step": 36981 + }, + { + "epoch": 0.6596154532158528, + "grad_norm": 0.2550612986087799, + "learning_rate": 1.5664732569710268e-05, + "loss": 0.1352, + "step": 36982 + }, + { + "epoch": 0.6596332893375665, + "grad_norm": 0.23893840610980988, + "learning_rate": 1.566328866843281e-05, + "loss": 0.1138, + "step": 36983 + }, + { + "epoch": 0.6596511254592802, + "grad_norm": 0.2854664921760559, + "learning_rate": 1.5661844803347684e-05, + "loss": 0.1472, + "step": 36984 + }, + { + "epoch": 0.6596689615809939, + "grad_norm": 0.3304499089717865, + "learning_rate": 1.5660400974460465e-05, + "loss": 0.1413, + "step": 36985 + }, + { + "epoch": 0.6596867977027076, + "grad_norm": 0.2861810326576233, + "learning_rate": 1.565895718177675e-05, + "loss": 0.123, + "step": 36986 + }, + { + "epoch": 0.6597046338244212, + "grad_norm": 0.1872393637895584, + "learning_rate": 1.5657513425302157e-05, + "loss": 0.1225, + "step": 36987 + }, + { + "epoch": 0.6597224699461349, + "grad_norm": 0.2831312119960785, + "learning_rate": 1.5656069705042266e-05, + "loss": 0.1424, + "step": 36988 + }, + { + "epoch": 0.6597403060678486, + "grad_norm": 0.27119460701942444, + "learning_rate": 1.565462602100267e-05, + "loss": 0.1293, + "step": 36989 + }, + { + "epoch": 0.6597581421895623, + "grad_norm": 0.23875048756599426, + "learning_rate": 1.5653182373188976e-05, + "loss": 0.1001, + "step": 36990 + }, + { + "epoch": 0.659775978311276, + "grad_norm": 0.2693100869655609, + "learning_rate": 1.5651738761606766e-05, + "loss": 0.1451, + "step": 36991 + }, + { + "epoch": 0.6597938144329897, + "grad_norm": 0.24230840802192688, + "learning_rate": 1.565029518626166e-05, + "loss": 0.1177, + "step": 36992 + }, + { + "epoch": 0.6598116505547034, + "grad_norm": 0.43695560097694397, + "learning_rate": 1.564885164715923e-05, + "loss": 0.1176, + "step": 36993 + }, + { + "epoch": 0.659829486676417, + "grad_norm": 0.26083457469940186, + "learning_rate": 1.5647408144305087e-05, + "loss": 0.1224, + "step": 36994 + }, + { + "epoch": 0.6598473227981307, + "grad_norm": 0.37520378828048706, + "learning_rate": 1.564596467770481e-05, + "loss": 0.0835, + "step": 36995 + }, + { + "epoch": 0.6598651589198444, + "grad_norm": 0.2315322607755661, + "learning_rate": 1.564452124736401e-05, + "loss": 0.1009, + "step": 36996 + }, + { + "epoch": 0.6598829950415581, + "grad_norm": 0.2620389461517334, + "learning_rate": 1.564307785328828e-05, + "loss": 0.1107, + "step": 36997 + }, + { + "epoch": 0.6599008311632718, + "grad_norm": 0.28540104627609253, + "learning_rate": 1.564163449548321e-05, + "loss": 0.1022, + "step": 36998 + }, + { + "epoch": 0.6599186672849856, + "grad_norm": 0.3569800555706024, + "learning_rate": 1.5640191173954393e-05, + "loss": 0.1158, + "step": 36999 + }, + { + "epoch": 0.6599365034066993, + "grad_norm": 0.15573696792125702, + "learning_rate": 1.5638747888707434e-05, + "loss": 0.0525, + "step": 37000 + }, + { + "epoch": 0.6599365034066993, + "eval_loss": 0.11902497708797455, + "eval_runtime": 106.7536, + "eval_samples_per_second": 9.592, + "eval_steps_per_second": 1.602, + "step": 37000 + }, + { + "epoch": 0.659954339528413, + "grad_norm": 0.25685834884643555, + "learning_rate": 1.563730463974792e-05, + "loss": 0.0974, + "step": 37001 + }, + { + "epoch": 0.6599721756501267, + "grad_norm": 0.20308485627174377, + "learning_rate": 1.5635861427081443e-05, + "loss": 0.1196, + "step": 37002 + }, + { + "epoch": 0.6599900117718404, + "grad_norm": 0.26290422677993774, + "learning_rate": 1.563441825071361e-05, + "loss": 0.0959, + "step": 37003 + }, + { + "epoch": 0.660007847893554, + "grad_norm": 0.28677183389663696, + "learning_rate": 1.563297511064999e-05, + "loss": 0.1042, + "step": 37004 + }, + { + "epoch": 0.6600256840152677, + "grad_norm": 0.30590012669563293, + "learning_rate": 1.563153200689621e-05, + "loss": 0.1691, + "step": 37005 + }, + { + "epoch": 0.6600435201369814, + "grad_norm": 0.29537612199783325, + "learning_rate": 1.5630088939457847e-05, + "loss": 0.1578, + "step": 37006 + }, + { + "epoch": 0.6600613562586951, + "grad_norm": 0.24706660211086273, + "learning_rate": 1.5628645908340492e-05, + "loss": 0.1418, + "step": 37007 + }, + { + "epoch": 0.6600791923804088, + "grad_norm": 0.299190878868103, + "learning_rate": 1.5627202913549734e-05, + "loss": 0.1556, + "step": 37008 + }, + { + "epoch": 0.6600970285021225, + "grad_norm": 0.23550699651241302, + "learning_rate": 1.5625759955091183e-05, + "loss": 0.119, + "step": 37009 + }, + { + "epoch": 0.6601148646238362, + "grad_norm": 0.27453306317329407, + "learning_rate": 1.5624317032970424e-05, + "loss": 0.1393, + "step": 37010 + }, + { + "epoch": 0.6601327007455499, + "grad_norm": 0.39263662695884705, + "learning_rate": 1.562287414719305e-05, + "loss": 0.1517, + "step": 37011 + }, + { + "epoch": 0.6601505368672635, + "grad_norm": 0.2503941059112549, + "learning_rate": 1.562143129776465e-05, + "loss": 0.1196, + "step": 37012 + }, + { + "epoch": 0.6601683729889772, + "grad_norm": 0.26782235503196716, + "learning_rate": 1.561998848469083e-05, + "loss": 0.1284, + "step": 37013 + }, + { + "epoch": 0.6601862091106909, + "grad_norm": 0.36488696932792664, + "learning_rate": 1.561854570797717e-05, + "loss": 0.1557, + "step": 37014 + }, + { + "epoch": 0.6602040452324046, + "grad_norm": 0.21011121571063995, + "learning_rate": 1.5617102967629272e-05, + "loss": 0.0765, + "step": 37015 + }, + { + "epoch": 0.6602218813541184, + "grad_norm": 0.36573705077171326, + "learning_rate": 1.5615660263652714e-05, + "loss": 0.1066, + "step": 37016 + }, + { + "epoch": 0.6602397174758321, + "grad_norm": 0.2442290037870407, + "learning_rate": 1.56142175960531e-05, + "loss": 0.1307, + "step": 37017 + }, + { + "epoch": 0.6602575535975458, + "grad_norm": 0.23664802312850952, + "learning_rate": 1.5612774964836026e-05, + "loss": 0.1017, + "step": 37018 + }, + { + "epoch": 0.6602753897192595, + "grad_norm": 0.18912160396575928, + "learning_rate": 1.5611332370007076e-05, + "loss": 0.0713, + "step": 37019 + }, + { + "epoch": 0.6602932258409732, + "grad_norm": 0.2211618274450302, + "learning_rate": 1.5609889811571846e-05, + "loss": 0.1131, + "step": 37020 + }, + { + "epoch": 0.6603110619626869, + "grad_norm": 0.31006449460983276, + "learning_rate": 1.5608447289535915e-05, + "loss": 0.1202, + "step": 37021 + }, + { + "epoch": 0.6603288980844005, + "grad_norm": 0.2946932315826416, + "learning_rate": 1.5607004803904902e-05, + "loss": 0.1158, + "step": 37022 + }, + { + "epoch": 0.6603467342061142, + "grad_norm": 0.3183962106704712, + "learning_rate": 1.5605562354684372e-05, + "loss": 0.1113, + "step": 37023 + }, + { + "epoch": 0.6603645703278279, + "grad_norm": 0.3085103929042816, + "learning_rate": 1.5604119941879936e-05, + "loss": 0.0918, + "step": 37024 + }, + { + "epoch": 0.6603824064495416, + "grad_norm": 0.22984778881072998, + "learning_rate": 1.5602677565497166e-05, + "loss": 0.0977, + "step": 37025 + }, + { + "epoch": 0.6604002425712553, + "grad_norm": 0.2717246413230896, + "learning_rate": 1.560123522554167e-05, + "loss": 0.1374, + "step": 37026 + }, + { + "epoch": 0.660418078692969, + "grad_norm": 0.21724514663219452, + "learning_rate": 1.5599792922019037e-05, + "loss": 0.1205, + "step": 37027 + }, + { + "epoch": 0.6604359148146827, + "grad_norm": 0.25974664092063904, + "learning_rate": 1.5598350654934846e-05, + "loss": 0.1527, + "step": 37028 + }, + { + "epoch": 0.6604537509363964, + "grad_norm": 0.3335629105567932, + "learning_rate": 1.5596908424294695e-05, + "loss": 0.1591, + "step": 37029 + }, + { + "epoch": 0.66047158705811, + "grad_norm": 0.31999802589416504, + "learning_rate": 1.5595466230104178e-05, + "loss": 0.1653, + "step": 37030 + }, + { + "epoch": 0.6604894231798237, + "grad_norm": 0.23707224428653717, + "learning_rate": 1.5594024072368878e-05, + "loss": 0.1449, + "step": 37031 + }, + { + "epoch": 0.6605072593015374, + "grad_norm": 0.3162710666656494, + "learning_rate": 1.5592581951094392e-05, + "loss": 0.1135, + "step": 37032 + }, + { + "epoch": 0.6605250954232512, + "grad_norm": 0.2938390076160431, + "learning_rate": 1.5591139866286315e-05, + "loss": 0.106, + "step": 37033 + }, + { + "epoch": 0.6605429315449649, + "grad_norm": 0.28506210446357727, + "learning_rate": 1.558969781795021e-05, + "loss": 0.1314, + "step": 37034 + }, + { + "epoch": 0.6605607676666786, + "grad_norm": 0.22351442277431488, + "learning_rate": 1.5588255806091703e-05, + "loss": 0.1647, + "step": 37035 + }, + { + "epoch": 0.6605786037883923, + "grad_norm": 0.25973811745643616, + "learning_rate": 1.558681383071637e-05, + "loss": 0.1155, + "step": 37036 + }, + { + "epoch": 0.660596439910106, + "grad_norm": 0.24580179154872894, + "learning_rate": 1.5585371891829788e-05, + "loss": 0.1224, + "step": 37037 + }, + { + "epoch": 0.6606142760318197, + "grad_norm": 0.24619829654693604, + "learning_rate": 1.5583929989437554e-05, + "loss": 0.113, + "step": 37038 + }, + { + "epoch": 0.6606321121535333, + "grad_norm": 0.22422625124454498, + "learning_rate": 1.5582488123545264e-05, + "loss": 0.1338, + "step": 37039 + }, + { + "epoch": 0.660649948275247, + "grad_norm": 0.2383342683315277, + "learning_rate": 1.5581046294158504e-05, + "loss": 0.0967, + "step": 37040 + }, + { + "epoch": 0.6606677843969607, + "grad_norm": 0.2628973424434662, + "learning_rate": 1.5579604501282867e-05, + "loss": 0.1096, + "step": 37041 + }, + { + "epoch": 0.6606856205186744, + "grad_norm": 0.29809632897377014, + "learning_rate": 1.5578162744923934e-05, + "loss": 0.1553, + "step": 37042 + }, + { + "epoch": 0.6607034566403881, + "grad_norm": 0.3029145300388336, + "learning_rate": 1.5576721025087283e-05, + "loss": 0.1229, + "step": 37043 + }, + { + "epoch": 0.6607212927621018, + "grad_norm": 0.27769985795021057, + "learning_rate": 1.5575279341778532e-05, + "loss": 0.1145, + "step": 37044 + }, + { + "epoch": 0.6607391288838155, + "grad_norm": 0.2558722198009491, + "learning_rate": 1.5573837695003243e-05, + "loss": 0.1277, + "step": 37045 + }, + { + "epoch": 0.6607569650055292, + "grad_norm": 0.28375545144081116, + "learning_rate": 1.557239608476702e-05, + "loss": 0.1434, + "step": 37046 + }, + { + "epoch": 0.6607748011272429, + "grad_norm": 0.2731107175350189, + "learning_rate": 1.5570954511075443e-05, + "loss": 0.1328, + "step": 37047 + }, + { + "epoch": 0.6607926372489565, + "grad_norm": 0.2662769854068756, + "learning_rate": 1.5569512973934106e-05, + "loss": 0.0956, + "step": 37048 + }, + { + "epoch": 0.6608104733706702, + "grad_norm": 0.34471702575683594, + "learning_rate": 1.5568071473348596e-05, + "loss": 0.1335, + "step": 37049 + }, + { + "epoch": 0.660828309492384, + "grad_norm": 0.22524121403694153, + "learning_rate": 1.55666300093245e-05, + "loss": 0.071, + "step": 37050 + }, + { + "epoch": 0.6608461456140977, + "grad_norm": 0.22285377979278564, + "learning_rate": 1.556518858186739e-05, + "loss": 0.1395, + "step": 37051 + }, + { + "epoch": 0.6608639817358114, + "grad_norm": 0.27272310853004456, + "learning_rate": 1.5563747190982877e-05, + "loss": 0.1536, + "step": 37052 + }, + { + "epoch": 0.6608818178575251, + "grad_norm": 0.2512317895889282, + "learning_rate": 1.5562305836676545e-05, + "loss": 0.1238, + "step": 37053 + }, + { + "epoch": 0.6608996539792388, + "grad_norm": 0.20758996903896332, + "learning_rate": 1.556086451895397e-05, + "loss": 0.1195, + "step": 37054 + }, + { + "epoch": 0.6609174901009525, + "grad_norm": 0.2207065373659134, + "learning_rate": 1.555942323782075e-05, + "loss": 0.0982, + "step": 37055 + }, + { + "epoch": 0.6609353262226662, + "grad_norm": 0.22319954633712769, + "learning_rate": 1.5557981993282453e-05, + "loss": 0.1404, + "step": 37056 + }, + { + "epoch": 0.6609531623443798, + "grad_norm": 0.2827005982398987, + "learning_rate": 1.555654078534469e-05, + "loss": 0.15, + "step": 37057 + }, + { + "epoch": 0.6609709984660935, + "grad_norm": 0.2364160716533661, + "learning_rate": 1.555509961401303e-05, + "loss": 0.1656, + "step": 37058 + }, + { + "epoch": 0.6609888345878072, + "grad_norm": 0.2683470845222473, + "learning_rate": 1.5553658479293073e-05, + "loss": 0.096, + "step": 37059 + }, + { + "epoch": 0.6610066707095209, + "grad_norm": 0.23723892867565155, + "learning_rate": 1.5552217381190386e-05, + "loss": 0.1434, + "step": 37060 + }, + { + "epoch": 0.6610245068312346, + "grad_norm": 0.30018407106399536, + "learning_rate": 1.555077631971058e-05, + "loss": 0.1035, + "step": 37061 + }, + { + "epoch": 0.6610423429529483, + "grad_norm": 0.2841120660305023, + "learning_rate": 1.554933529485923e-05, + "loss": 0.0937, + "step": 37062 + }, + { + "epoch": 0.661060179074662, + "grad_norm": 0.2536258101463318, + "learning_rate": 1.554789430664192e-05, + "loss": 0.1138, + "step": 37063 + }, + { + "epoch": 0.6610780151963757, + "grad_norm": 0.24432630836963654, + "learning_rate": 1.5546453355064223e-05, + "loss": 0.0986, + "step": 37064 + }, + { + "epoch": 0.6610958513180893, + "grad_norm": 0.315021276473999, + "learning_rate": 1.554501244013175e-05, + "loss": 0.1183, + "step": 37065 + }, + { + "epoch": 0.6611136874398031, + "grad_norm": 0.25056859850883484, + "learning_rate": 1.5543571561850068e-05, + "loss": 0.1317, + "step": 37066 + }, + { + "epoch": 0.6611315235615168, + "grad_norm": 0.2400984764099121, + "learning_rate": 1.5542130720224773e-05, + "loss": 0.1257, + "step": 37067 + }, + { + "epoch": 0.6611493596832305, + "grad_norm": 0.197922483086586, + "learning_rate": 1.554068991526145e-05, + "loss": 0.0692, + "step": 37068 + }, + { + "epoch": 0.6611671958049442, + "grad_norm": 0.26362791657447815, + "learning_rate": 1.5539249146965667e-05, + "loss": 0.0845, + "step": 37069 + }, + { + "epoch": 0.6611850319266579, + "grad_norm": 0.3231179714202881, + "learning_rate": 1.5537808415343033e-05, + "loss": 0.1395, + "step": 37070 + }, + { + "epoch": 0.6612028680483716, + "grad_norm": 0.36305803060531616, + "learning_rate": 1.5536367720399122e-05, + "loss": 0.0883, + "step": 37071 + }, + { + "epoch": 0.6612207041700853, + "grad_norm": 0.2532045543193817, + "learning_rate": 1.5534927062139516e-05, + "loss": 0.1132, + "step": 37072 + }, + { + "epoch": 0.661238540291799, + "grad_norm": 0.2202557772397995, + "learning_rate": 1.553348644056979e-05, + "loss": 0.0853, + "step": 37073 + }, + { + "epoch": 0.6612563764135126, + "grad_norm": 0.27331575751304626, + "learning_rate": 1.5532045855695553e-05, + "loss": 0.0891, + "step": 37074 + }, + { + "epoch": 0.6612742125352263, + "grad_norm": 0.2807004749774933, + "learning_rate": 1.553060530752238e-05, + "loss": 0.1072, + "step": 37075 + }, + { + "epoch": 0.66129204865694, + "grad_norm": 0.2401042878627777, + "learning_rate": 1.5529164796055847e-05, + "loss": 0.1217, + "step": 37076 + }, + { + "epoch": 0.6613098847786537, + "grad_norm": 0.24894565343856812, + "learning_rate": 1.5527724321301534e-05, + "loss": 0.114, + "step": 37077 + }, + { + "epoch": 0.6613277209003674, + "grad_norm": 0.2939464747905731, + "learning_rate": 1.5526283883265043e-05, + "loss": 0.1005, + "step": 37078 + }, + { + "epoch": 0.6613455570220811, + "grad_norm": 0.3336678147315979, + "learning_rate": 1.5524843481951944e-05, + "loss": 0.2024, + "step": 37079 + }, + { + "epoch": 0.6613633931437948, + "grad_norm": 0.31239572167396545, + "learning_rate": 1.552340311736783e-05, + "loss": 0.1907, + "step": 37080 + }, + { + "epoch": 0.6613812292655085, + "grad_norm": 0.32804518938064575, + "learning_rate": 1.5521962789518273e-05, + "loss": 0.1463, + "step": 37081 + }, + { + "epoch": 0.6613990653872222, + "grad_norm": 0.24787935614585876, + "learning_rate": 1.552052249840886e-05, + "loss": 0.1263, + "step": 37082 + }, + { + "epoch": 0.661416901508936, + "grad_norm": 0.2364760786294937, + "learning_rate": 1.551908224404518e-05, + "loss": 0.1311, + "step": 37083 + }, + { + "epoch": 0.6614347376306496, + "grad_norm": 0.1966044157743454, + "learning_rate": 1.5517642026432815e-05, + "loss": 0.0789, + "step": 37084 + }, + { + "epoch": 0.6614525737523633, + "grad_norm": 0.28517889976501465, + "learning_rate": 1.551620184557734e-05, + "loss": 0.0986, + "step": 37085 + }, + { + "epoch": 0.661470409874077, + "grad_norm": 0.2824646830558777, + "learning_rate": 1.5514761701484338e-05, + "loss": 0.1125, + "step": 37086 + }, + { + "epoch": 0.6614882459957907, + "grad_norm": 0.2853250801563263, + "learning_rate": 1.5513321594159396e-05, + "loss": 0.111, + "step": 37087 + }, + { + "epoch": 0.6615060821175044, + "grad_norm": 0.2811721861362457, + "learning_rate": 1.5511881523608105e-05, + "loss": 0.1776, + "step": 37088 + }, + { + "epoch": 0.6615239182392181, + "grad_norm": 0.2946011424064636, + "learning_rate": 1.5510441489836035e-05, + "loss": 0.1037, + "step": 37089 + }, + { + "epoch": 0.6615417543609318, + "grad_norm": 0.42056822776794434, + "learning_rate": 1.5509001492848763e-05, + "loss": 0.1161, + "step": 37090 + }, + { + "epoch": 0.6615595904826455, + "grad_norm": 0.19736723601818085, + "learning_rate": 1.550756153265189e-05, + "loss": 0.0645, + "step": 37091 + }, + { + "epoch": 0.6615774266043591, + "grad_norm": 0.22733968496322632, + "learning_rate": 1.5506121609250986e-05, + "loss": 0.0844, + "step": 37092 + }, + { + "epoch": 0.6615952627260728, + "grad_norm": 0.30161646008491516, + "learning_rate": 1.5504681722651633e-05, + "loss": 0.1301, + "step": 37093 + }, + { + "epoch": 0.6616130988477865, + "grad_norm": 0.2754213511943817, + "learning_rate": 1.5503241872859412e-05, + "loss": 0.1739, + "step": 37094 + }, + { + "epoch": 0.6616309349695002, + "grad_norm": 0.33259668946266174, + "learning_rate": 1.5501802059879896e-05, + "loss": 0.1814, + "step": 37095 + }, + { + "epoch": 0.6616487710912139, + "grad_norm": 0.21730954945087433, + "learning_rate": 1.5500362283718687e-05, + "loss": 0.1063, + "step": 37096 + }, + { + "epoch": 0.6616666072129276, + "grad_norm": 0.2618711590766907, + "learning_rate": 1.5498922544381355e-05, + "loss": 0.0734, + "step": 37097 + }, + { + "epoch": 0.6616844433346413, + "grad_norm": 0.2211068868637085, + "learning_rate": 1.5497482841873483e-05, + "loss": 0.1394, + "step": 37098 + }, + { + "epoch": 0.661702279456355, + "grad_norm": 0.26202213764190674, + "learning_rate": 1.5496043176200637e-05, + "loss": 0.1552, + "step": 37099 + }, + { + "epoch": 0.6617201155780688, + "grad_norm": 0.33530083298683167, + "learning_rate": 1.5494603547368415e-05, + "loss": 0.1059, + "step": 37100 + }, + { + "epoch": 0.6617379516997824, + "grad_norm": 0.28246310353279114, + "learning_rate": 1.54931639553824e-05, + "loss": 0.1759, + "step": 37101 + }, + { + "epoch": 0.6617557878214961, + "grad_norm": 0.3158358335494995, + "learning_rate": 1.5491724400248165e-05, + "loss": 0.1016, + "step": 37102 + }, + { + "epoch": 0.6617736239432098, + "grad_norm": 0.26427677273750305, + "learning_rate": 1.549028488197128e-05, + "loss": 0.0545, + "step": 37103 + }, + { + "epoch": 0.6617914600649235, + "grad_norm": 0.2480769008398056, + "learning_rate": 1.5488845400557344e-05, + "loss": 0.1519, + "step": 37104 + }, + { + "epoch": 0.6618092961866372, + "grad_norm": 0.2479245662689209, + "learning_rate": 1.548740595601193e-05, + "loss": 0.1457, + "step": 37105 + }, + { + "epoch": 0.6618271323083509, + "grad_norm": 0.2954351007938385, + "learning_rate": 1.548596654834061e-05, + "loss": 0.1114, + "step": 37106 + }, + { + "epoch": 0.6618449684300646, + "grad_norm": 0.20915383100509644, + "learning_rate": 1.5484527177548973e-05, + "loss": 0.0931, + "step": 37107 + }, + { + "epoch": 0.6618628045517783, + "grad_norm": 0.24162811040878296, + "learning_rate": 1.5483087843642585e-05, + "loss": 0.133, + "step": 37108 + }, + { + "epoch": 0.661880640673492, + "grad_norm": 0.23137404024600983, + "learning_rate": 1.5481648546627046e-05, + "loss": 0.0609, + "step": 37109 + }, + { + "epoch": 0.6618984767952056, + "grad_norm": 0.3720117509365082, + "learning_rate": 1.5480209286507928e-05, + "loss": 0.1066, + "step": 37110 + }, + { + "epoch": 0.6619163129169193, + "grad_norm": 0.242386132478714, + "learning_rate": 1.5478770063290803e-05, + "loss": 0.1062, + "step": 37111 + }, + { + "epoch": 0.661934149038633, + "grad_norm": 0.28530818223953247, + "learning_rate": 1.5477330876981248e-05, + "loss": 0.1329, + "step": 37112 + }, + { + "epoch": 0.6619519851603467, + "grad_norm": 0.32816416025161743, + "learning_rate": 1.5475891727584853e-05, + "loss": 0.1362, + "step": 37113 + }, + { + "epoch": 0.6619698212820604, + "grad_norm": 0.32355543971061707, + "learning_rate": 1.547445261510719e-05, + "loss": 0.1703, + "step": 37114 + }, + { + "epoch": 0.6619876574037741, + "grad_norm": 0.2863290011882782, + "learning_rate": 1.5473013539553843e-05, + "loss": 0.0849, + "step": 37115 + }, + { + "epoch": 0.6620054935254878, + "grad_norm": 0.2806033194065094, + "learning_rate": 1.5471574500930374e-05, + "loss": 0.0936, + "step": 37116 + }, + { + "epoch": 0.6620233296472016, + "grad_norm": 0.2882058918476105, + "learning_rate": 1.5470135499242383e-05, + "loss": 0.1492, + "step": 37117 + }, + { + "epoch": 0.6620411657689153, + "grad_norm": 0.251399964094162, + "learning_rate": 1.546869653449544e-05, + "loss": 0.0901, + "step": 37118 + }, + { + "epoch": 0.6620590018906289, + "grad_norm": 0.26312190294265747, + "learning_rate": 1.546725760669512e-05, + "loss": 0.1041, + "step": 37119 + }, + { + "epoch": 0.6620768380123426, + "grad_norm": 0.25243645906448364, + "learning_rate": 1.5465818715847e-05, + "loss": 0.1312, + "step": 37120 + }, + { + "epoch": 0.6620946741340563, + "grad_norm": 0.23666419088840485, + "learning_rate": 1.5464379861956658e-05, + "loss": 0.1287, + "step": 37121 + }, + { + "epoch": 0.66211251025577, + "grad_norm": 0.2283872812986374, + "learning_rate": 1.546294104502967e-05, + "loss": 0.1346, + "step": 37122 + }, + { + "epoch": 0.6621303463774837, + "grad_norm": 0.21539998054504395, + "learning_rate": 1.5461502265071625e-05, + "loss": 0.1354, + "step": 37123 + }, + { + "epoch": 0.6621481824991974, + "grad_norm": 0.2578134536743164, + "learning_rate": 1.5460063522088093e-05, + "loss": 0.1418, + "step": 37124 + }, + { + "epoch": 0.6621660186209111, + "grad_norm": 0.32014885544776917, + "learning_rate": 1.5458624816084634e-05, + "loss": 0.1148, + "step": 37125 + }, + { + "epoch": 0.6621838547426248, + "grad_norm": 0.23078584671020508, + "learning_rate": 1.5457186147066854e-05, + "loss": 0.1244, + "step": 37126 + }, + { + "epoch": 0.6622016908643384, + "grad_norm": 0.3438572287559509, + "learning_rate": 1.545574751504032e-05, + "loss": 0.1541, + "step": 37127 + }, + { + "epoch": 0.6622195269860521, + "grad_norm": 0.2518502175807953, + "learning_rate": 1.5454308920010596e-05, + "loss": 0.1101, + "step": 37128 + }, + { + "epoch": 0.6622373631077658, + "grad_norm": 0.388161838054657, + "learning_rate": 1.5452870361983264e-05, + "loss": 0.1676, + "step": 37129 + }, + { + "epoch": 0.6622551992294795, + "grad_norm": 0.2901150584220886, + "learning_rate": 1.5451431840963916e-05, + "loss": 0.1485, + "step": 37130 + }, + { + "epoch": 0.6622730353511932, + "grad_norm": 0.23931673169136047, + "learning_rate": 1.544999335695811e-05, + "loss": 0.071, + "step": 37131 + }, + { + "epoch": 0.6622908714729069, + "grad_norm": 0.301135390996933, + "learning_rate": 1.5448554909971433e-05, + "loss": 0.1609, + "step": 37132 + }, + { + "epoch": 0.6623087075946206, + "grad_norm": 0.266620397567749, + "learning_rate": 1.544711650000945e-05, + "loss": 0.1494, + "step": 37133 + }, + { + "epoch": 0.6623265437163344, + "grad_norm": 0.18081001937389374, + "learning_rate": 1.544567812707774e-05, + "loss": 0.1484, + "step": 37134 + }, + { + "epoch": 0.6623443798380481, + "grad_norm": 0.22067302465438843, + "learning_rate": 1.5444239791181885e-05, + "loss": 0.0966, + "step": 37135 + }, + { + "epoch": 0.6623622159597617, + "grad_norm": 0.2599610388278961, + "learning_rate": 1.5442801492327462e-05, + "loss": 0.1028, + "step": 37136 + }, + { + "epoch": 0.6623800520814754, + "grad_norm": 0.20911681652069092, + "learning_rate": 1.5441363230520043e-05, + "loss": 0.0843, + "step": 37137 + }, + { + "epoch": 0.6623978882031891, + "grad_norm": 0.306350976228714, + "learning_rate": 1.5439925005765187e-05, + "loss": 0.1257, + "step": 37138 + }, + { + "epoch": 0.6624157243249028, + "grad_norm": 0.24656684696674347, + "learning_rate": 1.54384868180685e-05, + "loss": 0.1086, + "step": 37139 + }, + { + "epoch": 0.6624335604466165, + "grad_norm": 0.32594624161720276, + "learning_rate": 1.5437048667435534e-05, + "loss": 0.134, + "step": 37140 + }, + { + "epoch": 0.6624513965683302, + "grad_norm": 0.5861173272132874, + "learning_rate": 1.543561055387187e-05, + "loss": 0.1653, + "step": 37141 + }, + { + "epoch": 0.6624692326900439, + "grad_norm": 0.2312593162059784, + "learning_rate": 1.543417247738308e-05, + "loss": 0.0929, + "step": 37142 + }, + { + "epoch": 0.6624870688117576, + "grad_norm": 0.3028632700443268, + "learning_rate": 1.543273443797474e-05, + "loss": 0.0984, + "step": 37143 + }, + { + "epoch": 0.6625049049334713, + "grad_norm": 0.27155882120132446, + "learning_rate": 1.543129643565243e-05, + "loss": 0.1334, + "step": 37144 + }, + { + "epoch": 0.6625227410551849, + "grad_norm": 0.29891037940979004, + "learning_rate": 1.5429858470421727e-05, + "loss": 0.132, + "step": 37145 + }, + { + "epoch": 0.6625405771768986, + "grad_norm": 0.2779270112514496, + "learning_rate": 1.5428420542288193e-05, + "loss": 0.1685, + "step": 37146 + }, + { + "epoch": 0.6625584132986123, + "grad_norm": 0.23913122713565826, + "learning_rate": 1.54269826512574e-05, + "loss": 0.0875, + "step": 37147 + }, + { + "epoch": 0.662576249420326, + "grad_norm": 0.43862831592559814, + "learning_rate": 1.5425544797334933e-05, + "loss": 0.1914, + "step": 37148 + }, + { + "epoch": 0.6625940855420397, + "grad_norm": 0.24144157767295837, + "learning_rate": 1.542410698052636e-05, + "loss": 0.1196, + "step": 37149 + }, + { + "epoch": 0.6626119216637534, + "grad_norm": 0.2764016389846802, + "learning_rate": 1.542266920083726e-05, + "loss": 0.1084, + "step": 37150 + }, + { + "epoch": 0.6626297577854672, + "grad_norm": 0.2363121211528778, + "learning_rate": 1.5421231458273195e-05, + "loss": 0.0701, + "step": 37151 + }, + { + "epoch": 0.6626475939071809, + "grad_norm": 0.2862124443054199, + "learning_rate": 1.541979375283975e-05, + "loss": 0.1192, + "step": 37152 + }, + { + "epoch": 0.6626654300288946, + "grad_norm": 0.19498895108699799, + "learning_rate": 1.54183560845425e-05, + "loss": 0.104, + "step": 37153 + }, + { + "epoch": 0.6626832661506082, + "grad_norm": 0.2652069628238678, + "learning_rate": 1.5416918453387006e-05, + "loss": 0.1861, + "step": 37154 + }, + { + "epoch": 0.6627011022723219, + "grad_norm": 0.22711940109729767, + "learning_rate": 1.5415480859378836e-05, + "loss": 0.131, + "step": 37155 + }, + { + "epoch": 0.6627189383940356, + "grad_norm": 0.19824570417404175, + "learning_rate": 1.541404330252358e-05, + "loss": 0.1005, + "step": 37156 + }, + { + "epoch": 0.6627367745157493, + "grad_norm": 0.1812756061553955, + "learning_rate": 1.5412605782826805e-05, + "loss": 0.1215, + "step": 37157 + }, + { + "epoch": 0.662754610637463, + "grad_norm": 0.2489568591117859, + "learning_rate": 1.5411168300294086e-05, + "loss": 0.0906, + "step": 37158 + }, + { + "epoch": 0.6627724467591767, + "grad_norm": 0.2969771921634674, + "learning_rate": 1.540973085493099e-05, + "loss": 0.191, + "step": 37159 + }, + { + "epoch": 0.6627902828808904, + "grad_norm": 0.21314893662929535, + "learning_rate": 1.5408293446743073e-05, + "loss": 0.1119, + "step": 37160 + }, + { + "epoch": 0.6628081190026041, + "grad_norm": 0.27313944697380066, + "learning_rate": 1.5406856075735936e-05, + "loss": 0.1526, + "step": 37161 + }, + { + "epoch": 0.6628259551243177, + "grad_norm": 0.2530764043331146, + "learning_rate": 1.5405418741915138e-05, + "loss": 0.1398, + "step": 37162 + }, + { + "epoch": 0.6628437912460314, + "grad_norm": 0.2006148099899292, + "learning_rate": 1.5403981445286252e-05, + "loss": 0.1064, + "step": 37163 + }, + { + "epoch": 0.6628616273677451, + "grad_norm": 0.27970704436302185, + "learning_rate": 1.540254418585484e-05, + "loss": 0.1132, + "step": 37164 + }, + { + "epoch": 0.6628794634894588, + "grad_norm": 0.27723613381385803, + "learning_rate": 1.5401106963626487e-05, + "loss": 0.1354, + "step": 37165 + }, + { + "epoch": 0.6628972996111725, + "grad_norm": 0.24947862327098846, + "learning_rate": 1.5399669778606766e-05, + "loss": 0.0935, + "step": 37166 + }, + { + "epoch": 0.6629151357328863, + "grad_norm": 0.3269851803779602, + "learning_rate": 1.5398232630801233e-05, + "loss": 0.1329, + "step": 37167 + }, + { + "epoch": 0.6629329718546, + "grad_norm": 0.23876972496509552, + "learning_rate": 1.5396795520215464e-05, + "loss": 0.1363, + "step": 37168 + }, + { + "epoch": 0.6629508079763137, + "grad_norm": 0.3513764441013336, + "learning_rate": 1.539535844685504e-05, + "loss": 0.1736, + "step": 37169 + }, + { + "epoch": 0.6629686440980274, + "grad_norm": 0.28996118903160095, + "learning_rate": 1.539392141072552e-05, + "loss": 0.1251, + "step": 37170 + }, + { + "epoch": 0.662986480219741, + "grad_norm": 0.2390926033258438, + "learning_rate": 1.539248441183248e-05, + "loss": 0.134, + "step": 37171 + }, + { + "epoch": 0.6630043163414547, + "grad_norm": 0.30860915780067444, + "learning_rate": 1.539104745018149e-05, + "loss": 0.1434, + "step": 37172 + }, + { + "epoch": 0.6630221524631684, + "grad_norm": 0.2948002815246582, + "learning_rate": 1.5389610525778107e-05, + "loss": 0.1352, + "step": 37173 + }, + { + "epoch": 0.6630399885848821, + "grad_norm": 0.25885844230651855, + "learning_rate": 1.538817363862793e-05, + "loss": 0.0972, + "step": 37174 + }, + { + "epoch": 0.6630578247065958, + "grad_norm": 0.1641605794429779, + "learning_rate": 1.5386736788736506e-05, + "loss": 0.0861, + "step": 37175 + }, + { + "epoch": 0.6630756608283095, + "grad_norm": 0.2730233371257782, + "learning_rate": 1.538529997610941e-05, + "loss": 0.0921, + "step": 37176 + }, + { + "epoch": 0.6630934969500232, + "grad_norm": 0.20238915085792542, + "learning_rate": 1.5383863200752198e-05, + "loss": 0.1117, + "step": 37177 + }, + { + "epoch": 0.6631113330717369, + "grad_norm": 0.3468170762062073, + "learning_rate": 1.538242646267047e-05, + "loss": 0.1084, + "step": 37178 + }, + { + "epoch": 0.6631291691934506, + "grad_norm": 0.1766020953655243, + "learning_rate": 1.538098976186978e-05, + "loss": 0.0938, + "step": 37179 + }, + { + "epoch": 0.6631470053151642, + "grad_norm": 0.3839959502220154, + "learning_rate": 1.5379553098355693e-05, + "loss": 0.1266, + "step": 37180 + }, + { + "epoch": 0.6631648414368779, + "grad_norm": 0.23723424971103668, + "learning_rate": 1.5378116472133774e-05, + "loss": 0.1335, + "step": 37181 + }, + { + "epoch": 0.6631826775585916, + "grad_norm": 0.2310997098684311, + "learning_rate": 1.5376679883209606e-05, + "loss": 0.1107, + "step": 37182 + }, + { + "epoch": 0.6632005136803053, + "grad_norm": 0.2645007073879242, + "learning_rate": 1.5375243331588746e-05, + "loss": 0.0872, + "step": 37183 + }, + { + "epoch": 0.6632183498020191, + "grad_norm": 0.2963446378707886, + "learning_rate": 1.5373806817276773e-05, + "loss": 0.1009, + "step": 37184 + }, + { + "epoch": 0.6632361859237328, + "grad_norm": 0.24584689736366272, + "learning_rate": 1.537237034027925e-05, + "loss": 0.1025, + "step": 37185 + }, + { + "epoch": 0.6632540220454465, + "grad_norm": 0.24448126554489136, + "learning_rate": 1.5370933900601732e-05, + "loss": 0.1009, + "step": 37186 + }, + { + "epoch": 0.6632718581671602, + "grad_norm": 0.23954446613788605, + "learning_rate": 1.5369497498249812e-05, + "loss": 0.1155, + "step": 37187 + }, + { + "epoch": 0.6632896942888739, + "grad_norm": 0.23547305166721344, + "learning_rate": 1.5368061133229046e-05, + "loss": 0.1507, + "step": 37188 + }, + { + "epoch": 0.6633075304105875, + "grad_norm": 0.25817275047302246, + "learning_rate": 1.5366624805544998e-05, + "loss": 0.0991, + "step": 37189 + }, + { + "epoch": 0.6633253665323012, + "grad_norm": 0.24378524720668793, + "learning_rate": 1.5365188515203237e-05, + "loss": 0.1229, + "step": 37190 + }, + { + "epoch": 0.6633432026540149, + "grad_norm": 0.2722286880016327, + "learning_rate": 1.5363752262209334e-05, + "loss": 0.0969, + "step": 37191 + }, + { + "epoch": 0.6633610387757286, + "grad_norm": 0.2856522500514984, + "learning_rate": 1.5362316046568866e-05, + "loss": 0.1521, + "step": 37192 + }, + { + "epoch": 0.6633788748974423, + "grad_norm": 0.28451889753341675, + "learning_rate": 1.5360879868287382e-05, + "loss": 0.1366, + "step": 37193 + }, + { + "epoch": 0.663396711019156, + "grad_norm": 0.34682589769363403, + "learning_rate": 1.535944372737045e-05, + "loss": 0.134, + "step": 37194 + }, + { + "epoch": 0.6634145471408697, + "grad_norm": 0.44659557938575745, + "learning_rate": 1.5358007623823652e-05, + "loss": 0.1414, + "step": 37195 + }, + { + "epoch": 0.6634323832625834, + "grad_norm": 0.2704588770866394, + "learning_rate": 1.535657155765255e-05, + "loss": 0.1479, + "step": 37196 + }, + { + "epoch": 0.663450219384297, + "grad_norm": 0.2523496150970459, + "learning_rate": 1.53551355288627e-05, + "loss": 0.1383, + "step": 37197 + }, + { + "epoch": 0.6634680555060107, + "grad_norm": 0.2615732252597809, + "learning_rate": 1.535369953745968e-05, + "loss": 0.1094, + "step": 37198 + }, + { + "epoch": 0.6634858916277244, + "grad_norm": 0.2392926961183548, + "learning_rate": 1.5352263583449044e-05, + "loss": 0.1184, + "step": 37199 + }, + { + "epoch": 0.6635037277494381, + "grad_norm": 0.23568804562091827, + "learning_rate": 1.5350827666836377e-05, + "loss": 0.1109, + "step": 37200 + }, + { + "epoch": 0.6635215638711519, + "grad_norm": 0.22187836468219757, + "learning_rate": 1.534939178762723e-05, + "loss": 0.1619, + "step": 37201 + }, + { + "epoch": 0.6635393999928656, + "grad_norm": 0.26541996002197266, + "learning_rate": 1.534795594582718e-05, + "loss": 0.1248, + "step": 37202 + }, + { + "epoch": 0.6635572361145793, + "grad_norm": 0.2507448196411133, + "learning_rate": 1.5346520141441774e-05, + "loss": 0.1064, + "step": 37203 + }, + { + "epoch": 0.663575072236293, + "grad_norm": 0.45367196202278137, + "learning_rate": 1.53450843744766e-05, + "loss": 0.1273, + "step": 37204 + }, + { + "epoch": 0.6635929083580067, + "grad_norm": 0.28770527243614197, + "learning_rate": 1.534364864493721e-05, + "loss": 0.0901, + "step": 37205 + }, + { + "epoch": 0.6636107444797203, + "grad_norm": 0.2517922520637512, + "learning_rate": 1.5342212952829178e-05, + "loss": 0.1215, + "step": 37206 + }, + { + "epoch": 0.663628580601434, + "grad_norm": 0.23865675926208496, + "learning_rate": 1.534077729815805e-05, + "loss": 0.1325, + "step": 37207 + }, + { + "epoch": 0.6636464167231477, + "grad_norm": 0.27530112862586975, + "learning_rate": 1.533934168092942e-05, + "loss": 0.1339, + "step": 37208 + }, + { + "epoch": 0.6636642528448614, + "grad_norm": 0.212025985121727, + "learning_rate": 1.5337906101148837e-05, + "loss": 0.1161, + "step": 37209 + }, + { + "epoch": 0.6636820889665751, + "grad_norm": 0.2298341542482376, + "learning_rate": 1.5336470558821865e-05, + "loss": 0.1057, + "step": 37210 + }, + { + "epoch": 0.6636999250882888, + "grad_norm": 0.20415493845939636, + "learning_rate": 1.5335035053954068e-05, + "loss": 0.1196, + "step": 37211 + }, + { + "epoch": 0.6637177612100025, + "grad_norm": 0.24964316189289093, + "learning_rate": 1.533359958655101e-05, + "loss": 0.1169, + "step": 37212 + }, + { + "epoch": 0.6637355973317162, + "grad_norm": 0.2127217799425125, + "learning_rate": 1.5332164156618266e-05, + "loss": 0.1157, + "step": 37213 + }, + { + "epoch": 0.6637534334534299, + "grad_norm": 0.31767717003822327, + "learning_rate": 1.5330728764161394e-05, + "loss": 0.0895, + "step": 37214 + }, + { + "epoch": 0.6637712695751435, + "grad_norm": 0.22804713249206543, + "learning_rate": 1.5329293409185957e-05, + "loss": 0.1074, + "step": 37215 + }, + { + "epoch": 0.6637891056968572, + "grad_norm": 0.350273996591568, + "learning_rate": 1.5327858091697508e-05, + "loss": 0.1315, + "step": 37216 + }, + { + "epoch": 0.6638069418185709, + "grad_norm": 0.26878586411476135, + "learning_rate": 1.532642281170163e-05, + "loss": 0.1079, + "step": 37217 + }, + { + "epoch": 0.6638247779402847, + "grad_norm": 0.19158531725406647, + "learning_rate": 1.5324987569203878e-05, + "loss": 0.0865, + "step": 37218 + }, + { + "epoch": 0.6638426140619984, + "grad_norm": 0.260093629360199, + "learning_rate": 1.532355236420982e-05, + "loss": 0.1128, + "step": 37219 + }, + { + "epoch": 0.6638604501837121, + "grad_norm": 0.31091323494911194, + "learning_rate": 1.5322117196725e-05, + "loss": 0.1787, + "step": 37220 + }, + { + "epoch": 0.6638782863054258, + "grad_norm": 0.2746206521987915, + "learning_rate": 1.532068206675501e-05, + "loss": 0.1502, + "step": 37221 + }, + { + "epoch": 0.6638961224271395, + "grad_norm": 0.2346046268939972, + "learning_rate": 1.53192469743054e-05, + "loss": 0.1166, + "step": 37222 + }, + { + "epoch": 0.6639139585488532, + "grad_norm": 0.3615010976791382, + "learning_rate": 1.5317811919381732e-05, + "loss": 0.1681, + "step": 37223 + }, + { + "epoch": 0.6639317946705668, + "grad_norm": 0.22678299248218536, + "learning_rate": 1.5316376901989565e-05, + "loss": 0.0922, + "step": 37224 + }, + { + "epoch": 0.6639496307922805, + "grad_norm": 0.22100037336349487, + "learning_rate": 1.5314941922134463e-05, + "loss": 0.1232, + "step": 37225 + }, + { + "epoch": 0.6639674669139942, + "grad_norm": 0.3014891445636749, + "learning_rate": 1.5313506979821996e-05, + "loss": 0.0988, + "step": 37226 + }, + { + "epoch": 0.6639853030357079, + "grad_norm": 0.31242236495018005, + "learning_rate": 1.531207207505772e-05, + "loss": 0.1717, + "step": 37227 + }, + { + "epoch": 0.6640031391574216, + "grad_norm": 0.2907034754753113, + "learning_rate": 1.5310637207847204e-05, + "loss": 0.1076, + "step": 37228 + }, + { + "epoch": 0.6640209752791353, + "grad_norm": 0.18815724551677704, + "learning_rate": 1.530920237819599e-05, + "loss": 0.1239, + "step": 37229 + }, + { + "epoch": 0.664038811400849, + "grad_norm": 0.22786502540111542, + "learning_rate": 1.530776758610967e-05, + "loss": 0.0936, + "step": 37230 + }, + { + "epoch": 0.6640566475225627, + "grad_norm": 0.20799139142036438, + "learning_rate": 1.530633283159379e-05, + "loss": 0.091, + "step": 37231 + }, + { + "epoch": 0.6640744836442763, + "grad_norm": 0.3690548837184906, + "learning_rate": 1.5304898114653903e-05, + "loss": 0.1083, + "step": 37232 + }, + { + "epoch": 0.66409231976599, + "grad_norm": 0.39600130915641785, + "learning_rate": 1.5303463435295577e-05, + "loss": 0.1529, + "step": 37233 + }, + { + "epoch": 0.6641101558877037, + "grad_norm": 0.23733268678188324, + "learning_rate": 1.5302028793524385e-05, + "loss": 0.1042, + "step": 37234 + }, + { + "epoch": 0.6641279920094175, + "grad_norm": 0.24712589383125305, + "learning_rate": 1.5300594189345878e-05, + "loss": 0.1648, + "step": 37235 + }, + { + "epoch": 0.6641458281311312, + "grad_norm": 0.2119656801223755, + "learning_rate": 1.5299159622765618e-05, + "loss": 0.1097, + "step": 37236 + }, + { + "epoch": 0.6641636642528449, + "grad_norm": 0.2934284806251526, + "learning_rate": 1.5297725093789166e-05, + "loss": 0.1696, + "step": 37237 + }, + { + "epoch": 0.6641815003745586, + "grad_norm": 0.22951065003871918, + "learning_rate": 1.5296290602422074e-05, + "loss": 0.1099, + "step": 37238 + }, + { + "epoch": 0.6641993364962723, + "grad_norm": 0.23003602027893066, + "learning_rate": 1.5294856148669916e-05, + "loss": 0.1209, + "step": 37239 + }, + { + "epoch": 0.664217172617986, + "grad_norm": 0.2253616452217102, + "learning_rate": 1.5293421732538256e-05, + "loss": 0.1133, + "step": 37240 + }, + { + "epoch": 0.6642350087396997, + "grad_norm": 0.32475483417510986, + "learning_rate": 1.5291987354032643e-05, + "loss": 0.1652, + "step": 37241 + }, + { + "epoch": 0.6642528448614133, + "grad_norm": 0.3503333628177643, + "learning_rate": 1.529055301315863e-05, + "loss": 0.1639, + "step": 37242 + }, + { + "epoch": 0.664270680983127, + "grad_norm": 0.20953485369682312, + "learning_rate": 1.5289118709921794e-05, + "loss": 0.0969, + "step": 37243 + }, + { + "epoch": 0.6642885171048407, + "grad_norm": 0.22294431924819946, + "learning_rate": 1.5287684444327693e-05, + "loss": 0.1151, + "step": 37244 + }, + { + "epoch": 0.6643063532265544, + "grad_norm": 0.28788459300994873, + "learning_rate": 1.5286250216381874e-05, + "loss": 0.1151, + "step": 37245 + }, + { + "epoch": 0.6643241893482681, + "grad_norm": 0.37174153327941895, + "learning_rate": 1.5284816026089906e-05, + "loss": 0.1762, + "step": 37246 + }, + { + "epoch": 0.6643420254699818, + "grad_norm": 0.3149736225605011, + "learning_rate": 1.528338187345734e-05, + "loss": 0.1407, + "step": 37247 + }, + { + "epoch": 0.6643598615916955, + "grad_norm": 0.31120601296424866, + "learning_rate": 1.5281947758489753e-05, + "loss": 0.1104, + "step": 37248 + }, + { + "epoch": 0.6643776977134092, + "grad_norm": 0.20940278470516205, + "learning_rate": 1.5280513681192695e-05, + "loss": 0.1242, + "step": 37249 + }, + { + "epoch": 0.6643955338351228, + "grad_norm": 0.27586403489112854, + "learning_rate": 1.5279079641571716e-05, + "loss": 0.1004, + "step": 37250 + }, + { + "epoch": 0.6644133699568365, + "grad_norm": 0.21651116013526917, + "learning_rate": 1.5277645639632377e-05, + "loss": 0.1126, + "step": 37251 + }, + { + "epoch": 0.6644312060785503, + "grad_norm": 0.25119549036026, + "learning_rate": 1.5276211675380248e-05, + "loss": 0.1385, + "step": 37252 + }, + { + "epoch": 0.664449042200264, + "grad_norm": 0.29774191975593567, + "learning_rate": 1.5274777748820878e-05, + "loss": 0.1543, + "step": 37253 + }, + { + "epoch": 0.6644668783219777, + "grad_norm": 0.26036617159843445, + "learning_rate": 1.5273343859959835e-05, + "loss": 0.1102, + "step": 37254 + }, + { + "epoch": 0.6644847144436914, + "grad_norm": 0.21600110828876495, + "learning_rate": 1.527191000880266e-05, + "loss": 0.086, + "step": 37255 + }, + { + "epoch": 0.6645025505654051, + "grad_norm": 0.29409074783325195, + "learning_rate": 1.527047619535493e-05, + "loss": 0.1457, + "step": 37256 + }, + { + "epoch": 0.6645203866871188, + "grad_norm": 0.22073373198509216, + "learning_rate": 1.5269042419622196e-05, + "loss": 0.0739, + "step": 37257 + }, + { + "epoch": 0.6645382228088325, + "grad_norm": 0.25438931584358215, + "learning_rate": 1.5267608681610013e-05, + "loss": 0.0983, + "step": 37258 + }, + { + "epoch": 0.6645560589305461, + "grad_norm": 0.2637360990047455, + "learning_rate": 1.5266174981323934e-05, + "loss": 0.1175, + "step": 37259 + }, + { + "epoch": 0.6645738950522598, + "grad_norm": 0.2647033631801605, + "learning_rate": 1.5264741318769522e-05, + "loss": 0.1314, + "step": 37260 + }, + { + "epoch": 0.6645917311739735, + "grad_norm": 0.39525842666625977, + "learning_rate": 1.5263307693952344e-05, + "loss": 0.1175, + "step": 37261 + }, + { + "epoch": 0.6646095672956872, + "grad_norm": 0.35060861706733704, + "learning_rate": 1.526187410687795e-05, + "loss": 0.1691, + "step": 37262 + }, + { + "epoch": 0.6646274034174009, + "grad_norm": 0.31796735525131226, + "learning_rate": 1.5260440557551892e-05, + "loss": 0.1187, + "step": 37263 + }, + { + "epoch": 0.6646452395391146, + "grad_norm": 0.255008727312088, + "learning_rate": 1.525900704597972e-05, + "loss": 0.1312, + "step": 37264 + }, + { + "epoch": 0.6646630756608283, + "grad_norm": 0.25454312562942505, + "learning_rate": 1.5257573572167011e-05, + "loss": 0.1523, + "step": 37265 + }, + { + "epoch": 0.664680911782542, + "grad_norm": 0.23359286785125732, + "learning_rate": 1.525614013611931e-05, + "loss": 0.1236, + "step": 37266 + }, + { + "epoch": 0.6646987479042556, + "grad_norm": 0.32658764719963074, + "learning_rate": 1.5254706737842179e-05, + "loss": 0.1456, + "step": 37267 + }, + { + "epoch": 0.6647165840259694, + "grad_norm": 0.1909789741039276, + "learning_rate": 1.525327337734116e-05, + "loss": 0.0754, + "step": 37268 + }, + { + "epoch": 0.6647344201476831, + "grad_norm": 0.23027926683425903, + "learning_rate": 1.525184005462183e-05, + "loss": 0.1032, + "step": 37269 + }, + { + "epoch": 0.6647522562693968, + "grad_norm": 0.21528609097003937, + "learning_rate": 1.5250406769689734e-05, + "loss": 0.1168, + "step": 37270 + }, + { + "epoch": 0.6647700923911105, + "grad_norm": 0.2575491666793823, + "learning_rate": 1.5248973522550431e-05, + "loss": 0.1291, + "step": 37271 + }, + { + "epoch": 0.6647879285128242, + "grad_norm": 0.29947635531425476, + "learning_rate": 1.524754031320946e-05, + "loss": 0.1296, + "step": 37272 + }, + { + "epoch": 0.6648057646345379, + "grad_norm": 0.2557018995285034, + "learning_rate": 1.5246107141672405e-05, + "loss": 0.146, + "step": 37273 + }, + { + "epoch": 0.6648236007562516, + "grad_norm": 0.19982996582984924, + "learning_rate": 1.52446740079448e-05, + "loss": 0.1089, + "step": 37274 + }, + { + "epoch": 0.6648414368779653, + "grad_norm": 0.3643823564052582, + "learning_rate": 1.5243240912032214e-05, + "loss": 0.1353, + "step": 37275 + }, + { + "epoch": 0.664859272999679, + "grad_norm": 0.26143956184387207, + "learning_rate": 1.5241807853940199e-05, + "loss": 0.1463, + "step": 37276 + }, + { + "epoch": 0.6648771091213926, + "grad_norm": 0.3711717128753662, + "learning_rate": 1.5240374833674292e-05, + "loss": 0.1471, + "step": 37277 + }, + { + "epoch": 0.6648949452431063, + "grad_norm": 0.23246586322784424, + "learning_rate": 1.523894185124008e-05, + "loss": 0.0898, + "step": 37278 + }, + { + "epoch": 0.66491278136482, + "grad_norm": 0.27794933319091797, + "learning_rate": 1.5237508906643094e-05, + "loss": 0.1653, + "step": 37279 + }, + { + "epoch": 0.6649306174865337, + "grad_norm": 0.28819993138313293, + "learning_rate": 1.5236075999888894e-05, + "loss": 0.1708, + "step": 37280 + }, + { + "epoch": 0.6649484536082474, + "grad_norm": 0.2146557718515396, + "learning_rate": 1.5234643130983028e-05, + "loss": 0.1228, + "step": 37281 + }, + { + "epoch": 0.6649662897299611, + "grad_norm": 0.24644805490970612, + "learning_rate": 1.5233210299931072e-05, + "loss": 0.1597, + "step": 37282 + }, + { + "epoch": 0.6649841258516748, + "grad_norm": 0.2855125069618225, + "learning_rate": 1.5231777506738564e-05, + "loss": 0.1579, + "step": 37283 + }, + { + "epoch": 0.6650019619733885, + "grad_norm": 0.22150960564613342, + "learning_rate": 1.5230344751411058e-05, + "loss": 0.077, + "step": 37284 + }, + { + "epoch": 0.6650197980951023, + "grad_norm": 0.25853249430656433, + "learning_rate": 1.5228912033954104e-05, + "loss": 0.1007, + "step": 37285 + }, + { + "epoch": 0.6650376342168159, + "grad_norm": 0.19901502132415771, + "learning_rate": 1.522747935437327e-05, + "loss": 0.1035, + "step": 37286 + }, + { + "epoch": 0.6650554703385296, + "grad_norm": 0.3312702775001526, + "learning_rate": 1.5226046712674096e-05, + "loss": 0.1569, + "step": 37287 + }, + { + "epoch": 0.6650733064602433, + "grad_norm": 0.2932724952697754, + "learning_rate": 1.5224614108862145e-05, + "loss": 0.1242, + "step": 37288 + }, + { + "epoch": 0.665091142581957, + "grad_norm": 0.34969285130500793, + "learning_rate": 1.5223181542942969e-05, + "loss": 0.0715, + "step": 37289 + }, + { + "epoch": 0.6651089787036707, + "grad_norm": 0.3771221935749054, + "learning_rate": 1.5221749014922104e-05, + "loss": 0.1118, + "step": 37290 + }, + { + "epoch": 0.6651268148253844, + "grad_norm": 0.26814377307891846, + "learning_rate": 1.522031652480513e-05, + "loss": 0.1423, + "step": 37291 + }, + { + "epoch": 0.6651446509470981, + "grad_norm": 0.25429239869117737, + "learning_rate": 1.5218884072597588e-05, + "loss": 0.1601, + "step": 37292 + }, + { + "epoch": 0.6651624870688118, + "grad_norm": 0.29808053374290466, + "learning_rate": 1.521745165830502e-05, + "loss": 0.1251, + "step": 37293 + }, + { + "epoch": 0.6651803231905254, + "grad_norm": 0.19373822212219238, + "learning_rate": 1.5216019281932994e-05, + "loss": 0.1011, + "step": 37294 + }, + { + "epoch": 0.6651981593122391, + "grad_norm": 0.22746190428733826, + "learning_rate": 1.5214586943487052e-05, + "loss": 0.1144, + "step": 37295 + }, + { + "epoch": 0.6652159954339528, + "grad_norm": 0.3558097779750824, + "learning_rate": 1.521315464297276e-05, + "loss": 0.1155, + "step": 37296 + }, + { + "epoch": 0.6652338315556665, + "grad_norm": 0.24452945590019226, + "learning_rate": 1.5211722380395657e-05, + "loss": 0.127, + "step": 37297 + }, + { + "epoch": 0.6652516676773802, + "grad_norm": 0.3131833076477051, + "learning_rate": 1.5210290155761292e-05, + "loss": 0.128, + "step": 37298 + }, + { + "epoch": 0.6652695037990939, + "grad_norm": 0.1862787902355194, + "learning_rate": 1.520885796907523e-05, + "loss": 0.105, + "step": 37299 + }, + { + "epoch": 0.6652873399208076, + "grad_norm": 0.22851228713989258, + "learning_rate": 1.5207425820343018e-05, + "loss": 0.0992, + "step": 37300 + }, + { + "epoch": 0.6653051760425213, + "grad_norm": 0.27868667244911194, + "learning_rate": 1.5205993709570202e-05, + "loss": 0.1431, + "step": 37301 + }, + { + "epoch": 0.6653230121642351, + "grad_norm": 0.22867079079151154, + "learning_rate": 1.5204561636762336e-05, + "loss": 0.1396, + "step": 37302 + }, + { + "epoch": 0.6653408482859487, + "grad_norm": 0.2716962695121765, + "learning_rate": 1.520312960192497e-05, + "loss": 0.1526, + "step": 37303 + }, + { + "epoch": 0.6653586844076624, + "grad_norm": 0.23462536931037903, + "learning_rate": 1.5201697605063667e-05, + "loss": 0.1459, + "step": 37304 + }, + { + "epoch": 0.6653765205293761, + "grad_norm": 0.3461110591888428, + "learning_rate": 1.5200265646183965e-05, + "loss": 0.1721, + "step": 37305 + }, + { + "epoch": 0.6653943566510898, + "grad_norm": 0.20824572443962097, + "learning_rate": 1.519883372529142e-05, + "loss": 0.0859, + "step": 37306 + }, + { + "epoch": 0.6654121927728035, + "grad_norm": 0.23570944368839264, + "learning_rate": 1.5197401842391568e-05, + "loss": 0.1198, + "step": 37307 + }, + { + "epoch": 0.6654300288945172, + "grad_norm": 0.24916639924049377, + "learning_rate": 1.5195969997489983e-05, + "loss": 0.1244, + "step": 37308 + }, + { + "epoch": 0.6654478650162309, + "grad_norm": 0.3105781078338623, + "learning_rate": 1.5194538190592204e-05, + "loss": 0.1539, + "step": 37309 + }, + { + "epoch": 0.6654657011379446, + "grad_norm": 0.3340488076210022, + "learning_rate": 1.519310642170378e-05, + "loss": 0.1397, + "step": 37310 + }, + { + "epoch": 0.6654835372596583, + "grad_norm": 0.27205079793930054, + "learning_rate": 1.5191674690830258e-05, + "loss": 0.1242, + "step": 37311 + }, + { + "epoch": 0.6655013733813719, + "grad_norm": 0.2699366807937622, + "learning_rate": 1.51902429979772e-05, + "loss": 0.1558, + "step": 37312 + }, + { + "epoch": 0.6655192095030856, + "grad_norm": 0.2082771509885788, + "learning_rate": 1.518881134315015e-05, + "loss": 0.0981, + "step": 37313 + }, + { + "epoch": 0.6655370456247993, + "grad_norm": 0.39234793186187744, + "learning_rate": 1.5187379726354651e-05, + "loss": 0.1204, + "step": 37314 + }, + { + "epoch": 0.665554881746513, + "grad_norm": 0.2821088135242462, + "learning_rate": 1.5185948147596251e-05, + "loss": 0.1681, + "step": 37315 + }, + { + "epoch": 0.6655727178682267, + "grad_norm": 0.33678463101387024, + "learning_rate": 1.5184516606880511e-05, + "loss": 0.1437, + "step": 37316 + }, + { + "epoch": 0.6655905539899404, + "grad_norm": 0.2640470266342163, + "learning_rate": 1.518308510421298e-05, + "loss": 0.1276, + "step": 37317 + }, + { + "epoch": 0.6656083901116541, + "grad_norm": 0.2837303578853607, + "learning_rate": 1.5181653639599202e-05, + "loss": 0.1774, + "step": 37318 + }, + { + "epoch": 0.6656262262333679, + "grad_norm": 0.2199104279279709, + "learning_rate": 1.5180222213044725e-05, + "loss": 0.1041, + "step": 37319 + }, + { + "epoch": 0.6656440623550816, + "grad_norm": 0.231767475605011, + "learning_rate": 1.5178790824555089e-05, + "loss": 0.0982, + "step": 37320 + }, + { + "epoch": 0.6656618984767952, + "grad_norm": 0.25228768587112427, + "learning_rate": 1.517735947413586e-05, + "loss": 0.1254, + "step": 37321 + }, + { + "epoch": 0.6656797345985089, + "grad_norm": 0.173648864030838, + "learning_rate": 1.5175928161792574e-05, + "loss": 0.1081, + "step": 37322 + }, + { + "epoch": 0.6656975707202226, + "grad_norm": 0.18578499555587769, + "learning_rate": 1.517449688753079e-05, + "loss": 0.0601, + "step": 37323 + }, + { + "epoch": 0.6657154068419363, + "grad_norm": 0.27416977286338806, + "learning_rate": 1.5173065651356039e-05, + "loss": 0.1344, + "step": 37324 + }, + { + "epoch": 0.66573324296365, + "grad_norm": 0.28001996874809265, + "learning_rate": 1.517163445327389e-05, + "loss": 0.0907, + "step": 37325 + }, + { + "epoch": 0.6657510790853637, + "grad_norm": 0.28628695011138916, + "learning_rate": 1.5170203293289878e-05, + "loss": 0.1269, + "step": 37326 + }, + { + "epoch": 0.6657689152070774, + "grad_norm": 0.3076476454734802, + "learning_rate": 1.5168772171409556e-05, + "loss": 0.1108, + "step": 37327 + }, + { + "epoch": 0.6657867513287911, + "grad_norm": 0.17745445668697357, + "learning_rate": 1.5167341087638457e-05, + "loss": 0.1041, + "step": 37328 + }, + { + "epoch": 0.6658045874505047, + "grad_norm": 0.4940611720085144, + "learning_rate": 1.516591004198215e-05, + "loss": 0.1069, + "step": 37329 + }, + { + "epoch": 0.6658224235722184, + "grad_norm": 0.342750608921051, + "learning_rate": 1.5164479034446166e-05, + "loss": 0.1252, + "step": 37330 + }, + { + "epoch": 0.6658402596939321, + "grad_norm": 0.3039979040622711, + "learning_rate": 1.5163048065036067e-05, + "loss": 0.1551, + "step": 37331 + }, + { + "epoch": 0.6658580958156458, + "grad_norm": 0.2881886065006256, + "learning_rate": 1.5161617133757389e-05, + "loss": 0.111, + "step": 37332 + }, + { + "epoch": 0.6658759319373595, + "grad_norm": 0.3448079824447632, + "learning_rate": 1.516018624061567e-05, + "loss": 0.1385, + "step": 37333 + }, + { + "epoch": 0.6658937680590732, + "grad_norm": 0.2973722815513611, + "learning_rate": 1.515875538561648e-05, + "loss": 0.1542, + "step": 37334 + }, + { + "epoch": 0.6659116041807869, + "grad_norm": 0.2840515077114105, + "learning_rate": 1.5157324568765352e-05, + "loss": 0.1621, + "step": 37335 + }, + { + "epoch": 0.6659294403025007, + "grad_norm": 0.2529444694519043, + "learning_rate": 1.5155893790067833e-05, + "loss": 0.1333, + "step": 37336 + }, + { + "epoch": 0.6659472764242144, + "grad_norm": 0.30991122126579285, + "learning_rate": 1.515446304952946e-05, + "loss": 0.1544, + "step": 37337 + }, + { + "epoch": 0.665965112545928, + "grad_norm": 0.283700555562973, + "learning_rate": 1.5153032347155799e-05, + "loss": 0.143, + "step": 37338 + }, + { + "epoch": 0.6659829486676417, + "grad_norm": 0.3248513638973236, + "learning_rate": 1.5151601682952384e-05, + "loss": 0.1698, + "step": 37339 + }, + { + "epoch": 0.6660007847893554, + "grad_norm": 0.1941169798374176, + "learning_rate": 1.5150171056924766e-05, + "loss": 0.1146, + "step": 37340 + }, + { + "epoch": 0.6660186209110691, + "grad_norm": 0.2569960355758667, + "learning_rate": 1.5148740469078476e-05, + "loss": 0.1083, + "step": 37341 + }, + { + "epoch": 0.6660364570327828, + "grad_norm": 0.29788339138031006, + "learning_rate": 1.5147309919419078e-05, + "loss": 0.1055, + "step": 37342 + }, + { + "epoch": 0.6660542931544965, + "grad_norm": 0.24948754906654358, + "learning_rate": 1.5145879407952107e-05, + "loss": 0.1125, + "step": 37343 + }, + { + "epoch": 0.6660721292762102, + "grad_norm": 0.35152512788772583, + "learning_rate": 1.5144448934683113e-05, + "loss": 0.1285, + "step": 37344 + }, + { + "epoch": 0.6660899653979239, + "grad_norm": 0.2097112536430359, + "learning_rate": 1.5143018499617645e-05, + "loss": 0.129, + "step": 37345 + }, + { + "epoch": 0.6661078015196376, + "grad_norm": 0.24409887194633484, + "learning_rate": 1.5141588102761229e-05, + "loss": 0.1279, + "step": 37346 + }, + { + "epoch": 0.6661256376413512, + "grad_norm": 0.2049250304698944, + "learning_rate": 1.5140157744119432e-05, + "loss": 0.0879, + "step": 37347 + }, + { + "epoch": 0.6661434737630649, + "grad_norm": 0.2546696364879608, + "learning_rate": 1.5138727423697788e-05, + "loss": 0.1083, + "step": 37348 + }, + { + "epoch": 0.6661613098847786, + "grad_norm": 0.3335992693901062, + "learning_rate": 1.5137297141501838e-05, + "loss": 0.1343, + "step": 37349 + }, + { + "epoch": 0.6661791460064923, + "grad_norm": 0.25380873680114746, + "learning_rate": 1.5135866897537131e-05, + "loss": 0.1332, + "step": 37350 + }, + { + "epoch": 0.666196982128206, + "grad_norm": 0.2732788026332855, + "learning_rate": 1.5134436691809212e-05, + "loss": 0.0794, + "step": 37351 + }, + { + "epoch": 0.6662148182499197, + "grad_norm": 0.33003032207489014, + "learning_rate": 1.5133006524323629e-05, + "loss": 0.1209, + "step": 37352 + }, + { + "epoch": 0.6662326543716335, + "grad_norm": 0.256030797958374, + "learning_rate": 1.5131576395085916e-05, + "loss": 0.1542, + "step": 37353 + }, + { + "epoch": 0.6662504904933472, + "grad_norm": 0.23467621207237244, + "learning_rate": 1.5130146304101616e-05, + "loss": 0.1185, + "step": 37354 + }, + { + "epoch": 0.6662683266150609, + "grad_norm": 0.22649657726287842, + "learning_rate": 1.5128716251376285e-05, + "loss": 0.0935, + "step": 37355 + }, + { + "epoch": 0.6662861627367745, + "grad_norm": 0.24411751329898834, + "learning_rate": 1.5127286236915461e-05, + "loss": 0.127, + "step": 37356 + }, + { + "epoch": 0.6663039988584882, + "grad_norm": 0.27348047494888306, + "learning_rate": 1.512585626072468e-05, + "loss": 0.1117, + "step": 37357 + }, + { + "epoch": 0.6663218349802019, + "grad_norm": 0.2979251444339752, + "learning_rate": 1.51244263228095e-05, + "loss": 0.1199, + "step": 37358 + }, + { + "epoch": 0.6663396711019156, + "grad_norm": 0.22357137501239777, + "learning_rate": 1.512299642317544e-05, + "loss": 0.0629, + "step": 37359 + }, + { + "epoch": 0.6663575072236293, + "grad_norm": 0.299640029668808, + "learning_rate": 1.5121566561828065e-05, + "loss": 0.1337, + "step": 37360 + }, + { + "epoch": 0.666375343345343, + "grad_norm": 0.2895599901676178, + "learning_rate": 1.5120136738772914e-05, + "loss": 0.171, + "step": 37361 + }, + { + "epoch": 0.6663931794670567, + "grad_norm": 0.29385510087013245, + "learning_rate": 1.5118706954015524e-05, + "loss": 0.098, + "step": 37362 + }, + { + "epoch": 0.6664110155887704, + "grad_norm": 0.2517750561237335, + "learning_rate": 1.5117277207561428e-05, + "loss": 0.1021, + "step": 37363 + }, + { + "epoch": 0.666428851710484, + "grad_norm": 0.2462937831878662, + "learning_rate": 1.5115847499416186e-05, + "loss": 0.0917, + "step": 37364 + }, + { + "epoch": 0.6664466878321977, + "grad_norm": 0.2444039285182953, + "learning_rate": 1.5114417829585336e-05, + "loss": 0.1111, + "step": 37365 + }, + { + "epoch": 0.6664645239539114, + "grad_norm": 0.27546748518943787, + "learning_rate": 1.5112988198074418e-05, + "loss": 0.1172, + "step": 37366 + }, + { + "epoch": 0.6664823600756251, + "grad_norm": 0.2604631781578064, + "learning_rate": 1.511155860488896e-05, + "loss": 0.1736, + "step": 37367 + }, + { + "epoch": 0.6665001961973388, + "grad_norm": 0.45146656036376953, + "learning_rate": 1.511012905003453e-05, + "loss": 0.1038, + "step": 37368 + }, + { + "epoch": 0.6665180323190526, + "grad_norm": 0.2320471704006195, + "learning_rate": 1.5108699533516652e-05, + "loss": 0.1395, + "step": 37369 + }, + { + "epoch": 0.6665358684407663, + "grad_norm": 0.17088526487350464, + "learning_rate": 1.5107270055340869e-05, + "loss": 0.0862, + "step": 37370 + }, + { + "epoch": 0.66655370456248, + "grad_norm": 0.18300753831863403, + "learning_rate": 1.5105840615512728e-05, + "loss": 0.1143, + "step": 37371 + }, + { + "epoch": 0.6665715406841937, + "grad_norm": 0.3051344156265259, + "learning_rate": 1.5104411214037756e-05, + "loss": 0.1457, + "step": 37372 + }, + { + "epoch": 0.6665893768059074, + "grad_norm": 0.21027489006519318, + "learning_rate": 1.5102981850921515e-05, + "loss": 0.1067, + "step": 37373 + }, + { + "epoch": 0.666607212927621, + "grad_norm": 0.2476443201303482, + "learning_rate": 1.5101552526169533e-05, + "loss": 0.125, + "step": 37374 + }, + { + "epoch": 0.6666250490493347, + "grad_norm": 0.2863208055496216, + "learning_rate": 1.5100123239787352e-05, + "loss": 0.1456, + "step": 37375 + }, + { + "epoch": 0.6666428851710484, + "grad_norm": 0.19317860901355743, + "learning_rate": 1.5098693991780504e-05, + "loss": 0.0869, + "step": 37376 + }, + { + "epoch": 0.6666607212927621, + "grad_norm": 0.2885204553604126, + "learning_rate": 1.5097264782154547e-05, + "loss": 0.1105, + "step": 37377 + }, + { + "epoch": 0.6666785574144758, + "grad_norm": 0.26823538541793823, + "learning_rate": 1.509583561091501e-05, + "loss": 0.1138, + "step": 37378 + }, + { + "epoch": 0.6666963935361895, + "grad_norm": 0.23470832407474518, + "learning_rate": 1.5094406478067436e-05, + "loss": 0.1386, + "step": 37379 + }, + { + "epoch": 0.6667142296579032, + "grad_norm": 0.2544650137424469, + "learning_rate": 1.5092977383617357e-05, + "loss": 0.0663, + "step": 37380 + }, + { + "epoch": 0.6667320657796169, + "grad_norm": 0.27774569392204285, + "learning_rate": 1.5091548327570327e-05, + "loss": 0.1469, + "step": 37381 + }, + { + "epoch": 0.6667499019013305, + "grad_norm": 0.23503939807415009, + "learning_rate": 1.5090119309931878e-05, + "loss": 0.1154, + "step": 37382 + }, + { + "epoch": 0.6667677380230442, + "grad_norm": 0.2723938226699829, + "learning_rate": 1.508869033070755e-05, + "loss": 0.1223, + "step": 37383 + }, + { + "epoch": 0.6667855741447579, + "grad_norm": 0.2845105528831482, + "learning_rate": 1.5087261389902876e-05, + "loss": 0.1532, + "step": 37384 + }, + { + "epoch": 0.6668034102664716, + "grad_norm": 0.24683833122253418, + "learning_rate": 1.5085832487523399e-05, + "loss": 0.1496, + "step": 37385 + }, + { + "epoch": 0.6668212463881854, + "grad_norm": 0.28045058250427246, + "learning_rate": 1.5084403623574664e-05, + "loss": 0.1389, + "step": 37386 + }, + { + "epoch": 0.6668390825098991, + "grad_norm": 0.3507380187511444, + "learning_rate": 1.5082974798062208e-05, + "loss": 0.1226, + "step": 37387 + }, + { + "epoch": 0.6668569186316128, + "grad_norm": 0.2572086751461029, + "learning_rate": 1.5081546010991566e-05, + "loss": 0.1087, + "step": 37388 + }, + { + "epoch": 0.6668747547533265, + "grad_norm": 0.2823340892791748, + "learning_rate": 1.5080117262368269e-05, + "loss": 0.154, + "step": 37389 + }, + { + "epoch": 0.6668925908750402, + "grad_norm": 0.21687743067741394, + "learning_rate": 1.5078688552197872e-05, + "loss": 0.1105, + "step": 37390 + }, + { + "epoch": 0.6669104269967538, + "grad_norm": 0.324459433555603, + "learning_rate": 1.50772598804859e-05, + "loss": 0.1171, + "step": 37391 + }, + { + "epoch": 0.6669282631184675, + "grad_norm": 0.29453200101852417, + "learning_rate": 1.5075831247237905e-05, + "loss": 0.1638, + "step": 37392 + }, + { + "epoch": 0.6669460992401812, + "grad_norm": 0.24993082880973816, + "learning_rate": 1.5074402652459401e-05, + "loss": 0.0735, + "step": 37393 + }, + { + "epoch": 0.6669639353618949, + "grad_norm": 0.23562286794185638, + "learning_rate": 1.5072974096155951e-05, + "loss": 0.1473, + "step": 37394 + }, + { + "epoch": 0.6669817714836086, + "grad_norm": 0.3867155611515045, + "learning_rate": 1.5071545578333084e-05, + "loss": 0.1459, + "step": 37395 + }, + { + "epoch": 0.6669996076053223, + "grad_norm": 0.2661600708961487, + "learning_rate": 1.5070117098996333e-05, + "loss": 0.1373, + "step": 37396 + }, + { + "epoch": 0.667017443727036, + "grad_norm": 0.24916689097881317, + "learning_rate": 1.5068688658151236e-05, + "loss": 0.113, + "step": 37397 + }, + { + "epoch": 0.6670352798487497, + "grad_norm": 0.29195889830589294, + "learning_rate": 1.5067260255803328e-05, + "loss": 0.1025, + "step": 37398 + }, + { + "epoch": 0.6670531159704634, + "grad_norm": 0.31373482942581177, + "learning_rate": 1.5065831891958154e-05, + "loss": 0.1264, + "step": 37399 + }, + { + "epoch": 0.667070952092177, + "grad_norm": 0.3319110870361328, + "learning_rate": 1.506440356662125e-05, + "loss": 0.1112, + "step": 37400 + }, + { + "epoch": 0.6670887882138907, + "grad_norm": 0.2719859480857849, + "learning_rate": 1.5062975279798147e-05, + "loss": 0.1612, + "step": 37401 + }, + { + "epoch": 0.6671066243356044, + "grad_norm": 0.28437674045562744, + "learning_rate": 1.5061547031494377e-05, + "loss": 0.1255, + "step": 37402 + }, + { + "epoch": 0.6671244604573182, + "grad_norm": 0.33990347385406494, + "learning_rate": 1.5060118821715494e-05, + "loss": 0.1567, + "step": 37403 + }, + { + "epoch": 0.6671422965790319, + "grad_norm": 0.2851102352142334, + "learning_rate": 1.5058690650467022e-05, + "loss": 0.1356, + "step": 37404 + }, + { + "epoch": 0.6671601327007456, + "grad_norm": 0.2860081195831299, + "learning_rate": 1.5057262517754495e-05, + "loss": 0.1412, + "step": 37405 + }, + { + "epoch": 0.6671779688224593, + "grad_norm": 0.2555182874202728, + "learning_rate": 1.505583442358345e-05, + "loss": 0.1107, + "step": 37406 + }, + { + "epoch": 0.667195804944173, + "grad_norm": 0.35433822870254517, + "learning_rate": 1.5054406367959428e-05, + "loss": 0.1064, + "step": 37407 + }, + { + "epoch": 0.6672136410658867, + "grad_norm": 0.27691397070884705, + "learning_rate": 1.5052978350887963e-05, + "loss": 0.1711, + "step": 37408 + }, + { + "epoch": 0.6672314771876003, + "grad_norm": 0.22487658262252808, + "learning_rate": 1.5051550372374596e-05, + "loss": 0.1222, + "step": 37409 + }, + { + "epoch": 0.667249313309314, + "grad_norm": 0.23512691259384155, + "learning_rate": 1.5050122432424852e-05, + "loss": 0.162, + "step": 37410 + }, + { + "epoch": 0.6672671494310277, + "grad_norm": 0.2749473452568054, + "learning_rate": 1.5048694531044261e-05, + "loss": 0.1995, + "step": 37411 + }, + { + "epoch": 0.6672849855527414, + "grad_norm": 0.22493837773799896, + "learning_rate": 1.5047266668238374e-05, + "loss": 0.1129, + "step": 37412 + }, + { + "epoch": 0.6673028216744551, + "grad_norm": 0.23057669401168823, + "learning_rate": 1.5045838844012716e-05, + "loss": 0.0628, + "step": 37413 + }, + { + "epoch": 0.6673206577961688, + "grad_norm": 0.29819393157958984, + "learning_rate": 1.5044411058372831e-05, + "loss": 0.1287, + "step": 37414 + }, + { + "epoch": 0.6673384939178825, + "grad_norm": 0.21266312897205353, + "learning_rate": 1.5042983311324236e-05, + "loss": 0.1094, + "step": 37415 + }, + { + "epoch": 0.6673563300395962, + "grad_norm": 0.286541223526001, + "learning_rate": 1.5041555602872487e-05, + "loss": 0.1372, + "step": 37416 + }, + { + "epoch": 0.6673741661613098, + "grad_norm": 0.25222474336624146, + "learning_rate": 1.5040127933023108e-05, + "loss": 0.1015, + "step": 37417 + }, + { + "epoch": 0.6673920022830235, + "grad_norm": 0.24851714074611664, + "learning_rate": 1.5038700301781627e-05, + "loss": 0.1526, + "step": 37418 + }, + { + "epoch": 0.6674098384047372, + "grad_norm": 0.26983392238616943, + "learning_rate": 1.5037272709153587e-05, + "loss": 0.1368, + "step": 37419 + }, + { + "epoch": 0.667427674526451, + "grad_norm": 0.2393546849489212, + "learning_rate": 1.5035845155144517e-05, + "loss": 0.1037, + "step": 37420 + }, + { + "epoch": 0.6674455106481647, + "grad_norm": 0.2689007520675659, + "learning_rate": 1.5034417639759956e-05, + "loss": 0.1096, + "step": 37421 + }, + { + "epoch": 0.6674633467698784, + "grad_norm": 0.20499089360237122, + "learning_rate": 1.5032990163005439e-05, + "loss": 0.1121, + "step": 37422 + }, + { + "epoch": 0.6674811828915921, + "grad_norm": 0.23303039371967316, + "learning_rate": 1.503156272488649e-05, + "loss": 0.1179, + "step": 37423 + }, + { + "epoch": 0.6674990190133058, + "grad_norm": 0.24827894568443298, + "learning_rate": 1.503013532540864e-05, + "loss": 0.0918, + "step": 37424 + }, + { + "epoch": 0.6675168551350195, + "grad_norm": 0.25609734654426575, + "learning_rate": 1.5028707964577437e-05, + "loss": 0.0972, + "step": 37425 + }, + { + "epoch": 0.6675346912567331, + "grad_norm": 0.30394768714904785, + "learning_rate": 1.5027280642398401e-05, + "loss": 0.1581, + "step": 37426 + }, + { + "epoch": 0.6675525273784468, + "grad_norm": 0.20294636487960815, + "learning_rate": 1.5025853358877076e-05, + "loss": 0.1018, + "step": 37427 + }, + { + "epoch": 0.6675703635001605, + "grad_norm": 0.274728000164032, + "learning_rate": 1.5024426114018977e-05, + "loss": 0.126, + "step": 37428 + }, + { + "epoch": 0.6675881996218742, + "grad_norm": 0.2742089331150055, + "learning_rate": 1.5022998907829658e-05, + "loss": 0.1272, + "step": 37429 + }, + { + "epoch": 0.6676060357435879, + "grad_norm": 0.23590481281280518, + "learning_rate": 1.5021571740314644e-05, + "loss": 0.119, + "step": 37430 + }, + { + "epoch": 0.6676238718653016, + "grad_norm": 0.3870883285999298, + "learning_rate": 1.5020144611479458e-05, + "loss": 0.1061, + "step": 37431 + }, + { + "epoch": 0.6676417079870153, + "grad_norm": 0.21080902218818665, + "learning_rate": 1.5018717521329633e-05, + "loss": 0.1259, + "step": 37432 + }, + { + "epoch": 0.667659544108729, + "grad_norm": 0.28261756896972656, + "learning_rate": 1.5017290469870718e-05, + "loss": 0.1083, + "step": 37433 + }, + { + "epoch": 0.6676773802304427, + "grad_norm": 0.20457933843135834, + "learning_rate": 1.5015863457108225e-05, + "loss": 0.0788, + "step": 37434 + }, + { + "epoch": 0.6676952163521563, + "grad_norm": 0.2747241258621216, + "learning_rate": 1.50144364830477e-05, + "loss": 0.1056, + "step": 37435 + }, + { + "epoch": 0.66771305247387, + "grad_norm": 0.32505983114242554, + "learning_rate": 1.5013009547694668e-05, + "loss": 0.1596, + "step": 37436 + }, + { + "epoch": 0.6677308885955838, + "grad_norm": 0.3339903652667999, + "learning_rate": 1.501158265105465e-05, + "loss": 0.1605, + "step": 37437 + }, + { + "epoch": 0.6677487247172975, + "grad_norm": 0.23056331276893616, + "learning_rate": 1.5010155793133196e-05, + "loss": 0.1439, + "step": 37438 + }, + { + "epoch": 0.6677665608390112, + "grad_norm": 0.3030514419078827, + "learning_rate": 1.5008728973935832e-05, + "loss": 0.1267, + "step": 37439 + }, + { + "epoch": 0.6677843969607249, + "grad_norm": 0.2898868918418884, + "learning_rate": 1.5007302193468076e-05, + "loss": 0.1573, + "step": 37440 + }, + { + "epoch": 0.6678022330824386, + "grad_norm": 0.24350489675998688, + "learning_rate": 1.5005875451735466e-05, + "loss": 0.0958, + "step": 37441 + }, + { + "epoch": 0.6678200692041523, + "grad_norm": 0.3009876310825348, + "learning_rate": 1.5004448748743543e-05, + "loss": 0.1374, + "step": 37442 + }, + { + "epoch": 0.667837905325866, + "grad_norm": 0.2699940800666809, + "learning_rate": 1.500302208449783e-05, + "loss": 0.1459, + "step": 37443 + }, + { + "epoch": 0.6678557414475796, + "grad_norm": 0.3586570918560028, + "learning_rate": 1.500159545900386e-05, + "loss": 0.126, + "step": 37444 + }, + { + "epoch": 0.6678735775692933, + "grad_norm": 0.2170565277338028, + "learning_rate": 1.5000168872267143e-05, + "loss": 0.0849, + "step": 37445 + }, + { + "epoch": 0.667891413691007, + "grad_norm": 0.24756716191768646, + "learning_rate": 1.499874232429324e-05, + "loss": 0.1256, + "step": 37446 + }, + { + "epoch": 0.6679092498127207, + "grad_norm": 0.263731449842453, + "learning_rate": 1.4997315815087656e-05, + "loss": 0.1356, + "step": 37447 + }, + { + "epoch": 0.6679270859344344, + "grad_norm": 0.32832127809524536, + "learning_rate": 1.4995889344655939e-05, + "loss": 0.1379, + "step": 37448 + }, + { + "epoch": 0.6679449220561481, + "grad_norm": 0.1897130012512207, + "learning_rate": 1.499446291300361e-05, + "loss": 0.0969, + "step": 37449 + }, + { + "epoch": 0.6679627581778618, + "grad_norm": 0.25627386569976807, + "learning_rate": 1.4993036520136189e-05, + "loss": 0.1066, + "step": 37450 + }, + { + "epoch": 0.6679805942995755, + "grad_norm": 0.29881152510643005, + "learning_rate": 1.4991610166059222e-05, + "loss": 0.0961, + "step": 37451 + }, + { + "epoch": 0.6679984304212891, + "grad_norm": 0.2977713942527771, + "learning_rate": 1.4990183850778233e-05, + "loss": 0.149, + "step": 37452 + }, + { + "epoch": 0.6680162665430028, + "grad_norm": 0.2646772563457489, + "learning_rate": 1.4988757574298745e-05, + "loss": 0.0877, + "step": 37453 + }, + { + "epoch": 0.6680341026647166, + "grad_norm": 0.24207238852977753, + "learning_rate": 1.498733133662629e-05, + "loss": 0.1714, + "step": 37454 + }, + { + "epoch": 0.6680519387864303, + "grad_norm": 0.17280913889408112, + "learning_rate": 1.4985905137766396e-05, + "loss": 0.071, + "step": 37455 + }, + { + "epoch": 0.668069774908144, + "grad_norm": 0.21218106150627136, + "learning_rate": 1.4984478977724598e-05, + "loss": 0.1087, + "step": 37456 + }, + { + "epoch": 0.6680876110298577, + "grad_norm": 0.32297825813293457, + "learning_rate": 1.4983052856506419e-05, + "loss": 0.1698, + "step": 37457 + }, + { + "epoch": 0.6681054471515714, + "grad_norm": 0.2423536479473114, + "learning_rate": 1.4981626774117375e-05, + "loss": 0.1154, + "step": 37458 + }, + { + "epoch": 0.6681232832732851, + "grad_norm": 0.26053690910339355, + "learning_rate": 1.4980200730563016e-05, + "loss": 0.1514, + "step": 37459 + }, + { + "epoch": 0.6681411193949988, + "grad_norm": 0.29639753699302673, + "learning_rate": 1.497877472584886e-05, + "loss": 0.1771, + "step": 37460 + }, + { + "epoch": 0.6681589555167124, + "grad_norm": 0.28842461109161377, + "learning_rate": 1.4977348759980431e-05, + "loss": 0.1221, + "step": 37461 + }, + { + "epoch": 0.6681767916384261, + "grad_norm": 0.29308587312698364, + "learning_rate": 1.4975922832963263e-05, + "loss": 0.1149, + "step": 37462 + }, + { + "epoch": 0.6681946277601398, + "grad_norm": 0.2799105644226074, + "learning_rate": 1.4974496944802873e-05, + "loss": 0.1297, + "step": 37463 + }, + { + "epoch": 0.6682124638818535, + "grad_norm": 0.3396545350551605, + "learning_rate": 1.4973071095504804e-05, + "loss": 0.0983, + "step": 37464 + }, + { + "epoch": 0.6682303000035672, + "grad_norm": 0.3781348168849945, + "learning_rate": 1.4971645285074573e-05, + "loss": 0.1468, + "step": 37465 + }, + { + "epoch": 0.6682481361252809, + "grad_norm": 0.32027482986450195, + "learning_rate": 1.497021951351771e-05, + "loss": 0.1794, + "step": 37466 + }, + { + "epoch": 0.6682659722469946, + "grad_norm": 0.2661987543106079, + "learning_rate": 1.4968793780839729e-05, + "loss": 0.127, + "step": 37467 + }, + { + "epoch": 0.6682838083687083, + "grad_norm": 0.26616767048835754, + "learning_rate": 1.4967368087046173e-05, + "loss": 0.131, + "step": 37468 + }, + { + "epoch": 0.668301644490422, + "grad_norm": 0.24877332150936127, + "learning_rate": 1.496594243214257e-05, + "loss": 0.1112, + "step": 37469 + }, + { + "epoch": 0.6683194806121358, + "grad_norm": 0.4432615339756012, + "learning_rate": 1.4964516816134443e-05, + "loss": 0.1462, + "step": 37470 + }, + { + "epoch": 0.6683373167338494, + "grad_norm": 0.2551150918006897, + "learning_rate": 1.49630912390273e-05, + "loss": 0.1285, + "step": 37471 + }, + { + "epoch": 0.6683551528555631, + "grad_norm": 0.24343405663967133, + "learning_rate": 1.4961665700826694e-05, + "loss": 0.1049, + "step": 37472 + }, + { + "epoch": 0.6683729889772768, + "grad_norm": 0.3006855547428131, + "learning_rate": 1.496024020153814e-05, + "loss": 0.1246, + "step": 37473 + }, + { + "epoch": 0.6683908250989905, + "grad_norm": 0.2872680723667145, + "learning_rate": 1.4958814741167154e-05, + "loss": 0.1437, + "step": 37474 + }, + { + "epoch": 0.6684086612207042, + "grad_norm": 0.24515847861766815, + "learning_rate": 1.4957389319719278e-05, + "loss": 0.1271, + "step": 37475 + }, + { + "epoch": 0.6684264973424179, + "grad_norm": 0.2896585464477539, + "learning_rate": 1.4955963937200019e-05, + "loss": 0.1064, + "step": 37476 + }, + { + "epoch": 0.6684443334641316, + "grad_norm": 0.2942967116832733, + "learning_rate": 1.4954538593614922e-05, + "loss": 0.1402, + "step": 37477 + }, + { + "epoch": 0.6684621695858453, + "grad_norm": 0.264669805765152, + "learning_rate": 1.4953113288969503e-05, + "loss": 0.1538, + "step": 37478 + }, + { + "epoch": 0.6684800057075589, + "grad_norm": 0.2579793632030487, + "learning_rate": 1.495168802326929e-05, + "loss": 0.1263, + "step": 37479 + }, + { + "epoch": 0.6684978418292726, + "grad_norm": 0.23053547739982605, + "learning_rate": 1.495026279651979e-05, + "loss": 0.088, + "step": 37480 + }, + { + "epoch": 0.6685156779509863, + "grad_norm": 0.2720099687576294, + "learning_rate": 1.4948837608726558e-05, + "loss": 0.1734, + "step": 37481 + }, + { + "epoch": 0.6685335140727, + "grad_norm": 0.21285288035869598, + "learning_rate": 1.4947412459895093e-05, + "loss": 0.126, + "step": 37482 + }, + { + "epoch": 0.6685513501944137, + "grad_norm": 0.23870287835597992, + "learning_rate": 1.4945987350030934e-05, + "loss": 0.0877, + "step": 37483 + }, + { + "epoch": 0.6685691863161274, + "grad_norm": 0.2172452062368393, + "learning_rate": 1.4944562279139596e-05, + "loss": 0.0836, + "step": 37484 + }, + { + "epoch": 0.6685870224378411, + "grad_norm": 0.27493658661842346, + "learning_rate": 1.4943137247226614e-05, + "loss": 0.1008, + "step": 37485 + }, + { + "epoch": 0.6686048585595548, + "grad_norm": 0.2739734947681427, + "learning_rate": 1.4941712254297504e-05, + "loss": 0.1229, + "step": 37486 + }, + { + "epoch": 0.6686226946812686, + "grad_norm": 0.21329782903194427, + "learning_rate": 1.4940287300357794e-05, + "loss": 0.0987, + "step": 37487 + }, + { + "epoch": 0.6686405308029822, + "grad_norm": 0.34743237495422363, + "learning_rate": 1.4938862385413e-05, + "loss": 0.1747, + "step": 37488 + }, + { + "epoch": 0.6686583669246959, + "grad_norm": 0.3650725185871124, + "learning_rate": 1.4937437509468649e-05, + "loss": 0.0952, + "step": 37489 + }, + { + "epoch": 0.6686762030464096, + "grad_norm": 0.3573608100414276, + "learning_rate": 1.4936012672530264e-05, + "loss": 0.1652, + "step": 37490 + }, + { + "epoch": 0.6686940391681233, + "grad_norm": 0.28686070442199707, + "learning_rate": 1.4934587874603379e-05, + "loss": 0.1206, + "step": 37491 + }, + { + "epoch": 0.668711875289837, + "grad_norm": 0.2425878793001175, + "learning_rate": 1.4933163115693505e-05, + "loss": 0.0896, + "step": 37492 + }, + { + "epoch": 0.6687297114115507, + "grad_norm": 0.2659091055393219, + "learning_rate": 1.493173839580616e-05, + "loss": 0.1164, + "step": 37493 + }, + { + "epoch": 0.6687475475332644, + "grad_norm": 0.3583313822746277, + "learning_rate": 1.4930313714946884e-05, + "loss": 0.168, + "step": 37494 + }, + { + "epoch": 0.6687653836549781, + "grad_norm": 0.2514447271823883, + "learning_rate": 1.4928889073121183e-05, + "loss": 0.0971, + "step": 37495 + }, + { + "epoch": 0.6687832197766918, + "grad_norm": 0.25375911593437195, + "learning_rate": 1.4927464470334592e-05, + "loss": 0.1044, + "step": 37496 + }, + { + "epoch": 0.6688010558984054, + "grad_norm": 0.21628263592720032, + "learning_rate": 1.492603990659262e-05, + "loss": 0.1277, + "step": 37497 + }, + { + "epoch": 0.6688188920201191, + "grad_norm": 0.2946949005126953, + "learning_rate": 1.4924615381900803e-05, + "loss": 0.1275, + "step": 37498 + }, + { + "epoch": 0.6688367281418328, + "grad_norm": 0.2594294250011444, + "learning_rate": 1.4923190896264661e-05, + "loss": 0.1148, + "step": 37499 + }, + { + "epoch": 0.6688545642635465, + "grad_norm": 0.25827738642692566, + "learning_rate": 1.4921766449689709e-05, + "loss": 0.1141, + "step": 37500 + }, + { + "epoch": 0.6688724003852602, + "grad_norm": 0.27559417486190796, + "learning_rate": 1.4920342042181468e-05, + "loss": 0.1147, + "step": 37501 + }, + { + "epoch": 0.6688902365069739, + "grad_norm": 0.17399421334266663, + "learning_rate": 1.4918917673745458e-05, + "loss": 0.1275, + "step": 37502 + }, + { + "epoch": 0.6689080726286876, + "grad_norm": 0.2777664065361023, + "learning_rate": 1.491749334438721e-05, + "loss": 0.1374, + "step": 37503 + }, + { + "epoch": 0.6689259087504014, + "grad_norm": 0.2537831962108612, + "learning_rate": 1.4916069054112242e-05, + "loss": 0.1382, + "step": 37504 + }, + { + "epoch": 0.668943744872115, + "grad_norm": 0.23989103734493256, + "learning_rate": 1.4914644802926075e-05, + "loss": 0.1101, + "step": 37505 + }, + { + "epoch": 0.6689615809938287, + "grad_norm": 0.27971115708351135, + "learning_rate": 1.4913220590834218e-05, + "loss": 0.1088, + "step": 37506 + }, + { + "epoch": 0.6689794171155424, + "grad_norm": 0.27331462502479553, + "learning_rate": 1.4911796417842211e-05, + "loss": 0.1273, + "step": 37507 + }, + { + "epoch": 0.6689972532372561, + "grad_norm": 0.2122882902622223, + "learning_rate": 1.4910372283955565e-05, + "loss": 0.1375, + "step": 37508 + }, + { + "epoch": 0.6690150893589698, + "grad_norm": 0.2488301545381546, + "learning_rate": 1.49089481891798e-05, + "loss": 0.135, + "step": 37509 + }, + { + "epoch": 0.6690329254806835, + "grad_norm": 0.30871787667274475, + "learning_rate": 1.490752413352043e-05, + "loss": 0.1519, + "step": 37510 + }, + { + "epoch": 0.6690507616023972, + "grad_norm": 0.276216596364975, + "learning_rate": 1.4906100116982988e-05, + "loss": 0.1311, + "step": 37511 + }, + { + "epoch": 0.6690685977241109, + "grad_norm": 0.315306156873703, + "learning_rate": 1.4904676139572992e-05, + "loss": 0.1344, + "step": 37512 + }, + { + "epoch": 0.6690864338458246, + "grad_norm": 0.29117345809936523, + "learning_rate": 1.4903252201295959e-05, + "loss": 0.1151, + "step": 37513 + }, + { + "epoch": 0.6691042699675382, + "grad_norm": 0.3025839328765869, + "learning_rate": 1.4901828302157406e-05, + "loss": 0.1338, + "step": 37514 + }, + { + "epoch": 0.6691221060892519, + "grad_norm": 0.3293962776660919, + "learning_rate": 1.4900404442162843e-05, + "loss": 0.1179, + "step": 37515 + }, + { + "epoch": 0.6691399422109656, + "grad_norm": 0.3022843301296234, + "learning_rate": 1.4898980621317813e-05, + "loss": 0.0851, + "step": 37516 + }, + { + "epoch": 0.6691577783326793, + "grad_norm": 0.20449146628379822, + "learning_rate": 1.4897556839627818e-05, + "loss": 0.0773, + "step": 37517 + }, + { + "epoch": 0.669175614454393, + "grad_norm": 0.26335957646369934, + "learning_rate": 1.4896133097098385e-05, + "loss": 0.1209, + "step": 37518 + }, + { + "epoch": 0.6691934505761067, + "grad_norm": 0.37627536058425903, + "learning_rate": 1.489470939373502e-05, + "loss": 0.1152, + "step": 37519 + }, + { + "epoch": 0.6692112866978204, + "grad_norm": 0.21723400056362152, + "learning_rate": 1.4893285729543263e-05, + "loss": 0.0907, + "step": 37520 + }, + { + "epoch": 0.6692291228195342, + "grad_norm": 0.25401490926742554, + "learning_rate": 1.4891862104528619e-05, + "loss": 0.0872, + "step": 37521 + }, + { + "epoch": 0.6692469589412479, + "grad_norm": 0.2620532512664795, + "learning_rate": 1.4890438518696608e-05, + "loss": 0.127, + "step": 37522 + }, + { + "epoch": 0.6692647950629615, + "grad_norm": 0.3761610984802246, + "learning_rate": 1.4889014972052745e-05, + "loss": 0.1748, + "step": 37523 + }, + { + "epoch": 0.6692826311846752, + "grad_norm": 0.27166199684143066, + "learning_rate": 1.4887591464602557e-05, + "loss": 0.1638, + "step": 37524 + }, + { + "epoch": 0.6693004673063889, + "grad_norm": 0.27220532298088074, + "learning_rate": 1.4886167996351557e-05, + "loss": 0.0843, + "step": 37525 + }, + { + "epoch": 0.6693183034281026, + "grad_norm": 0.4786548614501953, + "learning_rate": 1.4884744567305265e-05, + "loss": 0.0943, + "step": 37526 + }, + { + "epoch": 0.6693361395498163, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.4883321177469197e-05, + "loss": 0.1853, + "step": 37527 + }, + { + "epoch": 0.66935397567153, + "grad_norm": 0.2542572617530823, + "learning_rate": 1.4881897826848861e-05, + "loss": 0.0616, + "step": 37528 + }, + { + "epoch": 0.6693718117932437, + "grad_norm": 0.21073196828365326, + "learning_rate": 1.4880474515449794e-05, + "loss": 0.0873, + "step": 37529 + }, + { + "epoch": 0.6693896479149574, + "grad_norm": 0.2993996739387512, + "learning_rate": 1.4879051243277497e-05, + "loss": 0.1544, + "step": 37530 + }, + { + "epoch": 0.669407484036671, + "grad_norm": 0.26914775371551514, + "learning_rate": 1.4877628010337496e-05, + "loss": 0.1671, + "step": 37531 + }, + { + "epoch": 0.6694253201583847, + "grad_norm": 0.2967449426651001, + "learning_rate": 1.4876204816635297e-05, + "loss": 0.1226, + "step": 37532 + }, + { + "epoch": 0.6694431562800984, + "grad_norm": 0.3515028655529022, + "learning_rate": 1.4874781662176434e-05, + "loss": 0.066, + "step": 37533 + }, + { + "epoch": 0.6694609924018121, + "grad_norm": 0.3658161759376526, + "learning_rate": 1.4873358546966415e-05, + "loss": 0.164, + "step": 37534 + }, + { + "epoch": 0.6694788285235258, + "grad_norm": 0.26576781272888184, + "learning_rate": 1.4871935471010756e-05, + "loss": 0.1076, + "step": 37535 + }, + { + "epoch": 0.6694966646452395, + "grad_norm": 0.366319477558136, + "learning_rate": 1.4870512434314965e-05, + "loss": 0.1495, + "step": 37536 + }, + { + "epoch": 0.6695145007669532, + "grad_norm": 0.2648712992668152, + "learning_rate": 1.4869089436884576e-05, + "loss": 0.0945, + "step": 37537 + }, + { + "epoch": 0.669532336888667, + "grad_norm": 0.227285236120224, + "learning_rate": 1.4867666478725087e-05, + "loss": 0.129, + "step": 37538 + }, + { + "epoch": 0.6695501730103807, + "grad_norm": 0.24719049036502838, + "learning_rate": 1.486624355984203e-05, + "loss": 0.1423, + "step": 37539 + }, + { + "epoch": 0.6695680091320944, + "grad_norm": 0.3355978727340698, + "learning_rate": 1.4864820680240913e-05, + "loss": 0.1006, + "step": 37540 + }, + { + "epoch": 0.669585845253808, + "grad_norm": 0.2794986069202423, + "learning_rate": 1.486339783992724e-05, + "loss": 0.0564, + "step": 37541 + }, + { + "epoch": 0.6696036813755217, + "grad_norm": 0.3182423412799835, + "learning_rate": 1.486197503890655e-05, + "loss": 0.1518, + "step": 37542 + }, + { + "epoch": 0.6696215174972354, + "grad_norm": 0.2631544768810272, + "learning_rate": 1.4860552277184344e-05, + "loss": 0.1127, + "step": 37543 + }, + { + "epoch": 0.6696393536189491, + "grad_norm": 0.2074270248413086, + "learning_rate": 1.4859129554766139e-05, + "loss": 0.1505, + "step": 37544 + }, + { + "epoch": 0.6696571897406628, + "grad_norm": 0.2757417857646942, + "learning_rate": 1.485770687165744e-05, + "loss": 0.0953, + "step": 37545 + }, + { + "epoch": 0.6696750258623765, + "grad_norm": 0.3053838908672333, + "learning_rate": 1.4856284227863784e-05, + "loss": 0.1253, + "step": 37546 + }, + { + "epoch": 0.6696928619840902, + "grad_norm": 0.32925012707710266, + "learning_rate": 1.4854861623390676e-05, + "loss": 0.0962, + "step": 37547 + }, + { + "epoch": 0.6697106981058039, + "grad_norm": 0.23687401413917542, + "learning_rate": 1.4853439058243626e-05, + "loss": 0.1458, + "step": 37548 + }, + { + "epoch": 0.6697285342275175, + "grad_norm": 0.2599889636039734, + "learning_rate": 1.4852016532428143e-05, + "loss": 0.1104, + "step": 37549 + }, + { + "epoch": 0.6697463703492312, + "grad_norm": 0.35102471709251404, + "learning_rate": 1.4850594045949757e-05, + "loss": 0.1561, + "step": 37550 + }, + { + "epoch": 0.6697642064709449, + "grad_norm": 0.24412554502487183, + "learning_rate": 1.4849171598813966e-05, + "loss": 0.1223, + "step": 37551 + }, + { + "epoch": 0.6697820425926586, + "grad_norm": 0.228230282664299, + "learning_rate": 1.48477491910263e-05, + "loss": 0.1014, + "step": 37552 + }, + { + "epoch": 0.6697998787143723, + "grad_norm": 0.3021867275238037, + "learning_rate": 1.4846326822592265e-05, + "loss": 0.091, + "step": 37553 + }, + { + "epoch": 0.669817714836086, + "grad_norm": 0.2875548303127289, + "learning_rate": 1.4844904493517364e-05, + "loss": 0.1095, + "step": 37554 + }, + { + "epoch": 0.6698355509577998, + "grad_norm": 0.29142051935195923, + "learning_rate": 1.4843482203807132e-05, + "loss": 0.0867, + "step": 37555 + }, + { + "epoch": 0.6698533870795135, + "grad_norm": 0.4533658027648926, + "learning_rate": 1.484205995346707e-05, + "loss": 0.1635, + "step": 37556 + }, + { + "epoch": 0.6698712232012272, + "grad_norm": 0.2554011344909668, + "learning_rate": 1.4840637742502686e-05, + "loss": 0.1144, + "step": 37557 + }, + { + "epoch": 0.6698890593229408, + "grad_norm": 0.3522297143936157, + "learning_rate": 1.4839215570919501e-05, + "loss": 0.1675, + "step": 37558 + }, + { + "epoch": 0.6699068954446545, + "grad_norm": 0.22275668382644653, + "learning_rate": 1.4837793438723024e-05, + "loss": 0.1349, + "step": 37559 + }, + { + "epoch": 0.6699247315663682, + "grad_norm": 0.2554265558719635, + "learning_rate": 1.4836371345918775e-05, + "loss": 0.1304, + "step": 37560 + }, + { + "epoch": 0.6699425676880819, + "grad_norm": 0.3709714710712433, + "learning_rate": 1.4834949292512259e-05, + "loss": 0.1173, + "step": 37561 + }, + { + "epoch": 0.6699604038097956, + "grad_norm": 0.30835068225860596, + "learning_rate": 1.4833527278508985e-05, + "loss": 0.2022, + "step": 37562 + }, + { + "epoch": 0.6699782399315093, + "grad_norm": 0.30311504006385803, + "learning_rate": 1.4832105303914478e-05, + "loss": 0.1439, + "step": 37563 + }, + { + "epoch": 0.669996076053223, + "grad_norm": 0.2063443958759308, + "learning_rate": 1.4830683368734244e-05, + "loss": 0.1062, + "step": 37564 + }, + { + "epoch": 0.6700139121749367, + "grad_norm": 0.43419355154037476, + "learning_rate": 1.4829261472973784e-05, + "loss": 0.1415, + "step": 37565 + }, + { + "epoch": 0.6700317482966504, + "grad_norm": 0.28292039036750793, + "learning_rate": 1.4827839616638628e-05, + "loss": 0.0943, + "step": 37566 + }, + { + "epoch": 0.670049584418364, + "grad_norm": 0.2872313857078552, + "learning_rate": 1.482641779973427e-05, + "loss": 0.1559, + "step": 37567 + }, + { + "epoch": 0.6700674205400777, + "grad_norm": 0.2834457457065582, + "learning_rate": 1.4824996022266242e-05, + "loss": 0.1601, + "step": 37568 + }, + { + "epoch": 0.6700852566617914, + "grad_norm": 0.21420888602733612, + "learning_rate": 1.4823574284240037e-05, + "loss": 0.0839, + "step": 37569 + }, + { + "epoch": 0.6701030927835051, + "grad_norm": 0.3201926052570343, + "learning_rate": 1.482215258566118e-05, + "loss": 0.2149, + "step": 37570 + }, + { + "epoch": 0.6701209289052189, + "grad_norm": 0.2816302180290222, + "learning_rate": 1.4820730926535159e-05, + "loss": 0.1251, + "step": 37571 + }, + { + "epoch": 0.6701387650269326, + "grad_norm": 0.4552721679210663, + "learning_rate": 1.481930930686751e-05, + "loss": 0.1166, + "step": 37572 + }, + { + "epoch": 0.6701566011486463, + "grad_norm": 0.2416018694639206, + "learning_rate": 1.4817887726663736e-05, + "loss": 0.112, + "step": 37573 + }, + { + "epoch": 0.67017443727036, + "grad_norm": 0.30778974294662476, + "learning_rate": 1.4816466185929351e-05, + "loss": 0.1872, + "step": 37574 + }, + { + "epoch": 0.6701922733920737, + "grad_norm": 0.2449854463338852, + "learning_rate": 1.4815044684669848e-05, + "loss": 0.0933, + "step": 37575 + }, + { + "epoch": 0.6702101095137873, + "grad_norm": 0.21616952121257782, + "learning_rate": 1.4813623222890758e-05, + "loss": 0.12, + "step": 37576 + }, + { + "epoch": 0.670227945635501, + "grad_norm": 0.24348503351211548, + "learning_rate": 1.4812201800597583e-05, + "loss": 0.1092, + "step": 37577 + }, + { + "epoch": 0.6702457817572147, + "grad_norm": 0.37442681193351746, + "learning_rate": 1.4810780417795827e-05, + "loss": 0.1085, + "step": 37578 + }, + { + "epoch": 0.6702636178789284, + "grad_norm": 0.28844305872917175, + "learning_rate": 1.4809359074491014e-05, + "loss": 0.1168, + "step": 37579 + }, + { + "epoch": 0.6702814540006421, + "grad_norm": 0.29024919867515564, + "learning_rate": 1.4807937770688632e-05, + "loss": 0.1468, + "step": 37580 + }, + { + "epoch": 0.6702992901223558, + "grad_norm": 0.34914064407348633, + "learning_rate": 1.4806516506394214e-05, + "loss": 0.1353, + "step": 37581 + }, + { + "epoch": 0.6703171262440695, + "grad_norm": 0.232282817363739, + "learning_rate": 1.4805095281613257e-05, + "loss": 0.1102, + "step": 37582 + }, + { + "epoch": 0.6703349623657832, + "grad_norm": 0.28244367241859436, + "learning_rate": 1.4803674096351274e-05, + "loss": 0.1173, + "step": 37583 + }, + { + "epoch": 0.6703527984874968, + "grad_norm": 0.3104051947593689, + "learning_rate": 1.4802252950613763e-05, + "loss": 0.1219, + "step": 37584 + }, + { + "epoch": 0.6703706346092105, + "grad_norm": 0.3874141573905945, + "learning_rate": 1.480083184440625e-05, + "loss": 0.1066, + "step": 37585 + }, + { + "epoch": 0.6703884707309242, + "grad_norm": 0.26109880208969116, + "learning_rate": 1.4799410777734229e-05, + "loss": 0.1437, + "step": 37586 + }, + { + "epoch": 0.6704063068526379, + "grad_norm": 0.33147236704826355, + "learning_rate": 1.4797989750603223e-05, + "loss": 0.1077, + "step": 37587 + }, + { + "epoch": 0.6704241429743517, + "grad_norm": 0.28374454379081726, + "learning_rate": 1.479656876301872e-05, + "loss": 0.1403, + "step": 37588 + }, + { + "epoch": 0.6704419790960654, + "grad_norm": 0.26651573181152344, + "learning_rate": 1.4795147814986254e-05, + "loss": 0.1233, + "step": 37589 + }, + { + "epoch": 0.6704598152177791, + "grad_norm": 0.24789181351661682, + "learning_rate": 1.4793726906511316e-05, + "loss": 0.1185, + "step": 37590 + }, + { + "epoch": 0.6704776513394928, + "grad_norm": 0.26695364713668823, + "learning_rate": 1.4792306037599419e-05, + "loss": 0.1285, + "step": 37591 + }, + { + "epoch": 0.6704954874612065, + "grad_norm": 0.2959669828414917, + "learning_rate": 1.4790885208256064e-05, + "loss": 0.0921, + "step": 37592 + }, + { + "epoch": 0.6705133235829202, + "grad_norm": 0.26789140701293945, + "learning_rate": 1.4789464418486764e-05, + "loss": 0.1107, + "step": 37593 + }, + { + "epoch": 0.6705311597046338, + "grad_norm": 0.2850610017776489, + "learning_rate": 1.4788043668297027e-05, + "loss": 0.1426, + "step": 37594 + }, + { + "epoch": 0.6705489958263475, + "grad_norm": 0.3007853925228119, + "learning_rate": 1.4786622957692364e-05, + "loss": 0.12, + "step": 37595 + }, + { + "epoch": 0.6705668319480612, + "grad_norm": 0.24679973721504211, + "learning_rate": 1.478520228667828e-05, + "loss": 0.1015, + "step": 37596 + }, + { + "epoch": 0.6705846680697749, + "grad_norm": 0.17484255135059357, + "learning_rate": 1.478378165526027e-05, + "loss": 0.0678, + "step": 37597 + }, + { + "epoch": 0.6706025041914886, + "grad_norm": 0.3607724905014038, + "learning_rate": 1.4782361063443858e-05, + "loss": 0.1119, + "step": 37598 + }, + { + "epoch": 0.6706203403132023, + "grad_norm": 0.25202685594558716, + "learning_rate": 1.4780940511234542e-05, + "loss": 0.1121, + "step": 37599 + }, + { + "epoch": 0.670638176434916, + "grad_norm": 0.25669777393341064, + "learning_rate": 1.4779519998637833e-05, + "loss": 0.1373, + "step": 37600 + }, + { + "epoch": 0.6706560125566297, + "grad_norm": 0.29675689339637756, + "learning_rate": 1.4778099525659225e-05, + "loss": 0.1726, + "step": 37601 + }, + { + "epoch": 0.6706738486783433, + "grad_norm": 0.2819582223892212, + "learning_rate": 1.4776679092304244e-05, + "loss": 0.1096, + "step": 37602 + }, + { + "epoch": 0.670691684800057, + "grad_norm": 0.26130354404449463, + "learning_rate": 1.4775258698578388e-05, + "loss": 0.1066, + "step": 37603 + }, + { + "epoch": 0.6707095209217707, + "grad_norm": 0.28545647859573364, + "learning_rate": 1.4773838344487156e-05, + "loss": 0.1283, + "step": 37604 + }, + { + "epoch": 0.6707273570434845, + "grad_norm": 0.24724625051021576, + "learning_rate": 1.4772418030036056e-05, + "loss": 0.1517, + "step": 37605 + }, + { + "epoch": 0.6707451931651982, + "grad_norm": 0.2551341950893402, + "learning_rate": 1.4770997755230598e-05, + "loss": 0.1013, + "step": 37606 + }, + { + "epoch": 0.6707630292869119, + "grad_norm": 0.3370712697505951, + "learning_rate": 1.4769577520076284e-05, + "loss": 0.1375, + "step": 37607 + }, + { + "epoch": 0.6707808654086256, + "grad_norm": 0.35645297169685364, + "learning_rate": 1.4768157324578624e-05, + "loss": 0.1083, + "step": 37608 + }, + { + "epoch": 0.6707987015303393, + "grad_norm": 0.36033329367637634, + "learning_rate": 1.4766737168743122e-05, + "loss": 0.1439, + "step": 37609 + }, + { + "epoch": 0.670816537652053, + "grad_norm": 0.21240262687206268, + "learning_rate": 1.4765317052575273e-05, + "loss": 0.1099, + "step": 37610 + }, + { + "epoch": 0.6708343737737666, + "grad_norm": 0.2624545395374298, + "learning_rate": 1.4763896976080595e-05, + "loss": 0.127, + "step": 37611 + }, + { + "epoch": 0.6708522098954803, + "grad_norm": 0.30583328008651733, + "learning_rate": 1.4762476939264592e-05, + "loss": 0.1148, + "step": 37612 + }, + { + "epoch": 0.670870046017194, + "grad_norm": 0.2538442611694336, + "learning_rate": 1.476105694213276e-05, + "loss": 0.1191, + "step": 37613 + }, + { + "epoch": 0.6708878821389077, + "grad_norm": 0.26179274916648865, + "learning_rate": 1.4759636984690606e-05, + "loss": 0.1357, + "step": 37614 + }, + { + "epoch": 0.6709057182606214, + "grad_norm": 0.22708620131015778, + "learning_rate": 1.4758217066943636e-05, + "loss": 0.1486, + "step": 37615 + }, + { + "epoch": 0.6709235543823351, + "grad_norm": 0.22707833349704742, + "learning_rate": 1.4756797188897359e-05, + "loss": 0.133, + "step": 37616 + }, + { + "epoch": 0.6709413905040488, + "grad_norm": 0.3958587348461151, + "learning_rate": 1.4755377350557274e-05, + "loss": 0.2082, + "step": 37617 + }, + { + "epoch": 0.6709592266257625, + "grad_norm": 0.2120359241962433, + "learning_rate": 1.4753957551928881e-05, + "loss": 0.117, + "step": 37618 + }, + { + "epoch": 0.6709770627474761, + "grad_norm": 0.28987663984298706, + "learning_rate": 1.4752537793017684e-05, + "loss": 0.1308, + "step": 37619 + }, + { + "epoch": 0.6709948988691898, + "grad_norm": 0.21637216210365295, + "learning_rate": 1.4751118073829195e-05, + "loss": 0.1628, + "step": 37620 + }, + { + "epoch": 0.6710127349909035, + "grad_norm": 0.30588361620903015, + "learning_rate": 1.4749698394368911e-05, + "loss": 0.1438, + "step": 37621 + }, + { + "epoch": 0.6710305711126173, + "grad_norm": 0.24781139194965363, + "learning_rate": 1.4748278754642341e-05, + "loss": 0.1043, + "step": 37622 + }, + { + "epoch": 0.671048407234331, + "grad_norm": 0.27530962228775024, + "learning_rate": 1.474685915465497e-05, + "loss": 0.1183, + "step": 37623 + }, + { + "epoch": 0.6710662433560447, + "grad_norm": 0.315487265586853, + "learning_rate": 1.4745439594412328e-05, + "loss": 0.1818, + "step": 37624 + }, + { + "epoch": 0.6710840794777584, + "grad_norm": 0.3093649744987488, + "learning_rate": 1.47440200739199e-05, + "loss": 0.1105, + "step": 37625 + }, + { + "epoch": 0.6711019155994721, + "grad_norm": 0.21223153173923492, + "learning_rate": 1.4742600593183192e-05, + "loss": 0.1227, + "step": 37626 + }, + { + "epoch": 0.6711197517211858, + "grad_norm": 0.23524074256420135, + "learning_rate": 1.4741181152207702e-05, + "loss": 0.0975, + "step": 37627 + }, + { + "epoch": 0.6711375878428995, + "grad_norm": 0.20990169048309326, + "learning_rate": 1.4739761750998943e-05, + "loss": 0.1123, + "step": 37628 + }, + { + "epoch": 0.6711554239646131, + "grad_norm": 0.31458204984664917, + "learning_rate": 1.473834238956241e-05, + "loss": 0.1226, + "step": 37629 + }, + { + "epoch": 0.6711732600863268, + "grad_norm": 0.3979092538356781, + "learning_rate": 1.473692306790361e-05, + "loss": 0.18, + "step": 37630 + }, + { + "epoch": 0.6711910962080405, + "grad_norm": 0.32381775975227356, + "learning_rate": 1.473550378602804e-05, + "loss": 0.1098, + "step": 37631 + }, + { + "epoch": 0.6712089323297542, + "grad_norm": 0.2998827397823334, + "learning_rate": 1.4734084543941192e-05, + "loss": 0.139, + "step": 37632 + }, + { + "epoch": 0.6712267684514679, + "grad_norm": 0.24762079119682312, + "learning_rate": 1.4732665341648587e-05, + "loss": 0.1503, + "step": 37633 + }, + { + "epoch": 0.6712446045731816, + "grad_norm": 0.20519985258579254, + "learning_rate": 1.4731246179155716e-05, + "loss": 0.1049, + "step": 37634 + }, + { + "epoch": 0.6712624406948953, + "grad_norm": 0.24733471870422363, + "learning_rate": 1.4729827056468088e-05, + "loss": 0.1011, + "step": 37635 + }, + { + "epoch": 0.671280276816609, + "grad_norm": 0.3979268968105316, + "learning_rate": 1.472840797359118e-05, + "loss": 0.1097, + "step": 37636 + }, + { + "epoch": 0.6712981129383226, + "grad_norm": 0.35445818305015564, + "learning_rate": 1.4726988930530528e-05, + "loss": 0.1076, + "step": 37637 + }, + { + "epoch": 0.6713159490600363, + "grad_norm": 0.22412709891796112, + "learning_rate": 1.472556992729161e-05, + "loss": 0.1164, + "step": 37638 + }, + { + "epoch": 0.6713337851817501, + "grad_norm": 0.2835678458213806, + "learning_rate": 1.4724150963879935e-05, + "loss": 0.1383, + "step": 37639 + }, + { + "epoch": 0.6713516213034638, + "grad_norm": 0.26894861459732056, + "learning_rate": 1.4722732040300985e-05, + "loss": 0.1286, + "step": 37640 + }, + { + "epoch": 0.6713694574251775, + "grad_norm": 0.29006555676460266, + "learning_rate": 1.4721313156560291e-05, + "loss": 0.1593, + "step": 37641 + }, + { + "epoch": 0.6713872935468912, + "grad_norm": 0.2659665048122406, + "learning_rate": 1.471989431266333e-05, + "loss": 0.1596, + "step": 37642 + }, + { + "epoch": 0.6714051296686049, + "grad_norm": 0.22495977580547333, + "learning_rate": 1.4718475508615612e-05, + "loss": 0.1216, + "step": 37643 + }, + { + "epoch": 0.6714229657903186, + "grad_norm": 0.23820440471172333, + "learning_rate": 1.4717056744422635e-05, + "loss": 0.1599, + "step": 37644 + }, + { + "epoch": 0.6714408019120323, + "grad_norm": 0.29072341322898865, + "learning_rate": 1.471563802008989e-05, + "loss": 0.102, + "step": 37645 + }, + { + "epoch": 0.671458638033746, + "grad_norm": 0.2867191433906555, + "learning_rate": 1.4714219335622891e-05, + "loss": 0.1816, + "step": 37646 + }, + { + "epoch": 0.6714764741554596, + "grad_norm": 0.4542645514011383, + "learning_rate": 1.4712800691027135e-05, + "loss": 0.1742, + "step": 37647 + }, + { + "epoch": 0.6714943102771733, + "grad_norm": 0.33744609355926514, + "learning_rate": 1.4711382086308109e-05, + "loss": 0.0977, + "step": 37648 + }, + { + "epoch": 0.671512146398887, + "grad_norm": 0.16860638558864594, + "learning_rate": 1.4709963521471314e-05, + "loss": 0.095, + "step": 37649 + }, + { + "epoch": 0.6715299825206007, + "grad_norm": 0.244256392121315, + "learning_rate": 1.4708544996522267e-05, + "loss": 0.07, + "step": 37650 + }, + { + "epoch": 0.6715478186423144, + "grad_norm": 0.1727474480867386, + "learning_rate": 1.4707126511466452e-05, + "loss": 0.1388, + "step": 37651 + }, + { + "epoch": 0.6715656547640281, + "grad_norm": 0.27268147468566895, + "learning_rate": 1.470570806630937e-05, + "loss": 0.1342, + "step": 37652 + }, + { + "epoch": 0.6715834908857418, + "grad_norm": 0.23442377150058746, + "learning_rate": 1.4704289661056509e-05, + "loss": 0.1139, + "step": 37653 + }, + { + "epoch": 0.6716013270074555, + "grad_norm": 0.34102222323417664, + "learning_rate": 1.4702871295713386e-05, + "loss": 0.1338, + "step": 37654 + }, + { + "epoch": 0.6716191631291691, + "grad_norm": 0.35570889711380005, + "learning_rate": 1.4701452970285487e-05, + "loss": 0.1748, + "step": 37655 + }, + { + "epoch": 0.6716369992508829, + "grad_norm": 0.2313218116760254, + "learning_rate": 1.4700034684778319e-05, + "loss": 0.1355, + "step": 37656 + }, + { + "epoch": 0.6716548353725966, + "grad_norm": 0.24994917213916779, + "learning_rate": 1.4698616439197372e-05, + "loss": 0.0958, + "step": 37657 + }, + { + "epoch": 0.6716726714943103, + "grad_norm": 0.3116813600063324, + "learning_rate": 1.4697198233548137e-05, + "loss": 0.2071, + "step": 37658 + }, + { + "epoch": 0.671690507616024, + "grad_norm": 0.2499520629644394, + "learning_rate": 1.4695780067836129e-05, + "loss": 0.0825, + "step": 37659 + }, + { + "epoch": 0.6717083437377377, + "grad_norm": 0.24971351027488708, + "learning_rate": 1.4694361942066836e-05, + "loss": 0.0953, + "step": 37660 + }, + { + "epoch": 0.6717261798594514, + "grad_norm": 0.2825527489185333, + "learning_rate": 1.4692943856245752e-05, + "loss": 0.1155, + "step": 37661 + }, + { + "epoch": 0.6717440159811651, + "grad_norm": 0.26531845331192017, + "learning_rate": 1.4691525810378379e-05, + "loss": 0.1456, + "step": 37662 + }, + { + "epoch": 0.6717618521028788, + "grad_norm": 0.26592448353767395, + "learning_rate": 1.469010780447021e-05, + "loss": 0.1072, + "step": 37663 + }, + { + "epoch": 0.6717796882245924, + "grad_norm": 0.22369691729545593, + "learning_rate": 1.4688689838526751e-05, + "loss": 0.0864, + "step": 37664 + }, + { + "epoch": 0.6717975243463061, + "grad_norm": 0.19722050428390503, + "learning_rate": 1.468727191255349e-05, + "loss": 0.0697, + "step": 37665 + }, + { + "epoch": 0.6718153604680198, + "grad_norm": 0.2788572311401367, + "learning_rate": 1.4685854026555915e-05, + "loss": 0.2088, + "step": 37666 + }, + { + "epoch": 0.6718331965897335, + "grad_norm": 0.20969094336032867, + "learning_rate": 1.4684436180539543e-05, + "loss": 0.096, + "step": 37667 + }, + { + "epoch": 0.6718510327114472, + "grad_norm": 0.25735971331596375, + "learning_rate": 1.4683018374509857e-05, + "loss": 0.1556, + "step": 37668 + }, + { + "epoch": 0.6718688688331609, + "grad_norm": 0.2810255289077759, + "learning_rate": 1.4681600608472352e-05, + "loss": 0.1574, + "step": 37669 + }, + { + "epoch": 0.6718867049548746, + "grad_norm": 0.21014846861362457, + "learning_rate": 1.4680182882432534e-05, + "loss": 0.0914, + "step": 37670 + }, + { + "epoch": 0.6719045410765883, + "grad_norm": 0.22909945249557495, + "learning_rate": 1.4678765196395876e-05, + "loss": 0.0959, + "step": 37671 + }, + { + "epoch": 0.6719223771983021, + "grad_norm": 0.25569790601730347, + "learning_rate": 1.4677347550367904e-05, + "loss": 0.1828, + "step": 37672 + }, + { + "epoch": 0.6719402133200157, + "grad_norm": 0.22032766044139862, + "learning_rate": 1.4675929944354098e-05, + "loss": 0.12, + "step": 37673 + }, + { + "epoch": 0.6719580494417294, + "grad_norm": 0.3262254297733307, + "learning_rate": 1.467451237835995e-05, + "loss": 0.1647, + "step": 37674 + }, + { + "epoch": 0.6719758855634431, + "grad_norm": 0.24804438650608063, + "learning_rate": 1.4673094852390951e-05, + "loss": 0.1138, + "step": 37675 + }, + { + "epoch": 0.6719937216851568, + "grad_norm": 0.27605322003364563, + "learning_rate": 1.4671677366452607e-05, + "loss": 0.1046, + "step": 37676 + }, + { + "epoch": 0.6720115578068705, + "grad_norm": 0.23179854452610016, + "learning_rate": 1.4670259920550417e-05, + "loss": 0.079, + "step": 37677 + }, + { + "epoch": 0.6720293939285842, + "grad_norm": 0.23570826649665833, + "learning_rate": 1.4668842514689863e-05, + "loss": 0.1103, + "step": 37678 + }, + { + "epoch": 0.6720472300502979, + "grad_norm": 0.21014539897441864, + "learning_rate": 1.4667425148876435e-05, + "loss": 0.1246, + "step": 37679 + }, + { + "epoch": 0.6720650661720116, + "grad_norm": 0.2720472514629364, + "learning_rate": 1.4666007823115646e-05, + "loss": 0.1531, + "step": 37680 + }, + { + "epoch": 0.6720829022937252, + "grad_norm": 0.2559182941913605, + "learning_rate": 1.466459053741298e-05, + "loss": 0.1738, + "step": 37681 + }, + { + "epoch": 0.6721007384154389, + "grad_norm": 0.2380245476961136, + "learning_rate": 1.4663173291773924e-05, + "loss": 0.0951, + "step": 37682 + }, + { + "epoch": 0.6721185745371526, + "grad_norm": 0.3904139995574951, + "learning_rate": 1.4661756086203987e-05, + "loss": 0.1241, + "step": 37683 + }, + { + "epoch": 0.6721364106588663, + "grad_norm": 0.2362600564956665, + "learning_rate": 1.4660338920708644e-05, + "loss": 0.1296, + "step": 37684 + }, + { + "epoch": 0.67215424678058, + "grad_norm": 0.22976341843605042, + "learning_rate": 1.4658921795293407e-05, + "loss": 0.0897, + "step": 37685 + }, + { + "epoch": 0.6721720829022937, + "grad_norm": 0.3275347352027893, + "learning_rate": 1.4657504709963759e-05, + "loss": 0.1434, + "step": 37686 + }, + { + "epoch": 0.6721899190240074, + "grad_norm": 0.27674782276153564, + "learning_rate": 1.4656087664725199e-05, + "loss": 0.1039, + "step": 37687 + }, + { + "epoch": 0.6722077551457211, + "grad_norm": 0.2540251612663269, + "learning_rate": 1.4654670659583203e-05, + "loss": 0.1368, + "step": 37688 + }, + { + "epoch": 0.6722255912674349, + "grad_norm": 0.23808301985263824, + "learning_rate": 1.4653253694543283e-05, + "loss": 0.1324, + "step": 37689 + }, + { + "epoch": 0.6722434273891486, + "grad_norm": 0.32675492763519287, + "learning_rate": 1.4651836769610927e-05, + "loss": 0.1093, + "step": 37690 + }, + { + "epoch": 0.6722612635108622, + "grad_norm": 0.24685119092464447, + "learning_rate": 1.4650419884791628e-05, + "loss": 0.1304, + "step": 37691 + }, + { + "epoch": 0.6722790996325759, + "grad_norm": 0.2423398494720459, + "learning_rate": 1.4649003040090867e-05, + "loss": 0.1824, + "step": 37692 + }, + { + "epoch": 0.6722969357542896, + "grad_norm": 0.39113450050354004, + "learning_rate": 1.4647586235514155e-05, + "loss": 0.1382, + "step": 37693 + }, + { + "epoch": 0.6723147718760033, + "grad_norm": 0.27027618885040283, + "learning_rate": 1.4646169471066973e-05, + "loss": 0.0794, + "step": 37694 + }, + { + "epoch": 0.672332607997717, + "grad_norm": 0.2594882547855377, + "learning_rate": 1.4644752746754817e-05, + "loss": 0.1707, + "step": 37695 + }, + { + "epoch": 0.6723504441194307, + "grad_norm": 0.17667639255523682, + "learning_rate": 1.4643336062583168e-05, + "loss": 0.1104, + "step": 37696 + }, + { + "epoch": 0.6723682802411444, + "grad_norm": 0.21792756021022797, + "learning_rate": 1.464191941855753e-05, + "loss": 0.1102, + "step": 37697 + }, + { + "epoch": 0.672386116362858, + "grad_norm": 0.2616289556026459, + "learning_rate": 1.4640502814683385e-05, + "loss": 0.157, + "step": 37698 + }, + { + "epoch": 0.6724039524845717, + "grad_norm": 0.23419159650802612, + "learning_rate": 1.4639086250966238e-05, + "loss": 0.0988, + "step": 37699 + }, + { + "epoch": 0.6724217886062854, + "grad_norm": 0.2555391788482666, + "learning_rate": 1.4637669727411569e-05, + "loss": 0.167, + "step": 37700 + }, + { + "epoch": 0.6724396247279991, + "grad_norm": 0.23203125596046448, + "learning_rate": 1.4636253244024861e-05, + "loss": 0.1292, + "step": 37701 + }, + { + "epoch": 0.6724574608497128, + "grad_norm": 0.24973362684249878, + "learning_rate": 1.4634836800811624e-05, + "loss": 0.1655, + "step": 37702 + }, + { + "epoch": 0.6724752969714265, + "grad_norm": 0.2770857512950897, + "learning_rate": 1.4633420397777337e-05, + "loss": 0.116, + "step": 37703 + }, + { + "epoch": 0.6724931330931402, + "grad_norm": 0.28690508008003235, + "learning_rate": 1.4632004034927496e-05, + "loss": 0.0732, + "step": 37704 + }, + { + "epoch": 0.6725109692148539, + "grad_norm": 0.30182111263275146, + "learning_rate": 1.4630587712267579e-05, + "loss": 0.1007, + "step": 37705 + }, + { + "epoch": 0.6725288053365677, + "grad_norm": 0.20065154135227203, + "learning_rate": 1.4629171429803096e-05, + "loss": 0.1003, + "step": 37706 + }, + { + "epoch": 0.6725466414582814, + "grad_norm": 0.25812432169914246, + "learning_rate": 1.4627755187539526e-05, + "loss": 0.114, + "step": 37707 + }, + { + "epoch": 0.672564477579995, + "grad_norm": 0.32575470209121704, + "learning_rate": 1.4626338985482363e-05, + "loss": 0.0902, + "step": 37708 + }, + { + "epoch": 0.6725823137017087, + "grad_norm": 0.3000944256782532, + "learning_rate": 1.4624922823637077e-05, + "loss": 0.109, + "step": 37709 + }, + { + "epoch": 0.6726001498234224, + "grad_norm": 0.22485367953777313, + "learning_rate": 1.4623506702009188e-05, + "loss": 0.1388, + "step": 37710 + }, + { + "epoch": 0.6726179859451361, + "grad_norm": 0.22892051935195923, + "learning_rate": 1.4622090620604162e-05, + "loss": 0.1014, + "step": 37711 + }, + { + "epoch": 0.6726358220668498, + "grad_norm": 0.23906515538692474, + "learning_rate": 1.4620674579427507e-05, + "loss": 0.1079, + "step": 37712 + }, + { + "epoch": 0.6726536581885635, + "grad_norm": 0.32567185163497925, + "learning_rate": 1.4619258578484698e-05, + "loss": 0.1644, + "step": 37713 + }, + { + "epoch": 0.6726714943102772, + "grad_norm": 0.27001798152923584, + "learning_rate": 1.461784261778122e-05, + "loss": 0.0826, + "step": 37714 + }, + { + "epoch": 0.6726893304319909, + "grad_norm": 0.28006112575531006, + "learning_rate": 1.461642669732258e-05, + "loss": 0.082, + "step": 37715 + }, + { + "epoch": 0.6727071665537045, + "grad_norm": 0.2673995792865753, + "learning_rate": 1.4615010817114255e-05, + "loss": 0.1182, + "step": 37716 + }, + { + "epoch": 0.6727250026754182, + "grad_norm": 0.3754531145095825, + "learning_rate": 1.4613594977161732e-05, + "loss": 0.1327, + "step": 37717 + }, + { + "epoch": 0.6727428387971319, + "grad_norm": 0.24347876012325287, + "learning_rate": 1.4612179177470504e-05, + "loss": 0.1362, + "step": 37718 + }, + { + "epoch": 0.6727606749188456, + "grad_norm": 0.2931426167488098, + "learning_rate": 1.4610763418046053e-05, + "loss": 0.0847, + "step": 37719 + }, + { + "epoch": 0.6727785110405593, + "grad_norm": 0.26672571897506714, + "learning_rate": 1.4609347698893877e-05, + "loss": 0.1656, + "step": 37720 + }, + { + "epoch": 0.672796347162273, + "grad_norm": 0.24935072660446167, + "learning_rate": 1.460793202001946e-05, + "loss": 0.0904, + "step": 37721 + }, + { + "epoch": 0.6728141832839867, + "grad_norm": 0.25385987758636475, + "learning_rate": 1.4606516381428276e-05, + "loss": 0.1003, + "step": 37722 + }, + { + "epoch": 0.6728320194057005, + "grad_norm": 0.23812651634216309, + "learning_rate": 1.4605100783125836e-05, + "loss": 0.1066, + "step": 37723 + }, + { + "epoch": 0.6728498555274142, + "grad_norm": 0.2761055827140808, + "learning_rate": 1.4603685225117614e-05, + "loss": 0.0984, + "step": 37724 + }, + { + "epoch": 0.6728676916491279, + "grad_norm": 0.3269663453102112, + "learning_rate": 1.4602269707409094e-05, + "loss": 0.1232, + "step": 37725 + }, + { + "epoch": 0.6728855277708415, + "grad_norm": 0.2708067297935486, + "learning_rate": 1.4600854230005775e-05, + "loss": 0.0806, + "step": 37726 + }, + { + "epoch": 0.6729033638925552, + "grad_norm": 0.31648701429367065, + "learning_rate": 1.4599438792913126e-05, + "loss": 0.1025, + "step": 37727 + }, + { + "epoch": 0.6729212000142689, + "grad_norm": 0.3103322684764862, + "learning_rate": 1.4598023396136653e-05, + "loss": 0.1413, + "step": 37728 + }, + { + "epoch": 0.6729390361359826, + "grad_norm": 0.3118903934955597, + "learning_rate": 1.4596608039681835e-05, + "loss": 0.1422, + "step": 37729 + }, + { + "epoch": 0.6729568722576963, + "grad_norm": 0.28199806809425354, + "learning_rate": 1.4595192723554158e-05, + "loss": 0.183, + "step": 37730 + }, + { + "epoch": 0.67297470837941, + "grad_norm": 0.23366296291351318, + "learning_rate": 1.4593777447759096e-05, + "loss": 0.1082, + "step": 37731 + }, + { + "epoch": 0.6729925445011237, + "grad_norm": 0.19522027671337128, + "learning_rate": 1.4592362212302158e-05, + "loss": 0.1084, + "step": 37732 + }, + { + "epoch": 0.6730103806228374, + "grad_norm": 0.25478604435920715, + "learning_rate": 1.4590947017188819e-05, + "loss": 0.137, + "step": 37733 + }, + { + "epoch": 0.673028216744551, + "grad_norm": 0.31069138646125793, + "learning_rate": 1.4589531862424555e-05, + "loss": 0.1136, + "step": 37734 + }, + { + "epoch": 0.6730460528662647, + "grad_norm": 0.30225273966789246, + "learning_rate": 1.4588116748014869e-05, + "loss": 0.1278, + "step": 37735 + }, + { + "epoch": 0.6730638889879784, + "grad_norm": 0.4390554130077362, + "learning_rate": 1.458670167396523e-05, + "loss": 0.1379, + "step": 37736 + }, + { + "epoch": 0.6730817251096921, + "grad_norm": 0.2897048890590668, + "learning_rate": 1.4585286640281144e-05, + "loss": 0.1609, + "step": 37737 + }, + { + "epoch": 0.6730995612314058, + "grad_norm": 0.18407145142555237, + "learning_rate": 1.4583871646968082e-05, + "loss": 0.1144, + "step": 37738 + }, + { + "epoch": 0.6731173973531195, + "grad_norm": 0.22237777709960938, + "learning_rate": 1.4582456694031529e-05, + "loss": 0.1183, + "step": 37739 + }, + { + "epoch": 0.6731352334748333, + "grad_norm": 0.27603867650032043, + "learning_rate": 1.4581041781476965e-05, + "loss": 0.1045, + "step": 37740 + }, + { + "epoch": 0.673153069596547, + "grad_norm": 0.23941335082054138, + "learning_rate": 1.457962690930989e-05, + "loss": 0.1036, + "step": 37741 + }, + { + "epoch": 0.6731709057182607, + "grad_norm": 0.24371333420276642, + "learning_rate": 1.457821207753578e-05, + "loss": 0.1171, + "step": 37742 + }, + { + "epoch": 0.6731887418399743, + "grad_norm": 0.2937003970146179, + "learning_rate": 1.4576797286160121e-05, + "loss": 0.1585, + "step": 37743 + }, + { + "epoch": 0.673206577961688, + "grad_norm": 0.21572428941726685, + "learning_rate": 1.4575382535188386e-05, + "loss": 0.1314, + "step": 37744 + }, + { + "epoch": 0.6732244140834017, + "grad_norm": 0.25045835971832275, + "learning_rate": 1.4573967824626078e-05, + "loss": 0.1522, + "step": 37745 + }, + { + "epoch": 0.6732422502051154, + "grad_norm": 0.2256544828414917, + "learning_rate": 1.4572553154478672e-05, + "loss": 0.0705, + "step": 37746 + }, + { + "epoch": 0.6732600863268291, + "grad_norm": 0.1936129331588745, + "learning_rate": 1.4571138524751652e-05, + "loss": 0.1348, + "step": 37747 + }, + { + "epoch": 0.6732779224485428, + "grad_norm": 0.2671549916267395, + "learning_rate": 1.4569723935450492e-05, + "loss": 0.1277, + "step": 37748 + }, + { + "epoch": 0.6732957585702565, + "grad_norm": 0.23808681964874268, + "learning_rate": 1.4568309386580693e-05, + "loss": 0.1205, + "step": 37749 + }, + { + "epoch": 0.6733135946919702, + "grad_norm": 0.2457624226808548, + "learning_rate": 1.4566894878147718e-05, + "loss": 0.1246, + "step": 37750 + }, + { + "epoch": 0.6733314308136839, + "grad_norm": 0.32891029119491577, + "learning_rate": 1.4565480410157073e-05, + "loss": 0.0401, + "step": 37751 + }, + { + "epoch": 0.6733492669353975, + "grad_norm": 0.3560783565044403, + "learning_rate": 1.456406598261423e-05, + "loss": 0.1454, + "step": 37752 + }, + { + "epoch": 0.6733671030571112, + "grad_norm": 0.24829766154289246, + "learning_rate": 1.4562651595524662e-05, + "loss": 0.1293, + "step": 37753 + }, + { + "epoch": 0.6733849391788249, + "grad_norm": 0.348664790391922, + "learning_rate": 1.4561237248893872e-05, + "loss": 0.13, + "step": 37754 + }, + { + "epoch": 0.6734027753005386, + "grad_norm": 0.28492653369903564, + "learning_rate": 1.455982294272733e-05, + "loss": 0.1264, + "step": 37755 + }, + { + "epoch": 0.6734206114222523, + "grad_norm": 0.2991258502006531, + "learning_rate": 1.4558408677030522e-05, + "loss": 0.1346, + "step": 37756 + }, + { + "epoch": 0.6734384475439661, + "grad_norm": 0.29029542207717896, + "learning_rate": 1.4556994451808919e-05, + "loss": 0.1613, + "step": 37757 + }, + { + "epoch": 0.6734562836656798, + "grad_norm": 0.3601286709308624, + "learning_rate": 1.4555580267068023e-05, + "loss": 0.1574, + "step": 37758 + }, + { + "epoch": 0.6734741197873935, + "grad_norm": 0.2240452915430069, + "learning_rate": 1.4554166122813303e-05, + "loss": 0.1322, + "step": 37759 + }, + { + "epoch": 0.6734919559091072, + "grad_norm": 0.33050811290740967, + "learning_rate": 1.4552752019050241e-05, + "loss": 0.1286, + "step": 37760 + }, + { + "epoch": 0.6735097920308208, + "grad_norm": 0.25536200404167175, + "learning_rate": 1.4551337955784317e-05, + "loss": 0.1477, + "step": 37761 + }, + { + "epoch": 0.6735276281525345, + "grad_norm": 0.22343213856220245, + "learning_rate": 1.454992393302102e-05, + "loss": 0.0807, + "step": 37762 + }, + { + "epoch": 0.6735454642742482, + "grad_norm": 0.2785812020301819, + "learning_rate": 1.454850995076582e-05, + "loss": 0.1293, + "step": 37763 + }, + { + "epoch": 0.6735633003959619, + "grad_norm": 0.1965450942516327, + "learning_rate": 1.4547096009024214e-05, + "loss": 0.1142, + "step": 37764 + }, + { + "epoch": 0.6735811365176756, + "grad_norm": 0.25995826721191406, + "learning_rate": 1.4545682107801676e-05, + "loss": 0.081, + "step": 37765 + }, + { + "epoch": 0.6735989726393893, + "grad_norm": 0.27341073751449585, + "learning_rate": 1.4544268247103673e-05, + "loss": 0.1224, + "step": 37766 + }, + { + "epoch": 0.673616808761103, + "grad_norm": 0.3620646297931671, + "learning_rate": 1.4542854426935709e-05, + "loss": 0.0664, + "step": 37767 + }, + { + "epoch": 0.6736346448828167, + "grad_norm": 0.256929486989975, + "learning_rate": 1.4541440647303251e-05, + "loss": 0.1063, + "step": 37768 + }, + { + "epoch": 0.6736524810045303, + "grad_norm": 0.20610018074512482, + "learning_rate": 1.4540026908211785e-05, + "loss": 0.1225, + "step": 37769 + }, + { + "epoch": 0.673670317126244, + "grad_norm": 0.2695777416229248, + "learning_rate": 1.4538613209666774e-05, + "loss": 0.1223, + "step": 37770 + }, + { + "epoch": 0.6736881532479577, + "grad_norm": 0.3033485412597656, + "learning_rate": 1.4537199551673725e-05, + "loss": 0.1521, + "step": 37771 + }, + { + "epoch": 0.6737059893696714, + "grad_norm": 0.24626515805721283, + "learning_rate": 1.4535785934238099e-05, + "loss": 0.083, + "step": 37772 + }, + { + "epoch": 0.6737238254913852, + "grad_norm": 0.24730589985847473, + "learning_rate": 1.4534372357365383e-05, + "loss": 0.1731, + "step": 37773 + }, + { + "epoch": 0.6737416616130989, + "grad_norm": 0.22525490820407867, + "learning_rate": 1.4532958821061047e-05, + "loss": 0.0921, + "step": 37774 + }, + { + "epoch": 0.6737594977348126, + "grad_norm": 0.4234084486961365, + "learning_rate": 1.4531545325330587e-05, + "loss": 0.1158, + "step": 37775 + }, + { + "epoch": 0.6737773338565263, + "grad_norm": 0.2178460657596588, + "learning_rate": 1.4530131870179469e-05, + "loss": 0.0833, + "step": 37776 + }, + { + "epoch": 0.67379516997824, + "grad_norm": 0.3677380084991455, + "learning_rate": 1.4528718455613172e-05, + "loss": 0.1582, + "step": 37777 + }, + { + "epoch": 0.6738130060999536, + "grad_norm": 0.2249336689710617, + "learning_rate": 1.4527305081637186e-05, + "loss": 0.0623, + "step": 37778 + }, + { + "epoch": 0.6738308422216673, + "grad_norm": 0.2222917079925537, + "learning_rate": 1.452589174825697e-05, + "loss": 0.0818, + "step": 37779 + }, + { + "epoch": 0.673848678343381, + "grad_norm": 0.31255391240119934, + "learning_rate": 1.452447845547803e-05, + "loss": 0.0998, + "step": 37780 + }, + { + "epoch": 0.6738665144650947, + "grad_norm": 0.2845156788825989, + "learning_rate": 1.4523065203305828e-05, + "loss": 0.1106, + "step": 37781 + }, + { + "epoch": 0.6738843505868084, + "grad_norm": 0.2839960753917694, + "learning_rate": 1.4521651991745844e-05, + "loss": 0.1401, + "step": 37782 + }, + { + "epoch": 0.6739021867085221, + "grad_norm": 0.3479841947555542, + "learning_rate": 1.4520238820803545e-05, + "loss": 0.1033, + "step": 37783 + }, + { + "epoch": 0.6739200228302358, + "grad_norm": 0.2025499790906906, + "learning_rate": 1.4518825690484427e-05, + "loss": 0.1007, + "step": 37784 + }, + { + "epoch": 0.6739378589519495, + "grad_norm": 0.22952230274677277, + "learning_rate": 1.4517412600793966e-05, + "loss": 0.1191, + "step": 37785 + }, + { + "epoch": 0.6739556950736632, + "grad_norm": 0.24251607060432434, + "learning_rate": 1.4515999551737633e-05, + "loss": 0.1389, + "step": 37786 + }, + { + "epoch": 0.6739735311953768, + "grad_norm": 0.2477579116821289, + "learning_rate": 1.4514586543320897e-05, + "loss": 0.1142, + "step": 37787 + }, + { + "epoch": 0.6739913673170905, + "grad_norm": 0.34693047404289246, + "learning_rate": 1.4513173575549255e-05, + "loss": 0.1321, + "step": 37788 + }, + { + "epoch": 0.6740092034388042, + "grad_norm": 0.25677019357681274, + "learning_rate": 1.4511760648428171e-05, + "loss": 0.131, + "step": 37789 + }, + { + "epoch": 0.674027039560518, + "grad_norm": 0.2964639663696289, + "learning_rate": 1.451034776196312e-05, + "loss": 0.201, + "step": 37790 + }, + { + "epoch": 0.6740448756822317, + "grad_norm": 0.28781330585479736, + "learning_rate": 1.450893491615959e-05, + "loss": 0.1566, + "step": 37791 + }, + { + "epoch": 0.6740627118039454, + "grad_norm": 0.20691516995429993, + "learning_rate": 1.4507522111023044e-05, + "loss": 0.054, + "step": 37792 + }, + { + "epoch": 0.6740805479256591, + "grad_norm": 0.22368700802326202, + "learning_rate": 1.4506109346558976e-05, + "loss": 0.125, + "step": 37793 + }, + { + "epoch": 0.6740983840473728, + "grad_norm": 0.24548479914665222, + "learning_rate": 1.4504696622772854e-05, + "loss": 0.1542, + "step": 37794 + }, + { + "epoch": 0.6741162201690865, + "grad_norm": 0.21965786814689636, + "learning_rate": 1.4503283939670151e-05, + "loss": 0.0587, + "step": 37795 + }, + { + "epoch": 0.6741340562908001, + "grad_norm": 0.29775798320770264, + "learning_rate": 1.4501871297256336e-05, + "loss": 0.1096, + "step": 37796 + }, + { + "epoch": 0.6741518924125138, + "grad_norm": 0.2897217869758606, + "learning_rate": 1.45004586955369e-05, + "loss": 0.0896, + "step": 37797 + }, + { + "epoch": 0.6741697285342275, + "grad_norm": 0.2840767800807953, + "learning_rate": 1.449904613451732e-05, + "loss": 0.159, + "step": 37798 + }, + { + "epoch": 0.6741875646559412, + "grad_norm": 0.31331580877304077, + "learning_rate": 1.4497633614203057e-05, + "loss": 0.1007, + "step": 37799 + }, + { + "epoch": 0.6742054007776549, + "grad_norm": 0.2730228304862976, + "learning_rate": 1.4496221134599586e-05, + "loss": 0.1323, + "step": 37800 + }, + { + "epoch": 0.6742232368993686, + "grad_norm": 0.3634718954563141, + "learning_rate": 1.4494808695712401e-05, + "loss": 0.1249, + "step": 37801 + }, + { + "epoch": 0.6742410730210823, + "grad_norm": 0.2818793058395386, + "learning_rate": 1.4493396297546964e-05, + "loss": 0.1188, + "step": 37802 + }, + { + "epoch": 0.674258909142796, + "grad_norm": 0.3337445557117462, + "learning_rate": 1.4491983940108752e-05, + "loss": 0.1132, + "step": 37803 + }, + { + "epoch": 0.6742767452645096, + "grad_norm": 0.3687921464443207, + "learning_rate": 1.4490571623403232e-05, + "loss": 0.1138, + "step": 37804 + }, + { + "epoch": 0.6742945813862233, + "grad_norm": 0.31602203845977783, + "learning_rate": 1.4489159347435882e-05, + "loss": 0.1033, + "step": 37805 + }, + { + "epoch": 0.674312417507937, + "grad_norm": 0.26125404238700867, + "learning_rate": 1.4487747112212196e-05, + "loss": 0.1338, + "step": 37806 + }, + { + "epoch": 0.6743302536296508, + "grad_norm": 0.18155883252620697, + "learning_rate": 1.4486334917737629e-05, + "loss": 0.1115, + "step": 37807 + }, + { + "epoch": 0.6743480897513645, + "grad_norm": 0.1911650151014328, + "learning_rate": 1.4484922764017661e-05, + "loss": 0.1281, + "step": 37808 + }, + { + "epoch": 0.6743659258730782, + "grad_norm": 0.30961698293685913, + "learning_rate": 1.4483510651057752e-05, + "loss": 0.1569, + "step": 37809 + }, + { + "epoch": 0.6743837619947919, + "grad_norm": 0.23227068781852722, + "learning_rate": 1.4482098578863401e-05, + "loss": 0.1463, + "step": 37810 + }, + { + "epoch": 0.6744015981165056, + "grad_norm": 0.2824059724807739, + "learning_rate": 1.4480686547440067e-05, + "loss": 0.1092, + "step": 37811 + }, + { + "epoch": 0.6744194342382193, + "grad_norm": 0.27961429953575134, + "learning_rate": 1.4479274556793226e-05, + "loss": 0.1035, + "step": 37812 + }, + { + "epoch": 0.674437270359933, + "grad_norm": 0.21328777074813843, + "learning_rate": 1.4477862606928338e-05, + "loss": 0.0839, + "step": 37813 + }, + { + "epoch": 0.6744551064816466, + "grad_norm": 0.24824877083301544, + "learning_rate": 1.4476450697850902e-05, + "loss": 0.1318, + "step": 37814 + }, + { + "epoch": 0.6744729426033603, + "grad_norm": 0.2871030569076538, + "learning_rate": 1.447503882956638e-05, + "loss": 0.0963, + "step": 37815 + }, + { + "epoch": 0.674490778725074, + "grad_norm": 0.2195233702659607, + "learning_rate": 1.4473627002080237e-05, + "loss": 0.1216, + "step": 37816 + }, + { + "epoch": 0.6745086148467877, + "grad_norm": 0.40571096539497375, + "learning_rate": 1.4472215215397944e-05, + "loss": 0.123, + "step": 37817 + }, + { + "epoch": 0.6745264509685014, + "grad_norm": 0.3914256989955902, + "learning_rate": 1.4470803469524991e-05, + "loss": 0.0853, + "step": 37818 + }, + { + "epoch": 0.6745442870902151, + "grad_norm": 0.2930600941181183, + "learning_rate": 1.4469391764466828e-05, + "loss": 0.1433, + "step": 37819 + }, + { + "epoch": 0.6745621232119288, + "grad_norm": 0.3278384506702423, + "learning_rate": 1.4467980100228951e-05, + "loss": 0.1494, + "step": 37820 + }, + { + "epoch": 0.6745799593336425, + "grad_norm": 0.41129058599472046, + "learning_rate": 1.4466568476816822e-05, + "loss": 0.1588, + "step": 37821 + }, + { + "epoch": 0.6745977954553561, + "grad_norm": 0.2529033422470093, + "learning_rate": 1.4465156894235904e-05, + "loss": 0.1384, + "step": 37822 + }, + { + "epoch": 0.6746156315770698, + "grad_norm": 0.2949357032775879, + "learning_rate": 1.4463745352491682e-05, + "loss": 0.1182, + "step": 37823 + }, + { + "epoch": 0.6746334676987836, + "grad_norm": 0.25833311676979065, + "learning_rate": 1.4462333851589624e-05, + "loss": 0.13, + "step": 37824 + }, + { + "epoch": 0.6746513038204973, + "grad_norm": 0.33982226252555847, + "learning_rate": 1.4460922391535197e-05, + "loss": 0.1381, + "step": 37825 + }, + { + "epoch": 0.674669139942211, + "grad_norm": 0.32433032989501953, + "learning_rate": 1.445951097233387e-05, + "loss": 0.2395, + "step": 37826 + }, + { + "epoch": 0.6746869760639247, + "grad_norm": 0.24779917299747467, + "learning_rate": 1.4458099593991126e-05, + "loss": 0.1379, + "step": 37827 + }, + { + "epoch": 0.6747048121856384, + "grad_norm": 0.20598551630973816, + "learning_rate": 1.445668825651243e-05, + "loss": 0.0713, + "step": 37828 + }, + { + "epoch": 0.6747226483073521, + "grad_norm": 0.1937335729598999, + "learning_rate": 1.4455276959903252e-05, + "loss": 0.1075, + "step": 37829 + }, + { + "epoch": 0.6747404844290658, + "grad_norm": 0.28307947516441345, + "learning_rate": 1.4453865704169062e-05, + "loss": 0.1461, + "step": 37830 + }, + { + "epoch": 0.6747583205507794, + "grad_norm": 0.2247610092163086, + "learning_rate": 1.445245448931532e-05, + "loss": 0.1144, + "step": 37831 + }, + { + "epoch": 0.6747761566724931, + "grad_norm": 0.33952969312667847, + "learning_rate": 1.4451043315347517e-05, + "loss": 0.0988, + "step": 37832 + }, + { + "epoch": 0.6747939927942068, + "grad_norm": 0.253804087638855, + "learning_rate": 1.4449632182271106e-05, + "loss": 0.1209, + "step": 37833 + }, + { + "epoch": 0.6748118289159205, + "grad_norm": 0.253116637468338, + "learning_rate": 1.4448221090091574e-05, + "loss": 0.0978, + "step": 37834 + }, + { + "epoch": 0.6748296650376342, + "grad_norm": 0.20662766695022583, + "learning_rate": 1.4446810038814371e-05, + "loss": 0.1137, + "step": 37835 + }, + { + "epoch": 0.6748475011593479, + "grad_norm": 0.2772424817085266, + "learning_rate": 1.4445399028444987e-05, + "loss": 0.0877, + "step": 37836 + }, + { + "epoch": 0.6748653372810616, + "grad_norm": 0.23932193219661713, + "learning_rate": 1.4443988058988884e-05, + "loss": 0.0926, + "step": 37837 + }, + { + "epoch": 0.6748831734027753, + "grad_norm": 0.2180161029100418, + "learning_rate": 1.4442577130451524e-05, + "loss": 0.1122, + "step": 37838 + }, + { + "epoch": 0.674901009524489, + "grad_norm": 0.2553662061691284, + "learning_rate": 1.4441166242838378e-05, + "loss": 0.0767, + "step": 37839 + }, + { + "epoch": 0.6749188456462026, + "grad_norm": 0.390030175447464, + "learning_rate": 1.4439755396154925e-05, + "loss": 0.1513, + "step": 37840 + }, + { + "epoch": 0.6749366817679164, + "grad_norm": 0.2173352986574173, + "learning_rate": 1.4438344590406627e-05, + "loss": 0.0845, + "step": 37841 + }, + { + "epoch": 0.6749545178896301, + "grad_norm": 0.2620837688446045, + "learning_rate": 1.4436933825598952e-05, + "loss": 0.1291, + "step": 37842 + }, + { + "epoch": 0.6749723540113438, + "grad_norm": 0.28236278891563416, + "learning_rate": 1.4435523101737375e-05, + "loss": 0.1094, + "step": 37843 + }, + { + "epoch": 0.6749901901330575, + "grad_norm": 0.2069295197725296, + "learning_rate": 1.4434112418827344e-05, + "loss": 0.1052, + "step": 37844 + }, + { + "epoch": 0.6750080262547712, + "grad_norm": 0.3842110335826874, + "learning_rate": 1.4432701776874354e-05, + "loss": 0.1411, + "step": 37845 + }, + { + "epoch": 0.6750258623764849, + "grad_norm": 0.22237244248390198, + "learning_rate": 1.4431291175883854e-05, + "loss": 0.0957, + "step": 37846 + }, + { + "epoch": 0.6750436984981986, + "grad_norm": 0.26728424429893494, + "learning_rate": 1.442988061586133e-05, + "loss": 0.1002, + "step": 37847 + }, + { + "epoch": 0.6750615346199123, + "grad_norm": 0.22909174859523773, + "learning_rate": 1.4428470096812224e-05, + "loss": 0.0889, + "step": 37848 + }, + { + "epoch": 0.6750793707416259, + "grad_norm": 0.23479020595550537, + "learning_rate": 1.4427059618742034e-05, + "loss": 0.121, + "step": 37849 + }, + { + "epoch": 0.6750972068633396, + "grad_norm": 0.29722630977630615, + "learning_rate": 1.4425649181656213e-05, + "loss": 0.0984, + "step": 37850 + }, + { + "epoch": 0.6751150429850533, + "grad_norm": 0.27301540970802307, + "learning_rate": 1.4424238785560226e-05, + "loss": 0.1244, + "step": 37851 + }, + { + "epoch": 0.675132879106767, + "grad_norm": 0.28271132707595825, + "learning_rate": 1.4422828430459533e-05, + "loss": 0.1273, + "step": 37852 + }, + { + "epoch": 0.6751507152284807, + "grad_norm": 0.2761266231536865, + "learning_rate": 1.442141811635962e-05, + "loss": 0.1695, + "step": 37853 + }, + { + "epoch": 0.6751685513501944, + "grad_norm": 0.24770000576972961, + "learning_rate": 1.4420007843265943e-05, + "loss": 0.103, + "step": 37854 + }, + { + "epoch": 0.6751863874719081, + "grad_norm": 0.197562575340271, + "learning_rate": 1.4418597611183973e-05, + "loss": 0.083, + "step": 37855 + }, + { + "epoch": 0.6752042235936218, + "grad_norm": 0.4004027843475342, + "learning_rate": 1.4417187420119171e-05, + "loss": 0.1343, + "step": 37856 + }, + { + "epoch": 0.6752220597153354, + "grad_norm": 0.2654779851436615, + "learning_rate": 1.4415777270076996e-05, + "loss": 0.1447, + "step": 37857 + }, + { + "epoch": 0.6752398958370492, + "grad_norm": 0.21590454876422882, + "learning_rate": 1.4414367161062936e-05, + "loss": 0.0801, + "step": 37858 + }, + { + "epoch": 0.6752577319587629, + "grad_norm": 0.2510049343109131, + "learning_rate": 1.4412957093082441e-05, + "loss": 0.0771, + "step": 37859 + }, + { + "epoch": 0.6752755680804766, + "grad_norm": 0.34335780143737793, + "learning_rate": 1.4411547066140974e-05, + "loss": 0.1702, + "step": 37860 + }, + { + "epoch": 0.6752934042021903, + "grad_norm": 0.2221246212720871, + "learning_rate": 1.4410137080244007e-05, + "loss": 0.1331, + "step": 37861 + }, + { + "epoch": 0.675311240323904, + "grad_norm": 0.24388906359672546, + "learning_rate": 1.4408727135397016e-05, + "loss": 0.1557, + "step": 37862 + }, + { + "epoch": 0.6753290764456177, + "grad_norm": 0.2664140462875366, + "learning_rate": 1.4407317231605455e-05, + "loss": 0.0898, + "step": 37863 + }, + { + "epoch": 0.6753469125673314, + "grad_norm": 0.25646162033081055, + "learning_rate": 1.4405907368874793e-05, + "loss": 0.0945, + "step": 37864 + }, + { + "epoch": 0.6753647486890451, + "grad_norm": 0.26063966751098633, + "learning_rate": 1.4404497547210485e-05, + "loss": 0.1266, + "step": 37865 + }, + { + "epoch": 0.6753825848107587, + "grad_norm": 0.3244636356830597, + "learning_rate": 1.4403087766618011e-05, + "loss": 0.1215, + "step": 37866 + }, + { + "epoch": 0.6754004209324724, + "grad_norm": 0.304988831281662, + "learning_rate": 1.4401678027102833e-05, + "loss": 0.1505, + "step": 37867 + }, + { + "epoch": 0.6754182570541861, + "grad_norm": 0.28232863545417786, + "learning_rate": 1.4400268328670407e-05, + "loss": 0.1566, + "step": 37868 + }, + { + "epoch": 0.6754360931758998, + "grad_norm": 0.2966096103191376, + "learning_rate": 1.4398858671326203e-05, + "loss": 0.1068, + "step": 37869 + }, + { + "epoch": 0.6754539292976135, + "grad_norm": 0.24642762541770935, + "learning_rate": 1.4397449055075674e-05, + "loss": 0.1062, + "step": 37870 + }, + { + "epoch": 0.6754717654193272, + "grad_norm": 0.38957732915878296, + "learning_rate": 1.4396039479924307e-05, + "loss": 0.1656, + "step": 37871 + }, + { + "epoch": 0.6754896015410409, + "grad_norm": 0.24745139479637146, + "learning_rate": 1.4394629945877552e-05, + "loss": 0.1093, + "step": 37872 + }, + { + "epoch": 0.6755074376627546, + "grad_norm": 0.2550305426120758, + "learning_rate": 1.4393220452940864e-05, + "loss": 0.0631, + "step": 37873 + }, + { + "epoch": 0.6755252737844682, + "grad_norm": 0.22651255130767822, + "learning_rate": 1.4391811001119727e-05, + "loss": 0.067, + "step": 37874 + }, + { + "epoch": 0.675543109906182, + "grad_norm": 0.3496115803718567, + "learning_rate": 1.4390401590419584e-05, + "loss": 0.1319, + "step": 37875 + }, + { + "epoch": 0.6755609460278957, + "grad_norm": 0.23300904035568237, + "learning_rate": 1.438899222084592e-05, + "loss": 0.108, + "step": 37876 + }, + { + "epoch": 0.6755787821496094, + "grad_norm": 0.36536699533462524, + "learning_rate": 1.4387582892404184e-05, + "loss": 0.1628, + "step": 37877 + }, + { + "epoch": 0.6755966182713231, + "grad_norm": 0.2752687633037567, + "learning_rate": 1.4386173605099835e-05, + "loss": 0.1644, + "step": 37878 + }, + { + "epoch": 0.6756144543930368, + "grad_norm": 0.2863720953464508, + "learning_rate": 1.4384764358938351e-05, + "loss": 0.1266, + "step": 37879 + }, + { + "epoch": 0.6756322905147505, + "grad_norm": 0.25772207975387573, + "learning_rate": 1.4383355153925191e-05, + "loss": 0.1158, + "step": 37880 + }, + { + "epoch": 0.6756501266364642, + "grad_norm": 0.19938026368618011, + "learning_rate": 1.4381945990065809e-05, + "loss": 0.1282, + "step": 37881 + }, + { + "epoch": 0.6756679627581779, + "grad_norm": 0.2639893591403961, + "learning_rate": 1.4380536867365674e-05, + "loss": 0.116, + "step": 37882 + }, + { + "epoch": 0.6756857988798916, + "grad_norm": 0.2264513075351715, + "learning_rate": 1.4379127785830238e-05, + "loss": 0.1034, + "step": 37883 + }, + { + "epoch": 0.6757036350016052, + "grad_norm": 0.34920793771743774, + "learning_rate": 1.4377718745464975e-05, + "loss": 0.1295, + "step": 37884 + }, + { + "epoch": 0.6757214711233189, + "grad_norm": 0.20631419122219086, + "learning_rate": 1.4376309746275345e-05, + "loss": 0.1468, + "step": 37885 + }, + { + "epoch": 0.6757393072450326, + "grad_norm": 0.2549431622028351, + "learning_rate": 1.437490078826681e-05, + "loss": 0.0909, + "step": 37886 + }, + { + "epoch": 0.6757571433667463, + "grad_norm": 0.2594625651836395, + "learning_rate": 1.4373491871444822e-05, + "loss": 0.0856, + "step": 37887 + }, + { + "epoch": 0.67577497948846, + "grad_norm": 0.34404274821281433, + "learning_rate": 1.4372082995814845e-05, + "loss": 0.105, + "step": 37888 + }, + { + "epoch": 0.6757928156101737, + "grad_norm": 0.30066806077957153, + "learning_rate": 1.4370674161382355e-05, + "loss": 0.1412, + "step": 37889 + }, + { + "epoch": 0.6758106517318874, + "grad_norm": 0.3054562211036682, + "learning_rate": 1.4369265368152803e-05, + "loss": 0.1424, + "step": 37890 + }, + { + "epoch": 0.6758284878536012, + "grad_norm": 0.23939236998558044, + "learning_rate": 1.4367856616131642e-05, + "loss": 0.1216, + "step": 37891 + }, + { + "epoch": 0.6758463239753149, + "grad_norm": 0.23606866598129272, + "learning_rate": 1.436644790532435e-05, + "loss": 0.1296, + "step": 37892 + }, + { + "epoch": 0.6758641600970285, + "grad_norm": 0.23873786628246307, + "learning_rate": 1.4365039235736383e-05, + "loss": 0.1733, + "step": 37893 + }, + { + "epoch": 0.6758819962187422, + "grad_norm": 0.2883201837539673, + "learning_rate": 1.436363060737319e-05, + "loss": 0.1186, + "step": 37894 + }, + { + "epoch": 0.6758998323404559, + "grad_norm": 0.23939408361911774, + "learning_rate": 1.4362222020240243e-05, + "loss": 0.1131, + "step": 37895 + }, + { + "epoch": 0.6759176684621696, + "grad_norm": 0.2642723023891449, + "learning_rate": 1.4360813474342988e-05, + "loss": 0.0918, + "step": 37896 + }, + { + "epoch": 0.6759355045838833, + "grad_norm": 0.25423380732536316, + "learning_rate": 1.4359404969686902e-05, + "loss": 0.0975, + "step": 37897 + }, + { + "epoch": 0.675953340705597, + "grad_norm": 0.31318897008895874, + "learning_rate": 1.4357996506277438e-05, + "loss": 0.1159, + "step": 37898 + }, + { + "epoch": 0.6759711768273107, + "grad_norm": 0.20078317821025848, + "learning_rate": 1.4356588084120055e-05, + "loss": 0.1083, + "step": 37899 + }, + { + "epoch": 0.6759890129490244, + "grad_norm": 0.3042323887348175, + "learning_rate": 1.4355179703220204e-05, + "loss": 0.1653, + "step": 37900 + }, + { + "epoch": 0.676006849070738, + "grad_norm": 0.25910013914108276, + "learning_rate": 1.4353771363583362e-05, + "loss": 0.1568, + "step": 37901 + }, + { + "epoch": 0.6760246851924517, + "grad_norm": 0.31457364559173584, + "learning_rate": 1.435236306521497e-05, + "loss": 0.1314, + "step": 37902 + }, + { + "epoch": 0.6760425213141654, + "grad_norm": 0.2601189613342285, + "learning_rate": 1.4350954808120507e-05, + "loss": 0.1316, + "step": 37903 + }, + { + "epoch": 0.6760603574358791, + "grad_norm": 0.314890593290329, + "learning_rate": 1.4349546592305408e-05, + "loss": 0.1175, + "step": 37904 + }, + { + "epoch": 0.6760781935575928, + "grad_norm": 0.26978427171707153, + "learning_rate": 1.4348138417775159e-05, + "loss": 0.1, + "step": 37905 + }, + { + "epoch": 0.6760960296793065, + "grad_norm": 0.22783416509628296, + "learning_rate": 1.4346730284535204e-05, + "loss": 0.1043, + "step": 37906 + }, + { + "epoch": 0.6761138658010202, + "grad_norm": 0.2718282639980316, + "learning_rate": 1.4345322192591e-05, + "loss": 0.069, + "step": 37907 + }, + { + "epoch": 0.676131701922734, + "grad_norm": 0.33067601919174194, + "learning_rate": 1.4343914141948006e-05, + "loss": 0.1567, + "step": 37908 + }, + { + "epoch": 0.6761495380444477, + "grad_norm": 0.2838038206100464, + "learning_rate": 1.4342506132611675e-05, + "loss": 0.1721, + "step": 37909 + }, + { + "epoch": 0.6761673741661614, + "grad_norm": 0.2564777731895447, + "learning_rate": 1.434109816458748e-05, + "loss": 0.1374, + "step": 37910 + }, + { + "epoch": 0.676185210287875, + "grad_norm": 0.3623526692390442, + "learning_rate": 1.4339690237880869e-05, + "loss": 0.1656, + "step": 37911 + }, + { + "epoch": 0.6762030464095887, + "grad_norm": 0.22427743673324585, + "learning_rate": 1.4338282352497301e-05, + "loss": 0.1404, + "step": 37912 + }, + { + "epoch": 0.6762208825313024, + "grad_norm": 0.26960861682891846, + "learning_rate": 1.4336874508442222e-05, + "loss": 0.1439, + "step": 37913 + }, + { + "epoch": 0.6762387186530161, + "grad_norm": 0.2435963749885559, + "learning_rate": 1.4335466705721113e-05, + "loss": 0.1157, + "step": 37914 + }, + { + "epoch": 0.6762565547747298, + "grad_norm": 0.2895001769065857, + "learning_rate": 1.4334058944339403e-05, + "loss": 0.1116, + "step": 37915 + }, + { + "epoch": 0.6762743908964435, + "grad_norm": 0.2829170227050781, + "learning_rate": 1.433265122430258e-05, + "loss": 0.1549, + "step": 37916 + }, + { + "epoch": 0.6762922270181572, + "grad_norm": 0.169818714261055, + "learning_rate": 1.4331243545616075e-05, + "loss": 0.0932, + "step": 37917 + }, + { + "epoch": 0.6763100631398709, + "grad_norm": 0.30012017488479614, + "learning_rate": 1.4329835908285361e-05, + "loss": 0.1634, + "step": 37918 + }, + { + "epoch": 0.6763278992615845, + "grad_norm": 0.2552509903907776, + "learning_rate": 1.432842831231589e-05, + "loss": 0.0841, + "step": 37919 + }, + { + "epoch": 0.6763457353832982, + "grad_norm": 0.21990834176540375, + "learning_rate": 1.4327020757713116e-05, + "loss": 0.1197, + "step": 37920 + }, + { + "epoch": 0.6763635715050119, + "grad_norm": 0.27054712176322937, + "learning_rate": 1.43256132444825e-05, + "loss": 0.1602, + "step": 37921 + }, + { + "epoch": 0.6763814076267256, + "grad_norm": 0.27129170298576355, + "learning_rate": 1.4324205772629481e-05, + "loss": 0.1018, + "step": 37922 + }, + { + "epoch": 0.6763992437484393, + "grad_norm": 0.24261508882045746, + "learning_rate": 1.4322798342159537e-05, + "loss": 0.1258, + "step": 37923 + }, + { + "epoch": 0.676417079870153, + "grad_norm": 0.27131831645965576, + "learning_rate": 1.4321390953078117e-05, + "loss": 0.14, + "step": 37924 + }, + { + "epoch": 0.6764349159918668, + "grad_norm": 0.31937697529792786, + "learning_rate": 1.4319983605390672e-05, + "loss": 0.1184, + "step": 37925 + }, + { + "epoch": 0.6764527521135805, + "grad_norm": 0.27219003438949585, + "learning_rate": 1.4318576299102648e-05, + "loss": 0.0814, + "step": 37926 + }, + { + "epoch": 0.6764705882352942, + "grad_norm": 0.31953704357147217, + "learning_rate": 1.4317169034219524e-05, + "loss": 0.1688, + "step": 37927 + }, + { + "epoch": 0.6764884243570078, + "grad_norm": 0.2735213339328766, + "learning_rate": 1.4315761810746741e-05, + "loss": 0.1239, + "step": 37928 + }, + { + "epoch": 0.6765062604787215, + "grad_norm": 0.30123355984687805, + "learning_rate": 1.4314354628689746e-05, + "loss": 0.1326, + "step": 37929 + }, + { + "epoch": 0.6765240966004352, + "grad_norm": 0.22299256920814514, + "learning_rate": 1.4312947488054013e-05, + "loss": 0.112, + "step": 37930 + }, + { + "epoch": 0.6765419327221489, + "grad_norm": 0.24233022332191467, + "learning_rate": 1.431154038884498e-05, + "loss": 0.0892, + "step": 37931 + }, + { + "epoch": 0.6765597688438626, + "grad_norm": 0.29069089889526367, + "learning_rate": 1.4310133331068112e-05, + "loss": 0.1507, + "step": 37932 + }, + { + "epoch": 0.6765776049655763, + "grad_norm": 0.19256910681724548, + "learning_rate": 1.4308726314728863e-05, + "loss": 0.094, + "step": 37933 + }, + { + "epoch": 0.67659544108729, + "grad_norm": 0.3973245322704315, + "learning_rate": 1.4307319339832681e-05, + "loss": 0.0842, + "step": 37934 + }, + { + "epoch": 0.6766132772090037, + "grad_norm": 0.23836465179920197, + "learning_rate": 1.4305912406385016e-05, + "loss": 0.1159, + "step": 37935 + }, + { + "epoch": 0.6766311133307173, + "grad_norm": 0.23498216271400452, + "learning_rate": 1.4304505514391337e-05, + "loss": 0.1188, + "step": 37936 + }, + { + "epoch": 0.676648949452431, + "grad_norm": 0.3663347065448761, + "learning_rate": 1.430309866385709e-05, + "loss": 0.1617, + "step": 37937 + }, + { + "epoch": 0.6766667855741447, + "grad_norm": 0.22820499539375305, + "learning_rate": 1.4301691854787725e-05, + "loss": 0.1235, + "step": 37938 + }, + { + "epoch": 0.6766846216958584, + "grad_norm": 0.1825525313615799, + "learning_rate": 1.4300285087188686e-05, + "loss": 0.0908, + "step": 37939 + }, + { + "epoch": 0.6767024578175721, + "grad_norm": 0.22258968651294708, + "learning_rate": 1.4298878361065449e-05, + "loss": 0.1015, + "step": 37940 + }, + { + "epoch": 0.6767202939392858, + "grad_norm": 0.2557908892631531, + "learning_rate": 1.4297471676423458e-05, + "loss": 0.1315, + "step": 37941 + }, + { + "epoch": 0.6767381300609996, + "grad_norm": 0.31204521656036377, + "learning_rate": 1.429606503326815e-05, + "loss": 0.1506, + "step": 37942 + }, + { + "epoch": 0.6767559661827133, + "grad_norm": 0.24505315721035004, + "learning_rate": 1.4294658431605002e-05, + "loss": 0.1057, + "step": 37943 + }, + { + "epoch": 0.676773802304427, + "grad_norm": 0.26397737860679626, + "learning_rate": 1.4293251871439445e-05, + "loss": 0.1489, + "step": 37944 + }, + { + "epoch": 0.6767916384261407, + "grad_norm": 0.3072804808616638, + "learning_rate": 1.429184535277695e-05, + "loss": 0.0872, + "step": 37945 + }, + { + "epoch": 0.6768094745478543, + "grad_norm": 0.2764657139778137, + "learning_rate": 1.4290438875622964e-05, + "loss": 0.1518, + "step": 37946 + }, + { + "epoch": 0.676827310669568, + "grad_norm": 0.37035417556762695, + "learning_rate": 1.4289032439982936e-05, + "loss": 0.1557, + "step": 37947 + }, + { + "epoch": 0.6768451467912817, + "grad_norm": 0.26884639263153076, + "learning_rate": 1.4287626045862307e-05, + "loss": 0.1413, + "step": 37948 + }, + { + "epoch": 0.6768629829129954, + "grad_norm": 0.40568098425865173, + "learning_rate": 1.4286219693266551e-05, + "loss": 0.158, + "step": 37949 + }, + { + "epoch": 0.6768808190347091, + "grad_norm": 0.19726970791816711, + "learning_rate": 1.4284813382201103e-05, + "loss": 0.1037, + "step": 37950 + }, + { + "epoch": 0.6768986551564228, + "grad_norm": 0.2574869394302368, + "learning_rate": 1.4283407112671423e-05, + "loss": 0.1276, + "step": 37951 + }, + { + "epoch": 0.6769164912781365, + "grad_norm": 0.26717349886894226, + "learning_rate": 1.4282000884682947e-05, + "loss": 0.1491, + "step": 37952 + }, + { + "epoch": 0.6769343273998502, + "grad_norm": 0.22183527052402496, + "learning_rate": 1.4280594698241148e-05, + "loss": 0.1354, + "step": 37953 + }, + { + "epoch": 0.6769521635215638, + "grad_norm": 0.4951947331428528, + "learning_rate": 1.4279188553351469e-05, + "loss": 0.1491, + "step": 37954 + }, + { + "epoch": 0.6769699996432775, + "grad_norm": 0.24742573499679565, + "learning_rate": 1.4277782450019353e-05, + "loss": 0.0809, + "step": 37955 + }, + { + "epoch": 0.6769878357649912, + "grad_norm": 0.3553015887737274, + "learning_rate": 1.4276376388250249e-05, + "loss": 0.1096, + "step": 37956 + }, + { + "epoch": 0.6770056718867049, + "grad_norm": 0.2534266412258148, + "learning_rate": 1.4274970368049623e-05, + "loss": 0.131, + "step": 37957 + }, + { + "epoch": 0.6770235080084186, + "grad_norm": 0.3052847981452942, + "learning_rate": 1.4273564389422906e-05, + "loss": 0.1349, + "step": 37958 + }, + { + "epoch": 0.6770413441301324, + "grad_norm": 0.30340835452079773, + "learning_rate": 1.4272158452375568e-05, + "loss": 0.1651, + "step": 37959 + }, + { + "epoch": 0.6770591802518461, + "grad_norm": 0.24183939397335052, + "learning_rate": 1.4270752556913047e-05, + "loss": 0.1273, + "step": 37960 + }, + { + "epoch": 0.6770770163735598, + "grad_norm": 0.3222278654575348, + "learning_rate": 1.426934670304079e-05, + "loss": 0.1409, + "step": 37961 + }, + { + "epoch": 0.6770948524952735, + "grad_norm": 0.3161703050136566, + "learning_rate": 1.426794089076426e-05, + "loss": 0.1281, + "step": 37962 + }, + { + "epoch": 0.6771126886169871, + "grad_norm": 0.222113236784935, + "learning_rate": 1.4266535120088894e-05, + "loss": 0.1509, + "step": 37963 + }, + { + "epoch": 0.6771305247387008, + "grad_norm": 0.2669108510017395, + "learning_rate": 1.4265129391020149e-05, + "loss": 0.1033, + "step": 37964 + }, + { + "epoch": 0.6771483608604145, + "grad_norm": 0.26555803418159485, + "learning_rate": 1.4263723703563459e-05, + "loss": 0.1142, + "step": 37965 + }, + { + "epoch": 0.6771661969821282, + "grad_norm": 0.3308393955230713, + "learning_rate": 1.4262318057724294e-05, + "loss": 0.1154, + "step": 37966 + }, + { + "epoch": 0.6771840331038419, + "grad_norm": 0.24701416492462158, + "learning_rate": 1.4260912453508091e-05, + "loss": 0.1173, + "step": 37967 + }, + { + "epoch": 0.6772018692255556, + "grad_norm": 0.2714207172393799, + "learning_rate": 1.4259506890920304e-05, + "loss": 0.1447, + "step": 37968 + }, + { + "epoch": 0.6772197053472693, + "grad_norm": 0.2641317844390869, + "learning_rate": 1.4258101369966364e-05, + "loss": 0.1211, + "step": 37969 + }, + { + "epoch": 0.677237541468983, + "grad_norm": 0.2663409113883972, + "learning_rate": 1.4256695890651747e-05, + "loss": 0.1131, + "step": 37970 + }, + { + "epoch": 0.6772553775906967, + "grad_norm": 0.2812822759151459, + "learning_rate": 1.4255290452981877e-05, + "loss": 0.132, + "step": 37971 + }, + { + "epoch": 0.6772732137124103, + "grad_norm": 0.2187214493751526, + "learning_rate": 1.4253885056962218e-05, + "loss": 0.1357, + "step": 37972 + }, + { + "epoch": 0.677291049834124, + "grad_norm": 0.1870632767677307, + "learning_rate": 1.4252479702598218e-05, + "loss": 0.0774, + "step": 37973 + }, + { + "epoch": 0.6773088859558377, + "grad_norm": 0.27847856283187866, + "learning_rate": 1.4251074389895305e-05, + "loss": 0.1545, + "step": 37974 + }, + { + "epoch": 0.6773267220775514, + "grad_norm": 0.17982667684555054, + "learning_rate": 1.4249669118858952e-05, + "loss": 0.0852, + "step": 37975 + }, + { + "epoch": 0.6773445581992652, + "grad_norm": 0.4032396376132965, + "learning_rate": 1.4248263889494595e-05, + "loss": 0.1075, + "step": 37976 + }, + { + "epoch": 0.6773623943209789, + "grad_norm": 0.36110034584999084, + "learning_rate": 1.4246858701807677e-05, + "loss": 0.1064, + "step": 37977 + }, + { + "epoch": 0.6773802304426926, + "grad_norm": 0.2389470934867859, + "learning_rate": 1.4245453555803642e-05, + "loss": 0.1521, + "step": 37978 + }, + { + "epoch": 0.6773980665644063, + "grad_norm": 0.2208462357521057, + "learning_rate": 1.4244048451487952e-05, + "loss": 0.1247, + "step": 37979 + }, + { + "epoch": 0.67741590268612, + "grad_norm": 0.1915493905544281, + "learning_rate": 1.4242643388866046e-05, + "loss": 0.1028, + "step": 37980 + }, + { + "epoch": 0.6774337388078336, + "grad_norm": 0.23680944740772247, + "learning_rate": 1.4241238367943371e-05, + "loss": 0.1201, + "step": 37981 + }, + { + "epoch": 0.6774515749295473, + "grad_norm": 0.29377880692481995, + "learning_rate": 1.423983338872536e-05, + "loss": 0.1323, + "step": 37982 + }, + { + "epoch": 0.677469411051261, + "grad_norm": 0.26315832138061523, + "learning_rate": 1.423842845121748e-05, + "loss": 0.122, + "step": 37983 + }, + { + "epoch": 0.6774872471729747, + "grad_norm": 0.22639748454093933, + "learning_rate": 1.4237023555425173e-05, + "loss": 0.1052, + "step": 37984 + }, + { + "epoch": 0.6775050832946884, + "grad_norm": 0.21553198993206024, + "learning_rate": 1.4235618701353864e-05, + "loss": 0.0991, + "step": 37985 + }, + { + "epoch": 0.6775229194164021, + "grad_norm": 0.29160547256469727, + "learning_rate": 1.423421388900903e-05, + "loss": 0.1266, + "step": 37986 + }, + { + "epoch": 0.6775407555381158, + "grad_norm": 0.2924550473690033, + "learning_rate": 1.423280911839609e-05, + "loss": 0.1159, + "step": 37987 + }, + { + "epoch": 0.6775585916598295, + "grad_norm": 0.2519051730632782, + "learning_rate": 1.4231404389520508e-05, + "loss": 0.0973, + "step": 37988 + }, + { + "epoch": 0.6775764277815431, + "grad_norm": 0.32179516553878784, + "learning_rate": 1.4229999702387724e-05, + "loss": 0.1296, + "step": 37989 + }, + { + "epoch": 0.6775942639032568, + "grad_norm": 0.26298460364341736, + "learning_rate": 1.422859505700318e-05, + "loss": 0.1148, + "step": 37990 + }, + { + "epoch": 0.6776121000249705, + "grad_norm": 0.3153688311576843, + "learning_rate": 1.4227190453372314e-05, + "loss": 0.0666, + "step": 37991 + }, + { + "epoch": 0.6776299361466843, + "grad_norm": 0.19584353268146515, + "learning_rate": 1.4225785891500587e-05, + "loss": 0.093, + "step": 37992 + }, + { + "epoch": 0.677647772268398, + "grad_norm": 0.23734787106513977, + "learning_rate": 1.4224381371393436e-05, + "loss": 0.1743, + "step": 37993 + }, + { + "epoch": 0.6776656083901117, + "grad_norm": 0.24842730164527893, + "learning_rate": 1.4222976893056306e-05, + "loss": 0.1662, + "step": 37994 + }, + { + "epoch": 0.6776834445118254, + "grad_norm": 0.25893813371658325, + "learning_rate": 1.4221572456494629e-05, + "loss": 0.1278, + "step": 37995 + }, + { + "epoch": 0.6777012806335391, + "grad_norm": 0.2621501088142395, + "learning_rate": 1.4220168061713867e-05, + "loss": 0.1407, + "step": 37996 + }, + { + "epoch": 0.6777191167552528, + "grad_norm": 0.2936629056930542, + "learning_rate": 1.4218763708719457e-05, + "loss": 0.1716, + "step": 37997 + }, + { + "epoch": 0.6777369528769664, + "grad_norm": 0.28599438071250916, + "learning_rate": 1.4217359397516838e-05, + "loss": 0.0671, + "step": 37998 + }, + { + "epoch": 0.6777547889986801, + "grad_norm": 0.2325516939163208, + "learning_rate": 1.4215955128111463e-05, + "loss": 0.1094, + "step": 37999 + }, + { + "epoch": 0.6777726251203938, + "grad_norm": 0.43801024556159973, + "learning_rate": 1.421455090050876e-05, + "loss": 0.1363, + "step": 38000 + }, + { + "epoch": 0.6777726251203938, + "eval_loss": 0.11776335537433624, + "eval_runtime": 106.7526, + "eval_samples_per_second": 9.592, + "eval_steps_per_second": 1.602, + "step": 38000 + }, + { + "epoch": 0.6777904612421075, + "grad_norm": 0.3258973956108093, + "learning_rate": 1.4213146714714199e-05, + "loss": 0.1173, + "step": 38001 + }, + { + "epoch": 0.6778082973638212, + "grad_norm": 0.22279764711856842, + "learning_rate": 1.42117425707332e-05, + "loss": 0.0783, + "step": 38002 + }, + { + "epoch": 0.6778261334855349, + "grad_norm": 0.23170334100723267, + "learning_rate": 1.4210338468571216e-05, + "loss": 0.129, + "step": 38003 + }, + { + "epoch": 0.6778439696072486, + "grad_norm": 0.21238280832767487, + "learning_rate": 1.4208934408233677e-05, + "loss": 0.1474, + "step": 38004 + }, + { + "epoch": 0.6778618057289623, + "grad_norm": 0.22141441702842712, + "learning_rate": 1.4207530389726049e-05, + "loss": 0.0939, + "step": 38005 + }, + { + "epoch": 0.677879641850676, + "grad_norm": 0.276857852935791, + "learning_rate": 1.4206126413053755e-05, + "loss": 0.1432, + "step": 38006 + }, + { + "epoch": 0.6778974779723896, + "grad_norm": 0.2408115118741989, + "learning_rate": 1.4204722478222244e-05, + "loss": 0.0811, + "step": 38007 + }, + { + "epoch": 0.6779153140941033, + "grad_norm": 0.2695843279361725, + "learning_rate": 1.4203318585236946e-05, + "loss": 0.1055, + "step": 38008 + }, + { + "epoch": 0.6779331502158171, + "grad_norm": 0.24992306530475616, + "learning_rate": 1.4201914734103327e-05, + "loss": 0.1303, + "step": 38009 + }, + { + "epoch": 0.6779509863375308, + "grad_norm": 0.24770306050777435, + "learning_rate": 1.4200510924826816e-05, + "loss": 0.1166, + "step": 38010 + }, + { + "epoch": 0.6779688224592445, + "grad_norm": 0.2797127366065979, + "learning_rate": 1.4199107157412855e-05, + "loss": 0.1381, + "step": 38011 + }, + { + "epoch": 0.6779866585809582, + "grad_norm": 0.3100208044052124, + "learning_rate": 1.4197703431866875e-05, + "loss": 0.1612, + "step": 38012 + }, + { + "epoch": 0.6780044947026719, + "grad_norm": 0.3306817412376404, + "learning_rate": 1.4196299748194337e-05, + "loss": 0.1283, + "step": 38013 + }, + { + "epoch": 0.6780223308243856, + "grad_norm": 0.25073903799057007, + "learning_rate": 1.4194896106400663e-05, + "loss": 0.1237, + "step": 38014 + }, + { + "epoch": 0.6780401669460993, + "grad_norm": 0.26074302196502686, + "learning_rate": 1.4193492506491313e-05, + "loss": 0.1351, + "step": 38015 + }, + { + "epoch": 0.6780580030678129, + "grad_norm": 0.24974024295806885, + "learning_rate": 1.4192088948471715e-05, + "loss": 0.065, + "step": 38016 + }, + { + "epoch": 0.6780758391895266, + "grad_norm": 0.2402353435754776, + "learning_rate": 1.4190685432347309e-05, + "loss": 0.1375, + "step": 38017 + }, + { + "epoch": 0.6780936753112403, + "grad_norm": 0.2304908186197281, + "learning_rate": 1.4189281958123545e-05, + "loss": 0.0962, + "step": 38018 + }, + { + "epoch": 0.678111511432954, + "grad_norm": 0.294159471988678, + "learning_rate": 1.4187878525805865e-05, + "loss": 0.1553, + "step": 38019 + }, + { + "epoch": 0.6781293475546677, + "grad_norm": 0.22550486028194427, + "learning_rate": 1.4186475135399696e-05, + "loss": 0.1109, + "step": 38020 + }, + { + "epoch": 0.6781471836763814, + "grad_norm": 0.247028186917305, + "learning_rate": 1.4185071786910476e-05, + "loss": 0.0728, + "step": 38021 + }, + { + "epoch": 0.6781650197980951, + "grad_norm": 0.2688398063182831, + "learning_rate": 1.4183668480343665e-05, + "loss": 0.1187, + "step": 38022 + }, + { + "epoch": 0.6781828559198088, + "grad_norm": 0.23364390432834625, + "learning_rate": 1.4182265215704688e-05, + "loss": 0.1033, + "step": 38023 + }, + { + "epoch": 0.6782006920415224, + "grad_norm": 0.25726598501205444, + "learning_rate": 1.4180861992998988e-05, + "loss": 0.1493, + "step": 38024 + }, + { + "epoch": 0.6782185281632361, + "grad_norm": 0.23520156741142273, + "learning_rate": 1.4179458812231994e-05, + "loss": 0.1569, + "step": 38025 + }, + { + "epoch": 0.6782363642849499, + "grad_norm": 0.3169076144695282, + "learning_rate": 1.4178055673409163e-05, + "loss": 0.1064, + "step": 38026 + }, + { + "epoch": 0.6782542004066636, + "grad_norm": 0.2397921085357666, + "learning_rate": 1.4176652576535921e-05, + "loss": 0.1575, + "step": 38027 + }, + { + "epoch": 0.6782720365283773, + "grad_norm": 0.23026040196418762, + "learning_rate": 1.4175249521617717e-05, + "loss": 0.092, + "step": 38028 + }, + { + "epoch": 0.678289872650091, + "grad_norm": 0.2579258680343628, + "learning_rate": 1.4173846508659989e-05, + "loss": 0.1083, + "step": 38029 + }, + { + "epoch": 0.6783077087718047, + "grad_norm": 0.26648446917533875, + "learning_rate": 1.4172443537668156e-05, + "loss": 0.1199, + "step": 38030 + }, + { + "epoch": 0.6783255448935184, + "grad_norm": 0.27058619260787964, + "learning_rate": 1.4171040608647684e-05, + "loss": 0.1478, + "step": 38031 + }, + { + "epoch": 0.6783433810152321, + "grad_norm": 0.242412269115448, + "learning_rate": 1.4169637721603999e-05, + "loss": 0.0828, + "step": 38032 + }, + { + "epoch": 0.6783612171369457, + "grad_norm": 0.2512744069099426, + "learning_rate": 1.4168234876542539e-05, + "loss": 0.1462, + "step": 38033 + }, + { + "epoch": 0.6783790532586594, + "grad_norm": 0.28903698921203613, + "learning_rate": 1.416683207346873e-05, + "loss": 0.1101, + "step": 38034 + }, + { + "epoch": 0.6783968893803731, + "grad_norm": 0.25134021043777466, + "learning_rate": 1.4165429312388034e-05, + "loss": 0.1253, + "step": 38035 + }, + { + "epoch": 0.6784147255020868, + "grad_norm": 0.27152127027511597, + "learning_rate": 1.4164026593305874e-05, + "loss": 0.0893, + "step": 38036 + }, + { + "epoch": 0.6784325616238005, + "grad_norm": 0.22014084458351135, + "learning_rate": 1.416262391622769e-05, + "loss": 0.1169, + "step": 38037 + }, + { + "epoch": 0.6784503977455142, + "grad_norm": 0.23176322877407074, + "learning_rate": 1.416122128115892e-05, + "loss": 0.1251, + "step": 38038 + }, + { + "epoch": 0.6784682338672279, + "grad_norm": 0.32257190346717834, + "learning_rate": 1.4159818688104993e-05, + "loss": 0.1129, + "step": 38039 + }, + { + "epoch": 0.6784860699889416, + "grad_norm": 0.22879919409751892, + "learning_rate": 1.4158416137071356e-05, + "loss": 0.1629, + "step": 38040 + }, + { + "epoch": 0.6785039061106553, + "grad_norm": 0.22757573425769806, + "learning_rate": 1.4157013628063437e-05, + "loss": 0.1161, + "step": 38041 + }, + { + "epoch": 0.6785217422323689, + "grad_norm": 0.24358348548412323, + "learning_rate": 1.4155611161086687e-05, + "loss": 0.0933, + "step": 38042 + }, + { + "epoch": 0.6785395783540827, + "grad_norm": 0.26805564761161804, + "learning_rate": 1.4154208736146523e-05, + "loss": 0.1064, + "step": 38043 + }, + { + "epoch": 0.6785574144757964, + "grad_norm": 0.2571731209754944, + "learning_rate": 1.4152806353248405e-05, + "loss": 0.1105, + "step": 38044 + }, + { + "epoch": 0.6785752505975101, + "grad_norm": 0.20159876346588135, + "learning_rate": 1.4151404012397754e-05, + "loss": 0.1263, + "step": 38045 + }, + { + "epoch": 0.6785930867192238, + "grad_norm": 0.2972625195980072, + "learning_rate": 1.4150001713600009e-05, + "loss": 0.1471, + "step": 38046 + }, + { + "epoch": 0.6786109228409375, + "grad_norm": 0.2562756836414337, + "learning_rate": 1.4148599456860592e-05, + "loss": 0.0902, + "step": 38047 + }, + { + "epoch": 0.6786287589626512, + "grad_norm": 0.2638951241970062, + "learning_rate": 1.4147197242184962e-05, + "loss": 0.1221, + "step": 38048 + }, + { + "epoch": 0.6786465950843649, + "grad_norm": 0.34354645013809204, + "learning_rate": 1.4145795069578546e-05, + "loss": 0.152, + "step": 38049 + }, + { + "epoch": 0.6786644312060786, + "grad_norm": 0.2503903806209564, + "learning_rate": 1.4144392939046774e-05, + "loss": 0.1178, + "step": 38050 + }, + { + "epoch": 0.6786822673277922, + "grad_norm": 0.30100035667419434, + "learning_rate": 1.414299085059509e-05, + "loss": 0.0988, + "step": 38051 + }, + { + "epoch": 0.6787001034495059, + "grad_norm": 0.28491219878196716, + "learning_rate": 1.414158880422891e-05, + "loss": 0.1449, + "step": 38052 + }, + { + "epoch": 0.6787179395712196, + "grad_norm": 0.4232577383518219, + "learning_rate": 1.4140186799953691e-05, + "loss": 0.1605, + "step": 38053 + }, + { + "epoch": 0.6787357756929333, + "grad_norm": 0.31380370259284973, + "learning_rate": 1.4138784837774849e-05, + "loss": 0.1571, + "step": 38054 + }, + { + "epoch": 0.678753611814647, + "grad_norm": 0.21956247091293335, + "learning_rate": 1.4137382917697842e-05, + "loss": 0.074, + "step": 38055 + }, + { + "epoch": 0.6787714479363607, + "grad_norm": 0.30031442642211914, + "learning_rate": 1.4135981039728078e-05, + "loss": 0.1381, + "step": 38056 + }, + { + "epoch": 0.6787892840580744, + "grad_norm": 0.30037590861320496, + "learning_rate": 1.4134579203871013e-05, + "loss": 0.1949, + "step": 38057 + }, + { + "epoch": 0.6788071201797881, + "grad_norm": 0.24318231642246246, + "learning_rate": 1.4133177410132073e-05, + "loss": 0.1054, + "step": 38058 + }, + { + "epoch": 0.6788249563015017, + "grad_norm": 0.2966897189617157, + "learning_rate": 1.4131775658516689e-05, + "loss": 0.0859, + "step": 38059 + }, + { + "epoch": 0.6788427924232155, + "grad_norm": 0.26094427704811096, + "learning_rate": 1.413037394903029e-05, + "loss": 0.1251, + "step": 38060 + }, + { + "epoch": 0.6788606285449292, + "grad_norm": 0.22348086535930634, + "learning_rate": 1.4128972281678321e-05, + "loss": 0.0888, + "step": 38061 + }, + { + "epoch": 0.6788784646666429, + "grad_norm": 0.2923192083835602, + "learning_rate": 1.4127570656466212e-05, + "loss": 0.1512, + "step": 38062 + }, + { + "epoch": 0.6788963007883566, + "grad_norm": 0.26692765951156616, + "learning_rate": 1.4126169073399398e-05, + "loss": 0.0908, + "step": 38063 + }, + { + "epoch": 0.6789141369100703, + "grad_norm": 0.2778342664241791, + "learning_rate": 1.4124767532483302e-05, + "loss": 0.1414, + "step": 38064 + }, + { + "epoch": 0.678931973031784, + "grad_norm": 0.23958182334899902, + "learning_rate": 1.4123366033723356e-05, + "loss": 0.1542, + "step": 38065 + }, + { + "epoch": 0.6789498091534977, + "grad_norm": 0.2612308859825134, + "learning_rate": 1.4121964577125014e-05, + "loss": 0.0899, + "step": 38066 + }, + { + "epoch": 0.6789676452752114, + "grad_norm": 0.24842728674411774, + "learning_rate": 1.412056316269369e-05, + "loss": 0.0868, + "step": 38067 + }, + { + "epoch": 0.678985481396925, + "grad_norm": 0.25560522079467773, + "learning_rate": 1.4119161790434809e-05, + "loss": 0.1457, + "step": 38068 + }, + { + "epoch": 0.6790033175186387, + "grad_norm": 0.23789770901203156, + "learning_rate": 1.4117760460353819e-05, + "loss": 0.1012, + "step": 38069 + }, + { + "epoch": 0.6790211536403524, + "grad_norm": 0.21970634162425995, + "learning_rate": 1.4116359172456156e-05, + "loss": 0.1114, + "step": 38070 + }, + { + "epoch": 0.6790389897620661, + "grad_norm": 0.2765413820743561, + "learning_rate": 1.4114957926747247e-05, + "loss": 0.1291, + "step": 38071 + }, + { + "epoch": 0.6790568258837798, + "grad_norm": 0.21076099574565887, + "learning_rate": 1.4113556723232519e-05, + "loss": 0.0921, + "step": 38072 + }, + { + "epoch": 0.6790746620054935, + "grad_norm": 0.2992746829986572, + "learning_rate": 1.4112155561917395e-05, + "loss": 0.1709, + "step": 38073 + }, + { + "epoch": 0.6790924981272072, + "grad_norm": 0.2188761681318283, + "learning_rate": 1.4110754442807325e-05, + "loss": 0.1164, + "step": 38074 + }, + { + "epoch": 0.6791103342489209, + "grad_norm": 0.2258126586675644, + "learning_rate": 1.4109353365907732e-05, + "loss": 0.0964, + "step": 38075 + }, + { + "epoch": 0.6791281703706346, + "grad_norm": 0.2761717140674591, + "learning_rate": 1.410795233122405e-05, + "loss": 0.14, + "step": 38076 + }, + { + "epoch": 0.6791460064923484, + "grad_norm": 0.3077312707901001, + "learning_rate": 1.4106551338761704e-05, + "loss": 0.1028, + "step": 38077 + }, + { + "epoch": 0.679163842614062, + "grad_norm": 0.22233155369758606, + "learning_rate": 1.4105150388526117e-05, + "loss": 0.1004, + "step": 38078 + }, + { + "epoch": 0.6791816787357757, + "grad_norm": 0.24543443322181702, + "learning_rate": 1.410374948052274e-05, + "loss": 0.091, + "step": 38079 + }, + { + "epoch": 0.6791995148574894, + "grad_norm": 0.2750838100910187, + "learning_rate": 1.4102348614756994e-05, + "loss": 0.107, + "step": 38080 + }, + { + "epoch": 0.6792173509792031, + "grad_norm": 0.29360508918762207, + "learning_rate": 1.41009477912343e-05, + "loss": 0.1414, + "step": 38081 + }, + { + "epoch": 0.6792351871009168, + "grad_norm": 0.23823639750480652, + "learning_rate": 1.4099547009960108e-05, + "loss": 0.1546, + "step": 38082 + }, + { + "epoch": 0.6792530232226305, + "grad_norm": 0.29856178164482117, + "learning_rate": 1.4098146270939825e-05, + "loss": 0.1086, + "step": 38083 + }, + { + "epoch": 0.6792708593443442, + "grad_norm": 0.3999932110309601, + "learning_rate": 1.40967455741789e-05, + "loss": 0.1548, + "step": 38084 + }, + { + "epoch": 0.6792886954660579, + "grad_norm": 0.24247169494628906, + "learning_rate": 1.4095344919682757e-05, + "loss": 0.1244, + "step": 38085 + }, + { + "epoch": 0.6793065315877715, + "grad_norm": 0.2690492868423462, + "learning_rate": 1.4093944307456813e-05, + "loss": 0.1254, + "step": 38086 + }, + { + "epoch": 0.6793243677094852, + "grad_norm": 0.16117660701274872, + "learning_rate": 1.4092543737506519e-05, + "loss": 0.0491, + "step": 38087 + }, + { + "epoch": 0.6793422038311989, + "grad_norm": 0.23193974792957306, + "learning_rate": 1.4091143209837294e-05, + "loss": 0.1125, + "step": 38088 + }, + { + "epoch": 0.6793600399529126, + "grad_norm": 0.2804194390773773, + "learning_rate": 1.4089742724454564e-05, + "loss": 0.1043, + "step": 38089 + }, + { + "epoch": 0.6793778760746263, + "grad_norm": 0.25015395879745483, + "learning_rate": 1.4088342281363759e-05, + "loss": 0.0958, + "step": 38090 + }, + { + "epoch": 0.67939571219634, + "grad_norm": 0.5646789073944092, + "learning_rate": 1.4086941880570298e-05, + "loss": 0.1782, + "step": 38091 + }, + { + "epoch": 0.6794135483180537, + "grad_norm": 0.3682366907596588, + "learning_rate": 1.4085541522079631e-05, + "loss": 0.1553, + "step": 38092 + }, + { + "epoch": 0.6794313844397675, + "grad_norm": 0.19553418457508087, + "learning_rate": 1.4084141205897172e-05, + "loss": 0.1314, + "step": 38093 + }, + { + "epoch": 0.6794492205614812, + "grad_norm": 0.29568129777908325, + "learning_rate": 1.4082740932028354e-05, + "loss": 0.1239, + "step": 38094 + }, + { + "epoch": 0.6794670566831948, + "grad_norm": 0.3364834189414978, + "learning_rate": 1.4081340700478593e-05, + "loss": 0.0928, + "step": 38095 + }, + { + "epoch": 0.6794848928049085, + "grad_norm": 0.279713898897171, + "learning_rate": 1.4079940511253325e-05, + "loss": 0.0963, + "step": 38096 + }, + { + "epoch": 0.6795027289266222, + "grad_norm": 0.24845629930496216, + "learning_rate": 1.4078540364357989e-05, + "loss": 0.1478, + "step": 38097 + }, + { + "epoch": 0.6795205650483359, + "grad_norm": 0.3844321072101593, + "learning_rate": 1.4077140259798006e-05, + "loss": 0.1679, + "step": 38098 + }, + { + "epoch": 0.6795384011700496, + "grad_norm": 0.23167596757411957, + "learning_rate": 1.4075740197578788e-05, + "loss": 0.087, + "step": 38099 + }, + { + "epoch": 0.6795562372917633, + "grad_norm": 0.2533831298351288, + "learning_rate": 1.4074340177705786e-05, + "loss": 0.1426, + "step": 38100 + }, + { + "epoch": 0.679574073413477, + "grad_norm": 0.3005765974521637, + "learning_rate": 1.4072940200184412e-05, + "loss": 0.1383, + "step": 38101 + }, + { + "epoch": 0.6795919095351907, + "grad_norm": 0.2920003831386566, + "learning_rate": 1.4071540265020094e-05, + "loss": 0.1437, + "step": 38102 + }, + { + "epoch": 0.6796097456569044, + "grad_norm": 0.24694103002548218, + "learning_rate": 1.4070140372218255e-05, + "loss": 0.1024, + "step": 38103 + }, + { + "epoch": 0.679627581778618, + "grad_norm": 0.3437347114086151, + "learning_rate": 1.4068740521784334e-05, + "loss": 0.1831, + "step": 38104 + }, + { + "epoch": 0.6796454179003317, + "grad_norm": 0.20567935705184937, + "learning_rate": 1.4067340713723754e-05, + "loss": 0.1015, + "step": 38105 + }, + { + "epoch": 0.6796632540220454, + "grad_norm": 0.2873671352863312, + "learning_rate": 1.4065940948041934e-05, + "loss": 0.0762, + "step": 38106 + }, + { + "epoch": 0.6796810901437591, + "grad_norm": 0.31024056673049927, + "learning_rate": 1.4064541224744305e-05, + "loss": 0.1175, + "step": 38107 + }, + { + "epoch": 0.6796989262654728, + "grad_norm": 0.2765016555786133, + "learning_rate": 1.4063141543836284e-05, + "loss": 0.0913, + "step": 38108 + }, + { + "epoch": 0.6797167623871865, + "grad_norm": 0.27351024746894836, + "learning_rate": 1.4061741905323309e-05, + "loss": 0.118, + "step": 38109 + }, + { + "epoch": 0.6797345985089003, + "grad_norm": 0.29077038168907166, + "learning_rate": 1.4060342309210792e-05, + "loss": 0.1159, + "step": 38110 + }, + { + "epoch": 0.679752434630614, + "grad_norm": 0.23768246173858643, + "learning_rate": 1.4058942755504176e-05, + "loss": 0.1086, + "step": 38111 + }, + { + "epoch": 0.6797702707523277, + "grad_norm": 0.4070793688297272, + "learning_rate": 1.4057543244208868e-05, + "loss": 0.1351, + "step": 38112 + }, + { + "epoch": 0.6797881068740413, + "grad_norm": 0.23685990273952484, + "learning_rate": 1.405614377533031e-05, + "loss": 0.1793, + "step": 38113 + }, + { + "epoch": 0.679805942995755, + "grad_norm": 0.19624833762645721, + "learning_rate": 1.4054744348873921e-05, + "loss": 0.1395, + "step": 38114 + }, + { + "epoch": 0.6798237791174687, + "grad_norm": 0.2356448769569397, + "learning_rate": 1.4053344964845122e-05, + "loss": 0.1261, + "step": 38115 + }, + { + "epoch": 0.6798416152391824, + "grad_norm": 0.2942589819431305, + "learning_rate": 1.405194562324933e-05, + "loss": 0.0917, + "step": 38116 + }, + { + "epoch": 0.6798594513608961, + "grad_norm": 0.2452307939529419, + "learning_rate": 1.4050546324091985e-05, + "loss": 0.109, + "step": 38117 + }, + { + "epoch": 0.6798772874826098, + "grad_norm": 0.24454204738140106, + "learning_rate": 1.4049147067378507e-05, + "loss": 0.0803, + "step": 38118 + }, + { + "epoch": 0.6798951236043235, + "grad_norm": 0.3530655801296234, + "learning_rate": 1.4047747853114318e-05, + "loss": 0.1368, + "step": 38119 + }, + { + "epoch": 0.6799129597260372, + "grad_norm": 0.36142921447753906, + "learning_rate": 1.4046348681304838e-05, + "loss": 0.1726, + "step": 38120 + }, + { + "epoch": 0.6799307958477508, + "grad_norm": 0.23010893166065216, + "learning_rate": 1.4044949551955485e-05, + "loss": 0.094, + "step": 38121 + }, + { + "epoch": 0.6799486319694645, + "grad_norm": 0.24662913382053375, + "learning_rate": 1.4043550465071704e-05, + "loss": 0.1059, + "step": 38122 + }, + { + "epoch": 0.6799664680911782, + "grad_norm": 0.2754177451133728, + "learning_rate": 1.4042151420658891e-05, + "loss": 0.1217, + "step": 38123 + }, + { + "epoch": 0.6799843042128919, + "grad_norm": 0.3141807019710541, + "learning_rate": 1.4040752418722497e-05, + "loss": 0.149, + "step": 38124 + }, + { + "epoch": 0.6800021403346056, + "grad_norm": 0.2542325556278229, + "learning_rate": 1.4039353459267921e-05, + "loss": 0.1596, + "step": 38125 + }, + { + "epoch": 0.6800199764563193, + "grad_norm": 0.3023487329483032, + "learning_rate": 1.4037954542300607e-05, + "loss": 0.1192, + "step": 38126 + }, + { + "epoch": 0.6800378125780331, + "grad_norm": 0.2309402972459793, + "learning_rate": 1.4036555667825969e-05, + "loss": 0.1029, + "step": 38127 + }, + { + "epoch": 0.6800556486997468, + "grad_norm": 0.21685844659805298, + "learning_rate": 1.4035156835849423e-05, + "loss": 0.1038, + "step": 38128 + }, + { + "epoch": 0.6800734848214605, + "grad_norm": 0.2672184705734253, + "learning_rate": 1.4033758046376388e-05, + "loss": 0.1442, + "step": 38129 + }, + { + "epoch": 0.6800913209431741, + "grad_norm": 0.2531728446483612, + "learning_rate": 1.4032359299412307e-05, + "loss": 0.1109, + "step": 38130 + }, + { + "epoch": 0.6801091570648878, + "grad_norm": 0.26913121342658997, + "learning_rate": 1.4030960594962589e-05, + "loss": 0.199, + "step": 38131 + }, + { + "epoch": 0.6801269931866015, + "grad_norm": 0.2779064476490021, + "learning_rate": 1.4029561933032653e-05, + "loss": 0.1124, + "step": 38132 + }, + { + "epoch": 0.6801448293083152, + "grad_norm": 0.35877105593681335, + "learning_rate": 1.402816331362793e-05, + "loss": 0.1724, + "step": 38133 + }, + { + "epoch": 0.6801626654300289, + "grad_norm": 0.2864941656589508, + "learning_rate": 1.402676473675382e-05, + "loss": 0.1763, + "step": 38134 + }, + { + "epoch": 0.6801805015517426, + "grad_norm": 0.26372548937797546, + "learning_rate": 1.4025366202415772e-05, + "loss": 0.0856, + "step": 38135 + }, + { + "epoch": 0.6801983376734563, + "grad_norm": 0.2866208255290985, + "learning_rate": 1.4023967710619195e-05, + "loss": 0.1778, + "step": 38136 + }, + { + "epoch": 0.68021617379517, + "grad_norm": 0.304635226726532, + "learning_rate": 1.40225692613695e-05, + "loss": 0.1284, + "step": 38137 + }, + { + "epoch": 0.6802340099168837, + "grad_norm": 0.31587427854537964, + "learning_rate": 1.4021170854672127e-05, + "loss": 0.1373, + "step": 38138 + }, + { + "epoch": 0.6802518460385973, + "grad_norm": 0.26686543226242065, + "learning_rate": 1.401977249053248e-05, + "loss": 0.0964, + "step": 38139 + }, + { + "epoch": 0.680269682160311, + "grad_norm": 0.23689597845077515, + "learning_rate": 1.4018374168955995e-05, + "loss": 0.1093, + "step": 38140 + }, + { + "epoch": 0.6802875182820247, + "grad_norm": 0.34281957149505615, + "learning_rate": 1.4016975889948086e-05, + "loss": 0.1621, + "step": 38141 + }, + { + "epoch": 0.6803053544037384, + "grad_norm": 0.20335371792316437, + "learning_rate": 1.4015577653514161e-05, + "loss": 0.079, + "step": 38142 + }, + { + "epoch": 0.6803231905254521, + "grad_norm": 0.22631576657295227, + "learning_rate": 1.4014179459659662e-05, + "loss": 0.1195, + "step": 38143 + }, + { + "epoch": 0.6803410266471659, + "grad_norm": 0.28651463985443115, + "learning_rate": 1.4012781308389997e-05, + "loss": 0.1442, + "step": 38144 + }, + { + "epoch": 0.6803588627688796, + "grad_norm": 0.28875383734703064, + "learning_rate": 1.4011383199710587e-05, + "loss": 0.1502, + "step": 38145 + }, + { + "epoch": 0.6803766988905933, + "grad_norm": 0.23822379112243652, + "learning_rate": 1.4009985133626853e-05, + "loss": 0.1304, + "step": 38146 + }, + { + "epoch": 0.680394535012307, + "grad_norm": 0.2789832353591919, + "learning_rate": 1.40085871101442e-05, + "loss": 0.1374, + "step": 38147 + }, + { + "epoch": 0.6804123711340206, + "grad_norm": 0.30027708411216736, + "learning_rate": 1.4007189129268067e-05, + "loss": 0.1247, + "step": 38148 + }, + { + "epoch": 0.6804302072557343, + "grad_norm": 0.35845959186553955, + "learning_rate": 1.4005791191003869e-05, + "loss": 0.1331, + "step": 38149 + }, + { + "epoch": 0.680448043377448, + "grad_norm": 0.3312264084815979, + "learning_rate": 1.4004393295357013e-05, + "loss": 0.1669, + "step": 38150 + }, + { + "epoch": 0.6804658794991617, + "grad_norm": 0.29060032963752747, + "learning_rate": 1.4002995442332934e-05, + "loss": 0.098, + "step": 38151 + }, + { + "epoch": 0.6804837156208754, + "grad_norm": 0.18560989201068878, + "learning_rate": 1.4001597631937036e-05, + "loss": 0.0637, + "step": 38152 + }, + { + "epoch": 0.6805015517425891, + "grad_norm": 0.31550469994544983, + "learning_rate": 1.4000199864174752e-05, + "loss": 0.1687, + "step": 38153 + }, + { + "epoch": 0.6805193878643028, + "grad_norm": 0.3084012269973755, + "learning_rate": 1.3998802139051493e-05, + "loss": 0.1269, + "step": 38154 + }, + { + "epoch": 0.6805372239860165, + "grad_norm": 0.2431219518184662, + "learning_rate": 1.3997404456572663e-05, + "loss": 0.0841, + "step": 38155 + }, + { + "epoch": 0.6805550601077301, + "grad_norm": 0.2584455907344818, + "learning_rate": 1.3996006816743706e-05, + "loss": 0.079, + "step": 38156 + }, + { + "epoch": 0.6805728962294438, + "grad_norm": 0.23360052704811096, + "learning_rate": 1.399460921957003e-05, + "loss": 0.1094, + "step": 38157 + }, + { + "epoch": 0.6805907323511575, + "grad_norm": 0.2123311460018158, + "learning_rate": 1.3993211665057046e-05, + "loss": 0.1107, + "step": 38158 + }, + { + "epoch": 0.6806085684728712, + "grad_norm": 0.2764964997768402, + "learning_rate": 1.3991814153210175e-05, + "loss": 0.1147, + "step": 38159 + }, + { + "epoch": 0.6806264045945849, + "grad_norm": 0.1956997662782669, + "learning_rate": 1.3990416684034829e-05, + "loss": 0.1232, + "step": 38160 + }, + { + "epoch": 0.6806442407162987, + "grad_norm": 0.2130974978208542, + "learning_rate": 1.3989019257536437e-05, + "loss": 0.091, + "step": 38161 + }, + { + "epoch": 0.6806620768380124, + "grad_norm": 0.3136894106864929, + "learning_rate": 1.3987621873720411e-05, + "loss": 0.1488, + "step": 38162 + }, + { + "epoch": 0.6806799129597261, + "grad_norm": 0.32723692059516907, + "learning_rate": 1.3986224532592165e-05, + "loss": 0.121, + "step": 38163 + }, + { + "epoch": 0.6806977490814398, + "grad_norm": 0.21379972994327545, + "learning_rate": 1.3984827234157106e-05, + "loss": 0.1047, + "step": 38164 + }, + { + "epoch": 0.6807155852031535, + "grad_norm": 0.36420774459838867, + "learning_rate": 1.3983429978420673e-05, + "loss": 0.1565, + "step": 38165 + }, + { + "epoch": 0.6807334213248671, + "grad_norm": 0.24340316653251648, + "learning_rate": 1.3982032765388258e-05, + "loss": 0.081, + "step": 38166 + }, + { + "epoch": 0.6807512574465808, + "grad_norm": 0.24933785200119019, + "learning_rate": 1.3980635595065303e-05, + "loss": 0.1187, + "step": 38167 + }, + { + "epoch": 0.6807690935682945, + "grad_norm": 0.2732901871204376, + "learning_rate": 1.3979238467457201e-05, + "loss": 0.122, + "step": 38168 + }, + { + "epoch": 0.6807869296900082, + "grad_norm": 0.246701180934906, + "learning_rate": 1.3977841382569384e-05, + "loss": 0.1467, + "step": 38169 + }, + { + "epoch": 0.6808047658117219, + "grad_norm": 0.2657492458820343, + "learning_rate": 1.397644434040726e-05, + "loss": 0.0997, + "step": 38170 + }, + { + "epoch": 0.6808226019334356, + "grad_norm": 0.25838497281074524, + "learning_rate": 1.3975047340976246e-05, + "loss": 0.1105, + "step": 38171 + }, + { + "epoch": 0.6808404380551493, + "grad_norm": 0.3133302927017212, + "learning_rate": 1.3973650384281758e-05, + "loss": 0.1006, + "step": 38172 + }, + { + "epoch": 0.680858274176863, + "grad_norm": 0.2004750370979309, + "learning_rate": 1.39722534703292e-05, + "loss": 0.0739, + "step": 38173 + }, + { + "epoch": 0.6808761102985766, + "grad_norm": 0.29933902621269226, + "learning_rate": 1.3970856599124005e-05, + "loss": 0.0798, + "step": 38174 + }, + { + "epoch": 0.6808939464202903, + "grad_norm": 0.28791409730911255, + "learning_rate": 1.3969459770671579e-05, + "loss": 0.1313, + "step": 38175 + }, + { + "epoch": 0.680911782542004, + "grad_norm": 0.2405257672071457, + "learning_rate": 1.3968062984977337e-05, + "loss": 0.1042, + "step": 38176 + }, + { + "epoch": 0.6809296186637177, + "grad_norm": 0.30018922686576843, + "learning_rate": 1.3966666242046683e-05, + "loss": 0.0747, + "step": 38177 + }, + { + "epoch": 0.6809474547854315, + "grad_norm": 0.3320558965206146, + "learning_rate": 1.396526954188505e-05, + "loss": 0.11, + "step": 38178 + }, + { + "epoch": 0.6809652909071452, + "grad_norm": 0.28587329387664795, + "learning_rate": 1.3963872884497837e-05, + "loss": 0.106, + "step": 38179 + }, + { + "epoch": 0.6809831270288589, + "grad_norm": 0.3686194121837616, + "learning_rate": 1.396247626989047e-05, + "loss": 0.1475, + "step": 38180 + }, + { + "epoch": 0.6810009631505726, + "grad_norm": 0.2667557895183563, + "learning_rate": 1.396107969806835e-05, + "loss": 0.1421, + "step": 38181 + }, + { + "epoch": 0.6810187992722863, + "grad_norm": 0.3066580891609192, + "learning_rate": 1.3959683169036908e-05, + "loss": 0.1108, + "step": 38182 + }, + { + "epoch": 0.681036635394, + "grad_norm": 0.2698015570640564, + "learning_rate": 1.3958286682801546e-05, + "loss": 0.1744, + "step": 38183 + }, + { + "epoch": 0.6810544715157136, + "grad_norm": 0.2537696361541748, + "learning_rate": 1.3956890239367678e-05, + "loss": 0.0825, + "step": 38184 + }, + { + "epoch": 0.6810723076374273, + "grad_norm": 0.22206906974315643, + "learning_rate": 1.3955493838740718e-05, + "loss": 0.1499, + "step": 38185 + }, + { + "epoch": 0.681090143759141, + "grad_norm": 0.37641796469688416, + "learning_rate": 1.395409748092607e-05, + "loss": 0.1679, + "step": 38186 + }, + { + "epoch": 0.6811079798808547, + "grad_norm": 0.24186834692955017, + "learning_rate": 1.3952701165929166e-05, + "loss": 0.1495, + "step": 38187 + }, + { + "epoch": 0.6811258160025684, + "grad_norm": 0.2486027479171753, + "learning_rate": 1.3951304893755407e-05, + "loss": 0.109, + "step": 38188 + }, + { + "epoch": 0.6811436521242821, + "grad_norm": 0.2330179363489151, + "learning_rate": 1.3949908664410205e-05, + "loss": 0.092, + "step": 38189 + }, + { + "epoch": 0.6811614882459958, + "grad_norm": 0.5090229511260986, + "learning_rate": 1.3948512477898965e-05, + "loss": 0.1246, + "step": 38190 + }, + { + "epoch": 0.6811793243677094, + "grad_norm": 0.2555474638938904, + "learning_rate": 1.3947116334227117e-05, + "loss": 0.1096, + "step": 38191 + }, + { + "epoch": 0.6811971604894231, + "grad_norm": 0.3159734308719635, + "learning_rate": 1.3945720233400065e-05, + "loss": 0.1506, + "step": 38192 + }, + { + "epoch": 0.6812149966111368, + "grad_norm": 0.2880256474018097, + "learning_rate": 1.3944324175423207e-05, + "loss": 0.1244, + "step": 38193 + }, + { + "epoch": 0.6812328327328506, + "grad_norm": 0.27575504779815674, + "learning_rate": 1.3942928160301976e-05, + "loss": 0.1481, + "step": 38194 + }, + { + "epoch": 0.6812506688545643, + "grad_norm": 0.2570909857749939, + "learning_rate": 1.3941532188041768e-05, + "loss": 0.1741, + "step": 38195 + }, + { + "epoch": 0.681268504976278, + "grad_norm": 0.28382623195648193, + "learning_rate": 1.3940136258648013e-05, + "loss": 0.1365, + "step": 38196 + }, + { + "epoch": 0.6812863410979917, + "grad_norm": 0.2137383222579956, + "learning_rate": 1.3938740372126102e-05, + "loss": 0.0733, + "step": 38197 + }, + { + "epoch": 0.6813041772197054, + "grad_norm": 0.3670567274093628, + "learning_rate": 1.393734452848146e-05, + "loss": 0.1571, + "step": 38198 + }, + { + "epoch": 0.6813220133414191, + "grad_norm": 0.19791445136070251, + "learning_rate": 1.3935948727719478e-05, + "loss": 0.1032, + "step": 38199 + }, + { + "epoch": 0.6813398494631328, + "grad_norm": 0.2627556025981903, + "learning_rate": 1.3934552969845593e-05, + "loss": 0.079, + "step": 38200 + }, + { + "epoch": 0.6813576855848464, + "grad_norm": 0.30776503682136536, + "learning_rate": 1.39331572548652e-05, + "loss": 0.1639, + "step": 38201 + }, + { + "epoch": 0.6813755217065601, + "grad_norm": 0.23980633914470673, + "learning_rate": 1.3931761582783714e-05, + "loss": 0.1445, + "step": 38202 + }, + { + "epoch": 0.6813933578282738, + "grad_norm": 0.28446152806282043, + "learning_rate": 1.3930365953606533e-05, + "loss": 0.1294, + "step": 38203 + }, + { + "epoch": 0.6814111939499875, + "grad_norm": 0.2952460050582886, + "learning_rate": 1.3928970367339083e-05, + "loss": 0.118, + "step": 38204 + }, + { + "epoch": 0.6814290300717012, + "grad_norm": 0.24420614540576935, + "learning_rate": 1.3927574823986772e-05, + "loss": 0.1224, + "step": 38205 + }, + { + "epoch": 0.6814468661934149, + "grad_norm": 0.27015960216522217, + "learning_rate": 1.3926179323554995e-05, + "loss": 0.1035, + "step": 38206 + }, + { + "epoch": 0.6814647023151286, + "grad_norm": 0.2446109801530838, + "learning_rate": 1.3924783866049179e-05, + "loss": 0.1311, + "step": 38207 + }, + { + "epoch": 0.6814825384368423, + "grad_norm": 0.22301532328128815, + "learning_rate": 1.3923388451474717e-05, + "loss": 0.1199, + "step": 38208 + }, + { + "epoch": 0.6815003745585559, + "grad_norm": 0.2690506875514984, + "learning_rate": 1.3921993079837037e-05, + "loss": 0.1173, + "step": 38209 + }, + { + "epoch": 0.6815182106802696, + "grad_norm": 0.3371462821960449, + "learning_rate": 1.392059775114154e-05, + "loss": 0.1391, + "step": 38210 + }, + { + "epoch": 0.6815360468019834, + "grad_norm": 0.26927849650382996, + "learning_rate": 1.3919202465393633e-05, + "loss": 0.1066, + "step": 38211 + }, + { + "epoch": 0.6815538829236971, + "grad_norm": 0.2634313702583313, + "learning_rate": 1.3917807222598712e-05, + "loss": 0.097, + "step": 38212 + }, + { + "epoch": 0.6815717190454108, + "grad_norm": 0.6263816356658936, + "learning_rate": 1.391641202276221e-05, + "loss": 0.0811, + "step": 38213 + }, + { + "epoch": 0.6815895551671245, + "grad_norm": 0.2044709026813507, + "learning_rate": 1.3915016865889519e-05, + "loss": 0.1076, + "step": 38214 + }, + { + "epoch": 0.6816073912888382, + "grad_norm": 0.209680438041687, + "learning_rate": 1.3913621751986056e-05, + "loss": 0.1426, + "step": 38215 + }, + { + "epoch": 0.6816252274105519, + "grad_norm": 0.28347915410995483, + "learning_rate": 1.391222668105721e-05, + "loss": 0.1113, + "step": 38216 + }, + { + "epoch": 0.6816430635322656, + "grad_norm": 0.23850694298744202, + "learning_rate": 1.3910831653108417e-05, + "loss": 0.0663, + "step": 38217 + }, + { + "epoch": 0.6816608996539792, + "grad_norm": 0.3260178565979004, + "learning_rate": 1.3909436668145069e-05, + "loss": 0.1366, + "step": 38218 + }, + { + "epoch": 0.6816787357756929, + "grad_norm": 0.34857356548309326, + "learning_rate": 1.3908041726172574e-05, + "loss": 0.1337, + "step": 38219 + }, + { + "epoch": 0.6816965718974066, + "grad_norm": 0.25902673602104187, + "learning_rate": 1.3906646827196333e-05, + "loss": 0.0871, + "step": 38220 + }, + { + "epoch": 0.6817144080191203, + "grad_norm": 0.29883676767349243, + "learning_rate": 1.3905251971221767e-05, + "loss": 0.1153, + "step": 38221 + }, + { + "epoch": 0.681732244140834, + "grad_norm": 0.2750854194164276, + "learning_rate": 1.3903857158254269e-05, + "loss": 0.0991, + "step": 38222 + }, + { + "epoch": 0.6817500802625477, + "grad_norm": 0.29598814249038696, + "learning_rate": 1.3902462388299262e-05, + "loss": 0.1161, + "step": 38223 + }, + { + "epoch": 0.6817679163842614, + "grad_norm": 0.25487640500068665, + "learning_rate": 1.3901067661362144e-05, + "loss": 0.0974, + "step": 38224 + }, + { + "epoch": 0.6817857525059751, + "grad_norm": 0.28686612844467163, + "learning_rate": 1.3899672977448311e-05, + "loss": 0.094, + "step": 38225 + }, + { + "epoch": 0.6818035886276888, + "grad_norm": 0.2694266140460968, + "learning_rate": 1.3898278336563189e-05, + "loss": 0.1457, + "step": 38226 + }, + { + "epoch": 0.6818214247494024, + "grad_norm": 0.3161659240722656, + "learning_rate": 1.3896883738712174e-05, + "loss": 0.1045, + "step": 38227 + }, + { + "epoch": 0.6818392608711162, + "grad_norm": 0.2819994390010834, + "learning_rate": 1.3895489183900673e-05, + "loss": 0.1186, + "step": 38228 + }, + { + "epoch": 0.6818570969928299, + "grad_norm": 0.30145150423049927, + "learning_rate": 1.3894094672134084e-05, + "loss": 0.1715, + "step": 38229 + }, + { + "epoch": 0.6818749331145436, + "grad_norm": 0.2924537658691406, + "learning_rate": 1.3892700203417827e-05, + "loss": 0.1186, + "step": 38230 + }, + { + "epoch": 0.6818927692362573, + "grad_norm": 0.20499379932880402, + "learning_rate": 1.3891305777757301e-05, + "loss": 0.0833, + "step": 38231 + }, + { + "epoch": 0.681910605357971, + "grad_norm": 0.24573004245758057, + "learning_rate": 1.3889911395157911e-05, + "loss": 0.1002, + "step": 38232 + }, + { + "epoch": 0.6819284414796847, + "grad_norm": 0.3260311782360077, + "learning_rate": 1.3888517055625053e-05, + "loss": 0.1284, + "step": 38233 + }, + { + "epoch": 0.6819462776013984, + "grad_norm": 0.29326748847961426, + "learning_rate": 1.3887122759164151e-05, + "loss": 0.1006, + "step": 38234 + }, + { + "epoch": 0.681964113723112, + "grad_norm": 0.18743669986724854, + "learning_rate": 1.3885728505780588e-05, + "loss": 0.0675, + "step": 38235 + }, + { + "epoch": 0.6819819498448257, + "grad_norm": 0.3353661596775055, + "learning_rate": 1.3884334295479786e-05, + "loss": 0.14, + "step": 38236 + }, + { + "epoch": 0.6819997859665394, + "grad_norm": 0.4249752163887024, + "learning_rate": 1.3882940128267146e-05, + "loss": 0.107, + "step": 38237 + }, + { + "epoch": 0.6820176220882531, + "grad_norm": 0.22529543936252594, + "learning_rate": 1.3881546004148063e-05, + "loss": 0.124, + "step": 38238 + }, + { + "epoch": 0.6820354582099668, + "grad_norm": 0.27800649404525757, + "learning_rate": 1.3880151923127957e-05, + "loss": 0.0919, + "step": 38239 + }, + { + "epoch": 0.6820532943316805, + "grad_norm": 0.2507452964782715, + "learning_rate": 1.3878757885212221e-05, + "loss": 0.1155, + "step": 38240 + }, + { + "epoch": 0.6820711304533942, + "grad_norm": 0.25226354598999023, + "learning_rate": 1.3877363890406261e-05, + "loss": 0.1264, + "step": 38241 + }, + { + "epoch": 0.6820889665751079, + "grad_norm": 0.2639915645122528, + "learning_rate": 1.3875969938715472e-05, + "loss": 0.1114, + "step": 38242 + }, + { + "epoch": 0.6821068026968216, + "grad_norm": 0.27226176857948303, + "learning_rate": 1.3874576030145276e-05, + "loss": 0.1542, + "step": 38243 + }, + { + "epoch": 0.6821246388185352, + "grad_norm": 0.2571322023868561, + "learning_rate": 1.387318216470106e-05, + "loss": 0.1315, + "step": 38244 + }, + { + "epoch": 0.682142474940249, + "grad_norm": 0.3140402138233185, + "learning_rate": 1.3871788342388236e-05, + "loss": 0.1089, + "step": 38245 + }, + { + "epoch": 0.6821603110619627, + "grad_norm": 0.5863073468208313, + "learning_rate": 1.3870394563212197e-05, + "loss": 0.1168, + "step": 38246 + }, + { + "epoch": 0.6821781471836764, + "grad_norm": 0.3086020052433014, + "learning_rate": 1.3869000827178363e-05, + "loss": 0.157, + "step": 38247 + }, + { + "epoch": 0.6821959833053901, + "grad_norm": 0.27945834398269653, + "learning_rate": 1.386760713429212e-05, + "loss": 0.128, + "step": 38248 + }, + { + "epoch": 0.6822138194271038, + "grad_norm": 0.28308427333831787, + "learning_rate": 1.3866213484558874e-05, + "loss": 0.1361, + "step": 38249 + }, + { + "epoch": 0.6822316555488175, + "grad_norm": 0.2819865643978119, + "learning_rate": 1.3864819877984036e-05, + "loss": 0.1399, + "step": 38250 + }, + { + "epoch": 0.6822494916705312, + "grad_norm": 0.29427313804626465, + "learning_rate": 1.3863426314572991e-05, + "loss": 0.1528, + "step": 38251 + }, + { + "epoch": 0.6822673277922449, + "grad_norm": 0.2779558002948761, + "learning_rate": 1.3862032794331165e-05, + "loss": 0.125, + "step": 38252 + }, + { + "epoch": 0.6822851639139585, + "grad_norm": 0.24587909877300262, + "learning_rate": 1.3860639317263947e-05, + "loss": 0.1035, + "step": 38253 + }, + { + "epoch": 0.6823030000356722, + "grad_norm": 0.29528847336769104, + "learning_rate": 1.3859245883376736e-05, + "loss": 0.116, + "step": 38254 + }, + { + "epoch": 0.6823208361573859, + "grad_norm": 0.21087324619293213, + "learning_rate": 1.3857852492674927e-05, + "loss": 0.0948, + "step": 38255 + }, + { + "epoch": 0.6823386722790996, + "grad_norm": 0.31591928005218506, + "learning_rate": 1.3856459145163942e-05, + "loss": 0.1268, + "step": 38256 + }, + { + "epoch": 0.6823565084008133, + "grad_norm": 0.34696435928344727, + "learning_rate": 1.3855065840849168e-05, + "loss": 0.1533, + "step": 38257 + }, + { + "epoch": 0.682374344522527, + "grad_norm": 0.2870495617389679, + "learning_rate": 1.385367257973601e-05, + "loss": 0.1325, + "step": 38258 + }, + { + "epoch": 0.6823921806442407, + "grad_norm": 0.27288681268692017, + "learning_rate": 1.3852279361829853e-05, + "loss": 0.119, + "step": 38259 + }, + { + "epoch": 0.6824100167659544, + "grad_norm": 0.18147170543670654, + "learning_rate": 1.3850886187136126e-05, + "loss": 0.0966, + "step": 38260 + }, + { + "epoch": 0.682427852887668, + "grad_norm": 0.31475985050201416, + "learning_rate": 1.3849493055660211e-05, + "loss": 0.139, + "step": 38261 + }, + { + "epoch": 0.6824456890093819, + "grad_norm": 0.2707485556602478, + "learning_rate": 1.3848099967407504e-05, + "loss": 0.0828, + "step": 38262 + }, + { + "epoch": 0.6824635251310955, + "grad_norm": 0.249025359749794, + "learning_rate": 1.3846706922383423e-05, + "loss": 0.1096, + "step": 38263 + }, + { + "epoch": 0.6824813612528092, + "grad_norm": 0.20857711136341095, + "learning_rate": 1.384531392059335e-05, + "loss": 0.1103, + "step": 38264 + }, + { + "epoch": 0.6824991973745229, + "grad_norm": 0.28961077332496643, + "learning_rate": 1.38439209620427e-05, + "loss": 0.0723, + "step": 38265 + }, + { + "epoch": 0.6825170334962366, + "grad_norm": 0.3273436427116394, + "learning_rate": 1.3842528046736869e-05, + "loss": 0.1646, + "step": 38266 + }, + { + "epoch": 0.6825348696179503, + "grad_norm": 0.2047920525074005, + "learning_rate": 1.384113517468125e-05, + "loss": 0.114, + "step": 38267 + }, + { + "epoch": 0.682552705739664, + "grad_norm": 0.17825815081596375, + "learning_rate": 1.3839742345881234e-05, + "loss": 0.1186, + "step": 38268 + }, + { + "epoch": 0.6825705418613777, + "grad_norm": 0.23689785599708557, + "learning_rate": 1.3838349560342246e-05, + "loss": 0.0824, + "step": 38269 + }, + { + "epoch": 0.6825883779830914, + "grad_norm": 0.32322752475738525, + "learning_rate": 1.3836956818069668e-05, + "loss": 0.1087, + "step": 38270 + }, + { + "epoch": 0.682606214104805, + "grad_norm": 0.2611806094646454, + "learning_rate": 1.38355641190689e-05, + "loss": 0.0745, + "step": 38271 + }, + { + "epoch": 0.6826240502265187, + "grad_norm": 0.28862065076828003, + "learning_rate": 1.383417146334533e-05, + "loss": 0.1463, + "step": 38272 + }, + { + "epoch": 0.6826418863482324, + "grad_norm": 0.29576367139816284, + "learning_rate": 1.3832778850904381e-05, + "loss": 0.1372, + "step": 38273 + }, + { + "epoch": 0.6826597224699461, + "grad_norm": 0.4100019633769989, + "learning_rate": 1.3831386281751438e-05, + "loss": 0.1351, + "step": 38274 + }, + { + "epoch": 0.6826775585916598, + "grad_norm": 0.23411719501018524, + "learning_rate": 1.3829993755891898e-05, + "loss": 0.1239, + "step": 38275 + }, + { + "epoch": 0.6826953947133735, + "grad_norm": 0.244368314743042, + "learning_rate": 1.3828601273331152e-05, + "loss": 0.1082, + "step": 38276 + }, + { + "epoch": 0.6827132308350872, + "grad_norm": 0.26957738399505615, + "learning_rate": 1.3827208834074606e-05, + "loss": 0.1175, + "step": 38277 + }, + { + "epoch": 0.6827310669568009, + "grad_norm": 0.23769696056842804, + "learning_rate": 1.3825816438127664e-05, + "loss": 0.122, + "step": 38278 + }, + { + "epoch": 0.6827489030785147, + "grad_norm": 0.36357536911964417, + "learning_rate": 1.3824424085495718e-05, + "loss": 0.1434, + "step": 38279 + }, + { + "epoch": 0.6827667392002283, + "grad_norm": 0.31610673666000366, + "learning_rate": 1.3823031776184167e-05, + "loss": 0.105, + "step": 38280 + }, + { + "epoch": 0.682784575321942, + "grad_norm": 0.24689598381519318, + "learning_rate": 1.3821639510198395e-05, + "loss": 0.1164, + "step": 38281 + }, + { + "epoch": 0.6828024114436557, + "grad_norm": 0.24583686888217926, + "learning_rate": 1.3820247287543817e-05, + "loss": 0.0868, + "step": 38282 + }, + { + "epoch": 0.6828202475653694, + "grad_norm": 0.1959419846534729, + "learning_rate": 1.3818855108225823e-05, + "loss": 0.1041, + "step": 38283 + }, + { + "epoch": 0.6828380836870831, + "grad_norm": 0.2432994395494461, + "learning_rate": 1.3817462972249806e-05, + "loss": 0.1326, + "step": 38284 + }, + { + "epoch": 0.6828559198087968, + "grad_norm": 0.20067833364009857, + "learning_rate": 1.3816070879621157e-05, + "loss": 0.125, + "step": 38285 + }, + { + "epoch": 0.6828737559305105, + "grad_norm": 0.2468937337398529, + "learning_rate": 1.381467883034529e-05, + "loss": 0.0881, + "step": 38286 + }, + { + "epoch": 0.6828915920522242, + "grad_norm": 0.29089730978012085, + "learning_rate": 1.3813286824427593e-05, + "loss": 0.1175, + "step": 38287 + }, + { + "epoch": 0.6829094281739378, + "grad_norm": 0.20711497962474823, + "learning_rate": 1.3811894861873458e-05, + "loss": 0.1139, + "step": 38288 + }, + { + "epoch": 0.6829272642956515, + "grad_norm": 0.27331846952438354, + "learning_rate": 1.3810502942688274e-05, + "loss": 0.1299, + "step": 38289 + }, + { + "epoch": 0.6829451004173652, + "grad_norm": 0.31152036786079407, + "learning_rate": 1.3809111066877454e-05, + "loss": 0.1503, + "step": 38290 + }, + { + "epoch": 0.6829629365390789, + "grad_norm": 0.35022714734077454, + "learning_rate": 1.3807719234446375e-05, + "loss": 0.1113, + "step": 38291 + }, + { + "epoch": 0.6829807726607926, + "grad_norm": 0.26127803325653076, + "learning_rate": 1.3806327445400452e-05, + "loss": 0.1069, + "step": 38292 + }, + { + "epoch": 0.6829986087825063, + "grad_norm": 0.31824514269828796, + "learning_rate": 1.380493569974507e-05, + "loss": 0.1369, + "step": 38293 + }, + { + "epoch": 0.68301644490422, + "grad_norm": 0.2234506756067276, + "learning_rate": 1.3803543997485613e-05, + "loss": 0.1084, + "step": 38294 + }, + { + "epoch": 0.6830342810259338, + "grad_norm": 0.18394304811954498, + "learning_rate": 1.3802152338627498e-05, + "loss": 0.0654, + "step": 38295 + }, + { + "epoch": 0.6830521171476475, + "grad_norm": 0.2944035232067108, + "learning_rate": 1.3800760723176106e-05, + "loss": 0.1311, + "step": 38296 + }, + { + "epoch": 0.6830699532693612, + "grad_norm": 0.23413337767124176, + "learning_rate": 1.3799369151136836e-05, + "loss": 0.0966, + "step": 38297 + }, + { + "epoch": 0.6830877893910748, + "grad_norm": 0.23401190340518951, + "learning_rate": 1.3797977622515068e-05, + "loss": 0.1466, + "step": 38298 + }, + { + "epoch": 0.6831056255127885, + "grad_norm": 0.29377564787864685, + "learning_rate": 1.379658613731622e-05, + "loss": 0.142, + "step": 38299 + }, + { + "epoch": 0.6831234616345022, + "grad_norm": 0.2092708796262741, + "learning_rate": 1.3795194695545672e-05, + "loss": 0.0835, + "step": 38300 + }, + { + "epoch": 0.6831412977562159, + "grad_norm": 0.3363591730594635, + "learning_rate": 1.3793803297208818e-05, + "loss": 0.093, + "step": 38301 + }, + { + "epoch": 0.6831591338779296, + "grad_norm": 0.36219626665115356, + "learning_rate": 1.3792411942311056e-05, + "loss": 0.0986, + "step": 38302 + }, + { + "epoch": 0.6831769699996433, + "grad_norm": 0.22186565399169922, + "learning_rate": 1.3791020630857764e-05, + "loss": 0.1252, + "step": 38303 + }, + { + "epoch": 0.683194806121357, + "grad_norm": 0.3064078390598297, + "learning_rate": 1.3789629362854351e-05, + "loss": 0.1475, + "step": 38304 + }, + { + "epoch": 0.6832126422430707, + "grad_norm": 0.22408241033554077, + "learning_rate": 1.3788238138306211e-05, + "loss": 0.1164, + "step": 38305 + }, + { + "epoch": 0.6832304783647843, + "grad_norm": 0.3340977132320404, + "learning_rate": 1.3786846957218736e-05, + "loss": 0.1263, + "step": 38306 + }, + { + "epoch": 0.683248314486498, + "grad_norm": 0.2569003403186798, + "learning_rate": 1.3785455819597303e-05, + "loss": 0.0868, + "step": 38307 + }, + { + "epoch": 0.6832661506082117, + "grad_norm": 0.32586854696273804, + "learning_rate": 1.378406472544733e-05, + "loss": 0.1143, + "step": 38308 + }, + { + "epoch": 0.6832839867299254, + "grad_norm": 0.34712693095207214, + "learning_rate": 1.3782673674774193e-05, + "loss": 0.1112, + "step": 38309 + }, + { + "epoch": 0.6833018228516391, + "grad_norm": 0.23863783478736877, + "learning_rate": 1.378128266758329e-05, + "loss": 0.1336, + "step": 38310 + }, + { + "epoch": 0.6833196589733528, + "grad_norm": 0.25527501106262207, + "learning_rate": 1.3779891703879997e-05, + "loss": 0.1086, + "step": 38311 + }, + { + "epoch": 0.6833374950950666, + "grad_norm": 0.3521053194999695, + "learning_rate": 1.3778500783669728e-05, + "loss": 0.1005, + "step": 38312 + }, + { + "epoch": 0.6833553312167803, + "grad_norm": 0.2253001183271408, + "learning_rate": 1.3777109906957869e-05, + "loss": 0.1092, + "step": 38313 + }, + { + "epoch": 0.683373167338494, + "grad_norm": 0.3078317642211914, + "learning_rate": 1.377571907374981e-05, + "loss": 0.1387, + "step": 38314 + }, + { + "epoch": 0.6833910034602076, + "grad_norm": 0.20839102566242218, + "learning_rate": 1.3774328284050936e-05, + "loss": 0.1106, + "step": 38315 + }, + { + "epoch": 0.6834088395819213, + "grad_norm": 0.23857508599758148, + "learning_rate": 1.3772937537866634e-05, + "loss": 0.1405, + "step": 38316 + }, + { + "epoch": 0.683426675703635, + "grad_norm": 0.3254587948322296, + "learning_rate": 1.3771546835202315e-05, + "loss": 0.1468, + "step": 38317 + }, + { + "epoch": 0.6834445118253487, + "grad_norm": 0.31444141268730164, + "learning_rate": 1.3770156176063347e-05, + "loss": 0.1434, + "step": 38318 + }, + { + "epoch": 0.6834623479470624, + "grad_norm": 0.3288553059101105, + "learning_rate": 1.3768765560455144e-05, + "loss": 0.1592, + "step": 38319 + }, + { + "epoch": 0.6834801840687761, + "grad_norm": 0.2745189070701599, + "learning_rate": 1.3767374988383075e-05, + "loss": 0.1503, + "step": 38320 + }, + { + "epoch": 0.6834980201904898, + "grad_norm": 0.26712653040885925, + "learning_rate": 1.3765984459852548e-05, + "loss": 0.1955, + "step": 38321 + }, + { + "epoch": 0.6835158563122035, + "grad_norm": 0.38259992003440857, + "learning_rate": 1.376459397486895e-05, + "loss": 0.1046, + "step": 38322 + }, + { + "epoch": 0.6835336924339172, + "grad_norm": 0.24853625893592834, + "learning_rate": 1.3763203533437663e-05, + "loss": 0.1463, + "step": 38323 + }, + { + "epoch": 0.6835515285556308, + "grad_norm": 0.3358849585056305, + "learning_rate": 1.3761813135564069e-05, + "loss": 0.136, + "step": 38324 + }, + { + "epoch": 0.6835693646773445, + "grad_norm": 0.21480616927146912, + "learning_rate": 1.376042278125358e-05, + "loss": 0.1159, + "step": 38325 + }, + { + "epoch": 0.6835872007990582, + "grad_norm": 0.23901121318340302, + "learning_rate": 1.3759032470511573e-05, + "loss": 0.082, + "step": 38326 + }, + { + "epoch": 0.6836050369207719, + "grad_norm": 0.2030390352010727, + "learning_rate": 1.3757642203343444e-05, + "loss": 0.1314, + "step": 38327 + }, + { + "epoch": 0.6836228730424856, + "grad_norm": 0.27238669991493225, + "learning_rate": 1.375625197975457e-05, + "loss": 0.1052, + "step": 38328 + }, + { + "epoch": 0.6836407091641994, + "grad_norm": 0.2330402433872223, + "learning_rate": 1.3754861799750341e-05, + "loss": 0.1505, + "step": 38329 + }, + { + "epoch": 0.6836585452859131, + "grad_norm": 0.256360799074173, + "learning_rate": 1.3753471663336158e-05, + "loss": 0.0734, + "step": 38330 + }, + { + "epoch": 0.6836763814076268, + "grad_norm": 0.16624684631824493, + "learning_rate": 1.37520815705174e-05, + "loss": 0.0455, + "step": 38331 + }, + { + "epoch": 0.6836942175293405, + "grad_norm": 0.20241419970989227, + "learning_rate": 1.3750691521299464e-05, + "loss": 0.0987, + "step": 38332 + }, + { + "epoch": 0.6837120536510541, + "grad_norm": 0.2972196340560913, + "learning_rate": 1.3749301515687724e-05, + "loss": 0.1215, + "step": 38333 + }, + { + "epoch": 0.6837298897727678, + "grad_norm": 0.20276778936386108, + "learning_rate": 1.3747911553687587e-05, + "loss": 0.0566, + "step": 38334 + }, + { + "epoch": 0.6837477258944815, + "grad_norm": 0.20941853523254395, + "learning_rate": 1.374652163530443e-05, + "loss": 0.1144, + "step": 38335 + }, + { + "epoch": 0.6837655620161952, + "grad_norm": 0.3057221472263336, + "learning_rate": 1.3745131760543643e-05, + "loss": 0.1115, + "step": 38336 + }, + { + "epoch": 0.6837833981379089, + "grad_norm": 0.22856928408145905, + "learning_rate": 1.3743741929410604e-05, + "loss": 0.128, + "step": 38337 + }, + { + "epoch": 0.6838012342596226, + "grad_norm": 0.3734128773212433, + "learning_rate": 1.3742352141910714e-05, + "loss": 0.1469, + "step": 38338 + }, + { + "epoch": 0.6838190703813363, + "grad_norm": 0.20477478206157684, + "learning_rate": 1.374096239804936e-05, + "loss": 0.1003, + "step": 38339 + }, + { + "epoch": 0.68383690650305, + "grad_norm": 0.2997352182865143, + "learning_rate": 1.3739572697831924e-05, + "loss": 0.1249, + "step": 38340 + }, + { + "epoch": 0.6838547426247636, + "grad_norm": 0.20431646704673767, + "learning_rate": 1.3738183041263791e-05, + "loss": 0.0743, + "step": 38341 + }, + { + "epoch": 0.6838725787464773, + "grad_norm": 0.23744510114192963, + "learning_rate": 1.3736793428350341e-05, + "loss": 0.127, + "step": 38342 + }, + { + "epoch": 0.683890414868191, + "grad_norm": 0.29310956597328186, + "learning_rate": 1.3735403859096982e-05, + "loss": 0.1529, + "step": 38343 + }, + { + "epoch": 0.6839082509899047, + "grad_norm": 0.2078559547662735, + "learning_rate": 1.3734014333509088e-05, + "loss": 0.0802, + "step": 38344 + }, + { + "epoch": 0.6839260871116184, + "grad_norm": 0.2983554005622864, + "learning_rate": 1.3732624851592035e-05, + "loss": 0.1148, + "step": 38345 + }, + { + "epoch": 0.6839439232333322, + "grad_norm": 0.30653002858161926, + "learning_rate": 1.3731235413351229e-05, + "loss": 0.1195, + "step": 38346 + }, + { + "epoch": 0.6839617593550459, + "grad_norm": 0.4141486585140228, + "learning_rate": 1.3729846018792037e-05, + "loss": 0.1093, + "step": 38347 + }, + { + "epoch": 0.6839795954767596, + "grad_norm": 0.22888389229774475, + "learning_rate": 1.3728456667919864e-05, + "loss": 0.0718, + "step": 38348 + }, + { + "epoch": 0.6839974315984733, + "grad_norm": 0.19211259484291077, + "learning_rate": 1.3727067360740086e-05, + "loss": 0.0865, + "step": 38349 + }, + { + "epoch": 0.684015267720187, + "grad_norm": 0.26405784487724304, + "learning_rate": 1.372567809725808e-05, + "loss": 0.0814, + "step": 38350 + }, + { + "epoch": 0.6840331038419006, + "grad_norm": 0.3216015696525574, + "learning_rate": 1.3724288877479249e-05, + "loss": 0.0942, + "step": 38351 + }, + { + "epoch": 0.6840509399636143, + "grad_norm": 0.26392215490341187, + "learning_rate": 1.372289970140897e-05, + "loss": 0.1289, + "step": 38352 + }, + { + "epoch": 0.684068776085328, + "grad_norm": 0.2547518312931061, + "learning_rate": 1.3721510569052622e-05, + "loss": 0.1365, + "step": 38353 + }, + { + "epoch": 0.6840866122070417, + "grad_norm": 0.25171855092048645, + "learning_rate": 1.3720121480415599e-05, + "loss": 0.1182, + "step": 38354 + }, + { + "epoch": 0.6841044483287554, + "grad_norm": 0.2351503223180771, + "learning_rate": 1.3718732435503271e-05, + "loss": 0.0804, + "step": 38355 + }, + { + "epoch": 0.6841222844504691, + "grad_norm": 0.2107308954000473, + "learning_rate": 1.3717343434321039e-05, + "loss": 0.0991, + "step": 38356 + }, + { + "epoch": 0.6841401205721828, + "grad_norm": 0.2591079771518707, + "learning_rate": 1.3715954476874285e-05, + "loss": 0.134, + "step": 38357 + }, + { + "epoch": 0.6841579566938965, + "grad_norm": 0.23552291095256805, + "learning_rate": 1.3714565563168386e-05, + "loss": 0.139, + "step": 38358 + }, + { + "epoch": 0.6841757928156101, + "grad_norm": 0.2855210602283478, + "learning_rate": 1.371317669320872e-05, + "loss": 0.1081, + "step": 38359 + }, + { + "epoch": 0.6841936289373238, + "grad_norm": 0.17529477179050446, + "learning_rate": 1.3711787867000681e-05, + "loss": 0.073, + "step": 38360 + }, + { + "epoch": 0.6842114650590375, + "grad_norm": 0.2917133867740631, + "learning_rate": 1.3710399084549657e-05, + "loss": 0.1084, + "step": 38361 + }, + { + "epoch": 0.6842293011807512, + "grad_norm": 0.27219751477241516, + "learning_rate": 1.370901034586103e-05, + "loss": 0.1488, + "step": 38362 + }, + { + "epoch": 0.684247137302465, + "grad_norm": 0.2616942822933197, + "learning_rate": 1.3707621650940166e-05, + "loss": 0.1259, + "step": 38363 + }, + { + "epoch": 0.6842649734241787, + "grad_norm": 0.32580018043518066, + "learning_rate": 1.370623299979247e-05, + "loss": 0.1324, + "step": 38364 + }, + { + "epoch": 0.6842828095458924, + "grad_norm": 0.4417336881160736, + "learning_rate": 1.3704844392423315e-05, + "loss": 0.1389, + "step": 38365 + }, + { + "epoch": 0.6843006456676061, + "grad_norm": 0.3365018665790558, + "learning_rate": 1.3703455828838089e-05, + "loss": 0.1153, + "step": 38366 + }, + { + "epoch": 0.6843184817893198, + "grad_norm": 0.3291773796081543, + "learning_rate": 1.3702067309042167e-05, + "loss": 0.1064, + "step": 38367 + }, + { + "epoch": 0.6843363179110334, + "grad_norm": 0.33683764934539795, + "learning_rate": 1.3700678833040926e-05, + "loss": 0.1101, + "step": 38368 + }, + { + "epoch": 0.6843541540327471, + "grad_norm": 0.33964017033576965, + "learning_rate": 1.3699290400839762e-05, + "loss": 0.1104, + "step": 38369 + }, + { + "epoch": 0.6843719901544608, + "grad_norm": 0.2485402375459671, + "learning_rate": 1.3697902012444053e-05, + "loss": 0.1207, + "step": 38370 + }, + { + "epoch": 0.6843898262761745, + "grad_norm": 0.25063398480415344, + "learning_rate": 1.3696513667859181e-05, + "loss": 0.1631, + "step": 38371 + }, + { + "epoch": 0.6844076623978882, + "grad_norm": 0.2872093617916107, + "learning_rate": 1.3695125367090517e-05, + "loss": 0.1166, + "step": 38372 + }, + { + "epoch": 0.6844254985196019, + "grad_norm": 0.3052406907081604, + "learning_rate": 1.3693737110143462e-05, + "loss": 0.1515, + "step": 38373 + }, + { + "epoch": 0.6844433346413156, + "grad_norm": 0.2045532763004303, + "learning_rate": 1.3692348897023374e-05, + "loss": 0.1178, + "step": 38374 + }, + { + "epoch": 0.6844611707630293, + "grad_norm": 0.27117741107940674, + "learning_rate": 1.3690960727735659e-05, + "loss": 0.1222, + "step": 38375 + }, + { + "epoch": 0.684479006884743, + "grad_norm": 0.3235396444797516, + "learning_rate": 1.3689572602285678e-05, + "loss": 0.1283, + "step": 38376 + }, + { + "epoch": 0.6844968430064566, + "grad_norm": 0.31171759963035583, + "learning_rate": 1.3688184520678827e-05, + "loss": 0.145, + "step": 38377 + }, + { + "epoch": 0.6845146791281703, + "grad_norm": 0.27213242650032043, + "learning_rate": 1.368679648292048e-05, + "loss": 0.0772, + "step": 38378 + }, + { + "epoch": 0.684532515249884, + "grad_norm": 0.2412949800491333, + "learning_rate": 1.3685408489016017e-05, + "loss": 0.1064, + "step": 38379 + }, + { + "epoch": 0.6845503513715978, + "grad_norm": 0.21386411786079407, + "learning_rate": 1.3684020538970821e-05, + "loss": 0.1321, + "step": 38380 + }, + { + "epoch": 0.6845681874933115, + "grad_norm": 0.22135698795318604, + "learning_rate": 1.368263263279026e-05, + "loss": 0.1181, + "step": 38381 + }, + { + "epoch": 0.6845860236150252, + "grad_norm": 0.22843441367149353, + "learning_rate": 1.3681244770479731e-05, + "loss": 0.114, + "step": 38382 + }, + { + "epoch": 0.6846038597367389, + "grad_norm": 0.25926870107650757, + "learning_rate": 1.3679856952044606e-05, + "loss": 0.1416, + "step": 38383 + }, + { + "epoch": 0.6846216958584526, + "grad_norm": 0.24429389834403992, + "learning_rate": 1.3678469177490268e-05, + "loss": 0.0902, + "step": 38384 + }, + { + "epoch": 0.6846395319801662, + "grad_norm": 0.27275997400283813, + "learning_rate": 1.3677081446822085e-05, + "loss": 0.1087, + "step": 38385 + }, + { + "epoch": 0.6846573681018799, + "grad_norm": 0.23695170879364014, + "learning_rate": 1.3675693760045451e-05, + "loss": 0.1212, + "step": 38386 + }, + { + "epoch": 0.6846752042235936, + "grad_norm": 0.27595528960227966, + "learning_rate": 1.3674306117165733e-05, + "loss": 0.1028, + "step": 38387 + }, + { + "epoch": 0.6846930403453073, + "grad_norm": 0.2717142701148987, + "learning_rate": 1.3672918518188326e-05, + "loss": 0.1162, + "step": 38388 + }, + { + "epoch": 0.684710876467021, + "grad_norm": 0.3610185980796814, + "learning_rate": 1.3671530963118587e-05, + "loss": 0.0841, + "step": 38389 + }, + { + "epoch": 0.6847287125887347, + "grad_norm": 0.2965988218784332, + "learning_rate": 1.3670143451961918e-05, + "loss": 0.1139, + "step": 38390 + }, + { + "epoch": 0.6847465487104484, + "grad_norm": 0.30460888147354126, + "learning_rate": 1.3668755984723686e-05, + "loss": 0.1133, + "step": 38391 + }, + { + "epoch": 0.6847643848321621, + "grad_norm": 0.24197007715702057, + "learning_rate": 1.3667368561409268e-05, + "loss": 0.1507, + "step": 38392 + }, + { + "epoch": 0.6847822209538758, + "grad_norm": 0.24504749476909637, + "learning_rate": 1.3665981182024045e-05, + "loss": 0.128, + "step": 38393 + }, + { + "epoch": 0.6848000570755894, + "grad_norm": 0.24964097142219543, + "learning_rate": 1.3664593846573385e-05, + "loss": 0.0588, + "step": 38394 + }, + { + "epoch": 0.6848178931973031, + "grad_norm": 0.3125212490558624, + "learning_rate": 1.366320655506268e-05, + "loss": 0.1828, + "step": 38395 + }, + { + "epoch": 0.6848357293190169, + "grad_norm": 0.2686334550380707, + "learning_rate": 1.3661819307497306e-05, + "loss": 0.1087, + "step": 38396 + }, + { + "epoch": 0.6848535654407306, + "grad_norm": 0.24931973218917847, + "learning_rate": 1.3660432103882636e-05, + "loss": 0.0604, + "step": 38397 + }, + { + "epoch": 0.6848714015624443, + "grad_norm": 0.2861267328262329, + "learning_rate": 1.3659044944224036e-05, + "loss": 0.1116, + "step": 38398 + }, + { + "epoch": 0.684889237684158, + "grad_norm": 0.22131329774856567, + "learning_rate": 1.3657657828526907e-05, + "loss": 0.1297, + "step": 38399 + }, + { + "epoch": 0.6849070738058717, + "grad_norm": 0.25230592489242554, + "learning_rate": 1.3656270756796613e-05, + "loss": 0.1175, + "step": 38400 + }, + { + "epoch": 0.6849249099275854, + "grad_norm": 0.3339843451976776, + "learning_rate": 1.365488372903852e-05, + "loss": 0.1094, + "step": 38401 + }, + { + "epoch": 0.6849427460492991, + "grad_norm": 0.33061057329177856, + "learning_rate": 1.3653496745258027e-05, + "loss": 0.1825, + "step": 38402 + }, + { + "epoch": 0.6849605821710127, + "grad_norm": 0.21584612131118774, + "learning_rate": 1.365210980546049e-05, + "loss": 0.1065, + "step": 38403 + }, + { + "epoch": 0.6849784182927264, + "grad_norm": 0.296165406703949, + "learning_rate": 1.3650722909651303e-05, + "loss": 0.1768, + "step": 38404 + }, + { + "epoch": 0.6849962544144401, + "grad_norm": 0.2808534502983093, + "learning_rate": 1.3649336057835838e-05, + "loss": 0.1602, + "step": 38405 + }, + { + "epoch": 0.6850140905361538, + "grad_norm": 0.18003782629966736, + "learning_rate": 1.3647949250019465e-05, + "loss": 0.0663, + "step": 38406 + }, + { + "epoch": 0.6850319266578675, + "grad_norm": 0.252089262008667, + "learning_rate": 1.364656248620755e-05, + "loss": 0.0969, + "step": 38407 + }, + { + "epoch": 0.6850497627795812, + "grad_norm": 0.27056699991226196, + "learning_rate": 1.3645175766405493e-05, + "loss": 0.0966, + "step": 38408 + }, + { + "epoch": 0.6850675989012949, + "grad_norm": 0.27369362115859985, + "learning_rate": 1.3643789090618652e-05, + "loss": 0.1212, + "step": 38409 + }, + { + "epoch": 0.6850854350230086, + "grad_norm": 0.354015052318573, + "learning_rate": 1.364240245885241e-05, + "loss": 0.1306, + "step": 38410 + }, + { + "epoch": 0.6851032711447222, + "grad_norm": 0.20980007946491241, + "learning_rate": 1.3641015871112129e-05, + "loss": 0.1456, + "step": 38411 + }, + { + "epoch": 0.6851211072664359, + "grad_norm": 0.21975672245025635, + "learning_rate": 1.3639629327403203e-05, + "loss": 0.1267, + "step": 38412 + }, + { + "epoch": 0.6851389433881497, + "grad_norm": 0.2790791988372803, + "learning_rate": 1.3638242827730998e-05, + "loss": 0.1295, + "step": 38413 + }, + { + "epoch": 0.6851567795098634, + "grad_norm": 0.28701257705688477, + "learning_rate": 1.3636856372100875e-05, + "loss": 0.0835, + "step": 38414 + }, + { + "epoch": 0.6851746156315771, + "grad_norm": 0.41110920906066895, + "learning_rate": 1.3635469960518237e-05, + "loss": 0.1311, + "step": 38415 + }, + { + "epoch": 0.6851924517532908, + "grad_norm": 0.22192950546741486, + "learning_rate": 1.3634083592988428e-05, + "loss": 0.0983, + "step": 38416 + }, + { + "epoch": 0.6852102878750045, + "grad_norm": 0.3859463036060333, + "learning_rate": 1.363269726951685e-05, + "loss": 0.1042, + "step": 38417 + }, + { + "epoch": 0.6852281239967182, + "grad_norm": 0.26612839102745056, + "learning_rate": 1.3631310990108862e-05, + "loss": 0.1137, + "step": 38418 + }, + { + "epoch": 0.6852459601184319, + "grad_norm": 0.2530892491340637, + "learning_rate": 1.362992475476984e-05, + "loss": 0.1498, + "step": 38419 + }, + { + "epoch": 0.6852637962401456, + "grad_norm": 0.2104545384645462, + "learning_rate": 1.3628538563505144e-05, + "loss": 0.1032, + "step": 38420 + }, + { + "epoch": 0.6852816323618592, + "grad_norm": 0.18383771181106567, + "learning_rate": 1.362715241632017e-05, + "loss": 0.0644, + "step": 38421 + }, + { + "epoch": 0.6852994684835729, + "grad_norm": 0.27254554629325867, + "learning_rate": 1.3625766313220285e-05, + "loss": 0.1366, + "step": 38422 + }, + { + "epoch": 0.6853173046052866, + "grad_norm": 0.23755870759487152, + "learning_rate": 1.3624380254210855e-05, + "loss": 0.1389, + "step": 38423 + }, + { + "epoch": 0.6853351407270003, + "grad_norm": 0.22425946593284607, + "learning_rate": 1.3622994239297248e-05, + "loss": 0.0954, + "step": 38424 + }, + { + "epoch": 0.685352976848714, + "grad_norm": 0.25088125467300415, + "learning_rate": 1.3621608268484857e-05, + "loss": 0.0972, + "step": 38425 + }, + { + "epoch": 0.6853708129704277, + "grad_norm": 0.2909787893295288, + "learning_rate": 1.3620222341779038e-05, + "loss": 0.1341, + "step": 38426 + }, + { + "epoch": 0.6853886490921414, + "grad_norm": 0.32008832693099976, + "learning_rate": 1.361883645918517e-05, + "loss": 0.1082, + "step": 38427 + }, + { + "epoch": 0.685406485213855, + "grad_norm": 0.22382010519504547, + "learning_rate": 1.3617450620708613e-05, + "loss": 0.1136, + "step": 38428 + }, + { + "epoch": 0.6854243213355687, + "grad_norm": 0.23693999648094177, + "learning_rate": 1.3616064826354757e-05, + "loss": 0.1355, + "step": 38429 + }, + { + "epoch": 0.6854421574572825, + "grad_norm": 0.26569345593452454, + "learning_rate": 1.3614679076128958e-05, + "loss": 0.0922, + "step": 38430 + }, + { + "epoch": 0.6854599935789962, + "grad_norm": 0.2200886756181717, + "learning_rate": 1.3613293370036604e-05, + "loss": 0.105, + "step": 38431 + }, + { + "epoch": 0.6854778297007099, + "grad_norm": 0.43981683254241943, + "learning_rate": 1.3611907708083057e-05, + "loss": 0.0853, + "step": 38432 + }, + { + "epoch": 0.6854956658224236, + "grad_norm": 0.2803592383861542, + "learning_rate": 1.3610522090273681e-05, + "loss": 0.1255, + "step": 38433 + }, + { + "epoch": 0.6855135019441373, + "grad_norm": 0.3195376396179199, + "learning_rate": 1.3609136516613863e-05, + "loss": 0.1108, + "step": 38434 + }, + { + "epoch": 0.685531338065851, + "grad_norm": 0.24816769361495972, + "learning_rate": 1.3607750987108966e-05, + "loss": 0.1501, + "step": 38435 + }, + { + "epoch": 0.6855491741875647, + "grad_norm": 0.34660109877586365, + "learning_rate": 1.3606365501764363e-05, + "loss": 0.1284, + "step": 38436 + }, + { + "epoch": 0.6855670103092784, + "grad_norm": 0.29819396138191223, + "learning_rate": 1.3604980060585412e-05, + "loss": 0.1359, + "step": 38437 + }, + { + "epoch": 0.685584846430992, + "grad_norm": 0.2540644109249115, + "learning_rate": 1.3603594663577501e-05, + "loss": 0.0885, + "step": 38438 + }, + { + "epoch": 0.6856026825527057, + "grad_norm": 0.2896649241447449, + "learning_rate": 1.3602209310745998e-05, + "loss": 0.1182, + "step": 38439 + }, + { + "epoch": 0.6856205186744194, + "grad_norm": 0.2289072722196579, + "learning_rate": 1.3600824002096265e-05, + "loss": 0.1208, + "step": 38440 + }, + { + "epoch": 0.6856383547961331, + "grad_norm": 0.2511662244796753, + "learning_rate": 1.3599438737633668e-05, + "loss": 0.1296, + "step": 38441 + }, + { + "epoch": 0.6856561909178468, + "grad_norm": 0.20983365178108215, + "learning_rate": 1.359805351736359e-05, + "loss": 0.1108, + "step": 38442 + }, + { + "epoch": 0.6856740270395605, + "grad_norm": 0.22050683200359344, + "learning_rate": 1.3596668341291391e-05, + "loss": 0.1295, + "step": 38443 + }, + { + "epoch": 0.6856918631612742, + "grad_norm": 0.22735293209552765, + "learning_rate": 1.3595283209422449e-05, + "loss": 0.1064, + "step": 38444 + }, + { + "epoch": 0.6857096992829879, + "grad_norm": 0.23720484972000122, + "learning_rate": 1.3593898121762128e-05, + "loss": 0.0864, + "step": 38445 + }, + { + "epoch": 0.6857275354047015, + "grad_norm": 0.21472570300102234, + "learning_rate": 1.3592513078315792e-05, + "loss": 0.1094, + "step": 38446 + }, + { + "epoch": 0.6857453715264153, + "grad_norm": 0.20450402796268463, + "learning_rate": 1.3591128079088823e-05, + "loss": 0.1393, + "step": 38447 + }, + { + "epoch": 0.685763207648129, + "grad_norm": 0.22828178107738495, + "learning_rate": 1.3589743124086578e-05, + "loss": 0.0868, + "step": 38448 + }, + { + "epoch": 0.6857810437698427, + "grad_norm": 0.20493710041046143, + "learning_rate": 1.3588358213314433e-05, + "loss": 0.0888, + "step": 38449 + }, + { + "epoch": 0.6857988798915564, + "grad_norm": 0.27139800786972046, + "learning_rate": 1.3586973346777745e-05, + "loss": 0.1201, + "step": 38450 + }, + { + "epoch": 0.6858167160132701, + "grad_norm": 0.21251267194747925, + "learning_rate": 1.35855885244819e-05, + "loss": 0.0965, + "step": 38451 + }, + { + "epoch": 0.6858345521349838, + "grad_norm": 0.38066670298576355, + "learning_rate": 1.3584203746432253e-05, + "loss": 0.1166, + "step": 38452 + }, + { + "epoch": 0.6858523882566975, + "grad_norm": 0.19657441973686218, + "learning_rate": 1.3582819012634176e-05, + "loss": 0.0917, + "step": 38453 + }, + { + "epoch": 0.6858702243784112, + "grad_norm": 0.25900018215179443, + "learning_rate": 1.3581434323093028e-05, + "loss": 0.1246, + "step": 38454 + }, + { + "epoch": 0.6858880605001249, + "grad_norm": 0.23921701312065125, + "learning_rate": 1.3580049677814194e-05, + "loss": 0.1083, + "step": 38455 + }, + { + "epoch": 0.6859058966218385, + "grad_norm": 0.2900693118572235, + "learning_rate": 1.3578665076803029e-05, + "loss": 0.1344, + "step": 38456 + }, + { + "epoch": 0.6859237327435522, + "grad_norm": 0.3682699203491211, + "learning_rate": 1.3577280520064894e-05, + "loss": 0.1887, + "step": 38457 + }, + { + "epoch": 0.6859415688652659, + "grad_norm": 0.20448656380176544, + "learning_rate": 1.3575896007605177e-05, + "loss": 0.0877, + "step": 38458 + }, + { + "epoch": 0.6859594049869796, + "grad_norm": 0.31979233026504517, + "learning_rate": 1.3574511539429221e-05, + "loss": 0.1047, + "step": 38459 + }, + { + "epoch": 0.6859772411086933, + "grad_norm": 0.373167484998703, + "learning_rate": 1.3573127115542417e-05, + "loss": 0.1444, + "step": 38460 + }, + { + "epoch": 0.685995077230407, + "grad_norm": 0.22824648022651672, + "learning_rate": 1.3571742735950117e-05, + "loss": 0.0922, + "step": 38461 + }, + { + "epoch": 0.6860129133521207, + "grad_norm": 0.24930021166801453, + "learning_rate": 1.357035840065769e-05, + "loss": 0.1101, + "step": 38462 + }, + { + "epoch": 0.6860307494738344, + "grad_norm": 0.23855338990688324, + "learning_rate": 1.3568974109670491e-05, + "loss": 0.1545, + "step": 38463 + }, + { + "epoch": 0.6860485855955482, + "grad_norm": 0.2848866879940033, + "learning_rate": 1.3567589862993906e-05, + "loss": 0.1249, + "step": 38464 + }, + { + "epoch": 0.6860664217172618, + "grad_norm": 0.3375401198863983, + "learning_rate": 1.356620566063329e-05, + "loss": 0.1366, + "step": 38465 + }, + { + "epoch": 0.6860842578389755, + "grad_norm": 0.24678131937980652, + "learning_rate": 1.3564821502594013e-05, + "loss": 0.0966, + "step": 38466 + }, + { + "epoch": 0.6861020939606892, + "grad_norm": 0.303362101316452, + "learning_rate": 1.3563437388881428e-05, + "loss": 0.1356, + "step": 38467 + }, + { + "epoch": 0.6861199300824029, + "grad_norm": 0.28642475605010986, + "learning_rate": 1.3562053319500917e-05, + "loss": 0.1228, + "step": 38468 + }, + { + "epoch": 0.6861377662041166, + "grad_norm": 0.2539806067943573, + "learning_rate": 1.356066929445784e-05, + "loss": 0.0862, + "step": 38469 + }, + { + "epoch": 0.6861556023258303, + "grad_norm": 0.28071328997612, + "learning_rate": 1.3559285313757548e-05, + "loss": 0.1161, + "step": 38470 + }, + { + "epoch": 0.686173438447544, + "grad_norm": 0.28178900480270386, + "learning_rate": 1.355790137740543e-05, + "loss": 0.1385, + "step": 38471 + }, + { + "epoch": 0.6861912745692577, + "grad_norm": 0.28673335909843445, + "learning_rate": 1.3556517485406824e-05, + "loss": 0.1373, + "step": 38472 + }, + { + "epoch": 0.6862091106909713, + "grad_norm": 0.25109684467315674, + "learning_rate": 1.355513363776712e-05, + "loss": 0.1262, + "step": 38473 + }, + { + "epoch": 0.686226946812685, + "grad_norm": 0.20516128838062286, + "learning_rate": 1.3553749834491675e-05, + "loss": 0.0937, + "step": 38474 + }, + { + "epoch": 0.6862447829343987, + "grad_norm": 0.3105356693267822, + "learning_rate": 1.3552366075585845e-05, + "loss": 0.0899, + "step": 38475 + }, + { + "epoch": 0.6862626190561124, + "grad_norm": 0.4162426292896271, + "learning_rate": 1.355098236105499e-05, + "loss": 0.1084, + "step": 38476 + }, + { + "epoch": 0.6862804551778261, + "grad_norm": 0.25846436619758606, + "learning_rate": 1.3549598690904491e-05, + "loss": 0.1701, + "step": 38477 + }, + { + "epoch": 0.6862982912995398, + "grad_norm": 0.23674516379833221, + "learning_rate": 1.3548215065139703e-05, + "loss": 0.1402, + "step": 38478 + }, + { + "epoch": 0.6863161274212535, + "grad_norm": 0.2504197359085083, + "learning_rate": 1.3546831483765987e-05, + "loss": 0.2215, + "step": 38479 + }, + { + "epoch": 0.6863339635429672, + "grad_norm": 0.3130221962928772, + "learning_rate": 1.3545447946788698e-05, + "loss": 0.194, + "step": 38480 + }, + { + "epoch": 0.686351799664681, + "grad_norm": 0.36830708384513855, + "learning_rate": 1.354406445421322e-05, + "loss": 0.1372, + "step": 38481 + }, + { + "epoch": 0.6863696357863946, + "grad_norm": 0.2909860908985138, + "learning_rate": 1.3542681006044904e-05, + "loss": 0.1051, + "step": 38482 + }, + { + "epoch": 0.6863874719081083, + "grad_norm": 0.28509071469306946, + "learning_rate": 1.3541297602289115e-05, + "loss": 0.1637, + "step": 38483 + }, + { + "epoch": 0.686405308029822, + "grad_norm": 0.3719830811023712, + "learning_rate": 1.3539914242951207e-05, + "loss": 0.0584, + "step": 38484 + }, + { + "epoch": 0.6864231441515357, + "grad_norm": 0.23574262857437134, + "learning_rate": 1.3538530928036544e-05, + "loss": 0.1236, + "step": 38485 + }, + { + "epoch": 0.6864409802732494, + "grad_norm": 0.30539509654045105, + "learning_rate": 1.3537147657550508e-05, + "loss": 0.1274, + "step": 38486 + }, + { + "epoch": 0.6864588163949631, + "grad_norm": 0.2478194534778595, + "learning_rate": 1.3535764431498443e-05, + "loss": 0.1198, + "step": 38487 + }, + { + "epoch": 0.6864766525166768, + "grad_norm": 0.4017721712589264, + "learning_rate": 1.3534381249885719e-05, + "loss": 0.1046, + "step": 38488 + }, + { + "epoch": 0.6864944886383905, + "grad_norm": 0.4043966233730316, + "learning_rate": 1.353299811271768e-05, + "loss": 0.1052, + "step": 38489 + }, + { + "epoch": 0.6865123247601042, + "grad_norm": 0.19515439867973328, + "learning_rate": 1.353161501999971e-05, + "loss": 0.1162, + "step": 38490 + }, + { + "epoch": 0.6865301608818178, + "grad_norm": 0.27910029888153076, + "learning_rate": 1.3530231971737164e-05, + "loss": 0.1105, + "step": 38491 + }, + { + "epoch": 0.6865479970035315, + "grad_norm": 0.20086170732975006, + "learning_rate": 1.35288489679354e-05, + "loss": 0.0981, + "step": 38492 + }, + { + "epoch": 0.6865658331252452, + "grad_norm": 0.28241032361984253, + "learning_rate": 1.352746600859977e-05, + "loss": 0.1237, + "step": 38493 + }, + { + "epoch": 0.6865836692469589, + "grad_norm": 0.23580418527126312, + "learning_rate": 1.3526083093735654e-05, + "loss": 0.1228, + "step": 38494 + }, + { + "epoch": 0.6866015053686726, + "grad_norm": 0.220314159989357, + "learning_rate": 1.3524700223348402e-05, + "loss": 0.1004, + "step": 38495 + }, + { + "epoch": 0.6866193414903863, + "grad_norm": 0.22770333290100098, + "learning_rate": 1.3523317397443374e-05, + "loss": 0.1139, + "step": 38496 + }, + { + "epoch": 0.6866371776121001, + "grad_norm": 0.2761106491088867, + "learning_rate": 1.3521934616025922e-05, + "loss": 0.111, + "step": 38497 + }, + { + "epoch": 0.6866550137338138, + "grad_norm": 0.3761462867259979, + "learning_rate": 1.3520551879101428e-05, + "loss": 0.1025, + "step": 38498 + }, + { + "epoch": 0.6866728498555275, + "grad_norm": 0.33922600746154785, + "learning_rate": 1.351916918667523e-05, + "loss": 0.138, + "step": 38499 + }, + { + "epoch": 0.6866906859772411, + "grad_norm": 0.2886148691177368, + "learning_rate": 1.3517786538752705e-05, + "loss": 0.1147, + "step": 38500 + }, + { + "epoch": 0.6867085220989548, + "grad_norm": 0.3000772297382355, + "learning_rate": 1.3516403935339206e-05, + "loss": 0.1564, + "step": 38501 + }, + { + "epoch": 0.6867263582206685, + "grad_norm": 0.3180055022239685, + "learning_rate": 1.3515021376440084e-05, + "loss": 0.1327, + "step": 38502 + }, + { + "epoch": 0.6867441943423822, + "grad_norm": 0.33008190989494324, + "learning_rate": 1.3513638862060712e-05, + "loss": 0.1085, + "step": 38503 + }, + { + "epoch": 0.6867620304640959, + "grad_norm": 0.2843044698238373, + "learning_rate": 1.3512256392206445e-05, + "loss": 0.0941, + "step": 38504 + }, + { + "epoch": 0.6867798665858096, + "grad_norm": 0.26780855655670166, + "learning_rate": 1.3510873966882642e-05, + "loss": 0.1185, + "step": 38505 + }, + { + "epoch": 0.6867977027075233, + "grad_norm": 0.3334581255912781, + "learning_rate": 1.3509491586094646e-05, + "loss": 0.1329, + "step": 38506 + }, + { + "epoch": 0.686815538829237, + "grad_norm": 0.2519510090351105, + "learning_rate": 1.3508109249847845e-05, + "loss": 0.0916, + "step": 38507 + }, + { + "epoch": 0.6868333749509506, + "grad_norm": 0.28137853741645813, + "learning_rate": 1.3506726958147575e-05, + "loss": 0.1208, + "step": 38508 + }, + { + "epoch": 0.6868512110726643, + "grad_norm": 0.2665141522884369, + "learning_rate": 1.3505344710999205e-05, + "loss": 0.0902, + "step": 38509 + }, + { + "epoch": 0.686869047194378, + "grad_norm": 0.315267950296402, + "learning_rate": 1.3503962508408076e-05, + "loss": 0.1557, + "step": 38510 + }, + { + "epoch": 0.6868868833160917, + "grad_norm": 0.2413836121559143, + "learning_rate": 1.3502580350379573e-05, + "loss": 0.0974, + "step": 38511 + }, + { + "epoch": 0.6869047194378054, + "grad_norm": 0.31613633036613464, + "learning_rate": 1.3501198236919039e-05, + "loss": 0.1211, + "step": 38512 + }, + { + "epoch": 0.6869225555595191, + "grad_norm": 0.18966291844844818, + "learning_rate": 1.349981616803182e-05, + "loss": 0.1067, + "step": 38513 + }, + { + "epoch": 0.6869403916812329, + "grad_norm": 0.19223885238170624, + "learning_rate": 1.3498434143723293e-05, + "loss": 0.0995, + "step": 38514 + }, + { + "epoch": 0.6869582278029466, + "grad_norm": 0.2586239278316498, + "learning_rate": 1.3497052163998803e-05, + "loss": 0.132, + "step": 38515 + }, + { + "epoch": 0.6869760639246603, + "grad_norm": 0.27736833691596985, + "learning_rate": 1.3495670228863721e-05, + "loss": 0.1148, + "step": 38516 + }, + { + "epoch": 0.686993900046374, + "grad_norm": 0.33557626605033875, + "learning_rate": 1.3494288338323392e-05, + "loss": 0.1689, + "step": 38517 + }, + { + "epoch": 0.6870117361680876, + "grad_norm": 0.286602258682251, + "learning_rate": 1.3492906492383179e-05, + "loss": 0.168, + "step": 38518 + }, + { + "epoch": 0.6870295722898013, + "grad_norm": 0.2325119525194168, + "learning_rate": 1.3491524691048422e-05, + "loss": 0.1071, + "step": 38519 + }, + { + "epoch": 0.687047408411515, + "grad_norm": 0.3034343719482422, + "learning_rate": 1.3490142934324502e-05, + "loss": 0.1629, + "step": 38520 + }, + { + "epoch": 0.6870652445332287, + "grad_norm": 0.23229782283306122, + "learning_rate": 1.3488761222216761e-05, + "loss": 0.0948, + "step": 38521 + }, + { + "epoch": 0.6870830806549424, + "grad_norm": 0.26982471346855164, + "learning_rate": 1.3487379554730556e-05, + "loss": 0.1357, + "step": 38522 + }, + { + "epoch": 0.6871009167766561, + "grad_norm": 0.22699788212776184, + "learning_rate": 1.3485997931871236e-05, + "loss": 0.1045, + "step": 38523 + }, + { + "epoch": 0.6871187528983698, + "grad_norm": 0.28402313590049744, + "learning_rate": 1.3484616353644177e-05, + "loss": 0.1647, + "step": 38524 + }, + { + "epoch": 0.6871365890200835, + "grad_norm": 0.26465165615081787, + "learning_rate": 1.3483234820054722e-05, + "loss": 0.0935, + "step": 38525 + }, + { + "epoch": 0.6871544251417971, + "grad_norm": 0.31918010115623474, + "learning_rate": 1.3481853331108213e-05, + "loss": 0.1325, + "step": 38526 + }, + { + "epoch": 0.6871722612635108, + "grad_norm": 0.26944291591644287, + "learning_rate": 1.3480471886810031e-05, + "loss": 0.1303, + "step": 38527 + }, + { + "epoch": 0.6871900973852245, + "grad_norm": 0.21865399181842804, + "learning_rate": 1.3479090487165511e-05, + "loss": 0.0884, + "step": 38528 + }, + { + "epoch": 0.6872079335069382, + "grad_norm": 0.41744452714920044, + "learning_rate": 1.3477709132180023e-05, + "loss": 0.0937, + "step": 38529 + }, + { + "epoch": 0.6872257696286519, + "grad_norm": 0.30443131923675537, + "learning_rate": 1.3476327821858913e-05, + "loss": 0.1469, + "step": 38530 + }, + { + "epoch": 0.6872436057503657, + "grad_norm": 0.2349502444267273, + "learning_rate": 1.347494655620754e-05, + "loss": 0.1587, + "step": 38531 + }, + { + "epoch": 0.6872614418720794, + "grad_norm": 0.2678750157356262, + "learning_rate": 1.3473565335231241e-05, + "loss": 0.1344, + "step": 38532 + }, + { + "epoch": 0.6872792779937931, + "grad_norm": 0.2812378406524658, + "learning_rate": 1.3472184158935396e-05, + "loss": 0.0911, + "step": 38533 + }, + { + "epoch": 0.6872971141155068, + "grad_norm": 0.3774079978466034, + "learning_rate": 1.3470803027325345e-05, + "loss": 0.1354, + "step": 38534 + }, + { + "epoch": 0.6873149502372204, + "grad_norm": 0.23949551582336426, + "learning_rate": 1.3469421940406445e-05, + "loss": 0.124, + "step": 38535 + }, + { + "epoch": 0.6873327863589341, + "grad_norm": 0.26348742842674255, + "learning_rate": 1.3468040898184042e-05, + "loss": 0.0877, + "step": 38536 + }, + { + "epoch": 0.6873506224806478, + "grad_norm": 0.3536117374897003, + "learning_rate": 1.34666599006635e-05, + "loss": 0.149, + "step": 38537 + }, + { + "epoch": 0.6873684586023615, + "grad_norm": 0.20122745633125305, + "learning_rate": 1.3465278947850169e-05, + "loss": 0.1203, + "step": 38538 + }, + { + "epoch": 0.6873862947240752, + "grad_norm": 0.2623310387134552, + "learning_rate": 1.34638980397494e-05, + "loss": 0.1135, + "step": 38539 + }, + { + "epoch": 0.6874041308457889, + "grad_norm": 0.25378933548927307, + "learning_rate": 1.346251717636654e-05, + "loss": 0.1029, + "step": 38540 + }, + { + "epoch": 0.6874219669675026, + "grad_norm": 0.265180379152298, + "learning_rate": 1.3461136357706944e-05, + "loss": 0.1229, + "step": 38541 + }, + { + "epoch": 0.6874398030892163, + "grad_norm": 0.2195325642824173, + "learning_rate": 1.345975558377598e-05, + "loss": 0.1174, + "step": 38542 + }, + { + "epoch": 0.68745763921093, + "grad_norm": 0.18819034099578857, + "learning_rate": 1.3458374854578993e-05, + "loss": 0.0931, + "step": 38543 + }, + { + "epoch": 0.6874754753326436, + "grad_norm": 0.3780516982078552, + "learning_rate": 1.3456994170121326e-05, + "loss": 0.1421, + "step": 38544 + }, + { + "epoch": 0.6874933114543573, + "grad_norm": 0.2784189283847809, + "learning_rate": 1.345561353040833e-05, + "loss": 0.1254, + "step": 38545 + }, + { + "epoch": 0.687511147576071, + "grad_norm": 0.2901017963886261, + "learning_rate": 1.345423293544537e-05, + "loss": 0.088, + "step": 38546 + }, + { + "epoch": 0.6875289836977847, + "grad_norm": 0.23261678218841553, + "learning_rate": 1.3452852385237796e-05, + "loss": 0.0897, + "step": 38547 + }, + { + "epoch": 0.6875468198194985, + "grad_norm": 0.3297034204006195, + "learning_rate": 1.345147187979095e-05, + "loss": 0.1303, + "step": 38548 + }, + { + "epoch": 0.6875646559412122, + "grad_norm": 0.37196218967437744, + "learning_rate": 1.3450091419110178e-05, + "loss": 0.1222, + "step": 38549 + }, + { + "epoch": 0.6875824920629259, + "grad_norm": 0.27657291293144226, + "learning_rate": 1.3448711003200853e-05, + "loss": 0.1868, + "step": 38550 + }, + { + "epoch": 0.6876003281846396, + "grad_norm": 0.2765156924724579, + "learning_rate": 1.344733063206831e-05, + "loss": 0.1653, + "step": 38551 + }, + { + "epoch": 0.6876181643063533, + "grad_norm": 0.23933175206184387, + "learning_rate": 1.3445950305717909e-05, + "loss": 0.1293, + "step": 38552 + }, + { + "epoch": 0.6876360004280669, + "grad_norm": 0.21674111485481262, + "learning_rate": 1.3444570024154984e-05, + "loss": 0.0883, + "step": 38553 + }, + { + "epoch": 0.6876538365497806, + "grad_norm": 0.20923608541488647, + "learning_rate": 1.3443189787384904e-05, + "loss": 0.1312, + "step": 38554 + }, + { + "epoch": 0.6876716726714943, + "grad_norm": 0.371404767036438, + "learning_rate": 1.3441809595413005e-05, + "loss": 0.2429, + "step": 38555 + }, + { + "epoch": 0.687689508793208, + "grad_norm": 0.2733886241912842, + "learning_rate": 1.3440429448244652e-05, + "loss": 0.1407, + "step": 38556 + }, + { + "epoch": 0.6877073449149217, + "grad_norm": 0.35108649730682373, + "learning_rate": 1.3439049345885188e-05, + "loss": 0.0941, + "step": 38557 + }, + { + "epoch": 0.6877251810366354, + "grad_norm": 0.28271621465682983, + "learning_rate": 1.343766928833995e-05, + "loss": 0.0989, + "step": 38558 + }, + { + "epoch": 0.6877430171583491, + "grad_norm": 0.3597377836704254, + "learning_rate": 1.3436289275614311e-05, + "loss": 0.1225, + "step": 38559 + }, + { + "epoch": 0.6877608532800628, + "grad_norm": 0.22796548902988434, + "learning_rate": 1.3434909307713609e-05, + "loss": 0.1444, + "step": 38560 + }, + { + "epoch": 0.6877786894017764, + "grad_norm": 0.21248580515384674, + "learning_rate": 1.3433529384643193e-05, + "loss": 0.1422, + "step": 38561 + }, + { + "epoch": 0.6877965255234901, + "grad_norm": 0.21324306726455688, + "learning_rate": 1.34321495064084e-05, + "loss": 0.1153, + "step": 38562 + }, + { + "epoch": 0.6878143616452038, + "grad_norm": 0.259677529335022, + "learning_rate": 1.3430769673014604e-05, + "loss": 0.1398, + "step": 38563 + }, + { + "epoch": 0.6878321977669175, + "grad_norm": 0.18702569603919983, + "learning_rate": 1.3429389884467137e-05, + "loss": 0.117, + "step": 38564 + }, + { + "epoch": 0.6878500338886313, + "grad_norm": 0.30660855770111084, + "learning_rate": 1.3428010140771351e-05, + "loss": 0.105, + "step": 38565 + }, + { + "epoch": 0.687867870010345, + "grad_norm": 0.19275590777397156, + "learning_rate": 1.34266304419326e-05, + "loss": 0.1139, + "step": 38566 + }, + { + "epoch": 0.6878857061320587, + "grad_norm": 0.2929665744304657, + "learning_rate": 1.3425250787956212e-05, + "loss": 0.11, + "step": 38567 + }, + { + "epoch": 0.6879035422537724, + "grad_norm": 0.3055505156517029, + "learning_rate": 1.3423871178847552e-05, + "loss": 0.1565, + "step": 38568 + }, + { + "epoch": 0.6879213783754861, + "grad_norm": 0.2522900402545929, + "learning_rate": 1.3422491614611976e-05, + "loss": 0.2286, + "step": 38569 + }, + { + "epoch": 0.6879392144971997, + "grad_norm": 0.37942132353782654, + "learning_rate": 1.3421112095254817e-05, + "loss": 0.0962, + "step": 38570 + }, + { + "epoch": 0.6879570506189134, + "grad_norm": 0.26580050587654114, + "learning_rate": 1.3419732620781422e-05, + "loss": 0.1521, + "step": 38571 + }, + { + "epoch": 0.6879748867406271, + "grad_norm": 0.3553601801395416, + "learning_rate": 1.341835319119715e-05, + "loss": 0.1485, + "step": 38572 + }, + { + "epoch": 0.6879927228623408, + "grad_norm": 0.3606205880641937, + "learning_rate": 1.3416973806507344e-05, + "loss": 0.1924, + "step": 38573 + }, + { + "epoch": 0.6880105589840545, + "grad_norm": 0.2929568588733673, + "learning_rate": 1.3415594466717345e-05, + "loss": 0.1098, + "step": 38574 + }, + { + "epoch": 0.6880283951057682, + "grad_norm": 0.2457340657711029, + "learning_rate": 1.3414215171832496e-05, + "loss": 0.1448, + "step": 38575 + }, + { + "epoch": 0.6880462312274819, + "grad_norm": 0.29361286759376526, + "learning_rate": 1.3412835921858158e-05, + "loss": 0.1171, + "step": 38576 + }, + { + "epoch": 0.6880640673491956, + "grad_norm": 0.24348865449428558, + "learning_rate": 1.341145671679967e-05, + "loss": 0.0837, + "step": 38577 + }, + { + "epoch": 0.6880819034709093, + "grad_norm": 0.25722333788871765, + "learning_rate": 1.3410077556662376e-05, + "loss": 0.0769, + "step": 38578 + }, + { + "epoch": 0.6880997395926229, + "grad_norm": 0.3594392240047455, + "learning_rate": 1.3408698441451628e-05, + "loss": 0.0985, + "step": 38579 + }, + { + "epoch": 0.6881175757143366, + "grad_norm": 0.36963778734207153, + "learning_rate": 1.3407319371172761e-05, + "loss": 0.102, + "step": 38580 + }, + { + "epoch": 0.6881354118360503, + "grad_norm": 0.2212645709514618, + "learning_rate": 1.3405940345831134e-05, + "loss": 0.0988, + "step": 38581 + }, + { + "epoch": 0.6881532479577641, + "grad_norm": 0.2580206096172333, + "learning_rate": 1.3404561365432077e-05, + "loss": 0.0906, + "step": 38582 + }, + { + "epoch": 0.6881710840794778, + "grad_norm": 0.2727537453174591, + "learning_rate": 1.3403182429980954e-05, + "loss": 0.1276, + "step": 38583 + }, + { + "epoch": 0.6881889202011915, + "grad_norm": 0.21269245445728302, + "learning_rate": 1.3401803539483093e-05, + "loss": 0.1241, + "step": 38584 + }, + { + "epoch": 0.6882067563229052, + "grad_norm": 0.23468634486198425, + "learning_rate": 1.3400424693943858e-05, + "loss": 0.1134, + "step": 38585 + }, + { + "epoch": 0.6882245924446189, + "grad_norm": 0.24647195637226105, + "learning_rate": 1.3399045893368583e-05, + "loss": 0.0794, + "step": 38586 + }, + { + "epoch": 0.6882424285663326, + "grad_norm": 0.3625646233558655, + "learning_rate": 1.3397667137762613e-05, + "loss": 0.1101, + "step": 38587 + }, + { + "epoch": 0.6882602646880462, + "grad_norm": 0.23308062553405762, + "learning_rate": 1.3396288427131281e-05, + "loss": 0.1035, + "step": 38588 + }, + { + "epoch": 0.6882781008097599, + "grad_norm": 0.25125983357429504, + "learning_rate": 1.3394909761479954e-05, + "loss": 0.0882, + "step": 38589 + }, + { + "epoch": 0.6882959369314736, + "grad_norm": 0.342476487159729, + "learning_rate": 1.3393531140813964e-05, + "loss": 0.1397, + "step": 38590 + }, + { + "epoch": 0.6883137730531873, + "grad_norm": 0.2825044095516205, + "learning_rate": 1.3392152565138657e-05, + "loss": 0.1188, + "step": 38591 + }, + { + "epoch": 0.688331609174901, + "grad_norm": 0.22510160505771637, + "learning_rate": 1.3390774034459377e-05, + "loss": 0.0778, + "step": 38592 + }, + { + "epoch": 0.6883494452966147, + "grad_norm": 0.257340669631958, + "learning_rate": 1.3389395548781456e-05, + "loss": 0.0837, + "step": 38593 + }, + { + "epoch": 0.6883672814183284, + "grad_norm": 0.27798953652381897, + "learning_rate": 1.3388017108110257e-05, + "loss": 0.0816, + "step": 38594 + }, + { + "epoch": 0.6883851175400421, + "grad_norm": 0.1957148015499115, + "learning_rate": 1.3386638712451105e-05, + "loss": 0.0766, + "step": 38595 + }, + { + "epoch": 0.6884029536617557, + "grad_norm": 0.28676503896713257, + "learning_rate": 1.3385260361809362e-05, + "loss": 0.1078, + "step": 38596 + }, + { + "epoch": 0.6884207897834694, + "grad_norm": 0.3771875202655792, + "learning_rate": 1.338388205619035e-05, + "loss": 0.0817, + "step": 38597 + }, + { + "epoch": 0.6884386259051832, + "grad_norm": 0.36797305941581726, + "learning_rate": 1.3382503795599439e-05, + "loss": 0.1272, + "step": 38598 + }, + { + "epoch": 0.6884564620268969, + "grad_norm": 0.26945650577545166, + "learning_rate": 1.3381125580041948e-05, + "loss": 0.0722, + "step": 38599 + }, + { + "epoch": 0.6884742981486106, + "grad_norm": 0.27191072702407837, + "learning_rate": 1.3379747409523232e-05, + "loss": 0.0882, + "step": 38600 + }, + { + "epoch": 0.6884921342703243, + "grad_norm": 0.21759861707687378, + "learning_rate": 1.337836928404862e-05, + "loss": 0.1179, + "step": 38601 + }, + { + "epoch": 0.688509970392038, + "grad_norm": 0.2839098870754242, + "learning_rate": 1.3376991203623467e-05, + "loss": 0.1058, + "step": 38602 + }, + { + "epoch": 0.6885278065137517, + "grad_norm": 0.42385223507881165, + "learning_rate": 1.3375613168253115e-05, + "loss": 0.1014, + "step": 38603 + }, + { + "epoch": 0.6885456426354654, + "grad_norm": 0.2928169071674347, + "learning_rate": 1.3374235177942902e-05, + "loss": 0.1439, + "step": 38604 + }, + { + "epoch": 0.688563478757179, + "grad_norm": 0.22677674889564514, + "learning_rate": 1.3372857232698166e-05, + "loss": 0.1047, + "step": 38605 + }, + { + "epoch": 0.6885813148788927, + "grad_norm": 0.2896255850791931, + "learning_rate": 1.3371479332524239e-05, + "loss": 0.0797, + "step": 38606 + }, + { + "epoch": 0.6885991510006064, + "grad_norm": 0.24892179667949677, + "learning_rate": 1.3370101477426489e-05, + "loss": 0.0903, + "step": 38607 + }, + { + "epoch": 0.6886169871223201, + "grad_norm": 0.2192019820213318, + "learning_rate": 1.3368723667410243e-05, + "loss": 0.0762, + "step": 38608 + }, + { + "epoch": 0.6886348232440338, + "grad_norm": 0.27080243825912476, + "learning_rate": 1.3367345902480826e-05, + "loss": 0.1309, + "step": 38609 + }, + { + "epoch": 0.6886526593657475, + "grad_norm": 0.30355319380760193, + "learning_rate": 1.336596818264361e-05, + "loss": 0.0979, + "step": 38610 + }, + { + "epoch": 0.6886704954874612, + "grad_norm": 0.3359048664569855, + "learning_rate": 1.3364590507903906e-05, + "loss": 0.1014, + "step": 38611 + }, + { + "epoch": 0.6886883316091749, + "grad_norm": 0.2574026584625244, + "learning_rate": 1.3363212878267078e-05, + "loss": 0.1451, + "step": 38612 + }, + { + "epoch": 0.6887061677308886, + "grad_norm": 0.462470680475235, + "learning_rate": 1.3361835293738458e-05, + "loss": 0.1282, + "step": 38613 + }, + { + "epoch": 0.6887240038526022, + "grad_norm": 0.2831890881061554, + "learning_rate": 1.3360457754323374e-05, + "loss": 0.0837, + "step": 38614 + }, + { + "epoch": 0.688741839974316, + "grad_norm": 0.2723994851112366, + "learning_rate": 1.3359080260027184e-05, + "loss": 0.1498, + "step": 38615 + }, + { + "epoch": 0.6887596760960297, + "grad_norm": 0.2538544237613678, + "learning_rate": 1.3357702810855221e-05, + "loss": 0.1147, + "step": 38616 + }, + { + "epoch": 0.6887775122177434, + "grad_norm": 0.2999408543109894, + "learning_rate": 1.3356325406812826e-05, + "loss": 0.0972, + "step": 38617 + }, + { + "epoch": 0.6887953483394571, + "grad_norm": 0.24906311929225922, + "learning_rate": 1.335494804790533e-05, + "loss": 0.1305, + "step": 38618 + }, + { + "epoch": 0.6888131844611708, + "grad_norm": 0.24978883564472198, + "learning_rate": 1.3353570734138072e-05, + "loss": 0.0811, + "step": 38619 + }, + { + "epoch": 0.6888310205828845, + "grad_norm": 0.3234630525112152, + "learning_rate": 1.3352193465516402e-05, + "loss": 0.1275, + "step": 38620 + }, + { + "epoch": 0.6888488567045982, + "grad_norm": 0.3482932150363922, + "learning_rate": 1.3350816242045655e-05, + "loss": 0.1179, + "step": 38621 + }, + { + "epoch": 0.6888666928263119, + "grad_norm": 0.2004636973142624, + "learning_rate": 1.3349439063731157e-05, + "loss": 0.078, + "step": 38622 + }, + { + "epoch": 0.6888845289480255, + "grad_norm": 0.33838585019111633, + "learning_rate": 1.334806193057827e-05, + "loss": 0.1443, + "step": 38623 + }, + { + "epoch": 0.6889023650697392, + "grad_norm": 0.2522510886192322, + "learning_rate": 1.3346684842592306e-05, + "loss": 0.1129, + "step": 38624 + }, + { + "epoch": 0.6889202011914529, + "grad_norm": 0.27040895819664, + "learning_rate": 1.3345307799778627e-05, + "loss": 0.087, + "step": 38625 + }, + { + "epoch": 0.6889380373131666, + "grad_norm": 0.22355616092681885, + "learning_rate": 1.3343930802142562e-05, + "loss": 0.088, + "step": 38626 + }, + { + "epoch": 0.6889558734348803, + "grad_norm": 0.25743889808654785, + "learning_rate": 1.3342553849689437e-05, + "loss": 0.0766, + "step": 38627 + }, + { + "epoch": 0.688973709556594, + "grad_norm": 0.343999445438385, + "learning_rate": 1.334117694242461e-05, + "loss": 0.1553, + "step": 38628 + }, + { + "epoch": 0.6889915456783077, + "grad_norm": 0.3359681963920593, + "learning_rate": 1.3339800080353407e-05, + "loss": 0.1461, + "step": 38629 + }, + { + "epoch": 0.6890093818000214, + "grad_norm": 0.24476896226406097, + "learning_rate": 1.3338423263481164e-05, + "loss": 0.0896, + "step": 38630 + }, + { + "epoch": 0.689027217921735, + "grad_norm": 0.539818525314331, + "learning_rate": 1.3337046491813223e-05, + "loss": 0.0683, + "step": 38631 + }, + { + "epoch": 0.6890450540434488, + "grad_norm": 0.31362634897232056, + "learning_rate": 1.3335669765354907e-05, + "loss": 0.167, + "step": 38632 + }, + { + "epoch": 0.6890628901651625, + "grad_norm": 0.335553377866745, + "learning_rate": 1.3334293084111576e-05, + "loss": 0.1183, + "step": 38633 + }, + { + "epoch": 0.6890807262868762, + "grad_norm": 0.2639838755130768, + "learning_rate": 1.333291644808855e-05, + "loss": 0.0856, + "step": 38634 + }, + { + "epoch": 0.6890985624085899, + "grad_norm": 0.26502877473831177, + "learning_rate": 1.3331539857291175e-05, + "loss": 0.1418, + "step": 38635 + }, + { + "epoch": 0.6891163985303036, + "grad_norm": 0.21990294754505157, + "learning_rate": 1.3330163311724766e-05, + "loss": 0.0943, + "step": 38636 + }, + { + "epoch": 0.6891342346520173, + "grad_norm": 0.24346661567687988, + "learning_rate": 1.3328786811394688e-05, + "loss": 0.0991, + "step": 38637 + }, + { + "epoch": 0.689152070773731, + "grad_norm": 0.2155342549085617, + "learning_rate": 1.3327410356306252e-05, + "loss": 0.0941, + "step": 38638 + }, + { + "epoch": 0.6891699068954447, + "grad_norm": 0.2636861205101013, + "learning_rate": 1.3326033946464816e-05, + "loss": 0.1109, + "step": 38639 + }, + { + "epoch": 0.6891877430171583, + "grad_norm": 0.2761094868183136, + "learning_rate": 1.3324657581875694e-05, + "loss": 0.148, + "step": 38640 + }, + { + "epoch": 0.689205579138872, + "grad_norm": 0.41489556431770325, + "learning_rate": 1.3323281262544243e-05, + "loss": 0.1009, + "step": 38641 + }, + { + "epoch": 0.6892234152605857, + "grad_norm": 0.32335516810417175, + "learning_rate": 1.3321904988475787e-05, + "loss": 0.1108, + "step": 38642 + }, + { + "epoch": 0.6892412513822994, + "grad_norm": 0.3143722116947174, + "learning_rate": 1.3320528759675657e-05, + "loss": 0.1467, + "step": 38643 + }, + { + "epoch": 0.6892590875040131, + "grad_norm": 0.23518264293670654, + "learning_rate": 1.3319152576149197e-05, + "loss": 0.1462, + "step": 38644 + }, + { + "epoch": 0.6892769236257268, + "grad_norm": 0.2167765498161316, + "learning_rate": 1.331777643790172e-05, + "loss": 0.1344, + "step": 38645 + }, + { + "epoch": 0.6892947597474405, + "grad_norm": 0.30202847719192505, + "learning_rate": 1.331640034493859e-05, + "loss": 0.1713, + "step": 38646 + }, + { + "epoch": 0.6893125958691542, + "grad_norm": 0.31204459071159363, + "learning_rate": 1.3315024297265128e-05, + "loss": 0.1355, + "step": 38647 + }, + { + "epoch": 0.6893304319908679, + "grad_norm": 0.2837125062942505, + "learning_rate": 1.3313648294886666e-05, + "loss": 0.1919, + "step": 38648 + }, + { + "epoch": 0.6893482681125817, + "grad_norm": 0.2383567988872528, + "learning_rate": 1.331227233780853e-05, + "loss": 0.1573, + "step": 38649 + }, + { + "epoch": 0.6893661042342953, + "grad_norm": 0.30589544773101807, + "learning_rate": 1.3310896426036074e-05, + "loss": 0.1526, + "step": 38650 + }, + { + "epoch": 0.689383940356009, + "grad_norm": 0.2975884974002838, + "learning_rate": 1.3309520559574612e-05, + "loss": 0.1717, + "step": 38651 + }, + { + "epoch": 0.6894017764777227, + "grad_norm": 0.2432190328836441, + "learning_rate": 1.3308144738429492e-05, + "loss": 0.1002, + "step": 38652 + }, + { + "epoch": 0.6894196125994364, + "grad_norm": 0.21989594399929047, + "learning_rate": 1.3306768962606034e-05, + "loss": 0.0824, + "step": 38653 + }, + { + "epoch": 0.6894374487211501, + "grad_norm": 0.26220694184303284, + "learning_rate": 1.3305393232109586e-05, + "loss": 0.0981, + "step": 38654 + }, + { + "epoch": 0.6894552848428638, + "grad_norm": 0.3841293752193451, + "learning_rate": 1.3304017546945478e-05, + "loss": 0.2178, + "step": 38655 + }, + { + "epoch": 0.6894731209645775, + "grad_norm": 0.29004284739494324, + "learning_rate": 1.3302641907119034e-05, + "loss": 0.1911, + "step": 38656 + }, + { + "epoch": 0.6894909570862912, + "grad_norm": 0.40998777747154236, + "learning_rate": 1.3301266312635591e-05, + "loss": 0.0948, + "step": 38657 + }, + { + "epoch": 0.6895087932080048, + "grad_norm": 0.28436097502708435, + "learning_rate": 1.329989076350047e-05, + "loss": 0.1785, + "step": 38658 + }, + { + "epoch": 0.6895266293297185, + "grad_norm": 0.23973765969276428, + "learning_rate": 1.3298515259719024e-05, + "loss": 0.0837, + "step": 38659 + }, + { + "epoch": 0.6895444654514322, + "grad_norm": 0.24880464375019073, + "learning_rate": 1.3297139801296572e-05, + "loss": 0.1216, + "step": 38660 + }, + { + "epoch": 0.6895623015731459, + "grad_norm": 0.23176822066307068, + "learning_rate": 1.3295764388238451e-05, + "loss": 0.1508, + "step": 38661 + }, + { + "epoch": 0.6895801376948596, + "grad_norm": 0.26152318716049194, + "learning_rate": 1.329438902054998e-05, + "loss": 0.1593, + "step": 38662 + }, + { + "epoch": 0.6895979738165733, + "grad_norm": 0.2617315948009491, + "learning_rate": 1.3293013698236506e-05, + "loss": 0.072, + "step": 38663 + }, + { + "epoch": 0.689615809938287, + "grad_norm": 0.25507086515426636, + "learning_rate": 1.329163842130336e-05, + "loss": 0.1327, + "step": 38664 + }, + { + "epoch": 0.6896336460600007, + "grad_norm": 0.3612326979637146, + "learning_rate": 1.3290263189755852e-05, + "loss": 0.0904, + "step": 38665 + }, + { + "epoch": 0.6896514821817145, + "grad_norm": 0.2828935384750366, + "learning_rate": 1.3288888003599342e-05, + "loss": 0.1363, + "step": 38666 + }, + { + "epoch": 0.6896693183034281, + "grad_norm": 0.2619461715221405, + "learning_rate": 1.3287512862839135e-05, + "loss": 0.1204, + "step": 38667 + }, + { + "epoch": 0.6896871544251418, + "grad_norm": 0.2610267102718353, + "learning_rate": 1.3286137767480586e-05, + "loss": 0.164, + "step": 38668 + }, + { + "epoch": 0.6897049905468555, + "grad_norm": 0.2898586690425873, + "learning_rate": 1.3284762717529009e-05, + "loss": 0.1287, + "step": 38669 + }, + { + "epoch": 0.6897228266685692, + "grad_norm": 0.260958731174469, + "learning_rate": 1.3283387712989743e-05, + "loss": 0.1225, + "step": 38670 + }, + { + "epoch": 0.6897406627902829, + "grad_norm": 0.268326997756958, + "learning_rate": 1.32820127538681e-05, + "loss": 0.1388, + "step": 38671 + }, + { + "epoch": 0.6897584989119966, + "grad_norm": 0.29754361510276794, + "learning_rate": 1.3280637840169433e-05, + "loss": 0.1159, + "step": 38672 + }, + { + "epoch": 0.6897763350337103, + "grad_norm": 0.3323124051094055, + "learning_rate": 1.3279262971899062e-05, + "loss": 0.1482, + "step": 38673 + }, + { + "epoch": 0.689794171155424, + "grad_norm": 0.3237851560115814, + "learning_rate": 1.3277888149062314e-05, + "loss": 0.1397, + "step": 38674 + }, + { + "epoch": 0.6898120072771377, + "grad_norm": 0.3286314606666565, + "learning_rate": 1.3276513371664511e-05, + "loss": 0.1028, + "step": 38675 + }, + { + "epoch": 0.6898298433988513, + "grad_norm": 0.3324311673641205, + "learning_rate": 1.3275138639711005e-05, + "loss": 0.1802, + "step": 38676 + }, + { + "epoch": 0.689847679520565, + "grad_norm": 0.26701226830482483, + "learning_rate": 1.3273763953207108e-05, + "loss": 0.1286, + "step": 38677 + }, + { + "epoch": 0.6898655156422787, + "grad_norm": 0.32240742444992065, + "learning_rate": 1.3272389312158143e-05, + "loss": 0.1026, + "step": 38678 + }, + { + "epoch": 0.6898833517639924, + "grad_norm": 0.28120702505111694, + "learning_rate": 1.3271014716569457e-05, + "loss": 0.1434, + "step": 38679 + }, + { + "epoch": 0.6899011878857061, + "grad_norm": 0.26151809096336365, + "learning_rate": 1.3269640166446357e-05, + "loss": 0.0963, + "step": 38680 + }, + { + "epoch": 0.6899190240074198, + "grad_norm": 0.3740496039390564, + "learning_rate": 1.3268265661794196e-05, + "loss": 0.2064, + "step": 38681 + }, + { + "epoch": 0.6899368601291335, + "grad_norm": 0.2708732485771179, + "learning_rate": 1.326689120261829e-05, + "loss": 0.1053, + "step": 38682 + }, + { + "epoch": 0.6899546962508473, + "grad_norm": 0.3348347246646881, + "learning_rate": 1.3265516788923965e-05, + "loss": 0.1317, + "step": 38683 + }, + { + "epoch": 0.689972532372561, + "grad_norm": 0.24740757048130035, + "learning_rate": 1.326414242071654e-05, + "loss": 0.1029, + "step": 38684 + }, + { + "epoch": 0.6899903684942746, + "grad_norm": 0.23616833984851837, + "learning_rate": 1.326276809800136e-05, + "loss": 0.1084, + "step": 38685 + }, + { + "epoch": 0.6900082046159883, + "grad_norm": 0.3803771138191223, + "learning_rate": 1.3261393820783746e-05, + "loss": 0.1391, + "step": 38686 + }, + { + "epoch": 0.690026040737702, + "grad_norm": 0.28437551856040955, + "learning_rate": 1.3260019589069028e-05, + "loss": 0.1541, + "step": 38687 + }, + { + "epoch": 0.6900438768594157, + "grad_norm": 0.2313799113035202, + "learning_rate": 1.3258645402862512e-05, + "loss": 0.1137, + "step": 38688 + }, + { + "epoch": 0.6900617129811294, + "grad_norm": 0.28760936856269836, + "learning_rate": 1.3257271262169557e-05, + "loss": 0.1112, + "step": 38689 + }, + { + "epoch": 0.6900795491028431, + "grad_norm": 0.22957073152065277, + "learning_rate": 1.3255897166995474e-05, + "loss": 0.1069, + "step": 38690 + }, + { + "epoch": 0.6900973852245568, + "grad_norm": 0.3053368031978607, + "learning_rate": 1.3254523117345591e-05, + "loss": 0.0807, + "step": 38691 + }, + { + "epoch": 0.6901152213462705, + "grad_norm": 0.2762482762336731, + "learning_rate": 1.325314911322522e-05, + "loss": 0.1368, + "step": 38692 + }, + { + "epoch": 0.6901330574679841, + "grad_norm": 0.30173760652542114, + "learning_rate": 1.3251775154639713e-05, + "loss": 0.1591, + "step": 38693 + }, + { + "epoch": 0.6901508935896978, + "grad_norm": 0.23821094632148743, + "learning_rate": 1.3250401241594368e-05, + "loss": 0.1377, + "step": 38694 + }, + { + "epoch": 0.6901687297114115, + "grad_norm": 0.2922179698944092, + "learning_rate": 1.324902737409454e-05, + "loss": 0.1424, + "step": 38695 + }, + { + "epoch": 0.6901865658331252, + "grad_norm": 0.32731255888938904, + "learning_rate": 1.3247653552145538e-05, + "loss": 0.131, + "step": 38696 + }, + { + "epoch": 0.6902044019548389, + "grad_norm": 0.3112957179546356, + "learning_rate": 1.3246279775752685e-05, + "loss": 0.1418, + "step": 38697 + }, + { + "epoch": 0.6902222380765526, + "grad_norm": 0.24798676371574402, + "learning_rate": 1.3244906044921317e-05, + "loss": 0.1323, + "step": 38698 + }, + { + "epoch": 0.6902400741982663, + "grad_norm": 0.23981525003910065, + "learning_rate": 1.3243532359656754e-05, + "loss": 0.1388, + "step": 38699 + }, + { + "epoch": 0.6902579103199801, + "grad_norm": 0.28346219658851624, + "learning_rate": 1.324215871996432e-05, + "loss": 0.0753, + "step": 38700 + }, + { + "epoch": 0.6902757464416938, + "grad_norm": 0.37999963760375977, + "learning_rate": 1.324078512584933e-05, + "loss": 0.1345, + "step": 38701 + }, + { + "epoch": 0.6902935825634074, + "grad_norm": 0.2923862934112549, + "learning_rate": 1.3239411577317129e-05, + "loss": 0.0996, + "step": 38702 + }, + { + "epoch": 0.6903114186851211, + "grad_norm": 0.19239555299282074, + "learning_rate": 1.3238038074373033e-05, + "loss": 0.1096, + "step": 38703 + }, + { + "epoch": 0.6903292548068348, + "grad_norm": 0.29090577363967896, + "learning_rate": 1.323666461702236e-05, + "loss": 0.1518, + "step": 38704 + }, + { + "epoch": 0.6903470909285485, + "grad_norm": 0.23829719424247742, + "learning_rate": 1.3235291205270427e-05, + "loss": 0.1234, + "step": 38705 + }, + { + "epoch": 0.6903649270502622, + "grad_norm": 0.2725439667701721, + "learning_rate": 1.3233917839122583e-05, + "loss": 0.1262, + "step": 38706 + }, + { + "epoch": 0.6903827631719759, + "grad_norm": 0.3757858872413635, + "learning_rate": 1.3232544518584122e-05, + "loss": 0.1297, + "step": 38707 + }, + { + "epoch": 0.6904005992936896, + "grad_norm": 0.3936978876590729, + "learning_rate": 1.3231171243660398e-05, + "loss": 0.1558, + "step": 38708 + }, + { + "epoch": 0.6904184354154033, + "grad_norm": 0.2817973494529724, + "learning_rate": 1.3229798014356717e-05, + "loss": 0.1122, + "step": 38709 + }, + { + "epoch": 0.690436271537117, + "grad_norm": 0.30825358629226685, + "learning_rate": 1.3228424830678394e-05, + "loss": 0.1178, + "step": 38710 + }, + { + "epoch": 0.6904541076588306, + "grad_norm": 0.26283615827560425, + "learning_rate": 1.322705169263077e-05, + "loss": 0.0772, + "step": 38711 + }, + { + "epoch": 0.6904719437805443, + "grad_norm": 0.22746725380420685, + "learning_rate": 1.3225678600219165e-05, + "loss": 0.1399, + "step": 38712 + }, + { + "epoch": 0.690489779902258, + "grad_norm": 0.28221169114112854, + "learning_rate": 1.3224305553448893e-05, + "loss": 0.1152, + "step": 38713 + }, + { + "epoch": 0.6905076160239717, + "grad_norm": 0.2240988165140152, + "learning_rate": 1.3222932552325271e-05, + "loss": 0.1146, + "step": 38714 + }, + { + "epoch": 0.6905254521456854, + "grad_norm": 0.2932063341140747, + "learning_rate": 1.3221559596853638e-05, + "loss": 0.0592, + "step": 38715 + }, + { + "epoch": 0.6905432882673992, + "grad_norm": 0.48504117131233215, + "learning_rate": 1.322018668703931e-05, + "loss": 0.2323, + "step": 38716 + }, + { + "epoch": 0.6905611243891129, + "grad_norm": 0.20611225068569183, + "learning_rate": 1.3218813822887607e-05, + "loss": 0.0792, + "step": 38717 + }, + { + "epoch": 0.6905789605108266, + "grad_norm": 0.28257375955581665, + "learning_rate": 1.3217441004403842e-05, + "loss": 0.1441, + "step": 38718 + }, + { + "epoch": 0.6905967966325403, + "grad_norm": 0.2722945213317871, + "learning_rate": 1.3216068231593354e-05, + "loss": 0.1559, + "step": 38719 + }, + { + "epoch": 0.6906146327542539, + "grad_norm": 0.2578687071800232, + "learning_rate": 1.3214695504461455e-05, + "loss": 0.1135, + "step": 38720 + }, + { + "epoch": 0.6906324688759676, + "grad_norm": 0.2690184712409973, + "learning_rate": 1.3213322823013457e-05, + "loss": 0.1213, + "step": 38721 + }, + { + "epoch": 0.6906503049976813, + "grad_norm": 0.26990264654159546, + "learning_rate": 1.3211950187254702e-05, + "loss": 0.1799, + "step": 38722 + }, + { + "epoch": 0.690668141119395, + "grad_norm": 0.2215307056903839, + "learning_rate": 1.3210577597190489e-05, + "loss": 0.1521, + "step": 38723 + }, + { + "epoch": 0.6906859772411087, + "grad_norm": 0.2137995809316635, + "learning_rate": 1.3209205052826158e-05, + "loss": 0.1239, + "step": 38724 + }, + { + "epoch": 0.6907038133628224, + "grad_norm": 0.26220327615737915, + "learning_rate": 1.3207832554167021e-05, + "loss": 0.1089, + "step": 38725 + }, + { + "epoch": 0.6907216494845361, + "grad_norm": 0.3278980255126953, + "learning_rate": 1.3206460101218396e-05, + "loss": 0.1691, + "step": 38726 + }, + { + "epoch": 0.6907394856062498, + "grad_norm": 0.28485000133514404, + "learning_rate": 1.3205087693985596e-05, + "loss": 0.1744, + "step": 38727 + }, + { + "epoch": 0.6907573217279634, + "grad_norm": 0.2428922802209854, + "learning_rate": 1.3203715332473962e-05, + "loss": 0.1063, + "step": 38728 + }, + { + "epoch": 0.6907751578496771, + "grad_norm": 0.2448032945394516, + "learning_rate": 1.32023430166888e-05, + "loss": 0.0901, + "step": 38729 + }, + { + "epoch": 0.6907929939713908, + "grad_norm": 0.262446790933609, + "learning_rate": 1.3200970746635432e-05, + "loss": 0.1308, + "step": 38730 + }, + { + "epoch": 0.6908108300931045, + "grad_norm": 0.28618475794792175, + "learning_rate": 1.3199598522319168e-05, + "loss": 0.1172, + "step": 38731 + }, + { + "epoch": 0.6908286662148182, + "grad_norm": 0.16633619368076324, + "learning_rate": 1.3198226343745343e-05, + "loss": 0.1072, + "step": 38732 + }, + { + "epoch": 0.690846502336532, + "grad_norm": 0.3581560552120209, + "learning_rate": 1.319685421091927e-05, + "loss": 0.1796, + "step": 38733 + }, + { + "epoch": 0.6908643384582457, + "grad_norm": 0.30605944991111755, + "learning_rate": 1.319548212384626e-05, + "loss": 0.1467, + "step": 38734 + }, + { + "epoch": 0.6908821745799594, + "grad_norm": 0.32501140236854553, + "learning_rate": 1.3194110082531643e-05, + "loss": 0.1252, + "step": 38735 + }, + { + "epoch": 0.6909000107016731, + "grad_norm": 0.2701601982116699, + "learning_rate": 1.3192738086980726e-05, + "loss": 0.1618, + "step": 38736 + }, + { + "epoch": 0.6909178468233867, + "grad_norm": 0.22822974622249603, + "learning_rate": 1.3191366137198843e-05, + "loss": 0.1315, + "step": 38737 + }, + { + "epoch": 0.6909356829451004, + "grad_norm": 0.2660581171512604, + "learning_rate": 1.3189994233191305e-05, + "loss": 0.166, + "step": 38738 + }, + { + "epoch": 0.6909535190668141, + "grad_norm": 0.27075493335723877, + "learning_rate": 1.3188622374963428e-05, + "loss": 0.1236, + "step": 38739 + }, + { + "epoch": 0.6909713551885278, + "grad_norm": 0.2880050241947174, + "learning_rate": 1.318725056252052e-05, + "loss": 0.094, + "step": 38740 + }, + { + "epoch": 0.6909891913102415, + "grad_norm": 0.27749377489089966, + "learning_rate": 1.3185878795867917e-05, + "loss": 0.0964, + "step": 38741 + }, + { + "epoch": 0.6910070274319552, + "grad_norm": 0.3644750714302063, + "learning_rate": 1.318450707501093e-05, + "loss": 0.1608, + "step": 38742 + }, + { + "epoch": 0.6910248635536689, + "grad_norm": 0.3186701834201813, + "learning_rate": 1.3183135399954873e-05, + "loss": 0.1587, + "step": 38743 + }, + { + "epoch": 0.6910426996753826, + "grad_norm": 0.27826863527297974, + "learning_rate": 1.3181763770705059e-05, + "loss": 0.1583, + "step": 38744 + }, + { + "epoch": 0.6910605357970963, + "grad_norm": 0.26755356788635254, + "learning_rate": 1.3180392187266816e-05, + "loss": 0.1184, + "step": 38745 + }, + { + "epoch": 0.6910783719188099, + "grad_norm": 0.3084157705307007, + "learning_rate": 1.3179020649645458e-05, + "loss": 0.0934, + "step": 38746 + }, + { + "epoch": 0.6910962080405236, + "grad_norm": 0.28145831823349, + "learning_rate": 1.3177649157846295e-05, + "loss": 0.1106, + "step": 38747 + }, + { + "epoch": 0.6911140441622373, + "grad_norm": 0.2863115072250366, + "learning_rate": 1.317627771187464e-05, + "loss": 0.1371, + "step": 38748 + }, + { + "epoch": 0.691131880283951, + "grad_norm": 0.2876375913619995, + "learning_rate": 1.3174906311735815e-05, + "loss": 0.1029, + "step": 38749 + }, + { + "epoch": 0.6911497164056648, + "grad_norm": 0.4812483787536621, + "learning_rate": 1.3173534957435149e-05, + "loss": 0.1443, + "step": 38750 + }, + { + "epoch": 0.6911675525273785, + "grad_norm": 0.30926138162612915, + "learning_rate": 1.3172163648977948e-05, + "loss": 0.1617, + "step": 38751 + }, + { + "epoch": 0.6911853886490922, + "grad_norm": 0.32044175267219543, + "learning_rate": 1.3170792386369521e-05, + "loss": 0.1216, + "step": 38752 + }, + { + "epoch": 0.6912032247708059, + "grad_norm": 0.33879876136779785, + "learning_rate": 1.3169421169615182e-05, + "loss": 0.1066, + "step": 38753 + }, + { + "epoch": 0.6912210608925196, + "grad_norm": 0.36022135615348816, + "learning_rate": 1.316804999872026e-05, + "loss": 0.1167, + "step": 38754 + }, + { + "epoch": 0.6912388970142332, + "grad_norm": 0.2358321100473404, + "learning_rate": 1.3166678873690064e-05, + "loss": 0.133, + "step": 38755 + }, + { + "epoch": 0.6912567331359469, + "grad_norm": 0.20222005248069763, + "learning_rate": 1.3165307794529908e-05, + "loss": 0.0655, + "step": 38756 + }, + { + "epoch": 0.6912745692576606, + "grad_norm": 0.32699280977249146, + "learning_rate": 1.31639367612451e-05, + "loss": 0.1246, + "step": 38757 + }, + { + "epoch": 0.6912924053793743, + "grad_norm": 0.3404202461242676, + "learning_rate": 1.3162565773840968e-05, + "loss": 0.1455, + "step": 38758 + }, + { + "epoch": 0.691310241501088, + "grad_norm": 0.36428165435791016, + "learning_rate": 1.3161194832322818e-05, + "loss": 0.1672, + "step": 38759 + }, + { + "epoch": 0.6913280776228017, + "grad_norm": 0.19613413512706757, + "learning_rate": 1.3159823936695967e-05, + "loss": 0.1127, + "step": 38760 + }, + { + "epoch": 0.6913459137445154, + "grad_norm": 0.3601176142692566, + "learning_rate": 1.3158453086965716e-05, + "loss": 0.1582, + "step": 38761 + }, + { + "epoch": 0.6913637498662291, + "grad_norm": 0.2544485330581665, + "learning_rate": 1.3157082283137406e-05, + "loss": 0.1126, + "step": 38762 + }, + { + "epoch": 0.6913815859879427, + "grad_norm": 0.34593045711517334, + "learning_rate": 1.3155711525216321e-05, + "loss": 0.099, + "step": 38763 + }, + { + "epoch": 0.6913994221096564, + "grad_norm": 0.25740107893943787, + "learning_rate": 1.31543408132078e-05, + "loss": 0.1298, + "step": 38764 + }, + { + "epoch": 0.6914172582313701, + "grad_norm": 0.2705414593219757, + "learning_rate": 1.3152970147117147e-05, + "loss": 0.1046, + "step": 38765 + }, + { + "epoch": 0.6914350943530838, + "grad_norm": 0.3260417580604553, + "learning_rate": 1.3151599526949663e-05, + "loss": 0.1163, + "step": 38766 + }, + { + "epoch": 0.6914529304747976, + "grad_norm": 0.18829238414764404, + "learning_rate": 1.315022895271068e-05, + "loss": 0.0902, + "step": 38767 + }, + { + "epoch": 0.6914707665965113, + "grad_norm": 0.2498045563697815, + "learning_rate": 1.3148858424405503e-05, + "loss": 0.1008, + "step": 38768 + }, + { + "epoch": 0.691488602718225, + "grad_norm": 0.3177548944950104, + "learning_rate": 1.3147487942039444e-05, + "loss": 0.1328, + "step": 38769 + }, + { + "epoch": 0.6915064388399387, + "grad_norm": 0.2703242301940918, + "learning_rate": 1.3146117505617806e-05, + "loss": 0.1058, + "step": 38770 + }, + { + "epoch": 0.6915242749616524, + "grad_norm": 0.2493327409029007, + "learning_rate": 1.3144747115145923e-05, + "loss": 0.1358, + "step": 38771 + }, + { + "epoch": 0.691542111083366, + "grad_norm": 0.3020995259284973, + "learning_rate": 1.314337677062909e-05, + "loss": 0.1759, + "step": 38772 + }, + { + "epoch": 0.6915599472050797, + "grad_norm": 0.24847684800624847, + "learning_rate": 1.3142006472072626e-05, + "loss": 0.1063, + "step": 38773 + }, + { + "epoch": 0.6915777833267934, + "grad_norm": 0.23036056756973267, + "learning_rate": 1.3140636219481838e-05, + "loss": 0.0956, + "step": 38774 + }, + { + "epoch": 0.6915956194485071, + "grad_norm": 0.24159425497055054, + "learning_rate": 1.3139266012862034e-05, + "loss": 0.1446, + "step": 38775 + }, + { + "epoch": 0.6916134555702208, + "grad_norm": 0.21169061958789825, + "learning_rate": 1.3137895852218532e-05, + "loss": 0.0944, + "step": 38776 + }, + { + "epoch": 0.6916312916919345, + "grad_norm": 0.34019920229911804, + "learning_rate": 1.3136525737556648e-05, + "loss": 0.1528, + "step": 38777 + }, + { + "epoch": 0.6916491278136482, + "grad_norm": 0.2619572877883911, + "learning_rate": 1.3135155668881694e-05, + "loss": 0.1197, + "step": 38778 + }, + { + "epoch": 0.6916669639353619, + "grad_norm": 0.2956441640853882, + "learning_rate": 1.3133785646198959e-05, + "loss": 0.1284, + "step": 38779 + }, + { + "epoch": 0.6916848000570756, + "grad_norm": 0.1832607537508011, + "learning_rate": 1.3132415669513784e-05, + "loss": 0.0871, + "step": 38780 + }, + { + "epoch": 0.6917026361787892, + "grad_norm": 0.25110000371932983, + "learning_rate": 1.313104573883146e-05, + "loss": 0.1316, + "step": 38781 + }, + { + "epoch": 0.6917204723005029, + "grad_norm": 0.27382761240005493, + "learning_rate": 1.3129675854157306e-05, + "loss": 0.0686, + "step": 38782 + }, + { + "epoch": 0.6917383084222166, + "grad_norm": 0.2729042172431946, + "learning_rate": 1.3128306015496616e-05, + "loss": 0.1271, + "step": 38783 + }, + { + "epoch": 0.6917561445439304, + "grad_norm": 0.5074036717414856, + "learning_rate": 1.3126936222854724e-05, + "loss": 0.1655, + "step": 38784 + }, + { + "epoch": 0.6917739806656441, + "grad_norm": 0.34796684980392456, + "learning_rate": 1.3125566476236928e-05, + "loss": 0.1095, + "step": 38785 + }, + { + "epoch": 0.6917918167873578, + "grad_norm": 0.27179834246635437, + "learning_rate": 1.3124196775648534e-05, + "loss": 0.1074, + "step": 38786 + }, + { + "epoch": 0.6918096529090715, + "grad_norm": 0.26308971643447876, + "learning_rate": 1.312282712109486e-05, + "loss": 0.1336, + "step": 38787 + }, + { + "epoch": 0.6918274890307852, + "grad_norm": 0.229720339179039, + "learning_rate": 1.3121457512581197e-05, + "loss": 0.115, + "step": 38788 + }, + { + "epoch": 0.6918453251524989, + "grad_norm": 0.24698026478290558, + "learning_rate": 1.312008795011288e-05, + "loss": 0.1204, + "step": 38789 + }, + { + "epoch": 0.6918631612742125, + "grad_norm": 0.20926906168460846, + "learning_rate": 1.3118718433695194e-05, + "loss": 0.0895, + "step": 38790 + }, + { + "epoch": 0.6918809973959262, + "grad_norm": 0.266743004322052, + "learning_rate": 1.3117348963333468e-05, + "loss": 0.1147, + "step": 38791 + }, + { + "epoch": 0.6918988335176399, + "grad_norm": 0.32903027534484863, + "learning_rate": 1.3115979539032991e-05, + "loss": 0.1259, + "step": 38792 + }, + { + "epoch": 0.6919166696393536, + "grad_norm": 0.23266242444515228, + "learning_rate": 1.3114610160799095e-05, + "loss": 0.1716, + "step": 38793 + }, + { + "epoch": 0.6919345057610673, + "grad_norm": 0.23769375681877136, + "learning_rate": 1.3113240828637075e-05, + "loss": 0.1282, + "step": 38794 + }, + { + "epoch": 0.691952341882781, + "grad_norm": 0.2318408042192459, + "learning_rate": 1.3111871542552234e-05, + "loss": 0.1678, + "step": 38795 + }, + { + "epoch": 0.6919701780044947, + "grad_norm": 0.2351434975862503, + "learning_rate": 1.3110502302549882e-05, + "loss": 0.1349, + "step": 38796 + }, + { + "epoch": 0.6919880141262084, + "grad_norm": 0.27161964774131775, + "learning_rate": 1.3109133108635335e-05, + "loss": 0.1088, + "step": 38797 + }, + { + "epoch": 0.692005850247922, + "grad_norm": 0.29230543971061707, + "learning_rate": 1.3107763960813896e-05, + "loss": 0.0948, + "step": 38798 + }, + { + "epoch": 0.6920236863696357, + "grad_norm": 0.2311091125011444, + "learning_rate": 1.310639485909087e-05, + "loss": 0.0996, + "step": 38799 + }, + { + "epoch": 0.6920415224913494, + "grad_norm": 0.30151063203811646, + "learning_rate": 1.3105025803471565e-05, + "loss": 0.1031, + "step": 38800 + }, + { + "epoch": 0.6920593586130632, + "grad_norm": 0.265648752450943, + "learning_rate": 1.3103656793961282e-05, + "loss": 0.0795, + "step": 38801 + }, + { + "epoch": 0.6920771947347769, + "grad_norm": 0.24831724166870117, + "learning_rate": 1.310228783056534e-05, + "loss": 0.1713, + "step": 38802 + }, + { + "epoch": 0.6920950308564906, + "grad_norm": 0.31847429275512695, + "learning_rate": 1.3100918913289034e-05, + "loss": 0.1954, + "step": 38803 + }, + { + "epoch": 0.6921128669782043, + "grad_norm": 0.23262082040309906, + "learning_rate": 1.3099550042137684e-05, + "loss": 0.0996, + "step": 38804 + }, + { + "epoch": 0.692130703099918, + "grad_norm": 0.3489309251308441, + "learning_rate": 1.3098181217116578e-05, + "loss": 0.1453, + "step": 38805 + }, + { + "epoch": 0.6921485392216317, + "grad_norm": 0.26697972416877747, + "learning_rate": 1.3096812438231043e-05, + "loss": 0.128, + "step": 38806 + }, + { + "epoch": 0.6921663753433454, + "grad_norm": 0.22062751650810242, + "learning_rate": 1.3095443705486377e-05, + "loss": 0.0871, + "step": 38807 + }, + { + "epoch": 0.692184211465059, + "grad_norm": 0.21080462634563446, + "learning_rate": 1.3094075018887878e-05, + "loss": 0.1084, + "step": 38808 + }, + { + "epoch": 0.6922020475867727, + "grad_norm": 0.30563896894454956, + "learning_rate": 1.309270637844085e-05, + "loss": 0.1437, + "step": 38809 + }, + { + "epoch": 0.6922198837084864, + "grad_norm": 0.22260555624961853, + "learning_rate": 1.309133778415061e-05, + "loss": 0.1298, + "step": 38810 + }, + { + "epoch": 0.6922377198302001, + "grad_norm": 0.31000182032585144, + "learning_rate": 1.3089969236022461e-05, + "loss": 0.0621, + "step": 38811 + }, + { + "epoch": 0.6922555559519138, + "grad_norm": 0.28880545496940613, + "learning_rate": 1.3088600734061707e-05, + "loss": 0.1326, + "step": 38812 + }, + { + "epoch": 0.6922733920736275, + "grad_norm": 0.24089579284191132, + "learning_rate": 1.3087232278273649e-05, + "loss": 0.1515, + "step": 38813 + }, + { + "epoch": 0.6922912281953412, + "grad_norm": 0.31125473976135254, + "learning_rate": 1.308586386866358e-05, + "loss": 0.1443, + "step": 38814 + }, + { + "epoch": 0.6923090643170549, + "grad_norm": 0.3251326084136963, + "learning_rate": 1.3084495505236833e-05, + "loss": 0.1168, + "step": 38815 + }, + { + "epoch": 0.6923269004387685, + "grad_norm": 0.18446235358715057, + "learning_rate": 1.3083127187998692e-05, + "loss": 0.1071, + "step": 38816 + }, + { + "epoch": 0.6923447365604823, + "grad_norm": 0.20737780630588531, + "learning_rate": 1.3081758916954456e-05, + "loss": 0.0971, + "step": 38817 + }, + { + "epoch": 0.692362572682196, + "grad_norm": 0.1935952752828598, + "learning_rate": 1.3080390692109451e-05, + "loss": 0.1014, + "step": 38818 + }, + { + "epoch": 0.6923804088039097, + "grad_norm": 0.3421616554260254, + "learning_rate": 1.307902251346896e-05, + "loss": 0.1399, + "step": 38819 + }, + { + "epoch": 0.6923982449256234, + "grad_norm": 0.20943127572536469, + "learning_rate": 1.3077654381038304e-05, + "loss": 0.086, + "step": 38820 + }, + { + "epoch": 0.6924160810473371, + "grad_norm": 0.27631518244743347, + "learning_rate": 1.3076286294822776e-05, + "loss": 0.1575, + "step": 38821 + }, + { + "epoch": 0.6924339171690508, + "grad_norm": 0.22087550163269043, + "learning_rate": 1.3074918254827673e-05, + "loss": 0.1045, + "step": 38822 + }, + { + "epoch": 0.6924517532907645, + "grad_norm": 0.26400530338287354, + "learning_rate": 1.3073550261058315e-05, + "loss": 0.121, + "step": 38823 + }, + { + "epoch": 0.6924695894124782, + "grad_norm": 0.24591878056526184, + "learning_rate": 1.3072182313519993e-05, + "loss": 0.1487, + "step": 38824 + }, + { + "epoch": 0.6924874255341918, + "grad_norm": 0.32643479108810425, + "learning_rate": 1.3070814412218017e-05, + "loss": 0.1432, + "step": 38825 + }, + { + "epoch": 0.6925052616559055, + "grad_norm": 0.2526044547557831, + "learning_rate": 1.306944655715768e-05, + "loss": 0.0962, + "step": 38826 + }, + { + "epoch": 0.6925230977776192, + "grad_norm": 0.23792684078216553, + "learning_rate": 1.306807874834428e-05, + "loss": 0.0721, + "step": 38827 + }, + { + "epoch": 0.6925409338993329, + "grad_norm": 0.270885169506073, + "learning_rate": 1.3066710985783136e-05, + "loss": 0.1018, + "step": 38828 + }, + { + "epoch": 0.6925587700210466, + "grad_norm": 0.374444842338562, + "learning_rate": 1.3065343269479547e-05, + "loss": 0.1365, + "step": 38829 + }, + { + "epoch": 0.6925766061427603, + "grad_norm": 0.22914782166481018, + "learning_rate": 1.3063975599438797e-05, + "loss": 0.1283, + "step": 38830 + }, + { + "epoch": 0.692594442264474, + "grad_norm": 0.2915685772895813, + "learning_rate": 1.3062607975666208e-05, + "loss": 0.1056, + "step": 38831 + }, + { + "epoch": 0.6926122783861877, + "grad_norm": 0.21610741317272186, + "learning_rate": 1.3061240398167069e-05, + "loss": 0.1367, + "step": 38832 + }, + { + "epoch": 0.6926301145079014, + "grad_norm": 0.27557480335235596, + "learning_rate": 1.305987286694669e-05, + "loss": 0.1422, + "step": 38833 + }, + { + "epoch": 0.6926479506296152, + "grad_norm": 0.27477696537971497, + "learning_rate": 1.305850538201037e-05, + "loss": 0.1325, + "step": 38834 + }, + { + "epoch": 0.6926657867513288, + "grad_norm": 0.23252776265144348, + "learning_rate": 1.3057137943363396e-05, + "loss": 0.0904, + "step": 38835 + }, + { + "epoch": 0.6926836228730425, + "grad_norm": 0.2752423584461212, + "learning_rate": 1.3055770551011093e-05, + "loss": 0.1081, + "step": 38836 + }, + { + "epoch": 0.6927014589947562, + "grad_norm": 0.29582762718200684, + "learning_rate": 1.3054403204958749e-05, + "loss": 0.1395, + "step": 38837 + }, + { + "epoch": 0.6927192951164699, + "grad_norm": 0.2734878659248352, + "learning_rate": 1.305303590521166e-05, + "loss": 0.1415, + "step": 38838 + }, + { + "epoch": 0.6927371312381836, + "grad_norm": 0.285553902387619, + "learning_rate": 1.3051668651775134e-05, + "loss": 0.1734, + "step": 38839 + }, + { + "epoch": 0.6927549673598973, + "grad_norm": 0.24469208717346191, + "learning_rate": 1.3050301444654456e-05, + "loss": 0.1272, + "step": 38840 + }, + { + "epoch": 0.692772803481611, + "grad_norm": 0.2895817458629608, + "learning_rate": 1.3048934283854946e-05, + "loss": 0.108, + "step": 38841 + }, + { + "epoch": 0.6927906396033247, + "grad_norm": 0.3367748558521271, + "learning_rate": 1.3047567169381897e-05, + "loss": 0.1599, + "step": 38842 + }, + { + "epoch": 0.6928084757250383, + "grad_norm": 0.25952938199043274, + "learning_rate": 1.3046200101240602e-05, + "loss": 0.1119, + "step": 38843 + }, + { + "epoch": 0.692826311846752, + "grad_norm": 0.2831670045852661, + "learning_rate": 1.3044833079436359e-05, + "loss": 0.0843, + "step": 38844 + }, + { + "epoch": 0.6928441479684657, + "grad_norm": 0.37477907538414, + "learning_rate": 1.3043466103974478e-05, + "loss": 0.1078, + "step": 38845 + }, + { + "epoch": 0.6928619840901794, + "grad_norm": 0.3395490348339081, + "learning_rate": 1.3042099174860242e-05, + "loss": 0.0994, + "step": 38846 + }, + { + "epoch": 0.6928798202118931, + "grad_norm": 0.24265864491462708, + "learning_rate": 1.3040732292098973e-05, + "loss": 0.1704, + "step": 38847 + }, + { + "epoch": 0.6928976563336068, + "grad_norm": 0.2586183249950409, + "learning_rate": 1.3039365455695943e-05, + "loss": 0.1481, + "step": 38848 + }, + { + "epoch": 0.6929154924553205, + "grad_norm": 0.3862290382385254, + "learning_rate": 1.3037998665656475e-05, + "loss": 0.1619, + "step": 38849 + }, + { + "epoch": 0.6929333285770342, + "grad_norm": 0.2527387738227844, + "learning_rate": 1.3036631921985854e-05, + "loss": 0.1338, + "step": 38850 + }, + { + "epoch": 0.692951164698748, + "grad_norm": 0.26722294092178345, + "learning_rate": 1.303526522468938e-05, + "loss": 0.1484, + "step": 38851 + }, + { + "epoch": 0.6929690008204616, + "grad_norm": 0.26204484701156616, + "learning_rate": 1.3033898573772355e-05, + "loss": 0.139, + "step": 38852 + }, + { + "epoch": 0.6929868369421753, + "grad_norm": 0.2863484025001526, + "learning_rate": 1.3032531969240058e-05, + "loss": 0.108, + "step": 38853 + }, + { + "epoch": 0.693004673063889, + "grad_norm": 0.19015048444271088, + "learning_rate": 1.3031165411097813e-05, + "loss": 0.0938, + "step": 38854 + }, + { + "epoch": 0.6930225091856027, + "grad_norm": 0.35718753933906555, + "learning_rate": 1.3029798899350904e-05, + "loss": 0.1072, + "step": 38855 + }, + { + "epoch": 0.6930403453073164, + "grad_norm": 0.203440323472023, + "learning_rate": 1.3028432434004625e-05, + "loss": 0.0937, + "step": 38856 + }, + { + "epoch": 0.6930581814290301, + "grad_norm": 0.1964787393808365, + "learning_rate": 1.302706601506427e-05, + "loss": 0.1316, + "step": 38857 + }, + { + "epoch": 0.6930760175507438, + "grad_norm": 0.26423177123069763, + "learning_rate": 1.3025699642535152e-05, + "loss": 0.1303, + "step": 38858 + }, + { + "epoch": 0.6930938536724575, + "grad_norm": 0.3048575818538666, + "learning_rate": 1.302433331642255e-05, + "loss": 0.1029, + "step": 38859 + }, + { + "epoch": 0.6931116897941711, + "grad_norm": 0.2601907253265381, + "learning_rate": 1.3022967036731775e-05, + "loss": 0.0932, + "step": 38860 + }, + { + "epoch": 0.6931295259158848, + "grad_norm": 0.32533133029937744, + "learning_rate": 1.3021600803468109e-05, + "loss": 0.1552, + "step": 38861 + }, + { + "epoch": 0.6931473620375985, + "grad_norm": 0.25909945368766785, + "learning_rate": 1.3020234616636864e-05, + "loss": 0.1218, + "step": 38862 + }, + { + "epoch": 0.6931651981593122, + "grad_norm": 0.26075488328933716, + "learning_rate": 1.3018868476243328e-05, + "loss": 0.0956, + "step": 38863 + }, + { + "epoch": 0.6931830342810259, + "grad_norm": 0.2390093356370926, + "learning_rate": 1.3017502382292795e-05, + "loss": 0.1588, + "step": 38864 + }, + { + "epoch": 0.6932008704027396, + "grad_norm": 0.2447652667760849, + "learning_rate": 1.3016136334790563e-05, + "loss": 0.0589, + "step": 38865 + }, + { + "epoch": 0.6932187065244533, + "grad_norm": 0.3241710960865021, + "learning_rate": 1.3014770333741915e-05, + "loss": 0.1923, + "step": 38866 + }, + { + "epoch": 0.693236542646167, + "grad_norm": 0.25113940238952637, + "learning_rate": 1.3013404379152167e-05, + "loss": 0.0962, + "step": 38867 + }, + { + "epoch": 0.6932543787678808, + "grad_norm": 0.25811827182769775, + "learning_rate": 1.30120384710266e-05, + "loss": 0.1222, + "step": 38868 + }, + { + "epoch": 0.6932722148895945, + "grad_norm": 0.44056758284568787, + "learning_rate": 1.3010672609370517e-05, + "loss": 0.1556, + "step": 38869 + }, + { + "epoch": 0.6932900510113081, + "grad_norm": 0.3130090832710266, + "learning_rate": 1.3009306794189197e-05, + "loss": 0.1001, + "step": 38870 + }, + { + "epoch": 0.6933078871330218, + "grad_norm": 0.27695757150650024, + "learning_rate": 1.3007941025487955e-05, + "loss": 0.101, + "step": 38871 + }, + { + "epoch": 0.6933257232547355, + "grad_norm": 0.3238756060600281, + "learning_rate": 1.3006575303272076e-05, + "loss": 0.0817, + "step": 38872 + }, + { + "epoch": 0.6933435593764492, + "grad_norm": 0.33694007992744446, + "learning_rate": 1.3005209627546844e-05, + "loss": 0.1778, + "step": 38873 + }, + { + "epoch": 0.6933613954981629, + "grad_norm": 0.45675989985466003, + "learning_rate": 1.3003843998317568e-05, + "loss": 0.1893, + "step": 38874 + }, + { + "epoch": 0.6933792316198766, + "grad_norm": 0.19218678772449493, + "learning_rate": 1.300247841558953e-05, + "loss": 0.1107, + "step": 38875 + }, + { + "epoch": 0.6933970677415903, + "grad_norm": 0.29433581233024597, + "learning_rate": 1.3001112879368038e-05, + "loss": 0.1027, + "step": 38876 + }, + { + "epoch": 0.693414903863304, + "grad_norm": 0.2407170534133911, + "learning_rate": 1.2999747389658378e-05, + "loss": 0.0948, + "step": 38877 + }, + { + "epoch": 0.6934327399850176, + "grad_norm": 0.25088241696357727, + "learning_rate": 1.2998381946465842e-05, + "loss": 0.1291, + "step": 38878 + }, + { + "epoch": 0.6934505761067313, + "grad_norm": 0.2478271871805191, + "learning_rate": 1.2997016549795713e-05, + "loss": 0.0933, + "step": 38879 + }, + { + "epoch": 0.693468412228445, + "grad_norm": 0.321100115776062, + "learning_rate": 1.2995651199653302e-05, + "loss": 0.1406, + "step": 38880 + }, + { + "epoch": 0.6934862483501587, + "grad_norm": 0.314493864774704, + "learning_rate": 1.2994285896043896e-05, + "loss": 0.11, + "step": 38881 + }, + { + "epoch": 0.6935040844718724, + "grad_norm": 0.2880284786224365, + "learning_rate": 1.2992920638972777e-05, + "loss": 0.1704, + "step": 38882 + }, + { + "epoch": 0.6935219205935861, + "grad_norm": 0.3332328796386719, + "learning_rate": 1.2991555428445243e-05, + "loss": 0.1212, + "step": 38883 + }, + { + "epoch": 0.6935397567152998, + "grad_norm": 0.3110034167766571, + "learning_rate": 1.2990190264466596e-05, + "loss": 0.1397, + "step": 38884 + }, + { + "epoch": 0.6935575928370136, + "grad_norm": 0.24640539288520813, + "learning_rate": 1.2988825147042116e-05, + "loss": 0.0876, + "step": 38885 + }, + { + "epoch": 0.6935754289587273, + "grad_norm": 0.23988419771194458, + "learning_rate": 1.2987460076177091e-05, + "loss": 0.1482, + "step": 38886 + }, + { + "epoch": 0.693593265080441, + "grad_norm": 0.28728044033050537, + "learning_rate": 1.2986095051876828e-05, + "loss": 0.1273, + "step": 38887 + }, + { + "epoch": 0.6936111012021546, + "grad_norm": 0.2788718342781067, + "learning_rate": 1.2984730074146603e-05, + "loss": 0.1374, + "step": 38888 + }, + { + "epoch": 0.6936289373238683, + "grad_norm": 0.20722664892673492, + "learning_rate": 1.298336514299172e-05, + "loss": 0.1402, + "step": 38889 + }, + { + "epoch": 0.693646773445582, + "grad_norm": 0.23695799708366394, + "learning_rate": 1.298200025841747e-05, + "loss": 0.1399, + "step": 38890 + }, + { + "epoch": 0.6936646095672957, + "grad_norm": 0.32240235805511475, + "learning_rate": 1.2980635420429133e-05, + "loss": 0.1583, + "step": 38891 + }, + { + "epoch": 0.6936824456890094, + "grad_norm": 0.20412838459014893, + "learning_rate": 1.2979270629031997e-05, + "loss": 0.1178, + "step": 38892 + }, + { + "epoch": 0.6937002818107231, + "grad_norm": 0.25681132078170776, + "learning_rate": 1.2977905884231367e-05, + "loss": 0.0894, + "step": 38893 + }, + { + "epoch": 0.6937181179324368, + "grad_norm": 0.34713780879974365, + "learning_rate": 1.297654118603253e-05, + "loss": 0.1234, + "step": 38894 + }, + { + "epoch": 0.6937359540541504, + "grad_norm": 0.22974295914173126, + "learning_rate": 1.297517653444077e-05, + "loss": 0.1347, + "step": 38895 + }, + { + "epoch": 0.6937537901758641, + "grad_norm": 0.32635554671287537, + "learning_rate": 1.2973811929461372e-05, + "loss": 0.1592, + "step": 38896 + }, + { + "epoch": 0.6937716262975778, + "grad_norm": 0.21161885559558868, + "learning_rate": 1.2972447371099639e-05, + "loss": 0.1234, + "step": 38897 + }, + { + "epoch": 0.6937894624192915, + "grad_norm": 0.3161948323249817, + "learning_rate": 1.2971082859360854e-05, + "loss": 0.0781, + "step": 38898 + }, + { + "epoch": 0.6938072985410052, + "grad_norm": 0.26616141200065613, + "learning_rate": 1.2969718394250308e-05, + "loss": 0.0947, + "step": 38899 + }, + { + "epoch": 0.6938251346627189, + "grad_norm": 0.2649553120136261, + "learning_rate": 1.2968353975773279e-05, + "loss": 0.1061, + "step": 38900 + }, + { + "epoch": 0.6938429707844326, + "grad_norm": 0.291670560836792, + "learning_rate": 1.296698960393507e-05, + "loss": 0.0881, + "step": 38901 + }, + { + "epoch": 0.6938608069061464, + "grad_norm": 0.31786757707595825, + "learning_rate": 1.2965625278740962e-05, + "loss": 0.1248, + "step": 38902 + }, + { + "epoch": 0.6938786430278601, + "grad_norm": 0.29399222135543823, + "learning_rate": 1.2964261000196255e-05, + "loss": 0.1122, + "step": 38903 + }, + { + "epoch": 0.6938964791495738, + "grad_norm": 0.3356807231903076, + "learning_rate": 1.2962896768306229e-05, + "loss": 0.1648, + "step": 38904 + }, + { + "epoch": 0.6939143152712874, + "grad_norm": 0.3318850100040436, + "learning_rate": 1.2961532583076163e-05, + "loss": 0.1707, + "step": 38905 + }, + { + "epoch": 0.6939321513930011, + "grad_norm": 0.23528605699539185, + "learning_rate": 1.2960168444511367e-05, + "loss": 0.1054, + "step": 38906 + }, + { + "epoch": 0.6939499875147148, + "grad_norm": 0.18605659902095795, + "learning_rate": 1.2958804352617114e-05, + "loss": 0.0833, + "step": 38907 + }, + { + "epoch": 0.6939678236364285, + "grad_norm": 0.23467952013015747, + "learning_rate": 1.2957440307398694e-05, + "loss": 0.1071, + "step": 38908 + }, + { + "epoch": 0.6939856597581422, + "grad_norm": 0.39236992597579956, + "learning_rate": 1.2956076308861385e-05, + "loss": 0.1232, + "step": 38909 + }, + { + "epoch": 0.6940034958798559, + "grad_norm": 0.21431805193424225, + "learning_rate": 1.2954712357010494e-05, + "loss": 0.0985, + "step": 38910 + }, + { + "epoch": 0.6940213320015696, + "grad_norm": 0.3000572919845581, + "learning_rate": 1.29533484518513e-05, + "loss": 0.1362, + "step": 38911 + }, + { + "epoch": 0.6940391681232833, + "grad_norm": 0.29470011591911316, + "learning_rate": 1.2951984593389082e-05, + "loss": 0.1309, + "step": 38912 + }, + { + "epoch": 0.694057004244997, + "grad_norm": 0.22623272240161896, + "learning_rate": 1.295062078162913e-05, + "loss": 0.0798, + "step": 38913 + }, + { + "epoch": 0.6940748403667106, + "grad_norm": 0.2676866948604584, + "learning_rate": 1.294925701657674e-05, + "loss": 0.1128, + "step": 38914 + }, + { + "epoch": 0.6940926764884243, + "grad_norm": 0.2591826319694519, + "learning_rate": 1.2947893298237185e-05, + "loss": 0.1032, + "step": 38915 + }, + { + "epoch": 0.694110512610138, + "grad_norm": 0.23538632690906525, + "learning_rate": 1.2946529626615767e-05, + "loss": 0.1331, + "step": 38916 + }, + { + "epoch": 0.6941283487318517, + "grad_norm": 0.26029500365257263, + "learning_rate": 1.2945166001717752e-05, + "loss": 0.1133, + "step": 38917 + }, + { + "epoch": 0.6941461848535655, + "grad_norm": 0.2218397855758667, + "learning_rate": 1.2943802423548446e-05, + "loss": 0.1238, + "step": 38918 + }, + { + "epoch": 0.6941640209752792, + "grad_norm": 0.256335586309433, + "learning_rate": 1.2942438892113129e-05, + "loss": 0.1185, + "step": 38919 + }, + { + "epoch": 0.6941818570969929, + "grad_norm": 0.26803240180015564, + "learning_rate": 1.294107540741708e-05, + "loss": 0.132, + "step": 38920 + }, + { + "epoch": 0.6941996932187066, + "grad_norm": 0.3438301980495453, + "learning_rate": 1.2939711969465589e-05, + "loss": 0.1485, + "step": 38921 + }, + { + "epoch": 0.6942175293404202, + "grad_norm": 0.19693708419799805, + "learning_rate": 1.2938348578263934e-05, + "loss": 0.0992, + "step": 38922 + }, + { + "epoch": 0.6942353654621339, + "grad_norm": 0.46287044882774353, + "learning_rate": 1.2936985233817411e-05, + "loss": 0.1429, + "step": 38923 + }, + { + "epoch": 0.6942532015838476, + "grad_norm": 0.24437500536441803, + "learning_rate": 1.2935621936131304e-05, + "loss": 0.1195, + "step": 38924 + }, + { + "epoch": 0.6942710377055613, + "grad_norm": 0.3722871243953705, + "learning_rate": 1.2934258685210887e-05, + "loss": 0.0954, + "step": 38925 + }, + { + "epoch": 0.694288873827275, + "grad_norm": 0.4296792447566986, + "learning_rate": 1.2932895481061447e-05, + "loss": 0.153, + "step": 38926 + }, + { + "epoch": 0.6943067099489887, + "grad_norm": 0.21547015011310577, + "learning_rate": 1.2931532323688278e-05, + "loss": 0.0986, + "step": 38927 + }, + { + "epoch": 0.6943245460707024, + "grad_norm": 0.3268570899963379, + "learning_rate": 1.293016921309666e-05, + "loss": 0.138, + "step": 38928 + }, + { + "epoch": 0.6943423821924161, + "grad_norm": 0.30655694007873535, + "learning_rate": 1.2928806149291865e-05, + "loss": 0.1203, + "step": 38929 + }, + { + "epoch": 0.6943602183141298, + "grad_norm": 0.2059771716594696, + "learning_rate": 1.2927443132279186e-05, + "loss": 0.1134, + "step": 38930 + }, + { + "epoch": 0.6943780544358434, + "grad_norm": 0.20445942878723145, + "learning_rate": 1.2926080162063917e-05, + "loss": 0.0763, + "step": 38931 + }, + { + "epoch": 0.6943958905575571, + "grad_norm": 0.2293536514043808, + "learning_rate": 1.292471723865133e-05, + "loss": 0.1181, + "step": 38932 + }, + { + "epoch": 0.6944137266792708, + "grad_norm": 0.27545976638793945, + "learning_rate": 1.2923354362046711e-05, + "loss": 0.1101, + "step": 38933 + }, + { + "epoch": 0.6944315628009845, + "grad_norm": 0.258241206407547, + "learning_rate": 1.2921991532255343e-05, + "loss": 0.1236, + "step": 38934 + }, + { + "epoch": 0.6944493989226983, + "grad_norm": 0.2016286998987198, + "learning_rate": 1.2920628749282495e-05, + "loss": 0.0911, + "step": 38935 + }, + { + "epoch": 0.694467235044412, + "grad_norm": 0.29968076944351196, + "learning_rate": 1.2919266013133475e-05, + "loss": 0.0789, + "step": 38936 + }, + { + "epoch": 0.6944850711661257, + "grad_norm": 0.2846897840499878, + "learning_rate": 1.2917903323813549e-05, + "loss": 0.1421, + "step": 38937 + }, + { + "epoch": 0.6945029072878394, + "grad_norm": 0.2914271056652069, + "learning_rate": 1.2916540681328004e-05, + "loss": 0.1312, + "step": 38938 + }, + { + "epoch": 0.694520743409553, + "grad_norm": 0.2852572500705719, + "learning_rate": 1.2915178085682112e-05, + "loss": 0.096, + "step": 38939 + }, + { + "epoch": 0.6945385795312667, + "grad_norm": 0.2819148302078247, + "learning_rate": 1.2913815536881174e-05, + "loss": 0.0864, + "step": 38940 + }, + { + "epoch": 0.6945564156529804, + "grad_norm": 0.2402840554714203, + "learning_rate": 1.291245303493046e-05, + "loss": 0.1043, + "step": 38941 + }, + { + "epoch": 0.6945742517746941, + "grad_norm": 0.3452276587486267, + "learning_rate": 1.2911090579835244e-05, + "loss": 0.1598, + "step": 38942 + }, + { + "epoch": 0.6945920878964078, + "grad_norm": 0.17727108299732208, + "learning_rate": 1.2909728171600824e-05, + "loss": 0.0892, + "step": 38943 + }, + { + "epoch": 0.6946099240181215, + "grad_norm": 0.2905954122543335, + "learning_rate": 1.2908365810232465e-05, + "loss": 0.1601, + "step": 38944 + }, + { + "epoch": 0.6946277601398352, + "grad_norm": 0.32615017890930176, + "learning_rate": 1.2907003495735467e-05, + "loss": 0.1255, + "step": 38945 + }, + { + "epoch": 0.6946455962615489, + "grad_norm": 0.31594735383987427, + "learning_rate": 1.2905641228115101e-05, + "loss": 0.1319, + "step": 38946 + }, + { + "epoch": 0.6946634323832626, + "grad_norm": 0.3191177546977997, + "learning_rate": 1.2904279007376644e-05, + "loss": 0.1912, + "step": 38947 + }, + { + "epoch": 0.6946812685049762, + "grad_norm": 0.19566045701503754, + "learning_rate": 1.2902916833525369e-05, + "loss": 0.0796, + "step": 38948 + }, + { + "epoch": 0.6946991046266899, + "grad_norm": 0.27914050221443176, + "learning_rate": 1.2901554706566581e-05, + "loss": 0.1313, + "step": 38949 + }, + { + "epoch": 0.6947169407484036, + "grad_norm": 0.3291770815849304, + "learning_rate": 1.2900192626505541e-05, + "loss": 0.1646, + "step": 38950 + }, + { + "epoch": 0.6947347768701173, + "grad_norm": 0.2316390722990036, + "learning_rate": 1.2898830593347538e-05, + "loss": 0.1279, + "step": 38951 + }, + { + "epoch": 0.6947526129918311, + "grad_norm": 0.2263980209827423, + "learning_rate": 1.2897468607097835e-05, + "loss": 0.0637, + "step": 38952 + }, + { + "epoch": 0.6947704491135448, + "grad_norm": 0.2116321623325348, + "learning_rate": 1.2896106667761732e-05, + "loss": 0.1148, + "step": 38953 + }, + { + "epoch": 0.6947882852352585, + "grad_norm": 0.3965723514556885, + "learning_rate": 1.2894744775344503e-05, + "loss": 0.1399, + "step": 38954 + }, + { + "epoch": 0.6948061213569722, + "grad_norm": 0.2113034874200821, + "learning_rate": 1.2893382929851423e-05, + "loss": 0.1431, + "step": 38955 + }, + { + "epoch": 0.6948239574786859, + "grad_norm": 0.21018700301647186, + "learning_rate": 1.2892021131287763e-05, + "loss": 0.0907, + "step": 38956 + }, + { + "epoch": 0.6948417936003995, + "grad_norm": 0.35073137283325195, + "learning_rate": 1.289065937965881e-05, + "loss": 0.1327, + "step": 38957 + }, + { + "epoch": 0.6948596297221132, + "grad_norm": 0.2525736689567566, + "learning_rate": 1.2889297674969853e-05, + "loss": 0.1081, + "step": 38958 + }, + { + "epoch": 0.6948774658438269, + "grad_norm": 0.3054443895816803, + "learning_rate": 1.2887936017226159e-05, + "loss": 0.1325, + "step": 38959 + }, + { + "epoch": 0.6948953019655406, + "grad_norm": 0.22958900034427643, + "learning_rate": 1.2886574406433014e-05, + "loss": 0.0847, + "step": 38960 + }, + { + "epoch": 0.6949131380872543, + "grad_norm": 0.4076766073703766, + "learning_rate": 1.2885212842595678e-05, + "loss": 0.1571, + "step": 38961 + }, + { + "epoch": 0.694930974208968, + "grad_norm": 0.23542045056819916, + "learning_rate": 1.288385132571945e-05, + "loss": 0.1066, + "step": 38962 + }, + { + "epoch": 0.6949488103306817, + "grad_norm": 0.22955989837646484, + "learning_rate": 1.2882489855809602e-05, + "loss": 0.1229, + "step": 38963 + }, + { + "epoch": 0.6949666464523954, + "grad_norm": 0.21638444066047668, + "learning_rate": 1.2881128432871406e-05, + "loss": 0.1171, + "step": 38964 + }, + { + "epoch": 0.694984482574109, + "grad_norm": 0.21046389639377594, + "learning_rate": 1.2879767056910133e-05, + "loss": 0.083, + "step": 38965 + }, + { + "epoch": 0.6950023186958227, + "grad_norm": 0.22104981541633606, + "learning_rate": 1.2878405727931078e-05, + "loss": 0.1171, + "step": 38966 + }, + { + "epoch": 0.6950201548175364, + "grad_norm": 0.2039240151643753, + "learning_rate": 1.287704444593951e-05, + "loss": 0.071, + "step": 38967 + }, + { + "epoch": 0.6950379909392501, + "grad_norm": 0.25528669357299805, + "learning_rate": 1.2875683210940704e-05, + "loss": 0.1463, + "step": 38968 + }, + { + "epoch": 0.6950558270609639, + "grad_norm": 0.6063457727432251, + "learning_rate": 1.2874322022939927e-05, + "loss": 0.1915, + "step": 38969 + }, + { + "epoch": 0.6950736631826776, + "grad_norm": 0.27676084637641907, + "learning_rate": 1.2872960881942481e-05, + "loss": 0.1479, + "step": 38970 + }, + { + "epoch": 0.6950914993043913, + "grad_norm": 0.21087561547756195, + "learning_rate": 1.2871599787953612e-05, + "loss": 0.0934, + "step": 38971 + }, + { + "epoch": 0.695109335426105, + "grad_norm": 0.4068029820919037, + "learning_rate": 1.2870238740978623e-05, + "loss": 0.1315, + "step": 38972 + }, + { + "epoch": 0.6951271715478187, + "grad_norm": 0.28620705008506775, + "learning_rate": 1.2868877741022778e-05, + "loss": 0.1258, + "step": 38973 + }, + { + "epoch": 0.6951450076695324, + "grad_norm": 0.2266935408115387, + "learning_rate": 1.2867516788091342e-05, + "loss": 0.1244, + "step": 38974 + }, + { + "epoch": 0.695162843791246, + "grad_norm": 0.28696364164352417, + "learning_rate": 1.2866155882189613e-05, + "loss": 0.1277, + "step": 38975 + }, + { + "epoch": 0.6951806799129597, + "grad_norm": 0.24171146750450134, + "learning_rate": 1.2864795023322851e-05, + "loss": 0.0879, + "step": 38976 + }, + { + "epoch": 0.6951985160346734, + "grad_norm": 0.1903069168329239, + "learning_rate": 1.2863434211496339e-05, + "loss": 0.0619, + "step": 38977 + }, + { + "epoch": 0.6952163521563871, + "grad_norm": 0.3499768078327179, + "learning_rate": 1.2862073446715333e-05, + "loss": 0.1452, + "step": 38978 + }, + { + "epoch": 0.6952341882781008, + "grad_norm": 0.31532683968544006, + "learning_rate": 1.2860712728985137e-05, + "loss": 0.097, + "step": 38979 + }, + { + "epoch": 0.6952520243998145, + "grad_norm": 0.4439140856266022, + "learning_rate": 1.2859352058311011e-05, + "loss": 0.1453, + "step": 38980 + }, + { + "epoch": 0.6952698605215282, + "grad_norm": 0.21643291413784027, + "learning_rate": 1.285799143469823e-05, + "loss": 0.1435, + "step": 38981 + }, + { + "epoch": 0.6952876966432419, + "grad_norm": 0.2755575180053711, + "learning_rate": 1.2856630858152056e-05, + "loss": 0.1594, + "step": 38982 + }, + { + "epoch": 0.6953055327649555, + "grad_norm": 0.3381027579307556, + "learning_rate": 1.2855270328677784e-05, + "loss": 0.1605, + "step": 38983 + }, + { + "epoch": 0.6953233688866692, + "grad_norm": 0.2376745492219925, + "learning_rate": 1.285390984628067e-05, + "loss": 0.1269, + "step": 38984 + }, + { + "epoch": 0.6953412050083829, + "grad_norm": 0.2377690225839615, + "learning_rate": 1.2852549410966009e-05, + "loss": 0.1513, + "step": 38985 + }, + { + "epoch": 0.6953590411300967, + "grad_norm": 0.3392479419708252, + "learning_rate": 1.285118902273906e-05, + "loss": 0.1363, + "step": 38986 + }, + { + "epoch": 0.6953768772518104, + "grad_norm": 0.28568512201309204, + "learning_rate": 1.2849828681605089e-05, + "loss": 0.1166, + "step": 38987 + }, + { + "epoch": 0.6953947133735241, + "grad_norm": 0.27334463596343994, + "learning_rate": 1.2848468387569388e-05, + "loss": 0.1503, + "step": 38988 + }, + { + "epoch": 0.6954125494952378, + "grad_norm": 0.2065826803445816, + "learning_rate": 1.284710814063722e-05, + "loss": 0.1009, + "step": 38989 + }, + { + "epoch": 0.6954303856169515, + "grad_norm": 0.24931828677654266, + "learning_rate": 1.2845747940813857e-05, + "loss": 0.0686, + "step": 38990 + }, + { + "epoch": 0.6954482217386652, + "grad_norm": 0.3238942325115204, + "learning_rate": 1.2844387788104567e-05, + "loss": 0.1065, + "step": 38991 + }, + { + "epoch": 0.6954660578603789, + "grad_norm": 0.2807333469390869, + "learning_rate": 1.2843027682514635e-05, + "loss": 0.1324, + "step": 38992 + }, + { + "epoch": 0.6954838939820925, + "grad_norm": 0.3370315730571747, + "learning_rate": 1.2841667624049331e-05, + "loss": 0.1382, + "step": 38993 + }, + { + "epoch": 0.6955017301038062, + "grad_norm": 0.237172931432724, + "learning_rate": 1.2840307612713917e-05, + "loss": 0.1084, + "step": 38994 + }, + { + "epoch": 0.6955195662255199, + "grad_norm": 0.38796600699424744, + "learning_rate": 1.2838947648513663e-05, + "loss": 0.123, + "step": 38995 + }, + { + "epoch": 0.6955374023472336, + "grad_norm": 0.28400012850761414, + "learning_rate": 1.283758773145386e-05, + "loss": 0.1085, + "step": 38996 + }, + { + "epoch": 0.6955552384689473, + "grad_norm": 0.2858881652355194, + "learning_rate": 1.2836227861539762e-05, + "loss": 0.0826, + "step": 38997 + }, + { + "epoch": 0.695573074590661, + "grad_norm": 0.4085495173931122, + "learning_rate": 1.2834868038776644e-05, + "loss": 0.1512, + "step": 38998 + }, + { + "epoch": 0.6955909107123747, + "grad_norm": 0.3054777979850769, + "learning_rate": 1.2833508263169783e-05, + "loss": 0.179, + "step": 38999 + }, + { + "epoch": 0.6956087468340884, + "grad_norm": 0.277476042509079, + "learning_rate": 1.2832148534724439e-05, + "loss": 0.0871, + "step": 39000 + }, + { + "epoch": 0.6956087468340884, + "eval_loss": 0.11643356084823608, + "eval_runtime": 106.9618, + "eval_samples_per_second": 9.574, + "eval_steps_per_second": 1.599, + "step": 39000 + }, + { + "epoch": 0.695626582955802, + "grad_norm": 0.25121405720710754, + "learning_rate": 1.28307888534459e-05, + "loss": 0.1163, + "step": 39001 + }, + { + "epoch": 0.6956444190775157, + "grad_norm": 0.21921661496162415, + "learning_rate": 1.2829429219339423e-05, + "loss": 0.1413, + "step": 39002 + }, + { + "epoch": 0.6956622551992295, + "grad_norm": 0.20281681418418884, + "learning_rate": 1.2828069632410286e-05, + "loss": 0.0809, + "step": 39003 + }, + { + "epoch": 0.6956800913209432, + "grad_norm": 0.21603518724441528, + "learning_rate": 1.2826710092663747e-05, + "loss": 0.133, + "step": 39004 + }, + { + "epoch": 0.6956979274426569, + "grad_norm": 0.3089606463909149, + "learning_rate": 1.2825350600105088e-05, + "loss": 0.1543, + "step": 39005 + }, + { + "epoch": 0.6957157635643706, + "grad_norm": 0.2746087312698364, + "learning_rate": 1.2823991154739579e-05, + "loss": 0.1284, + "step": 39006 + }, + { + "epoch": 0.6957335996860843, + "grad_norm": 0.302846223115921, + "learning_rate": 1.2822631756572484e-05, + "loss": 0.0938, + "step": 39007 + }, + { + "epoch": 0.695751435807798, + "grad_norm": 0.23596924543380737, + "learning_rate": 1.2821272405609064e-05, + "loss": 0.0848, + "step": 39008 + }, + { + "epoch": 0.6957692719295117, + "grad_norm": 0.2683382034301758, + "learning_rate": 1.281991310185461e-05, + "loss": 0.0836, + "step": 39009 + }, + { + "epoch": 0.6957871080512253, + "grad_norm": 0.21829615533351898, + "learning_rate": 1.281855384531438e-05, + "loss": 0.102, + "step": 39010 + }, + { + "epoch": 0.695804944172939, + "grad_norm": 0.26865509152412415, + "learning_rate": 1.281719463599364e-05, + "loss": 0.1183, + "step": 39011 + }, + { + "epoch": 0.6958227802946527, + "grad_norm": 0.3482833504676819, + "learning_rate": 1.2815835473897655e-05, + "loss": 0.1298, + "step": 39012 + }, + { + "epoch": 0.6958406164163664, + "grad_norm": 0.27884677052497864, + "learning_rate": 1.28144763590317e-05, + "loss": 0.1173, + "step": 39013 + }, + { + "epoch": 0.6958584525380801, + "grad_norm": 0.21725775301456451, + "learning_rate": 1.281311729140105e-05, + "loss": 0.1192, + "step": 39014 + }, + { + "epoch": 0.6958762886597938, + "grad_norm": 0.27511894702911377, + "learning_rate": 1.2811758271010968e-05, + "loss": 0.1132, + "step": 39015 + }, + { + "epoch": 0.6958941247815075, + "grad_norm": 0.23363026976585388, + "learning_rate": 1.2810399297866721e-05, + "loss": 0.0774, + "step": 39016 + }, + { + "epoch": 0.6959119609032212, + "grad_norm": 0.25134822726249695, + "learning_rate": 1.2809040371973568e-05, + "loss": 0.1259, + "step": 39017 + }, + { + "epoch": 0.6959297970249348, + "grad_norm": 0.25535765290260315, + "learning_rate": 1.2807681493336792e-05, + "loss": 0.1113, + "step": 39018 + }, + { + "epoch": 0.6959476331466486, + "grad_norm": 0.2510998845100403, + "learning_rate": 1.2806322661961656e-05, + "loss": 0.0755, + "step": 39019 + }, + { + "epoch": 0.6959654692683623, + "grad_norm": 0.281025230884552, + "learning_rate": 1.280496387785342e-05, + "loss": 0.0903, + "step": 39020 + }, + { + "epoch": 0.695983305390076, + "grad_norm": 0.3614897131919861, + "learning_rate": 1.2803605141017354e-05, + "loss": 0.0657, + "step": 39021 + }, + { + "epoch": 0.6960011415117897, + "grad_norm": 0.2231587916612625, + "learning_rate": 1.2802246451458732e-05, + "loss": 0.1364, + "step": 39022 + }, + { + "epoch": 0.6960189776335034, + "grad_norm": 0.2773556709289551, + "learning_rate": 1.2800887809182815e-05, + "loss": 0.1211, + "step": 39023 + }, + { + "epoch": 0.6960368137552171, + "grad_norm": 0.21469847857952118, + "learning_rate": 1.2799529214194872e-05, + "loss": 0.1084, + "step": 39024 + }, + { + "epoch": 0.6960546498769308, + "grad_norm": 0.29153093695640564, + "learning_rate": 1.2798170666500158e-05, + "loss": 0.1268, + "step": 39025 + }, + { + "epoch": 0.6960724859986445, + "grad_norm": 0.24175620079040527, + "learning_rate": 1.2796812166103961e-05, + "loss": 0.1118, + "step": 39026 + }, + { + "epoch": 0.6960903221203582, + "grad_norm": 0.26647093892097473, + "learning_rate": 1.2795453713011525e-05, + "loss": 0.1282, + "step": 39027 + }, + { + "epoch": 0.6961081582420718, + "grad_norm": 0.23232296109199524, + "learning_rate": 1.279409530722813e-05, + "loss": 0.1094, + "step": 39028 + }, + { + "epoch": 0.6961259943637855, + "grad_norm": 0.2349260449409485, + "learning_rate": 1.279273694875904e-05, + "loss": 0.1065, + "step": 39029 + }, + { + "epoch": 0.6961438304854992, + "grad_norm": 0.2272673398256302, + "learning_rate": 1.2791378637609508e-05, + "loss": 0.1101, + "step": 39030 + }, + { + "epoch": 0.6961616666072129, + "grad_norm": 0.3893534243106842, + "learning_rate": 1.2790020373784822e-05, + "loss": 0.1434, + "step": 39031 + }, + { + "epoch": 0.6961795027289266, + "grad_norm": 0.24964436888694763, + "learning_rate": 1.2788662157290231e-05, + "loss": 0.1003, + "step": 39032 + }, + { + "epoch": 0.6961973388506403, + "grad_norm": 0.2861902713775635, + "learning_rate": 1.2787303988131003e-05, + "loss": 0.083, + "step": 39033 + }, + { + "epoch": 0.696215174972354, + "grad_norm": 0.17898684740066528, + "learning_rate": 1.2785945866312393e-05, + "loss": 0.0843, + "step": 39034 + }, + { + "epoch": 0.6962330110940677, + "grad_norm": 0.30016759037971497, + "learning_rate": 1.2784587791839686e-05, + "loss": 0.0726, + "step": 39035 + }, + { + "epoch": 0.6962508472157815, + "grad_norm": 0.3075457811355591, + "learning_rate": 1.2783229764718136e-05, + "loss": 0.091, + "step": 39036 + }, + { + "epoch": 0.6962686833374951, + "grad_norm": 0.4590596854686737, + "learning_rate": 1.278187178495301e-05, + "loss": 0.1639, + "step": 39037 + }, + { + "epoch": 0.6962865194592088, + "grad_norm": 0.3507436513900757, + "learning_rate": 1.2780513852549564e-05, + "loss": 0.1888, + "step": 39038 + }, + { + "epoch": 0.6963043555809225, + "grad_norm": 0.18821682035923004, + "learning_rate": 1.2779155967513057e-05, + "loss": 0.1158, + "step": 39039 + }, + { + "epoch": 0.6963221917026362, + "grad_norm": 0.25792309641838074, + "learning_rate": 1.2777798129848768e-05, + "loss": 0.1029, + "step": 39040 + }, + { + "epoch": 0.6963400278243499, + "grad_norm": 0.2089913934469223, + "learning_rate": 1.2776440339561957e-05, + "loss": 0.0993, + "step": 39041 + }, + { + "epoch": 0.6963578639460636, + "grad_norm": 0.2833325266838074, + "learning_rate": 1.2775082596657889e-05, + "loss": 0.114, + "step": 39042 + }, + { + "epoch": 0.6963757000677773, + "grad_norm": 0.27349862456321716, + "learning_rate": 1.2773724901141815e-05, + "loss": 0.109, + "step": 39043 + }, + { + "epoch": 0.696393536189491, + "grad_norm": 0.34258368611335754, + "learning_rate": 1.2772367253019014e-05, + "loss": 0.1112, + "step": 39044 + }, + { + "epoch": 0.6964113723112046, + "grad_norm": 0.2869550287723541, + "learning_rate": 1.2771009652294741e-05, + "loss": 0.0941, + "step": 39045 + }, + { + "epoch": 0.6964292084329183, + "grad_norm": 0.31704646348953247, + "learning_rate": 1.2769652098974261e-05, + "loss": 0.1056, + "step": 39046 + }, + { + "epoch": 0.696447044554632, + "grad_norm": 0.33068206906318665, + "learning_rate": 1.2768294593062818e-05, + "loss": 0.1395, + "step": 39047 + }, + { + "epoch": 0.6964648806763457, + "grad_norm": 0.29258713126182556, + "learning_rate": 1.2766937134565704e-05, + "loss": 0.1912, + "step": 39048 + }, + { + "epoch": 0.6964827167980594, + "grad_norm": 0.2919904887676239, + "learning_rate": 1.2765579723488167e-05, + "loss": 0.0935, + "step": 39049 + }, + { + "epoch": 0.6965005529197731, + "grad_norm": 0.31889808177948, + "learning_rate": 1.2764222359835468e-05, + "loss": 0.1509, + "step": 39050 + }, + { + "epoch": 0.6965183890414868, + "grad_norm": 0.2491060197353363, + "learning_rate": 1.2762865043612868e-05, + "loss": 0.0383, + "step": 39051 + }, + { + "epoch": 0.6965362251632005, + "grad_norm": 0.20252561569213867, + "learning_rate": 1.276150777482562e-05, + "loss": 0.0704, + "step": 39052 + }, + { + "epoch": 0.6965540612849143, + "grad_norm": 0.2772665321826935, + "learning_rate": 1.2760150553479006e-05, + "loss": 0.1465, + "step": 39053 + }, + { + "epoch": 0.696571897406628, + "grad_norm": 0.24536865949630737, + "learning_rate": 1.2758793379578266e-05, + "loss": 0.1314, + "step": 39054 + }, + { + "epoch": 0.6965897335283416, + "grad_norm": 0.26596805453300476, + "learning_rate": 1.2757436253128679e-05, + "loss": 0.1249, + "step": 39055 + }, + { + "epoch": 0.6966075696500553, + "grad_norm": 0.17847612500190735, + "learning_rate": 1.2756079174135489e-05, + "loss": 0.0683, + "step": 39056 + }, + { + "epoch": 0.696625405771769, + "grad_norm": 0.2210547775030136, + "learning_rate": 1.2754722142603975e-05, + "loss": 0.0879, + "step": 39057 + }, + { + "epoch": 0.6966432418934827, + "grad_norm": 0.35463643074035645, + "learning_rate": 1.2753365158539387e-05, + "loss": 0.0852, + "step": 39058 + }, + { + "epoch": 0.6966610780151964, + "grad_norm": 0.36180663108825684, + "learning_rate": 1.2752008221946987e-05, + "loss": 0.1528, + "step": 39059 + }, + { + "epoch": 0.6966789141369101, + "grad_norm": 0.22196823358535767, + "learning_rate": 1.2750651332832019e-05, + "loss": 0.1089, + "step": 39060 + }, + { + "epoch": 0.6966967502586238, + "grad_norm": 0.32768359780311584, + "learning_rate": 1.274929449119977e-05, + "loss": 0.1207, + "step": 39061 + }, + { + "epoch": 0.6967145863803375, + "grad_norm": 0.30993857979774475, + "learning_rate": 1.274793769705549e-05, + "loss": 0.1051, + "step": 39062 + }, + { + "epoch": 0.6967324225020511, + "grad_norm": 0.42698532342910767, + "learning_rate": 1.274658095040443e-05, + "loss": 0.0899, + "step": 39063 + }, + { + "epoch": 0.6967502586237648, + "grad_norm": 0.26853567361831665, + "learning_rate": 1.2745224251251858e-05, + "loss": 0.1185, + "step": 39064 + }, + { + "epoch": 0.6967680947454785, + "grad_norm": 0.24525515735149384, + "learning_rate": 1.2743867599603019e-05, + "loss": 0.1257, + "step": 39065 + }, + { + "epoch": 0.6967859308671922, + "grad_norm": 0.21994318068027496, + "learning_rate": 1.2742510995463192e-05, + "loss": 0.0964, + "step": 39066 + }, + { + "epoch": 0.6968037669889059, + "grad_norm": 0.3300603926181793, + "learning_rate": 1.2741154438837616e-05, + "loss": 0.1464, + "step": 39067 + }, + { + "epoch": 0.6968216031106196, + "grad_norm": 0.18020690977573395, + "learning_rate": 1.2739797929731567e-05, + "loss": 0.0966, + "step": 39068 + }, + { + "epoch": 0.6968394392323333, + "grad_norm": 0.2755752503871918, + "learning_rate": 1.2738441468150286e-05, + "loss": 0.1517, + "step": 39069 + }, + { + "epoch": 0.6968572753540471, + "grad_norm": 0.26277220249176025, + "learning_rate": 1.2737085054099055e-05, + "loss": 0.1213, + "step": 39070 + }, + { + "epoch": 0.6968751114757608, + "grad_norm": 0.2911570966243744, + "learning_rate": 1.2735728687583116e-05, + "loss": 0.1182, + "step": 39071 + }, + { + "epoch": 0.6968929475974744, + "grad_norm": 0.2323385626077652, + "learning_rate": 1.2734372368607728e-05, + "loss": 0.0992, + "step": 39072 + }, + { + "epoch": 0.6969107837191881, + "grad_norm": 0.30016231536865234, + "learning_rate": 1.2733016097178139e-05, + "loss": 0.147, + "step": 39073 + }, + { + "epoch": 0.6969286198409018, + "grad_norm": 0.27335840463638306, + "learning_rate": 1.2731659873299625e-05, + "loss": 0.0779, + "step": 39074 + }, + { + "epoch": 0.6969464559626155, + "grad_norm": 0.23359227180480957, + "learning_rate": 1.2730303696977436e-05, + "loss": 0.1245, + "step": 39075 + }, + { + "epoch": 0.6969642920843292, + "grad_norm": 0.4167681038379669, + "learning_rate": 1.2728947568216828e-05, + "loss": 0.1173, + "step": 39076 + }, + { + "epoch": 0.6969821282060429, + "grad_norm": 0.33337992429733276, + "learning_rate": 1.2727591487023057e-05, + "loss": 0.1451, + "step": 39077 + }, + { + "epoch": 0.6969999643277566, + "grad_norm": 0.2162632793188095, + "learning_rate": 1.2726235453401369e-05, + "loss": 0.1221, + "step": 39078 + }, + { + "epoch": 0.6970178004494703, + "grad_norm": 0.2172084003686905, + "learning_rate": 1.2724879467357046e-05, + "loss": 0.1473, + "step": 39079 + }, + { + "epoch": 0.697035636571184, + "grad_norm": 0.2695470154285431, + "learning_rate": 1.2723523528895327e-05, + "loss": 0.1108, + "step": 39080 + }, + { + "epoch": 0.6970534726928976, + "grad_norm": 0.260147362947464, + "learning_rate": 1.272216763802146e-05, + "loss": 0.1053, + "step": 39081 + }, + { + "epoch": 0.6970713088146113, + "grad_norm": 0.37548601627349854, + "learning_rate": 1.2720811794740723e-05, + "loss": 0.1014, + "step": 39082 + }, + { + "epoch": 0.697089144936325, + "grad_norm": 0.3185540437698364, + "learning_rate": 1.2719455999058349e-05, + "loss": 0.1172, + "step": 39083 + }, + { + "epoch": 0.6971069810580387, + "grad_norm": 0.23268212378025055, + "learning_rate": 1.2718100250979619e-05, + "loss": 0.1018, + "step": 39084 + }, + { + "epoch": 0.6971248171797524, + "grad_norm": 0.33350467681884766, + "learning_rate": 1.271674455050977e-05, + "loss": 0.1951, + "step": 39085 + }, + { + "epoch": 0.6971426533014661, + "grad_norm": 0.2379380464553833, + "learning_rate": 1.2715388897654058e-05, + "loss": 0.1579, + "step": 39086 + }, + { + "epoch": 0.6971604894231799, + "grad_norm": 0.22094188630580902, + "learning_rate": 1.2714033292417743e-05, + "loss": 0.0904, + "step": 39087 + }, + { + "epoch": 0.6971783255448936, + "grad_norm": 0.1776876002550125, + "learning_rate": 1.2712677734806082e-05, + "loss": 0.0871, + "step": 39088 + }, + { + "epoch": 0.6971961616666073, + "grad_norm": 0.2052459567785263, + "learning_rate": 1.271132222482433e-05, + "loss": 0.0219, + "step": 39089 + }, + { + "epoch": 0.6972139977883209, + "grad_norm": 0.3054584264755249, + "learning_rate": 1.2709966762477731e-05, + "loss": 0.1223, + "step": 39090 + }, + { + "epoch": 0.6972318339100346, + "grad_norm": 0.25725704431533813, + "learning_rate": 1.270861134777154e-05, + "loss": 0.1078, + "step": 39091 + }, + { + "epoch": 0.6972496700317483, + "grad_norm": 0.28233176469802856, + "learning_rate": 1.2707255980711024e-05, + "loss": 0.0899, + "step": 39092 + }, + { + "epoch": 0.697267506153462, + "grad_norm": 0.2727392911911011, + "learning_rate": 1.2705900661301434e-05, + "loss": 0.1366, + "step": 39093 + }, + { + "epoch": 0.6972853422751757, + "grad_norm": 0.28299516439437866, + "learning_rate": 1.2704545389548006e-05, + "loss": 0.1351, + "step": 39094 + }, + { + "epoch": 0.6973031783968894, + "grad_norm": 0.24756821990013123, + "learning_rate": 1.2703190165456016e-05, + "loss": 0.1183, + "step": 39095 + }, + { + "epoch": 0.6973210145186031, + "grad_norm": 0.2845204174518585, + "learning_rate": 1.27018349890307e-05, + "loss": 0.1456, + "step": 39096 + }, + { + "epoch": 0.6973388506403168, + "grad_norm": 0.2900078594684601, + "learning_rate": 1.2700479860277328e-05, + "loss": 0.1525, + "step": 39097 + }, + { + "epoch": 0.6973566867620304, + "grad_norm": 0.2275385856628418, + "learning_rate": 1.2699124779201144e-05, + "loss": 0.1277, + "step": 39098 + }, + { + "epoch": 0.6973745228837441, + "grad_norm": 0.25815024971961975, + "learning_rate": 1.2697769745807391e-05, + "loss": 0.118, + "step": 39099 + }, + { + "epoch": 0.6973923590054578, + "grad_norm": 0.23723527789115906, + "learning_rate": 1.2696414760101344e-05, + "loss": 0.1031, + "step": 39100 + }, + { + "epoch": 0.6974101951271715, + "grad_norm": 0.24545933306217194, + "learning_rate": 1.269505982208824e-05, + "loss": 0.1293, + "step": 39101 + }, + { + "epoch": 0.6974280312488852, + "grad_norm": 0.39338332414627075, + "learning_rate": 1.2693704931773336e-05, + "loss": 0.1102, + "step": 39102 + }, + { + "epoch": 0.6974458673705989, + "grad_norm": 0.2605094313621521, + "learning_rate": 1.269235008916188e-05, + "loss": 0.1322, + "step": 39103 + }, + { + "epoch": 0.6974637034923127, + "grad_norm": 0.28584545850753784, + "learning_rate": 1.2690995294259118e-05, + "loss": 0.1563, + "step": 39104 + }, + { + "epoch": 0.6974815396140264, + "grad_norm": 0.2810380160808563, + "learning_rate": 1.268964054707032e-05, + "loss": 0.0751, + "step": 39105 + }, + { + "epoch": 0.6974993757357401, + "grad_norm": 0.22144712507724762, + "learning_rate": 1.2688285847600726e-05, + "loss": 0.135, + "step": 39106 + }, + { + "epoch": 0.6975172118574537, + "grad_norm": 0.2636159360408783, + "learning_rate": 1.2686931195855587e-05, + "loss": 0.1024, + "step": 39107 + }, + { + "epoch": 0.6975350479791674, + "grad_norm": 0.20967800915241241, + "learning_rate": 1.2685576591840149e-05, + "loss": 0.0919, + "step": 39108 + }, + { + "epoch": 0.6975528841008811, + "grad_norm": 0.1997174620628357, + "learning_rate": 1.2684222035559676e-05, + "loss": 0.0903, + "step": 39109 + }, + { + "epoch": 0.6975707202225948, + "grad_norm": 0.35322344303131104, + "learning_rate": 1.2682867527019404e-05, + "loss": 0.1153, + "step": 39110 + }, + { + "epoch": 0.6975885563443085, + "grad_norm": 0.2915917634963989, + "learning_rate": 1.2681513066224598e-05, + "loss": 0.1418, + "step": 39111 + }, + { + "epoch": 0.6976063924660222, + "grad_norm": 0.3704468011856079, + "learning_rate": 1.2680158653180497e-05, + "loss": 0.1629, + "step": 39112 + }, + { + "epoch": 0.6976242285877359, + "grad_norm": 0.28715503215789795, + "learning_rate": 1.2678804287892366e-05, + "loss": 0.1047, + "step": 39113 + }, + { + "epoch": 0.6976420647094496, + "grad_norm": 0.2410965859889984, + "learning_rate": 1.2677449970365441e-05, + "loss": 0.1601, + "step": 39114 + }, + { + "epoch": 0.6976599008311632, + "grad_norm": 0.3165033459663391, + "learning_rate": 1.2676095700604978e-05, + "loss": 0.1081, + "step": 39115 + }, + { + "epoch": 0.6976777369528769, + "grad_norm": 0.21846450865268707, + "learning_rate": 1.2674741478616226e-05, + "loss": 0.1206, + "step": 39116 + }, + { + "epoch": 0.6976955730745906, + "grad_norm": 0.25390514731407166, + "learning_rate": 1.2673387304404421e-05, + "loss": 0.1171, + "step": 39117 + }, + { + "epoch": 0.6977134091963043, + "grad_norm": 0.2805967330932617, + "learning_rate": 1.2672033177974834e-05, + "loss": 0.1486, + "step": 39118 + }, + { + "epoch": 0.697731245318018, + "grad_norm": 0.25417935848236084, + "learning_rate": 1.2670679099332707e-05, + "loss": 0.0773, + "step": 39119 + }, + { + "epoch": 0.6977490814397318, + "grad_norm": 0.23024597764015198, + "learning_rate": 1.2669325068483284e-05, + "loss": 0.1247, + "step": 39120 + }, + { + "epoch": 0.6977669175614455, + "grad_norm": 0.25610220432281494, + "learning_rate": 1.266797108543181e-05, + "loss": 0.1756, + "step": 39121 + }, + { + "epoch": 0.6977847536831592, + "grad_norm": 0.27678313851356506, + "learning_rate": 1.2666617150183546e-05, + "loss": 0.0853, + "step": 39122 + }, + { + "epoch": 0.6978025898048729, + "grad_norm": 0.23821666836738586, + "learning_rate": 1.2665263262743724e-05, + "loss": 0.1092, + "step": 39123 + }, + { + "epoch": 0.6978204259265866, + "grad_norm": 0.20323744416236877, + "learning_rate": 1.266390942311761e-05, + "loss": 0.1208, + "step": 39124 + }, + { + "epoch": 0.6978382620483002, + "grad_norm": 0.23441927134990692, + "learning_rate": 1.2662555631310435e-05, + "loss": 0.1169, + "step": 39125 + }, + { + "epoch": 0.6978560981700139, + "grad_norm": 0.23083360493183136, + "learning_rate": 1.2661201887327468e-05, + "loss": 0.107, + "step": 39126 + }, + { + "epoch": 0.6978739342917276, + "grad_norm": 0.27246028184890747, + "learning_rate": 1.2659848191173942e-05, + "loss": 0.1234, + "step": 39127 + }, + { + "epoch": 0.6978917704134413, + "grad_norm": 0.3090742826461792, + "learning_rate": 1.265849454285511e-05, + "loss": 0.0892, + "step": 39128 + }, + { + "epoch": 0.697909606535155, + "grad_norm": 0.25338998436927795, + "learning_rate": 1.2657140942376211e-05, + "loss": 0.1229, + "step": 39129 + }, + { + "epoch": 0.6979274426568687, + "grad_norm": 0.28995487093925476, + "learning_rate": 1.2655787389742488e-05, + "loss": 0.134, + "step": 39130 + }, + { + "epoch": 0.6979452787785824, + "grad_norm": 0.34188976883888245, + "learning_rate": 1.2654433884959207e-05, + "loss": 0.0916, + "step": 39131 + }, + { + "epoch": 0.697963114900296, + "grad_norm": 0.30309998989105225, + "learning_rate": 1.2653080428031604e-05, + "loss": 0.0736, + "step": 39132 + }, + { + "epoch": 0.6979809510220097, + "grad_norm": 0.2233080267906189, + "learning_rate": 1.2651727018964925e-05, + "loss": 0.0697, + "step": 39133 + }, + { + "epoch": 0.6979987871437234, + "grad_norm": 0.2173866331577301, + "learning_rate": 1.2650373657764409e-05, + "loss": 0.1052, + "step": 39134 + }, + { + "epoch": 0.6980166232654371, + "grad_norm": 0.23665744066238403, + "learning_rate": 1.264902034443532e-05, + "loss": 0.1151, + "step": 39135 + }, + { + "epoch": 0.6980344593871508, + "grad_norm": 0.2823314964771271, + "learning_rate": 1.2647667078982894e-05, + "loss": 0.1351, + "step": 39136 + }, + { + "epoch": 0.6980522955088646, + "grad_norm": 0.2626916766166687, + "learning_rate": 1.2646313861412368e-05, + "loss": 0.1009, + "step": 39137 + }, + { + "epoch": 0.6980701316305783, + "grad_norm": 0.23058198392391205, + "learning_rate": 1.2644960691728991e-05, + "loss": 0.1272, + "step": 39138 + }, + { + "epoch": 0.698087967752292, + "grad_norm": 0.25708097219467163, + "learning_rate": 1.2643607569938029e-05, + "loss": 0.0992, + "step": 39139 + }, + { + "epoch": 0.6981058038740057, + "grad_norm": 0.40056949853897095, + "learning_rate": 1.2642254496044708e-05, + "loss": 0.0999, + "step": 39140 + }, + { + "epoch": 0.6981236399957194, + "grad_norm": 0.2038673758506775, + "learning_rate": 1.2640901470054279e-05, + "loss": 0.0929, + "step": 39141 + }, + { + "epoch": 0.698141476117433, + "grad_norm": 0.3673347532749176, + "learning_rate": 1.2639548491971987e-05, + "loss": 0.1172, + "step": 39142 + }, + { + "epoch": 0.6981593122391467, + "grad_norm": 0.4534304440021515, + "learning_rate": 1.2638195561803062e-05, + "loss": 0.2537, + "step": 39143 + }, + { + "epoch": 0.6981771483608604, + "grad_norm": 0.2370782345533371, + "learning_rate": 1.2636842679552769e-05, + "loss": 0.1275, + "step": 39144 + }, + { + "epoch": 0.6981949844825741, + "grad_norm": 0.3435683846473694, + "learning_rate": 1.2635489845226344e-05, + "loss": 0.1769, + "step": 39145 + }, + { + "epoch": 0.6982128206042878, + "grad_norm": 0.27369722723960876, + "learning_rate": 1.2634137058829031e-05, + "loss": 0.1079, + "step": 39146 + }, + { + "epoch": 0.6982306567260015, + "grad_norm": 0.27731940150260925, + "learning_rate": 1.2632784320366065e-05, + "loss": 0.1104, + "step": 39147 + }, + { + "epoch": 0.6982484928477152, + "grad_norm": 0.2214890867471695, + "learning_rate": 1.263143162984271e-05, + "loss": 0.1021, + "step": 39148 + }, + { + "epoch": 0.6982663289694289, + "grad_norm": 0.30670398473739624, + "learning_rate": 1.2630078987264196e-05, + "loss": 0.1756, + "step": 39149 + }, + { + "epoch": 0.6982841650911426, + "grad_norm": 0.29138460755348206, + "learning_rate": 1.2628726392635759e-05, + "loss": 0.1317, + "step": 39150 + }, + { + "epoch": 0.6983020012128562, + "grad_norm": 0.2378193736076355, + "learning_rate": 1.2627373845962658e-05, + "loss": 0.1328, + "step": 39151 + }, + { + "epoch": 0.6983198373345699, + "grad_norm": 0.17932955920696259, + "learning_rate": 1.2626021347250125e-05, + "loss": 0.1007, + "step": 39152 + }, + { + "epoch": 0.6983376734562836, + "grad_norm": 0.29061880707740784, + "learning_rate": 1.2624668896503414e-05, + "loss": 0.1483, + "step": 39153 + }, + { + "epoch": 0.6983555095779974, + "grad_norm": 0.31485599279403687, + "learning_rate": 1.262331649372776e-05, + "loss": 0.1508, + "step": 39154 + }, + { + "epoch": 0.6983733456997111, + "grad_norm": 0.22489584982395172, + "learning_rate": 1.2621964138928408e-05, + "loss": 0.1051, + "step": 39155 + }, + { + "epoch": 0.6983911818214248, + "grad_norm": 0.2558549642562866, + "learning_rate": 1.2620611832110587e-05, + "loss": 0.1241, + "step": 39156 + }, + { + "epoch": 0.6984090179431385, + "grad_norm": 0.25634926557540894, + "learning_rate": 1.261925957327956e-05, + "loss": 0.1136, + "step": 39157 + }, + { + "epoch": 0.6984268540648522, + "grad_norm": 0.29607006907463074, + "learning_rate": 1.2617907362440562e-05, + "loss": 0.127, + "step": 39158 + }, + { + "epoch": 0.6984446901865659, + "grad_norm": 0.2850937843322754, + "learning_rate": 1.2616555199598829e-05, + "loss": 0.1123, + "step": 39159 + }, + { + "epoch": 0.6984625263082795, + "grad_norm": 0.30544018745422363, + "learning_rate": 1.2615203084759594e-05, + "loss": 0.1169, + "step": 39160 + }, + { + "epoch": 0.6984803624299932, + "grad_norm": 0.3253500759601593, + "learning_rate": 1.2613851017928119e-05, + "loss": 0.1083, + "step": 39161 + }, + { + "epoch": 0.6984981985517069, + "grad_norm": 0.4971618056297302, + "learning_rate": 1.261249899910964e-05, + "loss": 0.1339, + "step": 39162 + }, + { + "epoch": 0.6985160346734206, + "grad_norm": 0.4263477623462677, + "learning_rate": 1.2611147028309388e-05, + "loss": 0.099, + "step": 39163 + }, + { + "epoch": 0.6985338707951343, + "grad_norm": 0.27318835258483887, + "learning_rate": 1.2609795105532604e-05, + "loss": 0.1003, + "step": 39164 + }, + { + "epoch": 0.698551706916848, + "grad_norm": 0.27858197689056396, + "learning_rate": 1.2608443230784539e-05, + "loss": 0.1171, + "step": 39165 + }, + { + "epoch": 0.6985695430385617, + "grad_norm": 0.247537761926651, + "learning_rate": 1.2607091404070424e-05, + "loss": 0.0919, + "step": 39166 + }, + { + "epoch": 0.6985873791602754, + "grad_norm": 0.22903065383434296, + "learning_rate": 1.2605739625395508e-05, + "loss": 0.0576, + "step": 39167 + }, + { + "epoch": 0.698605215281989, + "grad_norm": 0.29690563678741455, + "learning_rate": 1.2604387894765026e-05, + "loss": 0.1427, + "step": 39168 + }, + { + "epoch": 0.6986230514037027, + "grad_norm": 0.2974201440811157, + "learning_rate": 1.2603036212184211e-05, + "loss": 0.1356, + "step": 39169 + }, + { + "epoch": 0.6986408875254164, + "grad_norm": 0.21565239131450653, + "learning_rate": 1.2601684577658318e-05, + "loss": 0.1257, + "step": 39170 + }, + { + "epoch": 0.6986587236471302, + "grad_norm": 0.25304117798805237, + "learning_rate": 1.2600332991192576e-05, + "loss": 0.1522, + "step": 39171 + }, + { + "epoch": 0.6986765597688439, + "grad_norm": 0.1776355504989624, + "learning_rate": 1.259898145279223e-05, + "loss": 0.1016, + "step": 39172 + }, + { + "epoch": 0.6986943958905576, + "grad_norm": 0.19958031177520752, + "learning_rate": 1.2597629962462502e-05, + "loss": 0.1285, + "step": 39173 + }, + { + "epoch": 0.6987122320122713, + "grad_norm": 0.27183884382247925, + "learning_rate": 1.2596278520208654e-05, + "loss": 0.1274, + "step": 39174 + }, + { + "epoch": 0.698730068133985, + "grad_norm": 0.2655390202999115, + "learning_rate": 1.2594927126035917e-05, + "loss": 0.1208, + "step": 39175 + }, + { + "epoch": 0.6987479042556987, + "grad_norm": 0.2636301517486572, + "learning_rate": 1.2593575779949524e-05, + "loss": 0.1032, + "step": 39176 + }, + { + "epoch": 0.6987657403774123, + "grad_norm": 0.25221994519233704, + "learning_rate": 1.2592224481954707e-05, + "loss": 0.135, + "step": 39177 + }, + { + "epoch": 0.698783576499126, + "grad_norm": 0.18371526896953583, + "learning_rate": 1.2590873232056724e-05, + "loss": 0.1019, + "step": 39178 + }, + { + "epoch": 0.6988014126208397, + "grad_norm": 0.27536481618881226, + "learning_rate": 1.2589522030260791e-05, + "loss": 0.0696, + "step": 39179 + }, + { + "epoch": 0.6988192487425534, + "grad_norm": 0.2518484890460968, + "learning_rate": 1.258817087657217e-05, + "loss": 0.1101, + "step": 39180 + }, + { + "epoch": 0.6988370848642671, + "grad_norm": 0.29254433512687683, + "learning_rate": 1.2586819770996083e-05, + "loss": 0.114, + "step": 39181 + }, + { + "epoch": 0.6988549209859808, + "grad_norm": 0.29313719272613525, + "learning_rate": 1.2585468713537762e-05, + "loss": 0.1204, + "step": 39182 + }, + { + "epoch": 0.6988727571076945, + "grad_norm": 0.2611317038536072, + "learning_rate": 1.2584117704202459e-05, + "loss": 0.1307, + "step": 39183 + }, + { + "epoch": 0.6988905932294082, + "grad_norm": 0.28403565287590027, + "learning_rate": 1.2582766742995405e-05, + "loss": 0.1517, + "step": 39184 + }, + { + "epoch": 0.6989084293511219, + "grad_norm": 0.3277135193347931, + "learning_rate": 1.2581415829921839e-05, + "loss": 0.102, + "step": 39185 + }, + { + "epoch": 0.6989262654728355, + "grad_norm": 0.28685420751571655, + "learning_rate": 1.2580064964986981e-05, + "loss": 0.1076, + "step": 39186 + }, + { + "epoch": 0.6989441015945492, + "grad_norm": 0.27156829833984375, + "learning_rate": 1.2578714148196092e-05, + "loss": 0.1213, + "step": 39187 + }, + { + "epoch": 0.698961937716263, + "grad_norm": 0.26629018783569336, + "learning_rate": 1.2577363379554396e-05, + "loss": 0.151, + "step": 39188 + }, + { + "epoch": 0.6989797738379767, + "grad_norm": 0.24965910613536835, + "learning_rate": 1.2576012659067133e-05, + "loss": 0.1081, + "step": 39189 + }, + { + "epoch": 0.6989976099596904, + "grad_norm": 0.26446905732154846, + "learning_rate": 1.2574661986739528e-05, + "loss": 0.1183, + "step": 39190 + }, + { + "epoch": 0.6990154460814041, + "grad_norm": 0.26350221037864685, + "learning_rate": 1.2573311362576828e-05, + "loss": 0.1316, + "step": 39191 + }, + { + "epoch": 0.6990332822031178, + "grad_norm": 0.28374266624450684, + "learning_rate": 1.257196078658427e-05, + "loss": 0.1443, + "step": 39192 + }, + { + "epoch": 0.6990511183248315, + "grad_norm": 0.19795873761177063, + "learning_rate": 1.2570610258767073e-05, + "loss": 0.1089, + "step": 39193 + }, + { + "epoch": 0.6990689544465452, + "grad_norm": 0.20379579067230225, + "learning_rate": 1.2569259779130493e-05, + "loss": 0.0769, + "step": 39194 + }, + { + "epoch": 0.6990867905682588, + "grad_norm": 0.25196146965026855, + "learning_rate": 1.2567909347679745e-05, + "loss": 0.1179, + "step": 39195 + }, + { + "epoch": 0.6991046266899725, + "grad_norm": 0.23913459479808807, + "learning_rate": 1.2566558964420088e-05, + "loss": 0.1201, + "step": 39196 + }, + { + "epoch": 0.6991224628116862, + "grad_norm": 0.2760443687438965, + "learning_rate": 1.256520862935674e-05, + "loss": 0.1376, + "step": 39197 + }, + { + "epoch": 0.6991402989333999, + "grad_norm": 0.22437980771064758, + "learning_rate": 1.2563858342494938e-05, + "loss": 0.0982, + "step": 39198 + }, + { + "epoch": 0.6991581350551136, + "grad_norm": 0.3428071141242981, + "learning_rate": 1.2562508103839908e-05, + "loss": 0.2368, + "step": 39199 + }, + { + "epoch": 0.6991759711768273, + "grad_norm": 0.2954291105270386, + "learning_rate": 1.2561157913396898e-05, + "loss": 0.1451, + "step": 39200 + }, + { + "epoch": 0.699193807298541, + "grad_norm": 0.29232439398765564, + "learning_rate": 1.2559807771171139e-05, + "loss": 0.1573, + "step": 39201 + }, + { + "epoch": 0.6992116434202547, + "grad_norm": 0.26184868812561035, + "learning_rate": 1.2558457677167865e-05, + "loss": 0.1501, + "step": 39202 + }, + { + "epoch": 0.6992294795419683, + "grad_norm": 0.23924040794372559, + "learning_rate": 1.2557107631392292e-05, + "loss": 0.1196, + "step": 39203 + }, + { + "epoch": 0.699247315663682, + "grad_norm": 0.3905337452888489, + "learning_rate": 1.255575763384968e-05, + "loss": 0.1632, + "step": 39204 + }, + { + "epoch": 0.6992651517853958, + "grad_norm": 0.31102848052978516, + "learning_rate": 1.255440768454525e-05, + "loss": 0.1394, + "step": 39205 + }, + { + "epoch": 0.6992829879071095, + "grad_norm": 0.2870670557022095, + "learning_rate": 1.255305778348422e-05, + "loss": 0.078, + "step": 39206 + }, + { + "epoch": 0.6993008240288232, + "grad_norm": 0.20866712927818298, + "learning_rate": 1.2551707930671852e-05, + "loss": 0.0957, + "step": 39207 + }, + { + "epoch": 0.6993186601505369, + "grad_norm": 0.2362719178199768, + "learning_rate": 1.255035812611335e-05, + "loss": 0.1078, + "step": 39208 + }, + { + "epoch": 0.6993364962722506, + "grad_norm": 0.24133490025997162, + "learning_rate": 1.254900836981397e-05, + "loss": 0.1107, + "step": 39209 + }, + { + "epoch": 0.6993543323939643, + "grad_norm": 0.3059801757335663, + "learning_rate": 1.2547658661778937e-05, + "loss": 0.1035, + "step": 39210 + }, + { + "epoch": 0.699372168515678, + "grad_norm": 0.22538863122463226, + "learning_rate": 1.2546309002013479e-05, + "loss": 0.0685, + "step": 39211 + }, + { + "epoch": 0.6993900046373916, + "grad_norm": 0.2707521319389343, + "learning_rate": 1.2544959390522815e-05, + "loss": 0.1392, + "step": 39212 + }, + { + "epoch": 0.6994078407591053, + "grad_norm": 0.21140368282794952, + "learning_rate": 1.2543609827312203e-05, + "loss": 0.0977, + "step": 39213 + }, + { + "epoch": 0.699425676880819, + "grad_norm": 0.26786288619041443, + "learning_rate": 1.2542260312386861e-05, + "loss": 0.1379, + "step": 39214 + }, + { + "epoch": 0.6994435130025327, + "grad_norm": 0.2541566491127014, + "learning_rate": 1.2540910845752025e-05, + "loss": 0.1169, + "step": 39215 + }, + { + "epoch": 0.6994613491242464, + "grad_norm": 0.2691229581832886, + "learning_rate": 1.2539561427412904e-05, + "loss": 0.1108, + "step": 39216 + }, + { + "epoch": 0.6994791852459601, + "grad_norm": 0.298031747341156, + "learning_rate": 1.2538212057374765e-05, + "loss": 0.1111, + "step": 39217 + }, + { + "epoch": 0.6994970213676738, + "grad_norm": 0.21651975810527802, + "learning_rate": 1.2536862735642812e-05, + "loss": 0.1161, + "step": 39218 + }, + { + "epoch": 0.6995148574893875, + "grad_norm": 0.25400853157043457, + "learning_rate": 1.2535513462222289e-05, + "loss": 0.1181, + "step": 39219 + }, + { + "epoch": 0.6995326936111012, + "grad_norm": 0.3091680407524109, + "learning_rate": 1.2534164237118413e-05, + "loss": 0.1871, + "step": 39220 + }, + { + "epoch": 0.699550529732815, + "grad_norm": 0.2525642216205597, + "learning_rate": 1.2532815060336417e-05, + "loss": 0.1367, + "step": 39221 + }, + { + "epoch": 0.6995683658545286, + "grad_norm": 0.25376302003860474, + "learning_rate": 1.2531465931881548e-05, + "loss": 0.1557, + "step": 39222 + }, + { + "epoch": 0.6995862019762423, + "grad_norm": 0.22580528259277344, + "learning_rate": 1.2530116851759021e-05, + "loss": 0.1387, + "step": 39223 + }, + { + "epoch": 0.699604038097956, + "grad_norm": 0.22035124897956848, + "learning_rate": 1.2528767819974072e-05, + "loss": 0.1301, + "step": 39224 + }, + { + "epoch": 0.6996218742196697, + "grad_norm": 0.35134023427963257, + "learning_rate": 1.2527418836531913e-05, + "loss": 0.161, + "step": 39225 + }, + { + "epoch": 0.6996397103413834, + "grad_norm": 0.2618481516838074, + "learning_rate": 1.2526069901437798e-05, + "loss": 0.095, + "step": 39226 + }, + { + "epoch": 0.6996575464630971, + "grad_norm": 0.211517795920372, + "learning_rate": 1.2524721014696945e-05, + "loss": 0.0997, + "step": 39227 + }, + { + "epoch": 0.6996753825848108, + "grad_norm": 0.24371327459812164, + "learning_rate": 1.252337217631458e-05, + "loss": 0.1307, + "step": 39228 + }, + { + "epoch": 0.6996932187065245, + "grad_norm": 0.19230906665325165, + "learning_rate": 1.2522023386295928e-05, + "loss": 0.1067, + "step": 39229 + }, + { + "epoch": 0.6997110548282381, + "grad_norm": 0.2598012685775757, + "learning_rate": 1.252067464464623e-05, + "loss": 0.1581, + "step": 39230 + }, + { + "epoch": 0.6997288909499518, + "grad_norm": 0.25412869453430176, + "learning_rate": 1.2519325951370709e-05, + "loss": 0.1246, + "step": 39231 + }, + { + "epoch": 0.6997467270716655, + "grad_norm": 0.40220728516578674, + "learning_rate": 1.2517977306474587e-05, + "loss": 0.178, + "step": 39232 + }, + { + "epoch": 0.6997645631933792, + "grad_norm": 0.18672268092632294, + "learning_rate": 1.251662870996309e-05, + "loss": 0.0781, + "step": 39233 + }, + { + "epoch": 0.6997823993150929, + "grad_norm": 0.30113136768341064, + "learning_rate": 1.2515280161841461e-05, + "loss": 0.1077, + "step": 39234 + }, + { + "epoch": 0.6998002354368066, + "grad_norm": 0.2578449547290802, + "learning_rate": 1.2513931662114908e-05, + "loss": 0.0871, + "step": 39235 + }, + { + "epoch": 0.6998180715585203, + "grad_norm": 0.22928424179553986, + "learning_rate": 1.2512583210788678e-05, + "loss": 0.1166, + "step": 39236 + }, + { + "epoch": 0.699835907680234, + "grad_norm": 0.33365604281425476, + "learning_rate": 1.2511234807867988e-05, + "loss": 0.1636, + "step": 39237 + }, + { + "epoch": 0.6998537438019478, + "grad_norm": 0.19825081527233124, + "learning_rate": 1.2509886453358058e-05, + "loss": 0.1199, + "step": 39238 + }, + { + "epoch": 0.6998715799236614, + "grad_norm": 0.30040496587753296, + "learning_rate": 1.250853814726413e-05, + "loss": 0.1744, + "step": 39239 + }, + { + "epoch": 0.6998894160453751, + "grad_norm": 0.2596457004547119, + "learning_rate": 1.2507189889591422e-05, + "loss": 0.0891, + "step": 39240 + }, + { + "epoch": 0.6999072521670888, + "grad_norm": 0.3632284998893738, + "learning_rate": 1.2505841680345163e-05, + "loss": 0.1023, + "step": 39241 + }, + { + "epoch": 0.6999250882888025, + "grad_norm": 0.3388392925262451, + "learning_rate": 1.2504493519530563e-05, + "loss": 0.1145, + "step": 39242 + }, + { + "epoch": 0.6999429244105162, + "grad_norm": 0.21876616775989532, + "learning_rate": 1.2503145407152878e-05, + "loss": 0.0655, + "step": 39243 + }, + { + "epoch": 0.6999607605322299, + "grad_norm": 0.22531850636005402, + "learning_rate": 1.2501797343217314e-05, + "loss": 0.0841, + "step": 39244 + }, + { + "epoch": 0.6999785966539436, + "grad_norm": 0.24790534377098083, + "learning_rate": 1.25004493277291e-05, + "loss": 0.0704, + "step": 39245 + }, + { + "epoch": 0.6999964327756573, + "grad_norm": 0.2611728608608246, + "learning_rate": 1.2499101360693461e-05, + "loss": 0.1389, + "step": 39246 + }, + { + "epoch": 0.700014268897371, + "grad_norm": 0.26006975769996643, + "learning_rate": 1.2497753442115615e-05, + "loss": 0.1344, + "step": 39247 + }, + { + "epoch": 0.7000321050190846, + "grad_norm": 0.23054149746894836, + "learning_rate": 1.2496405572000793e-05, + "loss": 0.0992, + "step": 39248 + }, + { + "epoch": 0.7000499411407983, + "grad_norm": 0.24358099699020386, + "learning_rate": 1.2495057750354234e-05, + "loss": 0.136, + "step": 39249 + }, + { + "epoch": 0.700067777262512, + "grad_norm": 0.3306012749671936, + "learning_rate": 1.2493709977181149e-05, + "loss": 0.0956, + "step": 39250 + }, + { + "epoch": 0.7000856133842257, + "grad_norm": 0.27693748474121094, + "learning_rate": 1.2492362252486752e-05, + "loss": 0.1181, + "step": 39251 + }, + { + "epoch": 0.7001034495059394, + "grad_norm": 0.24711725115776062, + "learning_rate": 1.2491014576276288e-05, + "loss": 0.0543, + "step": 39252 + }, + { + "epoch": 0.7001212856276531, + "grad_norm": 0.3434850871562958, + "learning_rate": 1.2489666948554976e-05, + "loss": 0.1315, + "step": 39253 + }, + { + "epoch": 0.7001391217493668, + "grad_norm": 0.3840778172016144, + "learning_rate": 1.2488319369328033e-05, + "loss": 0.0924, + "step": 39254 + }, + { + "epoch": 0.7001569578710806, + "grad_norm": 0.37275633215904236, + "learning_rate": 1.2486971838600678e-05, + "loss": 0.208, + "step": 39255 + }, + { + "epoch": 0.7001747939927943, + "grad_norm": 0.24376462399959564, + "learning_rate": 1.248562435637815e-05, + "loss": 0.0912, + "step": 39256 + }, + { + "epoch": 0.7001926301145079, + "grad_norm": 0.24473705887794495, + "learning_rate": 1.2484276922665664e-05, + "loss": 0.1066, + "step": 39257 + }, + { + "epoch": 0.7002104662362216, + "grad_norm": 0.329258531332016, + "learning_rate": 1.2482929537468444e-05, + "loss": 0.1293, + "step": 39258 + }, + { + "epoch": 0.7002283023579353, + "grad_norm": 0.23452426493167877, + "learning_rate": 1.2481582200791713e-05, + "loss": 0.1427, + "step": 39259 + }, + { + "epoch": 0.700246138479649, + "grad_norm": 0.3388354182243347, + "learning_rate": 1.2480234912640685e-05, + "loss": 0.1479, + "step": 39260 + }, + { + "epoch": 0.7002639746013627, + "grad_norm": 0.2526503801345825, + "learning_rate": 1.2478887673020595e-05, + "loss": 0.1164, + "step": 39261 + }, + { + "epoch": 0.7002818107230764, + "grad_norm": 0.28935933113098145, + "learning_rate": 1.2477540481936656e-05, + "loss": 0.1432, + "step": 39262 + }, + { + "epoch": 0.7002996468447901, + "grad_norm": 0.24720443785190582, + "learning_rate": 1.2476193339394105e-05, + "loss": 0.1397, + "step": 39263 + }, + { + "epoch": 0.7003174829665038, + "grad_norm": 0.30163803696632385, + "learning_rate": 1.2474846245398142e-05, + "loss": 0.1022, + "step": 39264 + }, + { + "epoch": 0.7003353190882174, + "grad_norm": 0.2327345609664917, + "learning_rate": 1.2473499199954012e-05, + "loss": 0.0975, + "step": 39265 + }, + { + "epoch": 0.7003531552099311, + "grad_norm": 0.2702726125717163, + "learning_rate": 1.2472152203066926e-05, + "loss": 0.145, + "step": 39266 + }, + { + "epoch": 0.7003709913316448, + "grad_norm": 0.3468043804168701, + "learning_rate": 1.2470805254742104e-05, + "loss": 0.1001, + "step": 39267 + }, + { + "epoch": 0.7003888274533585, + "grad_norm": 0.25190499424934387, + "learning_rate": 1.2469458354984759e-05, + "loss": 0.1523, + "step": 39268 + }, + { + "epoch": 0.7004066635750722, + "grad_norm": 0.22685116529464722, + "learning_rate": 1.2468111503800132e-05, + "loss": 0.0873, + "step": 39269 + }, + { + "epoch": 0.7004244996967859, + "grad_norm": 0.32708802819252014, + "learning_rate": 1.246676470119343e-05, + "loss": 0.0693, + "step": 39270 + }, + { + "epoch": 0.7004423358184996, + "grad_norm": 0.2807897627353668, + "learning_rate": 1.246541794716988e-05, + "loss": 0.0866, + "step": 39271 + }, + { + "epoch": 0.7004601719402134, + "grad_norm": 0.23111185431480408, + "learning_rate": 1.2464071241734698e-05, + "loss": 0.0917, + "step": 39272 + }, + { + "epoch": 0.7004780080619271, + "grad_norm": 0.3054521083831787, + "learning_rate": 1.2462724584893095e-05, + "loss": 0.1099, + "step": 39273 + }, + { + "epoch": 0.7004958441836407, + "grad_norm": 0.3203429877758026, + "learning_rate": 1.2461377976650316e-05, + "loss": 0.159, + "step": 39274 + }, + { + "epoch": 0.7005136803053544, + "grad_norm": 0.319185733795166, + "learning_rate": 1.2460031417011553e-05, + "loss": 0.097, + "step": 39275 + }, + { + "epoch": 0.7005315164270681, + "grad_norm": 0.2993345260620117, + "learning_rate": 1.2458684905982049e-05, + "loss": 0.0811, + "step": 39276 + }, + { + "epoch": 0.7005493525487818, + "grad_norm": 0.2945885956287384, + "learning_rate": 1.2457338443567005e-05, + "loss": 0.1221, + "step": 39277 + }, + { + "epoch": 0.7005671886704955, + "grad_norm": 0.22599923610687256, + "learning_rate": 1.245599202977166e-05, + "loss": 0.1122, + "step": 39278 + }, + { + "epoch": 0.7005850247922092, + "grad_norm": 0.2429341822862625, + "learning_rate": 1.245464566460122e-05, + "loss": 0.1281, + "step": 39279 + }, + { + "epoch": 0.7006028609139229, + "grad_norm": 0.282914936542511, + "learning_rate": 1.2453299348060909e-05, + "loss": 0.1413, + "step": 39280 + }, + { + "epoch": 0.7006206970356366, + "grad_norm": 0.19204550981521606, + "learning_rate": 1.245195308015593e-05, + "loss": 0.1148, + "step": 39281 + }, + { + "epoch": 0.7006385331573503, + "grad_norm": 0.2714342176914215, + "learning_rate": 1.2450606860891528e-05, + "loss": 0.1295, + "step": 39282 + }, + { + "epoch": 0.7006563692790639, + "grad_norm": 0.30962443351745605, + "learning_rate": 1.2449260690272907e-05, + "loss": 0.1417, + "step": 39283 + }, + { + "epoch": 0.7006742054007776, + "grad_norm": 0.22967122495174408, + "learning_rate": 1.2447914568305289e-05, + "loss": 0.0838, + "step": 39284 + }, + { + "epoch": 0.7006920415224913, + "grad_norm": 0.3372592628002167, + "learning_rate": 1.2446568494993885e-05, + "loss": 0.1295, + "step": 39285 + }, + { + "epoch": 0.700709877644205, + "grad_norm": 0.21744464337825775, + "learning_rate": 1.2445222470343912e-05, + "loss": 0.1222, + "step": 39286 + }, + { + "epoch": 0.7007277137659187, + "grad_norm": 0.2446707785129547, + "learning_rate": 1.2443876494360599e-05, + "loss": 0.1053, + "step": 39287 + }, + { + "epoch": 0.7007455498876324, + "grad_norm": 0.3213288486003876, + "learning_rate": 1.244253056704916e-05, + "loss": 0.1265, + "step": 39288 + }, + { + "epoch": 0.7007633860093462, + "grad_norm": 0.20614628493785858, + "learning_rate": 1.2441184688414798e-05, + "loss": 0.1004, + "step": 39289 + }, + { + "epoch": 0.7007812221310599, + "grad_norm": 0.29540812969207764, + "learning_rate": 1.2439838858462752e-05, + "loss": 0.1, + "step": 39290 + }, + { + "epoch": 0.7007990582527736, + "grad_norm": 0.31829172372817993, + "learning_rate": 1.243849307719822e-05, + "loss": 0.2475, + "step": 39291 + }, + { + "epoch": 0.7008168943744872, + "grad_norm": 0.31675219535827637, + "learning_rate": 1.2437147344626437e-05, + "loss": 0.1735, + "step": 39292 + }, + { + "epoch": 0.7008347304962009, + "grad_norm": 0.36702772974967957, + "learning_rate": 1.2435801660752611e-05, + "loss": 0.1177, + "step": 39293 + }, + { + "epoch": 0.7008525666179146, + "grad_norm": 0.21705499291419983, + "learning_rate": 1.2434456025581944e-05, + "loss": 0.1356, + "step": 39294 + }, + { + "epoch": 0.7008704027396283, + "grad_norm": 0.23931188881397247, + "learning_rate": 1.2433110439119678e-05, + "loss": 0.1212, + "step": 39295 + }, + { + "epoch": 0.700888238861342, + "grad_norm": 0.2305675595998764, + "learning_rate": 1.2431764901371015e-05, + "loss": 0.1195, + "step": 39296 + }, + { + "epoch": 0.7009060749830557, + "grad_norm": 0.3326476812362671, + "learning_rate": 1.2430419412341174e-05, + "loss": 0.077, + "step": 39297 + }, + { + "epoch": 0.7009239111047694, + "grad_norm": 0.2987017333507538, + "learning_rate": 1.2429073972035368e-05, + "loss": 0.0823, + "step": 39298 + }, + { + "epoch": 0.7009417472264831, + "grad_norm": 0.2243385910987854, + "learning_rate": 1.2427728580458803e-05, + "loss": 0.142, + "step": 39299 + }, + { + "epoch": 0.7009595833481967, + "grad_norm": 0.21158157289028168, + "learning_rate": 1.2426383237616711e-05, + "loss": 0.0718, + "step": 39300 + }, + { + "epoch": 0.7009774194699104, + "grad_norm": 0.21682368218898773, + "learning_rate": 1.2425037943514306e-05, + "loss": 0.1136, + "step": 39301 + }, + { + "epoch": 0.7009952555916241, + "grad_norm": 0.2618319094181061, + "learning_rate": 1.2423692698156786e-05, + "loss": 0.0955, + "step": 39302 + }, + { + "epoch": 0.7010130917133378, + "grad_norm": 0.21428616344928741, + "learning_rate": 1.2422347501549384e-05, + "loss": 0.1243, + "step": 39303 + }, + { + "epoch": 0.7010309278350515, + "grad_norm": 0.2124176174402237, + "learning_rate": 1.24210023536973e-05, + "loss": 0.0585, + "step": 39304 + }, + { + "epoch": 0.7010487639567652, + "grad_norm": 0.22856761515140533, + "learning_rate": 1.2419657254605762e-05, + "loss": 0.1112, + "step": 39305 + }, + { + "epoch": 0.701066600078479, + "grad_norm": 0.28618964552879333, + "learning_rate": 1.2418312204279983e-05, + "loss": 0.1455, + "step": 39306 + }, + { + "epoch": 0.7010844362001927, + "grad_norm": 0.5303351283073425, + "learning_rate": 1.2416967202725157e-05, + "loss": 0.1534, + "step": 39307 + }, + { + "epoch": 0.7011022723219064, + "grad_norm": 0.2856936752796173, + "learning_rate": 1.2415622249946523e-05, + "loss": 0.1111, + "step": 39308 + }, + { + "epoch": 0.70112010844362, + "grad_norm": 0.24779370427131653, + "learning_rate": 1.2414277345949285e-05, + "loss": 0.1341, + "step": 39309 + }, + { + "epoch": 0.7011379445653337, + "grad_norm": 0.401278018951416, + "learning_rate": 1.2412932490738655e-05, + "loss": 0.185, + "step": 39310 + }, + { + "epoch": 0.7011557806870474, + "grad_norm": 0.2862904369831085, + "learning_rate": 1.2411587684319845e-05, + "loss": 0.1352, + "step": 39311 + }, + { + "epoch": 0.7011736168087611, + "grad_norm": 0.2385338842868805, + "learning_rate": 1.241024292669806e-05, + "loss": 0.1217, + "step": 39312 + }, + { + "epoch": 0.7011914529304748, + "grad_norm": 0.24269017577171326, + "learning_rate": 1.2408898217878531e-05, + "loss": 0.1621, + "step": 39313 + }, + { + "epoch": 0.7012092890521885, + "grad_norm": 0.23597021400928497, + "learning_rate": 1.2407553557866461e-05, + "loss": 0.0994, + "step": 39314 + }, + { + "epoch": 0.7012271251739022, + "grad_norm": 0.2812851667404175, + "learning_rate": 1.2406208946667065e-05, + "loss": 0.1587, + "step": 39315 + }, + { + "epoch": 0.7012449612956159, + "grad_norm": 0.32705435156822205, + "learning_rate": 1.2404864384285538e-05, + "loss": 0.1086, + "step": 39316 + }, + { + "epoch": 0.7012627974173296, + "grad_norm": 0.25162845849990845, + "learning_rate": 1.2403519870727121e-05, + "loss": 0.0738, + "step": 39317 + }, + { + "epoch": 0.7012806335390432, + "grad_norm": 0.17822778224945068, + "learning_rate": 1.2402175405997003e-05, + "loss": 0.0828, + "step": 39318 + }, + { + "epoch": 0.7012984696607569, + "grad_norm": 0.24295833706855774, + "learning_rate": 1.2400830990100409e-05, + "loss": 0.0965, + "step": 39319 + }, + { + "epoch": 0.7013163057824706, + "grad_norm": 0.2563144862651825, + "learning_rate": 1.2399486623042539e-05, + "loss": 0.1077, + "step": 39320 + }, + { + "epoch": 0.7013341419041843, + "grad_norm": 0.2714223861694336, + "learning_rate": 1.2398142304828622e-05, + "loss": 0.0835, + "step": 39321 + }, + { + "epoch": 0.7013519780258981, + "grad_norm": 0.24917560815811157, + "learning_rate": 1.2396798035463856e-05, + "loss": 0.0939, + "step": 39322 + }, + { + "epoch": 0.7013698141476118, + "grad_norm": 0.24004727602005005, + "learning_rate": 1.2395453814953453e-05, + "loss": 0.1343, + "step": 39323 + }, + { + "epoch": 0.7013876502693255, + "grad_norm": 0.24072277545928955, + "learning_rate": 1.2394109643302618e-05, + "loss": 0.0923, + "step": 39324 + }, + { + "epoch": 0.7014054863910392, + "grad_norm": 0.2789347767829895, + "learning_rate": 1.2392765520516575e-05, + "loss": 0.1566, + "step": 39325 + }, + { + "epoch": 0.7014233225127529, + "grad_norm": 0.2381190061569214, + "learning_rate": 1.2391421446600526e-05, + "loss": 0.1195, + "step": 39326 + }, + { + "epoch": 0.7014411586344665, + "grad_norm": 0.29626700282096863, + "learning_rate": 1.2390077421559684e-05, + "loss": 0.1393, + "step": 39327 + }, + { + "epoch": 0.7014589947561802, + "grad_norm": 0.3041497766971588, + "learning_rate": 1.2388733445399259e-05, + "loss": 0.115, + "step": 39328 + }, + { + "epoch": 0.7014768308778939, + "grad_norm": 0.1709960401058197, + "learning_rate": 1.2387389518124447e-05, + "loss": 0.0938, + "step": 39329 + }, + { + "epoch": 0.7014946669996076, + "grad_norm": 0.28282129764556885, + "learning_rate": 1.2386045639740481e-05, + "loss": 0.1308, + "step": 39330 + }, + { + "epoch": 0.7015125031213213, + "grad_norm": 0.2282441407442093, + "learning_rate": 1.2384701810252547e-05, + "loss": 0.1246, + "step": 39331 + }, + { + "epoch": 0.701530339243035, + "grad_norm": 0.2657516300678253, + "learning_rate": 1.2383358029665878e-05, + "loss": 0.1293, + "step": 39332 + }, + { + "epoch": 0.7015481753647487, + "grad_norm": 0.29586759209632874, + "learning_rate": 1.2382014297985658e-05, + "loss": 0.0941, + "step": 39333 + }, + { + "epoch": 0.7015660114864624, + "grad_norm": 0.3080667555332184, + "learning_rate": 1.2380670615217122e-05, + "loss": 0.0832, + "step": 39334 + }, + { + "epoch": 0.701583847608176, + "grad_norm": 0.3187499940395355, + "learning_rate": 1.2379326981365464e-05, + "loss": 0.101, + "step": 39335 + }, + { + "epoch": 0.7016016837298897, + "grad_norm": 0.21686674654483795, + "learning_rate": 1.2377983396435891e-05, + "loss": 0.1519, + "step": 39336 + }, + { + "epoch": 0.7016195198516034, + "grad_norm": 0.3235342502593994, + "learning_rate": 1.237663986043361e-05, + "loss": 0.1125, + "step": 39337 + }, + { + "epoch": 0.7016373559733171, + "grad_norm": 0.23311492800712585, + "learning_rate": 1.2375296373363836e-05, + "loss": 0.1109, + "step": 39338 + }, + { + "epoch": 0.7016551920950309, + "grad_norm": 0.2622036635875702, + "learning_rate": 1.2373952935231778e-05, + "loss": 0.0813, + "step": 39339 + }, + { + "epoch": 0.7016730282167446, + "grad_norm": 0.29204609990119934, + "learning_rate": 1.2372609546042638e-05, + "loss": 0.1833, + "step": 39340 + }, + { + "epoch": 0.7016908643384583, + "grad_norm": 0.26663222908973694, + "learning_rate": 1.2371266205801623e-05, + "loss": 0.114, + "step": 39341 + }, + { + "epoch": 0.701708700460172, + "grad_norm": 0.38094082474708557, + "learning_rate": 1.2369922914513935e-05, + "loss": 0.1232, + "step": 39342 + }, + { + "epoch": 0.7017265365818857, + "grad_norm": 0.19940249621868134, + "learning_rate": 1.2368579672184796e-05, + "loss": 0.082, + "step": 39343 + }, + { + "epoch": 0.7017443727035994, + "grad_norm": 0.23590822517871857, + "learning_rate": 1.2367236478819408e-05, + "loss": 0.0979, + "step": 39344 + }, + { + "epoch": 0.701762208825313, + "grad_norm": 0.3120937943458557, + "learning_rate": 1.2365893334422962e-05, + "loss": 0.1377, + "step": 39345 + }, + { + "epoch": 0.7017800449470267, + "grad_norm": 0.24797527492046356, + "learning_rate": 1.2364550239000688e-05, + "loss": 0.1149, + "step": 39346 + }, + { + "epoch": 0.7017978810687404, + "grad_norm": 0.3228990137577057, + "learning_rate": 1.2363207192557772e-05, + "loss": 0.127, + "step": 39347 + }, + { + "epoch": 0.7018157171904541, + "grad_norm": 0.25563138723373413, + "learning_rate": 1.2361864195099437e-05, + "loss": 0.1181, + "step": 39348 + }, + { + "epoch": 0.7018335533121678, + "grad_norm": 0.24893851578235626, + "learning_rate": 1.2360521246630882e-05, + "loss": 0.1232, + "step": 39349 + }, + { + "epoch": 0.7018513894338815, + "grad_norm": 0.22202596068382263, + "learning_rate": 1.2359178347157306e-05, + "loss": 0.1206, + "step": 39350 + }, + { + "epoch": 0.7018692255555952, + "grad_norm": 0.23783749341964722, + "learning_rate": 1.2357835496683926e-05, + "loss": 0.1204, + "step": 39351 + }, + { + "epoch": 0.7018870616773089, + "grad_norm": 0.36229702830314636, + "learning_rate": 1.2356492695215943e-05, + "loss": 0.1143, + "step": 39352 + }, + { + "epoch": 0.7019048977990225, + "grad_norm": 0.3061220645904541, + "learning_rate": 1.2355149942758562e-05, + "loss": 0.1282, + "step": 39353 + }, + { + "epoch": 0.7019227339207362, + "grad_norm": 0.3575322926044464, + "learning_rate": 1.2353807239316988e-05, + "loss": 0.1051, + "step": 39354 + }, + { + "epoch": 0.7019405700424499, + "grad_norm": 0.25187069177627563, + "learning_rate": 1.2352464584896415e-05, + "loss": 0.1323, + "step": 39355 + }, + { + "epoch": 0.7019584061641637, + "grad_norm": 0.20016524195671082, + "learning_rate": 1.2351121979502066e-05, + "loss": 0.0799, + "step": 39356 + }, + { + "epoch": 0.7019762422858774, + "grad_norm": 0.22806833684444427, + "learning_rate": 1.2349779423139138e-05, + "loss": 0.1682, + "step": 39357 + }, + { + "epoch": 0.7019940784075911, + "grad_norm": 0.28366509079933167, + "learning_rate": 1.2348436915812824e-05, + "loss": 0.133, + "step": 39358 + }, + { + "epoch": 0.7020119145293048, + "grad_norm": 0.2176230102777481, + "learning_rate": 1.2347094457528349e-05, + "loss": 0.0906, + "step": 39359 + }, + { + "epoch": 0.7020297506510185, + "grad_norm": 0.28068429231643677, + "learning_rate": 1.2345752048290895e-05, + "loss": 0.1347, + "step": 39360 + }, + { + "epoch": 0.7020475867727322, + "grad_norm": 0.2743104100227356, + "learning_rate": 1.2344409688105688e-05, + "loss": 0.1093, + "step": 39361 + }, + { + "epoch": 0.7020654228944458, + "grad_norm": 0.21048085391521454, + "learning_rate": 1.234306737697792e-05, + "loss": 0.0636, + "step": 39362 + }, + { + "epoch": 0.7020832590161595, + "grad_norm": 0.2708156406879425, + "learning_rate": 1.2341725114912783e-05, + "loss": 0.1382, + "step": 39363 + }, + { + "epoch": 0.7021010951378732, + "grad_norm": 0.2513867914676666, + "learning_rate": 1.2340382901915504e-05, + "loss": 0.1177, + "step": 39364 + }, + { + "epoch": 0.7021189312595869, + "grad_norm": 0.2737678289413452, + "learning_rate": 1.233904073799127e-05, + "loss": 0.1223, + "step": 39365 + }, + { + "epoch": 0.7021367673813006, + "grad_norm": 0.25893157720565796, + "learning_rate": 1.2337698623145288e-05, + "loss": 0.1446, + "step": 39366 + }, + { + "epoch": 0.7021546035030143, + "grad_norm": 0.2867906987667084, + "learning_rate": 1.233635655738276e-05, + "loss": 0.1371, + "step": 39367 + }, + { + "epoch": 0.702172439624728, + "grad_norm": 0.20060017704963684, + "learning_rate": 1.2335014540708879e-05, + "loss": 0.1195, + "step": 39368 + }, + { + "epoch": 0.7021902757464417, + "grad_norm": 0.23535068333148956, + "learning_rate": 1.2333672573128866e-05, + "loss": 0.1107, + "step": 39369 + }, + { + "epoch": 0.7022081118681553, + "grad_norm": 0.3587065041065216, + "learning_rate": 1.2332330654647912e-05, + "loss": 0.111, + "step": 39370 + }, + { + "epoch": 0.702225947989869, + "grad_norm": 0.38629475235939026, + "learning_rate": 1.233098878527122e-05, + "loss": 0.1376, + "step": 39371 + }, + { + "epoch": 0.7022437841115827, + "grad_norm": 0.26195722818374634, + "learning_rate": 1.232964696500398e-05, + "loss": 0.1428, + "step": 39372 + }, + { + "epoch": 0.7022616202332965, + "grad_norm": 0.20094266533851624, + "learning_rate": 1.2328305193851414e-05, + "loss": 0.0899, + "step": 39373 + }, + { + "epoch": 0.7022794563550102, + "grad_norm": 0.2695089876651764, + "learning_rate": 1.2326963471818707e-05, + "loss": 0.1036, + "step": 39374 + }, + { + "epoch": 0.7022972924767239, + "grad_norm": 0.3112042546272278, + "learning_rate": 1.2325621798911074e-05, + "loss": 0.0974, + "step": 39375 + }, + { + "epoch": 0.7023151285984376, + "grad_norm": 0.5171024203300476, + "learning_rate": 1.2324280175133701e-05, + "loss": 0.1584, + "step": 39376 + }, + { + "epoch": 0.7023329647201513, + "grad_norm": 0.23663710057735443, + "learning_rate": 1.2322938600491805e-05, + "loss": 0.1178, + "step": 39377 + }, + { + "epoch": 0.702350800841865, + "grad_norm": 0.22713212668895721, + "learning_rate": 1.2321597074990575e-05, + "loss": 0.078, + "step": 39378 + }, + { + "epoch": 0.7023686369635787, + "grad_norm": 0.30640026926994324, + "learning_rate": 1.2320255598635216e-05, + "loss": 0.1312, + "step": 39379 + }, + { + "epoch": 0.7023864730852923, + "grad_norm": 0.30451124906539917, + "learning_rate": 1.2318914171430925e-05, + "loss": 0.1218, + "step": 39380 + }, + { + "epoch": 0.702404309207006, + "grad_norm": 0.2545153796672821, + "learning_rate": 1.2317572793382892e-05, + "loss": 0.1276, + "step": 39381 + }, + { + "epoch": 0.7024221453287197, + "grad_norm": 0.25689148902893066, + "learning_rate": 1.2316231464496338e-05, + "loss": 0.0972, + "step": 39382 + }, + { + "epoch": 0.7024399814504334, + "grad_norm": 0.1817709505558014, + "learning_rate": 1.2314890184776454e-05, + "loss": 0.0812, + "step": 39383 + }, + { + "epoch": 0.7024578175721471, + "grad_norm": 0.2498323917388916, + "learning_rate": 1.2313548954228432e-05, + "loss": 0.1062, + "step": 39384 + }, + { + "epoch": 0.7024756536938608, + "grad_norm": 0.2677672505378723, + "learning_rate": 1.231220777285747e-05, + "loss": 0.1226, + "step": 39385 + }, + { + "epoch": 0.7024934898155745, + "grad_norm": 0.3736957907676697, + "learning_rate": 1.2310866640668784e-05, + "loss": 0.108, + "step": 39386 + }, + { + "epoch": 0.7025113259372882, + "grad_norm": 0.29187342524528503, + "learning_rate": 1.2309525557667547e-05, + "loss": 0.1136, + "step": 39387 + }, + { + "epoch": 0.7025291620590018, + "grad_norm": 0.2804716229438782, + "learning_rate": 1.2308184523858985e-05, + "loss": 0.1182, + "step": 39388 + }, + { + "epoch": 0.7025469981807155, + "grad_norm": 0.2445303499698639, + "learning_rate": 1.2306843539248272e-05, + "loss": 0.1022, + "step": 39389 + }, + { + "epoch": 0.7025648343024293, + "grad_norm": 0.2529180347919464, + "learning_rate": 1.230550260384063e-05, + "loss": 0.1109, + "step": 39390 + }, + { + "epoch": 0.702582670424143, + "grad_norm": 0.2855203151702881, + "learning_rate": 1.2304161717641241e-05, + "loss": 0.1209, + "step": 39391 + }, + { + "epoch": 0.7026005065458567, + "grad_norm": 0.39952555298805237, + "learning_rate": 1.2302820880655308e-05, + "loss": 0.0753, + "step": 39392 + }, + { + "epoch": 0.7026183426675704, + "grad_norm": 0.24199889600276947, + "learning_rate": 1.2301480092888026e-05, + "loss": 0.1087, + "step": 39393 + }, + { + "epoch": 0.7026361787892841, + "grad_norm": 0.24643494188785553, + "learning_rate": 1.2300139354344586e-05, + "loss": 0.1237, + "step": 39394 + }, + { + "epoch": 0.7026540149109978, + "grad_norm": 0.35643017292022705, + "learning_rate": 1.2298798665030198e-05, + "loss": 0.1379, + "step": 39395 + }, + { + "epoch": 0.7026718510327115, + "grad_norm": 0.2550109922885895, + "learning_rate": 1.2297458024950057e-05, + "loss": 0.1142, + "step": 39396 + }, + { + "epoch": 0.7026896871544251, + "grad_norm": 0.3141452372074127, + "learning_rate": 1.2296117434109353e-05, + "loss": 0.128, + "step": 39397 + }, + { + "epoch": 0.7027075232761388, + "grad_norm": 0.21773818135261536, + "learning_rate": 1.2294776892513277e-05, + "loss": 0.0962, + "step": 39398 + }, + { + "epoch": 0.7027253593978525, + "grad_norm": 0.3088330030441284, + "learning_rate": 1.2293436400167043e-05, + "loss": 0.1411, + "step": 39399 + }, + { + "epoch": 0.7027431955195662, + "grad_norm": 0.31708624958992004, + "learning_rate": 1.2292095957075841e-05, + "loss": 0.1083, + "step": 39400 + }, + { + "epoch": 0.7027610316412799, + "grad_norm": 0.25620579719543457, + "learning_rate": 1.2290755563244851e-05, + "loss": 0.1004, + "step": 39401 + }, + { + "epoch": 0.7027788677629936, + "grad_norm": 0.3027898371219635, + "learning_rate": 1.2289415218679284e-05, + "loss": 0.1421, + "step": 39402 + }, + { + "epoch": 0.7027967038847073, + "grad_norm": 0.26583635807037354, + "learning_rate": 1.2288074923384344e-05, + "loss": 0.135, + "step": 39403 + }, + { + "epoch": 0.702814540006421, + "grad_norm": 0.3299161195755005, + "learning_rate": 1.2286734677365214e-05, + "loss": 0.114, + "step": 39404 + }, + { + "epoch": 0.7028323761281347, + "grad_norm": 0.2883865237236023, + "learning_rate": 1.2285394480627094e-05, + "loss": 0.1144, + "step": 39405 + }, + { + "epoch": 0.7028502122498483, + "grad_norm": 0.26030004024505615, + "learning_rate": 1.2284054333175174e-05, + "loss": 0.1194, + "step": 39406 + }, + { + "epoch": 0.7028680483715621, + "grad_norm": 0.26997897028923035, + "learning_rate": 1.2282714235014641e-05, + "loss": 0.1487, + "step": 39407 + }, + { + "epoch": 0.7028858844932758, + "grad_norm": 0.33512449264526367, + "learning_rate": 1.2281374186150713e-05, + "loss": 0.1284, + "step": 39408 + }, + { + "epoch": 0.7029037206149895, + "grad_norm": 0.19745367765426636, + "learning_rate": 1.228003418658857e-05, + "loss": 0.11, + "step": 39409 + }, + { + "epoch": 0.7029215567367032, + "grad_norm": 0.314870148897171, + "learning_rate": 1.2278694236333407e-05, + "loss": 0.1632, + "step": 39410 + }, + { + "epoch": 0.7029393928584169, + "grad_norm": 0.2699345052242279, + "learning_rate": 1.2277354335390411e-05, + "loss": 0.1547, + "step": 39411 + }, + { + "epoch": 0.7029572289801306, + "grad_norm": 0.31500157713890076, + "learning_rate": 1.2276014483764791e-05, + "loss": 0.1133, + "step": 39412 + }, + { + "epoch": 0.7029750651018443, + "grad_norm": 0.2887321412563324, + "learning_rate": 1.2274674681461737e-05, + "loss": 0.1375, + "step": 39413 + }, + { + "epoch": 0.702992901223558, + "grad_norm": 0.3022030293941498, + "learning_rate": 1.2273334928486427e-05, + "loss": 0.1185, + "step": 39414 + }, + { + "epoch": 0.7030107373452716, + "grad_norm": 0.48569798469543457, + "learning_rate": 1.2271995224844076e-05, + "loss": 0.079, + "step": 39415 + }, + { + "epoch": 0.7030285734669853, + "grad_norm": 0.29759669303894043, + "learning_rate": 1.227065557053986e-05, + "loss": 0.0821, + "step": 39416 + }, + { + "epoch": 0.703046409588699, + "grad_norm": 0.21454553306102753, + "learning_rate": 1.2269315965578987e-05, + "loss": 0.0806, + "step": 39417 + }, + { + "epoch": 0.7030642457104127, + "grad_norm": 0.26078954339027405, + "learning_rate": 1.2267976409966645e-05, + "loss": 0.1735, + "step": 39418 + }, + { + "epoch": 0.7030820818321264, + "grad_norm": 0.2663392424583435, + "learning_rate": 1.226663690370802e-05, + "loss": 0.1109, + "step": 39419 + }, + { + "epoch": 0.7030999179538401, + "grad_norm": 0.35911014676094055, + "learning_rate": 1.2265297446808302e-05, + "loss": 0.1285, + "step": 39420 + }, + { + "epoch": 0.7031177540755538, + "grad_norm": 0.276168555021286, + "learning_rate": 1.22639580392727e-05, + "loss": 0.1165, + "step": 39421 + }, + { + "epoch": 0.7031355901972675, + "grad_norm": 0.2792278826236725, + "learning_rate": 1.2262618681106392e-05, + "loss": 0.1732, + "step": 39422 + }, + { + "epoch": 0.7031534263189813, + "grad_norm": 0.29941728711128235, + "learning_rate": 1.2261279372314574e-05, + "loss": 0.0968, + "step": 39423 + }, + { + "epoch": 0.7031712624406949, + "grad_norm": 0.2718508541584015, + "learning_rate": 1.225994011290243e-05, + "loss": 0.1376, + "step": 39424 + }, + { + "epoch": 0.7031890985624086, + "grad_norm": 0.2902662754058838, + "learning_rate": 1.2258600902875165e-05, + "loss": 0.0983, + "step": 39425 + }, + { + "epoch": 0.7032069346841223, + "grad_norm": 0.3752191364765167, + "learning_rate": 1.2257261742237965e-05, + "loss": 0.1252, + "step": 39426 + }, + { + "epoch": 0.703224770805836, + "grad_norm": 0.24731576442718506, + "learning_rate": 1.225592263099602e-05, + "loss": 0.1336, + "step": 39427 + }, + { + "epoch": 0.7032426069275497, + "grad_norm": 0.3346899449825287, + "learning_rate": 1.225458356915451e-05, + "loss": 0.1678, + "step": 39428 + }, + { + "epoch": 0.7032604430492634, + "grad_norm": 0.30468907952308655, + "learning_rate": 1.2253244556718637e-05, + "loss": 0.1603, + "step": 39429 + }, + { + "epoch": 0.7032782791709771, + "grad_norm": 0.27630361914634705, + "learning_rate": 1.22519055936936e-05, + "loss": 0.0937, + "step": 39430 + }, + { + "epoch": 0.7032961152926908, + "grad_norm": 0.25730952620506287, + "learning_rate": 1.2250566680084579e-05, + "loss": 0.1094, + "step": 39431 + }, + { + "epoch": 0.7033139514144044, + "grad_norm": 0.29234641790390015, + "learning_rate": 1.2249227815896767e-05, + "loss": 0.1349, + "step": 39432 + }, + { + "epoch": 0.7033317875361181, + "grad_norm": 0.22435252368450165, + "learning_rate": 1.2247889001135343e-05, + "loss": 0.1198, + "step": 39433 + }, + { + "epoch": 0.7033496236578318, + "grad_norm": 0.312931090593338, + "learning_rate": 1.2246550235805513e-05, + "loss": 0.1234, + "step": 39434 + }, + { + "epoch": 0.7033674597795455, + "grad_norm": 0.23169714212417603, + "learning_rate": 1.2245211519912458e-05, + "loss": 0.0715, + "step": 39435 + }, + { + "epoch": 0.7033852959012592, + "grad_norm": 0.37830817699432373, + "learning_rate": 1.2243872853461372e-05, + "loss": 0.1038, + "step": 39436 + }, + { + "epoch": 0.7034031320229729, + "grad_norm": 0.33053845167160034, + "learning_rate": 1.224253423645743e-05, + "loss": 0.1499, + "step": 39437 + }, + { + "epoch": 0.7034209681446866, + "grad_norm": 0.2584063708782196, + "learning_rate": 1.2241195668905839e-05, + "loss": 0.0891, + "step": 39438 + }, + { + "epoch": 0.7034388042664003, + "grad_norm": 0.2626304626464844, + "learning_rate": 1.2239857150811782e-05, + "loss": 0.0735, + "step": 39439 + }, + { + "epoch": 0.7034566403881141, + "grad_norm": 0.2601485848426819, + "learning_rate": 1.2238518682180446e-05, + "loss": 0.1255, + "step": 39440 + }, + { + "epoch": 0.7034744765098278, + "grad_norm": 0.2184794694185257, + "learning_rate": 1.2237180263017009e-05, + "loss": 0.0729, + "step": 39441 + }, + { + "epoch": 0.7034923126315414, + "grad_norm": 0.27454760670661926, + "learning_rate": 1.223584189332668e-05, + "loss": 0.142, + "step": 39442 + }, + { + "epoch": 0.7035101487532551, + "grad_norm": 0.2277342975139618, + "learning_rate": 1.2234503573114628e-05, + "loss": 0.0954, + "step": 39443 + }, + { + "epoch": 0.7035279848749688, + "grad_norm": 0.3067512512207031, + "learning_rate": 1.2233165302386057e-05, + "loss": 0.1175, + "step": 39444 + }, + { + "epoch": 0.7035458209966825, + "grad_norm": 0.2913004159927368, + "learning_rate": 1.2231827081146147e-05, + "loss": 0.0935, + "step": 39445 + }, + { + "epoch": 0.7035636571183962, + "grad_norm": 0.26625627279281616, + "learning_rate": 1.2230488909400076e-05, + "loss": 0.1264, + "step": 39446 + }, + { + "epoch": 0.7035814932401099, + "grad_norm": 0.2908312678337097, + "learning_rate": 1.2229150787153049e-05, + "loss": 0.1131, + "step": 39447 + }, + { + "epoch": 0.7035993293618236, + "grad_norm": 0.2886578142642975, + "learning_rate": 1.2227812714410244e-05, + "loss": 0.1327, + "step": 39448 + }, + { + "epoch": 0.7036171654835373, + "grad_norm": 0.25169724225997925, + "learning_rate": 1.2226474691176852e-05, + "loss": 0.143, + "step": 39449 + }, + { + "epoch": 0.7036350016052509, + "grad_norm": 0.24674318730831146, + "learning_rate": 1.2225136717458041e-05, + "loss": 0.14, + "step": 39450 + }, + { + "epoch": 0.7036528377269646, + "grad_norm": 0.24743840098381042, + "learning_rate": 1.2223798793259026e-05, + "loss": 0.0917, + "step": 39451 + }, + { + "epoch": 0.7036706738486783, + "grad_norm": 0.2742423713207245, + "learning_rate": 1.2222460918584977e-05, + "loss": 0.1268, + "step": 39452 + }, + { + "epoch": 0.703688509970392, + "grad_norm": 0.2715749740600586, + "learning_rate": 1.2221123093441087e-05, + "loss": 0.1562, + "step": 39453 + }, + { + "epoch": 0.7037063460921057, + "grad_norm": 0.2605179250240326, + "learning_rate": 1.2219785317832525e-05, + "loss": 0.1122, + "step": 39454 + }, + { + "epoch": 0.7037241822138194, + "grad_norm": 0.21195285022258759, + "learning_rate": 1.2218447591764498e-05, + "loss": 0.1308, + "step": 39455 + }, + { + "epoch": 0.7037420183355331, + "grad_norm": 0.2376292645931244, + "learning_rate": 1.2217109915242173e-05, + "loss": 0.1094, + "step": 39456 + }, + { + "epoch": 0.7037598544572469, + "grad_norm": 0.27844753861427307, + "learning_rate": 1.2215772288270754e-05, + "loss": 0.1126, + "step": 39457 + }, + { + "epoch": 0.7037776905789606, + "grad_norm": 0.29905635118484497, + "learning_rate": 1.2214434710855422e-05, + "loss": 0.0924, + "step": 39458 + }, + { + "epoch": 0.7037955267006742, + "grad_norm": 0.3172041177749634, + "learning_rate": 1.2213097183001343e-05, + "loss": 0.1333, + "step": 39459 + }, + { + "epoch": 0.7038133628223879, + "grad_norm": 0.28696951270103455, + "learning_rate": 1.2211759704713726e-05, + "loss": 0.1156, + "step": 39460 + }, + { + "epoch": 0.7038311989441016, + "grad_norm": 0.33109989762306213, + "learning_rate": 1.2210422275997747e-05, + "loss": 0.1221, + "step": 39461 + }, + { + "epoch": 0.7038490350658153, + "grad_norm": 0.2560708820819855, + "learning_rate": 1.2209084896858586e-05, + "loss": 0.1148, + "step": 39462 + }, + { + "epoch": 0.703866871187529, + "grad_norm": 0.2727649211883545, + "learning_rate": 1.2207747567301423e-05, + "loss": 0.1355, + "step": 39463 + }, + { + "epoch": 0.7038847073092427, + "grad_norm": 0.22568921744823456, + "learning_rate": 1.2206410287331458e-05, + "loss": 0.14, + "step": 39464 + }, + { + "epoch": 0.7039025434309564, + "grad_norm": 0.46994584798812866, + "learning_rate": 1.2205073056953864e-05, + "loss": 0.0991, + "step": 39465 + }, + { + "epoch": 0.7039203795526701, + "grad_norm": 0.19712673127651215, + "learning_rate": 1.2203735876173825e-05, + "loss": 0.0513, + "step": 39466 + }, + { + "epoch": 0.7039382156743837, + "grad_norm": 0.37030521035194397, + "learning_rate": 1.2202398744996519e-05, + "loss": 0.18, + "step": 39467 + }, + { + "epoch": 0.7039560517960974, + "grad_norm": 0.23003219068050385, + "learning_rate": 1.2201061663427144e-05, + "loss": 0.1192, + "step": 39468 + }, + { + "epoch": 0.7039738879178111, + "grad_norm": 0.2154010534286499, + "learning_rate": 1.2199724631470874e-05, + "loss": 0.1354, + "step": 39469 + }, + { + "epoch": 0.7039917240395248, + "grad_norm": 0.3016541600227356, + "learning_rate": 1.2198387649132884e-05, + "loss": 0.1309, + "step": 39470 + }, + { + "epoch": 0.7040095601612385, + "grad_norm": 0.28513240814208984, + "learning_rate": 1.2197050716418373e-05, + "loss": 0.1045, + "step": 39471 + }, + { + "epoch": 0.7040273962829522, + "grad_norm": 0.2755373418331146, + "learning_rate": 1.2195713833332506e-05, + "loss": 0.1074, + "step": 39472 + }, + { + "epoch": 0.7040452324046659, + "grad_norm": 0.2639329433441162, + "learning_rate": 1.2194376999880484e-05, + "loss": 0.1108, + "step": 39473 + }, + { + "epoch": 0.7040630685263797, + "grad_norm": 0.3147179186344147, + "learning_rate": 1.219304021606748e-05, + "loss": 0.0998, + "step": 39474 + }, + { + "epoch": 0.7040809046480934, + "grad_norm": 0.2252577692270279, + "learning_rate": 1.2191703481898676e-05, + "loss": 0.0922, + "step": 39475 + }, + { + "epoch": 0.704098740769807, + "grad_norm": 0.26773354411125183, + "learning_rate": 1.2190366797379244e-05, + "loss": 0.0921, + "step": 39476 + }, + { + "epoch": 0.7041165768915207, + "grad_norm": 0.19199728965759277, + "learning_rate": 1.2189030162514384e-05, + "loss": 0.086, + "step": 39477 + }, + { + "epoch": 0.7041344130132344, + "grad_norm": 0.26217150688171387, + "learning_rate": 1.2187693577309267e-05, + "loss": 0.1318, + "step": 39478 + }, + { + "epoch": 0.7041522491349481, + "grad_norm": 0.40364983677864075, + "learning_rate": 1.2186357041769075e-05, + "loss": 0.0938, + "step": 39479 + }, + { + "epoch": 0.7041700852566618, + "grad_norm": 0.2648116946220398, + "learning_rate": 1.218502055589898e-05, + "loss": 0.1234, + "step": 39480 + }, + { + "epoch": 0.7041879213783755, + "grad_norm": 0.21393592655658722, + "learning_rate": 1.2183684119704181e-05, + "loss": 0.0978, + "step": 39481 + }, + { + "epoch": 0.7042057575000892, + "grad_norm": 0.5880461931228638, + "learning_rate": 1.218234773318985e-05, + "loss": 0.164, + "step": 39482 + }, + { + "epoch": 0.7042235936218029, + "grad_norm": 0.22025704383850098, + "learning_rate": 1.2181011396361152e-05, + "loss": 0.1547, + "step": 39483 + }, + { + "epoch": 0.7042414297435166, + "grad_norm": 0.320406049489975, + "learning_rate": 1.2179675109223296e-05, + "loss": 0.1678, + "step": 39484 + }, + { + "epoch": 0.7042592658652302, + "grad_norm": 0.38594451546669006, + "learning_rate": 1.2178338871781436e-05, + "loss": 0.1789, + "step": 39485 + }, + { + "epoch": 0.7042771019869439, + "grad_norm": 0.31850072741508484, + "learning_rate": 1.2177002684040773e-05, + "loss": 0.1568, + "step": 39486 + }, + { + "epoch": 0.7042949381086576, + "grad_norm": 0.2897709012031555, + "learning_rate": 1.2175666546006475e-05, + "loss": 0.1545, + "step": 39487 + }, + { + "epoch": 0.7043127742303713, + "grad_norm": 0.2247619777917862, + "learning_rate": 1.2174330457683724e-05, + "loss": 0.1233, + "step": 39488 + }, + { + "epoch": 0.704330610352085, + "grad_norm": 0.2698691487312317, + "learning_rate": 1.2172994419077688e-05, + "loss": 0.1577, + "step": 39489 + }, + { + "epoch": 0.7043484464737987, + "grad_norm": 0.197042316198349, + "learning_rate": 1.2171658430193566e-05, + "loss": 0.08, + "step": 39490 + }, + { + "epoch": 0.7043662825955125, + "grad_norm": 0.28770431876182556, + "learning_rate": 1.2170322491036529e-05, + "loss": 0.1744, + "step": 39491 + }, + { + "epoch": 0.7043841187172262, + "grad_norm": 0.2561630606651306, + "learning_rate": 1.2168986601611747e-05, + "loss": 0.1417, + "step": 39492 + }, + { + "epoch": 0.7044019548389399, + "grad_norm": 0.250017911195755, + "learning_rate": 1.2167650761924402e-05, + "loss": 0.1235, + "step": 39493 + }, + { + "epoch": 0.7044197909606535, + "grad_norm": 0.2779890298843384, + "learning_rate": 1.2166314971979681e-05, + "loss": 0.1395, + "step": 39494 + }, + { + "epoch": 0.7044376270823672, + "grad_norm": 0.3253113329410553, + "learning_rate": 1.2164979231782755e-05, + "loss": 0.156, + "step": 39495 + }, + { + "epoch": 0.7044554632040809, + "grad_norm": 0.6006177067756653, + "learning_rate": 1.2163643541338803e-05, + "loss": 0.1355, + "step": 39496 + }, + { + "epoch": 0.7044732993257946, + "grad_norm": 0.2981247305870056, + "learning_rate": 1.2162307900652994e-05, + "loss": 0.1408, + "step": 39497 + }, + { + "epoch": 0.7044911354475083, + "grad_norm": 0.3281283676624298, + "learning_rate": 1.2160972309730522e-05, + "loss": 0.1117, + "step": 39498 + }, + { + "epoch": 0.704508971569222, + "grad_norm": 0.31892332434654236, + "learning_rate": 1.2159636768576546e-05, + "loss": 0.1264, + "step": 39499 + }, + { + "epoch": 0.7045268076909357, + "grad_norm": 0.26161426305770874, + "learning_rate": 1.2158301277196263e-05, + "loss": 0.1437, + "step": 39500 + }, + { + "epoch": 0.7045446438126494, + "grad_norm": 0.2609124779701233, + "learning_rate": 1.2156965835594841e-05, + "loss": 0.1264, + "step": 39501 + }, + { + "epoch": 0.704562479934363, + "grad_norm": 0.2709504961967468, + "learning_rate": 1.2155630443777444e-05, + "loss": 0.116, + "step": 39502 + }, + { + "epoch": 0.7045803160560767, + "grad_norm": 0.21923157572746277, + "learning_rate": 1.215429510174927e-05, + "loss": 0.0948, + "step": 39503 + }, + { + "epoch": 0.7045981521777904, + "grad_norm": 0.35085877776145935, + "learning_rate": 1.2152959809515483e-05, + "loss": 0.1545, + "step": 39504 + }, + { + "epoch": 0.7046159882995041, + "grad_norm": 0.3090458810329437, + "learning_rate": 1.2151624567081263e-05, + "loss": 0.1022, + "step": 39505 + }, + { + "epoch": 0.7046338244212178, + "grad_norm": 0.24669237434864044, + "learning_rate": 1.2150289374451773e-05, + "loss": 0.1294, + "step": 39506 + }, + { + "epoch": 0.7046516605429315, + "grad_norm": 0.2640063762664795, + "learning_rate": 1.2148954231632212e-05, + "loss": 0.1388, + "step": 39507 + }, + { + "epoch": 0.7046694966646453, + "grad_norm": 0.3696783483028412, + "learning_rate": 1.2147619138627739e-05, + "loss": 0.1119, + "step": 39508 + }, + { + "epoch": 0.704687332786359, + "grad_norm": 0.3612895905971527, + "learning_rate": 1.2146284095443536e-05, + "loss": 0.1096, + "step": 39509 + }, + { + "epoch": 0.7047051689080727, + "grad_norm": 0.24098873138427734, + "learning_rate": 1.2144949102084774e-05, + "loss": 0.0826, + "step": 39510 + }, + { + "epoch": 0.7047230050297864, + "grad_norm": 0.3146705627441406, + "learning_rate": 1.2143614158556621e-05, + "loss": 0.153, + "step": 39511 + }, + { + "epoch": 0.7047408411515, + "grad_norm": 0.28849712014198303, + "learning_rate": 1.214227926486426e-05, + "loss": 0.1159, + "step": 39512 + }, + { + "epoch": 0.7047586772732137, + "grad_norm": 0.28125548362731934, + "learning_rate": 1.2140944421012873e-05, + "loss": 0.132, + "step": 39513 + }, + { + "epoch": 0.7047765133949274, + "grad_norm": 0.382479727268219, + "learning_rate": 1.2139609627007628e-05, + "loss": 0.0839, + "step": 39514 + }, + { + "epoch": 0.7047943495166411, + "grad_norm": 0.20573905110359192, + "learning_rate": 1.213827488285369e-05, + "loss": 0.0919, + "step": 39515 + }, + { + "epoch": 0.7048121856383548, + "grad_norm": 0.269321084022522, + "learning_rate": 1.2136940188556248e-05, + "loss": 0.1145, + "step": 39516 + }, + { + "epoch": 0.7048300217600685, + "grad_norm": 0.342955619096756, + "learning_rate": 1.2135605544120469e-05, + "loss": 0.1631, + "step": 39517 + }, + { + "epoch": 0.7048478578817822, + "grad_norm": 0.3823602795600891, + "learning_rate": 1.2134270949551526e-05, + "loss": 0.157, + "step": 39518 + }, + { + "epoch": 0.7048656940034959, + "grad_norm": 0.24534699320793152, + "learning_rate": 1.2132936404854583e-05, + "loss": 0.1119, + "step": 39519 + }, + { + "epoch": 0.7048835301252095, + "grad_norm": 0.32959961891174316, + "learning_rate": 1.2131601910034835e-05, + "loss": 0.1327, + "step": 39520 + }, + { + "epoch": 0.7049013662469232, + "grad_norm": 0.21906019747257233, + "learning_rate": 1.2130267465097439e-05, + "loss": 0.0624, + "step": 39521 + }, + { + "epoch": 0.7049192023686369, + "grad_norm": 0.2751816213130951, + "learning_rate": 1.2128933070047572e-05, + "loss": 0.0849, + "step": 39522 + }, + { + "epoch": 0.7049370384903506, + "grad_norm": 0.2616502046585083, + "learning_rate": 1.2127598724890407e-05, + "loss": 0.1097, + "step": 39523 + }, + { + "epoch": 0.7049548746120643, + "grad_norm": 0.2105613797903061, + "learning_rate": 1.2126264429631104e-05, + "loss": 0.0955, + "step": 39524 + }, + { + "epoch": 0.7049727107337781, + "grad_norm": 0.3224189281463623, + "learning_rate": 1.2124930184274857e-05, + "loss": 0.1294, + "step": 39525 + }, + { + "epoch": 0.7049905468554918, + "grad_norm": 0.22643953561782837, + "learning_rate": 1.212359598882682e-05, + "loss": 0.1383, + "step": 39526 + }, + { + "epoch": 0.7050083829772055, + "grad_norm": 0.3114086985588074, + "learning_rate": 1.212226184329218e-05, + "loss": 0.1441, + "step": 39527 + }, + { + "epoch": 0.7050262190989192, + "grad_norm": 0.2591535151004791, + "learning_rate": 1.2120927747676093e-05, + "loss": 0.1313, + "step": 39528 + }, + { + "epoch": 0.7050440552206328, + "grad_norm": 0.26897308230400085, + "learning_rate": 1.2119593701983745e-05, + "loss": 0.1032, + "step": 39529 + }, + { + "epoch": 0.7050618913423465, + "grad_norm": 0.2618337869644165, + "learning_rate": 1.2118259706220303e-05, + "loss": 0.1687, + "step": 39530 + }, + { + "epoch": 0.7050797274640602, + "grad_norm": 0.32848429679870605, + "learning_rate": 1.2116925760390934e-05, + "loss": 0.1022, + "step": 39531 + }, + { + "epoch": 0.7050975635857739, + "grad_norm": 0.4262397885322571, + "learning_rate": 1.21155918645008e-05, + "loss": 0.1925, + "step": 39532 + }, + { + "epoch": 0.7051153997074876, + "grad_norm": 0.26168233156204224, + "learning_rate": 1.2114258018555094e-05, + "loss": 0.107, + "step": 39533 + }, + { + "epoch": 0.7051332358292013, + "grad_norm": 0.30384665727615356, + "learning_rate": 1.2112924222558975e-05, + "loss": 0.1858, + "step": 39534 + }, + { + "epoch": 0.705151071950915, + "grad_norm": 0.24300090968608856, + "learning_rate": 1.2111590476517609e-05, + "loss": 0.1479, + "step": 39535 + }, + { + "epoch": 0.7051689080726287, + "grad_norm": 0.31717199087142944, + "learning_rate": 1.2110256780436174e-05, + "loss": 0.1077, + "step": 39536 + }, + { + "epoch": 0.7051867441943424, + "grad_norm": 0.2913479804992676, + "learning_rate": 1.2108923134319825e-05, + "loss": 0.1318, + "step": 39537 + }, + { + "epoch": 0.705204580316056, + "grad_norm": 0.33571234345436096, + "learning_rate": 1.210758953817375e-05, + "loss": 0.0726, + "step": 39538 + }, + { + "epoch": 0.7052224164377697, + "grad_norm": 0.24905113875865936, + "learning_rate": 1.2106255992003102e-05, + "loss": 0.1174, + "step": 39539 + }, + { + "epoch": 0.7052402525594834, + "grad_norm": 0.3410506844520569, + "learning_rate": 1.210492249581307e-05, + "loss": 0.1413, + "step": 39540 + }, + { + "epoch": 0.7052580886811972, + "grad_norm": 0.35247164964675903, + "learning_rate": 1.2103589049608802e-05, + "loss": 0.1353, + "step": 39541 + }, + { + "epoch": 0.7052759248029109, + "grad_norm": 0.22630973160266876, + "learning_rate": 1.2102255653395486e-05, + "loss": 0.0838, + "step": 39542 + }, + { + "epoch": 0.7052937609246246, + "grad_norm": 0.3605957329273224, + "learning_rate": 1.2100922307178284e-05, + "loss": 0.0771, + "step": 39543 + }, + { + "epoch": 0.7053115970463383, + "grad_norm": 0.23125700652599335, + "learning_rate": 1.2099589010962358e-05, + "loss": 0.1499, + "step": 39544 + }, + { + "epoch": 0.705329433168052, + "grad_norm": 0.2557556927204132, + "learning_rate": 1.2098255764752874e-05, + "loss": 0.1322, + "step": 39545 + }, + { + "epoch": 0.7053472692897657, + "grad_norm": 0.23560579121112823, + "learning_rate": 1.2096922568555016e-05, + "loss": 0.092, + "step": 39546 + }, + { + "epoch": 0.7053651054114793, + "grad_norm": 0.23158732056617737, + "learning_rate": 1.2095589422373946e-05, + "loss": 0.1317, + "step": 39547 + }, + { + "epoch": 0.705382941533193, + "grad_norm": 0.3390738368034363, + "learning_rate": 1.2094256326214823e-05, + "loss": 0.1297, + "step": 39548 + }, + { + "epoch": 0.7054007776549067, + "grad_norm": 0.3284175992012024, + "learning_rate": 1.2092923280082823e-05, + "loss": 0.0942, + "step": 39549 + }, + { + "epoch": 0.7054186137766204, + "grad_norm": 0.18790553510189056, + "learning_rate": 1.20915902839831e-05, + "loss": 0.0931, + "step": 39550 + }, + { + "epoch": 0.7054364498983341, + "grad_norm": 0.23232364654541016, + "learning_rate": 1.209025733792084e-05, + "loss": 0.1161, + "step": 39551 + }, + { + "epoch": 0.7054542860200478, + "grad_norm": 0.2700973153114319, + "learning_rate": 1.2088924441901203e-05, + "loss": 0.1284, + "step": 39552 + }, + { + "epoch": 0.7054721221417615, + "grad_norm": 0.20915038883686066, + "learning_rate": 1.2087591595929345e-05, + "loss": 0.082, + "step": 39553 + }, + { + "epoch": 0.7054899582634752, + "grad_norm": 0.4089609384536743, + "learning_rate": 1.208625880001045e-05, + "loss": 0.1622, + "step": 39554 + }, + { + "epoch": 0.7055077943851888, + "grad_norm": 0.37525680661201477, + "learning_rate": 1.2084926054149667e-05, + "loss": 0.1201, + "step": 39555 + }, + { + "epoch": 0.7055256305069025, + "grad_norm": 0.3497820794582367, + "learning_rate": 1.2083593358352182e-05, + "loss": 0.1389, + "step": 39556 + }, + { + "epoch": 0.7055434666286162, + "grad_norm": 0.2753267288208008, + "learning_rate": 1.208226071262315e-05, + "loss": 0.0948, + "step": 39557 + }, + { + "epoch": 0.70556130275033, + "grad_norm": 0.2556132972240448, + "learning_rate": 1.2080928116967726e-05, + "loss": 0.096, + "step": 39558 + }, + { + "epoch": 0.7055791388720437, + "grad_norm": 0.22126372158527374, + "learning_rate": 1.2079595571391098e-05, + "loss": 0.122, + "step": 39559 + }, + { + "epoch": 0.7055969749937574, + "grad_norm": 0.1990654319524765, + "learning_rate": 1.2078263075898419e-05, + "loss": 0.0653, + "step": 39560 + }, + { + "epoch": 0.7056148111154711, + "grad_norm": 0.19901281595230103, + "learning_rate": 1.2076930630494856e-05, + "loss": 0.066, + "step": 39561 + }, + { + "epoch": 0.7056326472371848, + "grad_norm": 0.30381327867507935, + "learning_rate": 1.2075598235185574e-05, + "loss": 0.1236, + "step": 39562 + }, + { + "epoch": 0.7056504833588985, + "grad_norm": 0.24971404671669006, + "learning_rate": 1.2074265889975725e-05, + "loss": 0.0918, + "step": 39563 + }, + { + "epoch": 0.7056683194806121, + "grad_norm": 0.3000890016555786, + "learning_rate": 1.2072933594870498e-05, + "loss": 0.1471, + "step": 39564 + }, + { + "epoch": 0.7056861556023258, + "grad_norm": 0.25084424018859863, + "learning_rate": 1.2071601349875045e-05, + "loss": 0.1427, + "step": 39565 + }, + { + "epoch": 0.7057039917240395, + "grad_norm": 0.3690944015979767, + "learning_rate": 1.2070269154994517e-05, + "loss": 0.1524, + "step": 39566 + }, + { + "epoch": 0.7057218278457532, + "grad_norm": 0.2730376422405243, + "learning_rate": 1.2068937010234105e-05, + "loss": 0.0929, + "step": 39567 + }, + { + "epoch": 0.7057396639674669, + "grad_norm": 0.27159979939460754, + "learning_rate": 1.2067604915598952e-05, + "loss": 0.1036, + "step": 39568 + }, + { + "epoch": 0.7057575000891806, + "grad_norm": 0.25176775455474854, + "learning_rate": 1.2066272871094233e-05, + "loss": 0.133, + "step": 39569 + }, + { + "epoch": 0.7057753362108943, + "grad_norm": 0.19935861229896545, + "learning_rate": 1.2064940876725111e-05, + "loss": 0.1033, + "step": 39570 + }, + { + "epoch": 0.705793172332608, + "grad_norm": 0.30074551701545715, + "learning_rate": 1.2063608932496736e-05, + "loss": 0.175, + "step": 39571 + }, + { + "epoch": 0.7058110084543217, + "grad_norm": 0.2840253412723541, + "learning_rate": 1.2062277038414291e-05, + "loss": 0.1232, + "step": 39572 + }, + { + "epoch": 0.7058288445760353, + "grad_norm": 0.4243912398815155, + "learning_rate": 1.2060945194482925e-05, + "loss": 0.1485, + "step": 39573 + }, + { + "epoch": 0.705846680697749, + "grad_norm": 0.232137992978096, + "learning_rate": 1.2059613400707809e-05, + "loss": 0.1552, + "step": 39574 + }, + { + "epoch": 0.7058645168194628, + "grad_norm": 0.22122007608413696, + "learning_rate": 1.2058281657094097e-05, + "loss": 0.0808, + "step": 39575 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.20192213356494904, + "learning_rate": 1.2056949963646948e-05, + "loss": 0.0747, + "step": 39576 + }, + { + "epoch": 0.7059001890628902, + "grad_norm": 0.3498326539993286, + "learning_rate": 1.2055618320371541e-05, + "loss": 0.1185, + "step": 39577 + }, + { + "epoch": 0.7059180251846039, + "grad_norm": 0.31498607993125916, + "learning_rate": 1.2054286727273028e-05, + "loss": 0.1426, + "step": 39578 + }, + { + "epoch": 0.7059358613063176, + "grad_norm": 0.4067002832889557, + "learning_rate": 1.205295518435657e-05, + "loss": 0.1821, + "step": 39579 + }, + { + "epoch": 0.7059536974280313, + "grad_norm": 0.29944872856140137, + "learning_rate": 1.2051623691627322e-05, + "loss": 0.103, + "step": 39580 + }, + { + "epoch": 0.705971533549745, + "grad_norm": 0.2615627646446228, + "learning_rate": 1.2050292249090462e-05, + "loss": 0.1034, + "step": 39581 + }, + { + "epoch": 0.7059893696714586, + "grad_norm": 0.38268721103668213, + "learning_rate": 1.2048960856751132e-05, + "loss": 0.1638, + "step": 39582 + }, + { + "epoch": 0.7060072057931723, + "grad_norm": 0.3006829619407654, + "learning_rate": 1.2047629514614512e-05, + "loss": 0.087, + "step": 39583 + }, + { + "epoch": 0.706025041914886, + "grad_norm": 0.2751002013683319, + "learning_rate": 1.2046298222685743e-05, + "loss": 0.1336, + "step": 39584 + }, + { + "epoch": 0.7060428780365997, + "grad_norm": 0.21429263055324554, + "learning_rate": 1.2044966980970008e-05, + "loss": 0.0848, + "step": 39585 + }, + { + "epoch": 0.7060607141583134, + "grad_norm": 0.3600725531578064, + "learning_rate": 1.2043635789472452e-05, + "loss": 0.157, + "step": 39586 + }, + { + "epoch": 0.7060785502800271, + "grad_norm": 0.2642713785171509, + "learning_rate": 1.204230464819824e-05, + "loss": 0.128, + "step": 39587 + }, + { + "epoch": 0.7060963864017408, + "grad_norm": 0.2573707699775696, + "learning_rate": 1.2040973557152533e-05, + "loss": 0.1207, + "step": 39588 + }, + { + "epoch": 0.7061142225234545, + "grad_norm": 0.23546798527240753, + "learning_rate": 1.2039642516340477e-05, + "loss": 0.0956, + "step": 39589 + }, + { + "epoch": 0.7061320586451681, + "grad_norm": 0.3157248795032501, + "learning_rate": 1.2038311525767252e-05, + "loss": 0.1373, + "step": 39590 + }, + { + "epoch": 0.7061498947668818, + "grad_norm": 0.2501681447029114, + "learning_rate": 1.203698058543801e-05, + "loss": 0.1755, + "step": 39591 + }, + { + "epoch": 0.7061677308885956, + "grad_norm": 0.2777828872203827, + "learning_rate": 1.2035649695357906e-05, + "loss": 0.0528, + "step": 39592 + }, + { + "epoch": 0.7061855670103093, + "grad_norm": 0.2489694356918335, + "learning_rate": 1.2034318855532095e-05, + "loss": 0.1044, + "step": 39593 + }, + { + "epoch": 0.706203403132023, + "grad_norm": 0.24624349176883698, + "learning_rate": 1.203298806596575e-05, + "loss": 0.0771, + "step": 39594 + }, + { + "epoch": 0.7062212392537367, + "grad_norm": 0.27731943130493164, + "learning_rate": 1.2031657326664014e-05, + "loss": 0.1418, + "step": 39595 + }, + { + "epoch": 0.7062390753754504, + "grad_norm": 0.3589498996734619, + "learning_rate": 1.2030326637632061e-05, + "loss": 0.1626, + "step": 39596 + }, + { + "epoch": 0.7062569114971641, + "grad_norm": 0.42085835337638855, + "learning_rate": 1.202899599887503e-05, + "loss": 0.1956, + "step": 39597 + }, + { + "epoch": 0.7062747476188778, + "grad_norm": 0.23246483504772186, + "learning_rate": 1.2027665410398106e-05, + "loss": 0.1078, + "step": 39598 + }, + { + "epoch": 0.7062925837405915, + "grad_norm": 0.24317991733551025, + "learning_rate": 1.2026334872206426e-05, + "loss": 0.1329, + "step": 39599 + }, + { + "epoch": 0.7063104198623051, + "grad_norm": 0.23957769572734833, + "learning_rate": 1.2025004384305155e-05, + "loss": 0.1369, + "step": 39600 + }, + { + "epoch": 0.7063282559840188, + "grad_norm": 0.23834265768527985, + "learning_rate": 1.2023673946699451e-05, + "loss": 0.0935, + "step": 39601 + }, + { + "epoch": 0.7063460921057325, + "grad_norm": 0.1824650913476944, + "learning_rate": 1.2022343559394456e-05, + "loss": 0.1157, + "step": 39602 + }, + { + "epoch": 0.7063639282274462, + "grad_norm": 0.4296053647994995, + "learning_rate": 1.202101322239535e-05, + "loss": 0.1839, + "step": 39603 + }, + { + "epoch": 0.7063817643491599, + "grad_norm": 0.2947930693626404, + "learning_rate": 1.201968293570728e-05, + "loss": 0.0864, + "step": 39604 + }, + { + "epoch": 0.7063996004708736, + "grad_norm": 0.21025703847408295, + "learning_rate": 1.20183526993354e-05, + "loss": 0.127, + "step": 39605 + }, + { + "epoch": 0.7064174365925873, + "grad_norm": 0.3835030496120453, + "learning_rate": 1.2017022513284862e-05, + "loss": 0.1938, + "step": 39606 + }, + { + "epoch": 0.706435272714301, + "grad_norm": 0.39272934198379517, + "learning_rate": 1.2015692377560836e-05, + "loss": 0.1223, + "step": 39607 + }, + { + "epoch": 0.7064531088360146, + "grad_norm": 0.2779324948787689, + "learning_rate": 1.2014362292168475e-05, + "loss": 0.1832, + "step": 39608 + }, + { + "epoch": 0.7064709449577284, + "grad_norm": 0.29282549023628235, + "learning_rate": 1.2013032257112917e-05, + "loss": 0.0701, + "step": 39609 + }, + { + "epoch": 0.7064887810794421, + "grad_norm": 0.2887338697910309, + "learning_rate": 1.2011702272399334e-05, + "loss": 0.0793, + "step": 39610 + }, + { + "epoch": 0.7065066172011558, + "grad_norm": 0.26466473937034607, + "learning_rate": 1.2010372338032885e-05, + "loss": 0.1165, + "step": 39611 + }, + { + "epoch": 0.7065244533228695, + "grad_norm": 0.3904156982898712, + "learning_rate": 1.2009042454018724e-05, + "loss": 0.1558, + "step": 39612 + }, + { + "epoch": 0.7065422894445832, + "grad_norm": 0.26360198855400085, + "learning_rate": 1.2007712620361999e-05, + "loss": 0.1097, + "step": 39613 + }, + { + "epoch": 0.7065601255662969, + "grad_norm": 0.21179459989070892, + "learning_rate": 1.2006382837067867e-05, + "loss": 0.0915, + "step": 39614 + }, + { + "epoch": 0.7065779616880106, + "grad_norm": 0.23796197772026062, + "learning_rate": 1.2005053104141475e-05, + "loss": 0.117, + "step": 39615 + }, + { + "epoch": 0.7065957978097243, + "grad_norm": 0.28057971596717834, + "learning_rate": 1.2003723421587993e-05, + "loss": 0.1409, + "step": 39616 + }, + { + "epoch": 0.706613633931438, + "grad_norm": 0.3126096725463867, + "learning_rate": 1.200239378941257e-05, + "loss": 0.1275, + "step": 39617 + }, + { + "epoch": 0.7066314700531516, + "grad_norm": 0.23418393731117249, + "learning_rate": 1.2001064207620355e-05, + "loss": 0.189, + "step": 39618 + }, + { + "epoch": 0.7066493061748653, + "grad_norm": 0.2520259916782379, + "learning_rate": 1.1999734676216498e-05, + "loss": 0.1263, + "step": 39619 + }, + { + "epoch": 0.706667142296579, + "grad_norm": 0.280765563249588, + "learning_rate": 1.199840519520617e-05, + "loss": 0.145, + "step": 39620 + }, + { + "epoch": 0.7066849784182927, + "grad_norm": 0.2633095681667328, + "learning_rate": 1.199707576459451e-05, + "loss": 0.1255, + "step": 39621 + }, + { + "epoch": 0.7067028145400064, + "grad_norm": 0.3299943208694458, + "learning_rate": 1.1995746384386669e-05, + "loss": 0.1497, + "step": 39622 + }, + { + "epoch": 0.7067206506617201, + "grad_norm": 0.3245101273059845, + "learning_rate": 1.1994417054587814e-05, + "loss": 0.1527, + "step": 39623 + }, + { + "epoch": 0.7067384867834338, + "grad_norm": 0.2601618766784668, + "learning_rate": 1.1993087775203083e-05, + "loss": 0.0991, + "step": 39624 + }, + { + "epoch": 0.7067563229051474, + "grad_norm": 0.19852134585380554, + "learning_rate": 1.1991758546237644e-05, + "loss": 0.1041, + "step": 39625 + }, + { + "epoch": 0.7067741590268612, + "grad_norm": 0.25954705476760864, + "learning_rate": 1.1990429367696642e-05, + "loss": 0.1074, + "step": 39626 + }, + { + "epoch": 0.7067919951485749, + "grad_norm": 0.4318484663963318, + "learning_rate": 1.198910023958523e-05, + "loss": 0.0962, + "step": 39627 + }, + { + "epoch": 0.7068098312702886, + "grad_norm": 0.264016717672348, + "learning_rate": 1.198777116190855e-05, + "loss": 0.1099, + "step": 39628 + }, + { + "epoch": 0.7068276673920023, + "grad_norm": 0.2937481105327606, + "learning_rate": 1.1986442134671772e-05, + "loss": 0.1449, + "step": 39629 + }, + { + "epoch": 0.706845503513716, + "grad_norm": 0.18546999990940094, + "learning_rate": 1.198511315788004e-05, + "loss": 0.1302, + "step": 39630 + }, + { + "epoch": 0.7068633396354297, + "grad_norm": 0.2516763210296631, + "learning_rate": 1.1983784231538502e-05, + "loss": 0.1221, + "step": 39631 + }, + { + "epoch": 0.7068811757571434, + "grad_norm": 0.19915767014026642, + "learning_rate": 1.1982455355652305e-05, + "loss": 0.105, + "step": 39632 + }, + { + "epoch": 0.7068990118788571, + "grad_norm": 0.47600650787353516, + "learning_rate": 1.1981126530226617e-05, + "loss": 0.1699, + "step": 39633 + }, + { + "epoch": 0.7069168480005708, + "grad_norm": 0.2385549247264862, + "learning_rate": 1.1979797755266579e-05, + "loss": 0.1292, + "step": 39634 + }, + { + "epoch": 0.7069346841222844, + "grad_norm": 0.2580333948135376, + "learning_rate": 1.197846903077734e-05, + "loss": 0.1122, + "step": 39635 + }, + { + "epoch": 0.7069525202439981, + "grad_norm": 0.24852822721004486, + "learning_rate": 1.1977140356764044e-05, + "loss": 0.1258, + "step": 39636 + }, + { + "epoch": 0.7069703563657118, + "grad_norm": 0.3117390275001526, + "learning_rate": 1.1975811733231851e-05, + "loss": 0.1282, + "step": 39637 + }, + { + "epoch": 0.7069881924874255, + "grad_norm": 0.23703408241271973, + "learning_rate": 1.1974483160185918e-05, + "loss": 0.1276, + "step": 39638 + }, + { + "epoch": 0.7070060286091392, + "grad_norm": 0.2623916268348694, + "learning_rate": 1.1973154637631386e-05, + "loss": 0.1578, + "step": 39639 + }, + { + "epoch": 0.7070238647308529, + "grad_norm": 0.21476981043815613, + "learning_rate": 1.197182616557341e-05, + "loss": 0.0866, + "step": 39640 + }, + { + "epoch": 0.7070417008525666, + "grad_norm": 0.22601446509361267, + "learning_rate": 1.1970497744017122e-05, + "loss": 0.1166, + "step": 39641 + }, + { + "epoch": 0.7070595369742804, + "grad_norm": 0.3203946650028229, + "learning_rate": 1.1969169372967698e-05, + "loss": 0.129, + "step": 39642 + }, + { + "epoch": 0.707077373095994, + "grad_norm": 0.19800713658332825, + "learning_rate": 1.1967841052430274e-05, + "loss": 0.109, + "step": 39643 + }, + { + "epoch": 0.7070952092177077, + "grad_norm": 0.24219807982444763, + "learning_rate": 1.1966512782409998e-05, + "loss": 0.0712, + "step": 39644 + }, + { + "epoch": 0.7071130453394214, + "grad_norm": 0.23886223137378693, + "learning_rate": 1.1965184562912008e-05, + "loss": 0.1404, + "step": 39645 + }, + { + "epoch": 0.7071308814611351, + "grad_norm": 0.29603704810142517, + "learning_rate": 1.1963856393941478e-05, + "loss": 0.1343, + "step": 39646 + }, + { + "epoch": 0.7071487175828488, + "grad_norm": 0.23609548807144165, + "learning_rate": 1.196252827550354e-05, + "loss": 0.136, + "step": 39647 + }, + { + "epoch": 0.7071665537045625, + "grad_norm": 0.27406346797943115, + "learning_rate": 1.1961200207603349e-05, + "loss": 0.141, + "step": 39648 + }, + { + "epoch": 0.7071843898262762, + "grad_norm": 0.28046584129333496, + "learning_rate": 1.1959872190246035e-05, + "loss": 0.1485, + "step": 39649 + }, + { + "epoch": 0.7072022259479899, + "grad_norm": 0.2155359834432602, + "learning_rate": 1.1958544223436774e-05, + "loss": 0.1326, + "step": 39650 + }, + { + "epoch": 0.7072200620697036, + "grad_norm": 0.3944588303565979, + "learning_rate": 1.1957216307180691e-05, + "loss": 0.1379, + "step": 39651 + }, + { + "epoch": 0.7072378981914172, + "grad_norm": 0.22116807103157043, + "learning_rate": 1.195588844148295e-05, + "loss": 0.1046, + "step": 39652 + }, + { + "epoch": 0.7072557343131309, + "grad_norm": 0.1930345892906189, + "learning_rate": 1.195456062634869e-05, + "loss": 0.0922, + "step": 39653 + }, + { + "epoch": 0.7072735704348446, + "grad_norm": 0.2768682837486267, + "learning_rate": 1.195323286178305e-05, + "loss": 0.1324, + "step": 39654 + }, + { + "epoch": 0.7072914065565583, + "grad_norm": 0.7779185175895691, + "learning_rate": 1.1951905147791195e-05, + "loss": 0.1455, + "step": 39655 + }, + { + "epoch": 0.707309242678272, + "grad_norm": 0.21959508955478668, + "learning_rate": 1.1950577484378263e-05, + "loss": 0.1162, + "step": 39656 + }, + { + "epoch": 0.7073270787999857, + "grad_norm": 0.28785791993141174, + "learning_rate": 1.1949249871549401e-05, + "loss": 0.0646, + "step": 39657 + }, + { + "epoch": 0.7073449149216994, + "grad_norm": 0.2897997200489044, + "learning_rate": 1.1947922309309742e-05, + "loss": 0.1517, + "step": 39658 + }, + { + "epoch": 0.7073627510434132, + "grad_norm": 0.24086059629917145, + "learning_rate": 1.1946594797664454e-05, + "loss": 0.1447, + "step": 39659 + }, + { + "epoch": 0.7073805871651269, + "grad_norm": 0.3726377487182617, + "learning_rate": 1.1945267336618673e-05, + "loss": 0.1547, + "step": 39660 + }, + { + "epoch": 0.7073984232868405, + "grad_norm": 0.33043307065963745, + "learning_rate": 1.1943939926177547e-05, + "loss": 0.1545, + "step": 39661 + }, + { + "epoch": 0.7074162594085542, + "grad_norm": 0.31618979573249817, + "learning_rate": 1.194261256634621e-05, + "loss": 0.1758, + "step": 39662 + }, + { + "epoch": 0.7074340955302679, + "grad_norm": 0.26249876618385315, + "learning_rate": 1.1941285257129822e-05, + "loss": 0.082, + "step": 39663 + }, + { + "epoch": 0.7074519316519816, + "grad_norm": 0.22928814589977264, + "learning_rate": 1.1939957998533528e-05, + "loss": 0.0806, + "step": 39664 + }, + { + "epoch": 0.7074697677736953, + "grad_norm": 0.2218640297651291, + "learning_rate": 1.1938630790562451e-05, + "loss": 0.1145, + "step": 39665 + }, + { + "epoch": 0.707487603895409, + "grad_norm": 0.2665131390094757, + "learning_rate": 1.1937303633221768e-05, + "loss": 0.0862, + "step": 39666 + }, + { + "epoch": 0.7075054400171227, + "grad_norm": 0.30140218138694763, + "learning_rate": 1.1935976526516596e-05, + "loss": 0.102, + "step": 39667 + }, + { + "epoch": 0.7075232761388364, + "grad_norm": 0.30023959279060364, + "learning_rate": 1.19346494704521e-05, + "loss": 0.1228, + "step": 39668 + }, + { + "epoch": 0.70754111226055, + "grad_norm": 0.3467610776424408, + "learning_rate": 1.1933322465033417e-05, + "loss": 0.1695, + "step": 39669 + }, + { + "epoch": 0.7075589483822637, + "grad_norm": 0.2699323892593384, + "learning_rate": 1.193199551026569e-05, + "loss": 0.1543, + "step": 39670 + }, + { + "epoch": 0.7075767845039774, + "grad_norm": 0.1974875032901764, + "learning_rate": 1.193066860615405e-05, + "loss": 0.0803, + "step": 39671 + }, + { + "epoch": 0.7075946206256911, + "grad_norm": 0.33543792366981506, + "learning_rate": 1.1929341752703663e-05, + "loss": 0.1515, + "step": 39672 + }, + { + "epoch": 0.7076124567474048, + "grad_norm": 0.249459370970726, + "learning_rate": 1.192801494991966e-05, + "loss": 0.0951, + "step": 39673 + }, + { + "epoch": 0.7076302928691185, + "grad_norm": 0.37445738911628723, + "learning_rate": 1.192668819780719e-05, + "loss": 0.1051, + "step": 39674 + }, + { + "epoch": 0.7076481289908322, + "grad_norm": 0.2494664192199707, + "learning_rate": 1.192536149637138e-05, + "loss": 0.1093, + "step": 39675 + }, + { + "epoch": 0.707665965112546, + "grad_norm": 0.33687901496887207, + "learning_rate": 1.1924034845617394e-05, + "loss": 0.1461, + "step": 39676 + }, + { + "epoch": 0.7076838012342597, + "grad_norm": 0.30348601937294006, + "learning_rate": 1.1922708245550366e-05, + "loss": 0.1372, + "step": 39677 + }, + { + "epoch": 0.7077016373559734, + "grad_norm": 0.3136875629425049, + "learning_rate": 1.1921381696175426e-05, + "loss": 0.1277, + "step": 39678 + }, + { + "epoch": 0.707719473477687, + "grad_norm": 0.25206974148750305, + "learning_rate": 1.1920055197497739e-05, + "loss": 0.1149, + "step": 39679 + }, + { + "epoch": 0.7077373095994007, + "grad_norm": 0.25457724928855896, + "learning_rate": 1.1918728749522426e-05, + "loss": 0.1241, + "step": 39680 + }, + { + "epoch": 0.7077551457211144, + "grad_norm": 0.29711389541625977, + "learning_rate": 1.1917402352254647e-05, + "loss": 0.1822, + "step": 39681 + }, + { + "epoch": 0.7077729818428281, + "grad_norm": 0.2780214548110962, + "learning_rate": 1.1916076005699536e-05, + "loss": 0.1076, + "step": 39682 + }, + { + "epoch": 0.7077908179645418, + "grad_norm": 0.3048097789287567, + "learning_rate": 1.1914749709862235e-05, + "loss": 0.1427, + "step": 39683 + }, + { + "epoch": 0.7078086540862555, + "grad_norm": 0.3513798415660858, + "learning_rate": 1.1913423464747872e-05, + "loss": 0.0911, + "step": 39684 + }, + { + "epoch": 0.7078264902079692, + "grad_norm": 0.2893765866756439, + "learning_rate": 1.1912097270361611e-05, + "loss": 0.1, + "step": 39685 + }, + { + "epoch": 0.7078443263296829, + "grad_norm": 0.22490662336349487, + "learning_rate": 1.191077112670858e-05, + "loss": 0.1265, + "step": 39686 + }, + { + "epoch": 0.7078621624513965, + "grad_norm": 0.20559197664260864, + "learning_rate": 1.190944503379392e-05, + "loss": 0.0745, + "step": 39687 + }, + { + "epoch": 0.7078799985731102, + "grad_norm": 0.2870340645313263, + "learning_rate": 1.1908118991622765e-05, + "loss": 0.1494, + "step": 39688 + }, + { + "epoch": 0.7078978346948239, + "grad_norm": 0.3129982054233551, + "learning_rate": 1.190679300020027e-05, + "loss": 0.2129, + "step": 39689 + }, + { + "epoch": 0.7079156708165376, + "grad_norm": 0.26966583728790283, + "learning_rate": 1.1905467059531569e-05, + "loss": 0.0832, + "step": 39690 + }, + { + "epoch": 0.7079335069382513, + "grad_norm": 0.2385096698999405, + "learning_rate": 1.1904141169621802e-05, + "loss": 0.1534, + "step": 39691 + }, + { + "epoch": 0.707951343059965, + "grad_norm": 0.26734763383865356, + "learning_rate": 1.1902815330476094e-05, + "loss": 0.1456, + "step": 39692 + }, + { + "epoch": 0.7079691791816788, + "grad_norm": 0.3229265511035919, + "learning_rate": 1.19014895420996e-05, + "loss": 0.155, + "step": 39693 + }, + { + "epoch": 0.7079870153033925, + "grad_norm": 0.23967275023460388, + "learning_rate": 1.1900163804497464e-05, + "loss": 0.1118, + "step": 39694 + }, + { + "epoch": 0.7080048514251062, + "grad_norm": 0.20419557392597198, + "learning_rate": 1.1898838117674819e-05, + "loss": 0.1304, + "step": 39695 + }, + { + "epoch": 0.7080226875468199, + "grad_norm": 0.27684780955314636, + "learning_rate": 1.1897512481636802e-05, + "loss": 0.1084, + "step": 39696 + }, + { + "epoch": 0.7080405236685335, + "grad_norm": 0.27113083004951477, + "learning_rate": 1.1896186896388542e-05, + "loss": 0.1187, + "step": 39697 + }, + { + "epoch": 0.7080583597902472, + "grad_norm": 0.19479693472385406, + "learning_rate": 1.18948613619352e-05, + "loss": 0.0853, + "step": 39698 + }, + { + "epoch": 0.7080761959119609, + "grad_norm": 0.33110857009887695, + "learning_rate": 1.1893535878281898e-05, + "loss": 0.1238, + "step": 39699 + }, + { + "epoch": 0.7080940320336746, + "grad_norm": 0.25000420212745667, + "learning_rate": 1.189221044543378e-05, + "loss": 0.1321, + "step": 39700 + }, + { + "epoch": 0.7081118681553883, + "grad_norm": 0.34915584325790405, + "learning_rate": 1.1890885063395971e-05, + "loss": 0.0949, + "step": 39701 + }, + { + "epoch": 0.708129704277102, + "grad_norm": 0.3054065704345703, + "learning_rate": 1.188955973217363e-05, + "loss": 0.1013, + "step": 39702 + }, + { + "epoch": 0.7081475403988157, + "grad_norm": 0.24145790934562683, + "learning_rate": 1.1888234451771882e-05, + "loss": 0.0958, + "step": 39703 + }, + { + "epoch": 0.7081653765205294, + "grad_norm": 0.4281059205532074, + "learning_rate": 1.1886909222195866e-05, + "loss": 0.1507, + "step": 39704 + }, + { + "epoch": 0.708183212642243, + "grad_norm": 0.31464439630508423, + "learning_rate": 1.1885584043450711e-05, + "loss": 0.145, + "step": 39705 + }, + { + "epoch": 0.7082010487639567, + "grad_norm": 0.4476488530635834, + "learning_rate": 1.1884258915541571e-05, + "loss": 0.1952, + "step": 39706 + }, + { + "epoch": 0.7082188848856704, + "grad_norm": 0.24803683161735535, + "learning_rate": 1.1882933838473562e-05, + "loss": 0.1896, + "step": 39707 + }, + { + "epoch": 0.7082367210073841, + "grad_norm": 0.2919997572898865, + "learning_rate": 1.1881608812251843e-05, + "loss": 0.0733, + "step": 39708 + }, + { + "epoch": 0.7082545571290978, + "grad_norm": 0.28765708208084106, + "learning_rate": 1.188028383688154e-05, + "loss": 0.1271, + "step": 39709 + }, + { + "epoch": 0.7082723932508116, + "grad_norm": 0.24165499210357666, + "learning_rate": 1.1878958912367778e-05, + "loss": 0.1478, + "step": 39710 + }, + { + "epoch": 0.7082902293725253, + "grad_norm": 0.27377569675445557, + "learning_rate": 1.1877634038715712e-05, + "loss": 0.0797, + "step": 39711 + }, + { + "epoch": 0.708308065494239, + "grad_norm": 0.2816867232322693, + "learning_rate": 1.187630921593047e-05, + "loss": 0.0988, + "step": 39712 + }, + { + "epoch": 0.7083259016159527, + "grad_norm": 0.2299519032239914, + "learning_rate": 1.1874984444017183e-05, + "loss": 0.0867, + "step": 39713 + }, + { + "epoch": 0.7083437377376663, + "grad_norm": 0.3553406894207001, + "learning_rate": 1.1873659722980985e-05, + "loss": 0.1144, + "step": 39714 + }, + { + "epoch": 0.70836157385938, + "grad_norm": 0.29719212651252747, + "learning_rate": 1.1872335052827021e-05, + "loss": 0.1036, + "step": 39715 + }, + { + "epoch": 0.7083794099810937, + "grad_norm": 0.3521648347377777, + "learning_rate": 1.1871010433560422e-05, + "loss": 0.0983, + "step": 39716 + }, + { + "epoch": 0.7083972461028074, + "grad_norm": 0.27065762877464294, + "learning_rate": 1.1869685865186322e-05, + "loss": 0.0778, + "step": 39717 + }, + { + "epoch": 0.7084150822245211, + "grad_norm": 0.23846198618412018, + "learning_rate": 1.1868361347709843e-05, + "loss": 0.1031, + "step": 39718 + }, + { + "epoch": 0.7084329183462348, + "grad_norm": 0.28253084421157837, + "learning_rate": 1.1867036881136142e-05, + "loss": 0.1451, + "step": 39719 + }, + { + "epoch": 0.7084507544679485, + "grad_norm": 0.2843198776245117, + "learning_rate": 1.1865712465470336e-05, + "loss": 0.1103, + "step": 39720 + }, + { + "epoch": 0.7084685905896622, + "grad_norm": 0.3050706088542938, + "learning_rate": 1.1864388100717569e-05, + "loss": 0.0992, + "step": 39721 + }, + { + "epoch": 0.7084864267113758, + "grad_norm": 0.30785882472991943, + "learning_rate": 1.1863063786882971e-05, + "loss": 0.1558, + "step": 39722 + }, + { + "epoch": 0.7085042628330895, + "grad_norm": 0.20489034056663513, + "learning_rate": 1.1861739523971668e-05, + "loss": 0.1242, + "step": 39723 + }, + { + "epoch": 0.7085220989548032, + "grad_norm": 0.30929896235466003, + "learning_rate": 1.186041531198881e-05, + "loss": 0.2091, + "step": 39724 + }, + { + "epoch": 0.7085399350765169, + "grad_norm": 0.26772361993789673, + "learning_rate": 1.185909115093952e-05, + "loss": 0.0658, + "step": 39725 + }, + { + "epoch": 0.7085577711982306, + "grad_norm": 0.2231772392988205, + "learning_rate": 1.185776704082893e-05, + "loss": 0.0826, + "step": 39726 + }, + { + "epoch": 0.7085756073199444, + "grad_norm": 0.2768242657184601, + "learning_rate": 1.1856442981662167e-05, + "loss": 0.1506, + "step": 39727 + }, + { + "epoch": 0.7085934434416581, + "grad_norm": 0.2602427005767822, + "learning_rate": 1.1855118973444377e-05, + "loss": 0.1084, + "step": 39728 + }, + { + "epoch": 0.7086112795633718, + "grad_norm": 0.2349170595407486, + "learning_rate": 1.1853795016180689e-05, + "loss": 0.1455, + "step": 39729 + }, + { + "epoch": 0.7086291156850855, + "grad_norm": 0.29324156045913696, + "learning_rate": 1.1852471109876229e-05, + "loss": 0.0987, + "step": 39730 + }, + { + "epoch": 0.7086469518067992, + "grad_norm": 0.26818355917930603, + "learning_rate": 1.1851147254536124e-05, + "loss": 0.1432, + "step": 39731 + }, + { + "epoch": 0.7086647879285128, + "grad_norm": 0.25412893295288086, + "learning_rate": 1.1849823450165524e-05, + "loss": 0.1282, + "step": 39732 + }, + { + "epoch": 0.7086826240502265, + "grad_norm": 0.2679944932460785, + "learning_rate": 1.1848499696769549e-05, + "loss": 0.1136, + "step": 39733 + }, + { + "epoch": 0.7087004601719402, + "grad_norm": 0.2822827994823456, + "learning_rate": 1.1847175994353324e-05, + "loss": 0.1348, + "step": 39734 + }, + { + "epoch": 0.7087182962936539, + "grad_norm": 0.24108313024044037, + "learning_rate": 1.1845852342921995e-05, + "loss": 0.1556, + "step": 39735 + }, + { + "epoch": 0.7087361324153676, + "grad_norm": 0.24654445052146912, + "learning_rate": 1.1844528742480677e-05, + "loss": 0.0782, + "step": 39736 + }, + { + "epoch": 0.7087539685370813, + "grad_norm": 0.2935083210468292, + "learning_rate": 1.184320519303452e-05, + "loss": 0.1154, + "step": 39737 + }, + { + "epoch": 0.708771804658795, + "grad_norm": 0.27358415722846985, + "learning_rate": 1.1841881694588642e-05, + "loss": 0.1145, + "step": 39738 + }, + { + "epoch": 0.7087896407805087, + "grad_norm": 0.2414022982120514, + "learning_rate": 1.1840558247148176e-05, + "loss": 0.0737, + "step": 39739 + }, + { + "epoch": 0.7088074769022223, + "grad_norm": 0.3104711174964905, + "learning_rate": 1.1839234850718242e-05, + "loss": 0.108, + "step": 39740 + }, + { + "epoch": 0.708825313023936, + "grad_norm": 0.3439747095108032, + "learning_rate": 1.1837911505303989e-05, + "loss": 0.1208, + "step": 39741 + }, + { + "epoch": 0.7088431491456497, + "grad_norm": 0.2752928137779236, + "learning_rate": 1.1836588210910535e-05, + "loss": 0.1049, + "step": 39742 + }, + { + "epoch": 0.7088609852673635, + "grad_norm": 0.24843448400497437, + "learning_rate": 1.1835264967543013e-05, + "loss": 0.0936, + "step": 39743 + }, + { + "epoch": 0.7088788213890772, + "grad_norm": 0.3079076409339905, + "learning_rate": 1.1833941775206541e-05, + "loss": 0.1277, + "step": 39744 + }, + { + "epoch": 0.7088966575107909, + "grad_norm": 0.24981015920639038, + "learning_rate": 1.183261863390627e-05, + "loss": 0.1374, + "step": 39745 + }, + { + "epoch": 0.7089144936325046, + "grad_norm": 0.25197288393974304, + "learning_rate": 1.1831295543647317e-05, + "loss": 0.1414, + "step": 39746 + }, + { + "epoch": 0.7089323297542183, + "grad_norm": 0.29533651471138, + "learning_rate": 1.1829972504434797e-05, + "loss": 0.1064, + "step": 39747 + }, + { + "epoch": 0.708950165875932, + "grad_norm": 0.2694208025932312, + "learning_rate": 1.1828649516273865e-05, + "loss": 0.093, + "step": 39748 + }, + { + "epoch": 0.7089680019976456, + "grad_norm": 0.23937560617923737, + "learning_rate": 1.1827326579169629e-05, + "loss": 0.1072, + "step": 39749 + }, + { + "epoch": 0.7089858381193593, + "grad_norm": 0.2120543271303177, + "learning_rate": 1.182600369312723e-05, + "loss": 0.1117, + "step": 39750 + }, + { + "epoch": 0.709003674241073, + "grad_norm": 0.2478790581226349, + "learning_rate": 1.1824680858151794e-05, + "loss": 0.1495, + "step": 39751 + }, + { + "epoch": 0.7090215103627867, + "grad_norm": 0.3088122308254242, + "learning_rate": 1.1823358074248444e-05, + "loss": 0.1022, + "step": 39752 + }, + { + "epoch": 0.7090393464845004, + "grad_norm": 0.35148707032203674, + "learning_rate": 1.18220353414223e-05, + "loss": 0.1098, + "step": 39753 + }, + { + "epoch": 0.7090571826062141, + "grad_norm": 0.24480833113193512, + "learning_rate": 1.1820712659678507e-05, + "loss": 0.0812, + "step": 39754 + }, + { + "epoch": 0.7090750187279278, + "grad_norm": 0.37409868836402893, + "learning_rate": 1.1819390029022186e-05, + "loss": 0.1252, + "step": 39755 + }, + { + "epoch": 0.7090928548496415, + "grad_norm": 0.24978578090667725, + "learning_rate": 1.1818067449458461e-05, + "loss": 0.1082, + "step": 39756 + }, + { + "epoch": 0.7091106909713552, + "grad_norm": 0.26101842522621155, + "learning_rate": 1.1816744920992448e-05, + "loss": 0.0933, + "step": 39757 + }, + { + "epoch": 0.7091285270930688, + "grad_norm": 0.28741735219955444, + "learning_rate": 1.1815422443629299e-05, + "loss": 0.1096, + "step": 39758 + }, + { + "epoch": 0.7091463632147825, + "grad_norm": 0.309205561876297, + "learning_rate": 1.1814100017374122e-05, + "loss": 0.1131, + "step": 39759 + }, + { + "epoch": 0.7091641993364963, + "grad_norm": 0.3159785568714142, + "learning_rate": 1.1812777642232048e-05, + "loss": 0.1239, + "step": 39760 + }, + { + "epoch": 0.70918203545821, + "grad_norm": 0.1896720677614212, + "learning_rate": 1.1811455318208195e-05, + "loss": 0.0665, + "step": 39761 + }, + { + "epoch": 0.7091998715799237, + "grad_norm": 0.2209632843732834, + "learning_rate": 1.1810133045307705e-05, + "loss": 0.1297, + "step": 39762 + }, + { + "epoch": 0.7092177077016374, + "grad_norm": 0.22897344827651978, + "learning_rate": 1.1808810823535684e-05, + "loss": 0.0881, + "step": 39763 + }, + { + "epoch": 0.7092355438233511, + "grad_norm": 0.37251970171928406, + "learning_rate": 1.180748865289728e-05, + "loss": 0.1761, + "step": 39764 + }, + { + "epoch": 0.7092533799450648, + "grad_norm": 0.37440046668052673, + "learning_rate": 1.1806166533397605e-05, + "loss": 0.128, + "step": 39765 + }, + { + "epoch": 0.7092712160667785, + "grad_norm": 0.3115336298942566, + "learning_rate": 1.1804844465041779e-05, + "loss": 0.1063, + "step": 39766 + }, + { + "epoch": 0.7092890521884921, + "grad_norm": 0.26212194561958313, + "learning_rate": 1.1803522447834942e-05, + "loss": 0.0902, + "step": 39767 + }, + { + "epoch": 0.7093068883102058, + "grad_norm": 0.3410329222679138, + "learning_rate": 1.1802200481782208e-05, + "loss": 0.1235, + "step": 39768 + }, + { + "epoch": 0.7093247244319195, + "grad_norm": 0.2521723210811615, + "learning_rate": 1.1800878566888705e-05, + "loss": 0.1204, + "step": 39769 + }, + { + "epoch": 0.7093425605536332, + "grad_norm": 0.19752517342567444, + "learning_rate": 1.1799556703159542e-05, + "loss": 0.0982, + "step": 39770 + }, + { + "epoch": 0.7093603966753469, + "grad_norm": 0.2872680425643921, + "learning_rate": 1.1798234890599868e-05, + "loss": 0.1722, + "step": 39771 + }, + { + "epoch": 0.7093782327970606, + "grad_norm": 0.29151684045791626, + "learning_rate": 1.1796913129214798e-05, + "loss": 0.1192, + "step": 39772 + }, + { + "epoch": 0.7093960689187743, + "grad_norm": 0.25677651166915894, + "learning_rate": 1.179559141900945e-05, + "loss": 0.1354, + "step": 39773 + }, + { + "epoch": 0.709413905040488, + "grad_norm": 0.31532707810401917, + "learning_rate": 1.1794269759988943e-05, + "loss": 0.0877, + "step": 39774 + }, + { + "epoch": 0.7094317411622016, + "grad_norm": 0.2908805012702942, + "learning_rate": 1.1792948152158417e-05, + "loss": 0.1285, + "step": 39775 + }, + { + "epoch": 0.7094495772839153, + "grad_norm": 0.26030611991882324, + "learning_rate": 1.1791626595522973e-05, + "loss": 0.175, + "step": 39776 + }, + { + "epoch": 0.7094674134056291, + "grad_norm": 0.2896346151828766, + "learning_rate": 1.1790305090087758e-05, + "loss": 0.1509, + "step": 39777 + }, + { + "epoch": 0.7094852495273428, + "grad_norm": 0.25536879897117615, + "learning_rate": 1.1788983635857884e-05, + "loss": 0.0908, + "step": 39778 + }, + { + "epoch": 0.7095030856490565, + "grad_norm": 0.23476728796958923, + "learning_rate": 1.1787662232838461e-05, + "loss": 0.1214, + "step": 39779 + }, + { + "epoch": 0.7095209217707702, + "grad_norm": 0.23477321863174438, + "learning_rate": 1.1786340881034632e-05, + "loss": 0.1035, + "step": 39780 + }, + { + "epoch": 0.7095387578924839, + "grad_norm": 0.22442224621772766, + "learning_rate": 1.1785019580451511e-05, + "loss": 0.1068, + "step": 39781 + }, + { + "epoch": 0.7095565940141976, + "grad_norm": 0.3498179316520691, + "learning_rate": 1.1783698331094217e-05, + "loss": 0.1358, + "step": 39782 + }, + { + "epoch": 0.7095744301359113, + "grad_norm": 0.2949180006980896, + "learning_rate": 1.1782377132967865e-05, + "loss": 0.1224, + "step": 39783 + }, + { + "epoch": 0.709592266257625, + "grad_norm": 0.25757506489753723, + "learning_rate": 1.1781055986077593e-05, + "loss": 0.0951, + "step": 39784 + }, + { + "epoch": 0.7096101023793386, + "grad_norm": 0.28989556431770325, + "learning_rate": 1.1779734890428515e-05, + "loss": 0.1323, + "step": 39785 + }, + { + "epoch": 0.7096279385010523, + "grad_norm": 0.4818069338798523, + "learning_rate": 1.1778413846025748e-05, + "loss": 0.162, + "step": 39786 + }, + { + "epoch": 0.709645774622766, + "grad_norm": 0.2923320531845093, + "learning_rate": 1.177709285287442e-05, + "loss": 0.1014, + "step": 39787 + }, + { + "epoch": 0.7096636107444797, + "grad_norm": 0.32751449942588806, + "learning_rate": 1.1775771910979633e-05, + "loss": 0.1181, + "step": 39788 + }, + { + "epoch": 0.7096814468661934, + "grad_norm": 0.2230789214372635, + "learning_rate": 1.1774451020346532e-05, + "loss": 0.0865, + "step": 39789 + }, + { + "epoch": 0.7096992829879071, + "grad_norm": 0.23723924160003662, + "learning_rate": 1.1773130180980218e-05, + "loss": 0.1024, + "step": 39790 + }, + { + "epoch": 0.7097171191096208, + "grad_norm": 0.251668781042099, + "learning_rate": 1.177180939288583e-05, + "loss": 0.0987, + "step": 39791 + }, + { + "epoch": 0.7097349552313345, + "grad_norm": 0.279910147190094, + "learning_rate": 1.1770488656068469e-05, + "loss": 0.1595, + "step": 39792 + }, + { + "epoch": 0.7097527913530481, + "grad_norm": 0.19495919346809387, + "learning_rate": 1.176916797053327e-05, + "loss": 0.0992, + "step": 39793 + }, + { + "epoch": 0.7097706274747619, + "grad_norm": 0.1858009248971939, + "learning_rate": 1.1767847336285348e-05, + "loss": 0.0774, + "step": 39794 + }, + { + "epoch": 0.7097884635964756, + "grad_norm": 0.21037927269935608, + "learning_rate": 1.1766526753329818e-05, + "loss": 0.1213, + "step": 39795 + }, + { + "epoch": 0.7098062997181893, + "grad_norm": 0.24584011733531952, + "learning_rate": 1.1765206221671792e-05, + "loss": 0.0834, + "step": 39796 + }, + { + "epoch": 0.709824135839903, + "grad_norm": 0.24783062934875488, + "learning_rate": 1.176388574131641e-05, + "loss": 0.1438, + "step": 39797 + }, + { + "epoch": 0.7098419719616167, + "grad_norm": 0.8838186264038086, + "learning_rate": 1.1762565312268775e-05, + "loss": 0.0986, + "step": 39798 + }, + { + "epoch": 0.7098598080833304, + "grad_norm": 0.23214347660541534, + "learning_rate": 1.1761244934534011e-05, + "loss": 0.1209, + "step": 39799 + }, + { + "epoch": 0.7098776442050441, + "grad_norm": 0.30486711859703064, + "learning_rate": 1.1759924608117235e-05, + "loss": 0.0851, + "step": 39800 + }, + { + "epoch": 0.7098954803267578, + "grad_norm": 0.3394198715686798, + "learning_rate": 1.1758604333023552e-05, + "loss": 0.1192, + "step": 39801 + }, + { + "epoch": 0.7099133164484714, + "grad_norm": 0.2624277174472809, + "learning_rate": 1.1757284109258102e-05, + "loss": 0.1015, + "step": 39802 + }, + { + "epoch": 0.7099311525701851, + "grad_norm": 0.3305242657661438, + "learning_rate": 1.1755963936825984e-05, + "loss": 0.1544, + "step": 39803 + }, + { + "epoch": 0.7099489886918988, + "grad_norm": 0.30331698060035706, + "learning_rate": 1.1754643815732336e-05, + "loss": 0.1486, + "step": 39804 + }, + { + "epoch": 0.7099668248136125, + "grad_norm": 0.5515194535255432, + "learning_rate": 1.175332374598225e-05, + "loss": 0.1323, + "step": 39805 + }, + { + "epoch": 0.7099846609353262, + "grad_norm": 0.3562612235546112, + "learning_rate": 1.1752003727580868e-05, + "loss": 0.1481, + "step": 39806 + }, + { + "epoch": 0.7100024970570399, + "grad_norm": 0.25684693455696106, + "learning_rate": 1.1750683760533293e-05, + "loss": 0.1298, + "step": 39807 + }, + { + "epoch": 0.7100203331787536, + "grad_norm": 0.2579234838485718, + "learning_rate": 1.1749363844844646e-05, + "loss": 0.0977, + "step": 39808 + }, + { + "epoch": 0.7100381693004673, + "grad_norm": 0.20638930797576904, + "learning_rate": 1.1748043980520032e-05, + "loss": 0.1049, + "step": 39809 + }, + { + "epoch": 0.710056005422181, + "grad_norm": 0.22392813861370087, + "learning_rate": 1.1746724167564585e-05, + "loss": 0.1192, + "step": 39810 + }, + { + "epoch": 0.7100738415438947, + "grad_norm": 0.2495158463716507, + "learning_rate": 1.1745404405983412e-05, + "loss": 0.1034, + "step": 39811 + }, + { + "epoch": 0.7100916776656084, + "grad_norm": 0.2554972767829895, + "learning_rate": 1.1744084695781633e-05, + "loss": 0.0995, + "step": 39812 + }, + { + "epoch": 0.7101095137873221, + "grad_norm": 0.20457206666469574, + "learning_rate": 1.1742765036964357e-05, + "loss": 0.0971, + "step": 39813 + }, + { + "epoch": 0.7101273499090358, + "grad_norm": 0.24943438172340393, + "learning_rate": 1.1741445429536693e-05, + "loss": 0.0921, + "step": 39814 + }, + { + "epoch": 0.7101451860307495, + "grad_norm": 0.31734800338745117, + "learning_rate": 1.1740125873503777e-05, + "loss": 0.0791, + "step": 39815 + }, + { + "epoch": 0.7101630221524632, + "grad_norm": 0.2341710925102234, + "learning_rate": 1.173880636887071e-05, + "loss": 0.0865, + "step": 39816 + }, + { + "epoch": 0.7101808582741769, + "grad_norm": 0.27869102358818054, + "learning_rate": 1.1737486915642603e-05, + "loss": 0.1206, + "step": 39817 + }, + { + "epoch": 0.7101986943958906, + "grad_norm": 0.35360386967658997, + "learning_rate": 1.1736167513824578e-05, + "loss": 0.1438, + "step": 39818 + }, + { + "epoch": 0.7102165305176042, + "grad_norm": 0.2731833755970001, + "learning_rate": 1.1734848163421757e-05, + "loss": 0.102, + "step": 39819 + }, + { + "epoch": 0.7102343666393179, + "grad_norm": 0.27589675784111023, + "learning_rate": 1.1733528864439248e-05, + "loss": 0.1358, + "step": 39820 + }, + { + "epoch": 0.7102522027610316, + "grad_norm": 0.3466830551624298, + "learning_rate": 1.1732209616882161e-05, + "loss": 0.1429, + "step": 39821 + }, + { + "epoch": 0.7102700388827453, + "grad_norm": 0.26109829545021057, + "learning_rate": 1.1730890420755603e-05, + "loss": 0.1045, + "step": 39822 + }, + { + "epoch": 0.710287875004459, + "grad_norm": 0.19484354555606842, + "learning_rate": 1.1729571276064708e-05, + "loss": 0.1031, + "step": 39823 + }, + { + "epoch": 0.7103057111261727, + "grad_norm": 0.24178259074687958, + "learning_rate": 1.1728252182814575e-05, + "loss": 0.1298, + "step": 39824 + }, + { + "epoch": 0.7103235472478864, + "grad_norm": 0.2804674506187439, + "learning_rate": 1.1726933141010325e-05, + "loss": 0.08, + "step": 39825 + }, + { + "epoch": 0.7103413833696001, + "grad_norm": 0.22235791385173798, + "learning_rate": 1.1725614150657061e-05, + "loss": 0.1146, + "step": 39826 + }, + { + "epoch": 0.7103592194913138, + "grad_norm": 0.21689453721046448, + "learning_rate": 1.1724295211759896e-05, + "loss": 0.0964, + "step": 39827 + }, + { + "epoch": 0.7103770556130276, + "grad_norm": 0.24945567548274994, + "learning_rate": 1.1722976324323956e-05, + "loss": 0.0886, + "step": 39828 + }, + { + "epoch": 0.7103948917347412, + "grad_norm": 0.281645268201828, + "learning_rate": 1.1721657488354346e-05, + "loss": 0.156, + "step": 39829 + }, + { + "epoch": 0.7104127278564549, + "grad_norm": 0.2604235112667084, + "learning_rate": 1.1720338703856169e-05, + "loss": 0.1375, + "step": 39830 + }, + { + "epoch": 0.7104305639781686, + "grad_norm": 0.2349165380001068, + "learning_rate": 1.1719019970834552e-05, + "loss": 0.0829, + "step": 39831 + }, + { + "epoch": 0.7104484000998823, + "grad_norm": 0.3044160306453705, + "learning_rate": 1.1717701289294593e-05, + "loss": 0.104, + "step": 39832 + }, + { + "epoch": 0.710466236221596, + "grad_norm": 0.288135826587677, + "learning_rate": 1.171638265924142e-05, + "loss": 0.116, + "step": 39833 + }, + { + "epoch": 0.7104840723433097, + "grad_norm": 0.33671990036964417, + "learning_rate": 1.1715064080680138e-05, + "loss": 0.1006, + "step": 39834 + }, + { + "epoch": 0.7105019084650234, + "grad_norm": 0.3152539134025574, + "learning_rate": 1.1713745553615846e-05, + "loss": 0.1163, + "step": 39835 + }, + { + "epoch": 0.7105197445867371, + "grad_norm": 0.3200022578239441, + "learning_rate": 1.1712427078053675e-05, + "loss": 0.1194, + "step": 39836 + }, + { + "epoch": 0.7105375807084507, + "grad_norm": 0.202154278755188, + "learning_rate": 1.1711108653998725e-05, + "loss": 0.1031, + "step": 39837 + }, + { + "epoch": 0.7105554168301644, + "grad_norm": 0.2518230080604553, + "learning_rate": 1.170979028145611e-05, + "loss": 0.1014, + "step": 39838 + }, + { + "epoch": 0.7105732529518781, + "grad_norm": 0.32290545105934143, + "learning_rate": 1.1708471960430934e-05, + "loss": 0.1175, + "step": 39839 + }, + { + "epoch": 0.7105910890735918, + "grad_norm": 0.3526301085948944, + "learning_rate": 1.1707153690928304e-05, + "loss": 0.1249, + "step": 39840 + }, + { + "epoch": 0.7106089251953055, + "grad_norm": 0.18469084799289703, + "learning_rate": 1.1705835472953346e-05, + "loss": 0.088, + "step": 39841 + }, + { + "epoch": 0.7106267613170192, + "grad_norm": 0.23163259029388428, + "learning_rate": 1.1704517306511165e-05, + "loss": 0.1381, + "step": 39842 + }, + { + "epoch": 0.7106445974387329, + "grad_norm": 0.20498667657375336, + "learning_rate": 1.1703199191606865e-05, + "loss": 0.1183, + "step": 39843 + }, + { + "epoch": 0.7106624335604467, + "grad_norm": 0.2499837428331375, + "learning_rate": 1.1701881128245545e-05, + "loss": 0.137, + "step": 39844 + }, + { + "epoch": 0.7106802696821604, + "grad_norm": 0.3577694296836853, + "learning_rate": 1.1700563116432339e-05, + "loss": 0.1609, + "step": 39845 + }, + { + "epoch": 0.710698105803874, + "grad_norm": 0.388089656829834, + "learning_rate": 1.1699245156172336e-05, + "loss": 0.1371, + "step": 39846 + }, + { + "epoch": 0.7107159419255877, + "grad_norm": 0.23920665681362152, + "learning_rate": 1.1697927247470661e-05, + "loss": 0.1068, + "step": 39847 + }, + { + "epoch": 0.7107337780473014, + "grad_norm": 0.27356773614883423, + "learning_rate": 1.1696609390332403e-05, + "loss": 0.1641, + "step": 39848 + }, + { + "epoch": 0.7107516141690151, + "grad_norm": 0.2568212151527405, + "learning_rate": 1.1695291584762693e-05, + "loss": 0.1534, + "step": 39849 + }, + { + "epoch": 0.7107694502907288, + "grad_norm": 0.3869524896144867, + "learning_rate": 1.1693973830766628e-05, + "loss": 0.1894, + "step": 39850 + }, + { + "epoch": 0.7107872864124425, + "grad_norm": 0.27857154607772827, + "learning_rate": 1.1692656128349316e-05, + "loss": 0.1019, + "step": 39851 + }, + { + "epoch": 0.7108051225341562, + "grad_norm": 0.28598251938819885, + "learning_rate": 1.1691338477515864e-05, + "loss": 0.1058, + "step": 39852 + }, + { + "epoch": 0.7108229586558699, + "grad_norm": 0.27139079570770264, + "learning_rate": 1.1690020878271371e-05, + "loss": 0.1023, + "step": 39853 + }, + { + "epoch": 0.7108407947775836, + "grad_norm": 0.3517839312553406, + "learning_rate": 1.1688703330620965e-05, + "loss": 0.1114, + "step": 39854 + }, + { + "epoch": 0.7108586308992972, + "grad_norm": 0.3066236674785614, + "learning_rate": 1.1687385834569745e-05, + "loss": 0.1831, + "step": 39855 + }, + { + "epoch": 0.7108764670210109, + "grad_norm": 0.28191259503364563, + "learning_rate": 1.1686068390122812e-05, + "loss": 0.1158, + "step": 39856 + }, + { + "epoch": 0.7108943031427246, + "grad_norm": 0.27088141441345215, + "learning_rate": 1.1684750997285267e-05, + "loss": 0.1334, + "step": 39857 + }, + { + "epoch": 0.7109121392644383, + "grad_norm": 0.27961984276771545, + "learning_rate": 1.1683433656062237e-05, + "loss": 0.0987, + "step": 39858 + }, + { + "epoch": 0.710929975386152, + "grad_norm": 0.3439388573169708, + "learning_rate": 1.1682116366458806e-05, + "loss": 0.0858, + "step": 39859 + }, + { + "epoch": 0.7109478115078657, + "grad_norm": 0.24691370129585266, + "learning_rate": 1.1680799128480103e-05, + "loss": 0.1199, + "step": 39860 + }, + { + "epoch": 0.7109656476295795, + "grad_norm": 0.280831515789032, + "learning_rate": 1.1679481942131212e-05, + "loss": 0.145, + "step": 39861 + }, + { + "epoch": 0.7109834837512932, + "grad_norm": 0.15910178422927856, + "learning_rate": 1.1678164807417261e-05, + "loss": 0.0848, + "step": 39862 + }, + { + "epoch": 0.7110013198730069, + "grad_norm": 0.2672363519668579, + "learning_rate": 1.1676847724343345e-05, + "loss": 0.1092, + "step": 39863 + }, + { + "epoch": 0.7110191559947205, + "grad_norm": 0.26946932077407837, + "learning_rate": 1.1675530692914566e-05, + "loss": 0.132, + "step": 39864 + }, + { + "epoch": 0.7110369921164342, + "grad_norm": 0.26551803946495056, + "learning_rate": 1.1674213713136034e-05, + "loss": 0.0819, + "step": 39865 + }, + { + "epoch": 0.7110548282381479, + "grad_norm": 0.2769637703895569, + "learning_rate": 1.1672896785012843e-05, + "loss": 0.1223, + "step": 39866 + }, + { + "epoch": 0.7110726643598616, + "grad_norm": 0.2650698721408844, + "learning_rate": 1.1671579908550117e-05, + "loss": 0.1504, + "step": 39867 + }, + { + "epoch": 0.7110905004815753, + "grad_norm": 0.2823314964771271, + "learning_rate": 1.167026308375295e-05, + "loss": 0.1384, + "step": 39868 + }, + { + "epoch": 0.711108336603289, + "grad_norm": 0.2580188512802124, + "learning_rate": 1.1668946310626447e-05, + "loss": 0.1126, + "step": 39869 + }, + { + "epoch": 0.7111261727250027, + "grad_norm": 0.28970351815223694, + "learning_rate": 1.1667629589175702e-05, + "loss": 0.0927, + "step": 39870 + }, + { + "epoch": 0.7111440088467164, + "grad_norm": 0.33309268951416016, + "learning_rate": 1.1666312919405841e-05, + "loss": 0.0797, + "step": 39871 + }, + { + "epoch": 0.71116184496843, + "grad_norm": 0.25777631998062134, + "learning_rate": 1.1664996301321957e-05, + "loss": 0.1216, + "step": 39872 + }, + { + "epoch": 0.7111796810901437, + "grad_norm": 0.22438335418701172, + "learning_rate": 1.1663679734929139e-05, + "loss": 0.1171, + "step": 39873 + }, + { + "epoch": 0.7111975172118574, + "grad_norm": 0.22949950397014618, + "learning_rate": 1.166236322023251e-05, + "loss": 0.114, + "step": 39874 + }, + { + "epoch": 0.7112153533335711, + "grad_norm": 0.2608383595943451, + "learning_rate": 1.1661046757237173e-05, + "loss": 0.1599, + "step": 39875 + }, + { + "epoch": 0.7112331894552848, + "grad_norm": 0.21117225289344788, + "learning_rate": 1.165973034594823e-05, + "loss": 0.1242, + "step": 39876 + }, + { + "epoch": 0.7112510255769985, + "grad_norm": 0.3097367286682129, + "learning_rate": 1.1658413986370777e-05, + "loss": 0.1309, + "step": 39877 + }, + { + "epoch": 0.7112688616987123, + "grad_norm": 0.294981449842453, + "learning_rate": 1.165709767850992e-05, + "loss": 0.1319, + "step": 39878 + }, + { + "epoch": 0.711286697820426, + "grad_norm": 0.26491573452949524, + "learning_rate": 1.165578142237075e-05, + "loss": 0.1454, + "step": 39879 + }, + { + "epoch": 0.7113045339421397, + "grad_norm": 0.23283720016479492, + "learning_rate": 1.1654465217958394e-05, + "loss": 0.1231, + "step": 39880 + }, + { + "epoch": 0.7113223700638533, + "grad_norm": 0.19688691198825836, + "learning_rate": 1.1653149065277935e-05, + "loss": 0.1029, + "step": 39881 + }, + { + "epoch": 0.711340206185567, + "grad_norm": 0.25711968541145325, + "learning_rate": 1.1651832964334483e-05, + "loss": 0.1094, + "step": 39882 + }, + { + "epoch": 0.7113580423072807, + "grad_norm": 0.24089841544628143, + "learning_rate": 1.1650516915133127e-05, + "loss": 0.113, + "step": 39883 + }, + { + "epoch": 0.7113758784289944, + "grad_norm": 0.30157706141471863, + "learning_rate": 1.1649200917678987e-05, + "loss": 0.1229, + "step": 39884 + }, + { + "epoch": 0.7113937145507081, + "grad_norm": 0.21007725596427917, + "learning_rate": 1.1647884971977158e-05, + "loss": 0.1437, + "step": 39885 + }, + { + "epoch": 0.7114115506724218, + "grad_norm": 0.18124797940254211, + "learning_rate": 1.1646569078032727e-05, + "loss": 0.1455, + "step": 39886 + }, + { + "epoch": 0.7114293867941355, + "grad_norm": 0.33128172159194946, + "learning_rate": 1.1645253235850815e-05, + "loss": 0.1075, + "step": 39887 + }, + { + "epoch": 0.7114472229158492, + "grad_norm": 0.23498903214931488, + "learning_rate": 1.1643937445436506e-05, + "loss": 0.0759, + "step": 39888 + }, + { + "epoch": 0.7114650590375629, + "grad_norm": 0.2730211615562439, + "learning_rate": 1.1642621706794915e-05, + "loss": 0.1404, + "step": 39889 + }, + { + "epoch": 0.7114828951592765, + "grad_norm": 0.28579044342041016, + "learning_rate": 1.1641306019931139e-05, + "loss": 0.0753, + "step": 39890 + }, + { + "epoch": 0.7115007312809902, + "grad_norm": 0.28043386340141296, + "learning_rate": 1.1639990384850275e-05, + "loss": 0.1019, + "step": 39891 + }, + { + "epoch": 0.7115185674027039, + "grad_norm": 0.3845950663089752, + "learning_rate": 1.1638674801557412e-05, + "loss": 0.1637, + "step": 39892 + }, + { + "epoch": 0.7115364035244176, + "grad_norm": 0.20041073858737946, + "learning_rate": 1.1637359270057669e-05, + "loss": 0.0995, + "step": 39893 + }, + { + "epoch": 0.7115542396461313, + "grad_norm": 0.23606139421463013, + "learning_rate": 1.1636043790356137e-05, + "loss": 0.1469, + "step": 39894 + }, + { + "epoch": 0.7115720757678451, + "grad_norm": 0.2565153241157532, + "learning_rate": 1.1634728362457916e-05, + "loss": 0.0997, + "step": 39895 + }, + { + "epoch": 0.7115899118895588, + "grad_norm": 0.16047993302345276, + "learning_rate": 1.1633412986368094e-05, + "loss": 0.082, + "step": 39896 + }, + { + "epoch": 0.7116077480112725, + "grad_norm": 0.27130910754203796, + "learning_rate": 1.163209766209179e-05, + "loss": 0.1164, + "step": 39897 + }, + { + "epoch": 0.7116255841329862, + "grad_norm": 0.2683762311935425, + "learning_rate": 1.1630782389634093e-05, + "loss": 0.1208, + "step": 39898 + }, + { + "epoch": 0.7116434202546998, + "grad_norm": 0.2111593335866928, + "learning_rate": 1.1629467169000099e-05, + "loss": 0.1147, + "step": 39899 + }, + { + "epoch": 0.7116612563764135, + "grad_norm": 0.32923048734664917, + "learning_rate": 1.1628152000194901e-05, + "loss": 0.1633, + "step": 39900 + }, + { + "epoch": 0.7116790924981272, + "grad_norm": 0.2416684925556183, + "learning_rate": 1.1626836883223604e-05, + "loss": 0.1471, + "step": 39901 + }, + { + "epoch": 0.7116969286198409, + "grad_norm": 0.33641648292541504, + "learning_rate": 1.1625521818091315e-05, + "loss": 0.1562, + "step": 39902 + }, + { + "epoch": 0.7117147647415546, + "grad_norm": 0.2838899791240692, + "learning_rate": 1.1624206804803123e-05, + "loss": 0.2105, + "step": 39903 + }, + { + "epoch": 0.7117326008632683, + "grad_norm": 0.2801453173160553, + "learning_rate": 1.1622891843364126e-05, + "loss": 0.1225, + "step": 39904 + }, + { + "epoch": 0.711750436984982, + "grad_norm": 0.28423693776130676, + "learning_rate": 1.162157693377941e-05, + "loss": 0.1148, + "step": 39905 + }, + { + "epoch": 0.7117682731066957, + "grad_norm": 0.252823144197464, + "learning_rate": 1.1620262076054093e-05, + "loss": 0.1486, + "step": 39906 + }, + { + "epoch": 0.7117861092284093, + "grad_norm": 0.30582118034362793, + "learning_rate": 1.161894727019326e-05, + "loss": 0.1409, + "step": 39907 + }, + { + "epoch": 0.711803945350123, + "grad_norm": 0.25981634855270386, + "learning_rate": 1.161763251620201e-05, + "loss": 0.1006, + "step": 39908 + }, + { + "epoch": 0.7118217814718367, + "grad_norm": 0.24681849777698517, + "learning_rate": 1.1616317814085428e-05, + "loss": 0.1185, + "step": 39909 + }, + { + "epoch": 0.7118396175935504, + "grad_norm": 0.2377425879240036, + "learning_rate": 1.1615003163848632e-05, + "loss": 0.1336, + "step": 39910 + }, + { + "epoch": 0.7118574537152641, + "grad_norm": 0.3168492913246155, + "learning_rate": 1.1613688565496705e-05, + "loss": 0.1671, + "step": 39911 + }, + { + "epoch": 0.7118752898369779, + "grad_norm": 0.2868969142436981, + "learning_rate": 1.1612374019034744e-05, + "loss": 0.1415, + "step": 39912 + }, + { + "epoch": 0.7118931259586916, + "grad_norm": 0.25970104336738586, + "learning_rate": 1.1611059524467838e-05, + "loss": 0.1523, + "step": 39913 + }, + { + "epoch": 0.7119109620804053, + "grad_norm": 0.3548128008842468, + "learning_rate": 1.1609745081801099e-05, + "loss": 0.1249, + "step": 39914 + }, + { + "epoch": 0.711928798202119, + "grad_norm": 0.35793355107307434, + "learning_rate": 1.1608430691039601e-05, + "loss": 0.1323, + "step": 39915 + }, + { + "epoch": 0.7119466343238327, + "grad_norm": 0.2647632658481598, + "learning_rate": 1.1607116352188463e-05, + "loss": 0.135, + "step": 39916 + }, + { + "epoch": 0.7119644704455463, + "grad_norm": 0.2214607149362564, + "learning_rate": 1.1605802065252769e-05, + "loss": 0.0931, + "step": 39917 + }, + { + "epoch": 0.71198230656726, + "grad_norm": 0.2038196325302124, + "learning_rate": 1.1604487830237598e-05, + "loss": 0.1203, + "step": 39918 + }, + { + "epoch": 0.7120001426889737, + "grad_norm": 0.32778993248939514, + "learning_rate": 1.1603173647148071e-05, + "loss": 0.1506, + "step": 39919 + }, + { + "epoch": 0.7120179788106874, + "grad_norm": 0.24346116185188293, + "learning_rate": 1.1601859515989272e-05, + "loss": 0.1426, + "step": 39920 + }, + { + "epoch": 0.7120358149324011, + "grad_norm": 0.23992453515529633, + "learning_rate": 1.1600545436766291e-05, + "loss": 0.1065, + "step": 39921 + }, + { + "epoch": 0.7120536510541148, + "grad_norm": 0.30495685338974, + "learning_rate": 1.1599231409484215e-05, + "loss": 0.1074, + "step": 39922 + }, + { + "epoch": 0.7120714871758285, + "grad_norm": 0.2628614902496338, + "learning_rate": 1.1597917434148156e-05, + "loss": 0.1456, + "step": 39923 + }, + { + "epoch": 0.7120893232975422, + "grad_norm": 0.21645644307136536, + "learning_rate": 1.1596603510763196e-05, + "loss": 0.1357, + "step": 39924 + }, + { + "epoch": 0.7121071594192558, + "grad_norm": 0.243134006857872, + "learning_rate": 1.1595289639334433e-05, + "loss": 0.0405, + "step": 39925 + }, + { + "epoch": 0.7121249955409695, + "grad_norm": 0.28979218006134033, + "learning_rate": 1.1593975819866945e-05, + "loss": 0.063, + "step": 39926 + }, + { + "epoch": 0.7121428316626832, + "grad_norm": 0.258320689201355, + "learning_rate": 1.159266205236585e-05, + "loss": 0.147, + "step": 39927 + }, + { + "epoch": 0.7121606677843969, + "grad_norm": 0.29123276472091675, + "learning_rate": 1.1591348336836216e-05, + "loss": 0.1222, + "step": 39928 + }, + { + "epoch": 0.7121785039061107, + "grad_norm": 0.19829247891902924, + "learning_rate": 1.1590034673283156e-05, + "loss": 0.1132, + "step": 39929 + }, + { + "epoch": 0.7121963400278244, + "grad_norm": 0.35753124952316284, + "learning_rate": 1.1588721061711755e-05, + "loss": 0.1115, + "step": 39930 + }, + { + "epoch": 0.7122141761495381, + "grad_norm": 0.27448055148124695, + "learning_rate": 1.1587407502127094e-05, + "loss": 0.131, + "step": 39931 + }, + { + "epoch": 0.7122320122712518, + "grad_norm": 0.22233644127845764, + "learning_rate": 1.158609399453428e-05, + "loss": 0.0915, + "step": 39932 + }, + { + "epoch": 0.7122498483929655, + "grad_norm": 0.38981086015701294, + "learning_rate": 1.1584780538938402e-05, + "loss": 0.0979, + "step": 39933 + }, + { + "epoch": 0.7122676845146791, + "grad_norm": 0.29726698994636536, + "learning_rate": 1.1583467135344547e-05, + "loss": 0.1086, + "step": 39934 + }, + { + "epoch": 0.7122855206363928, + "grad_norm": 0.2556772828102112, + "learning_rate": 1.15821537837578e-05, + "loss": 0.1252, + "step": 39935 + }, + { + "epoch": 0.7123033567581065, + "grad_norm": 0.2642151713371277, + "learning_rate": 1.1580840484183264e-05, + "loss": 0.1046, + "step": 39936 + }, + { + "epoch": 0.7123211928798202, + "grad_norm": 0.28623437881469727, + "learning_rate": 1.1579527236626028e-05, + "loss": 0.0988, + "step": 39937 + }, + { + "epoch": 0.7123390290015339, + "grad_norm": 0.24815405905246735, + "learning_rate": 1.1578214041091178e-05, + "loss": 0.1295, + "step": 39938 + }, + { + "epoch": 0.7123568651232476, + "grad_norm": 0.5346922874450684, + "learning_rate": 1.15769008975838e-05, + "loss": 0.2474, + "step": 39939 + }, + { + "epoch": 0.7123747012449613, + "grad_norm": 0.3100970685482025, + "learning_rate": 1.1575587806108999e-05, + "loss": 0.1292, + "step": 39940 + }, + { + "epoch": 0.712392537366675, + "grad_norm": 0.41137516498565674, + "learning_rate": 1.1574274766671856e-05, + "loss": 0.2248, + "step": 39941 + }, + { + "epoch": 0.7124103734883886, + "grad_norm": 0.288207471370697, + "learning_rate": 1.157296177927745e-05, + "loss": 0.167, + "step": 39942 + }, + { + "epoch": 0.7124282096101023, + "grad_norm": 0.25861960649490356, + "learning_rate": 1.1571648843930891e-05, + "loss": 0.1279, + "step": 39943 + }, + { + "epoch": 0.712446045731816, + "grad_norm": 0.405437707901001, + "learning_rate": 1.1570335960637253e-05, + "loss": 0.1383, + "step": 39944 + }, + { + "epoch": 0.7124638818535298, + "grad_norm": 0.2419949769973755, + "learning_rate": 1.1569023129401639e-05, + "loss": 0.1305, + "step": 39945 + }, + { + "epoch": 0.7124817179752435, + "grad_norm": 0.2325814664363861, + "learning_rate": 1.1567710350229127e-05, + "loss": 0.0656, + "step": 39946 + }, + { + "epoch": 0.7124995540969572, + "grad_norm": 0.2678578794002533, + "learning_rate": 1.156639762312481e-05, + "loss": 0.1239, + "step": 39947 + }, + { + "epoch": 0.7125173902186709, + "grad_norm": 0.2614659070968628, + "learning_rate": 1.1565084948093771e-05, + "loss": 0.1209, + "step": 39948 + }, + { + "epoch": 0.7125352263403846, + "grad_norm": 0.2686004340648651, + "learning_rate": 1.156377232514111e-05, + "loss": 0.1155, + "step": 39949 + }, + { + "epoch": 0.7125530624620983, + "grad_norm": 0.3356996476650238, + "learning_rate": 1.1562459754271907e-05, + "loss": 0.0829, + "step": 39950 + }, + { + "epoch": 0.712570898583812, + "grad_norm": 0.2051440328359604, + "learning_rate": 1.1561147235491252e-05, + "loss": 0.0791, + "step": 39951 + }, + { + "epoch": 0.7125887347055256, + "grad_norm": 0.2531212270259857, + "learning_rate": 1.155983476880422e-05, + "loss": 0.1063, + "step": 39952 + }, + { + "epoch": 0.7126065708272393, + "grad_norm": 0.29687801003456116, + "learning_rate": 1.1558522354215922e-05, + "loss": 0.1162, + "step": 39953 + }, + { + "epoch": 0.712624406948953, + "grad_norm": 0.258180171251297, + "learning_rate": 1.1557209991731435e-05, + "loss": 0.1244, + "step": 39954 + }, + { + "epoch": 0.7126422430706667, + "grad_norm": 0.289081335067749, + "learning_rate": 1.1555897681355834e-05, + "loss": 0.2352, + "step": 39955 + }, + { + "epoch": 0.7126600791923804, + "grad_norm": 0.2513437271118164, + "learning_rate": 1.1554585423094228e-05, + "loss": 0.0876, + "step": 39956 + }, + { + "epoch": 0.7126779153140941, + "grad_norm": 0.30848774313926697, + "learning_rate": 1.1553273216951682e-05, + "loss": 0.0932, + "step": 39957 + }, + { + "epoch": 0.7126957514358078, + "grad_norm": 0.2955001890659332, + "learning_rate": 1.1551961062933305e-05, + "loss": 0.1078, + "step": 39958 + }, + { + "epoch": 0.7127135875575215, + "grad_norm": 0.24780118465423584, + "learning_rate": 1.1550648961044169e-05, + "loss": 0.1196, + "step": 39959 + }, + { + "epoch": 0.7127314236792351, + "grad_norm": 0.2631162106990814, + "learning_rate": 1.1549336911289366e-05, + "loss": 0.1821, + "step": 39960 + }, + { + "epoch": 0.7127492598009488, + "grad_norm": 0.24644921720027924, + "learning_rate": 1.1548024913673967e-05, + "loss": 0.1472, + "step": 39961 + }, + { + "epoch": 0.7127670959226626, + "grad_norm": 0.22369952499866486, + "learning_rate": 1.1546712968203077e-05, + "loss": 0.0859, + "step": 39962 + }, + { + "epoch": 0.7127849320443763, + "grad_norm": 0.2760312259197235, + "learning_rate": 1.1545401074881779e-05, + "loss": 0.1494, + "step": 39963 + }, + { + "epoch": 0.71280276816609, + "grad_norm": 0.2596503794193268, + "learning_rate": 1.154408923371515e-05, + "loss": 0.1037, + "step": 39964 + }, + { + "epoch": 0.7128206042878037, + "grad_norm": 0.29482051730155945, + "learning_rate": 1.154277744470827e-05, + "loss": 0.1716, + "step": 39965 + }, + { + "epoch": 0.7128384404095174, + "grad_norm": 0.2661428451538086, + "learning_rate": 1.1541465707866243e-05, + "loss": 0.1037, + "step": 39966 + }, + { + "epoch": 0.7128562765312311, + "grad_norm": 0.24982422590255737, + "learning_rate": 1.1540154023194141e-05, + "loss": 0.1035, + "step": 39967 + }, + { + "epoch": 0.7128741126529448, + "grad_norm": 0.2818349301815033, + "learning_rate": 1.1538842390697056e-05, + "loss": 0.1576, + "step": 39968 + }, + { + "epoch": 0.7128919487746584, + "grad_norm": 0.5475735664367676, + "learning_rate": 1.153753081038005e-05, + "loss": 0.1002, + "step": 39969 + }, + { + "epoch": 0.7129097848963721, + "grad_norm": 0.2658008635044098, + "learning_rate": 1.1536219282248239e-05, + "loss": 0.1066, + "step": 39970 + }, + { + "epoch": 0.7129276210180858, + "grad_norm": 0.30838435888290405, + "learning_rate": 1.1534907806306683e-05, + "loss": 0.1119, + "step": 39971 + }, + { + "epoch": 0.7129454571397995, + "grad_norm": 0.25189292430877686, + "learning_rate": 1.1533596382560482e-05, + "loss": 0.1031, + "step": 39972 + }, + { + "epoch": 0.7129632932615132, + "grad_norm": 0.2695024311542511, + "learning_rate": 1.1532285011014715e-05, + "loss": 0.0975, + "step": 39973 + }, + { + "epoch": 0.7129811293832269, + "grad_norm": 0.21420122683048248, + "learning_rate": 1.1530973691674455e-05, + "loss": 0.1134, + "step": 39974 + }, + { + "epoch": 0.7129989655049406, + "grad_norm": 0.29684752225875854, + "learning_rate": 1.1529662424544799e-05, + "loss": 0.1073, + "step": 39975 + }, + { + "epoch": 0.7130168016266543, + "grad_norm": 0.26985830068588257, + "learning_rate": 1.1528351209630824e-05, + "loss": 0.079, + "step": 39976 + }, + { + "epoch": 0.713034637748368, + "grad_norm": 0.3190481960773468, + "learning_rate": 1.1527040046937615e-05, + "loss": 0.1801, + "step": 39977 + }, + { + "epoch": 0.7130524738700816, + "grad_norm": 0.30620959401130676, + "learning_rate": 1.152572893647024e-05, + "loss": 0.1292, + "step": 39978 + }, + { + "epoch": 0.7130703099917954, + "grad_norm": 0.21564139425754547, + "learning_rate": 1.1524417878233803e-05, + "loss": 0.0849, + "step": 39979 + }, + { + "epoch": 0.7130881461135091, + "grad_norm": 0.20728722214698792, + "learning_rate": 1.152310687223338e-05, + "loss": 0.119, + "step": 39980 + }, + { + "epoch": 0.7131059822352228, + "grad_norm": 0.2679125964641571, + "learning_rate": 1.1521795918474046e-05, + "loss": 0.0938, + "step": 39981 + }, + { + "epoch": 0.7131238183569365, + "grad_norm": 0.2899567782878876, + "learning_rate": 1.1520485016960881e-05, + "loss": 0.1134, + "step": 39982 + }, + { + "epoch": 0.7131416544786502, + "grad_norm": 0.32243776321411133, + "learning_rate": 1.1519174167698976e-05, + "loss": 0.1538, + "step": 39983 + }, + { + "epoch": 0.7131594906003639, + "grad_norm": 0.24156616628170013, + "learning_rate": 1.1517863370693403e-05, + "loss": 0.1025, + "step": 39984 + }, + { + "epoch": 0.7131773267220776, + "grad_norm": 0.3430239260196686, + "learning_rate": 1.1516552625949253e-05, + "loss": 0.1352, + "step": 39985 + }, + { + "epoch": 0.7131951628437913, + "grad_norm": 0.29557710886001587, + "learning_rate": 1.1515241933471607e-05, + "loss": 0.1177, + "step": 39986 + }, + { + "epoch": 0.7132129989655049, + "grad_norm": 0.2616858184337616, + "learning_rate": 1.1513931293265529e-05, + "loss": 0.1078, + "step": 39987 + }, + { + "epoch": 0.7132308350872186, + "grad_norm": 0.28580084443092346, + "learning_rate": 1.151262070533612e-05, + "loss": 0.1238, + "step": 39988 + }, + { + "epoch": 0.7132486712089323, + "grad_norm": 0.29453325271606445, + "learning_rate": 1.1511310169688452e-05, + "loss": 0.1496, + "step": 39989 + }, + { + "epoch": 0.713266507330646, + "grad_norm": 0.22635002434253693, + "learning_rate": 1.1509999686327604e-05, + "loss": 0.1202, + "step": 39990 + }, + { + "epoch": 0.7132843434523597, + "grad_norm": 0.39565804600715637, + "learning_rate": 1.1508689255258648e-05, + "loss": 0.1752, + "step": 39991 + }, + { + "epoch": 0.7133021795740734, + "grad_norm": 0.28307363390922546, + "learning_rate": 1.1507378876486682e-05, + "loss": 0.1115, + "step": 39992 + }, + { + "epoch": 0.7133200156957871, + "grad_norm": 0.2626785635948181, + "learning_rate": 1.1506068550016774e-05, + "loss": 0.0977, + "step": 39993 + }, + { + "epoch": 0.7133378518175008, + "grad_norm": 0.31536778807640076, + "learning_rate": 1.1504758275854008e-05, + "loss": 0.1066, + "step": 39994 + }, + { + "epoch": 0.7133556879392144, + "grad_norm": 0.2317369282245636, + "learning_rate": 1.1503448054003457e-05, + "loss": 0.1246, + "step": 39995 + }, + { + "epoch": 0.7133735240609282, + "grad_norm": 0.361154168844223, + "learning_rate": 1.1502137884470197e-05, + "loss": 0.0999, + "step": 39996 + }, + { + "epoch": 0.7133913601826419, + "grad_norm": 0.28814399242401123, + "learning_rate": 1.1500827767259317e-05, + "loss": 0.0934, + "step": 39997 + }, + { + "epoch": 0.7134091963043556, + "grad_norm": 0.31533944606781006, + "learning_rate": 1.1499517702375887e-05, + "loss": 0.1712, + "step": 39998 + }, + { + "epoch": 0.7134270324260693, + "grad_norm": 0.22883932292461395, + "learning_rate": 1.1498207689824995e-05, + "loss": 0.0722, + "step": 39999 + }, + { + "epoch": 0.713444868547783, + "grad_norm": 0.1899060159921646, + "learning_rate": 1.1496897729611706e-05, + "loss": 0.0477, + "step": 40000 + }, + { + "epoch": 0.713444868547783, + "eval_loss": 0.11541074514389038, + "eval_runtime": 107.4664, + "eval_samples_per_second": 9.529, + "eval_steps_per_second": 1.591, + "step": 40000 + }, + { + "epoch": 0.7134627046694967, + "grad_norm": 0.21473735570907593, + "learning_rate": 1.1495587821741113e-05, + "loss": 0.1387, + "step": 40001 + }, + { + "epoch": 0.7134805407912104, + "grad_norm": 0.22912606596946716, + "learning_rate": 1.1494277966218287e-05, + "loss": 0.1427, + "step": 40002 + }, + { + "epoch": 0.7134983769129241, + "grad_norm": 0.22071535885334015, + "learning_rate": 1.1492968163048302e-05, + "loss": 0.0871, + "step": 40003 + }, + { + "epoch": 0.7135162130346377, + "grad_norm": 0.3121480941772461, + "learning_rate": 1.1491658412236231e-05, + "loss": 0.118, + "step": 40004 + }, + { + "epoch": 0.7135340491563514, + "grad_norm": 0.24430778622627258, + "learning_rate": 1.1490348713787167e-05, + "loss": 0.1204, + "step": 40005 + }, + { + "epoch": 0.7135518852780651, + "grad_norm": 0.2277761995792389, + "learning_rate": 1.1489039067706176e-05, + "loss": 0.1101, + "step": 40006 + }, + { + "epoch": 0.7135697213997788, + "grad_norm": 0.23791418969631195, + "learning_rate": 1.1487729473998336e-05, + "loss": 0.1104, + "step": 40007 + }, + { + "epoch": 0.7135875575214925, + "grad_norm": 0.34284713864326477, + "learning_rate": 1.1486419932668725e-05, + "loss": 0.173, + "step": 40008 + }, + { + "epoch": 0.7136053936432062, + "grad_norm": 0.27452951669692993, + "learning_rate": 1.1485110443722404e-05, + "loss": 0.1048, + "step": 40009 + }, + { + "epoch": 0.7136232297649199, + "grad_norm": 0.30748361349105835, + "learning_rate": 1.1483801007164477e-05, + "loss": 0.0894, + "step": 40010 + }, + { + "epoch": 0.7136410658866336, + "grad_norm": 0.2984600067138672, + "learning_rate": 1.1482491622999996e-05, + "loss": 0.11, + "step": 40011 + }, + { + "epoch": 0.7136589020083473, + "grad_norm": 0.35017356276512146, + "learning_rate": 1.1481182291234054e-05, + "loss": 0.1131, + "step": 40012 + }, + { + "epoch": 0.713676738130061, + "grad_norm": 0.18534281849861145, + "learning_rate": 1.1479873011871712e-05, + "loss": 0.1286, + "step": 40013 + }, + { + "epoch": 0.7136945742517747, + "grad_norm": 0.22484050691127777, + "learning_rate": 1.1478563784918059e-05, + "loss": 0.125, + "step": 40014 + }, + { + "epoch": 0.7137124103734884, + "grad_norm": 0.21328525245189667, + "learning_rate": 1.1477254610378163e-05, + "loss": 0.1349, + "step": 40015 + }, + { + "epoch": 0.7137302464952021, + "grad_norm": 0.4013546407222748, + "learning_rate": 1.1475945488257097e-05, + "loss": 0.203, + "step": 40016 + }, + { + "epoch": 0.7137480826169158, + "grad_norm": 0.31322768330574036, + "learning_rate": 1.147463641855993e-05, + "loss": 0.1318, + "step": 40017 + }, + { + "epoch": 0.7137659187386295, + "grad_norm": 0.29802224040031433, + "learning_rate": 1.1473327401291753e-05, + "loss": 0.1095, + "step": 40018 + }, + { + "epoch": 0.7137837548603432, + "grad_norm": 0.2667621970176697, + "learning_rate": 1.1472018436457632e-05, + "loss": 0.1236, + "step": 40019 + }, + { + "epoch": 0.7138015909820569, + "grad_norm": 0.26889848709106445, + "learning_rate": 1.147070952406264e-05, + "loss": 0.098, + "step": 40020 + }, + { + "epoch": 0.7138194271037706, + "grad_norm": 0.33743032813072205, + "learning_rate": 1.1469400664111848e-05, + "loss": 0.1376, + "step": 40021 + }, + { + "epoch": 0.7138372632254842, + "grad_norm": 0.25778719782829285, + "learning_rate": 1.1468091856610325e-05, + "loss": 0.1337, + "step": 40022 + }, + { + "epoch": 0.7138550993471979, + "grad_norm": 0.27928870916366577, + "learning_rate": 1.146678310156316e-05, + "loss": 0.0992, + "step": 40023 + }, + { + "epoch": 0.7138729354689116, + "grad_norm": 0.22588548064231873, + "learning_rate": 1.1465474398975418e-05, + "loss": 0.1425, + "step": 40024 + }, + { + "epoch": 0.7138907715906253, + "grad_norm": 0.2689093351364136, + "learning_rate": 1.1464165748852166e-05, + "loss": 0.1281, + "step": 40025 + }, + { + "epoch": 0.713908607712339, + "grad_norm": 0.23568901419639587, + "learning_rate": 1.1462857151198485e-05, + "loss": 0.1234, + "step": 40026 + }, + { + "epoch": 0.7139264438340527, + "grad_norm": 0.3280787765979767, + "learning_rate": 1.1461548606019443e-05, + "loss": 0.1369, + "step": 40027 + }, + { + "epoch": 0.7139442799557664, + "grad_norm": 0.2050517201423645, + "learning_rate": 1.146024011332012e-05, + "loss": 0.0956, + "step": 40028 + }, + { + "epoch": 0.7139621160774801, + "grad_norm": 0.3899628221988678, + "learning_rate": 1.1458931673105585e-05, + "loss": 0.1645, + "step": 40029 + }, + { + "epoch": 0.7139799521991939, + "grad_norm": 0.2577247619628906, + "learning_rate": 1.1457623285380897e-05, + "loss": 0.1137, + "step": 40030 + }, + { + "epoch": 0.7139977883209075, + "grad_norm": 0.4088238775730133, + "learning_rate": 1.1456314950151147e-05, + "loss": 0.1057, + "step": 40031 + }, + { + "epoch": 0.7140156244426212, + "grad_norm": 0.2634453773498535, + "learning_rate": 1.14550066674214e-05, + "loss": 0.109, + "step": 40032 + }, + { + "epoch": 0.7140334605643349, + "grad_norm": 0.27273663878440857, + "learning_rate": 1.1453698437196725e-05, + "loss": 0.094, + "step": 40033 + }, + { + "epoch": 0.7140512966860486, + "grad_norm": 0.19101014733314514, + "learning_rate": 1.1452390259482192e-05, + "loss": 0.0899, + "step": 40034 + }, + { + "epoch": 0.7140691328077623, + "grad_norm": 0.27890104055404663, + "learning_rate": 1.1451082134282868e-05, + "loss": 0.1329, + "step": 40035 + }, + { + "epoch": 0.714086968929476, + "grad_norm": 0.22313927114009857, + "learning_rate": 1.1449774061603834e-05, + "loss": 0.1013, + "step": 40036 + }, + { + "epoch": 0.7141048050511897, + "grad_norm": 0.25917425751686096, + "learning_rate": 1.144846604145016e-05, + "loss": 0.1282, + "step": 40037 + }, + { + "epoch": 0.7141226411729034, + "grad_norm": 0.265948086977005, + "learning_rate": 1.14471580738269e-05, + "loss": 0.118, + "step": 40038 + }, + { + "epoch": 0.714140477294617, + "grad_norm": 0.2603774964809418, + "learning_rate": 1.1445850158739146e-05, + "loss": 0.1173, + "step": 40039 + }, + { + "epoch": 0.7141583134163307, + "grad_norm": 0.24207022786140442, + "learning_rate": 1.1444542296191952e-05, + "loss": 0.1585, + "step": 40040 + }, + { + "epoch": 0.7141761495380444, + "grad_norm": 0.2568919062614441, + "learning_rate": 1.1443234486190399e-05, + "loss": 0.1488, + "step": 40041 + }, + { + "epoch": 0.7141939856597581, + "grad_norm": 0.2640952467918396, + "learning_rate": 1.1441926728739552e-05, + "loss": 0.086, + "step": 40042 + }, + { + "epoch": 0.7142118217814718, + "grad_norm": 0.2652968466281891, + "learning_rate": 1.1440619023844473e-05, + "loss": 0.1742, + "step": 40043 + }, + { + "epoch": 0.7142296579031855, + "grad_norm": 0.27271100878715515, + "learning_rate": 1.1439311371510244e-05, + "loss": 0.1068, + "step": 40044 + }, + { + "epoch": 0.7142474940248992, + "grad_norm": 0.36714431643486023, + "learning_rate": 1.1438003771741931e-05, + "loss": 0.1012, + "step": 40045 + }, + { + "epoch": 0.714265330146613, + "grad_norm": 0.2434958517551422, + "learning_rate": 1.14366962245446e-05, + "loss": 0.0795, + "step": 40046 + }, + { + "epoch": 0.7142831662683267, + "grad_norm": 0.2660449147224426, + "learning_rate": 1.1435388729923317e-05, + "loss": 0.0609, + "step": 40047 + }, + { + "epoch": 0.7143010023900404, + "grad_norm": 0.353059858083725, + "learning_rate": 1.1434081287883142e-05, + "loss": 0.1114, + "step": 40048 + }, + { + "epoch": 0.714318838511754, + "grad_norm": 0.20561105012893677, + "learning_rate": 1.1432773898429165e-05, + "loss": 0.0941, + "step": 40049 + }, + { + "epoch": 0.7143366746334677, + "grad_norm": 0.2738844156265259, + "learning_rate": 1.1431466561566443e-05, + "loss": 0.1327, + "step": 40050 + }, + { + "epoch": 0.7143545107551814, + "grad_norm": 0.2646847665309906, + "learning_rate": 1.143015927730004e-05, + "loss": 0.108, + "step": 40051 + }, + { + "epoch": 0.7143723468768951, + "grad_norm": 0.2548307478427887, + "learning_rate": 1.1428852045635018e-05, + "loss": 0.1079, + "step": 40052 + }, + { + "epoch": 0.7143901829986088, + "grad_norm": 0.2920530438423157, + "learning_rate": 1.1427544866576465e-05, + "loss": 0.0767, + "step": 40053 + }, + { + "epoch": 0.7144080191203225, + "grad_norm": 0.24720653891563416, + "learning_rate": 1.1426237740129422e-05, + "loss": 0.1095, + "step": 40054 + }, + { + "epoch": 0.7144258552420362, + "grad_norm": 0.3039284646511078, + "learning_rate": 1.1424930666298983e-05, + "loss": 0.0825, + "step": 40055 + }, + { + "epoch": 0.7144436913637499, + "grad_norm": 0.37439247965812683, + "learning_rate": 1.1423623645090189e-05, + "loss": 0.1319, + "step": 40056 + }, + { + "epoch": 0.7144615274854635, + "grad_norm": 0.2087412029504776, + "learning_rate": 1.142231667650813e-05, + "loss": 0.0853, + "step": 40057 + }, + { + "epoch": 0.7144793636071772, + "grad_norm": 0.1958671361207962, + "learning_rate": 1.142100976055786e-05, + "loss": 0.0937, + "step": 40058 + }, + { + "epoch": 0.7144971997288909, + "grad_norm": 0.20561863481998444, + "learning_rate": 1.141970289724445e-05, + "loss": 0.0943, + "step": 40059 + }, + { + "epoch": 0.7145150358506046, + "grad_norm": 0.32751795649528503, + "learning_rate": 1.1418396086572957e-05, + "loss": 0.1361, + "step": 40060 + }, + { + "epoch": 0.7145328719723183, + "grad_norm": 0.3036060929298401, + "learning_rate": 1.1417089328548443e-05, + "loss": 0.1272, + "step": 40061 + }, + { + "epoch": 0.714550708094032, + "grad_norm": 0.2874866724014282, + "learning_rate": 1.1415782623175994e-05, + "loss": 0.1459, + "step": 40062 + }, + { + "epoch": 0.7145685442157458, + "grad_norm": 0.43067842721939087, + "learning_rate": 1.1414475970460664e-05, + "loss": 0.1653, + "step": 40063 + }, + { + "epoch": 0.7145863803374595, + "grad_norm": 0.26317596435546875, + "learning_rate": 1.1413169370407514e-05, + "loss": 0.1353, + "step": 40064 + }, + { + "epoch": 0.7146042164591732, + "grad_norm": 0.28594955801963806, + "learning_rate": 1.1411862823021607e-05, + "loss": 0.129, + "step": 40065 + }, + { + "epoch": 0.7146220525808868, + "grad_norm": 0.26611292362213135, + "learning_rate": 1.1410556328308019e-05, + "loss": 0.0854, + "step": 40066 + }, + { + "epoch": 0.7146398887026005, + "grad_norm": 0.3274479806423187, + "learning_rate": 1.14092498862718e-05, + "loss": 0.1513, + "step": 40067 + }, + { + "epoch": 0.7146577248243142, + "grad_norm": 0.235755056142807, + "learning_rate": 1.1407943496918034e-05, + "loss": 0.1307, + "step": 40068 + }, + { + "epoch": 0.7146755609460279, + "grad_norm": 0.22338080406188965, + "learning_rate": 1.1406637160251759e-05, + "loss": 0.0735, + "step": 40069 + }, + { + "epoch": 0.7146933970677416, + "grad_norm": 0.3805863559246063, + "learning_rate": 1.1405330876278067e-05, + "loss": 0.1381, + "step": 40070 + }, + { + "epoch": 0.7147112331894553, + "grad_norm": 0.3719305992126465, + "learning_rate": 1.1404024645002007e-05, + "loss": 0.1122, + "step": 40071 + }, + { + "epoch": 0.714729069311169, + "grad_norm": 0.30759960412979126, + "learning_rate": 1.1402718466428641e-05, + "loss": 0.1614, + "step": 40072 + }, + { + "epoch": 0.7147469054328827, + "grad_norm": 0.30556634068489075, + "learning_rate": 1.1401412340563039e-05, + "loss": 0.2038, + "step": 40073 + }, + { + "epoch": 0.7147647415545964, + "grad_norm": 0.25647082924842834, + "learning_rate": 1.1400106267410245e-05, + "loss": 0.1004, + "step": 40074 + }, + { + "epoch": 0.71478257767631, + "grad_norm": 0.2774975895881653, + "learning_rate": 1.139880024697535e-05, + "loss": 0.1405, + "step": 40075 + }, + { + "epoch": 0.7148004137980237, + "grad_norm": 0.3357665240764618, + "learning_rate": 1.13974942792634e-05, + "loss": 0.1111, + "step": 40076 + }, + { + "epoch": 0.7148182499197374, + "grad_norm": 0.2735438942909241, + "learning_rate": 1.1396188364279465e-05, + "loss": 0.088, + "step": 40077 + }, + { + "epoch": 0.7148360860414511, + "grad_norm": 0.3286462724208832, + "learning_rate": 1.139488250202859e-05, + "loss": 0.1005, + "step": 40078 + }, + { + "epoch": 0.7148539221631648, + "grad_norm": 0.35431915521621704, + "learning_rate": 1.1393576692515859e-05, + "loss": 0.1261, + "step": 40079 + }, + { + "epoch": 0.7148717582848786, + "grad_norm": 0.34066176414489746, + "learning_rate": 1.1392270935746324e-05, + "loss": 0.1066, + "step": 40080 + }, + { + "epoch": 0.7148895944065923, + "grad_norm": 0.22965796291828156, + "learning_rate": 1.1390965231725037e-05, + "loss": 0.1157, + "step": 40081 + }, + { + "epoch": 0.714907430528306, + "grad_norm": 0.21479332447052002, + "learning_rate": 1.1389659580457069e-05, + "loss": 0.1143, + "step": 40082 + }, + { + "epoch": 0.7149252666500197, + "grad_norm": 0.2500707507133484, + "learning_rate": 1.1388353981947492e-05, + "loss": 0.1036, + "step": 40083 + }, + { + "epoch": 0.7149431027717333, + "grad_norm": 0.20943470299243927, + "learning_rate": 1.1387048436201355e-05, + "loss": 0.1164, + "step": 40084 + }, + { + "epoch": 0.714960938893447, + "grad_norm": 0.22885392606258392, + "learning_rate": 1.1385742943223721e-05, + "loss": 0.0761, + "step": 40085 + }, + { + "epoch": 0.7149787750151607, + "grad_norm": 0.3258139193058014, + "learning_rate": 1.1384437503019649e-05, + "loss": 0.1669, + "step": 40086 + }, + { + "epoch": 0.7149966111368744, + "grad_norm": 0.26969093084335327, + "learning_rate": 1.1383132115594192e-05, + "loss": 0.0865, + "step": 40087 + }, + { + "epoch": 0.7150144472585881, + "grad_norm": 0.22007393836975098, + "learning_rate": 1.1381826780952425e-05, + "loss": 0.1036, + "step": 40088 + }, + { + "epoch": 0.7150322833803018, + "grad_norm": 0.23283235728740692, + "learning_rate": 1.1380521499099403e-05, + "loss": 0.1211, + "step": 40089 + }, + { + "epoch": 0.7150501195020155, + "grad_norm": 0.2629624605178833, + "learning_rate": 1.1379216270040183e-05, + "loss": 0.1628, + "step": 40090 + }, + { + "epoch": 0.7150679556237292, + "grad_norm": 0.26915791630744934, + "learning_rate": 1.1377911093779814e-05, + "loss": 0.0987, + "step": 40091 + }, + { + "epoch": 0.7150857917454428, + "grad_norm": 0.2294517159461975, + "learning_rate": 1.137660597032338e-05, + "loss": 0.1329, + "step": 40092 + }, + { + "epoch": 0.7151036278671565, + "grad_norm": 0.24991875886917114, + "learning_rate": 1.1375300899675922e-05, + "loss": 0.1512, + "step": 40093 + }, + { + "epoch": 0.7151214639888702, + "grad_norm": 0.27796271443367004, + "learning_rate": 1.1373995881842498e-05, + "loss": 0.1089, + "step": 40094 + }, + { + "epoch": 0.7151393001105839, + "grad_norm": 0.22614654898643494, + "learning_rate": 1.137269091682818e-05, + "loss": 0.1391, + "step": 40095 + }, + { + "epoch": 0.7151571362322976, + "grad_norm": 0.2226305902004242, + "learning_rate": 1.137138600463801e-05, + "loss": 0.1133, + "step": 40096 + }, + { + "epoch": 0.7151749723540114, + "grad_norm": 0.2698943614959717, + "learning_rate": 1.1370081145277061e-05, + "loss": 0.1434, + "step": 40097 + }, + { + "epoch": 0.7151928084757251, + "grad_norm": 0.2680029571056366, + "learning_rate": 1.136877633875039e-05, + "loss": 0.1185, + "step": 40098 + }, + { + "epoch": 0.7152106445974388, + "grad_norm": 0.2352083921432495, + "learning_rate": 1.1367471585063048e-05, + "loss": 0.1281, + "step": 40099 + }, + { + "epoch": 0.7152284807191525, + "grad_norm": 0.28520306944847107, + "learning_rate": 1.1366166884220084e-05, + "loss": 0.145, + "step": 40100 + }, + { + "epoch": 0.7152463168408661, + "grad_norm": 0.20300179719924927, + "learning_rate": 1.1364862236226575e-05, + "loss": 0.1157, + "step": 40101 + }, + { + "epoch": 0.7152641529625798, + "grad_norm": 0.27353599667549133, + "learning_rate": 1.1363557641087572e-05, + "loss": 0.1059, + "step": 40102 + }, + { + "epoch": 0.7152819890842935, + "grad_norm": 0.19483867287635803, + "learning_rate": 1.1362253098808129e-05, + "loss": 0.098, + "step": 40103 + }, + { + "epoch": 0.7152998252060072, + "grad_norm": 0.2075476348400116, + "learning_rate": 1.1360948609393293e-05, + "loss": 0.0954, + "step": 40104 + }, + { + "epoch": 0.7153176613277209, + "grad_norm": 0.4020407497882843, + "learning_rate": 1.1359644172848141e-05, + "loss": 0.1407, + "step": 40105 + }, + { + "epoch": 0.7153354974494346, + "grad_norm": 0.2157728523015976, + "learning_rate": 1.1358339789177718e-05, + "loss": 0.0919, + "step": 40106 + }, + { + "epoch": 0.7153533335711483, + "grad_norm": 0.27376970648765564, + "learning_rate": 1.1357035458387083e-05, + "loss": 0.1435, + "step": 40107 + }, + { + "epoch": 0.715371169692862, + "grad_norm": 0.26904457807540894, + "learning_rate": 1.1355731180481283e-05, + "loss": 0.1198, + "step": 40108 + }, + { + "epoch": 0.7153890058145757, + "grad_norm": 0.19689197838306427, + "learning_rate": 1.135442695546538e-05, + "loss": 0.1275, + "step": 40109 + }, + { + "epoch": 0.7154068419362893, + "grad_norm": 0.3161015808582306, + "learning_rate": 1.1353122783344438e-05, + "loss": 0.1388, + "step": 40110 + }, + { + "epoch": 0.715424678058003, + "grad_norm": 0.26607078313827515, + "learning_rate": 1.1351818664123509e-05, + "loss": 0.1361, + "step": 40111 + }, + { + "epoch": 0.7154425141797167, + "grad_norm": 0.26398906111717224, + "learning_rate": 1.1350514597807644e-05, + "loss": 0.1146, + "step": 40112 + }, + { + "epoch": 0.7154603503014304, + "grad_norm": 0.21497918665409088, + "learning_rate": 1.134921058440189e-05, + "loss": 0.1222, + "step": 40113 + }, + { + "epoch": 0.7154781864231442, + "grad_norm": 0.2684513032436371, + "learning_rate": 1.1347906623911316e-05, + "loss": 0.1392, + "step": 40114 + }, + { + "epoch": 0.7154960225448579, + "grad_norm": 0.29163411259651184, + "learning_rate": 1.1346602716340976e-05, + "loss": 0.1364, + "step": 40115 + }, + { + "epoch": 0.7155138586665716, + "grad_norm": 0.32668426632881165, + "learning_rate": 1.1345298861695917e-05, + "loss": 0.1681, + "step": 40116 + }, + { + "epoch": 0.7155316947882853, + "grad_norm": 0.2838995158672333, + "learning_rate": 1.1343995059981188e-05, + "loss": 0.1326, + "step": 40117 + }, + { + "epoch": 0.715549530909999, + "grad_norm": 0.2582722008228302, + "learning_rate": 1.1342691311201859e-05, + "loss": 0.0842, + "step": 40118 + }, + { + "epoch": 0.7155673670317126, + "grad_norm": 0.4089448153972626, + "learning_rate": 1.1341387615362976e-05, + "loss": 0.1377, + "step": 40119 + }, + { + "epoch": 0.7155852031534263, + "grad_norm": 0.2817435562610626, + "learning_rate": 1.1340083972469592e-05, + "loss": 0.0972, + "step": 40120 + }, + { + "epoch": 0.71560303927514, + "grad_norm": 0.22558709979057312, + "learning_rate": 1.1338780382526751e-05, + "loss": 0.0877, + "step": 40121 + }, + { + "epoch": 0.7156208753968537, + "grad_norm": 0.24946123361587524, + "learning_rate": 1.1337476845539524e-05, + "loss": 0.0978, + "step": 40122 + }, + { + "epoch": 0.7156387115185674, + "grad_norm": 0.3986477851867676, + "learning_rate": 1.1336173361512946e-05, + "loss": 0.143, + "step": 40123 + }, + { + "epoch": 0.7156565476402811, + "grad_norm": 0.290179044008255, + "learning_rate": 1.1334869930452093e-05, + "loss": 0.1776, + "step": 40124 + }, + { + "epoch": 0.7156743837619948, + "grad_norm": 0.30848413705825806, + "learning_rate": 1.133356655236199e-05, + "loss": 0.1863, + "step": 40125 + }, + { + "epoch": 0.7156922198837085, + "grad_norm": 0.26101839542388916, + "learning_rate": 1.1332263227247717e-05, + "loss": 0.1065, + "step": 40126 + }, + { + "epoch": 0.7157100560054221, + "grad_norm": 0.2588322162628174, + "learning_rate": 1.1330959955114309e-05, + "loss": 0.1127, + "step": 40127 + }, + { + "epoch": 0.7157278921271358, + "grad_norm": 0.20499493181705475, + "learning_rate": 1.1329656735966823e-05, + "loss": 0.0862, + "step": 40128 + }, + { + "epoch": 0.7157457282488495, + "grad_norm": 0.25368282198905945, + "learning_rate": 1.1328353569810307e-05, + "loss": 0.1409, + "step": 40129 + }, + { + "epoch": 0.7157635643705632, + "grad_norm": 0.199868306517601, + "learning_rate": 1.1327050456649807e-05, + "loss": 0.1121, + "step": 40130 + }, + { + "epoch": 0.715781400492277, + "grad_norm": 0.38141289353370667, + "learning_rate": 1.132574739649039e-05, + "loss": 0.1169, + "step": 40131 + }, + { + "epoch": 0.7157992366139907, + "grad_norm": 0.24573341012001038, + "learning_rate": 1.13244443893371e-05, + "loss": 0.1218, + "step": 40132 + }, + { + "epoch": 0.7158170727357044, + "grad_norm": 0.27360770106315613, + "learning_rate": 1.1323141435194987e-05, + "loss": 0.1932, + "step": 40133 + }, + { + "epoch": 0.7158349088574181, + "grad_norm": 0.2645307183265686, + "learning_rate": 1.1321838534069092e-05, + "loss": 0.1524, + "step": 40134 + }, + { + "epoch": 0.7158527449791318, + "grad_norm": 0.24422770738601685, + "learning_rate": 1.1320535685964484e-05, + "loss": 0.1253, + "step": 40135 + }, + { + "epoch": 0.7158705811008454, + "grad_norm": 0.21716158092021942, + "learning_rate": 1.1319232890886197e-05, + "loss": 0.1071, + "step": 40136 + }, + { + "epoch": 0.7158884172225591, + "grad_norm": 0.303070992231369, + "learning_rate": 1.1317930148839295e-05, + "loss": 0.146, + "step": 40137 + }, + { + "epoch": 0.7159062533442728, + "grad_norm": 0.2110755443572998, + "learning_rate": 1.1316627459828813e-05, + "loss": 0.1275, + "step": 40138 + }, + { + "epoch": 0.7159240894659865, + "grad_norm": 0.21533270180225372, + "learning_rate": 1.1315324823859819e-05, + "loss": 0.1129, + "step": 40139 + }, + { + "epoch": 0.7159419255877002, + "grad_norm": 0.3444138169288635, + "learning_rate": 1.1314022240937352e-05, + "loss": 0.1747, + "step": 40140 + }, + { + "epoch": 0.7159597617094139, + "grad_norm": 0.22958432137966156, + "learning_rate": 1.1312719711066463e-05, + "loss": 0.0426, + "step": 40141 + }, + { + "epoch": 0.7159775978311276, + "grad_norm": 0.5284539461135864, + "learning_rate": 1.1311417234252201e-05, + "loss": 0.0995, + "step": 40142 + }, + { + "epoch": 0.7159954339528413, + "grad_norm": 0.1922261267900467, + "learning_rate": 1.13101148104996e-05, + "loss": 0.0755, + "step": 40143 + }, + { + "epoch": 0.716013270074555, + "grad_norm": 0.2351703643798828, + "learning_rate": 1.1308812439813735e-05, + "loss": 0.112, + "step": 40144 + }, + { + "epoch": 0.7160311061962686, + "grad_norm": 0.3151635229587555, + "learning_rate": 1.1307510122199641e-05, + "loss": 0.1527, + "step": 40145 + }, + { + "epoch": 0.7160489423179823, + "grad_norm": 0.22693036496639252, + "learning_rate": 1.1306207857662365e-05, + "loss": 0.1397, + "step": 40146 + }, + { + "epoch": 0.7160667784396961, + "grad_norm": 0.24674342572689056, + "learning_rate": 1.130490564620695e-05, + "loss": 0.0998, + "step": 40147 + }, + { + "epoch": 0.7160846145614098, + "grad_norm": 0.22183175384998322, + "learning_rate": 1.130360348783846e-05, + "loss": 0.1104, + "step": 40148 + }, + { + "epoch": 0.7161024506831235, + "grad_norm": 0.24917374551296234, + "learning_rate": 1.1302301382561934e-05, + "loss": 0.0851, + "step": 40149 + }, + { + "epoch": 0.7161202868048372, + "grad_norm": 0.35170769691467285, + "learning_rate": 1.1300999330382409e-05, + "loss": 0.113, + "step": 40150 + }, + { + "epoch": 0.7161381229265509, + "grad_norm": 0.2818054258823395, + "learning_rate": 1.1299697331304952e-05, + "loss": 0.1032, + "step": 40151 + }, + { + "epoch": 0.7161559590482646, + "grad_norm": 0.34978222846984863, + "learning_rate": 1.1298395385334586e-05, + "loss": 0.1637, + "step": 40152 + }, + { + "epoch": 0.7161737951699783, + "grad_norm": 0.21895340085029602, + "learning_rate": 1.1297093492476387e-05, + "loss": 0.0675, + "step": 40153 + }, + { + "epoch": 0.7161916312916919, + "grad_norm": 0.3641444444656372, + "learning_rate": 1.1295791652735383e-05, + "loss": 0.1401, + "step": 40154 + }, + { + "epoch": 0.7162094674134056, + "grad_norm": 0.21930791437625885, + "learning_rate": 1.1294489866116625e-05, + "loss": 0.0834, + "step": 40155 + }, + { + "epoch": 0.7162273035351193, + "grad_norm": 0.29976096749305725, + "learning_rate": 1.1293188132625149e-05, + "loss": 0.0945, + "step": 40156 + }, + { + "epoch": 0.716245139656833, + "grad_norm": 0.2277435064315796, + "learning_rate": 1.129188645226602e-05, + "loss": 0.1119, + "step": 40157 + }, + { + "epoch": 0.7162629757785467, + "grad_norm": 0.26885366439819336, + "learning_rate": 1.1290584825044273e-05, + "loss": 0.0742, + "step": 40158 + }, + { + "epoch": 0.7162808119002604, + "grad_norm": 0.233631432056427, + "learning_rate": 1.1289283250964955e-05, + "loss": 0.1256, + "step": 40159 + }, + { + "epoch": 0.7162986480219741, + "grad_norm": 0.2285805344581604, + "learning_rate": 1.12879817300331e-05, + "loss": 0.1258, + "step": 40160 + }, + { + "epoch": 0.7163164841436878, + "grad_norm": 0.21788142621517181, + "learning_rate": 1.1286680262253774e-05, + "loss": 0.1005, + "step": 40161 + }, + { + "epoch": 0.7163343202654014, + "grad_norm": 0.21417208015918732, + "learning_rate": 1.1285378847632012e-05, + "loss": 0.0881, + "step": 40162 + }, + { + "epoch": 0.7163521563871151, + "grad_norm": 0.281019002199173, + "learning_rate": 1.1284077486172847e-05, + "loss": 0.1276, + "step": 40163 + }, + { + "epoch": 0.7163699925088289, + "grad_norm": 0.2836749255657196, + "learning_rate": 1.1282776177881346e-05, + "loss": 0.2059, + "step": 40164 + }, + { + "epoch": 0.7163878286305426, + "grad_norm": 0.2243744432926178, + "learning_rate": 1.1281474922762534e-05, + "loss": 0.0766, + "step": 40165 + }, + { + "epoch": 0.7164056647522563, + "grad_norm": 0.27451154589653015, + "learning_rate": 1.128017372082147e-05, + "loss": 0.1142, + "step": 40166 + }, + { + "epoch": 0.71642350087397, + "grad_norm": 0.23932093381881714, + "learning_rate": 1.1278872572063196e-05, + "loss": 0.1255, + "step": 40167 + }, + { + "epoch": 0.7164413369956837, + "grad_norm": 0.24328601360321045, + "learning_rate": 1.1277571476492746e-05, + "loss": 0.1154, + "step": 40168 + }, + { + "epoch": 0.7164591731173974, + "grad_norm": 0.23491215705871582, + "learning_rate": 1.127627043411516e-05, + "loss": 0.0947, + "step": 40169 + }, + { + "epoch": 0.7164770092391111, + "grad_norm": 0.2488335818052292, + "learning_rate": 1.12749694449355e-05, + "loss": 0.1221, + "step": 40170 + }, + { + "epoch": 0.7164948453608248, + "grad_norm": 0.21399731934070587, + "learning_rate": 1.12736685089588e-05, + "loss": 0.0813, + "step": 40171 + }, + { + "epoch": 0.7165126814825384, + "grad_norm": 0.280479371547699, + "learning_rate": 1.1272367626190103e-05, + "loss": 0.1527, + "step": 40172 + }, + { + "epoch": 0.7165305176042521, + "grad_norm": 0.16917753219604492, + "learning_rate": 1.127106679663444e-05, + "loss": 0.0683, + "step": 40173 + }, + { + "epoch": 0.7165483537259658, + "grad_norm": 0.2876274883747101, + "learning_rate": 1.1269766020296872e-05, + "loss": 0.1005, + "step": 40174 + }, + { + "epoch": 0.7165661898476795, + "grad_norm": 0.27877625823020935, + "learning_rate": 1.1268465297182431e-05, + "loss": 0.0939, + "step": 40175 + }, + { + "epoch": 0.7165840259693932, + "grad_norm": 0.2975919246673584, + "learning_rate": 1.1267164627296164e-05, + "loss": 0.1339, + "step": 40176 + }, + { + "epoch": 0.7166018620911069, + "grad_norm": 0.2593155801296234, + "learning_rate": 1.1265864010643101e-05, + "loss": 0.1245, + "step": 40177 + }, + { + "epoch": 0.7166196982128206, + "grad_norm": 0.24974867701530457, + "learning_rate": 1.1264563447228301e-05, + "loss": 0.1462, + "step": 40178 + }, + { + "epoch": 0.7166375343345343, + "grad_norm": 0.2695964276790619, + "learning_rate": 1.126326293705679e-05, + "loss": 0.1151, + "step": 40179 + }, + { + "epoch": 0.7166553704562479, + "grad_norm": 0.2947452962398529, + "learning_rate": 1.1261962480133626e-05, + "loss": 0.0953, + "step": 40180 + }, + { + "epoch": 0.7166732065779617, + "grad_norm": 0.25085800886154175, + "learning_rate": 1.1260662076463837e-05, + "loss": 0.1332, + "step": 40181 + }, + { + "epoch": 0.7166910426996754, + "grad_norm": 0.33529868721961975, + "learning_rate": 1.1259361726052459e-05, + "loss": 0.1467, + "step": 40182 + }, + { + "epoch": 0.7167088788213891, + "grad_norm": 0.3045383393764496, + "learning_rate": 1.1258061428904552e-05, + "loss": 0.1027, + "step": 40183 + }, + { + "epoch": 0.7167267149431028, + "grad_norm": 0.29902833700180054, + "learning_rate": 1.1256761185025142e-05, + "loss": 0.1416, + "step": 40184 + }, + { + "epoch": 0.7167445510648165, + "grad_norm": 0.23123466968536377, + "learning_rate": 1.1255460994419275e-05, + "loss": 0.1472, + "step": 40185 + }, + { + "epoch": 0.7167623871865302, + "grad_norm": 0.23703597486019135, + "learning_rate": 1.1254160857091977e-05, + "loss": 0.1014, + "step": 40186 + }, + { + "epoch": 0.7167802233082439, + "grad_norm": 0.29561474919319153, + "learning_rate": 1.125286077304831e-05, + "loss": 0.1523, + "step": 40187 + }, + { + "epoch": 0.7167980594299576, + "grad_norm": 0.22643722593784332, + "learning_rate": 1.1251560742293305e-05, + "loss": 0.1118, + "step": 40188 + }, + { + "epoch": 0.7168158955516712, + "grad_norm": 0.3007449805736542, + "learning_rate": 1.1250260764831993e-05, + "loss": 0.1288, + "step": 40189 + }, + { + "epoch": 0.7168337316733849, + "grad_norm": 0.21088233590126038, + "learning_rate": 1.1248960840669415e-05, + "loss": 0.1391, + "step": 40190 + }, + { + "epoch": 0.7168515677950986, + "grad_norm": 0.26974600553512573, + "learning_rate": 1.1247660969810622e-05, + "loss": 0.1323, + "step": 40191 + }, + { + "epoch": 0.7168694039168123, + "grad_norm": 0.28961166739463806, + "learning_rate": 1.1246361152260636e-05, + "loss": 0.1363, + "step": 40192 + }, + { + "epoch": 0.716887240038526, + "grad_norm": 0.2893197238445282, + "learning_rate": 1.1245061388024514e-05, + "loss": 0.1365, + "step": 40193 + }, + { + "epoch": 0.7169050761602397, + "grad_norm": 0.29267364740371704, + "learning_rate": 1.1243761677107285e-05, + "loss": 0.1177, + "step": 40194 + }, + { + "epoch": 0.7169229122819534, + "grad_norm": 0.34230703115463257, + "learning_rate": 1.1242462019513978e-05, + "loss": 0.1039, + "step": 40195 + }, + { + "epoch": 0.7169407484036671, + "grad_norm": 0.27069658041000366, + "learning_rate": 1.1241162415249648e-05, + "loss": 0.111, + "step": 40196 + }, + { + "epoch": 0.7169585845253807, + "grad_norm": 0.23034998774528503, + "learning_rate": 1.1239862864319326e-05, + "loss": 0.1025, + "step": 40197 + }, + { + "epoch": 0.7169764206470945, + "grad_norm": 0.20744100213050842, + "learning_rate": 1.1238563366728044e-05, + "loss": 0.1237, + "step": 40198 + }, + { + "epoch": 0.7169942567688082, + "grad_norm": 0.29052066802978516, + "learning_rate": 1.1237263922480839e-05, + "loss": 0.1711, + "step": 40199 + }, + { + "epoch": 0.7170120928905219, + "grad_norm": 0.23305657505989075, + "learning_rate": 1.123596453158276e-05, + "loss": 0.0902, + "step": 40200 + }, + { + "epoch": 0.7170299290122356, + "grad_norm": 0.1973293125629425, + "learning_rate": 1.1234665194038838e-05, + "loss": 0.1053, + "step": 40201 + }, + { + "epoch": 0.7170477651339493, + "grad_norm": 0.2828347980976105, + "learning_rate": 1.1233365909854104e-05, + "loss": 0.1023, + "step": 40202 + }, + { + "epoch": 0.717065601255663, + "grad_norm": 0.33049723505973816, + "learning_rate": 1.1232066679033592e-05, + "loss": 0.1449, + "step": 40203 + }, + { + "epoch": 0.7170834373773767, + "grad_norm": 0.2718612551689148, + "learning_rate": 1.1230767501582356e-05, + "loss": 0.1625, + "step": 40204 + }, + { + "epoch": 0.7171012734990904, + "grad_norm": 0.4590749740600586, + "learning_rate": 1.1229468377505418e-05, + "loss": 0.1576, + "step": 40205 + }, + { + "epoch": 0.717119109620804, + "grad_norm": 0.2407943159341812, + "learning_rate": 1.1228169306807809e-05, + "loss": 0.0804, + "step": 40206 + }, + { + "epoch": 0.7171369457425177, + "grad_norm": 0.27960464358329773, + "learning_rate": 1.1226870289494581e-05, + "loss": 0.1319, + "step": 40207 + }, + { + "epoch": 0.7171547818642314, + "grad_norm": 0.23117999732494354, + "learning_rate": 1.1225571325570753e-05, + "loss": 0.0736, + "step": 40208 + }, + { + "epoch": 0.7171726179859451, + "grad_norm": 0.2947548031806946, + "learning_rate": 1.1224272415041376e-05, + "loss": 0.1804, + "step": 40209 + }, + { + "epoch": 0.7171904541076588, + "grad_norm": 0.31633463501930237, + "learning_rate": 1.1222973557911477e-05, + "loss": 0.1715, + "step": 40210 + }, + { + "epoch": 0.7172082902293725, + "grad_norm": 0.25023147463798523, + "learning_rate": 1.122167475418609e-05, + "loss": 0.1319, + "step": 40211 + }, + { + "epoch": 0.7172261263510862, + "grad_norm": 0.19847945868968964, + "learning_rate": 1.1220376003870242e-05, + "loss": 0.0821, + "step": 40212 + }, + { + "epoch": 0.7172439624727999, + "grad_norm": 0.18981978297233582, + "learning_rate": 1.1219077306968987e-05, + "loss": 0.0881, + "step": 40213 + }, + { + "epoch": 0.7172617985945136, + "grad_norm": 0.33658257126808167, + "learning_rate": 1.1217778663487346e-05, + "loss": 0.1334, + "step": 40214 + }, + { + "epoch": 0.7172796347162274, + "grad_norm": 0.23656687140464783, + "learning_rate": 1.1216480073430356e-05, + "loss": 0.1568, + "step": 40215 + }, + { + "epoch": 0.717297470837941, + "grad_norm": 0.31604477763175964, + "learning_rate": 1.1215181536803041e-05, + "loss": 0.0778, + "step": 40216 + }, + { + "epoch": 0.7173153069596547, + "grad_norm": 0.29235780239105225, + "learning_rate": 1.1213883053610453e-05, + "loss": 0.1157, + "step": 40217 + }, + { + "epoch": 0.7173331430813684, + "grad_norm": 0.258181631565094, + "learning_rate": 1.1212584623857614e-05, + "loss": 0.1042, + "step": 40218 + }, + { + "epoch": 0.7173509792030821, + "grad_norm": 0.24919407069683075, + "learning_rate": 1.1211286247549549e-05, + "loss": 0.1175, + "step": 40219 + }, + { + "epoch": 0.7173688153247958, + "grad_norm": 0.223277747631073, + "learning_rate": 1.1209987924691312e-05, + "loss": 0.095, + "step": 40220 + }, + { + "epoch": 0.7173866514465095, + "grad_norm": 0.3121904134750366, + "learning_rate": 1.1208689655287915e-05, + "loss": 0.1349, + "step": 40221 + }, + { + "epoch": 0.7174044875682232, + "grad_norm": 0.38540464639663696, + "learning_rate": 1.1207391439344412e-05, + "loss": 0.1131, + "step": 40222 + }, + { + "epoch": 0.7174223236899369, + "grad_norm": 0.25548434257507324, + "learning_rate": 1.120609327686582e-05, + "loss": 0.1365, + "step": 40223 + }, + { + "epoch": 0.7174401598116505, + "grad_norm": 0.2876202464103699, + "learning_rate": 1.1204795167857176e-05, + "loss": 0.1263, + "step": 40224 + }, + { + "epoch": 0.7174579959333642, + "grad_norm": 0.2345791757106781, + "learning_rate": 1.1203497112323501e-05, + "loss": 0.1163, + "step": 40225 + }, + { + "epoch": 0.7174758320550779, + "grad_norm": 0.33381974697113037, + "learning_rate": 1.1202199110269847e-05, + "loss": 0.0909, + "step": 40226 + }, + { + "epoch": 0.7174936681767916, + "grad_norm": 0.30950236320495605, + "learning_rate": 1.1200901161701235e-05, + "loss": 0.1571, + "step": 40227 + }, + { + "epoch": 0.7175115042985053, + "grad_norm": 0.2546635568141937, + "learning_rate": 1.1199603266622696e-05, + "loss": 0.1043, + "step": 40228 + }, + { + "epoch": 0.717529340420219, + "grad_norm": 0.3141431212425232, + "learning_rate": 1.1198305425039252e-05, + "loss": 0.0736, + "step": 40229 + }, + { + "epoch": 0.7175471765419327, + "grad_norm": 0.39999258518218994, + "learning_rate": 1.1197007636955953e-05, + "loss": 0.133, + "step": 40230 + }, + { + "epoch": 0.7175650126636464, + "grad_norm": 0.25507768988609314, + "learning_rate": 1.1195709902377819e-05, + "loss": 0.1267, + "step": 40231 + }, + { + "epoch": 0.7175828487853602, + "grad_norm": 0.2966732382774353, + "learning_rate": 1.119441222130988e-05, + "loss": 0.1423, + "step": 40232 + }, + { + "epoch": 0.7176006849070738, + "grad_norm": 0.26241880655288696, + "learning_rate": 1.119311459375716e-05, + "loss": 0.1083, + "step": 40233 + }, + { + "epoch": 0.7176185210287875, + "grad_norm": 0.25376299023628235, + "learning_rate": 1.1191817019724704e-05, + "loss": 0.1143, + "step": 40234 + }, + { + "epoch": 0.7176363571505012, + "grad_norm": 0.32532167434692383, + "learning_rate": 1.1190519499217525e-05, + "loss": 0.1215, + "step": 40235 + }, + { + "epoch": 0.7176541932722149, + "grad_norm": 0.26989617943763733, + "learning_rate": 1.1189222032240672e-05, + "loss": 0.0771, + "step": 40236 + }, + { + "epoch": 0.7176720293939286, + "grad_norm": 0.21981695294380188, + "learning_rate": 1.1187924618799165e-05, + "loss": 0.0744, + "step": 40237 + }, + { + "epoch": 0.7176898655156423, + "grad_norm": 0.22327512502670288, + "learning_rate": 1.1186627258898025e-05, + "loss": 0.1102, + "step": 40238 + }, + { + "epoch": 0.717707701637356, + "grad_norm": 0.2637295126914978, + "learning_rate": 1.1185329952542296e-05, + "loss": 0.1365, + "step": 40239 + }, + { + "epoch": 0.7177255377590697, + "grad_norm": 0.23144227266311646, + "learning_rate": 1.1184032699736998e-05, + "loss": 0.1464, + "step": 40240 + }, + { + "epoch": 0.7177433738807834, + "grad_norm": 0.24773211777210236, + "learning_rate": 1.1182735500487162e-05, + "loss": 0.1303, + "step": 40241 + }, + { + "epoch": 0.717761210002497, + "grad_norm": 0.24347743391990662, + "learning_rate": 1.118143835479781e-05, + "loss": 0.1105, + "step": 40242 + }, + { + "epoch": 0.7177790461242107, + "grad_norm": 0.24755750596523285, + "learning_rate": 1.1180141262673982e-05, + "loss": 0.0882, + "step": 40243 + }, + { + "epoch": 0.7177968822459244, + "grad_norm": 0.3391305208206177, + "learning_rate": 1.1178844224120699e-05, + "loss": 0.1061, + "step": 40244 + }, + { + "epoch": 0.7178147183676381, + "grad_norm": 0.2461908459663391, + "learning_rate": 1.1177547239142991e-05, + "loss": 0.11, + "step": 40245 + }, + { + "epoch": 0.7178325544893518, + "grad_norm": 0.2127000242471695, + "learning_rate": 1.1176250307745875e-05, + "loss": 0.0986, + "step": 40246 + }, + { + "epoch": 0.7178503906110655, + "grad_norm": 0.2532394826412201, + "learning_rate": 1.1174953429934395e-05, + "loss": 0.145, + "step": 40247 + }, + { + "epoch": 0.7178682267327793, + "grad_norm": 0.2500765025615692, + "learning_rate": 1.1173656605713561e-05, + "loss": 0.1241, + "step": 40248 + }, + { + "epoch": 0.717886062854493, + "grad_norm": 0.27345430850982666, + "learning_rate": 1.117235983508842e-05, + "loss": 0.0909, + "step": 40249 + }, + { + "epoch": 0.7179038989762067, + "grad_norm": 0.2972443103790283, + "learning_rate": 1.1171063118063988e-05, + "loss": 0.1465, + "step": 40250 + }, + { + "epoch": 0.7179217350979203, + "grad_norm": 0.25764113664627075, + "learning_rate": 1.1169766454645283e-05, + "loss": 0.1082, + "step": 40251 + }, + { + "epoch": 0.717939571219634, + "grad_norm": 0.2752144932746887, + "learning_rate": 1.1168469844837348e-05, + "loss": 0.1096, + "step": 40252 + }, + { + "epoch": 0.7179574073413477, + "grad_norm": 0.2440723180770874, + "learning_rate": 1.1167173288645203e-05, + "loss": 0.1321, + "step": 40253 + }, + { + "epoch": 0.7179752434630614, + "grad_norm": 0.24367555975914001, + "learning_rate": 1.116587678607387e-05, + "loss": 0.1277, + "step": 40254 + }, + { + "epoch": 0.7179930795847751, + "grad_norm": 0.3189479410648346, + "learning_rate": 1.116458033712837e-05, + "loss": 0.1129, + "step": 40255 + }, + { + "epoch": 0.7180109157064888, + "grad_norm": 0.31693968176841736, + "learning_rate": 1.1163283941813742e-05, + "loss": 0.1074, + "step": 40256 + }, + { + "epoch": 0.7180287518282025, + "grad_norm": 0.2833858132362366, + "learning_rate": 1.1161987600135005e-05, + "loss": 0.1063, + "step": 40257 + }, + { + "epoch": 0.7180465879499162, + "grad_norm": 0.24045118689537048, + "learning_rate": 1.1160691312097186e-05, + "loss": 0.1226, + "step": 40258 + }, + { + "epoch": 0.7180644240716298, + "grad_norm": 0.17967744171619415, + "learning_rate": 1.1159395077705303e-05, + "loss": 0.077, + "step": 40259 + }, + { + "epoch": 0.7180822601933435, + "grad_norm": 0.386762410402298, + "learning_rate": 1.1158098896964378e-05, + "loss": 0.142, + "step": 40260 + }, + { + "epoch": 0.7181000963150572, + "grad_norm": 0.2182648628950119, + "learning_rate": 1.1156802769879452e-05, + "loss": 0.1361, + "step": 40261 + }, + { + "epoch": 0.7181179324367709, + "grad_norm": 0.23098449409008026, + "learning_rate": 1.115550669645553e-05, + "loss": 0.1094, + "step": 40262 + }, + { + "epoch": 0.7181357685584846, + "grad_norm": 0.28771594166755676, + "learning_rate": 1.1154210676697657e-05, + "loss": 0.1472, + "step": 40263 + }, + { + "epoch": 0.7181536046801983, + "grad_norm": 0.2458367645740509, + "learning_rate": 1.1152914710610835e-05, + "loss": 0.136, + "step": 40264 + }, + { + "epoch": 0.7181714408019121, + "grad_norm": 0.2774643003940582, + "learning_rate": 1.1151618798200106e-05, + "loss": 0.1222, + "step": 40265 + }, + { + "epoch": 0.7181892769236258, + "grad_norm": 0.3647003769874573, + "learning_rate": 1.1150322939470487e-05, + "loss": 0.1391, + "step": 40266 + }, + { + "epoch": 0.7182071130453395, + "grad_norm": 0.20595820248126984, + "learning_rate": 1.1149027134427e-05, + "loss": 0.1039, + "step": 40267 + }, + { + "epoch": 0.7182249491670532, + "grad_norm": 0.27136728167533875, + "learning_rate": 1.1147731383074658e-05, + "loss": 0.0996, + "step": 40268 + }, + { + "epoch": 0.7182427852887668, + "grad_norm": 0.3267781436443329, + "learning_rate": 1.1146435685418501e-05, + "loss": 0.0876, + "step": 40269 + }, + { + "epoch": 0.7182606214104805, + "grad_norm": 0.2590530812740326, + "learning_rate": 1.1145140041463547e-05, + "loss": 0.1312, + "step": 40270 + }, + { + "epoch": 0.7182784575321942, + "grad_norm": 0.22409094870090485, + "learning_rate": 1.1143844451214816e-05, + "loss": 0.0969, + "step": 40271 + }, + { + "epoch": 0.7182962936539079, + "grad_norm": 0.29956942796707153, + "learning_rate": 1.1142548914677329e-05, + "loss": 0.0724, + "step": 40272 + }, + { + "epoch": 0.7183141297756216, + "grad_norm": 0.24620793759822845, + "learning_rate": 1.1141253431856097e-05, + "loss": 0.1015, + "step": 40273 + }, + { + "epoch": 0.7183319658973353, + "grad_norm": 0.2617422640323639, + "learning_rate": 1.1139958002756166e-05, + "loss": 0.1041, + "step": 40274 + }, + { + "epoch": 0.718349802019049, + "grad_norm": 0.37246477603912354, + "learning_rate": 1.1138662627382535e-05, + "loss": 0.13, + "step": 40275 + }, + { + "epoch": 0.7183676381407627, + "grad_norm": 0.34235435724258423, + "learning_rate": 1.1137367305740243e-05, + "loss": 0.1141, + "step": 40276 + }, + { + "epoch": 0.7183854742624763, + "grad_norm": 0.2520189881324768, + "learning_rate": 1.1136072037834294e-05, + "loss": 0.1329, + "step": 40277 + }, + { + "epoch": 0.71840331038419, + "grad_norm": 0.33708319067955017, + "learning_rate": 1.1134776823669727e-05, + "loss": 0.1748, + "step": 40278 + }, + { + "epoch": 0.7184211465059037, + "grad_norm": 0.3198677897453308, + "learning_rate": 1.1133481663251556e-05, + "loss": 0.1373, + "step": 40279 + }, + { + "epoch": 0.7184389826276174, + "grad_norm": 0.21848969161510468, + "learning_rate": 1.1132186556584797e-05, + "loss": 0.0704, + "step": 40280 + }, + { + "epoch": 0.7184568187493311, + "grad_norm": 0.33849579095840454, + "learning_rate": 1.1130891503674465e-05, + "loss": 0.1301, + "step": 40281 + }, + { + "epoch": 0.7184746548710449, + "grad_norm": 0.25301992893218994, + "learning_rate": 1.1129596504525597e-05, + "loss": 0.1467, + "step": 40282 + }, + { + "epoch": 0.7184924909927586, + "grad_norm": 0.23240190744400024, + "learning_rate": 1.1128301559143204e-05, + "loss": 0.1475, + "step": 40283 + }, + { + "epoch": 0.7185103271144723, + "grad_norm": 0.3365060091018677, + "learning_rate": 1.1127006667532305e-05, + "loss": 0.1757, + "step": 40284 + }, + { + "epoch": 0.718528163236186, + "grad_norm": 0.2211749404668808, + "learning_rate": 1.112571182969792e-05, + "loss": 0.0343, + "step": 40285 + }, + { + "epoch": 0.7185459993578996, + "grad_norm": 0.31455111503601074, + "learning_rate": 1.1124417045645055e-05, + "loss": 0.0993, + "step": 40286 + }, + { + "epoch": 0.7185638354796133, + "grad_norm": 0.3140621781349182, + "learning_rate": 1.1123122315378756e-05, + "loss": 0.1227, + "step": 40287 + }, + { + "epoch": 0.718581671601327, + "grad_norm": 0.3453923463821411, + "learning_rate": 1.1121827638904026e-05, + "loss": 0.2112, + "step": 40288 + }, + { + "epoch": 0.7185995077230407, + "grad_norm": 0.26226431131362915, + "learning_rate": 1.1120533016225877e-05, + "loss": 0.1317, + "step": 40289 + }, + { + "epoch": 0.7186173438447544, + "grad_norm": 0.2679152488708496, + "learning_rate": 1.1119238447349334e-05, + "loss": 0.1146, + "step": 40290 + }, + { + "epoch": 0.7186351799664681, + "grad_norm": 0.2937815189361572, + "learning_rate": 1.1117943932279429e-05, + "loss": 0.1312, + "step": 40291 + }, + { + "epoch": 0.7186530160881818, + "grad_norm": 0.19895359873771667, + "learning_rate": 1.1116649471021165e-05, + "loss": 0.1218, + "step": 40292 + }, + { + "epoch": 0.7186708522098955, + "grad_norm": 0.35976168513298035, + "learning_rate": 1.1115355063579566e-05, + "loss": 0.125, + "step": 40293 + }, + { + "epoch": 0.7186886883316091, + "grad_norm": 0.29435932636260986, + "learning_rate": 1.1114060709959635e-05, + "loss": 0.1581, + "step": 40294 + }, + { + "epoch": 0.7187065244533228, + "grad_norm": 0.2642267048358917, + "learning_rate": 1.1112766410166411e-05, + "loss": 0.1293, + "step": 40295 + }, + { + "epoch": 0.7187243605750365, + "grad_norm": 0.25632554292678833, + "learning_rate": 1.11114721642049e-05, + "loss": 0.1607, + "step": 40296 + }, + { + "epoch": 0.7187421966967502, + "grad_norm": 0.3137747645378113, + "learning_rate": 1.1110177972080121e-05, + "loss": 0.1254, + "step": 40297 + }, + { + "epoch": 0.7187600328184639, + "grad_norm": 0.17836590111255646, + "learning_rate": 1.1108883833797088e-05, + "loss": 0.1025, + "step": 40298 + }, + { + "epoch": 0.7187778689401777, + "grad_norm": 0.27142930030822754, + "learning_rate": 1.110758974936081e-05, + "loss": 0.1639, + "step": 40299 + }, + { + "epoch": 0.7187957050618914, + "grad_norm": 0.2555834650993347, + "learning_rate": 1.1106295718776322e-05, + "loss": 0.1029, + "step": 40300 + }, + { + "epoch": 0.7188135411836051, + "grad_norm": 0.28176093101501465, + "learning_rate": 1.1105001742048632e-05, + "loss": 0.1062, + "step": 40301 + }, + { + "epoch": 0.7188313773053188, + "grad_norm": 0.27635452151298523, + "learning_rate": 1.1103707819182744e-05, + "loss": 0.1035, + "step": 40302 + }, + { + "epoch": 0.7188492134270325, + "grad_norm": 0.21727557480335236, + "learning_rate": 1.1102413950183692e-05, + "loss": 0.0906, + "step": 40303 + }, + { + "epoch": 0.7188670495487461, + "grad_norm": 0.27734559774398804, + "learning_rate": 1.1101120135056479e-05, + "loss": 0.1495, + "step": 40304 + }, + { + "epoch": 0.7188848856704598, + "grad_norm": 0.20844654738903046, + "learning_rate": 1.109982637380613e-05, + "loss": 0.0864, + "step": 40305 + }, + { + "epoch": 0.7189027217921735, + "grad_norm": 0.2925710678100586, + "learning_rate": 1.1098532666437655e-05, + "loss": 0.0967, + "step": 40306 + }, + { + "epoch": 0.7189205579138872, + "grad_norm": 0.20547480881214142, + "learning_rate": 1.109723901295606e-05, + "loss": 0.0975, + "step": 40307 + }, + { + "epoch": 0.7189383940356009, + "grad_norm": 0.23280969262123108, + "learning_rate": 1.109594541336638e-05, + "loss": 0.0827, + "step": 40308 + }, + { + "epoch": 0.7189562301573146, + "grad_norm": 0.24231484532356262, + "learning_rate": 1.1094651867673614e-05, + "loss": 0.0923, + "step": 40309 + }, + { + "epoch": 0.7189740662790283, + "grad_norm": 0.24786196649074554, + "learning_rate": 1.1093358375882781e-05, + "loss": 0.1084, + "step": 40310 + }, + { + "epoch": 0.718991902400742, + "grad_norm": 0.2697654664516449, + "learning_rate": 1.1092064937998897e-05, + "loss": 0.1326, + "step": 40311 + }, + { + "epoch": 0.7190097385224556, + "grad_norm": 0.22581391036510468, + "learning_rate": 1.109077155402696e-05, + "loss": 0.1144, + "step": 40312 + }, + { + "epoch": 0.7190275746441693, + "grad_norm": 0.2712780237197876, + "learning_rate": 1.1089478223972007e-05, + "loss": 0.1329, + "step": 40313 + }, + { + "epoch": 0.719045410765883, + "grad_norm": 0.24814161658287048, + "learning_rate": 1.1088184947839043e-05, + "loss": 0.1095, + "step": 40314 + }, + { + "epoch": 0.7190632468875967, + "grad_norm": 0.22884242236614227, + "learning_rate": 1.1086891725633078e-05, + "loss": 0.0957, + "step": 40315 + }, + { + "epoch": 0.7190810830093105, + "grad_norm": 0.2308569699525833, + "learning_rate": 1.1085598557359117e-05, + "loss": 0.0858, + "step": 40316 + }, + { + "epoch": 0.7190989191310242, + "grad_norm": 0.2363572120666504, + "learning_rate": 1.108430544302218e-05, + "loss": 0.0823, + "step": 40317 + }, + { + "epoch": 0.7191167552527379, + "grad_norm": 0.2792021930217743, + "learning_rate": 1.1083012382627295e-05, + "loss": 0.0868, + "step": 40318 + }, + { + "epoch": 0.7191345913744516, + "grad_norm": 0.283976674079895, + "learning_rate": 1.1081719376179456e-05, + "loss": 0.1306, + "step": 40319 + }, + { + "epoch": 0.7191524274961653, + "grad_norm": 0.2579628527164459, + "learning_rate": 1.1080426423683674e-05, + "loss": 0.1174, + "step": 40320 + }, + { + "epoch": 0.719170263617879, + "grad_norm": 0.3998417556285858, + "learning_rate": 1.1079133525144975e-05, + "loss": 0.1084, + "step": 40321 + }, + { + "epoch": 0.7191880997395926, + "grad_norm": 0.3299851417541504, + "learning_rate": 1.1077840680568361e-05, + "loss": 0.147, + "step": 40322 + }, + { + "epoch": 0.7192059358613063, + "grad_norm": 0.24805738031864166, + "learning_rate": 1.1076547889958846e-05, + "loss": 0.1008, + "step": 40323 + }, + { + "epoch": 0.71922377198302, + "grad_norm": 0.257362961769104, + "learning_rate": 1.107525515332144e-05, + "loss": 0.1068, + "step": 40324 + }, + { + "epoch": 0.7192416081047337, + "grad_norm": 0.23313094675540924, + "learning_rate": 1.1073962470661147e-05, + "loss": 0.0942, + "step": 40325 + }, + { + "epoch": 0.7192594442264474, + "grad_norm": 0.26476219296455383, + "learning_rate": 1.1072669841982995e-05, + "loss": 0.0917, + "step": 40326 + }, + { + "epoch": 0.7192772803481611, + "grad_norm": 0.266126811504364, + "learning_rate": 1.1071377267291983e-05, + "loss": 0.0904, + "step": 40327 + }, + { + "epoch": 0.7192951164698748, + "grad_norm": 0.24406197667121887, + "learning_rate": 1.1070084746593124e-05, + "loss": 0.1125, + "step": 40328 + }, + { + "epoch": 0.7193129525915885, + "grad_norm": 0.2179514616727829, + "learning_rate": 1.1068792279891419e-05, + "loss": 0.1357, + "step": 40329 + }, + { + "epoch": 0.7193307887133021, + "grad_norm": 0.2856987416744232, + "learning_rate": 1.1067499867191894e-05, + "loss": 0.1222, + "step": 40330 + }, + { + "epoch": 0.7193486248350158, + "grad_norm": 0.26420894265174866, + "learning_rate": 1.1066207508499547e-05, + "loss": 0.0908, + "step": 40331 + }, + { + "epoch": 0.7193664609567295, + "grad_norm": 0.20407073199748993, + "learning_rate": 1.1064915203819396e-05, + "loss": 0.1206, + "step": 40332 + }, + { + "epoch": 0.7193842970784433, + "grad_norm": 0.3084770441055298, + "learning_rate": 1.1063622953156442e-05, + "loss": 0.1834, + "step": 40333 + }, + { + "epoch": 0.719402133200157, + "grad_norm": 0.1914638727903366, + "learning_rate": 1.1062330756515704e-05, + "loss": 0.0983, + "step": 40334 + }, + { + "epoch": 0.7194199693218707, + "grad_norm": 0.38346391916275024, + "learning_rate": 1.1061038613902189e-05, + "loss": 0.1813, + "step": 40335 + }, + { + "epoch": 0.7194378054435844, + "grad_norm": 0.28849998116493225, + "learning_rate": 1.1059746525320902e-05, + "loss": 0.0853, + "step": 40336 + }, + { + "epoch": 0.7194556415652981, + "grad_norm": 0.22317031025886536, + "learning_rate": 1.1058454490776851e-05, + "loss": 0.0988, + "step": 40337 + }, + { + "epoch": 0.7194734776870118, + "grad_norm": 0.21158041059970856, + "learning_rate": 1.1057162510275037e-05, + "loss": 0.1033, + "step": 40338 + }, + { + "epoch": 0.7194913138087254, + "grad_norm": 0.24587425589561462, + "learning_rate": 1.1055870583820488e-05, + "loss": 0.0759, + "step": 40339 + }, + { + "epoch": 0.7195091499304391, + "grad_norm": 0.2673921287059784, + "learning_rate": 1.1054578711418197e-05, + "loss": 0.0978, + "step": 40340 + }, + { + "epoch": 0.7195269860521528, + "grad_norm": 0.297579288482666, + "learning_rate": 1.105328689307318e-05, + "loss": 0.0778, + "step": 40341 + }, + { + "epoch": 0.7195448221738665, + "grad_norm": 0.23593173921108246, + "learning_rate": 1.105199512879043e-05, + "loss": 0.1001, + "step": 40342 + }, + { + "epoch": 0.7195626582955802, + "grad_norm": 0.2672853171825409, + "learning_rate": 1.105070341857497e-05, + "loss": 0.1103, + "step": 40343 + }, + { + "epoch": 0.7195804944172939, + "grad_norm": 0.22973723709583282, + "learning_rate": 1.1049411762431804e-05, + "loss": 0.1147, + "step": 40344 + }, + { + "epoch": 0.7195983305390076, + "grad_norm": 0.292667031288147, + "learning_rate": 1.1048120160365927e-05, + "loss": 0.1133, + "step": 40345 + }, + { + "epoch": 0.7196161666607213, + "grad_norm": 0.25625690817832947, + "learning_rate": 1.1046828612382353e-05, + "loss": 0.0992, + "step": 40346 + }, + { + "epoch": 0.719634002782435, + "grad_norm": 0.2784973084926605, + "learning_rate": 1.1045537118486105e-05, + "loss": 0.1333, + "step": 40347 + }, + { + "epoch": 0.7196518389041486, + "grad_norm": 0.2758970260620117, + "learning_rate": 1.1044245678682169e-05, + "loss": 0.1686, + "step": 40348 + }, + { + "epoch": 0.7196696750258624, + "grad_norm": 0.2847234606742859, + "learning_rate": 1.104295429297556e-05, + "loss": 0.123, + "step": 40349 + }, + { + "epoch": 0.7196875111475761, + "grad_norm": 0.2947148084640503, + "learning_rate": 1.104166296137128e-05, + "loss": 0.1427, + "step": 40350 + }, + { + "epoch": 0.7197053472692898, + "grad_norm": 0.32902270555496216, + "learning_rate": 1.1040371683874326e-05, + "loss": 0.1188, + "step": 40351 + }, + { + "epoch": 0.7197231833910035, + "grad_norm": 0.22397801280021667, + "learning_rate": 1.1039080460489724e-05, + "loss": 0.0989, + "step": 40352 + }, + { + "epoch": 0.7197410195127172, + "grad_norm": 0.2822009027004242, + "learning_rate": 1.1037789291222466e-05, + "loss": 0.0844, + "step": 40353 + }, + { + "epoch": 0.7197588556344309, + "grad_norm": 0.2531472444534302, + "learning_rate": 1.1036498176077558e-05, + "loss": 0.0958, + "step": 40354 + }, + { + "epoch": 0.7197766917561446, + "grad_norm": 0.21755728125572205, + "learning_rate": 1.103520711506e-05, + "loss": 0.1121, + "step": 40355 + }, + { + "epoch": 0.7197945278778582, + "grad_norm": 0.24625883996486664, + "learning_rate": 1.103391610817481e-05, + "loss": 0.1189, + "step": 40356 + }, + { + "epoch": 0.7198123639995719, + "grad_norm": 0.28660720586776733, + "learning_rate": 1.1032625155426985e-05, + "loss": 0.1818, + "step": 40357 + }, + { + "epoch": 0.7198302001212856, + "grad_norm": 0.23317407071590424, + "learning_rate": 1.1031334256821521e-05, + "loss": 0.1241, + "step": 40358 + }, + { + "epoch": 0.7198480362429993, + "grad_norm": 0.335793137550354, + "learning_rate": 1.1030043412363439e-05, + "loss": 0.1087, + "step": 40359 + }, + { + "epoch": 0.719865872364713, + "grad_norm": 0.2195972353219986, + "learning_rate": 1.1028752622057722e-05, + "loss": 0.094, + "step": 40360 + }, + { + "epoch": 0.7198837084864267, + "grad_norm": 0.392493337392807, + "learning_rate": 1.1027461885909395e-05, + "loss": 0.1382, + "step": 40361 + }, + { + "epoch": 0.7199015446081404, + "grad_norm": 0.34013909101486206, + "learning_rate": 1.1026171203923455e-05, + "loss": 0.1696, + "step": 40362 + }, + { + "epoch": 0.7199193807298541, + "grad_norm": 0.2574087381362915, + "learning_rate": 1.1024880576104899e-05, + "loss": 0.129, + "step": 40363 + }, + { + "epoch": 0.7199372168515678, + "grad_norm": 0.32612553238868713, + "learning_rate": 1.1023590002458725e-05, + "loss": 0.1347, + "step": 40364 + }, + { + "epoch": 0.7199550529732814, + "grad_norm": 0.27196574211120605, + "learning_rate": 1.102229948298995e-05, + "loss": 0.1251, + "step": 40365 + }, + { + "epoch": 0.7199728890949952, + "grad_norm": 0.25601544976234436, + "learning_rate": 1.1021009017703573e-05, + "loss": 0.0676, + "step": 40366 + }, + { + "epoch": 0.7199907252167089, + "grad_norm": 0.3186188042163849, + "learning_rate": 1.1019718606604593e-05, + "loss": 0.1648, + "step": 40367 + }, + { + "epoch": 0.7200085613384226, + "grad_norm": 0.27588871121406555, + "learning_rate": 1.1018428249698e-05, + "loss": 0.1284, + "step": 40368 + }, + { + "epoch": 0.7200263974601363, + "grad_norm": 0.19100086390972137, + "learning_rate": 1.101713794698882e-05, + "loss": 0.0657, + "step": 40369 + }, + { + "epoch": 0.72004423358185, + "grad_norm": 0.2826154828071594, + "learning_rate": 1.1015847698482043e-05, + "loss": 0.1151, + "step": 40370 + }, + { + "epoch": 0.7200620697035637, + "grad_norm": 0.28423142433166504, + "learning_rate": 1.101455750418267e-05, + "loss": 0.125, + "step": 40371 + }, + { + "epoch": 0.7200799058252774, + "grad_norm": 0.30817824602127075, + "learning_rate": 1.1013267364095694e-05, + "loss": 0.1409, + "step": 40372 + }, + { + "epoch": 0.720097741946991, + "grad_norm": 0.2409517616033554, + "learning_rate": 1.1011977278226124e-05, + "loss": 0.119, + "step": 40373 + }, + { + "epoch": 0.7201155780687047, + "grad_norm": 0.21675218641757965, + "learning_rate": 1.101068724657897e-05, + "loss": 0.0839, + "step": 40374 + }, + { + "epoch": 0.7201334141904184, + "grad_norm": 0.293465793132782, + "learning_rate": 1.1009397269159224e-05, + "loss": 0.1013, + "step": 40375 + }, + { + "epoch": 0.7201512503121321, + "grad_norm": 0.3176478445529938, + "learning_rate": 1.1008107345971888e-05, + "loss": 0.1251, + "step": 40376 + }, + { + "epoch": 0.7201690864338458, + "grad_norm": 0.29263970255851746, + "learning_rate": 1.100681747702195e-05, + "loss": 0.1116, + "step": 40377 + }, + { + "epoch": 0.7201869225555595, + "grad_norm": 0.2971602976322174, + "learning_rate": 1.100552766231443e-05, + "loss": 0.123, + "step": 40378 + }, + { + "epoch": 0.7202047586772732, + "grad_norm": 0.2378644496202469, + "learning_rate": 1.1004237901854317e-05, + "loss": 0.1223, + "step": 40379 + }, + { + "epoch": 0.7202225947989869, + "grad_norm": 0.22653701901435852, + "learning_rate": 1.1002948195646612e-05, + "loss": 0.0922, + "step": 40380 + }, + { + "epoch": 0.7202404309207006, + "grad_norm": 0.23711735010147095, + "learning_rate": 1.1001658543696305e-05, + "loss": 0.0916, + "step": 40381 + }, + { + "epoch": 0.7202582670424142, + "grad_norm": 0.31709030270576477, + "learning_rate": 1.1000368946008417e-05, + "loss": 0.1273, + "step": 40382 + }, + { + "epoch": 0.720276103164128, + "grad_norm": 0.19927071034908295, + "learning_rate": 1.0999079402587931e-05, + "loss": 0.1338, + "step": 40383 + }, + { + "epoch": 0.7202939392858417, + "grad_norm": 0.2005443125963211, + "learning_rate": 1.0997789913439849e-05, + "loss": 0.0952, + "step": 40384 + }, + { + "epoch": 0.7203117754075554, + "grad_norm": 0.22356563806533813, + "learning_rate": 1.099650047856916e-05, + "loss": 0.1296, + "step": 40385 + }, + { + "epoch": 0.7203296115292691, + "grad_norm": 0.24035263061523438, + "learning_rate": 1.0995211097980885e-05, + "loss": 0.1028, + "step": 40386 + }, + { + "epoch": 0.7203474476509828, + "grad_norm": 0.22749945521354675, + "learning_rate": 1.0993921771679996e-05, + "loss": 0.069, + "step": 40387 + }, + { + "epoch": 0.7203652837726965, + "grad_norm": 0.2557878792285919, + "learning_rate": 1.0992632499671513e-05, + "loss": 0.0995, + "step": 40388 + }, + { + "epoch": 0.7203831198944102, + "grad_norm": 0.2746986448764801, + "learning_rate": 1.0991343281960426e-05, + "loss": 0.118, + "step": 40389 + }, + { + "epoch": 0.7204009560161239, + "grad_norm": 0.2301800549030304, + "learning_rate": 1.0990054118551722e-05, + "loss": 0.063, + "step": 40390 + }, + { + "epoch": 0.7204187921378375, + "grad_norm": 0.23514443635940552, + "learning_rate": 1.0988765009450414e-05, + "loss": 0.1085, + "step": 40391 + }, + { + "epoch": 0.7204366282595512, + "grad_norm": 0.513597846031189, + "learning_rate": 1.0987475954661491e-05, + "loss": 0.134, + "step": 40392 + }, + { + "epoch": 0.7204544643812649, + "grad_norm": 0.3090693950653076, + "learning_rate": 1.0986186954189955e-05, + "loss": 0.1597, + "step": 40393 + }, + { + "epoch": 0.7204723005029786, + "grad_norm": 0.24443571269512177, + "learning_rate": 1.0984898008040787e-05, + "loss": 0.1121, + "step": 40394 + }, + { + "epoch": 0.7204901366246923, + "grad_norm": 0.3139670193195343, + "learning_rate": 1.0983609116219004e-05, + "loss": 0.1141, + "step": 40395 + }, + { + "epoch": 0.720507972746406, + "grad_norm": 0.17939960956573486, + "learning_rate": 1.0982320278729596e-05, + "loss": 0.0721, + "step": 40396 + }, + { + "epoch": 0.7205258088681197, + "grad_norm": 0.36835068464279175, + "learning_rate": 1.0981031495577554e-05, + "loss": 0.1435, + "step": 40397 + }, + { + "epoch": 0.7205436449898334, + "grad_norm": 0.23543259501457214, + "learning_rate": 1.0979742766767867e-05, + "loss": 0.1134, + "step": 40398 + }, + { + "epoch": 0.720561481111547, + "grad_norm": 0.21438445150852203, + "learning_rate": 1.0978454092305548e-05, + "loss": 0.1163, + "step": 40399 + }, + { + "epoch": 0.7205793172332609, + "grad_norm": 0.303114116191864, + "learning_rate": 1.0977165472195575e-05, + "loss": 0.1259, + "step": 40400 + }, + { + "epoch": 0.7205971533549745, + "grad_norm": 0.25703418254852295, + "learning_rate": 1.0975876906442961e-05, + "loss": 0.1084, + "step": 40401 + }, + { + "epoch": 0.7206149894766882, + "grad_norm": 0.45374831557273865, + "learning_rate": 1.0974588395052693e-05, + "loss": 0.1496, + "step": 40402 + }, + { + "epoch": 0.7206328255984019, + "grad_norm": 0.3314122259616852, + "learning_rate": 1.0973299938029755e-05, + "loss": 0.1332, + "step": 40403 + }, + { + "epoch": 0.7206506617201156, + "grad_norm": 0.2612239122390747, + "learning_rate": 1.097201153537916e-05, + "loss": 0.1724, + "step": 40404 + }, + { + "epoch": 0.7206684978418293, + "grad_norm": 0.26297783851623535, + "learning_rate": 1.0970723187105891e-05, + "loss": 0.1136, + "step": 40405 + }, + { + "epoch": 0.720686333963543, + "grad_norm": 0.26427575945854187, + "learning_rate": 1.0969434893214947e-05, + "loss": 0.1085, + "step": 40406 + }, + { + "epoch": 0.7207041700852567, + "grad_norm": 0.23110325634479523, + "learning_rate": 1.0968146653711311e-05, + "loss": 0.1345, + "step": 40407 + }, + { + "epoch": 0.7207220062069704, + "grad_norm": 0.24362215399742126, + "learning_rate": 1.0966858468599994e-05, + "loss": 0.1322, + "step": 40408 + }, + { + "epoch": 0.720739842328684, + "grad_norm": 0.23025809228420258, + "learning_rate": 1.0965570337885978e-05, + "loss": 0.1313, + "step": 40409 + }, + { + "epoch": 0.7207576784503977, + "grad_norm": 0.3628450334072113, + "learning_rate": 1.096428226157426e-05, + "loss": 0.1012, + "step": 40410 + }, + { + "epoch": 0.7207755145721114, + "grad_norm": 0.20489072799682617, + "learning_rate": 1.0962994239669824e-05, + "loss": 0.0814, + "step": 40411 + }, + { + "epoch": 0.7207933506938251, + "grad_norm": 0.2180626541376114, + "learning_rate": 1.096170627217768e-05, + "loss": 0.1273, + "step": 40412 + }, + { + "epoch": 0.7208111868155388, + "grad_norm": 0.25559237599372864, + "learning_rate": 1.0960418359102808e-05, + "loss": 0.1111, + "step": 40413 + }, + { + "epoch": 0.7208290229372525, + "grad_norm": 0.32958081364631653, + "learning_rate": 1.0959130500450196e-05, + "loss": 0.1275, + "step": 40414 + }, + { + "epoch": 0.7208468590589662, + "grad_norm": 0.2854267358779907, + "learning_rate": 1.0957842696224854e-05, + "loss": 0.1454, + "step": 40415 + }, + { + "epoch": 0.7208646951806799, + "grad_norm": 0.29579558968544006, + "learning_rate": 1.0956554946431754e-05, + "loss": 0.1297, + "step": 40416 + }, + { + "epoch": 0.7208825313023937, + "grad_norm": 0.23838843405246735, + "learning_rate": 1.0955267251075907e-05, + "loss": 0.0826, + "step": 40417 + }, + { + "epoch": 0.7209003674241073, + "grad_norm": 0.25677451491355896, + "learning_rate": 1.0953979610162294e-05, + "loss": 0.1393, + "step": 40418 + }, + { + "epoch": 0.720918203545821, + "grad_norm": 0.2448953092098236, + "learning_rate": 1.0952692023695907e-05, + "loss": 0.121, + "step": 40419 + }, + { + "epoch": 0.7209360396675347, + "grad_norm": 0.27708911895751953, + "learning_rate": 1.095140449168173e-05, + "loss": 0.1408, + "step": 40420 + }, + { + "epoch": 0.7209538757892484, + "grad_norm": 0.29170405864715576, + "learning_rate": 1.095011701412477e-05, + "loss": 0.1072, + "step": 40421 + }, + { + "epoch": 0.7209717119109621, + "grad_norm": 0.23069795966148376, + "learning_rate": 1.0948829591030007e-05, + "loss": 0.0875, + "step": 40422 + }, + { + "epoch": 0.7209895480326758, + "grad_norm": 0.33636805415153503, + "learning_rate": 1.0947542222402435e-05, + "loss": 0.1609, + "step": 40423 + }, + { + "epoch": 0.7210073841543895, + "grad_norm": 0.2755163013935089, + "learning_rate": 1.0946254908247034e-05, + "loss": 0.1603, + "step": 40424 + }, + { + "epoch": 0.7210252202761032, + "grad_norm": 0.2104751169681549, + "learning_rate": 1.0944967648568814e-05, + "loss": 0.0767, + "step": 40425 + }, + { + "epoch": 0.7210430563978169, + "grad_norm": 0.24717546999454498, + "learning_rate": 1.094368044337275e-05, + "loss": 0.1541, + "step": 40426 + }, + { + "epoch": 0.7210608925195305, + "grad_norm": 0.3137086033821106, + "learning_rate": 1.094239329266383e-05, + "loss": 0.1309, + "step": 40427 + }, + { + "epoch": 0.7210787286412442, + "grad_norm": 0.2632220387458801, + "learning_rate": 1.0941106196447057e-05, + "loss": 0.1383, + "step": 40428 + }, + { + "epoch": 0.7210965647629579, + "grad_norm": 0.35149380564689636, + "learning_rate": 1.0939819154727402e-05, + "loss": 0.1152, + "step": 40429 + }, + { + "epoch": 0.7211144008846716, + "grad_norm": 0.3171868920326233, + "learning_rate": 1.0938532167509874e-05, + "loss": 0.1387, + "step": 40430 + }, + { + "epoch": 0.7211322370063853, + "grad_norm": 0.22164089977741241, + "learning_rate": 1.0937245234799451e-05, + "loss": 0.0849, + "step": 40431 + }, + { + "epoch": 0.721150073128099, + "grad_norm": 0.2974955439567566, + "learning_rate": 1.0935958356601123e-05, + "loss": 0.0982, + "step": 40432 + }, + { + "epoch": 0.7211679092498127, + "grad_norm": 0.27351701259613037, + "learning_rate": 1.0934671532919869e-05, + "loss": 0.1175, + "step": 40433 + }, + { + "epoch": 0.7211857453715265, + "grad_norm": 0.25590378046035767, + "learning_rate": 1.0933384763760695e-05, + "loss": 0.1601, + "step": 40434 + }, + { + "epoch": 0.7212035814932402, + "grad_norm": 0.28437310457229614, + "learning_rate": 1.0932098049128581e-05, + "loss": 0.1603, + "step": 40435 + }, + { + "epoch": 0.7212214176149538, + "grad_norm": 0.28727272152900696, + "learning_rate": 1.0930811389028515e-05, + "loss": 0.1036, + "step": 40436 + }, + { + "epoch": 0.7212392537366675, + "grad_norm": 0.26963332295417786, + "learning_rate": 1.0929524783465472e-05, + "loss": 0.1292, + "step": 40437 + }, + { + "epoch": 0.7212570898583812, + "grad_norm": 0.3264870047569275, + "learning_rate": 1.092823823244446e-05, + "loss": 0.1402, + "step": 40438 + }, + { + "epoch": 0.7212749259800949, + "grad_norm": 0.3137264549732208, + "learning_rate": 1.0926951735970458e-05, + "loss": 0.1318, + "step": 40439 + }, + { + "epoch": 0.7212927621018086, + "grad_norm": 0.25432273745536804, + "learning_rate": 1.092566529404845e-05, + "loss": 0.1109, + "step": 40440 + }, + { + "epoch": 0.7213105982235223, + "grad_norm": 0.3099328279495239, + "learning_rate": 1.092437890668342e-05, + "loss": 0.0764, + "step": 40441 + }, + { + "epoch": 0.721328434345236, + "grad_norm": 0.3787504732608795, + "learning_rate": 1.0923092573880362e-05, + "loss": 0.1364, + "step": 40442 + }, + { + "epoch": 0.7213462704669497, + "grad_norm": 0.34405717253685, + "learning_rate": 1.0921806295644254e-05, + "loss": 0.1224, + "step": 40443 + }, + { + "epoch": 0.7213641065886633, + "grad_norm": 0.2813001275062561, + "learning_rate": 1.0920520071980095e-05, + "loss": 0.1151, + "step": 40444 + }, + { + "epoch": 0.721381942710377, + "grad_norm": 0.22539572417736053, + "learning_rate": 1.0919233902892862e-05, + "loss": 0.1009, + "step": 40445 + }, + { + "epoch": 0.7213997788320907, + "grad_norm": 0.28210368752479553, + "learning_rate": 1.0917947788387533e-05, + "loss": 0.127, + "step": 40446 + }, + { + "epoch": 0.7214176149538044, + "grad_norm": 0.32122141122817993, + "learning_rate": 1.0916661728469112e-05, + "loss": 0.11, + "step": 40447 + }, + { + "epoch": 0.7214354510755181, + "grad_norm": 0.28604593873023987, + "learning_rate": 1.0915375723142577e-05, + "loss": 0.1338, + "step": 40448 + }, + { + "epoch": 0.7214532871972318, + "grad_norm": 0.2551625669002533, + "learning_rate": 1.0914089772412905e-05, + "loss": 0.0836, + "step": 40449 + }, + { + "epoch": 0.7214711233189455, + "grad_norm": 0.2863278388977051, + "learning_rate": 1.0912803876285079e-05, + "loss": 0.1174, + "step": 40450 + }, + { + "epoch": 0.7214889594406593, + "grad_norm": 0.35364723205566406, + "learning_rate": 1.0911518034764104e-05, + "loss": 0.086, + "step": 40451 + }, + { + "epoch": 0.721506795562373, + "grad_norm": 0.28567206859588623, + "learning_rate": 1.0910232247854946e-05, + "loss": 0.0906, + "step": 40452 + }, + { + "epoch": 0.7215246316840866, + "grad_norm": 0.5889822840690613, + "learning_rate": 1.0908946515562596e-05, + "loss": 0.1936, + "step": 40453 + }, + { + "epoch": 0.7215424678058003, + "grad_norm": 0.25866588950157166, + "learning_rate": 1.0907660837892028e-05, + "loss": 0.0875, + "step": 40454 + }, + { + "epoch": 0.721560303927514, + "grad_norm": 0.22260944545269012, + "learning_rate": 1.0906375214848241e-05, + "loss": 0.1044, + "step": 40455 + }, + { + "epoch": 0.7215781400492277, + "grad_norm": 0.19718752801418304, + "learning_rate": 1.0905089646436204e-05, + "loss": 0.085, + "step": 40456 + }, + { + "epoch": 0.7215959761709414, + "grad_norm": 0.3765561580657959, + "learning_rate": 1.0903804132660916e-05, + "loss": 0.1543, + "step": 40457 + }, + { + "epoch": 0.7216138122926551, + "grad_norm": 0.2711127698421478, + "learning_rate": 1.0902518673527354e-05, + "loss": 0.1377, + "step": 40458 + }, + { + "epoch": 0.7216316484143688, + "grad_norm": 0.3439038097858429, + "learning_rate": 1.0901233269040487e-05, + "loss": 0.1109, + "step": 40459 + }, + { + "epoch": 0.7216494845360825, + "grad_norm": 0.20267468690872192, + "learning_rate": 1.089994791920532e-05, + "loss": 0.1131, + "step": 40460 + }, + { + "epoch": 0.7216673206577962, + "grad_norm": 0.2884863615036011, + "learning_rate": 1.0898662624026823e-05, + "loss": 0.127, + "step": 40461 + }, + { + "epoch": 0.7216851567795098, + "grad_norm": 0.2465682029724121, + "learning_rate": 1.0897377383509983e-05, + "loss": 0.1645, + "step": 40462 + }, + { + "epoch": 0.7217029929012235, + "grad_norm": 0.3834037780761719, + "learning_rate": 1.0896092197659765e-05, + "loss": 0.0685, + "step": 40463 + }, + { + "epoch": 0.7217208290229372, + "grad_norm": 0.2938390374183655, + "learning_rate": 1.089480706648118e-05, + "loss": 0.1478, + "step": 40464 + }, + { + "epoch": 0.7217386651446509, + "grad_norm": 0.2629544734954834, + "learning_rate": 1.089352198997919e-05, + "loss": 0.1092, + "step": 40465 + }, + { + "epoch": 0.7217565012663646, + "grad_norm": 0.27181199193000793, + "learning_rate": 1.0892236968158783e-05, + "loss": 0.0874, + "step": 40466 + }, + { + "epoch": 0.7217743373880784, + "grad_norm": 0.22509315609931946, + "learning_rate": 1.089095200102494e-05, + "loss": 0.139, + "step": 40467 + }, + { + "epoch": 0.7217921735097921, + "grad_norm": 0.2534896433353424, + "learning_rate": 1.0889667088582628e-05, + "loss": 0.15, + "step": 40468 + }, + { + "epoch": 0.7218100096315058, + "grad_norm": 0.28012070059776306, + "learning_rate": 1.0888382230836852e-05, + "loss": 0.1062, + "step": 40469 + }, + { + "epoch": 0.7218278457532195, + "grad_norm": 0.3172220289707184, + "learning_rate": 1.088709742779257e-05, + "loss": 0.12, + "step": 40470 + }, + { + "epoch": 0.7218456818749331, + "grad_norm": 0.2582438886165619, + "learning_rate": 1.088581267945478e-05, + "loss": 0.1392, + "step": 40471 + }, + { + "epoch": 0.7218635179966468, + "grad_norm": 0.29339727759361267, + "learning_rate": 1.0884527985828446e-05, + "loss": 0.1497, + "step": 40472 + }, + { + "epoch": 0.7218813541183605, + "grad_norm": 0.40829694271087646, + "learning_rate": 1.088324334691857e-05, + "loss": 0.0991, + "step": 40473 + }, + { + "epoch": 0.7218991902400742, + "grad_norm": 0.2584668695926666, + "learning_rate": 1.0881958762730116e-05, + "loss": 0.1468, + "step": 40474 + }, + { + "epoch": 0.7219170263617879, + "grad_norm": 0.2535378932952881, + "learning_rate": 1.0880674233268065e-05, + "loss": 0.114, + "step": 40475 + }, + { + "epoch": 0.7219348624835016, + "grad_norm": 0.3629578649997711, + "learning_rate": 1.0879389758537389e-05, + "loss": 0.1616, + "step": 40476 + }, + { + "epoch": 0.7219526986052153, + "grad_norm": 0.33013126254081726, + "learning_rate": 1.0878105338543084e-05, + "loss": 0.1264, + "step": 40477 + }, + { + "epoch": 0.721970534726929, + "grad_norm": 0.23822833597660065, + "learning_rate": 1.0876820973290122e-05, + "loss": 0.1513, + "step": 40478 + }, + { + "epoch": 0.7219883708486426, + "grad_norm": 0.3148685097694397, + "learning_rate": 1.0875536662783479e-05, + "loss": 0.1493, + "step": 40479 + }, + { + "epoch": 0.7220062069703563, + "grad_norm": 0.32281985878944397, + "learning_rate": 1.0874252407028134e-05, + "loss": 0.1188, + "step": 40480 + }, + { + "epoch": 0.72202404309207, + "grad_norm": 0.27260199189186096, + "learning_rate": 1.0872968206029055e-05, + "loss": 0.1202, + "step": 40481 + }, + { + "epoch": 0.7220418792137837, + "grad_norm": 0.3343993127346039, + "learning_rate": 1.0871684059791243e-05, + "loss": 0.1536, + "step": 40482 + }, + { + "epoch": 0.7220597153354974, + "grad_norm": 0.3231378495693207, + "learning_rate": 1.0870399968319653e-05, + "loss": 0.1169, + "step": 40483 + }, + { + "epoch": 0.7220775514572112, + "grad_norm": 0.28927361965179443, + "learning_rate": 1.086911593161928e-05, + "loss": 0.0672, + "step": 40484 + }, + { + "epoch": 0.7220953875789249, + "grad_norm": 0.23475094139575958, + "learning_rate": 1.0867831949695086e-05, + "loss": 0.1357, + "step": 40485 + }, + { + "epoch": 0.7221132237006386, + "grad_norm": 0.23237106204032898, + "learning_rate": 1.0866548022552067e-05, + "loss": 0.0898, + "step": 40486 + }, + { + "epoch": 0.7221310598223523, + "grad_norm": 0.2454371303319931, + "learning_rate": 1.086526415019519e-05, + "loss": 0.1216, + "step": 40487 + }, + { + "epoch": 0.722148895944066, + "grad_norm": 0.2840881049633026, + "learning_rate": 1.0863980332629428e-05, + "loss": 0.1277, + "step": 40488 + }, + { + "epoch": 0.7221667320657796, + "grad_norm": 0.2540324926376343, + "learning_rate": 1.0862696569859754e-05, + "loss": 0.1076, + "step": 40489 + }, + { + "epoch": 0.7221845681874933, + "grad_norm": 0.2851029336452484, + "learning_rate": 1.0861412861891159e-05, + "loss": 0.1674, + "step": 40490 + }, + { + "epoch": 0.722202404309207, + "grad_norm": 0.28207066655158997, + "learning_rate": 1.0860129208728612e-05, + "loss": 0.1401, + "step": 40491 + }, + { + "epoch": 0.7222202404309207, + "grad_norm": 0.27512624859809875, + "learning_rate": 1.0858845610377088e-05, + "loss": 0.1177, + "step": 40492 + }, + { + "epoch": 0.7222380765526344, + "grad_norm": 0.2700352966785431, + "learning_rate": 1.0857562066841562e-05, + "loss": 0.0869, + "step": 40493 + }, + { + "epoch": 0.7222559126743481, + "grad_norm": 0.2876301109790802, + "learning_rate": 1.0856278578127e-05, + "loss": 0.11, + "step": 40494 + }, + { + "epoch": 0.7222737487960618, + "grad_norm": 0.2155667096376419, + "learning_rate": 1.0854995144238398e-05, + "loss": 0.1309, + "step": 40495 + }, + { + "epoch": 0.7222915849177755, + "grad_norm": 0.20146290957927704, + "learning_rate": 1.0853711765180719e-05, + "loss": 0.1505, + "step": 40496 + }, + { + "epoch": 0.7223094210394891, + "grad_norm": 0.26354163885116577, + "learning_rate": 1.0852428440958931e-05, + "loss": 0.0961, + "step": 40497 + }, + { + "epoch": 0.7223272571612028, + "grad_norm": 0.38613587617874146, + "learning_rate": 1.0851145171578026e-05, + "loss": 0.1145, + "step": 40498 + }, + { + "epoch": 0.7223450932829165, + "grad_norm": 0.24673140048980713, + "learning_rate": 1.0849861957042962e-05, + "loss": 0.096, + "step": 40499 + }, + { + "epoch": 0.7223629294046302, + "grad_norm": 0.2833535671234131, + "learning_rate": 1.0848578797358725e-05, + "loss": 0.1165, + "step": 40500 + }, + { + "epoch": 0.722380765526344, + "grad_norm": 0.2744133472442627, + "learning_rate": 1.0847295692530287e-05, + "loss": 0.0772, + "step": 40501 + }, + { + "epoch": 0.7223986016480577, + "grad_norm": 0.2528727054595947, + "learning_rate": 1.084601264256261e-05, + "loss": 0.1081, + "step": 40502 + }, + { + "epoch": 0.7224164377697714, + "grad_norm": 0.291473388671875, + "learning_rate": 1.0844729647460686e-05, + "loss": 0.1133, + "step": 40503 + }, + { + "epoch": 0.7224342738914851, + "grad_norm": 0.24958087503910065, + "learning_rate": 1.0843446707229476e-05, + "loss": 0.079, + "step": 40504 + }, + { + "epoch": 0.7224521100131988, + "grad_norm": 0.2474784106016159, + "learning_rate": 1.084216382187396e-05, + "loss": 0.1399, + "step": 40505 + }, + { + "epoch": 0.7224699461349124, + "grad_norm": 0.2323588877916336, + "learning_rate": 1.0840880991399105e-05, + "loss": 0.1125, + "step": 40506 + }, + { + "epoch": 0.7224877822566261, + "grad_norm": 0.2776666581630707, + "learning_rate": 1.0839598215809877e-05, + "loss": 0.1157, + "step": 40507 + }, + { + "epoch": 0.7225056183783398, + "grad_norm": 0.2597182095050812, + "learning_rate": 1.0838315495111265e-05, + "loss": 0.079, + "step": 40508 + }, + { + "epoch": 0.7225234545000535, + "grad_norm": 0.24726997315883636, + "learning_rate": 1.0837032829308236e-05, + "loss": 0.0916, + "step": 40509 + }, + { + "epoch": 0.7225412906217672, + "grad_norm": 0.19278456270694733, + "learning_rate": 1.0835750218405748e-05, + "loss": 0.088, + "step": 40510 + }, + { + "epoch": 0.7225591267434809, + "grad_norm": 0.31380847096443176, + "learning_rate": 1.0834467662408795e-05, + "loss": 0.1582, + "step": 40511 + }, + { + "epoch": 0.7225769628651946, + "grad_norm": 0.26442664861679077, + "learning_rate": 1.083318516132233e-05, + "loss": 0.0915, + "step": 40512 + }, + { + "epoch": 0.7225947989869083, + "grad_norm": 0.26641327142715454, + "learning_rate": 1.0831902715151338e-05, + "loss": 0.1137, + "step": 40513 + }, + { + "epoch": 0.722612635108622, + "grad_norm": 0.2733730971813202, + "learning_rate": 1.0830620323900787e-05, + "loss": 0.1403, + "step": 40514 + }, + { + "epoch": 0.7226304712303356, + "grad_norm": 0.35424867272377014, + "learning_rate": 1.0829337987575636e-05, + "loss": 0.1551, + "step": 40515 + }, + { + "epoch": 0.7226483073520493, + "grad_norm": 0.24846625328063965, + "learning_rate": 1.0828055706180873e-05, + "loss": 0.0955, + "step": 40516 + }, + { + "epoch": 0.722666143473763, + "grad_norm": 0.29416537284851074, + "learning_rate": 1.082677347972146e-05, + "loss": 0.1417, + "step": 40517 + }, + { + "epoch": 0.7226839795954768, + "grad_norm": 0.3694874048233032, + "learning_rate": 1.0825491308202374e-05, + "loss": 0.1717, + "step": 40518 + }, + { + "epoch": 0.7227018157171905, + "grad_norm": 0.26925182342529297, + "learning_rate": 1.0824209191628573e-05, + "loss": 0.0901, + "step": 40519 + }, + { + "epoch": 0.7227196518389042, + "grad_norm": 0.3647941052913666, + "learning_rate": 1.0822927130005029e-05, + "loss": 0.1004, + "step": 40520 + }, + { + "epoch": 0.7227374879606179, + "grad_norm": 0.27682557702064514, + "learning_rate": 1.0821645123336725e-05, + "loss": 0.1158, + "step": 40521 + }, + { + "epoch": 0.7227553240823316, + "grad_norm": 0.25689277052879333, + "learning_rate": 1.0820363171628617e-05, + "loss": 0.1296, + "step": 40522 + }, + { + "epoch": 0.7227731602040453, + "grad_norm": 0.2692350149154663, + "learning_rate": 1.081908127488568e-05, + "loss": 0.1255, + "step": 40523 + }, + { + "epoch": 0.7227909963257589, + "grad_norm": 0.3036006987094879, + "learning_rate": 1.0817799433112876e-05, + "loss": 0.1323, + "step": 40524 + }, + { + "epoch": 0.7228088324474726, + "grad_norm": 0.37668895721435547, + "learning_rate": 1.0816517646315189e-05, + "loss": 0.1071, + "step": 40525 + }, + { + "epoch": 0.7228266685691863, + "grad_norm": 0.26455116271972656, + "learning_rate": 1.0815235914497565e-05, + "loss": 0.1027, + "step": 40526 + }, + { + "epoch": 0.7228445046909, + "grad_norm": 0.2779602110385895, + "learning_rate": 1.0813954237665e-05, + "loss": 0.1196, + "step": 40527 + }, + { + "epoch": 0.7228623408126137, + "grad_norm": 0.33256569504737854, + "learning_rate": 1.0812672615822437e-05, + "loss": 0.1579, + "step": 40528 + }, + { + "epoch": 0.7228801769343274, + "grad_norm": 0.4240020215511322, + "learning_rate": 1.0811391048974862e-05, + "loss": 0.1397, + "step": 40529 + }, + { + "epoch": 0.7228980130560411, + "grad_norm": 0.27562037110328674, + "learning_rate": 1.0810109537127239e-05, + "loss": 0.1411, + "step": 40530 + }, + { + "epoch": 0.7229158491777548, + "grad_norm": 0.39022308588027954, + "learning_rate": 1.0808828080284531e-05, + "loss": 0.1093, + "step": 40531 + }, + { + "epoch": 0.7229336852994684, + "grad_norm": 0.3457445502281189, + "learning_rate": 1.0807546678451697e-05, + "loss": 0.1304, + "step": 40532 + }, + { + "epoch": 0.7229515214211821, + "grad_norm": 0.2470165193080902, + "learning_rate": 1.0806265331633722e-05, + "loss": 0.1228, + "step": 40533 + }, + { + "epoch": 0.7229693575428958, + "grad_norm": 0.2204384058713913, + "learning_rate": 1.080498403983557e-05, + "loss": 0.0901, + "step": 40534 + }, + { + "epoch": 0.7229871936646096, + "grad_norm": 0.35831955075263977, + "learning_rate": 1.0803702803062198e-05, + "loss": 0.1543, + "step": 40535 + }, + { + "epoch": 0.7230050297863233, + "grad_norm": 0.2470790296792984, + "learning_rate": 1.0802421621318578e-05, + "loss": 0.1104, + "step": 40536 + }, + { + "epoch": 0.723022865908037, + "grad_norm": 0.33692437410354614, + "learning_rate": 1.0801140494609668e-05, + "loss": 0.1954, + "step": 40537 + }, + { + "epoch": 0.7230407020297507, + "grad_norm": 0.3157949149608612, + "learning_rate": 1.079985942294045e-05, + "loss": 0.1204, + "step": 40538 + }, + { + "epoch": 0.7230585381514644, + "grad_norm": 0.18929168581962585, + "learning_rate": 1.079857840631587e-05, + "loss": 0.0997, + "step": 40539 + }, + { + "epoch": 0.7230763742731781, + "grad_norm": 0.28917384147644043, + "learning_rate": 1.0797297444740916e-05, + "loss": 0.128, + "step": 40540 + }, + { + "epoch": 0.7230942103948917, + "grad_norm": 0.32336491346359253, + "learning_rate": 1.0796016538220535e-05, + "loss": 0.0972, + "step": 40541 + }, + { + "epoch": 0.7231120465166054, + "grad_norm": 0.2406318336725235, + "learning_rate": 1.0794735686759708e-05, + "loss": 0.1301, + "step": 40542 + }, + { + "epoch": 0.7231298826383191, + "grad_norm": 0.18333131074905396, + "learning_rate": 1.0793454890363392e-05, + "loss": 0.1107, + "step": 40543 + }, + { + "epoch": 0.7231477187600328, + "grad_norm": 0.23352345824241638, + "learning_rate": 1.0792174149036549e-05, + "loss": 0.1302, + "step": 40544 + }, + { + "epoch": 0.7231655548817465, + "grad_norm": 0.3373039960861206, + "learning_rate": 1.0790893462784141e-05, + "loss": 0.128, + "step": 40545 + }, + { + "epoch": 0.7231833910034602, + "grad_norm": 0.23998256027698517, + "learning_rate": 1.0789612831611146e-05, + "loss": 0.059, + "step": 40546 + }, + { + "epoch": 0.7232012271251739, + "grad_norm": 0.2694459855556488, + "learning_rate": 1.0788332255522519e-05, + "loss": 0.0576, + "step": 40547 + }, + { + "epoch": 0.7232190632468876, + "grad_norm": 0.2804769277572632, + "learning_rate": 1.0787051734523224e-05, + "loss": 0.1157, + "step": 40548 + }, + { + "epoch": 0.7232368993686012, + "grad_norm": 0.3122931122779846, + "learning_rate": 1.0785771268618225e-05, + "loss": 0.1223, + "step": 40549 + }, + { + "epoch": 0.7232547354903149, + "grad_norm": 0.32900938391685486, + "learning_rate": 1.0784490857812476e-05, + "loss": 0.1307, + "step": 40550 + }, + { + "epoch": 0.7232725716120286, + "grad_norm": 0.204051673412323, + "learning_rate": 1.0783210502110963e-05, + "loss": 0.1052, + "step": 40551 + }, + { + "epoch": 0.7232904077337424, + "grad_norm": 0.2570563554763794, + "learning_rate": 1.0781930201518633e-05, + "loss": 0.1057, + "step": 40552 + }, + { + "epoch": 0.7233082438554561, + "grad_norm": 0.24053695797920227, + "learning_rate": 1.0780649956040445e-05, + "loss": 0.1338, + "step": 40553 + }, + { + "epoch": 0.7233260799771698, + "grad_norm": 0.27540016174316406, + "learning_rate": 1.077936976568137e-05, + "loss": 0.142, + "step": 40554 + }, + { + "epoch": 0.7233439160988835, + "grad_norm": 0.2392880916595459, + "learning_rate": 1.0778089630446378e-05, + "loss": 0.1149, + "step": 40555 + }, + { + "epoch": 0.7233617522205972, + "grad_norm": 0.3592996597290039, + "learning_rate": 1.077680955034042e-05, + "loss": 0.1115, + "step": 40556 + }, + { + "epoch": 0.7233795883423109, + "grad_norm": 0.25523144006729126, + "learning_rate": 1.0775529525368463e-05, + "loss": 0.1355, + "step": 40557 + }, + { + "epoch": 0.7233974244640246, + "grad_norm": 0.38922280073165894, + "learning_rate": 1.0774249555535457e-05, + "loss": 0.1277, + "step": 40558 + }, + { + "epoch": 0.7234152605857382, + "grad_norm": 0.37434661388397217, + "learning_rate": 1.0772969640846383e-05, + "loss": 0.113, + "step": 40559 + }, + { + "epoch": 0.7234330967074519, + "grad_norm": 0.3323158025741577, + "learning_rate": 1.0771689781306191e-05, + "loss": 0.1389, + "step": 40560 + }, + { + "epoch": 0.7234509328291656, + "grad_norm": 0.30588456988334656, + "learning_rate": 1.0770409976919846e-05, + "loss": 0.1445, + "step": 40561 + }, + { + "epoch": 0.7234687689508793, + "grad_norm": 0.3162062466144562, + "learning_rate": 1.0769130227692304e-05, + "loss": 0.1005, + "step": 40562 + }, + { + "epoch": 0.723486605072593, + "grad_norm": 0.40629053115844727, + "learning_rate": 1.0767850533628521e-05, + "loss": 0.1586, + "step": 40563 + }, + { + "epoch": 0.7235044411943067, + "grad_norm": 0.20779992640018463, + "learning_rate": 1.0766570894733475e-05, + "loss": 0.0905, + "step": 40564 + }, + { + "epoch": 0.7235222773160204, + "grad_norm": 0.36102133989334106, + "learning_rate": 1.0765291311012113e-05, + "loss": 0.1687, + "step": 40565 + }, + { + "epoch": 0.7235401134377341, + "grad_norm": 0.25982508063316345, + "learning_rate": 1.0764011782469391e-05, + "loss": 0.1206, + "step": 40566 + }, + { + "epoch": 0.7235579495594477, + "grad_norm": 0.35117021203041077, + "learning_rate": 1.0762732309110288e-05, + "loss": 0.1556, + "step": 40567 + }, + { + "epoch": 0.7235757856811615, + "grad_norm": 0.269761323928833, + "learning_rate": 1.0761452890939738e-05, + "loss": 0.1089, + "step": 40568 + }, + { + "epoch": 0.7235936218028752, + "grad_norm": 0.24880672991275787, + "learning_rate": 1.0760173527962728e-05, + "loss": 0.1257, + "step": 40569 + }, + { + "epoch": 0.7236114579245889, + "grad_norm": 0.3262292444705963, + "learning_rate": 1.0758894220184202e-05, + "loss": 0.1206, + "step": 40570 + }, + { + "epoch": 0.7236292940463026, + "grad_norm": 0.36932623386383057, + "learning_rate": 1.0757614967609112e-05, + "loss": 0.1227, + "step": 40571 + }, + { + "epoch": 0.7236471301680163, + "grad_norm": 0.2558470070362091, + "learning_rate": 1.0756335770242432e-05, + "loss": 0.1116, + "step": 40572 + }, + { + "epoch": 0.72366496628973, + "grad_norm": 0.27696770429611206, + "learning_rate": 1.0755056628089116e-05, + "loss": 0.1346, + "step": 40573 + }, + { + "epoch": 0.7236828024114437, + "grad_norm": 0.22319665551185608, + "learning_rate": 1.075377754115412e-05, + "loss": 0.127, + "step": 40574 + }, + { + "epoch": 0.7237006385331574, + "grad_norm": 0.23782727122306824, + "learning_rate": 1.0752498509442405e-05, + "loss": 0.1033, + "step": 40575 + }, + { + "epoch": 0.723718474654871, + "grad_norm": 0.23157502710819244, + "learning_rate": 1.0751219532958914e-05, + "loss": 0.086, + "step": 40576 + }, + { + "epoch": 0.7237363107765847, + "grad_norm": 0.24546676874160767, + "learning_rate": 1.074994061170863e-05, + "loss": 0.1039, + "step": 40577 + }, + { + "epoch": 0.7237541468982984, + "grad_norm": 0.21519996225833893, + "learning_rate": 1.0748661745696498e-05, + "loss": 0.0951, + "step": 40578 + }, + { + "epoch": 0.7237719830200121, + "grad_norm": 0.3266916871070862, + "learning_rate": 1.0747382934927471e-05, + "loss": 0.0963, + "step": 40579 + }, + { + "epoch": 0.7237898191417258, + "grad_norm": 0.48786115646362305, + "learning_rate": 1.0746104179406504e-05, + "loss": 0.1567, + "step": 40580 + }, + { + "epoch": 0.7238076552634395, + "grad_norm": 0.5185796618461609, + "learning_rate": 1.0744825479138562e-05, + "loss": 0.1389, + "step": 40581 + }, + { + "epoch": 0.7238254913851532, + "grad_norm": 0.3125360906124115, + "learning_rate": 1.074354683412861e-05, + "loss": 0.1037, + "step": 40582 + }, + { + "epoch": 0.7238433275068669, + "grad_norm": 0.2962605357170105, + "learning_rate": 1.074226824438159e-05, + "loss": 0.1441, + "step": 40583 + }, + { + "epoch": 0.7238611636285806, + "grad_norm": 0.25355395674705505, + "learning_rate": 1.0740989709902458e-05, + "loss": 0.0825, + "step": 40584 + }, + { + "epoch": 0.7238789997502943, + "grad_norm": 0.3399258255958557, + "learning_rate": 1.073971123069618e-05, + "loss": 0.1233, + "step": 40585 + }, + { + "epoch": 0.723896835872008, + "grad_norm": 0.2795931100845337, + "learning_rate": 1.073843280676771e-05, + "loss": 0.0977, + "step": 40586 + }, + { + "epoch": 0.7239146719937217, + "grad_norm": 0.27127134799957275, + "learning_rate": 1.0737154438121997e-05, + "loss": 0.1235, + "step": 40587 + }, + { + "epoch": 0.7239325081154354, + "grad_norm": 0.23472371697425842, + "learning_rate": 1.0735876124764002e-05, + "loss": 0.1222, + "step": 40588 + }, + { + "epoch": 0.7239503442371491, + "grad_norm": 0.24967920780181885, + "learning_rate": 1.0734597866698668e-05, + "loss": 0.1536, + "step": 40589 + }, + { + "epoch": 0.7239681803588628, + "grad_norm": 0.34597447514533997, + "learning_rate": 1.0733319663930968e-05, + "loss": 0.131, + "step": 40590 + }, + { + "epoch": 0.7239860164805765, + "grad_norm": 0.2651764154434204, + "learning_rate": 1.073204151646585e-05, + "loss": 0.1245, + "step": 40591 + }, + { + "epoch": 0.7240038526022902, + "grad_norm": 0.26230162382125854, + "learning_rate": 1.0730763424308266e-05, + "loss": 0.1035, + "step": 40592 + }, + { + "epoch": 0.7240216887240039, + "grad_norm": 0.3079476058483124, + "learning_rate": 1.0729485387463162e-05, + "loss": 0.1275, + "step": 40593 + }, + { + "epoch": 0.7240395248457175, + "grad_norm": 0.3106514811515808, + "learning_rate": 1.0728207405935511e-05, + "loss": 0.1038, + "step": 40594 + }, + { + "epoch": 0.7240573609674312, + "grad_norm": 0.20632345974445343, + "learning_rate": 1.0726929479730249e-05, + "loss": 0.0733, + "step": 40595 + }, + { + "epoch": 0.7240751970891449, + "grad_norm": 0.34770768880844116, + "learning_rate": 1.0725651608852347e-05, + "loss": 0.1419, + "step": 40596 + }, + { + "epoch": 0.7240930332108586, + "grad_norm": 0.21071726083755493, + "learning_rate": 1.0724373793306741e-05, + "loss": 0.098, + "step": 40597 + }, + { + "epoch": 0.7241108693325723, + "grad_norm": 0.23060128092765808, + "learning_rate": 1.0723096033098402e-05, + "loss": 0.1092, + "step": 40598 + }, + { + "epoch": 0.724128705454286, + "grad_norm": 0.2908787727355957, + "learning_rate": 1.0721818328232273e-05, + "loss": 0.1609, + "step": 40599 + }, + { + "epoch": 0.7241465415759997, + "grad_norm": 0.31680330634117126, + "learning_rate": 1.0720540678713306e-05, + "loss": 0.0566, + "step": 40600 + }, + { + "epoch": 0.7241643776977134, + "grad_norm": 0.2705176770687103, + "learning_rate": 1.0719263084546458e-05, + "loss": 0.1056, + "step": 40601 + }, + { + "epoch": 0.7241822138194272, + "grad_norm": 0.21999453008174896, + "learning_rate": 1.0717985545736669e-05, + "loss": 0.1007, + "step": 40602 + }, + { + "epoch": 0.7242000499411408, + "grad_norm": 0.32170572876930237, + "learning_rate": 1.0716708062288908e-05, + "loss": 0.086, + "step": 40603 + }, + { + "epoch": 0.7242178860628545, + "grad_norm": 0.24259944260120392, + "learning_rate": 1.0715430634208123e-05, + "loss": 0.0788, + "step": 40604 + }, + { + "epoch": 0.7242357221845682, + "grad_norm": 0.2975025475025177, + "learning_rate": 1.071415326149926e-05, + "loss": 0.1239, + "step": 40605 + }, + { + "epoch": 0.7242535583062819, + "grad_norm": 0.2233697474002838, + "learning_rate": 1.0712875944167267e-05, + "loss": 0.124, + "step": 40606 + }, + { + "epoch": 0.7242713944279956, + "grad_norm": 0.36306843161582947, + "learning_rate": 1.0711598682217109e-05, + "loss": 0.098, + "step": 40607 + }, + { + "epoch": 0.7242892305497093, + "grad_norm": 0.22403933107852936, + "learning_rate": 1.0710321475653717e-05, + "loss": 0.1243, + "step": 40608 + }, + { + "epoch": 0.724307066671423, + "grad_norm": 0.21132799983024597, + "learning_rate": 1.0709044324482068e-05, + "loss": 0.0862, + "step": 40609 + }, + { + "epoch": 0.7243249027931367, + "grad_norm": 0.34940826892852783, + "learning_rate": 1.0707767228707089e-05, + "loss": 0.1009, + "step": 40610 + }, + { + "epoch": 0.7243427389148503, + "grad_norm": 0.33472365140914917, + "learning_rate": 1.0706490188333748e-05, + "loss": 0.1405, + "step": 40611 + }, + { + "epoch": 0.724360575036564, + "grad_norm": 0.27359768748283386, + "learning_rate": 1.070521320336699e-05, + "loss": 0.1242, + "step": 40612 + }, + { + "epoch": 0.7243784111582777, + "grad_norm": 0.2877182066440582, + "learning_rate": 1.0703936273811763e-05, + "loss": 0.1489, + "step": 40613 + }, + { + "epoch": 0.7243962472799914, + "grad_norm": 0.28104862570762634, + "learning_rate": 1.0702659399673016e-05, + "loss": 0.0638, + "step": 40614 + }, + { + "epoch": 0.7244140834017051, + "grad_norm": 0.2790651321411133, + "learning_rate": 1.070138258095569e-05, + "loss": 0.0561, + "step": 40615 + }, + { + "epoch": 0.7244319195234188, + "grad_norm": 0.2962120473384857, + "learning_rate": 1.0700105817664751e-05, + "loss": 0.137, + "step": 40616 + }, + { + "epoch": 0.7244497556451325, + "grad_norm": 0.1861022263765335, + "learning_rate": 1.0698829109805144e-05, + "loss": 0.0939, + "step": 40617 + }, + { + "epoch": 0.7244675917668462, + "grad_norm": 0.3007505238056183, + "learning_rate": 1.0697552457381812e-05, + "loss": 0.1757, + "step": 40618 + }, + { + "epoch": 0.72448542788856, + "grad_norm": 0.2994762063026428, + "learning_rate": 1.0696275860399701e-05, + "loss": 0.1122, + "step": 40619 + }, + { + "epoch": 0.7245032640102737, + "grad_norm": 0.25408273935317993, + "learning_rate": 1.0694999318863772e-05, + "loss": 0.1507, + "step": 40620 + }, + { + "epoch": 0.7245211001319873, + "grad_norm": 0.2956840693950653, + "learning_rate": 1.0693722832778966e-05, + "loss": 0.1386, + "step": 40621 + }, + { + "epoch": 0.724538936253701, + "grad_norm": 0.2609332501888275, + "learning_rate": 1.0692446402150225e-05, + "loss": 0.1291, + "step": 40622 + }, + { + "epoch": 0.7245567723754147, + "grad_norm": 0.5064058303833008, + "learning_rate": 1.0691170026982509e-05, + "loss": 0.2121, + "step": 40623 + }, + { + "epoch": 0.7245746084971284, + "grad_norm": 0.19119513034820557, + "learning_rate": 1.0689893707280754e-05, + "loss": 0.1208, + "step": 40624 + }, + { + "epoch": 0.7245924446188421, + "grad_norm": 0.2072117179632187, + "learning_rate": 1.068861744304992e-05, + "loss": 0.1115, + "step": 40625 + }, + { + "epoch": 0.7246102807405558, + "grad_norm": 0.43047600984573364, + "learning_rate": 1.0687341234294948e-05, + "loss": 0.1051, + "step": 40626 + }, + { + "epoch": 0.7246281168622695, + "grad_norm": 0.3276810646057129, + "learning_rate": 1.0686065081020789e-05, + "loss": 0.1101, + "step": 40627 + }, + { + "epoch": 0.7246459529839832, + "grad_norm": 0.43187955021858215, + "learning_rate": 1.0684788983232371e-05, + "loss": 0.1586, + "step": 40628 + }, + { + "epoch": 0.7246637891056968, + "grad_norm": 0.36013561487197876, + "learning_rate": 1.0683512940934667e-05, + "loss": 0.114, + "step": 40629 + }, + { + "epoch": 0.7246816252274105, + "grad_norm": 0.27758336067199707, + "learning_rate": 1.0682236954132612e-05, + "loss": 0.1399, + "step": 40630 + }, + { + "epoch": 0.7246994613491242, + "grad_norm": 0.2186107337474823, + "learning_rate": 1.0680961022831151e-05, + "loss": 0.1134, + "step": 40631 + }, + { + "epoch": 0.7247172974708379, + "grad_norm": 0.15892332792282104, + "learning_rate": 1.067968514703522e-05, + "loss": 0.0742, + "step": 40632 + }, + { + "epoch": 0.7247351335925516, + "grad_norm": 0.24363532662391663, + "learning_rate": 1.0678409326749789e-05, + "loss": 0.1263, + "step": 40633 + }, + { + "epoch": 0.7247529697142653, + "grad_norm": 0.24483613669872284, + "learning_rate": 1.0677133561979786e-05, + "loss": 0.1235, + "step": 40634 + }, + { + "epoch": 0.724770805835979, + "grad_norm": 0.22202663123607635, + "learning_rate": 1.0675857852730151e-05, + "loss": 0.0976, + "step": 40635 + }, + { + "epoch": 0.7247886419576928, + "grad_norm": 0.2127566933631897, + "learning_rate": 1.0674582199005851e-05, + "loss": 0.1012, + "step": 40636 + }, + { + "epoch": 0.7248064780794065, + "grad_norm": 0.2635959982872009, + "learning_rate": 1.0673306600811806e-05, + "loss": 0.1092, + "step": 40637 + }, + { + "epoch": 0.7248243142011201, + "grad_norm": 0.29407212138175964, + "learning_rate": 1.0672031058152981e-05, + "loss": 0.1878, + "step": 40638 + }, + { + "epoch": 0.7248421503228338, + "grad_norm": 0.325975239276886, + "learning_rate": 1.0670755571034315e-05, + "loss": 0.1079, + "step": 40639 + }, + { + "epoch": 0.7248599864445475, + "grad_norm": 0.27838611602783203, + "learning_rate": 1.066948013946075e-05, + "loss": 0.093, + "step": 40640 + }, + { + "epoch": 0.7248778225662612, + "grad_norm": 0.5459676384925842, + "learning_rate": 1.066820476343722e-05, + "loss": 0.0752, + "step": 40641 + }, + { + "epoch": 0.7248956586879749, + "grad_norm": 0.21756823360919952, + "learning_rate": 1.0666929442968687e-05, + "loss": 0.1384, + "step": 40642 + }, + { + "epoch": 0.7249134948096886, + "grad_norm": 0.3354736566543579, + "learning_rate": 1.0665654178060086e-05, + "loss": 0.1396, + "step": 40643 + }, + { + "epoch": 0.7249313309314023, + "grad_norm": 0.3089248538017273, + "learning_rate": 1.066437896871636e-05, + "loss": 0.1311, + "step": 40644 + }, + { + "epoch": 0.724949167053116, + "grad_norm": 0.279687762260437, + "learning_rate": 1.0663103814942443e-05, + "loss": 0.116, + "step": 40645 + }, + { + "epoch": 0.7249670031748296, + "grad_norm": 0.26352062821388245, + "learning_rate": 1.0661828716743297e-05, + "loss": 0.1238, + "step": 40646 + }, + { + "epoch": 0.7249848392965433, + "grad_norm": 0.20644158124923706, + "learning_rate": 1.0660553674123855e-05, + "loss": 0.1263, + "step": 40647 + }, + { + "epoch": 0.725002675418257, + "grad_norm": 0.3081561028957367, + "learning_rate": 1.0659278687089059e-05, + "loss": 0.0916, + "step": 40648 + }, + { + "epoch": 0.7250205115399707, + "grad_norm": 0.2747787833213806, + "learning_rate": 1.0658003755643844e-05, + "loss": 0.0912, + "step": 40649 + }, + { + "epoch": 0.7250383476616844, + "grad_norm": 0.2721705138683319, + "learning_rate": 1.0656728879793169e-05, + "loss": 0.1419, + "step": 40650 + }, + { + "epoch": 0.7250561837833981, + "grad_norm": 0.23173055052757263, + "learning_rate": 1.0655454059541958e-05, + "loss": 0.1055, + "step": 40651 + }, + { + "epoch": 0.7250740199051118, + "grad_norm": 0.24216590821743011, + "learning_rate": 1.0654179294895169e-05, + "loss": 0.1174, + "step": 40652 + }, + { + "epoch": 0.7250918560268256, + "grad_norm": 0.22711877524852753, + "learning_rate": 1.0652904585857736e-05, + "loss": 0.134, + "step": 40653 + }, + { + "epoch": 0.7251096921485393, + "grad_norm": 0.25260481238365173, + "learning_rate": 1.0651629932434593e-05, + "loss": 0.1071, + "step": 40654 + }, + { + "epoch": 0.725127528270253, + "grad_norm": 0.30491986870765686, + "learning_rate": 1.0650355334630695e-05, + "loss": 0.1162, + "step": 40655 + }, + { + "epoch": 0.7251453643919666, + "grad_norm": 0.26913368701934814, + "learning_rate": 1.064908079245098e-05, + "loss": 0.1327, + "step": 40656 + }, + { + "epoch": 0.7251632005136803, + "grad_norm": 0.23623481392860413, + "learning_rate": 1.0647806305900382e-05, + "loss": 0.0936, + "step": 40657 + }, + { + "epoch": 0.725181036635394, + "grad_norm": 0.2867516577243805, + "learning_rate": 1.0646531874983834e-05, + "loss": 0.133, + "step": 40658 + }, + { + "epoch": 0.7251988727571077, + "grad_norm": 0.29122912883758545, + "learning_rate": 1.0645257499706296e-05, + "loss": 0.1006, + "step": 40659 + }, + { + "epoch": 0.7252167088788214, + "grad_norm": 0.34155797958374023, + "learning_rate": 1.0643983180072698e-05, + "loss": 0.1386, + "step": 40660 + }, + { + "epoch": 0.7252345450005351, + "grad_norm": 0.3489937484264374, + "learning_rate": 1.0642708916087978e-05, + "loss": 0.0961, + "step": 40661 + }, + { + "epoch": 0.7252523811222488, + "grad_norm": 0.2931669056415558, + "learning_rate": 1.064143470775707e-05, + "loss": 0.1012, + "step": 40662 + }, + { + "epoch": 0.7252702172439625, + "grad_norm": 0.24757510423660278, + "learning_rate": 1.0640160555084926e-05, + "loss": 0.1219, + "step": 40663 + }, + { + "epoch": 0.7252880533656761, + "grad_norm": 0.2876999080181122, + "learning_rate": 1.0638886458076474e-05, + "loss": 0.1219, + "step": 40664 + }, + { + "epoch": 0.7253058894873898, + "grad_norm": 0.3704746961593628, + "learning_rate": 1.0637612416736667e-05, + "loss": 0.1097, + "step": 40665 + }, + { + "epoch": 0.7253237256091035, + "grad_norm": 0.28447043895721436, + "learning_rate": 1.0636338431070433e-05, + "loss": 0.1298, + "step": 40666 + }, + { + "epoch": 0.7253415617308172, + "grad_norm": 0.27934443950653076, + "learning_rate": 1.0635064501082707e-05, + "loss": 0.1568, + "step": 40667 + }, + { + "epoch": 0.7253593978525309, + "grad_norm": 0.3355010449886322, + "learning_rate": 1.0633790626778437e-05, + "loss": 0.13, + "step": 40668 + }, + { + "epoch": 0.7253772339742447, + "grad_norm": 0.2632060945034027, + "learning_rate": 1.0632516808162557e-05, + "loss": 0.1216, + "step": 40669 + }, + { + "epoch": 0.7253950700959584, + "grad_norm": 0.2881837487220764, + "learning_rate": 1.0631243045240008e-05, + "loss": 0.1411, + "step": 40670 + }, + { + "epoch": 0.7254129062176721, + "grad_norm": 0.2924255430698395, + "learning_rate": 1.0629969338015711e-05, + "loss": 0.1128, + "step": 40671 + }, + { + "epoch": 0.7254307423393858, + "grad_norm": 0.26867061853408813, + "learning_rate": 1.0628695686494627e-05, + "loss": 0.1479, + "step": 40672 + }, + { + "epoch": 0.7254485784610994, + "grad_norm": 0.2628811001777649, + "learning_rate": 1.0627422090681683e-05, + "loss": 0.1058, + "step": 40673 + }, + { + "epoch": 0.7254664145828131, + "grad_norm": 0.3317461609840393, + "learning_rate": 1.0626148550581813e-05, + "loss": 0.1674, + "step": 40674 + }, + { + "epoch": 0.7254842507045268, + "grad_norm": 0.25950726866722107, + "learning_rate": 1.0624875066199947e-05, + "loss": 0.0797, + "step": 40675 + }, + { + "epoch": 0.7255020868262405, + "grad_norm": 0.2664039134979248, + "learning_rate": 1.0623601637541039e-05, + "loss": 0.1654, + "step": 40676 + }, + { + "epoch": 0.7255199229479542, + "grad_norm": 0.2518742084503174, + "learning_rate": 1.0622328264610018e-05, + "loss": 0.0821, + "step": 40677 + }, + { + "epoch": 0.7255377590696679, + "grad_norm": 0.37814977765083313, + "learning_rate": 1.0621054947411806e-05, + "loss": 0.1256, + "step": 40678 + }, + { + "epoch": 0.7255555951913816, + "grad_norm": 0.2022005021572113, + "learning_rate": 1.0619781685951366e-05, + "loss": 0.0784, + "step": 40679 + }, + { + "epoch": 0.7255734313130953, + "grad_norm": 0.2952006757259369, + "learning_rate": 1.0618508480233607e-05, + "loss": 0.1243, + "step": 40680 + }, + { + "epoch": 0.725591267434809, + "grad_norm": 0.19452205300331116, + "learning_rate": 1.0617235330263484e-05, + "loss": 0.1003, + "step": 40681 + }, + { + "epoch": 0.7256091035565226, + "grad_norm": 0.2793303430080414, + "learning_rate": 1.0615962236045926e-05, + "loss": 0.1871, + "step": 40682 + }, + { + "epoch": 0.7256269396782363, + "grad_norm": 0.22673064470291138, + "learning_rate": 1.0614689197585866e-05, + "loss": 0.0921, + "step": 40683 + }, + { + "epoch": 0.72564477579995, + "grad_norm": 0.23677125573158264, + "learning_rate": 1.061341621488823e-05, + "loss": 0.1017, + "step": 40684 + }, + { + "epoch": 0.7256626119216637, + "grad_norm": 0.17663273215293884, + "learning_rate": 1.0612143287957974e-05, + "loss": 0.0726, + "step": 40685 + }, + { + "epoch": 0.7256804480433775, + "grad_norm": 0.2574593126773834, + "learning_rate": 1.0610870416800017e-05, + "loss": 0.1483, + "step": 40686 + }, + { + "epoch": 0.7256982841650912, + "grad_norm": 0.2850920855998993, + "learning_rate": 1.0609597601419296e-05, + "loss": 0.1657, + "step": 40687 + }, + { + "epoch": 0.7257161202868049, + "grad_norm": 0.2715839743614197, + "learning_rate": 1.0608324841820735e-05, + "loss": 0.1042, + "step": 40688 + }, + { + "epoch": 0.7257339564085186, + "grad_norm": 0.2802274227142334, + "learning_rate": 1.0607052138009288e-05, + "loss": 0.0786, + "step": 40689 + }, + { + "epoch": 0.7257517925302323, + "grad_norm": 0.2860008478164673, + "learning_rate": 1.0605779489989878e-05, + "loss": 0.1347, + "step": 40690 + }, + { + "epoch": 0.7257696286519459, + "grad_norm": 0.3361153304576874, + "learning_rate": 1.060450689776743e-05, + "loss": 0.1278, + "step": 40691 + }, + { + "epoch": 0.7257874647736596, + "grad_norm": 0.2440606951713562, + "learning_rate": 1.0603234361346895e-05, + "loss": 0.1144, + "step": 40692 + }, + { + "epoch": 0.7258053008953733, + "grad_norm": 0.28858011960983276, + "learning_rate": 1.0601961880733186e-05, + "loss": 0.1271, + "step": 40693 + }, + { + "epoch": 0.725823137017087, + "grad_norm": 0.26654544472694397, + "learning_rate": 1.0600689455931254e-05, + "loss": 0.1427, + "step": 40694 + }, + { + "epoch": 0.7258409731388007, + "grad_norm": 0.31179121136665344, + "learning_rate": 1.0599417086946023e-05, + "loss": 0.1155, + "step": 40695 + }, + { + "epoch": 0.7258588092605144, + "grad_norm": 0.24408067762851715, + "learning_rate": 1.0598144773782425e-05, + "loss": 0.1096, + "step": 40696 + }, + { + "epoch": 0.7258766453822281, + "grad_norm": 0.27092304825782776, + "learning_rate": 1.0596872516445385e-05, + "loss": 0.0897, + "step": 40697 + }, + { + "epoch": 0.7258944815039418, + "grad_norm": 0.27914175391197205, + "learning_rate": 1.0595600314939849e-05, + "loss": 0.1244, + "step": 40698 + }, + { + "epoch": 0.7259123176256554, + "grad_norm": 0.3074016571044922, + "learning_rate": 1.0594328169270742e-05, + "loss": 0.0982, + "step": 40699 + }, + { + "epoch": 0.7259301537473691, + "grad_norm": 0.22875364124774933, + "learning_rate": 1.0593056079442995e-05, + "loss": 0.1224, + "step": 40700 + }, + { + "epoch": 0.7259479898690828, + "grad_norm": 0.30135592818260193, + "learning_rate": 1.0591784045461531e-05, + "loss": 0.1081, + "step": 40701 + }, + { + "epoch": 0.7259658259907965, + "grad_norm": 0.3393057584762573, + "learning_rate": 1.0590512067331294e-05, + "loss": 0.1704, + "step": 40702 + }, + { + "epoch": 0.7259836621125103, + "grad_norm": 0.26212579011917114, + "learning_rate": 1.058924014505721e-05, + "loss": 0.1682, + "step": 40703 + }, + { + "epoch": 0.726001498234224, + "grad_norm": 0.25574398040771484, + "learning_rate": 1.0587968278644212e-05, + "loss": 0.1011, + "step": 40704 + }, + { + "epoch": 0.7260193343559377, + "grad_norm": 0.30615395307540894, + "learning_rate": 1.0586696468097212e-05, + "loss": 0.1297, + "step": 40705 + }, + { + "epoch": 0.7260371704776514, + "grad_norm": 0.23416419327259064, + "learning_rate": 1.0585424713421168e-05, + "loss": 0.0811, + "step": 40706 + }, + { + "epoch": 0.7260550065993651, + "grad_norm": 0.2481381595134735, + "learning_rate": 1.0584153014620984e-05, + "loss": 0.11, + "step": 40707 + }, + { + "epoch": 0.7260728427210787, + "grad_norm": 0.2731077969074249, + "learning_rate": 1.0582881371701615e-05, + "loss": 0.1172, + "step": 40708 + }, + { + "epoch": 0.7260906788427924, + "grad_norm": 0.22165630757808685, + "learning_rate": 1.0581609784667976e-05, + "loss": 0.1274, + "step": 40709 + }, + { + "epoch": 0.7261085149645061, + "grad_norm": 0.28758513927459717, + "learning_rate": 1.0580338253524988e-05, + "loss": 0.1464, + "step": 40710 + }, + { + "epoch": 0.7261263510862198, + "grad_norm": 0.1941150724887848, + "learning_rate": 1.0579066778277596e-05, + "loss": 0.1186, + "step": 40711 + }, + { + "epoch": 0.7261441872079335, + "grad_norm": 0.33894768357276917, + "learning_rate": 1.0577795358930725e-05, + "loss": 0.1361, + "step": 40712 + }, + { + "epoch": 0.7261620233296472, + "grad_norm": 0.27559563517570496, + "learning_rate": 1.0576523995489299e-05, + "loss": 0.1109, + "step": 40713 + }, + { + "epoch": 0.7261798594513609, + "grad_norm": 0.27855145931243896, + "learning_rate": 1.057525268795824e-05, + "loss": 0.0911, + "step": 40714 + }, + { + "epoch": 0.7261976955730746, + "grad_norm": 0.3512547016143799, + "learning_rate": 1.0573981436342493e-05, + "loss": 0.0715, + "step": 40715 + }, + { + "epoch": 0.7262155316947883, + "grad_norm": 0.20762325823307037, + "learning_rate": 1.0572710240646973e-05, + "loss": 0.1055, + "step": 40716 + }, + { + "epoch": 0.7262333678165019, + "grad_norm": 0.2569544315338135, + "learning_rate": 1.0571439100876615e-05, + "loss": 0.1005, + "step": 40717 + }, + { + "epoch": 0.7262512039382156, + "grad_norm": 0.3328148424625397, + "learning_rate": 1.0570168017036333e-05, + "loss": 0.1147, + "step": 40718 + }, + { + "epoch": 0.7262690400599293, + "grad_norm": 0.27908775210380554, + "learning_rate": 1.0568896989131071e-05, + "loss": 0.1455, + "step": 40719 + }, + { + "epoch": 0.7262868761816431, + "grad_norm": 0.2935066521167755, + "learning_rate": 1.056762601716574e-05, + "loss": 0.1492, + "step": 40720 + }, + { + "epoch": 0.7263047123033568, + "grad_norm": 0.23233112692832947, + "learning_rate": 1.0566355101145286e-05, + "loss": 0.1116, + "step": 40721 + }, + { + "epoch": 0.7263225484250705, + "grad_norm": 0.22923393547534943, + "learning_rate": 1.0565084241074624e-05, + "loss": 0.0935, + "step": 40722 + }, + { + "epoch": 0.7263403845467842, + "grad_norm": 0.264287531375885, + "learning_rate": 1.0563813436958672e-05, + "loss": 0.0897, + "step": 40723 + }, + { + "epoch": 0.7263582206684979, + "grad_norm": 0.2624832093715668, + "learning_rate": 1.0562542688802374e-05, + "loss": 0.1543, + "step": 40724 + }, + { + "epoch": 0.7263760567902116, + "grad_norm": 0.5052750110626221, + "learning_rate": 1.0561271996610647e-05, + "loss": 0.1483, + "step": 40725 + }, + { + "epoch": 0.7263938929119252, + "grad_norm": 0.21963505446910858, + "learning_rate": 1.0560001360388416e-05, + "loss": 0.1085, + "step": 40726 + }, + { + "epoch": 0.7264117290336389, + "grad_norm": 0.43002766370773315, + "learning_rate": 1.0558730780140599e-05, + "loss": 0.1681, + "step": 40727 + }, + { + "epoch": 0.7264295651553526, + "grad_norm": 0.3028855621814728, + "learning_rate": 1.055746025587214e-05, + "loss": 0.1142, + "step": 40728 + }, + { + "epoch": 0.7264474012770663, + "grad_norm": 0.25368496775627136, + "learning_rate": 1.055618978758795e-05, + "loss": 0.113, + "step": 40729 + }, + { + "epoch": 0.72646523739878, + "grad_norm": 0.3520154356956482, + "learning_rate": 1.055491937529296e-05, + "loss": 0.1274, + "step": 40730 + }, + { + "epoch": 0.7264830735204937, + "grad_norm": 0.25041234493255615, + "learning_rate": 1.055364901899209e-05, + "loss": 0.1123, + "step": 40731 + }, + { + "epoch": 0.7265009096422074, + "grad_norm": 0.2312353104352951, + "learning_rate": 1.0552378718690257e-05, + "loss": 0.1384, + "step": 40732 + }, + { + "epoch": 0.7265187457639211, + "grad_norm": 0.3082735538482666, + "learning_rate": 1.0551108474392405e-05, + "loss": 0.0955, + "step": 40733 + }, + { + "epoch": 0.7265365818856347, + "grad_norm": 0.286031037569046, + "learning_rate": 1.0549838286103436e-05, + "loss": 0.1003, + "step": 40734 + }, + { + "epoch": 0.7265544180073484, + "grad_norm": 0.19576328992843628, + "learning_rate": 1.0548568153828292e-05, + "loss": 0.1212, + "step": 40735 + }, + { + "epoch": 0.7265722541290621, + "grad_norm": 0.38110560178756714, + "learning_rate": 1.0547298077571885e-05, + "loss": 0.1363, + "step": 40736 + }, + { + "epoch": 0.7265900902507759, + "grad_norm": 0.3293739855289459, + "learning_rate": 1.0546028057339147e-05, + "loss": 0.1305, + "step": 40737 + }, + { + "epoch": 0.7266079263724896, + "grad_norm": 0.3222125768661499, + "learning_rate": 1.0544758093134999e-05, + "loss": 0.1791, + "step": 40738 + }, + { + "epoch": 0.7266257624942033, + "grad_norm": 0.3354884088039398, + "learning_rate": 1.0543488184964363e-05, + "loss": 0.1107, + "step": 40739 + }, + { + "epoch": 0.726643598615917, + "grad_norm": 0.32864633202552795, + "learning_rate": 1.0542218332832147e-05, + "loss": 0.1082, + "step": 40740 + }, + { + "epoch": 0.7266614347376307, + "grad_norm": 0.2713450491428375, + "learning_rate": 1.0540948536743297e-05, + "loss": 0.1111, + "step": 40741 + }, + { + "epoch": 0.7266792708593444, + "grad_norm": 0.31537148356437683, + "learning_rate": 1.0539678796702724e-05, + "loss": 0.1238, + "step": 40742 + }, + { + "epoch": 0.726697106981058, + "grad_norm": 0.24227231740951538, + "learning_rate": 1.053840911271535e-05, + "loss": 0.1306, + "step": 40743 + }, + { + "epoch": 0.7267149431027717, + "grad_norm": 0.27465173602104187, + "learning_rate": 1.0537139484786098e-05, + "loss": 0.1109, + "step": 40744 + }, + { + "epoch": 0.7267327792244854, + "grad_norm": 0.31403836607933044, + "learning_rate": 1.0535869912919882e-05, + "loss": 0.1262, + "step": 40745 + }, + { + "epoch": 0.7267506153461991, + "grad_norm": 0.2626326382160187, + "learning_rate": 1.0534600397121635e-05, + "loss": 0.0917, + "step": 40746 + }, + { + "epoch": 0.7267684514679128, + "grad_norm": 0.3095690608024597, + "learning_rate": 1.0533330937396265e-05, + "loss": 0.1595, + "step": 40747 + }, + { + "epoch": 0.7267862875896265, + "grad_norm": 0.3072819411754608, + "learning_rate": 1.0532061533748713e-05, + "loss": 0.106, + "step": 40748 + }, + { + "epoch": 0.7268041237113402, + "grad_norm": 0.2610625922679901, + "learning_rate": 1.0530792186183875e-05, + "loss": 0.1397, + "step": 40749 + }, + { + "epoch": 0.7268219598330539, + "grad_norm": 0.24871870875358582, + "learning_rate": 1.0529522894706696e-05, + "loss": 0.0998, + "step": 40750 + }, + { + "epoch": 0.7268397959547676, + "grad_norm": 0.28653693199157715, + "learning_rate": 1.0528253659322083e-05, + "loss": 0.156, + "step": 40751 + }, + { + "epoch": 0.7268576320764812, + "grad_norm": 0.25087928771972656, + "learning_rate": 1.0526984480034954e-05, + "loss": 0.1453, + "step": 40752 + }, + { + "epoch": 0.7268754681981949, + "grad_norm": 0.3090744912624359, + "learning_rate": 1.0525715356850226e-05, + "loss": 0.0843, + "step": 40753 + }, + { + "epoch": 0.7268933043199087, + "grad_norm": 0.2786547839641571, + "learning_rate": 1.0524446289772832e-05, + "loss": 0.0893, + "step": 40754 + }, + { + "epoch": 0.7269111404416224, + "grad_norm": 0.29326432943344116, + "learning_rate": 1.0523177278807686e-05, + "loss": 0.0916, + "step": 40755 + }, + { + "epoch": 0.7269289765633361, + "grad_norm": 0.3040871322154999, + "learning_rate": 1.0521908323959703e-05, + "loss": 0.1159, + "step": 40756 + }, + { + "epoch": 0.7269468126850498, + "grad_norm": 0.24024918675422668, + "learning_rate": 1.0520639425233805e-05, + "loss": 0.121, + "step": 40757 + }, + { + "epoch": 0.7269646488067635, + "grad_norm": 0.21338613331317902, + "learning_rate": 1.0519370582634902e-05, + "loss": 0.0546, + "step": 40758 + }, + { + "epoch": 0.7269824849284772, + "grad_norm": 0.2051226645708084, + "learning_rate": 1.0518101796167929e-05, + "loss": 0.1154, + "step": 40759 + }, + { + "epoch": 0.7270003210501909, + "grad_norm": 0.2698860168457031, + "learning_rate": 1.0516833065837792e-05, + "loss": 0.1341, + "step": 40760 + }, + { + "epoch": 0.7270181571719045, + "grad_norm": 0.2239181101322174, + "learning_rate": 1.0515564391649405e-05, + "loss": 0.0772, + "step": 40761 + }, + { + "epoch": 0.7270359932936182, + "grad_norm": 0.26052507758140564, + "learning_rate": 1.0514295773607693e-05, + "loss": 0.1295, + "step": 40762 + }, + { + "epoch": 0.7270538294153319, + "grad_norm": 0.3422616720199585, + "learning_rate": 1.0513027211717582e-05, + "loss": 0.1196, + "step": 40763 + }, + { + "epoch": 0.7270716655370456, + "grad_norm": 0.21924665570259094, + "learning_rate": 1.0511758705983984e-05, + "loss": 0.1107, + "step": 40764 + }, + { + "epoch": 0.7270895016587593, + "grad_norm": 0.23988090455532074, + "learning_rate": 1.0510490256411811e-05, + "loss": 0.0641, + "step": 40765 + }, + { + "epoch": 0.727107337780473, + "grad_norm": 0.33719107508659363, + "learning_rate": 1.0509221863005974e-05, + "loss": 0.1443, + "step": 40766 + }, + { + "epoch": 0.7271251739021867, + "grad_norm": 0.2508661150932312, + "learning_rate": 1.0507953525771407e-05, + "loss": 0.0873, + "step": 40767 + }, + { + "epoch": 0.7271430100239004, + "grad_norm": 0.2774602472782135, + "learning_rate": 1.0506685244713019e-05, + "loss": 0.1042, + "step": 40768 + }, + { + "epoch": 0.727160846145614, + "grad_norm": 0.18078641593456268, + "learning_rate": 1.0505417019835722e-05, + "loss": 0.0813, + "step": 40769 + }, + { + "epoch": 0.7271786822673278, + "grad_norm": 0.2852545380592346, + "learning_rate": 1.0504148851144438e-05, + "loss": 0.1705, + "step": 40770 + }, + { + "epoch": 0.7271965183890415, + "grad_norm": 0.24146457016468048, + "learning_rate": 1.050288073864407e-05, + "loss": 0.0811, + "step": 40771 + }, + { + "epoch": 0.7272143545107552, + "grad_norm": 0.2205086201429367, + "learning_rate": 1.0501612682339551e-05, + "loss": 0.1028, + "step": 40772 + }, + { + "epoch": 0.7272321906324689, + "grad_norm": 0.2953667938709259, + "learning_rate": 1.0500344682235788e-05, + "loss": 0.1413, + "step": 40773 + }, + { + "epoch": 0.7272500267541826, + "grad_norm": 0.2932448387145996, + "learning_rate": 1.049907673833769e-05, + "loss": 0.1228, + "step": 40774 + }, + { + "epoch": 0.7272678628758963, + "grad_norm": 0.2763751745223999, + "learning_rate": 1.0497808850650188e-05, + "loss": 0.1222, + "step": 40775 + }, + { + "epoch": 0.72728569899761, + "grad_norm": 0.3213701844215393, + "learning_rate": 1.0496541019178177e-05, + "loss": 0.1331, + "step": 40776 + }, + { + "epoch": 0.7273035351193237, + "grad_norm": 0.3039744198322296, + "learning_rate": 1.0495273243926592e-05, + "loss": 0.1382, + "step": 40777 + }, + { + "epoch": 0.7273213712410374, + "grad_norm": 0.2129533588886261, + "learning_rate": 1.0494005524900338e-05, + "loss": 0.1229, + "step": 40778 + }, + { + "epoch": 0.727339207362751, + "grad_norm": 0.2732669413089752, + "learning_rate": 1.0492737862104321e-05, + "loss": 0.1163, + "step": 40779 + }, + { + "epoch": 0.7273570434844647, + "grad_norm": 0.2482658326625824, + "learning_rate": 1.049147025554347e-05, + "loss": 0.1309, + "step": 40780 + }, + { + "epoch": 0.7273748796061784, + "grad_norm": 0.2273804396390915, + "learning_rate": 1.049020270522269e-05, + "loss": 0.1273, + "step": 40781 + }, + { + "epoch": 0.7273927157278921, + "grad_norm": 0.2663029134273529, + "learning_rate": 1.04889352111469e-05, + "loss": 0.1417, + "step": 40782 + }, + { + "epoch": 0.7274105518496058, + "grad_norm": 0.2137305587530136, + "learning_rate": 1.0487667773321005e-05, + "loss": 0.0898, + "step": 40783 + }, + { + "epoch": 0.7274283879713195, + "grad_norm": 0.27498704195022583, + "learning_rate": 1.0486400391749918e-05, + "loss": 0.1329, + "step": 40784 + }, + { + "epoch": 0.7274462240930332, + "grad_norm": 0.3028446137905121, + "learning_rate": 1.048513306643856e-05, + "loss": 0.1083, + "step": 40785 + }, + { + "epoch": 0.7274640602147469, + "grad_norm": 0.29380154609680176, + "learning_rate": 1.0483865797391842e-05, + "loss": 0.1379, + "step": 40786 + }, + { + "epoch": 0.7274818963364607, + "grad_norm": 0.27396297454833984, + "learning_rate": 1.0482598584614673e-05, + "loss": 0.0995, + "step": 40787 + }, + { + "epoch": 0.7274997324581743, + "grad_norm": 0.314533531665802, + "learning_rate": 1.0481331428111957e-05, + "loss": 0.1104, + "step": 40788 + }, + { + "epoch": 0.727517568579888, + "grad_norm": 0.2690560221672058, + "learning_rate": 1.0480064327888617e-05, + "loss": 0.0946, + "step": 40789 + }, + { + "epoch": 0.7275354047016017, + "grad_norm": 0.29971808195114136, + "learning_rate": 1.0478797283949568e-05, + "loss": 0.1952, + "step": 40790 + }, + { + "epoch": 0.7275532408233154, + "grad_norm": 0.304678738117218, + "learning_rate": 1.047753029629972e-05, + "loss": 0.1086, + "step": 40791 + }, + { + "epoch": 0.7275710769450291, + "grad_norm": 0.27599942684173584, + "learning_rate": 1.047626336494397e-05, + "loss": 0.1004, + "step": 40792 + }, + { + "epoch": 0.7275889130667428, + "grad_norm": 0.24337460100650787, + "learning_rate": 1.0474996489887246e-05, + "loss": 0.1079, + "step": 40793 + }, + { + "epoch": 0.7276067491884565, + "grad_norm": 0.2940039336681366, + "learning_rate": 1.0473729671134455e-05, + "loss": 0.1528, + "step": 40794 + }, + { + "epoch": 0.7276245853101702, + "grad_norm": 0.26683154702186584, + "learning_rate": 1.0472462908690506e-05, + "loss": 0.1346, + "step": 40795 + }, + { + "epoch": 0.7276424214318838, + "grad_norm": 0.3066383898258209, + "learning_rate": 1.0471196202560307e-05, + "loss": 0.1323, + "step": 40796 + }, + { + "epoch": 0.7276602575535975, + "grad_norm": 0.2884158790111542, + "learning_rate": 1.0469929552748758e-05, + "loss": 0.1404, + "step": 40797 + }, + { + "epoch": 0.7276780936753112, + "grad_norm": 0.3149704337120056, + "learning_rate": 1.0468662959260794e-05, + "loss": 0.1565, + "step": 40798 + }, + { + "epoch": 0.7276959297970249, + "grad_norm": 0.38092759251594543, + "learning_rate": 1.0467396422101308e-05, + "loss": 0.1892, + "step": 40799 + }, + { + "epoch": 0.7277137659187386, + "grad_norm": 0.24065916240215302, + "learning_rate": 1.0466129941275216e-05, + "loss": 0.1419, + "step": 40800 + }, + { + "epoch": 0.7277316020404523, + "grad_norm": 0.3000841736793518, + "learning_rate": 1.0464863516787413e-05, + "loss": 0.1303, + "step": 40801 + }, + { + "epoch": 0.727749438162166, + "grad_norm": 0.2805483639240265, + "learning_rate": 1.0463597148642828e-05, + "loss": 0.1247, + "step": 40802 + }, + { + "epoch": 0.7277672742838797, + "grad_norm": 0.24328021705150604, + "learning_rate": 1.0462330836846355e-05, + "loss": 0.1913, + "step": 40803 + }, + { + "epoch": 0.7277851104055935, + "grad_norm": 0.1977032572031021, + "learning_rate": 1.0461064581402916e-05, + "loss": 0.0774, + "step": 40804 + }, + { + "epoch": 0.7278029465273071, + "grad_norm": 0.2659285068511963, + "learning_rate": 1.0459798382317402e-05, + "loss": 0.0917, + "step": 40805 + }, + { + "epoch": 0.7278207826490208, + "grad_norm": 0.3462757170200348, + "learning_rate": 1.0458532239594742e-05, + "loss": 0.1178, + "step": 40806 + }, + { + "epoch": 0.7278386187707345, + "grad_norm": 0.259512722492218, + "learning_rate": 1.0457266153239833e-05, + "loss": 0.1129, + "step": 40807 + }, + { + "epoch": 0.7278564548924482, + "grad_norm": 0.3088579773902893, + "learning_rate": 1.0456000123257584e-05, + "loss": 0.1272, + "step": 40808 + }, + { + "epoch": 0.7278742910141619, + "grad_norm": 0.2899314761161804, + "learning_rate": 1.04547341496529e-05, + "loss": 0.1091, + "step": 40809 + }, + { + "epoch": 0.7278921271358756, + "grad_norm": 0.21103590726852417, + "learning_rate": 1.0453468232430682e-05, + "loss": 0.1318, + "step": 40810 + }, + { + "epoch": 0.7279099632575893, + "grad_norm": 0.3091026544570923, + "learning_rate": 1.0452202371595856e-05, + "loss": 0.1021, + "step": 40811 + }, + { + "epoch": 0.727927799379303, + "grad_norm": 0.2287903130054474, + "learning_rate": 1.0450936567153316e-05, + "loss": 0.1329, + "step": 40812 + }, + { + "epoch": 0.7279456355010167, + "grad_norm": 0.18485434353351593, + "learning_rate": 1.0449670819107974e-05, + "loss": 0.0547, + "step": 40813 + }, + { + "epoch": 0.7279634716227303, + "grad_norm": 0.26258590817451477, + "learning_rate": 1.0448405127464722e-05, + "loss": 0.1122, + "step": 40814 + }, + { + "epoch": 0.727981307744444, + "grad_norm": 0.2268001139163971, + "learning_rate": 1.0447139492228486e-05, + "loss": 0.1335, + "step": 40815 + }, + { + "epoch": 0.7279991438661577, + "grad_norm": 0.2697557806968689, + "learning_rate": 1.0445873913404156e-05, + "loss": 0.103, + "step": 40816 + }, + { + "epoch": 0.7280169799878714, + "grad_norm": 0.22228126227855682, + "learning_rate": 1.0444608390996655e-05, + "loss": 0.0726, + "step": 40817 + }, + { + "epoch": 0.7280348161095851, + "grad_norm": 0.2938465476036072, + "learning_rate": 1.0443342925010868e-05, + "loss": 0.0857, + "step": 40818 + }, + { + "epoch": 0.7280526522312988, + "grad_norm": 0.2011396884918213, + "learning_rate": 1.0442077515451724e-05, + "loss": 0.0534, + "step": 40819 + }, + { + "epoch": 0.7280704883530125, + "grad_norm": 0.27762192487716675, + "learning_rate": 1.0440812162324113e-05, + "loss": 0.1058, + "step": 40820 + }, + { + "epoch": 0.7280883244747263, + "grad_norm": 0.25490036606788635, + "learning_rate": 1.0439546865632943e-05, + "loss": 0.0927, + "step": 40821 + }, + { + "epoch": 0.72810616059644, + "grad_norm": 0.36661574244499207, + "learning_rate": 1.043828162538312e-05, + "loss": 0.1161, + "step": 40822 + }, + { + "epoch": 0.7281239967181536, + "grad_norm": 0.24220754206180573, + "learning_rate": 1.0437016441579537e-05, + "loss": 0.1752, + "step": 40823 + }, + { + "epoch": 0.7281418328398673, + "grad_norm": 0.28342440724372864, + "learning_rate": 1.0435751314227116e-05, + "loss": 0.106, + "step": 40824 + }, + { + "epoch": 0.728159668961581, + "grad_norm": 0.26885533332824707, + "learning_rate": 1.0434486243330754e-05, + "loss": 0.1524, + "step": 40825 + }, + { + "epoch": 0.7281775050832947, + "grad_norm": 0.24018530547618866, + "learning_rate": 1.0433221228895354e-05, + "loss": 0.1057, + "step": 40826 + }, + { + "epoch": 0.7281953412050084, + "grad_norm": 0.20767271518707275, + "learning_rate": 1.0431956270925811e-05, + "loss": 0.1448, + "step": 40827 + }, + { + "epoch": 0.7282131773267221, + "grad_norm": 0.3597570061683655, + "learning_rate": 1.0430691369427045e-05, + "loss": 0.1453, + "step": 40828 + }, + { + "epoch": 0.7282310134484358, + "grad_norm": 0.23094920814037323, + "learning_rate": 1.0429426524403954e-05, + "loss": 0.0905, + "step": 40829 + }, + { + "epoch": 0.7282488495701495, + "grad_norm": 0.29736557602882385, + "learning_rate": 1.0428161735861428e-05, + "loss": 0.1724, + "step": 40830 + }, + { + "epoch": 0.7282666856918631, + "grad_norm": 0.2172999233007431, + "learning_rate": 1.0426897003804386e-05, + "loss": 0.1056, + "step": 40831 + }, + { + "epoch": 0.7282845218135768, + "grad_norm": 0.2563237249851227, + "learning_rate": 1.042563232823772e-05, + "loss": 0.1209, + "step": 40832 + }, + { + "epoch": 0.7283023579352905, + "grad_norm": 0.30967509746551514, + "learning_rate": 1.0424367709166344e-05, + "loss": 0.1121, + "step": 40833 + }, + { + "epoch": 0.7283201940570042, + "grad_norm": 0.2564481496810913, + "learning_rate": 1.0423103146595154e-05, + "loss": 0.0843, + "step": 40834 + }, + { + "epoch": 0.7283380301787179, + "grad_norm": 0.3186921775341034, + "learning_rate": 1.042183864052905e-05, + "loss": 0.1328, + "step": 40835 + }, + { + "epoch": 0.7283558663004316, + "grad_norm": 0.31604063510894775, + "learning_rate": 1.0420574190972926e-05, + "loss": 0.1429, + "step": 40836 + }, + { + "epoch": 0.7283737024221453, + "grad_norm": 0.25721925497055054, + "learning_rate": 1.0419309797931701e-05, + "loss": 0.0837, + "step": 40837 + }, + { + "epoch": 0.7283915385438591, + "grad_norm": 0.23972263932228088, + "learning_rate": 1.0418045461410263e-05, + "loss": 0.136, + "step": 40838 + }, + { + "epoch": 0.7284093746655728, + "grad_norm": 0.2746901214122772, + "learning_rate": 1.0416781181413523e-05, + "loss": 0.1381, + "step": 40839 + }, + { + "epoch": 0.7284272107872864, + "grad_norm": 0.29209262132644653, + "learning_rate": 1.0415516957946366e-05, + "loss": 0.1472, + "step": 40840 + }, + { + "epoch": 0.7284450469090001, + "grad_norm": 0.23592689633369446, + "learning_rate": 1.041425279101371e-05, + "loss": 0.0621, + "step": 40841 + }, + { + "epoch": 0.7284628830307138, + "grad_norm": 0.34710782766342163, + "learning_rate": 1.041298868062045e-05, + "loss": 0.1461, + "step": 40842 + }, + { + "epoch": 0.7284807191524275, + "grad_norm": 0.35806378722190857, + "learning_rate": 1.0411724626771482e-05, + "loss": 0.188, + "step": 40843 + }, + { + "epoch": 0.7284985552741412, + "grad_norm": 0.22632494568824768, + "learning_rate": 1.04104606294717e-05, + "loss": 0.0753, + "step": 40844 + }, + { + "epoch": 0.7285163913958549, + "grad_norm": 0.2383161336183548, + "learning_rate": 1.040919668872601e-05, + "loss": 0.1159, + "step": 40845 + }, + { + "epoch": 0.7285342275175686, + "grad_norm": 0.3089994788169861, + "learning_rate": 1.0407932804539325e-05, + "loss": 0.1064, + "step": 40846 + }, + { + "epoch": 0.7285520636392823, + "grad_norm": 0.27490413188934326, + "learning_rate": 1.0406668976916531e-05, + "loss": 0.1006, + "step": 40847 + }, + { + "epoch": 0.728569899760996, + "grad_norm": 0.31490302085876465, + "learning_rate": 1.0405405205862529e-05, + "loss": 0.1508, + "step": 40848 + }, + { + "epoch": 0.7285877358827096, + "grad_norm": 0.2667716443538666, + "learning_rate": 1.0404141491382208e-05, + "loss": 0.1151, + "step": 40849 + }, + { + "epoch": 0.7286055720044233, + "grad_norm": 0.20112928748130798, + "learning_rate": 1.0402877833480485e-05, + "loss": 0.1066, + "step": 40850 + }, + { + "epoch": 0.728623408126137, + "grad_norm": 0.28737518191337585, + "learning_rate": 1.0401614232162246e-05, + "loss": 0.1369, + "step": 40851 + }, + { + "epoch": 0.7286412442478507, + "grad_norm": 0.3094600439071655, + "learning_rate": 1.0400350687432395e-05, + "loss": 0.169, + "step": 40852 + }, + { + "epoch": 0.7286590803695644, + "grad_norm": 0.21896794438362122, + "learning_rate": 1.0399087199295819e-05, + "loss": 0.1649, + "step": 40853 + }, + { + "epoch": 0.7286769164912781, + "grad_norm": 0.27871015667915344, + "learning_rate": 1.0397823767757433e-05, + "loss": 0.1124, + "step": 40854 + }, + { + "epoch": 0.7286947526129919, + "grad_norm": 0.22240017354488373, + "learning_rate": 1.0396560392822124e-05, + "loss": 0.0806, + "step": 40855 + }, + { + "epoch": 0.7287125887347056, + "grad_norm": 0.43458759784698486, + "learning_rate": 1.0395297074494791e-05, + "loss": 0.1663, + "step": 40856 + }, + { + "epoch": 0.7287304248564193, + "grad_norm": 0.20958521962165833, + "learning_rate": 1.0394033812780323e-05, + "loss": 0.0499, + "step": 40857 + }, + { + "epoch": 0.728748260978133, + "grad_norm": 0.3532405495643616, + "learning_rate": 1.0392770607683633e-05, + "loss": 0.1398, + "step": 40858 + }, + { + "epoch": 0.7287660970998466, + "grad_norm": 0.2999124228954315, + "learning_rate": 1.0391507459209601e-05, + "loss": 0.1586, + "step": 40859 + }, + { + "epoch": 0.7287839332215603, + "grad_norm": 0.22298569977283478, + "learning_rate": 1.0390244367363139e-05, + "loss": 0.0801, + "step": 40860 + }, + { + "epoch": 0.728801769343274, + "grad_norm": 0.21821489930152893, + "learning_rate": 1.0388981332149138e-05, + "loss": 0.1066, + "step": 40861 + }, + { + "epoch": 0.7288196054649877, + "grad_norm": 0.2001788467168808, + "learning_rate": 1.0387718353572482e-05, + "loss": 0.07, + "step": 40862 + }, + { + "epoch": 0.7288374415867014, + "grad_norm": 0.2745591104030609, + "learning_rate": 1.0386455431638085e-05, + "loss": 0.1416, + "step": 40863 + }, + { + "epoch": 0.7288552777084151, + "grad_norm": 0.32037249207496643, + "learning_rate": 1.0385192566350835e-05, + "loss": 0.1115, + "step": 40864 + }, + { + "epoch": 0.7288731138301288, + "grad_norm": 0.2541063725948334, + "learning_rate": 1.0383929757715623e-05, + "loss": 0.0856, + "step": 40865 + }, + { + "epoch": 0.7288909499518424, + "grad_norm": 0.3164190649986267, + "learning_rate": 1.0382667005737343e-05, + "loss": 0.1198, + "step": 40866 + }, + { + "epoch": 0.7289087860735561, + "grad_norm": 0.23438000679016113, + "learning_rate": 1.0381404310420903e-05, + "loss": 0.1285, + "step": 40867 + }, + { + "epoch": 0.7289266221952698, + "grad_norm": 0.2584449350833893, + "learning_rate": 1.0380141671771188e-05, + "loss": 0.1283, + "step": 40868 + }, + { + "epoch": 0.7289444583169835, + "grad_norm": 0.27657485008239746, + "learning_rate": 1.0378879089793093e-05, + "loss": 0.1061, + "step": 40869 + }, + { + "epoch": 0.7289622944386972, + "grad_norm": 0.2460792362689972, + "learning_rate": 1.0377616564491502e-05, + "loss": 0.0803, + "step": 40870 + }, + { + "epoch": 0.728980130560411, + "grad_norm": 0.2133624404668808, + "learning_rate": 1.0376354095871333e-05, + "loss": 0.126, + "step": 40871 + }, + { + "epoch": 0.7289979666821247, + "grad_norm": 0.229832723736763, + "learning_rate": 1.0375091683937455e-05, + "loss": 0.1056, + "step": 40872 + }, + { + "epoch": 0.7290158028038384, + "grad_norm": 0.3540733754634857, + "learning_rate": 1.037382932869478e-05, + "loss": 0.1215, + "step": 40873 + }, + { + "epoch": 0.7290336389255521, + "grad_norm": 0.35751286149024963, + "learning_rate": 1.0372567030148194e-05, + "loss": 0.0879, + "step": 40874 + }, + { + "epoch": 0.7290514750472658, + "grad_norm": 0.28189775347709656, + "learning_rate": 1.0371304788302586e-05, + "loss": 0.0382, + "step": 40875 + }, + { + "epoch": 0.7290693111689794, + "grad_norm": 0.2872788906097412, + "learning_rate": 1.0370042603162858e-05, + "loss": 0.151, + "step": 40876 + }, + { + "epoch": 0.7290871472906931, + "grad_norm": 0.22581401467323303, + "learning_rate": 1.03687804747339e-05, + "loss": 0.1398, + "step": 40877 + }, + { + "epoch": 0.7291049834124068, + "grad_norm": 0.32426077127456665, + "learning_rate": 1.0367518403020602e-05, + "loss": 0.1541, + "step": 40878 + }, + { + "epoch": 0.7291228195341205, + "grad_norm": 0.2865886390209198, + "learning_rate": 1.0366256388027849e-05, + "loss": 0.109, + "step": 40879 + }, + { + "epoch": 0.7291406556558342, + "grad_norm": 0.20413914322853088, + "learning_rate": 1.0364994429760547e-05, + "loss": 0.0886, + "step": 40880 + }, + { + "epoch": 0.7291584917775479, + "grad_norm": 0.2653484642505646, + "learning_rate": 1.036373252822358e-05, + "loss": 0.091, + "step": 40881 + }, + { + "epoch": 0.7291763278992616, + "grad_norm": 0.3649868369102478, + "learning_rate": 1.0362470683421843e-05, + "loss": 0.1198, + "step": 40882 + }, + { + "epoch": 0.7291941640209753, + "grad_norm": 0.3148685097694397, + "learning_rate": 1.0361208895360216e-05, + "loss": 0.1691, + "step": 40883 + }, + { + "epoch": 0.7292120001426889, + "grad_norm": 0.22477386891841888, + "learning_rate": 1.035994716404361e-05, + "loss": 0.0829, + "step": 40884 + }, + { + "epoch": 0.7292298362644026, + "grad_norm": 0.28982433676719666, + "learning_rate": 1.0358685489476905e-05, + "loss": 0.1606, + "step": 40885 + }, + { + "epoch": 0.7292476723861163, + "grad_norm": 0.25430357456207275, + "learning_rate": 1.0357423871664982e-05, + "loss": 0.1298, + "step": 40886 + }, + { + "epoch": 0.72926550850783, + "grad_norm": 0.2837600111961365, + "learning_rate": 1.0356162310612749e-05, + "loss": 0.1094, + "step": 40887 + }, + { + "epoch": 0.7292833446295438, + "grad_norm": 0.23365193605422974, + "learning_rate": 1.0354900806325082e-05, + "loss": 0.1287, + "step": 40888 + }, + { + "epoch": 0.7293011807512575, + "grad_norm": 0.4007009267807007, + "learning_rate": 1.0353639358806885e-05, + "loss": 0.1036, + "step": 40889 + }, + { + "epoch": 0.7293190168729712, + "grad_norm": 0.22579741477966309, + "learning_rate": 1.0352377968063042e-05, + "loss": 0.1404, + "step": 40890 + }, + { + "epoch": 0.7293368529946849, + "grad_norm": 0.2736065685749054, + "learning_rate": 1.0351116634098443e-05, + "loss": 0.1069, + "step": 40891 + }, + { + "epoch": 0.7293546891163986, + "grad_norm": 0.29125985503196716, + "learning_rate": 1.0349855356917962e-05, + "loss": 0.1204, + "step": 40892 + }, + { + "epoch": 0.7293725252381122, + "grad_norm": 0.458539217710495, + "learning_rate": 1.0348594136526513e-05, + "loss": 0.1399, + "step": 40893 + }, + { + "epoch": 0.7293903613598259, + "grad_norm": 0.26064369082450867, + "learning_rate": 1.0347332972928975e-05, + "loss": 0.0511, + "step": 40894 + }, + { + "epoch": 0.7294081974815396, + "grad_norm": 0.21638596057891846, + "learning_rate": 1.0346071866130233e-05, + "loss": 0.1067, + "step": 40895 + }, + { + "epoch": 0.7294260336032533, + "grad_norm": 0.25604212284088135, + "learning_rate": 1.034481081613517e-05, + "loss": 0.118, + "step": 40896 + }, + { + "epoch": 0.729443869724967, + "grad_norm": 0.19671796262264252, + "learning_rate": 1.034354982294869e-05, + "loss": 0.0783, + "step": 40897 + }, + { + "epoch": 0.7294617058466807, + "grad_norm": 0.3067481815814972, + "learning_rate": 1.0342288886575674e-05, + "loss": 0.1624, + "step": 40898 + }, + { + "epoch": 0.7294795419683944, + "grad_norm": 0.3336236774921417, + "learning_rate": 1.0341028007020998e-05, + "loss": 0.0645, + "step": 40899 + }, + { + "epoch": 0.7294973780901081, + "grad_norm": 0.18999546766281128, + "learning_rate": 1.033976718428957e-05, + "loss": 0.0913, + "step": 40900 + }, + { + "epoch": 0.7295152142118217, + "grad_norm": 0.23737022280693054, + "learning_rate": 1.033850641838626e-05, + "loss": 0.1412, + "step": 40901 + }, + { + "epoch": 0.7295330503335354, + "grad_norm": 0.19350391626358032, + "learning_rate": 1.033724570931597e-05, + "loss": 0.1176, + "step": 40902 + }, + { + "epoch": 0.7295508864552491, + "grad_norm": 0.3428206145763397, + "learning_rate": 1.0335985057083583e-05, + "loss": 0.0995, + "step": 40903 + }, + { + "epoch": 0.7295687225769628, + "grad_norm": 0.23868238925933838, + "learning_rate": 1.0334724461693982e-05, + "loss": 0.1299, + "step": 40904 + }, + { + "epoch": 0.7295865586986766, + "grad_norm": 0.2400166541337967, + "learning_rate": 1.0333463923152045e-05, + "loss": 0.1075, + "step": 40905 + }, + { + "epoch": 0.7296043948203903, + "grad_norm": 0.22942416369915009, + "learning_rate": 1.0332203441462677e-05, + "loss": 0.0926, + "step": 40906 + }, + { + "epoch": 0.729622230942104, + "grad_norm": 0.2808311879634857, + "learning_rate": 1.0330943016630754e-05, + "loss": 0.1169, + "step": 40907 + }, + { + "epoch": 0.7296400670638177, + "grad_norm": 0.2582831382751465, + "learning_rate": 1.032968264866116e-05, + "loss": 0.1333, + "step": 40908 + }, + { + "epoch": 0.7296579031855314, + "grad_norm": 0.2564980387687683, + "learning_rate": 1.0328422337558777e-05, + "loss": 0.0955, + "step": 40909 + }, + { + "epoch": 0.729675739307245, + "grad_norm": 0.31204789876937866, + "learning_rate": 1.0327162083328504e-05, + "loss": 0.1235, + "step": 40910 + }, + { + "epoch": 0.7296935754289587, + "grad_norm": 0.20017872750759125, + "learning_rate": 1.032590188597522e-05, + "loss": 0.0743, + "step": 40911 + }, + { + "epoch": 0.7297114115506724, + "grad_norm": 0.18407808244228363, + "learning_rate": 1.0324641745503807e-05, + "loss": 0.0993, + "step": 40912 + }, + { + "epoch": 0.7297292476723861, + "grad_norm": 0.26999562978744507, + "learning_rate": 1.0323381661919143e-05, + "loss": 0.1504, + "step": 40913 + }, + { + "epoch": 0.7297470837940998, + "grad_norm": 0.25350305438041687, + "learning_rate": 1.0322121635226128e-05, + "loss": 0.1258, + "step": 40914 + }, + { + "epoch": 0.7297649199158135, + "grad_norm": 0.2944730818271637, + "learning_rate": 1.0320861665429635e-05, + "loss": 0.1009, + "step": 40915 + }, + { + "epoch": 0.7297827560375272, + "grad_norm": 0.3151612877845764, + "learning_rate": 1.0319601752534558e-05, + "loss": 0.113, + "step": 40916 + }, + { + "epoch": 0.7298005921592409, + "grad_norm": 0.28373289108276367, + "learning_rate": 1.0318341896545775e-05, + "loss": 0.1113, + "step": 40917 + }, + { + "epoch": 0.7298184282809546, + "grad_norm": 0.21229694783687592, + "learning_rate": 1.0317082097468158e-05, + "loss": 0.0744, + "step": 40918 + }, + { + "epoch": 0.7298362644026682, + "grad_norm": 0.33448320627212524, + "learning_rate": 1.0315822355306615e-05, + "loss": 0.107, + "step": 40919 + }, + { + "epoch": 0.7298541005243819, + "grad_norm": 0.24119000136852264, + "learning_rate": 1.0314562670066017e-05, + "loss": 0.1291, + "step": 40920 + }, + { + "epoch": 0.7298719366460956, + "grad_norm": 0.31554439663887024, + "learning_rate": 1.0313303041751243e-05, + "loss": 0.0911, + "step": 40921 + }, + { + "epoch": 0.7298897727678094, + "grad_norm": 0.27390286326408386, + "learning_rate": 1.0312043470367172e-05, + "loss": 0.1041, + "step": 40922 + }, + { + "epoch": 0.7299076088895231, + "grad_norm": 0.3328604996204376, + "learning_rate": 1.0310783955918702e-05, + "loss": 0.1559, + "step": 40923 + }, + { + "epoch": 0.7299254450112368, + "grad_norm": 0.27093109488487244, + "learning_rate": 1.0309524498410707e-05, + "loss": 0.1299, + "step": 40924 + }, + { + "epoch": 0.7299432811329505, + "grad_norm": 0.18972952663898468, + "learning_rate": 1.0308265097848069e-05, + "loss": 0.0853, + "step": 40925 + }, + { + "epoch": 0.7299611172546642, + "grad_norm": 0.2934782803058624, + "learning_rate": 1.030700575423566e-05, + "loss": 0.1114, + "step": 40926 + }, + { + "epoch": 0.7299789533763779, + "grad_norm": 0.33054137229919434, + "learning_rate": 1.0305746467578382e-05, + "loss": 0.1745, + "step": 40927 + }, + { + "epoch": 0.7299967894980915, + "grad_norm": 0.26389992237091064, + "learning_rate": 1.0304487237881098e-05, + "loss": 0.1264, + "step": 40928 + }, + { + "epoch": 0.7300146256198052, + "grad_norm": 0.16462776064872742, + "learning_rate": 1.0303228065148704e-05, + "loss": 0.0866, + "step": 40929 + }, + { + "epoch": 0.7300324617415189, + "grad_norm": 0.29492875933647156, + "learning_rate": 1.0301968949386074e-05, + "loss": 0.1468, + "step": 40930 + }, + { + "epoch": 0.7300502978632326, + "grad_norm": 0.24353215098381042, + "learning_rate": 1.0300709890598079e-05, + "loss": 0.1122, + "step": 40931 + }, + { + "epoch": 0.7300681339849463, + "grad_norm": 0.2887917459011078, + "learning_rate": 1.0299450888789618e-05, + "loss": 0.1339, + "step": 40932 + }, + { + "epoch": 0.73008597010666, + "grad_norm": 0.2835650146007538, + "learning_rate": 1.0298191943965566e-05, + "loss": 0.0888, + "step": 40933 + }, + { + "epoch": 0.7301038062283737, + "grad_norm": 0.36773163080215454, + "learning_rate": 1.02969330561308e-05, + "loss": 0.1469, + "step": 40934 + }, + { + "epoch": 0.7301216423500874, + "grad_norm": 0.20941372215747833, + "learning_rate": 1.0295674225290189e-05, + "loss": 0.0865, + "step": 40935 + }, + { + "epoch": 0.730139478471801, + "grad_norm": 0.2645230293273926, + "learning_rate": 1.0294415451448632e-05, + "loss": 0.0798, + "step": 40936 + }, + { + "epoch": 0.7301573145935147, + "grad_norm": 0.3101343810558319, + "learning_rate": 1.0293156734611e-05, + "loss": 0.0962, + "step": 40937 + }, + { + "epoch": 0.7301751507152284, + "grad_norm": 0.4218254089355469, + "learning_rate": 1.0291898074782172e-05, + "loss": 0.1678, + "step": 40938 + }, + { + "epoch": 0.7301929868369422, + "grad_norm": 0.23557502031326294, + "learning_rate": 1.029063947196702e-05, + "loss": 0.0933, + "step": 40939 + }, + { + "epoch": 0.7302108229586559, + "grad_norm": 0.21256248652935028, + "learning_rate": 1.0289380926170436e-05, + "loss": 0.0951, + "step": 40940 + }, + { + "epoch": 0.7302286590803696, + "grad_norm": 0.27800583839416504, + "learning_rate": 1.0288122437397296e-05, + "loss": 0.1106, + "step": 40941 + }, + { + "epoch": 0.7302464952020833, + "grad_norm": 0.31588152050971985, + "learning_rate": 1.0286864005652464e-05, + "loss": 0.145, + "step": 40942 + }, + { + "epoch": 0.730264331323797, + "grad_norm": 0.3558647632598877, + "learning_rate": 1.0285605630940836e-05, + "loss": 0.1338, + "step": 40943 + }, + { + "epoch": 0.7302821674455107, + "grad_norm": 0.32653316855430603, + "learning_rate": 1.0284347313267276e-05, + "loss": 0.1439, + "step": 40944 + }, + { + "epoch": 0.7303000035672244, + "grad_norm": 0.2534255385398865, + "learning_rate": 1.0283089052636677e-05, + "loss": 0.0873, + "step": 40945 + }, + { + "epoch": 0.730317839688938, + "grad_norm": 0.21963676810264587, + "learning_rate": 1.0281830849053906e-05, + "loss": 0.1158, + "step": 40946 + }, + { + "epoch": 0.7303356758106517, + "grad_norm": 0.2415035218000412, + "learning_rate": 1.0280572702523844e-05, + "loss": 0.0955, + "step": 40947 + }, + { + "epoch": 0.7303535119323654, + "grad_norm": 0.31052109599113464, + "learning_rate": 1.0279314613051358e-05, + "loss": 0.1055, + "step": 40948 + }, + { + "epoch": 0.7303713480540791, + "grad_norm": 0.2785680294036865, + "learning_rate": 1.027805658064134e-05, + "loss": 0.1033, + "step": 40949 + }, + { + "epoch": 0.7303891841757928, + "grad_norm": 0.24477392435073853, + "learning_rate": 1.0276798605298659e-05, + "loss": 0.1073, + "step": 40950 + }, + { + "epoch": 0.7304070202975065, + "grad_norm": 0.4114842414855957, + "learning_rate": 1.0275540687028196e-05, + "loss": 0.1648, + "step": 40951 + }, + { + "epoch": 0.7304248564192202, + "grad_norm": 0.2071211189031601, + "learning_rate": 1.0274282825834811e-05, + "loss": 0.1072, + "step": 40952 + }, + { + "epoch": 0.7304426925409339, + "grad_norm": 0.20276851952075958, + "learning_rate": 1.0273025021723398e-05, + "loss": 0.0716, + "step": 40953 + }, + { + "epoch": 0.7304605286626475, + "grad_norm": 0.31820905208587646, + "learning_rate": 1.0271767274698831e-05, + "loss": 0.1527, + "step": 40954 + }, + { + "epoch": 0.7304783647843612, + "grad_norm": 0.3157261312007904, + "learning_rate": 1.0270509584765969e-05, + "loss": 0.1635, + "step": 40955 + }, + { + "epoch": 0.730496200906075, + "grad_norm": 0.2503087818622589, + "learning_rate": 1.0269251951929712e-05, + "loss": 0.0973, + "step": 40956 + }, + { + "epoch": 0.7305140370277887, + "grad_norm": 0.3021951913833618, + "learning_rate": 1.0267994376194909e-05, + "loss": 0.1577, + "step": 40957 + }, + { + "epoch": 0.7305318731495024, + "grad_norm": 0.2571439743041992, + "learning_rate": 1.026673685756646e-05, + "loss": 0.1029, + "step": 40958 + }, + { + "epoch": 0.7305497092712161, + "grad_norm": 0.22617734968662262, + "learning_rate": 1.0265479396049227e-05, + "loss": 0.1126, + "step": 40959 + }, + { + "epoch": 0.7305675453929298, + "grad_norm": 0.2502675950527191, + "learning_rate": 1.0264221991648085e-05, + "loss": 0.1002, + "step": 40960 + }, + { + "epoch": 0.7305853815146435, + "grad_norm": 0.27962726354599, + "learning_rate": 1.0262964644367898e-05, + "loss": 0.1123, + "step": 40961 + }, + { + "epoch": 0.7306032176363572, + "grad_norm": 0.32183629274368286, + "learning_rate": 1.0261707354213559e-05, + "loss": 0.2326, + "step": 40962 + }, + { + "epoch": 0.7306210537580708, + "grad_norm": 0.2995951771736145, + "learning_rate": 1.0260450121189935e-05, + "loss": 0.1263, + "step": 40963 + }, + { + "epoch": 0.7306388898797845, + "grad_norm": 0.26489582657814026, + "learning_rate": 1.0259192945301896e-05, + "loss": 0.1101, + "step": 40964 + }, + { + "epoch": 0.7306567260014982, + "grad_norm": 0.2925983965396881, + "learning_rate": 1.0257935826554307e-05, + "loss": 0.1158, + "step": 40965 + }, + { + "epoch": 0.7306745621232119, + "grad_norm": 0.2978723645210266, + "learning_rate": 1.025667876495206e-05, + "loss": 0.1565, + "step": 40966 + }, + { + "epoch": 0.7306923982449256, + "grad_norm": 0.2961837351322174, + "learning_rate": 1.0255421760500017e-05, + "loss": 0.1452, + "step": 40967 + }, + { + "epoch": 0.7307102343666393, + "grad_norm": 0.26765406131744385, + "learning_rate": 1.0254164813203055e-05, + "loss": 0.1378, + "step": 40968 + }, + { + "epoch": 0.730728070488353, + "grad_norm": 0.2946990430355072, + "learning_rate": 1.025290792306603e-05, + "loss": 0.1435, + "step": 40969 + }, + { + "epoch": 0.7307459066100667, + "grad_norm": 0.24788260459899902, + "learning_rate": 1.025165109009383e-05, + "loss": 0.0772, + "step": 40970 + }, + { + "epoch": 0.7307637427317804, + "grad_norm": 0.3516397774219513, + "learning_rate": 1.0250394314291334e-05, + "loss": 0.0872, + "step": 40971 + }, + { + "epoch": 0.7307815788534942, + "grad_norm": 0.44780948758125305, + "learning_rate": 1.0249137595663402e-05, + "loss": 0.1377, + "step": 40972 + }, + { + "epoch": 0.7307994149752078, + "grad_norm": 0.38909098505973816, + "learning_rate": 1.024788093421491e-05, + "loss": 0.1216, + "step": 40973 + }, + { + "epoch": 0.7308172510969215, + "grad_norm": 0.2733481526374817, + "learning_rate": 1.0246624329950712e-05, + "loss": 0.1354, + "step": 40974 + }, + { + "epoch": 0.7308350872186352, + "grad_norm": 0.2599705159664154, + "learning_rate": 1.0245367782875709e-05, + "loss": 0.1649, + "step": 40975 + }, + { + "epoch": 0.7308529233403489, + "grad_norm": 0.1773041933774948, + "learning_rate": 1.0244111292994754e-05, + "loss": 0.0812, + "step": 40976 + }, + { + "epoch": 0.7308707594620626, + "grad_norm": 0.29490283131599426, + "learning_rate": 1.024285486031272e-05, + "loss": 0.0811, + "step": 40977 + }, + { + "epoch": 0.7308885955837763, + "grad_norm": 0.328452467918396, + "learning_rate": 1.024159848483447e-05, + "loss": 0.1474, + "step": 40978 + }, + { + "epoch": 0.73090643170549, + "grad_norm": 0.3337807059288025, + "learning_rate": 1.024034216656489e-05, + "loss": 0.086, + "step": 40979 + }, + { + "epoch": 0.7309242678272037, + "grad_norm": 0.27943354845046997, + "learning_rate": 1.0239085905508841e-05, + "loss": 0.126, + "step": 40980 + }, + { + "epoch": 0.7309421039489173, + "grad_norm": 0.2145722359418869, + "learning_rate": 1.0237829701671192e-05, + "loss": 0.0931, + "step": 40981 + }, + { + "epoch": 0.730959940070631, + "grad_norm": 0.24666835367679596, + "learning_rate": 1.0236573555056808e-05, + "loss": 0.1162, + "step": 40982 + }, + { + "epoch": 0.7309777761923447, + "grad_norm": 0.36359351873397827, + "learning_rate": 1.023531746567057e-05, + "loss": 0.1141, + "step": 40983 + }, + { + "epoch": 0.7309956123140584, + "grad_norm": 0.2992902398109436, + "learning_rate": 1.0234061433517333e-05, + "loss": 0.1467, + "step": 40984 + }, + { + "epoch": 0.7310134484357721, + "grad_norm": 0.24065323173999786, + "learning_rate": 1.0232805458601984e-05, + "loss": 0.1072, + "step": 40985 + }, + { + "epoch": 0.7310312845574858, + "grad_norm": 0.2959069609642029, + "learning_rate": 1.023154954092938e-05, + "loss": 0.1256, + "step": 40986 + }, + { + "epoch": 0.7310491206791995, + "grad_norm": 0.3075346350669861, + "learning_rate": 1.0230293680504383e-05, + "loss": 0.1162, + "step": 40987 + }, + { + "epoch": 0.7310669568009132, + "grad_norm": 0.28486353158950806, + "learning_rate": 1.022903787733188e-05, + "loss": 0.1266, + "step": 40988 + }, + { + "epoch": 0.731084792922627, + "grad_norm": 0.3788246512413025, + "learning_rate": 1.0227782131416724e-05, + "loss": 0.172, + "step": 40989 + }, + { + "epoch": 0.7311026290443406, + "grad_norm": 0.28341367840766907, + "learning_rate": 1.0226526442763787e-05, + "loss": 0.0893, + "step": 40990 + }, + { + "epoch": 0.7311204651660543, + "grad_norm": 0.2282809019088745, + "learning_rate": 1.0225270811377929e-05, + "loss": 0.1008, + "step": 40991 + }, + { + "epoch": 0.731138301287768, + "grad_norm": 0.3333247900009155, + "learning_rate": 1.0224015237264032e-05, + "loss": 0.1929, + "step": 40992 + }, + { + "epoch": 0.7311561374094817, + "grad_norm": 0.3236623704433441, + "learning_rate": 1.0222759720426956e-05, + "loss": 0.1044, + "step": 40993 + }, + { + "epoch": 0.7311739735311954, + "grad_norm": 0.2413376122713089, + "learning_rate": 1.0221504260871566e-05, + "loss": 0.111, + "step": 40994 + }, + { + "epoch": 0.7311918096529091, + "grad_norm": 0.2764434516429901, + "learning_rate": 1.0220248858602732e-05, + "loss": 0.1255, + "step": 40995 + }, + { + "epoch": 0.7312096457746228, + "grad_norm": 0.20417995750904083, + "learning_rate": 1.0218993513625308e-05, + "loss": 0.1282, + "step": 40996 + }, + { + "epoch": 0.7312274818963365, + "grad_norm": 0.228545680642128, + "learning_rate": 1.021773822594418e-05, + "loss": 0.1604, + "step": 40997 + }, + { + "epoch": 0.7312453180180501, + "grad_norm": 0.22014886140823364, + "learning_rate": 1.0216482995564195e-05, + "loss": 0.0728, + "step": 40998 + }, + { + "epoch": 0.7312631541397638, + "grad_norm": 0.22016765177249908, + "learning_rate": 1.0215227822490236e-05, + "loss": 0.0811, + "step": 40999 + }, + { + "epoch": 0.7312809902614775, + "grad_norm": 0.348111093044281, + "learning_rate": 1.0213972706727151e-05, + "loss": 0.148, + "step": 41000 + }, + { + "epoch": 0.7312809902614775, + "eval_loss": 0.11357378959655762, + "eval_runtime": 107.9055, + "eval_samples_per_second": 9.49, + "eval_steps_per_second": 1.585, + "step": 41000 + }, + { + "epoch": 0.7312988263831912, + "grad_norm": 0.2562297284603119, + "learning_rate": 1.0212717648279827e-05, + "loss": 0.1199, + "step": 41001 + }, + { + "epoch": 0.7313166625049049, + "grad_norm": 0.41424325108528137, + "learning_rate": 1.0211462647153114e-05, + "loss": 0.133, + "step": 41002 + }, + { + "epoch": 0.7313344986266186, + "grad_norm": 0.3072645664215088, + "learning_rate": 1.0210207703351878e-05, + "loss": 0.0798, + "step": 41003 + }, + { + "epoch": 0.7313523347483323, + "grad_norm": 0.2379826009273529, + "learning_rate": 1.0208952816880982e-05, + "loss": 0.0798, + "step": 41004 + }, + { + "epoch": 0.731370170870046, + "grad_norm": 0.23669207096099854, + "learning_rate": 1.0207697987745297e-05, + "loss": 0.1258, + "step": 41005 + }, + { + "epoch": 0.7313880069917598, + "grad_norm": 0.297948956489563, + "learning_rate": 1.0206443215949685e-05, + "loss": 0.1424, + "step": 41006 + }, + { + "epoch": 0.7314058431134735, + "grad_norm": 0.2605789005756378, + "learning_rate": 1.020518850149901e-05, + "loss": 0.1239, + "step": 41007 + }, + { + "epoch": 0.7314236792351871, + "grad_norm": 0.3622311055660248, + "learning_rate": 1.0203933844398134e-05, + "loss": 0.1041, + "step": 41008 + }, + { + "epoch": 0.7314415153569008, + "grad_norm": 0.18975771963596344, + "learning_rate": 1.0202679244651912e-05, + "loss": 0.0953, + "step": 41009 + }, + { + "epoch": 0.7314593514786145, + "grad_norm": 0.23391392827033997, + "learning_rate": 1.0201424702265224e-05, + "loss": 0.1148, + "step": 41010 + }, + { + "epoch": 0.7314771876003282, + "grad_norm": 0.30333054065704346, + "learning_rate": 1.0200170217242916e-05, + "loss": 0.0878, + "step": 41011 + }, + { + "epoch": 0.7314950237220419, + "grad_norm": 0.2512120008468628, + "learning_rate": 1.0198915789589869e-05, + "loss": 0.1514, + "step": 41012 + }, + { + "epoch": 0.7315128598437556, + "grad_norm": 0.3056817054748535, + "learning_rate": 1.0197661419310927e-05, + "loss": 0.1051, + "step": 41013 + }, + { + "epoch": 0.7315306959654693, + "grad_norm": 0.25697413086891174, + "learning_rate": 1.0196407106410974e-05, + "loss": 0.1144, + "step": 41014 + }, + { + "epoch": 0.731548532087183, + "grad_norm": 0.2934715747833252, + "learning_rate": 1.0195152850894857e-05, + "loss": 0.1328, + "step": 41015 + }, + { + "epoch": 0.7315663682088966, + "grad_norm": 0.30051085352897644, + "learning_rate": 1.0193898652767442e-05, + "loss": 0.1066, + "step": 41016 + }, + { + "epoch": 0.7315842043306103, + "grad_norm": 0.3459209203720093, + "learning_rate": 1.019264451203358e-05, + "loss": 0.0937, + "step": 41017 + }, + { + "epoch": 0.731602040452324, + "grad_norm": 0.28906819224357605, + "learning_rate": 1.0191390428698153e-05, + "loss": 0.1227, + "step": 41018 + }, + { + "epoch": 0.7316198765740377, + "grad_norm": 0.7798620462417603, + "learning_rate": 1.0190136402766009e-05, + "loss": 0.1106, + "step": 41019 + }, + { + "epoch": 0.7316377126957514, + "grad_norm": 0.3055686056613922, + "learning_rate": 1.0188882434242014e-05, + "loss": 0.0851, + "step": 41020 + }, + { + "epoch": 0.7316555488174651, + "grad_norm": 0.25559568405151367, + "learning_rate": 1.0187628523131027e-05, + "loss": 0.1293, + "step": 41021 + }, + { + "epoch": 0.7316733849391788, + "grad_norm": 0.29998308420181274, + "learning_rate": 1.0186374669437896e-05, + "loss": 0.1609, + "step": 41022 + }, + { + "epoch": 0.7316912210608926, + "grad_norm": 0.2601812779903412, + "learning_rate": 1.0185120873167503e-05, + "loss": 0.1152, + "step": 41023 + }, + { + "epoch": 0.7317090571826063, + "grad_norm": 0.20285411179065704, + "learning_rate": 1.0183867134324698e-05, + "loss": 0.1171, + "step": 41024 + }, + { + "epoch": 0.73172689330432, + "grad_norm": 0.296916663646698, + "learning_rate": 1.0182613452914333e-05, + "loss": 0.1314, + "step": 41025 + }, + { + "epoch": 0.7317447294260336, + "grad_norm": 0.22588127851486206, + "learning_rate": 1.0181359828941276e-05, + "loss": 0.1063, + "step": 41026 + }, + { + "epoch": 0.7317625655477473, + "grad_norm": 0.2552788555622101, + "learning_rate": 1.0180106262410394e-05, + "loss": 0.1262, + "step": 41027 + }, + { + "epoch": 0.731780401669461, + "grad_norm": 0.2651049792766571, + "learning_rate": 1.017885275332654e-05, + "loss": 0.1098, + "step": 41028 + }, + { + "epoch": 0.7317982377911747, + "grad_norm": 0.3077028691768646, + "learning_rate": 1.0177599301694573e-05, + "loss": 0.1278, + "step": 41029 + }, + { + "epoch": 0.7318160739128884, + "grad_norm": 0.24880871176719666, + "learning_rate": 1.017634590751934e-05, + "loss": 0.1185, + "step": 41030 + }, + { + "epoch": 0.7318339100346021, + "grad_norm": 0.22988176345825195, + "learning_rate": 1.017509257080572e-05, + "loss": 0.1206, + "step": 41031 + }, + { + "epoch": 0.7318517461563158, + "grad_norm": 0.23562580347061157, + "learning_rate": 1.0173839291558562e-05, + "loss": 0.1333, + "step": 41032 + }, + { + "epoch": 0.7318695822780295, + "grad_norm": 0.3394968807697296, + "learning_rate": 1.0172586069782721e-05, + "loss": 0.158, + "step": 41033 + }, + { + "epoch": 0.7318874183997431, + "grad_norm": 0.23663291335105896, + "learning_rate": 1.017133290548306e-05, + "loss": 0.1118, + "step": 41034 + }, + { + "epoch": 0.7319052545214568, + "grad_norm": 0.29905152320861816, + "learning_rate": 1.0170079798664425e-05, + "loss": 0.1271, + "step": 41035 + }, + { + "epoch": 0.7319230906431705, + "grad_norm": 0.26523861289024353, + "learning_rate": 1.0168826749331692e-05, + "loss": 0.1027, + "step": 41036 + }, + { + "epoch": 0.7319409267648842, + "grad_norm": 0.2506278455257416, + "learning_rate": 1.016757375748971e-05, + "loss": 0.1435, + "step": 41037 + }, + { + "epoch": 0.7319587628865979, + "grad_norm": 0.3426370918750763, + "learning_rate": 1.0166320823143324e-05, + "loss": 0.0802, + "step": 41038 + }, + { + "epoch": 0.7319765990083116, + "grad_norm": 0.3216326832771301, + "learning_rate": 1.0165067946297411e-05, + "loss": 0.1575, + "step": 41039 + }, + { + "epoch": 0.7319944351300254, + "grad_norm": 0.24299614131450653, + "learning_rate": 1.016381512695681e-05, + "loss": 0.1217, + "step": 41040 + }, + { + "epoch": 0.7320122712517391, + "grad_norm": 0.2927909195423126, + "learning_rate": 1.0162562365126396e-05, + "loss": 0.1448, + "step": 41041 + }, + { + "epoch": 0.7320301073734528, + "grad_norm": 0.3751700818538666, + "learning_rate": 1.0161309660811014e-05, + "loss": 0.0862, + "step": 41042 + }, + { + "epoch": 0.7320479434951664, + "grad_norm": 0.40837520360946655, + "learning_rate": 1.0160057014015512e-05, + "loss": 0.1305, + "step": 41043 + }, + { + "epoch": 0.7320657796168801, + "grad_norm": 0.27980414032936096, + "learning_rate": 1.015880442474476e-05, + "loss": 0.1072, + "step": 41044 + }, + { + "epoch": 0.7320836157385938, + "grad_norm": 0.3306896686553955, + "learning_rate": 1.0157551893003614e-05, + "loss": 0.153, + "step": 41045 + }, + { + "epoch": 0.7321014518603075, + "grad_norm": 0.21825826168060303, + "learning_rate": 1.015629941879692e-05, + "loss": 0.0947, + "step": 41046 + }, + { + "epoch": 0.7321192879820212, + "grad_norm": 0.18993951380252838, + "learning_rate": 1.0155047002129536e-05, + "loss": 0.0822, + "step": 41047 + }, + { + "epoch": 0.7321371241037349, + "grad_norm": 0.275922566652298, + "learning_rate": 1.015379464300631e-05, + "loss": 0.0526, + "step": 41048 + }, + { + "epoch": 0.7321549602254486, + "grad_norm": 0.3374969959259033, + "learning_rate": 1.0152542341432111e-05, + "loss": 0.116, + "step": 41049 + }, + { + "epoch": 0.7321727963471623, + "grad_norm": 0.34595680236816406, + "learning_rate": 1.0151290097411789e-05, + "loss": 0.1263, + "step": 41050 + }, + { + "epoch": 0.732190632468876, + "grad_norm": 0.2317531853914261, + "learning_rate": 1.0150037910950191e-05, + "loss": 0.1506, + "step": 41051 + }, + { + "epoch": 0.7322084685905896, + "grad_norm": 0.29938170313835144, + "learning_rate": 1.014878578205217e-05, + "loss": 0.1511, + "step": 41052 + }, + { + "epoch": 0.7322263047123033, + "grad_norm": 0.2171143889427185, + "learning_rate": 1.0147533710722582e-05, + "loss": 0.0922, + "step": 41053 + }, + { + "epoch": 0.732244140834017, + "grad_norm": 0.31345969438552856, + "learning_rate": 1.0146281696966293e-05, + "loss": 0.1202, + "step": 41054 + }, + { + "epoch": 0.7322619769557307, + "grad_norm": 0.18728560209274292, + "learning_rate": 1.0145029740788146e-05, + "loss": 0.0778, + "step": 41055 + }, + { + "epoch": 0.7322798130774444, + "grad_norm": 0.23062537610530853, + "learning_rate": 1.0143777842192986e-05, + "loss": 0.1022, + "step": 41056 + }, + { + "epoch": 0.7322976491991582, + "grad_norm": 0.29825496673583984, + "learning_rate": 1.0142526001185681e-05, + "loss": 0.1435, + "step": 41057 + }, + { + "epoch": 0.7323154853208719, + "grad_norm": 0.28086090087890625, + "learning_rate": 1.014127421777108e-05, + "loss": 0.1842, + "step": 41058 + }, + { + "epoch": 0.7323333214425856, + "grad_norm": 0.36868149042129517, + "learning_rate": 1.0140022491954032e-05, + "loss": 0.1198, + "step": 41059 + }, + { + "epoch": 0.7323511575642992, + "grad_norm": 0.17740115523338318, + "learning_rate": 1.0138770823739389e-05, + "loss": 0.1123, + "step": 41060 + }, + { + "epoch": 0.7323689936860129, + "grad_norm": 0.2053452730178833, + "learning_rate": 1.0137519213131993e-05, + "loss": 0.1298, + "step": 41061 + }, + { + "epoch": 0.7323868298077266, + "grad_norm": 0.23524436354637146, + "learning_rate": 1.0136267660136716e-05, + "loss": 0.1166, + "step": 41062 + }, + { + "epoch": 0.7324046659294403, + "grad_norm": 0.26659414172172546, + "learning_rate": 1.0135016164758399e-05, + "loss": 0.1337, + "step": 41063 + }, + { + "epoch": 0.732422502051154, + "grad_norm": 0.22963832318782806, + "learning_rate": 1.0133764727001893e-05, + "loss": 0.1145, + "step": 41064 + }, + { + "epoch": 0.7324403381728677, + "grad_norm": 0.42937949299812317, + "learning_rate": 1.013251334687204e-05, + "loss": 0.1807, + "step": 41065 + }, + { + "epoch": 0.7324581742945814, + "grad_norm": 0.2890181243419647, + "learning_rate": 1.013126202437371e-05, + "loss": 0.0552, + "step": 41066 + }, + { + "epoch": 0.7324760104162951, + "grad_norm": 0.34323248267173767, + "learning_rate": 1.0130010759511735e-05, + "loss": 0.1006, + "step": 41067 + }, + { + "epoch": 0.7324938465380088, + "grad_norm": 0.2769702076911926, + "learning_rate": 1.0128759552290984e-05, + "loss": 0.1589, + "step": 41068 + }, + { + "epoch": 0.7325116826597224, + "grad_norm": 0.2968181371688843, + "learning_rate": 1.0127508402716288e-05, + "loss": 0.1367, + "step": 41069 + }, + { + "epoch": 0.7325295187814361, + "grad_norm": 0.5595503449440002, + "learning_rate": 1.0126257310792515e-05, + "loss": 0.1397, + "step": 41070 + }, + { + "epoch": 0.7325473549031498, + "grad_norm": 0.2284081131219864, + "learning_rate": 1.0125006276524507e-05, + "loss": 0.1302, + "step": 41071 + }, + { + "epoch": 0.7325651910248635, + "grad_norm": 0.28476080298423767, + "learning_rate": 1.0123755299917109e-05, + "loss": 0.1251, + "step": 41072 + }, + { + "epoch": 0.7325830271465773, + "grad_norm": 0.2432744950056076, + "learning_rate": 1.0122504380975175e-05, + "loss": 0.1361, + "step": 41073 + }, + { + "epoch": 0.732600863268291, + "grad_norm": 0.24936284124851227, + "learning_rate": 1.0121253519703543e-05, + "loss": 0.1177, + "step": 41074 + }, + { + "epoch": 0.7326186993900047, + "grad_norm": 0.3589072525501251, + "learning_rate": 1.0120002716107083e-05, + "loss": 0.1277, + "step": 41075 + }, + { + "epoch": 0.7326365355117184, + "grad_norm": 0.24981731176376343, + "learning_rate": 1.0118751970190631e-05, + "loss": 0.1307, + "step": 41076 + }, + { + "epoch": 0.732654371633432, + "grad_norm": 0.28184929490089417, + "learning_rate": 1.0117501281959035e-05, + "loss": 0.1369, + "step": 41077 + }, + { + "epoch": 0.7326722077551457, + "grad_norm": 0.22753465175628662, + "learning_rate": 1.0116250651417136e-05, + "loss": 0.0609, + "step": 41078 + }, + { + "epoch": 0.7326900438768594, + "grad_norm": 0.327316015958786, + "learning_rate": 1.0115000078569801e-05, + "loss": 0.1269, + "step": 41079 + }, + { + "epoch": 0.7327078799985731, + "grad_norm": 0.21712535619735718, + "learning_rate": 1.0113749563421854e-05, + "loss": 0.1208, + "step": 41080 + }, + { + "epoch": 0.7327257161202868, + "grad_norm": 0.2620106637477875, + "learning_rate": 1.0112499105978166e-05, + "loss": 0.1104, + "step": 41081 + }, + { + "epoch": 0.7327435522420005, + "grad_norm": 0.26524466276168823, + "learning_rate": 1.0111248706243564e-05, + "loss": 0.1464, + "step": 41082 + }, + { + "epoch": 0.7327613883637142, + "grad_norm": 0.3362818956375122, + "learning_rate": 1.0109998364222915e-05, + "loss": 0.1502, + "step": 41083 + }, + { + "epoch": 0.7327792244854279, + "grad_norm": 0.19853998720645905, + "learning_rate": 1.0108748079921055e-05, + "loss": 0.0858, + "step": 41084 + }, + { + "epoch": 0.7327970606071416, + "grad_norm": 0.2885602116584778, + "learning_rate": 1.0107497853342829e-05, + "loss": 0.0986, + "step": 41085 + }, + { + "epoch": 0.7328148967288552, + "grad_norm": 0.29427123069763184, + "learning_rate": 1.0106247684493086e-05, + "loss": 0.0819, + "step": 41086 + }, + { + "epoch": 0.7328327328505689, + "grad_norm": 0.28571197390556335, + "learning_rate": 1.0104997573376662e-05, + "loss": 0.1078, + "step": 41087 + }, + { + "epoch": 0.7328505689722826, + "grad_norm": 0.24130010604858398, + "learning_rate": 1.0103747519998422e-05, + "loss": 0.1199, + "step": 41088 + }, + { + "epoch": 0.7328684050939963, + "grad_norm": 0.2578444182872772, + "learning_rate": 1.01024975243632e-05, + "loss": 0.1214, + "step": 41089 + }, + { + "epoch": 0.7328862412157101, + "grad_norm": 0.30509063601493835, + "learning_rate": 1.0101247586475842e-05, + "loss": 0.1885, + "step": 41090 + }, + { + "epoch": 0.7329040773374238, + "grad_norm": 0.30987098813056946, + "learning_rate": 1.0099997706341188e-05, + "loss": 0.1045, + "step": 41091 + }, + { + "epoch": 0.7329219134591375, + "grad_norm": 0.24301046133041382, + "learning_rate": 1.0098747883964096e-05, + "loss": 0.0904, + "step": 41092 + }, + { + "epoch": 0.7329397495808512, + "grad_norm": 0.2671217918395996, + "learning_rate": 1.0097498119349404e-05, + "loss": 0.0925, + "step": 41093 + }, + { + "epoch": 0.7329575857025649, + "grad_norm": 0.2784178555011749, + "learning_rate": 1.009624841250195e-05, + "loss": 0.101, + "step": 41094 + }, + { + "epoch": 0.7329754218242786, + "grad_norm": 0.2802357077598572, + "learning_rate": 1.0094998763426591e-05, + "loss": 0.091, + "step": 41095 + }, + { + "epoch": 0.7329932579459922, + "grad_norm": 0.26100000739097595, + "learning_rate": 1.0093749172128159e-05, + "loss": 0.1087, + "step": 41096 + }, + { + "epoch": 0.7330110940677059, + "grad_norm": 0.27671751379966736, + "learning_rate": 1.0092499638611508e-05, + "loss": 0.1341, + "step": 41097 + }, + { + "epoch": 0.7330289301894196, + "grad_norm": 0.2316170036792755, + "learning_rate": 1.009125016288148e-05, + "loss": 0.0509, + "step": 41098 + }, + { + "epoch": 0.7330467663111333, + "grad_norm": 0.22340913116931915, + "learning_rate": 1.0090000744942915e-05, + "loss": 0.1036, + "step": 41099 + }, + { + "epoch": 0.733064602432847, + "grad_norm": 0.26249369978904724, + "learning_rate": 1.0088751384800649e-05, + "loss": 0.1486, + "step": 41100 + }, + { + "epoch": 0.7330824385545607, + "grad_norm": 0.3222165107727051, + "learning_rate": 1.008750208245954e-05, + "loss": 0.1064, + "step": 41101 + }, + { + "epoch": 0.7331002746762744, + "grad_norm": 0.2129303365945816, + "learning_rate": 1.0086252837924427e-05, + "loss": 0.0652, + "step": 41102 + }, + { + "epoch": 0.733118110797988, + "grad_norm": 0.2269088327884674, + "learning_rate": 1.0085003651200145e-05, + "loss": 0.1037, + "step": 41103 + }, + { + "epoch": 0.7331359469197017, + "grad_norm": 0.26634708046913147, + "learning_rate": 1.0083754522291536e-05, + "loss": 0.0898, + "step": 41104 + }, + { + "epoch": 0.7331537830414154, + "grad_norm": 0.2972703278064728, + "learning_rate": 1.008250545120345e-05, + "loss": 0.1297, + "step": 41105 + }, + { + "epoch": 0.7331716191631291, + "grad_norm": 0.3696639835834503, + "learning_rate": 1.0081256437940729e-05, + "loss": 0.1236, + "step": 41106 + }, + { + "epoch": 0.7331894552848429, + "grad_norm": 0.28319859504699707, + "learning_rate": 1.00800074825082e-05, + "loss": 0.1528, + "step": 41107 + }, + { + "epoch": 0.7332072914065566, + "grad_norm": 0.3253980576992035, + "learning_rate": 1.0078758584910725e-05, + "loss": 0.105, + "step": 41108 + }, + { + "epoch": 0.7332251275282703, + "grad_norm": 0.23751689493656158, + "learning_rate": 1.0077509745153126e-05, + "loss": 0.0912, + "step": 41109 + }, + { + "epoch": 0.733242963649984, + "grad_norm": 0.4059242308139801, + "learning_rate": 1.0076260963240261e-05, + "loss": 0.0917, + "step": 41110 + }, + { + "epoch": 0.7332607997716977, + "grad_norm": 0.26670950651168823, + "learning_rate": 1.0075012239176965e-05, + "loss": 0.0896, + "step": 41111 + }, + { + "epoch": 0.7332786358934114, + "grad_norm": 0.23575033247470856, + "learning_rate": 1.0073763572968075e-05, + "loss": 0.1076, + "step": 41112 + }, + { + "epoch": 0.733296472015125, + "grad_norm": 0.3413923680782318, + "learning_rate": 1.0072514964618426e-05, + "loss": 0.1249, + "step": 41113 + }, + { + "epoch": 0.7333143081368387, + "grad_norm": 0.20009158551692963, + "learning_rate": 1.0071266414132871e-05, + "loss": 0.0838, + "step": 41114 + }, + { + "epoch": 0.7333321442585524, + "grad_norm": 0.3031262159347534, + "learning_rate": 1.0070017921516247e-05, + "loss": 0.1437, + "step": 41115 + }, + { + "epoch": 0.7333499803802661, + "grad_norm": 0.29012876749038696, + "learning_rate": 1.0068769486773388e-05, + "loss": 0.0875, + "step": 41116 + }, + { + "epoch": 0.7333678165019798, + "grad_norm": 0.3280128240585327, + "learning_rate": 1.0067521109909126e-05, + "loss": 0.1141, + "step": 41117 + }, + { + "epoch": 0.7333856526236935, + "grad_norm": 0.26290854811668396, + "learning_rate": 1.006627279092832e-05, + "loss": 0.14, + "step": 41118 + }, + { + "epoch": 0.7334034887454072, + "grad_norm": 0.33645305037498474, + "learning_rate": 1.0065024529835798e-05, + "loss": 0.1314, + "step": 41119 + }, + { + "epoch": 0.7334213248671209, + "grad_norm": 0.2620290517807007, + "learning_rate": 1.0063776326636399e-05, + "loss": 0.1362, + "step": 41120 + }, + { + "epoch": 0.7334391609888345, + "grad_norm": 0.3068559169769287, + "learning_rate": 1.0062528181334954e-05, + "loss": 0.0972, + "step": 41121 + }, + { + "epoch": 0.7334569971105482, + "grad_norm": 0.34216317534446716, + "learning_rate": 1.0061280093936315e-05, + "loss": 0.1097, + "step": 41122 + }, + { + "epoch": 0.7334748332322619, + "grad_norm": 0.28030675649642944, + "learning_rate": 1.0060032064445307e-05, + "loss": 0.1638, + "step": 41123 + }, + { + "epoch": 0.7334926693539757, + "grad_norm": 0.2601567208766937, + "learning_rate": 1.0058784092866785e-05, + "loss": 0.1102, + "step": 41124 + }, + { + "epoch": 0.7335105054756894, + "grad_norm": 0.29961302876472473, + "learning_rate": 1.0057536179205576e-05, + "loss": 0.1235, + "step": 41125 + }, + { + "epoch": 0.7335283415974031, + "grad_norm": 0.23860082030296326, + "learning_rate": 1.0056288323466507e-05, + "loss": 0.131, + "step": 41126 + }, + { + "epoch": 0.7335461777191168, + "grad_norm": 0.2541649639606476, + "learning_rate": 1.0055040525654436e-05, + "loss": 0.1805, + "step": 41127 + }, + { + "epoch": 0.7335640138408305, + "grad_norm": 0.31603550910949707, + "learning_rate": 1.0053792785774191e-05, + "loss": 0.1106, + "step": 41128 + }, + { + "epoch": 0.7335818499625442, + "grad_norm": 0.2143966406583786, + "learning_rate": 1.0052545103830605e-05, + "loss": 0.1204, + "step": 41129 + }, + { + "epoch": 0.7335996860842579, + "grad_norm": 0.345138281583786, + "learning_rate": 1.0051297479828508e-05, + "loss": 0.1677, + "step": 41130 + }, + { + "epoch": 0.7336175222059715, + "grad_norm": 0.4226834774017334, + "learning_rate": 1.0050049913772755e-05, + "loss": 0.2135, + "step": 41131 + }, + { + "epoch": 0.7336353583276852, + "grad_norm": 0.2573096752166748, + "learning_rate": 1.0048802405668173e-05, + "loss": 0.0939, + "step": 41132 + }, + { + "epoch": 0.7336531944493989, + "grad_norm": 0.2890351712703705, + "learning_rate": 1.0047554955519595e-05, + "loss": 0.1128, + "step": 41133 + }, + { + "epoch": 0.7336710305711126, + "grad_norm": 0.25333184003829956, + "learning_rate": 1.004630756333185e-05, + "loss": 0.1037, + "step": 41134 + }, + { + "epoch": 0.7336888666928263, + "grad_norm": 0.3420916199684143, + "learning_rate": 1.0045060229109793e-05, + "loss": 0.098, + "step": 41135 + }, + { + "epoch": 0.73370670281454, + "grad_norm": 0.29470258951187134, + "learning_rate": 1.0043812952858236e-05, + "loss": 0.0796, + "step": 41136 + }, + { + "epoch": 0.7337245389362537, + "grad_norm": 0.31416720151901245, + "learning_rate": 1.0042565734582036e-05, + "loss": 0.1404, + "step": 41137 + }, + { + "epoch": 0.7337423750579674, + "grad_norm": 0.3004606068134308, + "learning_rate": 1.0041318574286015e-05, + "loss": 0.0993, + "step": 41138 + }, + { + "epoch": 0.733760211179681, + "grad_norm": 0.21731922030448914, + "learning_rate": 1.0040071471975005e-05, + "loss": 0.0785, + "step": 41139 + }, + { + "epoch": 0.7337780473013947, + "grad_norm": 0.2686507999897003, + "learning_rate": 1.003882442765385e-05, + "loss": 0.0812, + "step": 41140 + }, + { + "epoch": 0.7337958834231085, + "grad_norm": 0.19689592719078064, + "learning_rate": 1.0037577441327382e-05, + "loss": 0.1281, + "step": 41141 + }, + { + "epoch": 0.7338137195448222, + "grad_norm": 0.32062211632728577, + "learning_rate": 1.003633051300043e-05, + "loss": 0.1415, + "step": 41142 + }, + { + "epoch": 0.7338315556665359, + "grad_norm": 0.3154888153076172, + "learning_rate": 1.003508364267782e-05, + "loss": 0.1299, + "step": 41143 + }, + { + "epoch": 0.7338493917882496, + "grad_norm": 0.25007688999176025, + "learning_rate": 1.0033836830364405e-05, + "loss": 0.1201, + "step": 41144 + }, + { + "epoch": 0.7338672279099633, + "grad_norm": 0.3194383382797241, + "learning_rate": 1.0032590076065004e-05, + "loss": 0.0947, + "step": 41145 + }, + { + "epoch": 0.733885064031677, + "grad_norm": 0.2792467176914215, + "learning_rate": 1.0031343379784458e-05, + "loss": 0.1429, + "step": 41146 + }, + { + "epoch": 0.7339029001533907, + "grad_norm": 0.24085290729999542, + "learning_rate": 1.0030096741527584e-05, + "loss": 0.1012, + "step": 41147 + }, + { + "epoch": 0.7339207362751043, + "grad_norm": 0.21871492266654968, + "learning_rate": 1.0028850161299234e-05, + "loss": 0.1118, + "step": 41148 + }, + { + "epoch": 0.733938572396818, + "grad_norm": 0.26251277327537537, + "learning_rate": 1.0027603639104233e-05, + "loss": 0.1202, + "step": 41149 + }, + { + "epoch": 0.7339564085185317, + "grad_norm": 0.30111393332481384, + "learning_rate": 1.0026357174947401e-05, + "loss": 0.1149, + "step": 41150 + }, + { + "epoch": 0.7339742446402454, + "grad_norm": 0.23938579857349396, + "learning_rate": 1.002511076883359e-05, + "loss": 0.0954, + "step": 41151 + }, + { + "epoch": 0.7339920807619591, + "grad_norm": 0.2065141648054123, + "learning_rate": 1.0023864420767612e-05, + "loss": 0.0999, + "step": 41152 + }, + { + "epoch": 0.7340099168836728, + "grad_norm": 0.25619766116142273, + "learning_rate": 1.0022618130754318e-05, + "loss": 0.0944, + "step": 41153 + }, + { + "epoch": 0.7340277530053865, + "grad_norm": 0.21290558576583862, + "learning_rate": 1.0021371898798532e-05, + "loss": 0.1121, + "step": 41154 + }, + { + "epoch": 0.7340455891271002, + "grad_norm": 0.33529436588287354, + "learning_rate": 1.0020125724905076e-05, + "loss": 0.1294, + "step": 41155 + }, + { + "epoch": 0.7340634252488139, + "grad_norm": 0.2827434837818146, + "learning_rate": 1.0018879609078782e-05, + "loss": 0.1738, + "step": 41156 + }, + { + "epoch": 0.7340812613705275, + "grad_norm": 0.2743864059448242, + "learning_rate": 1.0017633551324493e-05, + "loss": 0.0758, + "step": 41157 + }, + { + "epoch": 0.7340990974922413, + "grad_norm": 0.29576677083969116, + "learning_rate": 1.001638755164703e-05, + "loss": 0.1413, + "step": 41158 + }, + { + "epoch": 0.734116933613955, + "grad_norm": 0.33710238337516785, + "learning_rate": 1.0015141610051227e-05, + "loss": 0.1361, + "step": 41159 + }, + { + "epoch": 0.7341347697356687, + "grad_norm": 0.23217138648033142, + "learning_rate": 1.0013895726541898e-05, + "loss": 0.1208, + "step": 41160 + }, + { + "epoch": 0.7341526058573824, + "grad_norm": 0.23373915255069733, + "learning_rate": 1.0012649901123897e-05, + "loss": 0.0675, + "step": 41161 + }, + { + "epoch": 0.7341704419790961, + "grad_norm": 0.3140864968299866, + "learning_rate": 1.0011404133802039e-05, + "loss": 0.1318, + "step": 41162 + }, + { + "epoch": 0.7341882781008098, + "grad_norm": 0.2737288177013397, + "learning_rate": 1.0010158424581148e-05, + "loss": 0.0972, + "step": 41163 + }, + { + "epoch": 0.7342061142225235, + "grad_norm": 0.24355921149253845, + "learning_rate": 1.000891277346607e-05, + "loss": 0.1201, + "step": 41164 + }, + { + "epoch": 0.7342239503442372, + "grad_norm": 0.3785707950592041, + "learning_rate": 1.0007667180461613e-05, + "loss": 0.1537, + "step": 41165 + }, + { + "epoch": 0.7342417864659508, + "grad_norm": 0.27761510014533997, + "learning_rate": 1.0006421645572626e-05, + "loss": 0.1022, + "step": 41166 + }, + { + "epoch": 0.7342596225876645, + "grad_norm": 0.4070633351802826, + "learning_rate": 1.0005176168803925e-05, + "loss": 0.1119, + "step": 41167 + }, + { + "epoch": 0.7342774587093782, + "grad_norm": 0.29173749685287476, + "learning_rate": 1.0003930750160345e-05, + "loss": 0.1105, + "step": 41168 + }, + { + "epoch": 0.7342952948310919, + "grad_norm": 0.25865909457206726, + "learning_rate": 1.0002685389646696e-05, + "loss": 0.134, + "step": 41169 + }, + { + "epoch": 0.7343131309528056, + "grad_norm": 0.23558352887630463, + "learning_rate": 1.000144008726783e-05, + "loss": 0.1048, + "step": 41170 + }, + { + "epoch": 0.7343309670745193, + "grad_norm": 0.32705268263816833, + "learning_rate": 1.000019484302856e-05, + "loss": 0.1845, + "step": 41171 + }, + { + "epoch": 0.734348803196233, + "grad_norm": 0.40526434779167175, + "learning_rate": 9.998949656933718e-06, + "loss": 0.1598, + "step": 41172 + }, + { + "epoch": 0.7343666393179467, + "grad_norm": 0.2830185294151306, + "learning_rate": 9.997704528988117e-06, + "loss": 0.1132, + "step": 41173 + }, + { + "epoch": 0.7343844754396605, + "grad_norm": 0.2461305558681488, + "learning_rate": 9.996459459196606e-06, + "loss": 0.1153, + "step": 41174 + }, + { + "epoch": 0.7344023115613741, + "grad_norm": 0.23248641192913055, + "learning_rate": 9.995214447564e-06, + "loss": 0.114, + "step": 41175 + }, + { + "epoch": 0.7344201476830878, + "grad_norm": 0.42728564143180847, + "learning_rate": 9.993969494095123e-06, + "loss": 0.1718, + "step": 41176 + }, + { + "epoch": 0.7344379838048015, + "grad_norm": 0.31020474433898926, + "learning_rate": 9.992724598794794e-06, + "loss": 0.1451, + "step": 41177 + }, + { + "epoch": 0.7344558199265152, + "grad_norm": 0.21588747203350067, + "learning_rate": 9.99147976166786e-06, + "loss": 0.1217, + "step": 41178 + }, + { + "epoch": 0.7344736560482289, + "grad_norm": 0.29261553287506104, + "learning_rate": 9.990234982719124e-06, + "loss": 0.116, + "step": 41179 + }, + { + "epoch": 0.7344914921699426, + "grad_norm": 0.33601871132850647, + "learning_rate": 9.98899026195343e-06, + "loss": 0.1776, + "step": 41180 + }, + { + "epoch": 0.7345093282916563, + "grad_norm": 0.4971127212047577, + "learning_rate": 9.987745599375595e-06, + "loss": 0.1256, + "step": 41181 + }, + { + "epoch": 0.73452716441337, + "grad_norm": 0.37492135167121887, + "learning_rate": 9.986500994990433e-06, + "loss": 0.1535, + "step": 41182 + }, + { + "epoch": 0.7345450005350836, + "grad_norm": 0.34187525510787964, + "learning_rate": 9.985256448802788e-06, + "loss": 0.121, + "step": 41183 + }, + { + "epoch": 0.7345628366567973, + "grad_norm": 0.22877824306488037, + "learning_rate": 9.984011960817475e-06, + "loss": 0.1085, + "step": 41184 + }, + { + "epoch": 0.734580672778511, + "grad_norm": 0.2493501454591751, + "learning_rate": 9.982767531039319e-06, + "loss": 0.0967, + "step": 41185 + }, + { + "epoch": 0.7345985089002247, + "grad_norm": 0.26916593313217163, + "learning_rate": 9.981523159473133e-06, + "loss": 0.1105, + "step": 41186 + }, + { + "epoch": 0.7346163450219384, + "grad_norm": 0.26236462593078613, + "learning_rate": 9.980278846123759e-06, + "loss": 0.1087, + "step": 41187 + }, + { + "epoch": 0.7346341811436521, + "grad_norm": 0.29075196385383606, + "learning_rate": 9.97903459099601e-06, + "loss": 0.1085, + "step": 41188 + }, + { + "epoch": 0.7346520172653658, + "grad_norm": 0.25500744581222534, + "learning_rate": 9.977790394094715e-06, + "loss": 0.1043, + "step": 41189 + }, + { + "epoch": 0.7346698533870795, + "grad_norm": 0.2520645558834076, + "learning_rate": 9.976546255424682e-06, + "loss": 0.1459, + "step": 41190 + }, + { + "epoch": 0.7346876895087933, + "grad_norm": 0.24114584922790527, + "learning_rate": 9.975302174990755e-06, + "loss": 0.1004, + "step": 41191 + }, + { + "epoch": 0.734705525630507, + "grad_norm": 0.3239428400993347, + "learning_rate": 9.974058152797735e-06, + "loss": 0.1165, + "step": 41192 + }, + { + "epoch": 0.7347233617522206, + "grad_norm": 0.3203233480453491, + "learning_rate": 9.972814188850465e-06, + "loss": 0.1493, + "step": 41193 + }, + { + "epoch": 0.7347411978739343, + "grad_norm": 0.263615220785141, + "learning_rate": 9.971570283153759e-06, + "loss": 0.1275, + "step": 41194 + }, + { + "epoch": 0.734759033995648, + "grad_norm": 0.2871703505516052, + "learning_rate": 9.970326435712426e-06, + "loss": 0.1181, + "step": 41195 + }, + { + "epoch": 0.7347768701173617, + "grad_norm": 0.34165987372398376, + "learning_rate": 9.96908264653131e-06, + "loss": 0.0979, + "step": 41196 + }, + { + "epoch": 0.7347947062390754, + "grad_norm": 0.28180333971977234, + "learning_rate": 9.967838915615218e-06, + "loss": 0.0749, + "step": 41197 + }, + { + "epoch": 0.7348125423607891, + "grad_norm": 0.36381253600120544, + "learning_rate": 9.966595242968978e-06, + "loss": 0.1456, + "step": 41198 + }, + { + "epoch": 0.7348303784825028, + "grad_norm": 0.3145157992839813, + "learning_rate": 9.965351628597398e-06, + "loss": 0.1065, + "step": 41199 + }, + { + "epoch": 0.7348482146042165, + "grad_norm": 0.2439817190170288, + "learning_rate": 9.964108072505316e-06, + "loss": 0.1636, + "step": 41200 + }, + { + "epoch": 0.7348660507259301, + "grad_norm": 0.17858165502548218, + "learning_rate": 9.962864574697542e-06, + "loss": 0.1137, + "step": 41201 + }, + { + "epoch": 0.7348838868476438, + "grad_norm": 0.18529586493968964, + "learning_rate": 9.9616211351789e-06, + "loss": 0.0369, + "step": 41202 + }, + { + "epoch": 0.7349017229693575, + "grad_norm": 0.256927490234375, + "learning_rate": 9.960377753954208e-06, + "loss": 0.0958, + "step": 41203 + }, + { + "epoch": 0.7349195590910712, + "grad_norm": 0.232254296541214, + "learning_rate": 9.959134431028281e-06, + "loss": 0.0878, + "step": 41204 + }, + { + "epoch": 0.7349373952127849, + "grad_norm": 0.2753954231739044, + "learning_rate": 9.95789116640595e-06, + "loss": 0.1557, + "step": 41205 + }, + { + "epoch": 0.7349552313344986, + "grad_norm": 0.2848125100135803, + "learning_rate": 9.956647960092019e-06, + "loss": 0.1579, + "step": 41206 + }, + { + "epoch": 0.7349730674562123, + "grad_norm": 0.30386704206466675, + "learning_rate": 9.955404812091324e-06, + "loss": 0.114, + "step": 41207 + }, + { + "epoch": 0.7349909035779261, + "grad_norm": 0.3053191006183624, + "learning_rate": 9.95416172240867e-06, + "loss": 0.1416, + "step": 41208 + }, + { + "epoch": 0.7350087396996398, + "grad_norm": 0.24196003377437592, + "learning_rate": 9.952918691048891e-06, + "loss": 0.0772, + "step": 41209 + }, + { + "epoch": 0.7350265758213534, + "grad_norm": 0.25430259108543396, + "learning_rate": 9.951675718016792e-06, + "loss": 0.0858, + "step": 41210 + }, + { + "epoch": 0.7350444119430671, + "grad_norm": 0.31533169746398926, + "learning_rate": 9.9504328033172e-06, + "loss": 0.1907, + "step": 41211 + }, + { + "epoch": 0.7350622480647808, + "grad_norm": 0.2254267930984497, + "learning_rate": 9.949189946954918e-06, + "loss": 0.0928, + "step": 41212 + }, + { + "epoch": 0.7350800841864945, + "grad_norm": 0.2716812789440155, + "learning_rate": 9.947947148934783e-06, + "loss": 0.0872, + "step": 41213 + }, + { + "epoch": 0.7350979203082082, + "grad_norm": 0.28400975465774536, + "learning_rate": 9.946704409261604e-06, + "loss": 0.1271, + "step": 41214 + }, + { + "epoch": 0.7351157564299219, + "grad_norm": 0.34416598081588745, + "learning_rate": 9.945461727940198e-06, + "loss": 0.0915, + "step": 41215 + }, + { + "epoch": 0.7351335925516356, + "grad_norm": 0.3939386308193207, + "learning_rate": 9.94421910497538e-06, + "loss": 0.1359, + "step": 41216 + }, + { + "epoch": 0.7351514286733493, + "grad_norm": 0.21759319305419922, + "learning_rate": 9.942976540371962e-06, + "loss": 0.1305, + "step": 41217 + }, + { + "epoch": 0.735169264795063, + "grad_norm": 0.2846537232398987, + "learning_rate": 9.941734034134775e-06, + "loss": 0.1152, + "step": 41218 + }, + { + "epoch": 0.7351871009167766, + "grad_norm": 0.27095213532447815, + "learning_rate": 9.940491586268621e-06, + "loss": 0.1142, + "step": 41219 + }, + { + "epoch": 0.7352049370384903, + "grad_norm": 0.3897612690925598, + "learning_rate": 9.93924919677833e-06, + "loss": 0.1112, + "step": 41220 + }, + { + "epoch": 0.735222773160204, + "grad_norm": 0.36748987436294556, + "learning_rate": 9.938006865668704e-06, + "loss": 0.1001, + "step": 41221 + }, + { + "epoch": 0.7352406092819177, + "grad_norm": 0.23544777929782867, + "learning_rate": 9.936764592944572e-06, + "loss": 0.084, + "step": 41222 + }, + { + "epoch": 0.7352584454036314, + "grad_norm": 0.23080258071422577, + "learning_rate": 9.935522378610746e-06, + "loss": 0.1423, + "step": 41223 + }, + { + "epoch": 0.7352762815253451, + "grad_norm": 0.21409931778907776, + "learning_rate": 9.934280222672035e-06, + "loss": 0.1433, + "step": 41224 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 0.20853962004184723, + "learning_rate": 9.933038125133252e-06, + "loss": 0.0971, + "step": 41225 + }, + { + "epoch": 0.7353119537687726, + "grad_norm": 0.2655574083328247, + "learning_rate": 9.931796085999222e-06, + "loss": 0.1062, + "step": 41226 + }, + { + "epoch": 0.7353297898904863, + "grad_norm": 0.2580946385860443, + "learning_rate": 9.930554105274759e-06, + "loss": 0.099, + "step": 41227 + }, + { + "epoch": 0.7353476260121999, + "grad_norm": 0.32619747519493103, + "learning_rate": 9.929312182964671e-06, + "loss": 0.1373, + "step": 41228 + }, + { + "epoch": 0.7353654621339136, + "grad_norm": 0.32713520526885986, + "learning_rate": 9.928070319073773e-06, + "loss": 0.1593, + "step": 41229 + }, + { + "epoch": 0.7353832982556273, + "grad_norm": 0.2641415297985077, + "learning_rate": 9.926828513606874e-06, + "loss": 0.1066, + "step": 41230 + }, + { + "epoch": 0.735401134377341, + "grad_norm": 0.2015880048274994, + "learning_rate": 9.9255867665688e-06, + "loss": 0.1058, + "step": 41231 + }, + { + "epoch": 0.7354189704990547, + "grad_norm": 0.281227707862854, + "learning_rate": 9.924345077964359e-06, + "loss": 0.1086, + "step": 41232 + }, + { + "epoch": 0.7354368066207684, + "grad_norm": 0.5230772495269775, + "learning_rate": 9.923103447798357e-06, + "loss": 0.1017, + "step": 41233 + }, + { + "epoch": 0.7354546427424821, + "grad_norm": 0.3981606066226959, + "learning_rate": 9.92186187607561e-06, + "loss": 0.1324, + "step": 41234 + }, + { + "epoch": 0.7354724788641958, + "grad_norm": 0.23892194032669067, + "learning_rate": 9.920620362800945e-06, + "loss": 0.1277, + "step": 41235 + }, + { + "epoch": 0.7354903149859094, + "grad_norm": 0.3244689404964447, + "learning_rate": 9.919378907979163e-06, + "loss": 0.1264, + "step": 41236 + }, + { + "epoch": 0.7355081511076231, + "grad_norm": 0.3191315233707428, + "learning_rate": 9.918137511615078e-06, + "loss": 0.1446, + "step": 41237 + }, + { + "epoch": 0.7355259872293368, + "grad_norm": 0.3100046217441559, + "learning_rate": 9.916896173713492e-06, + "loss": 0.1196, + "step": 41238 + }, + { + "epoch": 0.7355438233510505, + "grad_norm": 0.29578694701194763, + "learning_rate": 9.915654894279234e-06, + "loss": 0.1501, + "step": 41239 + }, + { + "epoch": 0.7355616594727642, + "grad_norm": 0.23093993961811066, + "learning_rate": 9.914413673317108e-06, + "loss": 0.1164, + "step": 41240 + }, + { + "epoch": 0.7355794955944779, + "grad_norm": 0.25504809617996216, + "learning_rate": 9.913172510831925e-06, + "loss": 0.0783, + "step": 41241 + }, + { + "epoch": 0.7355973317161917, + "grad_norm": 0.2667956054210663, + "learning_rate": 9.911931406828495e-06, + "loss": 0.1214, + "step": 41242 + }, + { + "epoch": 0.7356151678379054, + "grad_norm": 0.31752878427505493, + "learning_rate": 9.910690361311623e-06, + "loss": 0.1053, + "step": 41243 + }, + { + "epoch": 0.7356330039596191, + "grad_norm": 0.246660515666008, + "learning_rate": 9.909449374286137e-06, + "loss": 0.0931, + "step": 41244 + }, + { + "epoch": 0.7356508400813327, + "grad_norm": 0.35422033071517944, + "learning_rate": 9.908208445756834e-06, + "loss": 0.1364, + "step": 41245 + }, + { + "epoch": 0.7356686762030464, + "grad_norm": 0.31301769614219666, + "learning_rate": 9.906967575728521e-06, + "loss": 0.1116, + "step": 41246 + }, + { + "epoch": 0.7356865123247601, + "grad_norm": 0.31700021028518677, + "learning_rate": 9.905726764206022e-06, + "loss": 0.15, + "step": 41247 + }, + { + "epoch": 0.7357043484464738, + "grad_norm": 0.21303242444992065, + "learning_rate": 9.904486011194131e-06, + "loss": 0.0974, + "step": 41248 + }, + { + "epoch": 0.7357221845681875, + "grad_norm": 0.2768758535385132, + "learning_rate": 9.903245316697676e-06, + "loss": 0.1695, + "step": 41249 + }, + { + "epoch": 0.7357400206899012, + "grad_norm": 0.2156328409910202, + "learning_rate": 9.902004680721455e-06, + "loss": 0.1148, + "step": 41250 + }, + { + "epoch": 0.7357578568116149, + "grad_norm": 0.3599184453487396, + "learning_rate": 9.900764103270272e-06, + "loss": 0.1301, + "step": 41251 + }, + { + "epoch": 0.7357756929333286, + "grad_norm": 0.25073978304862976, + "learning_rate": 9.899523584348947e-06, + "loss": 0.1078, + "step": 41252 + }, + { + "epoch": 0.7357935290550423, + "grad_norm": 0.22071605920791626, + "learning_rate": 9.898283123962287e-06, + "loss": 0.0655, + "step": 41253 + }, + { + "epoch": 0.7358113651767559, + "grad_norm": 0.21694721281528473, + "learning_rate": 9.897042722115098e-06, + "loss": 0.0712, + "step": 41254 + }, + { + "epoch": 0.7358292012984696, + "grad_norm": 0.27649515867233276, + "learning_rate": 9.895802378812185e-06, + "loss": 0.132, + "step": 41255 + }, + { + "epoch": 0.7358470374201833, + "grad_norm": 0.2669910788536072, + "learning_rate": 9.894562094058355e-06, + "loss": 0.1205, + "step": 41256 + }, + { + "epoch": 0.735864873541897, + "grad_norm": 0.32782435417175293, + "learning_rate": 9.893321867858424e-06, + "loss": 0.1631, + "step": 41257 + }, + { + "epoch": 0.7358827096636107, + "grad_norm": 0.2619032561779022, + "learning_rate": 9.892081700217199e-06, + "loss": 0.0974, + "step": 41258 + }, + { + "epoch": 0.7359005457853245, + "grad_norm": 0.3018103241920471, + "learning_rate": 9.89084159113948e-06, + "loss": 0.1293, + "step": 41259 + }, + { + "epoch": 0.7359183819070382, + "grad_norm": 0.23306900262832642, + "learning_rate": 9.88960154063007e-06, + "loss": 0.116, + "step": 41260 + }, + { + "epoch": 0.7359362180287519, + "grad_norm": 0.29921168088912964, + "learning_rate": 9.888361548693781e-06, + "loss": 0.1494, + "step": 41261 + }, + { + "epoch": 0.7359540541504656, + "grad_norm": 0.5553643107414246, + "learning_rate": 9.887121615335434e-06, + "loss": 0.1846, + "step": 41262 + }, + { + "epoch": 0.7359718902721792, + "grad_norm": 0.29240524768829346, + "learning_rate": 9.885881740559821e-06, + "loss": 0.1424, + "step": 41263 + }, + { + "epoch": 0.7359897263938929, + "grad_norm": 0.29825636744499207, + "learning_rate": 9.884641924371745e-06, + "loss": 0.1885, + "step": 41264 + }, + { + "epoch": 0.7360075625156066, + "grad_norm": 0.22996903955936432, + "learning_rate": 9.883402166776023e-06, + "loss": 0.1063, + "step": 41265 + }, + { + "epoch": 0.7360253986373203, + "grad_norm": 0.5425612330436707, + "learning_rate": 9.882162467777458e-06, + "loss": 0.1794, + "step": 41266 + }, + { + "epoch": 0.736043234759034, + "grad_norm": 0.2286163866519928, + "learning_rate": 9.88092282738085e-06, + "loss": 0.1112, + "step": 41267 + }, + { + "epoch": 0.7360610708807477, + "grad_norm": 0.21221517026424408, + "learning_rate": 9.87968324559101e-06, + "loss": 0.1621, + "step": 41268 + }, + { + "epoch": 0.7360789070024614, + "grad_norm": 0.33377403020858765, + "learning_rate": 9.878443722412731e-06, + "loss": 0.1457, + "step": 41269 + }, + { + "epoch": 0.7360967431241751, + "grad_norm": 0.28944864869117737, + "learning_rate": 9.877204257850834e-06, + "loss": 0.1175, + "step": 41270 + }, + { + "epoch": 0.7361145792458887, + "grad_norm": 0.21595631539821625, + "learning_rate": 9.875964851910119e-06, + "loss": 0.0879, + "step": 41271 + }, + { + "epoch": 0.7361324153676024, + "grad_norm": 0.24051351845264435, + "learning_rate": 9.874725504595386e-06, + "loss": 0.0932, + "step": 41272 + }, + { + "epoch": 0.7361502514893161, + "grad_norm": 0.22883793711662292, + "learning_rate": 9.873486215911432e-06, + "loss": 0.153, + "step": 41273 + }, + { + "epoch": 0.7361680876110298, + "grad_norm": 0.35602131485939026, + "learning_rate": 9.87224698586308e-06, + "loss": 0.1328, + "step": 41274 + }, + { + "epoch": 0.7361859237327435, + "grad_norm": 0.2803347706794739, + "learning_rate": 9.871007814455113e-06, + "loss": 0.0929, + "step": 41275 + }, + { + "epoch": 0.7362037598544573, + "grad_norm": 0.3134915232658386, + "learning_rate": 9.869768701692355e-06, + "loss": 0.1371, + "step": 41276 + }, + { + "epoch": 0.736221595976171, + "grad_norm": 0.2598631680011749, + "learning_rate": 9.868529647579592e-06, + "loss": 0.1169, + "step": 41277 + }, + { + "epoch": 0.7362394320978847, + "grad_norm": 0.19712214171886444, + "learning_rate": 9.86729065212164e-06, + "loss": 0.0689, + "step": 41278 + }, + { + "epoch": 0.7362572682195984, + "grad_norm": 0.26780885457992554, + "learning_rate": 9.8660517153233e-06, + "loss": 0.1358, + "step": 41279 + }, + { + "epoch": 0.736275104341312, + "grad_norm": 0.264740914106369, + "learning_rate": 9.864812837189366e-06, + "loss": 0.1412, + "step": 41280 + }, + { + "epoch": 0.7362929404630257, + "grad_norm": 0.21488817036151886, + "learning_rate": 9.86357401772465e-06, + "loss": 0.0913, + "step": 41281 + }, + { + "epoch": 0.7363107765847394, + "grad_norm": 0.2211741805076599, + "learning_rate": 9.862335256933936e-06, + "loss": 0.1101, + "step": 41282 + }, + { + "epoch": 0.7363286127064531, + "grad_norm": 0.18428611755371094, + "learning_rate": 9.86109655482205e-06, + "loss": 0.0828, + "step": 41283 + }, + { + "epoch": 0.7363464488281668, + "grad_norm": 0.32277899980545044, + "learning_rate": 9.859857911393783e-06, + "loss": 0.141, + "step": 41284 + }, + { + "epoch": 0.7363642849498805, + "grad_norm": 0.2758120596408844, + "learning_rate": 9.858619326653934e-06, + "loss": 0.0877, + "step": 41285 + }, + { + "epoch": 0.7363821210715942, + "grad_norm": 0.26657819747924805, + "learning_rate": 9.857380800607299e-06, + "loss": 0.1283, + "step": 41286 + }, + { + "epoch": 0.7363999571933079, + "grad_norm": 0.36549943685531616, + "learning_rate": 9.856142333258695e-06, + "loss": 0.0982, + "step": 41287 + }, + { + "epoch": 0.7364177933150216, + "grad_norm": 0.2409859597682953, + "learning_rate": 9.854903924612901e-06, + "loss": 0.1204, + "step": 41288 + }, + { + "epoch": 0.7364356294367352, + "grad_norm": 0.3054850995540619, + "learning_rate": 9.853665574674744e-06, + "loss": 0.118, + "step": 41289 + }, + { + "epoch": 0.7364534655584489, + "grad_norm": 0.3015710115432739, + "learning_rate": 9.852427283449001e-06, + "loss": 0.1545, + "step": 41290 + }, + { + "epoch": 0.7364713016801626, + "grad_norm": 0.30109190940856934, + "learning_rate": 9.85118905094049e-06, + "loss": 0.167, + "step": 41291 + }, + { + "epoch": 0.7364891378018764, + "grad_norm": 0.2459445595741272, + "learning_rate": 9.849950877154002e-06, + "loss": 0.0904, + "step": 41292 + }, + { + "epoch": 0.7365069739235901, + "grad_norm": 0.3391425311565399, + "learning_rate": 9.848712762094337e-06, + "loss": 0.129, + "step": 41293 + }, + { + "epoch": 0.7365248100453038, + "grad_norm": 0.23951072990894318, + "learning_rate": 9.847474705766294e-06, + "loss": 0.1369, + "step": 41294 + }, + { + "epoch": 0.7365426461670175, + "grad_norm": 0.32919031381607056, + "learning_rate": 9.846236708174663e-06, + "loss": 0.1044, + "step": 41295 + }, + { + "epoch": 0.7365604822887312, + "grad_norm": 0.23353825509548187, + "learning_rate": 9.844998769324265e-06, + "loss": 0.1165, + "step": 41296 + }, + { + "epoch": 0.7365783184104449, + "grad_norm": 0.35395652055740356, + "learning_rate": 9.843760889219883e-06, + "loss": 0.1528, + "step": 41297 + }, + { + "epoch": 0.7365961545321585, + "grad_norm": 0.26342421770095825, + "learning_rate": 9.842523067866318e-06, + "loss": 0.1178, + "step": 41298 + }, + { + "epoch": 0.7366139906538722, + "grad_norm": 0.3448401689529419, + "learning_rate": 9.841285305268364e-06, + "loss": 0.1047, + "step": 41299 + }, + { + "epoch": 0.7366318267755859, + "grad_norm": 0.24445965886116028, + "learning_rate": 9.840047601430829e-06, + "loss": 0.1253, + "step": 41300 + }, + { + "epoch": 0.7366496628972996, + "grad_norm": 0.22575442492961884, + "learning_rate": 9.838809956358505e-06, + "loss": 0.1364, + "step": 41301 + }, + { + "epoch": 0.7366674990190133, + "grad_norm": 0.27268749475479126, + "learning_rate": 9.837572370056183e-06, + "loss": 0.1396, + "step": 41302 + }, + { + "epoch": 0.736685335140727, + "grad_norm": 0.2491791993379593, + "learning_rate": 9.836334842528677e-06, + "loss": 0.1246, + "step": 41303 + }, + { + "epoch": 0.7367031712624407, + "grad_norm": 0.28346988558769226, + "learning_rate": 9.835097373780766e-06, + "loss": 0.1428, + "step": 41304 + }, + { + "epoch": 0.7367210073841544, + "grad_norm": 0.26530852913856506, + "learning_rate": 9.833859963817263e-06, + "loss": 0.1386, + "step": 41305 + }, + { + "epoch": 0.736738843505868, + "grad_norm": 0.31037837266921997, + "learning_rate": 9.832622612642956e-06, + "loss": 0.1025, + "step": 41306 + }, + { + "epoch": 0.7367566796275817, + "grad_norm": 0.27824413776397705, + "learning_rate": 9.831385320262643e-06, + "loss": 0.1625, + "step": 41307 + }, + { + "epoch": 0.7367745157492954, + "grad_norm": 0.3469172418117523, + "learning_rate": 9.830148086681112e-06, + "loss": 0.1169, + "step": 41308 + }, + { + "epoch": 0.7367923518710092, + "grad_norm": 0.30491894483566284, + "learning_rate": 9.828910911903175e-06, + "loss": 0.1054, + "step": 41309 + }, + { + "epoch": 0.7368101879927229, + "grad_norm": 0.22883087396621704, + "learning_rate": 9.827673795933618e-06, + "loss": 0.0977, + "step": 41310 + }, + { + "epoch": 0.7368280241144366, + "grad_norm": 0.2847735285758972, + "learning_rate": 9.826436738777237e-06, + "loss": 0.0708, + "step": 41311 + }, + { + "epoch": 0.7368458602361503, + "grad_norm": 0.3282276690006256, + "learning_rate": 9.82519974043882e-06, + "loss": 0.1307, + "step": 41312 + }, + { + "epoch": 0.736863696357864, + "grad_norm": 0.35099318623542786, + "learning_rate": 9.823962800923181e-06, + "loss": 0.1349, + "step": 41313 + }, + { + "epoch": 0.7368815324795777, + "grad_norm": 0.28953787684440613, + "learning_rate": 9.822725920235102e-06, + "loss": 0.1509, + "step": 41314 + }, + { + "epoch": 0.7368993686012913, + "grad_norm": 0.23363667726516724, + "learning_rate": 9.821489098379372e-06, + "loss": 0.0719, + "step": 41315 + }, + { + "epoch": 0.736917204723005, + "grad_norm": 0.2640970051288605, + "learning_rate": 9.8202523353608e-06, + "loss": 0.1172, + "step": 41316 + }, + { + "epoch": 0.7369350408447187, + "grad_norm": 0.22964787483215332, + "learning_rate": 9.819015631184164e-06, + "loss": 0.1352, + "step": 41317 + }, + { + "epoch": 0.7369528769664324, + "grad_norm": 0.29829102754592896, + "learning_rate": 9.817778985854276e-06, + "loss": 0.1285, + "step": 41318 + }, + { + "epoch": 0.7369707130881461, + "grad_norm": 0.27778011560440063, + "learning_rate": 9.81654239937592e-06, + "loss": 0.1213, + "step": 41319 + }, + { + "epoch": 0.7369885492098598, + "grad_norm": 0.3028642535209656, + "learning_rate": 9.815305871753891e-06, + "loss": 0.0751, + "step": 41320 + }, + { + "epoch": 0.7370063853315735, + "grad_norm": 0.29142358899116516, + "learning_rate": 9.81406940299297e-06, + "loss": 0.1571, + "step": 41321 + }, + { + "epoch": 0.7370242214532872, + "grad_norm": 0.24036431312561035, + "learning_rate": 9.812832993097973e-06, + "loss": 0.1054, + "step": 41322 + }, + { + "epoch": 0.7370420575750009, + "grad_norm": 0.31236696243286133, + "learning_rate": 9.811596642073678e-06, + "loss": 0.1085, + "step": 41323 + }, + { + "epoch": 0.7370598936967145, + "grad_norm": 0.2720494270324707, + "learning_rate": 9.810360349924883e-06, + "loss": 0.1317, + "step": 41324 + }, + { + "epoch": 0.7370777298184282, + "grad_norm": 0.2666541635990143, + "learning_rate": 9.809124116656365e-06, + "loss": 0.1219, + "step": 41325 + }, + { + "epoch": 0.737095565940142, + "grad_norm": 0.19292420148849487, + "learning_rate": 9.807887942272941e-06, + "loss": 0.0919, + "step": 41326 + }, + { + "epoch": 0.7371134020618557, + "grad_norm": 0.23057663440704346, + "learning_rate": 9.80665182677939e-06, + "loss": 0.0924, + "step": 41327 + }, + { + "epoch": 0.7371312381835694, + "grad_norm": 0.3075406551361084, + "learning_rate": 9.805415770180503e-06, + "loss": 0.117, + "step": 41328 + }, + { + "epoch": 0.7371490743052831, + "grad_norm": 0.3134598135948181, + "learning_rate": 9.804179772481064e-06, + "loss": 0.0958, + "step": 41329 + }, + { + "epoch": 0.7371669104269968, + "grad_norm": 0.24785654246807098, + "learning_rate": 9.802943833685881e-06, + "loss": 0.1259, + "step": 41330 + }, + { + "epoch": 0.7371847465487105, + "grad_norm": 0.23358578979969025, + "learning_rate": 9.80170795379973e-06, + "loss": 0.1315, + "step": 41331 + }, + { + "epoch": 0.7372025826704242, + "grad_norm": 0.2687293291091919, + "learning_rate": 9.800472132827415e-06, + "loss": 0.1012, + "step": 41332 + }, + { + "epoch": 0.7372204187921378, + "grad_norm": 0.23413026332855225, + "learning_rate": 9.799236370773712e-06, + "loss": 0.1316, + "step": 41333 + }, + { + "epoch": 0.7372382549138515, + "grad_norm": 0.2845030725002289, + "learning_rate": 9.798000667643426e-06, + "loss": 0.1515, + "step": 41334 + }, + { + "epoch": 0.7372560910355652, + "grad_norm": 0.24666526913642883, + "learning_rate": 9.796765023441342e-06, + "loss": 0.1009, + "step": 41335 + }, + { + "epoch": 0.7372739271572789, + "grad_norm": 0.26149916648864746, + "learning_rate": 9.795529438172247e-06, + "loss": 0.1522, + "step": 41336 + }, + { + "epoch": 0.7372917632789926, + "grad_norm": 0.310781329870224, + "learning_rate": 9.794293911840932e-06, + "loss": 0.1356, + "step": 41337 + }, + { + "epoch": 0.7373095994007063, + "grad_norm": 0.27617835998535156, + "learning_rate": 9.793058444452175e-06, + "loss": 0.1151, + "step": 41338 + }, + { + "epoch": 0.73732743552242, + "grad_norm": 0.2620704174041748, + "learning_rate": 9.791823036010785e-06, + "loss": 0.1065, + "step": 41339 + }, + { + "epoch": 0.7373452716441337, + "grad_norm": 0.21987231075763702, + "learning_rate": 9.790587686521544e-06, + "loss": 0.1243, + "step": 41340 + }, + { + "epoch": 0.7373631077658473, + "grad_norm": 0.2871493995189667, + "learning_rate": 9.789352395989238e-06, + "loss": 0.1247, + "step": 41341 + }, + { + "epoch": 0.737380943887561, + "grad_norm": 0.19367468357086182, + "learning_rate": 9.788117164418645e-06, + "loss": 0.0685, + "step": 41342 + }, + { + "epoch": 0.7373987800092748, + "grad_norm": 0.23209679126739502, + "learning_rate": 9.786881991814572e-06, + "loss": 0.0928, + "step": 41343 + }, + { + "epoch": 0.7374166161309885, + "grad_norm": 0.3238326609134674, + "learning_rate": 9.785646878181792e-06, + "loss": 0.1184, + "step": 41344 + }, + { + "epoch": 0.7374344522527022, + "grad_norm": 0.19274313747882843, + "learning_rate": 9.784411823525109e-06, + "loss": 0.1528, + "step": 41345 + }, + { + "epoch": 0.7374522883744159, + "grad_norm": 0.3104701638221741, + "learning_rate": 9.783176827849292e-06, + "loss": 0.0909, + "step": 41346 + }, + { + "epoch": 0.7374701244961296, + "grad_norm": 0.34914323687553406, + "learning_rate": 9.781941891159143e-06, + "loss": 0.143, + "step": 41347 + }, + { + "epoch": 0.7374879606178433, + "grad_norm": 0.19293087720870972, + "learning_rate": 9.780707013459447e-06, + "loss": 0.1033, + "step": 41348 + }, + { + "epoch": 0.737505796739557, + "grad_norm": 0.28701385855674744, + "learning_rate": 9.779472194754986e-06, + "loss": 0.109, + "step": 41349 + }, + { + "epoch": 0.7375236328612707, + "grad_norm": 0.38134685158729553, + "learning_rate": 9.778237435050547e-06, + "loss": 0.151, + "step": 41350 + }, + { + "epoch": 0.7375414689829843, + "grad_norm": 0.2430017739534378, + "learning_rate": 9.77700273435091e-06, + "loss": 0.1528, + "step": 41351 + }, + { + "epoch": 0.737559305104698, + "grad_norm": 0.31125009059906006, + "learning_rate": 9.775768092660876e-06, + "loss": 0.1459, + "step": 41352 + }, + { + "epoch": 0.7375771412264117, + "grad_norm": 0.2849879562854767, + "learning_rate": 9.77453350998522e-06, + "loss": 0.1533, + "step": 41353 + }, + { + "epoch": 0.7375949773481254, + "grad_norm": 0.2242075651884079, + "learning_rate": 9.773298986328735e-06, + "loss": 0.0902, + "step": 41354 + }, + { + "epoch": 0.7376128134698391, + "grad_norm": 0.17803502082824707, + "learning_rate": 9.772064521696192e-06, + "loss": 0.0798, + "step": 41355 + }, + { + "epoch": 0.7376306495915528, + "grad_norm": 0.28858867287635803, + "learning_rate": 9.770830116092398e-06, + "loss": 0.129, + "step": 41356 + }, + { + "epoch": 0.7376484857132665, + "grad_norm": 0.5552446246147156, + "learning_rate": 9.769595769522121e-06, + "loss": 0.159, + "step": 41357 + }, + { + "epoch": 0.7376663218349802, + "grad_norm": 0.26760727167129517, + "learning_rate": 9.76836148199015e-06, + "loss": 0.1145, + "step": 41358 + }, + { + "epoch": 0.7376841579566938, + "grad_norm": 0.3317415118217468, + "learning_rate": 9.767127253501273e-06, + "loss": 0.1462, + "step": 41359 + }, + { + "epoch": 0.7377019940784076, + "grad_norm": 0.2822422981262207, + "learning_rate": 9.765893084060265e-06, + "loss": 0.1271, + "step": 41360 + }, + { + "epoch": 0.7377198302001213, + "grad_norm": 0.3395298719406128, + "learning_rate": 9.764658973671925e-06, + "loss": 0.1073, + "step": 41361 + }, + { + "epoch": 0.737737666321835, + "grad_norm": 0.22872154414653778, + "learning_rate": 9.76342492234103e-06, + "loss": 0.1116, + "step": 41362 + }, + { + "epoch": 0.7377555024435487, + "grad_norm": 0.22709661722183228, + "learning_rate": 9.762190930072365e-06, + "loss": 0.0925, + "step": 41363 + }, + { + "epoch": 0.7377733385652624, + "grad_norm": 0.27168264985084534, + "learning_rate": 9.760956996870699e-06, + "loss": 0.1351, + "step": 41364 + }, + { + "epoch": 0.7377911746869761, + "grad_norm": 0.2419309765100479, + "learning_rate": 9.759723122740836e-06, + "loss": 0.1066, + "step": 41365 + }, + { + "epoch": 0.7378090108086898, + "grad_norm": 0.24501577019691467, + "learning_rate": 9.758489307687552e-06, + "loss": 0.1011, + "step": 41366 + }, + { + "epoch": 0.7378268469304035, + "grad_norm": 0.3001067340373993, + "learning_rate": 9.757255551715624e-06, + "loss": 0.1522, + "step": 41367 + }, + { + "epoch": 0.7378446830521171, + "grad_norm": 0.21468831598758698, + "learning_rate": 9.756021854829833e-06, + "loss": 0.0719, + "step": 41368 + }, + { + "epoch": 0.7378625191738308, + "grad_norm": 0.3272707760334015, + "learning_rate": 9.754788217034975e-06, + "loss": 0.1008, + "step": 41369 + }, + { + "epoch": 0.7378803552955445, + "grad_norm": 0.3034740388393402, + "learning_rate": 9.753554638335822e-06, + "loss": 0.1375, + "step": 41370 + }, + { + "epoch": 0.7378981914172582, + "grad_norm": 0.36299192905426025, + "learning_rate": 9.752321118737149e-06, + "loss": 0.154, + "step": 41371 + }, + { + "epoch": 0.7379160275389719, + "grad_norm": 0.2829195261001587, + "learning_rate": 9.751087658243754e-06, + "loss": 0.1302, + "step": 41372 + }, + { + "epoch": 0.7379338636606856, + "grad_norm": 0.37660980224609375, + "learning_rate": 9.749854256860399e-06, + "loss": 0.1014, + "step": 41373 + }, + { + "epoch": 0.7379516997823993, + "grad_norm": 0.26894327998161316, + "learning_rate": 9.748620914591888e-06, + "loss": 0.1102, + "step": 41374 + }, + { + "epoch": 0.737969535904113, + "grad_norm": 0.28775569796562195, + "learning_rate": 9.747387631442991e-06, + "loss": 0.1566, + "step": 41375 + }, + { + "epoch": 0.7379873720258266, + "grad_norm": 0.23961712419986725, + "learning_rate": 9.746154407418486e-06, + "loss": 0.1321, + "step": 41376 + }, + { + "epoch": 0.7380052081475404, + "grad_norm": 0.26509279012680054, + "learning_rate": 9.744921242523144e-06, + "loss": 0.135, + "step": 41377 + }, + { + "epoch": 0.7380230442692541, + "grad_norm": 0.3258817791938782, + "learning_rate": 9.743688136761769e-06, + "loss": 0.143, + "step": 41378 + }, + { + "epoch": 0.7380408803909678, + "grad_norm": 0.40079614520072937, + "learning_rate": 9.742455090139128e-06, + "loss": 0.1376, + "step": 41379 + }, + { + "epoch": 0.7380587165126815, + "grad_norm": 0.3613516688346863, + "learning_rate": 9.741222102659998e-06, + "loss": 0.1219, + "step": 41380 + }, + { + "epoch": 0.7380765526343952, + "grad_norm": 0.24138766527175903, + "learning_rate": 9.739989174329153e-06, + "loss": 0.1228, + "step": 41381 + }, + { + "epoch": 0.7380943887561089, + "grad_norm": 0.24906961619853973, + "learning_rate": 9.738756305151392e-06, + "loss": 0.0866, + "step": 41382 + }, + { + "epoch": 0.7381122248778226, + "grad_norm": 0.26027509570121765, + "learning_rate": 9.737523495131479e-06, + "loss": 0.1036, + "step": 41383 + }, + { + "epoch": 0.7381300609995363, + "grad_norm": 0.2520490288734436, + "learning_rate": 9.736290744274198e-06, + "loss": 0.0623, + "step": 41384 + }, + { + "epoch": 0.73814789712125, + "grad_norm": 0.21440115571022034, + "learning_rate": 9.735058052584316e-06, + "loss": 0.1032, + "step": 41385 + }, + { + "epoch": 0.7381657332429636, + "grad_norm": 0.23680466413497925, + "learning_rate": 9.733825420066634e-06, + "loss": 0.1249, + "step": 41386 + }, + { + "epoch": 0.7381835693646773, + "grad_norm": 0.13513095676898956, + "learning_rate": 9.732592846725905e-06, + "loss": 0.0483, + "step": 41387 + }, + { + "epoch": 0.738201405486391, + "grad_norm": 0.28293490409851074, + "learning_rate": 9.73136033256693e-06, + "loss": 0.1358, + "step": 41388 + }, + { + "epoch": 0.7382192416081047, + "grad_norm": 0.24058249592781067, + "learning_rate": 9.730127877594474e-06, + "loss": 0.0958, + "step": 41389 + }, + { + "epoch": 0.7382370777298184, + "grad_norm": 0.24740611016750336, + "learning_rate": 9.72889548181331e-06, + "loss": 0.0722, + "step": 41390 + }, + { + "epoch": 0.7382549138515321, + "grad_norm": 0.27616000175476074, + "learning_rate": 9.727663145228231e-06, + "loss": 0.0822, + "step": 41391 + }, + { + "epoch": 0.7382727499732458, + "grad_norm": 0.2812407612800598, + "learning_rate": 9.726430867844002e-06, + "loss": 0.1247, + "step": 41392 + }, + { + "epoch": 0.7382905860949596, + "grad_norm": 0.23269103467464447, + "learning_rate": 9.725198649665401e-06, + "loss": 0.1277, + "step": 41393 + }, + { + "epoch": 0.7383084222166733, + "grad_norm": 0.25381720066070557, + "learning_rate": 9.7239664906972e-06, + "loss": 0.0819, + "step": 41394 + }, + { + "epoch": 0.7383262583383869, + "grad_norm": 0.318613737821579, + "learning_rate": 9.722734390944188e-06, + "loss": 0.1469, + "step": 41395 + }, + { + "epoch": 0.7383440944601006, + "grad_norm": 0.32372286915779114, + "learning_rate": 9.721502350411136e-06, + "loss": 0.1107, + "step": 41396 + }, + { + "epoch": 0.7383619305818143, + "grad_norm": 0.19990098476409912, + "learning_rate": 9.720270369102818e-06, + "loss": 0.0929, + "step": 41397 + }, + { + "epoch": 0.738379766703528, + "grad_norm": 0.24862739443778992, + "learning_rate": 9.719038447023998e-06, + "loss": 0.1186, + "step": 41398 + }, + { + "epoch": 0.7383976028252417, + "grad_norm": 0.3199597895145416, + "learning_rate": 9.717806584179472e-06, + "loss": 0.1244, + "step": 41399 + }, + { + "epoch": 0.7384154389469554, + "grad_norm": 0.28124749660491943, + "learning_rate": 9.716574780574e-06, + "loss": 0.096, + "step": 41400 + }, + { + "epoch": 0.7384332750686691, + "grad_norm": 0.21386314928531647, + "learning_rate": 9.71534303621237e-06, + "loss": 0.0881, + "step": 41401 + }, + { + "epoch": 0.7384511111903828, + "grad_norm": 0.22377094626426697, + "learning_rate": 9.71411135109935e-06, + "loss": 0.1298, + "step": 41402 + }, + { + "epoch": 0.7384689473120964, + "grad_norm": 0.1951315551996231, + "learning_rate": 9.712879725239703e-06, + "loss": 0.0817, + "step": 41403 + }, + { + "epoch": 0.7384867834338101, + "grad_norm": 0.27208980917930603, + "learning_rate": 9.711648158638229e-06, + "loss": 0.1007, + "step": 41404 + }, + { + "epoch": 0.7385046195555238, + "grad_norm": 0.3466946482658386, + "learning_rate": 9.710416651299681e-06, + "loss": 0.1774, + "step": 41405 + }, + { + "epoch": 0.7385224556772375, + "grad_norm": 0.2540076971054077, + "learning_rate": 9.709185203228841e-06, + "loss": 0.2065, + "step": 41406 + }, + { + "epoch": 0.7385402917989512, + "grad_norm": 0.23163831233978271, + "learning_rate": 9.70795381443047e-06, + "loss": 0.1031, + "step": 41407 + }, + { + "epoch": 0.7385581279206649, + "grad_norm": 0.2710544764995575, + "learning_rate": 9.706722484909364e-06, + "loss": 0.1548, + "step": 41408 + }, + { + "epoch": 0.7385759640423786, + "grad_norm": 0.5077825784683228, + "learning_rate": 9.705491214670281e-06, + "loss": 0.1196, + "step": 41409 + }, + { + "epoch": 0.7385938001640924, + "grad_norm": 0.27410173416137695, + "learning_rate": 9.704260003717999e-06, + "loss": 0.0725, + "step": 41410 + }, + { + "epoch": 0.7386116362858061, + "grad_norm": 0.20637863874435425, + "learning_rate": 9.703028852057277e-06, + "loss": 0.0807, + "step": 41411 + }, + { + "epoch": 0.7386294724075197, + "grad_norm": 0.3210963308811188, + "learning_rate": 9.701797759692908e-06, + "loss": 0.1167, + "step": 41412 + }, + { + "epoch": 0.7386473085292334, + "grad_norm": 0.3024260997772217, + "learning_rate": 9.700566726629657e-06, + "loss": 0.1476, + "step": 41413 + }, + { + "epoch": 0.7386651446509471, + "grad_norm": 0.2640106976032257, + "learning_rate": 9.69933575287228e-06, + "loss": 0.1031, + "step": 41414 + }, + { + "epoch": 0.7386829807726608, + "grad_norm": 0.307650089263916, + "learning_rate": 9.698104838425575e-06, + "loss": 0.1178, + "step": 41415 + }, + { + "epoch": 0.7387008168943745, + "grad_norm": 0.2609955966472626, + "learning_rate": 9.696873983294292e-06, + "loss": 0.145, + "step": 41416 + }, + { + "epoch": 0.7387186530160882, + "grad_norm": 0.2551628351211548, + "learning_rate": 9.695643187483216e-06, + "loss": 0.0918, + "step": 41417 + }, + { + "epoch": 0.7387364891378019, + "grad_norm": 0.20740343630313873, + "learning_rate": 9.694412450997117e-06, + "loss": 0.0547, + "step": 41418 + }, + { + "epoch": 0.7387543252595156, + "grad_norm": 0.2992290258407593, + "learning_rate": 9.69318177384076e-06, + "loss": 0.1215, + "step": 41419 + }, + { + "epoch": 0.7387721613812293, + "grad_norm": 0.2691270411014557, + "learning_rate": 9.691951156018907e-06, + "loss": 0.1097, + "step": 41420 + }, + { + "epoch": 0.7387899975029429, + "grad_norm": 0.3057255148887634, + "learning_rate": 9.690720597536352e-06, + "loss": 0.1064, + "step": 41421 + }, + { + "epoch": 0.7388078336246566, + "grad_norm": 0.31872445344924927, + "learning_rate": 9.689490098397846e-06, + "loss": 0.092, + "step": 41422 + }, + { + "epoch": 0.7388256697463703, + "grad_norm": 0.3267669379711151, + "learning_rate": 9.688259658608167e-06, + "loss": 0.0895, + "step": 41423 + }, + { + "epoch": 0.738843505868084, + "grad_norm": 0.2612524926662445, + "learning_rate": 9.687029278172074e-06, + "loss": 0.1347, + "step": 41424 + }, + { + "epoch": 0.7388613419897977, + "grad_norm": 0.3430282473564148, + "learning_rate": 9.685798957094353e-06, + "loss": 0.0975, + "step": 41425 + }, + { + "epoch": 0.7388791781115114, + "grad_norm": 0.28120988607406616, + "learning_rate": 9.684568695379765e-06, + "loss": 0.0962, + "step": 41426 + }, + { + "epoch": 0.7388970142332252, + "grad_norm": 0.2897935211658478, + "learning_rate": 9.683338493033067e-06, + "loss": 0.0899, + "step": 41427 + }, + { + "epoch": 0.7389148503549389, + "grad_norm": 0.19540317356586456, + "learning_rate": 9.682108350059053e-06, + "loss": 0.103, + "step": 41428 + }, + { + "epoch": 0.7389326864766526, + "grad_norm": 0.3540865182876587, + "learning_rate": 9.680878266462464e-06, + "loss": 0.1765, + "step": 41429 + }, + { + "epoch": 0.7389505225983662, + "grad_norm": 0.3027437627315521, + "learning_rate": 9.679648242248093e-06, + "loss": 0.1468, + "step": 41430 + }, + { + "epoch": 0.7389683587200799, + "grad_norm": 0.31158390641212463, + "learning_rate": 9.678418277420695e-06, + "loss": 0.1179, + "step": 41431 + }, + { + "epoch": 0.7389861948417936, + "grad_norm": 0.2667568325996399, + "learning_rate": 9.677188371985044e-06, + "loss": 0.152, + "step": 41432 + }, + { + "epoch": 0.7390040309635073, + "grad_norm": 0.372336208820343, + "learning_rate": 9.675958525945891e-06, + "loss": 0.1307, + "step": 41433 + }, + { + "epoch": 0.739021867085221, + "grad_norm": 0.24716825783252716, + "learning_rate": 9.674728739308023e-06, + "loss": 0.1088, + "step": 41434 + }, + { + "epoch": 0.7390397032069347, + "grad_norm": 0.2810577154159546, + "learning_rate": 9.673499012076202e-06, + "loss": 0.1214, + "step": 41435 + }, + { + "epoch": 0.7390575393286484, + "grad_norm": 0.24963566660881042, + "learning_rate": 9.672269344255192e-06, + "loss": 0.1197, + "step": 41436 + }, + { + "epoch": 0.7390753754503621, + "grad_norm": 0.33228957653045654, + "learning_rate": 9.671039735849752e-06, + "loss": 0.1766, + "step": 41437 + }, + { + "epoch": 0.7390932115720757, + "grad_norm": 0.3481631577014923, + "learning_rate": 9.669810186864664e-06, + "loss": 0.1124, + "step": 41438 + }, + { + "epoch": 0.7391110476937894, + "grad_norm": 0.32490450143814087, + "learning_rate": 9.668580697304686e-06, + "loss": 0.0922, + "step": 41439 + }, + { + "epoch": 0.7391288838155031, + "grad_norm": 0.25999683141708374, + "learning_rate": 9.667351267174584e-06, + "loss": 0.2062, + "step": 41440 + }, + { + "epoch": 0.7391467199372168, + "grad_norm": 0.3724503815174103, + "learning_rate": 9.666121896479119e-06, + "loss": 0.1238, + "step": 41441 + }, + { + "epoch": 0.7391645560589305, + "grad_norm": 0.2098606377840042, + "learning_rate": 9.664892585223059e-06, + "loss": 0.07, + "step": 41442 + }, + { + "epoch": 0.7391823921806442, + "grad_norm": 0.21738047897815704, + "learning_rate": 9.66366333341118e-06, + "loss": 0.1037, + "step": 41443 + }, + { + "epoch": 0.739200228302358, + "grad_norm": 0.2584335505962372, + "learning_rate": 9.66243414104824e-06, + "loss": 0.1337, + "step": 41444 + }, + { + "epoch": 0.7392180644240717, + "grad_norm": 0.2703591287136078, + "learning_rate": 9.661205008139004e-06, + "loss": 0.1079, + "step": 41445 + }, + { + "epoch": 0.7392359005457854, + "grad_norm": 0.28691044449806213, + "learning_rate": 9.659975934688225e-06, + "loss": 0.0983, + "step": 41446 + }, + { + "epoch": 0.739253736667499, + "grad_norm": 0.3057778477668762, + "learning_rate": 9.658746920700687e-06, + "loss": 0.1004, + "step": 41447 + }, + { + "epoch": 0.7392715727892127, + "grad_norm": 0.26334884762763977, + "learning_rate": 9.657517966181145e-06, + "loss": 0.1571, + "step": 41448 + }, + { + "epoch": 0.7392894089109264, + "grad_norm": 0.3140193521976471, + "learning_rate": 9.656289071134361e-06, + "loss": 0.1536, + "step": 41449 + }, + { + "epoch": 0.7393072450326401, + "grad_norm": 0.2501097619533539, + "learning_rate": 9.655060235565091e-06, + "loss": 0.1178, + "step": 41450 + }, + { + "epoch": 0.7393250811543538, + "grad_norm": 0.24283044040203094, + "learning_rate": 9.653831459478118e-06, + "loss": 0.1108, + "step": 41451 + }, + { + "epoch": 0.7393429172760675, + "grad_norm": 0.20097772777080536, + "learning_rate": 9.652602742878195e-06, + "loss": 0.1354, + "step": 41452 + }, + { + "epoch": 0.7393607533977812, + "grad_norm": 0.26745274662971497, + "learning_rate": 9.651374085770081e-06, + "loss": 0.1005, + "step": 41453 + }, + { + "epoch": 0.7393785895194949, + "grad_norm": 0.30940866470336914, + "learning_rate": 9.650145488158537e-06, + "loss": 0.0985, + "step": 41454 + }, + { + "epoch": 0.7393964256412086, + "grad_norm": 0.2447596937417984, + "learning_rate": 9.64891695004834e-06, + "loss": 0.1198, + "step": 41455 + }, + { + "epoch": 0.7394142617629222, + "grad_norm": 0.22476989030838013, + "learning_rate": 9.647688471444233e-06, + "loss": 0.1109, + "step": 41456 + }, + { + "epoch": 0.7394320978846359, + "grad_norm": 0.38907331228256226, + "learning_rate": 9.646460052350994e-06, + "loss": 0.1189, + "step": 41457 + }, + { + "epoch": 0.7394499340063496, + "grad_norm": 0.22350060939788818, + "learning_rate": 9.645231692773382e-06, + "loss": 0.0924, + "step": 41458 + }, + { + "epoch": 0.7394677701280633, + "grad_norm": 0.24202854931354523, + "learning_rate": 9.644003392716148e-06, + "loss": 0.1164, + "step": 41459 + }, + { + "epoch": 0.739485606249777, + "grad_norm": 0.24680662155151367, + "learning_rate": 9.642775152184067e-06, + "loss": 0.1084, + "step": 41460 + }, + { + "epoch": 0.7395034423714908, + "grad_norm": 0.34029340744018555, + "learning_rate": 9.641546971181894e-06, + "loss": 0.1432, + "step": 41461 + }, + { + "epoch": 0.7395212784932045, + "grad_norm": 0.2243356704711914, + "learning_rate": 9.640318849714388e-06, + "loss": 0.0816, + "step": 41462 + }, + { + "epoch": 0.7395391146149182, + "grad_norm": 0.265844464302063, + "learning_rate": 9.639090787786306e-06, + "loss": 0.0997, + "step": 41463 + }, + { + "epoch": 0.7395569507366319, + "grad_norm": 0.2882643938064575, + "learning_rate": 9.63786278540242e-06, + "loss": 0.0959, + "step": 41464 + }, + { + "epoch": 0.7395747868583455, + "grad_norm": 0.35611018538475037, + "learning_rate": 9.636634842567486e-06, + "loss": 0.1454, + "step": 41465 + }, + { + "epoch": 0.7395926229800592, + "grad_norm": 0.26879075169563293, + "learning_rate": 9.635406959286259e-06, + "loss": 0.1022, + "step": 41466 + }, + { + "epoch": 0.7396104591017729, + "grad_norm": 0.3092162311077118, + "learning_rate": 9.6341791355635e-06, + "loss": 0.1094, + "step": 41467 + }, + { + "epoch": 0.7396282952234866, + "grad_norm": 0.29538553953170776, + "learning_rate": 9.632951371403964e-06, + "loss": 0.1378, + "step": 41468 + }, + { + "epoch": 0.7396461313452003, + "grad_norm": 0.2055298388004303, + "learning_rate": 9.631723666812418e-06, + "loss": 0.0861, + "step": 41469 + }, + { + "epoch": 0.739663967466914, + "grad_norm": 0.3825426995754242, + "learning_rate": 9.630496021793622e-06, + "loss": 0.1674, + "step": 41470 + }, + { + "epoch": 0.7396818035886277, + "grad_norm": 0.26604917645454407, + "learning_rate": 9.629268436352337e-06, + "loss": 0.1288, + "step": 41471 + }, + { + "epoch": 0.7396996397103414, + "grad_norm": 0.2953208386898041, + "learning_rate": 9.628040910493306e-06, + "loss": 0.1065, + "step": 41472 + }, + { + "epoch": 0.739717475832055, + "grad_norm": 0.3289187252521515, + "learning_rate": 9.626813444221306e-06, + "loss": 0.1117, + "step": 41473 + }, + { + "epoch": 0.7397353119537687, + "grad_norm": 0.38006505370140076, + "learning_rate": 9.625586037541088e-06, + "loss": 0.0758, + "step": 41474 + }, + { + "epoch": 0.7397531480754824, + "grad_norm": 0.18941110372543335, + "learning_rate": 9.624358690457408e-06, + "loss": 0.1157, + "step": 41475 + }, + { + "epoch": 0.7397709841971961, + "grad_norm": 0.3121732473373413, + "learning_rate": 9.623131402975014e-06, + "loss": 0.1247, + "step": 41476 + }, + { + "epoch": 0.7397888203189098, + "grad_norm": 0.28172236680984497, + "learning_rate": 9.621904175098684e-06, + "loss": 0.1376, + "step": 41477 + }, + { + "epoch": 0.7398066564406236, + "grad_norm": 0.31437012553215027, + "learning_rate": 9.620677006833165e-06, + "loss": 0.0999, + "step": 41478 + }, + { + "epoch": 0.7398244925623373, + "grad_norm": 0.26400426030158997, + "learning_rate": 9.619449898183214e-06, + "loss": 0.1017, + "step": 41479 + }, + { + "epoch": 0.739842328684051, + "grad_norm": 0.297380656003952, + "learning_rate": 9.618222849153585e-06, + "loss": 0.1542, + "step": 41480 + }, + { + "epoch": 0.7398601648057647, + "grad_norm": 0.2452584058046341, + "learning_rate": 9.616995859749032e-06, + "loss": 0.1185, + "step": 41481 + }, + { + "epoch": 0.7398780009274784, + "grad_norm": 0.25024178624153137, + "learning_rate": 9.615768929974323e-06, + "loss": 0.1063, + "step": 41482 + }, + { + "epoch": 0.739895837049192, + "grad_norm": 0.2582530677318573, + "learning_rate": 9.614542059834198e-06, + "loss": 0.1515, + "step": 41483 + }, + { + "epoch": 0.7399136731709057, + "grad_norm": 0.24570819735527039, + "learning_rate": 9.613315249333432e-06, + "loss": 0.131, + "step": 41484 + }, + { + "epoch": 0.7399315092926194, + "grad_norm": 0.2790364623069763, + "learning_rate": 9.61208849847676e-06, + "loss": 0.1227, + "step": 41485 + }, + { + "epoch": 0.7399493454143331, + "grad_norm": 0.2984522879123688, + "learning_rate": 9.610861807268956e-06, + "loss": 0.1595, + "step": 41486 + }, + { + "epoch": 0.7399671815360468, + "grad_norm": 0.27627497911453247, + "learning_rate": 9.60963517571477e-06, + "loss": 0.078, + "step": 41487 + }, + { + "epoch": 0.7399850176577605, + "grad_norm": 0.172231525182724, + "learning_rate": 9.608408603818952e-06, + "loss": 0.1011, + "step": 41488 + }, + { + "epoch": 0.7400028537794742, + "grad_norm": 0.2303430438041687, + "learning_rate": 9.60718209158625e-06, + "loss": 0.0972, + "step": 41489 + }, + { + "epoch": 0.7400206899011879, + "grad_norm": 0.3887530565261841, + "learning_rate": 9.605955639021433e-06, + "loss": 0.1544, + "step": 41490 + }, + { + "epoch": 0.7400385260229015, + "grad_norm": 0.2966137230396271, + "learning_rate": 9.604729246129252e-06, + "loss": 0.096, + "step": 41491 + }, + { + "epoch": 0.7400563621446152, + "grad_norm": 0.2255358248949051, + "learning_rate": 9.603502912914456e-06, + "loss": 0.1113, + "step": 41492 + }, + { + "epoch": 0.7400741982663289, + "grad_norm": 0.2501656115055084, + "learning_rate": 9.602276639381803e-06, + "loss": 0.1431, + "step": 41493 + }, + { + "epoch": 0.7400920343880427, + "grad_norm": 0.28245556354522705, + "learning_rate": 9.601050425536032e-06, + "loss": 0.1247, + "step": 41494 + }, + { + "epoch": 0.7401098705097564, + "grad_norm": 0.33549466729164124, + "learning_rate": 9.599824271381919e-06, + "loss": 0.1086, + "step": 41495 + }, + { + "epoch": 0.7401277066314701, + "grad_norm": 0.2642711400985718, + "learning_rate": 9.598598176924206e-06, + "loss": 0.1752, + "step": 41496 + }, + { + "epoch": 0.7401455427531838, + "grad_norm": 0.22594918310642242, + "learning_rate": 9.597372142167635e-06, + "loss": 0.1434, + "step": 41497 + }, + { + "epoch": 0.7401633788748975, + "grad_norm": 0.2276662290096283, + "learning_rate": 9.596146167116973e-06, + "loss": 0.1655, + "step": 41498 + }, + { + "epoch": 0.7401812149966112, + "grad_norm": 0.29121944308280945, + "learning_rate": 9.594920251776976e-06, + "loss": 0.1546, + "step": 41499 + }, + { + "epoch": 0.7401990511183248, + "grad_norm": 0.2935793697834015, + "learning_rate": 9.593694396152386e-06, + "loss": 0.0946, + "step": 41500 + }, + { + "epoch": 0.7402168872400385, + "grad_norm": 0.23710060119628906, + "learning_rate": 9.592468600247961e-06, + "loss": 0.136, + "step": 41501 + }, + { + "epoch": 0.7402347233617522, + "grad_norm": 0.290985643863678, + "learning_rate": 9.591242864068439e-06, + "loss": 0.1351, + "step": 41502 + }, + { + "epoch": 0.7402525594834659, + "grad_norm": 0.2703031003475189, + "learning_rate": 9.590017187618592e-06, + "loss": 0.1278, + "step": 41503 + }, + { + "epoch": 0.7402703956051796, + "grad_norm": 0.25689277052879333, + "learning_rate": 9.58879157090316e-06, + "loss": 0.0972, + "step": 41504 + }, + { + "epoch": 0.7402882317268933, + "grad_norm": 0.24157381057739258, + "learning_rate": 9.587566013926897e-06, + "loss": 0.1557, + "step": 41505 + }, + { + "epoch": 0.740306067848607, + "grad_norm": 0.19293861091136932, + "learning_rate": 9.586340516694548e-06, + "loss": 0.1046, + "step": 41506 + }, + { + "epoch": 0.7403239039703207, + "grad_norm": 0.2195795327425003, + "learning_rate": 9.585115079210857e-06, + "loss": 0.1099, + "step": 41507 + }, + { + "epoch": 0.7403417400920344, + "grad_norm": 0.2531551420688629, + "learning_rate": 9.583889701480598e-06, + "loss": 0.1054, + "step": 41508 + }, + { + "epoch": 0.740359576213748, + "grad_norm": 0.29554933309555054, + "learning_rate": 9.582664383508503e-06, + "loss": 0.1394, + "step": 41509 + }, + { + "epoch": 0.7403774123354617, + "grad_norm": 0.32142654061317444, + "learning_rate": 9.581439125299318e-06, + "loss": 0.112, + "step": 41510 + }, + { + "epoch": 0.7403952484571755, + "grad_norm": 0.2822263836860657, + "learning_rate": 9.580213926857809e-06, + "loss": 0.1423, + "step": 41511 + }, + { + "epoch": 0.7404130845788892, + "grad_norm": 0.34038910269737244, + "learning_rate": 9.57898878818871e-06, + "loss": 0.1379, + "step": 41512 + }, + { + "epoch": 0.7404309207006029, + "grad_norm": 0.2618231475353241, + "learning_rate": 9.577763709296783e-06, + "loss": 0.0961, + "step": 41513 + }, + { + "epoch": 0.7404487568223166, + "grad_norm": 0.21803507208824158, + "learning_rate": 9.57653869018677e-06, + "loss": 0.1146, + "step": 41514 + }, + { + "epoch": 0.7404665929440303, + "grad_norm": 0.2623039186000824, + "learning_rate": 9.575313730863414e-06, + "loss": 0.111, + "step": 41515 + }, + { + "epoch": 0.740484429065744, + "grad_norm": 0.5876278281211853, + "learning_rate": 9.574088831331476e-06, + "loss": 0.165, + "step": 41516 + }, + { + "epoch": 0.7405022651874577, + "grad_norm": 0.25911587476730347, + "learning_rate": 9.5728639915957e-06, + "loss": 0.0668, + "step": 41517 + }, + { + "epoch": 0.7405201013091713, + "grad_norm": 0.2928988039493561, + "learning_rate": 9.57163921166083e-06, + "loss": 0.0939, + "step": 41518 + }, + { + "epoch": 0.740537937430885, + "grad_norm": 0.27473384141921997, + "learning_rate": 9.570414491531612e-06, + "loss": 0.1441, + "step": 41519 + }, + { + "epoch": 0.7405557735525987, + "grad_norm": 0.338054895401001, + "learning_rate": 9.569189831212794e-06, + "loss": 0.1368, + "step": 41520 + }, + { + "epoch": 0.7405736096743124, + "grad_norm": 0.2573372423648834, + "learning_rate": 9.56796523070913e-06, + "loss": 0.0951, + "step": 41521 + }, + { + "epoch": 0.7405914457960261, + "grad_norm": 0.3364068865776062, + "learning_rate": 9.566740690025364e-06, + "loss": 0.1312, + "step": 41522 + }, + { + "epoch": 0.7406092819177398, + "grad_norm": 0.28443315625190735, + "learning_rate": 9.565516209166242e-06, + "loss": 0.123, + "step": 41523 + }, + { + "epoch": 0.7406271180394535, + "grad_norm": 0.26175040006637573, + "learning_rate": 9.564291788136503e-06, + "loss": 0.0803, + "step": 41524 + }, + { + "epoch": 0.7406449541611672, + "grad_norm": 0.2979171574115753, + "learning_rate": 9.563067426940897e-06, + "loss": 0.0977, + "step": 41525 + }, + { + "epoch": 0.7406627902828808, + "grad_norm": 0.3721378445625305, + "learning_rate": 9.561843125584186e-06, + "loss": 0.1492, + "step": 41526 + }, + { + "epoch": 0.7406806264045945, + "grad_norm": 0.198248028755188, + "learning_rate": 9.560618884071102e-06, + "loss": 0.0677, + "step": 41527 + }, + { + "epoch": 0.7406984625263083, + "grad_norm": 0.25049564242362976, + "learning_rate": 9.55939470240638e-06, + "loss": 0.1362, + "step": 41528 + }, + { + "epoch": 0.740716298648022, + "grad_norm": 0.3355697691440582, + "learning_rate": 9.558170580594789e-06, + "loss": 0.1048, + "step": 41529 + }, + { + "epoch": 0.7407341347697357, + "grad_norm": 0.23160558938980103, + "learning_rate": 9.556946518641061e-06, + "loss": 0.0722, + "step": 41530 + }, + { + "epoch": 0.7407519708914494, + "grad_norm": 0.25317880511283875, + "learning_rate": 9.555722516549942e-06, + "loss": 0.1171, + "step": 41531 + }, + { + "epoch": 0.7407698070131631, + "grad_norm": 0.23997081816196442, + "learning_rate": 9.554498574326176e-06, + "loss": 0.1327, + "step": 41532 + }, + { + "epoch": 0.7407876431348768, + "grad_norm": 0.47078171372413635, + "learning_rate": 9.5532746919745e-06, + "loss": 0.1193, + "step": 41533 + }, + { + "epoch": 0.7408054792565905, + "grad_norm": 0.22875575721263885, + "learning_rate": 9.552050869499679e-06, + "loss": 0.0959, + "step": 41534 + }, + { + "epoch": 0.7408233153783041, + "grad_norm": 0.3673996329307556, + "learning_rate": 9.55082710690644e-06, + "loss": 0.0944, + "step": 41535 + }, + { + "epoch": 0.7408411515000178, + "grad_norm": 0.24300555884838104, + "learning_rate": 9.549603404199534e-06, + "loss": 0.0866, + "step": 41536 + }, + { + "epoch": 0.7408589876217315, + "grad_norm": 0.2510339617729187, + "learning_rate": 9.54837976138369e-06, + "loss": 0.1618, + "step": 41537 + }, + { + "epoch": 0.7408768237434452, + "grad_norm": 0.21743349730968475, + "learning_rate": 9.547156178463673e-06, + "loss": 0.1469, + "step": 41538 + }, + { + "epoch": 0.7408946598651589, + "grad_norm": 0.186907559633255, + "learning_rate": 9.545932655444207e-06, + "loss": 0.0838, + "step": 41539 + }, + { + "epoch": 0.7409124959868726, + "grad_norm": 0.21787531673908234, + "learning_rate": 9.544709192330054e-06, + "loss": 0.062, + "step": 41540 + }, + { + "epoch": 0.7409303321085863, + "grad_norm": 0.32206791639328003, + "learning_rate": 9.543485789125933e-06, + "loss": 0.1646, + "step": 41541 + }, + { + "epoch": 0.7409481682303, + "grad_norm": 0.27628186345100403, + "learning_rate": 9.542262445836615e-06, + "loss": 0.1638, + "step": 41542 + }, + { + "epoch": 0.7409660043520137, + "grad_norm": 0.23920463025569916, + "learning_rate": 9.541039162466819e-06, + "loss": 0.114, + "step": 41543 + }, + { + "epoch": 0.7409838404737273, + "grad_norm": 0.25832855701446533, + "learning_rate": 9.539815939021302e-06, + "loss": 0.1492, + "step": 41544 + }, + { + "epoch": 0.7410016765954411, + "grad_norm": 0.27545973658561707, + "learning_rate": 9.538592775504793e-06, + "loss": 0.12, + "step": 41545 + }, + { + "epoch": 0.7410195127171548, + "grad_norm": 0.3088870048522949, + "learning_rate": 9.537369671922031e-06, + "loss": 0.08, + "step": 41546 + }, + { + "epoch": 0.7410373488388685, + "grad_norm": 0.3808946907520294, + "learning_rate": 9.536146628277775e-06, + "loss": 0.1004, + "step": 41547 + }, + { + "epoch": 0.7410551849605822, + "grad_norm": 0.30978134274482727, + "learning_rate": 9.534923644576754e-06, + "loss": 0.1648, + "step": 41548 + }, + { + "epoch": 0.7410730210822959, + "grad_norm": 0.2890113294124603, + "learning_rate": 9.533700720823713e-06, + "loss": 0.1383, + "step": 41549 + }, + { + "epoch": 0.7410908572040096, + "grad_norm": 0.22573745250701904, + "learning_rate": 9.532477857023378e-06, + "loss": 0.104, + "step": 41550 + }, + { + "epoch": 0.7411086933257233, + "grad_norm": 0.1530609428882599, + "learning_rate": 9.531255053180513e-06, + "loss": 0.0693, + "step": 41551 + }, + { + "epoch": 0.741126529447437, + "grad_norm": 0.2667560875415802, + "learning_rate": 9.530032309299835e-06, + "loss": 0.0942, + "step": 41552 + }, + { + "epoch": 0.7411443655691506, + "grad_norm": 0.27196401357650757, + "learning_rate": 9.528809625386103e-06, + "loss": 0.1479, + "step": 41553 + }, + { + "epoch": 0.7411622016908643, + "grad_norm": 0.26229971647262573, + "learning_rate": 9.527587001444042e-06, + "loss": 0.1587, + "step": 41554 + }, + { + "epoch": 0.741180037812578, + "grad_norm": 0.2506105601787567, + "learning_rate": 9.526364437478405e-06, + "loss": 0.105, + "step": 41555 + }, + { + "epoch": 0.7411978739342917, + "grad_norm": 0.39847680926322937, + "learning_rate": 9.525141933493925e-06, + "loss": 0.0968, + "step": 41556 + }, + { + "epoch": 0.7412157100560054, + "grad_norm": 0.2831707000732422, + "learning_rate": 9.52391948949534e-06, + "loss": 0.0951, + "step": 41557 + }, + { + "epoch": 0.7412335461777191, + "grad_norm": 0.2951960265636444, + "learning_rate": 9.522697105487386e-06, + "loss": 0.1528, + "step": 41558 + }, + { + "epoch": 0.7412513822994328, + "grad_norm": 0.2560045123100281, + "learning_rate": 9.521474781474795e-06, + "loss": 0.1544, + "step": 41559 + }, + { + "epoch": 0.7412692184211465, + "grad_norm": 0.30274707078933716, + "learning_rate": 9.520252517462324e-06, + "loss": 0.1416, + "step": 41560 + }, + { + "epoch": 0.7412870545428601, + "grad_norm": 0.3149712085723877, + "learning_rate": 9.519030313454702e-06, + "loss": 0.0914, + "step": 41561 + }, + { + "epoch": 0.741304890664574, + "grad_norm": 0.28240153193473816, + "learning_rate": 9.517808169456665e-06, + "loss": 0.1271, + "step": 41562 + }, + { + "epoch": 0.7413227267862876, + "grad_norm": 0.2329358011484146, + "learning_rate": 9.516586085472942e-06, + "loss": 0.1326, + "step": 41563 + }, + { + "epoch": 0.7413405629080013, + "grad_norm": 0.21053653955459595, + "learning_rate": 9.515364061508285e-06, + "loss": 0.1014, + "step": 41564 + }, + { + "epoch": 0.741358399029715, + "grad_norm": 0.21459347009658813, + "learning_rate": 9.51414209756743e-06, + "loss": 0.0801, + "step": 41565 + }, + { + "epoch": 0.7413762351514287, + "grad_norm": 0.26571404933929443, + "learning_rate": 9.512920193655098e-06, + "loss": 0.1171, + "step": 41566 + }, + { + "epoch": 0.7413940712731424, + "grad_norm": 0.21552519500255585, + "learning_rate": 9.511698349776044e-06, + "loss": 0.1118, + "step": 41567 + }, + { + "epoch": 0.7414119073948561, + "grad_norm": 0.22639957070350647, + "learning_rate": 9.51047656593499e-06, + "loss": 0.0571, + "step": 41568 + }, + { + "epoch": 0.7414297435165698, + "grad_norm": 0.2666633725166321, + "learning_rate": 9.509254842136683e-06, + "loss": 0.0961, + "step": 41569 + }, + { + "epoch": 0.7414475796382834, + "grad_norm": 0.3685295283794403, + "learning_rate": 9.508033178385858e-06, + "loss": 0.1326, + "step": 41570 + }, + { + "epoch": 0.7414654157599971, + "grad_norm": 0.2418726682662964, + "learning_rate": 9.506811574687249e-06, + "loss": 0.1272, + "step": 41571 + }, + { + "epoch": 0.7414832518817108, + "grad_norm": 0.24539007246494293, + "learning_rate": 9.505590031045577e-06, + "loss": 0.1288, + "step": 41572 + }, + { + "epoch": 0.7415010880034245, + "grad_norm": 0.25588253140449524, + "learning_rate": 9.5043685474656e-06, + "loss": 0.148, + "step": 41573 + }, + { + "epoch": 0.7415189241251382, + "grad_norm": 0.2501687705516815, + "learning_rate": 9.503147123952044e-06, + "loss": 0.1008, + "step": 41574 + }, + { + "epoch": 0.7415367602468519, + "grad_norm": 0.5458818674087524, + "learning_rate": 9.50192576050964e-06, + "loss": 0.1111, + "step": 41575 + }, + { + "epoch": 0.7415545963685656, + "grad_norm": 0.27154460549354553, + "learning_rate": 9.500704457143117e-06, + "loss": 0.0901, + "step": 41576 + }, + { + "epoch": 0.7415724324902793, + "grad_norm": 0.22834055125713348, + "learning_rate": 9.499483213857225e-06, + "loss": 0.129, + "step": 41577 + }, + { + "epoch": 0.741590268611993, + "grad_norm": 0.2994661331176758, + "learning_rate": 9.49826203065669e-06, + "loss": 0.1965, + "step": 41578 + }, + { + "epoch": 0.7416081047337068, + "grad_norm": 0.2642194926738739, + "learning_rate": 9.497040907546236e-06, + "loss": 0.1374, + "step": 41579 + }, + { + "epoch": 0.7416259408554204, + "grad_norm": 0.4218829870223999, + "learning_rate": 9.495819844530616e-06, + "loss": 0.1428, + "step": 41580 + }, + { + "epoch": 0.7416437769771341, + "grad_norm": 0.27938157320022583, + "learning_rate": 9.494598841614544e-06, + "loss": 0.1411, + "step": 41581 + }, + { + "epoch": 0.7416616130988478, + "grad_norm": 0.3028716742992401, + "learning_rate": 9.493377898802771e-06, + "loss": 0.0888, + "step": 41582 + }, + { + "epoch": 0.7416794492205615, + "grad_norm": 0.2963705062866211, + "learning_rate": 9.49215701610002e-06, + "loss": 0.1339, + "step": 41583 + }, + { + "epoch": 0.7416972853422752, + "grad_norm": 0.33887961506843567, + "learning_rate": 9.490936193511027e-06, + "loss": 0.1158, + "step": 41584 + }, + { + "epoch": 0.7417151214639889, + "grad_norm": 0.25907376408576965, + "learning_rate": 9.48971543104051e-06, + "loss": 0.0586, + "step": 41585 + }, + { + "epoch": 0.7417329575857026, + "grad_norm": 0.25996875762939453, + "learning_rate": 9.488494728693226e-06, + "loss": 0.1231, + "step": 41586 + }, + { + "epoch": 0.7417507937074163, + "grad_norm": 0.33476582169532776, + "learning_rate": 9.48727408647389e-06, + "loss": 0.129, + "step": 41587 + }, + { + "epoch": 0.7417686298291299, + "grad_norm": 0.2209702432155609, + "learning_rate": 9.486053504387241e-06, + "loss": 0.1152, + "step": 41588 + }, + { + "epoch": 0.7417864659508436, + "grad_norm": 0.29414498805999756, + "learning_rate": 9.484832982437996e-06, + "loss": 0.0627, + "step": 41589 + }, + { + "epoch": 0.7418043020725573, + "grad_norm": 0.22861534357070923, + "learning_rate": 9.483612520630905e-06, + "loss": 0.1152, + "step": 41590 + }, + { + "epoch": 0.741822138194271, + "grad_norm": 0.2732055187225342, + "learning_rate": 9.482392118970693e-06, + "loss": 0.1324, + "step": 41591 + }, + { + "epoch": 0.7418399743159847, + "grad_norm": 0.2440653145313263, + "learning_rate": 9.481171777462088e-06, + "loss": 0.1056, + "step": 41592 + }, + { + "epoch": 0.7418578104376984, + "grad_norm": 0.30423206090927124, + "learning_rate": 9.479951496109813e-06, + "loss": 0.1617, + "step": 41593 + }, + { + "epoch": 0.7418756465594121, + "grad_norm": 0.22783881425857544, + "learning_rate": 9.478731274918615e-06, + "loss": 0.0993, + "step": 41594 + }, + { + "epoch": 0.7418934826811259, + "grad_norm": 0.2019798755645752, + "learning_rate": 9.477511113893206e-06, + "loss": 0.0796, + "step": 41595 + }, + { + "epoch": 0.7419113188028396, + "grad_norm": 0.28052619099617004, + "learning_rate": 9.476291013038336e-06, + "loss": 0.0789, + "step": 41596 + }, + { + "epoch": 0.7419291549245532, + "grad_norm": 0.2511623203754425, + "learning_rate": 9.47507097235872e-06, + "loss": 0.0914, + "step": 41597 + }, + { + "epoch": 0.7419469910462669, + "grad_norm": 0.23255203664302826, + "learning_rate": 9.473850991859085e-06, + "loss": 0.1087, + "step": 41598 + }, + { + "epoch": 0.7419648271679806, + "grad_norm": 0.2532743513584137, + "learning_rate": 9.472631071544175e-06, + "loss": 0.1546, + "step": 41599 + }, + { + "epoch": 0.7419826632896943, + "grad_norm": 0.2276996374130249, + "learning_rate": 9.47141121141871e-06, + "loss": 0.1059, + "step": 41600 + }, + { + "epoch": 0.742000499411408, + "grad_norm": 0.21323199570178986, + "learning_rate": 9.470191411487416e-06, + "loss": 0.1058, + "step": 41601 + }, + { + "epoch": 0.7420183355331217, + "grad_norm": 0.23297640681266785, + "learning_rate": 9.468971671755018e-06, + "loss": 0.0949, + "step": 41602 + }, + { + "epoch": 0.7420361716548354, + "grad_norm": 0.3119436204433441, + "learning_rate": 9.46775199222626e-06, + "loss": 0.1469, + "step": 41603 + }, + { + "epoch": 0.7420540077765491, + "grad_norm": 0.25828230381011963, + "learning_rate": 9.466532372905856e-06, + "loss": 0.0885, + "step": 41604 + }, + { + "epoch": 0.7420718438982628, + "grad_norm": 0.36914077401161194, + "learning_rate": 9.465312813798537e-06, + "loss": 0.1114, + "step": 41605 + }, + { + "epoch": 0.7420896800199764, + "grad_norm": 0.24076122045516968, + "learning_rate": 9.464093314909025e-06, + "loss": 0.11, + "step": 41606 + }, + { + "epoch": 0.7421075161416901, + "grad_norm": 0.22149060666561127, + "learning_rate": 9.462873876242062e-06, + "loss": 0.0845, + "step": 41607 + }, + { + "epoch": 0.7421253522634038, + "grad_norm": 0.2874957323074341, + "learning_rate": 9.461654497802358e-06, + "loss": 0.1575, + "step": 41608 + }, + { + "epoch": 0.7421431883851175, + "grad_norm": 0.2312544286251068, + "learning_rate": 9.460435179594657e-06, + "loss": 0.101, + "step": 41609 + }, + { + "epoch": 0.7421610245068312, + "grad_norm": 0.21125483512878418, + "learning_rate": 9.459215921623673e-06, + "loss": 0.083, + "step": 41610 + }, + { + "epoch": 0.7421788606285449, + "grad_norm": 0.2313290387392044, + "learning_rate": 9.45799672389413e-06, + "loss": 0.1311, + "step": 41611 + }, + { + "epoch": 0.7421966967502587, + "grad_norm": 0.3128338158130646, + "learning_rate": 9.456777586410767e-06, + "loss": 0.1097, + "step": 41612 + }, + { + "epoch": 0.7422145328719724, + "grad_norm": 0.2656687796115875, + "learning_rate": 9.455558509178306e-06, + "loss": 0.1753, + "step": 41613 + }, + { + "epoch": 0.742232368993686, + "grad_norm": 0.2783730626106262, + "learning_rate": 9.454339492201464e-06, + "loss": 0.1045, + "step": 41614 + }, + { + "epoch": 0.7422502051153997, + "grad_norm": 0.2757885158061981, + "learning_rate": 9.453120535484967e-06, + "loss": 0.1274, + "step": 41615 + }, + { + "epoch": 0.7422680412371134, + "grad_norm": 0.35291942954063416, + "learning_rate": 9.45190163903355e-06, + "loss": 0.1221, + "step": 41616 + }, + { + "epoch": 0.7422858773588271, + "grad_norm": 0.30008742213249207, + "learning_rate": 9.450682802851934e-06, + "loss": 0.1236, + "step": 41617 + }, + { + "epoch": 0.7423037134805408, + "grad_norm": 0.20530201494693756, + "learning_rate": 9.449464026944841e-06, + "loss": 0.1177, + "step": 41618 + }, + { + "epoch": 0.7423215496022545, + "grad_norm": 0.23360618948936462, + "learning_rate": 9.448245311316989e-06, + "loss": 0.0906, + "step": 41619 + }, + { + "epoch": 0.7423393857239682, + "grad_norm": 0.40567994117736816, + "learning_rate": 9.447026655973118e-06, + "loss": 0.1394, + "step": 41620 + }, + { + "epoch": 0.7423572218456819, + "grad_norm": 0.26606419682502747, + "learning_rate": 9.445808060917943e-06, + "loss": 0.1421, + "step": 41621 + }, + { + "epoch": 0.7423750579673956, + "grad_norm": 0.39970555901527405, + "learning_rate": 9.44458952615618e-06, + "loss": 0.1361, + "step": 41622 + }, + { + "epoch": 0.7423928940891092, + "grad_norm": 0.3549562990665436, + "learning_rate": 9.443371051692568e-06, + "loss": 0.176, + "step": 41623 + }, + { + "epoch": 0.7424107302108229, + "grad_norm": 0.22855940461158752, + "learning_rate": 9.442152637531816e-06, + "loss": 0.0969, + "step": 41624 + }, + { + "epoch": 0.7424285663325366, + "grad_norm": 0.38711312413215637, + "learning_rate": 9.440934283678662e-06, + "loss": 0.119, + "step": 41625 + }, + { + "epoch": 0.7424464024542503, + "grad_norm": 0.31242865324020386, + "learning_rate": 9.43971599013782e-06, + "loss": 0.1494, + "step": 41626 + }, + { + "epoch": 0.742464238575964, + "grad_norm": 0.1833241730928421, + "learning_rate": 9.438497756914013e-06, + "loss": 0.1375, + "step": 41627 + }, + { + "epoch": 0.7424820746976777, + "grad_norm": 0.3609023094177246, + "learning_rate": 9.437279584011957e-06, + "loss": 0.1706, + "step": 41628 + }, + { + "epoch": 0.7424999108193915, + "grad_norm": 0.23171988129615784, + "learning_rate": 9.436061471436386e-06, + "loss": 0.0634, + "step": 41629 + }, + { + "epoch": 0.7425177469411052, + "grad_norm": 0.2211289405822754, + "learning_rate": 9.434843419192019e-06, + "loss": 0.1075, + "step": 41630 + }, + { + "epoch": 0.7425355830628189, + "grad_norm": 0.3138478696346283, + "learning_rate": 9.433625427283574e-06, + "loss": 0.1176, + "step": 41631 + }, + { + "epoch": 0.7425534191845325, + "grad_norm": 0.22442957758903503, + "learning_rate": 9.432407495715765e-06, + "loss": 0.1011, + "step": 41632 + }, + { + "epoch": 0.7425712553062462, + "grad_norm": 0.3194059729576111, + "learning_rate": 9.431189624493328e-06, + "loss": 0.1297, + "step": 41633 + }, + { + "epoch": 0.7425890914279599, + "grad_norm": 0.2304256409406662, + "learning_rate": 9.42997181362098e-06, + "loss": 0.078, + "step": 41634 + }, + { + "epoch": 0.7426069275496736, + "grad_norm": 0.3238835334777832, + "learning_rate": 9.428754063103429e-06, + "loss": 0.1025, + "step": 41635 + }, + { + "epoch": 0.7426247636713873, + "grad_norm": 0.25386375188827515, + "learning_rate": 9.427536372945414e-06, + "loss": 0.1182, + "step": 41636 + }, + { + "epoch": 0.742642599793101, + "grad_norm": 0.22706788778305054, + "learning_rate": 9.426318743151638e-06, + "loss": 0.0956, + "step": 41637 + }, + { + "epoch": 0.7426604359148147, + "grad_norm": 0.4195094704627991, + "learning_rate": 9.425101173726839e-06, + "loss": 0.1561, + "step": 41638 + }, + { + "epoch": 0.7426782720365284, + "grad_norm": 0.22369061410427094, + "learning_rate": 9.423883664675725e-06, + "loss": 0.119, + "step": 41639 + }, + { + "epoch": 0.742696108158242, + "grad_norm": 0.267567902803421, + "learning_rate": 9.422666216003021e-06, + "loss": 0.1405, + "step": 41640 + }, + { + "epoch": 0.7427139442799557, + "grad_norm": 0.25982314348220825, + "learning_rate": 9.42144882771343e-06, + "loss": 0.1121, + "step": 41641 + }, + { + "epoch": 0.7427317804016694, + "grad_norm": 0.3214619755744934, + "learning_rate": 9.420231499811696e-06, + "loss": 0.1267, + "step": 41642 + }, + { + "epoch": 0.7427496165233831, + "grad_norm": 0.256944864988327, + "learning_rate": 9.419014232302525e-06, + "loss": 0.0832, + "step": 41643 + }, + { + "epoch": 0.7427674526450968, + "grad_norm": 0.2596313953399658, + "learning_rate": 9.41779702519064e-06, + "loss": 0.0879, + "step": 41644 + }, + { + "epoch": 0.7427852887668105, + "grad_norm": 0.21887367963790894, + "learning_rate": 9.416579878480741e-06, + "loss": 0.0809, + "step": 41645 + }, + { + "epoch": 0.7428031248885243, + "grad_norm": 0.23044101893901825, + "learning_rate": 9.415362792177573e-06, + "loss": 0.1533, + "step": 41646 + }, + { + "epoch": 0.742820961010238, + "grad_norm": 0.2685229182243347, + "learning_rate": 9.41414576628584e-06, + "loss": 0.1467, + "step": 41647 + }, + { + "epoch": 0.7428387971319517, + "grad_norm": 0.32725685834884644, + "learning_rate": 9.412928800810262e-06, + "loss": 0.1515, + "step": 41648 + }, + { + "epoch": 0.7428566332536654, + "grad_norm": 0.2877674102783203, + "learning_rate": 9.411711895755546e-06, + "loss": 0.1085, + "step": 41649 + }, + { + "epoch": 0.742874469375379, + "grad_norm": 0.27718546986579895, + "learning_rate": 9.41049505112642e-06, + "loss": 0.1203, + "step": 41650 + }, + { + "epoch": 0.7428923054970927, + "grad_norm": 0.2505258619785309, + "learning_rate": 9.409278266927607e-06, + "loss": 0.1146, + "step": 41651 + }, + { + "epoch": 0.7429101416188064, + "grad_norm": 0.2803599536418915, + "learning_rate": 9.408061543163815e-06, + "loss": 0.155, + "step": 41652 + }, + { + "epoch": 0.7429279777405201, + "grad_norm": 0.24942843616008759, + "learning_rate": 9.406844879839766e-06, + "loss": 0.0881, + "step": 41653 + }, + { + "epoch": 0.7429458138622338, + "grad_norm": 0.2915973961353302, + "learning_rate": 9.40562827696016e-06, + "loss": 0.1182, + "step": 41654 + }, + { + "epoch": 0.7429636499839475, + "grad_norm": 0.27929285168647766, + "learning_rate": 9.404411734529734e-06, + "loss": 0.1444, + "step": 41655 + }, + { + "epoch": 0.7429814861056612, + "grad_norm": 0.2811204493045807, + "learning_rate": 9.403195252553197e-06, + "loss": 0.1637, + "step": 41656 + }, + { + "epoch": 0.7429993222273749, + "grad_norm": 0.1754535436630249, + "learning_rate": 9.401978831035257e-06, + "loss": 0.1145, + "step": 41657 + }, + { + "epoch": 0.7430171583490885, + "grad_norm": 0.17934869229793549, + "learning_rate": 9.400762469980631e-06, + "loss": 0.0898, + "step": 41658 + }, + { + "epoch": 0.7430349944708022, + "grad_norm": 0.2595594525337219, + "learning_rate": 9.399546169394044e-06, + "loss": 0.1311, + "step": 41659 + }, + { + "epoch": 0.7430528305925159, + "grad_norm": 0.2776778042316437, + "learning_rate": 9.398329929280203e-06, + "loss": 0.172, + "step": 41660 + }, + { + "epoch": 0.7430706667142296, + "grad_norm": 0.3605760335922241, + "learning_rate": 9.397113749643827e-06, + "loss": 0.1938, + "step": 41661 + }, + { + "epoch": 0.7430885028359433, + "grad_norm": 0.5109666585922241, + "learning_rate": 9.395897630489617e-06, + "loss": 0.1479, + "step": 41662 + }, + { + "epoch": 0.7431063389576571, + "grad_norm": 0.2370705008506775, + "learning_rate": 9.394681571822303e-06, + "loss": 0.071, + "step": 41663 + }, + { + "epoch": 0.7431241750793708, + "grad_norm": 0.3058117926120758, + "learning_rate": 9.393465573646587e-06, + "loss": 0.1264, + "step": 41664 + }, + { + "epoch": 0.7431420112010845, + "grad_norm": 0.3172210156917572, + "learning_rate": 9.3922496359672e-06, + "loss": 0.115, + "step": 41665 + }, + { + "epoch": 0.7431598473227982, + "grad_norm": 0.21875812113285065, + "learning_rate": 9.391033758788842e-06, + "loss": 0.1128, + "step": 41666 + }, + { + "epoch": 0.7431776834445118, + "grad_norm": 0.3352431654930115, + "learning_rate": 9.389817942116218e-06, + "loss": 0.0973, + "step": 41667 + }, + { + "epoch": 0.7431955195662255, + "grad_norm": 0.2931629717350006, + "learning_rate": 9.388602185954063e-06, + "loss": 0.0671, + "step": 41668 + }, + { + "epoch": 0.7432133556879392, + "grad_norm": 0.33040523529052734, + "learning_rate": 9.387386490307074e-06, + "loss": 0.1152, + "step": 41669 + }, + { + "epoch": 0.7432311918096529, + "grad_norm": 0.2503654658794403, + "learning_rate": 9.38617085517997e-06, + "loss": 0.1223, + "step": 41670 + }, + { + "epoch": 0.7432490279313666, + "grad_norm": 0.23381134867668152, + "learning_rate": 9.38495528057745e-06, + "loss": 0.1096, + "step": 41671 + }, + { + "epoch": 0.7432668640530803, + "grad_norm": 0.28774163126945496, + "learning_rate": 9.383739766504249e-06, + "loss": 0.1118, + "step": 41672 + }, + { + "epoch": 0.743284700174794, + "grad_norm": 0.29325708746910095, + "learning_rate": 9.382524312965063e-06, + "loss": 0.1174, + "step": 41673 + }, + { + "epoch": 0.7433025362965077, + "grad_norm": 0.28332558274269104, + "learning_rate": 9.38130891996461e-06, + "loss": 0.0771, + "step": 41674 + }, + { + "epoch": 0.7433203724182214, + "grad_norm": 0.34472018480300903, + "learning_rate": 9.380093587507596e-06, + "loss": 0.0835, + "step": 41675 + }, + { + "epoch": 0.743338208539935, + "grad_norm": 0.34437990188598633, + "learning_rate": 9.378878315598724e-06, + "loss": 0.1019, + "step": 41676 + }, + { + "epoch": 0.7433560446616487, + "grad_norm": 0.4410110116004944, + "learning_rate": 9.377663104242726e-06, + "loss": 0.1704, + "step": 41677 + }, + { + "epoch": 0.7433738807833624, + "grad_norm": 0.3495272397994995, + "learning_rate": 9.376447953444293e-06, + "loss": 0.1693, + "step": 41678 + }, + { + "epoch": 0.7433917169050761, + "grad_norm": 0.29913029074668884, + "learning_rate": 9.37523286320815e-06, + "loss": 0.1415, + "step": 41679 + }, + { + "epoch": 0.7434095530267899, + "grad_norm": 0.34118911623954773, + "learning_rate": 9.374017833538994e-06, + "loss": 0.1373, + "step": 41680 + }, + { + "epoch": 0.7434273891485036, + "grad_norm": 0.22000570595264435, + "learning_rate": 9.372802864441551e-06, + "loss": 0.1066, + "step": 41681 + }, + { + "epoch": 0.7434452252702173, + "grad_norm": 0.30710750818252563, + "learning_rate": 9.371587955920521e-06, + "loss": 0.0896, + "step": 41682 + }, + { + "epoch": 0.743463061391931, + "grad_norm": 0.42219114303588867, + "learning_rate": 9.370373107980614e-06, + "loss": 0.0753, + "step": 41683 + }, + { + "epoch": 0.7434808975136447, + "grad_norm": 0.24803954362869263, + "learning_rate": 9.369158320626533e-06, + "loss": 0.0828, + "step": 41684 + }, + { + "epoch": 0.7434987336353583, + "grad_norm": 0.25883179903030396, + "learning_rate": 9.367943593862998e-06, + "loss": 0.123, + "step": 41685 + }, + { + "epoch": 0.743516569757072, + "grad_norm": 0.2109529674053192, + "learning_rate": 9.366728927694715e-06, + "loss": 0.1172, + "step": 41686 + }, + { + "epoch": 0.7435344058787857, + "grad_norm": 0.46198832988739014, + "learning_rate": 9.36551432212639e-06, + "loss": 0.1567, + "step": 41687 + }, + { + "epoch": 0.7435522420004994, + "grad_norm": 0.30942273139953613, + "learning_rate": 9.364299777162733e-06, + "loss": 0.143, + "step": 41688 + }, + { + "epoch": 0.7435700781222131, + "grad_norm": 0.3172661364078522, + "learning_rate": 9.363085292808439e-06, + "loss": 0.1346, + "step": 41689 + }, + { + "epoch": 0.7435879142439268, + "grad_norm": 0.3130984902381897, + "learning_rate": 9.361870869068237e-06, + "loss": 0.1071, + "step": 41690 + }, + { + "epoch": 0.7436057503656405, + "grad_norm": 0.29129648208618164, + "learning_rate": 9.360656505946816e-06, + "loss": 0.116, + "step": 41691 + }, + { + "epoch": 0.7436235864873542, + "grad_norm": 0.3340189754962921, + "learning_rate": 9.359442203448898e-06, + "loss": 0.1475, + "step": 41692 + }, + { + "epoch": 0.7436414226090678, + "grad_norm": 0.2583788335323334, + "learning_rate": 9.35822796157918e-06, + "loss": 0.1531, + "step": 41693 + }, + { + "epoch": 0.7436592587307815, + "grad_norm": 0.27318018674850464, + "learning_rate": 9.35701378034238e-06, + "loss": 0.121, + "step": 41694 + }, + { + "epoch": 0.7436770948524952, + "grad_norm": 0.28395748138427734, + "learning_rate": 9.355799659743195e-06, + "loss": 0.1045, + "step": 41695 + }, + { + "epoch": 0.743694930974209, + "grad_norm": 0.26743993163108826, + "learning_rate": 9.354585599786336e-06, + "loss": 0.1335, + "step": 41696 + }, + { + "epoch": 0.7437127670959227, + "grad_norm": 0.26818326115608215, + "learning_rate": 9.353371600476498e-06, + "loss": 0.0945, + "step": 41697 + }, + { + "epoch": 0.7437306032176364, + "grad_norm": 0.33066803216934204, + "learning_rate": 9.352157661818406e-06, + "loss": 0.1129, + "step": 41698 + }, + { + "epoch": 0.7437484393393501, + "grad_norm": 0.2889024615287781, + "learning_rate": 9.350943783816754e-06, + "loss": 0.1323, + "step": 41699 + }, + { + "epoch": 0.7437662754610638, + "grad_norm": 0.27680182456970215, + "learning_rate": 9.349729966476249e-06, + "loss": 0.1289, + "step": 41700 + }, + { + "epoch": 0.7437841115827775, + "grad_norm": 0.22897565364837646, + "learning_rate": 9.348516209801597e-06, + "loss": 0.1019, + "step": 41701 + }, + { + "epoch": 0.7438019477044912, + "grad_norm": 0.2774960994720459, + "learning_rate": 9.347302513797492e-06, + "loss": 0.1127, + "step": 41702 + }, + { + "epoch": 0.7438197838262048, + "grad_norm": 0.2343015968799591, + "learning_rate": 9.34608887846866e-06, + "loss": 0.1015, + "step": 41703 + }, + { + "epoch": 0.7438376199479185, + "grad_norm": 0.2966124415397644, + "learning_rate": 9.344875303819789e-06, + "loss": 0.1197, + "step": 41704 + }, + { + "epoch": 0.7438554560696322, + "grad_norm": 0.20514456927776337, + "learning_rate": 9.343661789855584e-06, + "loss": 0.089, + "step": 41705 + }, + { + "epoch": 0.7438732921913459, + "grad_norm": 0.25766170024871826, + "learning_rate": 9.342448336580753e-06, + "loss": 0.0961, + "step": 41706 + }, + { + "epoch": 0.7438911283130596, + "grad_norm": 0.2840248942375183, + "learning_rate": 9.341234944000008e-06, + "loss": 0.0914, + "step": 41707 + }, + { + "epoch": 0.7439089644347733, + "grad_norm": 0.4357462227344513, + "learning_rate": 9.340021612118044e-06, + "loss": 0.1297, + "step": 41708 + }, + { + "epoch": 0.743926800556487, + "grad_norm": 0.18843814730644226, + "learning_rate": 9.338808340939567e-06, + "loss": 0.0712, + "step": 41709 + }, + { + "epoch": 0.7439446366782007, + "grad_norm": 0.3405860662460327, + "learning_rate": 9.337595130469267e-06, + "loss": 0.0777, + "step": 41710 + }, + { + "epoch": 0.7439624727999143, + "grad_norm": 0.2407132089138031, + "learning_rate": 9.336381980711867e-06, + "loss": 0.157, + "step": 41711 + }, + { + "epoch": 0.743980308921628, + "grad_norm": 0.31268998980522156, + "learning_rate": 9.335168891672064e-06, + "loss": 0.1321, + "step": 41712 + }, + { + "epoch": 0.7439981450433418, + "grad_norm": 0.21785883605480194, + "learning_rate": 9.333955863354551e-06, + "loss": 0.0967, + "step": 41713 + }, + { + "epoch": 0.7440159811650555, + "grad_norm": 0.28984639048576355, + "learning_rate": 9.332742895764041e-06, + "loss": 0.0917, + "step": 41714 + }, + { + "epoch": 0.7440338172867692, + "grad_norm": 0.2926657199859619, + "learning_rate": 9.33152998890522e-06, + "loss": 0.1134, + "step": 41715 + }, + { + "epoch": 0.7440516534084829, + "grad_norm": 0.20349322259426117, + "learning_rate": 9.33031714278281e-06, + "loss": 0.1034, + "step": 41716 + }, + { + "epoch": 0.7440694895301966, + "grad_norm": 0.37565159797668457, + "learning_rate": 9.329104357401503e-06, + "loss": 0.1565, + "step": 41717 + }, + { + "epoch": 0.7440873256519103, + "grad_norm": 0.25387582182884216, + "learning_rate": 9.32789163276599e-06, + "loss": 0.132, + "step": 41718 + }, + { + "epoch": 0.744105161773624, + "grad_norm": 0.310578852891922, + "learning_rate": 9.326678968880991e-06, + "loss": 0.1543, + "step": 41719 + }, + { + "epoch": 0.7441229978953376, + "grad_norm": 0.28139519691467285, + "learning_rate": 9.32546636575119e-06, + "loss": 0.1005, + "step": 41720 + }, + { + "epoch": 0.7441408340170513, + "grad_norm": 0.3438161313533783, + "learning_rate": 9.324253823381302e-06, + "loss": 0.1168, + "step": 41721 + }, + { + "epoch": 0.744158670138765, + "grad_norm": 0.2320399135351181, + "learning_rate": 9.323041341776023e-06, + "loss": 0.0977, + "step": 41722 + }, + { + "epoch": 0.7441765062604787, + "grad_norm": 0.2858046293258667, + "learning_rate": 9.32182892094004e-06, + "loss": 0.0854, + "step": 41723 + }, + { + "epoch": 0.7441943423821924, + "grad_norm": 0.27552470564842224, + "learning_rate": 9.320616560878073e-06, + "loss": 0.2017, + "step": 41724 + }, + { + "epoch": 0.7442121785039061, + "grad_norm": 0.22283224761486053, + "learning_rate": 9.319404261594811e-06, + "loss": 0.1036, + "step": 41725 + }, + { + "epoch": 0.7442300146256198, + "grad_norm": 0.2163795828819275, + "learning_rate": 9.318192023094951e-06, + "loss": 0.0761, + "step": 41726 + }, + { + "epoch": 0.7442478507473335, + "grad_norm": 0.27455413341522217, + "learning_rate": 9.3169798453832e-06, + "loss": 0.1096, + "step": 41727 + }, + { + "epoch": 0.7442656868690471, + "grad_norm": 0.22569192945957184, + "learning_rate": 9.315767728464241e-06, + "loss": 0.0914, + "step": 41728 + }, + { + "epoch": 0.7442835229907608, + "grad_norm": 0.26902055740356445, + "learning_rate": 9.314555672342792e-06, + "loss": 0.1045, + "step": 41729 + }, + { + "epoch": 0.7443013591124746, + "grad_norm": 0.29484423995018005, + "learning_rate": 9.313343677023542e-06, + "loss": 0.1213, + "step": 41730 + }, + { + "epoch": 0.7443191952341883, + "grad_norm": 0.26825323700904846, + "learning_rate": 9.312131742511192e-06, + "loss": 0.0943, + "step": 41731 + }, + { + "epoch": 0.744337031355902, + "grad_norm": 0.22511784732341766, + "learning_rate": 9.310919868810428e-06, + "loss": 0.0844, + "step": 41732 + }, + { + "epoch": 0.7443548674776157, + "grad_norm": 0.3141074478626251, + "learning_rate": 9.309708055925959e-06, + "loss": 0.1466, + "step": 41733 + }, + { + "epoch": 0.7443727035993294, + "grad_norm": 0.26542744040489197, + "learning_rate": 9.308496303862488e-06, + "loss": 0.1193, + "step": 41734 + }, + { + "epoch": 0.7443905397210431, + "grad_norm": 0.21172067523002625, + "learning_rate": 9.307284612624703e-06, + "loss": 0.1003, + "step": 41735 + }, + { + "epoch": 0.7444083758427568, + "grad_norm": 0.29889070987701416, + "learning_rate": 9.306072982217299e-06, + "loss": 0.1163, + "step": 41736 + }, + { + "epoch": 0.7444262119644705, + "grad_norm": 0.3729795217514038, + "learning_rate": 9.30486141264498e-06, + "loss": 0.1624, + "step": 41737 + }, + { + "epoch": 0.7444440480861841, + "grad_norm": 0.24151965975761414, + "learning_rate": 9.303649903912442e-06, + "loss": 0.0895, + "step": 41738 + }, + { + "epoch": 0.7444618842078978, + "grad_norm": 0.35498708486557007, + "learning_rate": 9.302438456024378e-06, + "loss": 0.113, + "step": 41739 + }, + { + "epoch": 0.7444797203296115, + "grad_norm": 0.27695244550704956, + "learning_rate": 9.301227068985477e-06, + "loss": 0.0955, + "step": 41740 + }, + { + "epoch": 0.7444975564513252, + "grad_norm": 0.18597276508808136, + "learning_rate": 9.300015742800449e-06, + "loss": 0.0615, + "step": 41741 + }, + { + "epoch": 0.7445153925730389, + "grad_norm": 0.28616073727607727, + "learning_rate": 9.298804477473983e-06, + "loss": 0.0898, + "step": 41742 + }, + { + "epoch": 0.7445332286947526, + "grad_norm": 0.29208970069885254, + "learning_rate": 9.297593273010774e-06, + "loss": 0.1568, + "step": 41743 + }, + { + "epoch": 0.7445510648164663, + "grad_norm": 0.2410343438386917, + "learning_rate": 9.296382129415515e-06, + "loss": 0.1178, + "step": 41744 + }, + { + "epoch": 0.74456890093818, + "grad_norm": 0.1810934841632843, + "learning_rate": 9.295171046692897e-06, + "loss": 0.0724, + "step": 41745 + }, + { + "epoch": 0.7445867370598936, + "grad_norm": 0.26009905338287354, + "learning_rate": 9.293960024847629e-06, + "loss": 0.0496, + "step": 41746 + }, + { + "epoch": 0.7446045731816074, + "grad_norm": 0.2826032340526581, + "learning_rate": 9.292749063884384e-06, + "loss": 0.1332, + "step": 41747 + }, + { + "epoch": 0.7446224093033211, + "grad_norm": 0.27991506457328796, + "learning_rate": 9.291538163807881e-06, + "loss": 0.0613, + "step": 41748 + }, + { + "epoch": 0.7446402454250348, + "grad_norm": 0.38155898451805115, + "learning_rate": 9.29032732462279e-06, + "loss": 0.0785, + "step": 41749 + }, + { + "epoch": 0.7446580815467485, + "grad_norm": 0.2500755190849304, + "learning_rate": 9.28911654633383e-06, + "loss": 0.1255, + "step": 41750 + }, + { + "epoch": 0.7446759176684622, + "grad_norm": 0.27290117740631104, + "learning_rate": 9.287905828945678e-06, + "loss": 0.0899, + "step": 41751 + }, + { + "epoch": 0.7446937537901759, + "grad_norm": 0.2371479868888855, + "learning_rate": 9.286695172463028e-06, + "loss": 0.0736, + "step": 41752 + }, + { + "epoch": 0.7447115899118896, + "grad_norm": 0.2186102271080017, + "learning_rate": 9.285484576890568e-06, + "loss": 0.0803, + "step": 41753 + }, + { + "epoch": 0.7447294260336033, + "grad_norm": 0.2484026998281479, + "learning_rate": 9.284274042233004e-06, + "loss": 0.1295, + "step": 41754 + }, + { + "epoch": 0.744747262155317, + "grad_norm": 0.35418692231178284, + "learning_rate": 9.283063568495024e-06, + "loss": 0.1442, + "step": 41755 + }, + { + "epoch": 0.7447650982770306, + "grad_norm": 0.2577676773071289, + "learning_rate": 9.281853155681316e-06, + "loss": 0.116, + "step": 41756 + }, + { + "epoch": 0.7447829343987443, + "grad_norm": 0.3181343674659729, + "learning_rate": 9.280642803796578e-06, + "loss": 0.0875, + "step": 41757 + }, + { + "epoch": 0.744800770520458, + "grad_norm": 0.34740081429481506, + "learning_rate": 9.279432512845485e-06, + "loss": 0.1719, + "step": 41758 + }, + { + "epoch": 0.7448186066421717, + "grad_norm": 0.24784712493419647, + "learning_rate": 9.278222282832752e-06, + "loss": 0.1246, + "step": 41759 + }, + { + "epoch": 0.7448364427638854, + "grad_norm": 0.24826543033123016, + "learning_rate": 9.277012113763051e-06, + "loss": 0.0715, + "step": 41760 + }, + { + "epoch": 0.7448542788855991, + "grad_norm": 0.3630891442298889, + "learning_rate": 9.275802005641091e-06, + "loss": 0.1775, + "step": 41761 + }, + { + "epoch": 0.7448721150073128, + "grad_norm": 0.22842375934123993, + "learning_rate": 9.274591958471542e-06, + "loss": 0.1188, + "step": 41762 + }, + { + "epoch": 0.7448899511290265, + "grad_norm": 0.3053423762321472, + "learning_rate": 9.273381972259116e-06, + "loss": 0.0667, + "step": 41763 + }, + { + "epoch": 0.7449077872507402, + "grad_norm": 0.30044570565223694, + "learning_rate": 9.272172047008493e-06, + "loss": 0.1381, + "step": 41764 + }, + { + "epoch": 0.7449256233724539, + "grad_norm": 0.332535982131958, + "learning_rate": 9.270962182724362e-06, + "loss": 0.111, + "step": 41765 + }, + { + "epoch": 0.7449434594941676, + "grad_norm": 0.2808431088924408, + "learning_rate": 9.269752379411408e-06, + "loss": 0.0989, + "step": 41766 + }, + { + "epoch": 0.7449612956158813, + "grad_norm": 0.23383773863315582, + "learning_rate": 9.268542637074335e-06, + "loss": 0.0917, + "step": 41767 + }, + { + "epoch": 0.744979131737595, + "grad_norm": 0.2602749466896057, + "learning_rate": 9.267332955717824e-06, + "loss": 0.1386, + "step": 41768 + }, + { + "epoch": 0.7449969678593087, + "grad_norm": 0.22628013789653778, + "learning_rate": 9.266123335346567e-06, + "loss": 0.0875, + "step": 41769 + }, + { + "epoch": 0.7450148039810224, + "grad_norm": 0.24421167373657227, + "learning_rate": 9.264913775965245e-06, + "loss": 0.1197, + "step": 41770 + }, + { + "epoch": 0.7450326401027361, + "grad_norm": 0.16801457107067108, + "learning_rate": 9.263704277578546e-06, + "loss": 0.0603, + "step": 41771 + }, + { + "epoch": 0.7450504762244498, + "grad_norm": 0.2294878214597702, + "learning_rate": 9.262494840191171e-06, + "loss": 0.0879, + "step": 41772 + }, + { + "epoch": 0.7450683123461634, + "grad_norm": 0.2590222954750061, + "learning_rate": 9.261285463807806e-06, + "loss": 0.0968, + "step": 41773 + }, + { + "epoch": 0.7450861484678771, + "grad_norm": 0.2385087013244629, + "learning_rate": 9.260076148433123e-06, + "loss": 0.1103, + "step": 41774 + }, + { + "epoch": 0.7451039845895908, + "grad_norm": 0.3576755225658417, + "learning_rate": 9.25886689407183e-06, + "loss": 0.0979, + "step": 41775 + }, + { + "epoch": 0.7451218207113045, + "grad_norm": 0.2186053842306137, + "learning_rate": 9.257657700728597e-06, + "loss": 0.082, + "step": 41776 + }, + { + "epoch": 0.7451396568330182, + "grad_norm": 0.2458042949438095, + "learning_rate": 9.25644856840813e-06, + "loss": 0.1449, + "step": 41777 + }, + { + "epoch": 0.7451574929547319, + "grad_norm": 0.24039867520332336, + "learning_rate": 9.255239497115103e-06, + "loss": 0.099, + "step": 41778 + }, + { + "epoch": 0.7451753290764456, + "grad_norm": 0.37679219245910645, + "learning_rate": 9.254030486854198e-06, + "loss": 0.1688, + "step": 41779 + }, + { + "epoch": 0.7451931651981593, + "grad_norm": 0.32191309332847595, + "learning_rate": 9.25282153763012e-06, + "loss": 0.1347, + "step": 41780 + }, + { + "epoch": 0.7452110013198731, + "grad_norm": 0.20005831122398376, + "learning_rate": 9.251612649447544e-06, + "loss": 0.105, + "step": 41781 + }, + { + "epoch": 0.7452288374415867, + "grad_norm": 0.23925656080245972, + "learning_rate": 9.250403822311158e-06, + "loss": 0.1005, + "step": 41782 + }, + { + "epoch": 0.7452466735633004, + "grad_norm": 0.26097792387008667, + "learning_rate": 9.249195056225643e-06, + "loss": 0.1252, + "step": 41783 + }, + { + "epoch": 0.7452645096850141, + "grad_norm": 0.35629400610923767, + "learning_rate": 9.247986351195681e-06, + "loss": 0.1504, + "step": 41784 + }, + { + "epoch": 0.7452823458067278, + "grad_norm": 0.49715733528137207, + "learning_rate": 9.246777707225973e-06, + "loss": 0.1504, + "step": 41785 + }, + { + "epoch": 0.7453001819284415, + "grad_norm": 0.21874253451824188, + "learning_rate": 9.245569124321196e-06, + "loss": 0.0918, + "step": 41786 + }, + { + "epoch": 0.7453180180501552, + "grad_norm": 0.26758113503456116, + "learning_rate": 9.244360602486027e-06, + "loss": 0.1203, + "step": 41787 + }, + { + "epoch": 0.7453358541718689, + "grad_norm": 0.1933550238609314, + "learning_rate": 9.243152141725165e-06, + "loss": 0.0584, + "step": 41788 + }, + { + "epoch": 0.7453536902935826, + "grad_norm": 0.2705863416194916, + "learning_rate": 9.24194374204328e-06, + "loss": 0.1201, + "step": 41789 + }, + { + "epoch": 0.7453715264152962, + "grad_norm": 0.2597334384918213, + "learning_rate": 9.24073540344507e-06, + "loss": 0.0875, + "step": 41790 + }, + { + "epoch": 0.7453893625370099, + "grad_norm": 0.2080470621585846, + "learning_rate": 9.239527125935216e-06, + "loss": 0.1186, + "step": 41791 + }, + { + "epoch": 0.7454071986587236, + "grad_norm": 0.4132150113582611, + "learning_rate": 9.238318909518387e-06, + "loss": 0.0939, + "step": 41792 + }, + { + "epoch": 0.7454250347804373, + "grad_norm": 0.26948103308677673, + "learning_rate": 9.237110754199287e-06, + "loss": 0.1208, + "step": 41793 + }, + { + "epoch": 0.745442870902151, + "grad_norm": 0.3922657370567322, + "learning_rate": 9.235902659982593e-06, + "loss": 0.1485, + "step": 41794 + }, + { + "epoch": 0.7454607070238647, + "grad_norm": 0.5338376760482788, + "learning_rate": 9.23469462687298e-06, + "loss": 0.1782, + "step": 41795 + }, + { + "epoch": 0.7454785431455784, + "grad_norm": 0.3584495782852173, + "learning_rate": 9.23348665487514e-06, + "loss": 0.123, + "step": 41796 + }, + { + "epoch": 0.7454963792672922, + "grad_norm": 0.2581416070461273, + "learning_rate": 9.232278743993742e-06, + "loss": 0.1079, + "step": 41797 + }, + { + "epoch": 0.7455142153890059, + "grad_norm": 0.24829868972301483, + "learning_rate": 9.231070894233484e-06, + "loss": 0.0715, + "step": 41798 + }, + { + "epoch": 0.7455320515107196, + "grad_norm": 0.22758758068084717, + "learning_rate": 9.229863105599044e-06, + "loss": 0.0634, + "step": 41799 + }, + { + "epoch": 0.7455498876324332, + "grad_norm": 0.2903639078140259, + "learning_rate": 9.2286553780951e-06, + "loss": 0.1261, + "step": 41800 + }, + { + "epoch": 0.7455677237541469, + "grad_norm": 0.2785182297229767, + "learning_rate": 9.227447711726325e-06, + "loss": 0.1041, + "step": 41801 + }, + { + "epoch": 0.7455855598758606, + "grad_norm": 0.3490147888660431, + "learning_rate": 9.226240106497421e-06, + "loss": 0.1769, + "step": 41802 + }, + { + "epoch": 0.7456033959975743, + "grad_norm": 0.29464948177337646, + "learning_rate": 9.22503256241305e-06, + "loss": 0.137, + "step": 41803 + }, + { + "epoch": 0.745621232119288, + "grad_norm": 0.25141823291778564, + "learning_rate": 9.223825079477911e-06, + "loss": 0.1141, + "step": 41804 + }, + { + "epoch": 0.7456390682410017, + "grad_norm": 0.24085195362567902, + "learning_rate": 9.222617657696664e-06, + "loss": 0.1138, + "step": 41805 + }, + { + "epoch": 0.7456569043627154, + "grad_norm": 0.2202644944190979, + "learning_rate": 9.22141029707401e-06, + "loss": 0.0877, + "step": 41806 + }, + { + "epoch": 0.745674740484429, + "grad_norm": 0.27243903279304504, + "learning_rate": 9.220202997614618e-06, + "loss": 0.1294, + "step": 41807 + }, + { + "epoch": 0.7456925766061427, + "grad_norm": 0.2497827112674713, + "learning_rate": 9.21899575932317e-06, + "loss": 0.1185, + "step": 41808 + }, + { + "epoch": 0.7457104127278564, + "grad_norm": 0.22586867213249207, + "learning_rate": 9.217788582204345e-06, + "loss": 0.1197, + "step": 41809 + }, + { + "epoch": 0.7457282488495701, + "grad_norm": 0.2817850410938263, + "learning_rate": 9.216581466262817e-06, + "loss": 0.1488, + "step": 41810 + }, + { + "epoch": 0.7457460849712838, + "grad_norm": 0.2709518373012543, + "learning_rate": 9.215374411503275e-06, + "loss": 0.1215, + "step": 41811 + }, + { + "epoch": 0.7457639210929975, + "grad_norm": 0.2206665277481079, + "learning_rate": 9.214167417930395e-06, + "loss": 0.1463, + "step": 41812 + }, + { + "epoch": 0.7457817572147112, + "grad_norm": 0.30456459522247314, + "learning_rate": 9.212960485548856e-06, + "loss": 0.1546, + "step": 41813 + }, + { + "epoch": 0.745799593336425, + "grad_norm": 0.23487740755081177, + "learning_rate": 9.211753614363326e-06, + "loss": 0.1532, + "step": 41814 + }, + { + "epoch": 0.7458174294581387, + "grad_norm": 0.260042667388916, + "learning_rate": 9.210546804378498e-06, + "loss": 0.1279, + "step": 41815 + }, + { + "epoch": 0.7458352655798524, + "grad_norm": 0.2977808713912964, + "learning_rate": 9.209340055599035e-06, + "loss": 0.1131, + "step": 41816 + }, + { + "epoch": 0.745853101701566, + "grad_norm": 0.30064624547958374, + "learning_rate": 9.208133368029633e-06, + "loss": 0.151, + "step": 41817 + }, + { + "epoch": 0.7458709378232797, + "grad_norm": 0.27859240770339966, + "learning_rate": 9.206926741674957e-06, + "loss": 0.1186, + "step": 41818 + }, + { + "epoch": 0.7458887739449934, + "grad_norm": 0.2517804801464081, + "learning_rate": 9.20572017653969e-06, + "loss": 0.11, + "step": 41819 + }, + { + "epoch": 0.7459066100667071, + "grad_norm": 0.3461153209209442, + "learning_rate": 9.20451367262851e-06, + "loss": 0.1277, + "step": 41820 + }, + { + "epoch": 0.7459244461884208, + "grad_norm": 0.4292374551296234, + "learning_rate": 9.20330722994609e-06, + "loss": 0.1332, + "step": 41821 + }, + { + "epoch": 0.7459422823101345, + "grad_norm": 0.2662316560745239, + "learning_rate": 9.202100848497106e-06, + "loss": 0.1502, + "step": 41822 + }, + { + "epoch": 0.7459601184318482, + "grad_norm": 0.3414556086063385, + "learning_rate": 9.200894528286227e-06, + "loss": 0.1396, + "step": 41823 + }, + { + "epoch": 0.7459779545535619, + "grad_norm": 0.21548150479793549, + "learning_rate": 9.199688269318149e-06, + "loss": 0.0896, + "step": 41824 + }, + { + "epoch": 0.7459957906752755, + "grad_norm": 0.24060171842575073, + "learning_rate": 9.198482071597533e-06, + "loss": 0.1242, + "step": 41825 + }, + { + "epoch": 0.7460136267969892, + "grad_norm": 0.26048561930656433, + "learning_rate": 9.197275935129062e-06, + "loss": 0.132, + "step": 41826 + }, + { + "epoch": 0.7460314629187029, + "grad_norm": 0.2417096197605133, + "learning_rate": 9.196069859917398e-06, + "loss": 0.1284, + "step": 41827 + }, + { + "epoch": 0.7460492990404166, + "grad_norm": 0.2765873074531555, + "learning_rate": 9.19486384596723e-06, + "loss": 0.1655, + "step": 41828 + }, + { + "epoch": 0.7460671351621303, + "grad_norm": 0.2587246596813202, + "learning_rate": 9.193657893283234e-06, + "loss": 0.1372, + "step": 41829 + }, + { + "epoch": 0.746084971283844, + "grad_norm": 0.21504376828670502, + "learning_rate": 9.192452001870067e-06, + "loss": 0.0769, + "step": 41830 + }, + { + "epoch": 0.7461028074055578, + "grad_norm": 0.3404028117656708, + "learning_rate": 9.191246171732427e-06, + "loss": 0.1434, + "step": 41831 + }, + { + "epoch": 0.7461206435272715, + "grad_norm": 0.32502004504203796, + "learning_rate": 9.190040402874966e-06, + "loss": 0.1412, + "step": 41832 + }, + { + "epoch": 0.7461384796489852, + "grad_norm": 0.35646936297416687, + "learning_rate": 9.18883469530238e-06, + "loss": 0.1476, + "step": 41833 + }, + { + "epoch": 0.7461563157706989, + "grad_norm": 0.24465464055538177, + "learning_rate": 9.18762904901933e-06, + "loss": 0.119, + "step": 41834 + }, + { + "epoch": 0.7461741518924125, + "grad_norm": 0.27104452252388, + "learning_rate": 9.186423464030492e-06, + "loss": 0.1813, + "step": 41835 + }, + { + "epoch": 0.7461919880141262, + "grad_norm": 0.2749364376068115, + "learning_rate": 9.18521794034053e-06, + "loss": 0.1251, + "step": 41836 + }, + { + "epoch": 0.7462098241358399, + "grad_norm": 0.21803925931453705, + "learning_rate": 9.184012477954134e-06, + "loss": 0.1068, + "step": 41837 + }, + { + "epoch": 0.7462276602575536, + "grad_norm": 0.18814244866371155, + "learning_rate": 9.182807076875969e-06, + "loss": 0.0738, + "step": 41838 + }, + { + "epoch": 0.7462454963792673, + "grad_norm": 0.25718310475349426, + "learning_rate": 9.181601737110707e-06, + "loss": 0.1538, + "step": 41839 + }, + { + "epoch": 0.746263332500981, + "grad_norm": 0.2642189562320709, + "learning_rate": 9.18039645866301e-06, + "loss": 0.108, + "step": 41840 + }, + { + "epoch": 0.7462811686226947, + "grad_norm": 0.37207135558128357, + "learning_rate": 9.179191241537568e-06, + "loss": 0.1048, + "step": 41841 + }, + { + "epoch": 0.7462990047444084, + "grad_norm": 0.2946400046348572, + "learning_rate": 9.177986085739046e-06, + "loss": 0.1288, + "step": 41842 + }, + { + "epoch": 0.746316840866122, + "grad_norm": 0.28700873255729675, + "learning_rate": 9.176780991272108e-06, + "loss": 0.1252, + "step": 41843 + }, + { + "epoch": 0.7463346769878357, + "grad_norm": 0.3128969073295593, + "learning_rate": 9.17557595814144e-06, + "loss": 0.0736, + "step": 41844 + }, + { + "epoch": 0.7463525131095494, + "grad_norm": 0.30640971660614014, + "learning_rate": 9.174370986351694e-06, + "loss": 0.1874, + "step": 41845 + }, + { + "epoch": 0.7463703492312631, + "grad_norm": 0.30637016892433167, + "learning_rate": 9.173166075907563e-06, + "loss": 0.1413, + "step": 41846 + }, + { + "epoch": 0.7463881853529768, + "grad_norm": 0.2354010045528412, + "learning_rate": 9.171961226813705e-06, + "loss": 0.1318, + "step": 41847 + }, + { + "epoch": 0.7464060214746906, + "grad_norm": 0.25598016381263733, + "learning_rate": 9.170756439074793e-06, + "loss": 0.1129, + "step": 41848 + }, + { + "epoch": 0.7464238575964043, + "grad_norm": 0.26738688349723816, + "learning_rate": 9.16955171269549e-06, + "loss": 0.1131, + "step": 41849 + }, + { + "epoch": 0.746441693718118, + "grad_norm": 0.25804081559181213, + "learning_rate": 9.168347047680478e-06, + "loss": 0.1105, + "step": 41850 + }, + { + "epoch": 0.7464595298398317, + "grad_norm": 0.23106598854064941, + "learning_rate": 9.167142444034421e-06, + "loss": 0.0633, + "step": 41851 + }, + { + "epoch": 0.7464773659615453, + "grad_norm": 0.28757357597351074, + "learning_rate": 9.16593790176199e-06, + "loss": 0.0743, + "step": 41852 + }, + { + "epoch": 0.746495202083259, + "grad_norm": 0.25843679904937744, + "learning_rate": 9.164733420867843e-06, + "loss": 0.0986, + "step": 41853 + }, + { + "epoch": 0.7465130382049727, + "grad_norm": 0.2869722843170166, + "learning_rate": 9.163529001356666e-06, + "loss": 0.1075, + "step": 41854 + }, + { + "epoch": 0.7465308743266864, + "grad_norm": 0.17662650346755981, + "learning_rate": 9.162324643233123e-06, + "loss": 0.1234, + "step": 41855 + }, + { + "epoch": 0.7465487104484001, + "grad_norm": 0.22272805869579315, + "learning_rate": 9.161120346501877e-06, + "loss": 0.147, + "step": 41856 + }, + { + "epoch": 0.7465665465701138, + "grad_norm": 0.376632422208786, + "learning_rate": 9.159916111167593e-06, + "loss": 0.1298, + "step": 41857 + }, + { + "epoch": 0.7465843826918275, + "grad_norm": 0.31782400608062744, + "learning_rate": 9.158711937234954e-06, + "loss": 0.1104, + "step": 41858 + }, + { + "epoch": 0.7466022188135412, + "grad_norm": 0.3512881398200989, + "learning_rate": 9.157507824708609e-06, + "loss": 0.2044, + "step": 41859 + }, + { + "epoch": 0.7466200549352549, + "grad_norm": 0.2636779546737671, + "learning_rate": 9.156303773593241e-06, + "loss": 0.0814, + "step": 41860 + }, + { + "epoch": 0.7466378910569685, + "grad_norm": 0.31799277663230896, + "learning_rate": 9.155099783893518e-06, + "loss": 0.135, + "step": 41861 + }, + { + "epoch": 0.7466557271786822, + "grad_norm": 0.2215995341539383, + "learning_rate": 9.15389585561409e-06, + "loss": 0.122, + "step": 41862 + }, + { + "epoch": 0.7466735633003959, + "grad_norm": 0.25963956117630005, + "learning_rate": 9.15269198875964e-06, + "loss": 0.1149, + "step": 41863 + }, + { + "epoch": 0.7466913994221096, + "grad_norm": 0.1972527652978897, + "learning_rate": 9.151488183334833e-06, + "loss": 0.1285, + "step": 41864 + }, + { + "epoch": 0.7467092355438234, + "grad_norm": 0.2374015599489212, + "learning_rate": 9.15028443934433e-06, + "loss": 0.1091, + "step": 41865 + }, + { + "epoch": 0.7467270716655371, + "grad_norm": 0.3324569761753082, + "learning_rate": 9.149080756792792e-06, + "loss": 0.1721, + "step": 41866 + }, + { + "epoch": 0.7467449077872508, + "grad_norm": 0.1996554434299469, + "learning_rate": 9.147877135684898e-06, + "loss": 0.0925, + "step": 41867 + }, + { + "epoch": 0.7467627439089645, + "grad_norm": 0.3236035704612732, + "learning_rate": 9.146673576025308e-06, + "loss": 0.1407, + "step": 41868 + }, + { + "epoch": 0.7467805800306782, + "grad_norm": 0.30322229862213135, + "learning_rate": 9.145470077818688e-06, + "loss": 0.2051, + "step": 41869 + }, + { + "epoch": 0.7467984161523918, + "grad_norm": 0.3037695288658142, + "learning_rate": 9.144266641069693e-06, + "loss": 0.1273, + "step": 41870 + }, + { + "epoch": 0.7468162522741055, + "grad_norm": 0.16107887029647827, + "learning_rate": 9.143063265783006e-06, + "loss": 0.1022, + "step": 41871 + }, + { + "epoch": 0.7468340883958192, + "grad_norm": 0.32069131731987, + "learning_rate": 9.141859951963273e-06, + "loss": 0.136, + "step": 41872 + }, + { + "epoch": 0.7468519245175329, + "grad_norm": 0.3409341871738434, + "learning_rate": 9.140656699615174e-06, + "loss": 0.2077, + "step": 41873 + }, + { + "epoch": 0.7468697606392466, + "grad_norm": 0.2663356065750122, + "learning_rate": 9.13945350874337e-06, + "loss": 0.1309, + "step": 41874 + }, + { + "epoch": 0.7468875967609603, + "grad_norm": 0.2814778983592987, + "learning_rate": 9.138250379352515e-06, + "loss": 0.0596, + "step": 41875 + }, + { + "epoch": 0.746905432882674, + "grad_norm": 0.33803728222846985, + "learning_rate": 9.137047311447289e-06, + "loss": 0.1574, + "step": 41876 + }, + { + "epoch": 0.7469232690043877, + "grad_norm": 0.2891872525215149, + "learning_rate": 9.135844305032343e-06, + "loss": 0.1087, + "step": 41877 + }, + { + "epoch": 0.7469411051261013, + "grad_norm": 0.3055242598056793, + "learning_rate": 9.134641360112345e-06, + "loss": 0.1313, + "step": 41878 + }, + { + "epoch": 0.746958941247815, + "grad_norm": 0.2337963730096817, + "learning_rate": 9.133438476691949e-06, + "loss": 0.0867, + "step": 41879 + }, + { + "epoch": 0.7469767773695287, + "grad_norm": 0.22477470338344574, + "learning_rate": 9.132235654775834e-06, + "loss": 0.1195, + "step": 41880 + }, + { + "epoch": 0.7469946134912424, + "grad_norm": 0.3041396737098694, + "learning_rate": 9.131032894368655e-06, + "loss": 0.1671, + "step": 41881 + }, + { + "epoch": 0.7470124496129562, + "grad_norm": 0.21302111446857452, + "learning_rate": 9.129830195475073e-06, + "loss": 0.1187, + "step": 41882 + }, + { + "epoch": 0.7470302857346699, + "grad_norm": 0.3032519221305847, + "learning_rate": 9.128627558099743e-06, + "loss": 0.1515, + "step": 41883 + }, + { + "epoch": 0.7470481218563836, + "grad_norm": 0.22368109226226807, + "learning_rate": 9.12742498224734e-06, + "loss": 0.1127, + "step": 41884 + }, + { + "epoch": 0.7470659579780973, + "grad_norm": 0.276267409324646, + "learning_rate": 9.12622246792252e-06, + "loss": 0.1083, + "step": 41885 + }, + { + "epoch": 0.747083794099811, + "grad_norm": 0.2245662808418274, + "learning_rate": 9.12502001512994e-06, + "loss": 0.1281, + "step": 41886 + }, + { + "epoch": 0.7471016302215246, + "grad_norm": 0.410659521818161, + "learning_rate": 9.123817623874271e-06, + "loss": 0.1719, + "step": 41887 + }, + { + "epoch": 0.7471194663432383, + "grad_norm": 0.23370492458343506, + "learning_rate": 9.122615294160159e-06, + "loss": 0.0795, + "step": 41888 + }, + { + "epoch": 0.747137302464952, + "grad_norm": 0.2496359646320343, + "learning_rate": 9.121413025992284e-06, + "loss": 0.1233, + "step": 41889 + }, + { + "epoch": 0.7471551385866657, + "grad_norm": 0.28118282556533813, + "learning_rate": 9.120210819375297e-06, + "loss": 0.1572, + "step": 41890 + }, + { + "epoch": 0.7471729747083794, + "grad_norm": 0.3433353006839752, + "learning_rate": 9.119008674313856e-06, + "loss": 0.0971, + "step": 41891 + }, + { + "epoch": 0.7471908108300931, + "grad_norm": 0.29779717326164246, + "learning_rate": 9.117806590812614e-06, + "loss": 0.0865, + "step": 41892 + }, + { + "epoch": 0.7472086469518068, + "grad_norm": 0.17946451902389526, + "learning_rate": 9.116604568876248e-06, + "loss": 0.0541, + "step": 41893 + }, + { + "epoch": 0.7472264830735205, + "grad_norm": 0.2596747875213623, + "learning_rate": 9.11540260850941e-06, + "loss": 0.1045, + "step": 41894 + }, + { + "epoch": 0.7472443191952342, + "grad_norm": 0.2925165593624115, + "learning_rate": 9.114200709716755e-06, + "loss": 0.0576, + "step": 41895 + }, + { + "epoch": 0.7472621553169478, + "grad_norm": 0.2701359987258911, + "learning_rate": 9.11299887250294e-06, + "loss": 0.1403, + "step": 41896 + }, + { + "epoch": 0.7472799914386615, + "grad_norm": 0.22653523087501526, + "learning_rate": 9.111797096872634e-06, + "loss": 0.1479, + "step": 41897 + }, + { + "epoch": 0.7472978275603753, + "grad_norm": 0.31115207076072693, + "learning_rate": 9.110595382830491e-06, + "loss": 0.0992, + "step": 41898 + }, + { + "epoch": 0.747315663682089, + "grad_norm": 0.2689175605773926, + "learning_rate": 9.109393730381161e-06, + "loss": 0.0828, + "step": 41899 + }, + { + "epoch": 0.7473334998038027, + "grad_norm": 0.25736773014068604, + "learning_rate": 9.108192139529317e-06, + "loss": 0.1645, + "step": 41900 + }, + { + "epoch": 0.7473513359255164, + "grad_norm": 0.18629489839076996, + "learning_rate": 9.1069906102796e-06, + "loss": 0.0726, + "step": 41901 + }, + { + "epoch": 0.7473691720472301, + "grad_norm": 0.2451358139514923, + "learning_rate": 9.105789142636686e-06, + "loss": 0.1332, + "step": 41902 + }, + { + "epoch": 0.7473870081689438, + "grad_norm": 0.29196950793266296, + "learning_rate": 9.104587736605222e-06, + "loss": 0.1207, + "step": 41903 + }, + { + "epoch": 0.7474048442906575, + "grad_norm": 0.2902083396911621, + "learning_rate": 9.103386392189866e-06, + "loss": 0.1817, + "step": 41904 + }, + { + "epoch": 0.7474226804123711, + "grad_norm": 0.3537599444389343, + "learning_rate": 9.102185109395267e-06, + "loss": 0.1035, + "step": 41905 + }, + { + "epoch": 0.7474405165340848, + "grad_norm": 0.24687115848064423, + "learning_rate": 9.100983888226097e-06, + "loss": 0.1422, + "step": 41906 + }, + { + "epoch": 0.7474583526557985, + "grad_norm": 0.36129337549209595, + "learning_rate": 9.099782728687006e-06, + "loss": 0.1328, + "step": 41907 + }, + { + "epoch": 0.7474761887775122, + "grad_norm": 0.3037579655647278, + "learning_rate": 9.098581630782648e-06, + "loss": 0.1267, + "step": 41908 + }, + { + "epoch": 0.7474940248992259, + "grad_norm": 0.2956200838088989, + "learning_rate": 9.097380594517674e-06, + "loss": 0.1144, + "step": 41909 + }, + { + "epoch": 0.7475118610209396, + "grad_norm": 0.2353220134973526, + "learning_rate": 9.096179619896753e-06, + "loss": 0.0871, + "step": 41910 + }, + { + "epoch": 0.7475296971426533, + "grad_norm": 0.31837642192840576, + "learning_rate": 9.094978706924531e-06, + "loss": 0.1366, + "step": 41911 + }, + { + "epoch": 0.747547533264367, + "grad_norm": 0.2602744996547699, + "learning_rate": 9.093777855605664e-06, + "loss": 0.0991, + "step": 41912 + }, + { + "epoch": 0.7475653693860806, + "grad_norm": 0.23165516555309296, + "learning_rate": 9.092577065944802e-06, + "loss": 0.0838, + "step": 41913 + }, + { + "epoch": 0.7475832055077943, + "grad_norm": 0.2634504437446594, + "learning_rate": 9.091376337946605e-06, + "loss": 0.1239, + "step": 41914 + }, + { + "epoch": 0.7476010416295081, + "grad_norm": 0.28072696924209595, + "learning_rate": 9.090175671615736e-06, + "loss": 0.1239, + "step": 41915 + }, + { + "epoch": 0.7476188777512218, + "grad_norm": 0.2594597041606903, + "learning_rate": 9.08897506695684e-06, + "loss": 0.124, + "step": 41916 + }, + { + "epoch": 0.7476367138729355, + "grad_norm": 0.25251322984695435, + "learning_rate": 9.087774523974575e-06, + "loss": 0.1194, + "step": 41917 + }, + { + "epoch": 0.7476545499946492, + "grad_norm": 0.2923622727394104, + "learning_rate": 9.086574042673578e-06, + "loss": 0.1283, + "step": 41918 + }, + { + "epoch": 0.7476723861163629, + "grad_norm": 0.26804476976394653, + "learning_rate": 9.085373623058529e-06, + "loss": 0.0917, + "step": 41919 + }, + { + "epoch": 0.7476902222380766, + "grad_norm": 0.27839505672454834, + "learning_rate": 9.084173265134068e-06, + "loss": 0.161, + "step": 41920 + }, + { + "epoch": 0.7477080583597903, + "grad_norm": 0.38782840967178345, + "learning_rate": 9.082972968904852e-06, + "loss": 0.0625, + "step": 41921 + }, + { + "epoch": 0.747725894481504, + "grad_norm": 0.26456138491630554, + "learning_rate": 9.081772734375519e-06, + "loss": 0.0916, + "step": 41922 + }, + { + "epoch": 0.7477437306032176, + "grad_norm": 0.1794004589319229, + "learning_rate": 9.080572561550737e-06, + "loss": 0.0508, + "step": 41923 + }, + { + "epoch": 0.7477615667249313, + "grad_norm": 0.369101881980896, + "learning_rate": 9.079372450435159e-06, + "loss": 0.1171, + "step": 41924 + }, + { + "epoch": 0.747779402846645, + "grad_norm": 0.2246226668357849, + "learning_rate": 9.078172401033433e-06, + "loss": 0.1299, + "step": 41925 + }, + { + "epoch": 0.7477972389683587, + "grad_norm": 0.23060117661952972, + "learning_rate": 9.0769724133502e-06, + "loss": 0.1031, + "step": 41926 + }, + { + "epoch": 0.7478150750900724, + "grad_norm": 0.24144943058490753, + "learning_rate": 9.075772487390128e-06, + "loss": 0.0823, + "step": 41927 + }, + { + "epoch": 0.7478329112117861, + "grad_norm": 0.30769750475883484, + "learning_rate": 9.074572623157856e-06, + "loss": 0.089, + "step": 41928 + }, + { + "epoch": 0.7478507473334998, + "grad_norm": 0.2637888193130493, + "learning_rate": 9.073372820658046e-06, + "loss": 0.1166, + "step": 41929 + }, + { + "epoch": 0.7478685834552135, + "grad_norm": 0.20476548373699188, + "learning_rate": 9.07217307989535e-06, + "loss": 0.078, + "step": 41930 + }, + { + "epoch": 0.7478864195769271, + "grad_norm": 0.24336165189743042, + "learning_rate": 9.0709734008744e-06, + "loss": 0.1012, + "step": 41931 + }, + { + "epoch": 0.7479042556986409, + "grad_norm": 0.343633770942688, + "learning_rate": 9.069773783599866e-06, + "loss": 0.1274, + "step": 41932 + }, + { + "epoch": 0.7479220918203546, + "grad_norm": 0.2840675711631775, + "learning_rate": 9.068574228076393e-06, + "loss": 0.1223, + "step": 41933 + }, + { + "epoch": 0.7479399279420683, + "grad_norm": 0.34654518961906433, + "learning_rate": 9.067374734308628e-06, + "loss": 0.1488, + "step": 41934 + }, + { + "epoch": 0.747957764063782, + "grad_norm": 0.275153785943985, + "learning_rate": 9.066175302301213e-06, + "loss": 0.1141, + "step": 41935 + }, + { + "epoch": 0.7479756001854957, + "grad_norm": 0.21194010972976685, + "learning_rate": 9.064975932058815e-06, + "loss": 0.0894, + "step": 41936 + }, + { + "epoch": 0.7479934363072094, + "grad_norm": 0.23151636123657227, + "learning_rate": 9.063776623586073e-06, + "loss": 0.1118, + "step": 41937 + }, + { + "epoch": 0.7480112724289231, + "grad_norm": 0.26441770792007446, + "learning_rate": 9.062577376887638e-06, + "loss": 0.1334, + "step": 41938 + }, + { + "epoch": 0.7480291085506368, + "grad_norm": 0.24598845839500427, + "learning_rate": 9.061378191968158e-06, + "loss": 0.1389, + "step": 41939 + }, + { + "epoch": 0.7480469446723504, + "grad_norm": 0.3421214520931244, + "learning_rate": 9.060179068832272e-06, + "loss": 0.1623, + "step": 41940 + }, + { + "epoch": 0.7480647807940641, + "grad_norm": 0.2248920500278473, + "learning_rate": 9.058980007484638e-06, + "loss": 0.0963, + "step": 41941 + }, + { + "epoch": 0.7480826169157778, + "grad_norm": 0.33854901790618896, + "learning_rate": 9.05778100792991e-06, + "loss": 0.1476, + "step": 41942 + }, + { + "epoch": 0.7481004530374915, + "grad_norm": 0.24359679222106934, + "learning_rate": 9.05658207017273e-06, + "loss": 0.1639, + "step": 41943 + }, + { + "epoch": 0.7481182891592052, + "grad_norm": 0.32375413179397583, + "learning_rate": 9.05538319421774e-06, + "loss": 0.1094, + "step": 41944 + }, + { + "epoch": 0.7481361252809189, + "grad_norm": 0.2316959798336029, + "learning_rate": 9.054184380069597e-06, + "loss": 0.102, + "step": 41945 + }, + { + "epoch": 0.7481539614026326, + "grad_norm": 0.2827676236629486, + "learning_rate": 9.052985627732941e-06, + "loss": 0.0951, + "step": 41946 + }, + { + "epoch": 0.7481717975243463, + "grad_norm": 0.33458948135375977, + "learning_rate": 9.051786937212422e-06, + "loss": 0.1243, + "step": 41947 + }, + { + "epoch": 0.74818963364606, + "grad_norm": 0.35230180621147156, + "learning_rate": 9.050588308512677e-06, + "loss": 0.1489, + "step": 41948 + }, + { + "epoch": 0.7482074697677737, + "grad_norm": 0.3614899814128876, + "learning_rate": 9.04938974163837e-06, + "loss": 0.0851, + "step": 41949 + }, + { + "epoch": 0.7482253058894874, + "grad_norm": 0.33453720808029175, + "learning_rate": 9.048191236594137e-06, + "loss": 0.1398, + "step": 41950 + }, + { + "epoch": 0.7482431420112011, + "grad_norm": 0.22242973744869232, + "learning_rate": 9.046992793384623e-06, + "loss": 0.0831, + "step": 41951 + }, + { + "epoch": 0.7482609781329148, + "grad_norm": 0.27579745650291443, + "learning_rate": 9.045794412014477e-06, + "loss": 0.1073, + "step": 41952 + }, + { + "epoch": 0.7482788142546285, + "grad_norm": 0.20397932827472687, + "learning_rate": 9.044596092488331e-06, + "loss": 0.1068, + "step": 41953 + }, + { + "epoch": 0.7482966503763422, + "grad_norm": 0.298859566450119, + "learning_rate": 9.043397834810852e-06, + "loss": 0.0699, + "step": 41954 + }, + { + "epoch": 0.7483144864980559, + "grad_norm": 0.30979788303375244, + "learning_rate": 9.042199638986665e-06, + "loss": 0.0881, + "step": 41955 + }, + { + "epoch": 0.7483323226197696, + "grad_norm": 0.3030558228492737, + "learning_rate": 9.041001505020433e-06, + "loss": 0.1029, + "step": 41956 + }, + { + "epoch": 0.7483501587414833, + "grad_norm": 0.271344393491745, + "learning_rate": 9.039803432916782e-06, + "loss": 0.1038, + "step": 41957 + }, + { + "epoch": 0.7483679948631969, + "grad_norm": 0.27243441343307495, + "learning_rate": 9.038605422680375e-06, + "loss": 0.1329, + "step": 41958 + }, + { + "epoch": 0.7483858309849106, + "grad_norm": 0.2747722566127777, + "learning_rate": 9.037407474315845e-06, + "loss": 0.0858, + "step": 41959 + }, + { + "epoch": 0.7484036671066243, + "grad_norm": 0.27515843510627747, + "learning_rate": 9.036209587827838e-06, + "loss": 0.1037, + "step": 41960 + }, + { + "epoch": 0.748421503228338, + "grad_norm": 0.2827395498752594, + "learning_rate": 9.035011763220984e-06, + "loss": 0.1217, + "step": 41961 + }, + { + "epoch": 0.7484393393500517, + "grad_norm": 0.3788807690143585, + "learning_rate": 9.03381400049995e-06, + "loss": 0.1032, + "step": 41962 + }, + { + "epoch": 0.7484571754717654, + "grad_norm": 0.3126762807369232, + "learning_rate": 9.03261629966937e-06, + "loss": 0.104, + "step": 41963 + }, + { + "epoch": 0.7484750115934791, + "grad_norm": 0.2632387578487396, + "learning_rate": 9.031418660733882e-06, + "loss": 0.1496, + "step": 41964 + }, + { + "epoch": 0.7484928477151928, + "grad_norm": 0.2825818657875061, + "learning_rate": 9.030221083698129e-06, + "loss": 0.1696, + "step": 41965 + }, + { + "epoch": 0.7485106838369066, + "grad_norm": 0.31543752551078796, + "learning_rate": 9.02902356856675e-06, + "loss": 0.1301, + "step": 41966 + }, + { + "epoch": 0.7485285199586202, + "grad_norm": 0.2320011854171753, + "learning_rate": 9.027826115344396e-06, + "loss": 0.0929, + "step": 41967 + }, + { + "epoch": 0.7485463560803339, + "grad_norm": 0.23497483134269714, + "learning_rate": 9.026628724035699e-06, + "loss": 0.1323, + "step": 41968 + }, + { + "epoch": 0.7485641922020476, + "grad_norm": 0.331898957490921, + "learning_rate": 9.025431394645315e-06, + "loss": 0.0704, + "step": 41969 + }, + { + "epoch": 0.7485820283237613, + "grad_norm": 0.24425993859767914, + "learning_rate": 9.024234127177866e-06, + "loss": 0.1031, + "step": 41970 + }, + { + "epoch": 0.748599864445475, + "grad_norm": 0.30189934372901917, + "learning_rate": 9.023036921638012e-06, + "loss": 0.102, + "step": 41971 + }, + { + "epoch": 0.7486177005671887, + "grad_norm": 0.29834675788879395, + "learning_rate": 9.021839778030386e-06, + "loss": 0.1233, + "step": 41972 + }, + { + "epoch": 0.7486355366889024, + "grad_norm": 0.3311809301376343, + "learning_rate": 9.020642696359628e-06, + "loss": 0.0887, + "step": 41973 + }, + { + "epoch": 0.7486533728106161, + "grad_norm": 0.34275487065315247, + "learning_rate": 9.019445676630368e-06, + "loss": 0.1317, + "step": 41974 + }, + { + "epoch": 0.7486712089323297, + "grad_norm": 0.2713060677051544, + "learning_rate": 9.018248718847266e-06, + "loss": 0.1187, + "step": 41975 + }, + { + "epoch": 0.7486890450540434, + "grad_norm": 0.38590869307518005, + "learning_rate": 9.017051823014952e-06, + "loss": 0.1114, + "step": 41976 + }, + { + "epoch": 0.7487068811757571, + "grad_norm": 0.266640841960907, + "learning_rate": 9.015854989138064e-06, + "loss": 0.1332, + "step": 41977 + }, + { + "epoch": 0.7487247172974708, + "grad_norm": 0.21670185029506683, + "learning_rate": 9.014658217221244e-06, + "loss": 0.0928, + "step": 41978 + }, + { + "epoch": 0.7487425534191845, + "grad_norm": 0.3271774351596832, + "learning_rate": 9.013461507269122e-06, + "loss": 0.0934, + "step": 41979 + }, + { + "epoch": 0.7487603895408982, + "grad_norm": 0.25897735357284546, + "learning_rate": 9.012264859286351e-06, + "loss": 0.0799, + "step": 41980 + }, + { + "epoch": 0.7487782256626119, + "grad_norm": 0.35139474272727966, + "learning_rate": 9.011068273277566e-06, + "loss": 0.1707, + "step": 41981 + }, + { + "epoch": 0.7487960617843256, + "grad_norm": 0.3317379355430603, + "learning_rate": 9.009871749247392e-06, + "loss": 0.1065, + "step": 41982 + }, + { + "epoch": 0.7488138979060394, + "grad_norm": 0.42130887508392334, + "learning_rate": 9.008675287200489e-06, + "loss": 0.1363, + "step": 41983 + }, + { + "epoch": 0.748831734027753, + "grad_norm": 0.2340133637189865, + "learning_rate": 9.007478887141471e-06, + "loss": 0.1094, + "step": 41984 + }, + { + "epoch": 0.7488495701494667, + "grad_norm": 0.24788811802864075, + "learning_rate": 9.006282549075001e-06, + "loss": 0.1536, + "step": 41985 + }, + { + "epoch": 0.7488674062711804, + "grad_norm": 0.2535175085067749, + "learning_rate": 9.005086273005703e-06, + "loss": 0.1235, + "step": 41986 + }, + { + "epoch": 0.7488852423928941, + "grad_norm": 0.2354431003332138, + "learning_rate": 9.003890058938205e-06, + "loss": 0.0584, + "step": 41987 + }, + { + "epoch": 0.7489030785146078, + "grad_norm": 0.3969897925853729, + "learning_rate": 9.002693906877164e-06, + "loss": 0.1905, + "step": 41988 + }, + { + "epoch": 0.7489209146363215, + "grad_norm": 0.18769477307796478, + "learning_rate": 9.001497816827205e-06, + "loss": 0.108, + "step": 41989 + }, + { + "epoch": 0.7489387507580352, + "grad_norm": 0.3645225465297699, + "learning_rate": 9.00030178879297e-06, + "loss": 0.183, + "step": 41990 + }, + { + "epoch": 0.7489565868797489, + "grad_norm": 0.2234112024307251, + "learning_rate": 8.999105822779089e-06, + "loss": 0.0927, + "step": 41991 + }, + { + "epoch": 0.7489744230014626, + "grad_norm": 0.3051113784313202, + "learning_rate": 8.99790991879019e-06, + "loss": 0.1254, + "step": 41992 + }, + { + "epoch": 0.7489922591231762, + "grad_norm": 0.3288295269012451, + "learning_rate": 8.996714076830931e-06, + "loss": 0.1049, + "step": 41993 + }, + { + "epoch": 0.7490100952448899, + "grad_norm": 0.26711562275886536, + "learning_rate": 8.995518296905934e-06, + "loss": 0.0957, + "step": 41994 + }, + { + "epoch": 0.7490279313666036, + "grad_norm": 0.267429918050766, + "learning_rate": 8.994322579019827e-06, + "loss": 0.1372, + "step": 41995 + }, + { + "epoch": 0.7490457674883173, + "grad_norm": 0.27344921231269836, + "learning_rate": 8.993126923177262e-06, + "loss": 0.1134, + "step": 41996 + }, + { + "epoch": 0.749063603610031, + "grad_norm": 0.2872573733329773, + "learning_rate": 8.991931329382857e-06, + "loss": 0.1371, + "step": 41997 + }, + { + "epoch": 0.7490814397317447, + "grad_norm": 0.19753947854042053, + "learning_rate": 8.990735797641268e-06, + "loss": 0.1119, + "step": 41998 + }, + { + "epoch": 0.7490992758534585, + "grad_norm": 0.4069095551967621, + "learning_rate": 8.98954032795711e-06, + "loss": 0.1658, + "step": 41999 + }, + { + "epoch": 0.7491171119751722, + "grad_norm": 0.28794756531715393, + "learning_rate": 8.988344920335018e-06, + "loss": 0.0889, + "step": 42000 + }, + { + "epoch": 0.7491171119751722, + "eval_loss": 0.11315001547336578, + "eval_runtime": 107.1699, + "eval_samples_per_second": 9.555, + "eval_steps_per_second": 1.596, + "step": 42000 + }, + { + "epoch": 0.7491349480968859, + "grad_norm": 0.33742472529411316, + "learning_rate": 8.98714957477964e-06, + "loss": 0.1144, + "step": 42001 + }, + { + "epoch": 0.7491527842185995, + "grad_norm": 0.28506872057914734, + "learning_rate": 8.9859542912956e-06, + "loss": 0.1627, + "step": 42002 + }, + { + "epoch": 0.7491706203403132, + "grad_norm": 0.28201717138290405, + "learning_rate": 8.984759069887535e-06, + "loss": 0.1457, + "step": 42003 + }, + { + "epoch": 0.7491884564620269, + "grad_norm": 0.2749691307544708, + "learning_rate": 8.983563910560073e-06, + "loss": 0.0858, + "step": 42004 + }, + { + "epoch": 0.7492062925837406, + "grad_norm": 0.47563955187797546, + "learning_rate": 8.98236881331784e-06, + "loss": 0.1744, + "step": 42005 + }, + { + "epoch": 0.7492241287054543, + "grad_norm": 0.2707219421863556, + "learning_rate": 8.981173778165488e-06, + "loss": 0.1143, + "step": 42006 + }, + { + "epoch": 0.749241964827168, + "grad_norm": 0.2272682785987854, + "learning_rate": 8.979978805107639e-06, + "loss": 0.1497, + "step": 42007 + }, + { + "epoch": 0.7492598009488817, + "grad_norm": 0.20125547051429749, + "learning_rate": 8.978783894148926e-06, + "loss": 0.1102, + "step": 42008 + }, + { + "epoch": 0.7492776370705954, + "grad_norm": 0.3066682517528534, + "learning_rate": 8.977589045293969e-06, + "loss": 0.117, + "step": 42009 + }, + { + "epoch": 0.749295473192309, + "grad_norm": 0.28412675857543945, + "learning_rate": 8.976394258547422e-06, + "loss": 0.1223, + "step": 42010 + }, + { + "epoch": 0.7493133093140227, + "grad_norm": 0.18476998805999756, + "learning_rate": 8.975199533913895e-06, + "loss": 0.096, + "step": 42011 + }, + { + "epoch": 0.7493311454357364, + "grad_norm": 0.3824542462825775, + "learning_rate": 8.974004871398036e-06, + "loss": 0.1145, + "step": 42012 + }, + { + "epoch": 0.7493489815574501, + "grad_norm": 0.3623155355453491, + "learning_rate": 8.972810271004463e-06, + "loss": 0.0959, + "step": 42013 + }, + { + "epoch": 0.7493668176791638, + "grad_norm": 0.2711127698421478, + "learning_rate": 8.971615732737823e-06, + "loss": 0.1612, + "step": 42014 + }, + { + "epoch": 0.7493846538008775, + "grad_norm": 0.23485594987869263, + "learning_rate": 8.970421256602735e-06, + "loss": 0.0911, + "step": 42015 + }, + { + "epoch": 0.7494024899225913, + "grad_norm": 0.24790982902050018, + "learning_rate": 8.96922684260383e-06, + "loss": 0.1246, + "step": 42016 + }, + { + "epoch": 0.749420326044305, + "grad_norm": 0.31698155403137207, + "learning_rate": 8.968032490745742e-06, + "loss": 0.1467, + "step": 42017 + }, + { + "epoch": 0.7494381621660187, + "grad_norm": 0.2183833122253418, + "learning_rate": 8.966838201033085e-06, + "loss": 0.142, + "step": 42018 + }, + { + "epoch": 0.7494559982877324, + "grad_norm": 0.2552938163280487, + "learning_rate": 8.965643973470511e-06, + "loss": 0.1235, + "step": 42019 + }, + { + "epoch": 0.749473834409446, + "grad_norm": 0.2738887667655945, + "learning_rate": 8.96444980806264e-06, + "loss": 0.1025, + "step": 42020 + }, + { + "epoch": 0.7494916705311597, + "grad_norm": 0.23545123636722565, + "learning_rate": 8.963255704814097e-06, + "loss": 0.1202, + "step": 42021 + }, + { + "epoch": 0.7495095066528734, + "grad_norm": 0.29631537199020386, + "learning_rate": 8.962061663729507e-06, + "loss": 0.1315, + "step": 42022 + }, + { + "epoch": 0.7495273427745871, + "grad_norm": 0.21105071902275085, + "learning_rate": 8.960867684813514e-06, + "loss": 0.073, + "step": 42023 + }, + { + "epoch": 0.7495451788963008, + "grad_norm": 0.2751389443874359, + "learning_rate": 8.959673768070728e-06, + "loss": 0.1646, + "step": 42024 + }, + { + "epoch": 0.7495630150180145, + "grad_norm": 0.27297112345695496, + "learning_rate": 8.958479913505796e-06, + "loss": 0.1509, + "step": 42025 + }, + { + "epoch": 0.7495808511397282, + "grad_norm": 0.2689000070095062, + "learning_rate": 8.95728612112333e-06, + "loss": 0.0719, + "step": 42026 + }, + { + "epoch": 0.7495986872614419, + "grad_norm": 0.192246213555336, + "learning_rate": 8.95609239092797e-06, + "loss": 0.1264, + "step": 42027 + }, + { + "epoch": 0.7496165233831555, + "grad_norm": 0.3367277681827545, + "learning_rate": 8.954898722924337e-06, + "loss": 0.0737, + "step": 42028 + }, + { + "epoch": 0.7496343595048692, + "grad_norm": 0.4084070026874542, + "learning_rate": 8.95370511711706e-06, + "loss": 0.1654, + "step": 42029 + }, + { + "epoch": 0.7496521956265829, + "grad_norm": 0.23909322917461395, + "learning_rate": 8.952511573510763e-06, + "loss": 0.17, + "step": 42030 + }, + { + "epoch": 0.7496700317482966, + "grad_norm": 0.26647239923477173, + "learning_rate": 8.951318092110064e-06, + "loss": 0.1123, + "step": 42031 + }, + { + "epoch": 0.7496878678700103, + "grad_norm": 0.215842142701149, + "learning_rate": 8.95012467291961e-06, + "loss": 0.0725, + "step": 42032 + }, + { + "epoch": 0.7497057039917241, + "grad_norm": 0.2804771959781647, + "learning_rate": 8.948931315944014e-06, + "loss": 0.143, + "step": 42033 + }, + { + "epoch": 0.7497235401134378, + "grad_norm": 0.2761063873767853, + "learning_rate": 8.947738021187907e-06, + "loss": 0.117, + "step": 42034 + }, + { + "epoch": 0.7497413762351515, + "grad_norm": 0.22390833497047424, + "learning_rate": 8.946544788655901e-06, + "loss": 0.1102, + "step": 42035 + }, + { + "epoch": 0.7497592123568652, + "grad_norm": 0.29880204796791077, + "learning_rate": 8.94535161835264e-06, + "loss": 0.0978, + "step": 42036 + }, + { + "epoch": 0.7497770484785788, + "grad_norm": 0.3341601490974426, + "learning_rate": 8.944158510282744e-06, + "loss": 0.1571, + "step": 42037 + }, + { + "epoch": 0.7497948846002925, + "grad_norm": 0.3323186933994293, + "learning_rate": 8.942965464450825e-06, + "loss": 0.1236, + "step": 42038 + }, + { + "epoch": 0.7498127207220062, + "grad_norm": 0.2128514051437378, + "learning_rate": 8.941772480861527e-06, + "loss": 0.1001, + "step": 42039 + }, + { + "epoch": 0.7498305568437199, + "grad_norm": 0.2780879735946655, + "learning_rate": 8.940579559519454e-06, + "loss": 0.1586, + "step": 42040 + }, + { + "epoch": 0.7498483929654336, + "grad_norm": 0.22825388610363007, + "learning_rate": 8.939386700429253e-06, + "loss": 0.0853, + "step": 42041 + }, + { + "epoch": 0.7498662290871473, + "grad_norm": 0.29734793305397034, + "learning_rate": 8.938193903595535e-06, + "loss": 0.076, + "step": 42042 + }, + { + "epoch": 0.749884065208861, + "grad_norm": 0.3439893126487732, + "learning_rate": 8.937001169022925e-06, + "loss": 0.106, + "step": 42043 + }, + { + "epoch": 0.7499019013305747, + "grad_norm": 0.22359788417816162, + "learning_rate": 8.935808496716038e-06, + "loss": 0.0617, + "step": 42044 + }, + { + "epoch": 0.7499197374522883, + "grad_norm": 0.25676581263542175, + "learning_rate": 8.934615886679515e-06, + "loss": 0.0967, + "step": 42045 + }, + { + "epoch": 0.749937573574002, + "grad_norm": 0.3949834704399109, + "learning_rate": 8.933423338917968e-06, + "loss": 0.1543, + "step": 42046 + }, + { + "epoch": 0.7499554096957157, + "grad_norm": 0.28987938165664673, + "learning_rate": 8.932230853436021e-06, + "loss": 0.1097, + "step": 42047 + }, + { + "epoch": 0.7499732458174294, + "grad_norm": 0.30169570446014404, + "learning_rate": 8.931038430238292e-06, + "loss": 0.1293, + "step": 42048 + }, + { + "epoch": 0.7499910819391431, + "grad_norm": 0.2319132685661316, + "learning_rate": 8.929846069329411e-06, + "loss": 0.1445, + "step": 42049 + }, + { + "epoch": 0.7500089180608569, + "grad_norm": 0.2882557213306427, + "learning_rate": 8.928653770714001e-06, + "loss": 0.1171, + "step": 42050 + }, + { + "epoch": 0.7500267541825706, + "grad_norm": 0.2730274796485901, + "learning_rate": 8.927461534396672e-06, + "loss": 0.1053, + "step": 42051 + }, + { + "epoch": 0.7500445903042843, + "grad_norm": 0.2675424814224243, + "learning_rate": 8.926269360382061e-06, + "loss": 0.178, + "step": 42052 + }, + { + "epoch": 0.750062426425998, + "grad_norm": 0.3200012445449829, + "learning_rate": 8.925077248674771e-06, + "loss": 0.1008, + "step": 42053 + }, + { + "epoch": 0.7500802625477117, + "grad_norm": 0.268690288066864, + "learning_rate": 8.923885199279444e-06, + "loss": 0.0805, + "step": 42054 + }, + { + "epoch": 0.7500980986694253, + "grad_norm": 0.2666533887386322, + "learning_rate": 8.922693212200692e-06, + "loss": 0.1516, + "step": 42055 + }, + { + "epoch": 0.750115934791139, + "grad_norm": 0.2992364168167114, + "learning_rate": 8.921501287443131e-06, + "loss": 0.1185, + "step": 42056 + }, + { + "epoch": 0.7501337709128527, + "grad_norm": 0.24742060899734497, + "learning_rate": 8.920309425011378e-06, + "loss": 0.0851, + "step": 42057 + }, + { + "epoch": 0.7501516070345664, + "grad_norm": 0.4028169512748718, + "learning_rate": 8.919117624910067e-06, + "loss": 0.1113, + "step": 42058 + }, + { + "epoch": 0.7501694431562801, + "grad_norm": 0.2818405330181122, + "learning_rate": 8.917925887143811e-06, + "loss": 0.1269, + "step": 42059 + }, + { + "epoch": 0.7501872792779938, + "grad_norm": 0.21878543496131897, + "learning_rate": 8.916734211717225e-06, + "loss": 0.1149, + "step": 42060 + }, + { + "epoch": 0.7502051153997075, + "grad_norm": 0.3269925117492676, + "learning_rate": 8.91554259863493e-06, + "loss": 0.1095, + "step": 42061 + }, + { + "epoch": 0.7502229515214212, + "grad_norm": 0.22564412653446198, + "learning_rate": 8.914351047901551e-06, + "loss": 0.074, + "step": 42062 + }, + { + "epoch": 0.7502407876431348, + "grad_norm": 0.25140830874443054, + "learning_rate": 8.913159559521705e-06, + "loss": 0.1103, + "step": 42063 + }, + { + "epoch": 0.7502586237648485, + "grad_norm": 0.28874045610427856, + "learning_rate": 8.91196813350001e-06, + "loss": 0.1067, + "step": 42064 + }, + { + "epoch": 0.7502764598865622, + "grad_norm": 0.3521924912929535, + "learning_rate": 8.910776769841073e-06, + "loss": 0.1517, + "step": 42065 + }, + { + "epoch": 0.7502942960082759, + "grad_norm": 0.25434020161628723, + "learning_rate": 8.90958546854953e-06, + "loss": 0.0957, + "step": 42066 + }, + { + "epoch": 0.7503121321299897, + "grad_norm": 0.33381780982017517, + "learning_rate": 8.908394229629983e-06, + "loss": 0.1397, + "step": 42067 + }, + { + "epoch": 0.7503299682517034, + "grad_norm": 0.1818884164094925, + "learning_rate": 8.907203053087068e-06, + "loss": 0.0949, + "step": 42068 + }, + { + "epoch": 0.7503478043734171, + "grad_norm": 0.25334861874580383, + "learning_rate": 8.906011938925391e-06, + "loss": 0.1604, + "step": 42069 + }, + { + "epoch": 0.7503656404951308, + "grad_norm": 0.29042163491249084, + "learning_rate": 8.904820887149562e-06, + "loss": 0.1299, + "step": 42070 + }, + { + "epoch": 0.7503834766168445, + "grad_norm": 0.25951075553894043, + "learning_rate": 8.903629897764215e-06, + "loss": 0.1354, + "step": 42071 + }, + { + "epoch": 0.7504013127385581, + "grad_norm": 0.30543404817581177, + "learning_rate": 8.902438970773958e-06, + "loss": 0.136, + "step": 42072 + }, + { + "epoch": 0.7504191488602718, + "grad_norm": 0.26674771308898926, + "learning_rate": 8.901248106183408e-06, + "loss": 0.118, + "step": 42073 + }, + { + "epoch": 0.7504369849819855, + "grad_norm": 0.2996059060096741, + "learning_rate": 8.900057303997169e-06, + "loss": 0.1178, + "step": 42074 + }, + { + "epoch": 0.7504548211036992, + "grad_norm": 0.21766085922718048, + "learning_rate": 8.898866564219882e-06, + "loss": 0.0646, + "step": 42075 + }, + { + "epoch": 0.7504726572254129, + "grad_norm": 0.2722904086112976, + "learning_rate": 8.897675886856147e-06, + "loss": 0.1126, + "step": 42076 + }, + { + "epoch": 0.7504904933471266, + "grad_norm": 0.26624128222465515, + "learning_rate": 8.896485271910582e-06, + "loss": 0.1117, + "step": 42077 + }, + { + "epoch": 0.7505083294688403, + "grad_norm": 0.3210342228412628, + "learning_rate": 8.895294719387792e-06, + "loss": 0.0903, + "step": 42078 + }, + { + "epoch": 0.750526165590554, + "grad_norm": 0.3359255790710449, + "learning_rate": 8.894104229292413e-06, + "loss": 0.1103, + "step": 42079 + }, + { + "epoch": 0.7505440017122676, + "grad_norm": 0.33223870396614075, + "learning_rate": 8.892913801629038e-06, + "loss": 0.1095, + "step": 42080 + }, + { + "epoch": 0.7505618378339813, + "grad_norm": 0.30413511395454407, + "learning_rate": 8.891723436402302e-06, + "loss": 0.0894, + "step": 42081 + }, + { + "epoch": 0.750579673955695, + "grad_norm": 0.31320711970329285, + "learning_rate": 8.89053313361681e-06, + "loss": 0.1239, + "step": 42082 + }, + { + "epoch": 0.7505975100774087, + "grad_norm": 0.252905011177063, + "learning_rate": 8.889342893277166e-06, + "loss": 0.0763, + "step": 42083 + }, + { + "epoch": 0.7506153461991225, + "grad_norm": 0.24671049416065216, + "learning_rate": 8.888152715388004e-06, + "loss": 0.1599, + "step": 42084 + }, + { + "epoch": 0.7506331823208362, + "grad_norm": 0.3065972328186035, + "learning_rate": 8.886962599953927e-06, + "loss": 0.1246, + "step": 42085 + }, + { + "epoch": 0.7506510184425499, + "grad_norm": 0.32748398184776306, + "learning_rate": 8.885772546979546e-06, + "loss": 0.1076, + "step": 42086 + }, + { + "epoch": 0.7506688545642636, + "grad_norm": 0.26008275151252747, + "learning_rate": 8.884582556469467e-06, + "loss": 0.0823, + "step": 42087 + }, + { + "epoch": 0.7506866906859773, + "grad_norm": 0.2635764479637146, + "learning_rate": 8.883392628428325e-06, + "loss": 0.129, + "step": 42088 + }, + { + "epoch": 0.750704526807691, + "grad_norm": 0.33368223905563354, + "learning_rate": 8.882202762860717e-06, + "loss": 0.1734, + "step": 42089 + }, + { + "epoch": 0.7507223629294046, + "grad_norm": 0.23162329196929932, + "learning_rate": 8.881012959771257e-06, + "loss": 0.0922, + "step": 42090 + }, + { + "epoch": 0.7507401990511183, + "grad_norm": 0.34869635105133057, + "learning_rate": 8.879823219164551e-06, + "loss": 0.1277, + "step": 42091 + }, + { + "epoch": 0.750758035172832, + "grad_norm": 0.3413008749485016, + "learning_rate": 8.878633541045226e-06, + "loss": 0.1309, + "step": 42092 + }, + { + "epoch": 0.7507758712945457, + "grad_norm": 0.20450599491596222, + "learning_rate": 8.877443925417886e-06, + "loss": 0.1234, + "step": 42093 + }, + { + "epoch": 0.7507937074162594, + "grad_norm": 0.22577449679374695, + "learning_rate": 8.876254372287132e-06, + "loss": 0.0909, + "step": 42094 + }, + { + "epoch": 0.7508115435379731, + "grad_norm": 0.33587026596069336, + "learning_rate": 8.875064881657593e-06, + "loss": 0.1207, + "step": 42095 + }, + { + "epoch": 0.7508293796596868, + "grad_norm": 0.2645242512226105, + "learning_rate": 8.873875453533868e-06, + "loss": 0.1638, + "step": 42096 + }, + { + "epoch": 0.7508472157814005, + "grad_norm": 0.21473512053489685, + "learning_rate": 8.872686087920574e-06, + "loss": 0.1181, + "step": 42097 + }, + { + "epoch": 0.7508650519031141, + "grad_norm": 0.24860872328281403, + "learning_rate": 8.871496784822323e-06, + "loss": 0.1196, + "step": 42098 + }, + { + "epoch": 0.7508828880248278, + "grad_norm": 0.29707661271095276, + "learning_rate": 8.87030754424372e-06, + "loss": 0.1088, + "step": 42099 + }, + { + "epoch": 0.7509007241465415, + "grad_norm": 0.2295612394809723, + "learning_rate": 8.869118366189365e-06, + "loss": 0.1054, + "step": 42100 + }, + { + "epoch": 0.7509185602682553, + "grad_norm": 0.23430395126342773, + "learning_rate": 8.86792925066389e-06, + "loss": 0.141, + "step": 42101 + }, + { + "epoch": 0.750936396389969, + "grad_norm": 0.22908875346183777, + "learning_rate": 8.866740197671895e-06, + "loss": 0.1395, + "step": 42102 + }, + { + "epoch": 0.7509542325116827, + "grad_norm": 0.26498180627822876, + "learning_rate": 8.86555120721798e-06, + "loss": 0.1307, + "step": 42103 + }, + { + "epoch": 0.7509720686333964, + "grad_norm": 0.2420375645160675, + "learning_rate": 8.86436227930676e-06, + "loss": 0.1279, + "step": 42104 + }, + { + "epoch": 0.7509899047551101, + "grad_norm": 0.26881036162376404, + "learning_rate": 8.863173413942851e-06, + "loss": 0.1142, + "step": 42105 + }, + { + "epoch": 0.7510077408768238, + "grad_norm": 0.26392707228660583, + "learning_rate": 8.861984611130855e-06, + "loss": 0.1785, + "step": 42106 + }, + { + "epoch": 0.7510255769985374, + "grad_norm": 0.2737892270088196, + "learning_rate": 8.86079587087537e-06, + "loss": 0.1179, + "step": 42107 + }, + { + "epoch": 0.7510434131202511, + "grad_norm": 0.30394309759140015, + "learning_rate": 8.859607193181027e-06, + "loss": 0.1558, + "step": 42108 + }, + { + "epoch": 0.7510612492419648, + "grad_norm": 0.26812347769737244, + "learning_rate": 8.858418578052411e-06, + "loss": 0.0888, + "step": 42109 + }, + { + "epoch": 0.7510790853636785, + "grad_norm": 0.2923065423965454, + "learning_rate": 8.857230025494148e-06, + "loss": 0.1145, + "step": 42110 + }, + { + "epoch": 0.7510969214853922, + "grad_norm": 0.22205254435539246, + "learning_rate": 8.856041535510836e-06, + "loss": 0.0985, + "step": 42111 + }, + { + "epoch": 0.7511147576071059, + "grad_norm": 0.27145302295684814, + "learning_rate": 8.854853108107086e-06, + "loss": 0.1168, + "step": 42112 + }, + { + "epoch": 0.7511325937288196, + "grad_norm": 0.253492146730423, + "learning_rate": 8.85366474328749e-06, + "loss": 0.0825, + "step": 42113 + }, + { + "epoch": 0.7511504298505333, + "grad_norm": 0.4730740487575531, + "learning_rate": 8.852476441056676e-06, + "loss": 0.1307, + "step": 42114 + }, + { + "epoch": 0.751168265972247, + "grad_norm": 0.2881898283958435, + "learning_rate": 8.85128820141924e-06, + "loss": 0.0702, + "step": 42115 + }, + { + "epoch": 0.7511861020939606, + "grad_norm": 0.23260222375392914, + "learning_rate": 8.85010002437979e-06, + "loss": 0.1007, + "step": 42116 + }, + { + "epoch": 0.7512039382156744, + "grad_norm": 0.270072340965271, + "learning_rate": 8.84891190994292e-06, + "loss": 0.1116, + "step": 42117 + }, + { + "epoch": 0.7512217743373881, + "grad_norm": 0.1961875855922699, + "learning_rate": 8.847723858113254e-06, + "loss": 0.0898, + "step": 42118 + }, + { + "epoch": 0.7512396104591018, + "grad_norm": 0.16710162162780762, + "learning_rate": 8.846535868895393e-06, + "loss": 0.0852, + "step": 42119 + }, + { + "epoch": 0.7512574465808155, + "grad_norm": 0.2354332059621811, + "learning_rate": 8.845347942293933e-06, + "loss": 0.1355, + "step": 42120 + }, + { + "epoch": 0.7512752827025292, + "grad_norm": 0.4211379885673523, + "learning_rate": 8.84416007831348e-06, + "loss": 0.1945, + "step": 42121 + }, + { + "epoch": 0.7512931188242429, + "grad_norm": 0.32310420274734497, + "learning_rate": 8.842972276958639e-06, + "loss": 0.1287, + "step": 42122 + }, + { + "epoch": 0.7513109549459566, + "grad_norm": 0.19457566738128662, + "learning_rate": 8.841784538234027e-06, + "loss": 0.0848, + "step": 42123 + }, + { + "epoch": 0.7513287910676703, + "grad_norm": 0.2943336069583893, + "learning_rate": 8.84059686214424e-06, + "loss": 0.1724, + "step": 42124 + }, + { + "epoch": 0.7513466271893839, + "grad_norm": 0.33193403482437134, + "learning_rate": 8.839409248693881e-06, + "loss": 0.1676, + "step": 42125 + }, + { + "epoch": 0.7513644633110976, + "grad_norm": 0.2646985352039337, + "learning_rate": 8.838221697887544e-06, + "loss": 0.1879, + "step": 42126 + }, + { + "epoch": 0.7513822994328113, + "grad_norm": 0.26498857140541077, + "learning_rate": 8.837034209729852e-06, + "loss": 0.1052, + "step": 42127 + }, + { + "epoch": 0.751400135554525, + "grad_norm": 0.2948433756828308, + "learning_rate": 8.835846784225398e-06, + "loss": 0.1756, + "step": 42128 + }, + { + "epoch": 0.7514179716762387, + "grad_norm": 0.23665866255760193, + "learning_rate": 8.834659421378783e-06, + "loss": 0.0923, + "step": 42129 + }, + { + "epoch": 0.7514358077979524, + "grad_norm": 0.24343526363372803, + "learning_rate": 8.833472121194602e-06, + "loss": 0.0862, + "step": 42130 + }, + { + "epoch": 0.7514536439196661, + "grad_norm": 0.23014704883098602, + "learning_rate": 8.832284883677478e-06, + "loss": 0.0784, + "step": 42131 + }, + { + "epoch": 0.7514714800413798, + "grad_norm": 0.27050164341926575, + "learning_rate": 8.831097708832001e-06, + "loss": 0.1361, + "step": 42132 + }, + { + "epoch": 0.7514893161630934, + "grad_norm": 0.25127875804901123, + "learning_rate": 8.829910596662773e-06, + "loss": 0.1104, + "step": 42133 + }, + { + "epoch": 0.7515071522848072, + "grad_norm": 0.3064620792865753, + "learning_rate": 8.828723547174389e-06, + "loss": 0.1409, + "step": 42134 + }, + { + "epoch": 0.7515249884065209, + "grad_norm": 0.26769953966140747, + "learning_rate": 8.827536560371467e-06, + "loss": 0.0876, + "step": 42135 + }, + { + "epoch": 0.7515428245282346, + "grad_norm": 0.20214343070983887, + "learning_rate": 8.826349636258591e-06, + "loss": 0.1015, + "step": 42136 + }, + { + "epoch": 0.7515606606499483, + "grad_norm": 0.2769550681114197, + "learning_rate": 8.825162774840376e-06, + "loss": 0.119, + "step": 42137 + }, + { + "epoch": 0.751578496771662, + "grad_norm": 0.3347737193107605, + "learning_rate": 8.823975976121418e-06, + "loss": 0.143, + "step": 42138 + }, + { + "epoch": 0.7515963328933757, + "grad_norm": 0.2858056426048279, + "learning_rate": 8.822789240106308e-06, + "loss": 0.163, + "step": 42139 + }, + { + "epoch": 0.7516141690150894, + "grad_norm": 0.2721172273159027, + "learning_rate": 8.821602566799662e-06, + "loss": 0.1657, + "step": 42140 + }, + { + "epoch": 0.7516320051368031, + "grad_norm": 0.20643888413906097, + "learning_rate": 8.82041595620607e-06, + "loss": 0.1161, + "step": 42141 + }, + { + "epoch": 0.7516498412585167, + "grad_norm": 0.282868355512619, + "learning_rate": 8.819229408330138e-06, + "loss": 0.131, + "step": 42142 + }, + { + "epoch": 0.7516676773802304, + "grad_norm": 0.6819544434547424, + "learning_rate": 8.818042923176453e-06, + "loss": 0.1338, + "step": 42143 + }, + { + "epoch": 0.7516855135019441, + "grad_norm": 0.25793537497520447, + "learning_rate": 8.816856500749629e-06, + "loss": 0.1096, + "step": 42144 + }, + { + "epoch": 0.7517033496236578, + "grad_norm": 0.3843628466129303, + "learning_rate": 8.815670141054261e-06, + "loss": 0.1021, + "step": 42145 + }, + { + "epoch": 0.7517211857453715, + "grad_norm": 0.3574337661266327, + "learning_rate": 8.814483844094943e-06, + "loss": 0.1316, + "step": 42146 + }, + { + "epoch": 0.7517390218670852, + "grad_norm": 0.22077126801013947, + "learning_rate": 8.813297609876268e-06, + "loss": 0.1019, + "step": 42147 + }, + { + "epoch": 0.7517568579887989, + "grad_norm": 0.2697865664958954, + "learning_rate": 8.81211143840285e-06, + "loss": 0.097, + "step": 42148 + }, + { + "epoch": 0.7517746941105126, + "grad_norm": 0.2687152922153473, + "learning_rate": 8.810925329679273e-06, + "loss": 0.1311, + "step": 42149 + }, + { + "epoch": 0.7517925302322263, + "grad_norm": 0.36930057406425476, + "learning_rate": 8.809739283710146e-06, + "loss": 0.1158, + "step": 42150 + }, + { + "epoch": 0.75181036635394, + "grad_norm": 0.32879677414894104, + "learning_rate": 8.808553300500066e-06, + "loss": 0.1233, + "step": 42151 + }, + { + "epoch": 0.7518282024756537, + "grad_norm": 0.29432424902915955, + "learning_rate": 8.807367380053613e-06, + "loss": 0.1199, + "step": 42152 + }, + { + "epoch": 0.7518460385973674, + "grad_norm": 0.43963679671287537, + "learning_rate": 8.806181522375409e-06, + "loss": 0.1315, + "step": 42153 + }, + { + "epoch": 0.7518638747190811, + "grad_norm": 0.25721198320388794, + "learning_rate": 8.804995727470036e-06, + "loss": 0.0829, + "step": 42154 + }, + { + "epoch": 0.7518817108407948, + "grad_norm": 0.24039192497730255, + "learning_rate": 8.803809995342094e-06, + "loss": 0.1069, + "step": 42155 + }, + { + "epoch": 0.7518995469625085, + "grad_norm": 0.24511951208114624, + "learning_rate": 8.80262432599617e-06, + "loss": 0.1143, + "step": 42156 + }, + { + "epoch": 0.7519173830842222, + "grad_norm": 0.2990073561668396, + "learning_rate": 8.801438719436877e-06, + "loss": 0.1181, + "step": 42157 + }, + { + "epoch": 0.7519352192059359, + "grad_norm": 0.26523932814598083, + "learning_rate": 8.800253175668801e-06, + "loss": 0.1305, + "step": 42158 + }, + { + "epoch": 0.7519530553276496, + "grad_norm": 0.25667643547058105, + "learning_rate": 8.799067694696542e-06, + "loss": 0.1472, + "step": 42159 + }, + { + "epoch": 0.7519708914493632, + "grad_norm": 0.2096024602651596, + "learning_rate": 8.79788227652468e-06, + "loss": 0.1276, + "step": 42160 + }, + { + "epoch": 0.7519887275710769, + "grad_norm": 0.30646803975105286, + "learning_rate": 8.796696921157833e-06, + "loss": 0.1345, + "step": 42161 + }, + { + "epoch": 0.7520065636927906, + "grad_norm": 0.2282247394323349, + "learning_rate": 8.795511628600583e-06, + "loss": 0.0936, + "step": 42162 + }, + { + "epoch": 0.7520243998145043, + "grad_norm": 0.344303160905838, + "learning_rate": 8.794326398857523e-06, + "loss": 0.1441, + "step": 42163 + }, + { + "epoch": 0.752042235936218, + "grad_norm": 0.19948624074459076, + "learning_rate": 8.793141231933255e-06, + "loss": 0.076, + "step": 42164 + }, + { + "epoch": 0.7520600720579317, + "grad_norm": 0.2920467257499695, + "learning_rate": 8.791956127832362e-06, + "loss": 0.1394, + "step": 42165 + }, + { + "epoch": 0.7520779081796454, + "grad_norm": 0.339828759431839, + "learning_rate": 8.790771086559455e-06, + "loss": 0.1638, + "step": 42166 + }, + { + "epoch": 0.7520957443013591, + "grad_norm": 0.22915126383304596, + "learning_rate": 8.789586108119115e-06, + "loss": 0.0778, + "step": 42167 + }, + { + "epoch": 0.7521135804230729, + "grad_norm": 0.2713801860809326, + "learning_rate": 8.78840119251594e-06, + "loss": 0.1187, + "step": 42168 + }, + { + "epoch": 0.7521314165447865, + "grad_norm": 0.2872539460659027, + "learning_rate": 8.787216339754514e-06, + "loss": 0.1087, + "step": 42169 + }, + { + "epoch": 0.7521492526665002, + "grad_norm": 0.2936089038848877, + "learning_rate": 8.786031549839447e-06, + "loss": 0.155, + "step": 42170 + }, + { + "epoch": 0.7521670887882139, + "grad_norm": 0.33259886503219604, + "learning_rate": 8.784846822775316e-06, + "loss": 0.1183, + "step": 42171 + }, + { + "epoch": 0.7521849249099276, + "grad_norm": 0.23290354013442993, + "learning_rate": 8.783662158566724e-06, + "loss": 0.1004, + "step": 42172 + }, + { + "epoch": 0.7522027610316413, + "grad_norm": 0.20739133656024933, + "learning_rate": 8.782477557218249e-06, + "loss": 0.0837, + "step": 42173 + }, + { + "epoch": 0.752220597153355, + "grad_norm": 0.3643436133861542, + "learning_rate": 8.7812930187345e-06, + "loss": 0.1055, + "step": 42174 + }, + { + "epoch": 0.7522384332750687, + "grad_norm": 0.27334144711494446, + "learning_rate": 8.780108543120061e-06, + "loss": 0.0968, + "step": 42175 + }, + { + "epoch": 0.7522562693967824, + "grad_norm": 0.24182043969631195, + "learning_rate": 8.778924130379523e-06, + "loss": 0.0909, + "step": 42176 + }, + { + "epoch": 0.752274105518496, + "grad_norm": 0.29700538516044617, + "learning_rate": 8.777739780517472e-06, + "loss": 0.1117, + "step": 42177 + }, + { + "epoch": 0.7522919416402097, + "grad_norm": 0.26775923371315, + "learning_rate": 8.776555493538502e-06, + "loss": 0.1028, + "step": 42178 + }, + { + "epoch": 0.7523097777619234, + "grad_norm": 0.2090633064508438, + "learning_rate": 8.775371269447213e-06, + "loss": 0.1015, + "step": 42179 + }, + { + "epoch": 0.7523276138836371, + "grad_norm": 0.2052198201417923, + "learning_rate": 8.77418710824819e-06, + "loss": 0.1269, + "step": 42180 + }, + { + "epoch": 0.7523454500053508, + "grad_norm": 0.2673207223415375, + "learning_rate": 8.773003009946026e-06, + "loss": 0.1354, + "step": 42181 + }, + { + "epoch": 0.7523632861270645, + "grad_norm": 0.25561168789863586, + "learning_rate": 8.771818974545296e-06, + "loss": 0.1073, + "step": 42182 + }, + { + "epoch": 0.7523811222487782, + "grad_norm": 0.3096112310886383, + "learning_rate": 8.770635002050609e-06, + "loss": 0.1189, + "step": 42183 + }, + { + "epoch": 0.7523989583704919, + "grad_norm": 0.27430427074432373, + "learning_rate": 8.769451092466544e-06, + "loss": 0.1025, + "step": 42184 + }, + { + "epoch": 0.7524167944922057, + "grad_norm": 0.3142137825489044, + "learning_rate": 8.768267245797696e-06, + "loss": 0.1188, + "step": 42185 + }, + { + "epoch": 0.7524346306139194, + "grad_norm": 0.28176233172416687, + "learning_rate": 8.767083462048639e-06, + "loss": 0.1374, + "step": 42186 + }, + { + "epoch": 0.752452466735633, + "grad_norm": 0.39823800325393677, + "learning_rate": 8.765899741223983e-06, + "loss": 0.1141, + "step": 42187 + }, + { + "epoch": 0.7524703028573467, + "grad_norm": 0.2888416647911072, + "learning_rate": 8.764716083328306e-06, + "loss": 0.1112, + "step": 42188 + }, + { + "epoch": 0.7524881389790604, + "grad_norm": 0.25727975368499756, + "learning_rate": 8.763532488366196e-06, + "loss": 0.1166, + "step": 42189 + }, + { + "epoch": 0.7525059751007741, + "grad_norm": 0.2711246609687805, + "learning_rate": 8.762348956342236e-06, + "loss": 0.1561, + "step": 42190 + }, + { + "epoch": 0.7525238112224878, + "grad_norm": 0.2138831615447998, + "learning_rate": 8.761165487261028e-06, + "loss": 0.0666, + "step": 42191 + }, + { + "epoch": 0.7525416473442015, + "grad_norm": 0.2878745496273041, + "learning_rate": 8.759982081127142e-06, + "loss": 0.1554, + "step": 42192 + }, + { + "epoch": 0.7525594834659152, + "grad_norm": 0.30273404717445374, + "learning_rate": 8.758798737945184e-06, + "loss": 0.102, + "step": 42193 + }, + { + "epoch": 0.7525773195876289, + "grad_norm": 0.4103369116783142, + "learning_rate": 8.757615457719732e-06, + "loss": 0.1088, + "step": 42194 + }, + { + "epoch": 0.7525951557093425, + "grad_norm": 0.2743113040924072, + "learning_rate": 8.756432240455361e-06, + "loss": 0.0959, + "step": 42195 + }, + { + "epoch": 0.7526129918310562, + "grad_norm": 0.2367207407951355, + "learning_rate": 8.755249086156677e-06, + "loss": 0.0994, + "step": 42196 + }, + { + "epoch": 0.7526308279527699, + "grad_norm": 0.23079419136047363, + "learning_rate": 8.754065994828261e-06, + "loss": 0.1263, + "step": 42197 + }, + { + "epoch": 0.7526486640744836, + "grad_norm": 0.30911049246788025, + "learning_rate": 8.752882966474696e-06, + "loss": 0.1336, + "step": 42198 + }, + { + "epoch": 0.7526665001961973, + "grad_norm": 0.22299621999263763, + "learning_rate": 8.75170000110056e-06, + "loss": 0.1152, + "step": 42199 + }, + { + "epoch": 0.752684336317911, + "grad_norm": 0.29472866654396057, + "learning_rate": 8.750517098710457e-06, + "loss": 0.1062, + "step": 42200 + }, + { + "epoch": 0.7527021724396247, + "grad_norm": 0.3956160843372345, + "learning_rate": 8.749334259308958e-06, + "loss": 0.1543, + "step": 42201 + }, + { + "epoch": 0.7527200085613385, + "grad_norm": 0.2704830467700958, + "learning_rate": 8.748151482900654e-06, + "loss": 0.135, + "step": 42202 + }, + { + "epoch": 0.7527378446830522, + "grad_norm": 0.4271145761013031, + "learning_rate": 8.746968769490127e-06, + "loss": 0.0818, + "step": 42203 + }, + { + "epoch": 0.7527556808047658, + "grad_norm": 0.2855283319950104, + "learning_rate": 8.745786119081955e-06, + "loss": 0.2005, + "step": 42204 + }, + { + "epoch": 0.7527735169264795, + "grad_norm": 0.26800844073295593, + "learning_rate": 8.744603531680732e-06, + "loss": 0.1568, + "step": 42205 + }, + { + "epoch": 0.7527913530481932, + "grad_norm": 0.28565552830696106, + "learning_rate": 8.743421007291047e-06, + "loss": 0.1197, + "step": 42206 + }, + { + "epoch": 0.7528091891699069, + "grad_norm": 0.2677529454231262, + "learning_rate": 8.742238545917478e-06, + "loss": 0.177, + "step": 42207 + }, + { + "epoch": 0.7528270252916206, + "grad_norm": 0.28748244047164917, + "learning_rate": 8.741056147564596e-06, + "loss": 0.1066, + "step": 42208 + }, + { + "epoch": 0.7528448614133343, + "grad_norm": 0.2905385494232178, + "learning_rate": 8.739873812237007e-06, + "loss": 0.106, + "step": 42209 + }, + { + "epoch": 0.752862697535048, + "grad_norm": 0.2593698799610138, + "learning_rate": 8.738691539939284e-06, + "loss": 0.1219, + "step": 42210 + }, + { + "epoch": 0.7528805336567617, + "grad_norm": 0.26137372851371765, + "learning_rate": 8.737509330676008e-06, + "loss": 0.0955, + "step": 42211 + }, + { + "epoch": 0.7528983697784754, + "grad_norm": 0.26841309666633606, + "learning_rate": 8.736327184451753e-06, + "loss": 0.0838, + "step": 42212 + }, + { + "epoch": 0.752916205900189, + "grad_norm": 0.23604892194271088, + "learning_rate": 8.735145101271122e-06, + "loss": 0.0847, + "step": 42213 + }, + { + "epoch": 0.7529340420219027, + "grad_norm": 0.2799234092235565, + "learning_rate": 8.733963081138686e-06, + "loss": 0.1298, + "step": 42214 + }, + { + "epoch": 0.7529518781436164, + "grad_norm": 0.18916910886764526, + "learning_rate": 8.732781124059026e-06, + "loss": 0.0911, + "step": 42215 + }, + { + "epoch": 0.7529697142653301, + "grad_norm": 0.29734471440315247, + "learning_rate": 8.731599230036725e-06, + "loss": 0.1386, + "step": 42216 + }, + { + "epoch": 0.7529875503870438, + "grad_norm": 0.4562632739543915, + "learning_rate": 8.730417399076355e-06, + "loss": 0.0927, + "step": 42217 + }, + { + "epoch": 0.7530053865087576, + "grad_norm": 0.221034973859787, + "learning_rate": 8.729235631182517e-06, + "loss": 0.1019, + "step": 42218 + }, + { + "epoch": 0.7530232226304713, + "grad_norm": 0.4050613343715668, + "learning_rate": 8.72805392635977e-06, + "loss": 0.1114, + "step": 42219 + }, + { + "epoch": 0.753041058752185, + "grad_norm": 0.25245755910873413, + "learning_rate": 8.726872284612716e-06, + "loss": 0.1214, + "step": 42220 + }, + { + "epoch": 0.7530588948738987, + "grad_norm": 0.28109344840049744, + "learning_rate": 8.725690705945918e-06, + "loss": 0.1019, + "step": 42221 + }, + { + "epoch": 0.7530767309956123, + "grad_norm": 0.2757278084754944, + "learning_rate": 8.724509190363972e-06, + "loss": 0.1264, + "step": 42222 + }, + { + "epoch": 0.753094567117326, + "grad_norm": 0.30457839369773865, + "learning_rate": 8.723327737871451e-06, + "loss": 0.1388, + "step": 42223 + }, + { + "epoch": 0.7531124032390397, + "grad_norm": 0.33599331974983215, + "learning_rate": 8.722146348472932e-06, + "loss": 0.1299, + "step": 42224 + }, + { + "epoch": 0.7531302393607534, + "grad_norm": 0.4407554268836975, + "learning_rate": 8.720965022172986e-06, + "loss": 0.1173, + "step": 42225 + }, + { + "epoch": 0.7531480754824671, + "grad_norm": 0.23061370849609375, + "learning_rate": 8.719783758976213e-06, + "loss": 0.0757, + "step": 42226 + }, + { + "epoch": 0.7531659116041808, + "grad_norm": 0.2715109586715698, + "learning_rate": 8.718602558887182e-06, + "loss": 0.1052, + "step": 42227 + }, + { + "epoch": 0.7531837477258945, + "grad_norm": 0.22626429796218872, + "learning_rate": 8.717421421910468e-06, + "loss": 0.1304, + "step": 42228 + }, + { + "epoch": 0.7532015838476082, + "grad_norm": 0.2100069224834442, + "learning_rate": 8.716240348050653e-06, + "loss": 0.1041, + "step": 42229 + }, + { + "epoch": 0.7532194199693218, + "grad_norm": 0.3300541341304779, + "learning_rate": 8.715059337312306e-06, + "loss": 0.1228, + "step": 42230 + }, + { + "epoch": 0.7532372560910355, + "grad_norm": 0.2597181797027588, + "learning_rate": 8.713878389700026e-06, + "loss": 0.1548, + "step": 42231 + }, + { + "epoch": 0.7532550922127492, + "grad_norm": 0.24442999064922333, + "learning_rate": 8.712697505218365e-06, + "loss": 0.0681, + "step": 42232 + }, + { + "epoch": 0.7532729283344629, + "grad_norm": 0.25236642360687256, + "learning_rate": 8.711516683871923e-06, + "loss": 0.1278, + "step": 42233 + }, + { + "epoch": 0.7532907644561766, + "grad_norm": 0.25417017936706543, + "learning_rate": 8.710335925665261e-06, + "loss": 0.1076, + "step": 42234 + }, + { + "epoch": 0.7533086005778904, + "grad_norm": 0.22272507846355438, + "learning_rate": 8.709155230602971e-06, + "loss": 0.1212, + "step": 42235 + }, + { + "epoch": 0.7533264366996041, + "grad_norm": 0.3872177004814148, + "learning_rate": 8.707974598689625e-06, + "loss": 0.1627, + "step": 42236 + }, + { + "epoch": 0.7533442728213178, + "grad_norm": 0.23775237798690796, + "learning_rate": 8.706794029929794e-06, + "loss": 0.1405, + "step": 42237 + }, + { + "epoch": 0.7533621089430315, + "grad_norm": 0.28898173570632935, + "learning_rate": 8.705613524328049e-06, + "loss": 0.1716, + "step": 42238 + }, + { + "epoch": 0.7533799450647451, + "grad_norm": 0.35293468832969666, + "learning_rate": 8.704433081888983e-06, + "loss": 0.101, + "step": 42239 + }, + { + "epoch": 0.7533977811864588, + "grad_norm": 0.2647443413734436, + "learning_rate": 8.703252702617159e-06, + "loss": 0.13, + "step": 42240 + }, + { + "epoch": 0.7534156173081725, + "grad_norm": 0.24611803889274597, + "learning_rate": 8.70207238651716e-06, + "loss": 0.0993, + "step": 42241 + }, + { + "epoch": 0.7534334534298862, + "grad_norm": 0.35614416003227234, + "learning_rate": 8.700892133593555e-06, + "loss": 0.0823, + "step": 42242 + }, + { + "epoch": 0.7534512895515999, + "grad_norm": 0.23924636840820312, + "learning_rate": 8.699711943850916e-06, + "loss": 0.1449, + "step": 42243 + }, + { + "epoch": 0.7534691256733136, + "grad_norm": 0.32560649514198303, + "learning_rate": 8.698531817293832e-06, + "loss": 0.1317, + "step": 42244 + }, + { + "epoch": 0.7534869617950273, + "grad_norm": 0.263988733291626, + "learning_rate": 8.697351753926866e-06, + "loss": 0.1172, + "step": 42245 + }, + { + "epoch": 0.753504797916741, + "grad_norm": 0.21988803148269653, + "learning_rate": 8.696171753754586e-06, + "loss": 0.0874, + "step": 42246 + }, + { + "epoch": 0.7535226340384547, + "grad_norm": 0.3509487807750702, + "learning_rate": 8.694991816781588e-06, + "loss": 0.148, + "step": 42247 + }, + { + "epoch": 0.7535404701601683, + "grad_norm": 0.33444318175315857, + "learning_rate": 8.693811943012422e-06, + "loss": 0.1505, + "step": 42248 + }, + { + "epoch": 0.753558306281882, + "grad_norm": 0.3227597773075104, + "learning_rate": 8.692632132451683e-06, + "loss": 0.1091, + "step": 42249 + }, + { + "epoch": 0.7535761424035957, + "grad_norm": 0.28614404797554016, + "learning_rate": 8.691452385103934e-06, + "loss": 0.1143, + "step": 42250 + }, + { + "epoch": 0.7535939785253094, + "grad_norm": 0.23036304116249084, + "learning_rate": 8.690272700973739e-06, + "loss": 0.0809, + "step": 42251 + }, + { + "epoch": 0.7536118146470232, + "grad_norm": 0.31473544239997864, + "learning_rate": 8.689093080065691e-06, + "loss": 0.1025, + "step": 42252 + }, + { + "epoch": 0.7536296507687369, + "grad_norm": 0.24815233051776886, + "learning_rate": 8.68791352238435e-06, + "loss": 0.1349, + "step": 42253 + }, + { + "epoch": 0.7536474868904506, + "grad_norm": 0.30203378200531006, + "learning_rate": 8.68673402793429e-06, + "loss": 0.1098, + "step": 42254 + }, + { + "epoch": 0.7536653230121643, + "grad_norm": 0.1972241997718811, + "learning_rate": 8.685554596720083e-06, + "loss": 0.1134, + "step": 42255 + }, + { + "epoch": 0.753683159133878, + "grad_norm": 0.3330599367618561, + "learning_rate": 8.684375228746295e-06, + "loss": 0.0788, + "step": 42256 + }, + { + "epoch": 0.7537009952555916, + "grad_norm": 0.3485974073410034, + "learning_rate": 8.68319592401751e-06, + "loss": 0.1232, + "step": 42257 + }, + { + "epoch": 0.7537188313773053, + "grad_norm": 0.33124592900276184, + "learning_rate": 8.682016682538293e-06, + "loss": 0.1677, + "step": 42258 + }, + { + "epoch": 0.753736667499019, + "grad_norm": 0.2172287553548813, + "learning_rate": 8.680837504313208e-06, + "loss": 0.0707, + "step": 42259 + }, + { + "epoch": 0.7537545036207327, + "grad_norm": 0.32111528515815735, + "learning_rate": 8.679658389346842e-06, + "loss": 0.1809, + "step": 42260 + }, + { + "epoch": 0.7537723397424464, + "grad_norm": 0.2719787657260895, + "learning_rate": 8.678479337643747e-06, + "loss": 0.0882, + "step": 42261 + }, + { + "epoch": 0.7537901758641601, + "grad_norm": 0.2695040702819824, + "learning_rate": 8.677300349208513e-06, + "loss": 0.1182, + "step": 42262 + }, + { + "epoch": 0.7538080119858738, + "grad_norm": 0.33145859837532043, + "learning_rate": 8.676121424045702e-06, + "loss": 0.131, + "step": 42263 + }, + { + "epoch": 0.7538258481075875, + "grad_norm": 0.29759299755096436, + "learning_rate": 8.67494256215987e-06, + "loss": 0.0912, + "step": 42264 + }, + { + "epoch": 0.7538436842293011, + "grad_norm": 0.297317236661911, + "learning_rate": 8.673763763555611e-06, + "loss": 0.1264, + "step": 42265 + }, + { + "epoch": 0.7538615203510148, + "grad_norm": 0.30711886286735535, + "learning_rate": 8.672585028237481e-06, + "loss": 0.1359, + "step": 42266 + }, + { + "epoch": 0.7538793564727285, + "grad_norm": 0.28112030029296875, + "learning_rate": 8.671406356210051e-06, + "loss": 0.1334, + "step": 42267 + }, + { + "epoch": 0.7538971925944422, + "grad_norm": 0.2673642933368683, + "learning_rate": 8.670227747477891e-06, + "loss": 0.1057, + "step": 42268 + }, + { + "epoch": 0.753915028716156, + "grad_norm": 0.27309855818748474, + "learning_rate": 8.66904920204556e-06, + "loss": 0.12, + "step": 42269 + }, + { + "epoch": 0.7539328648378697, + "grad_norm": 0.17130039632320404, + "learning_rate": 8.667870719917642e-06, + "loss": 0.0615, + "step": 42270 + }, + { + "epoch": 0.7539507009595834, + "grad_norm": 0.3982267677783966, + "learning_rate": 8.666692301098697e-06, + "loss": 0.1334, + "step": 42271 + }, + { + "epoch": 0.7539685370812971, + "grad_norm": 0.27218371629714966, + "learning_rate": 8.665513945593295e-06, + "loss": 0.1065, + "step": 42272 + }, + { + "epoch": 0.7539863732030108, + "grad_norm": 0.27529701590538025, + "learning_rate": 8.664335653405995e-06, + "loss": 0.1415, + "step": 42273 + }, + { + "epoch": 0.7540042093247245, + "grad_norm": 0.21380817890167236, + "learning_rate": 8.66315742454138e-06, + "loss": 0.0602, + "step": 42274 + }, + { + "epoch": 0.7540220454464381, + "grad_norm": 0.2720341086387634, + "learning_rate": 8.661979259004002e-06, + "loss": 0.1049, + "step": 42275 + }, + { + "epoch": 0.7540398815681518, + "grad_norm": 0.34371060132980347, + "learning_rate": 8.660801156798443e-06, + "loss": 0.1264, + "step": 42276 + }, + { + "epoch": 0.7540577176898655, + "grad_norm": 0.3421943783760071, + "learning_rate": 8.659623117929252e-06, + "loss": 0.0994, + "step": 42277 + }, + { + "epoch": 0.7540755538115792, + "grad_norm": 0.2658706605434418, + "learning_rate": 8.658445142401017e-06, + "loss": 0.1535, + "step": 42278 + }, + { + "epoch": 0.7540933899332929, + "grad_norm": 0.32612144947052, + "learning_rate": 8.65726723021829e-06, + "loss": 0.1846, + "step": 42279 + }, + { + "epoch": 0.7541112260550066, + "grad_norm": 0.2186860740184784, + "learning_rate": 8.656089381385641e-06, + "loss": 0.0946, + "step": 42280 + }, + { + "epoch": 0.7541290621767203, + "grad_norm": 0.2428620457649231, + "learning_rate": 8.654911595907635e-06, + "loss": 0.1171, + "step": 42281 + }, + { + "epoch": 0.754146898298434, + "grad_norm": 0.2928742468357086, + "learning_rate": 8.653733873788828e-06, + "loss": 0.1583, + "step": 42282 + }, + { + "epoch": 0.7541647344201476, + "grad_norm": 0.27857810258865356, + "learning_rate": 8.652556215033802e-06, + "loss": 0.1285, + "step": 42283 + }, + { + "epoch": 0.7541825705418613, + "grad_norm": 0.35628536343574524, + "learning_rate": 8.651378619647117e-06, + "loss": 0.1517, + "step": 42284 + }, + { + "epoch": 0.754200406663575, + "grad_norm": 0.2081788182258606, + "learning_rate": 8.650201087633334e-06, + "loss": 0.0751, + "step": 42285 + }, + { + "epoch": 0.7542182427852888, + "grad_norm": 0.21348069608211517, + "learning_rate": 8.64902361899701e-06, + "loss": 0.091, + "step": 42286 + }, + { + "epoch": 0.7542360789070025, + "grad_norm": 0.2190733253955841, + "learning_rate": 8.647846213742724e-06, + "loss": 0.1003, + "step": 42287 + }, + { + "epoch": 0.7542539150287162, + "grad_norm": 0.20201753079891205, + "learning_rate": 8.646668871875027e-06, + "loss": 0.1445, + "step": 42288 + }, + { + "epoch": 0.7542717511504299, + "grad_norm": 0.4274791181087494, + "learning_rate": 8.6454915933985e-06, + "loss": 0.1549, + "step": 42289 + }, + { + "epoch": 0.7542895872721436, + "grad_norm": 0.2638417184352875, + "learning_rate": 8.644314378317685e-06, + "loss": 0.0886, + "step": 42290 + }, + { + "epoch": 0.7543074233938573, + "grad_norm": 0.28225457668304443, + "learning_rate": 8.643137226637168e-06, + "loss": 0.0802, + "step": 42291 + }, + { + "epoch": 0.754325259515571, + "grad_norm": 0.33142709732055664, + "learning_rate": 8.641960138361499e-06, + "loss": 0.1372, + "step": 42292 + }, + { + "epoch": 0.7543430956372846, + "grad_norm": 0.22591717541217804, + "learning_rate": 8.640783113495243e-06, + "loss": 0.0581, + "step": 42293 + }, + { + "epoch": 0.7543609317589983, + "grad_norm": 0.23914481699466705, + "learning_rate": 8.639606152042962e-06, + "loss": 0.0822, + "step": 42294 + }, + { + "epoch": 0.754378767880712, + "grad_norm": 0.21988533437252045, + "learning_rate": 8.638429254009209e-06, + "loss": 0.1294, + "step": 42295 + }, + { + "epoch": 0.7543966040024257, + "grad_norm": 0.2980956733226776, + "learning_rate": 8.637252419398562e-06, + "loss": 0.0892, + "step": 42296 + }, + { + "epoch": 0.7544144401241394, + "grad_norm": 0.3343035876750946, + "learning_rate": 8.636075648215577e-06, + "loss": 0.1404, + "step": 42297 + }, + { + "epoch": 0.7544322762458531, + "grad_norm": 0.300118625164032, + "learning_rate": 8.634898940464817e-06, + "loss": 0.1312, + "step": 42298 + }, + { + "epoch": 0.7544501123675668, + "grad_norm": 0.29913660883903503, + "learning_rate": 8.633722296150832e-06, + "loss": 0.1603, + "step": 42299 + }, + { + "epoch": 0.7544679484892804, + "grad_norm": 0.31691840291023254, + "learning_rate": 8.632545715278201e-06, + "loss": 0.0997, + "step": 42300 + }, + { + "epoch": 0.7544857846109941, + "grad_norm": 0.33182668685913086, + "learning_rate": 8.631369197851474e-06, + "loss": 0.1535, + "step": 42301 + }, + { + "epoch": 0.7545036207327078, + "grad_norm": 0.23822741210460663, + "learning_rate": 8.630192743875207e-06, + "loss": 0.1517, + "step": 42302 + }, + { + "epoch": 0.7545214568544216, + "grad_norm": 0.23876644670963287, + "learning_rate": 8.629016353353963e-06, + "loss": 0.1021, + "step": 42303 + }, + { + "epoch": 0.7545392929761353, + "grad_norm": 0.38707587122917175, + "learning_rate": 8.62784002629232e-06, + "loss": 0.1986, + "step": 42304 + }, + { + "epoch": 0.754557129097849, + "grad_norm": 0.2862582802772522, + "learning_rate": 8.62666376269482e-06, + "loss": 0.1255, + "step": 42305 + }, + { + "epoch": 0.7545749652195627, + "grad_norm": 0.30288851261138916, + "learning_rate": 8.625487562566026e-06, + "loss": 0.155, + "step": 42306 + }, + { + "epoch": 0.7545928013412764, + "grad_norm": 0.2549150288105011, + "learning_rate": 8.6243114259105e-06, + "loss": 0.0928, + "step": 42307 + }, + { + "epoch": 0.7546106374629901, + "grad_norm": 0.23368658125400543, + "learning_rate": 8.62313535273279e-06, + "loss": 0.1274, + "step": 42308 + }, + { + "epoch": 0.7546284735847038, + "grad_norm": 0.23577933013439178, + "learning_rate": 8.621959343037472e-06, + "loss": 0.1007, + "step": 42309 + }, + { + "epoch": 0.7546463097064174, + "grad_norm": 0.23457945883274078, + "learning_rate": 8.620783396829097e-06, + "loss": 0.0829, + "step": 42310 + }, + { + "epoch": 0.7546641458281311, + "grad_norm": 0.24033674597740173, + "learning_rate": 8.619607514112221e-06, + "loss": 0.0851, + "step": 42311 + }, + { + "epoch": 0.7546819819498448, + "grad_norm": 0.2517170011997223, + "learning_rate": 8.618431694891397e-06, + "loss": 0.1629, + "step": 42312 + }, + { + "epoch": 0.7546998180715585, + "grad_norm": 0.26215705275535583, + "learning_rate": 8.617255939171199e-06, + "loss": 0.0941, + "step": 42313 + }, + { + "epoch": 0.7547176541932722, + "grad_norm": 0.31515082716941833, + "learning_rate": 8.616080246956173e-06, + "loss": 0.1458, + "step": 42314 + }, + { + "epoch": 0.7547354903149859, + "grad_norm": 0.32869940996170044, + "learning_rate": 8.614904618250872e-06, + "loss": 0.1122, + "step": 42315 + }, + { + "epoch": 0.7547533264366996, + "grad_norm": 0.3740622103214264, + "learning_rate": 8.613729053059866e-06, + "loss": 0.1294, + "step": 42316 + }, + { + "epoch": 0.7547711625584133, + "grad_norm": 0.2890958786010742, + "learning_rate": 8.6125535513877e-06, + "loss": 0.1185, + "step": 42317 + }, + { + "epoch": 0.7547889986801269, + "grad_norm": 0.2520935833454132, + "learning_rate": 8.611378113238944e-06, + "loss": 0.0921, + "step": 42318 + }, + { + "epoch": 0.7548068348018407, + "grad_norm": 0.3312392830848694, + "learning_rate": 8.610202738618147e-06, + "loss": 0.0869, + "step": 42319 + }, + { + "epoch": 0.7548246709235544, + "grad_norm": 0.361851304769516, + "learning_rate": 8.609027427529864e-06, + "loss": 0.1554, + "step": 42320 + }, + { + "epoch": 0.7548425070452681, + "grad_norm": 0.31280526518821716, + "learning_rate": 8.607852179978646e-06, + "loss": 0.1122, + "step": 42321 + }, + { + "epoch": 0.7548603431669818, + "grad_norm": 0.30120766162872314, + "learning_rate": 8.606676995969059e-06, + "loss": 0.1617, + "step": 42322 + }, + { + "epoch": 0.7548781792886955, + "grad_norm": 0.29599201679229736, + "learning_rate": 8.605501875505658e-06, + "loss": 0.1442, + "step": 42323 + }, + { + "epoch": 0.7548960154104092, + "grad_norm": 0.31141197681427, + "learning_rate": 8.604326818592992e-06, + "loss": 0.1091, + "step": 42324 + }, + { + "epoch": 0.7549138515321229, + "grad_norm": 0.17772111296653748, + "learning_rate": 8.603151825235612e-06, + "loss": 0.1337, + "step": 42325 + }, + { + "epoch": 0.7549316876538366, + "grad_norm": 0.27165862917900085, + "learning_rate": 8.601976895438086e-06, + "loss": 0.0647, + "step": 42326 + }, + { + "epoch": 0.7549495237755502, + "grad_norm": 0.2627803683280945, + "learning_rate": 8.60080202920496e-06, + "loss": 0.1277, + "step": 42327 + }, + { + "epoch": 0.7549673598972639, + "grad_norm": 0.2923004925251007, + "learning_rate": 8.59962722654079e-06, + "loss": 0.1025, + "step": 42328 + }, + { + "epoch": 0.7549851960189776, + "grad_norm": 0.25430062413215637, + "learning_rate": 8.598452487450124e-06, + "loss": 0.1583, + "step": 42329 + }, + { + "epoch": 0.7550030321406913, + "grad_norm": 0.2305556684732437, + "learning_rate": 8.597277811937526e-06, + "loss": 0.1213, + "step": 42330 + }, + { + "epoch": 0.755020868262405, + "grad_norm": 0.24316012859344482, + "learning_rate": 8.596103200007536e-06, + "loss": 0.1173, + "step": 42331 + }, + { + "epoch": 0.7550387043841187, + "grad_norm": 0.2630822956562042, + "learning_rate": 8.594928651664725e-06, + "loss": 0.1754, + "step": 42332 + }, + { + "epoch": 0.7550565405058324, + "grad_norm": 0.2831932604312897, + "learning_rate": 8.593754166913637e-06, + "loss": 0.1046, + "step": 42333 + }, + { + "epoch": 0.7550743766275461, + "grad_norm": 0.2862245440483093, + "learning_rate": 8.592579745758817e-06, + "loss": 0.168, + "step": 42334 + }, + { + "epoch": 0.7550922127492598, + "grad_norm": 0.23946191370487213, + "learning_rate": 8.591405388204831e-06, + "loss": 0.1035, + "step": 42335 + }, + { + "epoch": 0.7551100488709735, + "grad_norm": 0.2561069130897522, + "learning_rate": 8.590231094256226e-06, + "loss": 0.1139, + "step": 42336 + }, + { + "epoch": 0.7551278849926872, + "grad_norm": 0.3123854100704193, + "learning_rate": 8.589056863917553e-06, + "loss": 0.1445, + "step": 42337 + }, + { + "epoch": 0.7551457211144009, + "grad_norm": 0.26023247838020325, + "learning_rate": 8.587882697193356e-06, + "loss": 0.0894, + "step": 42338 + }, + { + "epoch": 0.7551635572361146, + "grad_norm": 0.2850258946418762, + "learning_rate": 8.586708594088203e-06, + "loss": 0.1053, + "step": 42339 + }, + { + "epoch": 0.7551813933578283, + "grad_norm": 0.25813084840774536, + "learning_rate": 8.585534554606634e-06, + "loss": 0.0917, + "step": 42340 + }, + { + "epoch": 0.755199229479542, + "grad_norm": 0.3162669539451599, + "learning_rate": 8.584360578753204e-06, + "loss": 0.1162, + "step": 42341 + }, + { + "epoch": 0.7552170656012557, + "grad_norm": 0.3151412010192871, + "learning_rate": 8.583186666532455e-06, + "loss": 0.1237, + "step": 42342 + }, + { + "epoch": 0.7552349017229694, + "grad_norm": 0.24522614479064941, + "learning_rate": 8.582012817948951e-06, + "loss": 0.0701, + "step": 42343 + }, + { + "epoch": 0.755252737844683, + "grad_norm": 0.27964454889297485, + "learning_rate": 8.58083903300723e-06, + "loss": 0.1228, + "step": 42344 + }, + { + "epoch": 0.7552705739663967, + "grad_norm": 0.31902268528938293, + "learning_rate": 8.579665311711854e-06, + "loss": 0.1672, + "step": 42345 + }, + { + "epoch": 0.7552884100881104, + "grad_norm": 0.23952889442443848, + "learning_rate": 8.578491654067366e-06, + "loss": 0.1348, + "step": 42346 + }, + { + "epoch": 0.7553062462098241, + "grad_norm": 0.2591972053050995, + "learning_rate": 8.57731806007831e-06, + "loss": 0.1314, + "step": 42347 + }, + { + "epoch": 0.7553240823315378, + "grad_norm": 0.243315190076828, + "learning_rate": 8.57614452974925e-06, + "loss": 0.0994, + "step": 42348 + }, + { + "epoch": 0.7553419184532515, + "grad_norm": 0.2919265925884247, + "learning_rate": 8.574971063084724e-06, + "loss": 0.1813, + "step": 42349 + }, + { + "epoch": 0.7553597545749652, + "grad_norm": 0.46110835671424866, + "learning_rate": 8.573797660089284e-06, + "loss": 0.1332, + "step": 42350 + }, + { + "epoch": 0.7553775906966789, + "grad_norm": 0.27178412675857544, + "learning_rate": 8.57262432076747e-06, + "loss": 0.115, + "step": 42351 + }, + { + "epoch": 0.7553954268183926, + "grad_norm": 0.24335041642189026, + "learning_rate": 8.571451045123846e-06, + "loss": 0.136, + "step": 42352 + }, + { + "epoch": 0.7554132629401064, + "grad_norm": 0.2885285019874573, + "learning_rate": 8.57027783316295e-06, + "loss": 0.111, + "step": 42353 + }, + { + "epoch": 0.75543109906182, + "grad_norm": 0.2938196063041687, + "learning_rate": 8.569104684889337e-06, + "loss": 0.1284, + "step": 42354 + }, + { + "epoch": 0.7554489351835337, + "grad_norm": 0.2460946887731552, + "learning_rate": 8.567931600307538e-06, + "loss": 0.083, + "step": 42355 + }, + { + "epoch": 0.7554667713052474, + "grad_norm": 0.2997353971004486, + "learning_rate": 8.566758579422118e-06, + "loss": 0.1258, + "step": 42356 + }, + { + "epoch": 0.7554846074269611, + "grad_norm": 0.31593987345695496, + "learning_rate": 8.56558562223762e-06, + "loss": 0.1225, + "step": 42357 + }, + { + "epoch": 0.7555024435486748, + "grad_norm": 0.2387072741985321, + "learning_rate": 8.56441272875858e-06, + "loss": 0.1044, + "step": 42358 + }, + { + "epoch": 0.7555202796703885, + "grad_norm": 0.26952114701271057, + "learning_rate": 8.563239898989562e-06, + "loss": 0.095, + "step": 42359 + }, + { + "epoch": 0.7555381157921022, + "grad_norm": 0.28935471177101135, + "learning_rate": 8.562067132935093e-06, + "loss": 0.1622, + "step": 42360 + }, + { + "epoch": 0.7555559519138159, + "grad_norm": 0.3453834652900696, + "learning_rate": 8.560894430599736e-06, + "loss": 0.1073, + "step": 42361 + }, + { + "epoch": 0.7555737880355295, + "grad_norm": 0.32495880126953125, + "learning_rate": 8.559721791988035e-06, + "loss": 0.1515, + "step": 42362 + }, + { + "epoch": 0.7555916241572432, + "grad_norm": 0.2073800414800644, + "learning_rate": 8.558549217104524e-06, + "loss": 0.1019, + "step": 42363 + }, + { + "epoch": 0.7556094602789569, + "grad_norm": 0.24889720976352692, + "learning_rate": 8.557376705953752e-06, + "loss": 0.0559, + "step": 42364 + }, + { + "epoch": 0.7556272964006706, + "grad_norm": 0.2713497579097748, + "learning_rate": 8.556204258540274e-06, + "loss": 0.0937, + "step": 42365 + }, + { + "epoch": 0.7556451325223843, + "grad_norm": 0.27397334575653076, + "learning_rate": 8.555031874868627e-06, + "loss": 0.1564, + "step": 42366 + }, + { + "epoch": 0.755662968644098, + "grad_norm": 0.22274106740951538, + "learning_rate": 8.553859554943358e-06, + "loss": 0.1032, + "step": 42367 + }, + { + "epoch": 0.7556808047658117, + "grad_norm": 0.2755008339881897, + "learning_rate": 8.552687298769e-06, + "loss": 0.1162, + "step": 42368 + }, + { + "epoch": 0.7556986408875254, + "grad_norm": 0.4210760295391083, + "learning_rate": 8.551515106350117e-06, + "loss": 0.1295, + "step": 42369 + }, + { + "epoch": 0.7557164770092392, + "grad_norm": 0.29186972975730896, + "learning_rate": 8.550342977691239e-06, + "loss": 0.1159, + "step": 42370 + }, + { + "epoch": 0.7557343131309529, + "grad_norm": 0.24795956909656525, + "learning_rate": 8.549170912796908e-06, + "loss": 0.093, + "step": 42371 + }, + { + "epoch": 0.7557521492526665, + "grad_norm": 0.23988518118858337, + "learning_rate": 8.547998911671678e-06, + "loss": 0.1144, + "step": 42372 + }, + { + "epoch": 0.7557699853743802, + "grad_norm": 0.2773982882499695, + "learning_rate": 8.54682697432008e-06, + "loss": 0.1324, + "step": 42373 + }, + { + "epoch": 0.7557878214960939, + "grad_norm": 0.2738858461380005, + "learning_rate": 8.545655100746672e-06, + "loss": 0.1391, + "step": 42374 + }, + { + "epoch": 0.7558056576178076, + "grad_norm": 0.18455122411251068, + "learning_rate": 8.544483290955988e-06, + "loss": 0.0845, + "step": 42375 + }, + { + "epoch": 0.7558234937395213, + "grad_norm": 0.27649593353271484, + "learning_rate": 8.54331154495257e-06, + "loss": 0.1454, + "step": 42376 + }, + { + "epoch": 0.755841329861235, + "grad_norm": 0.19789008796215057, + "learning_rate": 8.54213986274095e-06, + "loss": 0.0926, + "step": 42377 + }, + { + "epoch": 0.7558591659829487, + "grad_norm": 0.2960929274559021, + "learning_rate": 8.540968244325692e-06, + "loss": 0.1074, + "step": 42378 + }, + { + "epoch": 0.7558770021046624, + "grad_norm": 0.16888195276260376, + "learning_rate": 8.539796689711324e-06, + "loss": 0.0796, + "step": 42379 + }, + { + "epoch": 0.755894838226376, + "grad_norm": 0.2973552942276001, + "learning_rate": 8.538625198902389e-06, + "loss": 0.1171, + "step": 42380 + }, + { + "epoch": 0.7559126743480897, + "grad_norm": 0.23543117940425873, + "learning_rate": 8.53745377190342e-06, + "loss": 0.0878, + "step": 42381 + }, + { + "epoch": 0.7559305104698034, + "grad_norm": 0.25800907611846924, + "learning_rate": 8.536282408718976e-06, + "loss": 0.0762, + "step": 42382 + }, + { + "epoch": 0.7559483465915171, + "grad_norm": 0.3049534559249878, + "learning_rate": 8.535111109353586e-06, + "loss": 0.1633, + "step": 42383 + }, + { + "epoch": 0.7559661827132308, + "grad_norm": 0.3172808885574341, + "learning_rate": 8.533939873811794e-06, + "loss": 0.1086, + "step": 42384 + }, + { + "epoch": 0.7559840188349445, + "grad_norm": 0.28367358446121216, + "learning_rate": 8.532768702098129e-06, + "loss": 0.178, + "step": 42385 + }, + { + "epoch": 0.7560018549566582, + "grad_norm": 0.25366777181625366, + "learning_rate": 8.53159759421714e-06, + "loss": 0.1389, + "step": 42386 + }, + { + "epoch": 0.756019691078372, + "grad_norm": 0.20593418180942535, + "learning_rate": 8.530426550173373e-06, + "loss": 0.1257, + "step": 42387 + }, + { + "epoch": 0.7560375272000857, + "grad_norm": 0.23428650200366974, + "learning_rate": 8.529255569971364e-06, + "loss": 0.1387, + "step": 42388 + }, + { + "epoch": 0.7560553633217993, + "grad_norm": 0.22403877973556519, + "learning_rate": 8.528084653615647e-06, + "loss": 0.1006, + "step": 42389 + }, + { + "epoch": 0.756073199443513, + "grad_norm": 0.282473087310791, + "learning_rate": 8.526913801110758e-06, + "loss": 0.1233, + "step": 42390 + }, + { + "epoch": 0.7560910355652267, + "grad_norm": 0.29483911395072937, + "learning_rate": 8.525743012461245e-06, + "loss": 0.1792, + "step": 42391 + }, + { + "epoch": 0.7561088716869404, + "grad_norm": 0.259728342294693, + "learning_rate": 8.524572287671645e-06, + "loss": 0.112, + "step": 42392 + }, + { + "epoch": 0.7561267078086541, + "grad_norm": 0.31821489334106445, + "learning_rate": 8.523401626746495e-06, + "loss": 0.1115, + "step": 42393 + }, + { + "epoch": 0.7561445439303678, + "grad_norm": 0.29007548093795776, + "learning_rate": 8.52223102969032e-06, + "loss": 0.1334, + "step": 42394 + }, + { + "epoch": 0.7561623800520815, + "grad_norm": 0.25430139899253845, + "learning_rate": 8.521060496507677e-06, + "loss": 0.0783, + "step": 42395 + }, + { + "epoch": 0.7561802161737952, + "grad_norm": 0.27675554156303406, + "learning_rate": 8.519890027203096e-06, + "loss": 0.1297, + "step": 42396 + }, + { + "epoch": 0.7561980522955088, + "grad_norm": 0.2595537304878235, + "learning_rate": 8.518719621781112e-06, + "loss": 0.137, + "step": 42397 + }, + { + "epoch": 0.7562158884172225, + "grad_norm": 0.24917346239089966, + "learning_rate": 8.517549280246256e-06, + "loss": 0.0736, + "step": 42398 + }, + { + "epoch": 0.7562337245389362, + "grad_norm": 0.24004189670085907, + "learning_rate": 8.516379002603077e-06, + "loss": 0.0758, + "step": 42399 + }, + { + "epoch": 0.7562515606606499, + "grad_norm": 0.3062804639339447, + "learning_rate": 8.515208788856102e-06, + "loss": 0.1752, + "step": 42400 + }, + { + "epoch": 0.7562693967823636, + "grad_norm": 0.21575665473937988, + "learning_rate": 8.514038639009881e-06, + "loss": 0.0821, + "step": 42401 + }, + { + "epoch": 0.7562872329040773, + "grad_norm": 0.25711703300476074, + "learning_rate": 8.512868553068937e-06, + "loss": 0.1061, + "step": 42402 + }, + { + "epoch": 0.756305069025791, + "grad_norm": 0.32097741961479187, + "learning_rate": 8.5116985310378e-06, + "loss": 0.0659, + "step": 42403 + }, + { + "epoch": 0.7563229051475048, + "grad_norm": 0.24532966315746307, + "learning_rate": 8.510528572921025e-06, + "loss": 0.09, + "step": 42404 + }, + { + "epoch": 0.7563407412692185, + "grad_norm": 0.19473090767860413, + "learning_rate": 8.509358678723137e-06, + "loss": 0.0841, + "step": 42405 + }, + { + "epoch": 0.7563585773909322, + "grad_norm": 0.2386460155248642, + "learning_rate": 8.508188848448668e-06, + "loss": 0.0687, + "step": 42406 + }, + { + "epoch": 0.7563764135126458, + "grad_norm": 0.2659556567668915, + "learning_rate": 8.507019082102147e-06, + "loss": 0.1138, + "step": 42407 + }, + { + "epoch": 0.7563942496343595, + "grad_norm": 0.3707256019115448, + "learning_rate": 8.505849379688127e-06, + "loss": 0.0975, + "step": 42408 + }, + { + "epoch": 0.7564120857560732, + "grad_norm": 0.26180100440979004, + "learning_rate": 8.50467974121113e-06, + "loss": 0.1048, + "step": 42409 + }, + { + "epoch": 0.7564299218777869, + "grad_norm": 0.2367788702249527, + "learning_rate": 8.50351016667569e-06, + "loss": 0.1125, + "step": 42410 + }, + { + "epoch": 0.7564477579995006, + "grad_norm": 0.24839997291564941, + "learning_rate": 8.502340656086347e-06, + "loss": 0.1172, + "step": 42411 + }, + { + "epoch": 0.7564655941212143, + "grad_norm": 0.33145636320114136, + "learning_rate": 8.501171209447617e-06, + "loss": 0.1381, + "step": 42412 + }, + { + "epoch": 0.756483430242928, + "grad_norm": 0.35183119773864746, + "learning_rate": 8.500001826764048e-06, + "loss": 0.119, + "step": 42413 + }, + { + "epoch": 0.7565012663646417, + "grad_norm": 0.2977330982685089, + "learning_rate": 8.49883250804018e-06, + "loss": 0.1501, + "step": 42414 + }, + { + "epoch": 0.7565191024863553, + "grad_norm": 0.2004634290933609, + "learning_rate": 8.497663253280536e-06, + "loss": 0.0769, + "step": 42415 + }, + { + "epoch": 0.756536938608069, + "grad_norm": 0.2261001020669937, + "learning_rate": 8.496494062489638e-06, + "loss": 0.1087, + "step": 42416 + }, + { + "epoch": 0.7565547747297827, + "grad_norm": 0.21379733085632324, + "learning_rate": 8.495324935672039e-06, + "loss": 0.0955, + "step": 42417 + }, + { + "epoch": 0.7565726108514964, + "grad_norm": 0.3068874776363373, + "learning_rate": 8.494155872832262e-06, + "loss": 0.0923, + "step": 42418 + }, + { + "epoch": 0.7565904469732101, + "grad_norm": 0.3010251820087433, + "learning_rate": 8.492986873974837e-06, + "loss": 0.1671, + "step": 42419 + }, + { + "epoch": 0.7566082830949239, + "grad_norm": 0.2966093420982361, + "learning_rate": 8.491817939104287e-06, + "loss": 0.1015, + "step": 42420 + }, + { + "epoch": 0.7566261192166376, + "grad_norm": 0.2878933846950531, + "learning_rate": 8.490649068225165e-06, + "loss": 0.1143, + "step": 42421 + }, + { + "epoch": 0.7566439553383513, + "grad_norm": 0.25792810320854187, + "learning_rate": 8.489480261341986e-06, + "loss": 0.1213, + "step": 42422 + }, + { + "epoch": 0.756661791460065, + "grad_norm": 0.3670046031475067, + "learning_rate": 8.488311518459283e-06, + "loss": 0.0999, + "step": 42423 + }, + { + "epoch": 0.7566796275817786, + "grad_norm": 0.31089597940444946, + "learning_rate": 8.48714283958159e-06, + "loss": 0.1553, + "step": 42424 + }, + { + "epoch": 0.7566974637034923, + "grad_norm": 0.2926601767539978, + "learning_rate": 8.485974224713425e-06, + "loss": 0.127, + "step": 42425 + }, + { + "epoch": 0.756715299825206, + "grad_norm": 0.22717778384685516, + "learning_rate": 8.484805673859335e-06, + "loss": 0.0889, + "step": 42426 + }, + { + "epoch": 0.7567331359469197, + "grad_norm": 0.20641830563545227, + "learning_rate": 8.483637187023835e-06, + "loss": 0.0857, + "step": 42427 + }, + { + "epoch": 0.7567509720686334, + "grad_norm": 0.23882833123207092, + "learning_rate": 8.482468764211471e-06, + "loss": 0.0983, + "step": 42428 + }, + { + "epoch": 0.7567688081903471, + "grad_norm": 0.2915860414505005, + "learning_rate": 8.481300405426756e-06, + "loss": 0.1343, + "step": 42429 + }, + { + "epoch": 0.7567866443120608, + "grad_norm": 0.2899327576160431, + "learning_rate": 8.480132110674232e-06, + "loss": 0.1138, + "step": 42430 + }, + { + "epoch": 0.7568044804337745, + "grad_norm": 0.2871955931186676, + "learning_rate": 8.47896387995842e-06, + "loss": 0.0968, + "step": 42431 + }, + { + "epoch": 0.7568223165554882, + "grad_norm": 0.2323184609413147, + "learning_rate": 8.477795713283853e-06, + "loss": 0.1158, + "step": 42432 + }, + { + "epoch": 0.7568401526772018, + "grad_norm": 0.25878503918647766, + "learning_rate": 8.476627610655049e-06, + "loss": 0.0896, + "step": 42433 + }, + { + "epoch": 0.7568579887989155, + "grad_norm": 0.23402409255504608, + "learning_rate": 8.475459572076549e-06, + "loss": 0.1007, + "step": 42434 + }, + { + "epoch": 0.7568758249206292, + "grad_norm": 0.31241506338119507, + "learning_rate": 8.474291597552877e-06, + "loss": 0.1643, + "step": 42435 + }, + { + "epoch": 0.7568936610423429, + "grad_norm": 0.3281039297580719, + "learning_rate": 8.473123687088558e-06, + "loss": 0.1313, + "step": 42436 + }, + { + "epoch": 0.7569114971640567, + "grad_norm": 0.21797817945480347, + "learning_rate": 8.47195584068812e-06, + "loss": 0.0823, + "step": 42437 + }, + { + "epoch": 0.7569293332857704, + "grad_norm": 0.2519802749156952, + "learning_rate": 8.47078805835608e-06, + "loss": 0.1198, + "step": 42438 + }, + { + "epoch": 0.7569471694074841, + "grad_norm": 0.2547273337841034, + "learning_rate": 8.469620340096983e-06, + "loss": 0.0805, + "step": 42439 + }, + { + "epoch": 0.7569650055291978, + "grad_norm": 0.24511700868606567, + "learning_rate": 8.46845268591534e-06, + "loss": 0.1246, + "step": 42440 + }, + { + "epoch": 0.7569828416509115, + "grad_norm": 0.3022809624671936, + "learning_rate": 8.467285095815694e-06, + "loss": 0.138, + "step": 42441 + }, + { + "epoch": 0.7570006777726251, + "grad_norm": 0.2824322581291199, + "learning_rate": 8.46611756980255e-06, + "loss": 0.1396, + "step": 42442 + }, + { + "epoch": 0.7570185138943388, + "grad_norm": 0.36678019165992737, + "learning_rate": 8.464950107880453e-06, + "loss": 0.096, + "step": 42443 + }, + { + "epoch": 0.7570363500160525, + "grad_norm": 0.2552364766597748, + "learning_rate": 8.463782710053922e-06, + "loss": 0.1373, + "step": 42444 + }, + { + "epoch": 0.7570541861377662, + "grad_norm": 0.31479692459106445, + "learning_rate": 8.462615376327482e-06, + "loss": 0.0658, + "step": 42445 + }, + { + "epoch": 0.7570720222594799, + "grad_norm": 0.2944337725639343, + "learning_rate": 8.461448106705644e-06, + "loss": 0.1626, + "step": 42446 + }, + { + "epoch": 0.7570898583811936, + "grad_norm": 0.2544756233692169, + "learning_rate": 8.460280901192957e-06, + "loss": 0.1014, + "step": 42447 + }, + { + "epoch": 0.7571076945029073, + "grad_norm": 0.26997318863868713, + "learning_rate": 8.459113759793933e-06, + "loss": 0.1308, + "step": 42448 + }, + { + "epoch": 0.757125530624621, + "grad_norm": 0.29476282000541687, + "learning_rate": 8.4579466825131e-06, + "loss": 0.13, + "step": 42449 + }, + { + "epoch": 0.7571433667463346, + "grad_norm": 0.33941540122032166, + "learning_rate": 8.456779669354975e-06, + "loss": 0.1184, + "step": 42450 + }, + { + "epoch": 0.7571612028680483, + "grad_norm": 0.38993942737579346, + "learning_rate": 8.455612720324078e-06, + "loss": 0.1438, + "step": 42451 + }, + { + "epoch": 0.757179038989762, + "grad_norm": 0.30195915699005127, + "learning_rate": 8.454445835424948e-06, + "loss": 0.1412, + "step": 42452 + }, + { + "epoch": 0.7571968751114757, + "grad_norm": 0.31078389286994934, + "learning_rate": 8.453279014662101e-06, + "loss": 0.1843, + "step": 42453 + }, + { + "epoch": 0.7572147112331895, + "grad_norm": 0.22774893045425415, + "learning_rate": 8.452112258040054e-06, + "loss": 0.0847, + "step": 42454 + }, + { + "epoch": 0.7572325473549032, + "grad_norm": 0.3099285066127777, + "learning_rate": 8.450945565563342e-06, + "loss": 0.1199, + "step": 42455 + }, + { + "epoch": 0.7572503834766169, + "grad_norm": 0.26423385739326477, + "learning_rate": 8.449778937236472e-06, + "loss": 0.0943, + "step": 42456 + }, + { + "epoch": 0.7572682195983306, + "grad_norm": 0.23151858150959015, + "learning_rate": 8.448612373063985e-06, + "loss": 0.1277, + "step": 42457 + }, + { + "epoch": 0.7572860557200443, + "grad_norm": 0.26518574357032776, + "learning_rate": 8.447445873050392e-06, + "loss": 0.1428, + "step": 42458 + }, + { + "epoch": 0.757303891841758, + "grad_norm": 0.22830712795257568, + "learning_rate": 8.446279437200208e-06, + "loss": 0.0928, + "step": 42459 + }, + { + "epoch": 0.7573217279634716, + "grad_norm": 0.26572519540786743, + "learning_rate": 8.44511306551797e-06, + "loss": 0.0984, + "step": 42460 + }, + { + "epoch": 0.7573395640851853, + "grad_norm": 0.2624780237674713, + "learning_rate": 8.443946758008192e-06, + "loss": 0.1225, + "step": 42461 + }, + { + "epoch": 0.757357400206899, + "grad_norm": 0.2891901731491089, + "learning_rate": 8.442780514675395e-06, + "loss": 0.0605, + "step": 42462 + }, + { + "epoch": 0.7573752363286127, + "grad_norm": 0.21807990968227386, + "learning_rate": 8.441614335524098e-06, + "loss": 0.0899, + "step": 42463 + }, + { + "epoch": 0.7573930724503264, + "grad_norm": 0.3276066780090332, + "learning_rate": 8.440448220558817e-06, + "loss": 0.1323, + "step": 42464 + }, + { + "epoch": 0.7574109085720401, + "grad_norm": 0.22362357378005981, + "learning_rate": 8.439282169784083e-06, + "loss": 0.1019, + "step": 42465 + }, + { + "epoch": 0.7574287446937538, + "grad_norm": 0.2000609189271927, + "learning_rate": 8.438116183204414e-06, + "loss": 0.0836, + "step": 42466 + }, + { + "epoch": 0.7574465808154675, + "grad_norm": 0.19653943181037903, + "learning_rate": 8.436950260824317e-06, + "loss": 0.0752, + "step": 42467 + }, + { + "epoch": 0.7574644169371811, + "grad_norm": 0.2805749773979187, + "learning_rate": 8.43578440264833e-06, + "loss": 0.1054, + "step": 42468 + }, + { + "epoch": 0.7574822530588948, + "grad_norm": 0.32110410928726196, + "learning_rate": 8.434618608680959e-06, + "loss": 0.1299, + "step": 42469 + }, + { + "epoch": 0.7575000891806085, + "grad_norm": 0.24727442860603333, + "learning_rate": 8.433452878926731e-06, + "loss": 0.1158, + "step": 42470 + }, + { + "epoch": 0.7575179253023223, + "grad_norm": 0.30695629119873047, + "learning_rate": 8.432287213390164e-06, + "loss": 0.1622, + "step": 42471 + }, + { + "epoch": 0.757535761424036, + "grad_norm": 0.21809907257556915, + "learning_rate": 8.431121612075766e-06, + "loss": 0.091, + "step": 42472 + }, + { + "epoch": 0.7575535975457497, + "grad_norm": 0.32574430108070374, + "learning_rate": 8.429956074988071e-06, + "loss": 0.1056, + "step": 42473 + }, + { + "epoch": 0.7575714336674634, + "grad_norm": 0.24971674382686615, + "learning_rate": 8.42879060213159e-06, + "loss": 0.105, + "step": 42474 + }, + { + "epoch": 0.7575892697891771, + "grad_norm": 0.26743370294570923, + "learning_rate": 8.427625193510839e-06, + "loss": 0.1003, + "step": 42475 + }, + { + "epoch": 0.7576071059108908, + "grad_norm": 0.17743538320064545, + "learning_rate": 8.426459849130339e-06, + "loss": 0.0611, + "step": 42476 + }, + { + "epoch": 0.7576249420326044, + "grad_norm": 0.30535629391670227, + "learning_rate": 8.425294568994593e-06, + "loss": 0.1781, + "step": 42477 + }, + { + "epoch": 0.7576427781543181, + "grad_norm": 0.46546825766563416, + "learning_rate": 8.42412935310814e-06, + "loss": 0.1494, + "step": 42478 + }, + { + "epoch": 0.7576606142760318, + "grad_norm": 0.31428396701812744, + "learning_rate": 8.42296420147549e-06, + "loss": 0.1257, + "step": 42479 + }, + { + "epoch": 0.7576784503977455, + "grad_norm": 0.3928177058696747, + "learning_rate": 8.42179911410115e-06, + "loss": 0.1045, + "step": 42480 + }, + { + "epoch": 0.7576962865194592, + "grad_norm": 0.22976143658161163, + "learning_rate": 8.420634090989638e-06, + "loss": 0.0847, + "step": 42481 + }, + { + "epoch": 0.7577141226411729, + "grad_norm": 0.30515724420547485, + "learning_rate": 8.419469132145483e-06, + "loss": 0.1395, + "step": 42482 + }, + { + "epoch": 0.7577319587628866, + "grad_norm": 0.2620439827442169, + "learning_rate": 8.418304237573182e-06, + "loss": 0.135, + "step": 42483 + }, + { + "epoch": 0.7577497948846003, + "grad_norm": 0.3544202446937561, + "learning_rate": 8.417139407277273e-06, + "loss": 0.0561, + "step": 42484 + }, + { + "epoch": 0.757767631006314, + "grad_norm": 0.1993686854839325, + "learning_rate": 8.415974641262248e-06, + "loss": 0.0667, + "step": 42485 + }, + { + "epoch": 0.7577854671280276, + "grad_norm": 0.18143802881240845, + "learning_rate": 8.414809939532642e-06, + "loss": 0.0507, + "step": 42486 + }, + { + "epoch": 0.7578033032497413, + "grad_norm": 0.2108558714389801, + "learning_rate": 8.413645302092962e-06, + "loss": 0.1245, + "step": 42487 + }, + { + "epoch": 0.7578211393714551, + "grad_norm": 0.2468854784965515, + "learning_rate": 8.41248072894772e-06, + "loss": 0.1249, + "step": 42488 + }, + { + "epoch": 0.7578389754931688, + "grad_norm": 0.2811237871646881, + "learning_rate": 8.41131622010143e-06, + "loss": 0.1205, + "step": 42489 + }, + { + "epoch": 0.7578568116148825, + "grad_norm": 0.2078595757484436, + "learning_rate": 8.410151775558603e-06, + "loss": 0.0594, + "step": 42490 + }, + { + "epoch": 0.7578746477365962, + "grad_norm": 0.2517724931240082, + "learning_rate": 8.408987395323766e-06, + "loss": 0.0812, + "step": 42491 + }, + { + "epoch": 0.7578924838583099, + "grad_norm": 0.22889970242977142, + "learning_rate": 8.407823079401423e-06, + "loss": 0.0764, + "step": 42492 + }, + { + "epoch": 0.7579103199800236, + "grad_norm": 0.2879985272884369, + "learning_rate": 8.406658827796091e-06, + "loss": 0.1259, + "step": 42493 + }, + { + "epoch": 0.7579281561017372, + "grad_norm": 0.20380660891532898, + "learning_rate": 8.405494640512269e-06, + "loss": 0.0844, + "step": 42494 + }, + { + "epoch": 0.7579459922234509, + "grad_norm": 0.26806068420410156, + "learning_rate": 8.40433051755449e-06, + "loss": 0.1145, + "step": 42495 + }, + { + "epoch": 0.7579638283451646, + "grad_norm": 0.3038773536682129, + "learning_rate": 8.40316645892725e-06, + "loss": 0.1166, + "step": 42496 + }, + { + "epoch": 0.7579816644668783, + "grad_norm": 0.2840266823768616, + "learning_rate": 8.402002464635079e-06, + "loss": 0.1255, + "step": 42497 + }, + { + "epoch": 0.757999500588592, + "grad_norm": 0.24185702204704285, + "learning_rate": 8.400838534682468e-06, + "loss": 0.1047, + "step": 42498 + }, + { + "epoch": 0.7580173367103057, + "grad_norm": 0.202910378575325, + "learning_rate": 8.399674669073952e-06, + "loss": 0.1034, + "step": 42499 + }, + { + "epoch": 0.7580351728320194, + "grad_norm": 0.28981637954711914, + "learning_rate": 8.398510867814026e-06, + "loss": 0.1045, + "step": 42500 + }, + { + "epoch": 0.7580530089537331, + "grad_norm": 0.270341157913208, + "learning_rate": 8.39734713090721e-06, + "loss": 0.1438, + "step": 42501 + }, + { + "epoch": 0.7580708450754468, + "grad_norm": 0.295109361410141, + "learning_rate": 8.39618345835801e-06, + "loss": 0.1141, + "step": 42502 + }, + { + "epoch": 0.7580886811971604, + "grad_norm": 0.3369133174419403, + "learning_rate": 8.395019850170927e-06, + "loss": 0.1157, + "step": 42503 + }, + { + "epoch": 0.7581065173188741, + "grad_norm": 0.24019691348075867, + "learning_rate": 8.393856306350493e-06, + "loss": 0.1172, + "step": 42504 + }, + { + "epoch": 0.7581243534405879, + "grad_norm": 0.22990193963050842, + "learning_rate": 8.392692826901205e-06, + "loss": 0.1353, + "step": 42505 + }, + { + "epoch": 0.7581421895623016, + "grad_norm": 0.3955802619457245, + "learning_rate": 8.391529411827578e-06, + "loss": 0.1299, + "step": 42506 + }, + { + "epoch": 0.7581600256840153, + "grad_norm": 0.24784225225448608, + "learning_rate": 8.39036606113411e-06, + "loss": 0.1076, + "step": 42507 + }, + { + "epoch": 0.758177861805729, + "grad_norm": 0.2705511748790741, + "learning_rate": 8.389202774825328e-06, + "loss": 0.1149, + "step": 42508 + }, + { + "epoch": 0.7581956979274427, + "grad_norm": 0.3075313866138458, + "learning_rate": 8.388039552905735e-06, + "loss": 0.1325, + "step": 42509 + }, + { + "epoch": 0.7582135340491564, + "grad_norm": 0.32823804020881653, + "learning_rate": 8.386876395379828e-06, + "loss": 0.1776, + "step": 42510 + }, + { + "epoch": 0.7582313701708701, + "grad_norm": 0.3254458010196686, + "learning_rate": 8.385713302252136e-06, + "loss": 0.1512, + "step": 42511 + }, + { + "epoch": 0.7582492062925837, + "grad_norm": 0.20118288695812225, + "learning_rate": 8.384550273527148e-06, + "loss": 0.07, + "step": 42512 + }, + { + "epoch": 0.7582670424142974, + "grad_norm": 0.29054129123687744, + "learning_rate": 8.383387309209389e-06, + "loss": 0.1583, + "step": 42513 + }, + { + "epoch": 0.7582848785360111, + "grad_norm": 0.22485169768333435, + "learning_rate": 8.382224409303364e-06, + "loss": 0.111, + "step": 42514 + }, + { + "epoch": 0.7583027146577248, + "grad_norm": 0.3353945314884186, + "learning_rate": 8.381061573813573e-06, + "loss": 0.1327, + "step": 42515 + }, + { + "epoch": 0.7583205507794385, + "grad_norm": 0.26426833868026733, + "learning_rate": 8.37989880274452e-06, + "loss": 0.1025, + "step": 42516 + }, + { + "epoch": 0.7583383869011522, + "grad_norm": 0.2605058550834656, + "learning_rate": 8.378736096100728e-06, + "loss": 0.1123, + "step": 42517 + }, + { + "epoch": 0.7583562230228659, + "grad_norm": 0.32913678884506226, + "learning_rate": 8.377573453886698e-06, + "loss": 0.1327, + "step": 42518 + }, + { + "epoch": 0.7583740591445796, + "grad_norm": 0.21796973049640656, + "learning_rate": 8.376410876106932e-06, + "loss": 0.1402, + "step": 42519 + }, + { + "epoch": 0.7583918952662932, + "grad_norm": 0.3871913552284241, + "learning_rate": 8.375248362765931e-06, + "loss": 0.1332, + "step": 42520 + }, + { + "epoch": 0.758409731388007, + "grad_norm": 0.26409193873405457, + "learning_rate": 8.374085913868218e-06, + "loss": 0.1256, + "step": 42521 + }, + { + "epoch": 0.7584275675097207, + "grad_norm": 0.2507871985435486, + "learning_rate": 8.372923529418291e-06, + "loss": 0.1171, + "step": 42522 + }, + { + "epoch": 0.7584454036314344, + "grad_norm": 0.29590460658073425, + "learning_rate": 8.371761209420648e-06, + "loss": 0.1174, + "step": 42523 + }, + { + "epoch": 0.7584632397531481, + "grad_norm": 0.27333101630210876, + "learning_rate": 8.370598953879808e-06, + "loss": 0.1153, + "step": 42524 + }, + { + "epoch": 0.7584810758748618, + "grad_norm": 0.24664399027824402, + "learning_rate": 8.369436762800262e-06, + "loss": 0.1387, + "step": 42525 + }, + { + "epoch": 0.7584989119965755, + "grad_norm": 0.22566759586334229, + "learning_rate": 8.36827463618653e-06, + "loss": 0.1208, + "step": 42526 + }, + { + "epoch": 0.7585167481182892, + "grad_norm": 0.2601856589317322, + "learning_rate": 8.367112574043111e-06, + "loss": 0.1345, + "step": 42527 + }, + { + "epoch": 0.7585345842400029, + "grad_norm": 0.26377347111701965, + "learning_rate": 8.36595057637451e-06, + "loss": 0.0978, + "step": 42528 + }, + { + "epoch": 0.7585524203617166, + "grad_norm": 0.2287953644990921, + "learning_rate": 8.36478864318522e-06, + "loss": 0.0904, + "step": 42529 + }, + { + "epoch": 0.7585702564834302, + "grad_norm": 0.4153529107570648, + "learning_rate": 8.363626774479763e-06, + "loss": 0.1178, + "step": 42530 + }, + { + "epoch": 0.7585880926051439, + "grad_norm": 0.2789282500743866, + "learning_rate": 8.362464970262638e-06, + "loss": 0.1389, + "step": 42531 + }, + { + "epoch": 0.7586059287268576, + "grad_norm": 0.3188031315803528, + "learning_rate": 8.361303230538342e-06, + "loss": 0.1667, + "step": 42532 + }, + { + "epoch": 0.7586237648485713, + "grad_norm": 0.2907651960849762, + "learning_rate": 8.360141555311373e-06, + "loss": 0.0757, + "step": 42533 + }, + { + "epoch": 0.758641600970285, + "grad_norm": 0.23574650287628174, + "learning_rate": 8.35897994458625e-06, + "loss": 0.0459, + "step": 42534 + }, + { + "epoch": 0.7586594370919987, + "grad_norm": 0.2789841890335083, + "learning_rate": 8.35781839836747e-06, + "loss": 0.1178, + "step": 42535 + }, + { + "epoch": 0.7586772732137124, + "grad_norm": 0.22705766558647156, + "learning_rate": 8.356656916659534e-06, + "loss": 0.1056, + "step": 42536 + }, + { + "epoch": 0.758695109335426, + "grad_norm": 0.2200649380683899, + "learning_rate": 8.355495499466936e-06, + "loss": 0.1045, + "step": 42537 + }, + { + "epoch": 0.7587129454571399, + "grad_norm": 0.3629782497882843, + "learning_rate": 8.354334146794193e-06, + "loss": 0.1112, + "step": 42538 + }, + { + "epoch": 0.7587307815788535, + "grad_norm": 0.3256164789199829, + "learning_rate": 8.353172858645794e-06, + "loss": 0.1198, + "step": 42539 + }, + { + "epoch": 0.7587486177005672, + "grad_norm": 0.2853577435016632, + "learning_rate": 8.352011635026254e-06, + "loss": 0.1426, + "step": 42540 + }, + { + "epoch": 0.7587664538222809, + "grad_norm": 0.25813165307044983, + "learning_rate": 8.350850475940068e-06, + "loss": 0.1304, + "step": 42541 + }, + { + "epoch": 0.7587842899439946, + "grad_norm": 0.26125818490982056, + "learning_rate": 8.349689381391727e-06, + "loss": 0.1038, + "step": 42542 + }, + { + "epoch": 0.7588021260657083, + "grad_norm": 0.2559528648853302, + "learning_rate": 8.348528351385748e-06, + "loss": 0.0791, + "step": 42543 + }, + { + "epoch": 0.758819962187422, + "grad_norm": 0.22897832095623016, + "learning_rate": 8.347367385926624e-06, + "loss": 0.1054, + "step": 42544 + }, + { + "epoch": 0.7588377983091357, + "grad_norm": 0.2834038734436035, + "learning_rate": 8.346206485018856e-06, + "loss": 0.089, + "step": 42545 + }, + { + "epoch": 0.7588556344308494, + "grad_norm": 0.3164770305156708, + "learning_rate": 8.345045648666939e-06, + "loss": 0.1039, + "step": 42546 + }, + { + "epoch": 0.758873470552563, + "grad_norm": 0.2670660614967346, + "learning_rate": 8.34388487687538e-06, + "loss": 0.2029, + "step": 42547 + }, + { + "epoch": 0.7588913066742767, + "grad_norm": 0.23607979714870453, + "learning_rate": 8.34272416964868e-06, + "loss": 0.0802, + "step": 42548 + }, + { + "epoch": 0.7589091427959904, + "grad_norm": 0.23584768176078796, + "learning_rate": 8.341563526991333e-06, + "loss": 0.1708, + "step": 42549 + }, + { + "epoch": 0.7589269789177041, + "grad_norm": 0.2171461433172226, + "learning_rate": 8.340402948907832e-06, + "loss": 0.0772, + "step": 42550 + }, + { + "epoch": 0.7589448150394178, + "grad_norm": 0.33473798632621765, + "learning_rate": 8.339242435402692e-06, + "loss": 0.1016, + "step": 42551 + }, + { + "epoch": 0.7589626511611315, + "grad_norm": 0.21547746658325195, + "learning_rate": 8.338081986480392e-06, + "loss": 0.0735, + "step": 42552 + }, + { + "epoch": 0.7589804872828452, + "grad_norm": 0.249026820063591, + "learning_rate": 8.33692160214545e-06, + "loss": 0.0614, + "step": 42553 + }, + { + "epoch": 0.7589983234045589, + "grad_norm": 0.3808281123638153, + "learning_rate": 8.335761282402349e-06, + "loss": 0.1411, + "step": 42554 + }, + { + "epoch": 0.7590161595262727, + "grad_norm": 0.28560081124305725, + "learning_rate": 8.334601027255598e-06, + "loss": 0.132, + "step": 42555 + }, + { + "epoch": 0.7590339956479863, + "grad_norm": 0.3320583701133728, + "learning_rate": 8.33344083670969e-06, + "loss": 0.1517, + "step": 42556 + }, + { + "epoch": 0.7590518317697, + "grad_norm": 0.2528710961341858, + "learning_rate": 8.332280710769124e-06, + "loss": 0.146, + "step": 42557 + }, + { + "epoch": 0.7590696678914137, + "grad_norm": 0.20687957108020782, + "learning_rate": 8.331120649438396e-06, + "loss": 0.0975, + "step": 42558 + }, + { + "epoch": 0.7590875040131274, + "grad_norm": 0.23713906109333038, + "learning_rate": 8.32996065272199e-06, + "loss": 0.0865, + "step": 42559 + }, + { + "epoch": 0.7591053401348411, + "grad_norm": 0.24285969138145447, + "learning_rate": 8.328800720624424e-06, + "loss": 0.1184, + "step": 42560 + }, + { + "epoch": 0.7591231762565548, + "grad_norm": 0.24162907898426056, + "learning_rate": 8.327640853150184e-06, + "loss": 0.1142, + "step": 42561 + }, + { + "epoch": 0.7591410123782685, + "grad_norm": 0.2637457847595215, + "learning_rate": 8.326481050303766e-06, + "loss": 0.1234, + "step": 42562 + }, + { + "epoch": 0.7591588484999822, + "grad_norm": 0.27374517917633057, + "learning_rate": 8.325321312089658e-06, + "loss": 0.1305, + "step": 42563 + }, + { + "epoch": 0.7591766846216959, + "grad_norm": 0.21077707409858704, + "learning_rate": 8.324161638512373e-06, + "loss": 0.1107, + "step": 42564 + }, + { + "epoch": 0.7591945207434095, + "grad_norm": 0.25188934803009033, + "learning_rate": 8.323002029576397e-06, + "loss": 0.1056, + "step": 42565 + }, + { + "epoch": 0.7592123568651232, + "grad_norm": 0.24743735790252686, + "learning_rate": 8.321842485286216e-06, + "loss": 0.1006, + "step": 42566 + }, + { + "epoch": 0.7592301929868369, + "grad_norm": 0.41030362248420715, + "learning_rate": 8.320683005646335e-06, + "loss": 0.1114, + "step": 42567 + }, + { + "epoch": 0.7592480291085506, + "grad_norm": 0.2505373954772949, + "learning_rate": 8.319523590661255e-06, + "loss": 0.1549, + "step": 42568 + }, + { + "epoch": 0.7592658652302643, + "grad_norm": 0.2733801305294037, + "learning_rate": 8.318364240335463e-06, + "loss": 0.0787, + "step": 42569 + }, + { + "epoch": 0.759283701351978, + "grad_norm": 0.29073014855384827, + "learning_rate": 8.317204954673455e-06, + "loss": 0.1175, + "step": 42570 + }, + { + "epoch": 0.7593015374736917, + "grad_norm": 0.2475467324256897, + "learning_rate": 8.31604573367972e-06, + "loss": 0.1188, + "step": 42571 + }, + { + "epoch": 0.7593193735954055, + "grad_norm": 0.23538252711296082, + "learning_rate": 8.314886577358747e-06, + "loss": 0.1141, + "step": 42572 + }, + { + "epoch": 0.7593372097171192, + "grad_norm": 0.2203197479248047, + "learning_rate": 8.313727485715042e-06, + "loss": 0.106, + "step": 42573 + }, + { + "epoch": 0.7593550458388328, + "grad_norm": 0.33786508440971375, + "learning_rate": 8.312568458753095e-06, + "loss": 0.1069, + "step": 42574 + }, + { + "epoch": 0.7593728819605465, + "grad_norm": 0.29235002398490906, + "learning_rate": 8.311409496477399e-06, + "loss": 0.0876, + "step": 42575 + }, + { + "epoch": 0.7593907180822602, + "grad_norm": 0.4059620201587677, + "learning_rate": 8.31025059889243e-06, + "loss": 0.0718, + "step": 42576 + }, + { + "epoch": 0.7594085542039739, + "grad_norm": 0.32068195939064026, + "learning_rate": 8.309091766002708e-06, + "loss": 0.1316, + "step": 42577 + }, + { + "epoch": 0.7594263903256876, + "grad_norm": 0.3006241023540497, + "learning_rate": 8.307932997812706e-06, + "loss": 0.1222, + "step": 42578 + }, + { + "epoch": 0.7594442264474013, + "grad_norm": 0.2777824103832245, + "learning_rate": 8.306774294326916e-06, + "loss": 0.1245, + "step": 42579 + }, + { + "epoch": 0.759462062569115, + "grad_norm": 0.2003381997346878, + "learning_rate": 8.305615655549842e-06, + "loss": 0.1527, + "step": 42580 + }, + { + "epoch": 0.7594798986908287, + "grad_norm": 0.39176928997039795, + "learning_rate": 8.30445708148596e-06, + "loss": 0.1671, + "step": 42581 + }, + { + "epoch": 0.7594977348125423, + "grad_norm": 0.24290509521961212, + "learning_rate": 8.303298572139775e-06, + "loss": 0.1043, + "step": 42582 + }, + { + "epoch": 0.759515570934256, + "grad_norm": 0.3411197364330292, + "learning_rate": 8.302140127515773e-06, + "loss": 0.1449, + "step": 42583 + }, + { + "epoch": 0.7595334070559697, + "grad_norm": 0.2943204343318939, + "learning_rate": 8.300981747618444e-06, + "loss": 0.1528, + "step": 42584 + }, + { + "epoch": 0.7595512431776834, + "grad_norm": 0.25660815834999084, + "learning_rate": 8.299823432452267e-06, + "loss": 0.0929, + "step": 42585 + }, + { + "epoch": 0.7595690792993971, + "grad_norm": 0.26953575015068054, + "learning_rate": 8.298665182021753e-06, + "loss": 0.0697, + "step": 42586 + }, + { + "epoch": 0.7595869154211108, + "grad_norm": 0.23611371219158173, + "learning_rate": 8.29750699633138e-06, + "loss": 0.1118, + "step": 42587 + }, + { + "epoch": 0.7596047515428245, + "grad_norm": 0.32746422290802, + "learning_rate": 8.29634887538564e-06, + "loss": 0.1158, + "step": 42588 + }, + { + "epoch": 0.7596225876645383, + "grad_norm": 0.33476951718330383, + "learning_rate": 8.295190819189012e-06, + "loss": 0.1235, + "step": 42589 + }, + { + "epoch": 0.759640423786252, + "grad_norm": 0.3064015805721283, + "learning_rate": 8.294032827746002e-06, + "loss": 0.1271, + "step": 42590 + }, + { + "epoch": 0.7596582599079656, + "grad_norm": 0.26001209020614624, + "learning_rate": 8.29287490106109e-06, + "loss": 0.0895, + "step": 42591 + }, + { + "epoch": 0.7596760960296793, + "grad_norm": 0.26250532269477844, + "learning_rate": 8.291717039138765e-06, + "loss": 0.0881, + "step": 42592 + }, + { + "epoch": 0.759693932151393, + "grad_norm": 0.26296573877334595, + "learning_rate": 8.29055924198351e-06, + "loss": 0.1197, + "step": 42593 + }, + { + "epoch": 0.7597117682731067, + "grad_norm": 0.2375052124261856, + "learning_rate": 8.289401509599817e-06, + "loss": 0.1153, + "step": 42594 + }, + { + "epoch": 0.7597296043948204, + "grad_norm": 0.24330399930477142, + "learning_rate": 8.288243841992182e-06, + "loss": 0.0803, + "step": 42595 + }, + { + "epoch": 0.7597474405165341, + "grad_norm": 0.3472011387348175, + "learning_rate": 8.287086239165088e-06, + "loss": 0.1405, + "step": 42596 + }, + { + "epoch": 0.7597652766382478, + "grad_norm": 0.2950378358364105, + "learning_rate": 8.28592870112302e-06, + "loss": 0.1107, + "step": 42597 + }, + { + "epoch": 0.7597831127599615, + "grad_norm": 0.3199588656425476, + "learning_rate": 8.284771227870458e-06, + "loss": 0.1313, + "step": 42598 + }, + { + "epoch": 0.7598009488816752, + "grad_norm": 0.31105688214302063, + "learning_rate": 8.283613819411903e-06, + "loss": 0.1575, + "step": 42599 + }, + { + "epoch": 0.7598187850033888, + "grad_norm": 0.30236220359802246, + "learning_rate": 8.282456475751835e-06, + "loss": 0.1463, + "step": 42600 + }, + { + "epoch": 0.7598366211251025, + "grad_norm": 0.30073082447052, + "learning_rate": 8.281299196894737e-06, + "loss": 0.1457, + "step": 42601 + }, + { + "epoch": 0.7598544572468162, + "grad_norm": 0.3153354525566101, + "learning_rate": 8.280141982845093e-06, + "loss": 0.1342, + "step": 42602 + }, + { + "epoch": 0.7598722933685299, + "grad_norm": 0.2673802375793457, + "learning_rate": 8.2789848336074e-06, + "loss": 0.1323, + "step": 42603 + }, + { + "epoch": 0.7598901294902436, + "grad_norm": 0.26933273673057556, + "learning_rate": 8.277827749186137e-06, + "loss": 0.1065, + "step": 42604 + }, + { + "epoch": 0.7599079656119573, + "grad_norm": 0.20391269028186798, + "learning_rate": 8.276670729585793e-06, + "loss": 0.0766, + "step": 42605 + }, + { + "epoch": 0.7599258017336711, + "grad_norm": 0.44093039631843567, + "learning_rate": 8.275513774810837e-06, + "loss": 0.0868, + "step": 42606 + }, + { + "epoch": 0.7599436378553848, + "grad_norm": 0.23069758713245392, + "learning_rate": 8.274356884865775e-06, + "loss": 0.0801, + "step": 42607 + }, + { + "epoch": 0.7599614739770985, + "grad_norm": 0.270152747631073, + "learning_rate": 8.273200059755073e-06, + "loss": 0.1506, + "step": 42608 + }, + { + "epoch": 0.7599793100988121, + "grad_norm": 0.24902886152267456, + "learning_rate": 8.272043299483234e-06, + "loss": 0.1351, + "step": 42609 + }, + { + "epoch": 0.7599971462205258, + "grad_norm": 0.28365853428840637, + "learning_rate": 8.270886604054734e-06, + "loss": 0.1666, + "step": 42610 + }, + { + "epoch": 0.7600149823422395, + "grad_norm": 0.3187510669231415, + "learning_rate": 8.269729973474047e-06, + "loss": 0.1324, + "step": 42611 + }, + { + "epoch": 0.7600328184639532, + "grad_norm": 0.2684469223022461, + "learning_rate": 8.268573407745676e-06, + "loss": 0.1394, + "step": 42612 + }, + { + "epoch": 0.7600506545856669, + "grad_norm": 0.24168717861175537, + "learning_rate": 8.26741690687409e-06, + "loss": 0.1191, + "step": 42613 + }, + { + "epoch": 0.7600684907073806, + "grad_norm": 0.2750702500343323, + "learning_rate": 8.266260470863774e-06, + "loss": 0.1099, + "step": 42614 + }, + { + "epoch": 0.7600863268290943, + "grad_norm": 0.38823944330215454, + "learning_rate": 8.265104099719206e-06, + "loss": 0.0807, + "step": 42615 + }, + { + "epoch": 0.760104162950808, + "grad_norm": 0.28940507769584656, + "learning_rate": 8.263947793444882e-06, + "loss": 0.0924, + "step": 42616 + }, + { + "epoch": 0.7601219990725216, + "grad_norm": 0.3433786928653717, + "learning_rate": 8.262791552045277e-06, + "loss": 0.1597, + "step": 42617 + }, + { + "epoch": 0.7601398351942353, + "grad_norm": 0.2491087168455124, + "learning_rate": 8.261635375524874e-06, + "loss": 0.0832, + "step": 42618 + }, + { + "epoch": 0.760157671315949, + "grad_norm": 0.39373230934143066, + "learning_rate": 8.260479263888143e-06, + "loss": 0.1013, + "step": 42619 + }, + { + "epoch": 0.7601755074376627, + "grad_norm": 0.3260776102542877, + "learning_rate": 8.259323217139586e-06, + "loss": 0.1393, + "step": 42620 + }, + { + "epoch": 0.7601933435593764, + "grad_norm": 0.2427917867898941, + "learning_rate": 8.258167235283665e-06, + "loss": 0.0787, + "step": 42621 + }, + { + "epoch": 0.7602111796810902, + "grad_norm": 0.22220590710639954, + "learning_rate": 8.25701131832488e-06, + "loss": 0.1285, + "step": 42622 + }, + { + "epoch": 0.7602290158028039, + "grad_norm": 0.23178735375404358, + "learning_rate": 8.2558554662677e-06, + "loss": 0.0982, + "step": 42623 + }, + { + "epoch": 0.7602468519245176, + "grad_norm": 0.17036950588226318, + "learning_rate": 8.2546996791166e-06, + "loss": 0.0751, + "step": 42624 + }, + { + "epoch": 0.7602646880462313, + "grad_norm": 0.2511630654335022, + "learning_rate": 8.253543956876077e-06, + "loss": 0.1211, + "step": 42625 + }, + { + "epoch": 0.760282524167945, + "grad_norm": 0.24705013632774353, + "learning_rate": 8.252388299550601e-06, + "loss": 0.1367, + "step": 42626 + }, + { + "epoch": 0.7603003602896586, + "grad_norm": 0.21071964502334595, + "learning_rate": 8.251232707144652e-06, + "loss": 0.1128, + "step": 42627 + }, + { + "epoch": 0.7603181964113723, + "grad_norm": 0.2599203884601593, + "learning_rate": 8.250077179662701e-06, + "loss": 0.1135, + "step": 42628 + }, + { + "epoch": 0.760336032533086, + "grad_norm": 0.3434358537197113, + "learning_rate": 8.248921717109246e-06, + "loss": 0.1294, + "step": 42629 + }, + { + "epoch": 0.7603538686547997, + "grad_norm": 0.34271255135536194, + "learning_rate": 8.247766319488755e-06, + "loss": 0.1077, + "step": 42630 + }, + { + "epoch": 0.7603717047765134, + "grad_norm": 0.31806135177612305, + "learning_rate": 8.246610986805709e-06, + "loss": 0.1504, + "step": 42631 + }, + { + "epoch": 0.7603895408982271, + "grad_norm": 0.20858317613601685, + "learning_rate": 8.245455719064576e-06, + "loss": 0.1361, + "step": 42632 + }, + { + "epoch": 0.7604073770199408, + "grad_norm": 0.22832196950912476, + "learning_rate": 8.244300516269851e-06, + "loss": 0.1089, + "step": 42633 + }, + { + "epoch": 0.7604252131416545, + "grad_norm": 0.261154443025589, + "learning_rate": 8.243145378426003e-06, + "loss": 0.1381, + "step": 42634 + }, + { + "epoch": 0.7604430492633681, + "grad_norm": 0.3797610104084015, + "learning_rate": 8.241990305537506e-06, + "loss": 0.1869, + "step": 42635 + }, + { + "epoch": 0.7604608853850818, + "grad_norm": 0.21285179257392883, + "learning_rate": 8.240835297608851e-06, + "loss": 0.0838, + "step": 42636 + }, + { + "epoch": 0.7604787215067955, + "grad_norm": 0.2694287896156311, + "learning_rate": 8.239680354644496e-06, + "loss": 0.1309, + "step": 42637 + }, + { + "epoch": 0.7604965576285092, + "grad_norm": 0.2810622453689575, + "learning_rate": 8.238525476648939e-06, + "loss": 0.1103, + "step": 42638 + }, + { + "epoch": 0.760514393750223, + "grad_norm": 0.21039675176143646, + "learning_rate": 8.237370663626648e-06, + "loss": 0.0979, + "step": 42639 + }, + { + "epoch": 0.7605322298719367, + "grad_norm": 0.2081712931394577, + "learning_rate": 8.236215915582096e-06, + "loss": 0.0664, + "step": 42640 + }, + { + "epoch": 0.7605500659936504, + "grad_norm": 0.25597190856933594, + "learning_rate": 8.235061232519753e-06, + "loss": 0.1026, + "step": 42641 + }, + { + "epoch": 0.7605679021153641, + "grad_norm": 0.29841238260269165, + "learning_rate": 8.23390661444411e-06, + "loss": 0.1438, + "step": 42642 + }, + { + "epoch": 0.7605857382370778, + "grad_norm": 0.2845439910888672, + "learning_rate": 8.232752061359636e-06, + "loss": 0.0884, + "step": 42643 + }, + { + "epoch": 0.7606035743587914, + "grad_norm": 0.3196823298931122, + "learning_rate": 8.231597573270807e-06, + "loss": 0.1429, + "step": 42644 + }, + { + "epoch": 0.7606214104805051, + "grad_norm": 0.24257633090019226, + "learning_rate": 8.23044315018209e-06, + "loss": 0.1076, + "step": 42645 + }, + { + "epoch": 0.7606392466022188, + "grad_norm": 0.2125617414712906, + "learning_rate": 8.229288792097975e-06, + "loss": 0.0812, + "step": 42646 + }, + { + "epoch": 0.7606570827239325, + "grad_norm": 0.21107307076454163, + "learning_rate": 8.22813449902293e-06, + "loss": 0.13, + "step": 42647 + }, + { + "epoch": 0.7606749188456462, + "grad_norm": 0.25327208638191223, + "learning_rate": 8.226980270961418e-06, + "loss": 0.0698, + "step": 42648 + }, + { + "epoch": 0.7606927549673599, + "grad_norm": 0.29937681555747986, + "learning_rate": 8.225826107917933e-06, + "loss": 0.0705, + "step": 42649 + }, + { + "epoch": 0.7607105910890736, + "grad_norm": 0.21487462520599365, + "learning_rate": 8.224672009896934e-06, + "loss": 0.0646, + "step": 42650 + }, + { + "epoch": 0.7607284272107873, + "grad_norm": 0.3470233380794525, + "learning_rate": 8.223517976902903e-06, + "loss": 0.1011, + "step": 42651 + }, + { + "epoch": 0.760746263332501, + "grad_norm": 0.21214036643505096, + "learning_rate": 8.222364008940313e-06, + "loss": 0.0788, + "step": 42652 + }, + { + "epoch": 0.7607640994542146, + "grad_norm": 0.2770938277244568, + "learning_rate": 8.221210106013638e-06, + "loss": 0.1051, + "step": 42653 + }, + { + "epoch": 0.7607819355759283, + "grad_norm": 0.38843944668769836, + "learning_rate": 8.220056268127338e-06, + "loss": 0.0986, + "step": 42654 + }, + { + "epoch": 0.760799771697642, + "grad_norm": 0.3177003264427185, + "learning_rate": 8.218902495285901e-06, + "loss": 0.1292, + "step": 42655 + }, + { + "epoch": 0.7608176078193558, + "grad_norm": 0.25186216831207275, + "learning_rate": 8.217748787493798e-06, + "loss": 0.1658, + "step": 42656 + }, + { + "epoch": 0.7608354439410695, + "grad_norm": 0.2543962001800537, + "learning_rate": 8.216595144755493e-06, + "loss": 0.144, + "step": 42657 + }, + { + "epoch": 0.7608532800627832, + "grad_norm": 0.27730339765548706, + "learning_rate": 8.215441567075458e-06, + "loss": 0.0867, + "step": 42658 + }, + { + "epoch": 0.7608711161844969, + "grad_norm": 0.4992286264896393, + "learning_rate": 8.214288054458175e-06, + "loss": 0.1924, + "step": 42659 + }, + { + "epoch": 0.7608889523062106, + "grad_norm": 0.2354087382555008, + "learning_rate": 8.213134606908108e-06, + "loss": 0.1111, + "step": 42660 + }, + { + "epoch": 0.7609067884279243, + "grad_norm": 0.2836313843727112, + "learning_rate": 8.21198122442973e-06, + "loss": 0.1041, + "step": 42661 + }, + { + "epoch": 0.7609246245496379, + "grad_norm": 0.3752792775630951, + "learning_rate": 8.210827907027501e-06, + "loss": 0.1949, + "step": 42662 + }, + { + "epoch": 0.7609424606713516, + "grad_norm": 0.2538500428199768, + "learning_rate": 8.209674654705915e-06, + "loss": 0.1046, + "step": 42663 + }, + { + "epoch": 0.7609602967930653, + "grad_norm": 0.24966923892498016, + "learning_rate": 8.208521467469418e-06, + "loss": 0.1145, + "step": 42664 + }, + { + "epoch": 0.760978132914779, + "grad_norm": 0.21328851580619812, + "learning_rate": 8.2073683453225e-06, + "loss": 0.0999, + "step": 42665 + }, + { + "epoch": 0.7609959690364927, + "grad_norm": 0.27110686898231506, + "learning_rate": 8.206215288269622e-06, + "loss": 0.0703, + "step": 42666 + }, + { + "epoch": 0.7610138051582064, + "grad_norm": 0.25685498118400574, + "learning_rate": 8.205062296315244e-06, + "loss": 0.1302, + "step": 42667 + }, + { + "epoch": 0.7610316412799201, + "grad_norm": 0.23116253316402435, + "learning_rate": 8.203909369463855e-06, + "loss": 0.1198, + "step": 42668 + }, + { + "epoch": 0.7610494774016338, + "grad_norm": 0.25200340151786804, + "learning_rate": 8.202756507719916e-06, + "loss": 0.0899, + "step": 42669 + }, + { + "epoch": 0.7610673135233474, + "grad_norm": 0.27751943469047546, + "learning_rate": 8.201603711087894e-06, + "loss": 0.1033, + "step": 42670 + }, + { + "epoch": 0.7610851496450611, + "grad_norm": 0.16676461696624756, + "learning_rate": 8.200450979572247e-06, + "loss": 0.0919, + "step": 42671 + }, + { + "epoch": 0.7611029857667748, + "grad_norm": 0.30492642521858215, + "learning_rate": 8.199298313177465e-06, + "loss": 0.1279, + "step": 42672 + }, + { + "epoch": 0.7611208218884886, + "grad_norm": 0.2170400470495224, + "learning_rate": 8.198145711908006e-06, + "loss": 0.1106, + "step": 42673 + }, + { + "epoch": 0.7611386580102023, + "grad_norm": 0.24292133748531342, + "learning_rate": 8.196993175768336e-06, + "loss": 0.1041, + "step": 42674 + }, + { + "epoch": 0.761156494131916, + "grad_norm": 0.4326706826686859, + "learning_rate": 8.195840704762925e-06, + "loss": 0.1476, + "step": 42675 + }, + { + "epoch": 0.7611743302536297, + "grad_norm": 0.21665386855602264, + "learning_rate": 8.19468829889623e-06, + "loss": 0.1008, + "step": 42676 + }, + { + "epoch": 0.7611921663753434, + "grad_norm": 0.23884227871894836, + "learning_rate": 8.19353595817273e-06, + "loss": 0.1012, + "step": 42677 + }, + { + "epoch": 0.7612100024970571, + "grad_norm": 0.306518018245697, + "learning_rate": 8.192383682596894e-06, + "loss": 0.1448, + "step": 42678 + }, + { + "epoch": 0.7612278386187707, + "grad_norm": 0.22884008288383484, + "learning_rate": 8.191231472173185e-06, + "loss": 0.0715, + "step": 42679 + }, + { + "epoch": 0.7612456747404844, + "grad_norm": 0.2654304504394531, + "learning_rate": 8.190079326906061e-06, + "loss": 0.1754, + "step": 42680 + }, + { + "epoch": 0.7612635108621981, + "grad_norm": 0.3729139268398285, + "learning_rate": 8.188927246800004e-06, + "loss": 0.1211, + "step": 42681 + }, + { + "epoch": 0.7612813469839118, + "grad_norm": 0.25649964809417725, + "learning_rate": 8.18777523185947e-06, + "loss": 0.1104, + "step": 42682 + }, + { + "epoch": 0.7612991831056255, + "grad_norm": 0.25164562463760376, + "learning_rate": 8.18662328208893e-06, + "loss": 0.1542, + "step": 42683 + }, + { + "epoch": 0.7613170192273392, + "grad_norm": 0.22828686237335205, + "learning_rate": 8.185471397492834e-06, + "loss": 0.1112, + "step": 42684 + }, + { + "epoch": 0.7613348553490529, + "grad_norm": 0.2508590519428253, + "learning_rate": 8.184319578075664e-06, + "loss": 0.0943, + "step": 42685 + }, + { + "epoch": 0.7613526914707666, + "grad_norm": 0.2704280912876129, + "learning_rate": 8.183167823841883e-06, + "loss": 0.1374, + "step": 42686 + }, + { + "epoch": 0.7613705275924803, + "grad_norm": 0.3528907001018524, + "learning_rate": 8.18201613479595e-06, + "loss": 0.1435, + "step": 42687 + }, + { + "epoch": 0.7613883637141939, + "grad_norm": 0.20950253307819366, + "learning_rate": 8.180864510942332e-06, + "loss": 0.0956, + "step": 42688 + }, + { + "epoch": 0.7614061998359076, + "grad_norm": 0.24641311168670654, + "learning_rate": 8.179712952285485e-06, + "loss": 0.111, + "step": 42689 + }, + { + "epoch": 0.7614240359576214, + "grad_norm": 0.22809140384197235, + "learning_rate": 8.178561458829887e-06, + "loss": 0.1026, + "step": 42690 + }, + { + "epoch": 0.7614418720793351, + "grad_norm": 0.3638050854206085, + "learning_rate": 8.177410030579988e-06, + "loss": 0.1389, + "step": 42691 + }, + { + "epoch": 0.7614597082010488, + "grad_norm": 0.34730955958366394, + "learning_rate": 8.176258667540266e-06, + "loss": 0.1063, + "step": 42692 + }, + { + "epoch": 0.7614775443227625, + "grad_norm": 0.21242783963680267, + "learning_rate": 8.175107369715165e-06, + "loss": 0.1473, + "step": 42693 + }, + { + "epoch": 0.7614953804444762, + "grad_norm": 0.1886938512325287, + "learning_rate": 8.17395613710917e-06, + "loss": 0.1076, + "step": 42694 + }, + { + "epoch": 0.7615132165661899, + "grad_norm": 0.25451093912124634, + "learning_rate": 8.172804969726734e-06, + "loss": 0.1049, + "step": 42695 + }, + { + "epoch": 0.7615310526879036, + "grad_norm": 0.33973121643066406, + "learning_rate": 8.171653867572312e-06, + "loss": 0.0898, + "step": 42696 + }, + { + "epoch": 0.7615488888096172, + "grad_norm": 0.2229091078042984, + "learning_rate": 8.17050283065037e-06, + "loss": 0.1153, + "step": 42697 + }, + { + "epoch": 0.7615667249313309, + "grad_norm": 0.3007701337337494, + "learning_rate": 8.169351858965376e-06, + "loss": 0.1322, + "step": 42698 + }, + { + "epoch": 0.7615845610530446, + "grad_norm": 0.2611568570137024, + "learning_rate": 8.16820095252179e-06, + "loss": 0.0989, + "step": 42699 + }, + { + "epoch": 0.7616023971747583, + "grad_norm": 0.3249962627887726, + "learning_rate": 8.167050111324068e-06, + "loss": 0.0918, + "step": 42700 + }, + { + "epoch": 0.761620233296472, + "grad_norm": 0.2891022264957428, + "learning_rate": 8.165899335376676e-06, + "loss": 0.1012, + "step": 42701 + }, + { + "epoch": 0.7616380694181857, + "grad_norm": 0.3628145456314087, + "learning_rate": 8.164748624684062e-06, + "loss": 0.1013, + "step": 42702 + }, + { + "epoch": 0.7616559055398994, + "grad_norm": 0.24708378314971924, + "learning_rate": 8.163597979250707e-06, + "loss": 0.1523, + "step": 42703 + }, + { + "epoch": 0.7616737416616131, + "grad_norm": 0.20327432453632355, + "learning_rate": 8.162447399081052e-06, + "loss": 0.1012, + "step": 42704 + }, + { + "epoch": 0.7616915777833267, + "grad_norm": 0.41453808546066284, + "learning_rate": 8.161296884179575e-06, + "loss": 0.1305, + "step": 42705 + }, + { + "epoch": 0.7617094139050404, + "grad_norm": 0.3436262011528015, + "learning_rate": 8.16014643455072e-06, + "loss": 0.1722, + "step": 42706 + }, + { + "epoch": 0.7617272500267542, + "grad_norm": 0.2809482514858246, + "learning_rate": 8.15899605019896e-06, + "loss": 0.1181, + "step": 42707 + }, + { + "epoch": 0.7617450861484679, + "grad_norm": 0.19482646882534027, + "learning_rate": 8.15784573112875e-06, + "loss": 0.0882, + "step": 42708 + }, + { + "epoch": 0.7617629222701816, + "grad_norm": 0.3058270215988159, + "learning_rate": 8.156695477344547e-06, + "loss": 0.1295, + "step": 42709 + }, + { + "epoch": 0.7617807583918953, + "grad_norm": 0.24580976366996765, + "learning_rate": 8.1555452888508e-06, + "loss": 0.1282, + "step": 42710 + }, + { + "epoch": 0.761798594513609, + "grad_norm": 0.297408789396286, + "learning_rate": 8.154395165651988e-06, + "loss": 0.1302, + "step": 42711 + }, + { + "epoch": 0.7618164306353227, + "grad_norm": 0.24513469636440277, + "learning_rate": 8.153245107752555e-06, + "loss": 0.1251, + "step": 42712 + }, + { + "epoch": 0.7618342667570364, + "grad_norm": 0.2538241446018219, + "learning_rate": 8.152095115156966e-06, + "loss": 0.1191, + "step": 42713 + }, + { + "epoch": 0.76185210287875, + "grad_norm": 0.23211342096328735, + "learning_rate": 8.150945187869675e-06, + "loss": 0.1265, + "step": 42714 + }, + { + "epoch": 0.7618699390004637, + "grad_norm": 0.25968465209007263, + "learning_rate": 8.14979532589513e-06, + "loss": 0.0955, + "step": 42715 + }, + { + "epoch": 0.7618877751221774, + "grad_norm": 0.31376928091049194, + "learning_rate": 8.148645529237805e-06, + "loss": 0.0928, + "step": 42716 + }, + { + "epoch": 0.7619056112438911, + "grad_norm": 0.21094666421413422, + "learning_rate": 8.147495797902156e-06, + "loss": 0.1247, + "step": 42717 + }, + { + "epoch": 0.7619234473656048, + "grad_norm": 0.3071964383125305, + "learning_rate": 8.146346131892621e-06, + "loss": 0.0891, + "step": 42718 + }, + { + "epoch": 0.7619412834873185, + "grad_norm": 0.3281419575214386, + "learning_rate": 8.145196531213677e-06, + "loss": 0.1041, + "step": 42719 + }, + { + "epoch": 0.7619591196090322, + "grad_norm": 0.19709442555904388, + "learning_rate": 8.144046995869766e-06, + "loss": 0.1135, + "step": 42720 + }, + { + "epoch": 0.7619769557307459, + "grad_norm": 0.2718742787837982, + "learning_rate": 8.142897525865362e-06, + "loss": 0.1163, + "step": 42721 + }, + { + "epoch": 0.7619947918524596, + "grad_norm": 0.23442450165748596, + "learning_rate": 8.141748121204906e-06, + "loss": 0.0625, + "step": 42722 + }, + { + "epoch": 0.7620126279741734, + "grad_norm": 0.27153611183166504, + "learning_rate": 8.14059878189285e-06, + "loss": 0.0939, + "step": 42723 + }, + { + "epoch": 0.762030464095887, + "grad_norm": 0.27912840247154236, + "learning_rate": 8.139449507933664e-06, + "loss": 0.1233, + "step": 42724 + }, + { + "epoch": 0.7620483002176007, + "grad_norm": 0.34461748600006104, + "learning_rate": 8.138300299331794e-06, + "loss": 0.1356, + "step": 42725 + }, + { + "epoch": 0.7620661363393144, + "grad_norm": 0.24578818678855896, + "learning_rate": 8.137151156091696e-06, + "loss": 0.087, + "step": 42726 + }, + { + "epoch": 0.7620839724610281, + "grad_norm": 0.2059352695941925, + "learning_rate": 8.136002078217825e-06, + "loss": 0.1016, + "step": 42727 + }, + { + "epoch": 0.7621018085827418, + "grad_norm": 0.2727588713169098, + "learning_rate": 8.134853065714626e-06, + "loss": 0.1129, + "step": 42728 + }, + { + "epoch": 0.7621196447044555, + "grad_norm": 0.30707958340644836, + "learning_rate": 8.133704118586572e-06, + "loss": 0.16, + "step": 42729 + }, + { + "epoch": 0.7621374808261692, + "grad_norm": 0.23666512966156006, + "learning_rate": 8.132555236838104e-06, + "loss": 0.1283, + "step": 42730 + }, + { + "epoch": 0.7621553169478829, + "grad_norm": 0.2750038206577301, + "learning_rate": 8.13140642047367e-06, + "loss": 0.1299, + "step": 42731 + }, + { + "epoch": 0.7621731530695965, + "grad_norm": 0.2746490240097046, + "learning_rate": 8.13025766949774e-06, + "loss": 0.1268, + "step": 42732 + }, + { + "epoch": 0.7621909891913102, + "grad_norm": 0.2559909522533417, + "learning_rate": 8.129108983914749e-06, + "loss": 0.0939, + "step": 42733 + }, + { + "epoch": 0.7622088253130239, + "grad_norm": 0.3153408467769623, + "learning_rate": 8.127960363729164e-06, + "loss": 0.0861, + "step": 42734 + }, + { + "epoch": 0.7622266614347376, + "grad_norm": 0.2540927827358246, + "learning_rate": 8.126811808945436e-06, + "loss": 0.1087, + "step": 42735 + }, + { + "epoch": 0.7622444975564513, + "grad_norm": 0.2728038728237152, + "learning_rate": 8.125663319568002e-06, + "loss": 0.0803, + "step": 42736 + }, + { + "epoch": 0.762262333678165, + "grad_norm": 0.36184045672416687, + "learning_rate": 8.124514895601337e-06, + "loss": 0.1315, + "step": 42737 + }, + { + "epoch": 0.7622801697998787, + "grad_norm": 0.2298123687505722, + "learning_rate": 8.12336653704988e-06, + "loss": 0.0661, + "step": 42738 + }, + { + "epoch": 0.7622980059215924, + "grad_norm": 0.40313494205474854, + "learning_rate": 8.122218243918081e-06, + "loss": 0.1701, + "step": 42739 + }, + { + "epoch": 0.7623158420433062, + "grad_norm": 0.25307127833366394, + "learning_rate": 8.121070016210393e-06, + "loss": 0.1273, + "step": 42740 + }, + { + "epoch": 0.7623336781650198, + "grad_norm": 0.21520350873470306, + "learning_rate": 8.119921853931264e-06, + "loss": 0.1159, + "step": 42741 + }, + { + "epoch": 0.7623515142867335, + "grad_norm": 0.279513418674469, + "learning_rate": 8.118773757085152e-06, + "loss": 0.1358, + "step": 42742 + }, + { + "epoch": 0.7623693504084472, + "grad_norm": 0.3304174244403839, + "learning_rate": 8.117625725676506e-06, + "loss": 0.0917, + "step": 42743 + }, + { + "epoch": 0.7623871865301609, + "grad_norm": 0.2964650094509125, + "learning_rate": 8.116477759709773e-06, + "loss": 0.1097, + "step": 42744 + }, + { + "epoch": 0.7624050226518746, + "grad_norm": 0.27967938780784607, + "learning_rate": 8.115329859189394e-06, + "loss": 0.1357, + "step": 42745 + }, + { + "epoch": 0.7624228587735883, + "grad_norm": 0.2825847268104553, + "learning_rate": 8.114182024119838e-06, + "loss": 0.1152, + "step": 42746 + }, + { + "epoch": 0.762440694895302, + "grad_norm": 0.3004486560821533, + "learning_rate": 8.113034254505536e-06, + "loss": 0.1234, + "step": 42747 + }, + { + "epoch": 0.7624585310170157, + "grad_norm": 0.2610960900783539, + "learning_rate": 8.111886550350953e-06, + "loss": 0.0939, + "step": 42748 + }, + { + "epoch": 0.7624763671387293, + "grad_norm": 0.2529049813747406, + "learning_rate": 8.110738911660523e-06, + "loss": 0.138, + "step": 42749 + }, + { + "epoch": 0.762494203260443, + "grad_norm": 0.25084343552589417, + "learning_rate": 8.10959133843871e-06, + "loss": 0.1115, + "step": 42750 + }, + { + "epoch": 0.7625120393821567, + "grad_norm": 0.5162140727043152, + "learning_rate": 8.108443830689958e-06, + "loss": 0.1066, + "step": 42751 + }, + { + "epoch": 0.7625298755038704, + "grad_norm": 0.22267527878284454, + "learning_rate": 8.107296388418708e-06, + "loss": 0.0996, + "step": 42752 + }, + { + "epoch": 0.7625477116255841, + "grad_norm": 0.3033842444419861, + "learning_rate": 8.106149011629413e-06, + "loss": 0.1101, + "step": 42753 + }, + { + "epoch": 0.7625655477472978, + "grad_norm": 0.3041439652442932, + "learning_rate": 8.105001700326511e-06, + "loss": 0.1117, + "step": 42754 + }, + { + "epoch": 0.7625833838690115, + "grad_norm": 0.2612026333808899, + "learning_rate": 8.103854454514467e-06, + "loss": 0.1062, + "step": 42755 + }, + { + "epoch": 0.7626012199907252, + "grad_norm": 0.27075478434562683, + "learning_rate": 8.102707274197718e-06, + "loss": 0.1433, + "step": 42756 + }, + { + "epoch": 0.762619056112439, + "grad_norm": 0.3153226375579834, + "learning_rate": 8.10156015938071e-06, + "loss": 0.1141, + "step": 42757 + }, + { + "epoch": 0.7626368922341527, + "grad_norm": 0.21550028026103973, + "learning_rate": 8.100413110067886e-06, + "loss": 0.1039, + "step": 42758 + }, + { + "epoch": 0.7626547283558663, + "grad_norm": 0.29533568024635315, + "learning_rate": 8.099266126263704e-06, + "loss": 0.1636, + "step": 42759 + }, + { + "epoch": 0.76267256447758, + "grad_norm": 0.2324717938899994, + "learning_rate": 8.098119207972599e-06, + "loss": 0.1362, + "step": 42760 + }, + { + "epoch": 0.7626904005992937, + "grad_norm": 0.19602574408054352, + "learning_rate": 8.096972355199029e-06, + "loss": 0.0664, + "step": 42761 + }, + { + "epoch": 0.7627082367210074, + "grad_norm": 0.19509239494800568, + "learning_rate": 8.09582556794742e-06, + "loss": 0.1006, + "step": 42762 + }, + { + "epoch": 0.7627260728427211, + "grad_norm": 0.2777608036994934, + "learning_rate": 8.094678846222242e-06, + "loss": 0.1872, + "step": 42763 + }, + { + "epoch": 0.7627439089644348, + "grad_norm": 0.26951149106025696, + "learning_rate": 8.093532190027926e-06, + "loss": 0.143, + "step": 42764 + }, + { + "epoch": 0.7627617450861485, + "grad_norm": 0.2789444923400879, + "learning_rate": 8.092385599368917e-06, + "loss": 0.1058, + "step": 42765 + }, + { + "epoch": 0.7627795812078622, + "grad_norm": 0.25518038868904114, + "learning_rate": 8.091239074249663e-06, + "loss": 0.1114, + "step": 42766 + }, + { + "epoch": 0.7627974173295758, + "grad_norm": 0.2865860164165497, + "learning_rate": 8.090092614674599e-06, + "loss": 0.1407, + "step": 42767 + }, + { + "epoch": 0.7628152534512895, + "grad_norm": 0.30358266830444336, + "learning_rate": 8.088946220648184e-06, + "loss": 0.099, + "step": 42768 + }, + { + "epoch": 0.7628330895730032, + "grad_norm": 0.24380910396575928, + "learning_rate": 8.087799892174853e-06, + "loss": 0.1302, + "step": 42769 + }, + { + "epoch": 0.7628509256947169, + "grad_norm": 0.3392241597175598, + "learning_rate": 8.086653629259053e-06, + "loss": 0.1196, + "step": 42770 + }, + { + "epoch": 0.7628687618164306, + "grad_norm": 0.19452714920043945, + "learning_rate": 8.085507431905215e-06, + "loss": 0.0769, + "step": 42771 + }, + { + "epoch": 0.7628865979381443, + "grad_norm": 0.32577091455459595, + "learning_rate": 8.084361300117802e-06, + "loss": 0.1599, + "step": 42772 + }, + { + "epoch": 0.762904434059858, + "grad_norm": 0.2734892666339874, + "learning_rate": 8.083215233901245e-06, + "loss": 0.1136, + "step": 42773 + }, + { + "epoch": 0.7629222701815718, + "grad_norm": 0.3848903179168701, + "learning_rate": 8.082069233259984e-06, + "loss": 0.1064, + "step": 42774 + }, + { + "epoch": 0.7629401063032855, + "grad_norm": 0.2228815108537674, + "learning_rate": 8.080923298198461e-06, + "loss": 0.1149, + "step": 42775 + }, + { + "epoch": 0.7629579424249991, + "grad_norm": 0.40106096863746643, + "learning_rate": 8.079777428721136e-06, + "loss": 0.1379, + "step": 42776 + }, + { + "epoch": 0.7629757785467128, + "grad_norm": 0.25054147839546204, + "learning_rate": 8.078631624832436e-06, + "loss": 0.1448, + "step": 42777 + }, + { + "epoch": 0.7629936146684265, + "grad_norm": 0.22279147803783417, + "learning_rate": 8.077485886536803e-06, + "loss": 0.0845, + "step": 42778 + }, + { + "epoch": 0.7630114507901402, + "grad_norm": 0.25205790996551514, + "learning_rate": 8.076340213838682e-06, + "loss": 0.1062, + "step": 42779 + }, + { + "epoch": 0.7630292869118539, + "grad_norm": 0.33514559268951416, + "learning_rate": 8.075194606742501e-06, + "loss": 0.1679, + "step": 42780 + }, + { + "epoch": 0.7630471230335676, + "grad_norm": 0.24219800531864166, + "learning_rate": 8.07404906525272e-06, + "loss": 0.1025, + "step": 42781 + }, + { + "epoch": 0.7630649591552813, + "grad_norm": 0.25891268253326416, + "learning_rate": 8.072903589373773e-06, + "loss": 0.1458, + "step": 42782 + }, + { + "epoch": 0.763082795276995, + "grad_norm": 0.3182671368122101, + "learning_rate": 8.071758179110095e-06, + "loss": 0.1268, + "step": 42783 + }, + { + "epoch": 0.7631006313987087, + "grad_norm": 0.328510046005249, + "learning_rate": 8.070612834466124e-06, + "loss": 0.1326, + "step": 42784 + }, + { + "epoch": 0.7631184675204223, + "grad_norm": 0.301238477230072, + "learning_rate": 8.069467555446312e-06, + "loss": 0.1477, + "step": 42785 + }, + { + "epoch": 0.763136303642136, + "grad_norm": 0.24899804592132568, + "learning_rate": 8.06832234205509e-06, + "loss": 0.1145, + "step": 42786 + }, + { + "epoch": 0.7631541397638497, + "grad_norm": 0.27358248829841614, + "learning_rate": 8.06717719429689e-06, + "loss": 0.1235, + "step": 42787 + }, + { + "epoch": 0.7631719758855634, + "grad_norm": 0.33678731322288513, + "learning_rate": 8.06603211217617e-06, + "loss": 0.1246, + "step": 42788 + }, + { + "epoch": 0.7631898120072771, + "grad_norm": 0.2629486620426178, + "learning_rate": 8.06488709569735e-06, + "loss": 0.1393, + "step": 42789 + }, + { + "epoch": 0.7632076481289908, + "grad_norm": 0.2906598448753357, + "learning_rate": 8.063742144864882e-06, + "loss": 0.1418, + "step": 42790 + }, + { + "epoch": 0.7632254842507046, + "grad_norm": 0.2102222889661789, + "learning_rate": 8.062597259683202e-06, + "loss": 0.0897, + "step": 42791 + }, + { + "epoch": 0.7632433203724183, + "grad_norm": 0.2572406530380249, + "learning_rate": 8.061452440156744e-06, + "loss": 0.1587, + "step": 42792 + }, + { + "epoch": 0.763261156494132, + "grad_norm": 0.3155263662338257, + "learning_rate": 8.060307686289938e-06, + "loss": 0.1557, + "step": 42793 + }, + { + "epoch": 0.7632789926158456, + "grad_norm": 0.27527886629104614, + "learning_rate": 8.059162998087242e-06, + "loss": 0.1237, + "step": 42794 + }, + { + "epoch": 0.7632968287375593, + "grad_norm": 0.30883437395095825, + "learning_rate": 8.058018375553078e-06, + "loss": 0.1621, + "step": 42795 + }, + { + "epoch": 0.763314664859273, + "grad_norm": 0.2987239956855774, + "learning_rate": 8.056873818691884e-06, + "loss": 0.1348, + "step": 42796 + }, + { + "epoch": 0.7633325009809867, + "grad_norm": 0.27694231271743774, + "learning_rate": 8.055729327508097e-06, + "loss": 0.1293, + "step": 42797 + }, + { + "epoch": 0.7633503371027004, + "grad_norm": 0.26061302423477173, + "learning_rate": 8.05458490200616e-06, + "loss": 0.1288, + "step": 42798 + }, + { + "epoch": 0.7633681732244141, + "grad_norm": 0.3165815472602844, + "learning_rate": 8.053440542190505e-06, + "loss": 0.1044, + "step": 42799 + }, + { + "epoch": 0.7633860093461278, + "grad_norm": 0.3112483620643616, + "learning_rate": 8.052296248065565e-06, + "loss": 0.1471, + "step": 42800 + }, + { + "epoch": 0.7634038454678415, + "grad_norm": 0.2951695919036865, + "learning_rate": 8.051152019635774e-06, + "loss": 0.14, + "step": 42801 + }, + { + "epoch": 0.7634216815895551, + "grad_norm": 0.2529488503932953, + "learning_rate": 8.050007856905572e-06, + "loss": 0.1328, + "step": 42802 + }, + { + "epoch": 0.7634395177112688, + "grad_norm": 0.22331856191158295, + "learning_rate": 8.048863759879399e-06, + "loss": 0.1156, + "step": 42803 + }, + { + "epoch": 0.7634573538329825, + "grad_norm": 0.3526498079299927, + "learning_rate": 8.047719728561687e-06, + "loss": 0.0861, + "step": 42804 + }, + { + "epoch": 0.7634751899546962, + "grad_norm": 0.21404924988746643, + "learning_rate": 8.046575762956867e-06, + "loss": 0.0821, + "step": 42805 + }, + { + "epoch": 0.7634930260764099, + "grad_norm": 0.3276389241218567, + "learning_rate": 8.045431863069369e-06, + "loss": 0.1535, + "step": 42806 + }, + { + "epoch": 0.7635108621981236, + "grad_norm": 0.25100985169410706, + "learning_rate": 8.04428802890364e-06, + "loss": 0.1148, + "step": 42807 + }, + { + "epoch": 0.7635286983198374, + "grad_norm": 0.26612594723701477, + "learning_rate": 8.043144260464105e-06, + "loss": 0.1275, + "step": 42808 + }, + { + "epoch": 0.7635465344415511, + "grad_norm": 0.24870416522026062, + "learning_rate": 8.042000557755202e-06, + "loss": 0.1355, + "step": 42809 + }, + { + "epoch": 0.7635643705632648, + "grad_norm": 0.24198852479457855, + "learning_rate": 8.040856920781354e-06, + "loss": 0.1234, + "step": 42810 + }, + { + "epoch": 0.7635822066849784, + "grad_norm": 0.2864730954170227, + "learning_rate": 8.03971334954701e-06, + "loss": 0.1402, + "step": 42811 + }, + { + "epoch": 0.7636000428066921, + "grad_norm": 0.2514137923717499, + "learning_rate": 8.038569844056592e-06, + "loss": 0.1485, + "step": 42812 + }, + { + "epoch": 0.7636178789284058, + "grad_norm": 0.222085103392601, + "learning_rate": 8.037426404314538e-06, + "loss": 0.0942, + "step": 42813 + }, + { + "epoch": 0.7636357150501195, + "grad_norm": 0.3161686956882477, + "learning_rate": 8.03628303032527e-06, + "loss": 0.0805, + "step": 42814 + }, + { + "epoch": 0.7636535511718332, + "grad_norm": 0.2392972707748413, + "learning_rate": 8.035139722093235e-06, + "loss": 0.145, + "step": 42815 + }, + { + "epoch": 0.7636713872935469, + "grad_norm": 0.4221689701080322, + "learning_rate": 8.033996479622852e-06, + "loss": 0.1521, + "step": 42816 + }, + { + "epoch": 0.7636892234152606, + "grad_norm": 0.2915332615375519, + "learning_rate": 8.032853302918564e-06, + "loss": 0.0973, + "step": 42817 + }, + { + "epoch": 0.7637070595369743, + "grad_norm": 0.27898216247558594, + "learning_rate": 8.031710191984799e-06, + "loss": 0.1012, + "step": 42818 + }, + { + "epoch": 0.763724895658688, + "grad_norm": 0.2774139642715454, + "learning_rate": 8.030567146825976e-06, + "loss": 0.0539, + "step": 42819 + }, + { + "epoch": 0.7637427317804016, + "grad_norm": 0.20346693694591522, + "learning_rate": 8.029424167446545e-06, + "loss": 0.1282, + "step": 42820 + }, + { + "epoch": 0.7637605679021153, + "grad_norm": 0.27988263964653015, + "learning_rate": 8.028281253850928e-06, + "loss": 0.0787, + "step": 42821 + }, + { + "epoch": 0.763778404023829, + "grad_norm": 0.27412253618240356, + "learning_rate": 8.027138406043555e-06, + "loss": 0.1542, + "step": 42822 + }, + { + "epoch": 0.7637962401455427, + "grad_norm": 0.264974445104599, + "learning_rate": 8.025995624028846e-06, + "loss": 0.0777, + "step": 42823 + }, + { + "epoch": 0.7638140762672565, + "grad_norm": 0.27311447262763977, + "learning_rate": 8.024852907811248e-06, + "loss": 0.1348, + "step": 42824 + }, + { + "epoch": 0.7638319123889702, + "grad_norm": 0.386681467294693, + "learning_rate": 8.023710257395184e-06, + "loss": 0.1507, + "step": 42825 + }, + { + "epoch": 0.7638497485106839, + "grad_norm": 0.26128536462783813, + "learning_rate": 8.022567672785083e-06, + "loss": 0.1006, + "step": 42826 + }, + { + "epoch": 0.7638675846323976, + "grad_norm": 0.3179316818714142, + "learning_rate": 8.021425153985364e-06, + "loss": 0.0829, + "step": 42827 + }, + { + "epoch": 0.7638854207541113, + "grad_norm": 0.2926112115383148, + "learning_rate": 8.020282701000479e-06, + "loss": 0.1371, + "step": 42828 + }, + { + "epoch": 0.7639032568758249, + "grad_norm": 0.2479543387889862, + "learning_rate": 8.019140313834837e-06, + "loss": 0.1421, + "step": 42829 + }, + { + "epoch": 0.7639210929975386, + "grad_norm": 0.2865014374256134, + "learning_rate": 8.017997992492865e-06, + "loss": 0.0943, + "step": 42830 + }, + { + "epoch": 0.7639389291192523, + "grad_norm": 0.3328753411769867, + "learning_rate": 8.016855736979006e-06, + "loss": 0.146, + "step": 42831 + }, + { + "epoch": 0.763956765240966, + "grad_norm": 0.2740744650363922, + "learning_rate": 8.015713547297673e-06, + "loss": 0.0859, + "step": 42832 + }, + { + "epoch": 0.7639746013626797, + "grad_norm": 0.340349018573761, + "learning_rate": 8.01457142345331e-06, + "loss": 0.1233, + "step": 42833 + }, + { + "epoch": 0.7639924374843934, + "grad_norm": 0.27598676085472107, + "learning_rate": 8.013429365450332e-06, + "loss": 0.0671, + "step": 42834 + }, + { + "epoch": 0.7640102736061071, + "grad_norm": 0.2351517230272293, + "learning_rate": 8.01228737329317e-06, + "loss": 0.0945, + "step": 42835 + }, + { + "epoch": 0.7640281097278208, + "grad_norm": 0.29766207933425903, + "learning_rate": 8.01114544698624e-06, + "loss": 0.0734, + "step": 42836 + }, + { + "epoch": 0.7640459458495344, + "grad_norm": 0.26338836550712585, + "learning_rate": 8.010003586533988e-06, + "loss": 0.124, + "step": 42837 + }, + { + "epoch": 0.7640637819712481, + "grad_norm": 0.3375798463821411, + "learning_rate": 8.00886179194083e-06, + "loss": 0.1097, + "step": 42838 + }, + { + "epoch": 0.7640816180929618, + "grad_norm": 0.25835952162742615, + "learning_rate": 8.007720063211192e-06, + "loss": 0.1192, + "step": 42839 + }, + { + "epoch": 0.7640994542146755, + "grad_norm": 0.3264369070529938, + "learning_rate": 8.006578400349491e-06, + "loss": 0.1294, + "step": 42840 + }, + { + "epoch": 0.7641172903363893, + "grad_norm": 0.25550565123558044, + "learning_rate": 8.005436803360169e-06, + "loss": 0.1573, + "step": 42841 + }, + { + "epoch": 0.764135126458103, + "grad_norm": 0.4731195569038391, + "learning_rate": 8.004295272247647e-06, + "loss": 0.1315, + "step": 42842 + }, + { + "epoch": 0.7641529625798167, + "grad_norm": 0.2981793284416199, + "learning_rate": 8.003153807016337e-06, + "loss": 0.1418, + "step": 42843 + }, + { + "epoch": 0.7641707987015304, + "grad_norm": 0.2584768533706665, + "learning_rate": 8.002012407670681e-06, + "loss": 0.1093, + "step": 42844 + }, + { + "epoch": 0.7641886348232441, + "grad_norm": 0.29052045941352844, + "learning_rate": 8.000871074215086e-06, + "loss": 0.1246, + "step": 42845 + }, + { + "epoch": 0.7642064709449577, + "grad_norm": 0.2955264151096344, + "learning_rate": 7.999729806653997e-06, + "loss": 0.1378, + "step": 42846 + }, + { + "epoch": 0.7642243070666714, + "grad_norm": 0.307647705078125, + "learning_rate": 7.998588604991824e-06, + "loss": 0.1411, + "step": 42847 + }, + { + "epoch": 0.7642421431883851, + "grad_norm": 0.27249661087989807, + "learning_rate": 7.997447469232996e-06, + "loss": 0.0884, + "step": 42848 + }, + { + "epoch": 0.7642599793100988, + "grad_norm": 0.33691415190696716, + "learning_rate": 7.996306399381925e-06, + "loss": 0.1648, + "step": 42849 + }, + { + "epoch": 0.7642778154318125, + "grad_norm": 0.3107374906539917, + "learning_rate": 7.995165395443053e-06, + "loss": 0.1277, + "step": 42850 + }, + { + "epoch": 0.7642956515535262, + "grad_norm": 0.2589839696884155, + "learning_rate": 7.994024457420793e-06, + "loss": 0.0912, + "step": 42851 + }, + { + "epoch": 0.7643134876752399, + "grad_norm": 0.21271340548992157, + "learning_rate": 7.992883585319566e-06, + "loss": 0.1139, + "step": 42852 + }, + { + "epoch": 0.7643313237969536, + "grad_norm": 0.3283775746822357, + "learning_rate": 7.99174277914379e-06, + "loss": 0.1249, + "step": 42853 + }, + { + "epoch": 0.7643491599186673, + "grad_norm": 0.2842460572719574, + "learning_rate": 7.9906020388979e-06, + "loss": 0.1258, + "step": 42854 + }, + { + "epoch": 0.7643669960403809, + "grad_norm": 0.30483558773994446, + "learning_rate": 7.989461364586311e-06, + "loss": 0.2016, + "step": 42855 + }, + { + "epoch": 0.7643848321620946, + "grad_norm": 0.22662563621997833, + "learning_rate": 7.988320756213447e-06, + "loss": 0.1093, + "step": 42856 + }, + { + "epoch": 0.7644026682838083, + "grad_norm": 0.31030842661857605, + "learning_rate": 7.987180213783719e-06, + "loss": 0.1261, + "step": 42857 + }, + { + "epoch": 0.7644205044055221, + "grad_norm": 0.30658525228500366, + "learning_rate": 7.986039737301555e-06, + "loss": 0.1312, + "step": 42858 + }, + { + "epoch": 0.7644383405272358, + "grad_norm": 0.25996431708335876, + "learning_rate": 7.984899326771387e-06, + "loss": 0.0842, + "step": 42859 + }, + { + "epoch": 0.7644561766489495, + "grad_norm": 0.2168186753988266, + "learning_rate": 7.983758982197625e-06, + "loss": 0.1198, + "step": 42860 + }, + { + "epoch": 0.7644740127706632, + "grad_norm": 0.2856423258781433, + "learning_rate": 7.98261870358469e-06, + "loss": 0.0944, + "step": 42861 + }, + { + "epoch": 0.7644918488923769, + "grad_norm": 0.35936540365219116, + "learning_rate": 7.981478490936997e-06, + "loss": 0.1403, + "step": 42862 + }, + { + "epoch": 0.7645096850140906, + "grad_norm": 0.22587734460830688, + "learning_rate": 7.980338344258978e-06, + "loss": 0.1109, + "step": 42863 + }, + { + "epoch": 0.7645275211358042, + "grad_norm": 0.2973916828632355, + "learning_rate": 7.979198263555043e-06, + "loss": 0.1785, + "step": 42864 + }, + { + "epoch": 0.7645453572575179, + "grad_norm": 0.2513025104999542, + "learning_rate": 7.978058248829617e-06, + "loss": 0.1101, + "step": 42865 + }, + { + "epoch": 0.7645631933792316, + "grad_norm": 0.38973304629325867, + "learning_rate": 7.976918300087111e-06, + "loss": 0.117, + "step": 42866 + }, + { + "epoch": 0.7645810295009453, + "grad_norm": 0.3005039393901825, + "learning_rate": 7.975778417331953e-06, + "loss": 0.096, + "step": 42867 + }, + { + "epoch": 0.764598865622659, + "grad_norm": 0.27728769183158875, + "learning_rate": 7.974638600568559e-06, + "loss": 0.1327, + "step": 42868 + }, + { + "epoch": 0.7646167017443727, + "grad_norm": 0.28186824917793274, + "learning_rate": 7.973498849801348e-06, + "loss": 0.0921, + "step": 42869 + }, + { + "epoch": 0.7646345378660864, + "grad_norm": 0.3413606584072113, + "learning_rate": 7.972359165034727e-06, + "loss": 0.1237, + "step": 42870 + }, + { + "epoch": 0.7646523739878001, + "grad_norm": 0.3357499837875366, + "learning_rate": 7.971219546273131e-06, + "loss": 0.1279, + "step": 42871 + }, + { + "epoch": 0.7646702101095137, + "grad_norm": 0.2963908612728119, + "learning_rate": 7.970079993520959e-06, + "loss": 0.1147, + "step": 42872 + }, + { + "epoch": 0.7646880462312274, + "grad_norm": 0.2905164659023285, + "learning_rate": 7.968940506782652e-06, + "loss": 0.1164, + "step": 42873 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.2257183939218521, + "learning_rate": 7.96780108606261e-06, + "loss": 0.0851, + "step": 42874 + }, + { + "epoch": 0.7647237184746549, + "grad_norm": 0.2909891605377197, + "learning_rate": 7.966661731365246e-06, + "loss": 0.0933, + "step": 42875 + }, + { + "epoch": 0.7647415545963686, + "grad_norm": 0.26723378896713257, + "learning_rate": 7.965522442694994e-06, + "loss": 0.1451, + "step": 42876 + }, + { + "epoch": 0.7647593907180823, + "grad_norm": 0.27947598695755005, + "learning_rate": 7.964383220056259e-06, + "loss": 0.0804, + "step": 42877 + }, + { + "epoch": 0.764777226839796, + "grad_norm": 0.2634432911872864, + "learning_rate": 7.963244063453463e-06, + "loss": 0.1584, + "step": 42878 + }, + { + "epoch": 0.7647950629615097, + "grad_norm": 0.23158571124076843, + "learning_rate": 7.962104972891005e-06, + "loss": 0.1446, + "step": 42879 + }, + { + "epoch": 0.7648128990832234, + "grad_norm": 0.267520934343338, + "learning_rate": 7.960965948373319e-06, + "loss": 0.1311, + "step": 42880 + }, + { + "epoch": 0.764830735204937, + "grad_norm": 0.3257252275943756, + "learning_rate": 7.959826989904818e-06, + "loss": 0.1716, + "step": 42881 + }, + { + "epoch": 0.7648485713266507, + "grad_norm": 0.29919692873954773, + "learning_rate": 7.95868809748991e-06, + "loss": 0.1017, + "step": 42882 + }, + { + "epoch": 0.7648664074483644, + "grad_norm": 0.2358035147190094, + "learning_rate": 7.957549271133016e-06, + "loss": 0.1046, + "step": 42883 + }, + { + "epoch": 0.7648842435700781, + "grad_norm": 0.4467250108718872, + "learning_rate": 7.956410510838538e-06, + "loss": 0.164, + "step": 42884 + }, + { + "epoch": 0.7649020796917918, + "grad_norm": 0.2204703539609909, + "learning_rate": 7.955271816610899e-06, + "loss": 0.0903, + "step": 42885 + }, + { + "epoch": 0.7649199158135055, + "grad_norm": 0.3152219355106354, + "learning_rate": 7.954133188454522e-06, + "loss": 0.1704, + "step": 42886 + }, + { + "epoch": 0.7649377519352192, + "grad_norm": 0.3217666447162628, + "learning_rate": 7.95299462637381e-06, + "loss": 0.1276, + "step": 42887 + }, + { + "epoch": 0.7649555880569329, + "grad_norm": 0.32320696115493774, + "learning_rate": 7.951856130373176e-06, + "loss": 0.1104, + "step": 42888 + }, + { + "epoch": 0.7649734241786466, + "grad_norm": 0.2849743962287903, + "learning_rate": 7.950717700457038e-06, + "loss": 0.1441, + "step": 42889 + }, + { + "epoch": 0.7649912603003602, + "grad_norm": 0.2630385458469391, + "learning_rate": 7.94957933662981e-06, + "loss": 0.1331, + "step": 42890 + }, + { + "epoch": 0.7650090964220739, + "grad_norm": 0.6085127592086792, + "learning_rate": 7.9484410388959e-06, + "loss": 0.1371, + "step": 42891 + }, + { + "epoch": 0.7650269325437877, + "grad_norm": 0.30573567748069763, + "learning_rate": 7.947302807259715e-06, + "loss": 0.1232, + "step": 42892 + }, + { + "epoch": 0.7650447686655014, + "grad_norm": 0.18907184898853302, + "learning_rate": 7.946164641725685e-06, + "loss": 0.117, + "step": 42893 + }, + { + "epoch": 0.7650626047872151, + "grad_norm": 0.19128276407718658, + "learning_rate": 7.945026542298207e-06, + "loss": 0.0777, + "step": 42894 + }, + { + "epoch": 0.7650804409089288, + "grad_norm": 0.26594278216362, + "learning_rate": 7.943888508981697e-06, + "loss": 0.1218, + "step": 42895 + }, + { + "epoch": 0.7650982770306425, + "grad_norm": 0.1851966232061386, + "learning_rate": 7.942750541780567e-06, + "loss": 0.0716, + "step": 42896 + }, + { + "epoch": 0.7651161131523562, + "grad_norm": 0.27270033955574036, + "learning_rate": 7.94161264069922e-06, + "loss": 0.0909, + "step": 42897 + }, + { + "epoch": 0.7651339492740699, + "grad_norm": 0.287460058927536, + "learning_rate": 7.940474805742078e-06, + "loss": 0.1108, + "step": 42898 + }, + { + "epoch": 0.7651517853957835, + "grad_norm": 0.27612176537513733, + "learning_rate": 7.939337036913543e-06, + "loss": 0.0936, + "step": 42899 + }, + { + "epoch": 0.7651696215174972, + "grad_norm": 0.3355642259120941, + "learning_rate": 7.93819933421804e-06, + "loss": 0.1571, + "step": 42900 + }, + { + "epoch": 0.7651874576392109, + "grad_norm": 0.23417073488235474, + "learning_rate": 7.937061697659962e-06, + "loss": 0.1231, + "step": 42901 + }, + { + "epoch": 0.7652052937609246, + "grad_norm": 0.3518558740615845, + "learning_rate": 7.935924127243729e-06, + "loss": 0.09, + "step": 42902 + }, + { + "epoch": 0.7652231298826383, + "grad_norm": 0.25965461134910583, + "learning_rate": 7.934786622973753e-06, + "loss": 0.1058, + "step": 42903 + }, + { + "epoch": 0.765240966004352, + "grad_norm": 0.31411510705947876, + "learning_rate": 7.933649184854436e-06, + "loss": 0.113, + "step": 42904 + }, + { + "epoch": 0.7652588021260657, + "grad_norm": 0.28541257977485657, + "learning_rate": 7.932511812890182e-06, + "loss": 0.1066, + "step": 42905 + }, + { + "epoch": 0.7652766382477794, + "grad_norm": 0.2829614281654358, + "learning_rate": 7.931374507085412e-06, + "loss": 0.1214, + "step": 42906 + }, + { + "epoch": 0.765294474369493, + "grad_norm": 0.32959020137786865, + "learning_rate": 7.930237267444533e-06, + "loss": 0.1318, + "step": 42907 + }, + { + "epoch": 0.7653123104912067, + "grad_norm": 0.25511986017227173, + "learning_rate": 7.92910009397195e-06, + "loss": 0.0834, + "step": 42908 + }, + { + "epoch": 0.7653301466129205, + "grad_norm": 0.24911274015903473, + "learning_rate": 7.927962986672068e-06, + "loss": 0.1357, + "step": 42909 + }, + { + "epoch": 0.7653479827346342, + "grad_norm": 0.22685551643371582, + "learning_rate": 7.926825945549293e-06, + "loss": 0.0877, + "step": 42910 + }, + { + "epoch": 0.7653658188563479, + "grad_norm": 0.2752068340778351, + "learning_rate": 7.925688970608044e-06, + "loss": 0.1236, + "step": 42911 + }, + { + "epoch": 0.7653836549780616, + "grad_norm": 0.38870489597320557, + "learning_rate": 7.924552061852717e-06, + "loss": 0.0835, + "step": 42912 + }, + { + "epoch": 0.7654014910997753, + "grad_norm": 0.31101763248443604, + "learning_rate": 7.923415219287728e-06, + "loss": 0.125, + "step": 42913 + }, + { + "epoch": 0.765419327221489, + "grad_norm": 0.308556467294693, + "learning_rate": 7.922278442917474e-06, + "loss": 0.1541, + "step": 42914 + }, + { + "epoch": 0.7654371633432027, + "grad_norm": 0.3094073235988617, + "learning_rate": 7.921141732746373e-06, + "loss": 0.0929, + "step": 42915 + }, + { + "epoch": 0.7654549994649164, + "grad_norm": 0.2967217266559601, + "learning_rate": 7.920005088778828e-06, + "loss": 0.1124, + "step": 42916 + }, + { + "epoch": 0.76547283558663, + "grad_norm": 0.2879098057746887, + "learning_rate": 7.91886851101924e-06, + "loss": 0.0902, + "step": 42917 + }, + { + "epoch": 0.7654906717083437, + "grad_norm": 0.35333603620529175, + "learning_rate": 7.917731999472012e-06, + "loss": 0.1326, + "step": 42918 + }, + { + "epoch": 0.7655085078300574, + "grad_norm": 0.2966606318950653, + "learning_rate": 7.916595554141562e-06, + "loss": 0.091, + "step": 42919 + }, + { + "epoch": 0.7655263439517711, + "grad_norm": 0.2804579436779022, + "learning_rate": 7.915459175032286e-06, + "loss": 0.1368, + "step": 42920 + }, + { + "epoch": 0.7655441800734848, + "grad_norm": 0.35079076886177063, + "learning_rate": 7.914322862148594e-06, + "loss": 0.1218, + "step": 42921 + }, + { + "epoch": 0.7655620161951985, + "grad_norm": 0.37944865226745605, + "learning_rate": 7.913186615494886e-06, + "loss": 0.1731, + "step": 42922 + }, + { + "epoch": 0.7655798523169122, + "grad_norm": 0.3109481930732727, + "learning_rate": 7.91205043507556e-06, + "loss": 0.1578, + "step": 42923 + }, + { + "epoch": 0.7655976884386259, + "grad_norm": 0.3452800512313843, + "learning_rate": 7.910914320895038e-06, + "loss": 0.1382, + "step": 42924 + }, + { + "epoch": 0.7656155245603395, + "grad_norm": 0.2029651254415512, + "learning_rate": 7.909778272957712e-06, + "loss": 0.1134, + "step": 42925 + }, + { + "epoch": 0.7656333606820533, + "grad_norm": 0.2473873496055603, + "learning_rate": 7.908642291267982e-06, + "loss": 0.1144, + "step": 42926 + }, + { + "epoch": 0.765651196803767, + "grad_norm": 0.24521149694919586, + "learning_rate": 7.907506375830265e-06, + "loss": 0.1166, + "step": 42927 + }, + { + "epoch": 0.7656690329254807, + "grad_norm": 0.313917875289917, + "learning_rate": 7.906370526648951e-06, + "loss": 0.1574, + "step": 42928 + }, + { + "epoch": 0.7656868690471944, + "grad_norm": 0.2503361403942108, + "learning_rate": 7.905234743728455e-06, + "loss": 0.1482, + "step": 42929 + }, + { + "epoch": 0.7657047051689081, + "grad_norm": 0.2644880414009094, + "learning_rate": 7.904099027073172e-06, + "loss": 0.1387, + "step": 42930 + }, + { + "epoch": 0.7657225412906218, + "grad_norm": 0.31529226899147034, + "learning_rate": 7.9029633766875e-06, + "loss": 0.1557, + "step": 42931 + }, + { + "epoch": 0.7657403774123355, + "grad_norm": 0.5534952282905579, + "learning_rate": 7.901827792575856e-06, + "loss": 0.1785, + "step": 42932 + }, + { + "epoch": 0.7657582135340492, + "grad_norm": 0.2698407769203186, + "learning_rate": 7.90069227474263e-06, + "loss": 0.0972, + "step": 42933 + }, + { + "epoch": 0.7657760496557628, + "grad_norm": 0.33253052830696106, + "learning_rate": 7.899556823192229e-06, + "loss": 0.1594, + "step": 42934 + }, + { + "epoch": 0.7657938857774765, + "grad_norm": 0.2952536940574646, + "learning_rate": 7.898421437929052e-06, + "loss": 0.1483, + "step": 42935 + }, + { + "epoch": 0.7658117218991902, + "grad_norm": 0.2531408667564392, + "learning_rate": 7.89728611895749e-06, + "loss": 0.063, + "step": 42936 + }, + { + "epoch": 0.7658295580209039, + "grad_norm": 0.2943894863128662, + "learning_rate": 7.896150866281963e-06, + "loss": 0.1316, + "step": 42937 + }, + { + "epoch": 0.7658473941426176, + "grad_norm": 0.31141456961631775, + "learning_rate": 7.895015679906864e-06, + "loss": 0.1197, + "step": 42938 + }, + { + "epoch": 0.7658652302643313, + "grad_norm": 0.2847091853618622, + "learning_rate": 7.893880559836583e-06, + "loss": 0.1223, + "step": 42939 + }, + { + "epoch": 0.765883066386045, + "grad_norm": 0.26799747347831726, + "learning_rate": 7.892745506075538e-06, + "loss": 0.1525, + "step": 42940 + }, + { + "epoch": 0.7659009025077587, + "grad_norm": 0.20217901468276978, + "learning_rate": 7.891610518628112e-06, + "loss": 0.1184, + "step": 42941 + }, + { + "epoch": 0.7659187386294725, + "grad_norm": 0.27190306782722473, + "learning_rate": 7.89047559749872e-06, + "loss": 0.1105, + "step": 42942 + }, + { + "epoch": 0.7659365747511861, + "grad_norm": 0.2974618375301361, + "learning_rate": 7.889340742691753e-06, + "loss": 0.1277, + "step": 42943 + }, + { + "epoch": 0.7659544108728998, + "grad_norm": 0.30681005120277405, + "learning_rate": 7.888205954211606e-06, + "loss": 0.13, + "step": 42944 + }, + { + "epoch": 0.7659722469946135, + "grad_norm": 0.31746363639831543, + "learning_rate": 7.887071232062687e-06, + "loss": 0.1161, + "step": 42945 + }, + { + "epoch": 0.7659900831163272, + "grad_norm": 0.19313858449459076, + "learning_rate": 7.885936576249394e-06, + "loss": 0.0736, + "step": 42946 + }, + { + "epoch": 0.7660079192380409, + "grad_norm": 0.31254518032073975, + "learning_rate": 7.884801986776122e-06, + "loss": 0.0964, + "step": 42947 + }, + { + "epoch": 0.7660257553597546, + "grad_norm": 0.310528963804245, + "learning_rate": 7.88366746364726e-06, + "loss": 0.1586, + "step": 42948 + }, + { + "epoch": 0.7660435914814683, + "grad_norm": 0.20265977084636688, + "learning_rate": 7.882533006867221e-06, + "loss": 0.1296, + "step": 42949 + }, + { + "epoch": 0.766061427603182, + "grad_norm": 0.27496954798698425, + "learning_rate": 7.881398616440398e-06, + "loss": 0.1252, + "step": 42950 + }, + { + "epoch": 0.7660792637248957, + "grad_norm": 0.25272825360298157, + "learning_rate": 7.880264292371187e-06, + "loss": 0.1651, + "step": 42951 + }, + { + "epoch": 0.7660970998466093, + "grad_norm": 0.20538458228111267, + "learning_rate": 7.87913003466398e-06, + "loss": 0.1254, + "step": 42952 + }, + { + "epoch": 0.766114935968323, + "grad_norm": 0.28113630414009094, + "learning_rate": 7.877995843323178e-06, + "loss": 0.1175, + "step": 42953 + }, + { + "epoch": 0.7661327720900367, + "grad_norm": 0.187697172164917, + "learning_rate": 7.876861718353178e-06, + "loss": 0.1085, + "step": 42954 + }, + { + "epoch": 0.7661506082117504, + "grad_norm": 0.277017205953598, + "learning_rate": 7.875727659758374e-06, + "loss": 0.1154, + "step": 42955 + }, + { + "epoch": 0.7661684443334641, + "grad_norm": 0.2884194552898407, + "learning_rate": 7.874593667543171e-06, + "loss": 0.1038, + "step": 42956 + }, + { + "epoch": 0.7661862804551778, + "grad_norm": 0.24556809663772583, + "learning_rate": 7.873459741711947e-06, + "loss": 0.143, + "step": 42957 + }, + { + "epoch": 0.7662041165768915, + "grad_norm": 0.24674363434314728, + "learning_rate": 7.872325882269119e-06, + "loss": 0.1284, + "step": 42958 + }, + { + "epoch": 0.7662219526986053, + "grad_norm": 0.2722882926464081, + "learning_rate": 7.871192089219071e-06, + "loss": 0.1035, + "step": 42959 + }, + { + "epoch": 0.766239788820319, + "grad_norm": 0.30494844913482666, + "learning_rate": 7.870058362566198e-06, + "loss": 0.1019, + "step": 42960 + }, + { + "epoch": 0.7662576249420326, + "grad_norm": 0.31936338543891907, + "learning_rate": 7.868924702314889e-06, + "loss": 0.1845, + "step": 42961 + }, + { + "epoch": 0.7662754610637463, + "grad_norm": 0.29673469066619873, + "learning_rate": 7.867791108469552e-06, + "loss": 0.1377, + "step": 42962 + }, + { + "epoch": 0.76629329718546, + "grad_norm": 0.36143171787261963, + "learning_rate": 7.866657581034572e-06, + "loss": 0.1136, + "step": 42963 + }, + { + "epoch": 0.7663111333071737, + "grad_norm": 0.3071307837963104, + "learning_rate": 7.865524120014347e-06, + "loss": 0.1163, + "step": 42964 + }, + { + "epoch": 0.7663289694288874, + "grad_norm": 0.26397082209587097, + "learning_rate": 7.864390725413268e-06, + "loss": 0.0563, + "step": 42965 + }, + { + "epoch": 0.7663468055506011, + "grad_norm": 0.33258673548698425, + "learning_rate": 7.863257397235722e-06, + "loss": 0.1341, + "step": 42966 + }, + { + "epoch": 0.7663646416723148, + "grad_norm": 0.24217641353607178, + "learning_rate": 7.862124135486116e-06, + "loss": 0.1018, + "step": 42967 + }, + { + "epoch": 0.7663824777940285, + "grad_norm": 0.22889651358127594, + "learning_rate": 7.860990940168827e-06, + "loss": 0.108, + "step": 42968 + }, + { + "epoch": 0.7664003139157421, + "grad_norm": 0.23804038763046265, + "learning_rate": 7.859857811288265e-06, + "loss": 0.0909, + "step": 42969 + }, + { + "epoch": 0.7664181500374558, + "grad_norm": 0.2688756585121155, + "learning_rate": 7.858724748848809e-06, + "loss": 0.0831, + "step": 42970 + }, + { + "epoch": 0.7664359861591695, + "grad_norm": 0.22026517987251282, + "learning_rate": 7.857591752854862e-06, + "loss": 0.1221, + "step": 42971 + }, + { + "epoch": 0.7664538222808832, + "grad_norm": 0.20782138407230377, + "learning_rate": 7.856458823310809e-06, + "loss": 0.0754, + "step": 42972 + }, + { + "epoch": 0.7664716584025969, + "grad_norm": 0.8296259045600891, + "learning_rate": 7.855325960221044e-06, + "loss": 0.1608, + "step": 42973 + }, + { + "epoch": 0.7664894945243106, + "grad_norm": 0.24983401596546173, + "learning_rate": 7.85419316358995e-06, + "loss": 0.1135, + "step": 42974 + }, + { + "epoch": 0.7665073306460243, + "grad_norm": 0.21038931608200073, + "learning_rate": 7.853060433421933e-06, + "loss": 0.0632, + "step": 42975 + }, + { + "epoch": 0.7665251667677381, + "grad_norm": 0.2374330759048462, + "learning_rate": 7.851927769721376e-06, + "loss": 0.1256, + "step": 42976 + }, + { + "epoch": 0.7665430028894518, + "grad_norm": 0.31622299551963806, + "learning_rate": 7.85079517249267e-06, + "loss": 0.0641, + "step": 42977 + }, + { + "epoch": 0.7665608390111655, + "grad_norm": 0.3443428575992584, + "learning_rate": 7.849662641740204e-06, + "loss": 0.1255, + "step": 42978 + }, + { + "epoch": 0.7665786751328791, + "grad_norm": 0.21523240208625793, + "learning_rate": 7.848530177468361e-06, + "loss": 0.1117, + "step": 42979 + }, + { + "epoch": 0.7665965112545928, + "grad_norm": 0.3534828722476959, + "learning_rate": 7.847397779681548e-06, + "loss": 0.1646, + "step": 42980 + }, + { + "epoch": 0.7666143473763065, + "grad_norm": 0.28434497117996216, + "learning_rate": 7.846265448384147e-06, + "loss": 0.1286, + "step": 42981 + }, + { + "epoch": 0.7666321834980202, + "grad_norm": 0.24215789139270782, + "learning_rate": 7.845133183580536e-06, + "loss": 0.0744, + "step": 42982 + }, + { + "epoch": 0.7666500196197339, + "grad_norm": 0.24433310329914093, + "learning_rate": 7.844000985275112e-06, + "loss": 0.1075, + "step": 42983 + }, + { + "epoch": 0.7666678557414476, + "grad_norm": 0.24054983258247375, + "learning_rate": 7.842868853472276e-06, + "loss": 0.079, + "step": 42984 + }, + { + "epoch": 0.7666856918631613, + "grad_norm": 0.23580560088157654, + "learning_rate": 7.841736788176407e-06, + "loss": 0.0532, + "step": 42985 + }, + { + "epoch": 0.766703527984875, + "grad_norm": 0.2895699441432953, + "learning_rate": 7.84060478939189e-06, + "loss": 0.0886, + "step": 42986 + }, + { + "epoch": 0.7667213641065886, + "grad_norm": 0.24624425172805786, + "learning_rate": 7.839472857123109e-06, + "loss": 0.1534, + "step": 42987 + }, + { + "epoch": 0.7667392002283023, + "grad_norm": 0.2970411479473114, + "learning_rate": 7.838340991374465e-06, + "loss": 0.1066, + "step": 42988 + }, + { + "epoch": 0.766757036350016, + "grad_norm": 0.2494984120130539, + "learning_rate": 7.83720919215034e-06, + "loss": 0.0989, + "step": 42989 + }, + { + "epoch": 0.7667748724717297, + "grad_norm": 0.25810766220092773, + "learning_rate": 7.836077459455121e-06, + "loss": 0.1079, + "step": 42990 + }, + { + "epoch": 0.7667927085934434, + "grad_norm": 0.23424342274665833, + "learning_rate": 7.834945793293191e-06, + "loss": 0.108, + "step": 42991 + }, + { + "epoch": 0.7668105447151571, + "grad_norm": 0.38717761635780334, + "learning_rate": 7.833814193668935e-06, + "loss": 0.1331, + "step": 42992 + }, + { + "epoch": 0.7668283808368709, + "grad_norm": 0.2999991178512573, + "learning_rate": 7.832682660586751e-06, + "loss": 0.0709, + "step": 42993 + }, + { + "epoch": 0.7668462169585846, + "grad_norm": 0.25582024455070496, + "learning_rate": 7.831551194051017e-06, + "loss": 0.1361, + "step": 42994 + }, + { + "epoch": 0.7668640530802983, + "grad_norm": 0.26886799931526184, + "learning_rate": 7.830419794066116e-06, + "loss": 0.0976, + "step": 42995 + }, + { + "epoch": 0.766881889202012, + "grad_norm": 0.31064337491989136, + "learning_rate": 7.829288460636441e-06, + "loss": 0.0961, + "step": 42996 + }, + { + "epoch": 0.7668997253237256, + "grad_norm": 0.3979969322681427, + "learning_rate": 7.828157193766369e-06, + "loss": 0.1025, + "step": 42997 + }, + { + "epoch": 0.7669175614454393, + "grad_norm": 0.27119138836860657, + "learning_rate": 7.827025993460298e-06, + "loss": 0.093, + "step": 42998 + }, + { + "epoch": 0.766935397567153, + "grad_norm": 0.2645191252231598, + "learning_rate": 7.825894859722608e-06, + "loss": 0.1231, + "step": 42999 + }, + { + "epoch": 0.7669532336888667, + "grad_norm": 0.2291717380285263, + "learning_rate": 7.82476379255767e-06, + "loss": 0.0944, + "step": 43000 + }, + { + "epoch": 0.7669532336888667, + "eval_loss": 0.11168795824050903, + "eval_runtime": 107.0319, + "eval_samples_per_second": 9.567, + "eval_steps_per_second": 1.598, + "step": 43000 + }, + { + "epoch": 0.7669710698105804, + "grad_norm": 0.17766976356506348, + "learning_rate": 7.82363279196989e-06, + "loss": 0.0738, + "step": 43001 + }, + { + "epoch": 0.7669889059322941, + "grad_norm": 0.4053889214992523, + "learning_rate": 7.822501857963643e-06, + "loss": 0.1159, + "step": 43002 + }, + { + "epoch": 0.7670067420540078, + "grad_norm": 0.21890953183174133, + "learning_rate": 7.82137099054331e-06, + "loss": 0.1149, + "step": 43003 + }, + { + "epoch": 0.7670245781757214, + "grad_norm": 0.22025559842586517, + "learning_rate": 7.820240189713274e-06, + "loss": 0.1038, + "step": 43004 + }, + { + "epoch": 0.7670424142974351, + "grad_norm": 0.2734902799129486, + "learning_rate": 7.819109455477916e-06, + "loss": 0.1022, + "step": 43005 + }, + { + "epoch": 0.7670602504191488, + "grad_norm": 0.2672120928764343, + "learning_rate": 7.817978787841631e-06, + "loss": 0.1263, + "step": 43006 + }, + { + "epoch": 0.7670780865408625, + "grad_norm": 0.20441804826259613, + "learning_rate": 7.816848186808796e-06, + "loss": 0.0593, + "step": 43007 + }, + { + "epoch": 0.7670959226625762, + "grad_norm": 0.2629643380641937, + "learning_rate": 7.815717652383789e-06, + "loss": 0.1417, + "step": 43008 + }, + { + "epoch": 0.7671137587842899, + "grad_norm": 0.2767849266529083, + "learning_rate": 7.81458718457099e-06, + "loss": 0.0879, + "step": 43009 + }, + { + "epoch": 0.7671315949060037, + "grad_norm": 0.38085970282554626, + "learning_rate": 7.813456783374793e-06, + "loss": 0.177, + "step": 43010 + }, + { + "epoch": 0.7671494310277174, + "grad_norm": 0.2635781466960907, + "learning_rate": 7.812326448799568e-06, + "loss": 0.1124, + "step": 43011 + }, + { + "epoch": 0.7671672671494311, + "grad_norm": 0.2636691629886627, + "learning_rate": 7.811196180849708e-06, + "loss": 0.1527, + "step": 43012 + }, + { + "epoch": 0.7671851032711448, + "grad_norm": 0.2609785497188568, + "learning_rate": 7.810065979529579e-06, + "loss": 0.1075, + "step": 43013 + }, + { + "epoch": 0.7672029393928584, + "grad_norm": 0.260689914226532, + "learning_rate": 7.808935844843584e-06, + "loss": 0.1184, + "step": 43014 + }, + { + "epoch": 0.7672207755145721, + "grad_norm": 0.34491807222366333, + "learning_rate": 7.807805776796088e-06, + "loss": 0.1012, + "step": 43015 + }, + { + "epoch": 0.7672386116362858, + "grad_norm": 0.25932416319847107, + "learning_rate": 7.806675775391476e-06, + "loss": 0.093, + "step": 43016 + }, + { + "epoch": 0.7672564477579995, + "grad_norm": 0.317667156457901, + "learning_rate": 7.805545840634126e-06, + "loss": 0.0992, + "step": 43017 + }, + { + "epoch": 0.7672742838797132, + "grad_norm": 0.24547673761844635, + "learning_rate": 7.804415972528412e-06, + "loss": 0.0814, + "step": 43018 + }, + { + "epoch": 0.7672921200014269, + "grad_norm": 0.21152032911777496, + "learning_rate": 7.803286171078731e-06, + "loss": 0.0708, + "step": 43019 + }, + { + "epoch": 0.7673099561231406, + "grad_norm": 0.2532030940055847, + "learning_rate": 7.802156436289448e-06, + "loss": 0.149, + "step": 43020 + }, + { + "epoch": 0.7673277922448543, + "grad_norm": 0.34872791171073914, + "learning_rate": 7.801026768164949e-06, + "loss": 0.1289, + "step": 43021 + }, + { + "epoch": 0.767345628366568, + "grad_norm": 0.36275026202201843, + "learning_rate": 7.799897166709602e-06, + "loss": 0.1036, + "step": 43022 + }, + { + "epoch": 0.7673634644882816, + "grad_norm": 0.27639642357826233, + "learning_rate": 7.798767631927802e-06, + "loss": 0.0999, + "step": 43023 + }, + { + "epoch": 0.7673813006099953, + "grad_norm": 0.3113633692264557, + "learning_rate": 7.797638163823914e-06, + "loss": 0.1375, + "step": 43024 + }, + { + "epoch": 0.767399136731709, + "grad_norm": 0.2779654264450073, + "learning_rate": 7.796508762402327e-06, + "loss": 0.116, + "step": 43025 + }, + { + "epoch": 0.7674169728534227, + "grad_norm": 0.22081835567951202, + "learning_rate": 7.79537942766741e-06, + "loss": 0.1628, + "step": 43026 + }, + { + "epoch": 0.7674348089751365, + "grad_norm": 0.2872173488140106, + "learning_rate": 7.794250159623548e-06, + "loss": 0.1211, + "step": 43027 + }, + { + "epoch": 0.7674526450968502, + "grad_norm": 0.23371343314647675, + "learning_rate": 7.793120958275119e-06, + "loss": 0.1141, + "step": 43028 + }, + { + "epoch": 0.7674704812185639, + "grad_norm": 0.8620897531509399, + "learning_rate": 7.791991823626492e-06, + "loss": 0.1722, + "step": 43029 + }, + { + "epoch": 0.7674883173402776, + "grad_norm": 0.32448840141296387, + "learning_rate": 7.790862755682051e-06, + "loss": 0.1452, + "step": 43030 + }, + { + "epoch": 0.7675061534619912, + "grad_norm": 0.25936704874038696, + "learning_rate": 7.78973375444616e-06, + "loss": 0.0922, + "step": 43031 + }, + { + "epoch": 0.7675239895837049, + "grad_norm": 0.19083185493946075, + "learning_rate": 7.788604819923215e-06, + "loss": 0.1107, + "step": 43032 + }, + { + "epoch": 0.7675418257054186, + "grad_norm": 0.5480894446372986, + "learning_rate": 7.787475952117582e-06, + "loss": 0.1661, + "step": 43033 + }, + { + "epoch": 0.7675596618271323, + "grad_norm": 0.2621341943740845, + "learning_rate": 7.786347151033637e-06, + "loss": 0.1356, + "step": 43034 + }, + { + "epoch": 0.767577497948846, + "grad_norm": 0.24636195600032806, + "learning_rate": 7.785218416675746e-06, + "loss": 0.0985, + "step": 43035 + }, + { + "epoch": 0.7675953340705597, + "grad_norm": 0.27014729380607605, + "learning_rate": 7.784089749048307e-06, + "loss": 0.0849, + "step": 43036 + }, + { + "epoch": 0.7676131701922734, + "grad_norm": 0.3166449964046478, + "learning_rate": 7.78296114815568e-06, + "loss": 0.1402, + "step": 43037 + }, + { + "epoch": 0.7676310063139871, + "grad_norm": 0.1861816644668579, + "learning_rate": 7.781832614002232e-06, + "loss": 0.104, + "step": 43038 + }, + { + "epoch": 0.7676488424357008, + "grad_norm": 0.2678993344306946, + "learning_rate": 7.780704146592349e-06, + "loss": 0.1356, + "step": 43039 + }, + { + "epoch": 0.7676666785574144, + "grad_norm": 0.26752379536628723, + "learning_rate": 7.779575745930413e-06, + "loss": 0.1611, + "step": 43040 + }, + { + "epoch": 0.7676845146791281, + "grad_norm": 0.2167665660381317, + "learning_rate": 7.778447412020787e-06, + "loss": 0.1232, + "step": 43041 + }, + { + "epoch": 0.7677023508008418, + "grad_norm": 0.25560852885246277, + "learning_rate": 7.777319144867848e-06, + "loss": 0.0906, + "step": 43042 + }, + { + "epoch": 0.7677201869225556, + "grad_norm": 0.2128244936466217, + "learning_rate": 7.77619094447597e-06, + "loss": 0.1116, + "step": 43043 + }, + { + "epoch": 0.7677380230442693, + "grad_norm": 0.218234121799469, + "learning_rate": 7.775062810849515e-06, + "loss": 0.1165, + "step": 43044 + }, + { + "epoch": 0.767755859165983, + "grad_norm": 0.24815750122070312, + "learning_rate": 7.773934743992875e-06, + "loss": 0.143, + "step": 43045 + }, + { + "epoch": 0.7677736952876967, + "grad_norm": 0.29426339268684387, + "learning_rate": 7.772806743910412e-06, + "loss": 0.151, + "step": 43046 + }, + { + "epoch": 0.7677915314094104, + "grad_norm": 0.24082951247692108, + "learning_rate": 7.771678810606502e-06, + "loss": 0.1131, + "step": 43047 + }, + { + "epoch": 0.767809367531124, + "grad_norm": 0.25458139181137085, + "learning_rate": 7.770550944085508e-06, + "loss": 0.0835, + "step": 43048 + }, + { + "epoch": 0.7678272036528377, + "grad_norm": 0.22703048586845398, + "learning_rate": 7.769423144351814e-06, + "loss": 0.0894, + "step": 43049 + }, + { + "epoch": 0.7678450397745514, + "grad_norm": 0.45730623602867126, + "learning_rate": 7.76829541140979e-06, + "loss": 0.1164, + "step": 43050 + }, + { + "epoch": 0.7678628758962651, + "grad_norm": 0.2879440486431122, + "learning_rate": 7.767167745263796e-06, + "loss": 0.1348, + "step": 43051 + }, + { + "epoch": 0.7678807120179788, + "grad_norm": 0.2726638615131378, + "learning_rate": 7.766040145918221e-06, + "loss": 0.0969, + "step": 43052 + }, + { + "epoch": 0.7678985481396925, + "grad_norm": 0.24980737268924713, + "learning_rate": 7.764912613377418e-06, + "loss": 0.14, + "step": 43053 + }, + { + "epoch": 0.7679163842614062, + "grad_norm": 0.34120723605155945, + "learning_rate": 7.763785147645772e-06, + "loss": 0.1267, + "step": 43054 + }, + { + "epoch": 0.7679342203831199, + "grad_norm": 0.31987863779067993, + "learning_rate": 7.76265774872765e-06, + "loss": 0.0967, + "step": 43055 + }, + { + "epoch": 0.7679520565048336, + "grad_norm": 0.3227657973766327, + "learning_rate": 7.761530416627421e-06, + "loss": 0.112, + "step": 43056 + }, + { + "epoch": 0.7679698926265472, + "grad_norm": 0.25025060772895813, + "learning_rate": 7.760403151349446e-06, + "loss": 0.1579, + "step": 43057 + }, + { + "epoch": 0.7679877287482609, + "grad_norm": 0.3062143325805664, + "learning_rate": 7.75927595289811e-06, + "loss": 0.1051, + "step": 43058 + }, + { + "epoch": 0.7680055648699746, + "grad_norm": 0.22879207134246826, + "learning_rate": 7.758148821277775e-06, + "loss": 0.1279, + "step": 43059 + }, + { + "epoch": 0.7680234009916884, + "grad_norm": 0.28289875388145447, + "learning_rate": 7.75702175649281e-06, + "loss": 0.0872, + "step": 43060 + }, + { + "epoch": 0.7680412371134021, + "grad_norm": 0.2567320466041565, + "learning_rate": 7.755894758547578e-06, + "loss": 0.0963, + "step": 43061 + }, + { + "epoch": 0.7680590732351158, + "grad_norm": 0.3041916787624359, + "learning_rate": 7.754767827446461e-06, + "loss": 0.1513, + "step": 43062 + }, + { + "epoch": 0.7680769093568295, + "grad_norm": 0.2254086136817932, + "learning_rate": 7.753640963193817e-06, + "loss": 0.112, + "step": 43063 + }, + { + "epoch": 0.7680947454785432, + "grad_norm": 0.31808820366859436, + "learning_rate": 7.752514165794022e-06, + "loss": 0.1766, + "step": 43064 + }, + { + "epoch": 0.7681125816002569, + "grad_norm": 0.26063990592956543, + "learning_rate": 7.751387435251428e-06, + "loss": 0.1178, + "step": 43065 + }, + { + "epoch": 0.7681304177219705, + "grad_norm": 0.32291775941848755, + "learning_rate": 7.750260771570416e-06, + "loss": 0.208, + "step": 43066 + }, + { + "epoch": 0.7681482538436842, + "grad_norm": 0.1942048966884613, + "learning_rate": 7.749134174755357e-06, + "loss": 0.0798, + "step": 43067 + }, + { + "epoch": 0.7681660899653979, + "grad_norm": 0.31568217277526855, + "learning_rate": 7.748007644810614e-06, + "loss": 0.1406, + "step": 43068 + }, + { + "epoch": 0.7681839260871116, + "grad_norm": 0.229177787899971, + "learning_rate": 7.746881181740551e-06, + "loss": 0.089, + "step": 43069 + }, + { + "epoch": 0.7682017622088253, + "grad_norm": 0.34718164801597595, + "learning_rate": 7.745754785549528e-06, + "loss": 0.0819, + "step": 43070 + }, + { + "epoch": 0.768219598330539, + "grad_norm": 0.24727711081504822, + "learning_rate": 7.744628456241929e-06, + "loss": 0.106, + "step": 43071 + }, + { + "epoch": 0.7682374344522527, + "grad_norm": 0.24125796556472778, + "learning_rate": 7.743502193822106e-06, + "loss": 0.1592, + "step": 43072 + }, + { + "epoch": 0.7682552705739664, + "grad_norm": 0.2829814851284027, + "learning_rate": 7.742375998294431e-06, + "loss": 0.1101, + "step": 43073 + }, + { + "epoch": 0.76827310669568, + "grad_norm": 0.278268963098526, + "learning_rate": 7.741249869663259e-06, + "loss": 0.083, + "step": 43074 + }, + { + "epoch": 0.7682909428173937, + "grad_norm": 0.1862305998802185, + "learning_rate": 7.74012380793297e-06, + "loss": 0.0663, + "step": 43075 + }, + { + "epoch": 0.7683087789391074, + "grad_norm": 0.2176610827445984, + "learning_rate": 7.738997813107923e-06, + "loss": 0.0921, + "step": 43076 + }, + { + "epoch": 0.7683266150608212, + "grad_norm": 0.23432780802249908, + "learning_rate": 7.737871885192484e-06, + "loss": 0.1287, + "step": 43077 + }, + { + "epoch": 0.7683444511825349, + "grad_norm": 0.2174353152513504, + "learning_rate": 7.736746024191008e-06, + "loss": 0.0748, + "step": 43078 + }, + { + "epoch": 0.7683622873042486, + "grad_norm": 0.3002229928970337, + "learning_rate": 7.735620230107873e-06, + "loss": 0.125, + "step": 43079 + }, + { + "epoch": 0.7683801234259623, + "grad_norm": 0.2447798103094101, + "learning_rate": 7.73449450294743e-06, + "loss": 0.0967, + "step": 43080 + }, + { + "epoch": 0.768397959547676, + "grad_norm": 0.21270261704921722, + "learning_rate": 7.733368842714055e-06, + "loss": 0.1388, + "step": 43081 + }, + { + "epoch": 0.7684157956693897, + "grad_norm": 0.27022525668144226, + "learning_rate": 7.73224324941211e-06, + "loss": 0.1109, + "step": 43082 + }, + { + "epoch": 0.7684336317911034, + "grad_norm": 0.20504756271839142, + "learning_rate": 7.731117723045944e-06, + "loss": 0.0809, + "step": 43083 + }, + { + "epoch": 0.768451467912817, + "grad_norm": 0.2651057243347168, + "learning_rate": 7.72999226361994e-06, + "loss": 0.1094, + "step": 43084 + }, + { + "epoch": 0.7684693040345307, + "grad_norm": 0.18808798491954803, + "learning_rate": 7.728866871138448e-06, + "loss": 0.084, + "step": 43085 + }, + { + "epoch": 0.7684871401562444, + "grad_norm": 0.21226200461387634, + "learning_rate": 7.727741545605835e-06, + "loss": 0.1093, + "step": 43086 + }, + { + "epoch": 0.7685049762779581, + "grad_norm": 0.27938905358314514, + "learning_rate": 7.726616287026454e-06, + "loss": 0.1039, + "step": 43087 + }, + { + "epoch": 0.7685228123996718, + "grad_norm": 0.2202521711587906, + "learning_rate": 7.72549109540468e-06, + "loss": 0.0688, + "step": 43088 + }, + { + "epoch": 0.7685406485213855, + "grad_norm": 0.27696171402931213, + "learning_rate": 7.72436597074487e-06, + "loss": 0.1348, + "step": 43089 + }, + { + "epoch": 0.7685584846430992, + "grad_norm": 0.282012939453125, + "learning_rate": 7.723240913051385e-06, + "loss": 0.1008, + "step": 43090 + }, + { + "epoch": 0.7685763207648129, + "grad_norm": 0.31433621048927307, + "learning_rate": 7.722115922328577e-06, + "loss": 0.1134, + "step": 43091 + }, + { + "epoch": 0.7685941568865265, + "grad_norm": 0.352639764547348, + "learning_rate": 7.720990998580823e-06, + "loss": 0.1545, + "step": 43092 + }, + { + "epoch": 0.7686119930082402, + "grad_norm": 0.29540979862213135, + "learning_rate": 7.719866141812468e-06, + "loss": 0.1308, + "step": 43093 + }, + { + "epoch": 0.768629829129954, + "grad_norm": 0.40335536003112793, + "learning_rate": 7.718741352027889e-06, + "loss": 0.1397, + "step": 43094 + }, + { + "epoch": 0.7686476652516677, + "grad_norm": 0.26056864857673645, + "learning_rate": 7.717616629231436e-06, + "loss": 0.1102, + "step": 43095 + }, + { + "epoch": 0.7686655013733814, + "grad_norm": 0.25503790378570557, + "learning_rate": 7.716491973427465e-06, + "loss": 0.1223, + "step": 43096 + }, + { + "epoch": 0.7686833374950951, + "grad_norm": 0.2516988515853882, + "learning_rate": 7.715367384620345e-06, + "loss": 0.1234, + "step": 43097 + }, + { + "epoch": 0.7687011736168088, + "grad_norm": 0.17808571457862854, + "learning_rate": 7.714242862814433e-06, + "loss": 0.0727, + "step": 43098 + }, + { + "epoch": 0.7687190097385225, + "grad_norm": 0.2271338403224945, + "learning_rate": 7.713118408014087e-06, + "loss": 0.0984, + "step": 43099 + }, + { + "epoch": 0.7687368458602362, + "grad_norm": 0.20613867044448853, + "learning_rate": 7.711994020223654e-06, + "loss": 0.1261, + "step": 43100 + }, + { + "epoch": 0.7687546819819499, + "grad_norm": 0.24867239594459534, + "learning_rate": 7.710869699447512e-06, + "loss": 0.1343, + "step": 43101 + }, + { + "epoch": 0.7687725181036635, + "grad_norm": 0.29739993810653687, + "learning_rate": 7.709745445690012e-06, + "loss": 0.1776, + "step": 43102 + }, + { + "epoch": 0.7687903542253772, + "grad_norm": 0.35179978609085083, + "learning_rate": 7.708621258955509e-06, + "loss": 0.1211, + "step": 43103 + }, + { + "epoch": 0.7688081903470909, + "grad_norm": 0.25421440601348877, + "learning_rate": 7.707497139248355e-06, + "loss": 0.0683, + "step": 43104 + }, + { + "epoch": 0.7688260264688046, + "grad_norm": 0.37196245789527893, + "learning_rate": 7.70637308657292e-06, + "loss": 0.1131, + "step": 43105 + }, + { + "epoch": 0.7688438625905183, + "grad_norm": 0.2655698359012604, + "learning_rate": 7.70524910093356e-06, + "loss": 0.1016, + "step": 43106 + }, + { + "epoch": 0.768861698712232, + "grad_norm": 0.3636311888694763, + "learning_rate": 7.704125182334618e-06, + "loss": 0.1231, + "step": 43107 + }, + { + "epoch": 0.7688795348339457, + "grad_norm": 0.2517312467098236, + "learning_rate": 7.703001330780469e-06, + "loss": 0.1123, + "step": 43108 + }, + { + "epoch": 0.7688973709556594, + "grad_norm": 0.297485888004303, + "learning_rate": 7.701877546275451e-06, + "loss": 0.0843, + "step": 43109 + }, + { + "epoch": 0.768915207077373, + "grad_norm": 0.25403669476509094, + "learning_rate": 7.700753828823942e-06, + "loss": 0.162, + "step": 43110 + }, + { + "epoch": 0.7689330431990868, + "grad_norm": 0.274909645318985, + "learning_rate": 7.699630178430284e-06, + "loss": 0.1111, + "step": 43111 + }, + { + "epoch": 0.7689508793208005, + "grad_norm": 0.24657250940799713, + "learning_rate": 7.698506595098834e-06, + "loss": 0.0863, + "step": 43112 + }, + { + "epoch": 0.7689687154425142, + "grad_norm": 0.3488124907016754, + "learning_rate": 7.697383078833941e-06, + "loss": 0.1929, + "step": 43113 + }, + { + "epoch": 0.7689865515642279, + "grad_norm": 0.33446502685546875, + "learning_rate": 7.696259629639977e-06, + "loss": 0.1368, + "step": 43114 + }, + { + "epoch": 0.7690043876859416, + "grad_norm": 0.21536527574062347, + "learning_rate": 7.695136247521285e-06, + "loss": 0.0859, + "step": 43115 + }, + { + "epoch": 0.7690222238076553, + "grad_norm": 0.3449539542198181, + "learning_rate": 7.694012932482223e-06, + "loss": 0.114, + "step": 43116 + }, + { + "epoch": 0.769040059929369, + "grad_norm": 0.30397507548332214, + "learning_rate": 7.692889684527136e-06, + "loss": 0.1737, + "step": 43117 + }, + { + "epoch": 0.7690578960510827, + "grad_norm": 0.30209821462631226, + "learning_rate": 7.691766503660393e-06, + "loss": 0.1453, + "step": 43118 + }, + { + "epoch": 0.7690757321727963, + "grad_norm": 0.28950777649879456, + "learning_rate": 7.690643389886343e-06, + "loss": 0.0836, + "step": 43119 + }, + { + "epoch": 0.76909356829451, + "grad_norm": 0.3058173954486847, + "learning_rate": 7.689520343209327e-06, + "loss": 0.1149, + "step": 43120 + }, + { + "epoch": 0.7691114044162237, + "grad_norm": 0.31864744424819946, + "learning_rate": 7.68839736363372e-06, + "loss": 0.1192, + "step": 43121 + }, + { + "epoch": 0.7691292405379374, + "grad_norm": 0.4027446210384369, + "learning_rate": 7.687274451163853e-06, + "loss": 0.1276, + "step": 43122 + }, + { + "epoch": 0.7691470766596511, + "grad_norm": 0.26255398988723755, + "learning_rate": 7.686151605804102e-06, + "loss": 0.1718, + "step": 43123 + }, + { + "epoch": 0.7691649127813648, + "grad_norm": 0.2799380123615265, + "learning_rate": 7.685028827558804e-06, + "loss": 0.0861, + "step": 43124 + }, + { + "epoch": 0.7691827489030785, + "grad_norm": 0.2313477098941803, + "learning_rate": 7.683906116432316e-06, + "loss": 0.1247, + "step": 43125 + }, + { + "epoch": 0.7692005850247922, + "grad_norm": 0.2404094636440277, + "learning_rate": 7.682783472428981e-06, + "loss": 0.1359, + "step": 43126 + }, + { + "epoch": 0.7692184211465058, + "grad_norm": 0.18121837079524994, + "learning_rate": 7.681660895553163e-06, + "loss": 0.0565, + "step": 43127 + }, + { + "epoch": 0.7692362572682196, + "grad_norm": 0.283416748046875, + "learning_rate": 7.680538385809214e-06, + "loss": 0.114, + "step": 43128 + }, + { + "epoch": 0.7692540933899333, + "grad_norm": 0.2934994399547577, + "learning_rate": 7.679415943201476e-06, + "loss": 0.1048, + "step": 43129 + }, + { + "epoch": 0.769271929511647, + "grad_norm": 0.2772046625614166, + "learning_rate": 7.678293567734299e-06, + "loss": 0.0759, + "step": 43130 + }, + { + "epoch": 0.7692897656333607, + "grad_norm": 0.2413949966430664, + "learning_rate": 7.677171259412047e-06, + "loss": 0.0857, + "step": 43131 + }, + { + "epoch": 0.7693076017550744, + "grad_norm": 0.3355247676372528, + "learning_rate": 7.676049018239059e-06, + "loss": 0.1251, + "step": 43132 + }, + { + "epoch": 0.7693254378767881, + "grad_norm": 0.2650832235813141, + "learning_rate": 7.674926844219693e-06, + "loss": 0.1391, + "step": 43133 + }, + { + "epoch": 0.7693432739985018, + "grad_norm": 0.25431960821151733, + "learning_rate": 7.673804737358285e-06, + "loss": 0.0952, + "step": 43134 + }, + { + "epoch": 0.7693611101202155, + "grad_norm": 0.26480409502983093, + "learning_rate": 7.6726826976592e-06, + "loss": 0.1505, + "step": 43135 + }, + { + "epoch": 0.7693789462419292, + "grad_norm": 0.36791080236434937, + "learning_rate": 7.671560725126775e-06, + "loss": 0.1466, + "step": 43136 + }, + { + "epoch": 0.7693967823636428, + "grad_norm": 0.2084689885377884, + "learning_rate": 7.670438819765372e-06, + "loss": 0.0921, + "step": 43137 + }, + { + "epoch": 0.7694146184853565, + "grad_norm": 0.20457488298416138, + "learning_rate": 7.669316981579335e-06, + "loss": 0.0914, + "step": 43138 + }, + { + "epoch": 0.7694324546070702, + "grad_norm": 0.29849106073379517, + "learning_rate": 7.668195210573004e-06, + "loss": 0.1015, + "step": 43139 + }, + { + "epoch": 0.7694502907287839, + "grad_norm": 0.2425861954689026, + "learning_rate": 7.667073506750741e-06, + "loss": 0.1028, + "step": 43140 + }, + { + "epoch": 0.7694681268504976, + "grad_norm": 0.24990884959697723, + "learning_rate": 7.665951870116889e-06, + "loss": 0.1002, + "step": 43141 + }, + { + "epoch": 0.7694859629722113, + "grad_norm": 0.2745872437953949, + "learning_rate": 7.664830300675793e-06, + "loss": 0.1426, + "step": 43142 + }, + { + "epoch": 0.769503799093925, + "grad_norm": 0.23437552154064178, + "learning_rate": 7.663708798431795e-06, + "loss": 0.0955, + "step": 43143 + }, + { + "epoch": 0.7695216352156388, + "grad_norm": 0.2776063084602356, + "learning_rate": 7.662587363389258e-06, + "loss": 0.0853, + "step": 43144 + }, + { + "epoch": 0.7695394713373525, + "grad_norm": 0.19547830522060394, + "learning_rate": 7.661465995552523e-06, + "loss": 0.0723, + "step": 43145 + }, + { + "epoch": 0.7695573074590661, + "grad_norm": 0.39287278056144714, + "learning_rate": 7.66034469492593e-06, + "loss": 0.1068, + "step": 43146 + }, + { + "epoch": 0.7695751435807798, + "grad_norm": 0.2864287197589874, + "learning_rate": 7.659223461513823e-06, + "loss": 0.1222, + "step": 43147 + }, + { + "epoch": 0.7695929797024935, + "grad_norm": 0.34062066674232483, + "learning_rate": 7.658102295320562e-06, + "loss": 0.1781, + "step": 43148 + }, + { + "epoch": 0.7696108158242072, + "grad_norm": 0.2591913342475891, + "learning_rate": 7.656981196350482e-06, + "loss": 0.1317, + "step": 43149 + }, + { + "epoch": 0.7696286519459209, + "grad_norm": 0.2820344567298889, + "learning_rate": 7.655860164607936e-06, + "loss": 0.1086, + "step": 43150 + }, + { + "epoch": 0.7696464880676346, + "grad_norm": 0.2313556671142578, + "learning_rate": 7.654739200097271e-06, + "loss": 0.1358, + "step": 43151 + }, + { + "epoch": 0.7696643241893483, + "grad_norm": 0.21038925647735596, + "learning_rate": 7.653618302822818e-06, + "loss": 0.0951, + "step": 43152 + }, + { + "epoch": 0.769682160311062, + "grad_norm": 0.26829609274864197, + "learning_rate": 7.65249747278894e-06, + "loss": 0.0967, + "step": 43153 + }, + { + "epoch": 0.7696999964327756, + "grad_norm": 0.2607644498348236, + "learning_rate": 7.651376709999971e-06, + "loss": 0.1132, + "step": 43154 + }, + { + "epoch": 0.7697178325544893, + "grad_norm": 0.24704980850219727, + "learning_rate": 7.65025601446026e-06, + "loss": 0.1189, + "step": 43155 + }, + { + "epoch": 0.769735668676203, + "grad_norm": 0.25660374760627747, + "learning_rate": 7.649135386174142e-06, + "loss": 0.1115, + "step": 43156 + }, + { + "epoch": 0.7697535047979167, + "grad_norm": 0.23838426172733307, + "learning_rate": 7.648014825145974e-06, + "loss": 0.1433, + "step": 43157 + }, + { + "epoch": 0.7697713409196304, + "grad_norm": 0.3446713387966156, + "learning_rate": 7.646894331380097e-06, + "loss": 0.1324, + "step": 43158 + }, + { + "epoch": 0.7697891770413441, + "grad_norm": 0.3234025835990906, + "learning_rate": 7.645773904880849e-06, + "loss": 0.1433, + "step": 43159 + }, + { + "epoch": 0.7698070131630578, + "grad_norm": 0.21624432504177094, + "learning_rate": 7.644653545652575e-06, + "loss": 0.0794, + "step": 43160 + }, + { + "epoch": 0.7698248492847716, + "grad_norm": 0.25194352865219116, + "learning_rate": 7.643533253699611e-06, + "loss": 0.1056, + "step": 43161 + }, + { + "epoch": 0.7698426854064853, + "grad_norm": 0.3037458658218384, + "learning_rate": 7.642413029026314e-06, + "loss": 0.1188, + "step": 43162 + }, + { + "epoch": 0.769860521528199, + "grad_norm": 0.30823564529418945, + "learning_rate": 7.64129287163701e-06, + "loss": 0.1296, + "step": 43163 + }, + { + "epoch": 0.7698783576499126, + "grad_norm": 0.2188183218240738, + "learning_rate": 7.640172781536062e-06, + "loss": 0.1008, + "step": 43164 + }, + { + "epoch": 0.7698961937716263, + "grad_norm": 0.3287068009376526, + "learning_rate": 7.639052758727789e-06, + "loss": 0.1724, + "step": 43165 + }, + { + "epoch": 0.76991402989334, + "grad_norm": 0.24873261153697968, + "learning_rate": 7.637932803216552e-06, + "loss": 0.1291, + "step": 43166 + }, + { + "epoch": 0.7699318660150537, + "grad_norm": 0.3201192319393158, + "learning_rate": 7.636812915006686e-06, + "loss": 0.1184, + "step": 43167 + }, + { + "epoch": 0.7699497021367674, + "grad_norm": 0.25319814682006836, + "learning_rate": 7.635693094102528e-06, + "loss": 0.1361, + "step": 43168 + }, + { + "epoch": 0.7699675382584811, + "grad_norm": 0.28412410616874695, + "learning_rate": 7.634573340508413e-06, + "loss": 0.109, + "step": 43169 + }, + { + "epoch": 0.7699853743801948, + "grad_norm": 0.23510943353176117, + "learning_rate": 7.633453654228701e-06, + "loss": 0.1, + "step": 43170 + }, + { + "epoch": 0.7700032105019085, + "grad_norm": 0.3590015769004822, + "learning_rate": 7.632334035267719e-06, + "loss": 0.1351, + "step": 43171 + }, + { + "epoch": 0.7700210466236221, + "grad_norm": 0.2735082805156708, + "learning_rate": 7.631214483629806e-06, + "loss": 0.1187, + "step": 43172 + }, + { + "epoch": 0.7700388827453358, + "grad_norm": 0.36123690009117126, + "learning_rate": 7.630094999319309e-06, + "loss": 0.0946, + "step": 43173 + }, + { + "epoch": 0.7700567188670495, + "grad_norm": 0.3469754755496979, + "learning_rate": 7.628975582340553e-06, + "loss": 0.1557, + "step": 43174 + }, + { + "epoch": 0.7700745549887632, + "grad_norm": 0.2728753387928009, + "learning_rate": 7.627856232697894e-06, + "loss": 0.1065, + "step": 43175 + }, + { + "epoch": 0.7700923911104769, + "grad_norm": 0.4476653039455414, + "learning_rate": 7.626736950395661e-06, + "loss": 0.121, + "step": 43176 + }, + { + "epoch": 0.7701102272321906, + "grad_norm": 0.19168226420879364, + "learning_rate": 7.625617735438198e-06, + "loss": 0.1099, + "step": 43177 + }, + { + "epoch": 0.7701280633539044, + "grad_norm": 0.2618299424648285, + "learning_rate": 7.624498587829837e-06, + "loss": 0.1458, + "step": 43178 + }, + { + "epoch": 0.7701458994756181, + "grad_norm": 0.3376937210559845, + "learning_rate": 7.6233795075749296e-06, + "loss": 0.1014, + "step": 43179 + }, + { + "epoch": 0.7701637355973318, + "grad_norm": 0.3775985538959503, + "learning_rate": 7.622260494677805e-06, + "loss": 0.1242, + "step": 43180 + }, + { + "epoch": 0.7701815717190454, + "grad_norm": 0.22356335818767548, + "learning_rate": 7.621141549142799e-06, + "loss": 0.1148, + "step": 43181 + }, + { + "epoch": 0.7701994078407591, + "grad_norm": 0.24491247534751892, + "learning_rate": 7.620022670974241e-06, + "loss": 0.0939, + "step": 43182 + }, + { + "epoch": 0.7702172439624728, + "grad_norm": 0.2010246217250824, + "learning_rate": 7.61890386017649e-06, + "loss": 0.0719, + "step": 43183 + }, + { + "epoch": 0.7702350800841865, + "grad_norm": 0.30959445238113403, + "learning_rate": 7.61778511675387e-06, + "loss": 0.1612, + "step": 43184 + }, + { + "epoch": 0.7702529162059002, + "grad_norm": 0.2769760191440582, + "learning_rate": 7.616666440710718e-06, + "loss": 0.1484, + "step": 43185 + }, + { + "epoch": 0.7702707523276139, + "grad_norm": 0.25192350149154663, + "learning_rate": 7.61554783205137e-06, + "loss": 0.1144, + "step": 43186 + }, + { + "epoch": 0.7702885884493276, + "grad_norm": 0.22268660366535187, + "learning_rate": 7.614429290780156e-06, + "loss": 0.1207, + "step": 43187 + }, + { + "epoch": 0.7703064245710413, + "grad_norm": 0.24352645874023438, + "learning_rate": 7.613310816901426e-06, + "loss": 0.0927, + "step": 43188 + }, + { + "epoch": 0.770324260692755, + "grad_norm": 0.2507781684398651, + "learning_rate": 7.612192410419508e-06, + "loss": 0.117, + "step": 43189 + }, + { + "epoch": 0.7703420968144686, + "grad_norm": 0.21125119924545288, + "learning_rate": 7.6110740713387305e-06, + "loss": 0.1131, + "step": 43190 + }, + { + "epoch": 0.7703599329361823, + "grad_norm": 0.24588696658611298, + "learning_rate": 7.609955799663446e-06, + "loss": 0.1305, + "step": 43191 + }, + { + "epoch": 0.770377769057896, + "grad_norm": 0.14450868964195251, + "learning_rate": 7.608837595397969e-06, + "loss": 0.0344, + "step": 43192 + }, + { + "epoch": 0.7703956051796097, + "grad_norm": 0.3037028908729553, + "learning_rate": 7.607719458546652e-06, + "loss": 0.1346, + "step": 43193 + }, + { + "epoch": 0.7704134413013234, + "grad_norm": 0.2455395758152008, + "learning_rate": 7.606601389113821e-06, + "loss": 0.1241, + "step": 43194 + }, + { + "epoch": 0.7704312774230372, + "grad_norm": 0.30561378598213196, + "learning_rate": 7.605483387103804e-06, + "loss": 0.066, + "step": 43195 + }, + { + "epoch": 0.7704491135447509, + "grad_norm": 0.27732619643211365, + "learning_rate": 7.604365452520948e-06, + "loss": 0.1044, + "step": 43196 + }, + { + "epoch": 0.7704669496664646, + "grad_norm": 0.35555416345596313, + "learning_rate": 7.60324758536958e-06, + "loss": 0.126, + "step": 43197 + }, + { + "epoch": 0.7704847857881783, + "grad_norm": 0.30437901616096497, + "learning_rate": 7.602129785654036e-06, + "loss": 0.1546, + "step": 43198 + }, + { + "epoch": 0.7705026219098919, + "grad_norm": 0.27484020590782166, + "learning_rate": 7.6010120533786425e-06, + "loss": 0.1836, + "step": 43199 + }, + { + "epoch": 0.7705204580316056, + "grad_norm": 0.21924909949302673, + "learning_rate": 7.599894388547729e-06, + "loss": 0.1163, + "step": 43200 + }, + { + "epoch": 0.7705382941533193, + "grad_norm": 0.25036191940307617, + "learning_rate": 7.598776791165641e-06, + "loss": 0.0674, + "step": 43201 + }, + { + "epoch": 0.770556130275033, + "grad_norm": 0.2570417523384094, + "learning_rate": 7.5976592612367055e-06, + "loss": 0.1593, + "step": 43202 + }, + { + "epoch": 0.7705739663967467, + "grad_norm": 0.2015930712223053, + "learning_rate": 7.596541798765247e-06, + "loss": 0.122, + "step": 43203 + }, + { + "epoch": 0.7705918025184604, + "grad_norm": 0.3157704770565033, + "learning_rate": 7.5954244037556074e-06, + "loss": 0.1177, + "step": 43204 + }, + { + "epoch": 0.7706096386401741, + "grad_norm": 0.28714025020599365, + "learning_rate": 7.594307076212109e-06, + "loss": 0.1428, + "step": 43205 + }, + { + "epoch": 0.7706274747618878, + "grad_norm": 0.2482876032590866, + "learning_rate": 7.593189816139095e-06, + "loss": 0.0852, + "step": 43206 + }, + { + "epoch": 0.7706453108836014, + "grad_norm": 0.23907610774040222, + "learning_rate": 7.5920726235408915e-06, + "loss": 0.1202, + "step": 43207 + }, + { + "epoch": 0.7706631470053151, + "grad_norm": 0.24861156940460205, + "learning_rate": 7.590955498421817e-06, + "loss": 0.1075, + "step": 43208 + }, + { + "epoch": 0.7706809831270288, + "grad_norm": 0.3839176595211029, + "learning_rate": 7.5898384407862205e-06, + "loss": 0.1088, + "step": 43209 + }, + { + "epoch": 0.7706988192487425, + "grad_norm": 0.5693812966346741, + "learning_rate": 7.588721450638425e-06, + "loss": 0.1514, + "step": 43210 + }, + { + "epoch": 0.7707166553704562, + "grad_norm": 0.2105524241924286, + "learning_rate": 7.587604527982756e-06, + "loss": 0.1074, + "step": 43211 + }, + { + "epoch": 0.77073449149217, + "grad_norm": 0.24127443134784698, + "learning_rate": 7.5864876728235476e-06, + "loss": 0.141, + "step": 43212 + }, + { + "epoch": 0.7707523276138837, + "grad_norm": 0.4091591238975525, + "learning_rate": 7.585370885165119e-06, + "loss": 0.1191, + "step": 43213 + }, + { + "epoch": 0.7707701637355974, + "grad_norm": 0.27039700746536255, + "learning_rate": 7.584254165011817e-06, + "loss": 0.1024, + "step": 43214 + }, + { + "epoch": 0.7707879998573111, + "grad_norm": 0.2048932909965515, + "learning_rate": 7.583137512367958e-06, + "loss": 0.1047, + "step": 43215 + }, + { + "epoch": 0.7708058359790247, + "grad_norm": 0.2856503129005432, + "learning_rate": 7.582020927237876e-06, + "loss": 0.1115, + "step": 43216 + }, + { + "epoch": 0.7708236721007384, + "grad_norm": 0.20236904919147491, + "learning_rate": 7.580904409625889e-06, + "loss": 0.1033, + "step": 43217 + }, + { + "epoch": 0.7708415082224521, + "grad_norm": 0.2524934411048889, + "learning_rate": 7.579787959536339e-06, + "loss": 0.1336, + "step": 43218 + }, + { + "epoch": 0.7708593443441658, + "grad_norm": 0.22171279788017273, + "learning_rate": 7.578671576973539e-06, + "loss": 0.0897, + "step": 43219 + }, + { + "epoch": 0.7708771804658795, + "grad_norm": 0.35861000418663025, + "learning_rate": 7.5775552619418374e-06, + "loss": 0.147, + "step": 43220 + }, + { + "epoch": 0.7708950165875932, + "grad_norm": 0.2904147207736969, + "learning_rate": 7.576439014445538e-06, + "loss": 0.0885, + "step": 43221 + }, + { + "epoch": 0.7709128527093069, + "grad_norm": 0.2880682647228241, + "learning_rate": 7.575322834488988e-06, + "loss": 0.1041, + "step": 43222 + }, + { + "epoch": 0.7709306888310206, + "grad_norm": 0.24455687403678894, + "learning_rate": 7.574206722076505e-06, + "loss": 0.0799, + "step": 43223 + }, + { + "epoch": 0.7709485249527342, + "grad_norm": 0.24320228397846222, + "learning_rate": 7.573090677212413e-06, + "loss": 0.1201, + "step": 43224 + }, + { + "epoch": 0.7709663610744479, + "grad_norm": 0.23077481985092163, + "learning_rate": 7.5719746999010435e-06, + "loss": 0.0725, + "step": 43225 + }, + { + "epoch": 0.7709841971961616, + "grad_norm": 0.24306534230709076, + "learning_rate": 7.570858790146709e-06, + "loss": 0.1028, + "step": 43226 + }, + { + "epoch": 0.7710020333178753, + "grad_norm": 0.24609504640102386, + "learning_rate": 7.569742947953756e-06, + "loss": 0.1243, + "step": 43227 + }, + { + "epoch": 0.771019869439589, + "grad_norm": 0.3128361701965332, + "learning_rate": 7.568627173326498e-06, + "loss": 0.1152, + "step": 43228 + }, + { + "epoch": 0.7710377055613028, + "grad_norm": 0.27013352513313293, + "learning_rate": 7.5675114662692615e-06, + "loss": 0.1194, + "step": 43229 + }, + { + "epoch": 0.7710555416830165, + "grad_norm": 0.2624155879020691, + "learning_rate": 7.566395826786366e-06, + "loss": 0.14, + "step": 43230 + }, + { + "epoch": 0.7710733778047302, + "grad_norm": 0.2880672216415405, + "learning_rate": 7.565280254882148e-06, + "loss": 0.0942, + "step": 43231 + }, + { + "epoch": 0.7710912139264439, + "grad_norm": 0.23503975570201874, + "learning_rate": 7.564164750560918e-06, + "loss": 0.1155, + "step": 43232 + }, + { + "epoch": 0.7711090500481576, + "grad_norm": 0.22663715481758118, + "learning_rate": 7.563049313827014e-06, + "loss": 0.1041, + "step": 43233 + }, + { + "epoch": 0.7711268861698712, + "grad_norm": 0.27420687675476074, + "learning_rate": 7.5619339446847455e-06, + "loss": 0.0901, + "step": 43234 + }, + { + "epoch": 0.7711447222915849, + "grad_norm": 0.26121965050697327, + "learning_rate": 7.560818643138454e-06, + "loss": 0.0618, + "step": 43235 + }, + { + "epoch": 0.7711625584132986, + "grad_norm": 0.20403441786766052, + "learning_rate": 7.559703409192451e-06, + "loss": 0.0931, + "step": 43236 + }, + { + "epoch": 0.7711803945350123, + "grad_norm": 0.3251589238643646, + "learning_rate": 7.558588242851061e-06, + "loss": 0.1346, + "step": 43237 + }, + { + "epoch": 0.771198230656726, + "grad_norm": 0.2036709189414978, + "learning_rate": 7.55747314411861e-06, + "loss": 0.1236, + "step": 43238 + }, + { + "epoch": 0.7712160667784397, + "grad_norm": 0.32432326674461365, + "learning_rate": 7.556358112999406e-06, + "loss": 0.1552, + "step": 43239 + }, + { + "epoch": 0.7712339029001534, + "grad_norm": 0.21016792953014374, + "learning_rate": 7.555243149497793e-06, + "loss": 0.0719, + "step": 43240 + }, + { + "epoch": 0.771251739021867, + "grad_norm": 0.22607794404029846, + "learning_rate": 7.554128253618081e-06, + "loss": 0.1221, + "step": 43241 + }, + { + "epoch": 0.7712695751435807, + "grad_norm": 0.3194332718849182, + "learning_rate": 7.553013425364594e-06, + "loss": 0.1217, + "step": 43242 + }, + { + "epoch": 0.7712874112652944, + "grad_norm": 0.2569514811038971, + "learning_rate": 7.551898664741647e-06, + "loss": 0.1416, + "step": 43243 + }, + { + "epoch": 0.7713052473870081, + "grad_norm": 0.20603618025779724, + "learning_rate": 7.550783971753572e-06, + "loss": 0.1075, + "step": 43244 + }, + { + "epoch": 0.7713230835087219, + "grad_norm": 0.2324785590171814, + "learning_rate": 7.549669346404689e-06, + "loss": 0.1219, + "step": 43245 + }, + { + "epoch": 0.7713409196304356, + "grad_norm": 0.3243198096752167, + "learning_rate": 7.548554788699303e-06, + "loss": 0.1504, + "step": 43246 + }, + { + "epoch": 0.7713587557521493, + "grad_norm": 0.2696075141429901, + "learning_rate": 7.547440298641747e-06, + "loss": 0.1098, + "step": 43247 + }, + { + "epoch": 0.771376591873863, + "grad_norm": 0.28341400623321533, + "learning_rate": 7.546325876236351e-06, + "loss": 0.1112, + "step": 43248 + }, + { + "epoch": 0.7713944279955767, + "grad_norm": 0.30703896284103394, + "learning_rate": 7.545211521487422e-06, + "loss": 0.1266, + "step": 43249 + }, + { + "epoch": 0.7714122641172904, + "grad_norm": 0.2016613781452179, + "learning_rate": 7.5440972343992806e-06, + "loss": 0.08, + "step": 43250 + }, + { + "epoch": 0.771430100239004, + "grad_norm": 0.22971948981285095, + "learning_rate": 7.542983014976249e-06, + "loss": 0.1181, + "step": 43251 + }, + { + "epoch": 0.7714479363607177, + "grad_norm": 0.2425071746110916, + "learning_rate": 7.541868863222637e-06, + "loss": 0.1297, + "step": 43252 + }, + { + "epoch": 0.7714657724824314, + "grad_norm": 0.29377493262290955, + "learning_rate": 7.540754779142778e-06, + "loss": 0.1328, + "step": 43253 + }, + { + "epoch": 0.7714836086041451, + "grad_norm": 0.3360605537891388, + "learning_rate": 7.5396407627409845e-06, + "loss": 0.1367, + "step": 43254 + }, + { + "epoch": 0.7715014447258588, + "grad_norm": 0.25270095467567444, + "learning_rate": 7.538526814021574e-06, + "loss": 0.1148, + "step": 43255 + }, + { + "epoch": 0.7715192808475725, + "grad_norm": 0.26664939522743225, + "learning_rate": 7.5374129329888575e-06, + "loss": 0.1144, + "step": 43256 + }, + { + "epoch": 0.7715371169692862, + "grad_norm": 0.27126747369766235, + "learning_rate": 7.536299119647166e-06, + "loss": 0.107, + "step": 43257 + }, + { + "epoch": 0.7715549530909999, + "grad_norm": 0.2711648643016815, + "learning_rate": 7.535185374000811e-06, + "loss": 0.1216, + "step": 43258 + }, + { + "epoch": 0.7715727892127136, + "grad_norm": 0.2582990527153015, + "learning_rate": 7.5340716960541045e-06, + "loss": 0.1104, + "step": 43259 + }, + { + "epoch": 0.7715906253344272, + "grad_norm": 0.20992511510849, + "learning_rate": 7.532958085811373e-06, + "loss": 0.1147, + "step": 43260 + }, + { + "epoch": 0.7716084614561409, + "grad_norm": 0.24704952538013458, + "learning_rate": 7.531844543276922e-06, + "loss": 0.0846, + "step": 43261 + }, + { + "epoch": 0.7716262975778547, + "grad_norm": 0.2913043200969696, + "learning_rate": 7.530731068455086e-06, + "loss": 0.1163, + "step": 43262 + }, + { + "epoch": 0.7716441336995684, + "grad_norm": 0.26963940262794495, + "learning_rate": 7.529617661350166e-06, + "loss": 0.1304, + "step": 43263 + }, + { + "epoch": 0.7716619698212821, + "grad_norm": 0.24435272812843323, + "learning_rate": 7.5285043219664814e-06, + "loss": 0.1599, + "step": 43264 + }, + { + "epoch": 0.7716798059429958, + "grad_norm": 0.2214638590812683, + "learning_rate": 7.527391050308344e-06, + "loss": 0.0947, + "step": 43265 + }, + { + "epoch": 0.7716976420647095, + "grad_norm": 0.3008866608142853, + "learning_rate": 7.526277846380081e-06, + "loss": 0.1034, + "step": 43266 + }, + { + "epoch": 0.7717154781864232, + "grad_norm": 0.32285189628601074, + "learning_rate": 7.525164710185997e-06, + "loss": 0.1422, + "step": 43267 + }, + { + "epoch": 0.7717333143081369, + "grad_norm": 0.225494846701622, + "learning_rate": 7.524051641730415e-06, + "loss": 0.111, + "step": 43268 + }, + { + "epoch": 0.7717511504298505, + "grad_norm": 0.23391669988632202, + "learning_rate": 7.522938641017632e-06, + "loss": 0.0798, + "step": 43269 + }, + { + "epoch": 0.7717689865515642, + "grad_norm": 0.251701295375824, + "learning_rate": 7.5218257080519886e-06, + "loss": 0.1292, + "step": 43270 + }, + { + "epoch": 0.7717868226732779, + "grad_norm": 0.260171115398407, + "learning_rate": 7.520712842837782e-06, + "loss": 0.1339, + "step": 43271 + }, + { + "epoch": 0.7718046587949916, + "grad_norm": 0.25056910514831543, + "learning_rate": 7.519600045379329e-06, + "loss": 0.0967, + "step": 43272 + }, + { + "epoch": 0.7718224949167053, + "grad_norm": 0.3433469235897064, + "learning_rate": 7.518487315680936e-06, + "loss": 0.167, + "step": 43273 + }, + { + "epoch": 0.771840331038419, + "grad_norm": 0.41224896907806396, + "learning_rate": 7.517374653746925e-06, + "loss": 0.2071, + "step": 43274 + }, + { + "epoch": 0.7718581671601327, + "grad_norm": 0.2244434356689453, + "learning_rate": 7.516262059581616e-06, + "loss": 0.109, + "step": 43275 + }, + { + "epoch": 0.7718760032818464, + "grad_norm": 0.24160553514957428, + "learning_rate": 7.515149533189317e-06, + "loss": 0.114, + "step": 43276 + }, + { + "epoch": 0.77189383940356, + "grad_norm": 0.2449491024017334, + "learning_rate": 7.514037074574334e-06, + "loss": 0.0924, + "step": 43277 + }, + { + "epoch": 0.7719116755252737, + "grad_norm": 0.3116455078125, + "learning_rate": 7.512924683740974e-06, + "loss": 0.1561, + "step": 43278 + }, + { + "epoch": 0.7719295116469875, + "grad_norm": 0.4128992557525635, + "learning_rate": 7.511812360693568e-06, + "loss": 0.1172, + "step": 43279 + }, + { + "epoch": 0.7719473477687012, + "grad_norm": 0.2890704572200775, + "learning_rate": 7.5107001054364166e-06, + "loss": 0.1643, + "step": 43280 + }, + { + "epoch": 0.7719651838904149, + "grad_norm": 0.3013269305229187, + "learning_rate": 7.509587917973831e-06, + "loss": 0.0879, + "step": 43281 + }, + { + "epoch": 0.7719830200121286, + "grad_norm": 0.21916384994983673, + "learning_rate": 7.508475798310119e-06, + "loss": 0.0973, + "step": 43282 + }, + { + "epoch": 0.7720008561338423, + "grad_norm": 0.20679941773414612, + "learning_rate": 7.5073637464496e-06, + "loss": 0.1153, + "step": 43283 + }, + { + "epoch": 0.772018692255556, + "grad_norm": 0.23694542050361633, + "learning_rate": 7.506251762396585e-06, + "loss": 0.0857, + "step": 43284 + }, + { + "epoch": 0.7720365283772697, + "grad_norm": 0.23543240129947662, + "learning_rate": 7.505139846155377e-06, + "loss": 0.0868, + "step": 43285 + }, + { + "epoch": 0.7720543644989833, + "grad_norm": 0.23788268864154816, + "learning_rate": 7.504027997730284e-06, + "loss": 0.0977, + "step": 43286 + }, + { + "epoch": 0.772072200620697, + "grad_norm": 0.29897359013557434, + "learning_rate": 7.502916217125627e-06, + "loss": 0.0838, + "step": 43287 + }, + { + "epoch": 0.7720900367424107, + "grad_norm": 0.22732172906398773, + "learning_rate": 7.501804504345702e-06, + "loss": 0.1252, + "step": 43288 + }, + { + "epoch": 0.7721078728641244, + "grad_norm": 0.24480777978897095, + "learning_rate": 7.5006928593948364e-06, + "loss": 0.0812, + "step": 43289 + }, + { + "epoch": 0.7721257089858381, + "grad_norm": 0.2581625282764435, + "learning_rate": 7.499581282277329e-06, + "loss": 0.1317, + "step": 43290 + }, + { + "epoch": 0.7721435451075518, + "grad_norm": 0.29986411333084106, + "learning_rate": 7.49846977299748e-06, + "loss": 0.124, + "step": 43291 + }, + { + "epoch": 0.7721613812292655, + "grad_norm": 0.27072200179100037, + "learning_rate": 7.4973583315596145e-06, + "loss": 0.1195, + "step": 43292 + }, + { + "epoch": 0.7721792173509792, + "grad_norm": 0.266581654548645, + "learning_rate": 7.496246957968037e-06, + "loss": 0.118, + "step": 43293 + }, + { + "epoch": 0.7721970534726929, + "grad_norm": 0.23910745978355408, + "learning_rate": 7.495135652227048e-06, + "loss": 0.1144, + "step": 43294 + }, + { + "epoch": 0.7722148895944065, + "grad_norm": 0.31104573607444763, + "learning_rate": 7.494024414340952e-06, + "loss": 0.1925, + "step": 43295 + }, + { + "epoch": 0.7722327257161203, + "grad_norm": 0.23937399685382843, + "learning_rate": 7.492913244314073e-06, + "loss": 0.1161, + "step": 43296 + }, + { + "epoch": 0.772250561837834, + "grad_norm": 0.334472119808197, + "learning_rate": 7.491802142150709e-06, + "loss": 0.1006, + "step": 43297 + }, + { + "epoch": 0.7722683979595477, + "grad_norm": 0.25423452258110046, + "learning_rate": 7.490691107855166e-06, + "loss": 0.1052, + "step": 43298 + }, + { + "epoch": 0.7722862340812614, + "grad_norm": 0.20954209566116333, + "learning_rate": 7.489580141431743e-06, + "loss": 0.1136, + "step": 43299 + }, + { + "epoch": 0.7723040702029751, + "grad_norm": 0.2716667950153351, + "learning_rate": 7.488469242884766e-06, + "loss": 0.1223, + "step": 43300 + }, + { + "epoch": 0.7723219063246888, + "grad_norm": 0.2741115987300873, + "learning_rate": 7.48735841221852e-06, + "loss": 0.1086, + "step": 43301 + }, + { + "epoch": 0.7723397424464025, + "grad_norm": 0.36933889985084534, + "learning_rate": 7.486247649437331e-06, + "loss": 0.1531, + "step": 43302 + }, + { + "epoch": 0.7723575785681162, + "grad_norm": 0.2792014181613922, + "learning_rate": 7.485136954545493e-06, + "loss": 0.0814, + "step": 43303 + }, + { + "epoch": 0.7723754146898298, + "grad_norm": 0.23467305302619934, + "learning_rate": 7.484026327547308e-06, + "loss": 0.0986, + "step": 43304 + }, + { + "epoch": 0.7723932508115435, + "grad_norm": 0.3154553771018982, + "learning_rate": 7.482915768447093e-06, + "loss": 0.1199, + "step": 43305 + }, + { + "epoch": 0.7724110869332572, + "grad_norm": 0.2857285141944885, + "learning_rate": 7.4818052772491485e-06, + "loss": 0.1053, + "step": 43306 + }, + { + "epoch": 0.7724289230549709, + "grad_norm": 0.24907024204730988, + "learning_rate": 7.480694853957776e-06, + "loss": 0.112, + "step": 43307 + }, + { + "epoch": 0.7724467591766846, + "grad_norm": 0.2875986397266388, + "learning_rate": 7.479584498577271e-06, + "loss": 0.1438, + "step": 43308 + }, + { + "epoch": 0.7724645952983983, + "grad_norm": 0.3144265115261078, + "learning_rate": 7.47847421111196e-06, + "loss": 0.1411, + "step": 43309 + }, + { + "epoch": 0.772482431420112, + "grad_norm": 0.3387351334095001, + "learning_rate": 7.477363991566133e-06, + "loss": 0.1124, + "step": 43310 + }, + { + "epoch": 0.7725002675418257, + "grad_norm": 0.33802229166030884, + "learning_rate": 7.476253839944094e-06, + "loss": 0.1452, + "step": 43311 + }, + { + "epoch": 0.7725181036635393, + "grad_norm": 0.40700623393058777, + "learning_rate": 7.475143756250141e-06, + "loss": 0.1813, + "step": 43312 + }, + { + "epoch": 0.7725359397852531, + "grad_norm": 0.2382963001728058, + "learning_rate": 7.474033740488592e-06, + "loss": 0.1078, + "step": 43313 + }, + { + "epoch": 0.7725537759069668, + "grad_norm": 0.2745089530944824, + "learning_rate": 7.472923792663741e-06, + "loss": 0.1365, + "step": 43314 + }, + { + "epoch": 0.7725716120286805, + "grad_norm": 0.21949723362922668, + "learning_rate": 7.471813912779885e-06, + "loss": 0.1021, + "step": 43315 + }, + { + "epoch": 0.7725894481503942, + "grad_norm": 0.22324372828006744, + "learning_rate": 7.470704100841339e-06, + "loss": 0.0851, + "step": 43316 + }, + { + "epoch": 0.7726072842721079, + "grad_norm": 0.2681769132614136, + "learning_rate": 7.469594356852389e-06, + "loss": 0.1117, + "step": 43317 + }, + { + "epoch": 0.7726251203938216, + "grad_norm": 0.3031485974788666, + "learning_rate": 7.4684846808173555e-06, + "loss": 0.1148, + "step": 43318 + }, + { + "epoch": 0.7726429565155353, + "grad_norm": 0.36637088656425476, + "learning_rate": 7.467375072740529e-06, + "loss": 0.1237, + "step": 43319 + }, + { + "epoch": 0.772660792637249, + "grad_norm": 0.30085834860801697, + "learning_rate": 7.466265532626216e-06, + "loss": 0.142, + "step": 43320 + }, + { + "epoch": 0.7726786287589626, + "grad_norm": 0.3427233397960663, + "learning_rate": 7.465156060478701e-06, + "loss": 0.1475, + "step": 43321 + }, + { + "epoch": 0.7726964648806763, + "grad_norm": 0.3068648874759674, + "learning_rate": 7.464046656302307e-06, + "loss": 0.1164, + "step": 43322 + }, + { + "epoch": 0.77271430100239, + "grad_norm": 0.30954113602638245, + "learning_rate": 7.462937320101326e-06, + "loss": 0.0899, + "step": 43323 + }, + { + "epoch": 0.7727321371241037, + "grad_norm": 0.2714322805404663, + "learning_rate": 7.461828051880057e-06, + "loss": 0.1324, + "step": 43324 + }, + { + "epoch": 0.7727499732458174, + "grad_norm": 0.3573809862136841, + "learning_rate": 7.460718851642792e-06, + "loss": 0.1385, + "step": 43325 + }, + { + "epoch": 0.7727678093675311, + "grad_norm": 0.2611391544342041, + "learning_rate": 7.459609719393845e-06, + "loss": 0.0889, + "step": 43326 + }, + { + "epoch": 0.7727856454892448, + "grad_norm": 0.3394410312175751, + "learning_rate": 7.45850065513751e-06, + "loss": 0.1398, + "step": 43327 + }, + { + "epoch": 0.7728034816109585, + "grad_norm": 0.22016894817352295, + "learning_rate": 7.457391658878077e-06, + "loss": 0.0814, + "step": 43328 + }, + { + "epoch": 0.7728213177326722, + "grad_norm": 0.2938532531261444, + "learning_rate": 7.456282730619862e-06, + "loss": 0.1208, + "step": 43329 + }, + { + "epoch": 0.772839153854386, + "grad_norm": 0.2880353331565857, + "learning_rate": 7.4551738703671435e-06, + "loss": 0.1507, + "step": 43330 + }, + { + "epoch": 0.7728569899760996, + "grad_norm": 0.2728100121021271, + "learning_rate": 7.454065078124242e-06, + "loss": 0.0925, + "step": 43331 + }, + { + "epoch": 0.7728748260978133, + "grad_norm": 0.2868330478668213, + "learning_rate": 7.4529563538954434e-06, + "loss": 0.1747, + "step": 43332 + }, + { + "epoch": 0.772892662219527, + "grad_norm": 0.2272026687860489, + "learning_rate": 7.4518476976850464e-06, + "loss": 0.0701, + "step": 43333 + }, + { + "epoch": 0.7729104983412407, + "grad_norm": 0.25956249237060547, + "learning_rate": 7.45073910949734e-06, + "loss": 0.2031, + "step": 43334 + }, + { + "epoch": 0.7729283344629544, + "grad_norm": 0.20929618179798126, + "learning_rate": 7.449630589336639e-06, + "loss": 0.072, + "step": 43335 + }, + { + "epoch": 0.7729461705846681, + "grad_norm": 0.25738954544067383, + "learning_rate": 7.448522137207234e-06, + "loss": 0.0897, + "step": 43336 + }, + { + "epoch": 0.7729640067063818, + "grad_norm": 0.2278204709291458, + "learning_rate": 7.447413753113416e-06, + "loss": 0.1022, + "step": 43337 + }, + { + "epoch": 0.7729818428280955, + "grad_norm": 0.24706627428531647, + "learning_rate": 7.446305437059478e-06, + "loss": 0.1517, + "step": 43338 + }, + { + "epoch": 0.7729996789498091, + "grad_norm": 0.23114341497421265, + "learning_rate": 7.4451971890497295e-06, + "loss": 0.0803, + "step": 43339 + }, + { + "epoch": 0.7730175150715228, + "grad_norm": 0.23727920651435852, + "learning_rate": 7.4440890090884614e-06, + "loss": 0.1021, + "step": 43340 + }, + { + "epoch": 0.7730353511932365, + "grad_norm": 0.31540319323539734, + "learning_rate": 7.442980897179966e-06, + "loss": 0.1309, + "step": 43341 + }, + { + "epoch": 0.7730531873149502, + "grad_norm": 0.22448860108852386, + "learning_rate": 7.441872853328536e-06, + "loss": 0.11, + "step": 43342 + }, + { + "epoch": 0.7730710234366639, + "grad_norm": 0.26633214950561523, + "learning_rate": 7.440764877538475e-06, + "loss": 0.0981, + "step": 43343 + }, + { + "epoch": 0.7730888595583776, + "grad_norm": 0.279723197221756, + "learning_rate": 7.439656969814068e-06, + "loss": 0.165, + "step": 43344 + }, + { + "epoch": 0.7731066956800913, + "grad_norm": 0.24724264442920685, + "learning_rate": 7.438549130159625e-06, + "loss": 0.092, + "step": 43345 + }, + { + "epoch": 0.7731245318018051, + "grad_norm": 0.27358344197273254, + "learning_rate": 7.43744135857943e-06, + "loss": 0.0858, + "step": 43346 + }, + { + "epoch": 0.7731423679235188, + "grad_norm": 0.2298402488231659, + "learning_rate": 7.43633365507777e-06, + "loss": 0.1045, + "step": 43347 + }, + { + "epoch": 0.7731602040452324, + "grad_norm": 0.30903834104537964, + "learning_rate": 7.435226019658956e-06, + "loss": 0.1148, + "step": 43348 + }, + { + "epoch": 0.7731780401669461, + "grad_norm": 0.2901347875595093, + "learning_rate": 7.434118452327274e-06, + "loss": 0.133, + "step": 43349 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 0.2721090614795685, + "learning_rate": 7.433010953087013e-06, + "loss": 0.1042, + "step": 43350 + }, + { + "epoch": 0.7732137124103735, + "grad_norm": 0.31019127368927, + "learning_rate": 7.4319035219424625e-06, + "loss": 0.1364, + "step": 43351 + }, + { + "epoch": 0.7732315485320872, + "grad_norm": 0.19999803602695465, + "learning_rate": 7.430796158897929e-06, + "loss": 0.0897, + "step": 43352 + }, + { + "epoch": 0.7732493846538009, + "grad_norm": 0.20183596014976501, + "learning_rate": 7.429688863957698e-06, + "loss": 0.0978, + "step": 43353 + }, + { + "epoch": 0.7732672207755146, + "grad_norm": 0.2506796717643738, + "learning_rate": 7.428581637126061e-06, + "loss": 0.126, + "step": 43354 + }, + { + "epoch": 0.7732850568972283, + "grad_norm": 0.3142896294593811, + "learning_rate": 7.427474478407304e-06, + "loss": 0.1584, + "step": 43355 + }, + { + "epoch": 0.773302893018942, + "grad_norm": 0.28961101174354553, + "learning_rate": 7.426367387805733e-06, + "loss": 0.1144, + "step": 43356 + }, + { + "epoch": 0.7733207291406556, + "grad_norm": 0.5187827348709106, + "learning_rate": 7.425260365325623e-06, + "loss": 0.1032, + "step": 43357 + }, + { + "epoch": 0.7733385652623693, + "grad_norm": 0.43283024430274963, + "learning_rate": 7.424153410971283e-06, + "loss": 0.1058, + "step": 43358 + }, + { + "epoch": 0.773356401384083, + "grad_norm": 0.27138620615005493, + "learning_rate": 7.423046524746993e-06, + "loss": 0.094, + "step": 43359 + }, + { + "epoch": 0.7733742375057967, + "grad_norm": 0.2884177267551422, + "learning_rate": 7.421939706657038e-06, + "loss": 0.1431, + "step": 43360 + }, + { + "epoch": 0.7733920736275104, + "grad_norm": 0.2897762060165405, + "learning_rate": 7.420832956705726e-06, + "loss": 0.1115, + "step": 43361 + }, + { + "epoch": 0.7734099097492241, + "grad_norm": 0.2874641418457031, + "learning_rate": 7.4197262748973345e-06, + "loss": 0.133, + "step": 43362 + }, + { + "epoch": 0.7734277458709379, + "grad_norm": 0.2513301968574524, + "learning_rate": 7.418619661236157e-06, + "loss": 0.1323, + "step": 43363 + }, + { + "epoch": 0.7734455819926516, + "grad_norm": 0.2575463354587555, + "learning_rate": 7.417513115726474e-06, + "loss": 0.1026, + "step": 43364 + }, + { + "epoch": 0.7734634181143653, + "grad_norm": 0.2520389258861542, + "learning_rate": 7.4164066383725916e-06, + "loss": 0.1079, + "step": 43365 + }, + { + "epoch": 0.7734812542360789, + "grad_norm": 0.23377898335456848, + "learning_rate": 7.41530022917879e-06, + "loss": 0.0817, + "step": 43366 + }, + { + "epoch": 0.7734990903577926, + "grad_norm": 0.29210180044174194, + "learning_rate": 7.414193888149356e-06, + "loss": 0.0974, + "step": 43367 + }, + { + "epoch": 0.7735169264795063, + "grad_norm": 0.2617834806442261, + "learning_rate": 7.413087615288577e-06, + "loss": 0.0791, + "step": 43368 + }, + { + "epoch": 0.77353476260122, + "grad_norm": 0.22317801415920258, + "learning_rate": 7.411981410600749e-06, + "loss": 0.0834, + "step": 43369 + }, + { + "epoch": 0.7735525987229337, + "grad_norm": 0.2863292992115021, + "learning_rate": 7.410875274090157e-06, + "loss": 0.116, + "step": 43370 + }, + { + "epoch": 0.7735704348446474, + "grad_norm": 0.29589495062828064, + "learning_rate": 7.40976920576108e-06, + "loss": 0.1292, + "step": 43371 + }, + { + "epoch": 0.7735882709663611, + "grad_norm": 0.29509004950523376, + "learning_rate": 7.408663205617822e-06, + "loss": 0.2014, + "step": 43372 + }, + { + "epoch": 0.7736061070880748, + "grad_norm": 0.30128195881843567, + "learning_rate": 7.40755727366465e-06, + "loss": 0.1153, + "step": 43373 + }, + { + "epoch": 0.7736239432097884, + "grad_norm": 0.2588391900062561, + "learning_rate": 7.406451409905873e-06, + "loss": 0.1334, + "step": 43374 + }, + { + "epoch": 0.7736417793315021, + "grad_norm": 0.2026851326227188, + "learning_rate": 7.405345614345765e-06, + "loss": 0.0932, + "step": 43375 + }, + { + "epoch": 0.7736596154532158, + "grad_norm": 0.28502556681632996, + "learning_rate": 7.404239886988615e-06, + "loss": 0.1151, + "step": 43376 + }, + { + "epoch": 0.7736774515749295, + "grad_norm": 0.35120463371276855, + "learning_rate": 7.403134227838701e-06, + "loss": 0.1357, + "step": 43377 + }, + { + "epoch": 0.7736952876966432, + "grad_norm": 0.23151548206806183, + "learning_rate": 7.402028636900326e-06, + "loss": 0.1331, + "step": 43378 + }, + { + "epoch": 0.7737131238183569, + "grad_norm": 0.31900206208229065, + "learning_rate": 7.4009231141777655e-06, + "loss": 0.1607, + "step": 43379 + }, + { + "epoch": 0.7737309599400707, + "grad_norm": 0.26978549361228943, + "learning_rate": 7.3998176596753034e-06, + "loss": 0.1199, + "step": 43380 + }, + { + "epoch": 0.7737487960617844, + "grad_norm": 0.21533839404582977, + "learning_rate": 7.398712273397221e-06, + "loss": 0.0686, + "step": 43381 + }, + { + "epoch": 0.7737666321834981, + "grad_norm": 0.28145670890808105, + "learning_rate": 7.397606955347816e-06, + "loss": 0.1397, + "step": 43382 + }, + { + "epoch": 0.7737844683052117, + "grad_norm": 0.19150963425636292, + "learning_rate": 7.396501705531367e-06, + "loss": 0.0761, + "step": 43383 + }, + { + "epoch": 0.7738023044269254, + "grad_norm": 0.3306606113910675, + "learning_rate": 7.395396523952147e-06, + "loss": 0.1876, + "step": 43384 + }, + { + "epoch": 0.7738201405486391, + "grad_norm": 0.2997142970561981, + "learning_rate": 7.394291410614462e-06, + "loss": 0.1064, + "step": 43385 + }, + { + "epoch": 0.7738379766703528, + "grad_norm": 0.24205929040908813, + "learning_rate": 7.3931863655225755e-06, + "loss": 0.1176, + "step": 43386 + }, + { + "epoch": 0.7738558127920665, + "grad_norm": 0.26777875423431396, + "learning_rate": 7.3920813886807855e-06, + "loss": 0.1611, + "step": 43387 + }, + { + "epoch": 0.7738736489137802, + "grad_norm": 0.24796457588672638, + "learning_rate": 7.390976480093373e-06, + "loss": 0.1377, + "step": 43388 + }, + { + "epoch": 0.7738914850354939, + "grad_norm": 0.2466205507516861, + "learning_rate": 7.389871639764614e-06, + "loss": 0.1152, + "step": 43389 + }, + { + "epoch": 0.7739093211572076, + "grad_norm": 0.42343828082084656, + "learning_rate": 7.38876686769879e-06, + "loss": 0.1375, + "step": 43390 + }, + { + "epoch": 0.7739271572789213, + "grad_norm": 0.3893948793411255, + "learning_rate": 7.387662163900194e-06, + "loss": 0.1214, + "step": 43391 + }, + { + "epoch": 0.7739449934006349, + "grad_norm": 0.26313719153404236, + "learning_rate": 7.3865575283731034e-06, + "loss": 0.1185, + "step": 43392 + }, + { + "epoch": 0.7739628295223486, + "grad_norm": 0.4351363778114319, + "learning_rate": 7.3854529611218e-06, + "loss": 0.1413, + "step": 43393 + }, + { + "epoch": 0.7739806656440623, + "grad_norm": 0.2902747690677643, + "learning_rate": 7.384348462150556e-06, + "loss": 0.0958, + "step": 43394 + }, + { + "epoch": 0.773998501765776, + "grad_norm": 0.2046617716550827, + "learning_rate": 7.383244031463671e-06, + "loss": 0.0929, + "step": 43395 + }, + { + "epoch": 0.7740163378874897, + "grad_norm": 0.2876710891723633, + "learning_rate": 7.382139669065416e-06, + "loss": 0.1074, + "step": 43396 + }, + { + "epoch": 0.7740341740092035, + "grad_norm": 0.3248555064201355, + "learning_rate": 7.381035374960071e-06, + "loss": 0.0676, + "step": 43397 + }, + { + "epoch": 0.7740520101309172, + "grad_norm": 0.34729835391044617, + "learning_rate": 7.379931149151912e-06, + "loss": 0.093, + "step": 43398 + }, + { + "epoch": 0.7740698462526309, + "grad_norm": 0.2575656771659851, + "learning_rate": 7.378826991645232e-06, + "loss": 0.1671, + "step": 43399 + }, + { + "epoch": 0.7740876823743446, + "grad_norm": 0.2115790694952011, + "learning_rate": 7.377722902444301e-06, + "loss": 0.08, + "step": 43400 + }, + { + "epoch": 0.7741055184960582, + "grad_norm": 0.31736472249031067, + "learning_rate": 7.376618881553407e-06, + "loss": 0.0987, + "step": 43401 + }, + { + "epoch": 0.7741233546177719, + "grad_norm": 0.29819318652153015, + "learning_rate": 7.375514928976826e-06, + "loss": 0.1358, + "step": 43402 + }, + { + "epoch": 0.7741411907394856, + "grad_norm": 0.2615434229373932, + "learning_rate": 7.374411044718827e-06, + "loss": 0.1298, + "step": 43403 + }, + { + "epoch": 0.7741590268611993, + "grad_norm": 0.22527238726615906, + "learning_rate": 7.373307228783708e-06, + "loss": 0.0868, + "step": 43404 + }, + { + "epoch": 0.774176862982913, + "grad_norm": 0.22288206219673157, + "learning_rate": 7.372203481175738e-06, + "loss": 0.0744, + "step": 43405 + }, + { + "epoch": 0.7741946991046267, + "grad_norm": 0.31209704279899597, + "learning_rate": 7.371099801899195e-06, + "loss": 0.1052, + "step": 43406 + }, + { + "epoch": 0.7742125352263404, + "grad_norm": 0.2829226851463318, + "learning_rate": 7.3699961909583505e-06, + "loss": 0.1287, + "step": 43407 + }, + { + "epoch": 0.7742303713480541, + "grad_norm": 0.2901262938976288, + "learning_rate": 7.368892648357497e-06, + "loss": 0.151, + "step": 43408 + }, + { + "epoch": 0.7742482074697677, + "grad_norm": 0.225845605134964, + "learning_rate": 7.367789174100909e-06, + "loss": 0.1271, + "step": 43409 + }, + { + "epoch": 0.7742660435914814, + "grad_norm": 0.24322721362113953, + "learning_rate": 7.366685768192854e-06, + "loss": 0.1313, + "step": 43410 + }, + { + "epoch": 0.7742838797131951, + "grad_norm": 0.39144471287727356, + "learning_rate": 7.365582430637613e-06, + "loss": 0.1357, + "step": 43411 + }, + { + "epoch": 0.7743017158349088, + "grad_norm": 0.24938805401325226, + "learning_rate": 7.364479161439469e-06, + "loss": 0.1277, + "step": 43412 + }, + { + "epoch": 0.7743195519566225, + "grad_norm": 0.3017863631248474, + "learning_rate": 7.363375960602689e-06, + "loss": 0.1338, + "step": 43413 + }, + { + "epoch": 0.7743373880783363, + "grad_norm": 0.26391059160232544, + "learning_rate": 7.362272828131564e-06, + "loss": 0.1307, + "step": 43414 + }, + { + "epoch": 0.77435522420005, + "grad_norm": 0.2297331690788269, + "learning_rate": 7.361169764030363e-06, + "loss": 0.1112, + "step": 43415 + }, + { + "epoch": 0.7743730603217637, + "grad_norm": 0.2829309403896332, + "learning_rate": 7.360066768303348e-06, + "loss": 0.0924, + "step": 43416 + }, + { + "epoch": 0.7743908964434774, + "grad_norm": 0.3273833394050598, + "learning_rate": 7.3589638409548165e-06, + "loss": 0.1661, + "step": 43417 + }, + { + "epoch": 0.774408732565191, + "grad_norm": 0.2531827390193939, + "learning_rate": 7.357860981989034e-06, + "loss": 0.1243, + "step": 43418 + }, + { + "epoch": 0.7744265686869047, + "grad_norm": 0.16432201862335205, + "learning_rate": 7.356758191410277e-06, + "loss": 0.0374, + "step": 43419 + }, + { + "epoch": 0.7744444048086184, + "grad_norm": 0.20097115635871887, + "learning_rate": 7.355655469222808e-06, + "loss": 0.099, + "step": 43420 + }, + { + "epoch": 0.7744622409303321, + "grad_norm": 0.17405837774276733, + "learning_rate": 7.354552815430923e-06, + "loss": 0.0623, + "step": 43421 + }, + { + "epoch": 0.7744800770520458, + "grad_norm": 0.2694275975227356, + "learning_rate": 7.353450230038886e-06, + "loss": 0.11, + "step": 43422 + }, + { + "epoch": 0.7744979131737595, + "grad_norm": 0.2961132526397705, + "learning_rate": 7.352347713050969e-06, + "loss": 0.144, + "step": 43423 + }, + { + "epoch": 0.7745157492954732, + "grad_norm": 0.2935517430305481, + "learning_rate": 7.351245264471452e-06, + "loss": 0.1252, + "step": 43424 + }, + { + "epoch": 0.7745335854171869, + "grad_norm": 0.26306623220443726, + "learning_rate": 7.3501428843045925e-06, + "loss": 0.1086, + "step": 43425 + }, + { + "epoch": 0.7745514215389006, + "grad_norm": 0.3005884885787964, + "learning_rate": 7.3490405725546826e-06, + "loss": 0.139, + "step": 43426 + }, + { + "epoch": 0.7745692576606142, + "grad_norm": 0.26893794536590576, + "learning_rate": 7.347938329225982e-06, + "loss": 0.1695, + "step": 43427 + }, + { + "epoch": 0.7745870937823279, + "grad_norm": 0.22363987565040588, + "learning_rate": 7.346836154322776e-06, + "loss": 0.0891, + "step": 43428 + }, + { + "epoch": 0.7746049299040416, + "grad_norm": 0.25274941325187683, + "learning_rate": 7.34573404784932e-06, + "loss": 0.1053, + "step": 43429 + }, + { + "epoch": 0.7746227660257553, + "grad_norm": 0.2577058970928192, + "learning_rate": 7.344632009809907e-06, + "loss": 0.126, + "step": 43430 + }, + { + "epoch": 0.7746406021474691, + "grad_norm": 0.24778054654598236, + "learning_rate": 7.3435300402087985e-06, + "loss": 0.1266, + "step": 43431 + }, + { + "epoch": 0.7746584382691828, + "grad_norm": 0.3263584077358246, + "learning_rate": 7.342428139050267e-06, + "loss": 0.1346, + "step": 43432 + }, + { + "epoch": 0.7746762743908965, + "grad_norm": 0.2668343186378479, + "learning_rate": 7.3413263063385725e-06, + "loss": 0.1002, + "step": 43433 + }, + { + "epoch": 0.7746941105126102, + "grad_norm": 0.23315773904323578, + "learning_rate": 7.340224542078006e-06, + "loss": 0.092, + "step": 43434 + }, + { + "epoch": 0.7747119466343239, + "grad_norm": 0.2598983943462372, + "learning_rate": 7.339122846272828e-06, + "loss": 0.1412, + "step": 43435 + }, + { + "epoch": 0.7747297827560375, + "grad_norm": 0.22261232137680054, + "learning_rate": 7.3380212189273075e-06, + "loss": 0.1064, + "step": 43436 + }, + { + "epoch": 0.7747476188777512, + "grad_norm": 0.27725648880004883, + "learning_rate": 7.336919660045718e-06, + "loss": 0.1347, + "step": 43437 + }, + { + "epoch": 0.7747654549994649, + "grad_norm": 0.27133169770240784, + "learning_rate": 7.335818169632322e-06, + "loss": 0.0989, + "step": 43438 + }, + { + "epoch": 0.7747832911211786, + "grad_norm": 0.41076937317848206, + "learning_rate": 7.334716747691406e-06, + "loss": 0.1866, + "step": 43439 + }, + { + "epoch": 0.7748011272428923, + "grad_norm": 0.27317196130752563, + "learning_rate": 7.333615394227217e-06, + "loss": 0.1129, + "step": 43440 + }, + { + "epoch": 0.774818963364606, + "grad_norm": 0.2356339544057846, + "learning_rate": 7.332514109244046e-06, + "loss": 0.0872, + "step": 43441 + }, + { + "epoch": 0.7748367994863197, + "grad_norm": 0.27787521481513977, + "learning_rate": 7.331412892746145e-06, + "loss": 0.123, + "step": 43442 + }, + { + "epoch": 0.7748546356080334, + "grad_norm": 0.5402619242668152, + "learning_rate": 7.330311744737797e-06, + "loss": 0.1533, + "step": 43443 + }, + { + "epoch": 0.774872471729747, + "grad_norm": 0.23586773872375488, + "learning_rate": 7.329210665223265e-06, + "loss": 0.0996, + "step": 43444 + }, + { + "epoch": 0.7748903078514607, + "grad_norm": 0.47996529936790466, + "learning_rate": 7.3281096542068165e-06, + "loss": 0.1813, + "step": 43445 + }, + { + "epoch": 0.7749081439731744, + "grad_norm": 0.24311622977256775, + "learning_rate": 7.32700871169271e-06, + "loss": 0.1133, + "step": 43446 + }, + { + "epoch": 0.7749259800948882, + "grad_norm": 0.25032761693000793, + "learning_rate": 7.3259078376852285e-06, + "loss": 0.1792, + "step": 43447 + }, + { + "epoch": 0.7749438162166019, + "grad_norm": 0.3706215023994446, + "learning_rate": 7.324807032188632e-06, + "loss": 0.1017, + "step": 43448 + }, + { + "epoch": 0.7749616523383156, + "grad_norm": 0.23765237629413605, + "learning_rate": 7.323706295207189e-06, + "loss": 0.063, + "step": 43449 + }, + { + "epoch": 0.7749794884600293, + "grad_norm": 0.22569239139556885, + "learning_rate": 7.322605626745166e-06, + "loss": 0.1674, + "step": 43450 + }, + { + "epoch": 0.774997324581743, + "grad_norm": 0.22864854335784912, + "learning_rate": 7.321505026806821e-06, + "loss": 0.0812, + "step": 43451 + }, + { + "epoch": 0.7750151607034567, + "grad_norm": 0.25411897897720337, + "learning_rate": 7.320404495396438e-06, + "loss": 0.1205, + "step": 43452 + }, + { + "epoch": 0.7750329968251704, + "grad_norm": 0.2267070859670639, + "learning_rate": 7.31930403251827e-06, + "loss": 0.111, + "step": 43453 + }, + { + "epoch": 0.775050832946884, + "grad_norm": 0.34046271443367004, + "learning_rate": 7.318203638176582e-06, + "loss": 0.0912, + "step": 43454 + }, + { + "epoch": 0.7750686690685977, + "grad_norm": 0.2848954200744629, + "learning_rate": 7.317103312375642e-06, + "loss": 0.0814, + "step": 43455 + }, + { + "epoch": 0.7750865051903114, + "grad_norm": 0.23784367740154266, + "learning_rate": 7.316003055119724e-06, + "loss": 0.1097, + "step": 43456 + }, + { + "epoch": 0.7751043413120251, + "grad_norm": 0.2915710508823395, + "learning_rate": 7.3149028664130896e-06, + "loss": 0.1267, + "step": 43457 + }, + { + "epoch": 0.7751221774337388, + "grad_norm": 0.21106117963790894, + "learning_rate": 7.313802746259996e-06, + "loss": 0.1077, + "step": 43458 + }, + { + "epoch": 0.7751400135554525, + "grad_norm": 0.26949021220207214, + "learning_rate": 7.312702694664705e-06, + "loss": 0.1171, + "step": 43459 + }, + { + "epoch": 0.7751578496771662, + "grad_norm": 0.3910709023475647, + "learning_rate": 7.311602711631496e-06, + "loss": 0.1245, + "step": 43460 + }, + { + "epoch": 0.7751756857988799, + "grad_norm": 0.294808030128479, + "learning_rate": 7.3105027971646255e-06, + "loss": 0.0408, + "step": 43461 + }, + { + "epoch": 0.7751935219205935, + "grad_norm": 0.25726941227912903, + "learning_rate": 7.3094029512683545e-06, + "loss": 0.1387, + "step": 43462 + }, + { + "epoch": 0.7752113580423072, + "grad_norm": 0.31136077642440796, + "learning_rate": 7.308303173946945e-06, + "loss": 0.0917, + "step": 43463 + }, + { + "epoch": 0.775229194164021, + "grad_norm": 0.2807251214981079, + "learning_rate": 7.3072034652046595e-06, + "loss": 0.1309, + "step": 43464 + }, + { + "epoch": 0.7752470302857347, + "grad_norm": 0.20629748702049255, + "learning_rate": 7.306103825045771e-06, + "loss": 0.1002, + "step": 43465 + }, + { + "epoch": 0.7752648664074484, + "grad_norm": 0.33572325110435486, + "learning_rate": 7.3050042534745346e-06, + "loss": 0.1302, + "step": 43466 + }, + { + "epoch": 0.7752827025291621, + "grad_norm": 0.2864384949207306, + "learning_rate": 7.303904750495205e-06, + "loss": 0.1419, + "step": 43467 + }, + { + "epoch": 0.7753005386508758, + "grad_norm": 0.2772945761680603, + "learning_rate": 7.30280531611206e-06, + "loss": 0.1091, + "step": 43468 + }, + { + "epoch": 0.7753183747725895, + "grad_norm": 0.2192145586013794, + "learning_rate": 7.301705950329346e-06, + "loss": 0.0755, + "step": 43469 + }, + { + "epoch": 0.7753362108943032, + "grad_norm": 0.3089390993118286, + "learning_rate": 7.300606653151343e-06, + "loss": 0.162, + "step": 43470 + }, + { + "epoch": 0.7753540470160168, + "grad_norm": 0.24713397026062012, + "learning_rate": 7.2995074245822996e-06, + "loss": 0.1024, + "step": 43471 + }, + { + "epoch": 0.7753718831377305, + "grad_norm": 0.21032772958278656, + "learning_rate": 7.298408264626472e-06, + "loss": 0.0981, + "step": 43472 + }, + { + "epoch": 0.7753897192594442, + "grad_norm": 0.2417958527803421, + "learning_rate": 7.297309173288136e-06, + "loss": 0.1226, + "step": 43473 + }, + { + "epoch": 0.7754075553811579, + "grad_norm": 0.295732319355011, + "learning_rate": 7.296210150571542e-06, + "loss": 0.1179, + "step": 43474 + }, + { + "epoch": 0.7754253915028716, + "grad_norm": 0.42218971252441406, + "learning_rate": 7.295111196480956e-06, + "loss": 0.1888, + "step": 43475 + }, + { + "epoch": 0.7754432276245853, + "grad_norm": 0.2556533217430115, + "learning_rate": 7.2940123110206314e-06, + "loss": 0.1337, + "step": 43476 + }, + { + "epoch": 0.775461063746299, + "grad_norm": 0.31848812103271484, + "learning_rate": 7.292913494194823e-06, + "loss": 0.1362, + "step": 43477 + }, + { + "epoch": 0.7754788998680127, + "grad_norm": 0.29907524585723877, + "learning_rate": 7.2918147460078055e-06, + "loss": 0.1243, + "step": 43478 + }, + { + "epoch": 0.7754967359897263, + "grad_norm": 0.28146547079086304, + "learning_rate": 7.290716066463829e-06, + "loss": 0.1411, + "step": 43479 + }, + { + "epoch": 0.77551457211144, + "grad_norm": 0.2943083941936493, + "learning_rate": 7.2896174555671546e-06, + "loss": 0.1412, + "step": 43480 + }, + { + "epoch": 0.7755324082331538, + "grad_norm": 0.23451046645641327, + "learning_rate": 7.288518913322034e-06, + "loss": 0.0787, + "step": 43481 + }, + { + "epoch": 0.7755502443548675, + "grad_norm": 0.18192817270755768, + "learning_rate": 7.287420439732728e-06, + "loss": 0.0767, + "step": 43482 + }, + { + "epoch": 0.7755680804765812, + "grad_norm": 0.23433317244052887, + "learning_rate": 7.286322034803508e-06, + "loss": 0.1052, + "step": 43483 + }, + { + "epoch": 0.7755859165982949, + "grad_norm": 0.21425002813339233, + "learning_rate": 7.2852236985386235e-06, + "loss": 0.1225, + "step": 43484 + }, + { + "epoch": 0.7756037527200086, + "grad_norm": 0.43848171830177307, + "learning_rate": 7.28412543094232e-06, + "loss": 0.1409, + "step": 43485 + }, + { + "epoch": 0.7756215888417223, + "grad_norm": 0.32085123658180237, + "learning_rate": 7.2830272320188745e-06, + "loss": 0.138, + "step": 43486 + }, + { + "epoch": 0.775639424963436, + "grad_norm": 0.2725811302661896, + "learning_rate": 7.281929101772533e-06, + "loss": 0.1707, + "step": 43487 + }, + { + "epoch": 0.7756572610851497, + "grad_norm": 0.24344605207443237, + "learning_rate": 7.280831040207556e-06, + "loss": 0.1082, + "step": 43488 + }, + { + "epoch": 0.7756750972068633, + "grad_norm": 0.2600553035736084, + "learning_rate": 7.2797330473281965e-06, + "loss": 0.1186, + "step": 43489 + }, + { + "epoch": 0.775692933328577, + "grad_norm": 0.22441856563091278, + "learning_rate": 7.278635123138705e-06, + "loss": 0.1291, + "step": 43490 + }, + { + "epoch": 0.7757107694502907, + "grad_norm": 0.5371310710906982, + "learning_rate": 7.27753726764335e-06, + "loss": 0.1228, + "step": 43491 + }, + { + "epoch": 0.7757286055720044, + "grad_norm": 0.2149852216243744, + "learning_rate": 7.276439480846384e-06, + "loss": 0.0813, + "step": 43492 + }, + { + "epoch": 0.7757464416937181, + "grad_norm": 0.23638857901096344, + "learning_rate": 7.275341762752061e-06, + "loss": 0.1088, + "step": 43493 + }, + { + "epoch": 0.7757642778154318, + "grad_norm": 0.27985575795173645, + "learning_rate": 7.274244113364626e-06, + "loss": 0.0832, + "step": 43494 + }, + { + "epoch": 0.7757821139371455, + "grad_norm": 0.21184313297271729, + "learning_rate": 7.273146532688352e-06, + "loss": 0.0667, + "step": 43495 + }, + { + "epoch": 0.7757999500588592, + "grad_norm": 0.20760251581668854, + "learning_rate": 7.2720490207274755e-06, + "loss": 0.0763, + "step": 43496 + }, + { + "epoch": 0.7758177861805728, + "grad_norm": 0.2926557958126068, + "learning_rate": 7.2709515774862676e-06, + "loss": 0.1126, + "step": 43497 + }, + { + "epoch": 0.7758356223022866, + "grad_norm": 0.2546495795249939, + "learning_rate": 7.269854202968968e-06, + "loss": 0.1173, + "step": 43498 + }, + { + "epoch": 0.7758534584240003, + "grad_norm": 0.3374875485897064, + "learning_rate": 7.268756897179846e-06, + "loss": 0.1126, + "step": 43499 + }, + { + "epoch": 0.775871294545714, + "grad_norm": 0.2784263491630554, + "learning_rate": 7.267659660123144e-06, + "loss": 0.1551, + "step": 43500 + }, + { + "epoch": 0.7758891306674277, + "grad_norm": 0.2909505069255829, + "learning_rate": 7.26656249180312e-06, + "loss": 0.1648, + "step": 43501 + }, + { + "epoch": 0.7759069667891414, + "grad_norm": 0.21909500658512115, + "learning_rate": 7.2654653922240215e-06, + "loss": 0.1143, + "step": 43502 + }, + { + "epoch": 0.7759248029108551, + "grad_norm": 0.21381916105747223, + "learning_rate": 7.2643683613901e-06, + "loss": 0.1083, + "step": 43503 + }, + { + "epoch": 0.7759426390325688, + "grad_norm": 0.20347769558429718, + "learning_rate": 7.2632713993056185e-06, + "loss": 0.1243, + "step": 43504 + }, + { + "epoch": 0.7759604751542825, + "grad_norm": 0.2153291255235672, + "learning_rate": 7.2621745059748255e-06, + "loss": 0.1024, + "step": 43505 + }, + { + "epoch": 0.7759783112759961, + "grad_norm": 0.3248733878135681, + "learning_rate": 7.261077681401968e-06, + "loss": 0.0924, + "step": 43506 + }, + { + "epoch": 0.7759961473977098, + "grad_norm": 0.32055947184562683, + "learning_rate": 7.259980925591292e-06, + "loss": 0.0903, + "step": 43507 + }, + { + "epoch": 0.7760139835194235, + "grad_norm": 0.27737247943878174, + "learning_rate": 7.258884238547067e-06, + "loss": 0.1405, + "step": 43508 + }, + { + "epoch": 0.7760318196411372, + "grad_norm": 0.27074140310287476, + "learning_rate": 7.257787620273532e-06, + "loss": 0.0833, + "step": 43509 + }, + { + "epoch": 0.7760496557628509, + "grad_norm": 0.2653268873691559, + "learning_rate": 7.256691070774935e-06, + "loss": 0.1186, + "step": 43510 + }, + { + "epoch": 0.7760674918845646, + "grad_norm": 0.2698708772659302, + "learning_rate": 7.255594590055531e-06, + "loss": 0.1356, + "step": 43511 + }, + { + "epoch": 0.7760853280062783, + "grad_norm": 0.3289799690246582, + "learning_rate": 7.254498178119579e-06, + "loss": 0.1007, + "step": 43512 + }, + { + "epoch": 0.776103164127992, + "grad_norm": 0.23925469815731049, + "learning_rate": 7.2534018349713185e-06, + "loss": 0.116, + "step": 43513 + }, + { + "epoch": 0.7761210002497057, + "grad_norm": 0.1912168264389038, + "learning_rate": 7.252305560615005e-06, + "loss": 0.0858, + "step": 43514 + }, + { + "epoch": 0.7761388363714194, + "grad_norm": 0.22202201187610626, + "learning_rate": 7.251209355054883e-06, + "loss": 0.0965, + "step": 43515 + }, + { + "epoch": 0.7761566724931331, + "grad_norm": 0.23961244523525238, + "learning_rate": 7.250113218295198e-06, + "loss": 0.1022, + "step": 43516 + }, + { + "epoch": 0.7761745086148468, + "grad_norm": 0.28463515639305115, + "learning_rate": 7.249017150340212e-06, + "loss": 0.1027, + "step": 43517 + }, + { + "epoch": 0.7761923447365605, + "grad_norm": 0.2957436740398407, + "learning_rate": 7.247921151194164e-06, + "loss": 0.1765, + "step": 43518 + }, + { + "epoch": 0.7762101808582742, + "grad_norm": 0.32369574904441833, + "learning_rate": 7.246825220861308e-06, + "loss": 0.1213, + "step": 43519 + }, + { + "epoch": 0.7762280169799879, + "grad_norm": 0.30015018582344055, + "learning_rate": 7.245729359345882e-06, + "loss": 0.1496, + "step": 43520 + }, + { + "epoch": 0.7762458531017016, + "grad_norm": 0.200738787651062, + "learning_rate": 7.244633566652145e-06, + "loss": 0.0911, + "step": 43521 + }, + { + "epoch": 0.7762636892234153, + "grad_norm": 0.27351483702659607, + "learning_rate": 7.243537842784345e-06, + "loss": 0.0957, + "step": 43522 + }, + { + "epoch": 0.776281525345129, + "grad_norm": 0.24771127104759216, + "learning_rate": 7.242442187746715e-06, + "loss": 0.102, + "step": 43523 + }, + { + "epoch": 0.7762993614668426, + "grad_norm": 0.24154995381832123, + "learning_rate": 7.241346601543519e-06, + "loss": 0.0901, + "step": 43524 + }, + { + "epoch": 0.7763171975885563, + "grad_norm": 0.345503568649292, + "learning_rate": 7.240251084178992e-06, + "loss": 0.1372, + "step": 43525 + }, + { + "epoch": 0.77633503371027, + "grad_norm": 0.2833728790283203, + "learning_rate": 7.2391556356573935e-06, + "loss": 0.0944, + "step": 43526 + }, + { + "epoch": 0.7763528698319837, + "grad_norm": 0.23817670345306396, + "learning_rate": 7.238060255982962e-06, + "loss": 0.0596, + "step": 43527 + }, + { + "epoch": 0.7763707059536974, + "grad_norm": 0.35691890120506287, + "learning_rate": 7.236964945159943e-06, + "loss": 0.1073, + "step": 43528 + }, + { + "epoch": 0.7763885420754111, + "grad_norm": 0.2345840036869049, + "learning_rate": 7.235869703192574e-06, + "loss": 0.1353, + "step": 43529 + }, + { + "epoch": 0.7764063781971248, + "grad_norm": 0.2550492286682129, + "learning_rate": 7.23477453008512e-06, + "loss": 0.1097, + "step": 43530 + }, + { + "epoch": 0.7764242143188385, + "grad_norm": 0.3019869923591614, + "learning_rate": 7.233679425841814e-06, + "loss": 0.0804, + "step": 43531 + }, + { + "epoch": 0.7764420504405523, + "grad_norm": 0.284801721572876, + "learning_rate": 7.2325843904669035e-06, + "loss": 0.0745, + "step": 43532 + }, + { + "epoch": 0.7764598865622659, + "grad_norm": 0.26073747873306274, + "learning_rate": 7.231489423964624e-06, + "loss": 0.1362, + "step": 43533 + }, + { + "epoch": 0.7764777226839796, + "grad_norm": 0.25383538007736206, + "learning_rate": 7.230394526339238e-06, + "loss": 0.0673, + "step": 43534 + }, + { + "epoch": 0.7764955588056933, + "grad_norm": 0.27917978167533875, + "learning_rate": 7.229299697594979e-06, + "loss": 0.0887, + "step": 43535 + }, + { + "epoch": 0.776513394927407, + "grad_norm": 0.29004523158073425, + "learning_rate": 7.228204937736094e-06, + "loss": 0.1236, + "step": 43536 + }, + { + "epoch": 0.7765312310491207, + "grad_norm": 0.30774280428886414, + "learning_rate": 7.227110246766814e-06, + "loss": 0.1324, + "step": 43537 + }, + { + "epoch": 0.7765490671708344, + "grad_norm": 0.39555907249450684, + "learning_rate": 7.226015624691396e-06, + "loss": 0.1055, + "step": 43538 + }, + { + "epoch": 0.7765669032925481, + "grad_norm": 0.313202440738678, + "learning_rate": 7.2249210715140846e-06, + "loss": 0.0769, + "step": 43539 + }, + { + "epoch": 0.7765847394142618, + "grad_norm": 0.2521459758281708, + "learning_rate": 7.223826587239122e-06, + "loss": 0.0785, + "step": 43540 + }, + { + "epoch": 0.7766025755359754, + "grad_norm": 0.318236380815506, + "learning_rate": 7.222732171870747e-06, + "loss": 0.1222, + "step": 43541 + }, + { + "epoch": 0.7766204116576891, + "grad_norm": 0.32848671078681946, + "learning_rate": 7.221637825413191e-06, + "loss": 0.1277, + "step": 43542 + }, + { + "epoch": 0.7766382477794028, + "grad_norm": 0.3019261360168457, + "learning_rate": 7.220543547870717e-06, + "loss": 0.1862, + "step": 43543 + }, + { + "epoch": 0.7766560839011165, + "grad_norm": 0.24744179844856262, + "learning_rate": 7.219449339247558e-06, + "loss": 0.0613, + "step": 43544 + }, + { + "epoch": 0.7766739200228302, + "grad_norm": 0.35991716384887695, + "learning_rate": 7.218355199547955e-06, + "loss": 0.1728, + "step": 43545 + }, + { + "epoch": 0.7766917561445439, + "grad_norm": 0.23743069171905518, + "learning_rate": 7.21726112877614e-06, + "loss": 0.0585, + "step": 43546 + }, + { + "epoch": 0.7767095922662576, + "grad_norm": 0.24608206748962402, + "learning_rate": 7.216167126936371e-06, + "loss": 0.0763, + "step": 43547 + }, + { + "epoch": 0.7767274283879714, + "grad_norm": 0.5099205374717712, + "learning_rate": 7.215073194032879e-06, + "loss": 0.1104, + "step": 43548 + }, + { + "epoch": 0.7767452645096851, + "grad_norm": 0.18419383466243744, + "learning_rate": 7.213979330069909e-06, + "loss": 0.0689, + "step": 43549 + }, + { + "epoch": 0.7767631006313988, + "grad_norm": 0.2664598226547241, + "learning_rate": 7.212885535051689e-06, + "loss": 0.1694, + "step": 43550 + }, + { + "epoch": 0.7767809367531124, + "grad_norm": 0.2201850712299347, + "learning_rate": 7.211791808982474e-06, + "loss": 0.0898, + "step": 43551 + }, + { + "epoch": 0.7767987728748261, + "grad_norm": 0.2813500761985779, + "learning_rate": 7.210698151866491e-06, + "loss": 0.121, + "step": 43552 + }, + { + "epoch": 0.7768166089965398, + "grad_norm": 0.30298855900764465, + "learning_rate": 7.209604563707994e-06, + "loss": 0.1482, + "step": 43553 + }, + { + "epoch": 0.7768344451182535, + "grad_norm": 0.42618370056152344, + "learning_rate": 7.208511044511215e-06, + "loss": 0.1476, + "step": 43554 + }, + { + "epoch": 0.7768522812399672, + "grad_norm": 0.3566909432411194, + "learning_rate": 7.207417594280386e-06, + "loss": 0.1175, + "step": 43555 + }, + { + "epoch": 0.7768701173616809, + "grad_norm": 0.2191656231880188, + "learning_rate": 7.2063242130197575e-06, + "loss": 0.1219, + "step": 43556 + }, + { + "epoch": 0.7768879534833946, + "grad_norm": 0.33290231227874756, + "learning_rate": 7.205230900733562e-06, + "loss": 0.1655, + "step": 43557 + }, + { + "epoch": 0.7769057896051083, + "grad_norm": 0.24335452914237976, + "learning_rate": 7.204137657426038e-06, + "loss": 0.1282, + "step": 43558 + }, + { + "epoch": 0.7769236257268219, + "grad_norm": 0.27061861753463745, + "learning_rate": 7.203044483101415e-06, + "loss": 0.0953, + "step": 43559 + }, + { + "epoch": 0.7769414618485356, + "grad_norm": 0.2712450325489044, + "learning_rate": 7.201951377763947e-06, + "loss": 0.1299, + "step": 43560 + }, + { + "epoch": 0.7769592979702493, + "grad_norm": 0.23656640946865082, + "learning_rate": 7.200858341417863e-06, + "loss": 0.1164, + "step": 43561 + }, + { + "epoch": 0.776977134091963, + "grad_norm": 0.2635999321937561, + "learning_rate": 7.199765374067397e-06, + "loss": 0.1286, + "step": 43562 + }, + { + "epoch": 0.7769949702136767, + "grad_norm": 0.22753091156482697, + "learning_rate": 7.198672475716783e-06, + "loss": 0.1139, + "step": 43563 + }, + { + "epoch": 0.7770128063353904, + "grad_norm": 0.18766862154006958, + "learning_rate": 7.19757964637027e-06, + "loss": 0.0963, + "step": 43564 + }, + { + "epoch": 0.7770306424571042, + "grad_norm": 0.500789999961853, + "learning_rate": 7.19648688603208e-06, + "loss": 0.1844, + "step": 43565 + }, + { + "epoch": 0.7770484785788179, + "grad_norm": 0.23887291550636292, + "learning_rate": 7.1953941947064616e-06, + "loss": 0.1144, + "step": 43566 + }, + { + "epoch": 0.7770663147005316, + "grad_norm": 0.4070923626422882, + "learning_rate": 7.1943015723976465e-06, + "loss": 0.1489, + "step": 43567 + }, + { + "epoch": 0.7770841508222452, + "grad_norm": 0.30288925766944885, + "learning_rate": 7.193209019109862e-06, + "loss": 0.1275, + "step": 43568 + }, + { + "epoch": 0.7771019869439589, + "grad_norm": 0.2119552493095398, + "learning_rate": 7.192116534847354e-06, + "loss": 0.0704, + "step": 43569 + }, + { + "epoch": 0.7771198230656726, + "grad_norm": 0.253764808177948, + "learning_rate": 7.191024119614357e-06, + "loss": 0.1026, + "step": 43570 + }, + { + "epoch": 0.7771376591873863, + "grad_norm": 0.3002331554889679, + "learning_rate": 7.189931773415098e-06, + "loss": 0.1695, + "step": 43571 + }, + { + "epoch": 0.7771554953091, + "grad_norm": 0.3269325792789459, + "learning_rate": 7.188839496253807e-06, + "loss": 0.1196, + "step": 43572 + }, + { + "epoch": 0.7771733314308137, + "grad_norm": 0.3425743579864502, + "learning_rate": 7.187747288134736e-06, + "loss": 0.1387, + "step": 43573 + }, + { + "epoch": 0.7771911675525274, + "grad_norm": 0.24573343992233276, + "learning_rate": 7.186655149062108e-06, + "loss": 0.1496, + "step": 43574 + }, + { + "epoch": 0.7772090036742411, + "grad_norm": 0.25576549768447876, + "learning_rate": 7.1855630790401585e-06, + "loss": 0.1437, + "step": 43575 + }, + { + "epoch": 0.7772268397959547, + "grad_norm": 0.3472616970539093, + "learning_rate": 7.184471078073107e-06, + "loss": 0.1523, + "step": 43576 + }, + { + "epoch": 0.7772446759176684, + "grad_norm": 0.3288723826408386, + "learning_rate": 7.183379146165211e-06, + "loss": 0.1474, + "step": 43577 + }, + { + "epoch": 0.7772625120393821, + "grad_norm": 0.19933801889419556, + "learning_rate": 7.1822872833206905e-06, + "loss": 0.0717, + "step": 43578 + }, + { + "epoch": 0.7772803481610958, + "grad_norm": 0.24148492515087128, + "learning_rate": 7.181195489543768e-06, + "loss": 0.0922, + "step": 43579 + }, + { + "epoch": 0.7772981842828095, + "grad_norm": 0.3136390745639801, + "learning_rate": 7.180103764838697e-06, + "loss": 0.1601, + "step": 43580 + }, + { + "epoch": 0.7773160204045232, + "grad_norm": 0.27717170119285583, + "learning_rate": 7.17901210920969e-06, + "loss": 0.0722, + "step": 43581 + }, + { + "epoch": 0.777333856526237, + "grad_norm": 0.21808890998363495, + "learning_rate": 7.177920522660994e-06, + "loss": 0.0931, + "step": 43582 + }, + { + "epoch": 0.7773516926479507, + "grad_norm": 0.3988843262195587, + "learning_rate": 7.176829005196837e-06, + "loss": 0.1051, + "step": 43583 + }, + { + "epoch": 0.7773695287696644, + "grad_norm": 0.2618977129459381, + "learning_rate": 7.175737556821443e-06, + "loss": 0.1265, + "step": 43584 + }, + { + "epoch": 0.777387364891378, + "grad_norm": 0.23547561466693878, + "learning_rate": 7.174646177539041e-06, + "loss": 0.108, + "step": 43585 + }, + { + "epoch": 0.7774052010130917, + "grad_norm": 0.25493043661117554, + "learning_rate": 7.1735548673538735e-06, + "loss": 0.1058, + "step": 43586 + }, + { + "epoch": 0.7774230371348054, + "grad_norm": 0.3202263414859772, + "learning_rate": 7.172463626270165e-06, + "loss": 0.128, + "step": 43587 + }, + { + "epoch": 0.7774408732565191, + "grad_norm": 0.3321714997291565, + "learning_rate": 7.171372454292144e-06, + "loss": 0.1146, + "step": 43588 + }, + { + "epoch": 0.7774587093782328, + "grad_norm": 0.19986885786056519, + "learning_rate": 7.170281351424033e-06, + "loss": 0.1292, + "step": 43589 + }, + { + "epoch": 0.7774765454999465, + "grad_norm": 0.3647879660129547, + "learning_rate": 7.1691903176700795e-06, + "loss": 0.1721, + "step": 43590 + }, + { + "epoch": 0.7774943816216602, + "grad_norm": 0.22513028979301453, + "learning_rate": 7.168099353034502e-06, + "loss": 0.1125, + "step": 43591 + }, + { + "epoch": 0.7775122177433739, + "grad_norm": 0.23531781136989594, + "learning_rate": 7.1670084575215205e-06, + "loss": 0.0932, + "step": 43592 + }, + { + "epoch": 0.7775300538650876, + "grad_norm": 0.2683334946632385, + "learning_rate": 7.165917631135385e-06, + "loss": 0.1084, + "step": 43593 + }, + { + "epoch": 0.7775478899868012, + "grad_norm": 0.21299968659877777, + "learning_rate": 7.164826873880301e-06, + "loss": 0.1477, + "step": 43594 + }, + { + "epoch": 0.7775657261085149, + "grad_norm": 0.20810571312904358, + "learning_rate": 7.1637361857605195e-06, + "loss": 0.1326, + "step": 43595 + }, + { + "epoch": 0.7775835622302286, + "grad_norm": 0.19711872935295105, + "learning_rate": 7.162645566780255e-06, + "loss": 0.0863, + "step": 43596 + }, + { + "epoch": 0.7776013983519423, + "grad_norm": 0.24326525628566742, + "learning_rate": 7.161555016943738e-06, + "loss": 0.1018, + "step": 43597 + }, + { + "epoch": 0.777619234473656, + "grad_norm": 0.2896716594696045, + "learning_rate": 7.160464536255185e-06, + "loss": 0.1501, + "step": 43598 + }, + { + "epoch": 0.7776370705953698, + "grad_norm": 0.2527450919151306, + "learning_rate": 7.159374124718843e-06, + "loss": 0.0946, + "step": 43599 + }, + { + "epoch": 0.7776549067170835, + "grad_norm": 0.28956490755081177, + "learning_rate": 7.158283782338929e-06, + "loss": 0.0904, + "step": 43600 + }, + { + "epoch": 0.7776727428387972, + "grad_norm": 0.18599063158035278, + "learning_rate": 7.157193509119667e-06, + "loss": 0.0917, + "step": 43601 + }, + { + "epoch": 0.7776905789605109, + "grad_norm": 0.19335591793060303, + "learning_rate": 7.156103305065281e-06, + "loss": 0.0776, + "step": 43602 + }, + { + "epoch": 0.7777084150822245, + "grad_norm": 0.2681027054786682, + "learning_rate": 7.155013170180008e-06, + "loss": 0.155, + "step": 43603 + }, + { + "epoch": 0.7777262512039382, + "grad_norm": 0.30874451994895935, + "learning_rate": 7.153923104468066e-06, + "loss": 0.1095, + "step": 43604 + }, + { + "epoch": 0.7777440873256519, + "grad_norm": 0.45701950788497925, + "learning_rate": 7.152833107933685e-06, + "loss": 0.125, + "step": 43605 + }, + { + "epoch": 0.7777619234473656, + "grad_norm": 0.24174362421035767, + "learning_rate": 7.151743180581077e-06, + "loss": 0.1212, + "step": 43606 + }, + { + "epoch": 0.7777797595690793, + "grad_norm": 0.26570090651512146, + "learning_rate": 7.150653322414485e-06, + "loss": 0.1414, + "step": 43607 + }, + { + "epoch": 0.777797595690793, + "grad_norm": 0.33156126737594604, + "learning_rate": 7.149563533438117e-06, + "loss": 0.1548, + "step": 43608 + }, + { + "epoch": 0.7778154318125067, + "grad_norm": 0.2927214205265045, + "learning_rate": 7.148473813656218e-06, + "loss": 0.1135, + "step": 43609 + }, + { + "epoch": 0.7778332679342204, + "grad_norm": 0.27944502234458923, + "learning_rate": 7.147384163072995e-06, + "loss": 0.0862, + "step": 43610 + }, + { + "epoch": 0.777851104055934, + "grad_norm": 0.2414645105600357, + "learning_rate": 7.1462945816926724e-06, + "loss": 0.1081, + "step": 43611 + }, + { + "epoch": 0.7778689401776477, + "grad_norm": 0.32957446575164795, + "learning_rate": 7.145205069519484e-06, + "loss": 0.131, + "step": 43612 + }, + { + "epoch": 0.7778867762993614, + "grad_norm": 0.22420360147953033, + "learning_rate": 7.144115626557651e-06, + "loss": 0.1191, + "step": 43613 + }, + { + "epoch": 0.7779046124210751, + "grad_norm": 0.1976509541273117, + "learning_rate": 7.1430262528113915e-06, + "loss": 0.0555, + "step": 43614 + }, + { + "epoch": 0.7779224485427888, + "grad_norm": 0.3038027882575989, + "learning_rate": 7.1419369482849215e-06, + "loss": 0.1085, + "step": 43615 + }, + { + "epoch": 0.7779402846645026, + "grad_norm": 0.22419126331806183, + "learning_rate": 7.140847712982479e-06, + "loss": 0.1631, + "step": 43616 + }, + { + "epoch": 0.7779581207862163, + "grad_norm": 0.42043906450271606, + "learning_rate": 7.139758546908279e-06, + "loss": 0.0922, + "step": 43617 + }, + { + "epoch": 0.77797595690793, + "grad_norm": 0.27288374304771423, + "learning_rate": 7.138669450066546e-06, + "loss": 0.137, + "step": 43618 + }, + { + "epoch": 0.7779937930296437, + "grad_norm": 0.3137914538383484, + "learning_rate": 7.13758042246149e-06, + "loss": 0.085, + "step": 43619 + }, + { + "epoch": 0.7780116291513574, + "grad_norm": 0.2974531948566437, + "learning_rate": 7.13649146409735e-06, + "loss": 0.0788, + "step": 43620 + }, + { + "epoch": 0.778029465273071, + "grad_norm": 0.22917471826076508, + "learning_rate": 7.135402574978331e-06, + "loss": 0.0927, + "step": 43621 + }, + { + "epoch": 0.7780473013947847, + "grad_norm": 0.22131459414958954, + "learning_rate": 7.134313755108668e-06, + "loss": 0.0916, + "step": 43622 + }, + { + "epoch": 0.7780651375164984, + "grad_norm": 0.28505557775497437, + "learning_rate": 7.133225004492578e-06, + "loss": 0.1338, + "step": 43623 + }, + { + "epoch": 0.7780829736382121, + "grad_norm": 0.36708155274391174, + "learning_rate": 7.1321363231342696e-06, + "loss": 0.1873, + "step": 43624 + }, + { + "epoch": 0.7781008097599258, + "grad_norm": 0.19532674551010132, + "learning_rate": 7.13104771103798e-06, + "loss": 0.109, + "step": 43625 + }, + { + "epoch": 0.7781186458816395, + "grad_norm": 0.30337557196617126, + "learning_rate": 7.1299591682079225e-06, + "loss": 0.1589, + "step": 43626 + }, + { + "epoch": 0.7781364820033532, + "grad_norm": 0.2736426293849945, + "learning_rate": 7.1288706946483125e-06, + "loss": 0.098, + "step": 43627 + }, + { + "epoch": 0.7781543181250669, + "grad_norm": 0.19808503985404968, + "learning_rate": 7.127782290363369e-06, + "loss": 0.0867, + "step": 43628 + }, + { + "epoch": 0.7781721542467805, + "grad_norm": 0.24737536907196045, + "learning_rate": 7.126693955357319e-06, + "loss": 0.064, + "step": 43629 + }, + { + "epoch": 0.7781899903684942, + "grad_norm": 0.2788330614566803, + "learning_rate": 7.125605689634377e-06, + "loss": 0.1296, + "step": 43630 + }, + { + "epoch": 0.7782078264902079, + "grad_norm": 0.20348486304283142, + "learning_rate": 7.12451749319876e-06, + "loss": 0.0614, + "step": 43631 + }, + { + "epoch": 0.7782256626119216, + "grad_norm": 0.35838407278060913, + "learning_rate": 7.123429366054687e-06, + "loss": 0.1242, + "step": 43632 + }, + { + "epoch": 0.7782434987336354, + "grad_norm": 0.3719123303890228, + "learning_rate": 7.12234130820637e-06, + "loss": 0.1025, + "step": 43633 + }, + { + "epoch": 0.7782613348553491, + "grad_norm": 0.3032964766025543, + "learning_rate": 7.121253319658039e-06, + "loss": 0.123, + "step": 43634 + }, + { + "epoch": 0.7782791709770628, + "grad_norm": 0.30011990666389465, + "learning_rate": 7.1201654004139e-06, + "loss": 0.1127, + "step": 43635 + }, + { + "epoch": 0.7782970070987765, + "grad_norm": 0.2569604814052582, + "learning_rate": 7.119077550478182e-06, + "loss": 0.1195, + "step": 43636 + }, + { + "epoch": 0.7783148432204902, + "grad_norm": 0.22060345113277435, + "learning_rate": 7.117989769855085e-06, + "loss": 0.1273, + "step": 43637 + }, + { + "epoch": 0.7783326793422038, + "grad_norm": 0.4035508334636688, + "learning_rate": 7.116902058548847e-06, + "loss": 0.1217, + "step": 43638 + }, + { + "epoch": 0.7783505154639175, + "grad_norm": 0.3727690279483795, + "learning_rate": 7.115814416563671e-06, + "loss": 0.1288, + "step": 43639 + }, + { + "epoch": 0.7783683515856312, + "grad_norm": 0.2476806938648224, + "learning_rate": 7.1147268439037764e-06, + "loss": 0.1158, + "step": 43640 + }, + { + "epoch": 0.7783861877073449, + "grad_norm": 0.5200356245040894, + "learning_rate": 7.113639340573372e-06, + "loss": 0.1146, + "step": 43641 + }, + { + "epoch": 0.7784040238290586, + "grad_norm": 0.2770136296749115, + "learning_rate": 7.112551906576684e-06, + "loss": 0.1533, + "step": 43642 + }, + { + "epoch": 0.7784218599507723, + "grad_norm": 0.22945545613765717, + "learning_rate": 7.111464541917926e-06, + "loss": 0.1086, + "step": 43643 + }, + { + "epoch": 0.778439696072486, + "grad_norm": 0.33841952681541443, + "learning_rate": 7.110377246601307e-06, + "loss": 0.1326, + "step": 43644 + }, + { + "epoch": 0.7784575321941997, + "grad_norm": 0.30094030499458313, + "learning_rate": 7.109290020631046e-06, + "loss": 0.1588, + "step": 43645 + }, + { + "epoch": 0.7784753683159134, + "grad_norm": 0.3093232214450836, + "learning_rate": 7.108202864011349e-06, + "loss": 0.1426, + "step": 43646 + }, + { + "epoch": 0.778493204437627, + "grad_norm": 0.2519281208515167, + "learning_rate": 7.1071157767464424e-06, + "loss": 0.1142, + "step": 43647 + }, + { + "epoch": 0.7785110405593407, + "grad_norm": 0.25267964601516724, + "learning_rate": 7.10602875884053e-06, + "loss": 0.1733, + "step": 43648 + }, + { + "epoch": 0.7785288766810545, + "grad_norm": 0.2881571650505066, + "learning_rate": 7.104941810297836e-06, + "loss": 0.1081, + "step": 43649 + }, + { + "epoch": 0.7785467128027682, + "grad_norm": 0.18375305831432343, + "learning_rate": 7.103854931122561e-06, + "loss": 0.0927, + "step": 43650 + }, + { + "epoch": 0.7785645489244819, + "grad_norm": 0.23631274700164795, + "learning_rate": 7.102768121318934e-06, + "loss": 0.1031, + "step": 43651 + }, + { + "epoch": 0.7785823850461956, + "grad_norm": 0.2417704463005066, + "learning_rate": 7.101681380891159e-06, + "loss": 0.0633, + "step": 43652 + }, + { + "epoch": 0.7786002211679093, + "grad_norm": 0.30564403533935547, + "learning_rate": 7.10059470984345e-06, + "loss": 0.132, + "step": 43653 + }, + { + "epoch": 0.778618057289623, + "grad_norm": 0.2736362814903259, + "learning_rate": 7.099508108180011e-06, + "loss": 0.1251, + "step": 43654 + }, + { + "epoch": 0.7786358934113367, + "grad_norm": 0.2428872436285019, + "learning_rate": 7.098421575905068e-06, + "loss": 0.0933, + "step": 43655 + }, + { + "epoch": 0.7786537295330503, + "grad_norm": 0.4220563471317291, + "learning_rate": 7.097335113022824e-06, + "loss": 0.1694, + "step": 43656 + }, + { + "epoch": 0.778671565654764, + "grad_norm": 0.28112512826919556, + "learning_rate": 7.096248719537493e-06, + "loss": 0.1252, + "step": 43657 + }, + { + "epoch": 0.7786894017764777, + "grad_norm": 0.35753437876701355, + "learning_rate": 7.095162395453289e-06, + "loss": 0.1271, + "step": 43658 + }, + { + "epoch": 0.7787072378981914, + "grad_norm": 0.24700522422790527, + "learning_rate": 7.094076140774408e-06, + "loss": 0.0739, + "step": 43659 + }, + { + "epoch": 0.7787250740199051, + "grad_norm": 0.22753089666366577, + "learning_rate": 7.092989955505083e-06, + "loss": 0.1022, + "step": 43660 + }, + { + "epoch": 0.7787429101416188, + "grad_norm": 0.32332295179367065, + "learning_rate": 7.091903839649511e-06, + "loss": 0.1391, + "step": 43661 + }, + { + "epoch": 0.7787607462633325, + "grad_norm": 0.21662050485610962, + "learning_rate": 7.0908177932118984e-06, + "loss": 0.0832, + "step": 43662 + }, + { + "epoch": 0.7787785823850462, + "grad_norm": 0.33779171109199524, + "learning_rate": 7.089731816196471e-06, + "loss": 0.1601, + "step": 43663 + }, + { + "epoch": 0.7787964185067598, + "grad_norm": 0.3255383372306824, + "learning_rate": 7.0886459086074195e-06, + "loss": 0.1368, + "step": 43664 + }, + { + "epoch": 0.7788142546284735, + "grad_norm": 0.254547655582428, + "learning_rate": 7.087560070448973e-06, + "loss": 0.094, + "step": 43665 + }, + { + "epoch": 0.7788320907501873, + "grad_norm": 0.3348188102245331, + "learning_rate": 7.086474301725327e-06, + "loss": 0.1339, + "step": 43666 + }, + { + "epoch": 0.778849926871901, + "grad_norm": 0.32626059651374817, + "learning_rate": 7.0853886024406874e-06, + "loss": 0.1081, + "step": 43667 + }, + { + "epoch": 0.7788677629936147, + "grad_norm": 0.2311558872461319, + "learning_rate": 7.084302972599277e-06, + "loss": 0.0757, + "step": 43668 + }, + { + "epoch": 0.7788855991153284, + "grad_norm": 0.25786083936691284, + "learning_rate": 7.083217412205298e-06, + "loss": 0.1198, + "step": 43669 + }, + { + "epoch": 0.7789034352370421, + "grad_norm": 0.25509217381477356, + "learning_rate": 7.082131921262955e-06, + "loss": 0.1272, + "step": 43670 + }, + { + "epoch": 0.7789212713587558, + "grad_norm": 0.2245519757270813, + "learning_rate": 7.0810464997764555e-06, + "loss": 0.0757, + "step": 43671 + }, + { + "epoch": 0.7789391074804695, + "grad_norm": 0.2528594434261322, + "learning_rate": 7.0799611477500025e-06, + "loss": 0.1104, + "step": 43672 + }, + { + "epoch": 0.7789569436021831, + "grad_norm": 0.31715109944343567, + "learning_rate": 7.078875865187817e-06, + "loss": 0.1248, + "step": 43673 + }, + { + "epoch": 0.7789747797238968, + "grad_norm": 0.3272842466831207, + "learning_rate": 7.077790652094099e-06, + "loss": 0.1446, + "step": 43674 + }, + { + "epoch": 0.7789926158456105, + "grad_norm": 0.4566992223262787, + "learning_rate": 7.076705508473047e-06, + "loss": 0.1536, + "step": 43675 + }, + { + "epoch": 0.7790104519673242, + "grad_norm": 0.23735497891902924, + "learning_rate": 7.075620434328883e-06, + "loss": 0.0402, + "step": 43676 + }, + { + "epoch": 0.7790282880890379, + "grad_norm": 0.31610390543937683, + "learning_rate": 7.074535429665796e-06, + "loss": 0.088, + "step": 43677 + }, + { + "epoch": 0.7790461242107516, + "grad_norm": 0.2789989709854126, + "learning_rate": 7.07345049448801e-06, + "loss": 0.1305, + "step": 43678 + }, + { + "epoch": 0.7790639603324653, + "grad_norm": 0.35149288177490234, + "learning_rate": 7.072365628799721e-06, + "loss": 0.1631, + "step": 43679 + }, + { + "epoch": 0.779081796454179, + "grad_norm": 0.22422043979167938, + "learning_rate": 7.071280832605126e-06, + "loss": 0.0876, + "step": 43680 + }, + { + "epoch": 0.7790996325758927, + "grad_norm": 0.2994772493839264, + "learning_rate": 7.070196105908447e-06, + "loss": 0.0965, + "step": 43681 + }, + { + "epoch": 0.7791174686976063, + "grad_norm": 0.3156898021697998, + "learning_rate": 7.069111448713883e-06, + "loss": 0.1254, + "step": 43682 + }, + { + "epoch": 0.7791353048193201, + "grad_norm": 0.3250686228275299, + "learning_rate": 7.068026861025634e-06, + "loss": 0.1218, + "step": 43683 + }, + { + "epoch": 0.7791531409410338, + "grad_norm": 0.18285994231700897, + "learning_rate": 7.066942342847907e-06, + "loss": 0.0749, + "step": 43684 + }, + { + "epoch": 0.7791709770627475, + "grad_norm": 0.3119698166847229, + "learning_rate": 7.065857894184897e-06, + "loss": 0.1657, + "step": 43685 + }, + { + "epoch": 0.7791888131844612, + "grad_norm": 0.23059257864952087, + "learning_rate": 7.064773515040823e-06, + "loss": 0.092, + "step": 43686 + }, + { + "epoch": 0.7792066493061749, + "grad_norm": 0.30813658237457275, + "learning_rate": 7.063689205419882e-06, + "loss": 0.1569, + "step": 43687 + }, + { + "epoch": 0.7792244854278886, + "grad_norm": 0.22009910643100739, + "learning_rate": 7.062604965326278e-06, + "loss": 0.0809, + "step": 43688 + }, + { + "epoch": 0.7792423215496023, + "grad_norm": 0.25231876969337463, + "learning_rate": 7.061520794764204e-06, + "loss": 0.1231, + "step": 43689 + }, + { + "epoch": 0.779260157671316, + "grad_norm": 0.25878289341926575, + "learning_rate": 7.060436693737879e-06, + "loss": 0.1068, + "step": 43690 + }, + { + "epoch": 0.7792779937930296, + "grad_norm": 0.27812430262565613, + "learning_rate": 7.059352662251489e-06, + "loss": 0.1582, + "step": 43691 + }, + { + "epoch": 0.7792958299147433, + "grad_norm": 0.2660776674747467, + "learning_rate": 7.058268700309254e-06, + "loss": 0.1277, + "step": 43692 + }, + { + "epoch": 0.779313666036457, + "grad_norm": 0.2194720357656479, + "learning_rate": 7.057184807915357e-06, + "loss": 0.0938, + "step": 43693 + }, + { + "epoch": 0.7793315021581707, + "grad_norm": 0.26511961221694946, + "learning_rate": 7.0561009850740186e-06, + "loss": 0.0996, + "step": 43694 + }, + { + "epoch": 0.7793493382798844, + "grad_norm": 0.3341408669948578, + "learning_rate": 7.055017231789429e-06, + "loss": 0.1418, + "step": 43695 + }, + { + "epoch": 0.7793671744015981, + "grad_norm": 0.17767034471035004, + "learning_rate": 7.053933548065791e-06, + "loss": 0.1014, + "step": 43696 + }, + { + "epoch": 0.7793850105233118, + "grad_norm": 0.1988210678100586, + "learning_rate": 7.052849933907305e-06, + "loss": 0.1244, + "step": 43697 + }, + { + "epoch": 0.7794028466450255, + "grad_norm": 0.35376453399658203, + "learning_rate": 7.0517663893181624e-06, + "loss": 0.1153, + "step": 43698 + }, + { + "epoch": 0.7794206827667391, + "grad_norm": 0.18547537922859192, + "learning_rate": 7.050682914302581e-06, + "loss": 0.0629, + "step": 43699 + }, + { + "epoch": 0.779438518888453, + "grad_norm": 0.2151731550693512, + "learning_rate": 7.049599508864752e-06, + "loss": 0.1081, + "step": 43700 + }, + { + "epoch": 0.7794563550101666, + "grad_norm": 0.26846128702163696, + "learning_rate": 7.0485161730088765e-06, + "loss": 0.0834, + "step": 43701 + }, + { + "epoch": 0.7794741911318803, + "grad_norm": 0.28111574053764343, + "learning_rate": 7.047432906739143e-06, + "loss": 0.1437, + "step": 43702 + }, + { + "epoch": 0.779492027253594, + "grad_norm": 0.2761976420879364, + "learning_rate": 7.046349710059768e-06, + "loss": 0.1322, + "step": 43703 + }, + { + "epoch": 0.7795098633753077, + "grad_norm": 0.4626353681087494, + "learning_rate": 7.045266582974935e-06, + "loss": 0.1933, + "step": 43704 + }, + { + "epoch": 0.7795276994970214, + "grad_norm": 0.2717216908931732, + "learning_rate": 7.04418352548886e-06, + "loss": 0.0738, + "step": 43705 + }, + { + "epoch": 0.7795455356187351, + "grad_norm": 0.2565809488296509, + "learning_rate": 7.043100537605721e-06, + "loss": 0.1005, + "step": 43706 + }, + { + "epoch": 0.7795633717404488, + "grad_norm": 0.2800256609916687, + "learning_rate": 7.042017619329736e-06, + "loss": 0.069, + "step": 43707 + }, + { + "epoch": 0.7795812078621625, + "grad_norm": 0.2342686951160431, + "learning_rate": 7.0409347706650915e-06, + "loss": 0.1805, + "step": 43708 + }, + { + "epoch": 0.7795990439838761, + "grad_norm": 0.2554384171962738, + "learning_rate": 7.039851991615986e-06, + "loss": 0.1476, + "step": 43709 + }, + { + "epoch": 0.7796168801055898, + "grad_norm": 0.2566075026988983, + "learning_rate": 7.038769282186622e-06, + "loss": 0.1302, + "step": 43710 + }, + { + "epoch": 0.7796347162273035, + "grad_norm": 0.25781774520874023, + "learning_rate": 7.03768664238118e-06, + "loss": 0.1329, + "step": 43711 + }, + { + "epoch": 0.7796525523490172, + "grad_norm": 0.22653940320014954, + "learning_rate": 7.036604072203876e-06, + "loss": 0.0906, + "step": 43712 + }, + { + "epoch": 0.7796703884707309, + "grad_norm": 0.32700657844543457, + "learning_rate": 7.035521571658901e-06, + "loss": 0.0717, + "step": 43713 + }, + { + "epoch": 0.7796882245924446, + "grad_norm": 0.3249012529850006, + "learning_rate": 7.034439140750448e-06, + "loss": 0.129, + "step": 43714 + }, + { + "epoch": 0.7797060607141583, + "grad_norm": 0.2203959822654724, + "learning_rate": 7.033356779482708e-06, + "loss": 0.0785, + "step": 43715 + }, + { + "epoch": 0.779723896835872, + "grad_norm": 0.216635599732399, + "learning_rate": 7.032274487859888e-06, + "loss": 0.1112, + "step": 43716 + }, + { + "epoch": 0.7797417329575858, + "grad_norm": 0.34894847869873047, + "learning_rate": 7.03119226588618e-06, + "loss": 0.0887, + "step": 43717 + }, + { + "epoch": 0.7797595690792994, + "grad_norm": 0.27039068937301636, + "learning_rate": 7.030110113565766e-06, + "loss": 0.0596, + "step": 43718 + }, + { + "epoch": 0.7797774052010131, + "grad_norm": 0.28972846269607544, + "learning_rate": 7.0290280309028535e-06, + "loss": 0.1283, + "step": 43719 + }, + { + "epoch": 0.7797952413227268, + "grad_norm": 0.29053258895874023, + "learning_rate": 7.027946017901646e-06, + "loss": 0.1091, + "step": 43720 + }, + { + "epoch": 0.7798130774444405, + "grad_norm": 0.35831987857818604, + "learning_rate": 7.026864074566325e-06, + "loss": 0.1839, + "step": 43721 + }, + { + "epoch": 0.7798309135661542, + "grad_norm": 0.21541453897953033, + "learning_rate": 7.025782200901085e-06, + "loss": 0.0969, + "step": 43722 + }, + { + "epoch": 0.7798487496878679, + "grad_norm": 0.2554680109024048, + "learning_rate": 7.0247003969101225e-06, + "loss": 0.1407, + "step": 43723 + }, + { + "epoch": 0.7798665858095816, + "grad_norm": 0.2727867066860199, + "learning_rate": 7.0236186625976225e-06, + "loss": 0.1465, + "step": 43724 + }, + { + "epoch": 0.7798844219312953, + "grad_norm": 0.2268063724040985, + "learning_rate": 7.022536997967793e-06, + "loss": 0.0789, + "step": 43725 + }, + { + "epoch": 0.779902258053009, + "grad_norm": 0.3424607217311859, + "learning_rate": 7.021455403024818e-06, + "loss": 0.1049, + "step": 43726 + }, + { + "epoch": 0.7799200941747226, + "grad_norm": 0.3250359296798706, + "learning_rate": 7.020373877772893e-06, + "loss": 0.1836, + "step": 43727 + }, + { + "epoch": 0.7799379302964363, + "grad_norm": 0.32640814781188965, + "learning_rate": 7.019292422216201e-06, + "loss": 0.1267, + "step": 43728 + }, + { + "epoch": 0.77995576641815, + "grad_norm": 0.3251577913761139, + "learning_rate": 7.018211036358949e-06, + "loss": 0.1214, + "step": 43729 + }, + { + "epoch": 0.7799736025398637, + "grad_norm": 0.3441602289676666, + "learning_rate": 7.01712972020532e-06, + "loss": 0.1227, + "step": 43730 + }, + { + "epoch": 0.7799914386615774, + "grad_norm": 0.328098326921463, + "learning_rate": 7.016048473759501e-06, + "loss": 0.1382, + "step": 43731 + }, + { + "epoch": 0.7800092747832911, + "grad_norm": 0.1681549847126007, + "learning_rate": 7.014967297025698e-06, + "loss": 0.0841, + "step": 43732 + }, + { + "epoch": 0.7800271109050048, + "grad_norm": 0.26506534218788147, + "learning_rate": 7.0138861900080874e-06, + "loss": 0.1392, + "step": 43733 + }, + { + "epoch": 0.7800449470267186, + "grad_norm": 0.2814160883426666, + "learning_rate": 7.012805152710872e-06, + "loss": 0.0957, + "step": 43734 + }, + { + "epoch": 0.7800627831484322, + "grad_norm": 0.2107677459716797, + "learning_rate": 7.011724185138235e-06, + "loss": 0.1419, + "step": 43735 + }, + { + "epoch": 0.7800806192701459, + "grad_norm": 0.23588047921657562, + "learning_rate": 7.010643287294369e-06, + "loss": 0.0974, + "step": 43736 + }, + { + "epoch": 0.7800984553918596, + "grad_norm": 0.1811896413564682, + "learning_rate": 7.0095624591834555e-06, + "loss": 0.0858, + "step": 43737 + }, + { + "epoch": 0.7801162915135733, + "grad_norm": 0.3950812816619873, + "learning_rate": 7.008481700809699e-06, + "loss": 0.1537, + "step": 43738 + }, + { + "epoch": 0.780134127635287, + "grad_norm": 0.2639102041721344, + "learning_rate": 7.007401012177284e-06, + "loss": 0.1167, + "step": 43739 + }, + { + "epoch": 0.7801519637570007, + "grad_norm": 0.3729657828807831, + "learning_rate": 7.006320393290394e-06, + "loss": 0.1011, + "step": 43740 + }, + { + "epoch": 0.7801697998787144, + "grad_norm": 0.9485948085784912, + "learning_rate": 7.005239844153213e-06, + "loss": 0.1553, + "step": 43741 + }, + { + "epoch": 0.7801876360004281, + "grad_norm": 0.5171279907226562, + "learning_rate": 7.00415936476995e-06, + "loss": 0.1321, + "step": 43742 + }, + { + "epoch": 0.7802054721221418, + "grad_norm": 0.2862878143787384, + "learning_rate": 7.0030789551447765e-06, + "loss": 0.0837, + "step": 43743 + }, + { + "epoch": 0.7802233082438554, + "grad_norm": 0.280666321516037, + "learning_rate": 7.001998615281885e-06, + "loss": 0.1167, + "step": 43744 + }, + { + "epoch": 0.7802411443655691, + "grad_norm": 0.33776262402534485, + "learning_rate": 7.000918345185456e-06, + "loss": 0.137, + "step": 43745 + }, + { + "epoch": 0.7802589804872828, + "grad_norm": 0.2990826666355133, + "learning_rate": 6.999838144859686e-06, + "loss": 0.1449, + "step": 43746 + }, + { + "epoch": 0.7802768166089965, + "grad_norm": 0.21509036421775818, + "learning_rate": 6.998758014308765e-06, + "loss": 0.1123, + "step": 43747 + }, + { + "epoch": 0.7802946527307102, + "grad_norm": 0.26408934593200684, + "learning_rate": 6.997677953536877e-06, + "loss": 0.1579, + "step": 43748 + }, + { + "epoch": 0.7803124888524239, + "grad_norm": 0.22391390800476074, + "learning_rate": 6.996597962548207e-06, + "loss": 0.0671, + "step": 43749 + }, + { + "epoch": 0.7803303249741376, + "grad_norm": 0.24073286354541779, + "learning_rate": 6.995518041346935e-06, + "loss": 0.1353, + "step": 43750 + }, + { + "epoch": 0.7803481610958514, + "grad_norm": 0.3524903655052185, + "learning_rate": 6.99443818993726e-06, + "loss": 0.1179, + "step": 43751 + }, + { + "epoch": 0.780365997217565, + "grad_norm": 0.24867582321166992, + "learning_rate": 6.993358408323361e-06, + "loss": 0.1302, + "step": 43752 + }, + { + "epoch": 0.7803838333392787, + "grad_norm": 0.29625800251960754, + "learning_rate": 6.992278696509425e-06, + "loss": 0.1588, + "step": 43753 + }, + { + "epoch": 0.7804016694609924, + "grad_norm": 0.2628306746482849, + "learning_rate": 6.991199054499628e-06, + "loss": 0.121, + "step": 43754 + }, + { + "epoch": 0.7804195055827061, + "grad_norm": 0.243768110871315, + "learning_rate": 6.990119482298171e-06, + "loss": 0.0854, + "step": 43755 + }, + { + "epoch": 0.7804373417044198, + "grad_norm": 0.20008443295955658, + "learning_rate": 6.989039979909235e-06, + "loss": 0.0759, + "step": 43756 + }, + { + "epoch": 0.7804551778261335, + "grad_norm": 0.2615085542201996, + "learning_rate": 6.987960547336997e-06, + "loss": 0.1138, + "step": 43757 + }, + { + "epoch": 0.7804730139478472, + "grad_norm": 0.24916520714759827, + "learning_rate": 6.986881184585636e-06, + "loss": 0.1392, + "step": 43758 + }, + { + "epoch": 0.7804908500695609, + "grad_norm": 0.24652761220932007, + "learning_rate": 6.985801891659358e-06, + "loss": 0.102, + "step": 43759 + }, + { + "epoch": 0.7805086861912746, + "grad_norm": 0.2989273965358734, + "learning_rate": 6.984722668562321e-06, + "loss": 0.1385, + "step": 43760 + }, + { + "epoch": 0.7805265223129882, + "grad_norm": 0.3251599967479706, + "learning_rate": 6.983643515298732e-06, + "loss": 0.1139, + "step": 43761 + }, + { + "epoch": 0.7805443584347019, + "grad_norm": 0.23709046840667725, + "learning_rate": 6.982564431872754e-06, + "loss": 0.0833, + "step": 43762 + }, + { + "epoch": 0.7805621945564156, + "grad_norm": 0.33574166893959045, + "learning_rate": 6.981485418288589e-06, + "loss": 0.1587, + "step": 43763 + }, + { + "epoch": 0.7805800306781293, + "grad_norm": 0.26342105865478516, + "learning_rate": 6.980406474550408e-06, + "loss": 0.1179, + "step": 43764 + }, + { + "epoch": 0.780597866799843, + "grad_norm": 0.3333938419818878, + "learning_rate": 6.979327600662395e-06, + "loss": 0.1126, + "step": 43765 + }, + { + "epoch": 0.7806157029215567, + "grad_norm": 0.24686551094055176, + "learning_rate": 6.978248796628734e-06, + "loss": 0.1456, + "step": 43766 + }, + { + "epoch": 0.7806335390432705, + "grad_norm": 0.2772115170955658, + "learning_rate": 6.9771700624535945e-06, + "loss": 0.1519, + "step": 43767 + }, + { + "epoch": 0.7806513751649842, + "grad_norm": 0.25339657068252563, + "learning_rate": 6.9760913981411796e-06, + "loss": 0.1283, + "step": 43768 + }, + { + "epoch": 0.7806692112866979, + "grad_norm": 0.3239727318286896, + "learning_rate": 6.975012803695657e-06, + "loss": 0.1337, + "step": 43769 + }, + { + "epoch": 0.7806870474084115, + "grad_norm": 0.24212871491909027, + "learning_rate": 6.973934279121214e-06, + "loss": 0.1111, + "step": 43770 + }, + { + "epoch": 0.7807048835301252, + "grad_norm": 0.24759207665920258, + "learning_rate": 6.972855824422017e-06, + "loss": 0.0994, + "step": 43771 + }, + { + "epoch": 0.7807227196518389, + "grad_norm": 0.34922152757644653, + "learning_rate": 6.9717774396022674e-06, + "loss": 0.1085, + "step": 43772 + }, + { + "epoch": 0.7807405557735526, + "grad_norm": 0.26788589358329773, + "learning_rate": 6.970699124666124e-06, + "loss": 0.101, + "step": 43773 + }, + { + "epoch": 0.7807583918952663, + "grad_norm": 0.27522867918014526, + "learning_rate": 6.969620879617789e-06, + "loss": 0.1304, + "step": 43774 + }, + { + "epoch": 0.78077622801698, + "grad_norm": 0.2831469476222992, + "learning_rate": 6.968542704461423e-06, + "loss": 0.1364, + "step": 43775 + }, + { + "epoch": 0.7807940641386937, + "grad_norm": 0.32025599479675293, + "learning_rate": 6.9674645992012176e-06, + "loss": 0.1117, + "step": 43776 + }, + { + "epoch": 0.7808119002604074, + "grad_norm": 0.28036609292030334, + "learning_rate": 6.9663865638413515e-06, + "loss": 0.1368, + "step": 43777 + }, + { + "epoch": 0.780829736382121, + "grad_norm": 0.3188968300819397, + "learning_rate": 6.965308598385997e-06, + "loss": 0.155, + "step": 43778 + }, + { + "epoch": 0.7808475725038347, + "grad_norm": 0.24273408949375153, + "learning_rate": 6.964230702839339e-06, + "loss": 0.122, + "step": 43779 + }, + { + "epoch": 0.7808654086255484, + "grad_norm": 0.2649591267108917, + "learning_rate": 6.9631528772055424e-06, + "loss": 0.1227, + "step": 43780 + }, + { + "epoch": 0.7808832447472621, + "grad_norm": 0.25641506910324097, + "learning_rate": 6.962075121488801e-06, + "loss": 0.0858, + "step": 43781 + }, + { + "epoch": 0.7809010808689758, + "grad_norm": 0.24746544659137726, + "learning_rate": 6.960997435693286e-06, + "loss": 0.1175, + "step": 43782 + }, + { + "epoch": 0.7809189169906895, + "grad_norm": 0.35266461968421936, + "learning_rate": 6.95991981982318e-06, + "loss": 0.1546, + "step": 43783 + }, + { + "epoch": 0.7809367531124033, + "grad_norm": 0.23821569979190826, + "learning_rate": 6.958842273882643e-06, + "loss": 0.1033, + "step": 43784 + }, + { + "epoch": 0.780954589234117, + "grad_norm": 0.2249196618795395, + "learning_rate": 6.957764797875877e-06, + "loss": 0.1176, + "step": 43785 + }, + { + "epoch": 0.7809724253558307, + "grad_norm": 0.22049926221370697, + "learning_rate": 6.956687391807043e-06, + "loss": 0.0708, + "step": 43786 + }, + { + "epoch": 0.7809902614775444, + "grad_norm": 0.2715582549571991, + "learning_rate": 6.955610055680312e-06, + "loss": 0.0913, + "step": 43787 + }, + { + "epoch": 0.781008097599258, + "grad_norm": 0.23448802530765533, + "learning_rate": 6.954532789499879e-06, + "loss": 0.0853, + "step": 43788 + }, + { + "epoch": 0.7810259337209717, + "grad_norm": 0.2523050904273987, + "learning_rate": 6.9534555932699e-06, + "loss": 0.1449, + "step": 43789 + }, + { + "epoch": 0.7810437698426854, + "grad_norm": 0.23628056049346924, + "learning_rate": 6.952378466994569e-06, + "loss": 0.127, + "step": 43790 + }, + { + "epoch": 0.7810616059643991, + "grad_norm": 0.28274673223495483, + "learning_rate": 6.951301410678055e-06, + "loss": 0.161, + "step": 43791 + }, + { + "epoch": 0.7810794420861128, + "grad_norm": 0.21495795249938965, + "learning_rate": 6.950224424324525e-06, + "loss": 0.1287, + "step": 43792 + }, + { + "epoch": 0.7810972782078265, + "grad_norm": 0.2318679541349411, + "learning_rate": 6.949147507938156e-06, + "loss": 0.0907, + "step": 43793 + }, + { + "epoch": 0.7811151143295402, + "grad_norm": 0.2539827525615692, + "learning_rate": 6.9480706615231316e-06, + "loss": 0.1227, + "step": 43794 + }, + { + "epoch": 0.7811329504512539, + "grad_norm": 0.235306516289711, + "learning_rate": 6.946993885083622e-06, + "loss": 0.084, + "step": 43795 + }, + { + "epoch": 0.7811507865729675, + "grad_norm": 0.3072325885295868, + "learning_rate": 6.945917178623798e-06, + "loss": 0.1177, + "step": 43796 + }, + { + "epoch": 0.7811686226946812, + "grad_norm": 0.2889297604560852, + "learning_rate": 6.944840542147827e-06, + "loss": 0.0908, + "step": 43797 + }, + { + "epoch": 0.7811864588163949, + "grad_norm": 0.23642177879810333, + "learning_rate": 6.943763975659898e-06, + "loss": 0.1313, + "step": 43798 + }, + { + "epoch": 0.7812042949381086, + "grad_norm": 0.28191569447517395, + "learning_rate": 6.942687479164176e-06, + "loss": 0.108, + "step": 43799 + }, + { + "epoch": 0.7812221310598223, + "grad_norm": 0.25036439299583435, + "learning_rate": 6.941611052664826e-06, + "loss": 0.1163, + "step": 43800 + }, + { + "epoch": 0.7812399671815361, + "grad_norm": 0.3078230023384094, + "learning_rate": 6.9405346961660375e-06, + "loss": 0.1198, + "step": 43801 + }, + { + "epoch": 0.7812578033032498, + "grad_norm": 0.244432270526886, + "learning_rate": 6.939458409671964e-06, + "loss": 0.0806, + "step": 43802 + }, + { + "epoch": 0.7812756394249635, + "grad_norm": 0.39712393283843994, + "learning_rate": 6.938382193186798e-06, + "loss": 0.1227, + "step": 43803 + }, + { + "epoch": 0.7812934755466772, + "grad_norm": 0.3207155466079712, + "learning_rate": 6.937306046714701e-06, + "loss": 0.0907, + "step": 43804 + }, + { + "epoch": 0.7813113116683909, + "grad_norm": 0.2498425990343094, + "learning_rate": 6.936229970259844e-06, + "loss": 0.1045, + "step": 43805 + }, + { + "epoch": 0.7813291477901045, + "grad_norm": 0.2713736593723297, + "learning_rate": 6.935153963826391e-06, + "loss": 0.1068, + "step": 43806 + }, + { + "epoch": 0.7813469839118182, + "grad_norm": 0.22285813093185425, + "learning_rate": 6.934078027418525e-06, + "loss": 0.077, + "step": 43807 + }, + { + "epoch": 0.7813648200335319, + "grad_norm": 0.30948761105537415, + "learning_rate": 6.933002161040417e-06, + "loss": 0.0683, + "step": 43808 + }, + { + "epoch": 0.7813826561552456, + "grad_norm": 0.27965933084487915, + "learning_rate": 6.9319263646962314e-06, + "loss": 0.1248, + "step": 43809 + }, + { + "epoch": 0.7814004922769593, + "grad_norm": 0.35496416687965393, + "learning_rate": 6.93085063839013e-06, + "loss": 0.1466, + "step": 43810 + }, + { + "epoch": 0.781418328398673, + "grad_norm": 0.2777808904647827, + "learning_rate": 6.929774982126302e-06, + "loss": 0.1037, + "step": 43811 + }, + { + "epoch": 0.7814361645203867, + "grad_norm": 0.2724512219429016, + "learning_rate": 6.928699395908908e-06, + "loss": 0.1415, + "step": 43812 + }, + { + "epoch": 0.7814540006421004, + "grad_norm": 0.27665358781814575, + "learning_rate": 6.927623879742115e-06, + "loss": 0.0978, + "step": 43813 + }, + { + "epoch": 0.781471836763814, + "grad_norm": 0.2774742543697357, + "learning_rate": 6.926548433630087e-06, + "loss": 0.1109, + "step": 43814 + }, + { + "epoch": 0.7814896728855277, + "grad_norm": 0.3812437951564789, + "learning_rate": 6.925473057577009e-06, + "loss": 0.1205, + "step": 43815 + }, + { + "epoch": 0.7815075090072414, + "grad_norm": 0.2852420210838318, + "learning_rate": 6.92439775158703e-06, + "loss": 0.0946, + "step": 43816 + }, + { + "epoch": 0.7815253451289551, + "grad_norm": 0.2125966101884842, + "learning_rate": 6.923322515664335e-06, + "loss": 0.1242, + "step": 43817 + }, + { + "epoch": 0.7815431812506689, + "grad_norm": 0.2485707402229309, + "learning_rate": 6.922247349813088e-06, + "loss": 0.1039, + "step": 43818 + }, + { + "epoch": 0.7815610173723826, + "grad_norm": 0.4582907557487488, + "learning_rate": 6.921172254037448e-06, + "loss": 0.1862, + "step": 43819 + }, + { + "epoch": 0.7815788534940963, + "grad_norm": 0.3127819001674652, + "learning_rate": 6.920097228341593e-06, + "loss": 0.1136, + "step": 43820 + }, + { + "epoch": 0.78159668961581, + "grad_norm": 0.2834848463535309, + "learning_rate": 6.919022272729686e-06, + "loss": 0.1324, + "step": 43821 + }, + { + "epoch": 0.7816145257375237, + "grad_norm": 0.2748400568962097, + "learning_rate": 6.9179473872058946e-06, + "loss": 0.1222, + "step": 43822 + }, + { + "epoch": 0.7816323618592373, + "grad_norm": 0.3027372658252716, + "learning_rate": 6.916872571774377e-06, + "loss": 0.1306, + "step": 43823 + }, + { + "epoch": 0.781650197980951, + "grad_norm": 0.3411955237388611, + "learning_rate": 6.915797826439313e-06, + "loss": 0.1106, + "step": 43824 + }, + { + "epoch": 0.7816680341026647, + "grad_norm": 0.32070213556289673, + "learning_rate": 6.914723151204866e-06, + "loss": 0.1481, + "step": 43825 + }, + { + "epoch": 0.7816858702243784, + "grad_norm": 0.31144171953201294, + "learning_rate": 6.913648546075197e-06, + "loss": 0.1643, + "step": 43826 + }, + { + "epoch": 0.7817037063460921, + "grad_norm": 0.2526528537273407, + "learning_rate": 6.912574011054462e-06, + "loss": 0.0235, + "step": 43827 + }, + { + "epoch": 0.7817215424678058, + "grad_norm": 0.2426234781742096, + "learning_rate": 6.911499546146849e-06, + "loss": 0.1071, + "step": 43828 + }, + { + "epoch": 0.7817393785895195, + "grad_norm": 0.2782787084579468, + "learning_rate": 6.910425151356503e-06, + "loss": 0.0915, + "step": 43829 + }, + { + "epoch": 0.7817572147112332, + "grad_norm": 0.27718019485473633, + "learning_rate": 6.909350826687605e-06, + "loss": 0.1442, + "step": 43830 + }, + { + "epoch": 0.7817750508329468, + "grad_norm": 0.2478848546743393, + "learning_rate": 6.9082765721443096e-06, + "loss": 0.1174, + "step": 43831 + }, + { + "epoch": 0.7817928869546605, + "grad_norm": 0.3368293046951294, + "learning_rate": 6.907202387730777e-06, + "loss": 0.1697, + "step": 43832 + }, + { + "epoch": 0.7818107230763742, + "grad_norm": 0.30247530341148376, + "learning_rate": 6.906128273451184e-06, + "loss": 0.1477, + "step": 43833 + }, + { + "epoch": 0.7818285591980879, + "grad_norm": 0.2598891854286194, + "learning_rate": 6.905054229309688e-06, + "loss": 0.1296, + "step": 43834 + }, + { + "epoch": 0.7818463953198017, + "grad_norm": 0.26453354954719543, + "learning_rate": 6.903980255310449e-06, + "loss": 0.0963, + "step": 43835 + }, + { + "epoch": 0.7818642314415154, + "grad_norm": 0.2543056309223175, + "learning_rate": 6.902906351457628e-06, + "loss": 0.1073, + "step": 43836 + }, + { + "epoch": 0.7818820675632291, + "grad_norm": 0.32321298122406006, + "learning_rate": 6.901832517755399e-06, + "loss": 0.0419, + "step": 43837 + }, + { + "epoch": 0.7818999036849428, + "grad_norm": 0.32585689425468445, + "learning_rate": 6.900758754207917e-06, + "loss": 0.1218, + "step": 43838 + }, + { + "epoch": 0.7819177398066565, + "grad_norm": 0.24500715732574463, + "learning_rate": 6.899685060819347e-06, + "loss": 0.1076, + "step": 43839 + }, + { + "epoch": 0.7819355759283702, + "grad_norm": 0.2550657093524933, + "learning_rate": 6.898611437593841e-06, + "loss": 0.1249, + "step": 43840 + }, + { + "epoch": 0.7819534120500838, + "grad_norm": 0.22559019923210144, + "learning_rate": 6.897537884535579e-06, + "loss": 0.1215, + "step": 43841 + }, + { + "epoch": 0.7819712481717975, + "grad_norm": 0.2994731068611145, + "learning_rate": 6.8964644016487085e-06, + "loss": 0.163, + "step": 43842 + }, + { + "epoch": 0.7819890842935112, + "grad_norm": 0.267466276884079, + "learning_rate": 6.895390988937389e-06, + "loss": 0.0943, + "step": 43843 + }, + { + "epoch": 0.7820069204152249, + "grad_norm": 0.24043338000774384, + "learning_rate": 6.894317646405796e-06, + "loss": 0.1274, + "step": 43844 + }, + { + "epoch": 0.7820247565369386, + "grad_norm": 0.2896980941295624, + "learning_rate": 6.893244374058072e-06, + "loss": 0.1213, + "step": 43845 + }, + { + "epoch": 0.7820425926586523, + "grad_norm": 0.23170100152492523, + "learning_rate": 6.892171171898396e-06, + "loss": 0.1357, + "step": 43846 + }, + { + "epoch": 0.782060428780366, + "grad_norm": 0.23925979435443878, + "learning_rate": 6.891098039930918e-06, + "loss": 0.0695, + "step": 43847 + }, + { + "epoch": 0.7820782649020797, + "grad_norm": 0.4208838939666748, + "learning_rate": 6.890024978159798e-06, + "loss": 0.1702, + "step": 43848 + }, + { + "epoch": 0.7820961010237933, + "grad_norm": 0.32660186290740967, + "learning_rate": 6.888951986589187e-06, + "loss": 0.202, + "step": 43849 + }, + { + "epoch": 0.782113937145507, + "grad_norm": 0.350597620010376, + "learning_rate": 6.8878790652232655e-06, + "loss": 0.1263, + "step": 43850 + }, + { + "epoch": 0.7821317732672207, + "grad_norm": 0.28322306275367737, + "learning_rate": 6.886806214066177e-06, + "loss": 0.1368, + "step": 43851 + }, + { + "epoch": 0.7821496093889345, + "grad_norm": 0.3919888138771057, + "learning_rate": 6.885733433122085e-06, + "loss": 0.1047, + "step": 43852 + }, + { + "epoch": 0.7821674455106482, + "grad_norm": 0.2230241596698761, + "learning_rate": 6.884660722395137e-06, + "loss": 0.0808, + "step": 43853 + }, + { + "epoch": 0.7821852816323619, + "grad_norm": 0.25251758098602295, + "learning_rate": 6.883588081889511e-06, + "loss": 0.1184, + "step": 43854 + }, + { + "epoch": 0.7822031177540756, + "grad_norm": 0.23842795193195343, + "learning_rate": 6.882515511609353e-06, + "loss": 0.0665, + "step": 43855 + }, + { + "epoch": 0.7822209538757893, + "grad_norm": 0.2097027748823166, + "learning_rate": 6.881443011558817e-06, + "loss": 0.117, + "step": 43856 + }, + { + "epoch": 0.782238789997503, + "grad_norm": 0.2694172263145447, + "learning_rate": 6.880370581742071e-06, + "loss": 0.1148, + "step": 43857 + }, + { + "epoch": 0.7822566261192166, + "grad_norm": 0.22149275243282318, + "learning_rate": 6.8792982221632606e-06, + "loss": 0.1118, + "step": 43858 + }, + { + "epoch": 0.7822744622409303, + "grad_norm": 0.2684837281703949, + "learning_rate": 6.878225932826557e-06, + "loss": 0.1523, + "step": 43859 + }, + { + "epoch": 0.782292298362644, + "grad_norm": 0.26369479298591614, + "learning_rate": 6.877153713736109e-06, + "loss": 0.1392, + "step": 43860 + }, + { + "epoch": 0.7823101344843577, + "grad_norm": 0.24295039474964142, + "learning_rate": 6.8760815648960725e-06, + "loss": 0.0775, + "step": 43861 + }, + { + "epoch": 0.7823279706060714, + "grad_norm": 0.33857327699661255, + "learning_rate": 6.8750094863105936e-06, + "loss": 0.0966, + "step": 43862 + }, + { + "epoch": 0.7823458067277851, + "grad_norm": 0.2506985664367676, + "learning_rate": 6.87393747798385e-06, + "loss": 0.1054, + "step": 43863 + }, + { + "epoch": 0.7823636428494988, + "grad_norm": 0.3456091582775116, + "learning_rate": 6.8728655399199815e-06, + "loss": 0.1244, + "step": 43864 + }, + { + "epoch": 0.7823814789712125, + "grad_norm": 0.25720977783203125, + "learning_rate": 6.8717936721231475e-06, + "loss": 0.1144, + "step": 43865 + }, + { + "epoch": 0.7823993150929262, + "grad_norm": 0.3041876256465912, + "learning_rate": 6.870721874597497e-06, + "loss": 0.147, + "step": 43866 + }, + { + "epoch": 0.7824171512146398, + "grad_norm": 0.2692136764526367, + "learning_rate": 6.869650147347196e-06, + "loss": 0.1315, + "step": 43867 + }, + { + "epoch": 0.7824349873363536, + "grad_norm": 0.22703337669372559, + "learning_rate": 6.868578490376393e-06, + "loss": 0.1152, + "step": 43868 + }, + { + "epoch": 0.7824528234580673, + "grad_norm": 0.318968802690506, + "learning_rate": 6.867506903689244e-06, + "loss": 0.0974, + "step": 43869 + }, + { + "epoch": 0.782470659579781, + "grad_norm": 0.251120388507843, + "learning_rate": 6.866435387289893e-06, + "loss": 0.1455, + "step": 43870 + }, + { + "epoch": 0.7824884957014947, + "grad_norm": 0.435168594121933, + "learning_rate": 6.865363941182507e-06, + "loss": 0.1353, + "step": 43871 + }, + { + "epoch": 0.7825063318232084, + "grad_norm": 0.28813475370407104, + "learning_rate": 6.864292565371228e-06, + "loss": 0.1158, + "step": 43872 + }, + { + "epoch": 0.7825241679449221, + "grad_norm": 0.3141791820526123, + "learning_rate": 6.863221259860223e-06, + "loss": 0.1158, + "step": 43873 + }, + { + "epoch": 0.7825420040666358, + "grad_norm": 0.2665664255619049, + "learning_rate": 6.862150024653635e-06, + "loss": 0.081, + "step": 43874 + }, + { + "epoch": 0.7825598401883495, + "grad_norm": 0.2959728538990021, + "learning_rate": 6.861078859755613e-06, + "loss": 0.1067, + "step": 43875 + }, + { + "epoch": 0.7825776763100631, + "grad_norm": 0.4040279984474182, + "learning_rate": 6.860007765170318e-06, + "loss": 0.0859, + "step": 43876 + }, + { + "epoch": 0.7825955124317768, + "grad_norm": 0.1971277892589569, + "learning_rate": 6.8589367409019e-06, + "loss": 0.0733, + "step": 43877 + }, + { + "epoch": 0.7826133485534905, + "grad_norm": 0.28245672583580017, + "learning_rate": 6.857865786954509e-06, + "loss": 0.1077, + "step": 43878 + }, + { + "epoch": 0.7826311846752042, + "grad_norm": 0.2788733243942261, + "learning_rate": 6.856794903332292e-06, + "loss": 0.081, + "step": 43879 + }, + { + "epoch": 0.7826490207969179, + "grad_norm": 0.2858203053474426, + "learning_rate": 6.855724090039408e-06, + "loss": 0.1732, + "step": 43880 + }, + { + "epoch": 0.7826668569186316, + "grad_norm": 0.30907875299453735, + "learning_rate": 6.8546533470800066e-06, + "loss": 0.1307, + "step": 43881 + }, + { + "epoch": 0.7826846930403453, + "grad_norm": 0.2967752516269684, + "learning_rate": 6.853582674458234e-06, + "loss": 0.1237, + "step": 43882 + }, + { + "epoch": 0.782702529162059, + "grad_norm": 0.2370687872171402, + "learning_rate": 6.852512072178236e-06, + "loss": 0.1414, + "step": 43883 + }, + { + "epoch": 0.7827203652837726, + "grad_norm": 0.23565343022346497, + "learning_rate": 6.851441540244175e-06, + "loss": 0.0707, + "step": 43884 + }, + { + "epoch": 0.7827382014054864, + "grad_norm": 0.31024396419525146, + "learning_rate": 6.850371078660189e-06, + "loss": 0.1524, + "step": 43885 + }, + { + "epoch": 0.7827560375272001, + "grad_norm": 0.31341469287872314, + "learning_rate": 6.84930068743044e-06, + "loss": 0.0962, + "step": 43886 + }, + { + "epoch": 0.7827738736489138, + "grad_norm": 0.30206796526908875, + "learning_rate": 6.8482303665590716e-06, + "loss": 0.1373, + "step": 43887 + }, + { + "epoch": 0.7827917097706275, + "grad_norm": 0.3254294693470001, + "learning_rate": 6.847160116050225e-06, + "loss": 0.1002, + "step": 43888 + }, + { + "epoch": 0.7828095458923412, + "grad_norm": 0.235929474234581, + "learning_rate": 6.8460899359080625e-06, + "loss": 0.0718, + "step": 43889 + }, + { + "epoch": 0.7828273820140549, + "grad_norm": 0.41313448548316956, + "learning_rate": 6.845019826136726e-06, + "loss": 0.1527, + "step": 43890 + }, + { + "epoch": 0.7828452181357686, + "grad_norm": 0.28257110714912415, + "learning_rate": 6.843949786740362e-06, + "loss": 0.0807, + "step": 43891 + }, + { + "epoch": 0.7828630542574823, + "grad_norm": 0.30631351470947266, + "learning_rate": 6.8428798177231104e-06, + "loss": 0.1376, + "step": 43892 + }, + { + "epoch": 0.782880890379196, + "grad_norm": 0.2863081395626068, + "learning_rate": 6.841809919089137e-06, + "loss": 0.0975, + "step": 43893 + }, + { + "epoch": 0.7828987265009096, + "grad_norm": 0.257034033536911, + "learning_rate": 6.840740090842582e-06, + "loss": 0.0913, + "step": 43894 + }, + { + "epoch": 0.7829165626226233, + "grad_norm": 0.230775848031044, + "learning_rate": 6.839670332987588e-06, + "loss": 0.1023, + "step": 43895 + }, + { + "epoch": 0.782934398744337, + "grad_norm": 0.3579098582267761, + "learning_rate": 6.8386006455283035e-06, + "loss": 0.1564, + "step": 43896 + }, + { + "epoch": 0.7829522348660507, + "grad_norm": 0.24949531257152557, + "learning_rate": 6.837531028468871e-06, + "loss": 0.1204, + "step": 43897 + }, + { + "epoch": 0.7829700709877644, + "grad_norm": 0.24813711643218994, + "learning_rate": 6.836461481813447e-06, + "loss": 0.1109, + "step": 43898 + }, + { + "epoch": 0.7829879071094781, + "grad_norm": 0.21334555745124817, + "learning_rate": 6.835392005566166e-06, + "loss": 0.1135, + "step": 43899 + }, + { + "epoch": 0.7830057432311918, + "grad_norm": 0.43501394987106323, + "learning_rate": 6.834322599731185e-06, + "loss": 0.2158, + "step": 43900 + }, + { + "epoch": 0.7830235793529055, + "grad_norm": 0.2504563331604004, + "learning_rate": 6.833253264312637e-06, + "loss": 0.1289, + "step": 43901 + }, + { + "epoch": 0.7830414154746193, + "grad_norm": 0.23282556235790253, + "learning_rate": 6.832183999314682e-06, + "loss": 0.1426, + "step": 43902 + }, + { + "epoch": 0.7830592515963329, + "grad_norm": 0.24387699365615845, + "learning_rate": 6.831114804741453e-06, + "loss": 0.098, + "step": 43903 + }, + { + "epoch": 0.7830770877180466, + "grad_norm": 0.35548272728919983, + "learning_rate": 6.830045680597103e-06, + "loss": 0.1442, + "step": 43904 + }, + { + "epoch": 0.7830949238397603, + "grad_norm": 0.31087082624435425, + "learning_rate": 6.828976626885763e-06, + "loss": 0.0827, + "step": 43905 + }, + { + "epoch": 0.783112759961474, + "grad_norm": 0.24772372841835022, + "learning_rate": 6.827907643611592e-06, + "loss": 0.1621, + "step": 43906 + }, + { + "epoch": 0.7831305960831877, + "grad_norm": 0.2577557861804962, + "learning_rate": 6.826838730778728e-06, + "loss": 0.095, + "step": 43907 + }, + { + "epoch": 0.7831484322049014, + "grad_norm": 0.250766783952713, + "learning_rate": 6.825769888391315e-06, + "loss": 0.1094, + "step": 43908 + }, + { + "epoch": 0.7831662683266151, + "grad_norm": 0.28177720308303833, + "learning_rate": 6.824701116453494e-06, + "loss": 0.063, + "step": 43909 + }, + { + "epoch": 0.7831841044483288, + "grad_norm": 0.2567859888076782, + "learning_rate": 6.823632414969402e-06, + "loss": 0.0855, + "step": 43910 + }, + { + "epoch": 0.7832019405700424, + "grad_norm": 0.30823659896850586, + "learning_rate": 6.822563783943195e-06, + "loss": 0.1175, + "step": 43911 + }, + { + "epoch": 0.7832197766917561, + "grad_norm": 0.28615644574165344, + "learning_rate": 6.8214952233790015e-06, + "loss": 0.1461, + "step": 43912 + }, + { + "epoch": 0.7832376128134698, + "grad_norm": 0.2536158859729767, + "learning_rate": 6.82042673328098e-06, + "loss": 0.0681, + "step": 43913 + }, + { + "epoch": 0.7832554489351835, + "grad_norm": 0.3210618495941162, + "learning_rate": 6.8193583136532565e-06, + "loss": 0.1579, + "step": 43914 + }, + { + "epoch": 0.7832732850568972, + "grad_norm": 0.33921271562576294, + "learning_rate": 6.818289964499985e-06, + "loss": 0.1581, + "step": 43915 + }, + { + "epoch": 0.7832911211786109, + "grad_norm": 0.20877552032470703, + "learning_rate": 6.8172216858253035e-06, + "loss": 0.0845, + "step": 43916 + }, + { + "epoch": 0.7833089573003246, + "grad_norm": 0.2081398218870163, + "learning_rate": 6.81615347763335e-06, + "loss": 0.0666, + "step": 43917 + }, + { + "epoch": 0.7833267934220383, + "grad_norm": 0.3053925633430481, + "learning_rate": 6.81508533992826e-06, + "loss": 0.114, + "step": 43918 + }, + { + "epoch": 0.7833446295437521, + "grad_norm": 0.2868819534778595, + "learning_rate": 6.814017272714185e-06, + "loss": 0.1223, + "step": 43919 + }, + { + "epoch": 0.7833624656654657, + "grad_norm": 0.27690690755844116, + "learning_rate": 6.812949275995262e-06, + "loss": 0.1664, + "step": 43920 + }, + { + "epoch": 0.7833803017871794, + "grad_norm": 0.3040023446083069, + "learning_rate": 6.811881349775628e-06, + "loss": 0.0935, + "step": 43921 + }, + { + "epoch": 0.7833981379088931, + "grad_norm": 0.29304078221321106, + "learning_rate": 6.810813494059423e-06, + "loss": 0.1231, + "step": 43922 + }, + { + "epoch": 0.7834159740306068, + "grad_norm": 0.21517883241176605, + "learning_rate": 6.8097457088507815e-06, + "loss": 0.1323, + "step": 43923 + }, + { + "epoch": 0.7834338101523205, + "grad_norm": 0.2635672688484192, + "learning_rate": 6.808677994153856e-06, + "loss": 0.1296, + "step": 43924 + }, + { + "epoch": 0.7834516462740342, + "grad_norm": 0.36073535680770874, + "learning_rate": 6.807610349972776e-06, + "loss": 0.0946, + "step": 43925 + }, + { + "epoch": 0.7834694823957479, + "grad_norm": 0.2448207139968872, + "learning_rate": 6.806542776311678e-06, + "loss": 0.0882, + "step": 43926 + }, + { + "epoch": 0.7834873185174616, + "grad_norm": 0.2364901900291443, + "learning_rate": 6.8054752731746996e-06, + "loss": 0.0632, + "step": 43927 + }, + { + "epoch": 0.7835051546391752, + "grad_norm": 0.33054494857788086, + "learning_rate": 6.8044078405659934e-06, + "loss": 0.1269, + "step": 43928 + }, + { + "epoch": 0.7835229907608889, + "grad_norm": 0.308354914188385, + "learning_rate": 6.803340478489686e-06, + "loss": 0.1209, + "step": 43929 + }, + { + "epoch": 0.7835408268826026, + "grad_norm": 0.28551816940307617, + "learning_rate": 6.802273186949914e-06, + "loss": 0.0817, + "step": 43930 + }, + { + "epoch": 0.7835586630043163, + "grad_norm": 0.20581507682800293, + "learning_rate": 6.801205965950813e-06, + "loss": 0.0769, + "step": 43931 + }, + { + "epoch": 0.78357649912603, + "grad_norm": 0.32127198576927185, + "learning_rate": 6.800138815496529e-06, + "loss": 0.1092, + "step": 43932 + }, + { + "epoch": 0.7835943352477437, + "grad_norm": 0.2638777196407318, + "learning_rate": 6.799071735591192e-06, + "loss": 0.1252, + "step": 43933 + }, + { + "epoch": 0.7836121713694574, + "grad_norm": 0.29972749948501587, + "learning_rate": 6.798004726238941e-06, + "loss": 0.1693, + "step": 43934 + }, + { + "epoch": 0.7836300074911711, + "grad_norm": 0.20652666687965393, + "learning_rate": 6.796937787443908e-06, + "loss": 0.0846, + "step": 43935 + }, + { + "epoch": 0.7836478436128849, + "grad_norm": 0.42658916115760803, + "learning_rate": 6.795870919210226e-06, + "loss": 0.1372, + "step": 43936 + }, + { + "epoch": 0.7836656797345986, + "grad_norm": 0.43499237298965454, + "learning_rate": 6.794804121542042e-06, + "loss": 0.0947, + "step": 43937 + }, + { + "epoch": 0.7836835158563122, + "grad_norm": 0.2992672026157379, + "learning_rate": 6.793737394443486e-06, + "loss": 0.0922, + "step": 43938 + }, + { + "epoch": 0.7837013519780259, + "grad_norm": 0.17826256155967712, + "learning_rate": 6.792670737918685e-06, + "loss": 0.0666, + "step": 43939 + }, + { + "epoch": 0.7837191880997396, + "grad_norm": 0.2414674162864685, + "learning_rate": 6.791604151971786e-06, + "loss": 0.1049, + "step": 43940 + }, + { + "epoch": 0.7837370242214533, + "grad_norm": 0.24372652173042297, + "learning_rate": 6.790537636606914e-06, + "loss": 0.0896, + "step": 43941 + }, + { + "epoch": 0.783754860343167, + "grad_norm": 0.37273260951042175, + "learning_rate": 6.789471191828215e-06, + "loss": 0.1177, + "step": 43942 + }, + { + "epoch": 0.7837726964648807, + "grad_norm": 0.2619059681892395, + "learning_rate": 6.788404817639812e-06, + "loss": 0.1578, + "step": 43943 + }, + { + "epoch": 0.7837905325865944, + "grad_norm": 0.2871662974357605, + "learning_rate": 6.7873385140458355e-06, + "loss": 0.1044, + "step": 43944 + }, + { + "epoch": 0.7838083687083081, + "grad_norm": 0.2847343981266022, + "learning_rate": 6.786272281050435e-06, + "loss": 0.1309, + "step": 43945 + }, + { + "epoch": 0.7838262048300217, + "grad_norm": 0.23448815941810608, + "learning_rate": 6.785206118657733e-06, + "loss": 0.1646, + "step": 43946 + }, + { + "epoch": 0.7838440409517354, + "grad_norm": 0.26971331238746643, + "learning_rate": 6.784140026871863e-06, + "loss": 0.1352, + "step": 43947 + }, + { + "epoch": 0.7838618770734491, + "grad_norm": 0.3233478367328644, + "learning_rate": 6.783074005696957e-06, + "loss": 0.1504, + "step": 43948 + }, + { + "epoch": 0.7838797131951628, + "grad_norm": 0.24761062860488892, + "learning_rate": 6.78200805513714e-06, + "loss": 0.099, + "step": 43949 + }, + { + "epoch": 0.7838975493168765, + "grad_norm": 0.29679733514785767, + "learning_rate": 6.7809421751965616e-06, + "loss": 0.141, + "step": 43950 + }, + { + "epoch": 0.7839153854385902, + "grad_norm": 0.4595220983028412, + "learning_rate": 6.779876365879342e-06, + "loss": 0.1813, + "step": 43951 + }, + { + "epoch": 0.7839332215603039, + "grad_norm": 0.47264882922172546, + "learning_rate": 6.778810627189616e-06, + "loss": 0.1052, + "step": 43952 + }, + { + "epoch": 0.7839510576820177, + "grad_norm": 0.19188813865184784, + "learning_rate": 6.7777449591315074e-06, + "loss": 0.0832, + "step": 43953 + }, + { + "epoch": 0.7839688938037314, + "grad_norm": 0.26854994893074036, + "learning_rate": 6.776679361709151e-06, + "loss": 0.1306, + "step": 43954 + }, + { + "epoch": 0.783986729925445, + "grad_norm": 0.23165906965732574, + "learning_rate": 6.775613834926686e-06, + "loss": 0.1024, + "step": 43955 + }, + { + "epoch": 0.7840045660471587, + "grad_norm": 0.24162918329238892, + "learning_rate": 6.77454837878824e-06, + "loss": 0.1252, + "step": 43956 + }, + { + "epoch": 0.7840224021688724, + "grad_norm": 0.18442334234714508, + "learning_rate": 6.773482993297928e-06, + "loss": 0.0696, + "step": 43957 + }, + { + "epoch": 0.7840402382905861, + "grad_norm": 0.3295544981956482, + "learning_rate": 6.772417678459902e-06, + "loss": 0.1308, + "step": 43958 + }, + { + "epoch": 0.7840580744122998, + "grad_norm": 0.32126617431640625, + "learning_rate": 6.7713524342782776e-06, + "loss": 0.1189, + "step": 43959 + }, + { + "epoch": 0.7840759105340135, + "grad_norm": 0.2620071470737457, + "learning_rate": 6.770287260757188e-06, + "loss": 0.0904, + "step": 43960 + }, + { + "epoch": 0.7840937466557272, + "grad_norm": 0.21310366690158844, + "learning_rate": 6.769222157900762e-06, + "loss": 0.0881, + "step": 43961 + }, + { + "epoch": 0.7841115827774409, + "grad_norm": 0.3438452482223511, + "learning_rate": 6.7681571257131205e-06, + "loss": 0.1422, + "step": 43962 + }, + { + "epoch": 0.7841294188991546, + "grad_norm": 0.24785585701465607, + "learning_rate": 6.767092164198407e-06, + "loss": 0.113, + "step": 43963 + }, + { + "epoch": 0.7841472550208682, + "grad_norm": 0.3359091281890869, + "learning_rate": 6.76602727336074e-06, + "loss": 0.1273, + "step": 43964 + }, + { + "epoch": 0.7841650911425819, + "grad_norm": 0.30333253741264343, + "learning_rate": 6.764962453204249e-06, + "loss": 0.1639, + "step": 43965 + }, + { + "epoch": 0.7841829272642956, + "grad_norm": 0.31708574295043945, + "learning_rate": 6.763897703733055e-06, + "loss": 0.1451, + "step": 43966 + }, + { + "epoch": 0.7842007633860093, + "grad_norm": 0.3266368508338928, + "learning_rate": 6.762833024951301e-06, + "loss": 0.1222, + "step": 43967 + }, + { + "epoch": 0.784218599507723, + "grad_norm": 0.25479280948638916, + "learning_rate": 6.761768416863096e-06, + "loss": 0.1451, + "step": 43968 + }, + { + "epoch": 0.7842364356294368, + "grad_norm": 0.2788829803466797, + "learning_rate": 6.760703879472582e-06, + "loss": 0.1289, + "step": 43969 + }, + { + "epoch": 0.7842542717511505, + "grad_norm": 0.2246149480342865, + "learning_rate": 6.759639412783875e-06, + "loss": 0.131, + "step": 43970 + }, + { + "epoch": 0.7842721078728642, + "grad_norm": 0.2637142539024353, + "learning_rate": 6.758575016801111e-06, + "loss": 0.1328, + "step": 43971 + }, + { + "epoch": 0.7842899439945779, + "grad_norm": 0.27741870284080505, + "learning_rate": 6.757510691528412e-06, + "loss": 0.1065, + "step": 43972 + }, + { + "epoch": 0.7843077801162915, + "grad_norm": 0.2824453115463257, + "learning_rate": 6.7564464369699015e-06, + "loss": 0.1518, + "step": 43973 + }, + { + "epoch": 0.7843256162380052, + "grad_norm": 0.23903805017471313, + "learning_rate": 6.755382253129705e-06, + "loss": 0.1283, + "step": 43974 + }, + { + "epoch": 0.7843434523597189, + "grad_norm": 0.33735576272010803, + "learning_rate": 6.754318140011942e-06, + "loss": 0.0976, + "step": 43975 + }, + { + "epoch": 0.7843612884814326, + "grad_norm": 0.20487698912620544, + "learning_rate": 6.753254097620751e-06, + "loss": 0.0272, + "step": 43976 + }, + { + "epoch": 0.7843791246031463, + "grad_norm": 0.31918108463287354, + "learning_rate": 6.752190125960251e-06, + "loss": 0.1709, + "step": 43977 + }, + { + "epoch": 0.78439696072486, + "grad_norm": 0.31330522894859314, + "learning_rate": 6.7511262250345625e-06, + "loss": 0.1598, + "step": 43978 + }, + { + "epoch": 0.7844147968465737, + "grad_norm": 0.4117167592048645, + "learning_rate": 6.7500623948478035e-06, + "loss": 0.1416, + "step": 43979 + }, + { + "epoch": 0.7844326329682874, + "grad_norm": 0.2615671455860138, + "learning_rate": 6.748998635404114e-06, + "loss": 0.1172, + "step": 43980 + }, + { + "epoch": 0.784450469090001, + "grad_norm": 0.2784390449523926, + "learning_rate": 6.747934946707604e-06, + "loss": 0.1363, + "step": 43981 + }, + { + "epoch": 0.7844683052117147, + "grad_norm": 0.2899610102176666, + "learning_rate": 6.746871328762408e-06, + "loss": 0.0806, + "step": 43982 + }, + { + "epoch": 0.7844861413334284, + "grad_norm": 0.23531195521354675, + "learning_rate": 6.745807781572636e-06, + "loss": 0.1121, + "step": 43983 + }, + { + "epoch": 0.7845039774551421, + "grad_norm": 0.27561086416244507, + "learning_rate": 6.744744305142425e-06, + "loss": 0.1054, + "step": 43984 + }, + { + "epoch": 0.7845218135768558, + "grad_norm": 0.3087584376335144, + "learning_rate": 6.743680899475891e-06, + "loss": 0.1488, + "step": 43985 + }, + { + "epoch": 0.7845396496985696, + "grad_norm": 0.23982226848602295, + "learning_rate": 6.742617564577152e-06, + "loss": 0.0784, + "step": 43986 + }, + { + "epoch": 0.7845574858202833, + "grad_norm": 0.26415035128593445, + "learning_rate": 6.741554300450339e-06, + "loss": 0.1149, + "step": 43987 + }, + { + "epoch": 0.784575321941997, + "grad_norm": 0.2885240614414215, + "learning_rate": 6.740491107099556e-06, + "loss": 0.1686, + "step": 43988 + }, + { + "epoch": 0.7845931580637107, + "grad_norm": 0.2962561249732971, + "learning_rate": 6.739427984528945e-06, + "loss": 0.1542, + "step": 43989 + }, + { + "epoch": 0.7846109941854243, + "grad_norm": 0.5219005942344666, + "learning_rate": 6.738364932742619e-06, + "loss": 0.2019, + "step": 43990 + }, + { + "epoch": 0.784628830307138, + "grad_norm": 0.30370572209358215, + "learning_rate": 6.737301951744696e-06, + "loss": 0.1721, + "step": 43991 + }, + { + "epoch": 0.7846466664288517, + "grad_norm": 0.2811250388622284, + "learning_rate": 6.736239041539294e-06, + "loss": 0.1278, + "step": 43992 + }, + { + "epoch": 0.7846645025505654, + "grad_norm": 0.2856088876724243, + "learning_rate": 6.7351762021305415e-06, + "loss": 0.0667, + "step": 43993 + }, + { + "epoch": 0.7846823386722791, + "grad_norm": 0.26672953367233276, + "learning_rate": 6.734113433522557e-06, + "loss": 0.1022, + "step": 43994 + }, + { + "epoch": 0.7847001747939928, + "grad_norm": 0.2474367469549179, + "learning_rate": 6.733050735719448e-06, + "loss": 0.1071, + "step": 43995 + }, + { + "epoch": 0.7847180109157065, + "grad_norm": 0.3611794710159302, + "learning_rate": 6.731988108725354e-06, + "loss": 0.0933, + "step": 43996 + }, + { + "epoch": 0.7847358470374202, + "grad_norm": 0.24332818388938904, + "learning_rate": 6.730925552544373e-06, + "loss": 0.1158, + "step": 43997 + }, + { + "epoch": 0.7847536831591339, + "grad_norm": 0.27692005038261414, + "learning_rate": 6.729863067180645e-06, + "loss": 0.1256, + "step": 43998 + }, + { + "epoch": 0.7847715192808475, + "grad_norm": 0.40714526176452637, + "learning_rate": 6.728800652638276e-06, + "loss": 0.1782, + "step": 43999 + }, + { + "epoch": 0.7847893554025612, + "grad_norm": 0.3263236880302429, + "learning_rate": 6.727738308921386e-06, + "loss": 0.0926, + "step": 44000 + }, + { + "epoch": 0.7847893554025612, + "eval_loss": 0.11120539903640747, + "eval_runtime": 108.1296, + "eval_samples_per_second": 9.47, + "eval_steps_per_second": 1.581, + "step": 44000 + }, + { + "epoch": 0.7848071915242749, + "grad_norm": 0.18879170715808868, + "learning_rate": 6.726676036034086e-06, + "loss": 0.072, + "step": 44001 + }, + { + "epoch": 0.7848250276459886, + "grad_norm": 0.25284790992736816, + "learning_rate": 6.725613833980509e-06, + "loss": 0.1251, + "step": 44002 + }, + { + "epoch": 0.7848428637677024, + "grad_norm": 0.22584082186222076, + "learning_rate": 6.724551702764764e-06, + "loss": 0.0748, + "step": 44003 + }, + { + "epoch": 0.7848606998894161, + "grad_norm": 0.2852792739868164, + "learning_rate": 6.7234896423909705e-06, + "loss": 0.1283, + "step": 44004 + }, + { + "epoch": 0.7848785360111298, + "grad_norm": 0.2150062769651413, + "learning_rate": 6.722427652863236e-06, + "loss": 0.0903, + "step": 44005 + }, + { + "epoch": 0.7848963721328435, + "grad_norm": 0.26435598731040955, + "learning_rate": 6.721365734185692e-06, + "loss": 0.0964, + "step": 44006 + }, + { + "epoch": 0.7849142082545572, + "grad_norm": 0.3176110088825226, + "learning_rate": 6.720303886362444e-06, + "loss": 0.142, + "step": 44007 + }, + { + "epoch": 0.7849320443762708, + "grad_norm": 0.28956207633018494, + "learning_rate": 6.719242109397617e-06, + "loss": 0.0835, + "step": 44008 + }, + { + "epoch": 0.7849498804979845, + "grad_norm": 0.2351217269897461, + "learning_rate": 6.71818040329531e-06, + "loss": 0.0728, + "step": 44009 + }, + { + "epoch": 0.7849677166196982, + "grad_norm": 0.22506709396839142, + "learning_rate": 6.717118768059652e-06, + "loss": 0.1348, + "step": 44010 + }, + { + "epoch": 0.7849855527414119, + "grad_norm": 0.3307000696659088, + "learning_rate": 6.716057203694764e-06, + "loss": 0.1138, + "step": 44011 + }, + { + "epoch": 0.7850033888631256, + "grad_norm": 0.25659558176994324, + "learning_rate": 6.714995710204755e-06, + "loss": 0.092, + "step": 44012 + }, + { + "epoch": 0.7850212249848393, + "grad_norm": 0.2781889736652374, + "learning_rate": 6.713934287593734e-06, + "loss": 0.0997, + "step": 44013 + }, + { + "epoch": 0.785039061106553, + "grad_norm": 0.22953982651233673, + "learning_rate": 6.712872935865816e-06, + "loss": 0.12, + "step": 44014 + }, + { + "epoch": 0.7850568972282667, + "grad_norm": 0.22984986007213593, + "learning_rate": 6.711811655025124e-06, + "loss": 0.1173, + "step": 44015 + }, + { + "epoch": 0.7850747333499803, + "grad_norm": 0.19141502678394318, + "learning_rate": 6.710750445075767e-06, + "loss": 0.0783, + "step": 44016 + }, + { + "epoch": 0.785092569471694, + "grad_norm": 0.24057647585868835, + "learning_rate": 6.709689306021857e-06, + "loss": 0.0946, + "step": 44017 + }, + { + "epoch": 0.7851104055934077, + "grad_norm": 0.25510695576667786, + "learning_rate": 6.7086282378675035e-06, + "loss": 0.13, + "step": 44018 + }, + { + "epoch": 0.7851282417151214, + "grad_norm": 0.1844690442085266, + "learning_rate": 6.70756724061683e-06, + "loss": 0.0952, + "step": 44019 + }, + { + "epoch": 0.7851460778368352, + "grad_norm": 0.2828642725944519, + "learning_rate": 6.706506314273944e-06, + "loss": 0.1141, + "step": 44020 + }, + { + "epoch": 0.7851639139585489, + "grad_norm": 0.3131805658340454, + "learning_rate": 6.70544545884296e-06, + "loss": 0.1497, + "step": 44021 + }, + { + "epoch": 0.7851817500802626, + "grad_norm": 0.2940351068973541, + "learning_rate": 6.7043846743279794e-06, + "loss": 0.1044, + "step": 44022 + }, + { + "epoch": 0.7851995862019763, + "grad_norm": 0.23246759176254272, + "learning_rate": 6.703323960733129e-06, + "loss": 0.0636, + "step": 44023 + }, + { + "epoch": 0.78521742232369, + "grad_norm": 0.2708224952220917, + "learning_rate": 6.70226331806251e-06, + "loss": 0.1226, + "step": 44024 + }, + { + "epoch": 0.7852352584454036, + "grad_norm": 0.30450719594955444, + "learning_rate": 6.701202746320243e-06, + "loss": 0.1517, + "step": 44025 + }, + { + "epoch": 0.7852530945671173, + "grad_norm": 0.3627087473869324, + "learning_rate": 6.700142245510435e-06, + "loss": 0.1171, + "step": 44026 + }, + { + "epoch": 0.785270930688831, + "grad_norm": 0.526991069316864, + "learning_rate": 6.699081815637187e-06, + "loss": 0.1053, + "step": 44027 + }, + { + "epoch": 0.7852887668105447, + "grad_norm": 0.33565235137939453, + "learning_rate": 6.698021456704628e-06, + "loss": 0.1473, + "step": 44028 + }, + { + "epoch": 0.7853066029322584, + "grad_norm": 0.4455755650997162, + "learning_rate": 6.696961168716862e-06, + "loss": 0.177, + "step": 44029 + }, + { + "epoch": 0.7853244390539721, + "grad_norm": 0.3626958131790161, + "learning_rate": 6.695900951677992e-06, + "loss": 0.1453, + "step": 44030 + }, + { + "epoch": 0.7853422751756858, + "grad_norm": 0.29024064540863037, + "learning_rate": 6.6948408055921255e-06, + "loss": 0.1466, + "step": 44031 + }, + { + "epoch": 0.7853601112973995, + "grad_norm": 0.23067434132099152, + "learning_rate": 6.693780730463387e-06, + "loss": 0.094, + "step": 44032 + }, + { + "epoch": 0.7853779474191132, + "grad_norm": 0.2677970826625824, + "learning_rate": 6.692720726295876e-06, + "loss": 0.1596, + "step": 44033 + }, + { + "epoch": 0.7853957835408268, + "grad_norm": 0.2243032306432724, + "learning_rate": 6.691660793093704e-06, + "loss": 0.129, + "step": 44034 + }, + { + "epoch": 0.7854136196625405, + "grad_norm": 0.22577647864818573, + "learning_rate": 6.690600930860969e-06, + "loss": 0.1229, + "step": 44035 + }, + { + "epoch": 0.7854314557842542, + "grad_norm": 0.24724464118480682, + "learning_rate": 6.689541139601799e-06, + "loss": 0.156, + "step": 44036 + }, + { + "epoch": 0.785449291905968, + "grad_norm": 0.2320672869682312, + "learning_rate": 6.68848141932028e-06, + "loss": 0.0798, + "step": 44037 + }, + { + "epoch": 0.7854671280276817, + "grad_norm": 0.3871418237686157, + "learning_rate": 6.687421770020541e-06, + "loss": 0.0982, + "step": 44038 + }, + { + "epoch": 0.7854849641493954, + "grad_norm": 0.24611538648605347, + "learning_rate": 6.686362191706683e-06, + "loss": 0.0809, + "step": 44039 + }, + { + "epoch": 0.7855028002711091, + "grad_norm": 0.1619541347026825, + "learning_rate": 6.685302684382799e-06, + "loss": 0.0761, + "step": 44040 + }, + { + "epoch": 0.7855206363928228, + "grad_norm": 0.2616454064846039, + "learning_rate": 6.684243248053018e-06, + "loss": 0.1387, + "step": 44041 + }, + { + "epoch": 0.7855384725145365, + "grad_norm": 0.2392408400774002, + "learning_rate": 6.683183882721434e-06, + "loss": 0.1228, + "step": 44042 + }, + { + "epoch": 0.7855563086362501, + "grad_norm": 0.3478841781616211, + "learning_rate": 6.682124588392155e-06, + "loss": 0.1072, + "step": 44043 + }, + { + "epoch": 0.7855741447579638, + "grad_norm": 0.29405397176742554, + "learning_rate": 6.681065365069283e-06, + "loss": 0.0961, + "step": 44044 + }, + { + "epoch": 0.7855919808796775, + "grad_norm": 0.2923218011856079, + "learning_rate": 6.680006212756934e-06, + "loss": 0.1052, + "step": 44045 + }, + { + "epoch": 0.7856098170013912, + "grad_norm": 0.3023574948310852, + "learning_rate": 6.678947131459213e-06, + "loss": 0.092, + "step": 44046 + }, + { + "epoch": 0.7856276531231049, + "grad_norm": 0.23581300675868988, + "learning_rate": 6.677888121180214e-06, + "loss": 0.1415, + "step": 44047 + }, + { + "epoch": 0.7856454892448186, + "grad_norm": 0.2426312118768692, + "learning_rate": 6.676829181924046e-06, + "loss": 0.097, + "step": 44048 + }, + { + "epoch": 0.7856633253665323, + "grad_norm": 0.26105406880378723, + "learning_rate": 6.6757703136948235e-06, + "loss": 0.1386, + "step": 44049 + }, + { + "epoch": 0.785681161488246, + "grad_norm": 0.28056710958480835, + "learning_rate": 6.674711516496643e-06, + "loss": 0.1255, + "step": 44050 + }, + { + "epoch": 0.7856989976099596, + "grad_norm": 0.33771297335624695, + "learning_rate": 6.673652790333604e-06, + "loss": 0.1507, + "step": 44051 + }, + { + "epoch": 0.7857168337316733, + "grad_norm": 0.29584062099456787, + "learning_rate": 6.672594135209823e-06, + "loss": 0.1335, + "step": 44052 + }, + { + "epoch": 0.785734669853387, + "grad_norm": 0.26861950755119324, + "learning_rate": 6.6715355511293905e-06, + "loss": 0.1004, + "step": 44053 + }, + { + "epoch": 0.7857525059751008, + "grad_norm": 0.47461599111557007, + "learning_rate": 6.6704770380964235e-06, + "loss": 0.1927, + "step": 44054 + }, + { + "epoch": 0.7857703420968145, + "grad_norm": 0.22906163334846497, + "learning_rate": 6.669418596115018e-06, + "loss": 0.1195, + "step": 44055 + }, + { + "epoch": 0.7857881782185282, + "grad_norm": 0.3668989837169647, + "learning_rate": 6.668360225189277e-06, + "loss": 0.099, + "step": 44056 + }, + { + "epoch": 0.7858060143402419, + "grad_norm": 0.2638819217681885, + "learning_rate": 6.667301925323296e-06, + "loss": 0.1338, + "step": 44057 + }, + { + "epoch": 0.7858238504619556, + "grad_norm": 0.22786612808704376, + "learning_rate": 6.666243696521194e-06, + "loss": 0.1199, + "step": 44058 + }, + { + "epoch": 0.7858416865836693, + "grad_norm": 0.303611159324646, + "learning_rate": 6.665185538787061e-06, + "loss": 0.1326, + "step": 44059 + }, + { + "epoch": 0.785859522705383, + "grad_norm": 0.293671578168869, + "learning_rate": 6.664127452125005e-06, + "loss": 0.1643, + "step": 44060 + }, + { + "epoch": 0.7858773588270966, + "grad_norm": 0.2638128995895386, + "learning_rate": 6.663069436539113e-06, + "loss": 0.1016, + "step": 44061 + }, + { + "epoch": 0.7858951949488103, + "grad_norm": 0.3524613082408905, + "learning_rate": 6.662011492033507e-06, + "loss": 0.1341, + "step": 44062 + }, + { + "epoch": 0.785913031070524, + "grad_norm": 0.16846193373203278, + "learning_rate": 6.660953618612276e-06, + "loss": 0.0622, + "step": 44063 + }, + { + "epoch": 0.7859308671922377, + "grad_norm": 0.2662205696105957, + "learning_rate": 6.659895816279518e-06, + "loss": 0.0639, + "step": 44064 + }, + { + "epoch": 0.7859487033139514, + "grad_norm": 0.2711627185344696, + "learning_rate": 6.658838085039343e-06, + "loss": 0.1012, + "step": 44065 + }, + { + "epoch": 0.7859665394356651, + "grad_norm": 0.30686184763908386, + "learning_rate": 6.65778042489584e-06, + "loss": 0.0938, + "step": 44066 + }, + { + "epoch": 0.7859843755573788, + "grad_norm": 0.22379377484321594, + "learning_rate": 6.656722835853124e-06, + "loss": 0.0838, + "step": 44067 + }, + { + "epoch": 0.7860022116790925, + "grad_norm": 0.28610849380493164, + "learning_rate": 6.655665317915286e-06, + "loss": 0.0515, + "step": 44068 + }, + { + "epoch": 0.7860200478008061, + "grad_norm": 0.2630413770675659, + "learning_rate": 6.654607871086424e-06, + "loss": 0.1497, + "step": 44069 + }, + { + "epoch": 0.7860378839225199, + "grad_norm": 0.22608932852745056, + "learning_rate": 6.653550495370631e-06, + "loss": 0.1221, + "step": 44070 + }, + { + "epoch": 0.7860557200442336, + "grad_norm": 0.29861024022102356, + "learning_rate": 6.652493190772019e-06, + "loss": 0.1484, + "step": 44071 + }, + { + "epoch": 0.7860735561659473, + "grad_norm": 0.2642778158187866, + "learning_rate": 6.651435957294683e-06, + "loss": 0.1031, + "step": 44072 + }, + { + "epoch": 0.786091392287661, + "grad_norm": 0.24304579198360443, + "learning_rate": 6.650378794942716e-06, + "loss": 0.1514, + "step": 44073 + }, + { + "epoch": 0.7861092284093747, + "grad_norm": 0.20899224281311035, + "learning_rate": 6.649321703720212e-06, + "loss": 0.0754, + "step": 44074 + }, + { + "epoch": 0.7861270645310884, + "grad_norm": 0.443224161863327, + "learning_rate": 6.648264683631284e-06, + "loss": 0.1127, + "step": 44075 + }, + { + "epoch": 0.7861449006528021, + "grad_norm": 0.2710306644439697, + "learning_rate": 6.647207734680019e-06, + "loss": 0.1265, + "step": 44076 + }, + { + "epoch": 0.7861627367745158, + "grad_norm": 0.2565361559391022, + "learning_rate": 6.646150856870517e-06, + "loss": 0.1612, + "step": 44077 + }, + { + "epoch": 0.7861805728962294, + "grad_norm": 0.2519514858722687, + "learning_rate": 6.645094050206865e-06, + "loss": 0.114, + "step": 44078 + }, + { + "epoch": 0.7861984090179431, + "grad_norm": 0.2312752604484558, + "learning_rate": 6.644037314693174e-06, + "loss": 0.0598, + "step": 44079 + }, + { + "epoch": 0.7862162451396568, + "grad_norm": 0.3019639253616333, + "learning_rate": 6.64298065033353e-06, + "loss": 0.1264, + "step": 44080 + }, + { + "epoch": 0.7862340812613705, + "grad_norm": 0.18915559351444244, + "learning_rate": 6.64192405713204e-06, + "loss": 0.0664, + "step": 44081 + }, + { + "epoch": 0.7862519173830842, + "grad_norm": 0.5290731191635132, + "learning_rate": 6.640867535092793e-06, + "loss": 0.1023, + "step": 44082 + }, + { + "epoch": 0.7862697535047979, + "grad_norm": 0.2455829232931137, + "learning_rate": 6.6398110842198765e-06, + "loss": 0.17, + "step": 44083 + }, + { + "epoch": 0.7862875896265116, + "grad_norm": 0.2832101881504059, + "learning_rate": 6.6387547045174e-06, + "loss": 0.0911, + "step": 44084 + }, + { + "epoch": 0.7863054257482253, + "grad_norm": 0.23851561546325684, + "learning_rate": 6.637698395989453e-06, + "loss": 0.0893, + "step": 44085 + }, + { + "epoch": 0.786323261869939, + "grad_norm": 0.2527659833431244, + "learning_rate": 6.63664215864013e-06, + "loss": 0.0606, + "step": 44086 + }, + { + "epoch": 0.7863410979916527, + "grad_norm": 0.2977469861507416, + "learning_rate": 6.635585992473517e-06, + "loss": 0.1266, + "step": 44087 + }, + { + "epoch": 0.7863589341133664, + "grad_norm": 0.26551225781440735, + "learning_rate": 6.634529897493721e-06, + "loss": 0.0934, + "step": 44088 + }, + { + "epoch": 0.7863767702350801, + "grad_norm": 0.3453245460987091, + "learning_rate": 6.633473873704832e-06, + "loss": 0.1202, + "step": 44089 + }, + { + "epoch": 0.7863946063567938, + "grad_norm": 0.24251382052898407, + "learning_rate": 6.632417921110942e-06, + "loss": 0.0762, + "step": 44090 + }, + { + "epoch": 0.7864124424785075, + "grad_norm": 0.2546616494655609, + "learning_rate": 6.631362039716136e-06, + "loss": 0.0995, + "step": 44091 + }, + { + "epoch": 0.7864302786002212, + "grad_norm": 0.26139745116233826, + "learning_rate": 6.630306229524521e-06, + "loss": 0.1043, + "step": 44092 + }, + { + "epoch": 0.7864481147219349, + "grad_norm": 0.29433515667915344, + "learning_rate": 6.629250490540178e-06, + "loss": 0.0932, + "step": 44093 + }, + { + "epoch": 0.7864659508436486, + "grad_norm": 0.21292799711227417, + "learning_rate": 6.628194822767214e-06, + "loss": 0.1255, + "step": 44094 + }, + { + "epoch": 0.7864837869653623, + "grad_norm": 0.29793331027030945, + "learning_rate": 6.627139226209711e-06, + "loss": 0.1582, + "step": 44095 + }, + { + "epoch": 0.7865016230870759, + "grad_norm": 0.26597556471824646, + "learning_rate": 6.626083700871755e-06, + "loss": 0.1383, + "step": 44096 + }, + { + "epoch": 0.7865194592087896, + "grad_norm": 0.22141043841838837, + "learning_rate": 6.62502824675745e-06, + "loss": 0.1089, + "step": 44097 + }, + { + "epoch": 0.7865372953305033, + "grad_norm": 0.2063092589378357, + "learning_rate": 6.623972863870884e-06, + "loss": 0.092, + "step": 44098 + }, + { + "epoch": 0.786555131452217, + "grad_norm": 0.24598990380764008, + "learning_rate": 6.622917552216146e-06, + "loss": 0.1184, + "step": 44099 + }, + { + "epoch": 0.7865729675739307, + "grad_norm": 0.26491016149520874, + "learning_rate": 6.621862311797319e-06, + "loss": 0.1118, + "step": 44100 + }, + { + "epoch": 0.7865908036956444, + "grad_norm": 0.26054850220680237, + "learning_rate": 6.6208071426185095e-06, + "loss": 0.1312, + "step": 44101 + }, + { + "epoch": 0.7866086398173581, + "grad_norm": 0.22832149267196655, + "learning_rate": 6.619752044683799e-06, + "loss": 0.1279, + "step": 44102 + }, + { + "epoch": 0.7866264759390718, + "grad_norm": 0.2645712196826935, + "learning_rate": 6.6186970179972775e-06, + "loss": 0.162, + "step": 44103 + }, + { + "epoch": 0.7866443120607856, + "grad_norm": 0.32220274209976196, + "learning_rate": 6.617642062563034e-06, + "loss": 0.1027, + "step": 44104 + }, + { + "epoch": 0.7866621481824992, + "grad_norm": 0.2885728180408478, + "learning_rate": 6.616587178385153e-06, + "loss": 0.093, + "step": 44105 + }, + { + "epoch": 0.7866799843042129, + "grad_norm": 0.2289877086877823, + "learning_rate": 6.615532365467738e-06, + "loss": 0.0963, + "step": 44106 + }, + { + "epoch": 0.7866978204259266, + "grad_norm": 0.314732164144516, + "learning_rate": 6.614477623814861e-06, + "loss": 0.1155, + "step": 44107 + }, + { + "epoch": 0.7867156565476403, + "grad_norm": 0.29838791489601135, + "learning_rate": 6.613422953430625e-06, + "loss": 0.2084, + "step": 44108 + }, + { + "epoch": 0.786733492669354, + "grad_norm": 0.26508355140686035, + "learning_rate": 6.612368354319107e-06, + "loss": 0.1286, + "step": 44109 + }, + { + "epoch": 0.7867513287910677, + "grad_norm": 0.2413455992937088, + "learning_rate": 6.611313826484405e-06, + "loss": 0.1452, + "step": 44110 + }, + { + "epoch": 0.7867691649127814, + "grad_norm": 0.29571884870529175, + "learning_rate": 6.610259369930605e-06, + "loss": 0.1165, + "step": 44111 + }, + { + "epoch": 0.7867870010344951, + "grad_norm": 0.3609950840473175, + "learning_rate": 6.609204984661787e-06, + "loss": 0.1727, + "step": 44112 + }, + { + "epoch": 0.7868048371562087, + "grad_norm": 0.3429317772388458, + "learning_rate": 6.60815067068204e-06, + "loss": 0.0949, + "step": 44113 + }, + { + "epoch": 0.7868226732779224, + "grad_norm": 0.16995365917682648, + "learning_rate": 6.607096427995457e-06, + "loss": 0.1056, + "step": 44114 + }, + { + "epoch": 0.7868405093996361, + "grad_norm": 0.2643730938434601, + "learning_rate": 6.606042256606121e-06, + "loss": 0.1085, + "step": 44115 + }, + { + "epoch": 0.7868583455213498, + "grad_norm": 0.3374778628349304, + "learning_rate": 6.604988156518122e-06, + "loss": 0.1198, + "step": 44116 + }, + { + "epoch": 0.7868761816430635, + "grad_norm": 0.24649089574813843, + "learning_rate": 6.603934127735539e-06, + "loss": 0.1162, + "step": 44117 + }, + { + "epoch": 0.7868940177647772, + "grad_norm": 0.2864466905593872, + "learning_rate": 6.602880170262455e-06, + "loss": 0.1448, + "step": 44118 + }, + { + "epoch": 0.7869118538864909, + "grad_norm": 0.2783842980861664, + "learning_rate": 6.601826284102966e-06, + "loss": 0.0748, + "step": 44119 + }, + { + "epoch": 0.7869296900082046, + "grad_norm": 0.28984659910202026, + "learning_rate": 6.600772469261149e-06, + "loss": 0.1792, + "step": 44120 + }, + { + "epoch": 0.7869475261299184, + "grad_norm": 0.26915243268013, + "learning_rate": 6.599718725741099e-06, + "loss": 0.1742, + "step": 44121 + }, + { + "epoch": 0.786965362251632, + "grad_norm": 0.2268848568201065, + "learning_rate": 6.598665053546888e-06, + "loss": 0.1124, + "step": 44122 + }, + { + "epoch": 0.7869831983733457, + "grad_norm": 0.25053054094314575, + "learning_rate": 6.597611452682614e-06, + "loss": 0.1393, + "step": 44123 + }, + { + "epoch": 0.7870010344950594, + "grad_norm": 0.25598159432411194, + "learning_rate": 6.596557923152352e-06, + "loss": 0.0828, + "step": 44124 + }, + { + "epoch": 0.7870188706167731, + "grad_norm": 0.2450539916753769, + "learning_rate": 6.59550446496019e-06, + "loss": 0.1358, + "step": 44125 + }, + { + "epoch": 0.7870367067384868, + "grad_norm": 0.27337446808815, + "learning_rate": 6.594451078110201e-06, + "loss": 0.0911, + "step": 44126 + }, + { + "epoch": 0.7870545428602005, + "grad_norm": 0.40898844599723816, + "learning_rate": 6.5933977626064845e-06, + "loss": 0.2003, + "step": 44127 + }, + { + "epoch": 0.7870723789819142, + "grad_norm": 0.3594771921634674, + "learning_rate": 6.592344518453117e-06, + "loss": 0.0781, + "step": 44128 + }, + { + "epoch": 0.7870902151036279, + "grad_norm": 0.20938251912593842, + "learning_rate": 6.591291345654177e-06, + "loss": 0.1095, + "step": 44129 + }, + { + "epoch": 0.7871080512253416, + "grad_norm": 0.4531751871109009, + "learning_rate": 6.590238244213753e-06, + "loss": 0.1569, + "step": 44130 + }, + { + "epoch": 0.7871258873470552, + "grad_norm": 0.2849150002002716, + "learning_rate": 6.589185214135915e-06, + "loss": 0.1223, + "step": 44131 + }, + { + "epoch": 0.7871437234687689, + "grad_norm": 0.4003971815109253, + "learning_rate": 6.588132255424762e-06, + "loss": 0.0958, + "step": 44132 + }, + { + "epoch": 0.7871615595904826, + "grad_norm": 0.2347901463508606, + "learning_rate": 6.5870793680843676e-06, + "loss": 0.0801, + "step": 44133 + }, + { + "epoch": 0.7871793957121963, + "grad_norm": 0.30664703249931335, + "learning_rate": 6.586026552118804e-06, + "loss": 0.1286, + "step": 44134 + }, + { + "epoch": 0.78719723183391, + "grad_norm": 0.326532244682312, + "learning_rate": 6.584973807532163e-06, + "loss": 0.1538, + "step": 44135 + }, + { + "epoch": 0.7872150679556237, + "grad_norm": 0.3017213046550751, + "learning_rate": 6.58392113432853e-06, + "loss": 0.1564, + "step": 44136 + }, + { + "epoch": 0.7872329040773374, + "grad_norm": 0.2550777196884155, + "learning_rate": 6.582868532511982e-06, + "loss": 0.0493, + "step": 44137 + }, + { + "epoch": 0.7872507401990512, + "grad_norm": 0.25276365876197815, + "learning_rate": 6.5818160020865935e-06, + "loss": 0.1335, + "step": 44138 + }, + { + "epoch": 0.7872685763207649, + "grad_norm": 0.2225196659564972, + "learning_rate": 6.580763543056442e-06, + "loss": 0.1137, + "step": 44139 + }, + { + "epoch": 0.7872864124424785, + "grad_norm": 0.24990713596343994, + "learning_rate": 6.579711155425622e-06, + "loss": 0.0981, + "step": 44140 + }, + { + "epoch": 0.7873042485641922, + "grad_norm": 0.23552751541137695, + "learning_rate": 6.578658839198201e-06, + "loss": 0.127, + "step": 44141 + }, + { + "epoch": 0.7873220846859059, + "grad_norm": 0.27550774812698364, + "learning_rate": 6.577606594378261e-06, + "loss": 0.1125, + "step": 44142 + }, + { + "epoch": 0.7873399208076196, + "grad_norm": 0.3579949140548706, + "learning_rate": 6.5765544209698825e-06, + "loss": 0.1141, + "step": 44143 + }, + { + "epoch": 0.7873577569293333, + "grad_norm": 0.25921282172203064, + "learning_rate": 6.575502318977134e-06, + "loss": 0.0946, + "step": 44144 + }, + { + "epoch": 0.787375593051047, + "grad_norm": 0.3165900409221649, + "learning_rate": 6.5744502884041105e-06, + "loss": 0.1219, + "step": 44145 + }, + { + "epoch": 0.7873934291727607, + "grad_norm": 0.20930802822113037, + "learning_rate": 6.573398329254879e-06, + "loss": 0.0919, + "step": 44146 + }, + { + "epoch": 0.7874112652944744, + "grad_norm": 0.2014750987291336, + "learning_rate": 6.572346441533514e-06, + "loss": 0.066, + "step": 44147 + }, + { + "epoch": 0.787429101416188, + "grad_norm": 0.2759567201137543, + "learning_rate": 6.571294625244107e-06, + "loss": 0.1018, + "step": 44148 + }, + { + "epoch": 0.7874469375379017, + "grad_norm": 0.22385810315608978, + "learning_rate": 6.5702428803907176e-06, + "loss": 0.1042, + "step": 44149 + }, + { + "epoch": 0.7874647736596154, + "grad_norm": 0.20451262593269348, + "learning_rate": 6.569191206977443e-06, + "loss": 0.0971, + "step": 44150 + }, + { + "epoch": 0.7874826097813291, + "grad_norm": 0.30356380343437195, + "learning_rate": 6.568139605008347e-06, + "loss": 0.0889, + "step": 44151 + }, + { + "epoch": 0.7875004459030428, + "grad_norm": 0.21762573719024658, + "learning_rate": 6.567088074487499e-06, + "loss": 0.0701, + "step": 44152 + }, + { + "epoch": 0.7875182820247565, + "grad_norm": 0.32316359877586365, + "learning_rate": 6.566036615418994e-06, + "loss": 0.1323, + "step": 44153 + }, + { + "epoch": 0.7875361181464702, + "grad_norm": 0.20996029675006866, + "learning_rate": 6.564985227806897e-06, + "loss": 0.0836, + "step": 44154 + }, + { + "epoch": 0.787553954268184, + "grad_norm": 0.28935426473617554, + "learning_rate": 6.563933911655285e-06, + "loss": 0.089, + "step": 44155 + }, + { + "epoch": 0.7875717903898977, + "grad_norm": 0.26105326414108276, + "learning_rate": 6.562882666968234e-06, + "loss": 0.0968, + "step": 44156 + }, + { + "epoch": 0.7875896265116114, + "grad_norm": 0.2796895503997803, + "learning_rate": 6.561831493749809e-06, + "loss": 0.0987, + "step": 44157 + }, + { + "epoch": 0.787607462633325, + "grad_norm": 0.36162832379341125, + "learning_rate": 6.560780392004101e-06, + "loss": 0.0869, + "step": 44158 + }, + { + "epoch": 0.7876252987550387, + "grad_norm": 0.32262855768203735, + "learning_rate": 6.559729361735176e-06, + "loss": 0.1833, + "step": 44159 + }, + { + "epoch": 0.7876431348767524, + "grad_norm": 0.25811102986335754, + "learning_rate": 6.5586784029471105e-06, + "loss": 0.1136, + "step": 44160 + }, + { + "epoch": 0.7876609709984661, + "grad_norm": 0.2625448703765869, + "learning_rate": 6.557627515643968e-06, + "loss": 0.0869, + "step": 44161 + }, + { + "epoch": 0.7876788071201798, + "grad_norm": 0.2576397657394409, + "learning_rate": 6.5565766998298385e-06, + "loss": 0.0654, + "step": 44162 + }, + { + "epoch": 0.7876966432418935, + "grad_norm": 0.19497458636760712, + "learning_rate": 6.555525955508782e-06, + "loss": 0.1238, + "step": 44163 + }, + { + "epoch": 0.7877144793636072, + "grad_norm": 0.24587573111057281, + "learning_rate": 6.5544752826848825e-06, + "loss": 0.1382, + "step": 44164 + }, + { + "epoch": 0.7877323154853209, + "grad_norm": 0.44457289576530457, + "learning_rate": 6.553424681362202e-06, + "loss": 0.1091, + "step": 44165 + }, + { + "epoch": 0.7877501516070345, + "grad_norm": 0.25439682602882385, + "learning_rate": 6.552374151544827e-06, + "loss": 0.108, + "step": 44166 + }, + { + "epoch": 0.7877679877287482, + "grad_norm": 0.3162674307823181, + "learning_rate": 6.551323693236822e-06, + "loss": 0.1064, + "step": 44167 + }, + { + "epoch": 0.7877858238504619, + "grad_norm": 0.31190550327301025, + "learning_rate": 6.550273306442256e-06, + "loss": 0.1059, + "step": 44168 + }, + { + "epoch": 0.7878036599721756, + "grad_norm": 0.3222202658653259, + "learning_rate": 6.549222991165197e-06, + "loss": 0.1667, + "step": 44169 + }, + { + "epoch": 0.7878214960938893, + "grad_norm": 0.2259027659893036, + "learning_rate": 6.548172747409728e-06, + "loss": 0.0863, + "step": 44170 + }, + { + "epoch": 0.7878393322156031, + "grad_norm": 0.22337354719638824, + "learning_rate": 6.547122575179915e-06, + "loss": 0.1119, + "step": 44171 + }, + { + "epoch": 0.7878571683373168, + "grad_norm": 0.21057939529418945, + "learning_rate": 6.546072474479828e-06, + "loss": 0.09, + "step": 44172 + }, + { + "epoch": 0.7878750044590305, + "grad_norm": 0.1994534283876419, + "learning_rate": 6.5450224453135394e-06, + "loss": 0.0961, + "step": 44173 + }, + { + "epoch": 0.7878928405807442, + "grad_norm": 0.2664986848831177, + "learning_rate": 6.54397248768511e-06, + "loss": 0.0644, + "step": 44174 + }, + { + "epoch": 0.7879106767024578, + "grad_norm": 0.2602781355381012, + "learning_rate": 6.542922601598625e-06, + "loss": 0.1645, + "step": 44175 + }, + { + "epoch": 0.7879285128241715, + "grad_norm": 0.21840496361255646, + "learning_rate": 6.54187278705814e-06, + "loss": 0.0982, + "step": 44176 + }, + { + "epoch": 0.7879463489458852, + "grad_norm": 0.22354631125926971, + "learning_rate": 6.540823044067737e-06, + "loss": 0.0952, + "step": 44177 + }, + { + "epoch": 0.7879641850675989, + "grad_norm": 0.255677729845047, + "learning_rate": 6.5397733726314716e-06, + "loss": 0.1202, + "step": 44178 + }, + { + "epoch": 0.7879820211893126, + "grad_norm": 0.24608290195465088, + "learning_rate": 6.53872377275343e-06, + "loss": 0.0968, + "step": 44179 + }, + { + "epoch": 0.7879998573110263, + "grad_norm": 0.34032246470451355, + "learning_rate": 6.537674244437672e-06, + "loss": 0.1532, + "step": 44180 + }, + { + "epoch": 0.78801769343274, + "grad_norm": 0.2574082911014557, + "learning_rate": 6.536624787688264e-06, + "loss": 0.1178, + "step": 44181 + }, + { + "epoch": 0.7880355295544537, + "grad_norm": 0.28018561005592346, + "learning_rate": 6.535575402509269e-06, + "loss": 0.1359, + "step": 44182 + }, + { + "epoch": 0.7880533656761673, + "grad_norm": 0.24346445500850677, + "learning_rate": 6.534526088904769e-06, + "loss": 0.0768, + "step": 44183 + }, + { + "epoch": 0.788071201797881, + "grad_norm": 0.24253524839878082, + "learning_rate": 6.533476846878822e-06, + "loss": 0.1442, + "step": 44184 + }, + { + "epoch": 0.7880890379195947, + "grad_norm": 0.2577386498451233, + "learning_rate": 6.5324276764354956e-06, + "loss": 0.1385, + "step": 44185 + }, + { + "epoch": 0.7881068740413084, + "grad_norm": 0.29873839020729065, + "learning_rate": 6.531378577578862e-06, + "loss": 0.0844, + "step": 44186 + }, + { + "epoch": 0.7881247101630221, + "grad_norm": 0.2075805813074112, + "learning_rate": 6.530329550312972e-06, + "loss": 0.106, + "step": 44187 + }, + { + "epoch": 0.7881425462847359, + "grad_norm": 0.29521387815475464, + "learning_rate": 6.529280594641915e-06, + "loss": 0.0875, + "step": 44188 + }, + { + "epoch": 0.7881603824064496, + "grad_norm": 0.2747701108455658, + "learning_rate": 6.528231710569744e-06, + "loss": 0.1558, + "step": 44189 + }, + { + "epoch": 0.7881782185281633, + "grad_norm": 0.19759778678417206, + "learning_rate": 6.52718289810052e-06, + "loss": 0.1004, + "step": 44190 + }, + { + "epoch": 0.788196054649877, + "grad_norm": 0.3707346022129059, + "learning_rate": 6.526134157238315e-06, + "loss": 0.1194, + "step": 44191 + }, + { + "epoch": 0.7882138907715907, + "grad_norm": 0.31901875138282776, + "learning_rate": 6.525085487987203e-06, + "loss": 0.0578, + "step": 44192 + }, + { + "epoch": 0.7882317268933043, + "grad_norm": 0.285643994808197, + "learning_rate": 6.52403689035124e-06, + "loss": 0.1198, + "step": 44193 + }, + { + "epoch": 0.788249563015018, + "grad_norm": 0.34529829025268555, + "learning_rate": 6.52298836433449e-06, + "loss": 0.1275, + "step": 44194 + }, + { + "epoch": 0.7882673991367317, + "grad_norm": 0.23882043361663818, + "learning_rate": 6.521939909941013e-06, + "loss": 0.1015, + "step": 44195 + }, + { + "epoch": 0.7882852352584454, + "grad_norm": 0.28525108098983765, + "learning_rate": 6.5208915271748886e-06, + "loss": 0.1171, + "step": 44196 + }, + { + "epoch": 0.7883030713801591, + "grad_norm": 0.28349438309669495, + "learning_rate": 6.519843216040167e-06, + "loss": 0.1545, + "step": 44197 + }, + { + "epoch": 0.7883209075018728, + "grad_norm": 0.22000150382518768, + "learning_rate": 6.5187949765409175e-06, + "loss": 0.1256, + "step": 44198 + }, + { + "epoch": 0.7883387436235865, + "grad_norm": 0.20423874258995056, + "learning_rate": 6.517746808681202e-06, + "loss": 0.0935, + "step": 44199 + }, + { + "epoch": 0.7883565797453002, + "grad_norm": 0.35680824518203735, + "learning_rate": 6.516698712465077e-06, + "loss": 0.1289, + "step": 44200 + }, + { + "epoch": 0.7883744158670138, + "grad_norm": 0.343405544757843, + "learning_rate": 6.515650687896619e-06, + "loss": 0.1619, + "step": 44201 + }, + { + "epoch": 0.7883922519887275, + "grad_norm": 0.30054977536201477, + "learning_rate": 6.514602734979883e-06, + "loss": 0.1145, + "step": 44202 + }, + { + "epoch": 0.7884100881104412, + "grad_norm": 0.27992501854896545, + "learning_rate": 6.513554853718923e-06, + "loss": 0.0883, + "step": 44203 + }, + { + "epoch": 0.7884279242321549, + "grad_norm": 0.2517690658569336, + "learning_rate": 6.512507044117816e-06, + "loss": 0.1296, + "step": 44204 + }, + { + "epoch": 0.7884457603538687, + "grad_norm": 0.391408234834671, + "learning_rate": 6.511459306180612e-06, + "loss": 0.1029, + "step": 44205 + }, + { + "epoch": 0.7884635964755824, + "grad_norm": 0.3042883276939392, + "learning_rate": 6.5104116399113845e-06, + "loss": 0.0887, + "step": 44206 + }, + { + "epoch": 0.7884814325972961, + "grad_norm": 0.24673470854759216, + "learning_rate": 6.509364045314187e-06, + "loss": 0.0883, + "step": 44207 + }, + { + "epoch": 0.7884992687190098, + "grad_norm": 0.26099276542663574, + "learning_rate": 6.508316522393074e-06, + "loss": 0.1171, + "step": 44208 + }, + { + "epoch": 0.7885171048407235, + "grad_norm": 0.26348787546157837, + "learning_rate": 6.507269071152119e-06, + "loss": 0.0777, + "step": 44209 + }, + { + "epoch": 0.7885349409624371, + "grad_norm": 0.3050234615802765, + "learning_rate": 6.5062216915953766e-06, + "loss": 0.091, + "step": 44210 + }, + { + "epoch": 0.7885527770841508, + "grad_norm": 0.32991647720336914, + "learning_rate": 6.505174383726908e-06, + "loss": 0.1302, + "step": 44211 + }, + { + "epoch": 0.7885706132058645, + "grad_norm": 0.3159943222999573, + "learning_rate": 6.50412714755077e-06, + "loss": 0.1138, + "step": 44212 + }, + { + "epoch": 0.7885884493275782, + "grad_norm": 0.31988975405693054, + "learning_rate": 6.503079983071017e-06, + "loss": 0.152, + "step": 44213 + }, + { + "epoch": 0.7886062854492919, + "grad_norm": 0.21109743416309357, + "learning_rate": 6.50203289029172e-06, + "loss": 0.109, + "step": 44214 + }, + { + "epoch": 0.7886241215710056, + "grad_norm": 0.3054670989513397, + "learning_rate": 6.500985869216936e-06, + "loss": 0.0839, + "step": 44215 + }, + { + "epoch": 0.7886419576927193, + "grad_norm": 0.21854069828987122, + "learning_rate": 6.499938919850718e-06, + "loss": 0.0688, + "step": 44216 + }, + { + "epoch": 0.788659793814433, + "grad_norm": 0.3289863169193268, + "learning_rate": 6.498892042197119e-06, + "loss": 0.1317, + "step": 44217 + }, + { + "epoch": 0.7886776299361467, + "grad_norm": 0.30067890882492065, + "learning_rate": 6.497845236260205e-06, + "loss": 0.1565, + "step": 44218 + }, + { + "epoch": 0.7886954660578603, + "grad_norm": 0.2696108818054199, + "learning_rate": 6.496798502044041e-06, + "loss": 0.145, + "step": 44219 + }, + { + "epoch": 0.788713302179574, + "grad_norm": 0.25394347310066223, + "learning_rate": 6.495751839552675e-06, + "loss": 0.0787, + "step": 44220 + }, + { + "epoch": 0.7887311383012877, + "grad_norm": 0.2914705276489258, + "learning_rate": 6.494705248790162e-06, + "loss": 0.1398, + "step": 44221 + }, + { + "epoch": 0.7887489744230015, + "grad_norm": 0.22038881480693817, + "learning_rate": 6.493658729760565e-06, + "loss": 0.0369, + "step": 44222 + }, + { + "epoch": 0.7887668105447152, + "grad_norm": 0.19869518280029297, + "learning_rate": 6.492612282467944e-06, + "loss": 0.0431, + "step": 44223 + }, + { + "epoch": 0.7887846466664289, + "grad_norm": 0.39566680788993835, + "learning_rate": 6.491565906916347e-06, + "loss": 0.0935, + "step": 44224 + }, + { + "epoch": 0.7888024827881426, + "grad_norm": 0.2697812616825104, + "learning_rate": 6.490519603109835e-06, + "loss": 0.0966, + "step": 44225 + }, + { + "epoch": 0.7888203189098563, + "grad_norm": 0.29709675908088684, + "learning_rate": 6.489473371052454e-06, + "loss": 0.0789, + "step": 44226 + }, + { + "epoch": 0.78883815503157, + "grad_norm": 0.2259664386510849, + "learning_rate": 6.488427210748274e-06, + "loss": 0.1021, + "step": 44227 + }, + { + "epoch": 0.7888559911532836, + "grad_norm": 0.22971664369106293, + "learning_rate": 6.487381122201344e-06, + "loss": 0.1566, + "step": 44228 + }, + { + "epoch": 0.7888738272749973, + "grad_norm": 0.25513756275177, + "learning_rate": 6.486335105415719e-06, + "loss": 0.0943, + "step": 44229 + }, + { + "epoch": 0.788891663396711, + "grad_norm": 0.3283088207244873, + "learning_rate": 6.485289160395447e-06, + "loss": 0.1692, + "step": 44230 + }, + { + "epoch": 0.7889094995184247, + "grad_norm": 0.2767787575721741, + "learning_rate": 6.484243287144595e-06, + "loss": 0.096, + "step": 44231 + }, + { + "epoch": 0.7889273356401384, + "grad_norm": 0.21235200762748718, + "learning_rate": 6.4831974856672054e-06, + "loss": 0.0677, + "step": 44232 + }, + { + "epoch": 0.7889451717618521, + "grad_norm": 0.30044808983802795, + "learning_rate": 6.482151755967345e-06, + "loss": 0.1193, + "step": 44233 + }, + { + "epoch": 0.7889630078835658, + "grad_norm": 0.2610991597175598, + "learning_rate": 6.481106098049053e-06, + "loss": 0.1082, + "step": 44234 + }, + { + "epoch": 0.7889808440052795, + "grad_norm": 0.2509284019470215, + "learning_rate": 6.480060511916394e-06, + "loss": 0.1052, + "step": 44235 + }, + { + "epoch": 0.7889986801269931, + "grad_norm": 0.2177760750055313, + "learning_rate": 6.479014997573421e-06, + "loss": 0.0933, + "step": 44236 + }, + { + "epoch": 0.7890165162487068, + "grad_norm": 0.24624665081501007, + "learning_rate": 6.477969555024182e-06, + "loss": 0.0783, + "step": 44237 + }, + { + "epoch": 0.7890343523704205, + "grad_norm": 0.2457483559846878, + "learning_rate": 6.476924184272729e-06, + "loss": 0.1013, + "step": 44238 + }, + { + "epoch": 0.7890521884921343, + "grad_norm": 0.28774693608283997, + "learning_rate": 6.475878885323108e-06, + "loss": 0.1572, + "step": 44239 + }, + { + "epoch": 0.789070024613848, + "grad_norm": 0.32484716176986694, + "learning_rate": 6.474833658179388e-06, + "loss": 0.0958, + "step": 44240 + }, + { + "epoch": 0.7890878607355617, + "grad_norm": 0.2363191843032837, + "learning_rate": 6.47378850284561e-06, + "loss": 0.091, + "step": 44241 + }, + { + "epoch": 0.7891056968572754, + "grad_norm": 0.31062182784080505, + "learning_rate": 6.472743419325827e-06, + "loss": 0.1044, + "step": 44242 + }, + { + "epoch": 0.7891235329789891, + "grad_norm": 0.3855638802051544, + "learning_rate": 6.471698407624083e-06, + "loss": 0.1304, + "step": 44243 + }, + { + "epoch": 0.7891413691007028, + "grad_norm": 0.2847430109977722, + "learning_rate": 6.470653467744439e-06, + "loss": 0.0995, + "step": 44244 + }, + { + "epoch": 0.7891592052224164, + "grad_norm": 0.1684645116329193, + "learning_rate": 6.469608599690938e-06, + "loss": 0.0404, + "step": 44245 + }, + { + "epoch": 0.7891770413441301, + "grad_norm": 0.3510969579219818, + "learning_rate": 6.4685638034676396e-06, + "loss": 0.0967, + "step": 44246 + }, + { + "epoch": 0.7891948774658438, + "grad_norm": 0.2813323140144348, + "learning_rate": 6.467519079078582e-06, + "loss": 0.1072, + "step": 44247 + }, + { + "epoch": 0.7892127135875575, + "grad_norm": 0.29802244901657104, + "learning_rate": 6.4664744265278295e-06, + "loss": 0.0921, + "step": 44248 + }, + { + "epoch": 0.7892305497092712, + "grad_norm": 0.23579160869121552, + "learning_rate": 6.465429845819424e-06, + "loss": 0.0725, + "step": 44249 + }, + { + "epoch": 0.7892483858309849, + "grad_norm": 0.3534276783466339, + "learning_rate": 6.464385336957413e-06, + "loss": 0.1132, + "step": 44250 + }, + { + "epoch": 0.7892662219526986, + "grad_norm": 0.2933705151081085, + "learning_rate": 6.463340899945847e-06, + "loss": 0.1093, + "step": 44251 + }, + { + "epoch": 0.7892840580744123, + "grad_norm": 0.34300175309181213, + "learning_rate": 6.462296534788764e-06, + "loss": 0.122, + "step": 44252 + }, + { + "epoch": 0.789301894196126, + "grad_norm": 0.19918321073055267, + "learning_rate": 6.4612522414902315e-06, + "loss": 0.0873, + "step": 44253 + }, + { + "epoch": 0.7893197303178396, + "grad_norm": 0.3685144782066345, + "learning_rate": 6.46020802005429e-06, + "loss": 0.1464, + "step": 44254 + }, + { + "epoch": 0.7893375664395533, + "grad_norm": 0.2875499725341797, + "learning_rate": 6.459163870484983e-06, + "loss": 0.0845, + "step": 44255 + }, + { + "epoch": 0.7893554025612671, + "grad_norm": 0.24322588741779327, + "learning_rate": 6.458119792786355e-06, + "loss": 0.1259, + "step": 44256 + }, + { + "epoch": 0.7893732386829808, + "grad_norm": 0.364108145236969, + "learning_rate": 6.457075786962466e-06, + "loss": 0.1165, + "step": 44257 + }, + { + "epoch": 0.7893910748046945, + "grad_norm": 0.22436676919460297, + "learning_rate": 6.456031853017356e-06, + "loss": 0.0877, + "step": 44258 + }, + { + "epoch": 0.7894089109264082, + "grad_norm": 0.3096891939640045, + "learning_rate": 6.454987990955063e-06, + "loss": 0.12, + "step": 44259 + }, + { + "epoch": 0.7894267470481219, + "grad_norm": 0.23672813177108765, + "learning_rate": 6.453944200779649e-06, + "loss": 0.0963, + "step": 44260 + }, + { + "epoch": 0.7894445831698356, + "grad_norm": 0.24908851087093353, + "learning_rate": 6.452900482495147e-06, + "loss": 0.1125, + "step": 44261 + }, + { + "epoch": 0.7894624192915493, + "grad_norm": 0.2525533437728882, + "learning_rate": 6.451856836105616e-06, + "loss": 0.1196, + "step": 44262 + }, + { + "epoch": 0.7894802554132629, + "grad_norm": 0.22468255460262299, + "learning_rate": 6.450813261615093e-06, + "loss": 0.0908, + "step": 44263 + }, + { + "epoch": 0.7894980915349766, + "grad_norm": 0.23806005716323853, + "learning_rate": 6.449769759027624e-06, + "loss": 0.107, + "step": 44264 + }, + { + "epoch": 0.7895159276566903, + "grad_norm": 0.29737845063209534, + "learning_rate": 6.448726328347249e-06, + "loss": 0.1501, + "step": 44265 + }, + { + "epoch": 0.789533763778404, + "grad_norm": 0.2882936894893646, + "learning_rate": 6.447682969578026e-06, + "loss": 0.0901, + "step": 44266 + }, + { + "epoch": 0.7895515999001177, + "grad_norm": 0.2789209187030792, + "learning_rate": 6.44663968272399e-06, + "loss": 0.0918, + "step": 44267 + }, + { + "epoch": 0.7895694360218314, + "grad_norm": 0.21692532300949097, + "learning_rate": 6.445596467789186e-06, + "loss": 0.092, + "step": 44268 + }, + { + "epoch": 0.7895872721435451, + "grad_norm": 0.23506997525691986, + "learning_rate": 6.444553324777652e-06, + "loss": 0.0972, + "step": 44269 + }, + { + "epoch": 0.7896051082652588, + "grad_norm": 0.28100505471229553, + "learning_rate": 6.4435102536934464e-06, + "loss": 0.1062, + "step": 44270 + }, + { + "epoch": 0.7896229443869724, + "grad_norm": 0.24430091679096222, + "learning_rate": 6.4424672545406016e-06, + "loss": 0.1265, + "step": 44271 + }, + { + "epoch": 0.7896407805086862, + "grad_norm": 0.24555829167366028, + "learning_rate": 6.441424327323159e-06, + "loss": 0.1244, + "step": 44272 + }, + { + "epoch": 0.7896586166303999, + "grad_norm": 0.1920609325170517, + "learning_rate": 6.4403814720451705e-06, + "loss": 0.1347, + "step": 44273 + }, + { + "epoch": 0.7896764527521136, + "grad_norm": 0.25160709023475647, + "learning_rate": 6.439338688710669e-06, + "loss": 0.0834, + "step": 44274 + }, + { + "epoch": 0.7896942888738273, + "grad_norm": 0.2756267488002777, + "learning_rate": 6.438295977323708e-06, + "loss": 0.1066, + "step": 44275 + }, + { + "epoch": 0.789712124995541, + "grad_norm": 0.3179618716239929, + "learning_rate": 6.437253337888321e-06, + "loss": 0.1626, + "step": 44276 + }, + { + "epoch": 0.7897299611172547, + "grad_norm": 0.2695470154285431, + "learning_rate": 6.436210770408551e-06, + "loss": 0.113, + "step": 44277 + }, + { + "epoch": 0.7897477972389684, + "grad_norm": 0.27090758085250854, + "learning_rate": 6.4351682748884365e-06, + "loss": 0.0555, + "step": 44278 + }, + { + "epoch": 0.7897656333606821, + "grad_norm": 0.2736127972602844, + "learning_rate": 6.4341258513320266e-06, + "loss": 0.1131, + "step": 44279 + }, + { + "epoch": 0.7897834694823958, + "grad_norm": 0.2599885165691376, + "learning_rate": 6.433083499743356e-06, + "loss": 0.0776, + "step": 44280 + }, + { + "epoch": 0.7898013056041094, + "grad_norm": 0.329611599445343, + "learning_rate": 6.432041220126469e-06, + "loss": 0.1161, + "step": 44281 + }, + { + "epoch": 0.7898191417258231, + "grad_norm": 0.34924349188804626, + "learning_rate": 6.430999012485395e-06, + "loss": 0.1431, + "step": 44282 + }, + { + "epoch": 0.7898369778475368, + "grad_norm": 0.21205946803092957, + "learning_rate": 6.4299568768241894e-06, + "loss": 0.0844, + "step": 44283 + }, + { + "epoch": 0.7898548139692505, + "grad_norm": 0.3495330810546875, + "learning_rate": 6.4289148131468855e-06, + "loss": 0.1286, + "step": 44284 + }, + { + "epoch": 0.7898726500909642, + "grad_norm": 0.25523877143859863, + "learning_rate": 6.427872821457523e-06, + "loss": 0.1101, + "step": 44285 + }, + { + "epoch": 0.7898904862126779, + "grad_norm": 0.22484418749809265, + "learning_rate": 6.426830901760131e-06, + "loss": 0.1269, + "step": 44286 + }, + { + "epoch": 0.7899083223343916, + "grad_norm": 0.21227017045021057, + "learning_rate": 6.4257890540587645e-06, + "loss": 0.1236, + "step": 44287 + }, + { + "epoch": 0.7899261584561053, + "grad_norm": 0.22469249367713928, + "learning_rate": 6.424747278357449e-06, + "loss": 0.1086, + "step": 44288 + }, + { + "epoch": 0.789943994577819, + "grad_norm": 0.23707841336727142, + "learning_rate": 6.423705574660235e-06, + "loss": 0.0741, + "step": 44289 + }, + { + "epoch": 0.7899618306995327, + "grad_norm": 0.258785605430603, + "learning_rate": 6.422663942971155e-06, + "loss": 0.0958, + "step": 44290 + }, + { + "epoch": 0.7899796668212464, + "grad_norm": 0.26216599345207214, + "learning_rate": 6.421622383294237e-06, + "loss": 0.1234, + "step": 44291 + }, + { + "epoch": 0.7899975029429601, + "grad_norm": 0.2765684723854065, + "learning_rate": 6.420580895633538e-06, + "loss": 0.1436, + "step": 44292 + }, + { + "epoch": 0.7900153390646738, + "grad_norm": 0.2912364602088928, + "learning_rate": 6.419539479993081e-06, + "loss": 0.1785, + "step": 44293 + }, + { + "epoch": 0.7900331751863875, + "grad_norm": 0.2888382077217102, + "learning_rate": 6.418498136376907e-06, + "loss": 0.1175, + "step": 44294 + }, + { + "epoch": 0.7900510113081012, + "grad_norm": 0.24683113396167755, + "learning_rate": 6.417456864789048e-06, + "loss": 0.118, + "step": 44295 + }, + { + "epoch": 0.7900688474298149, + "grad_norm": 0.28047481179237366, + "learning_rate": 6.4164156652335495e-06, + "loss": 0.1135, + "step": 44296 + }, + { + "epoch": 0.7900866835515286, + "grad_norm": 0.23094502091407776, + "learning_rate": 6.415374537714442e-06, + "loss": 0.0918, + "step": 44297 + }, + { + "epoch": 0.7901045196732422, + "grad_norm": 0.25547873973846436, + "learning_rate": 6.414333482235763e-06, + "loss": 0.144, + "step": 44298 + }, + { + "epoch": 0.7901223557949559, + "grad_norm": 0.2930883765220642, + "learning_rate": 6.413292498801538e-06, + "loss": 0.0691, + "step": 44299 + }, + { + "epoch": 0.7901401919166696, + "grad_norm": 0.3080432116985321, + "learning_rate": 6.412251587415818e-06, + "loss": 0.1842, + "step": 44300 + }, + { + "epoch": 0.7901580280383833, + "grad_norm": 0.3612186014652252, + "learning_rate": 6.411210748082625e-06, + "loss": 0.1709, + "step": 44301 + }, + { + "epoch": 0.790175864160097, + "grad_norm": 0.22774101793766022, + "learning_rate": 6.410169980806005e-06, + "loss": 0.1101, + "step": 44302 + }, + { + "epoch": 0.7901937002818107, + "grad_norm": 0.2938668727874756, + "learning_rate": 6.409129285589988e-06, + "loss": 0.1111, + "step": 44303 + }, + { + "epoch": 0.7902115364035244, + "grad_norm": 0.34203335642814636, + "learning_rate": 6.408088662438599e-06, + "loss": 0.1319, + "step": 44304 + }, + { + "epoch": 0.7902293725252381, + "grad_norm": 0.26302456855773926, + "learning_rate": 6.40704811135589e-06, + "loss": 0.091, + "step": 44305 + }, + { + "epoch": 0.7902472086469519, + "grad_norm": 0.2113720327615738, + "learning_rate": 6.4060076323458816e-06, + "loss": 0.0854, + "step": 44306 + }, + { + "epoch": 0.7902650447686655, + "grad_norm": 0.25970640778541565, + "learning_rate": 6.404967225412609e-06, + "loss": 0.0988, + "step": 44307 + }, + { + "epoch": 0.7902828808903792, + "grad_norm": 0.21835792064666748, + "learning_rate": 6.403926890560099e-06, + "loss": 0.1336, + "step": 44308 + }, + { + "epoch": 0.7903007170120929, + "grad_norm": 0.22718465328216553, + "learning_rate": 6.402886627792401e-06, + "loss": 0.1361, + "step": 44309 + }, + { + "epoch": 0.7903185531338066, + "grad_norm": 0.23819546401500702, + "learning_rate": 6.401846437113537e-06, + "loss": 0.1397, + "step": 44310 + }, + { + "epoch": 0.7903363892555203, + "grad_norm": 0.26632481813430786, + "learning_rate": 6.400806318527539e-06, + "loss": 0.0824, + "step": 44311 + }, + { + "epoch": 0.790354225377234, + "grad_norm": 0.2611066997051239, + "learning_rate": 6.399766272038432e-06, + "loss": 0.1306, + "step": 44312 + }, + { + "epoch": 0.7903720614989477, + "grad_norm": 0.3123415410518646, + "learning_rate": 6.398726297650262e-06, + "loss": 0.1418, + "step": 44313 + }, + { + "epoch": 0.7903898976206614, + "grad_norm": 0.2951338291168213, + "learning_rate": 6.3976863953670554e-06, + "loss": 0.0809, + "step": 44314 + }, + { + "epoch": 0.790407733742375, + "grad_norm": 0.2597586214542389, + "learning_rate": 6.396646565192832e-06, + "loss": 0.1155, + "step": 44315 + }, + { + "epoch": 0.7904255698640887, + "grad_norm": 0.27297112345695496, + "learning_rate": 6.395606807131641e-06, + "loss": 0.0877, + "step": 44316 + }, + { + "epoch": 0.7904434059858024, + "grad_norm": 0.2519475519657135, + "learning_rate": 6.394567121187494e-06, + "loss": 0.1167, + "step": 44317 + }, + { + "epoch": 0.7904612421075161, + "grad_norm": 0.7212458848953247, + "learning_rate": 6.393527507364442e-06, + "loss": 0.1943, + "step": 44318 + }, + { + "epoch": 0.7904790782292298, + "grad_norm": 0.23843009769916534, + "learning_rate": 6.392487965666502e-06, + "loss": 0.0988, + "step": 44319 + }, + { + "epoch": 0.7904969143509435, + "grad_norm": 0.6382744312286377, + "learning_rate": 6.3914484960977045e-06, + "loss": 0.1134, + "step": 44320 + }, + { + "epoch": 0.7905147504726572, + "grad_norm": 0.2201121598482132, + "learning_rate": 6.390409098662073e-06, + "loss": 0.1179, + "step": 44321 + }, + { + "epoch": 0.7905325865943709, + "grad_norm": 0.2574596703052521, + "learning_rate": 6.389369773363651e-06, + "loss": 0.1397, + "step": 44322 + }, + { + "epoch": 0.7905504227160847, + "grad_norm": 0.2841186225414276, + "learning_rate": 6.388330520206459e-06, + "loss": 0.1324, + "step": 44323 + }, + { + "epoch": 0.7905682588377984, + "grad_norm": 0.3004613518714905, + "learning_rate": 6.387291339194529e-06, + "loss": 0.1052, + "step": 44324 + }, + { + "epoch": 0.790586094959512, + "grad_norm": 0.23583769798278809, + "learning_rate": 6.386252230331877e-06, + "loss": 0.0936, + "step": 44325 + }, + { + "epoch": 0.7906039310812257, + "grad_norm": 0.20927172899246216, + "learning_rate": 6.3852131936225465e-06, + "loss": 0.0771, + "step": 44326 + }, + { + "epoch": 0.7906217672029394, + "grad_norm": 0.2318449765443802, + "learning_rate": 6.384174229070561e-06, + "loss": 0.0976, + "step": 44327 + }, + { + "epoch": 0.7906396033246531, + "grad_norm": 0.2622761130332947, + "learning_rate": 6.383135336679935e-06, + "loss": 0.1164, + "step": 44328 + }, + { + "epoch": 0.7906574394463668, + "grad_norm": 0.24776549637317657, + "learning_rate": 6.382096516454716e-06, + "loss": 0.1304, + "step": 44329 + }, + { + "epoch": 0.7906752755680805, + "grad_norm": 0.26426374912261963, + "learning_rate": 6.381057768398915e-06, + "loss": 0.1223, + "step": 44330 + }, + { + "epoch": 0.7906931116897942, + "grad_norm": 0.29112571477890015, + "learning_rate": 6.380019092516573e-06, + "loss": 0.1097, + "step": 44331 + }, + { + "epoch": 0.7907109478115079, + "grad_norm": 0.2609473466873169, + "learning_rate": 6.378980488811706e-06, + "loss": 0.1146, + "step": 44332 + }, + { + "epoch": 0.7907287839332215, + "grad_norm": 0.19225920736789703, + "learning_rate": 6.377941957288344e-06, + "loss": 0.099, + "step": 44333 + }, + { + "epoch": 0.7907466200549352, + "grad_norm": 0.23920294642448425, + "learning_rate": 6.376903497950501e-06, + "loss": 0.102, + "step": 44334 + }, + { + "epoch": 0.7907644561766489, + "grad_norm": 0.36490070819854736, + "learning_rate": 6.375865110802221e-06, + "loss": 0.1569, + "step": 44335 + }, + { + "epoch": 0.7907822922983626, + "grad_norm": 0.26597926020622253, + "learning_rate": 6.374826795847522e-06, + "loss": 0.128, + "step": 44336 + }, + { + "epoch": 0.7908001284200763, + "grad_norm": 0.24884183704853058, + "learning_rate": 6.3737885530904235e-06, + "loss": 0.1356, + "step": 44337 + }, + { + "epoch": 0.79081796454179, + "grad_norm": 0.2525271773338318, + "learning_rate": 6.3727503825349504e-06, + "loss": 0.1488, + "step": 44338 + }, + { + "epoch": 0.7908358006635037, + "grad_norm": 0.2771569490432739, + "learning_rate": 6.3717122841851356e-06, + "loss": 0.1417, + "step": 44339 + }, + { + "epoch": 0.7908536367852175, + "grad_norm": 0.310963898897171, + "learning_rate": 6.370674258044998e-06, + "loss": 0.1113, + "step": 44340 + }, + { + "epoch": 0.7908714729069312, + "grad_norm": 0.23170971870422363, + "learning_rate": 6.369636304118562e-06, + "loss": 0.131, + "step": 44341 + }, + { + "epoch": 0.7908893090286448, + "grad_norm": 0.28286445140838623, + "learning_rate": 6.3685984224098436e-06, + "loss": 0.1089, + "step": 44342 + }, + { + "epoch": 0.7909071451503585, + "grad_norm": 0.22349539399147034, + "learning_rate": 6.367560612922879e-06, + "loss": 0.152, + "step": 44343 + }, + { + "epoch": 0.7909249812720722, + "grad_norm": 0.3122520446777344, + "learning_rate": 6.366522875661676e-06, + "loss": 0.1842, + "step": 44344 + }, + { + "epoch": 0.7909428173937859, + "grad_norm": 0.2264707386493683, + "learning_rate": 6.365485210630276e-06, + "loss": 0.1202, + "step": 44345 + }, + { + "epoch": 0.7909606535154996, + "grad_norm": 0.20424042642116547, + "learning_rate": 6.364447617832692e-06, + "loss": 0.1114, + "step": 44346 + }, + { + "epoch": 0.7909784896372133, + "grad_norm": 0.28276780247688293, + "learning_rate": 6.363410097272937e-06, + "loss": 0.1058, + "step": 44347 + }, + { + "epoch": 0.790996325758927, + "grad_norm": 0.27995067834854126, + "learning_rate": 6.362372648955048e-06, + "loss": 0.0951, + "step": 44348 + }, + { + "epoch": 0.7910141618806407, + "grad_norm": 0.23828350007534027, + "learning_rate": 6.36133527288304e-06, + "loss": 0.0799, + "step": 44349 + }, + { + "epoch": 0.7910319980023544, + "grad_norm": 0.23907338082790375, + "learning_rate": 6.360297969060935e-06, + "loss": 0.0971, + "step": 44350 + }, + { + "epoch": 0.791049834124068, + "grad_norm": 0.33674052357673645, + "learning_rate": 6.359260737492745e-06, + "loss": 0.1667, + "step": 44351 + }, + { + "epoch": 0.7910676702457817, + "grad_norm": 0.2130574882030487, + "learning_rate": 6.358223578182507e-06, + "loss": 0.0965, + "step": 44352 + }, + { + "epoch": 0.7910855063674954, + "grad_norm": 0.3549567461013794, + "learning_rate": 6.357186491134232e-06, + "loss": 0.1677, + "step": 44353 + }, + { + "epoch": 0.7911033424892091, + "grad_norm": 0.23653914034366608, + "learning_rate": 6.356149476351942e-06, + "loss": 0.0855, + "step": 44354 + }, + { + "epoch": 0.7911211786109228, + "grad_norm": 0.2747228145599365, + "learning_rate": 6.355112533839649e-06, + "loss": 0.1328, + "step": 44355 + }, + { + "epoch": 0.7911390147326365, + "grad_norm": 0.23279783129692078, + "learning_rate": 6.354075663601386e-06, + "loss": 0.0876, + "step": 44356 + }, + { + "epoch": 0.7911568508543503, + "grad_norm": 0.28982770442962646, + "learning_rate": 6.353038865641159e-06, + "loss": 0.1482, + "step": 44357 + }, + { + "epoch": 0.791174686976064, + "grad_norm": 0.3094983696937561, + "learning_rate": 6.352002139963001e-06, + "loss": 0.0791, + "step": 44358 + }, + { + "epoch": 0.7911925230977777, + "grad_norm": 0.2396293431520462, + "learning_rate": 6.3509654865709224e-06, + "loss": 0.0938, + "step": 44359 + }, + { + "epoch": 0.7912103592194913, + "grad_norm": 0.22900012135505676, + "learning_rate": 6.349928905468935e-06, + "loss": 0.1143, + "step": 44360 + }, + { + "epoch": 0.791228195341205, + "grad_norm": 0.27662521600723267, + "learning_rate": 6.348892396661074e-06, + "loss": 0.1062, + "step": 44361 + }, + { + "epoch": 0.7912460314629187, + "grad_norm": 0.30716392397880554, + "learning_rate": 6.347855960151347e-06, + "loss": 0.1341, + "step": 44362 + }, + { + "epoch": 0.7912638675846324, + "grad_norm": 0.2991967797279358, + "learning_rate": 6.346819595943773e-06, + "loss": 0.1172, + "step": 44363 + }, + { + "epoch": 0.7912817037063461, + "grad_norm": 0.21833184361457825, + "learning_rate": 6.345783304042363e-06, + "loss": 0.1, + "step": 44364 + }, + { + "epoch": 0.7912995398280598, + "grad_norm": 0.3661629259586334, + "learning_rate": 6.344747084451144e-06, + "loss": 0.1691, + "step": 44365 + }, + { + "epoch": 0.7913173759497735, + "grad_norm": 0.2607227563858032, + "learning_rate": 6.343710937174132e-06, + "loss": 0.1024, + "step": 44366 + }, + { + "epoch": 0.7913352120714872, + "grad_norm": 0.31266945600509644, + "learning_rate": 6.342674862215337e-06, + "loss": 0.0569, + "step": 44367 + }, + { + "epoch": 0.7913530481932008, + "grad_norm": 0.3785927891731262, + "learning_rate": 6.34163885957878e-06, + "loss": 0.0808, + "step": 44368 + }, + { + "epoch": 0.7913708843149145, + "grad_norm": 0.23941653966903687, + "learning_rate": 6.340602929268469e-06, + "loss": 0.17, + "step": 44369 + }, + { + "epoch": 0.7913887204366282, + "grad_norm": 0.332263320684433, + "learning_rate": 6.33956707128843e-06, + "loss": 0.1079, + "step": 44370 + }, + { + "epoch": 0.7914065565583419, + "grad_norm": 0.3225208818912506, + "learning_rate": 6.338531285642668e-06, + "loss": 0.1639, + "step": 44371 + }, + { + "epoch": 0.7914243926800556, + "grad_norm": 0.27201715111732483, + "learning_rate": 6.337495572335211e-06, + "loss": 0.1179, + "step": 44372 + }, + { + "epoch": 0.7914422288017694, + "grad_norm": 0.2180429846048355, + "learning_rate": 6.336459931370062e-06, + "loss": 0.1165, + "step": 44373 + }, + { + "epoch": 0.7914600649234831, + "grad_norm": 0.21508382260799408, + "learning_rate": 6.335424362751247e-06, + "loss": 0.0668, + "step": 44374 + }, + { + "epoch": 0.7914779010451968, + "grad_norm": 0.3044489324092865, + "learning_rate": 6.3343888664827714e-06, + "loss": 0.1478, + "step": 44375 + }, + { + "epoch": 0.7914957371669105, + "grad_norm": 0.25947749614715576, + "learning_rate": 6.333353442568654e-06, + "loss": 0.1202, + "step": 44376 + }, + { + "epoch": 0.7915135732886242, + "grad_norm": 0.31481418013572693, + "learning_rate": 6.332318091012898e-06, + "loss": 0.1986, + "step": 44377 + }, + { + "epoch": 0.7915314094103378, + "grad_norm": 0.22756798565387726, + "learning_rate": 6.331282811819531e-06, + "loss": 0.1422, + "step": 44378 + }, + { + "epoch": 0.7915492455320515, + "grad_norm": 0.3005802631378174, + "learning_rate": 6.330247604992562e-06, + "loss": 0.0889, + "step": 44379 + }, + { + "epoch": 0.7915670816537652, + "grad_norm": 0.2736281752586365, + "learning_rate": 6.329212470535997e-06, + "loss": 0.1018, + "step": 44380 + }, + { + "epoch": 0.7915849177754789, + "grad_norm": 0.2804303765296936, + "learning_rate": 6.328177408453859e-06, + "loss": 0.0881, + "step": 44381 + }, + { + "epoch": 0.7916027538971926, + "grad_norm": 0.2587086260318756, + "learning_rate": 6.327142418750143e-06, + "loss": 0.1246, + "step": 44382 + }, + { + "epoch": 0.7916205900189063, + "grad_norm": 0.24446746706962585, + "learning_rate": 6.326107501428883e-06, + "loss": 0.1079, + "step": 44383 + }, + { + "epoch": 0.79163842614062, + "grad_norm": 0.2886585295200348, + "learning_rate": 6.325072656494069e-06, + "loss": 0.1096, + "step": 44384 + }, + { + "epoch": 0.7916562622623337, + "grad_norm": 0.39698928594589233, + "learning_rate": 6.3240378839497325e-06, + "loss": 0.1073, + "step": 44385 + }, + { + "epoch": 0.7916740983840473, + "grad_norm": 0.38284069299697876, + "learning_rate": 6.323003183799869e-06, + "loss": 0.0584, + "step": 44386 + }, + { + "epoch": 0.791691934505761, + "grad_norm": 0.36693501472473145, + "learning_rate": 6.3219685560485024e-06, + "loss": 0.0932, + "step": 44387 + }, + { + "epoch": 0.7917097706274747, + "grad_norm": 0.19956958293914795, + "learning_rate": 6.320934000699638e-06, + "loss": 0.1126, + "step": 44388 + }, + { + "epoch": 0.7917276067491884, + "grad_norm": 0.22734947502613068, + "learning_rate": 6.319899517757283e-06, + "loss": 0.1307, + "step": 44389 + }, + { + "epoch": 0.7917454428709022, + "grad_norm": 0.20080597698688507, + "learning_rate": 6.318865107225444e-06, + "loss": 0.0711, + "step": 44390 + }, + { + "epoch": 0.7917632789926159, + "grad_norm": 0.42116034030914307, + "learning_rate": 6.317830769108141e-06, + "loss": 0.1971, + "step": 44391 + }, + { + "epoch": 0.7917811151143296, + "grad_norm": 0.2806299924850464, + "learning_rate": 6.31679650340938e-06, + "loss": 0.1295, + "step": 44392 + }, + { + "epoch": 0.7917989512360433, + "grad_norm": 0.2743414342403412, + "learning_rate": 6.315762310133169e-06, + "loss": 0.1546, + "step": 44393 + }, + { + "epoch": 0.791816787357757, + "grad_norm": 0.3417699635028839, + "learning_rate": 6.314728189283517e-06, + "loss": 0.1423, + "step": 44394 + }, + { + "epoch": 0.7918346234794706, + "grad_norm": 0.24632671475410461, + "learning_rate": 6.313694140864423e-06, + "loss": 0.1335, + "step": 44395 + }, + { + "epoch": 0.7918524596011843, + "grad_norm": 0.2817908823490143, + "learning_rate": 6.312660164879916e-06, + "loss": 0.1166, + "step": 44396 + }, + { + "epoch": 0.791870295722898, + "grad_norm": 0.18462194502353668, + "learning_rate": 6.311626261333989e-06, + "loss": 0.0677, + "step": 44397 + }, + { + "epoch": 0.7918881318446117, + "grad_norm": 0.23208342492580414, + "learning_rate": 6.310592430230647e-06, + "loss": 0.1124, + "step": 44398 + }, + { + "epoch": 0.7919059679663254, + "grad_norm": 0.20546483993530273, + "learning_rate": 6.309558671573903e-06, + "loss": 0.0856, + "step": 44399 + }, + { + "epoch": 0.7919238040880391, + "grad_norm": 0.27161574363708496, + "learning_rate": 6.3085249853677725e-06, + "loss": 0.1051, + "step": 44400 + }, + { + "epoch": 0.7919416402097528, + "grad_norm": 0.23061218857765198, + "learning_rate": 6.307491371616256e-06, + "loss": 0.0978, + "step": 44401 + }, + { + "epoch": 0.7919594763314665, + "grad_norm": 0.24145588278770447, + "learning_rate": 6.306457830323359e-06, + "loss": 0.1359, + "step": 44402 + }, + { + "epoch": 0.7919773124531801, + "grad_norm": 0.25866636633872986, + "learning_rate": 6.305424361493081e-06, + "loss": 0.1215, + "step": 44403 + }, + { + "epoch": 0.7919951485748938, + "grad_norm": 0.33015015721321106, + "learning_rate": 6.304390965129442e-06, + "loss": 0.1328, + "step": 44404 + }, + { + "epoch": 0.7920129846966075, + "grad_norm": 0.23680657148361206, + "learning_rate": 6.30335764123644e-06, + "loss": 0.1173, + "step": 44405 + }, + { + "epoch": 0.7920308208183212, + "grad_norm": 0.23965995013713837, + "learning_rate": 6.302324389818081e-06, + "loss": 0.1155, + "step": 44406 + }, + { + "epoch": 0.792048656940035, + "grad_norm": 0.28786271810531616, + "learning_rate": 6.301291210878374e-06, + "loss": 0.122, + "step": 44407 + }, + { + "epoch": 0.7920664930617487, + "grad_norm": 0.3374701142311096, + "learning_rate": 6.300258104421311e-06, + "loss": 0.1199, + "step": 44408 + }, + { + "epoch": 0.7920843291834624, + "grad_norm": 0.233884796500206, + "learning_rate": 6.299225070450912e-06, + "loss": 0.1134, + "step": 44409 + }, + { + "epoch": 0.7921021653051761, + "grad_norm": 0.36059561371803284, + "learning_rate": 6.2981921089711765e-06, + "loss": 0.1654, + "step": 44410 + }, + { + "epoch": 0.7921200014268898, + "grad_norm": 0.4107843339443207, + "learning_rate": 6.297159219986101e-06, + "loss": 0.1174, + "step": 44411 + }, + { + "epoch": 0.7921378375486035, + "grad_norm": 0.3168709874153137, + "learning_rate": 6.296126403499705e-06, + "loss": 0.1351, + "step": 44412 + }, + { + "epoch": 0.7921556736703171, + "grad_norm": 0.2797567546367645, + "learning_rate": 6.295093659515974e-06, + "loss": 0.1258, + "step": 44413 + }, + { + "epoch": 0.7921735097920308, + "grad_norm": 0.39774811267852783, + "learning_rate": 6.2940609880389304e-06, + "loss": 0.1765, + "step": 44414 + }, + { + "epoch": 0.7921913459137445, + "grad_norm": 0.24939168989658356, + "learning_rate": 6.293028389072564e-06, + "loss": 0.1237, + "step": 44415 + }, + { + "epoch": 0.7922091820354582, + "grad_norm": 0.29042425751686096, + "learning_rate": 6.291995862620876e-06, + "loss": 0.1718, + "step": 44416 + }, + { + "epoch": 0.7922270181571719, + "grad_norm": 0.27344655990600586, + "learning_rate": 6.290963408687878e-06, + "loss": 0.1619, + "step": 44417 + }, + { + "epoch": 0.7922448542788856, + "grad_norm": 0.27999576926231384, + "learning_rate": 6.289931027277568e-06, + "loss": 0.0995, + "step": 44418 + }, + { + "epoch": 0.7922626904005993, + "grad_norm": 0.25451192259788513, + "learning_rate": 6.288898718393948e-06, + "loss": 0.122, + "step": 44419 + }, + { + "epoch": 0.792280526522313, + "grad_norm": 0.33843305706977844, + "learning_rate": 6.2878664820410215e-06, + "loss": 0.1036, + "step": 44420 + }, + { + "epoch": 0.7922983626440266, + "grad_norm": 0.3082786798477173, + "learning_rate": 6.286834318222779e-06, + "loss": 0.0973, + "step": 44421 + }, + { + "epoch": 0.7923161987657403, + "grad_norm": 0.3015062212944031, + "learning_rate": 6.285802226943235e-06, + "loss": 0.1296, + "step": 44422 + }, + { + "epoch": 0.792334034887454, + "grad_norm": 0.26641401648521423, + "learning_rate": 6.284770208206389e-06, + "loss": 0.1037, + "step": 44423 + }, + { + "epoch": 0.7923518710091678, + "grad_norm": 0.2790592610836029, + "learning_rate": 6.283738262016234e-06, + "loss": 0.1166, + "step": 44424 + }, + { + "epoch": 0.7923697071308815, + "grad_norm": 0.3925189971923828, + "learning_rate": 6.282706388376769e-06, + "loss": 0.1631, + "step": 44425 + }, + { + "epoch": 0.7923875432525952, + "grad_norm": 0.30460017919540405, + "learning_rate": 6.281674587292e-06, + "loss": 0.1136, + "step": 44426 + }, + { + "epoch": 0.7924053793743089, + "grad_norm": 0.26576054096221924, + "learning_rate": 6.280642858765929e-06, + "loss": 0.1147, + "step": 44427 + }, + { + "epoch": 0.7924232154960226, + "grad_norm": 0.31471821665763855, + "learning_rate": 6.279611202802554e-06, + "loss": 0.1233, + "step": 44428 + }, + { + "epoch": 0.7924410516177363, + "grad_norm": 0.22131535410881042, + "learning_rate": 6.278579619405864e-06, + "loss": 0.1139, + "step": 44429 + }, + { + "epoch": 0.79245888773945, + "grad_norm": 0.25198453664779663, + "learning_rate": 6.277548108579873e-06, + "loss": 0.1384, + "step": 44430 + }, + { + "epoch": 0.7924767238611636, + "grad_norm": 0.3263704776763916, + "learning_rate": 6.276516670328572e-06, + "loss": 0.1599, + "step": 44431 + }, + { + "epoch": 0.7924945599828773, + "grad_norm": 0.3255254030227661, + "learning_rate": 6.275485304655959e-06, + "loss": 0.1315, + "step": 44432 + }, + { + "epoch": 0.792512396104591, + "grad_norm": 0.22428010404109955, + "learning_rate": 6.274454011566033e-06, + "loss": 0.0739, + "step": 44433 + }, + { + "epoch": 0.7925302322263047, + "grad_norm": 0.29232409596443176, + "learning_rate": 6.273422791062783e-06, + "loss": 0.1154, + "step": 44434 + }, + { + "epoch": 0.7925480683480184, + "grad_norm": 0.3024180233478546, + "learning_rate": 6.272391643150222e-06, + "loss": 0.1802, + "step": 44435 + }, + { + "epoch": 0.7925659044697321, + "grad_norm": 0.24665972590446472, + "learning_rate": 6.271360567832338e-06, + "loss": 0.0968, + "step": 44436 + }, + { + "epoch": 0.7925837405914458, + "grad_norm": 0.25616490840911865, + "learning_rate": 6.270329565113131e-06, + "loss": 0.1047, + "step": 44437 + }, + { + "epoch": 0.7926015767131595, + "grad_norm": 0.2507772147655487, + "learning_rate": 6.269298634996587e-06, + "loss": 0.1619, + "step": 44438 + }, + { + "epoch": 0.7926194128348731, + "grad_norm": 0.2172888219356537, + "learning_rate": 6.26826777748672e-06, + "loss": 0.1379, + "step": 44439 + }, + { + "epoch": 0.7926372489565868, + "grad_norm": 0.2531301975250244, + "learning_rate": 6.267236992587505e-06, + "loss": 0.1099, + "step": 44440 + }, + { + "epoch": 0.7926550850783006, + "grad_norm": 0.22529707849025726, + "learning_rate": 6.266206280302961e-06, + "loss": 0.092, + "step": 44441 + }, + { + "epoch": 0.7926729212000143, + "grad_norm": 0.3378083109855652, + "learning_rate": 6.265175640637064e-06, + "loss": 0.1823, + "step": 44442 + }, + { + "epoch": 0.792690757321728, + "grad_norm": 0.3259870707988739, + "learning_rate": 6.264145073593822e-06, + "loss": 0.156, + "step": 44443 + }, + { + "epoch": 0.7927085934434417, + "grad_norm": 0.3068363666534424, + "learning_rate": 6.263114579177229e-06, + "loss": 0.0946, + "step": 44444 + }, + { + "epoch": 0.7927264295651554, + "grad_norm": 0.2478625476360321, + "learning_rate": 6.2620841573912705e-06, + "loss": 0.1094, + "step": 44445 + }, + { + "epoch": 0.7927442656868691, + "grad_norm": 0.2599877715110779, + "learning_rate": 6.26105380823995e-06, + "loss": 0.1056, + "step": 44446 + }, + { + "epoch": 0.7927621018085828, + "grad_norm": 0.23376449942588806, + "learning_rate": 6.2600235317272465e-06, + "loss": 0.0785, + "step": 44447 + }, + { + "epoch": 0.7927799379302964, + "grad_norm": 0.35235193371772766, + "learning_rate": 6.258993327857174e-06, + "loss": 0.1466, + "step": 44448 + }, + { + "epoch": 0.7927977740520101, + "grad_norm": 0.25425320863723755, + "learning_rate": 6.257963196633715e-06, + "loss": 0.0804, + "step": 44449 + }, + { + "epoch": 0.7928156101737238, + "grad_norm": 0.21206626296043396, + "learning_rate": 6.2569331380608635e-06, + "loss": 0.141, + "step": 44450 + }, + { + "epoch": 0.7928334462954375, + "grad_norm": 0.32689717411994934, + "learning_rate": 6.255903152142609e-06, + "loss": 0.1756, + "step": 44451 + }, + { + "epoch": 0.7928512824171512, + "grad_norm": 0.339819997549057, + "learning_rate": 6.254873238882952e-06, + "loss": 0.0814, + "step": 44452 + }, + { + "epoch": 0.7928691185388649, + "grad_norm": 0.2103695273399353, + "learning_rate": 6.253843398285875e-06, + "loss": 0.1139, + "step": 44453 + }, + { + "epoch": 0.7928869546605786, + "grad_norm": 0.22478380799293518, + "learning_rate": 6.2528136303553805e-06, + "loss": 0.0908, + "step": 44454 + }, + { + "epoch": 0.7929047907822923, + "grad_norm": 0.2647428512573242, + "learning_rate": 6.251783935095449e-06, + "loss": 0.1358, + "step": 44455 + }, + { + "epoch": 0.792922626904006, + "grad_norm": 0.3228669762611389, + "learning_rate": 6.250754312510087e-06, + "loss": 0.144, + "step": 44456 + }, + { + "epoch": 0.7929404630257196, + "grad_norm": 0.284750759601593, + "learning_rate": 6.249724762603276e-06, + "loss": 0.1203, + "step": 44457 + }, + { + "epoch": 0.7929582991474334, + "grad_norm": 0.31602364778518677, + "learning_rate": 6.248695285379008e-06, + "loss": 0.1117, + "step": 44458 + }, + { + "epoch": 0.7929761352691471, + "grad_norm": 0.3020690381526947, + "learning_rate": 6.247665880841275e-06, + "loss": 0.172, + "step": 44459 + }, + { + "epoch": 0.7929939713908608, + "grad_norm": 0.25382643938064575, + "learning_rate": 6.246636548994056e-06, + "loss": 0.132, + "step": 44460 + }, + { + "epoch": 0.7930118075125745, + "grad_norm": 0.281044065952301, + "learning_rate": 6.24560728984136e-06, + "loss": 0.0716, + "step": 44461 + }, + { + "epoch": 0.7930296436342882, + "grad_norm": 0.2512074112892151, + "learning_rate": 6.244578103387169e-06, + "loss": 0.1412, + "step": 44462 + }, + { + "epoch": 0.7930474797560019, + "grad_norm": 0.24529364705085754, + "learning_rate": 6.24354898963547e-06, + "loss": 0.1639, + "step": 44463 + }, + { + "epoch": 0.7930653158777156, + "grad_norm": 0.26589420437812805, + "learning_rate": 6.242519948590245e-06, + "loss": 0.0737, + "step": 44464 + }, + { + "epoch": 0.7930831519994292, + "grad_norm": 0.26851651072502136, + "learning_rate": 6.241490980255499e-06, + "loss": 0.1284, + "step": 44465 + }, + { + "epoch": 0.7931009881211429, + "grad_norm": 0.3239595890045166, + "learning_rate": 6.240462084635213e-06, + "loss": 0.1128, + "step": 44466 + }, + { + "epoch": 0.7931188242428566, + "grad_norm": 0.23602654039859772, + "learning_rate": 6.239433261733368e-06, + "loss": 0.1119, + "step": 44467 + }, + { + "epoch": 0.7931366603645703, + "grad_norm": 0.277048259973526, + "learning_rate": 6.2384045115539666e-06, + "loss": 0.0961, + "step": 44468 + }, + { + "epoch": 0.793154496486284, + "grad_norm": 0.3234660029411316, + "learning_rate": 6.237375834100983e-06, + "loss": 0.1143, + "step": 44469 + }, + { + "epoch": 0.7931723326079977, + "grad_norm": 0.22434797883033752, + "learning_rate": 6.236347229378417e-06, + "loss": 0.1028, + "step": 44470 + }, + { + "epoch": 0.7931901687297114, + "grad_norm": 0.27867910265922546, + "learning_rate": 6.2353186973902524e-06, + "loss": 0.0652, + "step": 44471 + }, + { + "epoch": 0.7932080048514251, + "grad_norm": 0.2899375259876251, + "learning_rate": 6.234290238140472e-06, + "loss": 0.0899, + "step": 44472 + }, + { + "epoch": 0.7932258409731388, + "grad_norm": 0.20784126222133636, + "learning_rate": 6.2332618516330545e-06, + "loss": 0.14, + "step": 44473 + }, + { + "epoch": 0.7932436770948526, + "grad_norm": 0.26721426844596863, + "learning_rate": 6.232233537872006e-06, + "loss": 0.0977, + "step": 44474 + }, + { + "epoch": 0.7932615132165662, + "grad_norm": 0.27338501811027527, + "learning_rate": 6.231205296861303e-06, + "loss": 0.0918, + "step": 44475 + }, + { + "epoch": 0.7932793493382799, + "grad_norm": 0.45138785243034363, + "learning_rate": 6.230177128604927e-06, + "loss": 0.1527, + "step": 44476 + }, + { + "epoch": 0.7932971854599936, + "grad_norm": 0.4324415922164917, + "learning_rate": 6.229149033106866e-06, + "loss": 0.1662, + "step": 44477 + }, + { + "epoch": 0.7933150215817073, + "grad_norm": 0.28207719326019287, + "learning_rate": 6.22812101037111e-06, + "loss": 0.1932, + "step": 44478 + }, + { + "epoch": 0.793332857703421, + "grad_norm": 0.2585281431674957, + "learning_rate": 6.22709306040164e-06, + "loss": 0.1057, + "step": 44479 + }, + { + "epoch": 0.7933506938251347, + "grad_norm": 0.34037742018699646, + "learning_rate": 6.226065183202437e-06, + "loss": 0.0905, + "step": 44480 + }, + { + "epoch": 0.7933685299468484, + "grad_norm": 0.25283282995224, + "learning_rate": 6.225037378777493e-06, + "loss": 0.1054, + "step": 44481 + }, + { + "epoch": 0.793386366068562, + "grad_norm": 0.18677212297916412, + "learning_rate": 6.224009647130785e-06, + "loss": 0.0813, + "step": 44482 + }, + { + "epoch": 0.7934042021902757, + "grad_norm": 0.2476346343755722, + "learning_rate": 6.2229819882663045e-06, + "loss": 0.1496, + "step": 44483 + }, + { + "epoch": 0.7934220383119894, + "grad_norm": 0.2059735655784607, + "learning_rate": 6.221954402188035e-06, + "loss": 0.1264, + "step": 44484 + }, + { + "epoch": 0.7934398744337031, + "grad_norm": 0.429033100605011, + "learning_rate": 6.220926888899953e-06, + "loss": 0.1534, + "step": 44485 + }, + { + "epoch": 0.7934577105554168, + "grad_norm": 0.2717283368110657, + "learning_rate": 6.219899448406039e-06, + "loss": 0.1492, + "step": 44486 + }, + { + "epoch": 0.7934755466771305, + "grad_norm": 0.26215073466300964, + "learning_rate": 6.2188720807102865e-06, + "loss": 0.1139, + "step": 44487 + }, + { + "epoch": 0.7934933827988442, + "grad_norm": 0.26236769556999207, + "learning_rate": 6.217844785816676e-06, + "loss": 0.1237, + "step": 44488 + }, + { + "epoch": 0.7935112189205579, + "grad_norm": 0.2535540461540222, + "learning_rate": 6.2168175637291825e-06, + "loss": 0.1169, + "step": 44489 + }, + { + "epoch": 0.7935290550422716, + "grad_norm": 0.29390227794647217, + "learning_rate": 6.215790414451786e-06, + "loss": 0.1076, + "step": 44490 + }, + { + "epoch": 0.7935468911639854, + "grad_norm": 0.2571776807308197, + "learning_rate": 6.214763337988483e-06, + "loss": 0.0798, + "step": 44491 + }, + { + "epoch": 0.793564727285699, + "grad_norm": 0.25504249334335327, + "learning_rate": 6.2137363343432425e-06, + "loss": 0.1405, + "step": 44492 + }, + { + "epoch": 0.7935825634074127, + "grad_norm": 0.27247005701065063, + "learning_rate": 6.212709403520048e-06, + "loss": 0.127, + "step": 44493 + }, + { + "epoch": 0.7936003995291264, + "grad_norm": 0.23732003569602966, + "learning_rate": 6.211682545522876e-06, + "loss": 0.1384, + "step": 44494 + }, + { + "epoch": 0.7936182356508401, + "grad_norm": 0.23826231062412262, + "learning_rate": 6.210655760355718e-06, + "loss": 0.0838, + "step": 44495 + }, + { + "epoch": 0.7936360717725538, + "grad_norm": 0.2973790466785431, + "learning_rate": 6.209629048022541e-06, + "loss": 0.0851, + "step": 44496 + }, + { + "epoch": 0.7936539078942675, + "grad_norm": 0.23042847216129303, + "learning_rate": 6.208602408527339e-06, + "loss": 0.091, + "step": 44497 + }, + { + "epoch": 0.7936717440159812, + "grad_norm": 0.2569870054721832, + "learning_rate": 6.207575841874083e-06, + "loss": 0.1015, + "step": 44498 + }, + { + "epoch": 0.7936895801376949, + "grad_norm": 0.27724137902259827, + "learning_rate": 6.206549348066748e-06, + "loss": 0.1023, + "step": 44499 + }, + { + "epoch": 0.7937074162594085, + "grad_norm": 0.31265929341316223, + "learning_rate": 6.205522927109325e-06, + "loss": 0.141, + "step": 44500 + }, + { + "epoch": 0.7937252523811222, + "grad_norm": 0.3048022985458374, + "learning_rate": 6.204496579005789e-06, + "loss": 0.1378, + "step": 44501 + }, + { + "epoch": 0.7937430885028359, + "grad_norm": 0.25097495317459106, + "learning_rate": 6.203470303760114e-06, + "loss": 0.0996, + "step": 44502 + }, + { + "epoch": 0.7937609246245496, + "grad_norm": 0.2565118670463562, + "learning_rate": 6.202444101376273e-06, + "loss": 0.1173, + "step": 44503 + }, + { + "epoch": 0.7937787607462633, + "grad_norm": 0.25220057368278503, + "learning_rate": 6.201417971858259e-06, + "loss": 0.1132, + "step": 44504 + }, + { + "epoch": 0.793796596867977, + "grad_norm": 0.22475945949554443, + "learning_rate": 6.200391915210041e-06, + "loss": 0.0707, + "step": 44505 + }, + { + "epoch": 0.7938144329896907, + "grad_norm": 0.24830904603004456, + "learning_rate": 6.1993659314355965e-06, + "loss": 0.0731, + "step": 44506 + }, + { + "epoch": 0.7938322691114044, + "grad_norm": 0.23916874825954437, + "learning_rate": 6.198340020538898e-06, + "loss": 0.0759, + "step": 44507 + }, + { + "epoch": 0.7938501052331182, + "grad_norm": 0.27155864238739014, + "learning_rate": 6.197314182523933e-06, + "loss": 0.0894, + "step": 44508 + }, + { + "epoch": 0.7938679413548319, + "grad_norm": 0.2335289865732193, + "learning_rate": 6.196288417394666e-06, + "loss": 0.0996, + "step": 44509 + }, + { + "epoch": 0.7938857774765455, + "grad_norm": 0.2389247715473175, + "learning_rate": 6.195262725155085e-06, + "loss": 0.0763, + "step": 44510 + }, + { + "epoch": 0.7939036135982592, + "grad_norm": 0.22371631860733032, + "learning_rate": 6.194237105809164e-06, + "loss": 0.0809, + "step": 44511 + }, + { + "epoch": 0.7939214497199729, + "grad_norm": 0.2821425497531891, + "learning_rate": 6.193211559360864e-06, + "loss": 0.1418, + "step": 44512 + }, + { + "epoch": 0.7939392858416866, + "grad_norm": 0.291654109954834, + "learning_rate": 6.192186085814183e-06, + "loss": 0.1344, + "step": 44513 + }, + { + "epoch": 0.7939571219634003, + "grad_norm": 0.21793454885482788, + "learning_rate": 6.191160685173083e-06, + "loss": 0.1059, + "step": 44514 + }, + { + "epoch": 0.793974958085114, + "grad_norm": 0.38878750801086426, + "learning_rate": 6.19013535744154e-06, + "loss": 0.106, + "step": 44515 + }, + { + "epoch": 0.7939927942068277, + "grad_norm": 0.2460489124059677, + "learning_rate": 6.1891101026235215e-06, + "loss": 0.0705, + "step": 44516 + }, + { + "epoch": 0.7940106303285414, + "grad_norm": 0.27745917439460754, + "learning_rate": 6.188084920723019e-06, + "loss": 0.1146, + "step": 44517 + }, + { + "epoch": 0.794028466450255, + "grad_norm": 0.2203586846590042, + "learning_rate": 6.187059811743995e-06, + "loss": 0.0859, + "step": 44518 + }, + { + "epoch": 0.7940463025719687, + "grad_norm": 0.23983919620513916, + "learning_rate": 6.186034775690424e-06, + "loss": 0.0793, + "step": 44519 + }, + { + "epoch": 0.7940641386936824, + "grad_norm": 0.3579029142856598, + "learning_rate": 6.185009812566275e-06, + "loss": 0.1266, + "step": 44520 + }, + { + "epoch": 0.7940819748153961, + "grad_norm": 0.2203003615140915, + "learning_rate": 6.183984922375533e-06, + "loss": 0.1256, + "step": 44521 + }, + { + "epoch": 0.7940998109371098, + "grad_norm": 0.2137032002210617, + "learning_rate": 6.182960105122165e-06, + "loss": 0.1141, + "step": 44522 + }, + { + "epoch": 0.7941176470588235, + "grad_norm": 0.2582463026046753, + "learning_rate": 6.181935360810134e-06, + "loss": 0.1038, + "step": 44523 + }, + { + "epoch": 0.7941354831805372, + "grad_norm": 0.21422302722930908, + "learning_rate": 6.180910689443428e-06, + "loss": 0.082, + "step": 44524 + }, + { + "epoch": 0.794153319302251, + "grad_norm": 0.3135656416416168, + "learning_rate": 6.179886091026008e-06, + "loss": 0.1697, + "step": 44525 + }, + { + "epoch": 0.7941711554239647, + "grad_norm": 0.20665223896503448, + "learning_rate": 6.178861565561855e-06, + "loss": 0.1129, + "step": 44526 + }, + { + "epoch": 0.7941889915456783, + "grad_norm": 0.25325390696525574, + "learning_rate": 6.177837113054935e-06, + "loss": 0.0978, + "step": 44527 + }, + { + "epoch": 0.794206827667392, + "grad_norm": 0.25414136052131653, + "learning_rate": 6.176812733509219e-06, + "loss": 0.1279, + "step": 44528 + }, + { + "epoch": 0.7942246637891057, + "grad_norm": 0.2939840853214264, + "learning_rate": 6.175788426928672e-06, + "loss": 0.101, + "step": 44529 + }, + { + "epoch": 0.7942424999108194, + "grad_norm": 0.23991821706295013, + "learning_rate": 6.1747641933172754e-06, + "loss": 0.1139, + "step": 44530 + }, + { + "epoch": 0.7942603360325331, + "grad_norm": 0.370981901884079, + "learning_rate": 6.173740032678999e-06, + "loss": 0.1572, + "step": 44531 + }, + { + "epoch": 0.7942781721542468, + "grad_norm": 0.28368422389030457, + "learning_rate": 6.172715945017807e-06, + "loss": 0.1206, + "step": 44532 + }, + { + "epoch": 0.7942960082759605, + "grad_norm": 0.3193915784358978, + "learning_rate": 6.171691930337661e-06, + "loss": 0.0806, + "step": 44533 + }, + { + "epoch": 0.7943138443976742, + "grad_norm": 0.2615368068218231, + "learning_rate": 6.17066798864255e-06, + "loss": 0.0811, + "step": 44534 + }, + { + "epoch": 0.7943316805193879, + "grad_norm": 0.22125034034252167, + "learning_rate": 6.169644119936432e-06, + "loss": 0.0808, + "step": 44535 + }, + { + "epoch": 0.7943495166411015, + "grad_norm": 0.3314410448074341, + "learning_rate": 6.16862032422327e-06, + "loss": 0.1334, + "step": 44536 + }, + { + "epoch": 0.7943673527628152, + "grad_norm": 0.2585088908672333, + "learning_rate": 6.167596601507047e-06, + "loss": 0.085, + "step": 44537 + }, + { + "epoch": 0.7943851888845289, + "grad_norm": 0.2760941982269287, + "learning_rate": 6.1665729517917185e-06, + "loss": 0.1452, + "step": 44538 + }, + { + "epoch": 0.7944030250062426, + "grad_norm": 0.3180933892726898, + "learning_rate": 6.165549375081264e-06, + "loss": 0.0648, + "step": 44539 + }, + { + "epoch": 0.7944208611279563, + "grad_norm": 0.22410927712917328, + "learning_rate": 6.164525871379648e-06, + "loss": 0.0976, + "step": 44540 + }, + { + "epoch": 0.79443869724967, + "grad_norm": 0.3005995750427246, + "learning_rate": 6.163502440690832e-06, + "loss": 0.0832, + "step": 44541 + }, + { + "epoch": 0.7944565333713838, + "grad_norm": 0.30551469326019287, + "learning_rate": 6.16247908301878e-06, + "loss": 0.0642, + "step": 44542 + }, + { + "epoch": 0.7944743694930975, + "grad_norm": 0.2784944474697113, + "learning_rate": 6.161455798367472e-06, + "loss": 0.1351, + "step": 44543 + }, + { + "epoch": 0.7944922056148112, + "grad_norm": 0.2333540916442871, + "learning_rate": 6.160432586740869e-06, + "loss": 0.1492, + "step": 44544 + }, + { + "epoch": 0.7945100417365248, + "grad_norm": 0.3198825716972351, + "learning_rate": 6.159409448142936e-06, + "loss": 0.1047, + "step": 44545 + }, + { + "epoch": 0.7945278778582385, + "grad_norm": 0.25376787781715393, + "learning_rate": 6.1583863825776326e-06, + "loss": 0.1181, + "step": 44546 + }, + { + "epoch": 0.7945457139799522, + "grad_norm": 0.29596784710884094, + "learning_rate": 6.1573633900489366e-06, + "loss": 0.1085, + "step": 44547 + }, + { + "epoch": 0.7945635501016659, + "grad_norm": 0.28149908781051636, + "learning_rate": 6.156340470560809e-06, + "loss": 0.2084, + "step": 44548 + }, + { + "epoch": 0.7945813862233796, + "grad_norm": 0.3240255117416382, + "learning_rate": 6.1553176241172155e-06, + "loss": 0.1484, + "step": 44549 + }, + { + "epoch": 0.7945992223450933, + "grad_norm": 0.2148492932319641, + "learning_rate": 6.154294850722112e-06, + "loss": 0.1056, + "step": 44550 + }, + { + "epoch": 0.794617058466807, + "grad_norm": 0.2300100475549698, + "learning_rate": 6.1532721503794775e-06, + "loss": 0.0888, + "step": 44551 + }, + { + "epoch": 0.7946348945885207, + "grad_norm": 0.3131921589374542, + "learning_rate": 6.152249523093262e-06, + "loss": 0.1333, + "step": 44552 + }, + { + "epoch": 0.7946527307102343, + "grad_norm": 0.3293219208717346, + "learning_rate": 6.151226968867443e-06, + "loss": 0.1047, + "step": 44553 + }, + { + "epoch": 0.794670566831948, + "grad_norm": 0.19201670587062836, + "learning_rate": 6.150204487705982e-06, + "loss": 0.0926, + "step": 44554 + }, + { + "epoch": 0.7946884029536617, + "grad_norm": 0.29677143692970276, + "learning_rate": 6.149182079612828e-06, + "loss": 0.0903, + "step": 44555 + }, + { + "epoch": 0.7947062390753754, + "grad_norm": 0.2133985310792923, + "learning_rate": 6.148159744591966e-06, + "loss": 0.081, + "step": 44556 + }, + { + "epoch": 0.7947240751970891, + "grad_norm": 0.25235363841056824, + "learning_rate": 6.147137482647344e-06, + "loss": 0.1127, + "step": 44557 + }, + { + "epoch": 0.7947419113188028, + "grad_norm": 0.31030669808387756, + "learning_rate": 6.1461152937829335e-06, + "loss": 0.1009, + "step": 44558 + }, + { + "epoch": 0.7947597474405166, + "grad_norm": 0.42810261249542236, + "learning_rate": 6.145093178002681e-06, + "loss": 0.1793, + "step": 44559 + }, + { + "epoch": 0.7947775835622303, + "grad_norm": 0.32217907905578613, + "learning_rate": 6.1440711353105665e-06, + "loss": 0.1543, + "step": 44560 + }, + { + "epoch": 0.794795419683944, + "grad_norm": 0.28193140029907227, + "learning_rate": 6.143049165710549e-06, + "loss": 0.1, + "step": 44561 + }, + { + "epoch": 0.7948132558056576, + "grad_norm": 0.29300662875175476, + "learning_rate": 6.142027269206582e-06, + "loss": 0.1146, + "step": 44562 + }, + { + "epoch": 0.7948310919273713, + "grad_norm": 0.24098087847232819, + "learning_rate": 6.141005445802625e-06, + "loss": 0.1265, + "step": 44563 + }, + { + "epoch": 0.794848928049085, + "grad_norm": 0.2366962730884552, + "learning_rate": 6.139983695502652e-06, + "loss": 0.1291, + "step": 44564 + }, + { + "epoch": 0.7948667641707987, + "grad_norm": 0.2564215064048767, + "learning_rate": 6.13896201831061e-06, + "loss": 0.1208, + "step": 44565 + }, + { + "epoch": 0.7948846002925124, + "grad_norm": 0.41292035579681396, + "learning_rate": 6.13794041423047e-06, + "loss": 0.15, + "step": 44566 + }, + { + "epoch": 0.7949024364142261, + "grad_norm": 0.29780304431915283, + "learning_rate": 6.136918883266188e-06, + "loss": 0.1376, + "step": 44567 + }, + { + "epoch": 0.7949202725359398, + "grad_norm": 0.24170997738838196, + "learning_rate": 6.1358974254217185e-06, + "loss": 0.0818, + "step": 44568 + }, + { + "epoch": 0.7949381086576535, + "grad_norm": 0.23589842021465302, + "learning_rate": 6.134876040701032e-06, + "loss": 0.0819, + "step": 44569 + }, + { + "epoch": 0.7949559447793672, + "grad_norm": 0.25793835520744324, + "learning_rate": 6.1338547291080825e-06, + "loss": 0.1173, + "step": 44570 + }, + { + "epoch": 0.7949737809010808, + "grad_norm": 0.3261379599571228, + "learning_rate": 6.1328334906468275e-06, + "loss": 0.1239, + "step": 44571 + }, + { + "epoch": 0.7949916170227945, + "grad_norm": 0.3002544939517975, + "learning_rate": 6.131812325321218e-06, + "loss": 0.0956, + "step": 44572 + }, + { + "epoch": 0.7950094531445082, + "grad_norm": 0.30837634205818176, + "learning_rate": 6.13079123313523e-06, + "loss": 0.093, + "step": 44573 + }, + { + "epoch": 0.7950272892662219, + "grad_norm": 0.24205940961837769, + "learning_rate": 6.129770214092812e-06, + "loss": 0.1495, + "step": 44574 + }, + { + "epoch": 0.7950451253879357, + "grad_norm": 0.3052802085876465, + "learning_rate": 6.128749268197923e-06, + "loss": 0.1284, + "step": 44575 + }, + { + "epoch": 0.7950629615096494, + "grad_norm": 0.22689349949359894, + "learning_rate": 6.12772839545451e-06, + "loss": 0.1016, + "step": 44576 + }, + { + "epoch": 0.7950807976313631, + "grad_norm": 0.20815925300121307, + "learning_rate": 6.1267075958665484e-06, + "loss": 0.0928, + "step": 44577 + }, + { + "epoch": 0.7950986337530768, + "grad_norm": 0.23941612243652344, + "learning_rate": 6.125686869437988e-06, + "loss": 0.0704, + "step": 44578 + }, + { + "epoch": 0.7951164698747905, + "grad_norm": 0.2548205852508545, + "learning_rate": 6.1246662161727755e-06, + "loss": 0.1272, + "step": 44579 + }, + { + "epoch": 0.7951343059965041, + "grad_norm": 0.2453823983669281, + "learning_rate": 6.1236456360748846e-06, + "loss": 0.0806, + "step": 44580 + }, + { + "epoch": 0.7951521421182178, + "grad_norm": 0.3220204710960388, + "learning_rate": 6.122625129148254e-06, + "loss": 0.1581, + "step": 44581 + }, + { + "epoch": 0.7951699782399315, + "grad_norm": 0.3273429274559021, + "learning_rate": 6.121604695396857e-06, + "loss": 0.0941, + "step": 44582 + }, + { + "epoch": 0.7951878143616452, + "grad_norm": 0.248832568526268, + "learning_rate": 6.120584334824642e-06, + "loss": 0.0775, + "step": 44583 + }, + { + "epoch": 0.7952056504833589, + "grad_norm": 0.21122051775455475, + "learning_rate": 6.119564047435561e-06, + "loss": 0.0798, + "step": 44584 + }, + { + "epoch": 0.7952234866050726, + "grad_norm": 0.28824931383132935, + "learning_rate": 6.118543833233562e-06, + "loss": 0.1304, + "step": 44585 + }, + { + "epoch": 0.7952413227267863, + "grad_norm": 0.22789673507213593, + "learning_rate": 6.117523692222618e-06, + "loss": 0.1034, + "step": 44586 + }, + { + "epoch": 0.7952591588485, + "grad_norm": 0.3981494605541229, + "learning_rate": 6.116503624406675e-06, + "loss": 0.1692, + "step": 44587 + }, + { + "epoch": 0.7952769949702136, + "grad_norm": 0.3661998212337494, + "learning_rate": 6.115483629789684e-06, + "loss": 0.1225, + "step": 44588 + }, + { + "epoch": 0.7952948310919273, + "grad_norm": 0.26195618510246277, + "learning_rate": 6.114463708375595e-06, + "loss": 0.0743, + "step": 44589 + }, + { + "epoch": 0.795312667213641, + "grad_norm": 0.2997496724128723, + "learning_rate": 6.1134438601683716e-06, + "loss": 0.1042, + "step": 44590 + }, + { + "epoch": 0.7953305033353547, + "grad_norm": 0.3216645419597626, + "learning_rate": 6.1124240851719665e-06, + "loss": 0.1218, + "step": 44591 + }, + { + "epoch": 0.7953483394570685, + "grad_norm": 0.35883399844169617, + "learning_rate": 6.11140438339032e-06, + "loss": 0.1306, + "step": 44592 + }, + { + "epoch": 0.7953661755787822, + "grad_norm": 0.29565638303756714, + "learning_rate": 6.110384754827403e-06, + "loss": 0.0907, + "step": 44593 + }, + { + "epoch": 0.7953840117004959, + "grad_norm": 0.2854943573474884, + "learning_rate": 6.109365199487152e-06, + "loss": 0.1296, + "step": 44594 + }, + { + "epoch": 0.7954018478222096, + "grad_norm": 0.3640003204345703, + "learning_rate": 6.1083457173735315e-06, + "loss": 0.0883, + "step": 44595 + }, + { + "epoch": 0.7954196839439233, + "grad_norm": 0.2673422396183014, + "learning_rate": 6.107326308490488e-06, + "loss": 0.1095, + "step": 44596 + }, + { + "epoch": 0.795437520065637, + "grad_norm": 0.27546003460884094, + "learning_rate": 6.106306972841974e-06, + "loss": 0.1191, + "step": 44597 + }, + { + "epoch": 0.7954553561873506, + "grad_norm": 0.24141313135623932, + "learning_rate": 6.105287710431934e-06, + "loss": 0.0908, + "step": 44598 + }, + { + "epoch": 0.7954731923090643, + "grad_norm": 0.29652106761932373, + "learning_rate": 6.104268521264331e-06, + "loss": 0.1171, + "step": 44599 + }, + { + "epoch": 0.795491028430778, + "grad_norm": 0.27894893288612366, + "learning_rate": 6.103249405343109e-06, + "loss": 0.0891, + "step": 44600 + }, + { + "epoch": 0.7955088645524917, + "grad_norm": 0.24989303946495056, + "learning_rate": 6.102230362672218e-06, + "loss": 0.1147, + "step": 44601 + }, + { + "epoch": 0.7955267006742054, + "grad_norm": 0.32122403383255005, + "learning_rate": 6.101211393255602e-06, + "loss": 0.0934, + "step": 44602 + }, + { + "epoch": 0.7955445367959191, + "grad_norm": 0.25291064381599426, + "learning_rate": 6.1001924970972265e-06, + "loss": 0.0841, + "step": 44603 + }, + { + "epoch": 0.7955623729176328, + "grad_norm": 0.2650677561759949, + "learning_rate": 6.099173674201034e-06, + "loss": 0.1391, + "step": 44604 + }, + { + "epoch": 0.7955802090393465, + "grad_norm": 0.25109490752220154, + "learning_rate": 6.09815492457097e-06, + "loss": 0.0925, + "step": 44605 + }, + { + "epoch": 0.7955980451610601, + "grad_norm": 0.3072631359100342, + "learning_rate": 6.0971362482109805e-06, + "loss": 0.1582, + "step": 44606 + }, + { + "epoch": 0.7956158812827738, + "grad_norm": 0.24367989599704742, + "learning_rate": 6.096117645125018e-06, + "loss": 0.0806, + "step": 44607 + }, + { + "epoch": 0.7956337174044875, + "grad_norm": 0.28080081939697266, + "learning_rate": 6.095099115317041e-06, + "loss": 0.1297, + "step": 44608 + }, + { + "epoch": 0.7956515535262013, + "grad_norm": 0.2517644762992859, + "learning_rate": 6.0940806587909904e-06, + "loss": 0.1446, + "step": 44609 + }, + { + "epoch": 0.795669389647915, + "grad_norm": 0.3113599717617035, + "learning_rate": 6.093062275550812e-06, + "loss": 0.138, + "step": 44610 + }, + { + "epoch": 0.7956872257696287, + "grad_norm": 0.20456230640411377, + "learning_rate": 6.092043965600447e-06, + "loss": 0.0884, + "step": 44611 + }, + { + "epoch": 0.7957050618913424, + "grad_norm": 0.4639633595943451, + "learning_rate": 6.091025728943858e-06, + "loss": 0.1647, + "step": 44612 + }, + { + "epoch": 0.7957228980130561, + "grad_norm": 0.21989652514457703, + "learning_rate": 6.090007565584982e-06, + "loss": 0.1022, + "step": 44613 + }, + { + "epoch": 0.7957407341347698, + "grad_norm": 0.35461410880088806, + "learning_rate": 6.088989475527771e-06, + "loss": 0.1562, + "step": 44614 + }, + { + "epoch": 0.7957585702564834, + "grad_norm": 0.28047627210617065, + "learning_rate": 6.08797145877616e-06, + "loss": 0.1617, + "step": 44615 + }, + { + "epoch": 0.7957764063781971, + "grad_norm": 0.24063174426555634, + "learning_rate": 6.086953515334109e-06, + "loss": 0.1073, + "step": 44616 + }, + { + "epoch": 0.7957942424999108, + "grad_norm": 0.38832253217697144, + "learning_rate": 6.085935645205562e-06, + "loss": 0.0877, + "step": 44617 + }, + { + "epoch": 0.7958120786216245, + "grad_norm": 0.30571722984313965, + "learning_rate": 6.0849178483944585e-06, + "loss": 0.0976, + "step": 44618 + }, + { + "epoch": 0.7958299147433382, + "grad_norm": 0.27339184284210205, + "learning_rate": 6.0839001249047425e-06, + "loss": 0.1694, + "step": 44619 + }, + { + "epoch": 0.7958477508650519, + "grad_norm": 0.2583093047142029, + "learning_rate": 6.082882474740367e-06, + "loss": 0.0939, + "step": 44620 + }, + { + "epoch": 0.7958655869867656, + "grad_norm": 0.2858339846134186, + "learning_rate": 6.0818648979052675e-06, + "loss": 0.1473, + "step": 44621 + }, + { + "epoch": 0.7958834231084793, + "grad_norm": 0.309569776058197, + "learning_rate": 6.0808473944033995e-06, + "loss": 0.1099, + "step": 44622 + }, + { + "epoch": 0.795901259230193, + "grad_norm": 0.28179314732551575, + "learning_rate": 6.079829964238704e-06, + "loss": 0.096, + "step": 44623 + }, + { + "epoch": 0.7959190953519066, + "grad_norm": 0.2213532030582428, + "learning_rate": 6.07881260741511e-06, + "loss": 0.1289, + "step": 44624 + }, + { + "epoch": 0.7959369314736203, + "grad_norm": 0.2249257117509842, + "learning_rate": 6.077795323936586e-06, + "loss": 0.0726, + "step": 44625 + }, + { + "epoch": 0.7959547675953341, + "grad_norm": 0.37136173248291016, + "learning_rate": 6.07677811380706e-06, + "loss": 0.0576, + "step": 44626 + }, + { + "epoch": 0.7959726037170478, + "grad_norm": 0.21767722070217133, + "learning_rate": 6.0757609770304785e-06, + "loss": 0.0667, + "step": 44627 + }, + { + "epoch": 0.7959904398387615, + "grad_norm": 0.3218723237514496, + "learning_rate": 6.0747439136107785e-06, + "loss": 0.1565, + "step": 44628 + }, + { + "epoch": 0.7960082759604752, + "grad_norm": 0.23924113810062408, + "learning_rate": 6.073726923551912e-06, + "loss": 0.1174, + "step": 44629 + }, + { + "epoch": 0.7960261120821889, + "grad_norm": 0.3085210919380188, + "learning_rate": 6.072710006857818e-06, + "loss": 0.0882, + "step": 44630 + }, + { + "epoch": 0.7960439482039026, + "grad_norm": 0.3239879906177521, + "learning_rate": 6.071693163532438e-06, + "loss": 0.1402, + "step": 44631 + }, + { + "epoch": 0.7960617843256163, + "grad_norm": 0.35857465863227844, + "learning_rate": 6.070676393579711e-06, + "loss": 0.19, + "step": 44632 + }, + { + "epoch": 0.7960796204473299, + "grad_norm": 0.2874899208545685, + "learning_rate": 6.069659697003574e-06, + "loss": 0.1096, + "step": 44633 + }, + { + "epoch": 0.7960974565690436, + "grad_norm": 0.32893192768096924, + "learning_rate": 6.068643073807975e-06, + "loss": 0.1301, + "step": 44634 + }, + { + "epoch": 0.7961152926907573, + "grad_norm": 0.25399476289749146, + "learning_rate": 6.067626523996861e-06, + "loss": 0.1149, + "step": 44635 + }, + { + "epoch": 0.796133128812471, + "grad_norm": 0.25364476442337036, + "learning_rate": 6.066610047574164e-06, + "loss": 0.0856, + "step": 44636 + }, + { + "epoch": 0.7961509649341847, + "grad_norm": 0.2583354711532593, + "learning_rate": 6.065593644543821e-06, + "loss": 0.0986, + "step": 44637 + }, + { + "epoch": 0.7961688010558984, + "grad_norm": 0.30223697423934937, + "learning_rate": 6.064577314909784e-06, + "loss": 0.1833, + "step": 44638 + }, + { + "epoch": 0.7961866371776121, + "grad_norm": 0.39697685837745667, + "learning_rate": 6.063561058675985e-06, + "loss": 0.1055, + "step": 44639 + }, + { + "epoch": 0.7962044732993258, + "grad_norm": 0.24909250438213348, + "learning_rate": 6.062544875846362e-06, + "loss": 0.1085, + "step": 44640 + }, + { + "epoch": 0.7962223094210394, + "grad_norm": 0.2788204848766327, + "learning_rate": 6.061528766424851e-06, + "loss": 0.0938, + "step": 44641 + }, + { + "epoch": 0.7962401455427531, + "grad_norm": 0.23912757635116577, + "learning_rate": 6.060512730415402e-06, + "loss": 0.0762, + "step": 44642 + }, + { + "epoch": 0.7962579816644669, + "grad_norm": 0.2298707365989685, + "learning_rate": 6.0594967678219455e-06, + "loss": 0.1152, + "step": 44643 + }, + { + "epoch": 0.7962758177861806, + "grad_norm": 0.2764599323272705, + "learning_rate": 6.058480878648423e-06, + "loss": 0.1827, + "step": 44644 + }, + { + "epoch": 0.7962936539078943, + "grad_norm": 0.22942933440208435, + "learning_rate": 6.05746506289877e-06, + "loss": 0.0749, + "step": 44645 + }, + { + "epoch": 0.796311490029608, + "grad_norm": 0.28335922956466675, + "learning_rate": 6.05644932057692e-06, + "loss": 0.128, + "step": 44646 + }, + { + "epoch": 0.7963293261513217, + "grad_norm": 0.3464876115322113, + "learning_rate": 6.055433651686821e-06, + "loss": 0.1522, + "step": 44647 + }, + { + "epoch": 0.7963471622730354, + "grad_norm": 0.24680723249912262, + "learning_rate": 6.054418056232397e-06, + "loss": 0.1112, + "step": 44648 + }, + { + "epoch": 0.7963649983947491, + "grad_norm": 0.31065842509269714, + "learning_rate": 6.053402534217601e-06, + "loss": 0.1648, + "step": 44649 + }, + { + "epoch": 0.7963828345164627, + "grad_norm": 0.3023083806037903, + "learning_rate": 6.052387085646349e-06, + "loss": 0.1195, + "step": 44650 + }, + { + "epoch": 0.7964006706381764, + "grad_norm": 0.2382739782333374, + "learning_rate": 6.051371710522599e-06, + "loss": 0.1388, + "step": 44651 + }, + { + "epoch": 0.7964185067598901, + "grad_norm": 0.2436676025390625, + "learning_rate": 6.050356408850277e-06, + "loss": 0.1158, + "step": 44652 + }, + { + "epoch": 0.7964363428816038, + "grad_norm": 0.30252712965011597, + "learning_rate": 6.0493411806333195e-06, + "loss": 0.178, + "step": 44653 + }, + { + "epoch": 0.7964541790033175, + "grad_norm": 0.27006739377975464, + "learning_rate": 6.048326025875653e-06, + "loss": 0.1361, + "step": 44654 + }, + { + "epoch": 0.7964720151250312, + "grad_norm": 0.2818959057331085, + "learning_rate": 6.047310944581228e-06, + "loss": 0.1102, + "step": 44655 + }, + { + "epoch": 0.7964898512467449, + "grad_norm": 0.2770169973373413, + "learning_rate": 6.046295936753971e-06, + "loss": 0.1406, + "step": 44656 + }, + { + "epoch": 0.7965076873684586, + "grad_norm": 0.422354519367218, + "learning_rate": 6.045281002397818e-06, + "loss": 0.1102, + "step": 44657 + }, + { + "epoch": 0.7965255234901722, + "grad_norm": 0.30090510845184326, + "learning_rate": 6.0442661415167026e-06, + "loss": 0.1127, + "step": 44658 + }, + { + "epoch": 0.7965433596118859, + "grad_norm": 0.20314498245716095, + "learning_rate": 6.043251354114552e-06, + "loss": 0.1054, + "step": 44659 + }, + { + "epoch": 0.7965611957335997, + "grad_norm": 0.32014185190200806, + "learning_rate": 6.042236640195312e-06, + "loss": 0.1379, + "step": 44660 + }, + { + "epoch": 0.7965790318553134, + "grad_norm": 0.3027096092700958, + "learning_rate": 6.041221999762911e-06, + "loss": 0.1125, + "step": 44661 + }, + { + "epoch": 0.7965968679770271, + "grad_norm": 0.2541898787021637, + "learning_rate": 6.0402074328212764e-06, + "loss": 0.0664, + "step": 44662 + }, + { + "epoch": 0.7966147040987408, + "grad_norm": 0.2724062204360962, + "learning_rate": 6.039192939374344e-06, + "loss": 0.0824, + "step": 44663 + }, + { + "epoch": 0.7966325402204545, + "grad_norm": 0.2679769992828369, + "learning_rate": 6.038178519426057e-06, + "loss": 0.0988, + "step": 44664 + }, + { + "epoch": 0.7966503763421682, + "grad_norm": 0.23171375691890717, + "learning_rate": 6.0371641729803395e-06, + "loss": 0.1044, + "step": 44665 + }, + { + "epoch": 0.7966682124638819, + "grad_norm": 0.2951165735721588, + "learning_rate": 6.036149900041124e-06, + "loss": 0.0964, + "step": 44666 + }, + { + "epoch": 0.7966860485855956, + "grad_norm": 0.28471213579177856, + "learning_rate": 6.035135700612332e-06, + "loss": 0.1364, + "step": 44667 + }, + { + "epoch": 0.7967038847073092, + "grad_norm": 0.34432467818260193, + "learning_rate": 6.034121574697912e-06, + "loss": 0.1306, + "step": 44668 + }, + { + "epoch": 0.7967217208290229, + "grad_norm": 0.24009296298027039, + "learning_rate": 6.03310752230179e-06, + "loss": 0.1078, + "step": 44669 + }, + { + "epoch": 0.7967395569507366, + "grad_norm": 0.2597629129886627, + "learning_rate": 6.032093543427892e-06, + "loss": 0.0942, + "step": 44670 + }, + { + "epoch": 0.7967573930724503, + "grad_norm": 0.23199214041233063, + "learning_rate": 6.03107963808015e-06, + "loss": 0.1127, + "step": 44671 + }, + { + "epoch": 0.796775229194164, + "grad_norm": 0.2756112515926361, + "learning_rate": 6.030065806262489e-06, + "loss": 0.1017, + "step": 44672 + }, + { + "epoch": 0.7967930653158777, + "grad_norm": 0.26526373624801636, + "learning_rate": 6.0290520479788506e-06, + "loss": 0.1064, + "step": 44673 + }, + { + "epoch": 0.7968109014375914, + "grad_norm": 0.29982566833496094, + "learning_rate": 6.02803836323316e-06, + "loss": 0.1343, + "step": 44674 + }, + { + "epoch": 0.7968287375593051, + "grad_norm": 0.2448246330022812, + "learning_rate": 6.02702475202934e-06, + "loss": 0.0778, + "step": 44675 + }, + { + "epoch": 0.7968465736810187, + "grad_norm": 0.2666827440261841, + "learning_rate": 6.02601121437133e-06, + "loss": 0.0803, + "step": 44676 + }, + { + "epoch": 0.7968644098027325, + "grad_norm": 0.2296871840953827, + "learning_rate": 6.024997750263045e-06, + "loss": 0.1407, + "step": 44677 + }, + { + "epoch": 0.7968822459244462, + "grad_norm": 0.2853383719921112, + "learning_rate": 6.023984359708432e-06, + "loss": 0.1397, + "step": 44678 + }, + { + "epoch": 0.7969000820461599, + "grad_norm": 0.25889334082603455, + "learning_rate": 6.0229710427114115e-06, + "loss": 0.127, + "step": 44679 + }, + { + "epoch": 0.7969179181678736, + "grad_norm": 0.22680220007896423, + "learning_rate": 6.021957799275898e-06, + "loss": 0.1438, + "step": 44680 + }, + { + "epoch": 0.7969357542895873, + "grad_norm": 0.31417372822761536, + "learning_rate": 6.020944629405842e-06, + "loss": 0.1408, + "step": 44681 + }, + { + "epoch": 0.796953590411301, + "grad_norm": 0.25277039408683777, + "learning_rate": 6.019931533105158e-06, + "loss": 0.0941, + "step": 44682 + }, + { + "epoch": 0.7969714265330147, + "grad_norm": 0.35460928082466125, + "learning_rate": 6.018918510377777e-06, + "loss": 0.1429, + "step": 44683 + }, + { + "epoch": 0.7969892626547284, + "grad_norm": 0.28174686431884766, + "learning_rate": 6.017905561227621e-06, + "loss": 0.1385, + "step": 44684 + }, + { + "epoch": 0.797007098776442, + "grad_norm": 0.24566136300563812, + "learning_rate": 6.0168926856586126e-06, + "loss": 0.0646, + "step": 44685 + }, + { + "epoch": 0.7970249348981557, + "grad_norm": 0.27640655636787415, + "learning_rate": 6.015879883674694e-06, + "loss": 0.1158, + "step": 44686 + }, + { + "epoch": 0.7970427710198694, + "grad_norm": 0.4684193730354309, + "learning_rate": 6.014867155279779e-06, + "loss": 0.1207, + "step": 44687 + }, + { + "epoch": 0.7970606071415831, + "grad_norm": 0.3309164047241211, + "learning_rate": 6.0138545004777976e-06, + "loss": 0.1212, + "step": 44688 + }, + { + "epoch": 0.7970784432632968, + "grad_norm": 0.21871930360794067, + "learning_rate": 6.012841919272666e-06, + "loss": 0.123, + "step": 44689 + }, + { + "epoch": 0.7970962793850105, + "grad_norm": 0.37281590700149536, + "learning_rate": 6.011829411668318e-06, + "loss": 0.1214, + "step": 44690 + }, + { + "epoch": 0.7971141155067242, + "grad_norm": 0.4736943542957306, + "learning_rate": 6.010816977668684e-06, + "loss": 0.1526, + "step": 44691 + }, + { + "epoch": 0.7971319516284379, + "grad_norm": 0.22863249480724335, + "learning_rate": 6.009804617277681e-06, + "loss": 0.0725, + "step": 44692 + }, + { + "epoch": 0.7971497877501517, + "grad_norm": 0.19314421713352203, + "learning_rate": 6.00879233049923e-06, + "loss": 0.072, + "step": 44693 + }, + { + "epoch": 0.7971676238718653, + "grad_norm": 0.2543048858642578, + "learning_rate": 6.007780117337264e-06, + "loss": 0.1051, + "step": 44694 + }, + { + "epoch": 0.797185459993579, + "grad_norm": 0.29834023118019104, + "learning_rate": 6.0067679777957045e-06, + "loss": 0.1117, + "step": 44695 + }, + { + "epoch": 0.7972032961152927, + "grad_norm": 0.3886660039424896, + "learning_rate": 6.005755911878469e-06, + "loss": 0.1145, + "step": 44696 + }, + { + "epoch": 0.7972211322370064, + "grad_norm": 0.31640034914016724, + "learning_rate": 6.004743919589487e-06, + "loss": 0.191, + "step": 44697 + }, + { + "epoch": 0.7972389683587201, + "grad_norm": 0.3111501634120941, + "learning_rate": 6.003732000932669e-06, + "loss": 0.1042, + "step": 44698 + }, + { + "epoch": 0.7972568044804338, + "grad_norm": 0.2425641417503357, + "learning_rate": 6.002720155911956e-06, + "loss": 0.1194, + "step": 44699 + }, + { + "epoch": 0.7972746406021475, + "grad_norm": 0.26602521538734436, + "learning_rate": 6.00170838453126e-06, + "loss": 0.1091, + "step": 44700 + }, + { + "epoch": 0.7972924767238612, + "grad_norm": 0.28066277503967285, + "learning_rate": 6.000696686794502e-06, + "loss": 0.1315, + "step": 44701 + }, + { + "epoch": 0.7973103128455749, + "grad_norm": 0.2511526942253113, + "learning_rate": 5.999685062705601e-06, + "loss": 0.0851, + "step": 44702 + }, + { + "epoch": 0.7973281489672885, + "grad_norm": 0.18181085586547852, + "learning_rate": 5.99867351226849e-06, + "loss": 0.098, + "step": 44703 + }, + { + "epoch": 0.7973459850890022, + "grad_norm": 0.24556975066661835, + "learning_rate": 5.997662035487075e-06, + "loss": 0.1115, + "step": 44704 + }, + { + "epoch": 0.7973638212107159, + "grad_norm": 0.3535599708557129, + "learning_rate": 5.996650632365294e-06, + "loss": 0.1115, + "step": 44705 + }, + { + "epoch": 0.7973816573324296, + "grad_norm": 0.22648532688617706, + "learning_rate": 5.995639302907052e-06, + "loss": 0.1191, + "step": 44706 + }, + { + "epoch": 0.7973994934541433, + "grad_norm": 0.27012690901756287, + "learning_rate": 5.994628047116282e-06, + "loss": 0.0924, + "step": 44707 + }, + { + "epoch": 0.797417329575857, + "grad_norm": 0.3251741826534271, + "learning_rate": 5.993616864996898e-06, + "loss": 0.1351, + "step": 44708 + }, + { + "epoch": 0.7974351656975707, + "grad_norm": 0.33112552762031555, + "learning_rate": 5.992605756552819e-06, + "loss": 0.1522, + "step": 44709 + }, + { + "epoch": 0.7974530018192845, + "grad_norm": 0.2304413616657257, + "learning_rate": 5.991594721787966e-06, + "loss": 0.1318, + "step": 44710 + }, + { + "epoch": 0.7974708379409982, + "grad_norm": 0.2530154883861542, + "learning_rate": 5.99058376070625e-06, + "loss": 0.0619, + "step": 44711 + }, + { + "epoch": 0.7974886740627118, + "grad_norm": 0.290822297334671, + "learning_rate": 5.989572873311605e-06, + "loss": 0.093, + "step": 44712 + }, + { + "epoch": 0.7975065101844255, + "grad_norm": 0.27113252878189087, + "learning_rate": 5.988562059607939e-06, + "loss": 0.1283, + "step": 44713 + }, + { + "epoch": 0.7975243463061392, + "grad_norm": 0.18801848590373993, + "learning_rate": 5.987551319599174e-06, + "loss": 0.0841, + "step": 44714 + }, + { + "epoch": 0.7975421824278529, + "grad_norm": 0.3197059631347656, + "learning_rate": 5.986540653289221e-06, + "loss": 0.0869, + "step": 44715 + }, + { + "epoch": 0.7975600185495666, + "grad_norm": 0.2285647839307785, + "learning_rate": 5.98553006068201e-06, + "loss": 0.0883, + "step": 44716 + }, + { + "epoch": 0.7975778546712803, + "grad_norm": 0.26284387707710266, + "learning_rate": 5.984519541781444e-06, + "loss": 0.0914, + "step": 44717 + }, + { + "epoch": 0.797595690792994, + "grad_norm": 0.2824154496192932, + "learning_rate": 5.9835090965914555e-06, + "loss": 0.1089, + "step": 44718 + }, + { + "epoch": 0.7976135269147077, + "grad_norm": 0.2424059361219406, + "learning_rate": 5.9824987251159455e-06, + "loss": 0.1448, + "step": 44719 + }, + { + "epoch": 0.7976313630364213, + "grad_norm": 0.26008403301239014, + "learning_rate": 5.981488427358847e-06, + "loss": 0.1072, + "step": 44720 + }, + { + "epoch": 0.797649199158135, + "grad_norm": 0.32084017992019653, + "learning_rate": 5.980478203324069e-06, + "loss": 0.1134, + "step": 44721 + }, + { + "epoch": 0.7976670352798487, + "grad_norm": 0.40345853567123413, + "learning_rate": 5.9794680530155246e-06, + "loss": 0.1691, + "step": 44722 + }, + { + "epoch": 0.7976848714015624, + "grad_norm": 0.34984374046325684, + "learning_rate": 5.978457976437132e-06, + "loss": 0.1388, + "step": 44723 + }, + { + "epoch": 0.7977027075232761, + "grad_norm": 0.2532453238964081, + "learning_rate": 5.977447973592801e-06, + "loss": 0.1023, + "step": 44724 + }, + { + "epoch": 0.7977205436449898, + "grad_norm": 0.2549111545085907, + "learning_rate": 5.976438044486457e-06, + "loss": 0.1178, + "step": 44725 + }, + { + "epoch": 0.7977383797667035, + "grad_norm": 0.29619550704956055, + "learning_rate": 5.975428189122009e-06, + "loss": 0.1309, + "step": 44726 + }, + { + "epoch": 0.7977562158884173, + "grad_norm": 0.29465508460998535, + "learning_rate": 5.974418407503374e-06, + "loss": 0.1491, + "step": 44727 + }, + { + "epoch": 0.797774052010131, + "grad_norm": 0.36146080493927, + "learning_rate": 5.973408699634459e-06, + "loss": 0.1108, + "step": 44728 + }, + { + "epoch": 0.7977918881318447, + "grad_norm": 0.30643779039382935, + "learning_rate": 5.972399065519188e-06, + "loss": 0.195, + "step": 44729 + }, + { + "epoch": 0.7978097242535583, + "grad_norm": 0.34295615553855896, + "learning_rate": 5.97138950516147e-06, + "loss": 0.1128, + "step": 44730 + }, + { + "epoch": 0.797827560375272, + "grad_norm": 0.2404564768075943, + "learning_rate": 5.970380018565211e-06, + "loss": 0.1023, + "step": 44731 + }, + { + "epoch": 0.7978453964969857, + "grad_norm": 0.31376343965530396, + "learning_rate": 5.969370605734339e-06, + "loss": 0.0984, + "step": 44732 + }, + { + "epoch": 0.7978632326186994, + "grad_norm": 0.2687996029853821, + "learning_rate": 5.968361266672754e-06, + "loss": 0.0991, + "step": 44733 + }, + { + "epoch": 0.7978810687404131, + "grad_norm": 0.20550496876239777, + "learning_rate": 5.96735200138438e-06, + "loss": 0.1258, + "step": 44734 + }, + { + "epoch": 0.7978989048621268, + "grad_norm": 0.27825260162353516, + "learning_rate": 5.966342809873124e-06, + "loss": 0.1251, + "step": 44735 + }, + { + "epoch": 0.7979167409838405, + "grad_norm": 0.22712254524230957, + "learning_rate": 5.965333692142896e-06, + "loss": 0.1051, + "step": 44736 + }, + { + "epoch": 0.7979345771055542, + "grad_norm": 0.30901479721069336, + "learning_rate": 5.964324648197603e-06, + "loss": 0.1157, + "step": 44737 + }, + { + "epoch": 0.7979524132272678, + "grad_norm": 0.26727601885795593, + "learning_rate": 5.9633156780411705e-06, + "loss": 0.0934, + "step": 44738 + }, + { + "epoch": 0.7979702493489815, + "grad_norm": 0.22644126415252686, + "learning_rate": 5.962306781677499e-06, + "loss": 0.1207, + "step": 44739 + }, + { + "epoch": 0.7979880854706952, + "grad_norm": 0.27483099699020386, + "learning_rate": 5.961297959110504e-06, + "loss": 0.0998, + "step": 44740 + }, + { + "epoch": 0.7980059215924089, + "grad_norm": 0.33201122283935547, + "learning_rate": 5.960289210344086e-06, + "loss": 0.1532, + "step": 44741 + }, + { + "epoch": 0.7980237577141226, + "grad_norm": 0.289569616317749, + "learning_rate": 5.9592805353821695e-06, + "loss": 0.1328, + "step": 44742 + }, + { + "epoch": 0.7980415938358363, + "grad_norm": 0.30282092094421387, + "learning_rate": 5.958271934228659e-06, + "loss": 0.1476, + "step": 44743 + }, + { + "epoch": 0.7980594299575501, + "grad_norm": 0.2952158749103546, + "learning_rate": 5.957263406887456e-06, + "loss": 0.1093, + "step": 44744 + }, + { + "epoch": 0.7980772660792638, + "grad_norm": 0.2503235340118408, + "learning_rate": 5.956254953362483e-06, + "loss": 0.1085, + "step": 44745 + }, + { + "epoch": 0.7980951022009775, + "grad_norm": 0.35521817207336426, + "learning_rate": 5.9552465736576385e-06, + "loss": 0.1364, + "step": 44746 + }, + { + "epoch": 0.7981129383226911, + "grad_norm": 0.2889169454574585, + "learning_rate": 5.954238267776841e-06, + "loss": 0.1091, + "step": 44747 + }, + { + "epoch": 0.7981307744444048, + "grad_norm": 0.2498825639486313, + "learning_rate": 5.9532300357239944e-06, + "loss": 0.099, + "step": 44748 + }, + { + "epoch": 0.7981486105661185, + "grad_norm": 0.25176000595092773, + "learning_rate": 5.952221877503009e-06, + "loss": 0.0684, + "step": 44749 + }, + { + "epoch": 0.7981664466878322, + "grad_norm": 0.2770783305168152, + "learning_rate": 5.951213793117783e-06, + "loss": 0.097, + "step": 44750 + }, + { + "epoch": 0.7981842828095459, + "grad_norm": 0.2908453345298767, + "learning_rate": 5.950205782572235e-06, + "loss": 0.114, + "step": 44751 + }, + { + "epoch": 0.7982021189312596, + "grad_norm": 0.35370782017707825, + "learning_rate": 5.9491978458702716e-06, + "loss": 0.1459, + "step": 44752 + }, + { + "epoch": 0.7982199550529733, + "grad_norm": 0.2853609025478363, + "learning_rate": 5.948189983015798e-06, + "loss": 0.1001, + "step": 44753 + }, + { + "epoch": 0.798237791174687, + "grad_norm": 0.2976193428039551, + "learning_rate": 5.947182194012712e-06, + "loss": 0.1096, + "step": 44754 + }, + { + "epoch": 0.7982556272964006, + "grad_norm": 0.3756830096244812, + "learning_rate": 5.946174478864938e-06, + "loss": 0.1281, + "step": 44755 + }, + { + "epoch": 0.7982734634181143, + "grad_norm": 0.2182513028383255, + "learning_rate": 5.945166837576369e-06, + "loss": 0.1608, + "step": 44756 + }, + { + "epoch": 0.798291299539828, + "grad_norm": 0.20052571594715118, + "learning_rate": 5.944159270150917e-06, + "loss": 0.0709, + "step": 44757 + }, + { + "epoch": 0.7983091356615417, + "grad_norm": 0.21268951892852783, + "learning_rate": 5.943151776592479e-06, + "loss": 0.1195, + "step": 44758 + }, + { + "epoch": 0.7983269717832554, + "grad_norm": 0.18963152170181274, + "learning_rate": 5.9421443569049715e-06, + "loss": 0.0838, + "step": 44759 + }, + { + "epoch": 0.7983448079049691, + "grad_norm": 0.21648621559143066, + "learning_rate": 5.941137011092288e-06, + "loss": 0.0591, + "step": 44760 + }, + { + "epoch": 0.7983626440266829, + "grad_norm": 0.29309189319610596, + "learning_rate": 5.940129739158348e-06, + "loss": 0.139, + "step": 44761 + }, + { + "epoch": 0.7983804801483966, + "grad_norm": 0.26586008071899414, + "learning_rate": 5.939122541107048e-06, + "loss": 0.0551, + "step": 44762 + }, + { + "epoch": 0.7983983162701103, + "grad_norm": 0.2932027280330658, + "learning_rate": 5.938115416942286e-06, + "loss": 0.1666, + "step": 44763 + }, + { + "epoch": 0.798416152391824, + "grad_norm": 0.28911998867988586, + "learning_rate": 5.937108366667979e-06, + "loss": 0.0879, + "step": 44764 + }, + { + "epoch": 0.7984339885135376, + "grad_norm": 0.2317652851343155, + "learning_rate": 5.936101390288023e-06, + "loss": 0.0831, + "step": 44765 + }, + { + "epoch": 0.7984518246352513, + "grad_norm": 0.22962601482868195, + "learning_rate": 5.9350944878063256e-06, + "loss": 0.1318, + "step": 44766 + }, + { + "epoch": 0.798469660756965, + "grad_norm": 0.3798615038394928, + "learning_rate": 5.934087659226775e-06, + "loss": 0.1247, + "step": 44767 + }, + { + "epoch": 0.7984874968786787, + "grad_norm": 0.2868122458457947, + "learning_rate": 5.933080904553298e-06, + "loss": 0.083, + "step": 44768 + }, + { + "epoch": 0.7985053330003924, + "grad_norm": 0.374931663274765, + "learning_rate": 5.9320742237897825e-06, + "loss": 0.0878, + "step": 44769 + }, + { + "epoch": 0.7985231691221061, + "grad_norm": 0.3661167621612549, + "learning_rate": 5.9310676169401304e-06, + "loss": 0.1071, + "step": 44770 + }, + { + "epoch": 0.7985410052438198, + "grad_norm": 0.29262712597846985, + "learning_rate": 5.9300610840082425e-06, + "loss": 0.1313, + "step": 44771 + }, + { + "epoch": 0.7985588413655335, + "grad_norm": 0.22950060665607452, + "learning_rate": 5.929054624998032e-06, + "loss": 0.1068, + "step": 44772 + }, + { + "epoch": 0.7985766774872471, + "grad_norm": 0.24029165506362915, + "learning_rate": 5.928048239913384e-06, + "loss": 0.1175, + "step": 44773 + }, + { + "epoch": 0.7985945136089608, + "grad_norm": 0.23541101813316345, + "learning_rate": 5.927041928758217e-06, + "loss": 0.0694, + "step": 44774 + }, + { + "epoch": 0.7986123497306745, + "grad_norm": 0.28525310754776, + "learning_rate": 5.926035691536421e-06, + "loss": 0.1085, + "step": 44775 + }, + { + "epoch": 0.7986301858523882, + "grad_norm": 0.29268792271614075, + "learning_rate": 5.925029528251894e-06, + "loss": 0.1015, + "step": 44776 + }, + { + "epoch": 0.7986480219741019, + "grad_norm": 0.23851439356803894, + "learning_rate": 5.924023438908546e-06, + "loss": 0.0766, + "step": 44777 + }, + { + "epoch": 0.7986658580958157, + "grad_norm": 0.21398480236530304, + "learning_rate": 5.923017423510271e-06, + "loss": 0.0756, + "step": 44778 + }, + { + "epoch": 0.7986836942175294, + "grad_norm": 0.31902143359184265, + "learning_rate": 5.922011482060971e-06, + "loss": 0.1305, + "step": 44779 + }, + { + "epoch": 0.7987015303392431, + "grad_norm": 0.3252716064453125, + "learning_rate": 5.921005614564537e-06, + "loss": 0.14, + "step": 44780 + }, + { + "epoch": 0.7987193664609568, + "grad_norm": 0.26099663972854614, + "learning_rate": 5.919999821024883e-06, + "loss": 0.1109, + "step": 44781 + }, + { + "epoch": 0.7987372025826704, + "grad_norm": 0.3772627115249634, + "learning_rate": 5.918994101445896e-06, + "loss": 0.1663, + "step": 44782 + }, + { + "epoch": 0.7987550387043841, + "grad_norm": 0.2689274251461029, + "learning_rate": 5.91798845583148e-06, + "loss": 0.1152, + "step": 44783 + }, + { + "epoch": 0.7987728748260978, + "grad_norm": 0.3023391366004944, + "learning_rate": 5.916982884185526e-06, + "loss": 0.1308, + "step": 44784 + }, + { + "epoch": 0.7987907109478115, + "grad_norm": 0.37498417496681213, + "learning_rate": 5.915977386511942e-06, + "loss": 0.1142, + "step": 44785 + }, + { + "epoch": 0.7988085470695252, + "grad_norm": 0.3197740316390991, + "learning_rate": 5.914971962814622e-06, + "loss": 0.1826, + "step": 44786 + }, + { + "epoch": 0.7988263831912389, + "grad_norm": 0.2431715875864029, + "learning_rate": 5.9139666130974574e-06, + "loss": 0.1326, + "step": 44787 + }, + { + "epoch": 0.7988442193129526, + "grad_norm": 0.4366306662559509, + "learning_rate": 5.9129613373643536e-06, + "loss": 0.0868, + "step": 44788 + }, + { + "epoch": 0.7988620554346663, + "grad_norm": 0.42088744044303894, + "learning_rate": 5.911956135619201e-06, + "loss": 0.1481, + "step": 44789 + }, + { + "epoch": 0.79887989155638, + "grad_norm": 0.32417958974838257, + "learning_rate": 5.910951007865903e-06, + "loss": 0.1423, + "step": 44790 + }, + { + "epoch": 0.7988977276780936, + "grad_norm": 0.3563603162765503, + "learning_rate": 5.909945954108354e-06, + "loss": 0.0791, + "step": 44791 + }, + { + "epoch": 0.7989155637998073, + "grad_norm": 0.20565392076969147, + "learning_rate": 5.908940974350446e-06, + "loss": 0.1102, + "step": 44792 + }, + { + "epoch": 0.798933399921521, + "grad_norm": 0.25399577617645264, + "learning_rate": 5.90793606859607e-06, + "loss": 0.1182, + "step": 44793 + }, + { + "epoch": 0.7989512360432348, + "grad_norm": 0.3509679436683655, + "learning_rate": 5.9069312368491365e-06, + "loss": 0.116, + "step": 44794 + }, + { + "epoch": 0.7989690721649485, + "grad_norm": 0.2200997918844223, + "learning_rate": 5.9059264791135326e-06, + "loss": 0.0994, + "step": 44795 + }, + { + "epoch": 0.7989869082866622, + "grad_norm": 0.37163692712783813, + "learning_rate": 5.904921795393151e-06, + "loss": 0.1444, + "step": 44796 + }, + { + "epoch": 0.7990047444083759, + "grad_norm": 0.4139951467514038, + "learning_rate": 5.903917185691881e-06, + "loss": 0.1314, + "step": 44797 + }, + { + "epoch": 0.7990225805300896, + "grad_norm": 0.5621646046638489, + "learning_rate": 5.902912650013631e-06, + "loss": 0.1038, + "step": 44798 + }, + { + "epoch": 0.7990404166518033, + "grad_norm": 0.28355610370635986, + "learning_rate": 5.901908188362288e-06, + "loss": 0.1275, + "step": 44799 + }, + { + "epoch": 0.7990582527735169, + "grad_norm": 0.26689136028289795, + "learning_rate": 5.9009038007417365e-06, + "loss": 0.0829, + "step": 44800 + }, + { + "epoch": 0.7990760888952306, + "grad_norm": 0.17263464629650116, + "learning_rate": 5.899899487155886e-06, + "loss": 0.0763, + "step": 44801 + }, + { + "epoch": 0.7990939250169443, + "grad_norm": 0.2657405436038971, + "learning_rate": 5.898895247608616e-06, + "loss": 0.081, + "step": 44802 + }, + { + "epoch": 0.799111761138658, + "grad_norm": 0.27339357137680054, + "learning_rate": 5.897891082103832e-06, + "loss": 0.1332, + "step": 44803 + }, + { + "epoch": 0.7991295972603717, + "grad_norm": 0.17493490874767303, + "learning_rate": 5.89688699064542e-06, + "loss": 0.0746, + "step": 44804 + }, + { + "epoch": 0.7991474333820854, + "grad_norm": 0.2501816749572754, + "learning_rate": 5.895882973237271e-06, + "loss": 0.1197, + "step": 44805 + }, + { + "epoch": 0.7991652695037991, + "grad_norm": 0.24011845886707306, + "learning_rate": 5.894879029883271e-06, + "loss": 0.1051, + "step": 44806 + }, + { + "epoch": 0.7991831056255128, + "grad_norm": 0.17578662931919098, + "learning_rate": 5.893875160587325e-06, + "loss": 0.0816, + "step": 44807 + }, + { + "epoch": 0.7992009417472264, + "grad_norm": 0.2163763791322708, + "learning_rate": 5.89287136535332e-06, + "loss": 0.0955, + "step": 44808 + }, + { + "epoch": 0.7992187778689401, + "grad_norm": 0.22871968150138855, + "learning_rate": 5.891867644185145e-06, + "loss": 0.0851, + "step": 44809 + }, + { + "epoch": 0.7992366139906538, + "grad_norm": 0.298725426197052, + "learning_rate": 5.8908639970866815e-06, + "loss": 0.1068, + "step": 44810 + }, + { + "epoch": 0.7992544501123676, + "grad_norm": 0.2751402258872986, + "learning_rate": 5.889860424061838e-06, + "loss": 0.1447, + "step": 44811 + }, + { + "epoch": 0.7992722862340813, + "grad_norm": 0.2923526167869568, + "learning_rate": 5.888856925114494e-06, + "loss": 0.0838, + "step": 44812 + }, + { + "epoch": 0.799290122355795, + "grad_norm": 0.2836334705352783, + "learning_rate": 5.887853500248544e-06, + "loss": 0.179, + "step": 44813 + }, + { + "epoch": 0.7993079584775087, + "grad_norm": 0.21170085668563843, + "learning_rate": 5.886850149467868e-06, + "loss": 0.104, + "step": 44814 + }, + { + "epoch": 0.7993257945992224, + "grad_norm": 0.3128630220890045, + "learning_rate": 5.88584687277636e-06, + "loss": 0.0634, + "step": 44815 + }, + { + "epoch": 0.7993436307209361, + "grad_norm": 0.284605473279953, + "learning_rate": 5.8848436701779195e-06, + "loss": 0.0745, + "step": 44816 + }, + { + "epoch": 0.7993614668426497, + "grad_norm": 0.3086356818675995, + "learning_rate": 5.883840541676428e-06, + "loss": 0.1369, + "step": 44817 + }, + { + "epoch": 0.7993793029643634, + "grad_norm": 0.2522403299808502, + "learning_rate": 5.882837487275769e-06, + "loss": 0.1047, + "step": 44818 + }, + { + "epoch": 0.7993971390860771, + "grad_norm": 0.2797737717628479, + "learning_rate": 5.881834506979833e-06, + "loss": 0.0854, + "step": 44819 + }, + { + "epoch": 0.7994149752077908, + "grad_norm": 0.3291873335838318, + "learning_rate": 5.880831600792514e-06, + "loss": 0.1122, + "step": 44820 + }, + { + "epoch": 0.7994328113295045, + "grad_norm": 0.2671118676662445, + "learning_rate": 5.879828768717696e-06, + "loss": 0.1256, + "step": 44821 + }, + { + "epoch": 0.7994506474512182, + "grad_norm": 0.30141565203666687, + "learning_rate": 5.878826010759264e-06, + "loss": 0.1044, + "step": 44822 + }, + { + "epoch": 0.7994684835729319, + "grad_norm": 0.4954824149608612, + "learning_rate": 5.8778233269211e-06, + "loss": 0.1582, + "step": 44823 + }, + { + "epoch": 0.7994863196946456, + "grad_norm": 0.24249166250228882, + "learning_rate": 5.876820717207107e-06, + "loss": 0.1092, + "step": 44824 + }, + { + "epoch": 0.7995041558163593, + "grad_norm": 0.2866115868091583, + "learning_rate": 5.875818181621159e-06, + "loss": 0.11, + "step": 44825 + }, + { + "epoch": 0.7995219919380729, + "grad_norm": 0.28984498977661133, + "learning_rate": 5.874815720167145e-06, + "loss": 0.1341, + "step": 44826 + }, + { + "epoch": 0.7995398280597866, + "grad_norm": 0.220419779419899, + "learning_rate": 5.873813332848946e-06, + "loss": 0.0618, + "step": 44827 + }, + { + "epoch": 0.7995576641815004, + "grad_norm": 0.2809552550315857, + "learning_rate": 5.87281101967046e-06, + "loss": 0.0615, + "step": 44828 + }, + { + "epoch": 0.7995755003032141, + "grad_norm": 0.2695557177066803, + "learning_rate": 5.871808780635555e-06, + "loss": 0.1046, + "step": 44829 + }, + { + "epoch": 0.7995933364249278, + "grad_norm": 0.23549309372901917, + "learning_rate": 5.870806615748134e-06, + "loss": 0.1201, + "step": 44830 + }, + { + "epoch": 0.7996111725466415, + "grad_norm": 0.33861011266708374, + "learning_rate": 5.869804525012074e-06, + "loss": 0.1121, + "step": 44831 + }, + { + "epoch": 0.7996290086683552, + "grad_norm": 0.2549445629119873, + "learning_rate": 5.8688025084312505e-06, + "loss": 0.1328, + "step": 44832 + }, + { + "epoch": 0.7996468447900689, + "grad_norm": 0.3028838336467743, + "learning_rate": 5.867800566009565e-06, + "loss": 0.1569, + "step": 44833 + }, + { + "epoch": 0.7996646809117826, + "grad_norm": 0.252324640750885, + "learning_rate": 5.8667986977508915e-06, + "loss": 0.0899, + "step": 44834 + }, + { + "epoch": 0.7996825170334962, + "grad_norm": 0.2836589813232422, + "learning_rate": 5.865796903659115e-06, + "loss": 0.1394, + "step": 44835 + }, + { + "epoch": 0.7997003531552099, + "grad_norm": 0.22945933043956757, + "learning_rate": 5.8647951837381114e-06, + "loss": 0.0957, + "step": 44836 + }, + { + "epoch": 0.7997181892769236, + "grad_norm": 0.36276066303253174, + "learning_rate": 5.863793537991779e-06, + "loss": 0.071, + "step": 44837 + }, + { + "epoch": 0.7997360253986373, + "grad_norm": 0.2796826660633087, + "learning_rate": 5.8627919664239915e-06, + "loss": 0.0829, + "step": 44838 + }, + { + "epoch": 0.799753861520351, + "grad_norm": 0.2649815082550049, + "learning_rate": 5.86179046903863e-06, + "loss": 0.1025, + "step": 44839 + }, + { + "epoch": 0.7997716976420647, + "grad_norm": 0.26657721400260925, + "learning_rate": 5.860789045839582e-06, + "loss": 0.0982, + "step": 44840 + }, + { + "epoch": 0.7997895337637784, + "grad_norm": 0.2324734777212143, + "learning_rate": 5.859787696830718e-06, + "loss": 0.1298, + "step": 44841 + }, + { + "epoch": 0.7998073698854921, + "grad_norm": 0.21991927921772003, + "learning_rate": 5.8587864220159325e-06, + "loss": 0.1046, + "step": 44842 + }, + { + "epoch": 0.7998252060072057, + "grad_norm": 0.20212183892726898, + "learning_rate": 5.857785221399098e-06, + "loss": 0.1175, + "step": 44843 + }, + { + "epoch": 0.7998430421289194, + "grad_norm": 0.365593820810318, + "learning_rate": 5.856784094984105e-06, + "loss": 0.1121, + "step": 44844 + }, + { + "epoch": 0.7998608782506332, + "grad_norm": 0.24445931613445282, + "learning_rate": 5.855783042774823e-06, + "loss": 0.0966, + "step": 44845 + }, + { + "epoch": 0.7998787143723469, + "grad_norm": 0.3151172399520874, + "learning_rate": 5.854782064775144e-06, + "loss": 0.1302, + "step": 44846 + }, + { + "epoch": 0.7998965504940606, + "grad_norm": 0.22199754416942596, + "learning_rate": 5.8537811609889415e-06, + "loss": 0.07, + "step": 44847 + }, + { + "epoch": 0.7999143866157743, + "grad_norm": 0.32313480973243713, + "learning_rate": 5.852780331420096e-06, + "loss": 0.1293, + "step": 44848 + }, + { + "epoch": 0.799932222737488, + "grad_norm": 0.20466674864292145, + "learning_rate": 5.851779576072483e-06, + "loss": 0.1114, + "step": 44849 + }, + { + "epoch": 0.7999500588592017, + "grad_norm": 0.2798991799354553, + "learning_rate": 5.8507788949499905e-06, + "loss": 0.1249, + "step": 44850 + }, + { + "epoch": 0.7999678949809154, + "grad_norm": 0.36040040850639343, + "learning_rate": 5.849778288056492e-06, + "loss": 0.1169, + "step": 44851 + }, + { + "epoch": 0.799985731102629, + "grad_norm": 0.3420540988445282, + "learning_rate": 5.848777755395871e-06, + "loss": 0.0663, + "step": 44852 + }, + { + "epoch": 0.8000035672243427, + "grad_norm": 0.27414995431900024, + "learning_rate": 5.847777296971998e-06, + "loss": 0.1217, + "step": 44853 + }, + { + "epoch": 0.8000214033460564, + "grad_norm": 0.20114260911941528, + "learning_rate": 5.846776912788751e-06, + "loss": 0.0662, + "step": 44854 + }, + { + "epoch": 0.8000392394677701, + "grad_norm": 0.2768368422985077, + "learning_rate": 5.8457766028500175e-06, + "loss": 0.1094, + "step": 44855 + }, + { + "epoch": 0.8000570755894838, + "grad_norm": 0.2530750036239624, + "learning_rate": 5.844776367159663e-06, + "loss": 0.1176, + "step": 44856 + }, + { + "epoch": 0.8000749117111975, + "grad_norm": 0.23680151998996735, + "learning_rate": 5.8437762057215796e-06, + "loss": 0.0809, + "step": 44857 + }, + { + "epoch": 0.8000927478329112, + "grad_norm": 0.24132221937179565, + "learning_rate": 5.842776118539628e-06, + "loss": 0.1462, + "step": 44858 + }, + { + "epoch": 0.8001105839546249, + "grad_norm": 0.24527911841869354, + "learning_rate": 5.841776105617699e-06, + "loss": 0.087, + "step": 44859 + }, + { + "epoch": 0.8001284200763386, + "grad_norm": 0.2281140834093094, + "learning_rate": 5.840776166959663e-06, + "loss": 0.059, + "step": 44860 + }, + { + "epoch": 0.8001462561980522, + "grad_norm": 0.39640456438064575, + "learning_rate": 5.839776302569397e-06, + "loss": 0.0838, + "step": 44861 + }, + { + "epoch": 0.800164092319766, + "grad_norm": 0.2991827428340912, + "learning_rate": 5.838776512450767e-06, + "loss": 0.1645, + "step": 44862 + }, + { + "epoch": 0.8001819284414797, + "grad_norm": 0.25122809410095215, + "learning_rate": 5.837776796607666e-06, + "loss": 0.1443, + "step": 44863 + }, + { + "epoch": 0.8001997645631934, + "grad_norm": 0.3652280867099762, + "learning_rate": 5.83677715504396e-06, + "loss": 0.1713, + "step": 44864 + }, + { + "epoch": 0.8002176006849071, + "grad_norm": 0.283097505569458, + "learning_rate": 5.835777587763524e-06, + "loss": 0.1034, + "step": 44865 + }, + { + "epoch": 0.8002354368066208, + "grad_norm": 0.35019248723983765, + "learning_rate": 5.8347780947702324e-06, + "loss": 0.0533, + "step": 44866 + }, + { + "epoch": 0.8002532729283345, + "grad_norm": 0.2504485547542572, + "learning_rate": 5.833778676067955e-06, + "loss": 0.0958, + "step": 44867 + }, + { + "epoch": 0.8002711090500482, + "grad_norm": 0.2404722273349762, + "learning_rate": 5.832779331660579e-06, + "loss": 0.1273, + "step": 44868 + }, + { + "epoch": 0.8002889451717619, + "grad_norm": 0.2237074375152588, + "learning_rate": 5.83178006155197e-06, + "loss": 0.0864, + "step": 44869 + }, + { + "epoch": 0.8003067812934755, + "grad_norm": 0.2370162308216095, + "learning_rate": 5.830780865745994e-06, + "loss": 0.1284, + "step": 44870 + }, + { + "epoch": 0.8003246174151892, + "grad_norm": 0.25863733887672424, + "learning_rate": 5.829781744246532e-06, + "loss": 0.134, + "step": 44871 + }, + { + "epoch": 0.8003424535369029, + "grad_norm": 0.2798623740673065, + "learning_rate": 5.828782697057464e-06, + "loss": 0.1293, + "step": 44872 + }, + { + "epoch": 0.8003602896586166, + "grad_norm": 0.27565184235572815, + "learning_rate": 5.827783724182656e-06, + "loss": 0.143, + "step": 44873 + }, + { + "epoch": 0.8003781257803303, + "grad_norm": 0.31971368193626404, + "learning_rate": 5.82678482562598e-06, + "loss": 0.1029, + "step": 44874 + }, + { + "epoch": 0.800395961902044, + "grad_norm": 0.25721362233161926, + "learning_rate": 5.8257860013913035e-06, + "loss": 0.1667, + "step": 44875 + }, + { + "epoch": 0.8004137980237577, + "grad_norm": 0.2920261323451996, + "learning_rate": 5.824787251482508e-06, + "loss": 0.0892, + "step": 44876 + }, + { + "epoch": 0.8004316341454714, + "grad_norm": 0.24468368291854858, + "learning_rate": 5.82378857590346e-06, + "loss": 0.1073, + "step": 44877 + }, + { + "epoch": 0.800449470267185, + "grad_norm": 0.29385194182395935, + "learning_rate": 5.82278997465803e-06, + "loss": 0.0745, + "step": 44878 + }, + { + "epoch": 0.8004673063888988, + "grad_norm": 0.32971900701522827, + "learning_rate": 5.821791447750092e-06, + "loss": 0.1335, + "step": 44879 + }, + { + "epoch": 0.8004851425106125, + "grad_norm": 0.377947598695755, + "learning_rate": 5.820792995183505e-06, + "loss": 0.1464, + "step": 44880 + }, + { + "epoch": 0.8005029786323262, + "grad_norm": 0.22087959945201874, + "learning_rate": 5.819794616962157e-06, + "loss": 0.093, + "step": 44881 + }, + { + "epoch": 0.8005208147540399, + "grad_norm": 0.24202612042427063, + "learning_rate": 5.818796313089911e-06, + "loss": 0.0985, + "step": 44882 + }, + { + "epoch": 0.8005386508757536, + "grad_norm": 0.2946437895298004, + "learning_rate": 5.8177980835706244e-06, + "loss": 0.1381, + "step": 44883 + }, + { + "epoch": 0.8005564869974673, + "grad_norm": 0.21484124660491943, + "learning_rate": 5.81679992840819e-06, + "loss": 0.0929, + "step": 44884 + }, + { + "epoch": 0.800574323119181, + "grad_norm": 0.36438870429992676, + "learning_rate": 5.815801847606453e-06, + "loss": 0.1202, + "step": 44885 + }, + { + "epoch": 0.8005921592408947, + "grad_norm": 0.3173169493675232, + "learning_rate": 5.814803841169303e-06, + "loss": 0.1715, + "step": 44886 + }, + { + "epoch": 0.8006099953626084, + "grad_norm": 0.27175572514533997, + "learning_rate": 5.813805909100601e-06, + "loss": 0.1258, + "step": 44887 + }, + { + "epoch": 0.800627831484322, + "grad_norm": 0.325126975774765, + "learning_rate": 5.8128080514042034e-06, + "loss": 0.1532, + "step": 44888 + }, + { + "epoch": 0.8006456676060357, + "grad_norm": 0.27757707238197327, + "learning_rate": 5.811810268084001e-06, + "loss": 0.1217, + "step": 44889 + }, + { + "epoch": 0.8006635037277494, + "grad_norm": 0.2835046947002411, + "learning_rate": 5.810812559143844e-06, + "loss": 0.2006, + "step": 44890 + }, + { + "epoch": 0.8006813398494631, + "grad_norm": 0.25689783692359924, + "learning_rate": 5.809814924587609e-06, + "loss": 0.1409, + "step": 44891 + }, + { + "epoch": 0.8006991759711768, + "grad_norm": 0.3308696150779724, + "learning_rate": 5.8088173644191586e-06, + "loss": 0.0936, + "step": 44892 + }, + { + "epoch": 0.8007170120928905, + "grad_norm": 0.3758254647254944, + "learning_rate": 5.807819878642351e-06, + "loss": 0.1516, + "step": 44893 + }, + { + "epoch": 0.8007348482146042, + "grad_norm": 0.22307826578617096, + "learning_rate": 5.806822467261072e-06, + "loss": 0.1058, + "step": 44894 + }, + { + "epoch": 0.800752684336318, + "grad_norm": 0.303335040807724, + "learning_rate": 5.805825130279177e-06, + "loss": 0.088, + "step": 44895 + }, + { + "epoch": 0.8007705204580317, + "grad_norm": 0.21667756140232086, + "learning_rate": 5.8048278677005355e-06, + "loss": 0.1272, + "step": 44896 + }, + { + "epoch": 0.8007883565797453, + "grad_norm": 0.23222732543945312, + "learning_rate": 5.803830679529001e-06, + "loss": 0.0619, + "step": 44897 + }, + { + "epoch": 0.800806192701459, + "grad_norm": 0.23286797106266022, + "learning_rate": 5.8028335657684506e-06, + "loss": 0.1616, + "step": 44898 + }, + { + "epoch": 0.8008240288231727, + "grad_norm": 0.2144184410572052, + "learning_rate": 5.801836526422755e-06, + "loss": 0.1033, + "step": 44899 + }, + { + "epoch": 0.8008418649448864, + "grad_norm": 0.31754070520401, + "learning_rate": 5.800839561495772e-06, + "loss": 0.1438, + "step": 44900 + }, + { + "epoch": 0.8008597010666001, + "grad_norm": 0.24232779443264008, + "learning_rate": 5.799842670991356e-06, + "loss": 0.1111, + "step": 44901 + }, + { + "epoch": 0.8008775371883138, + "grad_norm": 0.2632555365562439, + "learning_rate": 5.79884585491339e-06, + "loss": 0.0846, + "step": 44902 + }, + { + "epoch": 0.8008953733100275, + "grad_norm": 0.1810254454612732, + "learning_rate": 5.797849113265732e-06, + "loss": 0.0778, + "step": 44903 + }, + { + "epoch": 0.8009132094317412, + "grad_norm": 0.2572481334209442, + "learning_rate": 5.796852446052239e-06, + "loss": 0.0706, + "step": 44904 + }, + { + "epoch": 0.8009310455534548, + "grad_norm": 0.3234332203865051, + "learning_rate": 5.795855853276783e-06, + "loss": 0.1554, + "step": 44905 + }, + { + "epoch": 0.8009488816751685, + "grad_norm": 0.27203384041786194, + "learning_rate": 5.794859334943212e-06, + "loss": 0.1256, + "step": 44906 + }, + { + "epoch": 0.8009667177968822, + "grad_norm": 0.2624721825122833, + "learning_rate": 5.793862891055407e-06, + "loss": 0.1572, + "step": 44907 + }, + { + "epoch": 0.8009845539185959, + "grad_norm": 0.33851128816604614, + "learning_rate": 5.792866521617224e-06, + "loss": 0.1316, + "step": 44908 + }, + { + "epoch": 0.8010023900403096, + "grad_norm": 0.2457079291343689, + "learning_rate": 5.791870226632523e-06, + "loss": 0.072, + "step": 44909 + }, + { + "epoch": 0.8010202261620233, + "grad_norm": 0.23305246233940125, + "learning_rate": 5.790874006105162e-06, + "loss": 0.1482, + "step": 44910 + }, + { + "epoch": 0.801038062283737, + "grad_norm": 0.3221266567707062, + "learning_rate": 5.789877860039014e-06, + "loss": 0.138, + "step": 44911 + }, + { + "epoch": 0.8010558984054508, + "grad_norm": 0.2858541011810303, + "learning_rate": 5.788881788437927e-06, + "loss": 0.1273, + "step": 44912 + }, + { + "epoch": 0.8010737345271645, + "grad_norm": 0.22206105291843414, + "learning_rate": 5.787885791305778e-06, + "loss": 0.1104, + "step": 44913 + }, + { + "epoch": 0.8010915706488781, + "grad_norm": 0.27187642455101013, + "learning_rate": 5.786889868646411e-06, + "loss": 0.1274, + "step": 44914 + }, + { + "epoch": 0.8011094067705918, + "grad_norm": 0.27411359548568726, + "learning_rate": 5.785894020463703e-06, + "loss": 0.1212, + "step": 44915 + }, + { + "epoch": 0.8011272428923055, + "grad_norm": 0.29509982466697693, + "learning_rate": 5.7848982467615056e-06, + "loss": 0.1132, + "step": 44916 + }, + { + "epoch": 0.8011450790140192, + "grad_norm": 0.30816808342933655, + "learning_rate": 5.783902547543676e-06, + "loss": 0.1258, + "step": 44917 + }, + { + "epoch": 0.8011629151357329, + "grad_norm": 0.28559520840644836, + "learning_rate": 5.782906922814082e-06, + "loss": 0.1386, + "step": 44918 + }, + { + "epoch": 0.8011807512574466, + "grad_norm": 0.27879440784454346, + "learning_rate": 5.7819113725765675e-06, + "loss": 0.1241, + "step": 44919 + }, + { + "epoch": 0.8011985873791603, + "grad_norm": 0.2690989077091217, + "learning_rate": 5.780915896835013e-06, + "loss": 0.0802, + "step": 44920 + }, + { + "epoch": 0.801216423500874, + "grad_norm": 0.2710437476634979, + "learning_rate": 5.779920495593263e-06, + "loss": 0.1222, + "step": 44921 + }, + { + "epoch": 0.8012342596225877, + "grad_norm": 0.2932475507259369, + "learning_rate": 5.778925168855182e-06, + "loss": 0.1234, + "step": 44922 + }, + { + "epoch": 0.8012520957443013, + "grad_norm": 0.3456650674343109, + "learning_rate": 5.777929916624619e-06, + "loss": 0.0959, + "step": 44923 + }, + { + "epoch": 0.801269931866015, + "grad_norm": 0.3106158673763275, + "learning_rate": 5.776934738905443e-06, + "loss": 0.1218, + "step": 44924 + }, + { + "epoch": 0.8012877679877287, + "grad_norm": 0.28460800647735596, + "learning_rate": 5.7759396357015025e-06, + "loss": 0.124, + "step": 44925 + }, + { + "epoch": 0.8013056041094424, + "grad_norm": 0.29398831725120544, + "learning_rate": 5.774944607016666e-06, + "loss": 0.1621, + "step": 44926 + }, + { + "epoch": 0.8013234402311561, + "grad_norm": 0.31669294834136963, + "learning_rate": 5.773949652854779e-06, + "loss": 0.1293, + "step": 44927 + }, + { + "epoch": 0.8013412763528698, + "grad_norm": 0.2677053213119507, + "learning_rate": 5.772954773219707e-06, + "loss": 0.1502, + "step": 44928 + }, + { + "epoch": 0.8013591124745836, + "grad_norm": 0.3948040008544922, + "learning_rate": 5.771959968115306e-06, + "loss": 0.1507, + "step": 44929 + }, + { + "epoch": 0.8013769485962973, + "grad_norm": 0.38298657536506653, + "learning_rate": 5.7709652375454285e-06, + "loss": 0.1694, + "step": 44930 + }, + { + "epoch": 0.801394784718011, + "grad_norm": 0.18793480098247528, + "learning_rate": 5.769970581513931e-06, + "loss": 0.0756, + "step": 44931 + }, + { + "epoch": 0.8014126208397246, + "grad_norm": 0.2480316311120987, + "learning_rate": 5.768976000024664e-06, + "loss": 0.138, + "step": 44932 + }, + { + "epoch": 0.8014304569614383, + "grad_norm": 0.32688817381858826, + "learning_rate": 5.767981493081492e-06, + "loss": 0.1159, + "step": 44933 + }, + { + "epoch": 0.801448293083152, + "grad_norm": 0.20329535007476807, + "learning_rate": 5.766987060688269e-06, + "loss": 0.0846, + "step": 44934 + }, + { + "epoch": 0.8014661292048657, + "grad_norm": 0.2714150547981262, + "learning_rate": 5.765992702848847e-06, + "loss": 0.1133, + "step": 44935 + }, + { + "epoch": 0.8014839653265794, + "grad_norm": 0.3031516373157501, + "learning_rate": 5.764998419567072e-06, + "loss": 0.1632, + "step": 44936 + }, + { + "epoch": 0.8015018014482931, + "grad_norm": 0.43094682693481445, + "learning_rate": 5.764004210846815e-06, + "loss": 0.0945, + "step": 44937 + }, + { + "epoch": 0.8015196375700068, + "grad_norm": 0.289556622505188, + "learning_rate": 5.763010076691921e-06, + "loss": 0.1406, + "step": 44938 + }, + { + "epoch": 0.8015374736917205, + "grad_norm": 0.33710530400276184, + "learning_rate": 5.762016017106237e-06, + "loss": 0.1114, + "step": 44939 + }, + { + "epoch": 0.8015553098134341, + "grad_norm": 0.22856399416923523, + "learning_rate": 5.76102203209363e-06, + "loss": 0.083, + "step": 44940 + }, + { + "epoch": 0.8015731459351478, + "grad_norm": 0.21168820559978485, + "learning_rate": 5.76002812165794e-06, + "loss": 0.1254, + "step": 44941 + }, + { + "epoch": 0.8015909820568615, + "grad_norm": 0.2864570617675781, + "learning_rate": 5.759034285803033e-06, + "loss": 0.1056, + "step": 44942 + }, + { + "epoch": 0.8016088181785752, + "grad_norm": 0.20837455987930298, + "learning_rate": 5.758040524532756e-06, + "loss": 0.152, + "step": 44943 + }, + { + "epoch": 0.8016266543002889, + "grad_norm": 0.274724543094635, + "learning_rate": 5.75704683785096e-06, + "loss": 0.1126, + "step": 44944 + }, + { + "epoch": 0.8016444904220026, + "grad_norm": 0.5435351729393005, + "learning_rate": 5.756053225761488e-06, + "loss": 0.0977, + "step": 44945 + }, + { + "epoch": 0.8016623265437164, + "grad_norm": 0.1976233720779419, + "learning_rate": 5.7550596882682075e-06, + "loss": 0.0733, + "step": 44946 + }, + { + "epoch": 0.8016801626654301, + "grad_norm": 0.2799014151096344, + "learning_rate": 5.754066225374966e-06, + "loss": 0.0841, + "step": 44947 + }, + { + "epoch": 0.8016979987871438, + "grad_norm": 0.25921720266342163, + "learning_rate": 5.7530728370856076e-06, + "loss": 0.1272, + "step": 44948 + }, + { + "epoch": 0.8017158349088574, + "grad_norm": 0.24318300187587738, + "learning_rate": 5.75207952340398e-06, + "loss": 0.1599, + "step": 44949 + }, + { + "epoch": 0.8017336710305711, + "grad_norm": 0.241105854511261, + "learning_rate": 5.751086284333948e-06, + "loss": 0.0895, + "step": 44950 + }, + { + "epoch": 0.8017515071522848, + "grad_norm": 0.5471779704093933, + "learning_rate": 5.750093119879352e-06, + "loss": 0.1493, + "step": 44951 + }, + { + "epoch": 0.8017693432739985, + "grad_norm": 0.3248126804828644, + "learning_rate": 5.749100030044038e-06, + "loss": 0.1263, + "step": 44952 + }, + { + "epoch": 0.8017871793957122, + "grad_norm": 0.22908511757850647, + "learning_rate": 5.748107014831869e-06, + "loss": 0.1114, + "step": 44953 + }, + { + "epoch": 0.8018050155174259, + "grad_norm": 0.23781849443912506, + "learning_rate": 5.747114074246679e-06, + "loss": 0.1068, + "step": 44954 + }, + { + "epoch": 0.8018228516391396, + "grad_norm": 0.25892335176467896, + "learning_rate": 5.746121208292332e-06, + "loss": 0.1089, + "step": 44955 + }, + { + "epoch": 0.8018406877608533, + "grad_norm": 0.5099577307701111, + "learning_rate": 5.74512841697267e-06, + "loss": 0.1345, + "step": 44956 + }, + { + "epoch": 0.801858523882567, + "grad_norm": 0.2208169847726822, + "learning_rate": 5.7441357002915395e-06, + "loss": 0.0771, + "step": 44957 + }, + { + "epoch": 0.8018763600042806, + "grad_norm": 0.2946223318576813, + "learning_rate": 5.743143058252784e-06, + "loss": 0.1411, + "step": 44958 + }, + { + "epoch": 0.8018941961259943, + "grad_norm": 0.30401602387428284, + "learning_rate": 5.742150490860262e-06, + "loss": 0.1807, + "step": 44959 + }, + { + "epoch": 0.801912032247708, + "grad_norm": 0.2889178395271301, + "learning_rate": 5.741157998117816e-06, + "loss": 0.1386, + "step": 44960 + }, + { + "epoch": 0.8019298683694217, + "grad_norm": 0.35939544439315796, + "learning_rate": 5.740165580029297e-06, + "loss": 0.121, + "step": 44961 + }, + { + "epoch": 0.8019477044911354, + "grad_norm": 0.41577592492103577, + "learning_rate": 5.739173236598539e-06, + "loss": 0.2049, + "step": 44962 + }, + { + "epoch": 0.8019655406128492, + "grad_norm": 0.2711866497993469, + "learning_rate": 5.738180967829404e-06, + "loss": 0.1185, + "step": 44963 + }, + { + "epoch": 0.8019833767345629, + "grad_norm": 0.24924075603485107, + "learning_rate": 5.7371887737257325e-06, + "loss": 0.1036, + "step": 44964 + }, + { + "epoch": 0.8020012128562766, + "grad_norm": 0.2239152491092682, + "learning_rate": 5.736196654291373e-06, + "loss": 0.1349, + "step": 44965 + }, + { + "epoch": 0.8020190489779903, + "grad_norm": 0.2896084487438202, + "learning_rate": 5.7352046095301595e-06, + "loss": 0.1499, + "step": 44966 + }, + { + "epoch": 0.802036885099704, + "grad_norm": 0.24542298913002014, + "learning_rate": 5.734212639445954e-06, + "loss": 0.0808, + "step": 44967 + }, + { + "epoch": 0.8020547212214176, + "grad_norm": 0.27352175116539, + "learning_rate": 5.733220744042589e-06, + "loss": 0.1103, + "step": 44968 + }, + { + "epoch": 0.8020725573431313, + "grad_norm": 0.3354392349720001, + "learning_rate": 5.7322289233239206e-06, + "loss": 0.1046, + "step": 44969 + }, + { + "epoch": 0.802090393464845, + "grad_norm": 0.25808966159820557, + "learning_rate": 5.73123717729378e-06, + "loss": 0.0842, + "step": 44970 + }, + { + "epoch": 0.8021082295865587, + "grad_norm": 0.26042869687080383, + "learning_rate": 5.7302455059560275e-06, + "loss": 0.1277, + "step": 44971 + }, + { + "epoch": 0.8021260657082724, + "grad_norm": 0.42380964756011963, + "learning_rate": 5.7292539093145e-06, + "loss": 0.1286, + "step": 44972 + }, + { + "epoch": 0.8021439018299861, + "grad_norm": 0.33321458101272583, + "learning_rate": 5.7282623873730374e-06, + "loss": 0.1733, + "step": 44973 + }, + { + "epoch": 0.8021617379516998, + "grad_norm": 0.28361377120018005, + "learning_rate": 5.727270940135487e-06, + "loss": 0.1312, + "step": 44974 + }, + { + "epoch": 0.8021795740734134, + "grad_norm": 0.3127893805503845, + "learning_rate": 5.726279567605686e-06, + "loss": 0.1212, + "step": 44975 + }, + { + "epoch": 0.8021974101951271, + "grad_norm": 0.22174373269081116, + "learning_rate": 5.725288269787488e-06, + "loss": 0.1147, + "step": 44976 + }, + { + "epoch": 0.8022152463168408, + "grad_norm": 0.2840960621833801, + "learning_rate": 5.724297046684729e-06, + "loss": 0.0992, + "step": 44977 + }, + { + "epoch": 0.8022330824385545, + "grad_norm": 0.26874086260795593, + "learning_rate": 5.723305898301254e-06, + "loss": 0.1451, + "step": 44978 + }, + { + "epoch": 0.8022509185602682, + "grad_norm": 0.28927677869796753, + "learning_rate": 5.7223148246408954e-06, + "loss": 0.1316, + "step": 44979 + }, + { + "epoch": 0.802268754681982, + "grad_norm": 0.23745930194854736, + "learning_rate": 5.7213238257075115e-06, + "loss": 0.1121, + "step": 44980 + }, + { + "epoch": 0.8022865908036957, + "grad_norm": 0.28322634100914, + "learning_rate": 5.720332901504926e-06, + "loss": 0.1221, + "step": 44981 + }, + { + "epoch": 0.8023044269254094, + "grad_norm": 0.2268662005662918, + "learning_rate": 5.719342052036997e-06, + "loss": 0.0858, + "step": 44982 + }, + { + "epoch": 0.8023222630471231, + "grad_norm": 0.2538077235221863, + "learning_rate": 5.718351277307549e-06, + "loss": 0.1306, + "step": 44983 + }, + { + "epoch": 0.8023400991688368, + "grad_norm": 0.2929036617279053, + "learning_rate": 5.717360577320441e-06, + "loss": 0.1177, + "step": 44984 + }, + { + "epoch": 0.8023579352905504, + "grad_norm": 0.2052081972360611, + "learning_rate": 5.7163699520795e-06, + "loss": 0.0672, + "step": 44985 + }, + { + "epoch": 0.8023757714122641, + "grad_norm": 0.26290860772132874, + "learning_rate": 5.715379401588572e-06, + "loss": 0.1204, + "step": 44986 + }, + { + "epoch": 0.8023936075339778, + "grad_norm": 0.24862372875213623, + "learning_rate": 5.714388925851494e-06, + "loss": 0.1206, + "step": 44987 + }, + { + "epoch": 0.8024114436556915, + "grad_norm": 0.3110763728618622, + "learning_rate": 5.713398524872096e-06, + "loss": 0.1015, + "step": 44988 + }, + { + "epoch": 0.8024292797774052, + "grad_norm": 0.32171696424484253, + "learning_rate": 5.712408198654237e-06, + "loss": 0.1295, + "step": 44989 + }, + { + "epoch": 0.8024471158991189, + "grad_norm": 0.36702901124954224, + "learning_rate": 5.711417947201745e-06, + "loss": 0.1469, + "step": 44990 + }, + { + "epoch": 0.8024649520208326, + "grad_norm": 0.26032930612564087, + "learning_rate": 5.710427770518456e-06, + "loss": 0.1298, + "step": 44991 + }, + { + "epoch": 0.8024827881425463, + "grad_norm": 0.2838994562625885, + "learning_rate": 5.709437668608208e-06, + "loss": 0.1518, + "step": 44992 + }, + { + "epoch": 0.8025006242642599, + "grad_norm": 0.2227109670639038, + "learning_rate": 5.708447641474848e-06, + "loss": 0.0845, + "step": 44993 + }, + { + "epoch": 0.8025184603859736, + "grad_norm": 0.3563428223133087, + "learning_rate": 5.707457689122208e-06, + "loss": 0.1895, + "step": 44994 + }, + { + "epoch": 0.8025362965076873, + "grad_norm": 0.20921310782432556, + "learning_rate": 5.706467811554117e-06, + "loss": 0.1238, + "step": 44995 + }, + { + "epoch": 0.8025541326294011, + "grad_norm": 0.26908808946609497, + "learning_rate": 5.705478008774428e-06, + "loss": 0.1051, + "step": 44996 + }, + { + "epoch": 0.8025719687511148, + "grad_norm": 0.3211827576160431, + "learning_rate": 5.704488280786966e-06, + "loss": 0.1125, + "step": 44997 + }, + { + "epoch": 0.8025898048728285, + "grad_norm": 0.22332863509655, + "learning_rate": 5.703498627595577e-06, + "loss": 0.0809, + "step": 44998 + }, + { + "epoch": 0.8026076409945422, + "grad_norm": 0.2842158377170563, + "learning_rate": 5.702509049204094e-06, + "loss": 0.1111, + "step": 44999 + }, + { + "epoch": 0.8026254771162559, + "grad_norm": 0.3417893350124359, + "learning_rate": 5.701519545616349e-06, + "loss": 0.1101, + "step": 45000 + }, + { + "epoch": 0.8026254771162559, + "eval_loss": 0.11037413775920868, + "eval_runtime": 108.5419, + "eval_samples_per_second": 9.434, + "eval_steps_per_second": 1.575, + "step": 45000 + }, + { + "epoch": 0.8026433132379696, + "grad_norm": 0.2643965482711792, + "learning_rate": 5.700530116836173e-06, + "loss": 0.118, + "step": 45001 + }, + { + "epoch": 0.8026611493596832, + "grad_norm": 0.3477681875228882, + "learning_rate": 5.699540762867414e-06, + "loss": 0.0635, + "step": 45002 + }, + { + "epoch": 0.8026789854813969, + "grad_norm": 0.3364926874637604, + "learning_rate": 5.698551483713902e-06, + "loss": 0.1621, + "step": 45003 + }, + { + "epoch": 0.8026968216031106, + "grad_norm": 0.2213757187128067, + "learning_rate": 5.697562279379473e-06, + "loss": 0.1022, + "step": 45004 + }, + { + "epoch": 0.8027146577248243, + "grad_norm": 0.2655234932899475, + "learning_rate": 5.6965731498679484e-06, + "loss": 0.1055, + "step": 45005 + }, + { + "epoch": 0.802732493846538, + "grad_norm": 0.25524941086769104, + "learning_rate": 5.695584095183184e-06, + "loss": 0.1076, + "step": 45006 + }, + { + "epoch": 0.8027503299682517, + "grad_norm": 0.24004822969436646, + "learning_rate": 5.694595115329002e-06, + "loss": 0.0847, + "step": 45007 + }, + { + "epoch": 0.8027681660899654, + "grad_norm": 0.3697361946105957, + "learning_rate": 5.69360621030923e-06, + "loss": 0.127, + "step": 45008 + }, + { + "epoch": 0.8027860022116791, + "grad_norm": 0.25982481241226196, + "learning_rate": 5.692617380127716e-06, + "loss": 0.1009, + "step": 45009 + }, + { + "epoch": 0.8028038383333927, + "grad_norm": 0.25787749886512756, + "learning_rate": 5.691628624788278e-06, + "loss": 0.1046, + "step": 45010 + }, + { + "epoch": 0.8028216744551064, + "grad_norm": 0.2753685712814331, + "learning_rate": 5.690639944294765e-06, + "loss": 0.0778, + "step": 45011 + }, + { + "epoch": 0.8028395105768201, + "grad_norm": 0.2629126310348511, + "learning_rate": 5.6896513386509976e-06, + "loss": 0.13, + "step": 45012 + }, + { + "epoch": 0.8028573466985339, + "grad_norm": 0.2808989882469177, + "learning_rate": 5.688662807860815e-06, + "loss": 0.1068, + "step": 45013 + }, + { + "epoch": 0.8028751828202476, + "grad_norm": 0.2294870913028717, + "learning_rate": 5.687674351928035e-06, + "loss": 0.0419, + "step": 45014 + }, + { + "epoch": 0.8028930189419613, + "grad_norm": 0.23658153414726257, + "learning_rate": 5.686685970856509e-06, + "loss": 0.0937, + "step": 45015 + }, + { + "epoch": 0.802910855063675, + "grad_norm": 0.2728012502193451, + "learning_rate": 5.685697664650055e-06, + "loss": 0.0936, + "step": 45016 + }, + { + "epoch": 0.8029286911853887, + "grad_norm": 0.2915666997432709, + "learning_rate": 5.684709433312513e-06, + "loss": 0.1582, + "step": 45017 + }, + { + "epoch": 0.8029465273071024, + "grad_norm": 0.33082664012908936, + "learning_rate": 5.683721276847698e-06, + "loss": 0.1472, + "step": 45018 + }, + { + "epoch": 0.802964363428816, + "grad_norm": 0.2335931956768036, + "learning_rate": 5.68273319525946e-06, + "loss": 0.1156, + "step": 45019 + }, + { + "epoch": 0.8029821995505297, + "grad_norm": 0.2518908977508545, + "learning_rate": 5.6817451885516185e-06, + "loss": 0.1128, + "step": 45020 + }, + { + "epoch": 0.8030000356722434, + "grad_norm": 0.2515953779220581, + "learning_rate": 5.6807572567280035e-06, + "loss": 0.1001, + "step": 45021 + }, + { + "epoch": 0.8030178717939571, + "grad_norm": 0.4360179603099823, + "learning_rate": 5.679769399792442e-06, + "loss": 0.0831, + "step": 45022 + }, + { + "epoch": 0.8030357079156708, + "grad_norm": 0.30175352096557617, + "learning_rate": 5.678781617748774e-06, + "loss": 0.1145, + "step": 45023 + }, + { + "epoch": 0.8030535440373845, + "grad_norm": 0.3068403899669647, + "learning_rate": 5.677793910600812e-06, + "loss": 0.1065, + "step": 45024 + }, + { + "epoch": 0.8030713801590982, + "grad_norm": 0.3034062087535858, + "learning_rate": 5.6768062783524035e-06, + "loss": 0.0966, + "step": 45025 + }, + { + "epoch": 0.8030892162808119, + "grad_norm": 0.30345284938812256, + "learning_rate": 5.67581872100737e-06, + "loss": 0.1235, + "step": 45026 + }, + { + "epoch": 0.8031070524025256, + "grad_norm": 0.22523249685764313, + "learning_rate": 5.674831238569528e-06, + "loss": 0.1522, + "step": 45027 + }, + { + "epoch": 0.8031248885242392, + "grad_norm": 0.2335573434829712, + "learning_rate": 5.673843831042722e-06, + "loss": 0.1215, + "step": 45028 + }, + { + "epoch": 0.8031427246459529, + "grad_norm": 0.3100409507751465, + "learning_rate": 5.672856498430773e-06, + "loss": 0.067, + "step": 45029 + }, + { + "epoch": 0.8031605607676667, + "grad_norm": 0.27990153431892395, + "learning_rate": 5.671869240737507e-06, + "loss": 0.1515, + "step": 45030 + }, + { + "epoch": 0.8031783968893804, + "grad_norm": 0.2517462968826294, + "learning_rate": 5.670882057966745e-06, + "loss": 0.109, + "step": 45031 + }, + { + "epoch": 0.8031962330110941, + "grad_norm": 0.29710128903388977, + "learning_rate": 5.669894950122326e-06, + "loss": 0.0914, + "step": 45032 + }, + { + "epoch": 0.8032140691328078, + "grad_norm": 0.23807238042354584, + "learning_rate": 5.668907917208072e-06, + "loss": 0.1419, + "step": 45033 + }, + { + "epoch": 0.8032319052545215, + "grad_norm": 0.23975948989391327, + "learning_rate": 5.667920959227807e-06, + "loss": 0.095, + "step": 45034 + }, + { + "epoch": 0.8032497413762352, + "grad_norm": 0.31046345829963684, + "learning_rate": 5.666934076185351e-06, + "loss": 0.1099, + "step": 45035 + }, + { + "epoch": 0.8032675774979489, + "grad_norm": 0.2461196482181549, + "learning_rate": 5.6659472680845425e-06, + "loss": 0.0827, + "step": 45036 + }, + { + "epoch": 0.8032854136196625, + "grad_norm": 0.21382002532482147, + "learning_rate": 5.664960534929192e-06, + "loss": 0.1192, + "step": 45037 + }, + { + "epoch": 0.8033032497413762, + "grad_norm": 0.22482174634933472, + "learning_rate": 5.663973876723142e-06, + "loss": 0.0935, + "step": 45038 + }, + { + "epoch": 0.8033210858630899, + "grad_norm": 0.2876605689525604, + "learning_rate": 5.662987293470207e-06, + "loss": 0.0794, + "step": 45039 + }, + { + "epoch": 0.8033389219848036, + "grad_norm": 0.23094086349010468, + "learning_rate": 5.662000785174204e-06, + "loss": 0.0753, + "step": 45040 + }, + { + "epoch": 0.8033567581065173, + "grad_norm": 0.2665034830570221, + "learning_rate": 5.661014351838972e-06, + "loss": 0.1044, + "step": 45041 + }, + { + "epoch": 0.803374594228231, + "grad_norm": 0.2721155881881714, + "learning_rate": 5.660027993468328e-06, + "loss": 0.1105, + "step": 45042 + }, + { + "epoch": 0.8033924303499447, + "grad_norm": 0.3114945590496063, + "learning_rate": 5.659041710066096e-06, + "loss": 0.1109, + "step": 45043 + }, + { + "epoch": 0.8034102664716584, + "grad_norm": 0.23734666407108307, + "learning_rate": 5.658055501636092e-06, + "loss": 0.0938, + "step": 45044 + }, + { + "epoch": 0.803428102593372, + "grad_norm": 0.2699252665042877, + "learning_rate": 5.65706936818215e-06, + "loss": 0.1524, + "step": 45045 + }, + { + "epoch": 0.8034459387150857, + "grad_norm": 0.36435607075691223, + "learning_rate": 5.656083309708091e-06, + "loss": 0.1789, + "step": 45046 + }, + { + "epoch": 0.8034637748367995, + "grad_norm": 0.38901329040527344, + "learning_rate": 5.655097326217732e-06, + "loss": 0.1393, + "step": 45047 + }, + { + "epoch": 0.8034816109585132, + "grad_norm": 0.2220451831817627, + "learning_rate": 5.654111417714892e-06, + "loss": 0.0944, + "step": 45048 + }, + { + "epoch": 0.8034994470802269, + "grad_norm": 0.2760920226573944, + "learning_rate": 5.653125584203403e-06, + "loss": 0.1498, + "step": 45049 + }, + { + "epoch": 0.8035172832019406, + "grad_norm": 0.2670243978500366, + "learning_rate": 5.652139825687083e-06, + "loss": 0.1334, + "step": 45050 + }, + { + "epoch": 0.8035351193236543, + "grad_norm": 0.26425203680992126, + "learning_rate": 5.651154142169743e-06, + "loss": 0.0639, + "step": 45051 + }, + { + "epoch": 0.803552955445368, + "grad_norm": 0.21602346003055573, + "learning_rate": 5.6501685336552204e-06, + "loss": 0.0859, + "step": 45052 + }, + { + "epoch": 0.8035707915670817, + "grad_norm": 0.24008624255657196, + "learning_rate": 5.649183000147323e-06, + "loss": 0.1167, + "step": 45053 + }, + { + "epoch": 0.8035886276887954, + "grad_norm": 0.2429155856370926, + "learning_rate": 5.64819754164988e-06, + "loss": 0.0958, + "step": 45054 + }, + { + "epoch": 0.803606463810509, + "grad_norm": 0.2998778223991394, + "learning_rate": 5.647212158166706e-06, + "loss": 0.0978, + "step": 45055 + }, + { + "epoch": 0.8036242999322227, + "grad_norm": 0.24849781394004822, + "learning_rate": 5.646226849701625e-06, + "loss": 0.0737, + "step": 45056 + }, + { + "epoch": 0.8036421360539364, + "grad_norm": 0.2383917272090912, + "learning_rate": 5.645241616258446e-06, + "loss": 0.0945, + "step": 45057 + }, + { + "epoch": 0.8036599721756501, + "grad_norm": 0.2024536281824112, + "learning_rate": 5.6442564578410015e-06, + "loss": 0.127, + "step": 45058 + }, + { + "epoch": 0.8036778082973638, + "grad_norm": 0.3398240804672241, + "learning_rate": 5.6432713744531046e-06, + "loss": 0.1413, + "step": 45059 + }, + { + "epoch": 0.8036956444190775, + "grad_norm": 0.2868044674396515, + "learning_rate": 5.642286366098573e-06, + "loss": 0.1235, + "step": 45060 + }, + { + "epoch": 0.8037134805407912, + "grad_norm": 0.24181649088859558, + "learning_rate": 5.641301432781218e-06, + "loss": 0.0952, + "step": 45061 + }, + { + "epoch": 0.8037313166625049, + "grad_norm": 0.28084084391593933, + "learning_rate": 5.6403165745048725e-06, + "loss": 0.1019, + "step": 45062 + }, + { + "epoch": 0.8037491527842185, + "grad_norm": 0.25780850648880005, + "learning_rate": 5.639331791273347e-06, + "loss": 0.1362, + "step": 45063 + }, + { + "epoch": 0.8037669889059323, + "grad_norm": 0.28611013293266296, + "learning_rate": 5.638347083090451e-06, + "loss": 0.1187, + "step": 45064 + }, + { + "epoch": 0.803784825027646, + "grad_norm": 0.2852856516838074, + "learning_rate": 5.637362449960015e-06, + "loss": 0.1133, + "step": 45065 + }, + { + "epoch": 0.8038026611493597, + "grad_norm": 0.25416988134384155, + "learning_rate": 5.636377891885844e-06, + "loss": 0.167, + "step": 45066 + }, + { + "epoch": 0.8038204972710734, + "grad_norm": 0.2378191500902176, + "learning_rate": 5.635393408871767e-06, + "loss": 0.1333, + "step": 45067 + }, + { + "epoch": 0.8038383333927871, + "grad_norm": 0.31271690130233765, + "learning_rate": 5.634409000921595e-06, + "loss": 0.1027, + "step": 45068 + }, + { + "epoch": 0.8038561695145008, + "grad_norm": 0.3686150908470154, + "learning_rate": 5.633424668039142e-06, + "loss": 0.152, + "step": 45069 + }, + { + "epoch": 0.8038740056362145, + "grad_norm": 0.31732410192489624, + "learning_rate": 5.6324404102282156e-06, + "loss": 0.0975, + "step": 45070 + }, + { + "epoch": 0.8038918417579282, + "grad_norm": 0.2282390594482422, + "learning_rate": 5.631456227492646e-06, + "loss": 0.0809, + "step": 45071 + }, + { + "epoch": 0.8039096778796418, + "grad_norm": 0.23849719762802124, + "learning_rate": 5.630472119836242e-06, + "loss": 0.0827, + "step": 45072 + }, + { + "epoch": 0.8039275140013555, + "grad_norm": 0.3120070993900299, + "learning_rate": 5.629488087262819e-06, + "loss": 0.1176, + "step": 45073 + }, + { + "epoch": 0.8039453501230692, + "grad_norm": 0.3191993534564972, + "learning_rate": 5.6285041297761825e-06, + "loss": 0.1107, + "step": 45074 + }, + { + "epoch": 0.8039631862447829, + "grad_norm": 0.24546119570732117, + "learning_rate": 5.627520247380164e-06, + "loss": 0.0797, + "step": 45075 + }, + { + "epoch": 0.8039810223664966, + "grad_norm": 0.2516654431819916, + "learning_rate": 5.626536440078564e-06, + "loss": 0.1097, + "step": 45076 + }, + { + "epoch": 0.8039988584882103, + "grad_norm": 0.2875223159790039, + "learning_rate": 5.625552707875203e-06, + "loss": 0.1137, + "step": 45077 + }, + { + "epoch": 0.804016694609924, + "grad_norm": 0.3131698966026306, + "learning_rate": 5.624569050773884e-06, + "loss": 0.0742, + "step": 45078 + }, + { + "epoch": 0.8040345307316377, + "grad_norm": 0.17964769899845123, + "learning_rate": 5.6235854687784276e-06, + "loss": 0.0731, + "step": 45079 + }, + { + "epoch": 0.8040523668533514, + "grad_norm": 0.22102344036102295, + "learning_rate": 5.622601961892654e-06, + "loss": 0.0792, + "step": 45080 + }, + { + "epoch": 0.8040702029750652, + "grad_norm": 0.24348902702331543, + "learning_rate": 5.621618530120367e-06, + "loss": 0.0926, + "step": 45081 + }, + { + "epoch": 0.8040880390967788, + "grad_norm": 0.3022458851337433, + "learning_rate": 5.62063517346538e-06, + "loss": 0.0714, + "step": 45082 + }, + { + "epoch": 0.8041058752184925, + "grad_norm": 0.3298971652984619, + "learning_rate": 5.619651891931496e-06, + "loss": 0.1466, + "step": 45083 + }, + { + "epoch": 0.8041237113402062, + "grad_norm": 0.2805505096912384, + "learning_rate": 5.618668685522544e-06, + "loss": 0.1626, + "step": 45084 + }, + { + "epoch": 0.8041415474619199, + "grad_norm": 0.3055839538574219, + "learning_rate": 5.617685554242325e-06, + "loss": 0.1402, + "step": 45085 + }, + { + "epoch": 0.8041593835836336, + "grad_norm": 0.2969001233577728, + "learning_rate": 5.616702498094653e-06, + "loss": 0.1344, + "step": 45086 + }, + { + "epoch": 0.8041772197053473, + "grad_norm": 0.23529979586601257, + "learning_rate": 5.615719517083328e-06, + "loss": 0.0963, + "step": 45087 + }, + { + "epoch": 0.804195055827061, + "grad_norm": 0.20105794072151184, + "learning_rate": 5.614736611212176e-06, + "loss": 0.0634, + "step": 45088 + }, + { + "epoch": 0.8042128919487747, + "grad_norm": 0.333313912153244, + "learning_rate": 5.613753780485001e-06, + "loss": 0.1504, + "step": 45089 + }, + { + "epoch": 0.8042307280704883, + "grad_norm": 0.2451031357049942, + "learning_rate": 5.6127710249056135e-06, + "loss": 0.1081, + "step": 45090 + }, + { + "epoch": 0.804248564192202, + "grad_norm": 0.2931724786758423, + "learning_rate": 5.611788344477814e-06, + "loss": 0.1219, + "step": 45091 + }, + { + "epoch": 0.8042664003139157, + "grad_norm": 0.216327965259552, + "learning_rate": 5.6108057392054244e-06, + "loss": 0.0641, + "step": 45092 + }, + { + "epoch": 0.8042842364356294, + "grad_norm": 0.27867671847343445, + "learning_rate": 5.6098232090922434e-06, + "loss": 0.0718, + "step": 45093 + }, + { + "epoch": 0.8043020725573431, + "grad_norm": 0.4103490114212036, + "learning_rate": 5.608840754142092e-06, + "loss": 0.1015, + "step": 45094 + }, + { + "epoch": 0.8043199086790568, + "grad_norm": 0.24888038635253906, + "learning_rate": 5.607858374358771e-06, + "loss": 0.0836, + "step": 45095 + }, + { + "epoch": 0.8043377448007705, + "grad_norm": 0.22790265083312988, + "learning_rate": 5.606876069746081e-06, + "loss": 0.098, + "step": 45096 + }, + { + "epoch": 0.8043555809224843, + "grad_norm": 0.20715153217315674, + "learning_rate": 5.605893840307846e-06, + "loss": 0.0817, + "step": 45097 + }, + { + "epoch": 0.804373417044198, + "grad_norm": 0.26539748907089233, + "learning_rate": 5.604911686047865e-06, + "loss": 0.0888, + "step": 45098 + }, + { + "epoch": 0.8043912531659116, + "grad_norm": 0.2161572426557541, + "learning_rate": 5.603929606969943e-06, + "loss": 0.1207, + "step": 45099 + }, + { + "epoch": 0.8044090892876253, + "grad_norm": 0.3037853538990021, + "learning_rate": 5.602947603077882e-06, + "loss": 0.0725, + "step": 45100 + }, + { + "epoch": 0.804426925409339, + "grad_norm": 0.20741458237171173, + "learning_rate": 5.601965674375503e-06, + "loss": 0.1112, + "step": 45101 + }, + { + "epoch": 0.8044447615310527, + "grad_norm": 0.4309438169002533, + "learning_rate": 5.600983820866607e-06, + "loss": 0.1519, + "step": 45102 + }, + { + "epoch": 0.8044625976527664, + "grad_norm": 0.2536090910434723, + "learning_rate": 5.600002042554997e-06, + "loss": 0.1144, + "step": 45103 + }, + { + "epoch": 0.8044804337744801, + "grad_norm": 0.19193938374519348, + "learning_rate": 5.59902033944448e-06, + "loss": 0.0928, + "step": 45104 + }, + { + "epoch": 0.8044982698961938, + "grad_norm": 0.23917387425899506, + "learning_rate": 5.598038711538853e-06, + "loss": 0.0635, + "step": 45105 + }, + { + "epoch": 0.8045161060179075, + "grad_norm": 0.3996824622154236, + "learning_rate": 5.597057158841929e-06, + "loss": 0.1706, + "step": 45106 + }, + { + "epoch": 0.8045339421396211, + "grad_norm": 0.32945168018341064, + "learning_rate": 5.596075681357521e-06, + "loss": 0.1093, + "step": 45107 + }, + { + "epoch": 0.8045517782613348, + "grad_norm": 0.2145969420671463, + "learning_rate": 5.595094279089425e-06, + "loss": 0.0858, + "step": 45108 + }, + { + "epoch": 0.8045696143830485, + "grad_norm": 0.262411504983902, + "learning_rate": 5.5941129520414385e-06, + "loss": 0.1032, + "step": 45109 + }, + { + "epoch": 0.8045874505047622, + "grad_norm": 0.32691875100135803, + "learning_rate": 5.59313170021738e-06, + "loss": 0.0888, + "step": 45110 + }, + { + "epoch": 0.8046052866264759, + "grad_norm": 0.1854233592748642, + "learning_rate": 5.592150523621045e-06, + "loss": 0.0895, + "step": 45111 + }, + { + "epoch": 0.8046231227481896, + "grad_norm": 0.2635454833507538, + "learning_rate": 5.59116942225624e-06, + "loss": 0.1418, + "step": 45112 + }, + { + "epoch": 0.8046409588699033, + "grad_norm": 0.29499322175979614, + "learning_rate": 5.590188396126758e-06, + "loss": 0.0955, + "step": 45113 + }, + { + "epoch": 0.8046587949916171, + "grad_norm": 0.5843046307563782, + "learning_rate": 5.589207445236417e-06, + "loss": 0.1424, + "step": 45114 + }, + { + "epoch": 0.8046766311133308, + "grad_norm": 0.33900749683380127, + "learning_rate": 5.588226569589011e-06, + "loss": 0.1168, + "step": 45115 + }, + { + "epoch": 0.8046944672350445, + "grad_norm": 0.2698301374912262, + "learning_rate": 5.587245769188345e-06, + "loss": 0.1295, + "step": 45116 + }, + { + "epoch": 0.8047123033567581, + "grad_norm": 0.29449552297592163, + "learning_rate": 5.586265044038219e-06, + "loss": 0.1378, + "step": 45117 + }, + { + "epoch": 0.8047301394784718, + "grad_norm": 0.24851654469966888, + "learning_rate": 5.585284394142426e-06, + "loss": 0.0987, + "step": 45118 + }, + { + "epoch": 0.8047479756001855, + "grad_norm": 0.2852621376514435, + "learning_rate": 5.5843038195047855e-06, + "loss": 0.0903, + "step": 45119 + }, + { + "epoch": 0.8047658117218992, + "grad_norm": 0.2575474977493286, + "learning_rate": 5.583323320129083e-06, + "loss": 0.1368, + "step": 45120 + }, + { + "epoch": 0.8047836478436129, + "grad_norm": 0.31236347556114197, + "learning_rate": 5.58234289601913e-06, + "loss": 0.0863, + "step": 45121 + }, + { + "epoch": 0.8048014839653266, + "grad_norm": 0.25784802436828613, + "learning_rate": 5.581362547178717e-06, + "loss": 0.1434, + "step": 45122 + }, + { + "epoch": 0.8048193200870403, + "grad_norm": 0.17321154475212097, + "learning_rate": 5.580382273611656e-06, + "loss": 0.0842, + "step": 45123 + }, + { + "epoch": 0.804837156208754, + "grad_norm": 0.251720666885376, + "learning_rate": 5.579402075321743e-06, + "loss": 0.0548, + "step": 45124 + }, + { + "epoch": 0.8048549923304676, + "grad_norm": 0.2474765032529831, + "learning_rate": 5.5784219523127735e-06, + "loss": 0.0995, + "step": 45125 + }, + { + "epoch": 0.8048728284521813, + "grad_norm": 0.29034796357154846, + "learning_rate": 5.57744190458854e-06, + "loss": 0.1727, + "step": 45126 + }, + { + "epoch": 0.804890664573895, + "grad_norm": 0.170152947306633, + "learning_rate": 5.57646193215286e-06, + "loss": 0.0699, + "step": 45127 + }, + { + "epoch": 0.8049085006956087, + "grad_norm": 0.22174742817878723, + "learning_rate": 5.5754820350095185e-06, + "loss": 0.0966, + "step": 45128 + }, + { + "epoch": 0.8049263368173224, + "grad_norm": 0.3623199164867401, + "learning_rate": 5.574502213162322e-06, + "loss": 0.1249, + "step": 45129 + }, + { + "epoch": 0.8049441729390361, + "grad_norm": 0.23711462318897247, + "learning_rate": 5.573522466615061e-06, + "loss": 0.1066, + "step": 45130 + }, + { + "epoch": 0.8049620090607499, + "grad_norm": 0.3087281882762909, + "learning_rate": 5.572542795371532e-06, + "loss": 0.0881, + "step": 45131 + }, + { + "epoch": 0.8049798451824636, + "grad_norm": 0.28367748856544495, + "learning_rate": 5.571563199435542e-06, + "loss": 0.0928, + "step": 45132 + }, + { + "epoch": 0.8049976813041773, + "grad_norm": 0.2632720172405243, + "learning_rate": 5.570583678810878e-06, + "loss": 0.1397, + "step": 45133 + }, + { + "epoch": 0.805015517425891, + "grad_norm": 0.19186727702617645, + "learning_rate": 5.569604233501349e-06, + "loss": 0.0988, + "step": 45134 + }, + { + "epoch": 0.8050333535476046, + "grad_norm": 0.23219752311706543, + "learning_rate": 5.568624863510738e-06, + "loss": 0.1139, + "step": 45135 + }, + { + "epoch": 0.8050511896693183, + "grad_norm": 0.25439921021461487, + "learning_rate": 5.567645568842855e-06, + "loss": 0.1123, + "step": 45136 + }, + { + "epoch": 0.805069025791032, + "grad_norm": 0.25233781337738037, + "learning_rate": 5.5666663495014895e-06, + "loss": 0.134, + "step": 45137 + }, + { + "epoch": 0.8050868619127457, + "grad_norm": 0.35540053248405457, + "learning_rate": 5.5656872054904355e-06, + "loss": 0.0841, + "step": 45138 + }, + { + "epoch": 0.8051046980344594, + "grad_norm": 0.33396226167678833, + "learning_rate": 5.564708136813487e-06, + "loss": 0.1098, + "step": 45139 + }, + { + "epoch": 0.8051225341561731, + "grad_norm": 0.23016005754470825, + "learning_rate": 5.563729143474447e-06, + "loss": 0.0999, + "step": 45140 + }, + { + "epoch": 0.8051403702778868, + "grad_norm": 0.224091574549675, + "learning_rate": 5.562750225477106e-06, + "loss": 0.1301, + "step": 45141 + }, + { + "epoch": 0.8051582063996005, + "grad_norm": 0.27392706274986267, + "learning_rate": 5.5617713828252595e-06, + "loss": 0.1573, + "step": 45142 + }, + { + "epoch": 0.8051760425213141, + "grad_norm": 0.24790118634700775, + "learning_rate": 5.5607926155227e-06, + "loss": 0.134, + "step": 45143 + }, + { + "epoch": 0.8051938786430278, + "grad_norm": 0.297635018825531, + "learning_rate": 5.559813923573218e-06, + "loss": 0.0862, + "step": 45144 + }, + { + "epoch": 0.8052117147647415, + "grad_norm": 0.2815331816673279, + "learning_rate": 5.558835306980614e-06, + "loss": 0.1119, + "step": 45145 + }, + { + "epoch": 0.8052295508864552, + "grad_norm": 0.2936416566371918, + "learning_rate": 5.557856765748684e-06, + "loss": 0.0927, + "step": 45146 + }, + { + "epoch": 0.8052473870081689, + "grad_norm": 0.2213561236858368, + "learning_rate": 5.556878299881208e-06, + "loss": 0.0969, + "step": 45147 + }, + { + "epoch": 0.8052652231298827, + "grad_norm": 0.25681453943252563, + "learning_rate": 5.555899909381992e-06, + "loss": 0.1215, + "step": 45148 + }, + { + "epoch": 0.8052830592515964, + "grad_norm": 0.275356262922287, + "learning_rate": 5.5549215942548165e-06, + "loss": 0.1122, + "step": 45149 + }, + { + "epoch": 0.8053008953733101, + "grad_norm": 0.24249376356601715, + "learning_rate": 5.553943354503491e-06, + "loss": 0.0878, + "step": 45150 + }, + { + "epoch": 0.8053187314950238, + "grad_norm": 0.2888849675655365, + "learning_rate": 5.552965190131795e-06, + "loss": 0.1032, + "step": 45151 + }, + { + "epoch": 0.8053365676167374, + "grad_norm": 0.2656308114528656, + "learning_rate": 5.551987101143516e-06, + "loss": 0.1198, + "step": 45152 + }, + { + "epoch": 0.8053544037384511, + "grad_norm": 0.2765926420688629, + "learning_rate": 5.5510090875424606e-06, + "loss": 0.1681, + "step": 45153 + }, + { + "epoch": 0.8053722398601648, + "grad_norm": 0.2646331191062927, + "learning_rate": 5.55003114933241e-06, + "loss": 0.1447, + "step": 45154 + }, + { + "epoch": 0.8053900759818785, + "grad_norm": 0.3718184232711792, + "learning_rate": 5.549053286517159e-06, + "loss": 0.0848, + "step": 45155 + }, + { + "epoch": 0.8054079121035922, + "grad_norm": 0.17985741794109344, + "learning_rate": 5.548075499100492e-06, + "loss": 0.0805, + "step": 45156 + }, + { + "epoch": 0.8054257482253059, + "grad_norm": 0.28909024596214294, + "learning_rate": 5.547097787086197e-06, + "loss": 0.1047, + "step": 45157 + }, + { + "epoch": 0.8054435843470196, + "grad_norm": 0.18139904737472534, + "learning_rate": 5.546120150478076e-06, + "loss": 0.0923, + "step": 45158 + }, + { + "epoch": 0.8054614204687333, + "grad_norm": 0.22457453608512878, + "learning_rate": 5.545142589279914e-06, + "loss": 0.0563, + "step": 45159 + }, + { + "epoch": 0.805479256590447, + "grad_norm": 0.27773961424827576, + "learning_rate": 5.544165103495494e-06, + "loss": 0.1149, + "step": 45160 + }, + { + "epoch": 0.8054970927121606, + "grad_norm": 0.300950288772583, + "learning_rate": 5.5431876931286125e-06, + "loss": 0.1301, + "step": 45161 + }, + { + "epoch": 0.8055149288338743, + "grad_norm": 0.32615914940834045, + "learning_rate": 5.54221035818305e-06, + "loss": 0.1465, + "step": 45162 + }, + { + "epoch": 0.805532764955588, + "grad_norm": 0.21602334082126617, + "learning_rate": 5.54123309866261e-06, + "loss": 0.0476, + "step": 45163 + }, + { + "epoch": 0.8055506010773017, + "grad_norm": 0.2839105725288391, + "learning_rate": 5.540255914571069e-06, + "loss": 0.089, + "step": 45164 + }, + { + "epoch": 0.8055684371990155, + "grad_norm": 0.3230873942375183, + "learning_rate": 5.539278805912209e-06, + "loss": 0.099, + "step": 45165 + }, + { + "epoch": 0.8055862733207292, + "grad_norm": 0.3063286542892456, + "learning_rate": 5.5383017726898355e-06, + "loss": 0.1526, + "step": 45166 + }, + { + "epoch": 0.8056041094424429, + "grad_norm": 0.2745344638824463, + "learning_rate": 5.537324814907727e-06, + "loss": 0.1161, + "step": 45167 + }, + { + "epoch": 0.8056219455641566, + "grad_norm": 0.22862201929092407, + "learning_rate": 5.536347932569669e-06, + "loss": 0.1224, + "step": 45168 + }, + { + "epoch": 0.8056397816858702, + "grad_norm": 0.21719032526016235, + "learning_rate": 5.535371125679448e-06, + "loss": 0.0551, + "step": 45169 + }, + { + "epoch": 0.8056576178075839, + "grad_norm": 0.23315177857875824, + "learning_rate": 5.534394394240844e-06, + "loss": 0.123, + "step": 45170 + }, + { + "epoch": 0.8056754539292976, + "grad_norm": 0.31189781427383423, + "learning_rate": 5.533417738257657e-06, + "loss": 0.1532, + "step": 45171 + }, + { + "epoch": 0.8056932900510113, + "grad_norm": 0.2474830448627472, + "learning_rate": 5.532441157733667e-06, + "loss": 0.1223, + "step": 45172 + }, + { + "epoch": 0.805711126172725, + "grad_norm": 0.46391186118125916, + "learning_rate": 5.531464652672661e-06, + "loss": 0.1326, + "step": 45173 + }, + { + "epoch": 0.8057289622944387, + "grad_norm": 0.28167542815208435, + "learning_rate": 5.5304882230784115e-06, + "loss": 0.1295, + "step": 45174 + }, + { + "epoch": 0.8057467984161524, + "grad_norm": 0.3139234483242035, + "learning_rate": 5.5295118689547235e-06, + "loss": 0.121, + "step": 45175 + }, + { + "epoch": 0.8057646345378661, + "grad_norm": 0.2539048194885254, + "learning_rate": 5.528535590305364e-06, + "loss": 0.0924, + "step": 45176 + }, + { + "epoch": 0.8057824706595798, + "grad_norm": 0.2819989025592804, + "learning_rate": 5.527559387134135e-06, + "loss": 0.094, + "step": 45177 + }, + { + "epoch": 0.8058003067812934, + "grad_norm": 0.3445577323436737, + "learning_rate": 5.526583259444803e-06, + "loss": 0.082, + "step": 45178 + }, + { + "epoch": 0.8058181429030071, + "grad_norm": 0.2964233160018921, + "learning_rate": 5.525607207241168e-06, + "loss": 0.1327, + "step": 45179 + }, + { + "epoch": 0.8058359790247208, + "grad_norm": 0.20467011630535126, + "learning_rate": 5.524631230527006e-06, + "loss": 0.0885, + "step": 45180 + }, + { + "epoch": 0.8058538151464345, + "grad_norm": 0.34546440839767456, + "learning_rate": 5.5236553293060984e-06, + "loss": 0.1146, + "step": 45181 + }, + { + "epoch": 0.8058716512681483, + "grad_norm": 0.22661687433719635, + "learning_rate": 5.522679503582231e-06, + "loss": 0.1388, + "step": 45182 + }, + { + "epoch": 0.805889487389862, + "grad_norm": 0.255628764629364, + "learning_rate": 5.521703753359178e-06, + "loss": 0.1476, + "step": 45183 + }, + { + "epoch": 0.8059073235115757, + "grad_norm": 0.4289093017578125, + "learning_rate": 5.520728078640733e-06, + "loss": 0.1218, + "step": 45184 + }, + { + "epoch": 0.8059251596332894, + "grad_norm": 0.2473369538784027, + "learning_rate": 5.519752479430676e-06, + "loss": 0.1099, + "step": 45185 + }, + { + "epoch": 0.805942995755003, + "grad_norm": 0.23515115678310394, + "learning_rate": 5.518776955732788e-06, + "loss": 0.0996, + "step": 45186 + }, + { + "epoch": 0.8059608318767167, + "grad_norm": 0.2763703167438507, + "learning_rate": 5.51780150755084e-06, + "loss": 0.1374, + "step": 45187 + }, + { + "epoch": 0.8059786679984304, + "grad_norm": 0.33515024185180664, + "learning_rate": 5.516826134888631e-06, + "loss": 0.1016, + "step": 45188 + }, + { + "epoch": 0.8059965041201441, + "grad_norm": 0.36895760893821716, + "learning_rate": 5.515850837749925e-06, + "loss": 0.1364, + "step": 45189 + }, + { + "epoch": 0.8060143402418578, + "grad_norm": 0.2607574164867401, + "learning_rate": 5.514875616138518e-06, + "loss": 0.1351, + "step": 45190 + }, + { + "epoch": 0.8060321763635715, + "grad_norm": 0.332884281873703, + "learning_rate": 5.5139004700581785e-06, + "loss": 0.1566, + "step": 45191 + }, + { + "epoch": 0.8060500124852852, + "grad_norm": 0.30799686908721924, + "learning_rate": 5.512925399512694e-06, + "loss": 0.1178, + "step": 45192 + }, + { + "epoch": 0.8060678486069989, + "grad_norm": 0.25493577122688293, + "learning_rate": 5.511950404505845e-06, + "loss": 0.1161, + "step": 45193 + }, + { + "epoch": 0.8060856847287126, + "grad_norm": 0.31678780913352966, + "learning_rate": 5.510975485041403e-06, + "loss": 0.1276, + "step": 45194 + }, + { + "epoch": 0.8061035208504262, + "grad_norm": 0.2563844621181488, + "learning_rate": 5.510000641123153e-06, + "loss": 0.0574, + "step": 45195 + }, + { + "epoch": 0.8061213569721399, + "grad_norm": 0.31718015670776367, + "learning_rate": 5.509025872754866e-06, + "loss": 0.116, + "step": 45196 + }, + { + "epoch": 0.8061391930938536, + "grad_norm": 0.2768203914165497, + "learning_rate": 5.508051179940335e-06, + "loss": 0.0763, + "step": 45197 + }, + { + "epoch": 0.8061570292155674, + "grad_norm": 0.36266446113586426, + "learning_rate": 5.507076562683327e-06, + "loss": 0.083, + "step": 45198 + }, + { + "epoch": 0.8061748653372811, + "grad_norm": 0.29332104325294495, + "learning_rate": 5.506102020987625e-06, + "loss": 0.0915, + "step": 45199 + }, + { + "epoch": 0.8061927014589948, + "grad_norm": 0.2650769054889679, + "learning_rate": 5.5051275548569956e-06, + "loss": 0.1006, + "step": 45200 + }, + { + "epoch": 0.8062105375807085, + "grad_norm": 0.3163204789161682, + "learning_rate": 5.504153164295234e-06, + "loss": 0.1396, + "step": 45201 + }, + { + "epoch": 0.8062283737024222, + "grad_norm": 0.2902640700340271, + "learning_rate": 5.503178849306107e-06, + "loss": 0.1449, + "step": 45202 + }, + { + "epoch": 0.8062462098241359, + "grad_norm": 0.20983174443244934, + "learning_rate": 5.502204609893388e-06, + "loss": 0.0708, + "step": 45203 + }, + { + "epoch": 0.8062640459458496, + "grad_norm": 0.48085594177246094, + "learning_rate": 5.501230446060862e-06, + "loss": 0.0937, + "step": 45204 + }, + { + "epoch": 0.8062818820675632, + "grad_norm": 0.35073500871658325, + "learning_rate": 5.5002563578122975e-06, + "loss": 0.1547, + "step": 45205 + }, + { + "epoch": 0.8062997181892769, + "grad_norm": 0.32338613271713257, + "learning_rate": 5.49928234515148e-06, + "loss": 0.1266, + "step": 45206 + }, + { + "epoch": 0.8063175543109906, + "grad_norm": 0.27109187841415405, + "learning_rate": 5.498308408082179e-06, + "loss": 0.1025, + "step": 45207 + }, + { + "epoch": 0.8063353904327043, + "grad_norm": 0.3133688271045685, + "learning_rate": 5.497334546608171e-06, + "loss": 0.1282, + "step": 45208 + }, + { + "epoch": 0.806353226554418, + "grad_norm": 0.3348323106765747, + "learning_rate": 5.496360760733221e-06, + "loss": 0.0999, + "step": 45209 + }, + { + "epoch": 0.8063710626761317, + "grad_norm": 0.2402358502149582, + "learning_rate": 5.495387050461123e-06, + "loss": 0.1361, + "step": 45210 + }, + { + "epoch": 0.8063888987978454, + "grad_norm": 0.29074907302856445, + "learning_rate": 5.49441341579564e-06, + "loss": 0.1058, + "step": 45211 + }, + { + "epoch": 0.806406734919559, + "grad_norm": 0.269368052482605, + "learning_rate": 5.4934398567405486e-06, + "loss": 0.1044, + "step": 45212 + }, + { + "epoch": 0.8064245710412727, + "grad_norm": 0.22036467492580414, + "learning_rate": 5.492466373299615e-06, + "loss": 0.0945, + "step": 45213 + }, + { + "epoch": 0.8064424071629864, + "grad_norm": 0.28387391567230225, + "learning_rate": 5.491492965476624e-06, + "loss": 0.1768, + "step": 45214 + }, + { + "epoch": 0.8064602432847002, + "grad_norm": 0.2515878975391388, + "learning_rate": 5.490519633275348e-06, + "loss": 0.1027, + "step": 45215 + }, + { + "epoch": 0.8064780794064139, + "grad_norm": 0.274103581905365, + "learning_rate": 5.489546376699548e-06, + "loss": 0.1066, + "step": 45216 + }, + { + "epoch": 0.8064959155281276, + "grad_norm": 0.29180335998535156, + "learning_rate": 5.48857319575301e-06, + "loss": 0.0624, + "step": 45217 + }, + { + "epoch": 0.8065137516498413, + "grad_norm": 0.3230469822883606, + "learning_rate": 5.487600090439496e-06, + "loss": 0.1331, + "step": 45218 + }, + { + "epoch": 0.806531587771555, + "grad_norm": 0.1601676493883133, + "learning_rate": 5.486627060762789e-06, + "loss": 0.0728, + "step": 45219 + }, + { + "epoch": 0.8065494238932687, + "grad_norm": 0.31972986459732056, + "learning_rate": 5.485654106726657e-06, + "loss": 0.09, + "step": 45220 + }, + { + "epoch": 0.8065672600149824, + "grad_norm": 0.3033163547515869, + "learning_rate": 5.484681228334867e-06, + "loss": 0.0946, + "step": 45221 + }, + { + "epoch": 0.806585096136696, + "grad_norm": 0.3145233988761902, + "learning_rate": 5.483708425591188e-06, + "loss": 0.1253, + "step": 45222 + }, + { + "epoch": 0.8066029322584097, + "grad_norm": 0.327591210603714, + "learning_rate": 5.482735698499403e-06, + "loss": 0.1036, + "step": 45223 + }, + { + "epoch": 0.8066207683801234, + "grad_norm": 0.28029635548591614, + "learning_rate": 5.4817630470632734e-06, + "loss": 0.1382, + "step": 45224 + }, + { + "epoch": 0.8066386045018371, + "grad_norm": 0.39999592304229736, + "learning_rate": 5.480790471286573e-06, + "loss": 0.141, + "step": 45225 + }, + { + "epoch": 0.8066564406235508, + "grad_norm": 0.2242717742919922, + "learning_rate": 5.4798179711730655e-06, + "loss": 0.0792, + "step": 45226 + }, + { + "epoch": 0.8066742767452645, + "grad_norm": 0.3036152720451355, + "learning_rate": 5.47884554672653e-06, + "loss": 0.1648, + "step": 45227 + }, + { + "epoch": 0.8066921128669782, + "grad_norm": 0.216957688331604, + "learning_rate": 5.477873197950733e-06, + "loss": 0.0882, + "step": 45228 + }, + { + "epoch": 0.8067099489886919, + "grad_norm": 0.2038707137107849, + "learning_rate": 5.47690092484944e-06, + "loss": 0.0897, + "step": 45229 + }, + { + "epoch": 0.8067277851104055, + "grad_norm": 0.23714205622673035, + "learning_rate": 5.4759287274264185e-06, + "loss": 0.1102, + "step": 45230 + }, + { + "epoch": 0.8067456212321192, + "grad_norm": 0.27829429507255554, + "learning_rate": 5.474956605685444e-06, + "loss": 0.0913, + "step": 45231 + }, + { + "epoch": 0.806763457353833, + "grad_norm": 0.24379919469356537, + "learning_rate": 5.473984559630277e-06, + "loss": 0.083, + "step": 45232 + }, + { + "epoch": 0.8067812934755467, + "grad_norm": 0.23219719529151917, + "learning_rate": 5.4730125892646976e-06, + "loss": 0.0956, + "step": 45233 + }, + { + "epoch": 0.8067991295972604, + "grad_norm": 0.2861798405647278, + "learning_rate": 5.472040694592465e-06, + "loss": 0.1157, + "step": 45234 + }, + { + "epoch": 0.8068169657189741, + "grad_norm": 0.2962949872016907, + "learning_rate": 5.471068875617339e-06, + "loss": 0.1185, + "step": 45235 + }, + { + "epoch": 0.8068348018406878, + "grad_norm": 0.2821102738380432, + "learning_rate": 5.470097132343102e-06, + "loss": 0.1205, + "step": 45236 + }, + { + "epoch": 0.8068526379624015, + "grad_norm": 0.2512767016887665, + "learning_rate": 5.469125464773514e-06, + "loss": 0.0966, + "step": 45237 + }, + { + "epoch": 0.8068704740841152, + "grad_norm": 0.23213151097297668, + "learning_rate": 5.468153872912343e-06, + "loss": 0.0818, + "step": 45238 + }, + { + "epoch": 0.8068883102058289, + "grad_norm": 0.40025150775909424, + "learning_rate": 5.467182356763345e-06, + "loss": 0.14, + "step": 45239 + }, + { + "epoch": 0.8069061463275425, + "grad_norm": 0.28509029746055603, + "learning_rate": 5.466210916330303e-06, + "loss": 0.0973, + "step": 45240 + }, + { + "epoch": 0.8069239824492562, + "grad_norm": 0.2456107884645462, + "learning_rate": 5.465239551616974e-06, + "loss": 0.1296, + "step": 45241 + }, + { + "epoch": 0.8069418185709699, + "grad_norm": 0.23805761337280273, + "learning_rate": 5.464268262627123e-06, + "loss": 0.1172, + "step": 45242 + }, + { + "epoch": 0.8069596546926836, + "grad_norm": 0.26924633979797363, + "learning_rate": 5.46329704936451e-06, + "loss": 0.0835, + "step": 45243 + }, + { + "epoch": 0.8069774908143973, + "grad_norm": 0.23496891558170319, + "learning_rate": 5.462325911832911e-06, + "loss": 0.1314, + "step": 45244 + }, + { + "epoch": 0.806995326936111, + "grad_norm": 0.27983197569847107, + "learning_rate": 5.461354850036077e-06, + "loss": 0.1264, + "step": 45245 + }, + { + "epoch": 0.8070131630578247, + "grad_norm": 0.2608453929424286, + "learning_rate": 5.4603838639777884e-06, + "loss": 0.0947, + "step": 45246 + }, + { + "epoch": 0.8070309991795384, + "grad_norm": 0.29466572403907776, + "learning_rate": 5.459412953661803e-06, + "loss": 0.0936, + "step": 45247 + }, + { + "epoch": 0.807048835301252, + "grad_norm": 0.28669387102127075, + "learning_rate": 5.458442119091872e-06, + "loss": 0.1141, + "step": 45248 + }, + { + "epoch": 0.8070666714229658, + "grad_norm": 0.2387753576040268, + "learning_rate": 5.457471360271777e-06, + "loss": 0.0931, + "step": 45249 + }, + { + "epoch": 0.8070845075446795, + "grad_norm": 0.35160550475120544, + "learning_rate": 5.45650067720527e-06, + "loss": 0.1409, + "step": 45250 + }, + { + "epoch": 0.8071023436663932, + "grad_norm": 0.28481483459472656, + "learning_rate": 5.45553006989612e-06, + "loss": 0.0896, + "step": 45251 + }, + { + "epoch": 0.8071201797881069, + "grad_norm": 0.33091050386428833, + "learning_rate": 5.454559538348078e-06, + "loss": 0.1419, + "step": 45252 + }, + { + "epoch": 0.8071380159098206, + "grad_norm": 0.27689868211746216, + "learning_rate": 5.453589082564919e-06, + "loss": 0.0966, + "step": 45253 + }, + { + "epoch": 0.8071558520315343, + "grad_norm": 0.2798336148262024, + "learning_rate": 5.452618702550402e-06, + "loss": 0.0675, + "step": 45254 + }, + { + "epoch": 0.807173688153248, + "grad_norm": 0.3700137138366699, + "learning_rate": 5.4516483983082845e-06, + "loss": 0.1241, + "step": 45255 + }, + { + "epoch": 0.8071915242749617, + "grad_norm": 0.38202112913131714, + "learning_rate": 5.450678169842324e-06, + "loss": 0.1495, + "step": 45256 + }, + { + "epoch": 0.8072093603966753, + "grad_norm": 0.36776942014694214, + "learning_rate": 5.449708017156294e-06, + "loss": 0.1109, + "step": 45257 + }, + { + "epoch": 0.807227196518389, + "grad_norm": 0.23688314855098724, + "learning_rate": 5.448737940253948e-06, + "loss": 0.0926, + "step": 45258 + }, + { + "epoch": 0.8072450326401027, + "grad_norm": 0.2886868417263031, + "learning_rate": 5.447767939139039e-06, + "loss": 0.1216, + "step": 45259 + }, + { + "epoch": 0.8072628687618164, + "grad_norm": 0.2926797866821289, + "learning_rate": 5.446798013815343e-06, + "loss": 0.1306, + "step": 45260 + }, + { + "epoch": 0.8072807048835301, + "grad_norm": 0.27875691652297974, + "learning_rate": 5.4458281642866035e-06, + "loss": 0.1107, + "step": 45261 + }, + { + "epoch": 0.8072985410052438, + "grad_norm": 0.2355824112892151, + "learning_rate": 5.444858390556596e-06, + "loss": 0.1308, + "step": 45262 + }, + { + "epoch": 0.8073163771269575, + "grad_norm": 0.29015809297561646, + "learning_rate": 5.443888692629071e-06, + "loss": 0.1068, + "step": 45263 + }, + { + "epoch": 0.8073342132486712, + "grad_norm": 0.3303722143173218, + "learning_rate": 5.442919070507788e-06, + "loss": 0.1743, + "step": 45264 + }, + { + "epoch": 0.8073520493703848, + "grad_norm": 0.22878235578536987, + "learning_rate": 5.441949524196496e-06, + "loss": 0.1249, + "step": 45265 + }, + { + "epoch": 0.8073698854920986, + "grad_norm": 0.20490944385528564, + "learning_rate": 5.440980053698971e-06, + "loss": 0.1136, + "step": 45266 + }, + { + "epoch": 0.8073877216138123, + "grad_norm": 0.44089174270629883, + "learning_rate": 5.440010659018965e-06, + "loss": 0.0872, + "step": 45267 + }, + { + "epoch": 0.807405557735526, + "grad_norm": 0.3440295457839966, + "learning_rate": 5.439041340160231e-06, + "loss": 0.1273, + "step": 45268 + }, + { + "epoch": 0.8074233938572397, + "grad_norm": 0.3534582853317261, + "learning_rate": 5.438072097126523e-06, + "loss": 0.1736, + "step": 45269 + }, + { + "epoch": 0.8074412299789534, + "grad_norm": 0.229813814163208, + "learning_rate": 5.437102929921612e-06, + "loss": 0.0702, + "step": 45270 + }, + { + "epoch": 0.8074590661006671, + "grad_norm": 0.2391553819179535, + "learning_rate": 5.436133838549243e-06, + "loss": 0.0805, + "step": 45271 + }, + { + "epoch": 0.8074769022223808, + "grad_norm": 0.2872280478477478, + "learning_rate": 5.435164823013172e-06, + "loss": 0.1754, + "step": 45272 + }, + { + "epoch": 0.8074947383440945, + "grad_norm": 0.25781434774398804, + "learning_rate": 5.434195883317167e-06, + "loss": 0.1228, + "step": 45273 + }, + { + "epoch": 0.8075125744658082, + "grad_norm": 0.23661333322525024, + "learning_rate": 5.43322701946497e-06, + "loss": 0.1015, + "step": 45274 + }, + { + "epoch": 0.8075304105875218, + "grad_norm": 0.3195432722568512, + "learning_rate": 5.432258231460349e-06, + "loss": 0.127, + "step": 45275 + }, + { + "epoch": 0.8075482467092355, + "grad_norm": 0.33412986993789673, + "learning_rate": 5.4312895193070534e-06, + "loss": 0.1047, + "step": 45276 + }, + { + "epoch": 0.8075660828309492, + "grad_norm": 0.2995836138725281, + "learning_rate": 5.430320883008841e-06, + "loss": 0.1401, + "step": 45277 + }, + { + "epoch": 0.8075839189526629, + "grad_norm": 0.3090651035308838, + "learning_rate": 5.429352322569453e-06, + "loss": 0.0981, + "step": 45278 + }, + { + "epoch": 0.8076017550743766, + "grad_norm": 0.1910848319530487, + "learning_rate": 5.428383837992665e-06, + "loss": 0.0838, + "step": 45279 + }, + { + "epoch": 0.8076195911960903, + "grad_norm": 0.29134660959243774, + "learning_rate": 5.42741542928222e-06, + "loss": 0.1018, + "step": 45280 + }, + { + "epoch": 0.807637427317804, + "grad_norm": 0.23909783363342285, + "learning_rate": 5.4264470964418725e-06, + "loss": 0.1075, + "step": 45281 + }, + { + "epoch": 0.8076552634395177, + "grad_norm": 0.3901885747909546, + "learning_rate": 5.42547883947537e-06, + "loss": 0.1206, + "step": 45282 + }, + { + "epoch": 0.8076730995612315, + "grad_norm": 0.23201575875282288, + "learning_rate": 5.424510658386478e-06, + "loss": 0.0905, + "step": 45283 + }, + { + "epoch": 0.8076909356829451, + "grad_norm": 0.3962501883506775, + "learning_rate": 5.423542553178945e-06, + "loss": 0.1488, + "step": 45284 + }, + { + "epoch": 0.8077087718046588, + "grad_norm": 0.27960798144340515, + "learning_rate": 5.422574523856524e-06, + "loss": 0.1044, + "step": 45285 + }, + { + "epoch": 0.8077266079263725, + "grad_norm": 0.26997601985931396, + "learning_rate": 5.421606570422957e-06, + "loss": 0.0945, + "step": 45286 + }, + { + "epoch": 0.8077444440480862, + "grad_norm": 0.31664034724235535, + "learning_rate": 5.420638692882007e-06, + "loss": 0.1457, + "step": 45287 + }, + { + "epoch": 0.8077622801697999, + "grad_norm": 0.18644289672374725, + "learning_rate": 5.41967089123743e-06, + "loss": 0.0652, + "step": 45288 + }, + { + "epoch": 0.8077801162915136, + "grad_norm": 0.2650397717952728, + "learning_rate": 5.41870316549297e-06, + "loss": 0.0951, + "step": 45289 + }, + { + "epoch": 0.8077979524132273, + "grad_norm": 0.33292874693870544, + "learning_rate": 5.4177355156523805e-06, + "loss": 0.1262, + "step": 45290 + }, + { + "epoch": 0.807815788534941, + "grad_norm": 0.2799188196659088, + "learning_rate": 5.4167679417194054e-06, + "loss": 0.1109, + "step": 45291 + }, + { + "epoch": 0.8078336246566546, + "grad_norm": 0.19513505697250366, + "learning_rate": 5.415800443697808e-06, + "loss": 0.0601, + "step": 45292 + }, + { + "epoch": 0.8078514607783683, + "grad_norm": 0.33457663655281067, + "learning_rate": 5.414833021591334e-06, + "loss": 0.0982, + "step": 45293 + }, + { + "epoch": 0.807869296900082, + "grad_norm": 0.28720855712890625, + "learning_rate": 5.413865675403729e-06, + "loss": 0.1105, + "step": 45294 + }, + { + "epoch": 0.8078871330217957, + "grad_norm": 0.2925865650177002, + "learning_rate": 5.412898405138738e-06, + "loss": 0.1344, + "step": 45295 + }, + { + "epoch": 0.8079049691435094, + "grad_norm": 0.43825265765190125, + "learning_rate": 5.411931210800128e-06, + "loss": 0.1363, + "step": 45296 + }, + { + "epoch": 0.8079228052652231, + "grad_norm": 0.2749407887458801, + "learning_rate": 5.410964092391638e-06, + "loss": 0.1115, + "step": 45297 + }, + { + "epoch": 0.8079406413869368, + "grad_norm": 0.28501713275909424, + "learning_rate": 5.409997049917015e-06, + "loss": 0.1518, + "step": 45298 + }, + { + "epoch": 0.8079584775086506, + "grad_norm": 0.24590031802654266, + "learning_rate": 5.409030083380001e-06, + "loss": 0.1574, + "step": 45299 + }, + { + "epoch": 0.8079763136303643, + "grad_norm": 0.2159082591533661, + "learning_rate": 5.408063192784363e-06, + "loss": 0.1042, + "step": 45300 + }, + { + "epoch": 0.807994149752078, + "grad_norm": 0.22653235495090485, + "learning_rate": 5.4070963781338325e-06, + "loss": 0.1135, + "step": 45301 + }, + { + "epoch": 0.8080119858737916, + "grad_norm": 0.25408491492271423, + "learning_rate": 5.40612963943217e-06, + "loss": 0.1055, + "step": 45302 + }, + { + "epoch": 0.8080298219955053, + "grad_norm": 0.23485468327999115, + "learning_rate": 5.405162976683115e-06, + "loss": 0.0987, + "step": 45303 + }, + { + "epoch": 0.808047658117219, + "grad_norm": 0.2339015007019043, + "learning_rate": 5.404196389890409e-06, + "loss": 0.0752, + "step": 45304 + }, + { + "epoch": 0.8080654942389327, + "grad_norm": 0.23099899291992188, + "learning_rate": 5.403229879057814e-06, + "loss": 0.1144, + "step": 45305 + }, + { + "epoch": 0.8080833303606464, + "grad_norm": 0.25047290325164795, + "learning_rate": 5.402263444189068e-06, + "loss": 0.103, + "step": 45306 + }, + { + "epoch": 0.8081011664823601, + "grad_norm": 0.20365071296691895, + "learning_rate": 5.401297085287918e-06, + "loss": 0.0788, + "step": 45307 + }, + { + "epoch": 0.8081190026040738, + "grad_norm": 0.28805556893348694, + "learning_rate": 5.400330802358103e-06, + "loss": 0.1513, + "step": 45308 + }, + { + "epoch": 0.8081368387257875, + "grad_norm": 0.2784179151058197, + "learning_rate": 5.399364595403381e-06, + "loss": 0.0606, + "step": 45309 + }, + { + "epoch": 0.8081546748475011, + "grad_norm": 0.23879724740982056, + "learning_rate": 5.398398464427493e-06, + "loss": 0.0809, + "step": 45310 + }, + { + "epoch": 0.8081725109692148, + "grad_norm": 0.21566037833690643, + "learning_rate": 5.3974324094341835e-06, + "loss": 0.1426, + "step": 45311 + }, + { + "epoch": 0.8081903470909285, + "grad_norm": 0.27589720487594604, + "learning_rate": 5.396466430427194e-06, + "loss": 0.0978, + "step": 45312 + }, + { + "epoch": 0.8082081832126422, + "grad_norm": 0.3204553723335266, + "learning_rate": 5.395500527410266e-06, + "loss": 0.1047, + "step": 45313 + }, + { + "epoch": 0.8082260193343559, + "grad_norm": 0.19571274518966675, + "learning_rate": 5.394534700387149e-06, + "loss": 0.0942, + "step": 45314 + }, + { + "epoch": 0.8082438554560696, + "grad_norm": 0.2984962463378906, + "learning_rate": 5.3935689493615935e-06, + "loss": 0.1027, + "step": 45315 + }, + { + "epoch": 0.8082616915777834, + "grad_norm": 0.3122525215148926, + "learning_rate": 5.392603274337338e-06, + "loss": 0.1513, + "step": 45316 + }, + { + "epoch": 0.8082795276994971, + "grad_norm": 0.28807875514030457, + "learning_rate": 5.391637675318117e-06, + "loss": 0.0864, + "step": 45317 + }, + { + "epoch": 0.8082973638212108, + "grad_norm": 0.27992531657218933, + "learning_rate": 5.390672152307688e-06, + "loss": 0.129, + "step": 45318 + }, + { + "epoch": 0.8083151999429244, + "grad_norm": 0.252861887216568, + "learning_rate": 5.389706705309786e-06, + "loss": 0.1353, + "step": 45319 + }, + { + "epoch": 0.8083330360646381, + "grad_norm": 0.25114619731903076, + "learning_rate": 5.388741334328154e-06, + "loss": 0.1115, + "step": 45320 + }, + { + "epoch": 0.8083508721863518, + "grad_norm": 0.25021761655807495, + "learning_rate": 5.387776039366527e-06, + "loss": 0.1112, + "step": 45321 + }, + { + "epoch": 0.8083687083080655, + "grad_norm": 0.30372557044029236, + "learning_rate": 5.3868108204286585e-06, + "loss": 0.0697, + "step": 45322 + }, + { + "epoch": 0.8083865444297792, + "grad_norm": 0.22994790971279144, + "learning_rate": 5.385845677518289e-06, + "loss": 0.1065, + "step": 45323 + }, + { + "epoch": 0.8084043805514929, + "grad_norm": 0.25752994418144226, + "learning_rate": 5.384880610639156e-06, + "loss": 0.096, + "step": 45324 + }, + { + "epoch": 0.8084222166732066, + "grad_norm": 0.31726139783859253, + "learning_rate": 5.383915619794999e-06, + "loss": 0.0661, + "step": 45325 + }, + { + "epoch": 0.8084400527949203, + "grad_norm": 0.33966130018234253, + "learning_rate": 5.382950704989554e-06, + "loss": 0.1742, + "step": 45326 + }, + { + "epoch": 0.808457888916634, + "grad_norm": 0.33929044008255005, + "learning_rate": 5.381985866226574e-06, + "loss": 0.1721, + "step": 45327 + }, + { + "epoch": 0.8084757250383476, + "grad_norm": 0.276747465133667, + "learning_rate": 5.381021103509787e-06, + "loss": 0.1305, + "step": 45328 + }, + { + "epoch": 0.8084935611600613, + "grad_norm": 0.33432528376579285, + "learning_rate": 5.380056416842943e-06, + "loss": 0.1515, + "step": 45329 + }, + { + "epoch": 0.808511397281775, + "grad_norm": 0.26534348726272583, + "learning_rate": 5.379091806229772e-06, + "loss": 0.1102, + "step": 45330 + }, + { + "epoch": 0.8085292334034887, + "grad_norm": 0.3019811809062958, + "learning_rate": 5.3781272716740245e-06, + "loss": 0.0961, + "step": 45331 + }, + { + "epoch": 0.8085470695252024, + "grad_norm": 0.25023484230041504, + "learning_rate": 5.377162813179434e-06, + "loss": 0.0622, + "step": 45332 + }, + { + "epoch": 0.8085649056469162, + "grad_norm": 0.2626052796840668, + "learning_rate": 5.376198430749735e-06, + "loss": 0.0995, + "step": 45333 + }, + { + "epoch": 0.8085827417686299, + "grad_norm": 0.27859359979629517, + "learning_rate": 5.375234124388665e-06, + "loss": 0.0833, + "step": 45334 + }, + { + "epoch": 0.8086005778903436, + "grad_norm": 0.35295358300209045, + "learning_rate": 5.37426989409997e-06, + "loss": 0.1261, + "step": 45335 + }, + { + "epoch": 0.8086184140120573, + "grad_norm": 0.2497841715812683, + "learning_rate": 5.373305739887385e-06, + "loss": 0.1603, + "step": 45336 + }, + { + "epoch": 0.8086362501337709, + "grad_norm": 0.33090224862098694, + "learning_rate": 5.372341661754646e-06, + "loss": 0.0806, + "step": 45337 + }, + { + "epoch": 0.8086540862554846, + "grad_norm": 0.29360634088516235, + "learning_rate": 5.371377659705487e-06, + "loss": 0.1249, + "step": 45338 + }, + { + "epoch": 0.8086719223771983, + "grad_norm": 0.2407098114490509, + "learning_rate": 5.3704137337436424e-06, + "loss": 0.1363, + "step": 45339 + }, + { + "epoch": 0.808689758498912, + "grad_norm": 0.23742075264453888, + "learning_rate": 5.369449883872863e-06, + "loss": 0.068, + "step": 45340 + }, + { + "epoch": 0.8087075946206257, + "grad_norm": 0.3116680383682251, + "learning_rate": 5.368486110096874e-06, + "loss": 0.15, + "step": 45341 + }, + { + "epoch": 0.8087254307423394, + "grad_norm": 0.22685708105564117, + "learning_rate": 5.367522412419407e-06, + "loss": 0.0923, + "step": 45342 + }, + { + "epoch": 0.8087432668640531, + "grad_norm": 0.35855111479759216, + "learning_rate": 5.366558790844203e-06, + "loss": 0.1252, + "step": 45343 + }, + { + "epoch": 0.8087611029857668, + "grad_norm": 0.2319633960723877, + "learning_rate": 5.365595245375007e-06, + "loss": 0.102, + "step": 45344 + }, + { + "epoch": 0.8087789391074804, + "grad_norm": 0.24426911771297455, + "learning_rate": 5.364631776015544e-06, + "loss": 0.1571, + "step": 45345 + }, + { + "epoch": 0.8087967752291941, + "grad_norm": 0.23831485211849213, + "learning_rate": 5.363668382769551e-06, + "loss": 0.0956, + "step": 45346 + }, + { + "epoch": 0.8088146113509078, + "grad_norm": 0.23909638822078705, + "learning_rate": 5.362705065640755e-06, + "loss": 0.1011, + "step": 45347 + }, + { + "epoch": 0.8088324474726215, + "grad_norm": 0.2797805070877075, + "learning_rate": 5.361741824632901e-06, + "loss": 0.151, + "step": 45348 + }, + { + "epoch": 0.8088502835943352, + "grad_norm": 0.27477607131004333, + "learning_rate": 5.360778659749721e-06, + "loss": 0.0876, + "step": 45349 + }, + { + "epoch": 0.808868119716049, + "grad_norm": 0.3827594518661499, + "learning_rate": 5.359815570994945e-06, + "loss": 0.1061, + "step": 45350 + }, + { + "epoch": 0.8088859558377627, + "grad_norm": 0.3047603964805603, + "learning_rate": 5.358852558372307e-06, + "loss": 0.108, + "step": 45351 + }, + { + "epoch": 0.8089037919594764, + "grad_norm": 0.270510733127594, + "learning_rate": 5.357889621885534e-06, + "loss": 0.0744, + "step": 45352 + }, + { + "epoch": 0.8089216280811901, + "grad_norm": 0.3078206479549408, + "learning_rate": 5.356926761538372e-06, + "loss": 0.1057, + "step": 45353 + }, + { + "epoch": 0.8089394642029037, + "grad_norm": 0.20678578317165375, + "learning_rate": 5.355963977334547e-06, + "loss": 0.1026, + "step": 45354 + }, + { + "epoch": 0.8089573003246174, + "grad_norm": 0.31052365899086, + "learning_rate": 5.355001269277784e-06, + "loss": 0.1168, + "step": 45355 + }, + { + "epoch": 0.8089751364463311, + "grad_norm": 0.27801328897476196, + "learning_rate": 5.354038637371827e-06, + "loss": 0.1204, + "step": 45356 + }, + { + "epoch": 0.8089929725680448, + "grad_norm": 0.3298667073249817, + "learning_rate": 5.353076081620395e-06, + "loss": 0.1019, + "step": 45357 + }, + { + "epoch": 0.8090108086897585, + "grad_norm": 0.28805720806121826, + "learning_rate": 5.352113602027231e-06, + "loss": 0.0737, + "step": 45358 + }, + { + "epoch": 0.8090286448114722, + "grad_norm": 0.26523420214653015, + "learning_rate": 5.351151198596063e-06, + "loss": 0.1193, + "step": 45359 + }, + { + "epoch": 0.8090464809331859, + "grad_norm": 0.27644097805023193, + "learning_rate": 5.350188871330614e-06, + "loss": 0.1108, + "step": 45360 + }, + { + "epoch": 0.8090643170548996, + "grad_norm": 0.24303224682807922, + "learning_rate": 5.349226620234624e-06, + "loss": 0.0607, + "step": 45361 + }, + { + "epoch": 0.8090821531766133, + "grad_norm": 0.407665878534317, + "learning_rate": 5.348264445311818e-06, + "loss": 0.1127, + "step": 45362 + }, + { + "epoch": 0.8090999892983269, + "grad_norm": 0.2172544300556183, + "learning_rate": 5.34730234656593e-06, + "loss": 0.1218, + "step": 45363 + }, + { + "epoch": 0.8091178254200406, + "grad_norm": 0.2502395808696747, + "learning_rate": 5.346340324000681e-06, + "loss": 0.1047, + "step": 45364 + }, + { + "epoch": 0.8091356615417543, + "grad_norm": 0.2945944368839264, + "learning_rate": 5.345378377619803e-06, + "loss": 0.1256, + "step": 45365 + }, + { + "epoch": 0.809153497663468, + "grad_norm": 0.3464438021183014, + "learning_rate": 5.344416507427033e-06, + "loss": 0.1232, + "step": 45366 + }, + { + "epoch": 0.8091713337851818, + "grad_norm": 0.2606756091117859, + "learning_rate": 5.343454713426091e-06, + "loss": 0.1027, + "step": 45367 + }, + { + "epoch": 0.8091891699068955, + "grad_norm": 0.3260132074356079, + "learning_rate": 5.34249299562071e-06, + "loss": 0.0947, + "step": 45368 + }, + { + "epoch": 0.8092070060286092, + "grad_norm": 0.23424182832241058, + "learning_rate": 5.341531354014606e-06, + "loss": 0.112, + "step": 45369 + }, + { + "epoch": 0.8092248421503229, + "grad_norm": 0.2808634042739868, + "learning_rate": 5.340569788611518e-06, + "loss": 0.1647, + "step": 45370 + }, + { + "epoch": 0.8092426782720366, + "grad_norm": 0.3497547507286072, + "learning_rate": 5.33960829941518e-06, + "loss": 0.1384, + "step": 45371 + }, + { + "epoch": 0.8092605143937502, + "grad_norm": 0.3503676652908325, + "learning_rate": 5.338646886429308e-06, + "loss": 0.1763, + "step": 45372 + }, + { + "epoch": 0.8092783505154639, + "grad_norm": 0.39602136611938477, + "learning_rate": 5.337685549657626e-06, + "loss": 0.1178, + "step": 45373 + }, + { + "epoch": 0.8092961866371776, + "grad_norm": 0.23481430113315582, + "learning_rate": 5.336724289103873e-06, + "loss": 0.1499, + "step": 45374 + }, + { + "epoch": 0.8093140227588913, + "grad_norm": 0.265619158744812, + "learning_rate": 5.335763104771768e-06, + "loss": 0.0962, + "step": 45375 + }, + { + "epoch": 0.809331858880605, + "grad_norm": 0.3164747655391693, + "learning_rate": 5.334801996665037e-06, + "loss": 0.1536, + "step": 45376 + }, + { + "epoch": 0.8093496950023187, + "grad_norm": 0.3203544020652771, + "learning_rate": 5.333840964787398e-06, + "loss": 0.1213, + "step": 45377 + }, + { + "epoch": 0.8093675311240324, + "grad_norm": 0.2461533397436142, + "learning_rate": 5.332880009142594e-06, + "loss": 0.0993, + "step": 45378 + }, + { + "epoch": 0.8093853672457461, + "grad_norm": 0.2611466944217682, + "learning_rate": 5.331919129734336e-06, + "loss": 0.0835, + "step": 45379 + }, + { + "epoch": 0.8094032033674597, + "grad_norm": 0.24052175879478455, + "learning_rate": 5.330958326566354e-06, + "loss": 0.1321, + "step": 45380 + }, + { + "epoch": 0.8094210394891734, + "grad_norm": 0.35289251804351807, + "learning_rate": 5.3299975996423695e-06, + "loss": 0.0838, + "step": 45381 + }, + { + "epoch": 0.8094388756108871, + "grad_norm": 0.343667209148407, + "learning_rate": 5.329036948966104e-06, + "loss": 0.1083, + "step": 45382 + }, + { + "epoch": 0.8094567117326008, + "grad_norm": 0.264646977186203, + "learning_rate": 5.328076374541291e-06, + "loss": 0.1264, + "step": 45383 + }, + { + "epoch": 0.8094745478543146, + "grad_norm": 0.29366007447242737, + "learning_rate": 5.327115876371641e-06, + "loss": 0.0981, + "step": 45384 + }, + { + "epoch": 0.8094923839760283, + "grad_norm": 0.2466009557247162, + "learning_rate": 5.3261554544608904e-06, + "loss": 0.1015, + "step": 45385 + }, + { + "epoch": 0.809510220097742, + "grad_norm": 0.4913146495819092, + "learning_rate": 5.3251951088127526e-06, + "loss": 0.1348, + "step": 45386 + }, + { + "epoch": 0.8095280562194557, + "grad_norm": 0.27386918663978577, + "learning_rate": 5.324234839430958e-06, + "loss": 0.1125, + "step": 45387 + }, + { + "epoch": 0.8095458923411694, + "grad_norm": 0.2521750330924988, + "learning_rate": 5.3232746463192265e-06, + "loss": 0.0946, + "step": 45388 + }, + { + "epoch": 0.809563728462883, + "grad_norm": 0.21190997958183289, + "learning_rate": 5.3223145294812785e-06, + "loss": 0.0708, + "step": 45389 + }, + { + "epoch": 0.8095815645845967, + "grad_norm": 0.24246685206890106, + "learning_rate": 5.3213544889208295e-06, + "loss": 0.118, + "step": 45390 + }, + { + "epoch": 0.8095994007063104, + "grad_norm": 0.30345553159713745, + "learning_rate": 5.320394524641614e-06, + "loss": 0.0848, + "step": 45391 + }, + { + "epoch": 0.8096172368280241, + "grad_norm": 0.27112045884132385, + "learning_rate": 5.319434636647347e-06, + "loss": 0.1116, + "step": 45392 + }, + { + "epoch": 0.8096350729497378, + "grad_norm": 0.2395063042640686, + "learning_rate": 5.318474824941747e-06, + "loss": 0.0912, + "step": 45393 + }, + { + "epoch": 0.8096529090714515, + "grad_norm": 0.4231487810611725, + "learning_rate": 5.317515089528535e-06, + "loss": 0.1551, + "step": 45394 + }, + { + "epoch": 0.8096707451931652, + "grad_norm": 0.2898949980735779, + "learning_rate": 5.31655543041143e-06, + "loss": 0.1214, + "step": 45395 + }, + { + "epoch": 0.8096885813148789, + "grad_norm": 0.29833000898361206, + "learning_rate": 5.31559584759416e-06, + "loss": 0.1317, + "step": 45396 + }, + { + "epoch": 0.8097064174365926, + "grad_norm": 0.22038598358631134, + "learning_rate": 5.314636341080431e-06, + "loss": 0.1067, + "step": 45397 + }, + { + "epoch": 0.8097242535583062, + "grad_norm": 0.38286957144737244, + "learning_rate": 5.313676910873977e-06, + "loss": 0.102, + "step": 45398 + }, + { + "epoch": 0.8097420896800199, + "grad_norm": 0.37689468264579773, + "learning_rate": 5.312717556978506e-06, + "loss": 0.1172, + "step": 45399 + }, + { + "epoch": 0.8097599258017337, + "grad_norm": 0.2364596724510193, + "learning_rate": 5.311758279397747e-06, + "loss": 0.0719, + "step": 45400 + }, + { + "epoch": 0.8097777619234474, + "grad_norm": 0.297269731760025, + "learning_rate": 5.310799078135415e-06, + "loss": 0.0741, + "step": 45401 + }, + { + "epoch": 0.8097955980451611, + "grad_norm": 0.19961655139923096, + "learning_rate": 5.309839953195222e-06, + "loss": 0.1392, + "step": 45402 + }, + { + "epoch": 0.8098134341668748, + "grad_norm": 0.36211714148521423, + "learning_rate": 5.308880904580887e-06, + "loss": 0.137, + "step": 45403 + }, + { + "epoch": 0.8098312702885885, + "grad_norm": 0.3193663954734802, + "learning_rate": 5.307921932296136e-06, + "loss": 0.1574, + "step": 45404 + }, + { + "epoch": 0.8098491064103022, + "grad_norm": 0.2649797797203064, + "learning_rate": 5.3069630363446835e-06, + "loss": 0.1191, + "step": 45405 + }, + { + "epoch": 0.8098669425320159, + "grad_norm": 0.21580186486244202, + "learning_rate": 5.306004216730243e-06, + "loss": 0.1256, + "step": 45406 + }, + { + "epoch": 0.8098847786537295, + "grad_norm": 0.35072061419487, + "learning_rate": 5.30504547345653e-06, + "loss": 0.1801, + "step": 45407 + }, + { + "epoch": 0.8099026147754432, + "grad_norm": 0.2494833618402481, + "learning_rate": 5.304086806527259e-06, + "loss": 0.1623, + "step": 45408 + }, + { + "epoch": 0.8099204508971569, + "grad_norm": 0.3665267825126648, + "learning_rate": 5.303128215946154e-06, + "loss": 0.1888, + "step": 45409 + }, + { + "epoch": 0.8099382870188706, + "grad_norm": 0.22827298939228058, + "learning_rate": 5.30216970171693e-06, + "loss": 0.1056, + "step": 45410 + }, + { + "epoch": 0.8099561231405843, + "grad_norm": 0.28469589352607727, + "learning_rate": 5.301211263843292e-06, + "loss": 0.0809, + "step": 45411 + }, + { + "epoch": 0.809973959262298, + "grad_norm": 0.24700549244880676, + "learning_rate": 5.300252902328967e-06, + "loss": 0.1034, + "step": 45412 + }, + { + "epoch": 0.8099917953840117, + "grad_norm": 0.307685524225235, + "learning_rate": 5.299294617177664e-06, + "loss": 0.0926, + "step": 45413 + }, + { + "epoch": 0.8100096315057254, + "grad_norm": 0.36144816875457764, + "learning_rate": 5.298336408393101e-06, + "loss": 0.1243, + "step": 45414 + }, + { + "epoch": 0.810027467627439, + "grad_norm": 0.37847742438316345, + "learning_rate": 5.297378275978995e-06, + "loss": 0.1035, + "step": 45415 + }, + { + "epoch": 0.8100453037491527, + "grad_norm": 0.19188730418682098, + "learning_rate": 5.2964202199390465e-06, + "loss": 0.0844, + "step": 45416 + }, + { + "epoch": 0.8100631398708665, + "grad_norm": 0.30874645709991455, + "learning_rate": 5.295462240276988e-06, + "loss": 0.091, + "step": 45417 + }, + { + "epoch": 0.8100809759925802, + "grad_norm": 0.3077942728996277, + "learning_rate": 5.294504336996523e-06, + "loss": 0.1251, + "step": 45418 + }, + { + "epoch": 0.8100988121142939, + "grad_norm": 0.2177252471446991, + "learning_rate": 5.293546510101363e-06, + "loss": 0.0859, + "step": 45419 + }, + { + "epoch": 0.8101166482360076, + "grad_norm": 0.266027569770813, + "learning_rate": 5.292588759595224e-06, + "loss": 0.1066, + "step": 45420 + }, + { + "epoch": 0.8101344843577213, + "grad_norm": 0.3502812385559082, + "learning_rate": 5.291631085481813e-06, + "loss": 0.1464, + "step": 45421 + }, + { + "epoch": 0.810152320479435, + "grad_norm": 0.2796567678451538, + "learning_rate": 5.29067348776485e-06, + "loss": 0.1476, + "step": 45422 + }, + { + "epoch": 0.8101701566011487, + "grad_norm": 0.33431828022003174, + "learning_rate": 5.2897159664480474e-06, + "loss": 0.1369, + "step": 45423 + }, + { + "epoch": 0.8101879927228623, + "grad_norm": 0.2717862129211426, + "learning_rate": 5.288758521535106e-06, + "loss": 0.1159, + "step": 45424 + }, + { + "epoch": 0.810205828844576, + "grad_norm": 0.2470252364873886, + "learning_rate": 5.28780115302975e-06, + "loss": 0.0777, + "step": 45425 + }, + { + "epoch": 0.8102236649662897, + "grad_norm": 0.4683379828929901, + "learning_rate": 5.286843860935678e-06, + "loss": 0.1295, + "step": 45426 + }, + { + "epoch": 0.8102415010880034, + "grad_norm": 0.27637097239494324, + "learning_rate": 5.285886645256616e-06, + "loss": 0.139, + "step": 45427 + }, + { + "epoch": 0.8102593372097171, + "grad_norm": 0.25384750962257385, + "learning_rate": 5.284929505996266e-06, + "loss": 0.0546, + "step": 45428 + }, + { + "epoch": 0.8102771733314308, + "grad_norm": 0.2812712490558624, + "learning_rate": 5.283972443158333e-06, + "loss": 0.106, + "step": 45429 + }, + { + "epoch": 0.8102950094531445, + "grad_norm": 0.2187226265668869, + "learning_rate": 5.283015456746537e-06, + "loss": 0.0937, + "step": 45430 + }, + { + "epoch": 0.8103128455748582, + "grad_norm": 0.22594816982746124, + "learning_rate": 5.2820585467645844e-06, + "loss": 0.0994, + "step": 45431 + }, + { + "epoch": 0.8103306816965719, + "grad_norm": 0.278167188167572, + "learning_rate": 5.281101713216183e-06, + "loss": 0.095, + "step": 45432 + }, + { + "epoch": 0.8103485178182855, + "grad_norm": 0.23320238292217255, + "learning_rate": 5.280144956105043e-06, + "loss": 0.1177, + "step": 45433 + }, + { + "epoch": 0.8103663539399993, + "grad_norm": 0.26970675587654114, + "learning_rate": 5.279188275434865e-06, + "loss": 0.0894, + "step": 45434 + }, + { + "epoch": 0.810384190061713, + "grad_norm": 0.234837144613266, + "learning_rate": 5.278231671209371e-06, + "loss": 0.1126, + "step": 45435 + }, + { + "epoch": 0.8104020261834267, + "grad_norm": 0.3310607373714447, + "learning_rate": 5.277275143432262e-06, + "loss": 0.1434, + "step": 45436 + }, + { + "epoch": 0.8104198623051404, + "grad_norm": 0.24881210923194885, + "learning_rate": 5.2763186921072465e-06, + "loss": 0.0972, + "step": 45437 + }, + { + "epoch": 0.8104376984268541, + "grad_norm": 0.378174751996994, + "learning_rate": 5.275362317238028e-06, + "loss": 0.1244, + "step": 45438 + }, + { + "epoch": 0.8104555345485678, + "grad_norm": 0.3615255355834961, + "learning_rate": 5.274406018828321e-06, + "loss": 0.0749, + "step": 45439 + }, + { + "epoch": 0.8104733706702815, + "grad_norm": 0.25198599696159363, + "learning_rate": 5.273449796881824e-06, + "loss": 0.1348, + "step": 45440 + }, + { + "epoch": 0.8104912067919952, + "grad_norm": 0.28914254903793335, + "learning_rate": 5.2724936514022575e-06, + "loss": 0.2376, + "step": 45441 + }, + { + "epoch": 0.8105090429137088, + "grad_norm": 0.21748897433280945, + "learning_rate": 5.27153758239331e-06, + "loss": 0.1057, + "step": 45442 + }, + { + "epoch": 0.8105268790354225, + "grad_norm": 0.3717272877693176, + "learning_rate": 5.270581589858703e-06, + "loss": 0.1122, + "step": 45443 + }, + { + "epoch": 0.8105447151571362, + "grad_norm": 0.200483039021492, + "learning_rate": 5.269625673802139e-06, + "loss": 0.0788, + "step": 45444 + }, + { + "epoch": 0.8105625512788499, + "grad_norm": 0.2612393796443939, + "learning_rate": 5.268669834227319e-06, + "loss": 0.0993, + "step": 45445 + }, + { + "epoch": 0.8105803874005636, + "grad_norm": 0.24470101296901703, + "learning_rate": 5.26771407113795e-06, + "loss": 0.12, + "step": 45446 + }, + { + "epoch": 0.8105982235222773, + "grad_norm": 0.302722305059433, + "learning_rate": 5.266758384537729e-06, + "loss": 0.1116, + "step": 45447 + }, + { + "epoch": 0.810616059643991, + "grad_norm": 0.24601230025291443, + "learning_rate": 5.2658027744303755e-06, + "loss": 0.1039, + "step": 45448 + }, + { + "epoch": 0.8106338957657047, + "grad_norm": 0.3152858316898346, + "learning_rate": 5.264847240819587e-06, + "loss": 0.1576, + "step": 45449 + }, + { + "epoch": 0.8106517318874183, + "grad_norm": 0.22193369269371033, + "learning_rate": 5.263891783709066e-06, + "loss": 0.0933, + "step": 45450 + }, + { + "epoch": 0.8106695680091321, + "grad_norm": 0.2663132846355438, + "learning_rate": 5.262936403102511e-06, + "loss": 0.1239, + "step": 45451 + }, + { + "epoch": 0.8106874041308458, + "grad_norm": 0.20785358548164368, + "learning_rate": 5.261981099003638e-06, + "loss": 0.0885, + "step": 45452 + }, + { + "epoch": 0.8107052402525595, + "grad_norm": 0.26577529311180115, + "learning_rate": 5.261025871416137e-06, + "loss": 0.1332, + "step": 45453 + }, + { + "epoch": 0.8107230763742732, + "grad_norm": 0.40488943457603455, + "learning_rate": 5.260070720343724e-06, + "loss": 0.1805, + "step": 45454 + }, + { + "epoch": 0.8107409124959869, + "grad_norm": 0.2246762365102768, + "learning_rate": 5.259115645790086e-06, + "loss": 0.0791, + "step": 45455 + }, + { + "epoch": 0.8107587486177006, + "grad_norm": 0.3010610044002533, + "learning_rate": 5.2581606477589415e-06, + "loss": 0.0714, + "step": 45456 + }, + { + "epoch": 0.8107765847394143, + "grad_norm": 0.2886185050010681, + "learning_rate": 5.257205726253989e-06, + "loss": 0.0949, + "step": 45457 + }, + { + "epoch": 0.810794420861128, + "grad_norm": 0.2818879783153534, + "learning_rate": 5.25625088127892e-06, + "loss": 0.1343, + "step": 45458 + }, + { + "epoch": 0.8108122569828417, + "grad_norm": 0.3077107071876526, + "learning_rate": 5.255296112837446e-06, + "loss": 0.154, + "step": 45459 + }, + { + "epoch": 0.8108300931045553, + "grad_norm": 0.3322410583496094, + "learning_rate": 5.254341420933256e-06, + "loss": 0.2221, + "step": 45460 + }, + { + "epoch": 0.810847929226269, + "grad_norm": 0.3636000454425812, + "learning_rate": 5.253386805570065e-06, + "loss": 0.0897, + "step": 45461 + }, + { + "epoch": 0.8108657653479827, + "grad_norm": 0.288411408662796, + "learning_rate": 5.25243226675157e-06, + "loss": 0.1188, + "step": 45462 + }, + { + "epoch": 0.8108836014696964, + "grad_norm": 0.2190844714641571, + "learning_rate": 5.251477804481464e-06, + "loss": 0.097, + "step": 45463 + }, + { + "epoch": 0.8109014375914101, + "grad_norm": 0.2701435983181, + "learning_rate": 5.250523418763445e-06, + "loss": 0.1339, + "step": 45464 + }, + { + "epoch": 0.8109192737131238, + "grad_norm": 0.24628344178199768, + "learning_rate": 5.249569109601227e-06, + "loss": 0.1358, + "step": 45465 + }, + { + "epoch": 0.8109371098348375, + "grad_norm": 0.22886212170124054, + "learning_rate": 5.2486148769985004e-06, + "loss": 0.1247, + "step": 45466 + }, + { + "epoch": 0.8109549459565512, + "grad_norm": 0.3496599793434143, + "learning_rate": 5.247660720958955e-06, + "loss": 0.1343, + "step": 45467 + }, + { + "epoch": 0.810972782078265, + "grad_norm": 0.27672773599624634, + "learning_rate": 5.246706641486301e-06, + "loss": 0.0802, + "step": 45468 + }, + { + "epoch": 0.8109906181999786, + "grad_norm": 0.21181270480155945, + "learning_rate": 5.245752638584242e-06, + "loss": 0.064, + "step": 45469 + }, + { + "epoch": 0.8110084543216923, + "grad_norm": 0.2689070701599121, + "learning_rate": 5.244798712256469e-06, + "loss": 0.1083, + "step": 45470 + }, + { + "epoch": 0.811026290443406, + "grad_norm": 0.27168264985084534, + "learning_rate": 5.243844862506677e-06, + "loss": 0.111, + "step": 45471 + }, + { + "epoch": 0.8110441265651197, + "grad_norm": 0.2746426463127136, + "learning_rate": 5.242891089338567e-06, + "loss": 0.1578, + "step": 45472 + }, + { + "epoch": 0.8110619626868334, + "grad_norm": 0.29743823409080505, + "learning_rate": 5.24193739275583e-06, + "loss": 0.1282, + "step": 45473 + }, + { + "epoch": 0.8110797988085471, + "grad_norm": 0.3167383372783661, + "learning_rate": 5.240983772762173e-06, + "loss": 0.1252, + "step": 45474 + }, + { + "epoch": 0.8110976349302608, + "grad_norm": 0.2434619814157486, + "learning_rate": 5.240030229361287e-06, + "loss": 0.101, + "step": 45475 + }, + { + "epoch": 0.8111154710519745, + "grad_norm": 0.41078394651412964, + "learning_rate": 5.239076762556869e-06, + "loss": 0.1384, + "step": 45476 + }, + { + "epoch": 0.8111333071736881, + "grad_norm": 0.28844285011291504, + "learning_rate": 5.2381233723526075e-06, + "loss": 0.1179, + "step": 45477 + }, + { + "epoch": 0.8111511432954018, + "grad_norm": 0.31216683983802795, + "learning_rate": 5.2371700587522136e-06, + "loss": 0.1591, + "step": 45478 + }, + { + "epoch": 0.8111689794171155, + "grad_norm": 0.3527508080005646, + "learning_rate": 5.236216821759373e-06, + "loss": 0.1439, + "step": 45479 + }, + { + "epoch": 0.8111868155388292, + "grad_norm": 0.24327363073825836, + "learning_rate": 5.235263661377776e-06, + "loss": 0.1285, + "step": 45480 + }, + { + "epoch": 0.8112046516605429, + "grad_norm": 0.2778516113758087, + "learning_rate": 5.2343105776111315e-06, + "loss": 0.1402, + "step": 45481 + }, + { + "epoch": 0.8112224877822566, + "grad_norm": 0.2080429047346115, + "learning_rate": 5.233357570463118e-06, + "loss": 0.1142, + "step": 45482 + }, + { + "epoch": 0.8112403239039703, + "grad_norm": 0.34583908319473267, + "learning_rate": 5.232404639937444e-06, + "loss": 0.1003, + "step": 45483 + }, + { + "epoch": 0.811258160025684, + "grad_norm": 0.30335095524787903, + "learning_rate": 5.231451786037797e-06, + "loss": 0.1015, + "step": 45484 + }, + { + "epoch": 0.8112759961473978, + "grad_norm": 0.3012371361255646, + "learning_rate": 5.2304990087678726e-06, + "loss": 0.0951, + "step": 45485 + }, + { + "epoch": 0.8112938322691114, + "grad_norm": 0.22829729318618774, + "learning_rate": 5.229546308131353e-06, + "loss": 0.0971, + "step": 45486 + }, + { + "epoch": 0.8113116683908251, + "grad_norm": 0.2483467012643814, + "learning_rate": 5.22859368413195e-06, + "loss": 0.1393, + "step": 45487 + }, + { + "epoch": 0.8113295045125388, + "grad_norm": 0.23309765756130219, + "learning_rate": 5.227641136773345e-06, + "loss": 0.1102, + "step": 45488 + }, + { + "epoch": 0.8113473406342525, + "grad_norm": 0.409842848777771, + "learning_rate": 5.226688666059232e-06, + "loss": 0.1037, + "step": 45489 + }, + { + "epoch": 0.8113651767559662, + "grad_norm": 0.295044481754303, + "learning_rate": 5.225736271993295e-06, + "loss": 0.0911, + "step": 45490 + }, + { + "epoch": 0.8113830128776799, + "grad_norm": 0.30489203333854675, + "learning_rate": 5.224783954579243e-06, + "loss": 0.14, + "step": 45491 + }, + { + "epoch": 0.8114008489993936, + "grad_norm": 0.22889581322669983, + "learning_rate": 5.22383171382076e-06, + "loss": 0.1037, + "step": 45492 + }, + { + "epoch": 0.8114186851211073, + "grad_norm": 0.3222343921661377, + "learning_rate": 5.222879549721532e-06, + "loss": 0.0725, + "step": 45493 + }, + { + "epoch": 0.811436521242821, + "grad_norm": 0.3243221342563629, + "learning_rate": 5.22192746228525e-06, + "loss": 0.158, + "step": 45494 + }, + { + "epoch": 0.8114543573645346, + "grad_norm": 0.2591312825679779, + "learning_rate": 5.220975451515614e-06, + "loss": 0.1031, + "step": 45495 + }, + { + "epoch": 0.8114721934862483, + "grad_norm": 0.42368006706237793, + "learning_rate": 5.2200235174163e-06, + "loss": 0.1648, + "step": 45496 + }, + { + "epoch": 0.811490029607962, + "grad_norm": 0.20873235166072845, + "learning_rate": 5.219071659991015e-06, + "loss": 0.0738, + "step": 45497 + }, + { + "epoch": 0.8115078657296757, + "grad_norm": 0.2601599097251892, + "learning_rate": 5.218119879243441e-06, + "loss": 0.086, + "step": 45498 + }, + { + "epoch": 0.8115257018513894, + "grad_norm": 0.270443320274353, + "learning_rate": 5.217168175177259e-06, + "loss": 0.0952, + "step": 45499 + }, + { + "epoch": 0.8115435379731031, + "grad_norm": 0.24155418574810028, + "learning_rate": 5.216216547796174e-06, + "loss": 0.1, + "step": 45500 + }, + { + "epoch": 0.8115613740948168, + "grad_norm": 0.3387152850627899, + "learning_rate": 5.215264997103866e-06, + "loss": 0.1008, + "step": 45501 + }, + { + "epoch": 0.8115792102165306, + "grad_norm": 0.24622072279453278, + "learning_rate": 5.214313523104023e-06, + "loss": 0.0912, + "step": 45502 + }, + { + "epoch": 0.8115970463382443, + "grad_norm": 0.20400360226631165, + "learning_rate": 5.213362125800328e-06, + "loss": 0.1189, + "step": 45503 + }, + { + "epoch": 0.8116148824599579, + "grad_norm": 0.35512739419937134, + "learning_rate": 5.212410805196486e-06, + "loss": 0.1417, + "step": 45504 + }, + { + "epoch": 0.8116327185816716, + "grad_norm": 0.2782239317893982, + "learning_rate": 5.2114595612961695e-06, + "loss": 0.0987, + "step": 45505 + }, + { + "epoch": 0.8116505547033853, + "grad_norm": 0.35960808396339417, + "learning_rate": 5.210508394103072e-06, + "loss": 0.0482, + "step": 45506 + }, + { + "epoch": 0.811668390825099, + "grad_norm": 0.24351949989795685, + "learning_rate": 5.209557303620874e-06, + "loss": 0.1027, + "step": 45507 + }, + { + "epoch": 0.8116862269468127, + "grad_norm": 0.20865245163440704, + "learning_rate": 5.208606289853271e-06, + "loss": 0.0702, + "step": 45508 + }, + { + "epoch": 0.8117040630685264, + "grad_norm": 0.2898094356060028, + "learning_rate": 5.207655352803942e-06, + "loss": 0.1421, + "step": 45509 + }, + { + "epoch": 0.8117218991902401, + "grad_norm": 0.2679760158061981, + "learning_rate": 5.20670449247658e-06, + "loss": 0.1076, + "step": 45510 + }, + { + "epoch": 0.8117397353119538, + "grad_norm": 0.21027739346027374, + "learning_rate": 5.205753708874872e-06, + "loss": 0.111, + "step": 45511 + }, + { + "epoch": 0.8117575714336674, + "grad_norm": 0.3852689266204834, + "learning_rate": 5.20480300200249e-06, + "loss": 0.1523, + "step": 45512 + }, + { + "epoch": 0.8117754075553811, + "grad_norm": 0.3098362982273102, + "learning_rate": 5.2038523718631354e-06, + "loss": 0.0951, + "step": 45513 + }, + { + "epoch": 0.8117932436770948, + "grad_norm": 0.26572349667549133, + "learning_rate": 5.202901818460487e-06, + "loss": 0.1287, + "step": 45514 + }, + { + "epoch": 0.8118110797988085, + "grad_norm": 0.3147519528865814, + "learning_rate": 5.201951341798228e-06, + "loss": 0.1277, + "step": 45515 + }, + { + "epoch": 0.8118289159205222, + "grad_norm": 0.18713118135929108, + "learning_rate": 5.201000941880038e-06, + "loss": 0.0824, + "step": 45516 + }, + { + "epoch": 0.8118467520422359, + "grad_norm": 0.31820791959762573, + "learning_rate": 5.200050618709615e-06, + "loss": 0.1576, + "step": 45517 + }, + { + "epoch": 0.8118645881639497, + "grad_norm": 0.2600247859954834, + "learning_rate": 5.19910037229063e-06, + "loss": 0.0805, + "step": 45518 + }, + { + "epoch": 0.8118824242856634, + "grad_norm": 0.3205106854438782, + "learning_rate": 5.198150202626776e-06, + "loss": 0.1539, + "step": 45519 + }, + { + "epoch": 0.8119002604073771, + "grad_norm": 0.28935134410858154, + "learning_rate": 5.19720010972172e-06, + "loss": 0.1462, + "step": 45520 + }, + { + "epoch": 0.8119180965290907, + "grad_norm": 0.2988860607147217, + "learning_rate": 5.196250093579166e-06, + "loss": 0.1603, + "step": 45521 + }, + { + "epoch": 0.8119359326508044, + "grad_norm": 0.2744669020175934, + "learning_rate": 5.195300154202784e-06, + "loss": 0.0897, + "step": 45522 + }, + { + "epoch": 0.8119537687725181, + "grad_norm": 0.33278608322143555, + "learning_rate": 5.1943502915962536e-06, + "loss": 0.1208, + "step": 45523 + }, + { + "epoch": 0.8119716048942318, + "grad_norm": 0.3321242332458496, + "learning_rate": 5.193400505763269e-06, + "loss": 0.1725, + "step": 45524 + }, + { + "epoch": 0.8119894410159455, + "grad_norm": 0.2584283649921417, + "learning_rate": 5.192450796707498e-06, + "loss": 0.1263, + "step": 45525 + }, + { + "epoch": 0.8120072771376592, + "grad_norm": 0.3969503343105316, + "learning_rate": 5.191501164432635e-06, + "loss": 0.1509, + "step": 45526 + }, + { + "epoch": 0.8120251132593729, + "grad_norm": 0.3709224760532379, + "learning_rate": 5.190551608942357e-06, + "loss": 0.1851, + "step": 45527 + }, + { + "epoch": 0.8120429493810866, + "grad_norm": 0.20300635695457458, + "learning_rate": 5.189602130240342e-06, + "loss": 0.1082, + "step": 45528 + }, + { + "epoch": 0.8120607855028003, + "grad_norm": 0.2492382526397705, + "learning_rate": 5.188652728330265e-06, + "loss": 0.0996, + "step": 45529 + }, + { + "epoch": 0.8120786216245139, + "grad_norm": 0.2658096253871918, + "learning_rate": 5.187703403215818e-06, + "loss": 0.0918, + "step": 45530 + }, + { + "epoch": 0.8120964577462276, + "grad_norm": 0.28822076320648193, + "learning_rate": 5.186754154900678e-06, + "loss": 0.1254, + "step": 45531 + }, + { + "epoch": 0.8121142938679413, + "grad_norm": 0.28774335980415344, + "learning_rate": 5.18580498338852e-06, + "loss": 0.1015, + "step": 45532 + }, + { + "epoch": 0.812132129989655, + "grad_norm": 0.1896011084318161, + "learning_rate": 5.184855888683019e-06, + "loss": 0.0675, + "step": 45533 + }, + { + "epoch": 0.8121499661113687, + "grad_norm": 0.27161189913749695, + "learning_rate": 5.183906870787869e-06, + "loss": 0.1315, + "step": 45534 + }, + { + "epoch": 0.8121678022330825, + "grad_norm": 0.3012565076351166, + "learning_rate": 5.182957929706738e-06, + "loss": 0.1459, + "step": 45535 + }, + { + "epoch": 0.8121856383547962, + "grad_norm": 0.2427787482738495, + "learning_rate": 5.182009065443302e-06, + "loss": 0.1037, + "step": 45536 + }, + { + "epoch": 0.8122034744765099, + "grad_norm": 0.2582356035709381, + "learning_rate": 5.181060278001249e-06, + "loss": 0.0865, + "step": 45537 + }, + { + "epoch": 0.8122213105982236, + "grad_norm": 0.23960572481155396, + "learning_rate": 5.180111567384244e-06, + "loss": 0.0821, + "step": 45538 + }, + { + "epoch": 0.8122391467199372, + "grad_norm": 0.20808137953281403, + "learning_rate": 5.17916293359598e-06, + "loss": 0.0954, + "step": 45539 + }, + { + "epoch": 0.8122569828416509, + "grad_norm": 0.21785911917686462, + "learning_rate": 5.1782143766401285e-06, + "loss": 0.0678, + "step": 45540 + }, + { + "epoch": 0.8122748189633646, + "grad_norm": 0.2476358413696289, + "learning_rate": 5.1772658965203615e-06, + "loss": 0.1017, + "step": 45541 + }, + { + "epoch": 0.8122926550850783, + "grad_norm": 0.27477723360061646, + "learning_rate": 5.176317493240351e-06, + "loss": 0.1249, + "step": 45542 + }, + { + "epoch": 0.812310491206792, + "grad_norm": 0.2926079332828522, + "learning_rate": 5.17536916680379e-06, + "loss": 0.1193, + "step": 45543 + }, + { + "epoch": 0.8123283273285057, + "grad_norm": 0.2808745205402374, + "learning_rate": 5.174420917214345e-06, + "loss": 0.1096, + "step": 45544 + }, + { + "epoch": 0.8123461634502194, + "grad_norm": 0.32176074385643005, + "learning_rate": 5.173472744475691e-06, + "loss": 0.0952, + "step": 45545 + }, + { + "epoch": 0.8123639995719331, + "grad_norm": 0.2728419303894043, + "learning_rate": 5.172524648591498e-06, + "loss": 0.1039, + "step": 45546 + }, + { + "epoch": 0.8123818356936467, + "grad_norm": 0.2153676450252533, + "learning_rate": 5.171576629565456e-06, + "loss": 0.1234, + "step": 45547 + }, + { + "epoch": 0.8123996718153604, + "grad_norm": 0.25571686029434204, + "learning_rate": 5.170628687401227e-06, + "loss": 0.1074, + "step": 45548 + }, + { + "epoch": 0.8124175079370741, + "grad_norm": 0.5360467433929443, + "learning_rate": 5.1696808221024934e-06, + "loss": 0.1582, + "step": 45549 + }, + { + "epoch": 0.8124353440587878, + "grad_norm": 0.24733681976795197, + "learning_rate": 5.16873303367292e-06, + "loss": 0.081, + "step": 45550 + }, + { + "epoch": 0.8124531801805015, + "grad_norm": 0.24323980510234833, + "learning_rate": 5.1677853221161865e-06, + "loss": 0.1176, + "step": 45551 + }, + { + "epoch": 0.8124710163022153, + "grad_norm": 0.22319646179676056, + "learning_rate": 5.1668376874359715e-06, + "loss": 0.1065, + "step": 45552 + }, + { + "epoch": 0.812488852423929, + "grad_norm": 0.2035447210073471, + "learning_rate": 5.165890129635947e-06, + "loss": 0.0655, + "step": 45553 + }, + { + "epoch": 0.8125066885456427, + "grad_norm": 0.22651202976703644, + "learning_rate": 5.164942648719781e-06, + "loss": 0.1002, + "step": 45554 + }, + { + "epoch": 0.8125245246673564, + "grad_norm": 0.33447906374931335, + "learning_rate": 5.163995244691142e-06, + "loss": 0.0735, + "step": 45555 + }, + { + "epoch": 0.81254236078907, + "grad_norm": 0.300897479057312, + "learning_rate": 5.163047917553715e-06, + "loss": 0.1373, + "step": 45556 + }, + { + "epoch": 0.8125601969107837, + "grad_norm": 0.2702885568141937, + "learning_rate": 5.162100667311165e-06, + "loss": 0.1619, + "step": 45557 + }, + { + "epoch": 0.8125780330324974, + "grad_norm": 0.5546003580093384, + "learning_rate": 5.161153493967164e-06, + "loss": 0.1464, + "step": 45558 + }, + { + "epoch": 0.8125958691542111, + "grad_norm": 0.25141334533691406, + "learning_rate": 5.16020639752538e-06, + "loss": 0.121, + "step": 45559 + }, + { + "epoch": 0.8126137052759248, + "grad_norm": 0.4828642010688782, + "learning_rate": 5.159259377989492e-06, + "loss": 0.1217, + "step": 45560 + }, + { + "epoch": 0.8126315413976385, + "grad_norm": 0.3760753870010376, + "learning_rate": 5.158312435363169e-06, + "loss": 0.1516, + "step": 45561 + }, + { + "epoch": 0.8126493775193522, + "grad_norm": 0.5467627048492432, + "learning_rate": 5.15736556965008e-06, + "loss": 0.1251, + "step": 45562 + }, + { + "epoch": 0.8126672136410659, + "grad_norm": 0.32679885625839233, + "learning_rate": 5.156418780853892e-06, + "loss": 0.1064, + "step": 45563 + }, + { + "epoch": 0.8126850497627796, + "grad_norm": 0.24771232903003693, + "learning_rate": 5.1554720689782815e-06, + "loss": 0.1102, + "step": 45564 + }, + { + "epoch": 0.8127028858844932, + "grad_norm": 0.2394687682390213, + "learning_rate": 5.154525434026908e-06, + "loss": 0.1624, + "step": 45565 + }, + { + "epoch": 0.8127207220062069, + "grad_norm": 0.261261522769928, + "learning_rate": 5.153578876003457e-06, + "loss": 0.1228, + "step": 45566 + }, + { + "epoch": 0.8127385581279206, + "grad_norm": 0.25472745299339294, + "learning_rate": 5.152632394911589e-06, + "loss": 0.1455, + "step": 45567 + }, + { + "epoch": 0.8127563942496343, + "grad_norm": 0.24779358506202698, + "learning_rate": 5.151685990754965e-06, + "loss": 0.1109, + "step": 45568 + }, + { + "epoch": 0.8127742303713481, + "grad_norm": 0.21714620292186737, + "learning_rate": 5.150739663537269e-06, + "loss": 0.1037, + "step": 45569 + }, + { + "epoch": 0.8127920664930618, + "grad_norm": 0.2893657684326172, + "learning_rate": 5.149793413262163e-06, + "loss": 0.1416, + "step": 45570 + }, + { + "epoch": 0.8128099026147755, + "grad_norm": 0.22147493064403534, + "learning_rate": 5.1488472399333105e-06, + "loss": 0.0647, + "step": 45571 + }, + { + "epoch": 0.8128277387364892, + "grad_norm": 0.27817443013191223, + "learning_rate": 5.147901143554379e-06, + "loss": 0.0669, + "step": 45572 + }, + { + "epoch": 0.8128455748582029, + "grad_norm": 0.24891194701194763, + "learning_rate": 5.146955124129044e-06, + "loss": 0.1303, + "step": 45573 + }, + { + "epoch": 0.8128634109799165, + "grad_norm": 0.28080177307128906, + "learning_rate": 5.146009181660968e-06, + "loss": 0.1156, + "step": 45574 + }, + { + "epoch": 0.8128812471016302, + "grad_norm": 0.2582859396934509, + "learning_rate": 5.14506331615382e-06, + "loss": 0.135, + "step": 45575 + }, + { + "epoch": 0.8128990832233439, + "grad_norm": 0.7043549418449402, + "learning_rate": 5.1441175276112615e-06, + "loss": 0.1255, + "step": 45576 + }, + { + "epoch": 0.8129169193450576, + "grad_norm": 0.29657599329948425, + "learning_rate": 5.1431718160369565e-06, + "loss": 0.1221, + "step": 45577 + }, + { + "epoch": 0.8129347554667713, + "grad_norm": 0.3936665654182434, + "learning_rate": 5.142226181434576e-06, + "loss": 0.1425, + "step": 45578 + }, + { + "epoch": 0.812952591588485, + "grad_norm": 0.3372699022293091, + "learning_rate": 5.141280623807792e-06, + "loss": 0.1076, + "step": 45579 + }, + { + "epoch": 0.8129704277101987, + "grad_norm": 0.27011269330978394, + "learning_rate": 5.140335143160263e-06, + "loss": 0.0886, + "step": 45580 + }, + { + "epoch": 0.8129882638319124, + "grad_norm": 0.26672306656837463, + "learning_rate": 5.139389739495645e-06, + "loss": 0.1002, + "step": 45581 + }, + { + "epoch": 0.813006099953626, + "grad_norm": 0.21756118535995483, + "learning_rate": 5.138444412817623e-06, + "loss": 0.1249, + "step": 45582 + }, + { + "epoch": 0.8130239360753397, + "grad_norm": 0.33472296595573425, + "learning_rate": 5.137499163129849e-06, + "loss": 0.1548, + "step": 45583 + }, + { + "epoch": 0.8130417721970534, + "grad_norm": 0.28582215309143066, + "learning_rate": 5.1365539904359884e-06, + "loss": 0.0851, + "step": 45584 + }, + { + "epoch": 0.8130596083187671, + "grad_norm": 0.35146069526672363, + "learning_rate": 5.135608894739696e-06, + "loss": 0.1057, + "step": 45585 + }, + { + "epoch": 0.8130774444404809, + "grad_norm": 0.25881344079971313, + "learning_rate": 5.134663876044654e-06, + "loss": 0.1455, + "step": 45586 + }, + { + "epoch": 0.8130952805621946, + "grad_norm": 0.25682884454727173, + "learning_rate": 5.1337189343545146e-06, + "loss": 0.1213, + "step": 45587 + }, + { + "epoch": 0.8131131166839083, + "grad_norm": 0.30452170968055725, + "learning_rate": 5.132774069672944e-06, + "loss": 0.1082, + "step": 45588 + }, + { + "epoch": 0.813130952805622, + "grad_norm": 0.26689738035202026, + "learning_rate": 5.131829282003603e-06, + "loss": 0.0904, + "step": 45589 + }, + { + "epoch": 0.8131487889273357, + "grad_norm": 0.32830995321273804, + "learning_rate": 5.130884571350144e-06, + "loss": 0.1182, + "step": 45590 + }, + { + "epoch": 0.8131666250490494, + "grad_norm": 0.2156449407339096, + "learning_rate": 5.1299399377162495e-06, + "loss": 0.1106, + "step": 45591 + }, + { + "epoch": 0.813184461170763, + "grad_norm": 0.3808838129043579, + "learning_rate": 5.128995381105561e-06, + "loss": 0.1609, + "step": 45592 + }, + { + "epoch": 0.8132022972924767, + "grad_norm": 0.2852393090724945, + "learning_rate": 5.1280509015217585e-06, + "loss": 0.1308, + "step": 45593 + }, + { + "epoch": 0.8132201334141904, + "grad_norm": 0.25662487745285034, + "learning_rate": 5.1271064989684865e-06, + "loss": 0.109, + "step": 45594 + }, + { + "epoch": 0.8132379695359041, + "grad_norm": 0.2496776431798935, + "learning_rate": 5.126162173449422e-06, + "loss": 0.0821, + "step": 45595 + }, + { + "epoch": 0.8132558056576178, + "grad_norm": 0.2867102324962616, + "learning_rate": 5.1252179249682144e-06, + "loss": 0.1423, + "step": 45596 + }, + { + "epoch": 0.8132736417793315, + "grad_norm": 0.29804888367652893, + "learning_rate": 5.12427375352853e-06, + "loss": 0.1729, + "step": 45597 + }, + { + "epoch": 0.8132914779010452, + "grad_norm": 0.3755727708339691, + "learning_rate": 5.123329659134016e-06, + "loss": 0.1786, + "step": 45598 + }, + { + "epoch": 0.8133093140227589, + "grad_norm": 0.2618981897830963, + "learning_rate": 5.122385641788349e-06, + "loss": 0.1396, + "step": 45599 + }, + { + "epoch": 0.8133271501444725, + "grad_norm": 0.19386625289916992, + "learning_rate": 5.121441701495181e-06, + "loss": 0.0789, + "step": 45600 + }, + { + "epoch": 0.8133449862661862, + "grad_norm": 0.3044191002845764, + "learning_rate": 5.12049783825817e-06, + "loss": 0.077, + "step": 45601 + }, + { + "epoch": 0.8133628223878999, + "grad_norm": 0.2679678201675415, + "learning_rate": 5.1195540520809745e-06, + "loss": 0.097, + "step": 45602 + }, + { + "epoch": 0.8133806585096137, + "grad_norm": 0.24042688310146332, + "learning_rate": 5.118610342967248e-06, + "loss": 0.0959, + "step": 45603 + }, + { + "epoch": 0.8133984946313274, + "grad_norm": 0.25474920868873596, + "learning_rate": 5.117666710920663e-06, + "loss": 0.1522, + "step": 45604 + }, + { + "epoch": 0.8134163307530411, + "grad_norm": 0.3378788232803345, + "learning_rate": 5.116723155944861e-06, + "loss": 0.1436, + "step": 45605 + }, + { + "epoch": 0.8134341668747548, + "grad_norm": 0.3056812882423401, + "learning_rate": 5.115779678043514e-06, + "loss": 0.1352, + "step": 45606 + }, + { + "epoch": 0.8134520029964685, + "grad_norm": 0.2749176621437073, + "learning_rate": 5.114836277220267e-06, + "loss": 0.1294, + "step": 45607 + }, + { + "epoch": 0.8134698391181822, + "grad_norm": 0.27322229743003845, + "learning_rate": 5.113892953478788e-06, + "loss": 0.0488, + "step": 45608 + }, + { + "epoch": 0.8134876752398958, + "grad_norm": 0.2960701584815979, + "learning_rate": 5.112949706822731e-06, + "loss": 0.1122, + "step": 45609 + }, + { + "epoch": 0.8135055113616095, + "grad_norm": 0.37147918343544006, + "learning_rate": 5.112006537255748e-06, + "loss": 0.1299, + "step": 45610 + }, + { + "epoch": 0.8135233474833232, + "grad_norm": 0.2725314795970917, + "learning_rate": 5.111063444781489e-06, + "loss": 0.1314, + "step": 45611 + }, + { + "epoch": 0.8135411836050369, + "grad_norm": 0.2502182424068451, + "learning_rate": 5.1101204294036255e-06, + "loss": 0.1237, + "step": 45612 + }, + { + "epoch": 0.8135590197267506, + "grad_norm": 0.29174044728279114, + "learning_rate": 5.109177491125805e-06, + "loss": 0.111, + "step": 45613 + }, + { + "epoch": 0.8135768558484643, + "grad_norm": 0.25557634234428406, + "learning_rate": 5.108234629951683e-06, + "loss": 0.1001, + "step": 45614 + }, + { + "epoch": 0.813594691970178, + "grad_norm": 0.29499757289886475, + "learning_rate": 5.1072918458849124e-06, + "loss": 0.1387, + "step": 45615 + }, + { + "epoch": 0.8136125280918917, + "grad_norm": 0.2681562304496765, + "learning_rate": 5.1063491389291425e-06, + "loss": 0.1106, + "step": 45616 + }, + { + "epoch": 0.8136303642136054, + "grad_norm": 0.21198689937591553, + "learning_rate": 5.105406509088042e-06, + "loss": 0.095, + "step": 45617 + }, + { + "epoch": 0.813648200335319, + "grad_norm": 0.2974684238433838, + "learning_rate": 5.104463956365258e-06, + "loss": 0.1522, + "step": 45618 + }, + { + "epoch": 0.8136660364570328, + "grad_norm": 0.26667389273643494, + "learning_rate": 5.1035214807644366e-06, + "loss": 0.1414, + "step": 45619 + }, + { + "epoch": 0.8136838725787465, + "grad_norm": 0.34514445066452026, + "learning_rate": 5.102579082289244e-06, + "loss": 0.1932, + "step": 45620 + }, + { + "epoch": 0.8137017087004602, + "grad_norm": 0.2809849679470062, + "learning_rate": 5.101636760943321e-06, + "loss": 0.1031, + "step": 45621 + }, + { + "epoch": 0.8137195448221739, + "grad_norm": 0.2639085650444031, + "learning_rate": 5.1006945167303314e-06, + "loss": 0.0896, + "step": 45622 + }, + { + "epoch": 0.8137373809438876, + "grad_norm": 0.33750295639038086, + "learning_rate": 5.099752349653924e-06, + "loss": 0.1268, + "step": 45623 + }, + { + "epoch": 0.8137552170656013, + "grad_norm": 0.28890225291252136, + "learning_rate": 5.0988102597177455e-06, + "loss": 0.0819, + "step": 45624 + }, + { + "epoch": 0.813773053187315, + "grad_norm": 0.2128099948167801, + "learning_rate": 5.097868246925455e-06, + "loss": 0.1091, + "step": 45625 + }, + { + "epoch": 0.8137908893090287, + "grad_norm": 0.32496771216392517, + "learning_rate": 5.096926311280703e-06, + "loss": 0.0741, + "step": 45626 + }, + { + "epoch": 0.8138087254307423, + "grad_norm": 0.23552510142326355, + "learning_rate": 5.095984452787139e-06, + "loss": 0.0979, + "step": 45627 + }, + { + "epoch": 0.813826561552456, + "grad_norm": 0.29348504543304443, + "learning_rate": 5.095042671448413e-06, + "loss": 0.1294, + "step": 45628 + }, + { + "epoch": 0.8138443976741697, + "grad_norm": 0.2769838869571686, + "learning_rate": 5.094100967268173e-06, + "loss": 0.1222, + "step": 45629 + }, + { + "epoch": 0.8138622337958834, + "grad_norm": 0.25167563557624817, + "learning_rate": 5.0931593402500756e-06, + "loss": 0.1268, + "step": 45630 + }, + { + "epoch": 0.8138800699175971, + "grad_norm": 0.23509718477725983, + "learning_rate": 5.092217790397771e-06, + "loss": 0.1037, + "step": 45631 + }, + { + "epoch": 0.8138979060393108, + "grad_norm": 0.24677452445030212, + "learning_rate": 5.091276317714896e-06, + "loss": 0.0859, + "step": 45632 + }, + { + "epoch": 0.8139157421610245, + "grad_norm": 0.24758413434028625, + "learning_rate": 5.09033492220512e-06, + "loss": 0.103, + "step": 45633 + }, + { + "epoch": 0.8139335782827382, + "grad_norm": 0.35802656412124634, + "learning_rate": 5.089393603872075e-06, + "loss": 0.113, + "step": 45634 + }, + { + "epoch": 0.8139514144044518, + "grad_norm": 0.2309873104095459, + "learning_rate": 5.088452362719426e-06, + "loss": 0.1004, + "step": 45635 + }, + { + "epoch": 0.8139692505261656, + "grad_norm": 0.21573497354984283, + "learning_rate": 5.087511198750811e-06, + "loss": 0.0821, + "step": 45636 + }, + { + "epoch": 0.8139870866478793, + "grad_norm": 0.3483726382255554, + "learning_rate": 5.086570111969871e-06, + "loss": 0.2048, + "step": 45637 + }, + { + "epoch": 0.814004922769593, + "grad_norm": 0.2416190803050995, + "learning_rate": 5.085629102380274e-06, + "loss": 0.094, + "step": 45638 + }, + { + "epoch": 0.8140227588913067, + "grad_norm": 0.216502845287323, + "learning_rate": 5.0846881699856545e-06, + "loss": 0.0955, + "step": 45639 + }, + { + "epoch": 0.8140405950130204, + "grad_norm": 0.17814892530441284, + "learning_rate": 5.0837473147896605e-06, + "loss": 0.0998, + "step": 45640 + }, + { + "epoch": 0.8140584311347341, + "grad_norm": 0.2848738431930542, + "learning_rate": 5.082806536795945e-06, + "loss": 0.1058, + "step": 45641 + }, + { + "epoch": 0.8140762672564478, + "grad_norm": 0.26333603262901306, + "learning_rate": 5.08186583600814e-06, + "loss": 0.0717, + "step": 45642 + }, + { + "epoch": 0.8140941033781615, + "grad_norm": 0.1773180216550827, + "learning_rate": 5.080925212429913e-06, + "loss": 0.1241, + "step": 45643 + }, + { + "epoch": 0.8141119394998751, + "grad_norm": 0.23399409651756287, + "learning_rate": 5.079984666064897e-06, + "loss": 0.129, + "step": 45644 + }, + { + "epoch": 0.8141297756215888, + "grad_norm": 0.32898133993148804, + "learning_rate": 5.079044196916741e-06, + "loss": 0.1564, + "step": 45645 + }, + { + "epoch": 0.8141476117433025, + "grad_norm": 0.25006625056266785, + "learning_rate": 5.078103804989082e-06, + "loss": 0.1198, + "step": 45646 + }, + { + "epoch": 0.8141654478650162, + "grad_norm": 0.2780681550502777, + "learning_rate": 5.077163490285583e-06, + "loss": 0.1327, + "step": 45647 + }, + { + "epoch": 0.8141832839867299, + "grad_norm": 0.2767432928085327, + "learning_rate": 5.076223252809873e-06, + "loss": 0.1238, + "step": 45648 + }, + { + "epoch": 0.8142011201084436, + "grad_norm": 0.2799425721168518, + "learning_rate": 5.075283092565605e-06, + "loss": 0.1291, + "step": 45649 + }, + { + "epoch": 0.8142189562301573, + "grad_norm": 0.2554751932621002, + "learning_rate": 5.0743430095564205e-06, + "loss": 0.132, + "step": 45650 + }, + { + "epoch": 0.814236792351871, + "grad_norm": 0.28389936685562134, + "learning_rate": 5.073403003785967e-06, + "loss": 0.1562, + "step": 45651 + }, + { + "epoch": 0.8142546284735847, + "grad_norm": 0.427094042301178, + "learning_rate": 5.072463075257889e-06, + "loss": 0.0977, + "step": 45652 + }, + { + "epoch": 0.8142724645952985, + "grad_norm": 0.3419346213340759, + "learning_rate": 5.071523223975824e-06, + "loss": 0.118, + "step": 45653 + }, + { + "epoch": 0.8142903007170121, + "grad_norm": 0.24643169343471527, + "learning_rate": 5.0705834499434225e-06, + "loss": 0.0629, + "step": 45654 + }, + { + "epoch": 0.8143081368387258, + "grad_norm": 0.2812938392162323, + "learning_rate": 5.069643753164313e-06, + "loss": 0.1073, + "step": 45655 + }, + { + "epoch": 0.8143259729604395, + "grad_norm": 0.2212173491716385, + "learning_rate": 5.068704133642155e-06, + "loss": 0.118, + "step": 45656 + }, + { + "epoch": 0.8143438090821532, + "grad_norm": 0.2015131711959839, + "learning_rate": 5.067764591380583e-06, + "loss": 0.0895, + "step": 45657 + }, + { + "epoch": 0.8143616452038669, + "grad_norm": 0.23904988169670105, + "learning_rate": 5.066825126383243e-06, + "loss": 0.1093, + "step": 45658 + }, + { + "epoch": 0.8143794813255806, + "grad_norm": 0.41175538301467896, + "learning_rate": 5.0658857386537635e-06, + "loss": 0.0687, + "step": 45659 + }, + { + "epoch": 0.8143973174472943, + "grad_norm": 0.39152294397354126, + "learning_rate": 5.064946428195805e-06, + "loss": 0.1213, + "step": 45660 + }, + { + "epoch": 0.814415153569008, + "grad_norm": 0.26551494002342224, + "learning_rate": 5.06400719501299e-06, + "loss": 0.0754, + "step": 45661 + }, + { + "epoch": 0.8144329896907216, + "grad_norm": 0.322030633687973, + "learning_rate": 5.0630680391089755e-06, + "loss": 0.0933, + "step": 45662 + }, + { + "epoch": 0.8144508258124353, + "grad_norm": 0.23832233250141144, + "learning_rate": 5.0621289604873915e-06, + "loss": 0.0859, + "step": 45663 + }, + { + "epoch": 0.814468661934149, + "grad_norm": 0.2087278962135315, + "learning_rate": 5.061189959151888e-06, + "loss": 0.1146, + "step": 45664 + }, + { + "epoch": 0.8144864980558627, + "grad_norm": 0.2993241548538208, + "learning_rate": 5.060251035106098e-06, + "loss": 0.1284, + "step": 45665 + }, + { + "epoch": 0.8145043341775764, + "grad_norm": 0.2602251172065735, + "learning_rate": 5.059312188353662e-06, + "loss": 0.1124, + "step": 45666 + }, + { + "epoch": 0.8145221702992901, + "grad_norm": 0.20799730718135834, + "learning_rate": 5.058373418898219e-06, + "loss": 0.1142, + "step": 45667 + }, + { + "epoch": 0.8145400064210038, + "grad_norm": 0.3496764004230499, + "learning_rate": 5.057434726743401e-06, + "loss": 0.1039, + "step": 45668 + }, + { + "epoch": 0.8145578425427175, + "grad_norm": 0.25862595438957214, + "learning_rate": 5.056496111892864e-06, + "loss": 0.0815, + "step": 45669 + }, + { + "epoch": 0.8145756786644313, + "grad_norm": 0.28510621190071106, + "learning_rate": 5.055557574350234e-06, + "loss": 0.1165, + "step": 45670 + }, + { + "epoch": 0.814593514786145, + "grad_norm": 0.25111693143844604, + "learning_rate": 5.05461911411915e-06, + "loss": 0.1061, + "step": 45671 + }, + { + "epoch": 0.8146113509078586, + "grad_norm": 0.2907634377479553, + "learning_rate": 5.053680731203245e-06, + "loss": 0.0933, + "step": 45672 + }, + { + "epoch": 0.8146291870295723, + "grad_norm": 0.32620877027511597, + "learning_rate": 5.0527424256061715e-06, + "loss": 0.1024, + "step": 45673 + }, + { + "epoch": 0.814647023151286, + "grad_norm": 0.29583412408828735, + "learning_rate": 5.051804197331555e-06, + "loss": 0.127, + "step": 45674 + }, + { + "epoch": 0.8146648592729997, + "grad_norm": 0.23249945044517517, + "learning_rate": 5.050866046383032e-06, + "loss": 0.0914, + "step": 45675 + }, + { + "epoch": 0.8146826953947134, + "grad_norm": 0.24768604338169098, + "learning_rate": 5.049927972764246e-06, + "loss": 0.0955, + "step": 45676 + }, + { + "epoch": 0.8147005315164271, + "grad_norm": 0.4107540249824524, + "learning_rate": 5.048989976478824e-06, + "loss": 0.1162, + "step": 45677 + }, + { + "epoch": 0.8147183676381408, + "grad_norm": 0.2767286002635956, + "learning_rate": 5.048052057530417e-06, + "loss": 0.0876, + "step": 45678 + }, + { + "epoch": 0.8147362037598544, + "grad_norm": 0.3293555974960327, + "learning_rate": 5.047114215922649e-06, + "loss": 0.1332, + "step": 45679 + }, + { + "epoch": 0.8147540398815681, + "grad_norm": 0.29267212748527527, + "learning_rate": 5.046176451659157e-06, + "loss": 0.1168, + "step": 45680 + }, + { + "epoch": 0.8147718760032818, + "grad_norm": 0.38418206572532654, + "learning_rate": 5.04523876474357e-06, + "loss": 0.1405, + "step": 45681 + }, + { + "epoch": 0.8147897121249955, + "grad_norm": 0.3118765950202942, + "learning_rate": 5.04430115517954e-06, + "loss": 0.1173, + "step": 45682 + }, + { + "epoch": 0.8148075482467092, + "grad_norm": 0.3011217415332794, + "learning_rate": 5.043363622970687e-06, + "loss": 0.1225, + "step": 45683 + }, + { + "epoch": 0.8148253843684229, + "grad_norm": 0.2403777688741684, + "learning_rate": 5.042426168120653e-06, + "loss": 0.1244, + "step": 45684 + }, + { + "epoch": 0.8148432204901366, + "grad_norm": 0.2491941601037979, + "learning_rate": 5.041488790633059e-06, + "loss": 0.0733, + "step": 45685 + }, + { + "epoch": 0.8148610566118503, + "grad_norm": 0.27331098914146423, + "learning_rate": 5.040551490511555e-06, + "loss": 0.1204, + "step": 45686 + }, + { + "epoch": 0.8148788927335641, + "grad_norm": 0.27458685636520386, + "learning_rate": 5.0396142677597676e-06, + "loss": 0.1111, + "step": 45687 + }, + { + "epoch": 0.8148967288552778, + "grad_norm": 0.2450343519449234, + "learning_rate": 5.0386771223813244e-06, + "loss": 0.1056, + "step": 45688 + }, + { + "epoch": 0.8149145649769914, + "grad_norm": 0.2235519289970398, + "learning_rate": 5.037740054379866e-06, + "loss": 0.0772, + "step": 45689 + }, + { + "epoch": 0.8149324010987051, + "grad_norm": 0.24457727372646332, + "learning_rate": 5.036803063759018e-06, + "loss": 0.0935, + "step": 45690 + }, + { + "epoch": 0.8149502372204188, + "grad_norm": 0.19804850220680237, + "learning_rate": 5.0358661505224226e-06, + "loss": 0.12, + "step": 45691 + }, + { + "epoch": 0.8149680733421325, + "grad_norm": 0.3335053622722626, + "learning_rate": 5.0349293146737055e-06, + "loss": 0.1271, + "step": 45692 + }, + { + "epoch": 0.8149859094638462, + "grad_norm": 0.19297271966934204, + "learning_rate": 5.033992556216499e-06, + "loss": 0.0987, + "step": 45693 + }, + { + "epoch": 0.8150037455855599, + "grad_norm": 0.26616060733795166, + "learning_rate": 5.0330558751544245e-06, + "loss": 0.1063, + "step": 45694 + }, + { + "epoch": 0.8150215817072736, + "grad_norm": 0.29165148735046387, + "learning_rate": 5.032119271491129e-06, + "loss": 0.1173, + "step": 45695 + }, + { + "epoch": 0.8150394178289873, + "grad_norm": 0.22301040589809418, + "learning_rate": 5.031182745230237e-06, + "loss": 0.1062, + "step": 45696 + }, + { + "epoch": 0.8150572539507009, + "grad_norm": 0.2579430639743805, + "learning_rate": 5.030246296375377e-06, + "loss": 0.0845, + "step": 45697 + }, + { + "epoch": 0.8150750900724146, + "grad_norm": 0.3751802444458008, + "learning_rate": 5.029309924930173e-06, + "loss": 0.1316, + "step": 45698 + }, + { + "epoch": 0.8150929261941283, + "grad_norm": 0.2684709429740906, + "learning_rate": 5.028373630898267e-06, + "loss": 0.1166, + "step": 45699 + }, + { + "epoch": 0.815110762315842, + "grad_norm": 0.22469931840896606, + "learning_rate": 5.027437414283284e-06, + "loss": 0.0979, + "step": 45700 + }, + { + "epoch": 0.8151285984375557, + "grad_norm": 0.26601627469062805, + "learning_rate": 5.026501275088852e-06, + "loss": 0.1554, + "step": 45701 + }, + { + "epoch": 0.8151464345592694, + "grad_norm": 0.3379831314086914, + "learning_rate": 5.0255652133185925e-06, + "loss": 0.1192, + "step": 45702 + }, + { + "epoch": 0.8151642706809831, + "grad_norm": 0.24312078952789307, + "learning_rate": 5.0246292289761494e-06, + "loss": 0.088, + "step": 45703 + }, + { + "epoch": 0.8151821068026969, + "grad_norm": 0.22759027779102325, + "learning_rate": 5.023693322065134e-06, + "loss": 0.1286, + "step": 45704 + }, + { + "epoch": 0.8151999429244106, + "grad_norm": 0.2826850116252899, + "learning_rate": 5.022757492589192e-06, + "loss": 0.0865, + "step": 45705 + }, + { + "epoch": 0.8152177790461242, + "grad_norm": 0.3726387917995453, + "learning_rate": 5.021821740551938e-06, + "loss": 0.1297, + "step": 45706 + }, + { + "epoch": 0.8152356151678379, + "grad_norm": 0.3333369493484497, + "learning_rate": 5.020886065957001e-06, + "loss": 0.1793, + "step": 45707 + }, + { + "epoch": 0.8152534512895516, + "grad_norm": 0.3990958333015442, + "learning_rate": 5.019950468808013e-06, + "loss": 0.1521, + "step": 45708 + }, + { + "epoch": 0.8152712874112653, + "grad_norm": 0.2901363968849182, + "learning_rate": 5.019014949108599e-06, + "loss": 0.1265, + "step": 45709 + }, + { + "epoch": 0.815289123532979, + "grad_norm": 0.4082321226596832, + "learning_rate": 5.018079506862386e-06, + "loss": 0.1229, + "step": 45710 + }, + { + "epoch": 0.8153069596546927, + "grad_norm": 0.23928017914295197, + "learning_rate": 5.017144142072988e-06, + "loss": 0.1095, + "step": 45711 + }, + { + "epoch": 0.8153247957764064, + "grad_norm": 0.29541197419166565, + "learning_rate": 5.01620885474405e-06, + "loss": 0.1406, + "step": 45712 + }, + { + "epoch": 0.8153426318981201, + "grad_norm": 0.22327785193920135, + "learning_rate": 5.0152736448791895e-06, + "loss": 0.0964, + "step": 45713 + }, + { + "epoch": 0.8153604680198338, + "grad_norm": 0.3019932806491852, + "learning_rate": 5.014338512482031e-06, + "loss": 0.1351, + "step": 45714 + }, + { + "epoch": 0.8153783041415474, + "grad_norm": 0.23647254705429077, + "learning_rate": 5.013403457556193e-06, + "loss": 0.0657, + "step": 45715 + }, + { + "epoch": 0.8153961402632611, + "grad_norm": 0.29700228571891785, + "learning_rate": 5.0124684801053115e-06, + "loss": 0.0881, + "step": 45716 + }, + { + "epoch": 0.8154139763849748, + "grad_norm": 0.2415122091770172, + "learning_rate": 5.011533580132999e-06, + "loss": 0.0727, + "step": 45717 + }, + { + "epoch": 0.8154318125066885, + "grad_norm": 0.22231937944889069, + "learning_rate": 5.0105987576428925e-06, + "loss": 0.0797, + "step": 45718 + }, + { + "epoch": 0.8154496486284022, + "grad_norm": 0.3903101682662964, + "learning_rate": 5.0096640126386095e-06, + "loss": 0.1282, + "step": 45719 + }, + { + "epoch": 0.815467484750116, + "grad_norm": 0.43967410922050476, + "learning_rate": 5.008729345123767e-06, + "loss": 0.1407, + "step": 45720 + }, + { + "epoch": 0.8154853208718297, + "grad_norm": 0.22630712389945984, + "learning_rate": 5.0077947551020025e-06, + "loss": 0.0905, + "step": 45721 + }, + { + "epoch": 0.8155031569935434, + "grad_norm": 0.25379830598831177, + "learning_rate": 5.0068602425769275e-06, + "loss": 0.1629, + "step": 45722 + }, + { + "epoch": 0.815520993115257, + "grad_norm": 0.3421266973018646, + "learning_rate": 5.00592580755217e-06, + "loss": 0.1377, + "step": 45723 + }, + { + "epoch": 0.8155388292369707, + "grad_norm": 0.260030597448349, + "learning_rate": 5.004991450031341e-06, + "loss": 0.0868, + "step": 45724 + }, + { + "epoch": 0.8155566653586844, + "grad_norm": 0.3151165246963501, + "learning_rate": 5.004057170018081e-06, + "loss": 0.1177, + "step": 45725 + }, + { + "epoch": 0.8155745014803981, + "grad_norm": 0.2416413575410843, + "learning_rate": 5.003122967515999e-06, + "loss": 0.1296, + "step": 45726 + }, + { + "epoch": 0.8155923376021118, + "grad_norm": 0.28530260920524597, + "learning_rate": 5.0021888425287215e-06, + "loss": 0.0892, + "step": 45727 + }, + { + "epoch": 0.8156101737238255, + "grad_norm": 0.23772463202476501, + "learning_rate": 5.001254795059857e-06, + "loss": 0.1304, + "step": 45728 + }, + { + "epoch": 0.8156280098455392, + "grad_norm": 0.22185170650482178, + "learning_rate": 5.000320825113045e-06, + "loss": 0.1085, + "step": 45729 + }, + { + "epoch": 0.8156458459672529, + "grad_norm": 0.26586902141571045, + "learning_rate": 4.999386932691896e-06, + "loss": 0.1079, + "step": 45730 + }, + { + "epoch": 0.8156636820889666, + "grad_norm": 0.21446923911571503, + "learning_rate": 4.998453117800026e-06, + "loss": 0.0996, + "step": 45731 + }, + { + "epoch": 0.8156815182106802, + "grad_norm": 0.30676373839378357, + "learning_rate": 4.997519380441068e-06, + "loss": 0.1463, + "step": 45732 + }, + { + "epoch": 0.8156993543323939, + "grad_norm": 0.2974746823310852, + "learning_rate": 4.996585720618624e-06, + "loss": 0.1027, + "step": 45733 + }, + { + "epoch": 0.8157171904541076, + "grad_norm": 0.30245092511177063, + "learning_rate": 4.995652138336329e-06, + "loss": 0.1176, + "step": 45734 + }, + { + "epoch": 0.8157350265758213, + "grad_norm": 0.22816167771816254, + "learning_rate": 4.994718633597798e-06, + "loss": 0.0915, + "step": 45735 + }, + { + "epoch": 0.815752862697535, + "grad_norm": 0.33669912815093994, + "learning_rate": 4.993785206406648e-06, + "loss": 0.1251, + "step": 45736 + }, + { + "epoch": 0.8157706988192488, + "grad_norm": 0.3293575346469879, + "learning_rate": 4.992851856766487e-06, + "loss": 0.1637, + "step": 45737 + }, + { + "epoch": 0.8157885349409625, + "grad_norm": 0.2731926143169403, + "learning_rate": 4.991918584680949e-06, + "loss": 0.1454, + "step": 45738 + }, + { + "epoch": 0.8158063710626762, + "grad_norm": 0.2395857721567154, + "learning_rate": 4.990985390153647e-06, + "loss": 0.1157, + "step": 45739 + }, + { + "epoch": 0.8158242071843899, + "grad_norm": 0.24014852941036224, + "learning_rate": 4.9900522731881945e-06, + "loss": 0.0914, + "step": 45740 + }, + { + "epoch": 0.8158420433061035, + "grad_norm": 0.35486292839050293, + "learning_rate": 4.989119233788206e-06, + "loss": 0.181, + "step": 45741 + }, + { + "epoch": 0.8158598794278172, + "grad_norm": 0.24551841616630554, + "learning_rate": 4.98818627195731e-06, + "loss": 0.1261, + "step": 45742 + }, + { + "epoch": 0.8158777155495309, + "grad_norm": 0.23259590566158295, + "learning_rate": 4.987253387699115e-06, + "loss": 0.0665, + "step": 45743 + }, + { + "epoch": 0.8158955516712446, + "grad_norm": 0.341943621635437, + "learning_rate": 4.986320581017231e-06, + "loss": 0.1485, + "step": 45744 + }, + { + "epoch": 0.8159133877929583, + "grad_norm": 0.36857911944389343, + "learning_rate": 4.985387851915288e-06, + "loss": 0.122, + "step": 45745 + }, + { + "epoch": 0.815931223914672, + "grad_norm": 0.265610933303833, + "learning_rate": 4.984455200396887e-06, + "loss": 0.1259, + "step": 45746 + }, + { + "epoch": 0.8159490600363857, + "grad_norm": 0.2793954908847809, + "learning_rate": 4.983522626465658e-06, + "loss": 0.1138, + "step": 45747 + }, + { + "epoch": 0.8159668961580994, + "grad_norm": 0.24487145245075226, + "learning_rate": 4.982590130125208e-06, + "loss": 0.1315, + "step": 45748 + }, + { + "epoch": 0.815984732279813, + "grad_norm": 0.27068623900413513, + "learning_rate": 4.981657711379154e-06, + "loss": 0.0811, + "step": 45749 + }, + { + "epoch": 0.8160025684015267, + "grad_norm": 0.24784843623638153, + "learning_rate": 4.980725370231101e-06, + "loss": 0.0863, + "step": 45750 + }, + { + "epoch": 0.8160204045232404, + "grad_norm": 0.3247925937175751, + "learning_rate": 4.979793106684677e-06, + "loss": 0.0895, + "step": 45751 + }, + { + "epoch": 0.8160382406449541, + "grad_norm": 0.2142827957868576, + "learning_rate": 4.978860920743492e-06, + "loss": 0.0794, + "step": 45752 + }, + { + "epoch": 0.8160560767666678, + "grad_norm": 0.31806308031082153, + "learning_rate": 4.977928812411156e-06, + "loss": 0.1146, + "step": 45753 + }, + { + "epoch": 0.8160739128883816, + "grad_norm": 0.25226280093193054, + "learning_rate": 4.976996781691276e-06, + "loss": 0.1248, + "step": 45754 + }, + { + "epoch": 0.8160917490100953, + "grad_norm": 0.22675657272338867, + "learning_rate": 4.976064828587479e-06, + "loss": 0.1015, + "step": 45755 + }, + { + "epoch": 0.816109585131809, + "grad_norm": 0.2481113225221634, + "learning_rate": 4.975132953103371e-06, + "loss": 0.0821, + "step": 45756 + }, + { + "epoch": 0.8161274212535227, + "grad_norm": 0.1804397851228714, + "learning_rate": 4.974201155242564e-06, + "loss": 0.0781, + "step": 45757 + }, + { + "epoch": 0.8161452573752364, + "grad_norm": 0.26124024391174316, + "learning_rate": 4.973269435008662e-06, + "loss": 0.1038, + "step": 45758 + }, + { + "epoch": 0.81616309349695, + "grad_norm": 0.2906644344329834, + "learning_rate": 4.972337792405288e-06, + "loss": 0.1582, + "step": 45759 + }, + { + "epoch": 0.8161809296186637, + "grad_norm": 0.25993266701698303, + "learning_rate": 4.971406227436054e-06, + "loss": 0.2159, + "step": 45760 + }, + { + "epoch": 0.8161987657403774, + "grad_norm": 0.29438257217407227, + "learning_rate": 4.970474740104569e-06, + "loss": 0.1117, + "step": 45761 + }, + { + "epoch": 0.8162166018620911, + "grad_norm": 0.27353140711784363, + "learning_rate": 4.969543330414439e-06, + "loss": 0.1076, + "step": 45762 + }, + { + "epoch": 0.8162344379838048, + "grad_norm": 0.25626784563064575, + "learning_rate": 4.968611998369274e-06, + "loss": 0.087, + "step": 45763 + }, + { + "epoch": 0.8162522741055185, + "grad_norm": 0.24238912761211395, + "learning_rate": 4.967680743972691e-06, + "loss": 0.0801, + "step": 45764 + }, + { + "epoch": 0.8162701102272322, + "grad_norm": 0.37771111726760864, + "learning_rate": 4.966749567228298e-06, + "loss": 0.1855, + "step": 45765 + }, + { + "epoch": 0.8162879463489459, + "grad_norm": 0.2881709039211273, + "learning_rate": 4.965818468139705e-06, + "loss": 0.0789, + "step": 45766 + }, + { + "epoch": 0.8163057824706595, + "grad_norm": 0.28422465920448303, + "learning_rate": 4.964887446710509e-06, + "loss": 0.1139, + "step": 45767 + }, + { + "epoch": 0.8163236185923732, + "grad_norm": 0.3086110055446625, + "learning_rate": 4.9639565029443395e-06, + "loss": 0.1484, + "step": 45768 + }, + { + "epoch": 0.8163414547140869, + "grad_norm": 0.2513750493526459, + "learning_rate": 4.96302563684479e-06, + "loss": 0.1055, + "step": 45769 + }, + { + "epoch": 0.8163592908358006, + "grad_norm": 0.2533184885978699, + "learning_rate": 4.9620948484154775e-06, + "loss": 0.1157, + "step": 45770 + }, + { + "epoch": 0.8163771269575144, + "grad_norm": 0.232883483171463, + "learning_rate": 4.961164137659999e-06, + "loss": 0.106, + "step": 45771 + }, + { + "epoch": 0.8163949630792281, + "grad_norm": 0.38878074288368225, + "learning_rate": 4.960233504581973e-06, + "loss": 0.1654, + "step": 45772 + }, + { + "epoch": 0.8164127992009418, + "grad_norm": 0.2574453055858612, + "learning_rate": 4.959302949185002e-06, + "loss": 0.1214, + "step": 45773 + }, + { + "epoch": 0.8164306353226555, + "grad_norm": 0.20646406710147858, + "learning_rate": 4.958372471472697e-06, + "loss": 0.091, + "step": 45774 + }, + { + "epoch": 0.8164484714443692, + "grad_norm": 0.2746943235397339, + "learning_rate": 4.957442071448665e-06, + "loss": 0.1221, + "step": 45775 + }, + { + "epoch": 0.8164663075660828, + "grad_norm": 0.21967127919197083, + "learning_rate": 4.956511749116502e-06, + "loss": 0.0915, + "step": 45776 + }, + { + "epoch": 0.8164841436877965, + "grad_norm": 0.26342082023620605, + "learning_rate": 4.955581504479831e-06, + "loss": 0.1027, + "step": 45777 + }, + { + "epoch": 0.8165019798095102, + "grad_norm": 0.25215309858322144, + "learning_rate": 4.954651337542246e-06, + "loss": 0.0958, + "step": 45778 + }, + { + "epoch": 0.8165198159312239, + "grad_norm": 0.2854245901107788, + "learning_rate": 4.953721248307358e-06, + "loss": 0.1234, + "step": 45779 + }, + { + "epoch": 0.8165376520529376, + "grad_norm": 0.3793368339538574, + "learning_rate": 4.9527912367787615e-06, + "loss": 0.1675, + "step": 45780 + }, + { + "epoch": 0.8165554881746513, + "grad_norm": 0.2954586148262024, + "learning_rate": 4.951861302960078e-06, + "loss": 0.1387, + "step": 45781 + }, + { + "epoch": 0.816573324296365, + "grad_norm": 0.2922453284263611, + "learning_rate": 4.950931446854906e-06, + "loss": 0.1192, + "step": 45782 + }, + { + "epoch": 0.8165911604180787, + "grad_norm": 0.2825854420661926, + "learning_rate": 4.950001668466847e-06, + "loss": 0.142, + "step": 45783 + }, + { + "epoch": 0.8166089965397924, + "grad_norm": 0.21359816193580627, + "learning_rate": 4.949071967799501e-06, + "loss": 0.0717, + "step": 45784 + }, + { + "epoch": 0.816626832661506, + "grad_norm": 0.23116125166416168, + "learning_rate": 4.948142344856482e-06, + "loss": 0.06, + "step": 45785 + }, + { + "epoch": 0.8166446687832197, + "grad_norm": 0.30504778027534485, + "learning_rate": 4.9472127996413846e-06, + "loss": 0.1091, + "step": 45786 + }, + { + "epoch": 0.8166625049049334, + "grad_norm": 0.31183409690856934, + "learning_rate": 4.946283332157822e-06, + "loss": 0.128, + "step": 45787 + }, + { + "epoch": 0.8166803410266472, + "grad_norm": 0.26938846707344055, + "learning_rate": 4.945353942409392e-06, + "loss": 0.1011, + "step": 45788 + }, + { + "epoch": 0.8166981771483609, + "grad_norm": 0.36265090107917786, + "learning_rate": 4.944424630399688e-06, + "loss": 0.1167, + "step": 45789 + }, + { + "epoch": 0.8167160132700746, + "grad_norm": 0.31251442432403564, + "learning_rate": 4.94349539613233e-06, + "loss": 0.0947, + "step": 45790 + }, + { + "epoch": 0.8167338493917883, + "grad_norm": 0.2830224335193634, + "learning_rate": 4.94256623961091e-06, + "loss": 0.1542, + "step": 45791 + }, + { + "epoch": 0.816751685513502, + "grad_norm": 0.21515852212905884, + "learning_rate": 4.941637160839033e-06, + "loss": 0.077, + "step": 45792 + }, + { + "epoch": 0.8167695216352157, + "grad_norm": 0.23835642635822296, + "learning_rate": 4.94070815982029e-06, + "loss": 0.1479, + "step": 45793 + }, + { + "epoch": 0.8167873577569293, + "grad_norm": 0.38214388489723206, + "learning_rate": 4.939779236558298e-06, + "loss": 0.1306, + "step": 45794 + }, + { + "epoch": 0.816805193878643, + "grad_norm": 0.21305690705776215, + "learning_rate": 4.938850391056648e-06, + "loss": 0.0799, + "step": 45795 + }, + { + "epoch": 0.8168230300003567, + "grad_norm": 0.4495021104812622, + "learning_rate": 4.937921623318944e-06, + "loss": 0.1168, + "step": 45796 + }, + { + "epoch": 0.8168408661220704, + "grad_norm": 0.31790244579315186, + "learning_rate": 4.93699293334878e-06, + "loss": 0.1797, + "step": 45797 + }, + { + "epoch": 0.8168587022437841, + "grad_norm": 0.3388820290565491, + "learning_rate": 4.936064321149766e-06, + "loss": 0.1488, + "step": 45798 + }, + { + "epoch": 0.8168765383654978, + "grad_norm": 0.30388545989990234, + "learning_rate": 4.935135786725498e-06, + "loss": 0.1315, + "step": 45799 + }, + { + "epoch": 0.8168943744872115, + "grad_norm": 0.20960137248039246, + "learning_rate": 4.9342073300795654e-06, + "loss": 0.0885, + "step": 45800 + }, + { + "epoch": 0.8169122106089252, + "grad_norm": 0.22565042972564697, + "learning_rate": 4.933278951215581e-06, + "loss": 0.1111, + "step": 45801 + }, + { + "epoch": 0.8169300467306388, + "grad_norm": 0.26642322540283203, + "learning_rate": 4.932350650137135e-06, + "loss": 0.1104, + "step": 45802 + }, + { + "epoch": 0.8169478828523525, + "grad_norm": 0.2083117812871933, + "learning_rate": 4.931422426847834e-06, + "loss": 0.1007, + "step": 45803 + }, + { + "epoch": 0.8169657189740662, + "grad_norm": 0.25963038206100464, + "learning_rate": 4.930494281351269e-06, + "loss": 0.1256, + "step": 45804 + }, + { + "epoch": 0.81698355509578, + "grad_norm": 0.26066914200782776, + "learning_rate": 4.9295662136510434e-06, + "loss": 0.0713, + "step": 45805 + }, + { + "epoch": 0.8170013912174937, + "grad_norm": 0.23632046580314636, + "learning_rate": 4.928638223750745e-06, + "loss": 0.1243, + "step": 45806 + }, + { + "epoch": 0.8170192273392074, + "grad_norm": 0.2195056676864624, + "learning_rate": 4.927710311653982e-06, + "loss": 0.104, + "step": 45807 + }, + { + "epoch": 0.8170370634609211, + "grad_norm": 0.19980207085609436, + "learning_rate": 4.926782477364348e-06, + "loss": 0.0877, + "step": 45808 + }, + { + "epoch": 0.8170548995826348, + "grad_norm": 0.237391859292984, + "learning_rate": 4.925854720885436e-06, + "loss": 0.0976, + "step": 45809 + }, + { + "epoch": 0.8170727357043485, + "grad_norm": 0.2724595069885254, + "learning_rate": 4.924927042220839e-06, + "loss": 0.0809, + "step": 45810 + }, + { + "epoch": 0.8170905718260622, + "grad_norm": 0.21402789652347565, + "learning_rate": 4.923999441374164e-06, + "loss": 0.0892, + "step": 45811 + }, + { + "epoch": 0.8171084079477758, + "grad_norm": 0.2948894202709198, + "learning_rate": 4.923071918349003e-06, + "loss": 0.0972, + "step": 45812 + }, + { + "epoch": 0.8171262440694895, + "grad_norm": 0.2897847592830658, + "learning_rate": 4.922144473148943e-06, + "loss": 0.1289, + "step": 45813 + }, + { + "epoch": 0.8171440801912032, + "grad_norm": 0.2478969246149063, + "learning_rate": 4.92121710577759e-06, + "loss": 0.1016, + "step": 45814 + }, + { + "epoch": 0.8171619163129169, + "grad_norm": 0.21380673348903656, + "learning_rate": 4.92028981623853e-06, + "loss": 0.1043, + "step": 45815 + }, + { + "epoch": 0.8171797524346306, + "grad_norm": 0.21537034213542938, + "learning_rate": 4.919362604535368e-06, + "loss": 0.0798, + "step": 45816 + }, + { + "epoch": 0.8171975885563443, + "grad_norm": 0.2611224055290222, + "learning_rate": 4.918435470671692e-06, + "loss": 0.146, + "step": 45817 + }, + { + "epoch": 0.817215424678058, + "grad_norm": 0.3332725167274475, + "learning_rate": 4.917508414651095e-06, + "loss": 0.1237, + "step": 45818 + }, + { + "epoch": 0.8172332607997717, + "grad_norm": 0.4206140637397766, + "learning_rate": 4.916581436477169e-06, + "loss": 0.0917, + "step": 45819 + }, + { + "epoch": 0.8172510969214853, + "grad_norm": 0.38903236389160156, + "learning_rate": 4.915654536153513e-06, + "loss": 0.1404, + "step": 45820 + }, + { + "epoch": 0.8172689330431991, + "grad_norm": 0.26197049021720886, + "learning_rate": 4.914727713683717e-06, + "loss": 0.0757, + "step": 45821 + }, + { + "epoch": 0.8172867691649128, + "grad_norm": 0.2817513048648834, + "learning_rate": 4.913800969071375e-06, + "loss": 0.0934, + "step": 45822 + }, + { + "epoch": 0.8173046052866265, + "grad_norm": 0.28478750586509705, + "learning_rate": 4.912874302320069e-06, + "loss": 0.1077, + "step": 45823 + }, + { + "epoch": 0.8173224414083402, + "grad_norm": 0.25764408707618713, + "learning_rate": 4.91194771343341e-06, + "loss": 0.1108, + "step": 45824 + }, + { + "epoch": 0.8173402775300539, + "grad_norm": 0.29493528604507446, + "learning_rate": 4.911021202414978e-06, + "loss": 0.1475, + "step": 45825 + }, + { + "epoch": 0.8173581136517676, + "grad_norm": 0.2621796131134033, + "learning_rate": 4.910094769268367e-06, + "loss": 0.1698, + "step": 45826 + }, + { + "epoch": 0.8173759497734813, + "grad_norm": 0.3376453220844269, + "learning_rate": 4.909168413997162e-06, + "loss": 0.127, + "step": 45827 + }, + { + "epoch": 0.817393785895195, + "grad_norm": 0.23872889578342438, + "learning_rate": 4.908242136604962e-06, + "loss": 0.1166, + "step": 45828 + }, + { + "epoch": 0.8174116220169086, + "grad_norm": 0.21681855618953705, + "learning_rate": 4.907315937095352e-06, + "loss": 0.0882, + "step": 45829 + }, + { + "epoch": 0.8174294581386223, + "grad_norm": 0.26990383863449097, + "learning_rate": 4.906389815471929e-06, + "loss": 0.0902, + "step": 45830 + }, + { + "epoch": 0.817447294260336, + "grad_norm": 0.23880477249622345, + "learning_rate": 4.905463771738283e-06, + "loss": 0.0397, + "step": 45831 + }, + { + "epoch": 0.8174651303820497, + "grad_norm": 0.21226686239242554, + "learning_rate": 4.9045378058979905e-06, + "loss": 0.1026, + "step": 45832 + }, + { + "epoch": 0.8174829665037634, + "grad_norm": 0.2717452645301819, + "learning_rate": 4.903611917954656e-06, + "loss": 0.1114, + "step": 45833 + }, + { + "epoch": 0.8175008026254771, + "grad_norm": 0.37122642993927, + "learning_rate": 4.902686107911867e-06, + "loss": 0.1324, + "step": 45834 + }, + { + "epoch": 0.8175186387471908, + "grad_norm": 0.24966545403003693, + "learning_rate": 4.901760375773204e-06, + "loss": 0.0978, + "step": 45835 + }, + { + "epoch": 0.8175364748689045, + "grad_norm": 0.2755209505558014, + "learning_rate": 4.900834721542255e-06, + "loss": 0.1006, + "step": 45836 + }, + { + "epoch": 0.8175543109906181, + "grad_norm": 0.20548062026500702, + "learning_rate": 4.899909145222617e-06, + "loss": 0.0453, + "step": 45837 + }, + { + "epoch": 0.817572147112332, + "grad_norm": 0.22613300383090973, + "learning_rate": 4.898983646817876e-06, + "loss": 0.136, + "step": 45838 + }, + { + "epoch": 0.8175899832340456, + "grad_norm": 0.22451215982437134, + "learning_rate": 4.898058226331615e-06, + "loss": 0.1168, + "step": 45839 + }, + { + "epoch": 0.8176078193557593, + "grad_norm": 0.2814333438873291, + "learning_rate": 4.897132883767425e-06, + "loss": 0.1011, + "step": 45840 + }, + { + "epoch": 0.817625655477473, + "grad_norm": 0.26289355754852295, + "learning_rate": 4.896207619128884e-06, + "loss": 0.1122, + "step": 45841 + }, + { + "epoch": 0.8176434915991867, + "grad_norm": 0.2840304374694824, + "learning_rate": 4.895282432419585e-06, + "loss": 0.0935, + "step": 45842 + }, + { + "epoch": 0.8176613277209004, + "grad_norm": 0.27413636445999146, + "learning_rate": 4.894357323643123e-06, + "loss": 0.083, + "step": 45843 + }, + { + "epoch": 0.8176791638426141, + "grad_norm": 0.27589163184165955, + "learning_rate": 4.8934322928030764e-06, + "loss": 0.1403, + "step": 45844 + }, + { + "epoch": 0.8176969999643278, + "grad_norm": 0.3354912996292114, + "learning_rate": 4.892507339903024e-06, + "loss": 0.1162, + "step": 45845 + }, + { + "epoch": 0.8177148360860415, + "grad_norm": 0.2817024886608124, + "learning_rate": 4.891582464946562e-06, + "loss": 0.1184, + "step": 45846 + }, + { + "epoch": 0.8177326722077551, + "grad_norm": 0.2827165424823761, + "learning_rate": 4.890657667937276e-06, + "loss": 0.1107, + "step": 45847 + }, + { + "epoch": 0.8177505083294688, + "grad_norm": 0.27988970279693604, + "learning_rate": 4.889732948878745e-06, + "loss": 0.0916, + "step": 45848 + }, + { + "epoch": 0.8177683444511825, + "grad_norm": 0.3029497563838959, + "learning_rate": 4.888808307774545e-06, + "loss": 0.1428, + "step": 45849 + }, + { + "epoch": 0.8177861805728962, + "grad_norm": 0.306348979473114, + "learning_rate": 4.88788374462828e-06, + "loss": 0.0998, + "step": 45850 + }, + { + "epoch": 0.8178040166946099, + "grad_norm": 0.2668544054031372, + "learning_rate": 4.886959259443524e-06, + "loss": 0.1205, + "step": 45851 + }, + { + "epoch": 0.8178218528163236, + "grad_norm": 0.2521647810935974, + "learning_rate": 4.886034852223862e-06, + "loss": 0.1194, + "step": 45852 + }, + { + "epoch": 0.8178396889380373, + "grad_norm": 0.2065790295600891, + "learning_rate": 4.885110522972875e-06, + "loss": 0.0756, + "step": 45853 + }, + { + "epoch": 0.817857525059751, + "grad_norm": 0.24508579075336456, + "learning_rate": 4.884186271694138e-06, + "loss": 0.0932, + "step": 45854 + }, + { + "epoch": 0.8178753611814648, + "grad_norm": 0.2725170850753784, + "learning_rate": 4.88326209839125e-06, + "loss": 0.1723, + "step": 45855 + }, + { + "epoch": 0.8178931973031784, + "grad_norm": 0.28779685497283936, + "learning_rate": 4.882338003067783e-06, + "loss": 0.1604, + "step": 45856 + }, + { + "epoch": 0.8179110334248921, + "grad_norm": 0.2571767568588257, + "learning_rate": 4.881413985727326e-06, + "loss": 0.1409, + "step": 45857 + }, + { + "epoch": 0.8179288695466058, + "grad_norm": 0.2688414752483368, + "learning_rate": 4.880490046373451e-06, + "loss": 0.1, + "step": 45858 + }, + { + "epoch": 0.8179467056683195, + "grad_norm": 0.23567961156368256, + "learning_rate": 4.879566185009754e-06, + "loss": 0.1245, + "step": 45859 + }, + { + "epoch": 0.8179645417900332, + "grad_norm": 0.2957104742527008, + "learning_rate": 4.878642401639805e-06, + "loss": 0.1138, + "step": 45860 + }, + { + "epoch": 0.8179823779117469, + "grad_norm": 0.28867942094802856, + "learning_rate": 4.877718696267189e-06, + "loss": 0.0653, + "step": 45861 + }, + { + "epoch": 0.8180002140334606, + "grad_norm": 0.35218754410743713, + "learning_rate": 4.876795068895479e-06, + "loss": 0.1095, + "step": 45862 + }, + { + "epoch": 0.8180180501551743, + "grad_norm": 0.33524566888809204, + "learning_rate": 4.875871519528269e-06, + "loss": 0.1696, + "step": 45863 + }, + { + "epoch": 0.818035886276888, + "grad_norm": 0.2608015537261963, + "learning_rate": 4.874948048169131e-06, + "loss": 0.0963, + "step": 45864 + }, + { + "epoch": 0.8180537223986016, + "grad_norm": 0.2816460430622101, + "learning_rate": 4.874024654821643e-06, + "loss": 0.1386, + "step": 45865 + }, + { + "epoch": 0.8180715585203153, + "grad_norm": 0.25960037112236023, + "learning_rate": 4.873101339489392e-06, + "loss": 0.1348, + "step": 45866 + }, + { + "epoch": 0.818089394642029, + "grad_norm": 0.2543811500072479, + "learning_rate": 4.872178102175939e-06, + "loss": 0.0835, + "step": 45867 + }, + { + "epoch": 0.8181072307637427, + "grad_norm": 0.27094703912734985, + "learning_rate": 4.8712549428848865e-06, + "loss": 0.1189, + "step": 45868 + }, + { + "epoch": 0.8181250668854564, + "grad_norm": 0.27168506383895874, + "learning_rate": 4.870331861619795e-06, + "loss": 0.1252, + "step": 45869 + }, + { + "epoch": 0.8181429030071701, + "grad_norm": 0.2607700526714325, + "learning_rate": 4.869408858384256e-06, + "loss": 0.0899, + "step": 45870 + }, + { + "epoch": 0.8181607391288838, + "grad_norm": 0.2877371907234192, + "learning_rate": 4.868485933181832e-06, + "loss": 0.0862, + "step": 45871 + }, + { + "epoch": 0.8181785752505976, + "grad_norm": 0.2727615237236023, + "learning_rate": 4.867563086016119e-06, + "loss": 0.1597, + "step": 45872 + }, + { + "epoch": 0.8181964113723112, + "grad_norm": 0.2519082725048065, + "learning_rate": 4.866640316890686e-06, + "loss": 0.1317, + "step": 45873 + }, + { + "epoch": 0.8182142474940249, + "grad_norm": 0.32453885674476624, + "learning_rate": 4.865717625809108e-06, + "loss": 0.1045, + "step": 45874 + }, + { + "epoch": 0.8182320836157386, + "grad_norm": 0.2822349965572357, + "learning_rate": 4.864795012774953e-06, + "loss": 0.1303, + "step": 45875 + }, + { + "epoch": 0.8182499197374523, + "grad_norm": 0.2806457579135895, + "learning_rate": 4.863872477791817e-06, + "loss": 0.1222, + "step": 45876 + }, + { + "epoch": 0.818267755859166, + "grad_norm": 0.3050583600997925, + "learning_rate": 4.862950020863266e-06, + "loss": 0.136, + "step": 45877 + }, + { + "epoch": 0.8182855919808797, + "grad_norm": 0.22583162784576416, + "learning_rate": 4.862027641992875e-06, + "loss": 0.1082, + "step": 45878 + }, + { + "epoch": 0.8183034281025934, + "grad_norm": 0.3267974853515625, + "learning_rate": 4.861105341184219e-06, + "loss": 0.1137, + "step": 45879 + }, + { + "epoch": 0.8183212642243071, + "grad_norm": 0.3673345446586609, + "learning_rate": 4.86018311844087e-06, + "loss": 0.1569, + "step": 45880 + }, + { + "epoch": 0.8183391003460208, + "grad_norm": 0.2517510652542114, + "learning_rate": 4.859260973766413e-06, + "loss": 0.1102, + "step": 45881 + }, + { + "epoch": 0.8183569364677344, + "grad_norm": 0.3128267824649811, + "learning_rate": 4.858338907164417e-06, + "loss": 0.0868, + "step": 45882 + }, + { + "epoch": 0.8183747725894481, + "grad_norm": 0.2551311254501343, + "learning_rate": 4.857416918638449e-06, + "loss": 0.1136, + "step": 45883 + }, + { + "epoch": 0.8183926087111618, + "grad_norm": 0.2428465038537979, + "learning_rate": 4.856495008192097e-06, + "loss": 0.1141, + "step": 45884 + }, + { + "epoch": 0.8184104448328755, + "grad_norm": 0.3564857244491577, + "learning_rate": 4.85557317582892e-06, + "loss": 0.1493, + "step": 45885 + }, + { + "epoch": 0.8184282809545892, + "grad_norm": 0.31332963705062866, + "learning_rate": 4.854651421552509e-06, + "loss": 0.1231, + "step": 45886 + }, + { + "epoch": 0.8184461170763029, + "grad_norm": 0.27633076906204224, + "learning_rate": 4.853729745366423e-06, + "loss": 0.1179, + "step": 45887 + }, + { + "epoch": 0.8184639531980166, + "grad_norm": 0.2887042164802551, + "learning_rate": 4.852808147274235e-06, + "loss": 0.0956, + "step": 45888 + }, + { + "epoch": 0.8184817893197304, + "grad_norm": 0.3558990955352783, + "learning_rate": 4.851886627279525e-06, + "loss": 0.1406, + "step": 45889 + }, + { + "epoch": 0.8184996254414441, + "grad_norm": 0.3414379954338074, + "learning_rate": 4.850965185385864e-06, + "loss": 0.0849, + "step": 45890 + }, + { + "epoch": 0.8185174615631577, + "grad_norm": 0.2530554234981537, + "learning_rate": 4.8500438215968226e-06, + "loss": 0.1132, + "step": 45891 + }, + { + "epoch": 0.8185352976848714, + "grad_norm": 0.347713828086853, + "learning_rate": 4.849122535915968e-06, + "loss": 0.1423, + "step": 45892 + }, + { + "epoch": 0.8185531338065851, + "grad_norm": 0.2625904977321625, + "learning_rate": 4.848201328346869e-06, + "loss": 0.083, + "step": 45893 + }, + { + "epoch": 0.8185709699282988, + "grad_norm": 0.2691967189311981, + "learning_rate": 4.847280198893106e-06, + "loss": 0.1555, + "step": 45894 + }, + { + "epoch": 0.8185888060500125, + "grad_norm": 0.27361661195755005, + "learning_rate": 4.846359147558249e-06, + "loss": 0.0712, + "step": 45895 + }, + { + "epoch": 0.8186066421717262, + "grad_norm": 0.2857093811035156, + "learning_rate": 4.845438174345856e-06, + "loss": 0.1275, + "step": 45896 + }, + { + "epoch": 0.8186244782934399, + "grad_norm": 0.26945415139198303, + "learning_rate": 4.844517279259514e-06, + "loss": 0.11, + "step": 45897 + }, + { + "epoch": 0.8186423144151536, + "grad_norm": 0.27251341938972473, + "learning_rate": 4.843596462302777e-06, + "loss": 0.1218, + "step": 45898 + }, + { + "epoch": 0.8186601505368672, + "grad_norm": 0.2999931871891022, + "learning_rate": 4.842675723479229e-06, + "loss": 0.1198, + "step": 45899 + }, + { + "epoch": 0.8186779866585809, + "grad_norm": 0.3390146493911743, + "learning_rate": 4.8417550627924305e-06, + "loss": 0.1462, + "step": 45900 + }, + { + "epoch": 0.8186958227802946, + "grad_norm": 0.24994206428527832, + "learning_rate": 4.840834480245945e-06, + "loss": 0.1209, + "step": 45901 + }, + { + "epoch": 0.8187136589020083, + "grad_norm": 0.3419128358364105, + "learning_rate": 4.839913975843355e-06, + "loss": 0.0972, + "step": 45902 + }, + { + "epoch": 0.818731495023722, + "grad_norm": 0.25583016872406006, + "learning_rate": 4.838993549588222e-06, + "loss": 0.1276, + "step": 45903 + }, + { + "epoch": 0.8187493311454357, + "grad_norm": 0.3033185303211212, + "learning_rate": 4.838073201484114e-06, + "loss": 0.1267, + "step": 45904 + }, + { + "epoch": 0.8187671672671494, + "grad_norm": 0.2594734728336334, + "learning_rate": 4.837152931534597e-06, + "loss": 0.1089, + "step": 45905 + }, + { + "epoch": 0.8187850033888632, + "grad_norm": 0.3624280095100403, + "learning_rate": 4.836232739743235e-06, + "loss": 0.1091, + "step": 45906 + }, + { + "epoch": 0.8188028395105769, + "grad_norm": 0.26875096559524536, + "learning_rate": 4.835312626113603e-06, + "loss": 0.0885, + "step": 45907 + }, + { + "epoch": 0.8188206756322906, + "grad_norm": 0.2868305742740631, + "learning_rate": 4.8343925906492646e-06, + "loss": 0.1384, + "step": 45908 + }, + { + "epoch": 0.8188385117540042, + "grad_norm": 0.22457817196846008, + "learning_rate": 4.833472633353786e-06, + "loss": 0.105, + "step": 45909 + }, + { + "epoch": 0.8188563478757179, + "grad_norm": 0.37026724219322205, + "learning_rate": 4.832552754230726e-06, + "loss": 0.0971, + "step": 45910 + }, + { + "epoch": 0.8188741839974316, + "grad_norm": 0.2729056775569916, + "learning_rate": 4.831632953283663e-06, + "loss": 0.1161, + "step": 45911 + }, + { + "epoch": 0.8188920201191453, + "grad_norm": 0.22457444667816162, + "learning_rate": 4.8307132305161515e-06, + "loss": 0.1265, + "step": 45912 + }, + { + "epoch": 0.818909856240859, + "grad_norm": 0.22164063155651093, + "learning_rate": 4.829793585931766e-06, + "loss": 0.0669, + "step": 45913 + }, + { + "epoch": 0.8189276923625727, + "grad_norm": 0.32471734285354614, + "learning_rate": 4.828874019534063e-06, + "loss": 0.0635, + "step": 45914 + }, + { + "epoch": 0.8189455284842864, + "grad_norm": 0.22897087037563324, + "learning_rate": 4.827954531326617e-06, + "loss": 0.0971, + "step": 45915 + }, + { + "epoch": 0.818963364606, + "grad_norm": 0.2785676419734955, + "learning_rate": 4.827035121312987e-06, + "loss": 0.1101, + "step": 45916 + }, + { + "epoch": 0.8189812007277137, + "grad_norm": 0.1943778693675995, + "learning_rate": 4.8261157894967355e-06, + "loss": 0.052, + "step": 45917 + }, + { + "epoch": 0.8189990368494274, + "grad_norm": 0.20942983031272888, + "learning_rate": 4.8251965358814265e-06, + "loss": 0.0987, + "step": 45918 + }, + { + "epoch": 0.8190168729711411, + "grad_norm": 0.22504980862140656, + "learning_rate": 4.824277360470619e-06, + "loss": 0.1372, + "step": 45919 + }, + { + "epoch": 0.8190347090928548, + "grad_norm": 0.34018179774284363, + "learning_rate": 4.823358263267885e-06, + "loss": 0.122, + "step": 45920 + }, + { + "epoch": 0.8190525452145685, + "grad_norm": 0.2242036610841751, + "learning_rate": 4.822439244276786e-06, + "loss": 0.0938, + "step": 45921 + }, + { + "epoch": 0.8190703813362823, + "grad_norm": 0.29010704159736633, + "learning_rate": 4.8215203035008785e-06, + "loss": 0.127, + "step": 45922 + }, + { + "epoch": 0.819088217457996, + "grad_norm": 0.35392218828201294, + "learning_rate": 4.820601440943723e-06, + "loss": 0.1512, + "step": 45923 + }, + { + "epoch": 0.8191060535797097, + "grad_norm": 0.2635032534599304, + "learning_rate": 4.8196826566088926e-06, + "loss": 0.1185, + "step": 45924 + }, + { + "epoch": 0.8191238897014234, + "grad_norm": 0.2950946092605591, + "learning_rate": 4.818763950499935e-06, + "loss": 0.1308, + "step": 45925 + }, + { + "epoch": 0.819141725823137, + "grad_norm": 0.2535664439201355, + "learning_rate": 4.817845322620426e-06, + "loss": 0.0633, + "step": 45926 + }, + { + "epoch": 0.8191595619448507, + "grad_norm": 0.33581188321113586, + "learning_rate": 4.816926772973915e-06, + "loss": 0.0996, + "step": 45927 + }, + { + "epoch": 0.8191773980665644, + "grad_norm": 0.2544093132019043, + "learning_rate": 4.81600830156397e-06, + "loss": 0.1417, + "step": 45928 + }, + { + "epoch": 0.8191952341882781, + "grad_norm": 0.25684642791748047, + "learning_rate": 4.815089908394149e-06, + "loss": 0.1048, + "step": 45929 + }, + { + "epoch": 0.8192130703099918, + "grad_norm": 0.39580997824668884, + "learning_rate": 4.814171593468011e-06, + "loss": 0.1266, + "step": 45930 + }, + { + "epoch": 0.8192309064317055, + "grad_norm": 0.36412855982780457, + "learning_rate": 4.813253356789116e-06, + "loss": 0.1319, + "step": 45931 + }, + { + "epoch": 0.8192487425534192, + "grad_norm": 0.2551431953907013, + "learning_rate": 4.812335198361018e-06, + "loss": 0.1167, + "step": 45932 + }, + { + "epoch": 0.8192665786751329, + "grad_norm": 0.2499801218509674, + "learning_rate": 4.8114171181872865e-06, + "loss": 0.1393, + "step": 45933 + }, + { + "epoch": 0.8192844147968465, + "grad_norm": 0.25826510787010193, + "learning_rate": 4.810499116271475e-06, + "loss": 0.1789, + "step": 45934 + }, + { + "epoch": 0.8193022509185602, + "grad_norm": 0.27540749311447144, + "learning_rate": 4.809581192617143e-06, + "loss": 0.1421, + "step": 45935 + }, + { + "epoch": 0.8193200870402739, + "grad_norm": 0.4772101640701294, + "learning_rate": 4.80866334722784e-06, + "loss": 0.153, + "step": 45936 + }, + { + "epoch": 0.8193379231619876, + "grad_norm": 0.24885480105876923, + "learning_rate": 4.80774558010714e-06, + "loss": 0.1363, + "step": 45937 + }, + { + "epoch": 0.8193557592837013, + "grad_norm": 0.2563723921775818, + "learning_rate": 4.8068278912585915e-06, + "loss": 0.0514, + "step": 45938 + }, + { + "epoch": 0.8193735954054151, + "grad_norm": 0.46413376927375793, + "learning_rate": 4.805910280685746e-06, + "loss": 0.1271, + "step": 45939 + }, + { + "epoch": 0.8193914315271288, + "grad_norm": 0.29271721839904785, + "learning_rate": 4.804992748392168e-06, + "loss": 0.0525, + "step": 45940 + }, + { + "epoch": 0.8194092676488425, + "grad_norm": 0.2971426248550415, + "learning_rate": 4.804075294381416e-06, + "loss": 0.1054, + "step": 45941 + }, + { + "epoch": 0.8194271037705562, + "grad_norm": 0.2233019918203354, + "learning_rate": 4.803157918657048e-06, + "loss": 0.1254, + "step": 45942 + }, + { + "epoch": 0.8194449398922699, + "grad_norm": 0.3524329662322998, + "learning_rate": 4.802240621222615e-06, + "loss": 0.1251, + "step": 45943 + }, + { + "epoch": 0.8194627760139835, + "grad_norm": 0.2646976113319397, + "learning_rate": 4.80132340208167e-06, + "loss": 0.1607, + "step": 45944 + }, + { + "epoch": 0.8194806121356972, + "grad_norm": 0.23442672193050385, + "learning_rate": 4.800406261237769e-06, + "loss": 0.0622, + "step": 45945 + }, + { + "epoch": 0.8194984482574109, + "grad_norm": 0.38687658309936523, + "learning_rate": 4.799489198694476e-06, + "loss": 0.1152, + "step": 45946 + }, + { + "epoch": 0.8195162843791246, + "grad_norm": 0.3659440279006958, + "learning_rate": 4.798572214455338e-06, + "loss": 0.1753, + "step": 45947 + }, + { + "epoch": 0.8195341205008383, + "grad_norm": 0.2823337912559509, + "learning_rate": 4.797655308523913e-06, + "loss": 0.0914, + "step": 45948 + }, + { + "epoch": 0.819551956622552, + "grad_norm": 0.3602820336818695, + "learning_rate": 4.796738480903748e-06, + "loss": 0.112, + "step": 45949 + }, + { + "epoch": 0.8195697927442657, + "grad_norm": 0.22025497257709503, + "learning_rate": 4.795821731598407e-06, + "loss": 0.0834, + "step": 45950 + }, + { + "epoch": 0.8195876288659794, + "grad_norm": 0.20211149752140045, + "learning_rate": 4.794905060611441e-06, + "loss": 0.0685, + "step": 45951 + }, + { + "epoch": 0.819605464987693, + "grad_norm": 0.4158110022544861, + "learning_rate": 4.793988467946395e-06, + "loss": 0.1474, + "step": 45952 + }, + { + "epoch": 0.8196233011094067, + "grad_norm": 0.28092360496520996, + "learning_rate": 4.793071953606834e-06, + "loss": 0.1187, + "step": 45953 + }, + { + "epoch": 0.8196411372311204, + "grad_norm": 0.19877710938453674, + "learning_rate": 4.7921555175963e-06, + "loss": 0.1029, + "step": 45954 + }, + { + "epoch": 0.8196589733528341, + "grad_norm": 0.3380809724330902, + "learning_rate": 4.791239159918357e-06, + "loss": 0.1204, + "step": 45955 + }, + { + "epoch": 0.8196768094745479, + "grad_norm": 0.2953563928604126, + "learning_rate": 4.7903228805765504e-06, + "loss": 0.1664, + "step": 45956 + }, + { + "epoch": 0.8196946455962616, + "grad_norm": 0.3931449353694916, + "learning_rate": 4.789406679574432e-06, + "loss": 0.1175, + "step": 45957 + }, + { + "epoch": 0.8197124817179753, + "grad_norm": 0.24556396901607513, + "learning_rate": 4.788490556915548e-06, + "loss": 0.1005, + "step": 45958 + }, + { + "epoch": 0.819730317839689, + "grad_norm": 0.2688406705856323, + "learning_rate": 4.787574512603462e-06, + "loss": 0.1612, + "step": 45959 + }, + { + "epoch": 0.8197481539614027, + "grad_norm": 0.23852618038654327, + "learning_rate": 4.78665854664172e-06, + "loss": 0.1307, + "step": 45960 + }, + { + "epoch": 0.8197659900831163, + "grad_norm": 0.23295815289020538, + "learning_rate": 4.785742659033867e-06, + "loss": 0.0887, + "step": 45961 + }, + { + "epoch": 0.81978382620483, + "grad_norm": 0.24690911173820496, + "learning_rate": 4.784826849783455e-06, + "loss": 0.106, + "step": 45962 + }, + { + "epoch": 0.8198016623265437, + "grad_norm": 0.22683537006378174, + "learning_rate": 4.783911118894041e-06, + "loss": 0.1655, + "step": 45963 + }, + { + "epoch": 0.8198194984482574, + "grad_norm": 0.3383115828037262, + "learning_rate": 4.782995466369169e-06, + "loss": 0.1226, + "step": 45964 + }, + { + "epoch": 0.8198373345699711, + "grad_norm": 0.25798147916793823, + "learning_rate": 4.782079892212391e-06, + "loss": 0.157, + "step": 45965 + }, + { + "epoch": 0.8198551706916848, + "grad_norm": 0.21800567209720612, + "learning_rate": 4.781164396427246e-06, + "loss": 0.1141, + "step": 45966 + }, + { + "epoch": 0.8198730068133985, + "grad_norm": 0.2954353094100952, + "learning_rate": 4.780248979017293e-06, + "loss": 0.1138, + "step": 45967 + }, + { + "epoch": 0.8198908429351122, + "grad_norm": 0.24430164694786072, + "learning_rate": 4.7793336399860834e-06, + "loss": 0.0797, + "step": 45968 + }, + { + "epoch": 0.8199086790568259, + "grad_norm": 0.31240203976631165, + "learning_rate": 4.778418379337163e-06, + "loss": 0.13, + "step": 45969 + }, + { + "epoch": 0.8199265151785395, + "grad_norm": 0.3194526433944702, + "learning_rate": 4.777503197074074e-06, + "loss": 0.1572, + "step": 45970 + }, + { + "epoch": 0.8199443513002532, + "grad_norm": 0.3290162682533264, + "learning_rate": 4.776588093200362e-06, + "loss": 0.1071, + "step": 45971 + }, + { + "epoch": 0.8199621874219669, + "grad_norm": 0.3180892765522003, + "learning_rate": 4.775673067719588e-06, + "loss": 0.1232, + "step": 45972 + }, + { + "epoch": 0.8199800235436807, + "grad_norm": 0.36144909262657166, + "learning_rate": 4.774758120635289e-06, + "loss": 0.1918, + "step": 45973 + }, + { + "epoch": 0.8199978596653944, + "grad_norm": 0.24784095585346222, + "learning_rate": 4.773843251951013e-06, + "loss": 0.1172, + "step": 45974 + }, + { + "epoch": 0.8200156957871081, + "grad_norm": 0.2473553866147995, + "learning_rate": 4.772928461670298e-06, + "loss": 0.1039, + "step": 45975 + }, + { + "epoch": 0.8200335319088218, + "grad_norm": 0.2709693908691406, + "learning_rate": 4.772013749796708e-06, + "loss": 0.0885, + "step": 45976 + }, + { + "epoch": 0.8200513680305355, + "grad_norm": 0.2229028344154358, + "learning_rate": 4.771099116333777e-06, + "loss": 0.1504, + "step": 45977 + }, + { + "epoch": 0.8200692041522492, + "grad_norm": 0.27017247676849365, + "learning_rate": 4.770184561285054e-06, + "loss": 0.1208, + "step": 45978 + }, + { + "epoch": 0.8200870402739628, + "grad_norm": 0.37522339820861816, + "learning_rate": 4.769270084654076e-06, + "loss": 0.0739, + "step": 45979 + }, + { + "epoch": 0.8201048763956765, + "grad_norm": 0.2650935649871826, + "learning_rate": 4.7683556864444e-06, + "loss": 0.1614, + "step": 45980 + }, + { + "epoch": 0.8201227125173902, + "grad_norm": 0.40602293610572815, + "learning_rate": 4.7674413666595615e-06, + "loss": 0.1094, + "step": 45981 + }, + { + "epoch": 0.8201405486391039, + "grad_norm": 0.24445992708206177, + "learning_rate": 4.766527125303113e-06, + "loss": 0.1026, + "step": 45982 + }, + { + "epoch": 0.8201583847608176, + "grad_norm": 0.22367045283317566, + "learning_rate": 4.765612962378593e-06, + "loss": 0.1301, + "step": 45983 + }, + { + "epoch": 0.8201762208825313, + "grad_norm": 0.2645864188671112, + "learning_rate": 4.7646988778895425e-06, + "loss": 0.099, + "step": 45984 + }, + { + "epoch": 0.820194057004245, + "grad_norm": 0.28095898032188416, + "learning_rate": 4.76378487183951e-06, + "loss": 0.123, + "step": 45985 + }, + { + "epoch": 0.8202118931259587, + "grad_norm": 0.276136577129364, + "learning_rate": 4.76287094423204e-06, + "loss": 0.1173, + "step": 45986 + }, + { + "epoch": 0.8202297292476723, + "grad_norm": 0.2820354402065277, + "learning_rate": 4.761957095070671e-06, + "loss": 0.1207, + "step": 45987 + }, + { + "epoch": 0.820247565369386, + "grad_norm": 0.2825775146484375, + "learning_rate": 4.761043324358941e-06, + "loss": 0.1193, + "step": 45988 + }, + { + "epoch": 0.8202654014910997, + "grad_norm": 0.26971086859703064, + "learning_rate": 4.760129632100402e-06, + "loss": 0.1242, + "step": 45989 + }, + { + "epoch": 0.8202832376128135, + "grad_norm": 0.2302245944738388, + "learning_rate": 4.759216018298593e-06, + "loss": 0.1128, + "step": 45990 + }, + { + "epoch": 0.8203010737345272, + "grad_norm": 0.2266547679901123, + "learning_rate": 4.7583024829570525e-06, + "loss": 0.095, + "step": 45991 + }, + { + "epoch": 0.8203189098562409, + "grad_norm": 0.30688589811325073, + "learning_rate": 4.757389026079317e-06, + "loss": 0.135, + "step": 45992 + }, + { + "epoch": 0.8203367459779546, + "grad_norm": 0.24869315326213837, + "learning_rate": 4.75647564766894e-06, + "loss": 0.0882, + "step": 45993 + }, + { + "epoch": 0.8203545820996683, + "grad_norm": 0.2837980389595032, + "learning_rate": 4.7555623477294535e-06, + "loss": 0.1092, + "step": 45994 + }, + { + "epoch": 0.820372418221382, + "grad_norm": 0.6464115381240845, + "learning_rate": 4.754649126264394e-06, + "loss": 0.1296, + "step": 45995 + }, + { + "epoch": 0.8203902543430956, + "grad_norm": 0.2385513186454773, + "learning_rate": 4.753735983277313e-06, + "loss": 0.083, + "step": 45996 + }, + { + "epoch": 0.8204080904648093, + "grad_norm": 0.2110484391450882, + "learning_rate": 4.752822918771738e-06, + "loss": 0.1038, + "step": 45997 + }, + { + "epoch": 0.820425926586523, + "grad_norm": 0.258468896150589, + "learning_rate": 4.751909932751223e-06, + "loss": 0.1293, + "step": 45998 + }, + { + "epoch": 0.8204437627082367, + "grad_norm": 0.2708444893360138, + "learning_rate": 4.750997025219295e-06, + "loss": 0.1405, + "step": 45999 + }, + { + "epoch": 0.8204615988299504, + "grad_norm": 0.3401334881782532, + "learning_rate": 4.750084196179499e-06, + "loss": 0.1088, + "step": 46000 + }, + { + "epoch": 0.8204615988299504, + "eval_loss": 0.11009883880615234, + "eval_runtime": 107.0669, + "eval_samples_per_second": 9.564, + "eval_steps_per_second": 1.597, + "step": 46000 + }, + { + "epoch": 0.8204794349516641, + "grad_norm": 0.21945084631443024, + "learning_rate": 4.749171445635362e-06, + "loss": 0.1046, + "step": 46001 + }, + { + "epoch": 0.8204972710733778, + "grad_norm": 0.266580194234848, + "learning_rate": 4.748258773590439e-06, + "loss": 0.0913, + "step": 46002 + }, + { + "epoch": 0.8205151071950915, + "grad_norm": 0.34601345658302307, + "learning_rate": 4.747346180048259e-06, + "loss": 0.1417, + "step": 46003 + }, + { + "epoch": 0.8205329433168052, + "grad_norm": 0.26941192150115967, + "learning_rate": 4.746433665012362e-06, + "loss": 0.0878, + "step": 46004 + }, + { + "epoch": 0.8205507794385188, + "grad_norm": 0.21882638335227966, + "learning_rate": 4.745521228486274e-06, + "loss": 0.0973, + "step": 46005 + }, + { + "epoch": 0.8205686155602325, + "grad_norm": 0.39612606167793274, + "learning_rate": 4.744608870473552e-06, + "loss": 0.1076, + "step": 46006 + }, + { + "epoch": 0.8205864516819463, + "grad_norm": 0.24326537549495697, + "learning_rate": 4.743696590977717e-06, + "loss": 0.1012, + "step": 46007 + }, + { + "epoch": 0.82060428780366, + "grad_norm": 0.23805387318134308, + "learning_rate": 4.742784390002308e-06, + "loss": 0.1202, + "step": 46008 + }, + { + "epoch": 0.8206221239253737, + "grad_norm": 0.24482424557209015, + "learning_rate": 4.741872267550868e-06, + "loss": 0.0827, + "step": 46009 + }, + { + "epoch": 0.8206399600470874, + "grad_norm": 0.27266693115234375, + "learning_rate": 4.740960223626922e-06, + "loss": 0.1414, + "step": 46010 + }, + { + "epoch": 0.8206577961688011, + "grad_norm": 0.2890705168247223, + "learning_rate": 4.740048258234017e-06, + "loss": 0.1029, + "step": 46011 + }, + { + "epoch": 0.8206756322905148, + "grad_norm": 0.23045755922794342, + "learning_rate": 4.739136371375683e-06, + "loss": 0.0576, + "step": 46012 + }, + { + "epoch": 0.8206934684122285, + "grad_norm": 0.26319772005081177, + "learning_rate": 4.7382245630554545e-06, + "loss": 0.1277, + "step": 46013 + }, + { + "epoch": 0.8207113045339421, + "grad_norm": 0.2503160536289215, + "learning_rate": 4.737312833276861e-06, + "loss": 0.0872, + "step": 46014 + }, + { + "epoch": 0.8207291406556558, + "grad_norm": 0.2337036430835724, + "learning_rate": 4.7364011820434455e-06, + "loss": 0.1105, + "step": 46015 + }, + { + "epoch": 0.8207469767773695, + "grad_norm": 0.27023717761039734, + "learning_rate": 4.735489609358737e-06, + "loss": 0.1145, + "step": 46016 + }, + { + "epoch": 0.8207648128990832, + "grad_norm": 0.19023779034614563, + "learning_rate": 4.734578115226271e-06, + "loss": 0.0691, + "step": 46017 + }, + { + "epoch": 0.8207826490207969, + "grad_norm": 0.21077920496463776, + "learning_rate": 4.733666699649575e-06, + "loss": 0.1108, + "step": 46018 + }, + { + "epoch": 0.8208004851425106, + "grad_norm": 0.28194013237953186, + "learning_rate": 4.732755362632191e-06, + "loss": 0.1055, + "step": 46019 + }, + { + "epoch": 0.8208183212642243, + "grad_norm": 0.27430495619773865, + "learning_rate": 4.731844104177649e-06, + "loss": 0.1107, + "step": 46020 + }, + { + "epoch": 0.820836157385938, + "grad_norm": 0.29253271222114563, + "learning_rate": 4.730932924289477e-06, + "loss": 0.0811, + "step": 46021 + }, + { + "epoch": 0.8208539935076516, + "grad_norm": 0.3712082505226135, + "learning_rate": 4.7300218229712065e-06, + "loss": 0.1573, + "step": 46022 + }, + { + "epoch": 0.8208718296293654, + "grad_norm": 0.37444156408309937, + "learning_rate": 4.729110800226372e-06, + "loss": 0.1089, + "step": 46023 + }, + { + "epoch": 0.8208896657510791, + "grad_norm": 0.22761695086956024, + "learning_rate": 4.72819985605851e-06, + "loss": 0.1267, + "step": 46024 + }, + { + "epoch": 0.8209075018727928, + "grad_norm": 0.35790693759918213, + "learning_rate": 4.727288990471146e-06, + "loss": 0.1264, + "step": 46025 + }, + { + "epoch": 0.8209253379945065, + "grad_norm": 0.24947425723075867, + "learning_rate": 4.7263782034678154e-06, + "loss": 0.1254, + "step": 46026 + }, + { + "epoch": 0.8209431741162202, + "grad_norm": 0.26250845193862915, + "learning_rate": 4.725467495052035e-06, + "loss": 0.053, + "step": 46027 + }, + { + "epoch": 0.8209610102379339, + "grad_norm": 0.3390832543373108, + "learning_rate": 4.724556865227356e-06, + "loss": 0.1046, + "step": 46028 + }, + { + "epoch": 0.8209788463596476, + "grad_norm": 0.2783950865268707, + "learning_rate": 4.723646313997293e-06, + "loss": 0.1201, + "step": 46029 + }, + { + "epoch": 0.8209966824813613, + "grad_norm": 0.19751502573490143, + "learning_rate": 4.722735841365383e-06, + "loss": 0.0877, + "step": 46030 + }, + { + "epoch": 0.821014518603075, + "grad_norm": 0.28201955556869507, + "learning_rate": 4.7218254473351455e-06, + "loss": 0.168, + "step": 46031 + }, + { + "epoch": 0.8210323547247886, + "grad_norm": 0.275782972574234, + "learning_rate": 4.720915131910122e-06, + "loss": 0.1407, + "step": 46032 + }, + { + "epoch": 0.8210501908465023, + "grad_norm": 0.25187429785728455, + "learning_rate": 4.720004895093835e-06, + "loss": 0.166, + "step": 46033 + }, + { + "epoch": 0.821068026968216, + "grad_norm": 0.2605955898761749, + "learning_rate": 4.719094736889815e-06, + "loss": 0.0994, + "step": 46034 + }, + { + "epoch": 0.8210858630899297, + "grad_norm": 0.4518495500087738, + "learning_rate": 4.718184657301583e-06, + "loss": 0.1708, + "step": 46035 + }, + { + "epoch": 0.8211036992116434, + "grad_norm": 0.23418988287448883, + "learning_rate": 4.7172746563326765e-06, + "loss": 0.0974, + "step": 46036 + }, + { + "epoch": 0.8211215353333571, + "grad_norm": 0.19696015119552612, + "learning_rate": 4.716364733986614e-06, + "loss": 0.0911, + "step": 46037 + }, + { + "epoch": 0.8211393714550708, + "grad_norm": 0.21459051966667175, + "learning_rate": 4.715454890266932e-06, + "loss": 0.0535, + "step": 46038 + }, + { + "epoch": 0.8211572075767845, + "grad_norm": 0.40713441371917725, + "learning_rate": 4.714545125177156e-06, + "loss": 0.1223, + "step": 46039 + }, + { + "epoch": 0.8211750436984983, + "grad_norm": 0.33374494314193726, + "learning_rate": 4.7136354387208006e-06, + "loss": 0.1311, + "step": 46040 + }, + { + "epoch": 0.8211928798202119, + "grad_norm": 0.3240242600440979, + "learning_rate": 4.71272583090141e-06, + "loss": 0.1002, + "step": 46041 + }, + { + "epoch": 0.8212107159419256, + "grad_norm": 0.2358447015285492, + "learning_rate": 4.711816301722499e-06, + "loss": 0.0964, + "step": 46042 + }, + { + "epoch": 0.8212285520636393, + "grad_norm": 0.21508219838142395, + "learning_rate": 4.710906851187594e-06, + "loss": 0.1051, + "step": 46043 + }, + { + "epoch": 0.821246388185353, + "grad_norm": 0.2570163905620575, + "learning_rate": 4.709997479300219e-06, + "loss": 0.1179, + "step": 46044 + }, + { + "epoch": 0.8212642243070667, + "grad_norm": 0.31717947125434875, + "learning_rate": 4.709088186063903e-06, + "loss": 0.0704, + "step": 46045 + }, + { + "epoch": 0.8212820604287804, + "grad_norm": 0.2874803841114044, + "learning_rate": 4.7081789714821745e-06, + "loss": 0.099, + "step": 46046 + }, + { + "epoch": 0.8212998965504941, + "grad_norm": 0.19525286555290222, + "learning_rate": 4.70726983555855e-06, + "loss": 0.0587, + "step": 46047 + }, + { + "epoch": 0.8213177326722078, + "grad_norm": 0.288347452878952, + "learning_rate": 4.706360778296557e-06, + "loss": 0.146, + "step": 46048 + }, + { + "epoch": 0.8213355687939214, + "grad_norm": 0.23644642531871796, + "learning_rate": 4.705451799699711e-06, + "loss": 0.1047, + "step": 46049 + }, + { + "epoch": 0.8213534049156351, + "grad_norm": 0.2333265244960785, + "learning_rate": 4.7045428997715465e-06, + "loss": 0.0921, + "step": 46050 + }, + { + "epoch": 0.8213712410373488, + "grad_norm": 0.3645409643650055, + "learning_rate": 4.703634078515589e-06, + "loss": 0.0753, + "step": 46051 + }, + { + "epoch": 0.8213890771590625, + "grad_norm": 0.2837405800819397, + "learning_rate": 4.702725335935354e-06, + "loss": 0.0862, + "step": 46052 + }, + { + "epoch": 0.8214069132807762, + "grad_norm": 0.29357659816741943, + "learning_rate": 4.701816672034362e-06, + "loss": 0.0872, + "step": 46053 + }, + { + "epoch": 0.8214247494024899, + "grad_norm": 0.3213426172733307, + "learning_rate": 4.700908086816144e-06, + "loss": 0.1171, + "step": 46054 + }, + { + "epoch": 0.8214425855242036, + "grad_norm": 0.35706669092178345, + "learning_rate": 4.69999958028422e-06, + "loss": 0.1341, + "step": 46055 + }, + { + "epoch": 0.8214604216459173, + "grad_norm": 0.28023555874824524, + "learning_rate": 4.699091152442106e-06, + "loss": 0.1708, + "step": 46056 + }, + { + "epoch": 0.8214782577676311, + "grad_norm": 0.3517388105392456, + "learning_rate": 4.698182803293321e-06, + "loss": 0.0939, + "step": 46057 + }, + { + "epoch": 0.8214960938893447, + "grad_norm": 0.30235356092453003, + "learning_rate": 4.697274532841398e-06, + "loss": 0.113, + "step": 46058 + }, + { + "epoch": 0.8215139300110584, + "grad_norm": 0.29654595255851746, + "learning_rate": 4.696366341089853e-06, + "loss": 0.0862, + "step": 46059 + }, + { + "epoch": 0.8215317661327721, + "grad_norm": 0.384662002325058, + "learning_rate": 4.695458228042202e-06, + "loss": 0.1445, + "step": 46060 + }, + { + "epoch": 0.8215496022544858, + "grad_norm": 0.22031594812870026, + "learning_rate": 4.694550193701968e-06, + "loss": 0.11, + "step": 46061 + }, + { + "epoch": 0.8215674383761995, + "grad_norm": 0.344844788312912, + "learning_rate": 4.693642238072666e-06, + "loss": 0.0608, + "step": 46062 + }, + { + "epoch": 0.8215852744979132, + "grad_norm": 0.2458350956439972, + "learning_rate": 4.692734361157827e-06, + "loss": 0.0962, + "step": 46063 + }, + { + "epoch": 0.8216031106196269, + "grad_norm": 0.39920634031295776, + "learning_rate": 4.691826562960955e-06, + "loss": 0.1046, + "step": 46064 + }, + { + "epoch": 0.8216209467413406, + "grad_norm": 0.3210431635379791, + "learning_rate": 4.690918843485584e-06, + "loss": 0.1007, + "step": 46065 + }, + { + "epoch": 0.8216387828630543, + "grad_norm": 0.3173997104167938, + "learning_rate": 4.690011202735223e-06, + "loss": 0.1173, + "step": 46066 + }, + { + "epoch": 0.8216566189847679, + "grad_norm": 0.34847620129585266, + "learning_rate": 4.689103640713397e-06, + "loss": 0.1162, + "step": 46067 + }, + { + "epoch": 0.8216744551064816, + "grad_norm": 0.21079584956169128, + "learning_rate": 4.688196157423619e-06, + "loss": 0.0804, + "step": 46068 + }, + { + "epoch": 0.8216922912281953, + "grad_norm": 0.26405075192451477, + "learning_rate": 4.687288752869412e-06, + "loss": 0.1703, + "step": 46069 + }, + { + "epoch": 0.821710127349909, + "grad_norm": 0.1882033795118332, + "learning_rate": 4.686381427054279e-06, + "loss": 0.0751, + "step": 46070 + }, + { + "epoch": 0.8217279634716227, + "grad_norm": 0.2501179277896881, + "learning_rate": 4.685474179981758e-06, + "loss": 0.109, + "step": 46071 + }, + { + "epoch": 0.8217457995933364, + "grad_norm": 0.2737066447734833, + "learning_rate": 4.684567011655353e-06, + "loss": 0.1259, + "step": 46072 + }, + { + "epoch": 0.8217636357150501, + "grad_norm": 0.24505580961704254, + "learning_rate": 4.683659922078584e-06, + "loss": 0.0697, + "step": 46073 + }, + { + "epoch": 0.8217814718367639, + "grad_norm": 0.42098796367645264, + "learning_rate": 4.682752911254965e-06, + "loss": 0.1164, + "step": 46074 + }, + { + "epoch": 0.8217993079584776, + "grad_norm": 0.3601228892803192, + "learning_rate": 4.681845979188007e-06, + "loss": 0.1656, + "step": 46075 + }, + { + "epoch": 0.8218171440801912, + "grad_norm": 0.23303711414337158, + "learning_rate": 4.680939125881239e-06, + "loss": 0.1239, + "step": 46076 + }, + { + "epoch": 0.8218349802019049, + "grad_norm": 0.19640538096427917, + "learning_rate": 4.680032351338162e-06, + "loss": 0.082, + "step": 46077 + }, + { + "epoch": 0.8218528163236186, + "grad_norm": 0.28218916058540344, + "learning_rate": 4.679125655562306e-06, + "loss": 0.1414, + "step": 46078 + }, + { + "epoch": 0.8218706524453323, + "grad_norm": 0.3526959717273712, + "learning_rate": 4.678219038557169e-06, + "loss": 0.14, + "step": 46079 + }, + { + "epoch": 0.821888488567046, + "grad_norm": 0.3054324686527252, + "learning_rate": 4.677312500326281e-06, + "loss": 0.1145, + "step": 46080 + }, + { + "epoch": 0.8219063246887597, + "grad_norm": 0.21161223948001862, + "learning_rate": 4.676406040873149e-06, + "loss": 0.0559, + "step": 46081 + }, + { + "epoch": 0.8219241608104734, + "grad_norm": 0.2855243682861328, + "learning_rate": 4.675499660201288e-06, + "loss": 0.1074, + "step": 46082 + }, + { + "epoch": 0.8219419969321871, + "grad_norm": 0.3317650258541107, + "learning_rate": 4.6745933583142e-06, + "loss": 0.1019, + "step": 46083 + }, + { + "epoch": 0.8219598330539007, + "grad_norm": 0.22176623344421387, + "learning_rate": 4.6736871352154185e-06, + "loss": 0.0769, + "step": 46084 + }, + { + "epoch": 0.8219776691756144, + "grad_norm": 0.2839299142360687, + "learning_rate": 4.672780990908446e-06, + "loss": 0.1151, + "step": 46085 + }, + { + "epoch": 0.8219955052973281, + "grad_norm": 0.2464563250541687, + "learning_rate": 4.671874925396794e-06, + "loss": 0.1275, + "step": 46086 + }, + { + "epoch": 0.8220133414190418, + "grad_norm": 0.18365976214408875, + "learning_rate": 4.670968938683975e-06, + "loss": 0.089, + "step": 46087 + }, + { + "epoch": 0.8220311775407555, + "grad_norm": 0.2941281199455261, + "learning_rate": 4.670063030773497e-06, + "loss": 0.1436, + "step": 46088 + }, + { + "epoch": 0.8220490136624692, + "grad_norm": 0.24149273335933685, + "learning_rate": 4.669157201668881e-06, + "loss": 0.0949, + "step": 46089 + }, + { + "epoch": 0.8220668497841829, + "grad_norm": 0.30592742562294006, + "learning_rate": 4.668251451373634e-06, + "loss": 0.1302, + "step": 46090 + }, + { + "epoch": 0.8220846859058967, + "grad_norm": 0.27544450759887695, + "learning_rate": 4.66734577989126e-06, + "loss": 0.0993, + "step": 46091 + }, + { + "epoch": 0.8221025220276104, + "grad_norm": 0.2548133134841919, + "learning_rate": 4.666440187225285e-06, + "loss": 0.1098, + "step": 46092 + }, + { + "epoch": 0.822120358149324, + "grad_norm": 0.2702014744281769, + "learning_rate": 4.665534673379204e-06, + "loss": 0.1056, + "step": 46093 + }, + { + "epoch": 0.8221381942710377, + "grad_norm": 0.3659670352935791, + "learning_rate": 4.664629238356541e-06, + "loss": 0.1448, + "step": 46094 + }, + { + "epoch": 0.8221560303927514, + "grad_norm": 0.32081031799316406, + "learning_rate": 4.663723882160797e-06, + "loss": 0.0991, + "step": 46095 + }, + { + "epoch": 0.8221738665144651, + "grad_norm": 0.2757716178894043, + "learning_rate": 4.662818604795477e-06, + "loss": 0.1485, + "step": 46096 + }, + { + "epoch": 0.8221917026361788, + "grad_norm": 0.28562456369400024, + "learning_rate": 4.661913406264101e-06, + "loss": 0.1212, + "step": 46097 + }, + { + "epoch": 0.8222095387578925, + "grad_norm": 0.26772695779800415, + "learning_rate": 4.661008286570176e-06, + "loss": 0.1072, + "step": 46098 + }, + { + "epoch": 0.8222273748796062, + "grad_norm": 0.2566722631454468, + "learning_rate": 4.660103245717207e-06, + "loss": 0.1075, + "step": 46099 + }, + { + "epoch": 0.8222452110013199, + "grad_norm": 0.32634469866752625, + "learning_rate": 4.659198283708705e-06, + "loss": 0.1117, + "step": 46100 + }, + { + "epoch": 0.8222630471230336, + "grad_norm": 0.1959335058927536, + "learning_rate": 4.658293400548166e-06, + "loss": 0.0658, + "step": 46101 + }, + { + "epoch": 0.8222808832447472, + "grad_norm": 0.2890969216823578, + "learning_rate": 4.657388596239115e-06, + "loss": 0.1041, + "step": 46102 + }, + { + "epoch": 0.8222987193664609, + "grad_norm": 0.2475145161151886, + "learning_rate": 4.6564838707850525e-06, + "loss": 0.1031, + "step": 46103 + }, + { + "epoch": 0.8223165554881746, + "grad_norm": 0.3062688410282135, + "learning_rate": 4.655579224189477e-06, + "loss": 0.1241, + "step": 46104 + }, + { + "epoch": 0.8223343916098883, + "grad_norm": 0.2573624849319458, + "learning_rate": 4.654674656455912e-06, + "loss": 0.1172, + "step": 46105 + }, + { + "epoch": 0.822352227731602, + "grad_norm": 0.2881820499897003, + "learning_rate": 4.653770167587848e-06, + "loss": 0.1329, + "step": 46106 + }, + { + "epoch": 0.8223700638533157, + "grad_norm": 0.29084765911102295, + "learning_rate": 4.652865757588803e-06, + "loss": 0.0892, + "step": 46107 + }, + { + "epoch": 0.8223878999750295, + "grad_norm": 0.3494262397289276, + "learning_rate": 4.6519614264622785e-06, + "loss": 0.1266, + "step": 46108 + }, + { + "epoch": 0.8224057360967432, + "grad_norm": 0.34002685546875, + "learning_rate": 4.651057174211776e-06, + "loss": 0.0869, + "step": 46109 + }, + { + "epoch": 0.8224235722184569, + "grad_norm": 0.24071046710014343, + "learning_rate": 4.650153000840807e-06, + "loss": 0.0915, + "step": 46110 + }, + { + "epoch": 0.8224414083401705, + "grad_norm": 0.3147448003292084, + "learning_rate": 4.6492489063528736e-06, + "loss": 0.0709, + "step": 46111 + }, + { + "epoch": 0.8224592444618842, + "grad_norm": 0.284736692905426, + "learning_rate": 4.648344890751483e-06, + "loss": 0.1317, + "step": 46112 + }, + { + "epoch": 0.8224770805835979, + "grad_norm": 0.30460789799690247, + "learning_rate": 4.647440954040136e-06, + "loss": 0.0767, + "step": 46113 + }, + { + "epoch": 0.8224949167053116, + "grad_norm": 0.28746193647384644, + "learning_rate": 4.646537096222331e-06, + "loss": 0.1052, + "step": 46114 + }, + { + "epoch": 0.8225127528270253, + "grad_norm": 0.2532510459423065, + "learning_rate": 4.645633317301584e-06, + "loss": 0.0849, + "step": 46115 + }, + { + "epoch": 0.822530588948739, + "grad_norm": 0.22827962040901184, + "learning_rate": 4.64472961728139e-06, + "loss": 0.044, + "step": 46116 + }, + { + "epoch": 0.8225484250704527, + "grad_norm": 0.40525200963020325, + "learning_rate": 4.643825996165257e-06, + "loss": 0.1295, + "step": 46117 + }, + { + "epoch": 0.8225662611921664, + "grad_norm": 0.25670236349105835, + "learning_rate": 4.64292245395668e-06, + "loss": 0.0794, + "step": 46118 + }, + { + "epoch": 0.82258409731388, + "grad_norm": 0.2593821585178375, + "learning_rate": 4.642018990659172e-06, + "loss": 0.109, + "step": 46119 + }, + { + "epoch": 0.8226019334355937, + "grad_norm": 0.26307791471481323, + "learning_rate": 4.641115606276222e-06, + "loss": 0.156, + "step": 46120 + }, + { + "epoch": 0.8226197695573074, + "grad_norm": 0.4257407784461975, + "learning_rate": 4.640212300811348e-06, + "loss": 0.1378, + "step": 46121 + }, + { + "epoch": 0.8226376056790211, + "grad_norm": 0.2438928633928299, + "learning_rate": 4.639309074268036e-06, + "loss": 0.1343, + "step": 46122 + }, + { + "epoch": 0.8226554418007348, + "grad_norm": 0.3015868663787842, + "learning_rate": 4.638405926649802e-06, + "loss": 0.1374, + "step": 46123 + }, + { + "epoch": 0.8226732779224486, + "grad_norm": 0.2525673508644104, + "learning_rate": 4.637502857960138e-06, + "loss": 0.0661, + "step": 46124 + }, + { + "epoch": 0.8226911140441623, + "grad_norm": 0.2619667649269104, + "learning_rate": 4.636599868202546e-06, + "loss": 0.1491, + "step": 46125 + }, + { + "epoch": 0.822708950165876, + "grad_norm": 0.30909499526023865, + "learning_rate": 4.6356969573805255e-06, + "loss": 0.0896, + "step": 46126 + }, + { + "epoch": 0.8227267862875897, + "grad_norm": 0.3124559819698334, + "learning_rate": 4.6347941254975716e-06, + "loss": 0.141, + "step": 46127 + }, + { + "epoch": 0.8227446224093033, + "grad_norm": 0.21617282927036285, + "learning_rate": 4.633891372557195e-06, + "loss": 0.0803, + "step": 46128 + }, + { + "epoch": 0.822762458531017, + "grad_norm": 0.29223906993865967, + "learning_rate": 4.632988698562893e-06, + "loss": 0.1427, + "step": 46129 + }, + { + "epoch": 0.8227802946527307, + "grad_norm": 0.27456992864608765, + "learning_rate": 4.632086103518157e-06, + "loss": 0.1054, + "step": 46130 + }, + { + "epoch": 0.8227981307744444, + "grad_norm": 0.2903895080089569, + "learning_rate": 4.631183587426485e-06, + "loss": 0.1329, + "step": 46131 + }, + { + "epoch": 0.8228159668961581, + "grad_norm": 0.26002299785614014, + "learning_rate": 4.630281150291388e-06, + "loss": 0.0917, + "step": 46132 + }, + { + "epoch": 0.8228338030178718, + "grad_norm": 0.3393615186214447, + "learning_rate": 4.629378792116351e-06, + "loss": 0.1507, + "step": 46133 + }, + { + "epoch": 0.8228516391395855, + "grad_norm": 0.26019203662872314, + "learning_rate": 4.628476512904884e-06, + "loss": 0.1179, + "step": 46134 + }, + { + "epoch": 0.8228694752612992, + "grad_norm": 0.25363609194755554, + "learning_rate": 4.627574312660468e-06, + "loss": 0.0938, + "step": 46135 + }, + { + "epoch": 0.8228873113830129, + "grad_norm": 0.3216564655303955, + "learning_rate": 4.626672191386622e-06, + "loss": 0.1426, + "step": 46136 + }, + { + "epoch": 0.8229051475047265, + "grad_norm": 0.3058150112628937, + "learning_rate": 4.6257701490868275e-06, + "loss": 0.1369, + "step": 46137 + }, + { + "epoch": 0.8229229836264402, + "grad_norm": 0.3566642999649048, + "learning_rate": 4.624868185764586e-06, + "loss": 0.142, + "step": 46138 + }, + { + "epoch": 0.8229408197481539, + "grad_norm": 0.38568368554115295, + "learning_rate": 4.623966301423393e-06, + "loss": 0.1114, + "step": 46139 + }, + { + "epoch": 0.8229586558698676, + "grad_norm": 0.2623521089553833, + "learning_rate": 4.623064496066737e-06, + "loss": 0.0951, + "step": 46140 + }, + { + "epoch": 0.8229764919915814, + "grad_norm": 0.2472400814294815, + "learning_rate": 4.62216276969813e-06, + "loss": 0.0708, + "step": 46141 + }, + { + "epoch": 0.8229943281132951, + "grad_norm": 0.2786538600921631, + "learning_rate": 4.621261122321055e-06, + "loss": 0.0978, + "step": 46142 + }, + { + "epoch": 0.8230121642350088, + "grad_norm": 0.344588041305542, + "learning_rate": 4.620359553939013e-06, + "loss": 0.1347, + "step": 46143 + }, + { + "epoch": 0.8230300003567225, + "grad_norm": 0.26819539070129395, + "learning_rate": 4.6194580645554875e-06, + "loss": 0.1014, + "step": 46144 + }, + { + "epoch": 0.8230478364784362, + "grad_norm": 0.24067746102809906, + "learning_rate": 4.618556654173991e-06, + "loss": 0.0911, + "step": 46145 + }, + { + "epoch": 0.8230656726001498, + "grad_norm": 0.31288135051727295, + "learning_rate": 4.617655322798006e-06, + "loss": 0.0821, + "step": 46146 + }, + { + "epoch": 0.8230835087218635, + "grad_norm": 0.22926175594329834, + "learning_rate": 4.616754070431023e-06, + "loss": 0.0784, + "step": 46147 + }, + { + "epoch": 0.8231013448435772, + "grad_norm": 0.29319366812705994, + "learning_rate": 4.615852897076542e-06, + "loss": 0.1395, + "step": 46148 + }, + { + "epoch": 0.8231191809652909, + "grad_norm": 0.2678544521331787, + "learning_rate": 4.614951802738063e-06, + "loss": 0.1451, + "step": 46149 + }, + { + "epoch": 0.8231370170870046, + "grad_norm": 0.4900416433811188, + "learning_rate": 4.6140507874190694e-06, + "loss": 0.1076, + "step": 46150 + }, + { + "epoch": 0.8231548532087183, + "grad_norm": 0.2736718952655792, + "learning_rate": 4.613149851123058e-06, + "loss": 0.0868, + "step": 46151 + }, + { + "epoch": 0.823172689330432, + "grad_norm": 0.25595220923423767, + "learning_rate": 4.6122489938535185e-06, + "loss": 0.1107, + "step": 46152 + }, + { + "epoch": 0.8231905254521457, + "grad_norm": 0.26881736516952515, + "learning_rate": 4.6113482156139366e-06, + "loss": 0.1092, + "step": 46153 + }, + { + "epoch": 0.8232083615738593, + "grad_norm": 0.2667938768863678, + "learning_rate": 4.6104475164078175e-06, + "loss": 0.136, + "step": 46154 + }, + { + "epoch": 0.823226197695573, + "grad_norm": 0.2897258400917053, + "learning_rate": 4.609546896238645e-06, + "loss": 0.0871, + "step": 46155 + }, + { + "epoch": 0.8232440338172867, + "grad_norm": 0.2286834865808487, + "learning_rate": 4.6086463551099135e-06, + "loss": 0.1065, + "step": 46156 + }, + { + "epoch": 0.8232618699390004, + "grad_norm": 0.23880279064178467, + "learning_rate": 4.607745893025106e-06, + "loss": 0.1048, + "step": 46157 + }, + { + "epoch": 0.8232797060607142, + "grad_norm": 0.3318907618522644, + "learning_rate": 4.606845509987723e-06, + "loss": 0.155, + "step": 46158 + }, + { + "epoch": 0.8232975421824279, + "grad_norm": 0.31443554162979126, + "learning_rate": 4.6059452060012495e-06, + "loss": 0.147, + "step": 46159 + }, + { + "epoch": 0.8233153783041416, + "grad_norm": 0.30736300349235535, + "learning_rate": 4.605044981069173e-06, + "loss": 0.1438, + "step": 46160 + }, + { + "epoch": 0.8233332144258553, + "grad_norm": 0.4022292494773865, + "learning_rate": 4.60414483519499e-06, + "loss": 0.1387, + "step": 46161 + }, + { + "epoch": 0.823351050547569, + "grad_norm": 0.27604565024375916, + "learning_rate": 4.603244768382181e-06, + "loss": 0.0981, + "step": 46162 + }, + { + "epoch": 0.8233688866692827, + "grad_norm": 0.26068148016929626, + "learning_rate": 4.602344780634247e-06, + "loss": 0.0869, + "step": 46163 + }, + { + "epoch": 0.8233867227909963, + "grad_norm": 0.24158501625061035, + "learning_rate": 4.601444871954669e-06, + "loss": 0.0925, + "step": 46164 + }, + { + "epoch": 0.82340455891271, + "grad_norm": 0.2656979560852051, + "learning_rate": 4.6005450423469345e-06, + "loss": 0.1353, + "step": 46165 + }, + { + "epoch": 0.8234223950344237, + "grad_norm": 0.2927142381668091, + "learning_rate": 4.599645291814531e-06, + "loss": 0.1221, + "step": 46166 + }, + { + "epoch": 0.8234402311561374, + "grad_norm": 0.21333450078964233, + "learning_rate": 4.59874562036095e-06, + "loss": 0.1057, + "step": 46167 + }, + { + "epoch": 0.8234580672778511, + "grad_norm": 0.3720265328884125, + "learning_rate": 4.5978460279896805e-06, + "loss": 0.1283, + "step": 46168 + }, + { + "epoch": 0.8234759033995648, + "grad_norm": 0.21119123697280884, + "learning_rate": 4.596946514704206e-06, + "loss": 0.0924, + "step": 46169 + }, + { + "epoch": 0.8234937395212785, + "grad_norm": 0.26303210854530334, + "learning_rate": 4.5960470805080065e-06, + "loss": 0.0894, + "step": 46170 + }, + { + "epoch": 0.8235115756429922, + "grad_norm": 0.2832821011543274, + "learning_rate": 4.595147725404583e-06, + "loss": 0.1359, + "step": 46171 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.20838138461112976, + "learning_rate": 4.594248449397415e-06, + "loss": 0.0952, + "step": 46172 + }, + { + "epoch": 0.8235472478864195, + "grad_norm": 0.3012889325618744, + "learning_rate": 4.593349252489987e-06, + "loss": 0.139, + "step": 46173 + }, + { + "epoch": 0.8235650840081332, + "grad_norm": 0.2740786671638489, + "learning_rate": 4.592450134685778e-06, + "loss": 0.1676, + "step": 46174 + }, + { + "epoch": 0.823582920129847, + "grad_norm": 0.4201247990131378, + "learning_rate": 4.591551095988289e-06, + "loss": 0.1753, + "step": 46175 + }, + { + "epoch": 0.8236007562515607, + "grad_norm": 0.2660380005836487, + "learning_rate": 4.590652136400989e-06, + "loss": 0.1123, + "step": 46176 + }, + { + "epoch": 0.8236185923732744, + "grad_norm": 0.2980002462863922, + "learning_rate": 4.589753255927378e-06, + "loss": 0.1459, + "step": 46177 + }, + { + "epoch": 0.8236364284949881, + "grad_norm": 0.24808001518249512, + "learning_rate": 4.588854454570926e-06, + "loss": 0.1427, + "step": 46178 + }, + { + "epoch": 0.8236542646167018, + "grad_norm": 0.20960798859596252, + "learning_rate": 4.587955732335131e-06, + "loss": 0.0938, + "step": 46179 + }, + { + "epoch": 0.8236721007384155, + "grad_norm": 0.3316652774810791, + "learning_rate": 4.58705708922347e-06, + "loss": 0.0954, + "step": 46180 + }, + { + "epoch": 0.8236899368601291, + "grad_norm": 0.31050950288772583, + "learning_rate": 4.586158525239425e-06, + "loss": 0.091, + "step": 46181 + }, + { + "epoch": 0.8237077729818428, + "grad_norm": 0.2365371286869049, + "learning_rate": 4.58526004038648e-06, + "loss": 0.111, + "step": 46182 + }, + { + "epoch": 0.8237256091035565, + "grad_norm": 0.2544407546520233, + "learning_rate": 4.584361634668113e-06, + "loss": 0.0996, + "step": 46183 + }, + { + "epoch": 0.8237434452252702, + "grad_norm": 0.28068748116493225, + "learning_rate": 4.583463308087815e-06, + "loss": 0.1494, + "step": 46184 + }, + { + "epoch": 0.8237612813469839, + "grad_norm": 0.230164036154747, + "learning_rate": 4.5825650606490685e-06, + "loss": 0.0944, + "step": 46185 + }, + { + "epoch": 0.8237791174686976, + "grad_norm": 0.38114142417907715, + "learning_rate": 4.581666892355352e-06, + "loss": 0.0768, + "step": 46186 + }, + { + "epoch": 0.8237969535904113, + "grad_norm": 0.3075874447822571, + "learning_rate": 4.580768803210137e-06, + "loss": 0.1374, + "step": 46187 + }, + { + "epoch": 0.823814789712125, + "grad_norm": 0.27401652932167053, + "learning_rate": 4.579870793216923e-06, + "loss": 0.0906, + "step": 46188 + }, + { + "epoch": 0.8238326258338386, + "grad_norm": 0.30371198058128357, + "learning_rate": 4.578972862379177e-06, + "loss": 0.1769, + "step": 46189 + }, + { + "epoch": 0.8238504619555523, + "grad_norm": 0.34023985266685486, + "learning_rate": 4.578075010700392e-06, + "loss": 0.1147, + "step": 46190 + }, + { + "epoch": 0.823868298077266, + "grad_norm": 0.1968194991350174, + "learning_rate": 4.577177238184033e-06, + "loss": 0.043, + "step": 46191 + }, + { + "epoch": 0.8238861341989798, + "grad_norm": 0.23457229137420654, + "learning_rate": 4.576279544833597e-06, + "loss": 0.0852, + "step": 46192 + }, + { + "epoch": 0.8239039703206935, + "grad_norm": 0.2830899655818939, + "learning_rate": 4.575381930652556e-06, + "loss": 0.1041, + "step": 46193 + }, + { + "epoch": 0.8239218064424072, + "grad_norm": 0.27478930354118347, + "learning_rate": 4.574484395644388e-06, + "loss": 0.1207, + "step": 46194 + }, + { + "epoch": 0.8239396425641209, + "grad_norm": 0.32030749320983887, + "learning_rate": 4.573586939812574e-06, + "loss": 0.1171, + "step": 46195 + }, + { + "epoch": 0.8239574786858346, + "grad_norm": 0.2744278609752655, + "learning_rate": 4.5726895631605845e-06, + "loss": 0.0863, + "step": 46196 + }, + { + "epoch": 0.8239753148075483, + "grad_norm": 0.17130441963672638, + "learning_rate": 4.571792265691913e-06, + "loss": 0.0488, + "step": 46197 + }, + { + "epoch": 0.823993150929262, + "grad_norm": 0.30343884229660034, + "learning_rate": 4.570895047410032e-06, + "loss": 0.0778, + "step": 46198 + }, + { + "epoch": 0.8240109870509756, + "grad_norm": 0.2665086090564728, + "learning_rate": 4.569997908318416e-06, + "loss": 0.1172, + "step": 46199 + }, + { + "epoch": 0.8240288231726893, + "grad_norm": 0.2738859951496124, + "learning_rate": 4.569100848420538e-06, + "loss": 0.1355, + "step": 46200 + }, + { + "epoch": 0.824046659294403, + "grad_norm": 0.34839731454849243, + "learning_rate": 4.568203867719886e-06, + "loss": 0.137, + "step": 46201 + }, + { + "epoch": 0.8240644954161167, + "grad_norm": 0.2935768663883209, + "learning_rate": 4.567306966219937e-06, + "loss": 0.0888, + "step": 46202 + }, + { + "epoch": 0.8240823315378304, + "grad_norm": 0.2638930678367615, + "learning_rate": 4.566410143924152e-06, + "loss": 0.0843, + "step": 46203 + }, + { + "epoch": 0.8241001676595441, + "grad_norm": 0.2797393798828125, + "learning_rate": 4.565513400836022e-06, + "loss": 0.0732, + "step": 46204 + }, + { + "epoch": 0.8241180037812578, + "grad_norm": 0.35857200622558594, + "learning_rate": 4.564616736959027e-06, + "loss": 0.12, + "step": 46205 + }, + { + "epoch": 0.8241358399029715, + "grad_norm": 0.24700576066970825, + "learning_rate": 4.5637201522966355e-06, + "loss": 0.1195, + "step": 46206 + }, + { + "epoch": 0.8241536760246851, + "grad_norm": 0.21895363926887512, + "learning_rate": 4.562823646852321e-06, + "loss": 0.068, + "step": 46207 + }, + { + "epoch": 0.8241715121463988, + "grad_norm": 0.2787195146083832, + "learning_rate": 4.561927220629561e-06, + "loss": 0.1097, + "step": 46208 + }, + { + "epoch": 0.8241893482681126, + "grad_norm": 0.23904530704021454, + "learning_rate": 4.561030873631828e-06, + "loss": 0.0588, + "step": 46209 + }, + { + "epoch": 0.8242071843898263, + "grad_norm": 0.2670755684375763, + "learning_rate": 4.560134605862601e-06, + "loss": 0.1081, + "step": 46210 + }, + { + "epoch": 0.82422502051154, + "grad_norm": 0.22265464067459106, + "learning_rate": 4.559238417325351e-06, + "loss": 0.0867, + "step": 46211 + }, + { + "epoch": 0.8242428566332537, + "grad_norm": 0.3074725270271301, + "learning_rate": 4.558342308023556e-06, + "loss": 0.1971, + "step": 46212 + }, + { + "epoch": 0.8242606927549674, + "grad_norm": 0.2584645748138428, + "learning_rate": 4.557446277960678e-06, + "loss": 0.1079, + "step": 46213 + }, + { + "epoch": 0.8242785288766811, + "grad_norm": 0.30519789457321167, + "learning_rate": 4.556550327140207e-06, + "loss": 0.1144, + "step": 46214 + }, + { + "epoch": 0.8242963649983948, + "grad_norm": 0.19832868874073029, + "learning_rate": 4.555654455565609e-06, + "loss": 0.1403, + "step": 46215 + }, + { + "epoch": 0.8243142011201084, + "grad_norm": 0.21667800843715668, + "learning_rate": 4.554758663240347e-06, + "loss": 0.0774, + "step": 46216 + }, + { + "epoch": 0.8243320372418221, + "grad_norm": 0.2568855881690979, + "learning_rate": 4.553862950167908e-06, + "loss": 0.1546, + "step": 46217 + }, + { + "epoch": 0.8243498733635358, + "grad_norm": 0.25933149456977844, + "learning_rate": 4.552967316351753e-06, + "loss": 0.2038, + "step": 46218 + }, + { + "epoch": 0.8243677094852495, + "grad_norm": 0.3272765874862671, + "learning_rate": 4.552071761795363e-06, + "loss": 0.1183, + "step": 46219 + }, + { + "epoch": 0.8243855456069632, + "grad_norm": 0.4056278467178345, + "learning_rate": 4.551176286502207e-06, + "loss": 0.1323, + "step": 46220 + }, + { + "epoch": 0.8244033817286769, + "grad_norm": 0.24973776936531067, + "learning_rate": 4.550280890475756e-06, + "loss": 0.1307, + "step": 46221 + }, + { + "epoch": 0.8244212178503906, + "grad_norm": 0.2523420453071594, + "learning_rate": 4.54938557371947e-06, + "loss": 0.0872, + "step": 46222 + }, + { + "epoch": 0.8244390539721043, + "grad_norm": 0.3047141432762146, + "learning_rate": 4.548490336236838e-06, + "loss": 0.1639, + "step": 46223 + }, + { + "epoch": 0.824456890093818, + "grad_norm": 0.28480643033981323, + "learning_rate": 4.5475951780313194e-06, + "loss": 0.1451, + "step": 46224 + }, + { + "epoch": 0.8244747262155318, + "grad_norm": 0.2801607847213745, + "learning_rate": 4.546700099106385e-06, + "loss": 0.1047, + "step": 46225 + }, + { + "epoch": 0.8244925623372454, + "grad_norm": 0.2784700393676758, + "learning_rate": 4.545805099465503e-06, + "loss": 0.109, + "step": 46226 + }, + { + "epoch": 0.8245103984589591, + "grad_norm": 0.27022579312324524, + "learning_rate": 4.544910179112146e-06, + "loss": 0.0845, + "step": 46227 + }, + { + "epoch": 0.8245282345806728, + "grad_norm": 0.30743691325187683, + "learning_rate": 4.5440153380497865e-06, + "loss": 0.099, + "step": 46228 + }, + { + "epoch": 0.8245460707023865, + "grad_norm": 0.2754661440849304, + "learning_rate": 4.5431205762818865e-06, + "loss": 0.0459, + "step": 46229 + }, + { + "epoch": 0.8245639068241002, + "grad_norm": 0.23122017085552216, + "learning_rate": 4.542225893811913e-06, + "loss": 0.1036, + "step": 46230 + }, + { + "epoch": 0.8245817429458139, + "grad_norm": 0.23332935571670532, + "learning_rate": 4.541331290643336e-06, + "loss": 0.1177, + "step": 46231 + }, + { + "epoch": 0.8245995790675276, + "grad_norm": 0.28356999158859253, + "learning_rate": 4.540436766779632e-06, + "loss": 0.147, + "step": 46232 + }, + { + "epoch": 0.8246174151892413, + "grad_norm": 0.28929442167282104, + "learning_rate": 4.539542322224261e-06, + "loss": 0.1079, + "step": 46233 + }, + { + "epoch": 0.8246352513109549, + "grad_norm": 0.3091000020503998, + "learning_rate": 4.5386479569806905e-06, + "loss": 0.1382, + "step": 46234 + }, + { + "epoch": 0.8246530874326686, + "grad_norm": 0.31929466128349304, + "learning_rate": 4.537753671052381e-06, + "loss": 0.0648, + "step": 46235 + }, + { + "epoch": 0.8246709235543823, + "grad_norm": 0.2694339156150818, + "learning_rate": 4.536859464442814e-06, + "loss": 0.104, + "step": 46236 + }, + { + "epoch": 0.824688759676096, + "grad_norm": 0.3294632136821747, + "learning_rate": 4.535965337155445e-06, + "loss": 0.1256, + "step": 46237 + }, + { + "epoch": 0.8247065957978097, + "grad_norm": 0.3106650114059448, + "learning_rate": 4.535071289193743e-06, + "loss": 0.0953, + "step": 46238 + }, + { + "epoch": 0.8247244319195234, + "grad_norm": 0.30092278122901917, + "learning_rate": 4.534177320561167e-06, + "loss": 0.1142, + "step": 46239 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 0.34116989374160767, + "learning_rate": 4.533283431261195e-06, + "loss": 0.1467, + "step": 46240 + }, + { + "epoch": 0.8247601041629508, + "grad_norm": 0.30845415592193604, + "learning_rate": 4.532389621297287e-06, + "loss": 0.1024, + "step": 46241 + }, + { + "epoch": 0.8247779402846646, + "grad_norm": 0.28233957290649414, + "learning_rate": 4.531495890672904e-06, + "loss": 0.1427, + "step": 46242 + }, + { + "epoch": 0.8247957764063782, + "grad_norm": 0.5807080268859863, + "learning_rate": 4.530602239391507e-06, + "loss": 0.1022, + "step": 46243 + }, + { + "epoch": 0.8248136125280919, + "grad_norm": 0.2583715319633484, + "learning_rate": 4.529708667456572e-06, + "loss": 0.1166, + "step": 46244 + }, + { + "epoch": 0.8248314486498056, + "grad_norm": 0.2822147607803345, + "learning_rate": 4.528815174871548e-06, + "loss": 0.1254, + "step": 46245 + }, + { + "epoch": 0.8248492847715193, + "grad_norm": 0.20499397814273834, + "learning_rate": 4.527921761639916e-06, + "loss": 0.0606, + "step": 46246 + }, + { + "epoch": 0.824867120893233, + "grad_norm": 0.24845187366008759, + "learning_rate": 4.527028427765129e-06, + "loss": 0.0838, + "step": 46247 + }, + { + "epoch": 0.8248849570149467, + "grad_norm": 0.2561642825603485, + "learning_rate": 4.526135173250645e-06, + "loss": 0.0948, + "step": 46248 + }, + { + "epoch": 0.8249027931366604, + "grad_norm": 0.5120947360992432, + "learning_rate": 4.5252419980999384e-06, + "loss": 0.1016, + "step": 46249 + }, + { + "epoch": 0.8249206292583741, + "grad_norm": 0.2862080931663513, + "learning_rate": 4.5243489023164665e-06, + "loss": 0.1459, + "step": 46250 + }, + { + "epoch": 0.8249384653800877, + "grad_norm": 0.31652697920799255, + "learning_rate": 4.523455885903688e-06, + "loss": 0.0719, + "step": 46251 + }, + { + "epoch": 0.8249563015018014, + "grad_norm": 0.3754929304122925, + "learning_rate": 4.522562948865064e-06, + "loss": 0.1352, + "step": 46252 + }, + { + "epoch": 0.8249741376235151, + "grad_norm": 0.2790130078792572, + "learning_rate": 4.521670091204061e-06, + "loss": 0.0869, + "step": 46253 + }, + { + "epoch": 0.8249919737452288, + "grad_norm": 0.28048959374427795, + "learning_rate": 4.520777312924141e-06, + "loss": 0.0873, + "step": 46254 + }, + { + "epoch": 0.8250098098669425, + "grad_norm": 0.29519322514533997, + "learning_rate": 4.51988461402876e-06, + "loss": 0.1262, + "step": 46255 + }, + { + "epoch": 0.8250276459886562, + "grad_norm": 0.2821556329727173, + "learning_rate": 4.518991994521374e-06, + "loss": 0.0849, + "step": 46256 + }, + { + "epoch": 0.8250454821103699, + "grad_norm": 0.2634526491165161, + "learning_rate": 4.518099454405456e-06, + "loss": 0.0891, + "step": 46257 + }, + { + "epoch": 0.8250633182320836, + "grad_norm": 0.28975245356559753, + "learning_rate": 4.51720699368445e-06, + "loss": 0.1095, + "step": 46258 + }, + { + "epoch": 0.8250811543537974, + "grad_norm": 0.3463969826698303, + "learning_rate": 4.5163146123618325e-06, + "loss": 0.0975, + "step": 46259 + }, + { + "epoch": 0.825098990475511, + "grad_norm": 0.17858460545539856, + "learning_rate": 4.515422310441053e-06, + "loss": 0.0596, + "step": 46260 + }, + { + "epoch": 0.8251168265972247, + "grad_norm": 0.29375067353248596, + "learning_rate": 4.514530087925567e-06, + "loss": 0.1491, + "step": 46261 + }, + { + "epoch": 0.8251346627189384, + "grad_norm": 0.28398069739341736, + "learning_rate": 4.5136379448188436e-06, + "loss": 0.1232, + "step": 46262 + }, + { + "epoch": 0.8251524988406521, + "grad_norm": 0.2173001617193222, + "learning_rate": 4.512745881124333e-06, + "loss": 0.0766, + "step": 46263 + }, + { + "epoch": 0.8251703349623658, + "grad_norm": 0.358110636472702, + "learning_rate": 4.5118538968455e-06, + "loss": 0.2114, + "step": 46264 + }, + { + "epoch": 0.8251881710840795, + "grad_norm": 0.4235260486602783, + "learning_rate": 4.5109619919857875e-06, + "loss": 0.1333, + "step": 46265 + }, + { + "epoch": 0.8252060072057932, + "grad_norm": 0.32002028822898865, + "learning_rate": 4.510070166548671e-06, + "loss": 0.1416, + "step": 46266 + }, + { + "epoch": 0.8252238433275069, + "grad_norm": 0.3162918984889984, + "learning_rate": 4.509178420537599e-06, + "loss": 0.1469, + "step": 46267 + }, + { + "epoch": 0.8252416794492206, + "grad_norm": 0.20543918013572693, + "learning_rate": 4.5082867539560285e-06, + "loss": 0.0878, + "step": 46268 + }, + { + "epoch": 0.8252595155709342, + "grad_norm": 0.2594909071922302, + "learning_rate": 4.507395166807413e-06, + "loss": 0.0916, + "step": 46269 + }, + { + "epoch": 0.8252773516926479, + "grad_norm": 0.22302532196044922, + "learning_rate": 4.506503659095216e-06, + "loss": 0.0878, + "step": 46270 + }, + { + "epoch": 0.8252951878143616, + "grad_norm": 0.23516328632831573, + "learning_rate": 4.505612230822887e-06, + "loss": 0.1346, + "step": 46271 + }, + { + "epoch": 0.8253130239360753, + "grad_norm": 0.2626056969165802, + "learning_rate": 4.504720881993879e-06, + "loss": 0.0833, + "step": 46272 + }, + { + "epoch": 0.825330860057789, + "grad_norm": 0.23956821858882904, + "learning_rate": 4.503829612611657e-06, + "loss": 0.1006, + "step": 46273 + }, + { + "epoch": 0.8253486961795027, + "grad_norm": 0.33588707447052, + "learning_rate": 4.502938422679665e-06, + "loss": 0.1494, + "step": 46274 + }, + { + "epoch": 0.8253665323012164, + "grad_norm": 0.2861526906490326, + "learning_rate": 4.5020473122013685e-06, + "loss": 0.1241, + "step": 46275 + }, + { + "epoch": 0.8253843684229302, + "grad_norm": 0.22129428386688232, + "learning_rate": 4.5011562811802184e-06, + "loss": 0.1017, + "step": 46276 + }, + { + "epoch": 0.8254022045446439, + "grad_norm": 0.22234410047531128, + "learning_rate": 4.500265329619665e-06, + "loss": 0.137, + "step": 46277 + }, + { + "epoch": 0.8254200406663575, + "grad_norm": 0.2388879954814911, + "learning_rate": 4.4993744575231585e-06, + "loss": 0.0911, + "step": 46278 + }, + { + "epoch": 0.8254378767880712, + "grad_norm": 0.3475572168827057, + "learning_rate": 4.498483664894162e-06, + "loss": 0.1091, + "step": 46279 + }, + { + "epoch": 0.8254557129097849, + "grad_norm": 0.21472719311714172, + "learning_rate": 4.497592951736124e-06, + "loss": 0.1065, + "step": 46280 + }, + { + "epoch": 0.8254735490314986, + "grad_norm": 0.2855726480484009, + "learning_rate": 4.496702318052498e-06, + "loss": 0.1256, + "step": 46281 + }, + { + "epoch": 0.8254913851532123, + "grad_norm": 0.2390160858631134, + "learning_rate": 4.49581176384673e-06, + "loss": 0.0907, + "step": 46282 + }, + { + "epoch": 0.825509221274926, + "grad_norm": 0.2769002616405487, + "learning_rate": 4.494921289122281e-06, + "loss": 0.0521, + "step": 46283 + }, + { + "epoch": 0.8255270573966397, + "grad_norm": 0.3119021952152252, + "learning_rate": 4.494030893882603e-06, + "loss": 0.1294, + "step": 46284 + }, + { + "epoch": 0.8255448935183534, + "grad_norm": 0.27752071619033813, + "learning_rate": 4.493140578131136e-06, + "loss": 0.098, + "step": 46285 + }, + { + "epoch": 0.825562729640067, + "grad_norm": 0.2968054711818695, + "learning_rate": 4.492250341871343e-06, + "loss": 0.1231, + "step": 46286 + }, + { + "epoch": 0.8255805657617807, + "grad_norm": 0.259860634803772, + "learning_rate": 4.491360185106666e-06, + "loss": 0.1023, + "step": 46287 + }, + { + "epoch": 0.8255984018834944, + "grad_norm": 0.3266686201095581, + "learning_rate": 4.490470107840566e-06, + "loss": 0.0819, + "step": 46288 + }, + { + "epoch": 0.8256162380052081, + "grad_norm": 0.24233923852443695, + "learning_rate": 4.4895801100764915e-06, + "loss": 0.1191, + "step": 46289 + }, + { + "epoch": 0.8256340741269218, + "grad_norm": 0.24481399357318878, + "learning_rate": 4.4886901918178844e-06, + "loss": 0.0793, + "step": 46290 + }, + { + "epoch": 0.8256519102486355, + "grad_norm": 0.26234865188598633, + "learning_rate": 4.487800353068192e-06, + "loss": 0.1013, + "step": 46291 + }, + { + "epoch": 0.8256697463703492, + "grad_norm": 0.21920229494571686, + "learning_rate": 4.486910593830878e-06, + "loss": 0.1008, + "step": 46292 + }, + { + "epoch": 0.825687582492063, + "grad_norm": 0.215499609708786, + "learning_rate": 4.486020914109384e-06, + "loss": 0.1011, + "step": 46293 + }, + { + "epoch": 0.8257054186137767, + "grad_norm": 0.31064313650131226, + "learning_rate": 4.485131313907156e-06, + "loss": 0.0958, + "step": 46294 + }, + { + "epoch": 0.8257232547354904, + "grad_norm": 0.2392728477716446, + "learning_rate": 4.484241793227642e-06, + "loss": 0.0961, + "step": 46295 + }, + { + "epoch": 0.825741090857204, + "grad_norm": 0.37710610032081604, + "learning_rate": 4.483352352074297e-06, + "loss": 0.114, + "step": 46296 + }, + { + "epoch": 0.8257589269789177, + "grad_norm": 0.2994675636291504, + "learning_rate": 4.4824629904505644e-06, + "loss": 0.0897, + "step": 46297 + }, + { + "epoch": 0.8257767631006314, + "grad_norm": 0.2258623093366623, + "learning_rate": 4.48157370835989e-06, + "loss": 0.0714, + "step": 46298 + }, + { + "epoch": 0.8257945992223451, + "grad_norm": 0.23060540854930878, + "learning_rate": 4.480684505805718e-06, + "loss": 0.106, + "step": 46299 + }, + { + "epoch": 0.8258124353440588, + "grad_norm": 0.3615747392177582, + "learning_rate": 4.479795382791508e-06, + "loss": 0.102, + "step": 46300 + }, + { + "epoch": 0.8258302714657725, + "grad_norm": 0.3808033764362335, + "learning_rate": 4.478906339320691e-06, + "loss": 0.1548, + "step": 46301 + }, + { + "epoch": 0.8258481075874862, + "grad_norm": 0.2565228343009949, + "learning_rate": 4.478017375396726e-06, + "loss": 0.0784, + "step": 46302 + }, + { + "epoch": 0.8258659437091999, + "grad_norm": 0.24148888885974884, + "learning_rate": 4.477128491023055e-06, + "loss": 0.0957, + "step": 46303 + }, + { + "epoch": 0.8258837798309135, + "grad_norm": 0.24815578758716583, + "learning_rate": 4.476239686203115e-06, + "loss": 0.083, + "step": 46304 + }, + { + "epoch": 0.8259016159526272, + "grad_norm": 0.22411227226257324, + "learning_rate": 4.475350960940366e-06, + "loss": 0.0841, + "step": 46305 + }, + { + "epoch": 0.8259194520743409, + "grad_norm": 0.27995002269744873, + "learning_rate": 4.474462315238246e-06, + "loss": 0.1576, + "step": 46306 + }, + { + "epoch": 0.8259372881960546, + "grad_norm": 0.25976744294166565, + "learning_rate": 4.473573749100199e-06, + "loss": 0.0941, + "step": 46307 + }, + { + "epoch": 0.8259551243177683, + "grad_norm": 0.2552463114261627, + "learning_rate": 4.472685262529664e-06, + "loss": 0.097, + "step": 46308 + }, + { + "epoch": 0.825972960439482, + "grad_norm": 0.6652151346206665, + "learning_rate": 4.471796855530095e-06, + "loss": 0.1407, + "step": 46309 + }, + { + "epoch": 0.8259907965611958, + "grad_norm": 0.24483048915863037, + "learning_rate": 4.470908528104933e-06, + "loss": 0.0697, + "step": 46310 + }, + { + "epoch": 0.8260086326829095, + "grad_norm": 0.34648269414901733, + "learning_rate": 4.470020280257619e-06, + "loss": 0.1634, + "step": 46311 + }, + { + "epoch": 0.8260264688046232, + "grad_norm": 0.2337142676115036, + "learning_rate": 4.469132111991592e-06, + "loss": 0.0973, + "step": 46312 + }, + { + "epoch": 0.8260443049263368, + "grad_norm": 0.2229405641555786, + "learning_rate": 4.468244023310305e-06, + "loss": 0.1279, + "step": 46313 + }, + { + "epoch": 0.8260621410480505, + "grad_norm": 0.3589451014995575, + "learning_rate": 4.467356014217192e-06, + "loss": 0.134, + "step": 46314 + }, + { + "epoch": 0.8260799771697642, + "grad_norm": 0.23785343766212463, + "learning_rate": 4.466468084715702e-06, + "loss": 0.0762, + "step": 46315 + }, + { + "epoch": 0.8260978132914779, + "grad_norm": 0.29191070795059204, + "learning_rate": 4.4655802348092775e-06, + "loss": 0.1351, + "step": 46316 + }, + { + "epoch": 0.8261156494131916, + "grad_norm": 0.2532224953174591, + "learning_rate": 4.4646924645013465e-06, + "loss": 0.0793, + "step": 46317 + }, + { + "epoch": 0.8261334855349053, + "grad_norm": 0.524084210395813, + "learning_rate": 4.463804773795369e-06, + "loss": 0.1477, + "step": 46318 + }, + { + "epoch": 0.826151321656619, + "grad_norm": 0.21385720372200012, + "learning_rate": 4.4629171626947765e-06, + "loss": 0.1053, + "step": 46319 + }, + { + "epoch": 0.8261691577783327, + "grad_norm": 0.35684239864349365, + "learning_rate": 4.46202963120301e-06, + "loss": 0.1692, + "step": 46320 + }, + { + "epoch": 0.8261869939000464, + "grad_norm": 0.24282434582710266, + "learning_rate": 4.461142179323502e-06, + "loss": 0.0716, + "step": 46321 + }, + { + "epoch": 0.82620483002176, + "grad_norm": 0.20248641073703766, + "learning_rate": 4.46025480705971e-06, + "loss": 0.0557, + "step": 46322 + }, + { + "epoch": 0.8262226661434737, + "grad_norm": 0.2696966528892517, + "learning_rate": 4.459367514415064e-06, + "loss": 0.0908, + "step": 46323 + }, + { + "epoch": 0.8262405022651874, + "grad_norm": 0.20231105387210846, + "learning_rate": 4.458480301393003e-06, + "loss": 0.0957, + "step": 46324 + }, + { + "epoch": 0.8262583383869011, + "grad_norm": 0.3420659303665161, + "learning_rate": 4.457593167996968e-06, + "loss": 0.0811, + "step": 46325 + }, + { + "epoch": 0.8262761745086148, + "grad_norm": 0.34777867794036865, + "learning_rate": 4.456706114230391e-06, + "loss": 0.1331, + "step": 46326 + }, + { + "epoch": 0.8262940106303286, + "grad_norm": 0.2817397117614746, + "learning_rate": 4.455819140096723e-06, + "loss": 0.1117, + "step": 46327 + }, + { + "epoch": 0.8263118467520423, + "grad_norm": 0.3114819824695587, + "learning_rate": 4.454932245599389e-06, + "loss": 0.0879, + "step": 46328 + }, + { + "epoch": 0.826329682873756, + "grad_norm": 0.2996848523616791, + "learning_rate": 4.454045430741838e-06, + "loss": 0.129, + "step": 46329 + }, + { + "epoch": 0.8263475189954697, + "grad_norm": 0.2899375557899475, + "learning_rate": 4.453158695527499e-06, + "loss": 0.1045, + "step": 46330 + }, + { + "epoch": 0.8263653551171833, + "grad_norm": 0.22163324058055878, + "learning_rate": 4.452272039959821e-06, + "loss": 0.0987, + "step": 46331 + }, + { + "epoch": 0.826383191238897, + "grad_norm": 0.31059330701828003, + "learning_rate": 4.451385464042229e-06, + "loss": 0.1088, + "step": 46332 + }, + { + "epoch": 0.8264010273606107, + "grad_norm": 0.3378532826900482, + "learning_rate": 4.4504989677781685e-06, + "loss": 0.1295, + "step": 46333 + }, + { + "epoch": 0.8264188634823244, + "grad_norm": 0.28912967443466187, + "learning_rate": 4.449612551171064e-06, + "loss": 0.0594, + "step": 46334 + }, + { + "epoch": 0.8264366996040381, + "grad_norm": 0.2329649031162262, + "learning_rate": 4.448726214224366e-06, + "loss": 0.1071, + "step": 46335 + }, + { + "epoch": 0.8264545357257518, + "grad_norm": 0.2518095076084137, + "learning_rate": 4.447839956941502e-06, + "loss": 0.0512, + "step": 46336 + }, + { + "epoch": 0.8264723718474655, + "grad_norm": 0.27875518798828125, + "learning_rate": 4.44695377932591e-06, + "loss": 0.1064, + "step": 46337 + }, + { + "epoch": 0.8264902079691792, + "grad_norm": 0.2622019648551941, + "learning_rate": 4.446067681381022e-06, + "loss": 0.0838, + "step": 46338 + }, + { + "epoch": 0.8265080440908928, + "grad_norm": 0.318490207195282, + "learning_rate": 4.4451816631102704e-06, + "loss": 0.1198, + "step": 46339 + }, + { + "epoch": 0.8265258802126065, + "grad_norm": 0.21269266307353973, + "learning_rate": 4.4442957245171e-06, + "loss": 0.1194, + "step": 46340 + }, + { + "epoch": 0.8265437163343202, + "grad_norm": 0.23801380395889282, + "learning_rate": 4.443409865604933e-06, + "loss": 0.1064, + "step": 46341 + }, + { + "epoch": 0.8265615524560339, + "grad_norm": 0.32886651158332825, + "learning_rate": 4.442524086377217e-06, + "loss": 0.106, + "step": 46342 + }, + { + "epoch": 0.8265793885777477, + "grad_norm": 0.293696790933609, + "learning_rate": 4.441638386837368e-06, + "loss": 0.125, + "step": 46343 + }, + { + "epoch": 0.8265972246994614, + "grad_norm": 0.32515859603881836, + "learning_rate": 4.44075276698884e-06, + "loss": 0.1691, + "step": 46344 + }, + { + "epoch": 0.8266150608211751, + "grad_norm": 0.2994956970214844, + "learning_rate": 4.439867226835051e-06, + "loss": 0.1323, + "step": 46345 + }, + { + "epoch": 0.8266328969428888, + "grad_norm": 0.2532998323440552, + "learning_rate": 4.438981766379441e-06, + "loss": 0.0928, + "step": 46346 + }, + { + "epoch": 0.8266507330646025, + "grad_norm": 0.2636529803276062, + "learning_rate": 4.438096385625431e-06, + "loss": 0.0928, + "step": 46347 + }, + { + "epoch": 0.8266685691863161, + "grad_norm": 0.21411831676959991, + "learning_rate": 4.437211084576467e-06, + "loss": 0.0853, + "step": 46348 + }, + { + "epoch": 0.8266864053080298, + "grad_norm": 0.2518365979194641, + "learning_rate": 4.436325863235976e-06, + "loss": 0.0997, + "step": 46349 + }, + { + "epoch": 0.8267042414297435, + "grad_norm": 0.24956312775611877, + "learning_rate": 4.435440721607389e-06, + "loss": 0.1168, + "step": 46350 + }, + { + "epoch": 0.8267220775514572, + "grad_norm": 0.34560057520866394, + "learning_rate": 4.434555659694137e-06, + "loss": 0.1014, + "step": 46351 + }, + { + "epoch": 0.8267399136731709, + "grad_norm": 0.3348969519138336, + "learning_rate": 4.433670677499643e-06, + "loss": 0.1598, + "step": 46352 + }, + { + "epoch": 0.8267577497948846, + "grad_norm": 0.25283145904541016, + "learning_rate": 4.4327857750273514e-06, + "loss": 0.1496, + "step": 46353 + }, + { + "epoch": 0.8267755859165983, + "grad_norm": 0.30321258306503296, + "learning_rate": 4.431900952280685e-06, + "loss": 0.1309, + "step": 46354 + }, + { + "epoch": 0.826793422038312, + "grad_norm": 0.3203035891056061, + "learning_rate": 4.43101620926307e-06, + "loss": 0.1259, + "step": 46355 + }, + { + "epoch": 0.8268112581600257, + "grad_norm": 0.26212987303733826, + "learning_rate": 4.430131545977945e-06, + "loss": 0.1284, + "step": 46356 + }, + { + "epoch": 0.8268290942817393, + "grad_norm": 0.25806617736816406, + "learning_rate": 4.429246962428729e-06, + "loss": 0.0977, + "step": 46357 + }, + { + "epoch": 0.826846930403453, + "grad_norm": 0.22036178410053253, + "learning_rate": 4.428362458618865e-06, + "loss": 0.1076, + "step": 46358 + }, + { + "epoch": 0.8268647665251667, + "grad_norm": 0.31598806381225586, + "learning_rate": 4.427478034551771e-06, + "loss": 0.1135, + "step": 46359 + }, + { + "epoch": 0.8268826026468805, + "grad_norm": 0.25243866443634033, + "learning_rate": 4.4265936902308706e-06, + "loss": 0.1123, + "step": 46360 + }, + { + "epoch": 0.8269004387685942, + "grad_norm": 0.34924691915512085, + "learning_rate": 4.425709425659608e-06, + "loss": 0.1209, + "step": 46361 + }, + { + "epoch": 0.8269182748903079, + "grad_norm": 0.293104350566864, + "learning_rate": 4.424825240841399e-06, + "loss": 0.1347, + "step": 46362 + }, + { + "epoch": 0.8269361110120216, + "grad_norm": 0.2945515513420105, + "learning_rate": 4.423941135779674e-06, + "loss": 0.1256, + "step": 46363 + }, + { + "epoch": 0.8269539471337353, + "grad_norm": 0.3119187355041504, + "learning_rate": 4.423057110477863e-06, + "loss": 0.0517, + "step": 46364 + }, + { + "epoch": 0.826971783255449, + "grad_norm": 0.3108843266963959, + "learning_rate": 4.42217316493938e-06, + "loss": 0.1516, + "step": 46365 + }, + { + "epoch": 0.8269896193771626, + "grad_norm": 0.26875820755958557, + "learning_rate": 4.421289299167669e-06, + "loss": 0.1423, + "step": 46366 + }, + { + "epoch": 0.8270074554988763, + "grad_norm": 0.2618597745895386, + "learning_rate": 4.420405513166148e-06, + "loss": 0.1166, + "step": 46367 + }, + { + "epoch": 0.82702529162059, + "grad_norm": 0.2817710041999817, + "learning_rate": 4.4195218069382375e-06, + "loss": 0.1372, + "step": 46368 + }, + { + "epoch": 0.8270431277423037, + "grad_norm": 0.22582118213176727, + "learning_rate": 4.418638180487375e-06, + "loss": 0.0818, + "step": 46369 + }, + { + "epoch": 0.8270609638640174, + "grad_norm": 0.25454017519950867, + "learning_rate": 4.417754633816973e-06, + "loss": 0.1304, + "step": 46370 + }, + { + "epoch": 0.8270787999857311, + "grad_norm": 0.1895970106124878, + "learning_rate": 4.416871166930467e-06, + "loss": 0.1234, + "step": 46371 + }, + { + "epoch": 0.8270966361074448, + "grad_norm": 0.2849210798740387, + "learning_rate": 4.415987779831279e-06, + "loss": 0.1014, + "step": 46372 + }, + { + "epoch": 0.8271144722291585, + "grad_norm": 0.36823350191116333, + "learning_rate": 4.415104472522827e-06, + "loss": 0.1161, + "step": 46373 + }, + { + "epoch": 0.8271323083508721, + "grad_norm": 0.2871535122394562, + "learning_rate": 4.414221245008546e-06, + "loss": 0.0987, + "step": 46374 + }, + { + "epoch": 0.8271501444725858, + "grad_norm": 0.23042576014995575, + "learning_rate": 4.413338097291852e-06, + "loss": 0.0877, + "step": 46375 + }, + { + "epoch": 0.8271679805942995, + "grad_norm": 0.2570904791355133, + "learning_rate": 4.412455029376172e-06, + "loss": 0.1335, + "step": 46376 + }, + { + "epoch": 0.8271858167160133, + "grad_norm": 0.2820962071418762, + "learning_rate": 4.411572041264925e-06, + "loss": 0.1145, + "step": 46377 + }, + { + "epoch": 0.827203652837727, + "grad_norm": 0.15524698793888092, + "learning_rate": 4.410689132961529e-06, + "loss": 0.0478, + "step": 46378 + }, + { + "epoch": 0.8272214889594407, + "grad_norm": 0.23548926413059235, + "learning_rate": 4.409806304469422e-06, + "loss": 0.115, + "step": 46379 + }, + { + "epoch": 0.8272393250811544, + "grad_norm": 0.3157966136932373, + "learning_rate": 4.408923555792016e-06, + "loss": 0.1111, + "step": 46380 + }, + { + "epoch": 0.8272571612028681, + "grad_norm": 0.269752562046051, + "learning_rate": 4.408040886932735e-06, + "loss": 0.1242, + "step": 46381 + }, + { + "epoch": 0.8272749973245818, + "grad_norm": 0.3442416489124298, + "learning_rate": 4.4071582978949925e-06, + "loss": 0.1069, + "step": 46382 + }, + { + "epoch": 0.8272928334462955, + "grad_norm": 0.27273133397102356, + "learning_rate": 4.4062757886822235e-06, + "loss": 0.1114, + "step": 46383 + }, + { + "epoch": 0.8273106695680091, + "grad_norm": 0.255541056394577, + "learning_rate": 4.405393359297835e-06, + "loss": 0.1315, + "step": 46384 + }, + { + "epoch": 0.8273285056897228, + "grad_norm": 0.3046436011791229, + "learning_rate": 4.404511009745263e-06, + "loss": 0.0974, + "step": 46385 + }, + { + "epoch": 0.8273463418114365, + "grad_norm": 0.33289432525634766, + "learning_rate": 4.403628740027912e-06, + "loss": 0.1784, + "step": 46386 + }, + { + "epoch": 0.8273641779331502, + "grad_norm": 0.4371652603149414, + "learning_rate": 4.4027465501492174e-06, + "loss": 0.1443, + "step": 46387 + }, + { + "epoch": 0.8273820140548639, + "grad_norm": 0.27950721979141235, + "learning_rate": 4.401864440112591e-06, + "loss": 0.109, + "step": 46388 + }, + { + "epoch": 0.8273998501765776, + "grad_norm": 0.31119170784950256, + "learning_rate": 4.400982409921453e-06, + "loss": 0.1158, + "step": 46389 + }, + { + "epoch": 0.8274176862982913, + "grad_norm": 0.29666703939437866, + "learning_rate": 4.40010045957922e-06, + "loss": 0.1124, + "step": 46390 + }, + { + "epoch": 0.827435522420005, + "grad_norm": 0.35534292459487915, + "learning_rate": 4.3992185890893075e-06, + "loss": 0.1291, + "step": 46391 + }, + { + "epoch": 0.8274533585417186, + "grad_norm": 0.2469429224729538, + "learning_rate": 4.398336798455147e-06, + "loss": 0.1092, + "step": 46392 + }, + { + "epoch": 0.8274711946634323, + "grad_norm": 0.34191715717315674, + "learning_rate": 4.397455087680147e-06, + "loss": 0.1406, + "step": 46393 + }, + { + "epoch": 0.8274890307851461, + "grad_norm": 0.5258873701095581, + "learning_rate": 4.396573456767725e-06, + "loss": 0.1693, + "step": 46394 + }, + { + "epoch": 0.8275068669068598, + "grad_norm": 0.257668137550354, + "learning_rate": 4.395691905721297e-06, + "loss": 0.095, + "step": 46395 + }, + { + "epoch": 0.8275247030285735, + "grad_norm": 0.33640992641448975, + "learning_rate": 4.394810434544288e-06, + "loss": 0.0933, + "step": 46396 + }, + { + "epoch": 0.8275425391502872, + "grad_norm": 0.21505111455917358, + "learning_rate": 4.393929043240105e-06, + "loss": 0.1331, + "step": 46397 + }, + { + "epoch": 0.8275603752720009, + "grad_norm": 0.23586580157279968, + "learning_rate": 4.393047731812177e-06, + "loss": 0.1049, + "step": 46398 + }, + { + "epoch": 0.8275782113937146, + "grad_norm": 0.2532597482204437, + "learning_rate": 4.392166500263906e-06, + "loss": 0.1283, + "step": 46399 + }, + { + "epoch": 0.8275960475154283, + "grad_norm": 0.2986949682235718, + "learning_rate": 4.391285348598722e-06, + "loss": 0.1165, + "step": 46400 + }, + { + "epoch": 0.827613883637142, + "grad_norm": 0.23167075216770172, + "learning_rate": 4.390404276820037e-06, + "loss": 0.1167, + "step": 46401 + }, + { + "epoch": 0.8276317197588556, + "grad_norm": 0.2582891881465912, + "learning_rate": 4.3895232849312605e-06, + "loss": 0.1255, + "step": 46402 + }, + { + "epoch": 0.8276495558805693, + "grad_norm": 0.2820345461368561, + "learning_rate": 4.388642372935811e-06, + "loss": 0.1101, + "step": 46403 + }, + { + "epoch": 0.827667392002283, + "grad_norm": 0.2451031506061554, + "learning_rate": 4.387761540837096e-06, + "loss": 0.1137, + "step": 46404 + }, + { + "epoch": 0.8276852281239967, + "grad_norm": 0.30854934453964233, + "learning_rate": 4.386880788638542e-06, + "loss": 0.1129, + "step": 46405 + }, + { + "epoch": 0.8277030642457104, + "grad_norm": 0.30441048741340637, + "learning_rate": 4.386000116343558e-06, + "loss": 0.1149, + "step": 46406 + }, + { + "epoch": 0.8277209003674241, + "grad_norm": 0.280292272567749, + "learning_rate": 4.3851195239555586e-06, + "loss": 0.1061, + "step": 46407 + }, + { + "epoch": 0.8277387364891378, + "grad_norm": 0.35672205686569214, + "learning_rate": 4.384239011477947e-06, + "loss": 0.1367, + "step": 46408 + }, + { + "epoch": 0.8277565726108514, + "grad_norm": 0.23385368287563324, + "learning_rate": 4.383358578914154e-06, + "loss": 0.0763, + "step": 46409 + }, + { + "epoch": 0.8277744087325651, + "grad_norm": 0.2788551449775696, + "learning_rate": 4.382478226267583e-06, + "loss": 0.0703, + "step": 46410 + }, + { + "epoch": 0.8277922448542789, + "grad_norm": 0.29580509662628174, + "learning_rate": 4.381597953541644e-06, + "loss": 0.1328, + "step": 46411 + }, + { + "epoch": 0.8278100809759926, + "grad_norm": 0.269625186920166, + "learning_rate": 4.38071776073975e-06, + "loss": 0.1018, + "step": 46412 + }, + { + "epoch": 0.8278279170977063, + "grad_norm": 0.2543848752975464, + "learning_rate": 4.379837647865323e-06, + "loss": 0.162, + "step": 46413 + }, + { + "epoch": 0.82784575321942, + "grad_norm": 0.21752704679965973, + "learning_rate": 4.378957614921766e-06, + "loss": 0.0679, + "step": 46414 + }, + { + "epoch": 0.8278635893411337, + "grad_norm": 0.2891313433647156, + "learning_rate": 4.3780776619124915e-06, + "loss": 0.0784, + "step": 46415 + }, + { + "epoch": 0.8278814254628474, + "grad_norm": 0.23833556473255157, + "learning_rate": 4.377197788840912e-06, + "loss": 0.1365, + "step": 46416 + }, + { + "epoch": 0.8278992615845611, + "grad_norm": 0.2756827473640442, + "learning_rate": 4.37631799571043e-06, + "loss": 0.131, + "step": 46417 + }, + { + "epoch": 0.8279170977062748, + "grad_norm": 0.1818789839744568, + "learning_rate": 4.375438282524469e-06, + "loss": 0.0905, + "step": 46418 + }, + { + "epoch": 0.8279349338279884, + "grad_norm": 0.3355312645435333, + "learning_rate": 4.374558649286431e-06, + "loss": 0.1281, + "step": 46419 + }, + { + "epoch": 0.8279527699497021, + "grad_norm": 0.29590240120887756, + "learning_rate": 4.37367909599973e-06, + "loss": 0.1002, + "step": 46420 + }, + { + "epoch": 0.8279706060714158, + "grad_norm": 0.29002490639686584, + "learning_rate": 4.372799622667764e-06, + "loss": 0.1004, + "step": 46421 + }, + { + "epoch": 0.8279884421931295, + "grad_norm": 0.2694943845272064, + "learning_rate": 4.37192022929396e-06, + "loss": 0.1225, + "step": 46422 + }, + { + "epoch": 0.8280062783148432, + "grad_norm": 0.3024190664291382, + "learning_rate": 4.371040915881716e-06, + "loss": 0.0978, + "step": 46423 + }, + { + "epoch": 0.8280241144365569, + "grad_norm": 0.30632027983665466, + "learning_rate": 4.370161682434437e-06, + "loss": 0.1643, + "step": 46424 + }, + { + "epoch": 0.8280419505582706, + "grad_norm": 0.31171733140945435, + "learning_rate": 4.369282528955543e-06, + "loss": 0.1489, + "step": 46425 + }, + { + "epoch": 0.8280597866799843, + "grad_norm": 0.31796911358833313, + "learning_rate": 4.368403455448428e-06, + "loss": 0.1015, + "step": 46426 + }, + { + "epoch": 0.8280776228016979, + "grad_norm": 0.282086044549942, + "learning_rate": 4.367524461916514e-06, + "loss": 0.1115, + "step": 46427 + }, + { + "epoch": 0.8280954589234117, + "grad_norm": 0.3228877782821655, + "learning_rate": 4.366645548363202e-06, + "loss": 0.1144, + "step": 46428 + }, + { + "epoch": 0.8281132950451254, + "grad_norm": 0.5565622448921204, + "learning_rate": 4.3657667147918995e-06, + "loss": 0.0955, + "step": 46429 + }, + { + "epoch": 0.8281311311668391, + "grad_norm": 0.2789391875267029, + "learning_rate": 4.3648879612060045e-06, + "loss": 0.0448, + "step": 46430 + }, + { + "epoch": 0.8281489672885528, + "grad_norm": 0.26229822635650635, + "learning_rate": 4.364009287608936e-06, + "loss": 0.0762, + "step": 46431 + }, + { + "epoch": 0.8281668034102665, + "grad_norm": 0.2685633599758148, + "learning_rate": 4.3631306940040975e-06, + "loss": 0.0933, + "step": 46432 + }, + { + "epoch": 0.8281846395319802, + "grad_norm": 0.2916639745235443, + "learning_rate": 4.3622521803948905e-06, + "loss": 0.1458, + "step": 46433 + }, + { + "epoch": 0.8282024756536939, + "grad_norm": 0.21343868970870972, + "learning_rate": 4.3613737467847164e-06, + "loss": 0.098, + "step": 46434 + }, + { + "epoch": 0.8282203117754076, + "grad_norm": 0.1641271561384201, + "learning_rate": 4.3604953931769915e-06, + "loss": 0.0167, + "step": 46435 + }, + { + "epoch": 0.8282381478971212, + "grad_norm": 0.29804444313049316, + "learning_rate": 4.3596171195751156e-06, + "loss": 0.1086, + "step": 46436 + }, + { + "epoch": 0.8282559840188349, + "grad_norm": 0.23752933740615845, + "learning_rate": 4.358738925982495e-06, + "loss": 0.1133, + "step": 46437 + }, + { + "epoch": 0.8282738201405486, + "grad_norm": 0.2890227138996124, + "learning_rate": 4.35786081240252e-06, + "loss": 0.093, + "step": 46438 + }, + { + "epoch": 0.8282916562622623, + "grad_norm": 0.22662892937660217, + "learning_rate": 4.356982778838612e-06, + "loss": 0.0973, + "step": 46439 + }, + { + "epoch": 0.828309492383976, + "grad_norm": 0.23539596796035767, + "learning_rate": 4.356104825294172e-06, + "loss": 0.1063, + "step": 46440 + }, + { + "epoch": 0.8283273285056897, + "grad_norm": 0.25857070088386536, + "learning_rate": 4.355226951772598e-06, + "loss": 0.084, + "step": 46441 + }, + { + "epoch": 0.8283451646274034, + "grad_norm": 0.26156559586524963, + "learning_rate": 4.354349158277296e-06, + "loss": 0.1011, + "step": 46442 + }, + { + "epoch": 0.8283630007491171, + "grad_norm": 0.2422717958688736, + "learning_rate": 4.3534714448116635e-06, + "loss": 0.1304, + "step": 46443 + }, + { + "epoch": 0.8283808368708309, + "grad_norm": 0.23752345144748688, + "learning_rate": 4.35259381137911e-06, + "loss": 0.0977, + "step": 46444 + }, + { + "epoch": 0.8283986729925445, + "grad_norm": 0.33036893606185913, + "learning_rate": 4.351716257983035e-06, + "loss": 0.1468, + "step": 46445 + }, + { + "epoch": 0.8284165091142582, + "grad_norm": 0.29562899470329285, + "learning_rate": 4.3508387846268425e-06, + "loss": 0.0903, + "step": 46446 + }, + { + "epoch": 0.8284343452359719, + "grad_norm": 0.3533494174480438, + "learning_rate": 4.349961391313922e-06, + "loss": 0.1626, + "step": 46447 + }, + { + "epoch": 0.8284521813576856, + "grad_norm": 0.24387799203395844, + "learning_rate": 4.3490840780476884e-06, + "loss": 0.1007, + "step": 46448 + }, + { + "epoch": 0.8284700174793993, + "grad_norm": 0.3472498655319214, + "learning_rate": 4.34820684483154e-06, + "loss": 0.1368, + "step": 46449 + }, + { + "epoch": 0.828487853601113, + "grad_norm": 0.21397210657596588, + "learning_rate": 4.347329691668875e-06, + "loss": 0.1037, + "step": 46450 + }, + { + "epoch": 0.8285056897228267, + "grad_norm": 0.23745764791965485, + "learning_rate": 4.346452618563085e-06, + "loss": 0.1307, + "step": 46451 + }, + { + "epoch": 0.8285235258445404, + "grad_norm": 0.4360826909542084, + "learning_rate": 4.345575625517584e-06, + "loss": 0.115, + "step": 46452 + }, + { + "epoch": 0.828541361966254, + "grad_norm": 0.22171275317668915, + "learning_rate": 4.344698712535761e-06, + "loss": 0.0879, + "step": 46453 + }, + { + "epoch": 0.8285591980879677, + "grad_norm": 0.21346944570541382, + "learning_rate": 4.343821879621027e-06, + "loss": 0.1111, + "step": 46454 + }, + { + "epoch": 0.8285770342096814, + "grad_norm": 0.25692903995513916, + "learning_rate": 4.342945126776773e-06, + "loss": 0.1122, + "step": 46455 + }, + { + "epoch": 0.8285948703313951, + "grad_norm": 0.3024471402168274, + "learning_rate": 4.34206845400639e-06, + "loss": 0.1272, + "step": 46456 + }, + { + "epoch": 0.8286127064531088, + "grad_norm": 0.25844213366508484, + "learning_rate": 4.341191861313293e-06, + "loss": 0.1124, + "step": 46457 + }, + { + "epoch": 0.8286305425748225, + "grad_norm": 0.3144952356815338, + "learning_rate": 4.3403153487008714e-06, + "loss": 0.0786, + "step": 46458 + }, + { + "epoch": 0.8286483786965362, + "grad_norm": 0.20591102540493011, + "learning_rate": 4.339438916172525e-06, + "loss": 0.1376, + "step": 46459 + }, + { + "epoch": 0.8286662148182499, + "grad_norm": 0.2556131184101105, + "learning_rate": 4.338562563731641e-06, + "loss": 0.0854, + "step": 46460 + }, + { + "epoch": 0.8286840509399637, + "grad_norm": 0.21406857669353485, + "learning_rate": 4.337686291381629e-06, + "loss": 0.133, + "step": 46461 + }, + { + "epoch": 0.8287018870616774, + "grad_norm": 0.23430363833904266, + "learning_rate": 4.336810099125884e-06, + "loss": 0.0808, + "step": 46462 + }, + { + "epoch": 0.828719723183391, + "grad_norm": 0.24600858986377716, + "learning_rate": 4.335933986967799e-06, + "loss": 0.1123, + "step": 46463 + }, + { + "epoch": 0.8287375593051047, + "grad_norm": 0.40453529357910156, + "learning_rate": 4.335057954910768e-06, + "loss": 0.1743, + "step": 46464 + }, + { + "epoch": 0.8287553954268184, + "grad_norm": 0.24288958311080933, + "learning_rate": 4.334182002958192e-06, + "loss": 0.0964, + "step": 46465 + }, + { + "epoch": 0.8287732315485321, + "grad_norm": 0.26050734519958496, + "learning_rate": 4.33330613111346e-06, + "loss": 0.133, + "step": 46466 + }, + { + "epoch": 0.8287910676702458, + "grad_norm": 0.3225926458835602, + "learning_rate": 4.332430339379978e-06, + "loss": 0.1277, + "step": 46467 + }, + { + "epoch": 0.8288089037919595, + "grad_norm": 0.268846720457077, + "learning_rate": 4.331554627761134e-06, + "loss": 0.116, + "step": 46468 + }, + { + "epoch": 0.8288267399136732, + "grad_norm": 0.31195732951164246, + "learning_rate": 4.330678996260315e-06, + "loss": 0.1482, + "step": 46469 + }, + { + "epoch": 0.8288445760353869, + "grad_norm": 0.24562333524227142, + "learning_rate": 4.329803444880931e-06, + "loss": 0.1483, + "step": 46470 + }, + { + "epoch": 0.8288624121571005, + "grad_norm": 0.19142258167266846, + "learning_rate": 4.328927973626368e-06, + "loss": 0.0765, + "step": 46471 + }, + { + "epoch": 0.8288802482788142, + "grad_norm": 0.2689981460571289, + "learning_rate": 4.32805258250002e-06, + "loss": 0.1365, + "step": 46472 + }, + { + "epoch": 0.8288980844005279, + "grad_norm": 0.23739860951900482, + "learning_rate": 4.327177271505273e-06, + "loss": 0.1339, + "step": 46473 + }, + { + "epoch": 0.8289159205222416, + "grad_norm": 0.4686424732208252, + "learning_rate": 4.326302040645533e-06, + "loss": 0.1034, + "step": 46474 + }, + { + "epoch": 0.8289337566439553, + "grad_norm": 0.39115047454833984, + "learning_rate": 4.325426889924186e-06, + "loss": 0.1397, + "step": 46475 + }, + { + "epoch": 0.828951592765669, + "grad_norm": 0.279804527759552, + "learning_rate": 4.324551819344627e-06, + "loss": 0.1332, + "step": 46476 + }, + { + "epoch": 0.8289694288873827, + "grad_norm": 0.22898860275745392, + "learning_rate": 4.323676828910239e-06, + "loss": 0.0547, + "step": 46477 + }, + { + "epoch": 0.8289872650090965, + "grad_norm": 0.36812683939933777, + "learning_rate": 4.3228019186244275e-06, + "loss": 0.1859, + "step": 46478 + }, + { + "epoch": 0.8290051011308102, + "grad_norm": 0.437730073928833, + "learning_rate": 4.321927088490577e-06, + "loss": 0.1114, + "step": 46479 + }, + { + "epoch": 0.8290229372525239, + "grad_norm": 0.23340614140033722, + "learning_rate": 4.321052338512074e-06, + "loss": 0.0591, + "step": 46480 + }, + { + "epoch": 0.8290407733742375, + "grad_norm": 0.19981735944747925, + "learning_rate": 4.320177668692321e-06, + "loss": 0.1089, + "step": 46481 + }, + { + "epoch": 0.8290586094959512, + "grad_norm": 0.27109524607658386, + "learning_rate": 4.319303079034695e-06, + "loss": 0.0906, + "step": 46482 + }, + { + "epoch": 0.8290764456176649, + "grad_norm": 0.3076198995113373, + "learning_rate": 4.318428569542599e-06, + "loss": 0.0648, + "step": 46483 + }, + { + "epoch": 0.8290942817393786, + "grad_norm": 0.3023016154766083, + "learning_rate": 4.3175541402194155e-06, + "loss": 0.1178, + "step": 46484 + }, + { + "epoch": 0.8291121178610923, + "grad_norm": 0.3083108365535736, + "learning_rate": 4.31667979106854e-06, + "loss": 0.1241, + "step": 46485 + }, + { + "epoch": 0.829129953982806, + "grad_norm": 0.1969596892595291, + "learning_rate": 4.315805522093347e-06, + "loss": 0.0916, + "step": 46486 + }, + { + "epoch": 0.8291477901045197, + "grad_norm": 0.231434166431427, + "learning_rate": 4.3149313332972425e-06, + "loss": 0.0824, + "step": 46487 + }, + { + "epoch": 0.8291656262262334, + "grad_norm": 0.22454939782619476, + "learning_rate": 4.3140572246836095e-06, + "loss": 0.0892, + "step": 46488 + }, + { + "epoch": 0.829183462347947, + "grad_norm": 0.257914274930954, + "learning_rate": 4.313183196255838e-06, + "loss": 0.0983, + "step": 46489 + }, + { + "epoch": 0.8292012984696607, + "grad_norm": 0.41265565156936646, + "learning_rate": 4.312309248017305e-06, + "loss": 0.1369, + "step": 46490 + }, + { + "epoch": 0.8292191345913744, + "grad_norm": 0.18557557463645935, + "learning_rate": 4.3114353799714126e-06, + "loss": 0.0868, + "step": 46491 + }, + { + "epoch": 0.8292369707130881, + "grad_norm": 0.2766319513320923, + "learning_rate": 4.310561592121539e-06, + "loss": 0.1607, + "step": 46492 + }, + { + "epoch": 0.8292548068348018, + "grad_norm": 0.308193176984787, + "learning_rate": 4.309687884471081e-06, + "loss": 0.0952, + "step": 46493 + }, + { + "epoch": 0.8292726429565155, + "grad_norm": 0.2726615369319916, + "learning_rate": 4.308814257023408e-06, + "loss": 0.2323, + "step": 46494 + }, + { + "epoch": 0.8292904790782293, + "grad_norm": 0.4430800974369049, + "learning_rate": 4.307940709781918e-06, + "loss": 0.1004, + "step": 46495 + }, + { + "epoch": 0.829308315199943, + "grad_norm": 0.2686435878276825, + "learning_rate": 4.307067242750007e-06, + "loss": 0.1287, + "step": 46496 + }, + { + "epoch": 0.8293261513216567, + "grad_norm": 0.22180284559726715, + "learning_rate": 4.306193855931046e-06, + "loss": 0.1109, + "step": 46497 + }, + { + "epoch": 0.8293439874433703, + "grad_norm": 0.2933272123336792, + "learning_rate": 4.305320549328426e-06, + "loss": 0.1394, + "step": 46498 + }, + { + "epoch": 0.829361823565084, + "grad_norm": 0.2671579122543335, + "learning_rate": 4.304447322945526e-06, + "loss": 0.1148, + "step": 46499 + }, + { + "epoch": 0.8293796596867977, + "grad_norm": 0.228309765458107, + "learning_rate": 4.303574176785741e-06, + "loss": 0.0652, + "step": 46500 + }, + { + "epoch": 0.8293974958085114, + "grad_norm": 0.2643977999687195, + "learning_rate": 4.302701110852453e-06, + "loss": 0.1005, + "step": 46501 + }, + { + "epoch": 0.8294153319302251, + "grad_norm": 0.2623671293258667, + "learning_rate": 4.301828125149043e-06, + "loss": 0.0935, + "step": 46502 + }, + { + "epoch": 0.8294331680519388, + "grad_norm": 0.2562272250652313, + "learning_rate": 4.3009552196788896e-06, + "loss": 0.1088, + "step": 46503 + }, + { + "epoch": 0.8294510041736525, + "grad_norm": 0.3042919337749481, + "learning_rate": 4.30008239444539e-06, + "loss": 0.1359, + "step": 46504 + }, + { + "epoch": 0.8294688402953662, + "grad_norm": 0.25932884216308594, + "learning_rate": 4.299209649451918e-06, + "loss": 0.1304, + "step": 46505 + }, + { + "epoch": 0.8294866764170798, + "grad_norm": 0.18472617864608765, + "learning_rate": 4.298336984701862e-06, + "loss": 0.0747, + "step": 46506 + }, + { + "epoch": 0.8295045125387935, + "grad_norm": 0.2599087953567505, + "learning_rate": 4.297464400198595e-06, + "loss": 0.1407, + "step": 46507 + }, + { + "epoch": 0.8295223486605072, + "grad_norm": 0.24152512848377228, + "learning_rate": 4.296591895945512e-06, + "loss": 0.1273, + "step": 46508 + }, + { + "epoch": 0.8295401847822209, + "grad_norm": 0.40741512179374695, + "learning_rate": 4.295719471945983e-06, + "loss": 0.1459, + "step": 46509 + }, + { + "epoch": 0.8295580209039346, + "grad_norm": 0.19509044289588928, + "learning_rate": 4.2948471282034054e-06, + "loss": 0.1089, + "step": 46510 + }, + { + "epoch": 0.8295758570256483, + "grad_norm": 0.23058395087718964, + "learning_rate": 4.293974864721148e-06, + "loss": 0.079, + "step": 46511 + }, + { + "epoch": 0.8295936931473621, + "grad_norm": 0.26388972997665405, + "learning_rate": 4.293102681502592e-06, + "loss": 0.1081, + "step": 46512 + }, + { + "epoch": 0.8296115292690758, + "grad_norm": 0.28935322165489197, + "learning_rate": 4.292230578551126e-06, + "loss": 0.1199, + "step": 46513 + }, + { + "epoch": 0.8296293653907895, + "grad_norm": 0.32966896891593933, + "learning_rate": 4.291358555870129e-06, + "loss": 0.1422, + "step": 46514 + }, + { + "epoch": 0.8296472015125032, + "grad_norm": 0.2210475355386734, + "learning_rate": 4.290486613462977e-06, + "loss": 0.0911, + "step": 46515 + }, + { + "epoch": 0.8296650376342168, + "grad_norm": 0.23138806223869324, + "learning_rate": 4.289614751333046e-06, + "loss": 0.0849, + "step": 46516 + }, + { + "epoch": 0.8296828737559305, + "grad_norm": 0.34146055579185486, + "learning_rate": 4.288742969483727e-06, + "loss": 0.1037, + "step": 46517 + }, + { + "epoch": 0.8297007098776442, + "grad_norm": 0.29106923937797546, + "learning_rate": 4.287871267918395e-06, + "loss": 0.0816, + "step": 46518 + }, + { + "epoch": 0.8297185459993579, + "grad_norm": 0.24652424454689026, + "learning_rate": 4.286999646640428e-06, + "loss": 0.1399, + "step": 46519 + }, + { + "epoch": 0.8297363821210716, + "grad_norm": 0.38204681873321533, + "learning_rate": 4.286128105653203e-06, + "loss": 0.1478, + "step": 46520 + }, + { + "epoch": 0.8297542182427853, + "grad_norm": 0.18774308264255524, + "learning_rate": 4.2852566449600925e-06, + "loss": 0.0624, + "step": 46521 + }, + { + "epoch": 0.829772054364499, + "grad_norm": 0.19842013716697693, + "learning_rate": 4.284385264564483e-06, + "loss": 0.1161, + "step": 46522 + }, + { + "epoch": 0.8297898904862127, + "grad_norm": 0.2144027203321457, + "learning_rate": 4.283513964469759e-06, + "loss": 0.1017, + "step": 46523 + }, + { + "epoch": 0.8298077266079263, + "grad_norm": 0.24248646199703217, + "learning_rate": 4.282642744679289e-06, + "loss": 0.1182, + "step": 46524 + }, + { + "epoch": 0.82982556272964, + "grad_norm": 0.2572486102581024, + "learning_rate": 4.281771605196444e-06, + "loss": 0.1021, + "step": 46525 + }, + { + "epoch": 0.8298433988513537, + "grad_norm": 0.25692108273506165, + "learning_rate": 4.280900546024616e-06, + "loss": 0.1206, + "step": 46526 + }, + { + "epoch": 0.8298612349730674, + "grad_norm": 0.19257889688014984, + "learning_rate": 4.280029567167174e-06, + "loss": 0.0551, + "step": 46527 + }, + { + "epoch": 0.8298790710947811, + "grad_norm": 0.20198261737823486, + "learning_rate": 4.279158668627492e-06, + "loss": 0.1652, + "step": 46528 + }, + { + "epoch": 0.8298969072164949, + "grad_norm": 0.19261956214904785, + "learning_rate": 4.278287850408941e-06, + "loss": 0.0934, + "step": 46529 + }, + { + "epoch": 0.8299147433382086, + "grad_norm": 0.22037526965141296, + "learning_rate": 4.2774171125149115e-06, + "loss": 0.108, + "step": 46530 + }, + { + "epoch": 0.8299325794599223, + "grad_norm": 0.22742339968681335, + "learning_rate": 4.276546454948768e-06, + "loss": 0.1072, + "step": 46531 + }, + { + "epoch": 0.829950415581636, + "grad_norm": 0.22602617740631104, + "learning_rate": 4.275675877713891e-06, + "loss": 0.071, + "step": 46532 + }, + { + "epoch": 0.8299682517033496, + "grad_norm": 0.3259783387184143, + "learning_rate": 4.274805380813651e-06, + "loss": 0.123, + "step": 46533 + }, + { + "epoch": 0.8299860878250633, + "grad_norm": 0.2571578621864319, + "learning_rate": 4.273934964251419e-06, + "loss": 0.1206, + "step": 46534 + }, + { + "epoch": 0.830003923946777, + "grad_norm": 0.3353583514690399, + "learning_rate": 4.273064628030576e-06, + "loss": 0.0905, + "step": 46535 + }, + { + "epoch": 0.8300217600684907, + "grad_norm": 0.2520037591457367, + "learning_rate": 4.272194372154492e-06, + "loss": 0.1502, + "step": 46536 + }, + { + "epoch": 0.8300395961902044, + "grad_norm": 0.2779551148414612, + "learning_rate": 4.2713241966265445e-06, + "loss": 0.1721, + "step": 46537 + }, + { + "epoch": 0.8300574323119181, + "grad_norm": 0.2768584191799164, + "learning_rate": 4.2704541014501e-06, + "loss": 0.138, + "step": 46538 + }, + { + "epoch": 0.8300752684336318, + "grad_norm": 0.25665348768234253, + "learning_rate": 4.269584086628539e-06, + "loss": 0.0784, + "step": 46539 + }, + { + "epoch": 0.8300931045553455, + "grad_norm": 0.34828171133995056, + "learning_rate": 4.2687141521652315e-06, + "loss": 0.1183, + "step": 46540 + }, + { + "epoch": 0.8301109406770592, + "grad_norm": 0.26388639211654663, + "learning_rate": 4.267844298063547e-06, + "loss": 0.1164, + "step": 46541 + }, + { + "epoch": 0.8301287767987728, + "grad_norm": 0.29912909865379333, + "learning_rate": 4.266974524326856e-06, + "loss": 0.1079, + "step": 46542 + }, + { + "epoch": 0.8301466129204865, + "grad_norm": 0.25899645686149597, + "learning_rate": 4.2661048309585335e-06, + "loss": 0.1096, + "step": 46543 + }, + { + "epoch": 0.8301644490422002, + "grad_norm": 0.3737677037715912, + "learning_rate": 4.2652352179619546e-06, + "loss": 0.1594, + "step": 46544 + }, + { + "epoch": 0.830182285163914, + "grad_norm": 0.3143066465854645, + "learning_rate": 4.264365685340482e-06, + "loss": 0.1373, + "step": 46545 + }, + { + "epoch": 0.8302001212856277, + "grad_norm": 0.27195504307746887, + "learning_rate": 4.2634962330974945e-06, + "loss": 0.088, + "step": 46546 + }, + { + "epoch": 0.8302179574073414, + "grad_norm": 0.31021612882614136, + "learning_rate": 4.262626861236346e-06, + "loss": 0.1611, + "step": 46547 + }, + { + "epoch": 0.8302357935290551, + "grad_norm": 0.2511056661605835, + "learning_rate": 4.261757569760427e-06, + "loss": 0.1415, + "step": 46548 + }, + { + "epoch": 0.8302536296507688, + "grad_norm": 0.3210453987121582, + "learning_rate": 4.260888358673093e-06, + "loss": 0.137, + "step": 46549 + }, + { + "epoch": 0.8302714657724825, + "grad_norm": 0.29189273715019226, + "learning_rate": 4.260019227977724e-06, + "loss": 0.1781, + "step": 46550 + }, + { + "epoch": 0.8302893018941961, + "grad_norm": 0.2712114453315735, + "learning_rate": 4.259150177677679e-06, + "loss": 0.0818, + "step": 46551 + }, + { + "epoch": 0.8303071380159098, + "grad_norm": 0.22185178101062775, + "learning_rate": 4.2582812077763375e-06, + "loss": 0.1278, + "step": 46552 + }, + { + "epoch": 0.8303249741376235, + "grad_norm": 0.2672707140445709, + "learning_rate": 4.2574123182770595e-06, + "loss": 0.112, + "step": 46553 + }, + { + "epoch": 0.8303428102593372, + "grad_norm": 0.1874978393316269, + "learning_rate": 4.256543509183219e-06, + "loss": 0.0852, + "step": 46554 + }, + { + "epoch": 0.8303606463810509, + "grad_norm": 0.34851282835006714, + "learning_rate": 4.2556747804981724e-06, + "loss": 0.099, + "step": 46555 + }, + { + "epoch": 0.8303784825027646, + "grad_norm": 0.33474960923194885, + "learning_rate": 4.2548061322253e-06, + "loss": 0.0762, + "step": 46556 + }, + { + "epoch": 0.8303963186244783, + "grad_norm": 0.2960427403450012, + "learning_rate": 4.253937564367968e-06, + "loss": 0.1513, + "step": 46557 + }, + { + "epoch": 0.830414154746192, + "grad_norm": 0.2283165454864502, + "learning_rate": 4.2530690769295365e-06, + "loss": 0.0627, + "step": 46558 + }, + { + "epoch": 0.8304319908679056, + "grad_norm": 0.24846839904785156, + "learning_rate": 4.2522006699133754e-06, + "loss": 0.0777, + "step": 46559 + }, + { + "epoch": 0.8304498269896193, + "grad_norm": 0.21080508828163147, + "learning_rate": 4.2513323433228445e-06, + "loss": 0.0649, + "step": 46560 + }, + { + "epoch": 0.830467663111333, + "grad_norm": 0.24394956231117249, + "learning_rate": 4.2504640971613216e-06, + "loss": 0.1151, + "step": 46561 + }, + { + "epoch": 0.8304854992330468, + "grad_norm": 0.2898518741130829, + "learning_rate": 4.249595931432168e-06, + "loss": 0.1126, + "step": 46562 + }, + { + "epoch": 0.8305033353547605, + "grad_norm": 0.27317920327186584, + "learning_rate": 4.248727846138742e-06, + "loss": 0.0922, + "step": 46563 + }, + { + "epoch": 0.8305211714764742, + "grad_norm": 0.2531094551086426, + "learning_rate": 4.247859841284418e-06, + "loss": 0.0733, + "step": 46564 + }, + { + "epoch": 0.8305390075981879, + "grad_norm": 0.219723641872406, + "learning_rate": 4.24699191687255e-06, + "loss": 0.1047, + "step": 46565 + }, + { + "epoch": 0.8305568437199016, + "grad_norm": 0.2982277274131775, + "learning_rate": 4.246124072906518e-06, + "loss": 0.1106, + "step": 46566 + }, + { + "epoch": 0.8305746798416153, + "grad_norm": 0.23950234055519104, + "learning_rate": 4.2452563093896754e-06, + "loss": 0.1226, + "step": 46567 + }, + { + "epoch": 0.830592515963329, + "grad_norm": 0.289177805185318, + "learning_rate": 4.244388626325382e-06, + "loss": 0.1303, + "step": 46568 + }, + { + "epoch": 0.8306103520850426, + "grad_norm": 0.32485687732696533, + "learning_rate": 4.243521023717015e-06, + "loss": 0.1227, + "step": 46569 + }, + { + "epoch": 0.8306281882067563, + "grad_norm": 0.3352840542793274, + "learning_rate": 4.242653501567928e-06, + "loss": 0.1133, + "step": 46570 + }, + { + "epoch": 0.83064602432847, + "grad_norm": 0.3036191463470459, + "learning_rate": 4.241786059881484e-06, + "loss": 0.0958, + "step": 46571 + }, + { + "epoch": 0.8306638604501837, + "grad_norm": 0.335553914308548, + "learning_rate": 4.24091869866105e-06, + "loss": 0.0876, + "step": 46572 + }, + { + "epoch": 0.8306816965718974, + "grad_norm": 0.24407418072223663, + "learning_rate": 4.2400514179099765e-06, + "loss": 0.1022, + "step": 46573 + }, + { + "epoch": 0.8306995326936111, + "grad_norm": 0.3260498046875, + "learning_rate": 4.2391842176316425e-06, + "loss": 0.1167, + "step": 46574 + }, + { + "epoch": 0.8307173688153248, + "grad_norm": 0.27406740188598633, + "learning_rate": 4.238317097829397e-06, + "loss": 0.073, + "step": 46575 + }, + { + "epoch": 0.8307352049370385, + "grad_norm": 0.23395362496376038, + "learning_rate": 4.237450058506603e-06, + "loss": 0.0834, + "step": 46576 + }, + { + "epoch": 0.8307530410587521, + "grad_norm": 0.2595573365688324, + "learning_rate": 4.236583099666628e-06, + "loss": 0.0924, + "step": 46577 + }, + { + "epoch": 0.8307708771804658, + "grad_norm": 0.22945165634155273, + "learning_rate": 4.235716221312821e-06, + "loss": 0.1095, + "step": 46578 + }, + { + "epoch": 0.8307887133021796, + "grad_norm": 0.2776692509651184, + "learning_rate": 4.234849423448559e-06, + "loss": 0.0843, + "step": 46579 + }, + { + "epoch": 0.8308065494238933, + "grad_norm": 0.3470524251461029, + "learning_rate": 4.233982706077191e-06, + "loss": 0.1758, + "step": 46580 + }, + { + "epoch": 0.830824385545607, + "grad_norm": 0.29722994565963745, + "learning_rate": 4.233116069202072e-06, + "loss": 0.0907, + "step": 46581 + }, + { + "epoch": 0.8308422216673207, + "grad_norm": 0.4577447772026062, + "learning_rate": 4.2322495128265745e-06, + "loss": 0.0909, + "step": 46582 + }, + { + "epoch": 0.8308600577890344, + "grad_norm": 0.3017619252204895, + "learning_rate": 4.231383036954051e-06, + "loss": 0.0982, + "step": 46583 + }, + { + "epoch": 0.8308778939107481, + "grad_norm": 0.21828629076480865, + "learning_rate": 4.230516641587859e-06, + "loss": 0.1159, + "step": 46584 + }, + { + "epoch": 0.8308957300324618, + "grad_norm": 0.374336302280426, + "learning_rate": 4.229650326731352e-06, + "loss": 0.0989, + "step": 46585 + }, + { + "epoch": 0.8309135661541754, + "grad_norm": 0.31565165519714355, + "learning_rate": 4.2287840923879025e-06, + "loss": 0.1038, + "step": 46586 + }, + { + "epoch": 0.8309314022758891, + "grad_norm": 0.323011577129364, + "learning_rate": 4.227917938560857e-06, + "loss": 0.1221, + "step": 46587 + }, + { + "epoch": 0.8309492383976028, + "grad_norm": 0.47113168239593506, + "learning_rate": 4.227051865253579e-06, + "loss": 0.1116, + "step": 46588 + }, + { + "epoch": 0.8309670745193165, + "grad_norm": 0.3476635813713074, + "learning_rate": 4.226185872469421e-06, + "loss": 0.1423, + "step": 46589 + }, + { + "epoch": 0.8309849106410302, + "grad_norm": 0.2949986755847931, + "learning_rate": 4.225319960211735e-06, + "loss": 0.1232, + "step": 46590 + }, + { + "epoch": 0.8310027467627439, + "grad_norm": 0.30801641941070557, + "learning_rate": 4.22445412848389e-06, + "loss": 0.0947, + "step": 46591 + }, + { + "epoch": 0.8310205828844576, + "grad_norm": 0.3262016475200653, + "learning_rate": 4.223588377289231e-06, + "loss": 0.0994, + "step": 46592 + }, + { + "epoch": 0.8310384190061713, + "grad_norm": 0.292354017496109, + "learning_rate": 4.222722706631127e-06, + "loss": 0.1307, + "step": 46593 + }, + { + "epoch": 0.831056255127885, + "grad_norm": 0.24663913249969482, + "learning_rate": 4.22185711651292e-06, + "loss": 0.0575, + "step": 46594 + }, + { + "epoch": 0.8310740912495986, + "grad_norm": 0.2890048623085022, + "learning_rate": 4.220991606937974e-06, + "loss": 0.1745, + "step": 46595 + }, + { + "epoch": 0.8310919273713124, + "grad_norm": 0.2675207257270813, + "learning_rate": 4.220126177909645e-06, + "loss": 0.0928, + "step": 46596 + }, + { + "epoch": 0.8311097634930261, + "grad_norm": 0.21993179619312286, + "learning_rate": 4.219260829431282e-06, + "loss": 0.0773, + "step": 46597 + }, + { + "epoch": 0.8311275996147398, + "grad_norm": 0.20542635023593903, + "learning_rate": 4.218395561506236e-06, + "loss": 0.1015, + "step": 46598 + }, + { + "epoch": 0.8311454357364535, + "grad_norm": 0.2960578501224518, + "learning_rate": 4.217530374137873e-06, + "loss": 0.147, + "step": 46599 + }, + { + "epoch": 0.8311632718581672, + "grad_norm": 0.2885647118091583, + "learning_rate": 4.216665267329539e-06, + "loss": 0.0968, + "step": 46600 + }, + { + "epoch": 0.8311811079798809, + "grad_norm": 0.340796560049057, + "learning_rate": 4.215800241084591e-06, + "loss": 0.0823, + "step": 46601 + }, + { + "epoch": 0.8311989441015946, + "grad_norm": 0.2635718584060669, + "learning_rate": 4.2149352954063775e-06, + "loss": 0.1112, + "step": 46602 + }, + { + "epoch": 0.8312167802233082, + "grad_norm": 0.28207260370254517, + "learning_rate": 4.21407043029825e-06, + "loss": 0.1301, + "step": 46603 + }, + { + "epoch": 0.8312346163450219, + "grad_norm": 0.31943976879119873, + "learning_rate": 4.213205645763569e-06, + "loss": 0.1206, + "step": 46604 + }, + { + "epoch": 0.8312524524667356, + "grad_norm": 0.3963462710380554, + "learning_rate": 4.212340941805676e-06, + "loss": 0.1239, + "step": 46605 + }, + { + "epoch": 0.8312702885884493, + "grad_norm": 0.23490403592586517, + "learning_rate": 4.211476318427937e-06, + "loss": 0.0965, + "step": 46606 + }, + { + "epoch": 0.831288124710163, + "grad_norm": 0.2724912464618683, + "learning_rate": 4.210611775633688e-06, + "loss": 0.1058, + "step": 46607 + }, + { + "epoch": 0.8313059608318767, + "grad_norm": 0.3340529203414917, + "learning_rate": 4.209747313426296e-06, + "loss": 0.0825, + "step": 46608 + }, + { + "epoch": 0.8313237969535904, + "grad_norm": 0.21293847262859344, + "learning_rate": 4.208882931809105e-06, + "loss": 0.0949, + "step": 46609 + }, + { + "epoch": 0.8313416330753041, + "grad_norm": 0.2156914323568344, + "learning_rate": 4.208018630785462e-06, + "loss": 0.1264, + "step": 46610 + }, + { + "epoch": 0.8313594691970178, + "grad_norm": 0.32299378514289856, + "learning_rate": 4.207154410358716e-06, + "loss": 0.1143, + "step": 46611 + }, + { + "epoch": 0.8313773053187314, + "grad_norm": 0.64911288022995, + "learning_rate": 4.206290270532226e-06, + "loss": 0.1278, + "step": 46612 + }, + { + "epoch": 0.8313951414404452, + "grad_norm": 0.3406936526298523, + "learning_rate": 4.205426211309338e-06, + "loss": 0.1239, + "step": 46613 + }, + { + "epoch": 0.8314129775621589, + "grad_norm": 0.3430907130241394, + "learning_rate": 4.204562232693399e-06, + "loss": 0.106, + "step": 46614 + }, + { + "epoch": 0.8314308136838726, + "grad_norm": 0.2644566595554352, + "learning_rate": 4.203698334687761e-06, + "loss": 0.1303, + "step": 46615 + }, + { + "epoch": 0.8314486498055863, + "grad_norm": 0.3030645251274109, + "learning_rate": 4.2028345172957614e-06, + "loss": 0.1029, + "step": 46616 + }, + { + "epoch": 0.8314664859273, + "grad_norm": 0.24397756159305573, + "learning_rate": 4.201970780520767e-06, + "loss": 0.148, + "step": 46617 + }, + { + "epoch": 0.8314843220490137, + "grad_norm": 0.24367259442806244, + "learning_rate": 4.2011071243661185e-06, + "loss": 0.1221, + "step": 46618 + }, + { + "epoch": 0.8315021581707274, + "grad_norm": 0.3192799389362335, + "learning_rate": 4.200243548835156e-06, + "loss": 0.124, + "step": 46619 + }, + { + "epoch": 0.8315199942924411, + "grad_norm": 0.2924949824810028, + "learning_rate": 4.199380053931232e-06, + "loss": 0.1145, + "step": 46620 + }, + { + "epoch": 0.8315378304141547, + "grad_norm": 0.2935815453529358, + "learning_rate": 4.198516639657701e-06, + "loss": 0.0927, + "step": 46621 + }, + { + "epoch": 0.8315556665358684, + "grad_norm": 0.2474581003189087, + "learning_rate": 4.197653306017904e-06, + "loss": 0.1291, + "step": 46622 + }, + { + "epoch": 0.8315735026575821, + "grad_norm": 0.24212592840194702, + "learning_rate": 4.19679005301519e-06, + "loss": 0.0827, + "step": 46623 + }, + { + "epoch": 0.8315913387792958, + "grad_norm": 0.2553001344203949, + "learning_rate": 4.195926880652895e-06, + "loss": 0.0955, + "step": 46624 + }, + { + "epoch": 0.8316091749010095, + "grad_norm": 0.30935606360435486, + "learning_rate": 4.195063788934381e-06, + "loss": 0.1389, + "step": 46625 + }, + { + "epoch": 0.8316270110227232, + "grad_norm": 0.2515186071395874, + "learning_rate": 4.194200777862984e-06, + "loss": 0.0863, + "step": 46626 + }, + { + "epoch": 0.8316448471444369, + "grad_norm": 0.2340743988752365, + "learning_rate": 4.1933378474420495e-06, + "loss": 0.0684, + "step": 46627 + }, + { + "epoch": 0.8316626832661506, + "grad_norm": 0.28276729583740234, + "learning_rate": 4.192474997674925e-06, + "loss": 0.1053, + "step": 46628 + }, + { + "epoch": 0.8316805193878642, + "grad_norm": 0.2271561175584793, + "learning_rate": 4.191612228564948e-06, + "loss": 0.0783, + "step": 46629 + }, + { + "epoch": 0.831698355509578, + "grad_norm": 0.2162674516439438, + "learning_rate": 4.190749540115477e-06, + "loss": 0.086, + "step": 46630 + }, + { + "epoch": 0.8317161916312917, + "grad_norm": 0.2562078535556793, + "learning_rate": 4.189886932329845e-06, + "loss": 0.0698, + "step": 46631 + }, + { + "epoch": 0.8317340277530054, + "grad_norm": 0.33728837966918945, + "learning_rate": 4.189024405211394e-06, + "loss": 0.1417, + "step": 46632 + }, + { + "epoch": 0.8317518638747191, + "grad_norm": 0.23697872459888458, + "learning_rate": 4.188161958763481e-06, + "loss": 0.112, + "step": 46633 + }, + { + "epoch": 0.8317696999964328, + "grad_norm": 0.2588939070701599, + "learning_rate": 4.18729959298943e-06, + "loss": 0.0676, + "step": 46634 + }, + { + "epoch": 0.8317875361181465, + "grad_norm": 0.2359021157026291, + "learning_rate": 4.186437307892604e-06, + "loss": 0.121, + "step": 46635 + }, + { + "epoch": 0.8318053722398602, + "grad_norm": 0.24883480370044708, + "learning_rate": 4.185575103476333e-06, + "loss": 0.0929, + "step": 46636 + }, + { + "epoch": 0.8318232083615739, + "grad_norm": 0.25182873010635376, + "learning_rate": 4.184712979743957e-06, + "loss": 0.1416, + "step": 46637 + }, + { + "epoch": 0.8318410444832876, + "grad_norm": 0.41528597474098206, + "learning_rate": 4.18385093669883e-06, + "loss": 0.2181, + "step": 46638 + }, + { + "epoch": 0.8318588806050012, + "grad_norm": 0.2870693504810333, + "learning_rate": 4.182988974344285e-06, + "loss": 0.1018, + "step": 46639 + }, + { + "epoch": 0.8318767167267149, + "grad_norm": 0.3066214323043823, + "learning_rate": 4.182127092683663e-06, + "loss": 0.1379, + "step": 46640 + }, + { + "epoch": 0.8318945528484286, + "grad_norm": 0.22343546152114868, + "learning_rate": 4.18126529172031e-06, + "loss": 0.0724, + "step": 46641 + }, + { + "epoch": 0.8319123889701423, + "grad_norm": 0.2248062938451767, + "learning_rate": 4.1804035714575555e-06, + "loss": 0.1227, + "step": 46642 + }, + { + "epoch": 0.831930225091856, + "grad_norm": 0.22939786314964294, + "learning_rate": 4.179541931898753e-06, + "loss": 0.1078, + "step": 46643 + }, + { + "epoch": 0.8319480612135697, + "grad_norm": 0.23790012300014496, + "learning_rate": 4.178680373047239e-06, + "loss": 0.0855, + "step": 46644 + }, + { + "epoch": 0.8319658973352834, + "grad_norm": 0.3449121117591858, + "learning_rate": 4.177818894906352e-06, + "loss": 0.121, + "step": 46645 + }, + { + "epoch": 0.8319837334569972, + "grad_norm": 0.24393446743488312, + "learning_rate": 4.176957497479423e-06, + "loss": 0.0791, + "step": 46646 + }, + { + "epoch": 0.8320015695787109, + "grad_norm": 0.3197662830352783, + "learning_rate": 4.176096180769798e-06, + "loss": 0.0727, + "step": 46647 + }, + { + "epoch": 0.8320194057004245, + "grad_norm": 0.24342453479766846, + "learning_rate": 4.175234944780823e-06, + "loss": 0.0936, + "step": 46648 + }, + { + "epoch": 0.8320372418221382, + "grad_norm": 0.26099687814712524, + "learning_rate": 4.174373789515831e-06, + "loss": 0.1441, + "step": 46649 + }, + { + "epoch": 0.8320550779438519, + "grad_norm": 0.17786675691604614, + "learning_rate": 4.173512714978153e-06, + "loss": 0.0614, + "step": 46650 + }, + { + "epoch": 0.8320729140655656, + "grad_norm": 0.2534498870372772, + "learning_rate": 4.172651721171139e-06, + "loss": 0.1234, + "step": 46651 + }, + { + "epoch": 0.8320907501872793, + "grad_norm": 0.29591962695121765, + "learning_rate": 4.17179080809812e-06, + "loss": 0.1142, + "step": 46652 + }, + { + "epoch": 0.832108586308993, + "grad_norm": 0.36470091342926025, + "learning_rate": 4.170929975762433e-06, + "loss": 0.1495, + "step": 46653 + }, + { + "epoch": 0.8321264224307067, + "grad_norm": 0.3425239324569702, + "learning_rate": 4.170069224167416e-06, + "loss": 0.1515, + "step": 46654 + }, + { + "epoch": 0.8321442585524204, + "grad_norm": 0.3186365067958832, + "learning_rate": 4.1692085533164005e-06, + "loss": 0.086, + "step": 46655 + }, + { + "epoch": 0.832162094674134, + "grad_norm": 0.38707998394966125, + "learning_rate": 4.168347963212732e-06, + "loss": 0.166, + "step": 46656 + }, + { + "epoch": 0.8321799307958477, + "grad_norm": 0.306901216506958, + "learning_rate": 4.167487453859742e-06, + "loss": 0.1191, + "step": 46657 + }, + { + "epoch": 0.8321977669175614, + "grad_norm": 0.34486865997314453, + "learning_rate": 4.166627025260766e-06, + "loss": 0.1032, + "step": 46658 + }, + { + "epoch": 0.8322156030392751, + "grad_norm": 0.34938499331474304, + "learning_rate": 4.165766677419133e-06, + "loss": 0.1291, + "step": 46659 + }, + { + "epoch": 0.8322334391609888, + "grad_norm": 0.21423789858818054, + "learning_rate": 4.1649064103381905e-06, + "loss": 0.1111, + "step": 46660 + }, + { + "epoch": 0.8322512752827025, + "grad_norm": 0.29759204387664795, + "learning_rate": 4.164046224021262e-06, + "loss": 0.0982, + "step": 46661 + }, + { + "epoch": 0.8322691114044162, + "grad_norm": 0.2146986573934555, + "learning_rate": 4.163186118471691e-06, + "loss": 0.1001, + "step": 46662 + }, + { + "epoch": 0.83228694752613, + "grad_norm": 0.308400422334671, + "learning_rate": 4.162326093692803e-06, + "loss": 0.0859, + "step": 46663 + }, + { + "epoch": 0.8323047836478437, + "grad_norm": 0.25380659103393555, + "learning_rate": 4.16146614968794e-06, + "loss": 0.0856, + "step": 46664 + }, + { + "epoch": 0.8323226197695573, + "grad_norm": 0.25684964656829834, + "learning_rate": 4.160606286460433e-06, + "loss": 0.0846, + "step": 46665 + }, + { + "epoch": 0.832340455891271, + "grad_norm": 0.2833910584449768, + "learning_rate": 4.159746504013615e-06, + "loss": 0.1369, + "step": 46666 + }, + { + "epoch": 0.8323582920129847, + "grad_norm": 0.21197587251663208, + "learning_rate": 4.158886802350817e-06, + "loss": 0.1061, + "step": 46667 + }, + { + "epoch": 0.8323761281346984, + "grad_norm": 0.25691279768943787, + "learning_rate": 4.158027181475366e-06, + "loss": 0.1277, + "step": 46668 + }, + { + "epoch": 0.8323939642564121, + "grad_norm": 0.38181617856025696, + "learning_rate": 4.1571676413906055e-06, + "loss": 0.0966, + "step": 46669 + }, + { + "epoch": 0.8324118003781258, + "grad_norm": 0.33105215430259705, + "learning_rate": 4.156308182099861e-06, + "loss": 0.1494, + "step": 46670 + }, + { + "epoch": 0.8324296364998395, + "grad_norm": 0.2425295114517212, + "learning_rate": 4.155448803606466e-06, + "loss": 0.087, + "step": 46671 + }, + { + "epoch": 0.8324474726215532, + "grad_norm": 0.30603736639022827, + "learning_rate": 4.154589505913745e-06, + "loss": 0.1143, + "step": 46672 + }, + { + "epoch": 0.8324653087432669, + "grad_norm": 0.288798063993454, + "learning_rate": 4.153730289025043e-06, + "loss": 0.1005, + "step": 46673 + }, + { + "epoch": 0.8324831448649805, + "grad_norm": 0.3551574945449829, + "learning_rate": 4.1528711529436795e-06, + "loss": 0.0958, + "step": 46674 + }, + { + "epoch": 0.8325009809866942, + "grad_norm": 0.3450653851032257, + "learning_rate": 4.152012097672983e-06, + "loss": 0.0912, + "step": 46675 + }, + { + "epoch": 0.8325188171084079, + "grad_norm": 0.2784174978733063, + "learning_rate": 4.151153123216286e-06, + "loss": 0.0918, + "step": 46676 + }, + { + "epoch": 0.8325366532301216, + "grad_norm": 0.2502913177013397, + "learning_rate": 4.150294229576931e-06, + "loss": 0.1075, + "step": 46677 + }, + { + "epoch": 0.8325544893518353, + "grad_norm": 0.25268688797950745, + "learning_rate": 4.149435416758235e-06, + "loss": 0.1509, + "step": 46678 + }, + { + "epoch": 0.832572325473549, + "grad_norm": 0.30086496472358704, + "learning_rate": 4.1485766847635275e-06, + "loss": 0.1271, + "step": 46679 + }, + { + "epoch": 0.8325901615952628, + "grad_norm": 0.5147128701210022, + "learning_rate": 4.14771803359614e-06, + "loss": 0.1055, + "step": 46680 + }, + { + "epoch": 0.8326079977169765, + "grad_norm": 0.23225994408130646, + "learning_rate": 4.146859463259392e-06, + "loss": 0.1267, + "step": 46681 + }, + { + "epoch": 0.8326258338386902, + "grad_norm": 0.3003656268119812, + "learning_rate": 4.1460009737566255e-06, + "loss": 0.1079, + "step": 46682 + }, + { + "epoch": 0.8326436699604038, + "grad_norm": 0.24094876646995544, + "learning_rate": 4.145142565091165e-06, + "loss": 0.089, + "step": 46683 + }, + { + "epoch": 0.8326615060821175, + "grad_norm": 0.2228277176618576, + "learning_rate": 4.14428423726633e-06, + "loss": 0.1076, + "step": 46684 + }, + { + "epoch": 0.8326793422038312, + "grad_norm": 0.2801830470561981, + "learning_rate": 4.143425990285449e-06, + "loss": 0.1062, + "step": 46685 + }, + { + "epoch": 0.8326971783255449, + "grad_norm": 0.2542019784450531, + "learning_rate": 4.14256782415186e-06, + "loss": 0.1087, + "step": 46686 + }, + { + "epoch": 0.8327150144472586, + "grad_norm": 0.24306438863277435, + "learning_rate": 4.141709738868879e-06, + "loss": 0.1016, + "step": 46687 + }, + { + "epoch": 0.8327328505689723, + "grad_norm": 0.24475422501564026, + "learning_rate": 4.140851734439832e-06, + "loss": 0.1119, + "step": 46688 + }, + { + "epoch": 0.832750686690686, + "grad_norm": 0.22002215683460236, + "learning_rate": 4.139993810868053e-06, + "loss": 0.1236, + "step": 46689 + }, + { + "epoch": 0.8327685228123997, + "grad_norm": 0.3026716709136963, + "learning_rate": 4.139135968156857e-06, + "loss": 0.1357, + "step": 46690 + }, + { + "epoch": 0.8327863589341133, + "grad_norm": 0.23907902836799622, + "learning_rate": 4.138278206309582e-06, + "loss": 0.0865, + "step": 46691 + }, + { + "epoch": 0.832804195055827, + "grad_norm": 0.27678099274635315, + "learning_rate": 4.137420525329544e-06, + "loss": 0.0992, + "step": 46692 + }, + { + "epoch": 0.8328220311775407, + "grad_norm": 0.20918892323970795, + "learning_rate": 4.136562925220072e-06, + "loss": 0.0913, + "step": 46693 + }, + { + "epoch": 0.8328398672992544, + "grad_norm": 0.28752925992012024, + "learning_rate": 4.13570540598448e-06, + "loss": 0.0961, + "step": 46694 + }, + { + "epoch": 0.8328577034209681, + "grad_norm": 0.2905171811580658, + "learning_rate": 4.134847967626107e-06, + "loss": 0.1062, + "step": 46695 + }, + { + "epoch": 0.8328755395426818, + "grad_norm": 0.34169983863830566, + "learning_rate": 4.1339906101482724e-06, + "loss": 0.1313, + "step": 46696 + }, + { + "epoch": 0.8328933756643956, + "grad_norm": 0.2458985447883606, + "learning_rate": 4.133133333554292e-06, + "loss": 0.0977, + "step": 46697 + }, + { + "epoch": 0.8329112117861093, + "grad_norm": 0.1946738064289093, + "learning_rate": 4.132276137847491e-06, + "loss": 0.0814, + "step": 46698 + }, + { + "epoch": 0.832929047907823, + "grad_norm": 0.251022607088089, + "learning_rate": 4.131419023031202e-06, + "loss": 0.1238, + "step": 46699 + }, + { + "epoch": 0.8329468840295366, + "grad_norm": 0.32583874464035034, + "learning_rate": 4.130561989108739e-06, + "loss": 0.1181, + "step": 46700 + }, + { + "epoch": 0.8329647201512503, + "grad_norm": 0.23731474578380585, + "learning_rate": 4.129705036083426e-06, + "loss": 0.1067, + "step": 46701 + }, + { + "epoch": 0.832982556272964, + "grad_norm": 0.29119330644607544, + "learning_rate": 4.128848163958579e-06, + "loss": 0.0569, + "step": 46702 + }, + { + "epoch": 0.8330003923946777, + "grad_norm": 0.4712277352809906, + "learning_rate": 4.127991372737525e-06, + "loss": 0.1925, + "step": 46703 + }, + { + "epoch": 0.8330182285163914, + "grad_norm": 0.4032341241836548, + "learning_rate": 4.127134662423593e-06, + "loss": 0.1282, + "step": 46704 + }, + { + "epoch": 0.8330360646381051, + "grad_norm": 0.28733837604522705, + "learning_rate": 4.126278033020095e-06, + "loss": 0.1509, + "step": 46705 + }, + { + "epoch": 0.8330539007598188, + "grad_norm": 0.3174244165420532, + "learning_rate": 4.125421484530351e-06, + "loss": 0.1459, + "step": 46706 + }, + { + "epoch": 0.8330717368815325, + "grad_norm": 0.22439810633659363, + "learning_rate": 4.12456501695768e-06, + "loss": 0.0993, + "step": 46707 + }, + { + "epoch": 0.8330895730032462, + "grad_norm": 0.3191872537136078, + "learning_rate": 4.123708630305409e-06, + "loss": 0.099, + "step": 46708 + }, + { + "epoch": 0.8331074091249598, + "grad_norm": 0.31039050221443176, + "learning_rate": 4.122852324576856e-06, + "loss": 0.0942, + "step": 46709 + }, + { + "epoch": 0.8331252452466735, + "grad_norm": 0.3705736994743347, + "learning_rate": 4.121996099775335e-06, + "loss": 0.1404, + "step": 46710 + }, + { + "epoch": 0.8331430813683872, + "grad_norm": 0.20622700452804565, + "learning_rate": 4.121139955904166e-06, + "loss": 0.1335, + "step": 46711 + }, + { + "epoch": 0.8331609174901009, + "grad_norm": 0.20084848999977112, + "learning_rate": 4.120283892966673e-06, + "loss": 0.1195, + "step": 46712 + }, + { + "epoch": 0.8331787536118146, + "grad_norm": 0.24524636566638947, + "learning_rate": 4.119427910966173e-06, + "loss": 0.1232, + "step": 46713 + }, + { + "epoch": 0.8331965897335284, + "grad_norm": 0.29304173588752747, + "learning_rate": 4.1185720099059825e-06, + "loss": 0.0549, + "step": 46714 + }, + { + "epoch": 0.8332144258552421, + "grad_norm": 0.2700052559375763, + "learning_rate": 4.1177161897894116e-06, + "loss": 0.0938, + "step": 46715 + }, + { + "epoch": 0.8332322619769558, + "grad_norm": 0.3403674066066742, + "learning_rate": 4.116860450619792e-06, + "loss": 0.1064, + "step": 46716 + }, + { + "epoch": 0.8332500980986695, + "grad_norm": 0.2700015902519226, + "learning_rate": 4.116004792400427e-06, + "loss": 0.1134, + "step": 46717 + }, + { + "epoch": 0.8332679342203831, + "grad_norm": 0.22742699086666107, + "learning_rate": 4.115149215134648e-06, + "loss": 0.1297, + "step": 46718 + }, + { + "epoch": 0.8332857703420968, + "grad_norm": 0.2205462008714676, + "learning_rate": 4.114293718825762e-06, + "loss": 0.0961, + "step": 46719 + }, + { + "epoch": 0.8333036064638105, + "grad_norm": 0.32220178842544556, + "learning_rate": 4.113438303477085e-06, + "loss": 0.0772, + "step": 46720 + }, + { + "epoch": 0.8333214425855242, + "grad_norm": 0.24213223159313202, + "learning_rate": 4.112582969091938e-06, + "loss": 0.1005, + "step": 46721 + }, + { + "epoch": 0.8333392787072379, + "grad_norm": 0.3152848184108734, + "learning_rate": 4.111727715673632e-06, + "loss": 0.1024, + "step": 46722 + }, + { + "epoch": 0.8333571148289516, + "grad_norm": 0.2606961727142334, + "learning_rate": 4.1108725432254886e-06, + "loss": 0.1131, + "step": 46723 + }, + { + "epoch": 0.8333749509506653, + "grad_norm": 0.16872140765190125, + "learning_rate": 4.110017451750811e-06, + "loss": 0.0776, + "step": 46724 + }, + { + "epoch": 0.833392787072379, + "grad_norm": 0.27245399355888367, + "learning_rate": 4.109162441252926e-06, + "loss": 0.1081, + "step": 46725 + }, + { + "epoch": 0.8334106231940926, + "grad_norm": 0.2449917048215866, + "learning_rate": 4.108307511735141e-06, + "loss": 0.115, + "step": 46726 + }, + { + "epoch": 0.8334284593158063, + "grad_norm": 0.31169965863227844, + "learning_rate": 4.107452663200776e-06, + "loss": 0.1232, + "step": 46727 + }, + { + "epoch": 0.83344629543752, + "grad_norm": 0.26030436158180237, + "learning_rate": 4.1065978956531305e-06, + "loss": 0.1558, + "step": 46728 + }, + { + "epoch": 0.8334641315592337, + "grad_norm": 0.8080865740776062, + "learning_rate": 4.105743209095536e-06, + "loss": 0.1482, + "step": 46729 + }, + { + "epoch": 0.8334819676809474, + "grad_norm": 0.3591577112674713, + "learning_rate": 4.10488860353129e-06, + "loss": 0.1426, + "step": 46730 + }, + { + "epoch": 0.8334998038026612, + "grad_norm": 0.26323091983795166, + "learning_rate": 4.1040340789637195e-06, + "loss": 0.1314, + "step": 46731 + }, + { + "epoch": 0.8335176399243749, + "grad_norm": 0.2543598413467407, + "learning_rate": 4.103179635396132e-06, + "loss": 0.0763, + "step": 46732 + }, + { + "epoch": 0.8335354760460886, + "grad_norm": 0.29716119170188904, + "learning_rate": 4.102325272831828e-06, + "loss": 0.0851, + "step": 46733 + }, + { + "epoch": 0.8335533121678023, + "grad_norm": 0.21345782279968262, + "learning_rate": 4.10147099127414e-06, + "loss": 0.0749, + "step": 46734 + }, + { + "epoch": 0.833571148289516, + "grad_norm": 0.2515353858470917, + "learning_rate": 4.100616790726364e-06, + "loss": 0.0835, + "step": 46735 + }, + { + "epoch": 0.8335889844112296, + "grad_norm": 0.25271734595298767, + "learning_rate": 4.099762671191817e-06, + "loss": 0.1085, + "step": 46736 + }, + { + "epoch": 0.8336068205329433, + "grad_norm": 0.20195934176445007, + "learning_rate": 4.0989086326738035e-06, + "loss": 0.0617, + "step": 46737 + }, + { + "epoch": 0.833624656654657, + "grad_norm": 0.2418886125087738, + "learning_rate": 4.098054675175644e-06, + "loss": 0.065, + "step": 46738 + }, + { + "epoch": 0.8336424927763707, + "grad_norm": 0.3445945382118225, + "learning_rate": 4.097200798700645e-06, + "loss": 0.1462, + "step": 46739 + }, + { + "epoch": 0.8336603288980844, + "grad_norm": 0.4444994032382965, + "learning_rate": 4.0963470032521155e-06, + "loss": 0.1428, + "step": 46740 + }, + { + "epoch": 0.8336781650197981, + "grad_norm": 0.26164713501930237, + "learning_rate": 4.095493288833358e-06, + "loss": 0.0968, + "step": 46741 + }, + { + "epoch": 0.8336960011415118, + "grad_norm": 0.3222672641277313, + "learning_rate": 4.0946396554476934e-06, + "loss": 0.1396, + "step": 46742 + }, + { + "epoch": 0.8337138372632255, + "grad_norm": 0.2579457461833954, + "learning_rate": 4.093786103098429e-06, + "loss": 0.1307, + "step": 46743 + }, + { + "epoch": 0.8337316733849391, + "grad_norm": 0.266775906085968, + "learning_rate": 4.092932631788862e-06, + "loss": 0.1326, + "step": 46744 + }, + { + "epoch": 0.8337495095066528, + "grad_norm": 0.30796027183532715, + "learning_rate": 4.0920792415223175e-06, + "loss": 0.1359, + "step": 46745 + }, + { + "epoch": 0.8337673456283665, + "grad_norm": 0.32440727949142456, + "learning_rate": 4.091225932302087e-06, + "loss": 0.1591, + "step": 46746 + }, + { + "epoch": 0.8337851817500803, + "grad_norm": 0.2423679083585739, + "learning_rate": 4.0903727041314925e-06, + "loss": 0.1254, + "step": 46747 + }, + { + "epoch": 0.833803017871794, + "grad_norm": 0.31258848309516907, + "learning_rate": 4.089519557013838e-06, + "loss": 0.1288, + "step": 46748 + }, + { + "epoch": 0.8338208539935077, + "grad_norm": 0.2675144374370575, + "learning_rate": 4.0886664909524256e-06, + "loss": 0.11, + "step": 46749 + }, + { + "epoch": 0.8338386901152214, + "grad_norm": 0.2953801155090332, + "learning_rate": 4.0878135059505576e-06, + "loss": 0.0988, + "step": 46750 + }, + { + "epoch": 0.8338565262369351, + "grad_norm": 0.31908726692199707, + "learning_rate": 4.086960602011555e-06, + "loss": 0.1111, + "step": 46751 + }, + { + "epoch": 0.8338743623586488, + "grad_norm": 0.27898141741752625, + "learning_rate": 4.086107779138718e-06, + "loss": 0.1202, + "step": 46752 + }, + { + "epoch": 0.8338921984803624, + "grad_norm": 0.23802019655704498, + "learning_rate": 4.085255037335348e-06, + "loss": 0.08, + "step": 46753 + }, + { + "epoch": 0.8339100346020761, + "grad_norm": 0.33159416913986206, + "learning_rate": 4.084402376604749e-06, + "loss": 0.1149, + "step": 46754 + }, + { + "epoch": 0.8339278707237898, + "grad_norm": 0.29219233989715576, + "learning_rate": 4.083549796950234e-06, + "loss": 0.1394, + "step": 46755 + }, + { + "epoch": 0.8339457068455035, + "grad_norm": 0.26689863204956055, + "learning_rate": 4.082697298375104e-06, + "loss": 0.0921, + "step": 46756 + }, + { + "epoch": 0.8339635429672172, + "grad_norm": 0.34107667207717896, + "learning_rate": 4.08184488088266e-06, + "loss": 0.161, + "step": 46757 + }, + { + "epoch": 0.8339813790889309, + "grad_norm": 0.2929271161556244, + "learning_rate": 4.080992544476217e-06, + "loss": 0.0746, + "step": 46758 + }, + { + "epoch": 0.8339992152106446, + "grad_norm": 0.2556752562522888, + "learning_rate": 4.080140289159063e-06, + "loss": 0.1088, + "step": 46759 + }, + { + "epoch": 0.8340170513323583, + "grad_norm": 0.2826586365699768, + "learning_rate": 4.079288114934518e-06, + "loss": 0.0692, + "step": 46760 + }, + { + "epoch": 0.834034887454072, + "grad_norm": 0.30315613746643066, + "learning_rate": 4.078436021805879e-06, + "loss": 0.0843, + "step": 46761 + }, + { + "epoch": 0.8340527235757856, + "grad_norm": 0.2464943826198578, + "learning_rate": 4.077584009776448e-06, + "loss": 0.0923, + "step": 46762 + }, + { + "epoch": 0.8340705596974993, + "grad_norm": 0.2595449984073639, + "learning_rate": 4.0767320788495215e-06, + "loss": 0.1292, + "step": 46763 + }, + { + "epoch": 0.8340883958192131, + "grad_norm": 0.3096935451030731, + "learning_rate": 4.075880229028412e-06, + "loss": 0.1329, + "step": 46764 + }, + { + "epoch": 0.8341062319409268, + "grad_norm": 0.2075943797826767, + "learning_rate": 4.075028460316421e-06, + "loss": 0.063, + "step": 46765 + }, + { + "epoch": 0.8341240680626405, + "grad_norm": 0.2840370535850525, + "learning_rate": 4.074176772716845e-06, + "loss": 0.1198, + "step": 46766 + }, + { + "epoch": 0.8341419041843542, + "grad_norm": 0.33895233273506165, + "learning_rate": 4.07332516623298e-06, + "loss": 0.0955, + "step": 46767 + }, + { + "epoch": 0.8341597403060679, + "grad_norm": 0.37132859230041504, + "learning_rate": 4.072473640868143e-06, + "loss": 0.1652, + "step": 46768 + }, + { + "epoch": 0.8341775764277816, + "grad_norm": 0.3085973560810089, + "learning_rate": 4.071622196625627e-06, + "loss": 0.1625, + "step": 46769 + }, + { + "epoch": 0.8341954125494953, + "grad_norm": 0.31743186712265015, + "learning_rate": 4.07077083350873e-06, + "loss": 0.1368, + "step": 46770 + }, + { + "epoch": 0.8342132486712089, + "grad_norm": 0.2818658947944641, + "learning_rate": 4.069919551520748e-06, + "loss": 0.1741, + "step": 46771 + }, + { + "epoch": 0.8342310847929226, + "grad_norm": 0.24712364375591278, + "learning_rate": 4.069068350664992e-06, + "loss": 0.1296, + "step": 46772 + }, + { + "epoch": 0.8342489209146363, + "grad_norm": 0.25967642664909363, + "learning_rate": 4.068217230944754e-06, + "loss": 0.105, + "step": 46773 + }, + { + "epoch": 0.83426675703635, + "grad_norm": 0.2966439425945282, + "learning_rate": 4.067366192363339e-06, + "loss": 0.1131, + "step": 46774 + }, + { + "epoch": 0.8342845931580637, + "grad_norm": 0.2699515223503113, + "learning_rate": 4.066515234924043e-06, + "loss": 0.0998, + "step": 46775 + }, + { + "epoch": 0.8343024292797774, + "grad_norm": 0.3081156015396118, + "learning_rate": 4.065664358630156e-06, + "loss": 0.1329, + "step": 46776 + }, + { + "epoch": 0.8343202654014911, + "grad_norm": 0.2245841771364212, + "learning_rate": 4.064813563484993e-06, + "loss": 0.0649, + "step": 46777 + }, + { + "epoch": 0.8343381015232048, + "grad_norm": 0.34844109416007996, + "learning_rate": 4.063962849491842e-06, + "loss": 0.1189, + "step": 46778 + }, + { + "epoch": 0.8343559376449184, + "grad_norm": 0.5935043096542358, + "learning_rate": 4.063112216654002e-06, + "loss": 0.0917, + "step": 46779 + }, + { + "epoch": 0.8343737737666321, + "grad_norm": 0.27321234345436096, + "learning_rate": 4.062261664974767e-06, + "loss": 0.059, + "step": 46780 + }, + { + "epoch": 0.8343916098883459, + "grad_norm": 0.28098952770233154, + "learning_rate": 4.0614111944574405e-06, + "loss": 0.1446, + "step": 46781 + }, + { + "epoch": 0.8344094460100596, + "grad_norm": 0.3820337951183319, + "learning_rate": 4.060560805105318e-06, + "loss": 0.1708, + "step": 46782 + }, + { + "epoch": 0.8344272821317733, + "grad_norm": 0.22245419025421143, + "learning_rate": 4.059710496921692e-06, + "loss": 0.1091, + "step": 46783 + }, + { + "epoch": 0.834445118253487, + "grad_norm": 0.2427702397108078, + "learning_rate": 4.058860269909859e-06, + "loss": 0.1004, + "step": 46784 + }, + { + "epoch": 0.8344629543752007, + "grad_norm": 0.26275157928466797, + "learning_rate": 4.0580101240731166e-06, + "loss": 0.1138, + "step": 46785 + }, + { + "epoch": 0.8344807904969144, + "grad_norm": 0.28306999802589417, + "learning_rate": 4.057160059414758e-06, + "loss": 0.0887, + "step": 46786 + }, + { + "epoch": 0.8344986266186281, + "grad_norm": 0.3550335764884949, + "learning_rate": 4.0563100759380854e-06, + "loss": 0.109, + "step": 46787 + }, + { + "epoch": 0.8345164627403417, + "grad_norm": 0.27959829568862915, + "learning_rate": 4.0554601736463884e-06, + "loss": 0.134, + "step": 46788 + }, + { + "epoch": 0.8345342988620554, + "grad_norm": 0.23582394421100616, + "learning_rate": 4.054610352542956e-06, + "loss": 0.0994, + "step": 46789 + }, + { + "epoch": 0.8345521349837691, + "grad_norm": 0.2507306933403015, + "learning_rate": 4.053760612631094e-06, + "loss": 0.0784, + "step": 46790 + }, + { + "epoch": 0.8345699711054828, + "grad_norm": 0.26044243574142456, + "learning_rate": 4.052910953914091e-06, + "loss": 0.1328, + "step": 46791 + }, + { + "epoch": 0.8345878072271965, + "grad_norm": 0.25004643201828003, + "learning_rate": 4.052061376395241e-06, + "loss": 0.0767, + "step": 46792 + }, + { + "epoch": 0.8346056433489102, + "grad_norm": 0.26451361179351807, + "learning_rate": 4.051211880077829e-06, + "loss": 0.0802, + "step": 46793 + }, + { + "epoch": 0.8346234794706239, + "grad_norm": 0.23932376503944397, + "learning_rate": 4.05036246496516e-06, + "loss": 0.1119, + "step": 46794 + }, + { + "epoch": 0.8346413155923376, + "grad_norm": 0.3065779507160187, + "learning_rate": 4.0495131310605224e-06, + "loss": 0.1113, + "step": 46795 + }, + { + "epoch": 0.8346591517140513, + "grad_norm": 0.31209102272987366, + "learning_rate": 4.0486638783672095e-06, + "loss": 0.1528, + "step": 46796 + }, + { + "epoch": 0.8346769878357649, + "grad_norm": 0.2687644958496094, + "learning_rate": 4.0478147068885115e-06, + "loss": 0.0634, + "step": 46797 + }, + { + "epoch": 0.8346948239574787, + "grad_norm": 0.24032819271087646, + "learning_rate": 4.0469656166277126e-06, + "loss": 0.1028, + "step": 46798 + }, + { + "epoch": 0.8347126600791924, + "grad_norm": 0.23493680357933044, + "learning_rate": 4.046116607588121e-06, + "loss": 0.095, + "step": 46799 + }, + { + "epoch": 0.8347304962009061, + "grad_norm": 0.39956793189048767, + "learning_rate": 4.04526767977301e-06, + "loss": 0.1102, + "step": 46800 + }, + { + "epoch": 0.8347483323226198, + "grad_norm": 0.28877198696136475, + "learning_rate": 4.044418833185684e-06, + "loss": 0.0883, + "step": 46801 + }, + { + "epoch": 0.8347661684443335, + "grad_norm": 0.30131542682647705, + "learning_rate": 4.0435700678294245e-06, + "loss": 0.1169, + "step": 46802 + }, + { + "epoch": 0.8347840045660472, + "grad_norm": 0.3342926800251007, + "learning_rate": 4.042721383707532e-06, + "loss": 0.1382, + "step": 46803 + }, + { + "epoch": 0.8348018406877609, + "grad_norm": 0.2899371087551117, + "learning_rate": 4.041872780823289e-06, + "loss": 0.1457, + "step": 46804 + }, + { + "epoch": 0.8348196768094746, + "grad_norm": 0.4322609007358551, + "learning_rate": 4.041024259179988e-06, + "loss": 0.0975, + "step": 46805 + }, + { + "epoch": 0.8348375129311882, + "grad_norm": 0.2646285593509674, + "learning_rate": 4.040175818780906e-06, + "loss": 0.0725, + "step": 46806 + }, + { + "epoch": 0.8348553490529019, + "grad_norm": 0.25836241245269775, + "learning_rate": 4.03932745962935e-06, + "loss": 0.1129, + "step": 46807 + }, + { + "epoch": 0.8348731851746156, + "grad_norm": 0.26384419202804565, + "learning_rate": 4.038479181728599e-06, + "loss": 0.0886, + "step": 46808 + }, + { + "epoch": 0.8348910212963293, + "grad_norm": 0.25899654626846313, + "learning_rate": 4.037630985081945e-06, + "loss": 0.1188, + "step": 46809 + }, + { + "epoch": 0.834908857418043, + "grad_norm": 0.25273481011390686, + "learning_rate": 4.036782869692671e-06, + "loss": 0.1639, + "step": 46810 + }, + { + "epoch": 0.8349266935397567, + "grad_norm": 0.19507236778736115, + "learning_rate": 4.035934835564062e-06, + "loss": 0.0874, + "step": 46811 + }, + { + "epoch": 0.8349445296614704, + "grad_norm": 0.2970743775367737, + "learning_rate": 4.035086882699418e-06, + "loss": 0.1612, + "step": 46812 + }, + { + "epoch": 0.8349623657831841, + "grad_norm": 0.21159812808036804, + "learning_rate": 4.034239011102009e-06, + "loss": 0.1011, + "step": 46813 + }, + { + "epoch": 0.8349802019048977, + "grad_norm": 0.3126099705696106, + "learning_rate": 4.03339122077514e-06, + "loss": 0.1217, + "step": 46814 + }, + { + "epoch": 0.8349980380266115, + "grad_norm": 0.32722559571266174, + "learning_rate": 4.032543511722081e-06, + "loss": 0.1008, + "step": 46815 + }, + { + "epoch": 0.8350158741483252, + "grad_norm": 0.6440924406051636, + "learning_rate": 4.031695883946129e-06, + "loss": 0.0991, + "step": 46816 + }, + { + "epoch": 0.8350337102700389, + "grad_norm": 0.32656651735305786, + "learning_rate": 4.030848337450568e-06, + "loss": 0.084, + "step": 46817 + }, + { + "epoch": 0.8350515463917526, + "grad_norm": 0.29383066296577454, + "learning_rate": 4.030000872238681e-06, + "loss": 0.0911, + "step": 46818 + }, + { + "epoch": 0.8350693825134663, + "grad_norm": 0.30996939539909363, + "learning_rate": 4.02915348831375e-06, + "loss": 0.1044, + "step": 46819 + }, + { + "epoch": 0.83508721863518, + "grad_norm": 0.17640818655490875, + "learning_rate": 4.028306185679068e-06, + "loss": 0.108, + "step": 46820 + }, + { + "epoch": 0.8351050547568937, + "grad_norm": 0.22094972431659698, + "learning_rate": 4.0274589643379126e-06, + "loss": 0.0653, + "step": 46821 + }, + { + "epoch": 0.8351228908786074, + "grad_norm": 0.26132553815841675, + "learning_rate": 4.026611824293572e-06, + "loss": 0.1169, + "step": 46822 + }, + { + "epoch": 0.835140727000321, + "grad_norm": 0.2136087417602539, + "learning_rate": 4.025764765549325e-06, + "loss": 0.1159, + "step": 46823 + }, + { + "epoch": 0.8351585631220347, + "grad_norm": 0.26849299669265747, + "learning_rate": 4.0249177881084566e-06, + "loss": 0.0753, + "step": 46824 + }, + { + "epoch": 0.8351763992437484, + "grad_norm": 0.296371191740036, + "learning_rate": 4.0240708919742544e-06, + "loss": 0.1749, + "step": 46825 + }, + { + "epoch": 0.8351942353654621, + "grad_norm": 0.24800507724285126, + "learning_rate": 4.0232240771499975e-06, + "loss": 0.1424, + "step": 46826 + }, + { + "epoch": 0.8352120714871758, + "grad_norm": 0.22743044793605804, + "learning_rate": 4.022377343638964e-06, + "loss": 0.1074, + "step": 46827 + }, + { + "epoch": 0.8352299076088895, + "grad_norm": 0.2506360113620758, + "learning_rate": 4.02153069144445e-06, + "loss": 0.1074, + "step": 46828 + }, + { + "epoch": 0.8352477437306032, + "grad_norm": 0.2383185774087906, + "learning_rate": 4.020684120569721e-06, + "loss": 0.107, + "step": 46829 + }, + { + "epoch": 0.8352655798523169, + "grad_norm": 0.22619059681892395, + "learning_rate": 4.01983763101807e-06, + "loss": 0.0842, + "step": 46830 + }, + { + "epoch": 0.8352834159740306, + "grad_norm": 0.24542072415351868, + "learning_rate": 4.018991222792776e-06, + "loss": 0.1214, + "step": 46831 + }, + { + "epoch": 0.8353012520957444, + "grad_norm": 0.46733415126800537, + "learning_rate": 4.018144895897113e-06, + "loss": 0.1242, + "step": 46832 + }, + { + "epoch": 0.835319088217458, + "grad_norm": 0.2272469699382782, + "learning_rate": 4.017298650334375e-06, + "loss": 0.0692, + "step": 46833 + }, + { + "epoch": 0.8353369243391717, + "grad_norm": 0.3833679258823395, + "learning_rate": 4.016452486107833e-06, + "loss": 0.1136, + "step": 46834 + }, + { + "epoch": 0.8353547604608854, + "grad_norm": 0.32804858684539795, + "learning_rate": 4.015606403220767e-06, + "loss": 0.1552, + "step": 46835 + }, + { + "epoch": 0.8353725965825991, + "grad_norm": 0.25129881501197815, + "learning_rate": 4.0147604016764624e-06, + "loss": 0.1243, + "step": 46836 + }, + { + "epoch": 0.8353904327043128, + "grad_norm": 0.3041929006576538, + "learning_rate": 4.013914481478187e-06, + "loss": 0.0725, + "step": 46837 + }, + { + "epoch": 0.8354082688260265, + "grad_norm": 0.2951223850250244, + "learning_rate": 4.01306864262923e-06, + "loss": 0.1092, + "step": 46838 + }, + { + "epoch": 0.8354261049477402, + "grad_norm": 0.20860454440116882, + "learning_rate": 4.012222885132872e-06, + "loss": 0.0606, + "step": 46839 + }, + { + "epoch": 0.8354439410694539, + "grad_norm": 0.2729339301586151, + "learning_rate": 4.0113772089923785e-06, + "loss": 0.0907, + "step": 46840 + }, + { + "epoch": 0.8354617771911675, + "grad_norm": 0.24473010003566742, + "learning_rate": 4.010531614211044e-06, + "loss": 0.1177, + "step": 46841 + }, + { + "epoch": 0.8354796133128812, + "grad_norm": 0.24658413231372833, + "learning_rate": 4.0096861007921315e-06, + "loss": 0.0767, + "step": 46842 + }, + { + "epoch": 0.8354974494345949, + "grad_norm": 0.2385355681180954, + "learning_rate": 4.0088406687389306e-06, + "loss": 0.1395, + "step": 46843 + }, + { + "epoch": 0.8355152855563086, + "grad_norm": 0.378973126411438, + "learning_rate": 4.007995318054714e-06, + "loss": 0.1173, + "step": 46844 + }, + { + "epoch": 0.8355331216780223, + "grad_norm": 0.2509481608867645, + "learning_rate": 4.007150048742753e-06, + "loss": 0.1361, + "step": 46845 + }, + { + "epoch": 0.835550957799736, + "grad_norm": 0.2565433382987976, + "learning_rate": 4.006304860806334e-06, + "loss": 0.1267, + "step": 46846 + }, + { + "epoch": 0.8355687939214497, + "grad_norm": 0.2195865958929062, + "learning_rate": 4.005459754248731e-06, + "loss": 0.1106, + "step": 46847 + }, + { + "epoch": 0.8355866300431635, + "grad_norm": 0.20758330821990967, + "learning_rate": 4.004614729073214e-06, + "loss": 0.0674, + "step": 46848 + }, + { + "epoch": 0.8356044661648772, + "grad_norm": 0.3654841482639313, + "learning_rate": 4.003769785283063e-06, + "loss": 0.161, + "step": 46849 + }, + { + "epoch": 0.8356223022865908, + "grad_norm": 0.2653590440750122, + "learning_rate": 4.002924922881546e-06, + "loss": 0.0824, + "step": 46850 + }, + { + "epoch": 0.8356401384083045, + "grad_norm": 0.36507052183151245, + "learning_rate": 4.00208014187195e-06, + "loss": 0.119, + "step": 46851 + }, + { + "epoch": 0.8356579745300182, + "grad_norm": 0.31233513355255127, + "learning_rate": 4.001235442257545e-06, + "loss": 0.1276, + "step": 46852 + }, + { + "epoch": 0.8356758106517319, + "grad_norm": 0.2052009105682373, + "learning_rate": 4.0003908240416025e-06, + "loss": 0.1036, + "step": 46853 + }, + { + "epoch": 0.8356936467734456, + "grad_norm": 0.2272307276725769, + "learning_rate": 3.999546287227393e-06, + "loss": 0.0952, + "step": 46854 + }, + { + "epoch": 0.8357114828951593, + "grad_norm": 0.27024638652801514, + "learning_rate": 3.9987018318182e-06, + "loss": 0.115, + "step": 46855 + }, + { + "epoch": 0.835729319016873, + "grad_norm": 0.2789422273635864, + "learning_rate": 3.997857457817289e-06, + "loss": 0.1592, + "step": 46856 + }, + { + "epoch": 0.8357471551385867, + "grad_norm": 0.2676437199115753, + "learning_rate": 3.997013165227939e-06, + "loss": 0.1505, + "step": 46857 + }, + { + "epoch": 0.8357649912603003, + "grad_norm": 0.3002021908760071, + "learning_rate": 3.996168954053417e-06, + "loss": 0.122, + "step": 46858 + }, + { + "epoch": 0.835782827382014, + "grad_norm": 0.21754351258277893, + "learning_rate": 3.9953248242970036e-06, + "loss": 0.1182, + "step": 46859 + }, + { + "epoch": 0.8358006635037277, + "grad_norm": 0.23948244750499725, + "learning_rate": 3.9944807759619676e-06, + "loss": 0.1128, + "step": 46860 + }, + { + "epoch": 0.8358184996254414, + "grad_norm": 0.20021620392799377, + "learning_rate": 3.993636809051576e-06, + "loss": 0.072, + "step": 46861 + }, + { + "epoch": 0.8358363357471551, + "grad_norm": 0.2331341952085495, + "learning_rate": 3.992792923569105e-06, + "loss": 0.0975, + "step": 46862 + }, + { + "epoch": 0.8358541718688688, + "grad_norm": 0.24508824944496155, + "learning_rate": 3.991949119517818e-06, + "loss": 0.0909, + "step": 46863 + }, + { + "epoch": 0.8358720079905825, + "grad_norm": 0.28833362460136414, + "learning_rate": 3.991105396901001e-06, + "loss": 0.0998, + "step": 46864 + }, + { + "epoch": 0.8358898441122963, + "grad_norm": 0.3301422595977783, + "learning_rate": 3.9902617557219135e-06, + "loss": 0.1115, + "step": 46865 + }, + { + "epoch": 0.83590768023401, + "grad_norm": 0.26082515716552734, + "learning_rate": 3.9894181959838266e-06, + "loss": 0.1509, + "step": 46866 + }, + { + "epoch": 0.8359255163557237, + "grad_norm": 0.19411340355873108, + "learning_rate": 3.988574717690008e-06, + "loss": 0.1115, + "step": 46867 + }, + { + "epoch": 0.8359433524774373, + "grad_norm": 0.24536892771720886, + "learning_rate": 3.987731320843735e-06, + "loss": 0.0862, + "step": 46868 + }, + { + "epoch": 0.835961188599151, + "grad_norm": 0.23225244879722595, + "learning_rate": 3.986888005448266e-06, + "loss": 0.1198, + "step": 46869 + }, + { + "epoch": 0.8359790247208647, + "grad_norm": 0.256185919046402, + "learning_rate": 3.986044771506886e-06, + "loss": 0.126, + "step": 46870 + }, + { + "epoch": 0.8359968608425784, + "grad_norm": 0.35747939348220825, + "learning_rate": 3.985201619022847e-06, + "loss": 0.1258, + "step": 46871 + }, + { + "epoch": 0.8360146969642921, + "grad_norm": 0.26201823353767395, + "learning_rate": 3.984358547999431e-06, + "loss": 0.1186, + "step": 46872 + }, + { + "epoch": 0.8360325330860058, + "grad_norm": 0.29478102922439575, + "learning_rate": 3.983515558439899e-06, + "loss": 0.1143, + "step": 46873 + }, + { + "epoch": 0.8360503692077195, + "grad_norm": 0.218222513794899, + "learning_rate": 3.982672650347521e-06, + "loss": 0.1088, + "step": 46874 + }, + { + "epoch": 0.8360682053294332, + "grad_norm": 0.28917232155799866, + "learning_rate": 3.981829823725561e-06, + "loss": 0.1161, + "step": 46875 + }, + { + "epoch": 0.8360860414511468, + "grad_norm": 0.214401513338089, + "learning_rate": 3.980987078577284e-06, + "loss": 0.1125, + "step": 46876 + }, + { + "epoch": 0.8361038775728605, + "grad_norm": 0.3000587224960327, + "learning_rate": 3.980144414905965e-06, + "loss": 0.0852, + "step": 46877 + }, + { + "epoch": 0.8361217136945742, + "grad_norm": 0.1898646503686905, + "learning_rate": 3.979301832714869e-06, + "loss": 0.0736, + "step": 46878 + }, + { + "epoch": 0.8361395498162879, + "grad_norm": 0.23889465630054474, + "learning_rate": 3.978459332007256e-06, + "loss": 0.0719, + "step": 46879 + }, + { + "epoch": 0.8361573859380016, + "grad_norm": 0.22260203957557678, + "learning_rate": 3.9776169127863915e-06, + "loss": 0.0732, + "step": 46880 + }, + { + "epoch": 0.8361752220597153, + "grad_norm": 0.3586057722568512, + "learning_rate": 3.9767745750555505e-06, + "loss": 0.1349, + "step": 46881 + }, + { + "epoch": 0.8361930581814291, + "grad_norm": 0.23854921758174896, + "learning_rate": 3.97593231881799e-06, + "loss": 0.0924, + "step": 46882 + }, + { + "epoch": 0.8362108943031428, + "grad_norm": 0.3031052350997925, + "learning_rate": 3.975090144076973e-06, + "loss": 0.1192, + "step": 46883 + }, + { + "epoch": 0.8362287304248565, + "grad_norm": 0.2748461961746216, + "learning_rate": 3.974248050835769e-06, + "loss": 0.0848, + "step": 46884 + }, + { + "epoch": 0.8362465665465701, + "grad_norm": 0.21828046441078186, + "learning_rate": 3.973406039097646e-06, + "loss": 0.1026, + "step": 46885 + }, + { + "epoch": 0.8362644026682838, + "grad_norm": 0.365934818983078, + "learning_rate": 3.972564108865867e-06, + "loss": 0.1624, + "step": 46886 + }, + { + "epoch": 0.8362822387899975, + "grad_norm": 0.2756219506263733, + "learning_rate": 3.97172226014369e-06, + "loss": 0.1263, + "step": 46887 + }, + { + "epoch": 0.8363000749117112, + "grad_norm": 0.24824893474578857, + "learning_rate": 3.9708804929343785e-06, + "loss": 0.0601, + "step": 46888 + }, + { + "epoch": 0.8363179110334249, + "grad_norm": 0.36520570516586304, + "learning_rate": 3.970038807241194e-06, + "loss": 0.1233, + "step": 46889 + }, + { + "epoch": 0.8363357471551386, + "grad_norm": 0.25544849038124084, + "learning_rate": 3.96919720306741e-06, + "loss": 0.0809, + "step": 46890 + }, + { + "epoch": 0.8363535832768523, + "grad_norm": 0.31959447264671326, + "learning_rate": 3.968355680416278e-06, + "loss": 0.1209, + "step": 46891 + }, + { + "epoch": 0.836371419398566, + "grad_norm": 0.26477327942848206, + "learning_rate": 3.9675142392910645e-06, + "loss": 0.1237, + "step": 46892 + }, + { + "epoch": 0.8363892555202797, + "grad_norm": 0.22651013731956482, + "learning_rate": 3.9666728796950245e-06, + "loss": 0.1006, + "step": 46893 + }, + { + "epoch": 0.8364070916419933, + "grad_norm": 0.3021063804626465, + "learning_rate": 3.965831601631431e-06, + "loss": 0.097, + "step": 46894 + }, + { + "epoch": 0.836424927763707, + "grad_norm": 0.38781633973121643, + "learning_rate": 3.964990405103539e-06, + "loss": 0.1516, + "step": 46895 + }, + { + "epoch": 0.8364427638854207, + "grad_norm": 0.37945935130119324, + "learning_rate": 3.964149290114605e-06, + "loss": 0.1364, + "step": 46896 + }, + { + "epoch": 0.8364606000071344, + "grad_norm": 0.29736632108688354, + "learning_rate": 3.963308256667897e-06, + "loss": 0.104, + "step": 46897 + }, + { + "epoch": 0.8364784361288481, + "grad_norm": 0.19665658473968506, + "learning_rate": 3.962467304766668e-06, + "loss": 0.0821, + "step": 46898 + }, + { + "epoch": 0.8364962722505619, + "grad_norm": 0.23883381485939026, + "learning_rate": 3.961626434414189e-06, + "loss": 0.1174, + "step": 46899 + }, + { + "epoch": 0.8365141083722756, + "grad_norm": 0.3611498773097992, + "learning_rate": 3.96078564561371e-06, + "loss": 0.151, + "step": 46900 + }, + { + "epoch": 0.8365319444939893, + "grad_norm": 0.2536463439464569, + "learning_rate": 3.9599449383684925e-06, + "loss": 0.0697, + "step": 46901 + }, + { + "epoch": 0.836549780615703, + "grad_norm": 0.2697376310825348, + "learning_rate": 3.9591043126817915e-06, + "loss": 0.0631, + "step": 46902 + }, + { + "epoch": 0.8365676167374166, + "grad_norm": 0.23434416949748993, + "learning_rate": 3.958263768556875e-06, + "loss": 0.1144, + "step": 46903 + }, + { + "epoch": 0.8365854528591303, + "grad_norm": 0.24046054482460022, + "learning_rate": 3.9574233059969965e-06, + "loss": 0.1427, + "step": 46904 + }, + { + "epoch": 0.836603288980844, + "grad_norm": 0.25384941697120667, + "learning_rate": 3.9565829250054085e-06, + "loss": 0.0602, + "step": 46905 + }, + { + "epoch": 0.8366211251025577, + "grad_norm": 0.3528127372264862, + "learning_rate": 3.955742625585371e-06, + "loss": 0.119, + "step": 46906 + }, + { + "epoch": 0.8366389612242714, + "grad_norm": 0.2794913649559021, + "learning_rate": 3.954902407740149e-06, + "loss": 0.1026, + "step": 46907 + }, + { + "epoch": 0.8366567973459851, + "grad_norm": 0.260948121547699, + "learning_rate": 3.954062271472994e-06, + "loss": 0.181, + "step": 46908 + }, + { + "epoch": 0.8366746334676988, + "grad_norm": 0.254564106464386, + "learning_rate": 3.9532222167871645e-06, + "loss": 0.1028, + "step": 46909 + }, + { + "epoch": 0.8366924695894125, + "grad_norm": 0.2630186080932617, + "learning_rate": 3.952382243685907e-06, + "loss": 0.0993, + "step": 46910 + }, + { + "epoch": 0.8367103057111261, + "grad_norm": 0.23984605073928833, + "learning_rate": 3.951542352172485e-06, + "loss": 0.0932, + "step": 46911 + }, + { + "epoch": 0.8367281418328398, + "grad_norm": 0.23631185293197632, + "learning_rate": 3.950702542250162e-06, + "loss": 0.1026, + "step": 46912 + }, + { + "epoch": 0.8367459779545535, + "grad_norm": 0.2506595551967621, + "learning_rate": 3.9498628139221865e-06, + "loss": 0.0749, + "step": 46913 + }, + { + "epoch": 0.8367638140762672, + "grad_norm": 0.22263506054878235, + "learning_rate": 3.949023167191812e-06, + "loss": 0.1047, + "step": 46914 + }, + { + "epoch": 0.8367816501979809, + "grad_norm": 0.33422768115997314, + "learning_rate": 3.9481836020622884e-06, + "loss": 0.076, + "step": 46915 + }, + { + "epoch": 0.8367994863196947, + "grad_norm": 0.26921719312667847, + "learning_rate": 3.947344118536883e-06, + "loss": 0.1326, + "step": 46916 + }, + { + "epoch": 0.8368173224414084, + "grad_norm": 0.41757893562316895, + "learning_rate": 3.946504716618843e-06, + "loss": 0.1969, + "step": 46917 + }, + { + "epoch": 0.8368351585631221, + "grad_norm": 0.24989396333694458, + "learning_rate": 3.945665396311421e-06, + "loss": 0.0699, + "step": 46918 + }, + { + "epoch": 0.8368529946848358, + "grad_norm": 0.37077704071998596, + "learning_rate": 3.944826157617867e-06, + "loss": 0.1601, + "step": 46919 + }, + { + "epoch": 0.8368708308065494, + "grad_norm": 0.2764621078968048, + "learning_rate": 3.9439870005414465e-06, + "loss": 0.0821, + "step": 46920 + }, + { + "epoch": 0.8368886669282631, + "grad_norm": 0.2903653383255005, + "learning_rate": 3.943147925085403e-06, + "loss": 0.114, + "step": 46921 + }, + { + "epoch": 0.8369065030499768, + "grad_norm": 0.2788127362728119, + "learning_rate": 3.942308931252992e-06, + "loss": 0.0937, + "step": 46922 + }, + { + "epoch": 0.8369243391716905, + "grad_norm": 0.2614961266517639, + "learning_rate": 3.941470019047458e-06, + "loss": 0.0863, + "step": 46923 + }, + { + "epoch": 0.8369421752934042, + "grad_norm": 0.2824731767177582, + "learning_rate": 3.940631188472063e-06, + "loss": 0.1104, + "step": 46924 + }, + { + "epoch": 0.8369600114151179, + "grad_norm": 0.3636886179447174, + "learning_rate": 3.939792439530055e-06, + "loss": 0.1083, + "step": 46925 + }, + { + "epoch": 0.8369778475368316, + "grad_norm": 0.338789701461792, + "learning_rate": 3.938953772224688e-06, + "loss": 0.115, + "step": 46926 + }, + { + "epoch": 0.8369956836585453, + "grad_norm": 0.24709048867225647, + "learning_rate": 3.938115186559211e-06, + "loss": 0.1144, + "step": 46927 + }, + { + "epoch": 0.837013519780259, + "grad_norm": 0.24254465103149414, + "learning_rate": 3.937276682536867e-06, + "loss": 0.1096, + "step": 46928 + }, + { + "epoch": 0.8370313559019726, + "grad_norm": 0.21065448224544525, + "learning_rate": 3.936438260160918e-06, + "loss": 0.0692, + "step": 46929 + }, + { + "epoch": 0.8370491920236863, + "grad_norm": 0.22972606122493744, + "learning_rate": 3.935599919434613e-06, + "loss": 0.098, + "step": 46930 + }, + { + "epoch": 0.8370670281454, + "grad_norm": 0.26478683948516846, + "learning_rate": 3.934761660361197e-06, + "loss": 0.1162, + "step": 46931 + }, + { + "epoch": 0.8370848642671137, + "grad_norm": 0.2467789649963379, + "learning_rate": 3.933923482943913e-06, + "loss": 0.1304, + "step": 46932 + }, + { + "epoch": 0.8371027003888275, + "grad_norm": 0.3902963697910309, + "learning_rate": 3.9330853871860255e-06, + "loss": 0.1596, + "step": 46933 + }, + { + "epoch": 0.8371205365105412, + "grad_norm": 0.20513826608657837, + "learning_rate": 3.932247373090775e-06, + "loss": 0.1199, + "step": 46934 + }, + { + "epoch": 0.8371383726322549, + "grad_norm": 0.2696496546268463, + "learning_rate": 3.93140944066141e-06, + "loss": 0.0724, + "step": 46935 + }, + { + "epoch": 0.8371562087539686, + "grad_norm": 0.26693907380104065, + "learning_rate": 3.930571589901172e-06, + "loss": 0.1264, + "step": 46936 + }, + { + "epoch": 0.8371740448756823, + "grad_norm": 0.26409271359443665, + "learning_rate": 3.929733820813322e-06, + "loss": 0.076, + "step": 46937 + }, + { + "epoch": 0.8371918809973959, + "grad_norm": 0.2790376543998718, + "learning_rate": 3.928896133401097e-06, + "loss": 0.0947, + "step": 46938 + }, + { + "epoch": 0.8372097171191096, + "grad_norm": 0.4949314594268799, + "learning_rate": 3.928058527667752e-06, + "loss": 0.18, + "step": 46939 + }, + { + "epoch": 0.8372275532408233, + "grad_norm": 0.22583194077014923, + "learning_rate": 3.927221003616533e-06, + "loss": 0.104, + "step": 46940 + }, + { + "epoch": 0.837245389362537, + "grad_norm": 0.32609236240386963, + "learning_rate": 3.926383561250674e-06, + "loss": 0.0782, + "step": 46941 + }, + { + "epoch": 0.8372632254842507, + "grad_norm": 0.334391713142395, + "learning_rate": 3.925546200573438e-06, + "loss": 0.1414, + "step": 46942 + }, + { + "epoch": 0.8372810616059644, + "grad_norm": 0.2561906576156616, + "learning_rate": 3.9247089215880665e-06, + "loss": 0.1149, + "step": 46943 + }, + { + "epoch": 0.8372988977276781, + "grad_norm": 0.28949081897735596, + "learning_rate": 3.923871724297801e-06, + "loss": 0.1039, + "step": 46944 + }, + { + "epoch": 0.8373167338493918, + "grad_norm": 0.26782864332199097, + "learning_rate": 3.9230346087058806e-06, + "loss": 0.1093, + "step": 46945 + }, + { + "epoch": 0.8373345699711054, + "grad_norm": 0.3119884729385376, + "learning_rate": 3.922197574815564e-06, + "loss": 0.1102, + "step": 46946 + }, + { + "epoch": 0.8373524060928191, + "grad_norm": 0.30032435059547424, + "learning_rate": 3.921360622630091e-06, + "loss": 0.1161, + "step": 46947 + }, + { + "epoch": 0.8373702422145328, + "grad_norm": 0.5193751454353333, + "learning_rate": 3.920523752152702e-06, + "loss": 0.187, + "step": 46948 + }, + { + "epoch": 0.8373880783362466, + "grad_norm": 0.2769351005554199, + "learning_rate": 3.91968696338664e-06, + "loss": 0.152, + "step": 46949 + }, + { + "epoch": 0.8374059144579603, + "grad_norm": 0.2023262232542038, + "learning_rate": 3.918850256335157e-06, + "loss": 0.09, + "step": 46950 + }, + { + "epoch": 0.837423750579674, + "grad_norm": 0.27765601873397827, + "learning_rate": 3.918013631001494e-06, + "loss": 0.125, + "step": 46951 + }, + { + "epoch": 0.8374415867013877, + "grad_norm": 0.21986523270606995, + "learning_rate": 3.917177087388885e-06, + "loss": 0.0647, + "step": 46952 + }, + { + "epoch": 0.8374594228231014, + "grad_norm": 0.3394491374492645, + "learning_rate": 3.916340625500583e-06, + "loss": 0.0861, + "step": 46953 + }, + { + "epoch": 0.8374772589448151, + "grad_norm": 0.31232213973999023, + "learning_rate": 3.915504245339821e-06, + "loss": 0.121, + "step": 46954 + }, + { + "epoch": 0.8374950950665287, + "grad_norm": 0.3081868886947632, + "learning_rate": 3.914667946909856e-06, + "loss": 0.1276, + "step": 46955 + }, + { + "epoch": 0.8375129311882424, + "grad_norm": 0.2707572877407074, + "learning_rate": 3.913831730213918e-06, + "loss": 0.0955, + "step": 46956 + }, + { + "epoch": 0.8375307673099561, + "grad_norm": 0.28062984347343445, + "learning_rate": 3.912995595255253e-06, + "loss": 0.0909, + "step": 46957 + }, + { + "epoch": 0.8375486034316698, + "grad_norm": 0.30293378233909607, + "learning_rate": 3.912159542037092e-06, + "loss": 0.1146, + "step": 46958 + }, + { + "epoch": 0.8375664395533835, + "grad_norm": 0.2180204838514328, + "learning_rate": 3.9113235705626915e-06, + "loss": 0.0953, + "step": 46959 + }, + { + "epoch": 0.8375842756750972, + "grad_norm": 0.314439982175827, + "learning_rate": 3.910487680835284e-06, + "loss": 0.0863, + "step": 46960 + }, + { + "epoch": 0.8376021117968109, + "grad_norm": 0.22588887810707092, + "learning_rate": 3.909651872858111e-06, + "loss": 0.1089, + "step": 46961 + }, + { + "epoch": 0.8376199479185246, + "grad_norm": 0.25089627504348755, + "learning_rate": 3.908816146634406e-06, + "loss": 0.0891, + "step": 46962 + }, + { + "epoch": 0.8376377840402383, + "grad_norm": 0.28479036688804626, + "learning_rate": 3.907980502167419e-06, + "loss": 0.1245, + "step": 46963 + }, + { + "epoch": 0.8376556201619519, + "grad_norm": 0.19454067945480347, + "learning_rate": 3.907144939460386e-06, + "loss": 0.054, + "step": 46964 + }, + { + "epoch": 0.8376734562836656, + "grad_norm": 0.2349027693271637, + "learning_rate": 3.906309458516538e-06, + "loss": 0.0735, + "step": 46965 + }, + { + "epoch": 0.8376912924053794, + "grad_norm": 0.2898387014865875, + "learning_rate": 3.905474059339126e-06, + "loss": 0.1199, + "step": 46966 + }, + { + "epoch": 0.8377091285270931, + "grad_norm": 0.2563013434410095, + "learning_rate": 3.904638741931374e-06, + "loss": 0.1208, + "step": 46967 + }, + { + "epoch": 0.8377269646488068, + "grad_norm": 0.26255422830581665, + "learning_rate": 3.903803506296538e-06, + "loss": 0.0536, + "step": 46968 + }, + { + "epoch": 0.8377448007705205, + "grad_norm": 0.27324071526527405, + "learning_rate": 3.902968352437844e-06, + "loss": 0.1349, + "step": 46969 + }, + { + "epoch": 0.8377626368922342, + "grad_norm": 0.30811983346939087, + "learning_rate": 3.902133280358533e-06, + "loss": 0.1424, + "step": 46970 + }, + { + "epoch": 0.8377804730139479, + "grad_norm": 0.26903247833251953, + "learning_rate": 3.901298290061831e-06, + "loss": 0.1067, + "step": 46971 + }, + { + "epoch": 0.8377983091356616, + "grad_norm": 0.27189138531684875, + "learning_rate": 3.900463381550993e-06, + "loss": 0.1248, + "step": 46972 + }, + { + "epoch": 0.8378161452573752, + "grad_norm": 0.40170568227767944, + "learning_rate": 3.899628554829246e-06, + "loss": 0.1353, + "step": 46973 + }, + { + "epoch": 0.8378339813790889, + "grad_norm": 0.3523317575454712, + "learning_rate": 3.898793809899826e-06, + "loss": 0.1398, + "step": 46974 + }, + { + "epoch": 0.8378518175008026, + "grad_norm": 0.28895458579063416, + "learning_rate": 3.897959146765965e-06, + "loss": 0.1153, + "step": 46975 + }, + { + "epoch": 0.8378696536225163, + "grad_norm": 0.41567447781562805, + "learning_rate": 3.897124565430907e-06, + "loss": 0.1426, + "step": 46976 + }, + { + "epoch": 0.83788748974423, + "grad_norm": 0.1604563295841217, + "learning_rate": 3.896290065897881e-06, + "loss": 0.0535, + "step": 46977 + }, + { + "epoch": 0.8379053258659437, + "grad_norm": 0.1963864117860794, + "learning_rate": 3.895455648170127e-06, + "loss": 0.0655, + "step": 46978 + }, + { + "epoch": 0.8379231619876574, + "grad_norm": 0.17659622430801392, + "learning_rate": 3.894621312250871e-06, + "loss": 0.0883, + "step": 46979 + }, + { + "epoch": 0.8379409981093711, + "grad_norm": 0.329047828912735, + "learning_rate": 3.893787058143355e-06, + "loss": 0.1082, + "step": 46980 + }, + { + "epoch": 0.8379588342310847, + "grad_norm": 0.2841615080833435, + "learning_rate": 3.892952885850806e-06, + "loss": 0.1017, + "step": 46981 + }, + { + "epoch": 0.8379766703527984, + "grad_norm": 0.2920970916748047, + "learning_rate": 3.892118795376467e-06, + "loss": 0.1252, + "step": 46982 + }, + { + "epoch": 0.8379945064745122, + "grad_norm": 0.297720730304718, + "learning_rate": 3.891284786723568e-06, + "loss": 0.1574, + "step": 46983 + }, + { + "epoch": 0.8380123425962259, + "grad_norm": 0.2133379429578781, + "learning_rate": 3.890450859895331e-06, + "loss": 0.0698, + "step": 46984 + }, + { + "epoch": 0.8380301787179396, + "grad_norm": 0.28827035427093506, + "learning_rate": 3.889617014895006e-06, + "loss": 0.085, + "step": 46985 + }, + { + "epoch": 0.8380480148396533, + "grad_norm": 0.19673387706279755, + "learning_rate": 3.8887832517258135e-06, + "loss": 0.0777, + "step": 46986 + }, + { + "epoch": 0.838065850961367, + "grad_norm": 0.3254310190677643, + "learning_rate": 3.887949570390992e-06, + "loss": 0.0969, + "step": 46987 + }, + { + "epoch": 0.8380836870830807, + "grad_norm": 0.2297883927822113, + "learning_rate": 3.887115970893762e-06, + "loss": 0.0653, + "step": 46988 + }, + { + "epoch": 0.8381015232047944, + "grad_norm": 0.33748525381088257, + "learning_rate": 3.886282453237369e-06, + "loss": 0.0744, + "step": 46989 + }, + { + "epoch": 0.838119359326508, + "grad_norm": 0.24912230670452118, + "learning_rate": 3.885449017425039e-06, + "loss": 0.0823, + "step": 46990 + }, + { + "epoch": 0.8381371954482217, + "grad_norm": 0.2941327393054962, + "learning_rate": 3.8846156634600005e-06, + "loss": 0.1199, + "step": 46991 + }, + { + "epoch": 0.8381550315699354, + "grad_norm": 0.24996671080589294, + "learning_rate": 3.883782391345478e-06, + "loss": 0.0876, + "step": 46992 + }, + { + "epoch": 0.8381728676916491, + "grad_norm": 0.26333895325660706, + "learning_rate": 3.882949201084715e-06, + "loss": 0.1206, + "step": 46993 + }, + { + "epoch": 0.8381907038133628, + "grad_norm": 0.34092408418655396, + "learning_rate": 3.882116092680926e-06, + "loss": 0.132, + "step": 46994 + }, + { + "epoch": 0.8382085399350765, + "grad_norm": 0.26638486981391907, + "learning_rate": 3.881283066137359e-06, + "loss": 0.1176, + "step": 46995 + }, + { + "epoch": 0.8382263760567902, + "grad_norm": 0.2559550106525421, + "learning_rate": 3.880450121457229e-06, + "loss": 0.1165, + "step": 46996 + }, + { + "epoch": 0.8382442121785039, + "grad_norm": 0.20587700605392456, + "learning_rate": 3.879617258643767e-06, + "loss": 0.0732, + "step": 46997 + }, + { + "epoch": 0.8382620483002176, + "grad_norm": 0.24560266733169556, + "learning_rate": 3.878784477700206e-06, + "loss": 0.0987, + "step": 46998 + }, + { + "epoch": 0.8382798844219312, + "grad_norm": 0.21615678071975708, + "learning_rate": 3.877951778629773e-06, + "loss": 0.1035, + "step": 46999 + }, + { + "epoch": 0.838297720543645, + "grad_norm": 0.3579657971858978, + "learning_rate": 3.8771191614356935e-06, + "loss": 0.1236, + "step": 47000 + }, + { + "epoch": 0.838297720543645, + "eval_loss": 0.10923902690410614, + "eval_runtime": 108.4643, + "eval_samples_per_second": 9.441, + "eval_steps_per_second": 1.577, + "step": 47000 + }, + { + "epoch": 0.8383155566653587, + "grad_norm": 0.30751803517341614, + "learning_rate": 3.876286626121189e-06, + "loss": 0.1314, + "step": 47001 + }, + { + "epoch": 0.8383333927870724, + "grad_norm": 0.3310813009738922, + "learning_rate": 3.875454172689502e-06, + "loss": 0.1384, + "step": 47002 + }, + { + "epoch": 0.8383512289087861, + "grad_norm": 0.2768760621547699, + "learning_rate": 3.874621801143849e-06, + "loss": 0.1063, + "step": 47003 + }, + { + "epoch": 0.8383690650304998, + "grad_norm": 0.41729825735092163, + "learning_rate": 3.873789511487458e-06, + "loss": 0.1367, + "step": 47004 + }, + { + "epoch": 0.8383869011522135, + "grad_norm": 0.26368626952171326, + "learning_rate": 3.8729573037235515e-06, + "loss": 0.1624, + "step": 47005 + }, + { + "epoch": 0.8384047372739272, + "grad_norm": 0.27840062975883484, + "learning_rate": 3.8721251778553626e-06, + "loss": 0.0701, + "step": 47006 + }, + { + "epoch": 0.8384225733956409, + "grad_norm": 0.19710266590118408, + "learning_rate": 3.871293133886117e-06, + "loss": 0.0829, + "step": 47007 + }, + { + "epoch": 0.8384404095173545, + "grad_norm": 0.3673233389854431, + "learning_rate": 3.870461171819029e-06, + "loss": 0.122, + "step": 47008 + }, + { + "epoch": 0.8384582456390682, + "grad_norm": 0.28950875997543335, + "learning_rate": 3.869629291657337e-06, + "loss": 0.1261, + "step": 47009 + }, + { + "epoch": 0.8384760817607819, + "grad_norm": 0.22746475040912628, + "learning_rate": 3.868797493404255e-06, + "loss": 0.0651, + "step": 47010 + }, + { + "epoch": 0.8384939178824956, + "grad_norm": 0.2901719808578491, + "learning_rate": 3.8679657770630175e-06, + "loss": 0.1506, + "step": 47011 + }, + { + "epoch": 0.8385117540042093, + "grad_norm": 0.25291070342063904, + "learning_rate": 3.8671341426368444e-06, + "loss": 0.18, + "step": 47012 + }, + { + "epoch": 0.838529590125923, + "grad_norm": 0.40263795852661133, + "learning_rate": 3.866302590128959e-06, + "loss": 0.1225, + "step": 47013 + }, + { + "epoch": 0.8385474262476367, + "grad_norm": 0.24514323472976685, + "learning_rate": 3.865471119542577e-06, + "loss": 0.1131, + "step": 47014 + }, + { + "epoch": 0.8385652623693504, + "grad_norm": 0.2155514657497406, + "learning_rate": 3.864639730880934e-06, + "loss": 0.1073, + "step": 47015 + }, + { + "epoch": 0.838583098491064, + "grad_norm": 0.323479026556015, + "learning_rate": 3.86380842414725e-06, + "loss": 0.1243, + "step": 47016 + }, + { + "epoch": 0.8386009346127778, + "grad_norm": 0.24916298687458038, + "learning_rate": 3.862977199344742e-06, + "loss": 0.12, + "step": 47017 + }, + { + "epoch": 0.8386187707344915, + "grad_norm": 0.49524566531181335, + "learning_rate": 3.862146056476629e-06, + "loss": 0.1144, + "step": 47018 + }, + { + "epoch": 0.8386366068562052, + "grad_norm": 0.2047007977962494, + "learning_rate": 3.861314995546145e-06, + "loss": 0.1089, + "step": 47019 + }, + { + "epoch": 0.8386544429779189, + "grad_norm": 0.321159303188324, + "learning_rate": 3.860484016556507e-06, + "loss": 0.14, + "step": 47020 + }, + { + "epoch": 0.8386722790996326, + "grad_norm": 0.2594822645187378, + "learning_rate": 3.8596531195109256e-06, + "loss": 0.1322, + "step": 47021 + }, + { + "epoch": 0.8386901152213463, + "grad_norm": 0.22090981900691986, + "learning_rate": 3.858822304412638e-06, + "loss": 0.107, + "step": 47022 + }, + { + "epoch": 0.83870795134306, + "grad_norm": 0.3495054841041565, + "learning_rate": 3.857991571264852e-06, + "loss": 0.0941, + "step": 47023 + }, + { + "epoch": 0.8387257874647737, + "grad_norm": 0.21863503754138947, + "learning_rate": 3.857160920070796e-06, + "loss": 0.0771, + "step": 47024 + }, + { + "epoch": 0.8387436235864874, + "grad_norm": 0.3025311827659607, + "learning_rate": 3.85633035083369e-06, + "loss": 0.0721, + "step": 47025 + }, + { + "epoch": 0.838761459708201, + "grad_norm": 0.19793066382408142, + "learning_rate": 3.855499863556747e-06, + "loss": 0.0827, + "step": 47026 + }, + { + "epoch": 0.8387792958299147, + "grad_norm": 0.23742131888866425, + "learning_rate": 3.8546694582431845e-06, + "loss": 0.1133, + "step": 47027 + }, + { + "epoch": 0.8387971319516284, + "grad_norm": 0.26667022705078125, + "learning_rate": 3.853839134896234e-06, + "loss": 0.1008, + "step": 47028 + }, + { + "epoch": 0.8388149680733421, + "grad_norm": 0.31092989444732666, + "learning_rate": 3.853008893519105e-06, + "loss": 0.114, + "step": 47029 + }, + { + "epoch": 0.8388328041950558, + "grad_norm": 0.19924920797348022, + "learning_rate": 3.85217873411502e-06, + "loss": 0.1028, + "step": 47030 + }, + { + "epoch": 0.8388506403167695, + "grad_norm": 0.27402082085609436, + "learning_rate": 3.851348656687187e-06, + "loss": 0.1854, + "step": 47031 + }, + { + "epoch": 0.8388684764384832, + "grad_norm": 0.2764783799648285, + "learning_rate": 3.850518661238836e-06, + "loss": 0.0718, + "step": 47032 + }, + { + "epoch": 0.8388863125601969, + "grad_norm": 0.2881247401237488, + "learning_rate": 3.849688747773181e-06, + "loss": 0.1085, + "step": 47033 + }, + { + "epoch": 0.8389041486819107, + "grad_norm": 0.2363775372505188, + "learning_rate": 3.848858916293438e-06, + "loss": 0.1064, + "step": 47034 + }, + { + "epoch": 0.8389219848036243, + "grad_norm": 0.2641986310482025, + "learning_rate": 3.8480291668028165e-06, + "loss": 0.1458, + "step": 47035 + }, + { + "epoch": 0.838939820925338, + "grad_norm": 0.2976537048816681, + "learning_rate": 3.847199499304543e-06, + "loss": 0.089, + "step": 47036 + }, + { + "epoch": 0.8389576570470517, + "grad_norm": 0.3063163757324219, + "learning_rate": 3.846369913801828e-06, + "loss": 0.0853, + "step": 47037 + }, + { + "epoch": 0.8389754931687654, + "grad_norm": 0.30863767862319946, + "learning_rate": 3.845540410297893e-06, + "loss": 0.1223, + "step": 47038 + }, + { + "epoch": 0.8389933292904791, + "grad_norm": 0.37166640162467957, + "learning_rate": 3.844710988795952e-06, + "loss": 0.1261, + "step": 47039 + }, + { + "epoch": 0.8390111654121928, + "grad_norm": 0.26229560375213623, + "learning_rate": 3.8438816492992106e-06, + "loss": 0.1092, + "step": 47040 + }, + { + "epoch": 0.8390290015339065, + "grad_norm": 0.2673822343349457, + "learning_rate": 3.843052391810897e-06, + "loss": 0.0785, + "step": 47041 + }, + { + "epoch": 0.8390468376556202, + "grad_norm": 0.31263089179992676, + "learning_rate": 3.84222321633422e-06, + "loss": 0.1425, + "step": 47042 + }, + { + "epoch": 0.8390646737773338, + "grad_norm": 0.24355749785900116, + "learning_rate": 3.841394122872394e-06, + "loss": 0.1108, + "step": 47043 + }, + { + "epoch": 0.8390825098990475, + "grad_norm": 0.24575401842594147, + "learning_rate": 3.840565111428626e-06, + "loss": 0.0815, + "step": 47044 + }, + { + "epoch": 0.8391003460207612, + "grad_norm": 0.48401495814323425, + "learning_rate": 3.83973618200614e-06, + "loss": 0.087, + "step": 47045 + }, + { + "epoch": 0.8391181821424749, + "grad_norm": 0.2347661852836609, + "learning_rate": 3.838907334608146e-06, + "loss": 0.0922, + "step": 47046 + }, + { + "epoch": 0.8391360182641886, + "grad_norm": 0.31247958540916443, + "learning_rate": 3.838078569237857e-06, + "loss": 0.1571, + "step": 47047 + }, + { + "epoch": 0.8391538543859023, + "grad_norm": 0.25748899579048157, + "learning_rate": 3.837249885898481e-06, + "loss": 0.1124, + "step": 47048 + }, + { + "epoch": 0.839171690507616, + "grad_norm": 0.44267764687538147, + "learning_rate": 3.836421284593236e-06, + "loss": 0.1052, + "step": 47049 + }, + { + "epoch": 0.8391895266293298, + "grad_norm": 0.31609952449798584, + "learning_rate": 3.835592765325327e-06, + "loss": 0.1443, + "step": 47050 + }, + { + "epoch": 0.8392073627510435, + "grad_norm": 0.37156808376312256, + "learning_rate": 3.834764328097976e-06, + "loss": 0.1624, + "step": 47051 + }, + { + "epoch": 0.8392251988727571, + "grad_norm": 0.3659432828426361, + "learning_rate": 3.833935972914388e-06, + "loss": 0.0868, + "step": 47052 + }, + { + "epoch": 0.8392430349944708, + "grad_norm": 0.3445312976837158, + "learning_rate": 3.833107699777768e-06, + "loss": 0.1182, + "step": 47053 + }, + { + "epoch": 0.8392608711161845, + "grad_norm": 0.25811076164245605, + "learning_rate": 3.832279508691344e-06, + "loss": 0.0863, + "step": 47054 + }, + { + "epoch": 0.8392787072378982, + "grad_norm": 0.2872600555419922, + "learning_rate": 3.8314513996583115e-06, + "loss": 0.1272, + "step": 47055 + }, + { + "epoch": 0.8392965433596119, + "grad_norm": 0.24091780185699463, + "learning_rate": 3.830623372681885e-06, + "loss": 0.1253, + "step": 47056 + }, + { + "epoch": 0.8393143794813256, + "grad_norm": 0.250893235206604, + "learning_rate": 3.82979542776527e-06, + "loss": 0.0907, + "step": 47057 + }, + { + "epoch": 0.8393322156030393, + "grad_norm": 0.24283510446548462, + "learning_rate": 3.828967564911684e-06, + "loss": 0.136, + "step": 47058 + }, + { + "epoch": 0.839350051724753, + "grad_norm": 0.21466827392578125, + "learning_rate": 3.828139784124335e-06, + "loss": 0.1238, + "step": 47059 + }, + { + "epoch": 0.8393678878464667, + "grad_norm": 0.26720157265663147, + "learning_rate": 3.827312085406426e-06, + "loss": 0.1198, + "step": 47060 + }, + { + "epoch": 0.8393857239681803, + "grad_norm": 0.22254715859889984, + "learning_rate": 3.826484468761168e-06, + "loss": 0.1126, + "step": 47061 + }, + { + "epoch": 0.839403560089894, + "grad_norm": 0.2646508514881134, + "learning_rate": 3.825656934191763e-06, + "loss": 0.0744, + "step": 47062 + }, + { + "epoch": 0.8394213962116077, + "grad_norm": 0.22626057267189026, + "learning_rate": 3.824829481701434e-06, + "loss": 0.1143, + "step": 47063 + }, + { + "epoch": 0.8394392323333214, + "grad_norm": 0.2767675220966339, + "learning_rate": 3.8240021112933706e-06, + "loss": 0.1091, + "step": 47064 + }, + { + "epoch": 0.8394570684550351, + "grad_norm": 0.25615358352661133, + "learning_rate": 3.823174822970796e-06, + "loss": 0.1448, + "step": 47065 + }, + { + "epoch": 0.8394749045767488, + "grad_norm": 0.2957111597061157, + "learning_rate": 3.822347616736904e-06, + "loss": 0.0643, + "step": 47066 + }, + { + "epoch": 0.8394927406984626, + "grad_norm": 0.24870948493480682, + "learning_rate": 3.821520492594913e-06, + "loss": 0.1296, + "step": 47067 + }, + { + "epoch": 0.8395105768201763, + "grad_norm": 0.29272136092185974, + "learning_rate": 3.820693450548024e-06, + "loss": 0.1187, + "step": 47068 + }, + { + "epoch": 0.83952841294189, + "grad_norm": 0.25594907999038696, + "learning_rate": 3.819866490599442e-06, + "loss": 0.1481, + "step": 47069 + }, + { + "epoch": 0.8395462490636036, + "grad_norm": 0.21848732233047485, + "learning_rate": 3.819039612752367e-06, + "loss": 0.0946, + "step": 47070 + }, + { + "epoch": 0.8395640851853173, + "grad_norm": 0.2692481279373169, + "learning_rate": 3.818212817010014e-06, + "loss": 0.118, + "step": 47071 + }, + { + "epoch": 0.839581921307031, + "grad_norm": 0.26383277773857117, + "learning_rate": 3.8173861033755864e-06, + "loss": 0.1263, + "step": 47072 + }, + { + "epoch": 0.8395997574287447, + "grad_norm": 0.2758817672729492, + "learning_rate": 3.8165594718522855e-06, + "loss": 0.1059, + "step": 47073 + }, + { + "epoch": 0.8396175935504584, + "grad_norm": 0.26964548230171204, + "learning_rate": 3.815732922443319e-06, + "loss": 0.1155, + "step": 47074 + }, + { + "epoch": 0.8396354296721721, + "grad_norm": 0.34769174456596375, + "learning_rate": 3.814906455151879e-06, + "loss": 0.1203, + "step": 47075 + }, + { + "epoch": 0.8396532657938858, + "grad_norm": 0.31437820196151733, + "learning_rate": 3.8140800699811878e-06, + "loss": 0.144, + "step": 47076 + }, + { + "epoch": 0.8396711019155995, + "grad_norm": 0.46026819944381714, + "learning_rate": 3.8132537669344325e-06, + "loss": 0.1693, + "step": 47077 + }, + { + "epoch": 0.8396889380373131, + "grad_norm": 0.29425153136253357, + "learning_rate": 3.8124275460148268e-06, + "loss": 0.0718, + "step": 47078 + }, + { + "epoch": 0.8397067741590268, + "grad_norm": 0.31868693232536316, + "learning_rate": 3.811601407225568e-06, + "loss": 0.1218, + "step": 47079 + }, + { + "epoch": 0.8397246102807405, + "grad_norm": 0.4028632640838623, + "learning_rate": 3.8107753505698644e-06, + "loss": 0.1006, + "step": 47080 + }, + { + "epoch": 0.8397424464024542, + "grad_norm": 0.2891925275325775, + "learning_rate": 3.809949376050914e-06, + "loss": 0.0761, + "step": 47081 + }, + { + "epoch": 0.8397602825241679, + "grad_norm": 0.24664293229579926, + "learning_rate": 3.8091234836719168e-06, + "loss": 0.1266, + "step": 47082 + }, + { + "epoch": 0.8397781186458816, + "grad_norm": 0.25259220600128174, + "learning_rate": 3.8082976734360733e-06, + "loss": 0.1418, + "step": 47083 + }, + { + "epoch": 0.8397959547675954, + "grad_norm": 0.24793516099452972, + "learning_rate": 3.8074719453465912e-06, + "loss": 0.0996, + "step": 47084 + }, + { + "epoch": 0.8398137908893091, + "grad_norm": 0.31412291526794434, + "learning_rate": 3.8066462994066686e-06, + "loss": 0.1953, + "step": 47085 + }, + { + "epoch": 0.8398316270110228, + "grad_norm": 0.3072883188724518, + "learning_rate": 3.805820735619506e-06, + "loss": 0.1475, + "step": 47086 + }, + { + "epoch": 0.8398494631327365, + "grad_norm": 0.2971581518650055, + "learning_rate": 3.8049952539883e-06, + "loss": 0.057, + "step": 47087 + }, + { + "epoch": 0.8398672992544501, + "grad_norm": 0.3154681622982025, + "learning_rate": 3.8041698545162484e-06, + "loss": 0.1271, + "step": 47088 + }, + { + "epoch": 0.8398851353761638, + "grad_norm": 0.2794531583786011, + "learning_rate": 3.80334453720656e-06, + "loss": 0.1074, + "step": 47089 + }, + { + "epoch": 0.8399029714978775, + "grad_norm": 0.250113844871521, + "learning_rate": 3.8025193020624297e-06, + "loss": 0.1052, + "step": 47090 + }, + { + "epoch": 0.8399208076195912, + "grad_norm": 0.20722746849060059, + "learning_rate": 3.801694149087051e-06, + "loss": 0.0753, + "step": 47091 + }, + { + "epoch": 0.8399386437413049, + "grad_norm": 0.3281230330467224, + "learning_rate": 3.8008690782836283e-06, + "loss": 0.1639, + "step": 47092 + }, + { + "epoch": 0.8399564798630186, + "grad_norm": 0.18458016216754913, + "learning_rate": 3.8000440896553636e-06, + "loss": 0.0763, + "step": 47093 + }, + { + "epoch": 0.8399743159847323, + "grad_norm": 0.304382860660553, + "learning_rate": 3.7992191832054493e-06, + "loss": 0.14, + "step": 47094 + }, + { + "epoch": 0.839992152106446, + "grad_norm": 0.22286227345466614, + "learning_rate": 3.7983943589370852e-06, + "loss": 0.13, + "step": 47095 + }, + { + "epoch": 0.8400099882281596, + "grad_norm": 0.2638026177883148, + "learning_rate": 3.797569616853461e-06, + "loss": 0.1371, + "step": 47096 + }, + { + "epoch": 0.8400278243498733, + "grad_norm": 0.2293182909488678, + "learning_rate": 3.7967449569577847e-06, + "loss": 0.0941, + "step": 47097 + }, + { + "epoch": 0.840045660471587, + "grad_norm": 0.2778116464614868, + "learning_rate": 3.795920379253251e-06, + "loss": 0.1189, + "step": 47098 + }, + { + "epoch": 0.8400634965933007, + "grad_norm": 0.2454736977815628, + "learning_rate": 3.7950958837430523e-06, + "loss": 0.0714, + "step": 47099 + }, + { + "epoch": 0.8400813327150144, + "grad_norm": 0.2624067962169647, + "learning_rate": 3.7942714704303854e-06, + "loss": 0.0935, + "step": 47100 + }, + { + "epoch": 0.8400991688367282, + "grad_norm": 0.2683139145374298, + "learning_rate": 3.79344713931844e-06, + "loss": 0.1073, + "step": 47101 + }, + { + "epoch": 0.8401170049584419, + "grad_norm": 0.22290875017642975, + "learning_rate": 3.7926228904104245e-06, + "loss": 0.1077, + "step": 47102 + }, + { + "epoch": 0.8401348410801556, + "grad_norm": 0.3065320551395416, + "learning_rate": 3.7917987237095277e-06, + "loss": 0.1504, + "step": 47103 + }, + { + "epoch": 0.8401526772018693, + "grad_norm": 0.3087966740131378, + "learning_rate": 3.7909746392189388e-06, + "loss": 0.1338, + "step": 47104 + }, + { + "epoch": 0.840170513323583, + "grad_norm": 0.2408827394247055, + "learning_rate": 3.7901506369418636e-06, + "loss": 0.0934, + "step": 47105 + }, + { + "epoch": 0.8401883494452966, + "grad_norm": 0.24439117312431335, + "learning_rate": 3.7893267168814833e-06, + "loss": 0.1007, + "step": 47106 + }, + { + "epoch": 0.8402061855670103, + "grad_norm": 0.2937263250350952, + "learning_rate": 3.788502879041006e-06, + "loss": 0.1313, + "step": 47107 + }, + { + "epoch": 0.840224021688724, + "grad_norm": 0.29831087589263916, + "learning_rate": 3.7876791234236155e-06, + "loss": 0.1037, + "step": 47108 + }, + { + "epoch": 0.8402418578104377, + "grad_norm": 0.2955385744571686, + "learning_rate": 3.7868554500325033e-06, + "loss": 0.1233, + "step": 47109 + }, + { + "epoch": 0.8402596939321514, + "grad_norm": 0.29125431180000305, + "learning_rate": 3.7860318588708697e-06, + "loss": 0.1003, + "step": 47110 + }, + { + "epoch": 0.8402775300538651, + "grad_norm": 0.35177454352378845, + "learning_rate": 3.7852083499419073e-06, + "loss": 0.1228, + "step": 47111 + }, + { + "epoch": 0.8402953661755788, + "grad_norm": 0.35921815037727356, + "learning_rate": 3.7843849232488017e-06, + "loss": 0.1427, + "step": 47112 + }, + { + "epoch": 0.8403132022972924, + "grad_norm": 0.24147279560565948, + "learning_rate": 3.7835615787947475e-06, + "loss": 0.0934, + "step": 47113 + }, + { + "epoch": 0.8403310384190061, + "grad_norm": 0.24416770040988922, + "learning_rate": 3.7827383165829343e-06, + "loss": 0.1241, + "step": 47114 + }, + { + "epoch": 0.8403488745407198, + "grad_norm": 0.2558423578739166, + "learning_rate": 3.7819151366165566e-06, + "loss": 0.0878, + "step": 47115 + }, + { + "epoch": 0.8403667106624335, + "grad_norm": 0.35122865438461304, + "learning_rate": 3.7810920388988093e-06, + "loss": 0.1217, + "step": 47116 + }, + { + "epoch": 0.8403845467841472, + "grad_norm": 0.2535398006439209, + "learning_rate": 3.7802690234328754e-06, + "loss": 0.109, + "step": 47117 + }, + { + "epoch": 0.840402382905861, + "grad_norm": 0.26111915707588196, + "learning_rate": 3.7794460902219416e-06, + "loss": 0.1035, + "step": 47118 + }, + { + "epoch": 0.8404202190275747, + "grad_norm": 0.22617845237255096, + "learning_rate": 3.778623239269202e-06, + "loss": 0.0858, + "step": 47119 + }, + { + "epoch": 0.8404380551492884, + "grad_norm": 0.4816853106021881, + "learning_rate": 3.777800470577858e-06, + "loss": 0.1381, + "step": 47120 + }, + { + "epoch": 0.8404558912710021, + "grad_norm": 0.3486986756324768, + "learning_rate": 3.7769777841510894e-06, + "loss": 0.1224, + "step": 47121 + }, + { + "epoch": 0.8404737273927158, + "grad_norm": 0.28654128313064575, + "learning_rate": 3.7761551799920773e-06, + "loss": 0.119, + "step": 47122 + }, + { + "epoch": 0.8404915635144294, + "grad_norm": 0.25931915640830994, + "learning_rate": 3.7753326581040247e-06, + "loss": 0.1158, + "step": 47123 + }, + { + "epoch": 0.8405093996361431, + "grad_norm": 0.22576895356178284, + "learning_rate": 3.7745102184901128e-06, + "loss": 0.1459, + "step": 47124 + }, + { + "epoch": 0.8405272357578568, + "grad_norm": 0.22982607781887054, + "learning_rate": 3.7736878611535297e-06, + "loss": 0.1105, + "step": 47125 + }, + { + "epoch": 0.8405450718795705, + "grad_norm": 0.3667536675930023, + "learning_rate": 3.7728655860974653e-06, + "loss": 0.1097, + "step": 47126 + }, + { + "epoch": 0.8405629080012842, + "grad_norm": 0.23805628716945648, + "learning_rate": 3.7720433933251004e-06, + "loss": 0.1245, + "step": 47127 + }, + { + "epoch": 0.8405807441229979, + "grad_norm": 0.26823189854621887, + "learning_rate": 3.771221282839632e-06, + "loss": 0.1295, + "step": 47128 + }, + { + "epoch": 0.8405985802447116, + "grad_norm": 0.2834422290325165, + "learning_rate": 3.7703992546442414e-06, + "loss": 0.1266, + "step": 47129 + }, + { + "epoch": 0.8406164163664253, + "grad_norm": 0.28688740730285645, + "learning_rate": 3.769577308742117e-06, + "loss": 0.0754, + "step": 47130 + }, + { + "epoch": 0.8406342524881389, + "grad_norm": 0.2513481378555298, + "learning_rate": 3.768755445136438e-06, + "loss": 0.1497, + "step": 47131 + }, + { + "epoch": 0.8406520886098526, + "grad_norm": 0.42138680815696716, + "learning_rate": 3.7679336638304004e-06, + "loss": 0.1684, + "step": 47132 + }, + { + "epoch": 0.8406699247315663, + "grad_norm": 0.21968132257461548, + "learning_rate": 3.7671119648271803e-06, + "loss": 0.0901, + "step": 47133 + }, + { + "epoch": 0.84068776085328, + "grad_norm": 0.2773551940917969, + "learning_rate": 3.766290348129972e-06, + "loss": 0.0878, + "step": 47134 + }, + { + "epoch": 0.8407055969749938, + "grad_norm": 0.13072916865348816, + "learning_rate": 3.7654688137419543e-06, + "loss": 0.04, + "step": 47135 + }, + { + "epoch": 0.8407234330967075, + "grad_norm": 0.35075798630714417, + "learning_rate": 3.764647361666315e-06, + "loss": 0.1314, + "step": 47136 + }, + { + "epoch": 0.8407412692184212, + "grad_norm": 0.30211958289146423, + "learning_rate": 3.763825991906239e-06, + "loss": 0.1325, + "step": 47137 + }, + { + "epoch": 0.8407591053401349, + "grad_norm": 0.2839547097682953, + "learning_rate": 3.763004704464909e-06, + "loss": 0.1234, + "step": 47138 + }, + { + "epoch": 0.8407769414618486, + "grad_norm": 0.2972462475299835, + "learning_rate": 3.762183499345506e-06, + "loss": 0.1574, + "step": 47139 + }, + { + "epoch": 0.8407947775835622, + "grad_norm": 0.3199516236782074, + "learning_rate": 3.7613623765512113e-06, + "loss": 0.1281, + "step": 47140 + }, + { + "epoch": 0.8408126137052759, + "grad_norm": 0.25842055678367615, + "learning_rate": 3.760541336085216e-06, + "loss": 0.1084, + "step": 47141 + }, + { + "epoch": 0.8408304498269896, + "grad_norm": 0.3160461485385895, + "learning_rate": 3.7597203779506982e-06, + "loss": 0.1834, + "step": 47142 + }, + { + "epoch": 0.8408482859487033, + "grad_norm": 0.2349184900522232, + "learning_rate": 3.7588995021508424e-06, + "loss": 0.1533, + "step": 47143 + }, + { + "epoch": 0.840866122070417, + "grad_norm": 0.22943779826164246, + "learning_rate": 3.75807870868882e-06, + "loss": 0.1094, + "step": 47144 + }, + { + "epoch": 0.8408839581921307, + "grad_norm": 0.27372583746910095, + "learning_rate": 3.7572579975678284e-06, + "loss": 0.1123, + "step": 47145 + }, + { + "epoch": 0.8409017943138444, + "grad_norm": 0.2682541012763977, + "learning_rate": 3.7564373687910354e-06, + "loss": 0.1846, + "step": 47146 + }, + { + "epoch": 0.8409196304355581, + "grad_norm": 0.2469344586133957, + "learning_rate": 3.755616822361635e-06, + "loss": 0.1391, + "step": 47147 + }, + { + "epoch": 0.8409374665572718, + "grad_norm": 0.21677681803703308, + "learning_rate": 3.7547963582827974e-06, + "loss": 0.1145, + "step": 47148 + }, + { + "epoch": 0.8409553026789854, + "grad_norm": 0.2241763323545456, + "learning_rate": 3.753975976557708e-06, + "loss": 0.1464, + "step": 47149 + }, + { + "epoch": 0.8409731388006991, + "grad_norm": 0.2170475572347641, + "learning_rate": 3.7531556771895486e-06, + "loss": 0.1003, + "step": 47150 + }, + { + "epoch": 0.8409909749224128, + "grad_norm": 0.24082481861114502, + "learning_rate": 3.752335460181497e-06, + "loss": 0.1018, + "step": 47151 + }, + { + "epoch": 0.8410088110441266, + "grad_norm": 0.21301297843456268, + "learning_rate": 3.751515325536731e-06, + "loss": 0.0804, + "step": 47152 + }, + { + "epoch": 0.8410266471658403, + "grad_norm": 0.2911536991596222, + "learning_rate": 3.7506952732584256e-06, + "loss": 0.1019, + "step": 47153 + }, + { + "epoch": 0.841044483287554, + "grad_norm": 0.35989171266555786, + "learning_rate": 3.74987530334977e-06, + "loss": 0.1255, + "step": 47154 + }, + { + "epoch": 0.8410623194092677, + "grad_norm": 0.21408697962760925, + "learning_rate": 3.7490554158139372e-06, + "loss": 0.0817, + "step": 47155 + }, + { + "epoch": 0.8410801555309814, + "grad_norm": 0.2765394151210785, + "learning_rate": 3.748235610654105e-06, + "loss": 0.1084, + "step": 47156 + }, + { + "epoch": 0.841097991652695, + "grad_norm": 0.1714177131652832, + "learning_rate": 3.7474158878734456e-06, + "loss": 0.0657, + "step": 47157 + }, + { + "epoch": 0.8411158277744087, + "grad_norm": 0.27308839559555054, + "learning_rate": 3.746596247475148e-06, + "loss": 0.1047, + "step": 47158 + }, + { + "epoch": 0.8411336638961224, + "grad_norm": 0.306439071893692, + "learning_rate": 3.7457766894623854e-06, + "loss": 0.0981, + "step": 47159 + }, + { + "epoch": 0.8411515000178361, + "grad_norm": 0.6013026237487793, + "learning_rate": 3.7449572138383266e-06, + "loss": 0.1126, + "step": 47160 + }, + { + "epoch": 0.8411693361395498, + "grad_norm": 0.29482531547546387, + "learning_rate": 3.744137820606161e-06, + "loss": 0.1117, + "step": 47161 + }, + { + "epoch": 0.8411871722612635, + "grad_norm": 0.269481360912323, + "learning_rate": 3.7433185097690533e-06, + "loss": 0.1343, + "step": 47162 + }, + { + "epoch": 0.8412050083829772, + "grad_norm": 0.30088502168655396, + "learning_rate": 3.7424992813301916e-06, + "loss": 0.148, + "step": 47163 + }, + { + "epoch": 0.8412228445046909, + "grad_norm": 0.3397041857242584, + "learning_rate": 3.741680135292744e-06, + "loss": 0.0479, + "step": 47164 + }, + { + "epoch": 0.8412406806264046, + "grad_norm": 0.24093548953533173, + "learning_rate": 3.740861071659885e-06, + "loss": 0.1071, + "step": 47165 + }, + { + "epoch": 0.8412585167481182, + "grad_norm": 0.32494139671325684, + "learning_rate": 3.7400420904347864e-06, + "loss": 0.17, + "step": 47166 + }, + { + "epoch": 0.8412763528698319, + "grad_norm": 0.34829822182655334, + "learning_rate": 3.7392231916206334e-06, + "loss": 0.1609, + "step": 47167 + }, + { + "epoch": 0.8412941889915457, + "grad_norm": 0.2576776146888733, + "learning_rate": 3.7384043752205944e-06, + "loss": 0.0842, + "step": 47168 + }, + { + "epoch": 0.8413120251132594, + "grad_norm": 0.19019296765327454, + "learning_rate": 3.737585641237845e-06, + "loss": 0.114, + "step": 47169 + }, + { + "epoch": 0.8413298612349731, + "grad_norm": 0.22499391436576843, + "learning_rate": 3.73676698967555e-06, + "loss": 0.0678, + "step": 47170 + }, + { + "epoch": 0.8413476973566868, + "grad_norm": 0.2032153308391571, + "learning_rate": 3.7359484205368973e-06, + "loss": 0.1002, + "step": 47171 + }, + { + "epoch": 0.8413655334784005, + "grad_norm": 0.2861258387565613, + "learning_rate": 3.7351299338250517e-06, + "loss": 0.0874, + "step": 47172 + }, + { + "epoch": 0.8413833696001142, + "grad_norm": 0.28178420662879944, + "learning_rate": 3.7343115295431884e-06, + "loss": 0.1331, + "step": 47173 + }, + { + "epoch": 0.8414012057218279, + "grad_norm": 0.2825622260570526, + "learning_rate": 3.7334932076944717e-06, + "loss": 0.1188, + "step": 47174 + }, + { + "epoch": 0.8414190418435415, + "grad_norm": 0.2185794860124588, + "learning_rate": 3.7326749682820794e-06, + "loss": 0.0711, + "step": 47175 + }, + { + "epoch": 0.8414368779652552, + "grad_norm": 0.22176779806613922, + "learning_rate": 3.7318568113091924e-06, + "loss": 0.1064, + "step": 47176 + }, + { + "epoch": 0.8414547140869689, + "grad_norm": 0.28279492259025574, + "learning_rate": 3.7310387367789745e-06, + "loss": 0.0701, + "step": 47177 + }, + { + "epoch": 0.8414725502086826, + "grad_norm": 0.36179807782173157, + "learning_rate": 3.730220744694596e-06, + "loss": 0.1244, + "step": 47178 + }, + { + "epoch": 0.8414903863303963, + "grad_norm": 0.3489610552787781, + "learning_rate": 3.7294028350592204e-06, + "loss": 0.1266, + "step": 47179 + }, + { + "epoch": 0.84150822245211, + "grad_norm": 0.32125207781791687, + "learning_rate": 3.728585007876034e-06, + "loss": 0.1113, + "step": 47180 + }, + { + "epoch": 0.8415260585738237, + "grad_norm": 0.27386870980262756, + "learning_rate": 3.7277672631481965e-06, + "loss": 0.1386, + "step": 47181 + }, + { + "epoch": 0.8415438946955374, + "grad_norm": 0.23990623652935028, + "learning_rate": 3.7269496008788845e-06, + "loss": 0.0609, + "step": 47182 + }, + { + "epoch": 0.841561730817251, + "grad_norm": 0.3674491047859192, + "learning_rate": 3.7261320210712547e-06, + "loss": 0.0983, + "step": 47183 + }, + { + "epoch": 0.8415795669389647, + "grad_norm": 0.3062206506729126, + "learning_rate": 3.7253145237284903e-06, + "loss": 0.1934, + "step": 47184 + }, + { + "epoch": 0.8415974030606785, + "grad_norm": 0.23742343485355377, + "learning_rate": 3.7244971088537556e-06, + "loss": 0.1062, + "step": 47185 + }, + { + "epoch": 0.8416152391823922, + "grad_norm": 0.24429085850715637, + "learning_rate": 3.7236797764502174e-06, + "loss": 0.0987, + "step": 47186 + }, + { + "epoch": 0.8416330753041059, + "grad_norm": 0.24329285323619843, + "learning_rate": 3.7228625265210428e-06, + "loss": 0.1077, + "step": 47187 + }, + { + "epoch": 0.8416509114258196, + "grad_norm": 0.25650450587272644, + "learning_rate": 3.7220453590694038e-06, + "loss": 0.0791, + "step": 47188 + }, + { + "epoch": 0.8416687475475333, + "grad_norm": 0.26111355423927307, + "learning_rate": 3.7212282740984624e-06, + "loss": 0.0947, + "step": 47189 + }, + { + "epoch": 0.841686583669247, + "grad_norm": 0.2582864463329315, + "learning_rate": 3.720411271611393e-06, + "loss": 0.0823, + "step": 47190 + }, + { + "epoch": 0.8417044197909607, + "grad_norm": 0.3363458812236786, + "learning_rate": 3.7195943516113637e-06, + "loss": 0.1281, + "step": 47191 + }, + { + "epoch": 0.8417222559126744, + "grad_norm": 0.30207955837249756, + "learning_rate": 3.7187775141015265e-06, + "loss": 0.1042, + "step": 47192 + }, + { + "epoch": 0.841740092034388, + "grad_norm": 0.23146916925907135, + "learning_rate": 3.7179607590850653e-06, + "loss": 0.0852, + "step": 47193 + }, + { + "epoch": 0.8417579281561017, + "grad_norm": 0.27197352051734924, + "learning_rate": 3.7171440865651412e-06, + "loss": 0.1207, + "step": 47194 + }, + { + "epoch": 0.8417757642778154, + "grad_norm": 0.33557239174842834, + "learning_rate": 3.7163274965449133e-06, + "loss": 0.1045, + "step": 47195 + }, + { + "epoch": 0.8417936003995291, + "grad_norm": 0.23681728541851044, + "learning_rate": 3.7155109890275487e-06, + "loss": 0.1272, + "step": 47196 + }, + { + "epoch": 0.8418114365212428, + "grad_norm": 0.3109634518623352, + "learning_rate": 3.714694564016219e-06, + "loss": 0.1025, + "step": 47197 + }, + { + "epoch": 0.8418292726429565, + "grad_norm": 0.2443607896566391, + "learning_rate": 3.713878221514086e-06, + "loss": 0.1103, + "step": 47198 + }, + { + "epoch": 0.8418471087646702, + "grad_norm": 0.17061349749565125, + "learning_rate": 3.7130619615243113e-06, + "loss": 0.0491, + "step": 47199 + }, + { + "epoch": 0.8418649448863839, + "grad_norm": 0.267388254404068, + "learning_rate": 3.7122457840500557e-06, + "loss": 0.1152, + "step": 47200 + }, + { + "epoch": 0.8418827810080975, + "grad_norm": 0.21858885884284973, + "learning_rate": 3.7114296890944947e-06, + "loss": 0.0826, + "step": 47201 + }, + { + "epoch": 0.8419006171298113, + "grad_norm": 0.1882113814353943, + "learning_rate": 3.710613676660779e-06, + "loss": 0.0611, + "step": 47202 + }, + { + "epoch": 0.841918453251525, + "grad_norm": 0.19960112869739532, + "learning_rate": 3.7097977467520832e-06, + "loss": 0.0676, + "step": 47203 + }, + { + "epoch": 0.8419362893732387, + "grad_norm": 0.24927645921707153, + "learning_rate": 3.7089818993715638e-06, + "loss": 0.1044, + "step": 47204 + }, + { + "epoch": 0.8419541254949524, + "grad_norm": 0.25596579909324646, + "learning_rate": 3.708166134522381e-06, + "loss": 0.0917, + "step": 47205 + }, + { + "epoch": 0.8419719616166661, + "grad_norm": 0.5513489246368408, + "learning_rate": 3.7073504522077028e-06, + "loss": 0.1379, + "step": 47206 + }, + { + "epoch": 0.8419897977383798, + "grad_norm": 0.2549701929092407, + "learning_rate": 3.7065348524306902e-06, + "loss": 0.0674, + "step": 47207 + }, + { + "epoch": 0.8420076338600935, + "grad_norm": 0.6302520036697388, + "learning_rate": 3.705719335194502e-06, + "loss": 0.1477, + "step": 47208 + }, + { + "epoch": 0.8420254699818072, + "grad_norm": 0.2891847491264343, + "learning_rate": 3.704903900502296e-06, + "loss": 0.0834, + "step": 47209 + }, + { + "epoch": 0.8420433061035208, + "grad_norm": 0.19554609060287476, + "learning_rate": 3.704088548357243e-06, + "loss": 0.0636, + "step": 47210 + }, + { + "epoch": 0.8420611422252345, + "grad_norm": 0.3935930132865906, + "learning_rate": 3.703273278762498e-06, + "loss": 0.1661, + "step": 47211 + }, + { + "epoch": 0.8420789783469482, + "grad_norm": 0.35310694575309753, + "learning_rate": 3.7024580917212225e-06, + "loss": 0.1535, + "step": 47212 + }, + { + "epoch": 0.8420968144686619, + "grad_norm": 0.32745763659477234, + "learning_rate": 3.701642987236567e-06, + "loss": 0.0668, + "step": 47213 + }, + { + "epoch": 0.8421146505903756, + "grad_norm": 0.2037838250398636, + "learning_rate": 3.7008279653117093e-06, + "loss": 0.0848, + "step": 47214 + }, + { + "epoch": 0.8421324867120893, + "grad_norm": 0.24979925155639648, + "learning_rate": 3.7000130259497944e-06, + "loss": 0.1398, + "step": 47215 + }, + { + "epoch": 0.842150322833803, + "grad_norm": 0.3978791832923889, + "learning_rate": 3.6991981691539835e-06, + "loss": 0.1498, + "step": 47216 + }, + { + "epoch": 0.8421681589555167, + "grad_norm": 0.23118583858013153, + "learning_rate": 3.6983833949274407e-06, + "loss": 0.1009, + "step": 47217 + }, + { + "epoch": 0.8421859950772304, + "grad_norm": 0.22183813154697418, + "learning_rate": 3.6975687032733185e-06, + "loss": 0.0854, + "step": 47218 + }, + { + "epoch": 0.8422038311989442, + "grad_norm": 0.3228885531425476, + "learning_rate": 3.696754094194782e-06, + "loss": 0.1048, + "step": 47219 + }, + { + "epoch": 0.8422216673206578, + "grad_norm": 0.2788088619709015, + "learning_rate": 3.695939567694984e-06, + "loss": 0.1152, + "step": 47220 + }, + { + "epoch": 0.8422395034423715, + "grad_norm": 0.2885431945323944, + "learning_rate": 3.695125123777082e-06, + "loss": 0.0798, + "step": 47221 + }, + { + "epoch": 0.8422573395640852, + "grad_norm": 0.22045758366584778, + "learning_rate": 3.694310762444228e-06, + "loss": 0.0895, + "step": 47222 + }, + { + "epoch": 0.8422751756857989, + "grad_norm": 0.2593652606010437, + "learning_rate": 3.6934964836995907e-06, + "loss": 0.084, + "step": 47223 + }, + { + "epoch": 0.8422930118075126, + "grad_norm": 0.2291940152645111, + "learning_rate": 3.69268228754632e-06, + "loss": 0.132, + "step": 47224 + }, + { + "epoch": 0.8423108479292263, + "grad_norm": 0.38843873143196106, + "learning_rate": 3.69186817398757e-06, + "loss": 0.0973, + "step": 47225 + }, + { + "epoch": 0.84232868405094, + "grad_norm": 0.2545664310455322, + "learning_rate": 3.6910541430264955e-06, + "loss": 0.1159, + "step": 47226 + }, + { + "epoch": 0.8423465201726537, + "grad_norm": 0.23164300620555878, + "learning_rate": 3.6902401946662585e-06, + "loss": 0.0816, + "step": 47227 + }, + { + "epoch": 0.8423643562943673, + "grad_norm": 0.22125941514968872, + "learning_rate": 3.6894263289100115e-06, + "loss": 0.0718, + "step": 47228 + }, + { + "epoch": 0.842382192416081, + "grad_norm": 0.3010319471359253, + "learning_rate": 3.6886125457609056e-06, + "loss": 0.1211, + "step": 47229 + }, + { + "epoch": 0.8424000285377947, + "grad_norm": 0.2706752419471741, + "learning_rate": 3.6877988452221014e-06, + "loss": 0.1108, + "step": 47230 + }, + { + "epoch": 0.8424178646595084, + "grad_norm": 0.27833816409111023, + "learning_rate": 3.6869852272967435e-06, + "loss": 0.1409, + "step": 47231 + }, + { + "epoch": 0.8424357007812221, + "grad_norm": 0.241594597697258, + "learning_rate": 3.686171691987997e-06, + "loss": 0.1379, + "step": 47232 + }, + { + "epoch": 0.8424535369029358, + "grad_norm": 0.3220077455043793, + "learning_rate": 3.6853582392990138e-06, + "loss": 0.095, + "step": 47233 + }, + { + "epoch": 0.8424713730246495, + "grad_norm": 0.3200768232345581, + "learning_rate": 3.684544869232942e-06, + "loss": 0.1218, + "step": 47234 + }, + { + "epoch": 0.8424892091463632, + "grad_norm": 0.20809081196784973, + "learning_rate": 3.6837315817929318e-06, + "loss": 0.0845, + "step": 47235 + }, + { + "epoch": 0.842507045268077, + "grad_norm": 0.2517867982387543, + "learning_rate": 3.6829183769821445e-06, + "loss": 0.1304, + "step": 47236 + }, + { + "epoch": 0.8425248813897906, + "grad_norm": 0.332599014043808, + "learning_rate": 3.6821052548037276e-06, + "loss": 0.0906, + "step": 47237 + }, + { + "epoch": 0.8425427175115043, + "grad_norm": 0.2423558384180069, + "learning_rate": 3.681292215260837e-06, + "loss": 0.1135, + "step": 47238 + }, + { + "epoch": 0.842560553633218, + "grad_norm": 0.18209552764892578, + "learning_rate": 3.6804792583566116e-06, + "loss": 0.1141, + "step": 47239 + }, + { + "epoch": 0.8425783897549317, + "grad_norm": 0.21611355245113373, + "learning_rate": 3.6796663840942215e-06, + "loss": 0.0859, + "step": 47240 + }, + { + "epoch": 0.8425962258766454, + "grad_norm": 0.34535863995552063, + "learning_rate": 3.6788535924768053e-06, + "loss": 0.2071, + "step": 47241 + }, + { + "epoch": 0.8426140619983591, + "grad_norm": 0.21731248497962952, + "learning_rate": 3.6780408835075192e-06, + "loss": 0.117, + "step": 47242 + }, + { + "epoch": 0.8426318981200728, + "grad_norm": 0.28919926285743713, + "learning_rate": 3.677228257189502e-06, + "loss": 0.0837, + "step": 47243 + }, + { + "epoch": 0.8426497342417865, + "grad_norm": 0.20098577439785004, + "learning_rate": 3.6764157135259213e-06, + "loss": 0.1228, + "step": 47244 + }, + { + "epoch": 0.8426675703635002, + "grad_norm": 0.24654437601566315, + "learning_rate": 3.67560325251991e-06, + "loss": 0.1268, + "step": 47245 + }, + { + "epoch": 0.8426854064852138, + "grad_norm": 0.24432142078876495, + "learning_rate": 3.674790874174636e-06, + "loss": 0.1164, + "step": 47246 + }, + { + "epoch": 0.8427032426069275, + "grad_norm": 0.2780130207538605, + "learning_rate": 3.673978578493234e-06, + "loss": 0.0699, + "step": 47247 + }, + { + "epoch": 0.8427210787286412, + "grad_norm": 0.2677513062953949, + "learning_rate": 3.67316636547885e-06, + "loss": 0.1012, + "step": 47248 + }, + { + "epoch": 0.8427389148503549, + "grad_norm": 0.23376131057739258, + "learning_rate": 3.672354235134648e-06, + "loss": 0.068, + "step": 47249 + }, + { + "epoch": 0.8427567509720686, + "grad_norm": 0.24621011316776276, + "learning_rate": 3.671542187463767e-06, + "loss": 0.1482, + "step": 47250 + }, + { + "epoch": 0.8427745870937823, + "grad_norm": 0.18394404649734497, + "learning_rate": 3.670730222469354e-06, + "loss": 0.0717, + "step": 47251 + }, + { + "epoch": 0.842792423215496, + "grad_norm": 0.3231929838657379, + "learning_rate": 3.6699183401545516e-06, + "loss": 0.1223, + "step": 47252 + }, + { + "epoch": 0.8428102593372098, + "grad_norm": 0.25396332144737244, + "learning_rate": 3.6691065405225212e-06, + "loss": 0.0893, + "step": 47253 + }, + { + "epoch": 0.8428280954589235, + "grad_norm": 0.265811562538147, + "learning_rate": 3.6682948235763988e-06, + "loss": 0.0967, + "step": 47254 + }, + { + "epoch": 0.8428459315806371, + "grad_norm": 0.3177051842212677, + "learning_rate": 3.6674831893193347e-06, + "loss": 0.1091, + "step": 47255 + }, + { + "epoch": 0.8428637677023508, + "grad_norm": 0.25777530670166016, + "learning_rate": 3.6666716377544653e-06, + "loss": 0.1545, + "step": 47256 + }, + { + "epoch": 0.8428816038240645, + "grad_norm": 0.288001149892807, + "learning_rate": 3.665860168884955e-06, + "loss": 0.0827, + "step": 47257 + }, + { + "epoch": 0.8428994399457782, + "grad_norm": 0.30498483777046204, + "learning_rate": 3.6650487827139312e-06, + "loss": 0.158, + "step": 47258 + }, + { + "epoch": 0.8429172760674919, + "grad_norm": 0.31337258219718933, + "learning_rate": 3.6642374792445504e-06, + "loss": 0.1595, + "step": 47259 + }, + { + "epoch": 0.8429351121892056, + "grad_norm": 0.2762129008769989, + "learning_rate": 3.6634262584799574e-06, + "loss": 0.0865, + "step": 47260 + }, + { + "epoch": 0.8429529483109193, + "grad_norm": 0.254638135433197, + "learning_rate": 3.662615120423288e-06, + "loss": 0.0836, + "step": 47261 + }, + { + "epoch": 0.842970784432633, + "grad_norm": 0.2897399663925171, + "learning_rate": 3.661804065077698e-06, + "loss": 0.1213, + "step": 47262 + }, + { + "epoch": 0.8429886205543466, + "grad_norm": 0.2596624791622162, + "learning_rate": 3.6609930924463242e-06, + "loss": 0.1179, + "step": 47263 + }, + { + "epoch": 0.8430064566760603, + "grad_norm": 0.27492207288742065, + "learning_rate": 3.6601822025323112e-06, + "loss": 0.0906, + "step": 47264 + }, + { + "epoch": 0.843024292797774, + "grad_norm": 0.29992106556892395, + "learning_rate": 3.659371395338798e-06, + "loss": 0.1697, + "step": 47265 + }, + { + "epoch": 0.8430421289194877, + "grad_norm": 0.2744263708591461, + "learning_rate": 3.658560670868938e-06, + "loss": 0.0537, + "step": 47266 + }, + { + "epoch": 0.8430599650412014, + "grad_norm": 0.26597514748573303, + "learning_rate": 3.6577500291258672e-06, + "loss": 0.07, + "step": 47267 + }, + { + "epoch": 0.8430778011629151, + "grad_norm": 0.28149116039276123, + "learning_rate": 3.6569394701127273e-06, + "loss": 0.0714, + "step": 47268 + }, + { + "epoch": 0.8430956372846289, + "grad_norm": 0.29164019227027893, + "learning_rate": 3.6561289938326635e-06, + "loss": 0.1425, + "step": 47269 + }, + { + "epoch": 0.8431134734063426, + "grad_norm": 0.2393653690814972, + "learning_rate": 3.6553186002888095e-06, + "loss": 0.0994, + "step": 47270 + }, + { + "epoch": 0.8431313095280563, + "grad_norm": 0.3910019099712372, + "learning_rate": 3.654508289484318e-06, + "loss": 0.1224, + "step": 47271 + }, + { + "epoch": 0.84314914564977, + "grad_norm": 0.3971771001815796, + "learning_rate": 3.65369806142232e-06, + "loss": 0.0783, + "step": 47272 + }, + { + "epoch": 0.8431669817714836, + "grad_norm": 0.26440882682800293, + "learning_rate": 3.6528879161059655e-06, + "loss": 0.1391, + "step": 47273 + }, + { + "epoch": 0.8431848178931973, + "grad_norm": 0.34841808676719666, + "learning_rate": 3.6520778535383853e-06, + "loss": 0.1167, + "step": 47274 + }, + { + "epoch": 0.843202654014911, + "grad_norm": 0.3352072238922119, + "learning_rate": 3.6512678737227304e-06, + "loss": 0.1141, + "step": 47275 + }, + { + "epoch": 0.8432204901366247, + "grad_norm": 0.29006505012512207, + "learning_rate": 3.650457976662136e-06, + "loss": 0.1092, + "step": 47276 + }, + { + "epoch": 0.8432383262583384, + "grad_norm": 0.2419443279504776, + "learning_rate": 3.6496481623597393e-06, + "loss": 0.0855, + "step": 47277 + }, + { + "epoch": 0.8432561623800521, + "grad_norm": 0.30216437578201294, + "learning_rate": 3.6488384308186743e-06, + "loss": 0.1055, + "step": 47278 + }, + { + "epoch": 0.8432739985017658, + "grad_norm": 0.23101922869682312, + "learning_rate": 3.648028782042093e-06, + "loss": 0.1323, + "step": 47279 + }, + { + "epoch": 0.8432918346234795, + "grad_norm": 0.39689844846725464, + "learning_rate": 3.6472192160331264e-06, + "loss": 0.1208, + "step": 47280 + }, + { + "epoch": 0.8433096707451931, + "grad_norm": 0.21119514107704163, + "learning_rate": 3.6464097327949142e-06, + "loss": 0.0729, + "step": 47281 + }, + { + "epoch": 0.8433275068669068, + "grad_norm": 0.2485380619764328, + "learning_rate": 3.64560033233059e-06, + "loss": 0.0981, + "step": 47282 + }, + { + "epoch": 0.8433453429886205, + "grad_norm": 0.21566466987133026, + "learning_rate": 3.6447910146432923e-06, + "loss": 0.1072, + "step": 47283 + }, + { + "epoch": 0.8433631791103342, + "grad_norm": 0.20807580649852753, + "learning_rate": 3.643981779736164e-06, + "loss": 0.0643, + "step": 47284 + }, + { + "epoch": 0.8433810152320479, + "grad_norm": 0.28790777921676636, + "learning_rate": 3.6431726276123345e-06, + "loss": 0.128, + "step": 47285 + }, + { + "epoch": 0.8433988513537617, + "grad_norm": 0.2902168333530426, + "learning_rate": 3.642363558274947e-06, + "loss": 0.1373, + "step": 47286 + }, + { + "epoch": 0.8434166874754754, + "grad_norm": 0.24145163595676422, + "learning_rate": 3.641554571727132e-06, + "loss": 0.0932, + "step": 47287 + }, + { + "epoch": 0.8434345235971891, + "grad_norm": 0.42242881655693054, + "learning_rate": 3.640745667972034e-06, + "loss": 0.1574, + "step": 47288 + }, + { + "epoch": 0.8434523597189028, + "grad_norm": 0.3094054162502289, + "learning_rate": 3.6399368470127813e-06, + "loss": 0.1037, + "step": 47289 + }, + { + "epoch": 0.8434701958406164, + "grad_norm": 0.2627965211868286, + "learning_rate": 3.639128108852513e-06, + "loss": 0.1186, + "step": 47290 + }, + { + "epoch": 0.8434880319623301, + "grad_norm": 0.2837042212486267, + "learning_rate": 3.6383194534943544e-06, + "loss": 0.1556, + "step": 47291 + }, + { + "epoch": 0.8435058680840438, + "grad_norm": 0.22625494003295898, + "learning_rate": 3.6375108809414555e-06, + "loss": 0.109, + "step": 47292 + }, + { + "epoch": 0.8435237042057575, + "grad_norm": 0.20810389518737793, + "learning_rate": 3.6367023911969417e-06, + "loss": 0.0677, + "step": 47293 + }, + { + "epoch": 0.8435415403274712, + "grad_norm": 0.2682685852050781, + "learning_rate": 3.635893984263947e-06, + "loss": 0.1439, + "step": 47294 + }, + { + "epoch": 0.8435593764491849, + "grad_norm": 0.25742673873901367, + "learning_rate": 3.635085660145607e-06, + "loss": 0.1165, + "step": 47295 + }, + { + "epoch": 0.8435772125708986, + "grad_norm": 0.37175577878952026, + "learning_rate": 3.63427741884505e-06, + "loss": 0.0821, + "step": 47296 + }, + { + "epoch": 0.8435950486926123, + "grad_norm": 0.22788120806217194, + "learning_rate": 3.6334692603654184e-06, + "loss": 0.1169, + "step": 47297 + }, + { + "epoch": 0.843612884814326, + "grad_norm": 0.2793791592121124, + "learning_rate": 3.6326611847098375e-06, + "loss": 0.1012, + "step": 47298 + }, + { + "epoch": 0.8436307209360396, + "grad_norm": 0.3146321773529053, + "learning_rate": 3.6318531918814373e-06, + "loss": 0.1108, + "step": 47299 + }, + { + "epoch": 0.8436485570577533, + "grad_norm": 0.29969334602355957, + "learning_rate": 3.6310452818833553e-06, + "loss": 0.1067, + "step": 47300 + }, + { + "epoch": 0.843666393179467, + "grad_norm": 0.26239240169525146, + "learning_rate": 3.6302374547187266e-06, + "loss": 0.151, + "step": 47301 + }, + { + "epoch": 0.8436842293011807, + "grad_norm": 0.2955058515071869, + "learning_rate": 3.6294297103906803e-06, + "loss": 0.0925, + "step": 47302 + }, + { + "epoch": 0.8437020654228945, + "grad_norm": 0.2752370536327362, + "learning_rate": 3.6286220489023444e-06, + "loss": 0.0749, + "step": 47303 + }, + { + "epoch": 0.8437199015446082, + "grad_norm": 0.2915973365306854, + "learning_rate": 3.6278144702568462e-06, + "loss": 0.1361, + "step": 47304 + }, + { + "epoch": 0.8437377376663219, + "grad_norm": 0.26207631826400757, + "learning_rate": 3.6270069744573252e-06, + "loss": 0.0861, + "step": 47305 + }, + { + "epoch": 0.8437555737880356, + "grad_norm": 0.2538292407989502, + "learning_rate": 3.6261995615069074e-06, + "loss": 0.1599, + "step": 47306 + }, + { + "epoch": 0.8437734099097493, + "grad_norm": 0.27881550788879395, + "learning_rate": 3.6253922314087225e-06, + "loss": 0.0918, + "step": 47307 + }, + { + "epoch": 0.8437912460314629, + "grad_norm": 0.32826316356658936, + "learning_rate": 3.624584984165899e-06, + "loss": 0.1032, + "step": 47308 + }, + { + "epoch": 0.8438090821531766, + "grad_norm": 0.35892581939697266, + "learning_rate": 3.6237778197815624e-06, + "loss": 0.1221, + "step": 47309 + }, + { + "epoch": 0.8438269182748903, + "grad_norm": 0.27105948328971863, + "learning_rate": 3.622970738258852e-06, + "loss": 0.1215, + "step": 47310 + }, + { + "epoch": 0.843844754396604, + "grad_norm": 0.2932094931602478, + "learning_rate": 3.6221637396008895e-06, + "loss": 0.1253, + "step": 47311 + }, + { + "epoch": 0.8438625905183177, + "grad_norm": 0.22844460606575012, + "learning_rate": 3.621356823810798e-06, + "loss": 0.0827, + "step": 47312 + }, + { + "epoch": 0.8438804266400314, + "grad_norm": 0.2548949122428894, + "learning_rate": 3.6205499908917196e-06, + "loss": 0.0986, + "step": 47313 + }, + { + "epoch": 0.8438982627617451, + "grad_norm": 0.24343626201152802, + "learning_rate": 3.6197432408467653e-06, + "loss": 0.0956, + "step": 47314 + }, + { + "epoch": 0.8439160988834588, + "grad_norm": 0.3302106559276581, + "learning_rate": 3.618936573679077e-06, + "loss": 0.1021, + "step": 47315 + }, + { + "epoch": 0.8439339350051724, + "grad_norm": 0.2927775979042053, + "learning_rate": 3.6181299893917777e-06, + "loss": 0.1189, + "step": 47316 + }, + { + "epoch": 0.8439517711268861, + "grad_norm": 0.4156448543071747, + "learning_rate": 3.6173234879879837e-06, + "loss": 0.1202, + "step": 47317 + }, + { + "epoch": 0.8439696072485998, + "grad_norm": 0.2190490961074829, + "learning_rate": 3.6165170694708343e-06, + "loss": 0.1087, + "step": 47318 + }, + { + "epoch": 0.8439874433703135, + "grad_norm": 0.42909204959869385, + "learning_rate": 3.61571073384345e-06, + "loss": 0.059, + "step": 47319 + }, + { + "epoch": 0.8440052794920273, + "grad_norm": 0.2620382606983185, + "learning_rate": 3.6149044811089576e-06, + "loss": 0.1272, + "step": 47320 + }, + { + "epoch": 0.844023115613741, + "grad_norm": 0.3202836513519287, + "learning_rate": 3.6140983112704836e-06, + "loss": 0.0863, + "step": 47321 + }, + { + "epoch": 0.8440409517354547, + "grad_norm": 0.24201911687850952, + "learning_rate": 3.613292224331144e-06, + "loss": 0.1144, + "step": 47322 + }, + { + "epoch": 0.8440587878571684, + "grad_norm": 0.31293395161628723, + "learning_rate": 3.6124862202940756e-06, + "loss": 0.1436, + "step": 47323 + }, + { + "epoch": 0.8440766239788821, + "grad_norm": 0.2554352581501007, + "learning_rate": 3.611680299162398e-06, + "loss": 0.0687, + "step": 47324 + }, + { + "epoch": 0.8440944601005957, + "grad_norm": 0.2146880179643631, + "learning_rate": 3.6108744609392343e-06, + "loss": 0.1078, + "step": 47325 + }, + { + "epoch": 0.8441122962223094, + "grad_norm": 0.2741975486278534, + "learning_rate": 3.6100687056277033e-06, + "loss": 0.1299, + "step": 47326 + }, + { + "epoch": 0.8441301323440231, + "grad_norm": 0.3124517500400543, + "learning_rate": 3.6092630332309392e-06, + "loss": 0.1704, + "step": 47327 + }, + { + "epoch": 0.8441479684657368, + "grad_norm": 0.22870393097400665, + "learning_rate": 3.6084574437520557e-06, + "loss": 0.0664, + "step": 47328 + }, + { + "epoch": 0.8441658045874505, + "grad_norm": 0.3728640079498291, + "learning_rate": 3.6076519371941842e-06, + "loss": 0.1805, + "step": 47329 + }, + { + "epoch": 0.8441836407091642, + "grad_norm": 0.3032281696796417, + "learning_rate": 3.6068465135604355e-06, + "loss": 0.0802, + "step": 47330 + }, + { + "epoch": 0.8442014768308779, + "grad_norm": 0.3359457552433014, + "learning_rate": 3.6060411728539466e-06, + "loss": 0.1278, + "step": 47331 + }, + { + "epoch": 0.8442193129525916, + "grad_norm": 0.3064267039299011, + "learning_rate": 3.6052359150778285e-06, + "loss": 0.1469, + "step": 47332 + }, + { + "epoch": 0.8442371490743052, + "grad_norm": 0.23197786509990692, + "learning_rate": 3.6044307402352095e-06, + "loss": 0.0958, + "step": 47333 + }, + { + "epoch": 0.8442549851960189, + "grad_norm": 0.2805977165699005, + "learning_rate": 3.6036256483292036e-06, + "loss": 0.0858, + "step": 47334 + }, + { + "epoch": 0.8442728213177326, + "grad_norm": 0.29823020100593567, + "learning_rate": 3.6028206393629304e-06, + "loss": 0.1231, + "step": 47335 + }, + { + "epoch": 0.8442906574394463, + "grad_norm": 0.22281208634376526, + "learning_rate": 3.6020157133395185e-06, + "loss": 0.1185, + "step": 47336 + }, + { + "epoch": 0.8443084935611601, + "grad_norm": 0.2655372619628906, + "learning_rate": 3.6012108702620866e-06, + "loss": 0.1016, + "step": 47337 + }, + { + "epoch": 0.8443263296828738, + "grad_norm": 0.2519085705280304, + "learning_rate": 3.6004061101337527e-06, + "loss": 0.109, + "step": 47338 + }, + { + "epoch": 0.8443441658045875, + "grad_norm": 0.24964651465415955, + "learning_rate": 3.599601432957628e-06, + "loss": 0.125, + "step": 47339 + }, + { + "epoch": 0.8443620019263012, + "grad_norm": 0.24692180752754211, + "learning_rate": 3.5987968387368453e-06, + "loss": 0.0633, + "step": 47340 + }, + { + "epoch": 0.8443798380480149, + "grad_norm": 0.2330651432275772, + "learning_rate": 3.597992327474514e-06, + "loss": 0.1086, + "step": 47341 + }, + { + "epoch": 0.8443976741697286, + "grad_norm": 0.24702002108097076, + "learning_rate": 3.5971878991737623e-06, + "loss": 0.1098, + "step": 47342 + }, + { + "epoch": 0.8444155102914422, + "grad_norm": 0.4640333652496338, + "learning_rate": 3.5963835538376956e-06, + "loss": 0.1358, + "step": 47343 + }, + { + "epoch": 0.8444333464131559, + "grad_norm": 0.30912286043167114, + "learning_rate": 3.5955792914694446e-06, + "loss": 0.1519, + "step": 47344 + }, + { + "epoch": 0.8444511825348696, + "grad_norm": 0.28649622201919556, + "learning_rate": 3.5947751120721213e-06, + "loss": 0.1731, + "step": 47345 + }, + { + "epoch": 0.8444690186565833, + "grad_norm": 0.29214009642601013, + "learning_rate": 3.5939710156488423e-06, + "loss": 0.0992, + "step": 47346 + }, + { + "epoch": 0.844486854778297, + "grad_norm": 0.2939797043800354, + "learning_rate": 3.5931670022027243e-06, + "loss": 0.1076, + "step": 47347 + }, + { + "epoch": 0.8445046909000107, + "grad_norm": 0.28265005350112915, + "learning_rate": 3.5923630717368815e-06, + "loss": 0.1495, + "step": 47348 + }, + { + "epoch": 0.8445225270217244, + "grad_norm": 0.21351316571235657, + "learning_rate": 3.591559224254437e-06, + "loss": 0.0875, + "step": 47349 + }, + { + "epoch": 0.844540363143438, + "grad_norm": 0.2238522619009018, + "learning_rate": 3.5907554597585048e-06, + "loss": 0.1065, + "step": 47350 + }, + { + "epoch": 0.8445581992651517, + "grad_norm": 0.21403296291828156, + "learning_rate": 3.5899517782521987e-06, + "loss": 0.0839, + "step": 47351 + }, + { + "epoch": 0.8445760353868654, + "grad_norm": 0.3860260844230652, + "learning_rate": 3.589148179738627e-06, + "loss": 0.1393, + "step": 47352 + }, + { + "epoch": 0.8445938715085791, + "grad_norm": 0.3730800747871399, + "learning_rate": 3.588344664220919e-06, + "loss": 0.1159, + "step": 47353 + }, + { + "epoch": 0.8446117076302929, + "grad_norm": 0.27137458324432373, + "learning_rate": 3.5875412317021824e-06, + "loss": 0.1235, + "step": 47354 + }, + { + "epoch": 0.8446295437520066, + "grad_norm": 0.2496897280216217, + "learning_rate": 3.586737882185526e-06, + "loss": 0.0575, + "step": 47355 + }, + { + "epoch": 0.8446473798737203, + "grad_norm": 0.29221904277801514, + "learning_rate": 3.5859346156740694e-06, + "loss": 0.1031, + "step": 47356 + }, + { + "epoch": 0.844665215995434, + "grad_norm": 0.33788564801216125, + "learning_rate": 3.5851314321709294e-06, + "loss": 0.1222, + "step": 47357 + }, + { + "epoch": 0.8446830521171477, + "grad_norm": 0.14239247143268585, + "learning_rate": 3.584328331679221e-06, + "loss": 0.0396, + "step": 47358 + }, + { + "epoch": 0.8447008882388614, + "grad_norm": 0.20731514692306519, + "learning_rate": 3.5835253142020493e-06, + "loss": 0.137, + "step": 47359 + }, + { + "epoch": 0.844718724360575, + "grad_norm": 0.2822842597961426, + "learning_rate": 3.5827223797425315e-06, + "loss": 0.0812, + "step": 47360 + }, + { + "epoch": 0.8447365604822887, + "grad_norm": 0.3031470775604248, + "learning_rate": 3.581919528303773e-06, + "loss": 0.1061, + "step": 47361 + }, + { + "epoch": 0.8447543966040024, + "grad_norm": 0.2415207028388977, + "learning_rate": 3.5811167598889e-06, + "loss": 0.1291, + "step": 47362 + }, + { + "epoch": 0.8447722327257161, + "grad_norm": 0.28498753905296326, + "learning_rate": 3.5803140745010148e-06, + "loss": 0.0835, + "step": 47363 + }, + { + "epoch": 0.8447900688474298, + "grad_norm": 0.28839969635009766, + "learning_rate": 3.5795114721432293e-06, + "loss": 0.0994, + "step": 47364 + }, + { + "epoch": 0.8448079049691435, + "grad_norm": 0.26176631450653076, + "learning_rate": 3.578708952818652e-06, + "loss": 0.1081, + "step": 47365 + }, + { + "epoch": 0.8448257410908572, + "grad_norm": 0.2957668900489807, + "learning_rate": 3.5779065165304023e-06, + "loss": 0.1612, + "step": 47366 + }, + { + "epoch": 0.8448435772125709, + "grad_norm": 0.2040109932422638, + "learning_rate": 3.577104163281586e-06, + "loss": 0.1232, + "step": 47367 + }, + { + "epoch": 0.8448614133342846, + "grad_norm": 0.2923802137374878, + "learning_rate": 3.5763018930753067e-06, + "loss": 0.107, + "step": 47368 + }, + { + "epoch": 0.8448792494559982, + "grad_norm": 0.23593628406524658, + "learning_rate": 3.5754997059146867e-06, + "loss": 0.1288, + "step": 47369 + }, + { + "epoch": 0.844897085577712, + "grad_norm": 0.2378631979227066, + "learning_rate": 3.5746976018028233e-06, + "loss": 0.0929, + "step": 47370 + }, + { + "epoch": 0.8449149216994257, + "grad_norm": 0.27903860807418823, + "learning_rate": 3.5738955807428393e-06, + "loss": 0.1091, + "step": 47371 + }, + { + "epoch": 0.8449327578211394, + "grad_norm": 0.26834574341773987, + "learning_rate": 3.573093642737835e-06, + "loss": 0.0719, + "step": 47372 + }, + { + "epoch": 0.8449505939428531, + "grad_norm": 0.3107265830039978, + "learning_rate": 3.5722917877909213e-06, + "loss": 0.1485, + "step": 47373 + }, + { + "epoch": 0.8449684300645668, + "grad_norm": 0.24711358547210693, + "learning_rate": 3.571490015905199e-06, + "loss": 0.1127, + "step": 47374 + }, + { + "epoch": 0.8449862661862805, + "grad_norm": 0.28482112288475037, + "learning_rate": 3.5706883270837903e-06, + "loss": 0.118, + "step": 47375 + }, + { + "epoch": 0.8450041023079942, + "grad_norm": 0.4752761125564575, + "learning_rate": 3.569886721329793e-06, + "loss": 0.0893, + "step": 47376 + }, + { + "epoch": 0.8450219384297079, + "grad_norm": 0.27395132184028625, + "learning_rate": 3.569085198646316e-06, + "loss": 0.1118, + "step": 47377 + }, + { + "epoch": 0.8450397745514215, + "grad_norm": 0.3193538784980774, + "learning_rate": 3.568283759036464e-06, + "loss": 0.1463, + "step": 47378 + }, + { + "epoch": 0.8450576106731352, + "grad_norm": 0.3200143575668335, + "learning_rate": 3.56748240250335e-06, + "loss": 0.1255, + "step": 47379 + }, + { + "epoch": 0.8450754467948489, + "grad_norm": 0.23412470519542694, + "learning_rate": 3.566681129050076e-06, + "loss": 0.1276, + "step": 47380 + }, + { + "epoch": 0.8450932829165626, + "grad_norm": 0.25210949778556824, + "learning_rate": 3.56587993867975e-06, + "loss": 0.0611, + "step": 47381 + }, + { + "epoch": 0.8451111190382763, + "grad_norm": 0.31062883138656616, + "learning_rate": 3.5650788313954716e-06, + "loss": 0.1343, + "step": 47382 + }, + { + "epoch": 0.84512895515999, + "grad_norm": 0.20535065233707428, + "learning_rate": 3.5642778072003504e-06, + "loss": 0.1134, + "step": 47383 + }, + { + "epoch": 0.8451467912817037, + "grad_norm": 0.19917286932468414, + "learning_rate": 3.5634768660974956e-06, + "loss": 0.061, + "step": 47384 + }, + { + "epoch": 0.8451646274034174, + "grad_norm": 0.2603689730167389, + "learning_rate": 3.56267600809001e-06, + "loss": 0.0836, + "step": 47385 + }, + { + "epoch": 0.845182463525131, + "grad_norm": 0.26594775915145874, + "learning_rate": 3.5618752331809974e-06, + "loss": 0.0993, + "step": 47386 + }, + { + "epoch": 0.8452002996468448, + "grad_norm": 0.3551575243473053, + "learning_rate": 3.5610745413735546e-06, + "loss": 0.1809, + "step": 47387 + }, + { + "epoch": 0.8452181357685585, + "grad_norm": 0.2946797013282776, + "learning_rate": 3.5602739326707962e-06, + "loss": 0.1626, + "step": 47388 + }, + { + "epoch": 0.8452359718902722, + "grad_norm": 0.30070677399635315, + "learning_rate": 3.5594734070758223e-06, + "loss": 0.0839, + "step": 47389 + }, + { + "epoch": 0.8452538080119859, + "grad_norm": 0.25644591450691223, + "learning_rate": 3.558672964591736e-06, + "loss": 0.1137, + "step": 47390 + }, + { + "epoch": 0.8452716441336996, + "grad_norm": 0.43660518527030945, + "learning_rate": 3.557872605221632e-06, + "loss": 0.1705, + "step": 47391 + }, + { + "epoch": 0.8452894802554133, + "grad_norm": 0.1884179711341858, + "learning_rate": 3.5570723289686247e-06, + "loss": 0.1029, + "step": 47392 + }, + { + "epoch": 0.845307316377127, + "grad_norm": 0.3108867108821869, + "learning_rate": 3.5562721358358114e-06, + "loss": 0.1132, + "step": 47393 + }, + { + "epoch": 0.8453251524988407, + "grad_norm": 0.30239301919937134, + "learning_rate": 3.555472025826295e-06, + "loss": 0.1641, + "step": 47394 + }, + { + "epoch": 0.8453429886205543, + "grad_norm": 0.33123305439949036, + "learning_rate": 3.554671998943168e-06, + "loss": 0.0976, + "step": 47395 + }, + { + "epoch": 0.845360824742268, + "grad_norm": 0.28957194089889526, + "learning_rate": 3.5538720551895473e-06, + "loss": 0.0864, + "step": 47396 + }, + { + "epoch": 0.8453786608639817, + "grad_norm": 0.22637823224067688, + "learning_rate": 3.5530721945685184e-06, + "loss": 0.1022, + "step": 47397 + }, + { + "epoch": 0.8453964969856954, + "grad_norm": 0.22780421376228333, + "learning_rate": 3.552272417083194e-06, + "loss": 0.0693, + "step": 47398 + }, + { + "epoch": 0.8454143331074091, + "grad_norm": 0.3114381432533264, + "learning_rate": 3.551472722736665e-06, + "loss": 0.1105, + "step": 47399 + }, + { + "epoch": 0.8454321692291228, + "grad_norm": 0.2825041711330414, + "learning_rate": 3.5506731115320406e-06, + "loss": 0.1362, + "step": 47400 + }, + { + "epoch": 0.8454500053508365, + "grad_norm": 0.24676674604415894, + "learning_rate": 3.5498735834724183e-06, + "loss": 0.0988, + "step": 47401 + }, + { + "epoch": 0.8454678414725502, + "grad_norm": 0.2726946175098419, + "learning_rate": 3.549074138560893e-06, + "loss": 0.107, + "step": 47402 + }, + { + "epoch": 0.8454856775942639, + "grad_norm": 0.4542251527309418, + "learning_rate": 3.5482747768005643e-06, + "loss": 0.1515, + "step": 47403 + }, + { + "epoch": 0.8455035137159777, + "grad_norm": 0.31879186630249023, + "learning_rate": 3.5474754981945272e-06, + "loss": 0.0973, + "step": 47404 + }, + { + "epoch": 0.8455213498376913, + "grad_norm": 0.35587841272354126, + "learning_rate": 3.5466763027458884e-06, + "loss": 0.1494, + "step": 47405 + }, + { + "epoch": 0.845539185959405, + "grad_norm": 0.2953929901123047, + "learning_rate": 3.545877190457744e-06, + "loss": 0.1112, + "step": 47406 + }, + { + "epoch": 0.8455570220811187, + "grad_norm": 0.3713397979736328, + "learning_rate": 3.54507816133319e-06, + "loss": 0.1583, + "step": 47407 + }, + { + "epoch": 0.8455748582028324, + "grad_norm": 0.2765306532382965, + "learning_rate": 3.544279215375315e-06, + "loss": 0.1225, + "step": 47408 + }, + { + "epoch": 0.8455926943245461, + "grad_norm": 0.26851508021354675, + "learning_rate": 3.5434803525872335e-06, + "loss": 0.0814, + "step": 47409 + }, + { + "epoch": 0.8456105304462598, + "grad_norm": 0.20504872500896454, + "learning_rate": 3.542681572972026e-06, + "loss": 0.0594, + "step": 47410 + }, + { + "epoch": 0.8456283665679735, + "grad_norm": 0.3291880190372467, + "learning_rate": 3.541882876532801e-06, + "loss": 0.0756, + "step": 47411 + }, + { + "epoch": 0.8456462026896872, + "grad_norm": 0.19813303649425507, + "learning_rate": 3.5410842632726426e-06, + "loss": 0.0903, + "step": 47412 + }, + { + "epoch": 0.8456640388114008, + "grad_norm": 0.29659345746040344, + "learning_rate": 3.540285733194662e-06, + "loss": 0.1175, + "step": 47413 + }, + { + "epoch": 0.8456818749331145, + "grad_norm": 0.25105470418930054, + "learning_rate": 3.539487286301943e-06, + "loss": 0.1092, + "step": 47414 + }, + { + "epoch": 0.8456997110548282, + "grad_norm": 0.308626651763916, + "learning_rate": 3.5386889225975855e-06, + "loss": 0.1271, + "step": 47415 + }, + { + "epoch": 0.8457175471765419, + "grad_norm": 0.19902539253234863, + "learning_rate": 3.537890642084679e-06, + "loss": 0.0701, + "step": 47416 + }, + { + "epoch": 0.8457353832982556, + "grad_norm": 0.2225547879934311, + "learning_rate": 3.5370924447663184e-06, + "loss": 0.1215, + "step": 47417 + }, + { + "epoch": 0.8457532194199693, + "grad_norm": 0.2490280419588089, + "learning_rate": 3.5362943306456063e-06, + "loss": 0.1084, + "step": 47418 + }, + { + "epoch": 0.845771055541683, + "grad_norm": 0.35284045338630676, + "learning_rate": 3.5354962997256295e-06, + "loss": 0.0924, + "step": 47419 + }, + { + "epoch": 0.8457888916633967, + "grad_norm": 0.2676253914833069, + "learning_rate": 3.5346983520094827e-06, + "loss": 0.0972, + "step": 47420 + }, + { + "epoch": 0.8458067277851105, + "grad_norm": 0.2826889753341675, + "learning_rate": 3.533900487500255e-06, + "loss": 0.1009, + "step": 47421 + }, + { + "epoch": 0.8458245639068241, + "grad_norm": 0.3616482615470886, + "learning_rate": 3.533102706201047e-06, + "loss": 0.1495, + "step": 47422 + }, + { + "epoch": 0.8458424000285378, + "grad_norm": 0.3494814932346344, + "learning_rate": 3.5323050081149475e-06, + "loss": 0.1563, + "step": 47423 + }, + { + "epoch": 0.8458602361502515, + "grad_norm": 0.26231318712234497, + "learning_rate": 3.531507393245043e-06, + "loss": 0.0961, + "step": 47424 + }, + { + "epoch": 0.8458780722719652, + "grad_norm": 0.28493982553482056, + "learning_rate": 3.5307098615944363e-06, + "loss": 0.0861, + "step": 47425 + }, + { + "epoch": 0.8458959083936789, + "grad_norm": 0.2542304992675781, + "learning_rate": 3.5299124131662085e-06, + "loss": 0.1067, + "step": 47426 + }, + { + "epoch": 0.8459137445153926, + "grad_norm": 0.2479018270969391, + "learning_rate": 3.5291150479634605e-06, + "loss": 0.0851, + "step": 47427 + }, + { + "epoch": 0.8459315806371063, + "grad_norm": 0.23945067822933197, + "learning_rate": 3.528317765989278e-06, + "loss": 0.124, + "step": 47428 + }, + { + "epoch": 0.84594941675882, + "grad_norm": 0.2718231976032257, + "learning_rate": 3.5275205672467533e-06, + "loss": 0.112, + "step": 47429 + }, + { + "epoch": 0.8459672528805336, + "grad_norm": 0.2712881565093994, + "learning_rate": 3.52672345173897e-06, + "loss": 0.1395, + "step": 47430 + }, + { + "epoch": 0.8459850890022473, + "grad_norm": 0.26649418473243713, + "learning_rate": 3.5259264194690278e-06, + "loss": 0.1096, + "step": 47431 + }, + { + "epoch": 0.846002925123961, + "grad_norm": 0.2700745761394501, + "learning_rate": 3.525129470440011e-06, + "loss": 0.0961, + "step": 47432 + }, + { + "epoch": 0.8460207612456747, + "grad_norm": 0.3305078446865082, + "learning_rate": 3.524332604655012e-06, + "loss": 0.1, + "step": 47433 + }, + { + "epoch": 0.8460385973673884, + "grad_norm": 0.24270285665988922, + "learning_rate": 3.5235358221171078e-06, + "loss": 0.1, + "step": 47434 + }, + { + "epoch": 0.8460564334891021, + "grad_norm": 0.383864164352417, + "learning_rate": 3.522739122829405e-06, + "loss": 0.1013, + "step": 47435 + }, + { + "epoch": 0.8460742696108158, + "grad_norm": 0.21216845512390137, + "learning_rate": 3.521942506794984e-06, + "loss": 0.1128, + "step": 47436 + }, + { + "epoch": 0.8460921057325295, + "grad_norm": 0.23016326129436493, + "learning_rate": 3.5211459740169235e-06, + "loss": 0.0759, + "step": 47437 + }, + { + "epoch": 0.8461099418542433, + "grad_norm": 0.3105771243572235, + "learning_rate": 3.5203495244983286e-06, + "loss": 0.0865, + "step": 47438 + }, + { + "epoch": 0.846127777975957, + "grad_norm": 0.26920071244239807, + "learning_rate": 3.51955315824227e-06, + "loss": 0.1212, + "step": 47439 + }, + { + "epoch": 0.8461456140976706, + "grad_norm": 0.21645879745483398, + "learning_rate": 3.5187568752518496e-06, + "loss": 0.092, + "step": 47440 + }, + { + "epoch": 0.8461634502193843, + "grad_norm": 0.24848566949367523, + "learning_rate": 3.5179606755301463e-06, + "loss": 0.0917, + "step": 47441 + }, + { + "epoch": 0.846181286341098, + "grad_norm": 0.2764410674571991, + "learning_rate": 3.5171645590802465e-06, + "loss": 0.0942, + "step": 47442 + }, + { + "epoch": 0.8461991224628117, + "grad_norm": 0.28439468145370483, + "learning_rate": 3.516368525905231e-06, + "loss": 0.1564, + "step": 47443 + }, + { + "epoch": 0.8462169585845254, + "grad_norm": 0.348168283700943, + "learning_rate": 3.515572576008197e-06, + "loss": 0.0812, + "step": 47444 + }, + { + "epoch": 0.8462347947062391, + "grad_norm": 0.2516811490058899, + "learning_rate": 3.5147767093922257e-06, + "loss": 0.1298, + "step": 47445 + }, + { + "epoch": 0.8462526308279528, + "grad_norm": 0.3369740843772888, + "learning_rate": 3.513980926060398e-06, + "loss": 0.1045, + "step": 47446 + }, + { + "epoch": 0.8462704669496665, + "grad_norm": 0.2666391432285309, + "learning_rate": 3.5131852260157972e-06, + "loss": 0.1383, + "step": 47447 + }, + { + "epoch": 0.8462883030713801, + "grad_norm": 0.2141275256872177, + "learning_rate": 3.5123896092615184e-06, + "loss": 0.0949, + "step": 47448 + }, + { + "epoch": 0.8463061391930938, + "grad_norm": 0.26325419545173645, + "learning_rate": 3.5115940758006394e-06, + "loss": 0.1133, + "step": 47449 + }, + { + "epoch": 0.8463239753148075, + "grad_norm": 0.23058579862117767, + "learning_rate": 3.5107986256362415e-06, + "loss": 0.1165, + "step": 47450 + }, + { + "epoch": 0.8463418114365212, + "grad_norm": 0.2442142814397812, + "learning_rate": 3.5100032587714048e-06, + "loss": 0.1293, + "step": 47451 + }, + { + "epoch": 0.8463596475582349, + "grad_norm": 0.24232317507266998, + "learning_rate": 3.509207975209225e-06, + "loss": 0.0895, + "step": 47452 + }, + { + "epoch": 0.8463774836799486, + "grad_norm": 0.29069605469703674, + "learning_rate": 3.508412774952771e-06, + "loss": 0.1178, + "step": 47453 + }, + { + "epoch": 0.8463953198016623, + "grad_norm": 0.2489462047815323, + "learning_rate": 3.5076176580051385e-06, + "loss": 0.1146, + "step": 47454 + }, + { + "epoch": 0.8464131559233761, + "grad_norm": 0.27243614196777344, + "learning_rate": 3.506822624369402e-06, + "loss": 0.1372, + "step": 47455 + }, + { + "epoch": 0.8464309920450898, + "grad_norm": 0.2851302921772003, + "learning_rate": 3.50602767404864e-06, + "loss": 0.1194, + "step": 47456 + }, + { + "epoch": 0.8464488281668034, + "grad_norm": 0.40311288833618164, + "learning_rate": 3.505232807045944e-06, + "loss": 0.1388, + "step": 47457 + }, + { + "epoch": 0.8464666642885171, + "grad_norm": 0.49015572667121887, + "learning_rate": 3.504438023364387e-06, + "loss": 0.1762, + "step": 47458 + }, + { + "epoch": 0.8464845004102308, + "grad_norm": 0.31908130645751953, + "learning_rate": 3.5036433230070553e-06, + "loss": 0.0949, + "step": 47459 + }, + { + "epoch": 0.8465023365319445, + "grad_norm": 0.29065850377082825, + "learning_rate": 3.5028487059770217e-06, + "loss": 0.0957, + "step": 47460 + }, + { + "epoch": 0.8465201726536582, + "grad_norm": 0.27658164501190186, + "learning_rate": 3.5020541722773746e-06, + "loss": 0.0841, + "step": 47461 + }, + { + "epoch": 0.8465380087753719, + "grad_norm": 0.30853772163391113, + "learning_rate": 3.5012597219111933e-06, + "loss": 0.1396, + "step": 47462 + }, + { + "epoch": 0.8465558448970856, + "grad_norm": 0.29472896456718445, + "learning_rate": 3.5004653548815547e-06, + "loss": 0.1255, + "step": 47463 + }, + { + "epoch": 0.8465736810187993, + "grad_norm": 0.2253725826740265, + "learning_rate": 3.4996710711915293e-06, + "loss": 0.1204, + "step": 47464 + }, + { + "epoch": 0.846591517140513, + "grad_norm": 0.24394692480564117, + "learning_rate": 3.4988768708442143e-06, + "loss": 0.0891, + "step": 47465 + }, + { + "epoch": 0.8466093532622266, + "grad_norm": 0.2033083587884903, + "learning_rate": 3.4980827538426686e-06, + "loss": 0.1318, + "step": 47466 + }, + { + "epoch": 0.8466271893839403, + "grad_norm": 0.25815555453300476, + "learning_rate": 3.4972887201899896e-06, + "loss": 0.075, + "step": 47467 + }, + { + "epoch": 0.846645025505654, + "grad_norm": 0.2514638304710388, + "learning_rate": 3.496494769889247e-06, + "loss": 0.0929, + "step": 47468 + }, + { + "epoch": 0.8466628616273677, + "grad_norm": 0.4418571889400482, + "learning_rate": 3.4957009029435108e-06, + "loss": 0.0933, + "step": 47469 + }, + { + "epoch": 0.8466806977490814, + "grad_norm": 0.2199210673570633, + "learning_rate": 3.4949071193558698e-06, + "loss": 0.1008, + "step": 47470 + }, + { + "epoch": 0.8466985338707952, + "grad_norm": 0.33602094650268555, + "learning_rate": 3.4941134191293968e-06, + "loss": 0.1686, + "step": 47471 + }, + { + "epoch": 0.8467163699925089, + "grad_norm": 0.22433800995349884, + "learning_rate": 3.49331980226717e-06, + "loss": 0.0765, + "step": 47472 + }, + { + "epoch": 0.8467342061142226, + "grad_norm": 0.2530791759490967, + "learning_rate": 3.492526268772256e-06, + "loss": 0.0936, + "step": 47473 + }, + { + "epoch": 0.8467520422359363, + "grad_norm": 0.3687301576137543, + "learning_rate": 3.491732818647747e-06, + "loss": 0.1595, + "step": 47474 + }, + { + "epoch": 0.8467698783576499, + "grad_norm": 0.23837460577487946, + "learning_rate": 3.4909394518967076e-06, + "loss": 0.1027, + "step": 47475 + }, + { + "epoch": 0.8467877144793636, + "grad_norm": 0.2569620609283447, + "learning_rate": 3.4901461685222183e-06, + "loss": 0.1236, + "step": 47476 + }, + { + "epoch": 0.8468055506010773, + "grad_norm": 0.34468820691108704, + "learning_rate": 3.489352968527343e-06, + "loss": 0.1015, + "step": 47477 + }, + { + "epoch": 0.846823386722791, + "grad_norm": 0.202947735786438, + "learning_rate": 3.4885598519151713e-06, + "loss": 0.0873, + "step": 47478 + }, + { + "epoch": 0.8468412228445047, + "grad_norm": 0.3023819029331207, + "learning_rate": 3.487766818688773e-06, + "loss": 0.0683, + "step": 47479 + }, + { + "epoch": 0.8468590589662184, + "grad_norm": 0.2978323698043823, + "learning_rate": 3.486973868851215e-06, + "loss": 0.1133, + "step": 47480 + }, + { + "epoch": 0.8468768950879321, + "grad_norm": 0.19978269934654236, + "learning_rate": 3.4861810024055806e-06, + "loss": 0.1152, + "step": 47481 + }, + { + "epoch": 0.8468947312096458, + "grad_norm": 0.2727530598640442, + "learning_rate": 3.4853882193549346e-06, + "loss": 0.113, + "step": 47482 + }, + { + "epoch": 0.8469125673313594, + "grad_norm": 0.22486057877540588, + "learning_rate": 3.4845955197023606e-06, + "loss": 0.0565, + "step": 47483 + }, + { + "epoch": 0.8469304034530731, + "grad_norm": 0.2608107030391693, + "learning_rate": 3.4838029034509278e-06, + "loss": 0.1, + "step": 47484 + }, + { + "epoch": 0.8469482395747868, + "grad_norm": 0.24796272814273834, + "learning_rate": 3.4830103706037038e-06, + "loss": 0.1201, + "step": 47485 + }, + { + "epoch": 0.8469660756965005, + "grad_norm": 0.22716866433620453, + "learning_rate": 3.4822179211637577e-06, + "loss": 0.1335, + "step": 47486 + }, + { + "epoch": 0.8469839118182142, + "grad_norm": 0.2889477610588074, + "learning_rate": 3.481425555134171e-06, + "loss": 0.145, + "step": 47487 + }, + { + "epoch": 0.847001747939928, + "grad_norm": 0.3446348309516907, + "learning_rate": 3.4806332725180133e-06, + "loss": 0.1498, + "step": 47488 + }, + { + "epoch": 0.8470195840616417, + "grad_norm": 0.4340376555919647, + "learning_rate": 3.4798410733183547e-06, + "loss": 0.1223, + "step": 47489 + }, + { + "epoch": 0.8470374201833554, + "grad_norm": 0.28792908787727356, + "learning_rate": 3.4790489575382586e-06, + "loss": 0.0703, + "step": 47490 + }, + { + "epoch": 0.8470552563050691, + "grad_norm": 0.27349692583084106, + "learning_rate": 3.478256925180806e-06, + "loss": 0.1537, + "step": 47491 + }, + { + "epoch": 0.8470730924267827, + "grad_norm": 0.2780435085296631, + "learning_rate": 3.4774649762490617e-06, + "loss": 0.0818, + "step": 47492 + }, + { + "epoch": 0.8470909285484964, + "grad_norm": 0.2728854715824127, + "learning_rate": 3.476673110746095e-06, + "loss": 0.1346, + "step": 47493 + }, + { + "epoch": 0.8471087646702101, + "grad_norm": 0.2203640341758728, + "learning_rate": 3.475881328674982e-06, + "loss": 0.1143, + "step": 47494 + }, + { + "epoch": 0.8471266007919238, + "grad_norm": 0.24378696084022522, + "learning_rate": 3.4750896300387804e-06, + "loss": 0.0936, + "step": 47495 + }, + { + "epoch": 0.8471444369136375, + "grad_norm": 0.4511106610298157, + "learning_rate": 3.474298014840571e-06, + "loss": 0.1096, + "step": 47496 + }, + { + "epoch": 0.8471622730353512, + "grad_norm": 0.23231393098831177, + "learning_rate": 3.4735064830834164e-06, + "loss": 0.1398, + "step": 47497 + }, + { + "epoch": 0.8471801091570649, + "grad_norm": 0.3215214014053345, + "learning_rate": 3.472715034770388e-06, + "loss": 0.0876, + "step": 47498 + }, + { + "epoch": 0.8471979452787786, + "grad_norm": 0.17329655587673187, + "learning_rate": 3.4719236699045447e-06, + "loss": 0.0997, + "step": 47499 + }, + { + "epoch": 0.8472157814004923, + "grad_norm": 0.2522377371788025, + "learning_rate": 3.471132388488968e-06, + "loss": 0.0822, + "step": 47500 + }, + { + "epoch": 0.8472336175222059, + "grad_norm": 0.21335873007774353, + "learning_rate": 3.4703411905267158e-06, + "loss": 0.1188, + "step": 47501 + }, + { + "epoch": 0.8472514536439196, + "grad_norm": 0.22173181176185608, + "learning_rate": 3.469550076020858e-06, + "loss": 0.1183, + "step": 47502 + }, + { + "epoch": 0.8472692897656333, + "grad_norm": 0.2797526717185974, + "learning_rate": 3.468759044974454e-06, + "loss": 0.1624, + "step": 47503 + }, + { + "epoch": 0.847287125887347, + "grad_norm": 0.22542330622673035, + "learning_rate": 3.4679680973905837e-06, + "loss": 0.0925, + "step": 47504 + }, + { + "epoch": 0.8473049620090608, + "grad_norm": 0.23701652884483337, + "learning_rate": 3.467177233272306e-06, + "loss": 0.1184, + "step": 47505 + }, + { + "epoch": 0.8473227981307745, + "grad_norm": 0.5262187719345093, + "learning_rate": 3.4663864526226856e-06, + "loss": 0.1524, + "step": 47506 + }, + { + "epoch": 0.8473406342524882, + "grad_norm": 0.29847222566604614, + "learning_rate": 3.4655957554447836e-06, + "loss": 0.1051, + "step": 47507 + }, + { + "epoch": 0.8473584703742019, + "grad_norm": 0.2641250789165497, + "learning_rate": 3.4648051417416754e-06, + "loss": 0.1126, + "step": 47508 + }, + { + "epoch": 0.8473763064959156, + "grad_norm": 0.27309516072273254, + "learning_rate": 3.464014611516417e-06, + "loss": 0.1278, + "step": 47509 + }, + { + "epoch": 0.8473941426176292, + "grad_norm": 0.24848908185958862, + "learning_rate": 3.4632241647720774e-06, + "loss": 0.092, + "step": 47510 + }, + { + "epoch": 0.8474119787393429, + "grad_norm": 0.2899356782436371, + "learning_rate": 3.4624338015117246e-06, + "loss": 0.0799, + "step": 47511 + }, + { + "epoch": 0.8474298148610566, + "grad_norm": 0.2715561091899872, + "learning_rate": 3.4616435217384084e-06, + "loss": 0.1176, + "step": 47512 + }, + { + "epoch": 0.8474476509827703, + "grad_norm": 0.33382654190063477, + "learning_rate": 3.460853325455207e-06, + "loss": 0.0816, + "step": 47513 + }, + { + "epoch": 0.847465487104484, + "grad_norm": 0.3166084587574005, + "learning_rate": 3.4600632126651793e-06, + "loss": 0.1558, + "step": 47514 + }, + { + "epoch": 0.8474833232261977, + "grad_norm": 0.2971554100513458, + "learning_rate": 3.4592731833713836e-06, + "loss": 0.1521, + "step": 47515 + }, + { + "epoch": 0.8475011593479114, + "grad_norm": 0.20303970575332642, + "learning_rate": 3.4584832375768816e-06, + "loss": 0.0728, + "step": 47516 + }, + { + "epoch": 0.8475189954696251, + "grad_norm": 0.2333047240972519, + "learning_rate": 3.457693375284743e-06, + "loss": 0.0881, + "step": 47517 + }, + { + "epoch": 0.8475368315913387, + "grad_norm": 0.30360716581344604, + "learning_rate": 3.4569035964980264e-06, + "loss": 0.1277, + "step": 47518 + }, + { + "epoch": 0.8475546677130524, + "grad_norm": 0.322036474943161, + "learning_rate": 3.45611390121979e-06, + "loss": 0.1308, + "step": 47519 + }, + { + "epoch": 0.8475725038347661, + "grad_norm": 0.31250110268592834, + "learning_rate": 3.4553242894530937e-06, + "loss": 0.1826, + "step": 47520 + }, + { + "epoch": 0.8475903399564798, + "grad_norm": 0.2294568568468094, + "learning_rate": 3.454534761201006e-06, + "loss": 0.1206, + "step": 47521 + }, + { + "epoch": 0.8476081760781936, + "grad_norm": 0.3135239779949188, + "learning_rate": 3.4537453164665806e-06, + "loss": 0.1583, + "step": 47522 + }, + { + "epoch": 0.8476260121999073, + "grad_norm": 0.2727711498737335, + "learning_rate": 3.452955955252882e-06, + "loss": 0.1329, + "step": 47523 + }, + { + "epoch": 0.847643848321621, + "grad_norm": 0.2505616545677185, + "learning_rate": 3.4521666775629706e-06, + "loss": 0.1155, + "step": 47524 + }, + { + "epoch": 0.8476616844433347, + "grad_norm": 0.21993312239646912, + "learning_rate": 3.451377483399895e-06, + "loss": 0.0611, + "step": 47525 + }, + { + "epoch": 0.8476795205650484, + "grad_norm": 0.23521654307842255, + "learning_rate": 3.450588372766733e-06, + "loss": 0.0856, + "step": 47526 + }, + { + "epoch": 0.847697356686762, + "grad_norm": 0.2705284655094147, + "learning_rate": 3.4497993456665294e-06, + "loss": 0.1344, + "step": 47527 + }, + { + "epoch": 0.8477151928084757, + "grad_norm": 0.26749953627586365, + "learning_rate": 3.4490104021023484e-06, + "loss": 0.1487, + "step": 47528 + }, + { + "epoch": 0.8477330289301894, + "grad_norm": 0.2799796760082245, + "learning_rate": 3.44822154207724e-06, + "loss": 0.1504, + "step": 47529 + }, + { + "epoch": 0.8477508650519031, + "grad_norm": 0.24230726063251495, + "learning_rate": 3.4474327655942774e-06, + "loss": 0.0937, + "step": 47530 + }, + { + "epoch": 0.8477687011736168, + "grad_norm": 0.2881892919540405, + "learning_rate": 3.4466440726565077e-06, + "loss": 0.1073, + "step": 47531 + }, + { + "epoch": 0.8477865372953305, + "grad_norm": 0.25630617141723633, + "learning_rate": 3.4458554632669893e-06, + "loss": 0.092, + "step": 47532 + }, + { + "epoch": 0.8478043734170442, + "grad_norm": 0.22776642441749573, + "learning_rate": 3.4450669374287787e-06, + "loss": 0.0936, + "step": 47533 + }, + { + "epoch": 0.8478222095387579, + "grad_norm": 0.25008949637413025, + "learning_rate": 3.4442784951449284e-06, + "loss": 0.1394, + "step": 47534 + }, + { + "epoch": 0.8478400456604716, + "grad_norm": 0.34504738450050354, + "learning_rate": 3.4434901364185062e-06, + "loss": 0.128, + "step": 47535 + }, + { + "epoch": 0.8478578817821852, + "grad_norm": 0.2894987463951111, + "learning_rate": 3.4427018612525564e-06, + "loss": 0.1002, + "step": 47536 + }, + { + "epoch": 0.8478757179038989, + "grad_norm": 0.29916563630104065, + "learning_rate": 3.441913669650143e-06, + "loss": 0.0618, + "step": 47537 + }, + { + "epoch": 0.8478935540256126, + "grad_norm": 0.30666837096214294, + "learning_rate": 3.441125561614314e-06, + "loss": 0.1237, + "step": 47538 + }, + { + "epoch": 0.8479113901473264, + "grad_norm": 0.3498028516769409, + "learning_rate": 3.440337537148136e-06, + "loss": 0.1099, + "step": 47539 + }, + { + "epoch": 0.8479292262690401, + "grad_norm": 0.3540576100349426, + "learning_rate": 3.4395495962546543e-06, + "loss": 0.1443, + "step": 47540 + }, + { + "epoch": 0.8479470623907538, + "grad_norm": 0.2628134787082672, + "learning_rate": 3.4387617389369242e-06, + "loss": 0.0942, + "step": 47541 + }, + { + "epoch": 0.8479648985124675, + "grad_norm": 0.5075153708457947, + "learning_rate": 3.4379739651979963e-06, + "loss": 0.1307, + "step": 47542 + }, + { + "epoch": 0.8479827346341812, + "grad_norm": 0.24735112488269806, + "learning_rate": 3.437186275040935e-06, + "loss": 0.1075, + "step": 47543 + }, + { + "epoch": 0.8480005707558949, + "grad_norm": 0.34918805956840515, + "learning_rate": 3.436398668468785e-06, + "loss": 0.1336, + "step": 47544 + }, + { + "epoch": 0.8480184068776085, + "grad_norm": 0.23646296560764313, + "learning_rate": 3.4356111454846048e-06, + "loss": 0.1388, + "step": 47545 + }, + { + "epoch": 0.8480362429993222, + "grad_norm": 0.27730274200439453, + "learning_rate": 3.434823706091442e-06, + "loss": 0.1163, + "step": 47546 + }, + { + "epoch": 0.8480540791210359, + "grad_norm": 0.25620508193969727, + "learning_rate": 3.434036350292344e-06, + "loss": 0.0942, + "step": 47547 + }, + { + "epoch": 0.8480719152427496, + "grad_norm": 0.295282244682312, + "learning_rate": 3.4332490780903754e-06, + "loss": 0.1269, + "step": 47548 + }, + { + "epoch": 0.8480897513644633, + "grad_norm": 0.32848411798477173, + "learning_rate": 3.432461889488578e-06, + "loss": 0.1531, + "step": 47549 + }, + { + "epoch": 0.848107587486177, + "grad_norm": 0.338253915309906, + "learning_rate": 3.4316747844900133e-06, + "loss": 0.1175, + "step": 47550 + }, + { + "epoch": 0.8481254236078907, + "grad_norm": 0.2497081309556961, + "learning_rate": 3.4308877630977203e-06, + "loss": 0.1326, + "step": 47551 + }, + { + "epoch": 0.8481432597296044, + "grad_norm": 0.2737867534160614, + "learning_rate": 3.4301008253147637e-06, + "loss": 0.1174, + "step": 47552 + }, + { + "epoch": 0.848161095851318, + "grad_norm": 0.2538291811943054, + "learning_rate": 3.4293139711441852e-06, + "loss": 0.1166, + "step": 47553 + }, + { + "epoch": 0.8481789319730317, + "grad_norm": 0.2532767951488495, + "learning_rate": 3.428527200589035e-06, + "loss": 0.1146, + "step": 47554 + }, + { + "epoch": 0.8481967680947454, + "grad_norm": 0.2863558232784271, + "learning_rate": 3.4277405136523582e-06, + "loss": 0.0997, + "step": 47555 + }, + { + "epoch": 0.8482146042164592, + "grad_norm": 0.2636297941207886, + "learning_rate": 3.426953910337216e-06, + "loss": 0.0772, + "step": 47556 + }, + { + "epoch": 0.8482324403381729, + "grad_norm": 0.3280991315841675, + "learning_rate": 3.4261673906466508e-06, + "loss": 0.1403, + "step": 47557 + }, + { + "epoch": 0.8482502764598866, + "grad_norm": 0.34249600768089294, + "learning_rate": 3.425380954583715e-06, + "loss": 0.079, + "step": 47558 + }, + { + "epoch": 0.8482681125816003, + "grad_norm": 0.3236106336116791, + "learning_rate": 3.4245946021514515e-06, + "loss": 0.1064, + "step": 47559 + }, + { + "epoch": 0.848285948703314, + "grad_norm": 0.1811663955450058, + "learning_rate": 3.423808333352907e-06, + "loss": 0.0791, + "step": 47560 + }, + { + "epoch": 0.8483037848250277, + "grad_norm": 0.23325011134147644, + "learning_rate": 3.423022148191138e-06, + "loss": 0.0893, + "step": 47561 + }, + { + "epoch": 0.8483216209467414, + "grad_norm": 0.33347317576408386, + "learning_rate": 3.4222360466691888e-06, + "loss": 0.153, + "step": 47562 + }, + { + "epoch": 0.848339457068455, + "grad_norm": 0.2656535804271698, + "learning_rate": 3.421450028790099e-06, + "loss": 0.1191, + "step": 47563 + }, + { + "epoch": 0.8483572931901687, + "grad_norm": 0.23828156292438507, + "learning_rate": 3.4206640945569217e-06, + "loss": 0.0908, + "step": 47564 + }, + { + "epoch": 0.8483751293118824, + "grad_norm": 0.23775924742221832, + "learning_rate": 3.4198782439727102e-06, + "loss": 0.1254, + "step": 47565 + }, + { + "epoch": 0.8483929654335961, + "grad_norm": 0.24574865400791168, + "learning_rate": 3.419092477040503e-06, + "loss": 0.1228, + "step": 47566 + }, + { + "epoch": 0.8484108015553098, + "grad_norm": 0.20682393014431, + "learning_rate": 3.4183067937633486e-06, + "loss": 0.1044, + "step": 47567 + }, + { + "epoch": 0.8484286376770235, + "grad_norm": 0.2883542478084564, + "learning_rate": 3.417521194144285e-06, + "loss": 0.1119, + "step": 47568 + }, + { + "epoch": 0.8484464737987372, + "grad_norm": 0.2940611243247986, + "learning_rate": 3.4167356781863696e-06, + "loss": 0.0963, + "step": 47569 + }, + { + "epoch": 0.8484643099204509, + "grad_norm": 0.25916025042533875, + "learning_rate": 3.415950245892641e-06, + "loss": 0.0745, + "step": 47570 + }, + { + "epoch": 0.8484821460421645, + "grad_norm": 0.23165477812290192, + "learning_rate": 3.4151648972661432e-06, + "loss": 0.0807, + "step": 47571 + }, + { + "epoch": 0.8484999821638783, + "grad_norm": 0.42337608337402344, + "learning_rate": 3.414379632309922e-06, + "loss": 0.1239, + "step": 47572 + }, + { + "epoch": 0.848517818285592, + "grad_norm": 0.27986547350883484, + "learning_rate": 3.4135944510270133e-06, + "loss": 0.101, + "step": 47573 + }, + { + "epoch": 0.8485356544073057, + "grad_norm": 0.5604416728019714, + "learning_rate": 3.412809353420476e-06, + "loss": 0.1333, + "step": 47574 + }, + { + "epoch": 0.8485534905290194, + "grad_norm": 0.34631314873695374, + "learning_rate": 3.412024339493347e-06, + "loss": 0.1424, + "step": 47575 + }, + { + "epoch": 0.8485713266507331, + "grad_norm": 0.2847054600715637, + "learning_rate": 3.4112394092486594e-06, + "loss": 0.1335, + "step": 47576 + }, + { + "epoch": 0.8485891627724468, + "grad_norm": 0.23745013773441315, + "learning_rate": 3.410454562689472e-06, + "loss": 0.093, + "step": 47577 + }, + { + "epoch": 0.8486069988941605, + "grad_norm": 0.4014798104763031, + "learning_rate": 3.4096697998188127e-06, + "loss": 0.1327, + "step": 47578 + }, + { + "epoch": 0.8486248350158742, + "grad_norm": 0.2616578936576843, + "learning_rate": 3.4088851206397347e-06, + "loss": 0.084, + "step": 47579 + }, + { + "epoch": 0.8486426711375878, + "grad_norm": 0.2699027359485626, + "learning_rate": 3.408100525155278e-06, + "loss": 0.0988, + "step": 47580 + }, + { + "epoch": 0.8486605072593015, + "grad_norm": 0.2636522054672241, + "learning_rate": 3.407316013368475e-06, + "loss": 0.0472, + "step": 47581 + }, + { + "epoch": 0.8486783433810152, + "grad_norm": 0.3269490897655487, + "learning_rate": 3.406531585282377e-06, + "loss": 0.089, + "step": 47582 + }, + { + "epoch": 0.8486961795027289, + "grad_norm": 0.32814884185791016, + "learning_rate": 3.405747240900023e-06, + "loss": 0.112, + "step": 47583 + }, + { + "epoch": 0.8487140156244426, + "grad_norm": 0.24202416837215424, + "learning_rate": 3.4049629802244493e-06, + "loss": 0.1023, + "step": 47584 + }, + { + "epoch": 0.8487318517461563, + "grad_norm": 0.2033878117799759, + "learning_rate": 3.4041788032587006e-06, + "loss": 0.064, + "step": 47585 + }, + { + "epoch": 0.84874968786787, + "grad_norm": 0.29180243611335754, + "learning_rate": 3.4033947100058055e-06, + "loss": 0.1218, + "step": 47586 + }, + { + "epoch": 0.8487675239895837, + "grad_norm": 0.2479466199874878, + "learning_rate": 3.4026107004688167e-06, + "loss": 0.0972, + "step": 47587 + }, + { + "epoch": 0.8487853601112973, + "grad_norm": 0.2755719721317291, + "learning_rate": 3.401826774650771e-06, + "loss": 0.1174, + "step": 47588 + }, + { + "epoch": 0.8488031962330111, + "grad_norm": 0.27944543957710266, + "learning_rate": 3.4010429325547016e-06, + "loss": 0.1332, + "step": 47589 + }, + { + "epoch": 0.8488210323547248, + "grad_norm": 0.32449841499328613, + "learning_rate": 3.4002591741836454e-06, + "loss": 0.0959, + "step": 47590 + }, + { + "epoch": 0.8488388684764385, + "grad_norm": 0.2929539084434509, + "learning_rate": 3.399475499540647e-06, + "loss": 0.1349, + "step": 47591 + }, + { + "epoch": 0.8488567045981522, + "grad_norm": 0.19197528064250946, + "learning_rate": 3.3986919086287454e-06, + "loss": 0.0771, + "step": 47592 + }, + { + "epoch": 0.8488745407198659, + "grad_norm": 0.4546467661857605, + "learning_rate": 3.3979084014509747e-06, + "loss": 0.1159, + "step": 47593 + }, + { + "epoch": 0.8488923768415796, + "grad_norm": 0.37847015261650085, + "learning_rate": 3.3971249780103688e-06, + "loss": 0.0879, + "step": 47594 + }, + { + "epoch": 0.8489102129632933, + "grad_norm": 0.28531503677368164, + "learning_rate": 3.3963416383099718e-06, + "loss": 0.1276, + "step": 47595 + }, + { + "epoch": 0.848928049085007, + "grad_norm": 0.35991430282592773, + "learning_rate": 3.395558382352815e-06, + "loss": 0.0973, + "step": 47596 + }, + { + "epoch": 0.8489458852067207, + "grad_norm": 0.278708279132843, + "learning_rate": 3.394775210141937e-06, + "loss": 0.093, + "step": 47597 + }, + { + "epoch": 0.8489637213284343, + "grad_norm": 0.30353036522865295, + "learning_rate": 3.3939921216803726e-06, + "loss": 0.181, + "step": 47598 + }, + { + "epoch": 0.848981557450148, + "grad_norm": 0.26497504115104675, + "learning_rate": 3.393209116971152e-06, + "loss": 0.0823, + "step": 47599 + }, + { + "epoch": 0.8489993935718617, + "grad_norm": 0.32025888562202454, + "learning_rate": 3.392426196017323e-06, + "loss": 0.1543, + "step": 47600 + }, + { + "epoch": 0.8490172296935754, + "grad_norm": 0.26852986216545105, + "learning_rate": 3.391643358821911e-06, + "loss": 0.1028, + "step": 47601 + }, + { + "epoch": 0.8490350658152891, + "grad_norm": 0.21738412976264954, + "learning_rate": 3.3908606053879522e-06, + "loss": 0.0829, + "step": 47602 + }, + { + "epoch": 0.8490529019370028, + "grad_norm": 0.2731854021549225, + "learning_rate": 3.3900779357184746e-06, + "loss": 0.0593, + "step": 47603 + }, + { + "epoch": 0.8490707380587165, + "grad_norm": 0.29218968749046326, + "learning_rate": 3.3892953498165265e-06, + "loss": 0.1307, + "step": 47604 + }, + { + "epoch": 0.8490885741804302, + "grad_norm": 0.2593521475791931, + "learning_rate": 3.3885128476851268e-06, + "loss": 0.0864, + "step": 47605 + }, + { + "epoch": 0.849106410302144, + "grad_norm": 0.4156484603881836, + "learning_rate": 3.387730429327324e-06, + "loss": 0.0817, + "step": 47606 + }, + { + "epoch": 0.8491242464238576, + "grad_norm": 0.2704949975013733, + "learning_rate": 3.386948094746134e-06, + "loss": 0.1571, + "step": 47607 + }, + { + "epoch": 0.8491420825455713, + "grad_norm": 0.27881699800491333, + "learning_rate": 3.3861658439446054e-06, + "loss": 0.1339, + "step": 47608 + }, + { + "epoch": 0.849159918667285, + "grad_norm": 0.280504435300827, + "learning_rate": 3.385383676925763e-06, + "loss": 0.0979, + "step": 47609 + }, + { + "epoch": 0.8491777547889987, + "grad_norm": 0.2182489037513733, + "learning_rate": 3.384601593692638e-06, + "loss": 0.1075, + "step": 47610 + }, + { + "epoch": 0.8491955909107124, + "grad_norm": 0.2650125026702881, + "learning_rate": 3.3838195942482635e-06, + "loss": 0.1101, + "step": 47611 + }, + { + "epoch": 0.8492134270324261, + "grad_norm": 0.21502986550331116, + "learning_rate": 3.383037678595663e-06, + "loss": 0.1209, + "step": 47612 + }, + { + "epoch": 0.8492312631541398, + "grad_norm": 0.23931175470352173, + "learning_rate": 3.3822558467378834e-06, + "loss": 0.1281, + "step": 47613 + }, + { + "epoch": 0.8492490992758535, + "grad_norm": 0.27963826060295105, + "learning_rate": 3.381474098677945e-06, + "loss": 0.1187, + "step": 47614 + }, + { + "epoch": 0.8492669353975671, + "grad_norm": 0.32648712396621704, + "learning_rate": 3.3806924344188806e-06, + "loss": 0.0978, + "step": 47615 + }, + { + "epoch": 0.8492847715192808, + "grad_norm": 0.3141188323497772, + "learning_rate": 3.379910853963711e-06, + "loss": 0.1415, + "step": 47616 + }, + { + "epoch": 0.8493026076409945, + "grad_norm": 0.23769818246364594, + "learning_rate": 3.379129357315483e-06, + "loss": 0.0781, + "step": 47617 + }, + { + "epoch": 0.8493204437627082, + "grad_norm": 0.2554595172405243, + "learning_rate": 3.3783479444772113e-06, + "loss": 0.1255, + "step": 47618 + }, + { + "epoch": 0.8493382798844219, + "grad_norm": 0.2738015651702881, + "learning_rate": 3.3775666154519355e-06, + "loss": 0.111, + "step": 47619 + }, + { + "epoch": 0.8493561160061356, + "grad_norm": 0.5938237905502319, + "learning_rate": 3.376785370242674e-06, + "loss": 0.1128, + "step": 47620 + }, + { + "epoch": 0.8493739521278493, + "grad_norm": 0.3143916130065918, + "learning_rate": 3.3760042088524654e-06, + "loss": 0.1276, + "step": 47621 + }, + { + "epoch": 0.849391788249563, + "grad_norm": 0.2929554283618927, + "learning_rate": 3.375223131284336e-06, + "loss": 0.1721, + "step": 47622 + }, + { + "epoch": 0.8494096243712768, + "grad_norm": 0.30695393681526184, + "learning_rate": 3.374442137541309e-06, + "loss": 0.0733, + "step": 47623 + }, + { + "epoch": 0.8494274604929904, + "grad_norm": 0.29187247157096863, + "learning_rate": 3.3736612276264123e-06, + "loss": 0.1015, + "step": 47624 + }, + { + "epoch": 0.8494452966147041, + "grad_norm": 0.24599182605743408, + "learning_rate": 3.372880401542672e-06, + "loss": 0.0805, + "step": 47625 + }, + { + "epoch": 0.8494631327364178, + "grad_norm": 0.2022215723991394, + "learning_rate": 3.3720996592931183e-06, + "loss": 0.1316, + "step": 47626 + }, + { + "epoch": 0.8494809688581315, + "grad_norm": 0.28157946467399597, + "learning_rate": 3.3713190008807793e-06, + "loss": 0.1176, + "step": 47627 + }, + { + "epoch": 0.8494988049798452, + "grad_norm": 0.24914538860321045, + "learning_rate": 3.3705384263086783e-06, + "loss": 0.1245, + "step": 47628 + }, + { + "epoch": 0.8495166411015589, + "grad_norm": 0.2940198481082916, + "learning_rate": 3.369757935579834e-06, + "loss": 0.1294, + "step": 47629 + }, + { + "epoch": 0.8495344772232726, + "grad_norm": 0.27949362993240356, + "learning_rate": 3.368977528697284e-06, + "loss": 0.1031, + "step": 47630 + }, + { + "epoch": 0.8495523133449863, + "grad_norm": 0.3019295930862427, + "learning_rate": 3.3681972056640502e-06, + "loss": 0.1311, + "step": 47631 + }, + { + "epoch": 0.8495701494667, + "grad_norm": 0.22180651128292084, + "learning_rate": 3.36741696648315e-06, + "loss": 0.1203, + "step": 47632 + }, + { + "epoch": 0.8495879855884136, + "grad_norm": 0.2812873423099518, + "learning_rate": 3.366636811157617e-06, + "loss": 0.133, + "step": 47633 + }, + { + "epoch": 0.8496058217101273, + "grad_norm": 0.2788574993610382, + "learning_rate": 3.365856739690465e-06, + "loss": 0.0779, + "step": 47634 + }, + { + "epoch": 0.849623657831841, + "grad_norm": 0.2598767876625061, + "learning_rate": 3.365076752084734e-06, + "loss": 0.1323, + "step": 47635 + }, + { + "epoch": 0.8496414939535547, + "grad_norm": 0.2924971580505371, + "learning_rate": 3.3642968483434343e-06, + "loss": 0.1306, + "step": 47636 + }, + { + "epoch": 0.8496593300752684, + "grad_norm": 0.24843446910381317, + "learning_rate": 3.3635170284695955e-06, + "loss": 0.1004, + "step": 47637 + }, + { + "epoch": 0.8496771661969821, + "grad_norm": 0.21223753690719604, + "learning_rate": 3.362737292466231e-06, + "loss": 0.1246, + "step": 47638 + }, + { + "epoch": 0.8496950023186958, + "grad_norm": 0.23976342380046844, + "learning_rate": 3.3619576403363744e-06, + "loss": 0.1319, + "step": 47639 + }, + { + "epoch": 0.8497128384404096, + "grad_norm": 0.28250566124916077, + "learning_rate": 3.3611780720830433e-06, + "loss": 0.1247, + "step": 47640 + }, + { + "epoch": 0.8497306745621233, + "grad_norm": 0.28882476687431335, + "learning_rate": 3.3603985877092627e-06, + "loss": 0.1597, + "step": 47641 + }, + { + "epoch": 0.8497485106838369, + "grad_norm": 0.23271915316581726, + "learning_rate": 3.359619187218044e-06, + "loss": 0.1114, + "step": 47642 + }, + { + "epoch": 0.8497663468055506, + "grad_norm": 0.23010171949863434, + "learning_rate": 3.358839870612421e-06, + "loss": 0.077, + "step": 47643 + }, + { + "epoch": 0.8497841829272643, + "grad_norm": 0.3984963297843933, + "learning_rate": 3.3580606378954078e-06, + "loss": 0.1015, + "step": 47644 + }, + { + "epoch": 0.849802019048978, + "grad_norm": 0.24417617917060852, + "learning_rate": 3.3572814890700243e-06, + "loss": 0.0876, + "step": 47645 + }, + { + "epoch": 0.8498198551706917, + "grad_norm": 0.2776118218898773, + "learning_rate": 3.356502424139296e-06, + "loss": 0.0763, + "step": 47646 + }, + { + "epoch": 0.8498376912924054, + "grad_norm": 0.24080964922904968, + "learning_rate": 3.3557234431062344e-06, + "loss": 0.0836, + "step": 47647 + }, + { + "epoch": 0.8498555274141191, + "grad_norm": 0.32639598846435547, + "learning_rate": 3.3549445459738706e-06, + "loss": 0.1107, + "step": 47648 + }, + { + "epoch": 0.8498733635358328, + "grad_norm": 0.24550041556358337, + "learning_rate": 3.354165732745218e-06, + "loss": 0.0753, + "step": 47649 + }, + { + "epoch": 0.8498911996575464, + "grad_norm": 0.20576785504817963, + "learning_rate": 3.3533870034232945e-06, + "loss": 0.0805, + "step": 47650 + }, + { + "epoch": 0.8499090357792601, + "grad_norm": 0.1855613738298416, + "learning_rate": 3.352608358011114e-06, + "loss": 0.0864, + "step": 47651 + }, + { + "epoch": 0.8499268719009738, + "grad_norm": 0.27259737253189087, + "learning_rate": 3.3518297965117073e-06, + "loss": 0.0771, + "step": 47652 + }, + { + "epoch": 0.8499447080226875, + "grad_norm": 0.32468804717063904, + "learning_rate": 3.3510513189280836e-06, + "loss": 0.1401, + "step": 47653 + }, + { + "epoch": 0.8499625441444012, + "grad_norm": 0.29628005623817444, + "learning_rate": 3.350272925263265e-06, + "loss": 0.1058, + "step": 47654 + }, + { + "epoch": 0.8499803802661149, + "grad_norm": 0.23224864900112152, + "learning_rate": 3.3494946155202573e-06, + "loss": 0.0842, + "step": 47655 + }, + { + "epoch": 0.8499982163878286, + "grad_norm": 0.26734307408332825, + "learning_rate": 3.348716389702092e-06, + "loss": 0.0539, + "step": 47656 + }, + { + "epoch": 0.8500160525095424, + "grad_norm": 0.2762904167175293, + "learning_rate": 3.3479382478117825e-06, + "loss": 0.0931, + "step": 47657 + }, + { + "epoch": 0.8500338886312561, + "grad_norm": 0.32132840156555176, + "learning_rate": 3.3471601898523415e-06, + "loss": 0.1487, + "step": 47658 + }, + { + "epoch": 0.8500517247529698, + "grad_norm": 0.26144763827323914, + "learning_rate": 3.3463822158267794e-06, + "loss": 0.1556, + "step": 47659 + }, + { + "epoch": 0.8500695608746834, + "grad_norm": 0.29807576537132263, + "learning_rate": 3.345604325738125e-06, + "loss": 0.1217, + "step": 47660 + }, + { + "epoch": 0.8500873969963971, + "grad_norm": 0.21024297177791595, + "learning_rate": 3.3448265195893836e-06, + "loss": 0.0921, + "step": 47661 + }, + { + "epoch": 0.8501052331181108, + "grad_norm": 0.2591642141342163, + "learning_rate": 3.3440487973835784e-06, + "loss": 0.1186, + "step": 47662 + }, + { + "epoch": 0.8501230692398245, + "grad_norm": 0.26573431491851807, + "learning_rate": 3.3432711591237175e-06, + "loss": 0.1097, + "step": 47663 + }, + { + "epoch": 0.8501409053615382, + "grad_norm": 0.27268293499946594, + "learning_rate": 3.3424936048128156e-06, + "loss": 0.1142, + "step": 47664 + }, + { + "epoch": 0.8501587414832519, + "grad_norm": 0.3256499767303467, + "learning_rate": 3.3417161344538928e-06, + "loss": 0.1587, + "step": 47665 + }, + { + "epoch": 0.8501765776049656, + "grad_norm": 0.29039502143859863, + "learning_rate": 3.3409387480499597e-06, + "loss": 0.0996, + "step": 47666 + }, + { + "epoch": 0.8501944137266793, + "grad_norm": 0.29900890588760376, + "learning_rate": 3.3401614456040285e-06, + "loss": 0.132, + "step": 47667 + }, + { + "epoch": 0.8502122498483929, + "grad_norm": 0.20569966733455658, + "learning_rate": 3.339384227119105e-06, + "loss": 0.0865, + "step": 47668 + }, + { + "epoch": 0.8502300859701066, + "grad_norm": 0.42728641629219055, + "learning_rate": 3.338607092598217e-06, + "loss": 0.0957, + "step": 47669 + }, + { + "epoch": 0.8502479220918203, + "grad_norm": 0.40185558795928955, + "learning_rate": 3.337830042044368e-06, + "loss": 0.1192, + "step": 47670 + }, + { + "epoch": 0.850265758213534, + "grad_norm": 0.2814464569091797, + "learning_rate": 3.337053075460575e-06, + "loss": 0.0627, + "step": 47671 + }, + { + "epoch": 0.8502835943352477, + "grad_norm": 0.2782944440841675, + "learning_rate": 3.336276192849838e-06, + "loss": 0.1495, + "step": 47672 + }, + { + "epoch": 0.8503014304569615, + "grad_norm": 0.3199179768562317, + "learning_rate": 3.335499394215183e-06, + "loss": 0.096, + "step": 47673 + }, + { + "epoch": 0.8503192665786752, + "grad_norm": 0.24381811916828156, + "learning_rate": 3.33472267955961e-06, + "loss": 0.1353, + "step": 47674 + }, + { + "epoch": 0.8503371027003889, + "grad_norm": 0.26505932211875916, + "learning_rate": 3.3339460488861385e-06, + "loss": 0.1086, + "step": 47675 + }, + { + "epoch": 0.8503549388221026, + "grad_norm": 0.23636607825756073, + "learning_rate": 3.3331695021977778e-06, + "loss": 0.0722, + "step": 47676 + }, + { + "epoch": 0.8503727749438162, + "grad_norm": 0.2680937647819519, + "learning_rate": 3.3323930394975305e-06, + "loss": 0.1267, + "step": 47677 + }, + { + "epoch": 0.8503906110655299, + "grad_norm": 0.20951932668685913, + "learning_rate": 3.3316166607884143e-06, + "loss": 0.0525, + "step": 47678 + }, + { + "epoch": 0.8504084471872436, + "grad_norm": 0.24381721019744873, + "learning_rate": 3.3308403660734374e-06, + "loss": 0.0768, + "step": 47679 + }, + { + "epoch": 0.8504262833089573, + "grad_norm": 0.27948907017707825, + "learning_rate": 3.3300641553556083e-06, + "loss": 0.0675, + "step": 47680 + }, + { + "epoch": 0.850444119430671, + "grad_norm": 0.2873189449310303, + "learning_rate": 3.329288028637928e-06, + "loss": 0.0843, + "step": 47681 + }, + { + "epoch": 0.8504619555523847, + "grad_norm": 0.25985538959503174, + "learning_rate": 3.3285119859234185e-06, + "loss": 0.1054, + "step": 47682 + }, + { + "epoch": 0.8504797916740984, + "grad_norm": 0.29643651843070984, + "learning_rate": 3.327736027215081e-06, + "loss": 0.1169, + "step": 47683 + }, + { + "epoch": 0.8504976277958121, + "grad_norm": 0.3242148756980896, + "learning_rate": 3.326960152515926e-06, + "loss": 0.1364, + "step": 47684 + }, + { + "epoch": 0.8505154639175257, + "grad_norm": 0.29610541462898254, + "learning_rate": 3.3261843618289517e-06, + "loss": 0.1275, + "step": 47685 + }, + { + "epoch": 0.8505333000392394, + "grad_norm": 0.29560166597366333, + "learning_rate": 3.3254086551571777e-06, + "loss": 0.0669, + "step": 47686 + }, + { + "epoch": 0.8505511361609531, + "grad_norm": 0.25385379791259766, + "learning_rate": 3.3246330325036075e-06, + "loss": 0.0983, + "step": 47687 + }, + { + "epoch": 0.8505689722826668, + "grad_norm": 0.2635827958583832, + "learning_rate": 3.323857493871238e-06, + "loss": 0.1566, + "step": 47688 + }, + { + "epoch": 0.8505868084043805, + "grad_norm": 0.23801691830158234, + "learning_rate": 3.3230820392630924e-06, + "loss": 0.1026, + "step": 47689 + }, + { + "epoch": 0.8506046445260943, + "grad_norm": 0.2759699821472168, + "learning_rate": 3.3223066686821597e-06, + "loss": 0.1222, + "step": 47690 + }, + { + "epoch": 0.850622480647808, + "grad_norm": 0.2693772614002228, + "learning_rate": 3.3215313821314625e-06, + "loss": 0.1118, + "step": 47691 + }, + { + "epoch": 0.8506403167695217, + "grad_norm": 0.22049814462661743, + "learning_rate": 3.320756179613993e-06, + "loss": 0.056, + "step": 47692 + }, + { + "epoch": 0.8506581528912354, + "grad_norm": 0.21508820354938507, + "learning_rate": 3.3199810611327625e-06, + "loss": 0.091, + "step": 47693 + }, + { + "epoch": 0.850675989012949, + "grad_norm": 0.2161596268415451, + "learning_rate": 3.319206026690769e-06, + "loss": 0.0974, + "step": 47694 + }, + { + "epoch": 0.8506938251346627, + "grad_norm": 0.30514243245124817, + "learning_rate": 3.318431076291023e-06, + "loss": 0.0887, + "step": 47695 + }, + { + "epoch": 0.8507116612563764, + "grad_norm": 0.26499712467193604, + "learning_rate": 3.3176562099365316e-06, + "loss": 0.123, + "step": 47696 + }, + { + "epoch": 0.8507294973780901, + "grad_norm": 0.3640264868736267, + "learning_rate": 3.316881427630289e-06, + "loss": 0.1122, + "step": 47697 + }, + { + "epoch": 0.8507473334998038, + "grad_norm": 0.3829297721385956, + "learning_rate": 3.3161067293753013e-06, + "loss": 0.195, + "step": 47698 + }, + { + "epoch": 0.8507651696215175, + "grad_norm": 0.40436848998069763, + "learning_rate": 3.315332115174577e-06, + "loss": 0.1181, + "step": 47699 + }, + { + "epoch": 0.8507830057432312, + "grad_norm": 0.3490535616874695, + "learning_rate": 3.314557585031114e-06, + "loss": 0.1378, + "step": 47700 + }, + { + "epoch": 0.8508008418649449, + "grad_norm": 0.3003726005554199, + "learning_rate": 3.3137831389479124e-06, + "loss": 0.0864, + "step": 47701 + }, + { + "epoch": 0.8508186779866586, + "grad_norm": 0.2347470074892044, + "learning_rate": 3.313008776927981e-06, + "loss": 0.1122, + "step": 47702 + }, + { + "epoch": 0.8508365141083722, + "grad_norm": 0.24559372663497925, + "learning_rate": 3.3122344989743147e-06, + "loss": 0.0723, + "step": 47703 + }, + { + "epoch": 0.8508543502300859, + "grad_norm": 0.24125739932060242, + "learning_rate": 3.311460305089922e-06, + "loss": 0.0973, + "step": 47704 + }, + { + "epoch": 0.8508721863517996, + "grad_norm": 0.33484894037246704, + "learning_rate": 3.3106861952778013e-06, + "loss": 0.1782, + "step": 47705 + }, + { + "epoch": 0.8508900224735133, + "grad_norm": 0.2980024516582489, + "learning_rate": 3.309912169540952e-06, + "loss": 0.1576, + "step": 47706 + }, + { + "epoch": 0.8509078585952271, + "grad_norm": 0.28373217582702637, + "learning_rate": 3.309138227882369e-06, + "loss": 0.1352, + "step": 47707 + }, + { + "epoch": 0.8509256947169408, + "grad_norm": 0.24331073462963104, + "learning_rate": 3.3083643703050615e-06, + "loss": 0.0757, + "step": 47708 + }, + { + "epoch": 0.8509435308386545, + "grad_norm": 0.2487875521183014, + "learning_rate": 3.30759059681203e-06, + "loss": 0.1837, + "step": 47709 + }, + { + "epoch": 0.8509613669603682, + "grad_norm": 0.3310057818889618, + "learning_rate": 3.3068169074062657e-06, + "loss": 0.1377, + "step": 47710 + }, + { + "epoch": 0.8509792030820819, + "grad_norm": 0.32067668437957764, + "learning_rate": 3.3060433020907667e-06, + "loss": 0.1251, + "step": 47711 + }, + { + "epoch": 0.8509970392037955, + "grad_norm": 0.21814998984336853, + "learning_rate": 3.3052697808685446e-06, + "loss": 0.1237, + "step": 47712 + }, + { + "epoch": 0.8510148753255092, + "grad_norm": 0.3243829309940338, + "learning_rate": 3.3044963437425885e-06, + "loss": 0.1505, + "step": 47713 + }, + { + "epoch": 0.8510327114472229, + "grad_norm": 0.334989994764328, + "learning_rate": 3.303722990715896e-06, + "loss": 0.1217, + "step": 47714 + }, + { + "epoch": 0.8510505475689366, + "grad_norm": 0.37650513648986816, + "learning_rate": 3.302949721791465e-06, + "loss": 0.1212, + "step": 47715 + }, + { + "epoch": 0.8510683836906503, + "grad_norm": 0.27502763271331787, + "learning_rate": 3.3021765369722985e-06, + "loss": 0.1398, + "step": 47716 + }, + { + "epoch": 0.851086219812364, + "grad_norm": 0.2925994098186493, + "learning_rate": 3.3014034362613825e-06, + "loss": 0.1263, + "step": 47717 + }, + { + "epoch": 0.8511040559340777, + "grad_norm": 0.2871912121772766, + "learning_rate": 3.3006304196617294e-06, + "loss": 0.11, + "step": 47718 + }, + { + "epoch": 0.8511218920557914, + "grad_norm": 0.3749898672103882, + "learning_rate": 3.2998574871763277e-06, + "loss": 0.1471, + "step": 47719 + }, + { + "epoch": 0.851139728177505, + "grad_norm": 0.23590585589408875, + "learning_rate": 3.299084638808167e-06, + "loss": 0.0963, + "step": 47720 + }, + { + "epoch": 0.8511575642992187, + "grad_norm": 0.24640442430973053, + "learning_rate": 3.298311874560256e-06, + "loss": 0.1058, + "step": 47721 + }, + { + "epoch": 0.8511754004209324, + "grad_norm": 0.25836181640625, + "learning_rate": 3.297539194435581e-06, + "loss": 0.0808, + "step": 47722 + }, + { + "epoch": 0.8511932365426461, + "grad_norm": 0.2222335785627365, + "learning_rate": 3.296766598437143e-06, + "loss": 0.0854, + "step": 47723 + }, + { + "epoch": 0.8512110726643599, + "grad_norm": 0.3288305401802063, + "learning_rate": 3.2959940865679273e-06, + "loss": 0.1156, + "step": 47724 + }, + { + "epoch": 0.8512289087860736, + "grad_norm": 0.3277072608470917, + "learning_rate": 3.2952216588309383e-06, + "loss": 0.1571, + "step": 47725 + }, + { + "epoch": 0.8512467449077873, + "grad_norm": 0.26358556747436523, + "learning_rate": 3.29444931522917e-06, + "loss": 0.0842, + "step": 47726 + }, + { + "epoch": 0.851264581029501, + "grad_norm": 0.21426579356193542, + "learning_rate": 3.2936770557656096e-06, + "loss": 0.0892, + "step": 47727 + }, + { + "epoch": 0.8512824171512147, + "grad_norm": 0.1993224024772644, + "learning_rate": 3.2929048804432513e-06, + "loss": 0.0823, + "step": 47728 + }, + { + "epoch": 0.8513002532729284, + "grad_norm": 0.31235823035240173, + "learning_rate": 3.2921327892650955e-06, + "loss": 0.0738, + "step": 47729 + }, + { + "epoch": 0.851318089394642, + "grad_norm": 0.2811439037322998, + "learning_rate": 3.2913607822341235e-06, + "loss": 0.1235, + "step": 47730 + }, + { + "epoch": 0.8513359255163557, + "grad_norm": 0.3290042579174042, + "learning_rate": 3.290588859353344e-06, + "loss": 0.1163, + "step": 47731 + }, + { + "epoch": 0.8513537616380694, + "grad_norm": 0.3252313733100891, + "learning_rate": 3.2898170206257377e-06, + "loss": 0.1069, + "step": 47732 + }, + { + "epoch": 0.8513715977597831, + "grad_norm": 0.27864041924476624, + "learning_rate": 3.2890452660542942e-06, + "loss": 0.1041, + "step": 47733 + }, + { + "epoch": 0.8513894338814968, + "grad_norm": 0.29188069701194763, + "learning_rate": 3.288273595642016e-06, + "loss": 0.1586, + "step": 47734 + }, + { + "epoch": 0.8514072700032105, + "grad_norm": 0.30595695972442627, + "learning_rate": 3.287502009391888e-06, + "loss": 0.0856, + "step": 47735 + }, + { + "epoch": 0.8514251061249242, + "grad_norm": 0.3293379545211792, + "learning_rate": 3.286730507306901e-06, + "loss": 0.1251, + "step": 47736 + }, + { + "epoch": 0.8514429422466379, + "grad_norm": 0.40624716877937317, + "learning_rate": 3.2859590893900423e-06, + "loss": 0.1425, + "step": 47737 + }, + { + "epoch": 0.8514607783683515, + "grad_norm": 0.3774282932281494, + "learning_rate": 3.285187755644309e-06, + "loss": 0.101, + "step": 47738 + }, + { + "epoch": 0.8514786144900652, + "grad_norm": 0.2766306698322296, + "learning_rate": 3.2844165060726905e-06, + "loss": 0.1138, + "step": 47739 + }, + { + "epoch": 0.8514964506117789, + "grad_norm": 0.3679528832435608, + "learning_rate": 3.283645340678171e-06, + "loss": 0.1442, + "step": 47740 + }, + { + "epoch": 0.8515142867334927, + "grad_norm": 0.3985013961791992, + "learning_rate": 3.2828742594637445e-06, + "loss": 0.135, + "step": 47741 + }, + { + "epoch": 0.8515321228552064, + "grad_norm": 0.28673189878463745, + "learning_rate": 3.2821032624323926e-06, + "loss": 0.0919, + "step": 47742 + }, + { + "epoch": 0.8515499589769201, + "grad_norm": 0.2633587121963501, + "learning_rate": 3.2813323495871156e-06, + "loss": 0.1185, + "step": 47743 + }, + { + "epoch": 0.8515677950986338, + "grad_norm": 0.28822004795074463, + "learning_rate": 3.280561520930889e-06, + "loss": 0.0884, + "step": 47744 + }, + { + "epoch": 0.8515856312203475, + "grad_norm": 0.29646366834640503, + "learning_rate": 3.279790776466715e-06, + "loss": 0.1309, + "step": 47745 + }, + { + "epoch": 0.8516034673420612, + "grad_norm": 0.3249569535255432, + "learning_rate": 3.2790201161975675e-06, + "loss": 0.0893, + "step": 47746 + }, + { + "epoch": 0.8516213034637748, + "grad_norm": 0.32418540120124817, + "learning_rate": 3.2782495401264463e-06, + "loss": 0.1752, + "step": 47747 + }, + { + "epoch": 0.8516391395854885, + "grad_norm": 0.33033403754234314, + "learning_rate": 3.2774790482563293e-06, + "loss": 0.1643, + "step": 47748 + }, + { + "epoch": 0.8516569757072022, + "grad_norm": 0.24495819211006165, + "learning_rate": 3.276708640590209e-06, + "loss": 0.0898, + "step": 47749 + }, + { + "epoch": 0.8516748118289159, + "grad_norm": 0.3480449318885803, + "learning_rate": 3.2759383171310638e-06, + "loss": 0.1162, + "step": 47750 + }, + { + "epoch": 0.8516926479506296, + "grad_norm": 0.20499290525913239, + "learning_rate": 3.2751680778818876e-06, + "loss": 0.0775, + "step": 47751 + }, + { + "epoch": 0.8517104840723433, + "grad_norm": 0.3261464238166809, + "learning_rate": 3.2743979228456678e-06, + "loss": 0.1723, + "step": 47752 + }, + { + "epoch": 0.851728320194057, + "grad_norm": 0.27224817872047424, + "learning_rate": 3.2736278520253823e-06, + "loss": 0.0904, + "step": 47753 + }, + { + "epoch": 0.8517461563157707, + "grad_norm": 0.31743139028549194, + "learning_rate": 3.27285786542402e-06, + "loss": 0.1329, + "step": 47754 + }, + { + "epoch": 0.8517639924374844, + "grad_norm": 0.2498500943183899, + "learning_rate": 3.2720879630445595e-06, + "loss": 0.1189, + "step": 47755 + }, + { + "epoch": 0.851781828559198, + "grad_norm": 0.3125590980052948, + "learning_rate": 3.2713181448899954e-06, + "loss": 0.1475, + "step": 47756 + }, + { + "epoch": 0.8517996646809117, + "grad_norm": 0.2251376509666443, + "learning_rate": 3.2705484109633034e-06, + "loss": 0.0971, + "step": 47757 + }, + { + "epoch": 0.8518175008026255, + "grad_norm": 0.2038601189851761, + "learning_rate": 3.269778761267475e-06, + "loss": 0.1233, + "step": 47758 + }, + { + "epoch": 0.8518353369243392, + "grad_norm": 0.25741422176361084, + "learning_rate": 3.2690091958054834e-06, + "loss": 0.1228, + "step": 47759 + }, + { + "epoch": 0.8518531730460529, + "grad_norm": 0.19874070584774017, + "learning_rate": 3.268239714580323e-06, + "loss": 0.076, + "step": 47760 + }, + { + "epoch": 0.8518710091677666, + "grad_norm": 0.2819465398788452, + "learning_rate": 3.2674703175949727e-06, + "loss": 0.1078, + "step": 47761 + }, + { + "epoch": 0.8518888452894803, + "grad_norm": 0.25806084275245667, + "learning_rate": 3.2667010048524157e-06, + "loss": 0.1111, + "step": 47762 + }, + { + "epoch": 0.851906681411194, + "grad_norm": 0.28122764825820923, + "learning_rate": 3.2659317763556242e-06, + "loss": 0.126, + "step": 47763 + }, + { + "epoch": 0.8519245175329077, + "grad_norm": 0.26505550742149353, + "learning_rate": 3.2651626321075906e-06, + "loss": 0.1356, + "step": 47764 + }, + { + "epoch": 0.8519423536546213, + "grad_norm": 0.3219742178916931, + "learning_rate": 3.264393572111299e-06, + "loss": 0.125, + "step": 47765 + }, + { + "epoch": 0.851960189776335, + "grad_norm": 0.3135806620121002, + "learning_rate": 3.2636245963697217e-06, + "loss": 0.1464, + "step": 47766 + }, + { + "epoch": 0.8519780258980487, + "grad_norm": 0.3126005530357361, + "learning_rate": 3.2628557048858423e-06, + "loss": 0.1204, + "step": 47767 + }, + { + "epoch": 0.8519958620197624, + "grad_norm": 0.4257054626941681, + "learning_rate": 3.262086897662639e-06, + "loss": 0.1017, + "step": 47768 + }, + { + "epoch": 0.8520136981414761, + "grad_norm": 0.4266127645969391, + "learning_rate": 3.2613181747030984e-06, + "loss": 0.1627, + "step": 47769 + }, + { + "epoch": 0.8520315342631898, + "grad_norm": 0.29566749930381775, + "learning_rate": 3.2605495360101988e-06, + "loss": 0.091, + "step": 47770 + }, + { + "epoch": 0.8520493703849035, + "grad_norm": 0.26643410325050354, + "learning_rate": 3.25978098158691e-06, + "loss": 0.1108, + "step": 47771 + }, + { + "epoch": 0.8520672065066172, + "grad_norm": 0.3051968216896057, + "learning_rate": 3.2590125114362213e-06, + "loss": 0.1001, + "step": 47772 + }, + { + "epoch": 0.8520850426283308, + "grad_norm": 0.2870226204395294, + "learning_rate": 3.2582441255611134e-06, + "loss": 0.099, + "step": 47773 + }, + { + "epoch": 0.8521028787500446, + "grad_norm": 0.191124826669693, + "learning_rate": 3.257475823964562e-06, + "loss": 0.09, + "step": 47774 + }, + { + "epoch": 0.8521207148717583, + "grad_norm": 0.290499210357666, + "learning_rate": 3.256707606649542e-06, + "loss": 0.1516, + "step": 47775 + }, + { + "epoch": 0.852138550993472, + "grad_norm": 0.2909443974494934, + "learning_rate": 3.2559394736190294e-06, + "loss": 0.1089, + "step": 47776 + }, + { + "epoch": 0.8521563871151857, + "grad_norm": 0.262471079826355, + "learning_rate": 3.2551714248760077e-06, + "loss": 0.0987, + "step": 47777 + }, + { + "epoch": 0.8521742232368994, + "grad_norm": 0.5307853817939758, + "learning_rate": 3.254403460423455e-06, + "loss": 0.0972, + "step": 47778 + }, + { + "epoch": 0.8521920593586131, + "grad_norm": 0.43064555525779724, + "learning_rate": 3.253635580264344e-06, + "loss": 0.1416, + "step": 47779 + }, + { + "epoch": 0.8522098954803268, + "grad_norm": 0.2886241376399994, + "learning_rate": 3.2528677844016557e-06, + "loss": 0.1132, + "step": 47780 + }, + { + "epoch": 0.8522277316020405, + "grad_norm": 0.30424654483795166, + "learning_rate": 3.2521000728383543e-06, + "loss": 0.1101, + "step": 47781 + }, + { + "epoch": 0.8522455677237541, + "grad_norm": 0.24711903929710388, + "learning_rate": 3.251332445577429e-06, + "loss": 0.16, + "step": 47782 + }, + { + "epoch": 0.8522634038454678, + "grad_norm": 0.25032106041908264, + "learning_rate": 3.250564902621853e-06, + "loss": 0.0778, + "step": 47783 + }, + { + "epoch": 0.8522812399671815, + "grad_norm": 0.19252462685108185, + "learning_rate": 3.2497974439745948e-06, + "loss": 0.0923, + "step": 47784 + }, + { + "epoch": 0.8522990760888952, + "grad_norm": 0.29898732900619507, + "learning_rate": 3.249030069638637e-06, + "loss": 0.1264, + "step": 47785 + }, + { + "epoch": 0.8523169122106089, + "grad_norm": 0.27894526720046997, + "learning_rate": 3.248262779616948e-06, + "loss": 0.1461, + "step": 47786 + }, + { + "epoch": 0.8523347483323226, + "grad_norm": 0.3338276147842407, + "learning_rate": 3.2474955739125096e-06, + "loss": 0.1237, + "step": 47787 + }, + { + "epoch": 0.8523525844540363, + "grad_norm": 0.24039319157600403, + "learning_rate": 3.2467284525282915e-06, + "loss": 0.1248, + "step": 47788 + }, + { + "epoch": 0.85237042057575, + "grad_norm": 0.2494104653596878, + "learning_rate": 3.245961415467261e-06, + "loss": 0.0784, + "step": 47789 + }, + { + "epoch": 0.8523882566974637, + "grad_norm": 0.2040245532989502, + "learning_rate": 3.245194462732404e-06, + "loss": 0.0987, + "step": 47790 + }, + { + "epoch": 0.8524060928191775, + "grad_norm": 0.22220703959465027, + "learning_rate": 3.2444275943266876e-06, + "loss": 0.0756, + "step": 47791 + }, + { + "epoch": 0.8524239289408911, + "grad_norm": 0.24491311609745026, + "learning_rate": 3.2436608102530823e-06, + "loss": 0.1064, + "step": 47792 + }, + { + "epoch": 0.8524417650626048, + "grad_norm": 0.24906878173351288, + "learning_rate": 3.2428941105145576e-06, + "loss": 0.127, + "step": 47793 + }, + { + "epoch": 0.8524596011843185, + "grad_norm": 0.24240249395370483, + "learning_rate": 3.242127495114097e-06, + "loss": 0.1572, + "step": 47794 + }, + { + "epoch": 0.8524774373060322, + "grad_norm": 0.4117433726787567, + "learning_rate": 3.2413609640546627e-06, + "loss": 0.1485, + "step": 47795 + }, + { + "epoch": 0.8524952734277459, + "grad_norm": 0.2782477140426636, + "learning_rate": 3.2405945173392293e-06, + "loss": 0.1061, + "step": 47796 + }, + { + "epoch": 0.8525131095494596, + "grad_norm": 0.21209150552749634, + "learning_rate": 3.2398281549707673e-06, + "loss": 0.0869, + "step": 47797 + }, + { + "epoch": 0.8525309456711733, + "grad_norm": 0.25119513273239136, + "learning_rate": 3.239061876952243e-06, + "loss": 0.0864, + "step": 47798 + }, + { + "epoch": 0.852548781792887, + "grad_norm": 0.26370301842689514, + "learning_rate": 3.2382956832866267e-06, + "loss": 0.1258, + "step": 47799 + }, + { + "epoch": 0.8525666179146006, + "grad_norm": 0.2579623758792877, + "learning_rate": 3.2375295739769025e-06, + "loss": 0.0939, + "step": 47800 + }, + { + "epoch": 0.8525844540363143, + "grad_norm": 0.34552836418151855, + "learning_rate": 3.2367635490260284e-06, + "loss": 0.1276, + "step": 47801 + }, + { + "epoch": 0.852602290158028, + "grad_norm": 0.384785532951355, + "learning_rate": 3.2359976084369693e-06, + "loss": 0.1169, + "step": 47802 + }, + { + "epoch": 0.8526201262797417, + "grad_norm": 0.2821256220340729, + "learning_rate": 3.2352317522127086e-06, + "loss": 0.1238, + "step": 47803 + }, + { + "epoch": 0.8526379624014554, + "grad_norm": 0.2784591019153595, + "learning_rate": 3.234465980356205e-06, + "loss": 0.117, + "step": 47804 + }, + { + "epoch": 0.8526557985231691, + "grad_norm": 0.37206172943115234, + "learning_rate": 3.2337002928704287e-06, + "loss": 0.1608, + "step": 47805 + }, + { + "epoch": 0.8526736346448828, + "grad_norm": 0.24293984472751617, + "learning_rate": 3.2329346897583405e-06, + "loss": 0.0822, + "step": 47806 + }, + { + "epoch": 0.8526914707665965, + "grad_norm": 0.2291383594274521, + "learning_rate": 3.232169171022925e-06, + "loss": 0.1034, + "step": 47807 + }, + { + "epoch": 0.8527093068883103, + "grad_norm": 0.29689157009124756, + "learning_rate": 3.231403736667138e-06, + "loss": 0.1082, + "step": 47808 + }, + { + "epoch": 0.852727143010024, + "grad_norm": 0.19419559836387634, + "learning_rate": 3.230638386693949e-06, + "loss": 0.0995, + "step": 47809 + }, + { + "epoch": 0.8527449791317376, + "grad_norm": 0.38792508840560913, + "learning_rate": 3.229873121106325e-06, + "loss": 0.085, + "step": 47810 + }, + { + "epoch": 0.8527628152534513, + "grad_norm": 0.27517664432525635, + "learning_rate": 3.229107939907225e-06, + "loss": 0.1313, + "step": 47811 + }, + { + "epoch": 0.852780651375165, + "grad_norm": 0.3076547682285309, + "learning_rate": 3.22834284309963e-06, + "loss": 0.132, + "step": 47812 + }, + { + "epoch": 0.8527984874968787, + "grad_norm": 0.24541279673576355, + "learning_rate": 3.22757783068649e-06, + "loss": 0.1226, + "step": 47813 + }, + { + "epoch": 0.8528163236185924, + "grad_norm": 0.24003036320209503, + "learning_rate": 3.2268129026707838e-06, + "loss": 0.0913, + "step": 47814 + }, + { + "epoch": 0.8528341597403061, + "grad_norm": 0.20849540829658508, + "learning_rate": 3.226048059055467e-06, + "loss": 0.0956, + "step": 47815 + }, + { + "epoch": 0.8528519958620198, + "grad_norm": 0.3137829005718231, + "learning_rate": 3.225283299843515e-06, + "loss": 0.097, + "step": 47816 + }, + { + "epoch": 0.8528698319837335, + "grad_norm": 0.3306083083152771, + "learning_rate": 3.224518625037884e-06, + "loss": 0.1822, + "step": 47817 + }, + { + "epoch": 0.8528876681054471, + "grad_norm": 0.2567809522151947, + "learning_rate": 3.223754034641538e-06, + "loss": 0.1184, + "step": 47818 + }, + { + "epoch": 0.8529055042271608, + "grad_norm": 0.26851823925971985, + "learning_rate": 3.222989528657441e-06, + "loss": 0.0905, + "step": 47819 + }, + { + "epoch": 0.8529233403488745, + "grad_norm": 0.41028735041618347, + "learning_rate": 3.222225107088561e-06, + "loss": 0.1234, + "step": 47820 + }, + { + "epoch": 0.8529411764705882, + "grad_norm": 0.22635671496391296, + "learning_rate": 3.221460769937859e-06, + "loss": 0.0779, + "step": 47821 + }, + { + "epoch": 0.8529590125923019, + "grad_norm": 0.3000737428665161, + "learning_rate": 3.2206965172082963e-06, + "loss": 0.1122, + "step": 47822 + }, + { + "epoch": 0.8529768487140156, + "grad_norm": 0.24610120058059692, + "learning_rate": 3.2199323489028375e-06, + "loss": 0.1026, + "step": 47823 + }, + { + "epoch": 0.8529946848357293, + "grad_norm": 0.2923082113265991, + "learning_rate": 3.219168265024436e-06, + "loss": 0.1404, + "step": 47824 + }, + { + "epoch": 0.8530125209574431, + "grad_norm": 0.40196898579597473, + "learning_rate": 3.2184042655760693e-06, + "loss": 0.1803, + "step": 47825 + }, + { + "epoch": 0.8530303570791568, + "grad_norm": 0.24313712120056152, + "learning_rate": 3.2176403505606883e-06, + "loss": 0.138, + "step": 47826 + }, + { + "epoch": 0.8530481932008704, + "grad_norm": 0.30020010471343994, + "learning_rate": 3.2168765199812543e-06, + "loss": 0.1368, + "step": 47827 + }, + { + "epoch": 0.8530660293225841, + "grad_norm": 0.31225764751434326, + "learning_rate": 3.2161127738407294e-06, + "loss": 0.1169, + "step": 47828 + }, + { + "epoch": 0.8530838654442978, + "grad_norm": 0.3640735149383545, + "learning_rate": 3.21534911214208e-06, + "loss": 0.0992, + "step": 47829 + }, + { + "epoch": 0.8531017015660115, + "grad_norm": 0.1927071213722229, + "learning_rate": 3.2145855348882618e-06, + "loss": 0.1082, + "step": 47830 + }, + { + "epoch": 0.8531195376877252, + "grad_norm": 0.24293819069862366, + "learning_rate": 3.2138220420822348e-06, + "loss": 0.1612, + "step": 47831 + }, + { + "epoch": 0.8531373738094389, + "grad_norm": 0.2787235379219055, + "learning_rate": 3.2130586337269507e-06, + "loss": 0.095, + "step": 47832 + }, + { + "epoch": 0.8531552099311526, + "grad_norm": 0.3321904242038727, + "learning_rate": 3.212295309825383e-06, + "loss": 0.1456, + "step": 47833 + }, + { + "epoch": 0.8531730460528663, + "grad_norm": 0.25644099712371826, + "learning_rate": 3.2115320703804852e-06, + "loss": 0.0679, + "step": 47834 + }, + { + "epoch": 0.85319088217458, + "grad_norm": 0.2552539110183716, + "learning_rate": 3.2107689153952152e-06, + "loss": 0.1412, + "step": 47835 + }, + { + "epoch": 0.8532087182962936, + "grad_norm": 0.29224133491516113, + "learning_rate": 3.2100058448725267e-06, + "loss": 0.0989, + "step": 47836 + }, + { + "epoch": 0.8532265544180073, + "grad_norm": 0.19636109471321106, + "learning_rate": 3.209242858815381e-06, + "loss": 0.0658, + "step": 47837 + }, + { + "epoch": 0.853244390539721, + "grad_norm": 0.29283997416496277, + "learning_rate": 3.20847995722674e-06, + "loss": 0.0877, + "step": 47838 + }, + { + "epoch": 0.8532622266614347, + "grad_norm": 0.21656470000743866, + "learning_rate": 3.2077171401095562e-06, + "loss": 0.1431, + "step": 47839 + }, + { + "epoch": 0.8532800627831484, + "grad_norm": 0.3101223111152649, + "learning_rate": 3.2069544074667806e-06, + "loss": 0.0634, + "step": 47840 + }, + { + "epoch": 0.8532978989048621, + "grad_norm": 0.2811841070652008, + "learning_rate": 3.2061917593013855e-06, + "loss": 0.0901, + "step": 47841 + }, + { + "epoch": 0.8533157350265759, + "grad_norm": 0.4255165755748749, + "learning_rate": 3.205429195616311e-06, + "loss": 0.1548, + "step": 47842 + }, + { + "epoch": 0.8533335711482896, + "grad_norm": 0.2860971689224243, + "learning_rate": 3.204666716414528e-06, + "loss": 0.1354, + "step": 47843 + }, + { + "epoch": 0.8533514072700032, + "grad_norm": 0.415427029132843, + "learning_rate": 3.2039043216989833e-06, + "loss": 0.1167, + "step": 47844 + }, + { + "epoch": 0.8533692433917169, + "grad_norm": 0.3244549036026001, + "learning_rate": 3.203142011472626e-06, + "loss": 0.0872, + "step": 47845 + }, + { + "epoch": 0.8533870795134306, + "grad_norm": 0.29142245650291443, + "learning_rate": 3.202379785738427e-06, + "loss": 0.1317, + "step": 47846 + }, + { + "epoch": 0.8534049156351443, + "grad_norm": 0.2919248044490814, + "learning_rate": 3.2016176444993327e-06, + "loss": 0.1375, + "step": 47847 + }, + { + "epoch": 0.853422751756858, + "grad_norm": 0.22123698890209198, + "learning_rate": 3.2008555877582974e-06, + "loss": 0.0868, + "step": 47848 + }, + { + "epoch": 0.8534405878785717, + "grad_norm": 0.25368037819862366, + "learning_rate": 3.2000936155182735e-06, + "loss": 0.0894, + "step": 47849 + }, + { + "epoch": 0.8534584240002854, + "grad_norm": 0.2924588918685913, + "learning_rate": 3.199331727782212e-06, + "loss": 0.1124, + "step": 47850 + }, + { + "epoch": 0.8534762601219991, + "grad_norm": 0.37117618322372437, + "learning_rate": 3.1985699245530743e-06, + "loss": 0.0694, + "step": 47851 + }, + { + "epoch": 0.8534940962437128, + "grad_norm": 0.1976107656955719, + "learning_rate": 3.1978082058338105e-06, + "loss": 0.1177, + "step": 47852 + }, + { + "epoch": 0.8535119323654264, + "grad_norm": 0.3082253038883209, + "learning_rate": 3.197046571627374e-06, + "loss": 0.1707, + "step": 47853 + }, + { + "epoch": 0.8535297684871401, + "grad_norm": 0.274643212556839, + "learning_rate": 3.196285021936707e-06, + "loss": 0.1209, + "step": 47854 + }, + { + "epoch": 0.8535476046088538, + "grad_norm": 0.2921876311302185, + "learning_rate": 3.195523556764771e-06, + "loss": 0.0756, + "step": 47855 + }, + { + "epoch": 0.8535654407305675, + "grad_norm": 0.2211625874042511, + "learning_rate": 3.1947621761145243e-06, + "loss": 0.116, + "step": 47856 + }, + { + "epoch": 0.8535832768522812, + "grad_norm": 0.25704023241996765, + "learning_rate": 3.1940008799889094e-06, + "loss": 0.115, + "step": 47857 + }, + { + "epoch": 0.8536011129739949, + "grad_norm": 0.28190675377845764, + "learning_rate": 3.193239668390871e-06, + "loss": 0.1425, + "step": 47858 + }, + { + "epoch": 0.8536189490957087, + "grad_norm": 0.25090017914772034, + "learning_rate": 3.1924785413233763e-06, + "loss": 0.1444, + "step": 47859 + }, + { + "epoch": 0.8536367852174224, + "grad_norm": 0.24649174511432648, + "learning_rate": 3.191717498789365e-06, + "loss": 0.1168, + "step": 47860 + }, + { + "epoch": 0.853654621339136, + "grad_norm": 0.23139570653438568, + "learning_rate": 3.19095654079179e-06, + "loss": 0.1014, + "step": 47861 + }, + { + "epoch": 0.8536724574608497, + "grad_norm": 0.32303106784820557, + "learning_rate": 3.190195667333598e-06, + "loss": 0.082, + "step": 47862 + }, + { + "epoch": 0.8536902935825634, + "grad_norm": 0.2713688910007477, + "learning_rate": 3.189434878417735e-06, + "loss": 0.0901, + "step": 47863 + }, + { + "epoch": 0.8537081297042771, + "grad_norm": 0.20291899144649506, + "learning_rate": 3.1886741740471626e-06, + "loss": 0.0884, + "step": 47864 + }, + { + "epoch": 0.8537259658259908, + "grad_norm": 0.3431152105331421, + "learning_rate": 3.1879135542248223e-06, + "loss": 0.1535, + "step": 47865 + }, + { + "epoch": 0.8537438019477045, + "grad_norm": 0.3150855302810669, + "learning_rate": 3.187153018953665e-06, + "loss": 0.105, + "step": 47866 + }, + { + "epoch": 0.8537616380694182, + "grad_norm": 0.327764630317688, + "learning_rate": 3.1863925682366265e-06, + "loss": 0.0711, + "step": 47867 + }, + { + "epoch": 0.8537794741911319, + "grad_norm": 0.33379560708999634, + "learning_rate": 3.1856322020766715e-06, + "loss": 0.1646, + "step": 47868 + }, + { + "epoch": 0.8537973103128456, + "grad_norm": 0.22928795218467712, + "learning_rate": 3.184871920476737e-06, + "loss": 0.1212, + "step": 47869 + }, + { + "epoch": 0.8538151464345592, + "grad_norm": 0.4106377065181732, + "learning_rate": 3.1841117234397782e-06, + "loss": 0.1119, + "step": 47870 + }, + { + "epoch": 0.8538329825562729, + "grad_norm": 0.21120992302894592, + "learning_rate": 3.1833516109687324e-06, + "loss": 0.1117, + "step": 47871 + }, + { + "epoch": 0.8538508186779866, + "grad_norm": 0.2546467185020447, + "learning_rate": 3.1825915830665547e-06, + "loss": 0.1265, + "step": 47872 + }, + { + "epoch": 0.8538686547997003, + "grad_norm": 0.2843717634677887, + "learning_rate": 3.181831639736188e-06, + "loss": 0.1439, + "step": 47873 + }, + { + "epoch": 0.853886490921414, + "grad_norm": 0.30135223269462585, + "learning_rate": 3.1810717809805796e-06, + "loss": 0.1229, + "step": 47874 + }, + { + "epoch": 0.8539043270431278, + "grad_norm": 0.15905214846134186, + "learning_rate": 3.1803120068026713e-06, + "loss": 0.0656, + "step": 47875 + }, + { + "epoch": 0.8539221631648415, + "grad_norm": 0.2464081197977066, + "learning_rate": 3.179552317205403e-06, + "loss": 0.1209, + "step": 47876 + }, + { + "epoch": 0.8539399992865552, + "grad_norm": 0.27057844400405884, + "learning_rate": 3.1787927121917333e-06, + "loss": 0.0732, + "step": 47877 + }, + { + "epoch": 0.8539578354082689, + "grad_norm": 0.26727089285850525, + "learning_rate": 3.178033191764601e-06, + "loss": 0.073, + "step": 47878 + }, + { + "epoch": 0.8539756715299825, + "grad_norm": 0.3358921408653259, + "learning_rate": 3.177273755926949e-06, + "loss": 0.1594, + "step": 47879 + }, + { + "epoch": 0.8539935076516962, + "grad_norm": 0.2509421706199646, + "learning_rate": 3.176514404681713e-06, + "loss": 0.0877, + "step": 47880 + }, + { + "epoch": 0.8540113437734099, + "grad_norm": 0.18894881010055542, + "learning_rate": 3.1757551380318525e-06, + "loss": 0.0774, + "step": 47881 + }, + { + "epoch": 0.8540291798951236, + "grad_norm": 0.2663378417491913, + "learning_rate": 3.174995955980298e-06, + "loss": 0.1289, + "step": 47882 + }, + { + "epoch": 0.8540470160168373, + "grad_norm": 0.24263593554496765, + "learning_rate": 3.1742368585300026e-06, + "loss": 0.1251, + "step": 47883 + }, + { + "epoch": 0.854064852138551, + "grad_norm": 0.2619529068470001, + "learning_rate": 3.1734778456838976e-06, + "loss": 0.1001, + "step": 47884 + }, + { + "epoch": 0.8540826882602647, + "grad_norm": 0.2646733224391937, + "learning_rate": 3.1727189174449363e-06, + "loss": 0.094, + "step": 47885 + }, + { + "epoch": 0.8541005243819784, + "grad_norm": 0.26845771074295044, + "learning_rate": 3.171960073816055e-06, + "loss": 0.1508, + "step": 47886 + }, + { + "epoch": 0.854118360503692, + "grad_norm": 0.2413213849067688, + "learning_rate": 3.1712013148001955e-06, + "loss": 0.0723, + "step": 47887 + }, + { + "epoch": 0.8541361966254057, + "grad_norm": 0.2584402859210968, + "learning_rate": 3.170442640400301e-06, + "loss": 0.088, + "step": 47888 + }, + { + "epoch": 0.8541540327471194, + "grad_norm": 0.3772064745426178, + "learning_rate": 3.169684050619301e-06, + "loss": 0.1367, + "step": 47889 + }, + { + "epoch": 0.8541718688688331, + "grad_norm": 0.2587524354457855, + "learning_rate": 3.1689255454601554e-06, + "loss": 0.0915, + "step": 47890 + }, + { + "epoch": 0.8541897049905468, + "grad_norm": 0.21464362740516663, + "learning_rate": 3.1681671249257915e-06, + "loss": 0.1097, + "step": 47891 + }, + { + "epoch": 0.8542075411122606, + "grad_norm": 0.26134949922561646, + "learning_rate": 3.167408789019155e-06, + "loss": 0.1095, + "step": 47892 + }, + { + "epoch": 0.8542253772339743, + "grad_norm": 0.27121084928512573, + "learning_rate": 3.1666505377431767e-06, + "loss": 0.0892, + "step": 47893 + }, + { + "epoch": 0.854243213355688, + "grad_norm": 0.21713709831237793, + "learning_rate": 3.1658923711008067e-06, + "loss": 0.081, + "step": 47894 + }, + { + "epoch": 0.8542610494774017, + "grad_norm": 0.2829188406467438, + "learning_rate": 3.165134289094979e-06, + "loss": 0.1446, + "step": 47895 + }, + { + "epoch": 0.8542788855991154, + "grad_norm": 0.2501837909221649, + "learning_rate": 3.164376291728627e-06, + "loss": 0.1042, + "step": 47896 + }, + { + "epoch": 0.854296721720829, + "grad_norm": 0.3427073657512665, + "learning_rate": 3.1636183790046984e-06, + "loss": 0.17, + "step": 47897 + }, + { + "epoch": 0.8543145578425427, + "grad_norm": 0.22886112332344055, + "learning_rate": 3.162860550926122e-06, + "loss": 0.0775, + "step": 47898 + }, + { + "epoch": 0.8543323939642564, + "grad_norm": 0.26632001996040344, + "learning_rate": 3.162102807495848e-06, + "loss": 0.1177, + "step": 47899 + }, + { + "epoch": 0.8543502300859701, + "grad_norm": 0.2824237644672394, + "learning_rate": 3.1613451487168046e-06, + "loss": 0.1391, + "step": 47900 + }, + { + "epoch": 0.8543680662076838, + "grad_norm": 0.2267133891582489, + "learning_rate": 3.1605875745919307e-06, + "loss": 0.0758, + "step": 47901 + }, + { + "epoch": 0.8543859023293975, + "grad_norm": 0.2908259332180023, + "learning_rate": 3.1598300851241576e-06, + "loss": 0.0792, + "step": 47902 + }, + { + "epoch": 0.8544037384511112, + "grad_norm": 0.24099712073802948, + "learning_rate": 3.15907268031643e-06, + "loss": 0.0626, + "step": 47903 + }, + { + "epoch": 0.8544215745728249, + "grad_norm": 0.29692861437797546, + "learning_rate": 3.158315360171682e-06, + "loss": 0.1614, + "step": 47904 + }, + { + "epoch": 0.8544394106945385, + "grad_norm": 0.28205573558807373, + "learning_rate": 3.157558124692847e-06, + "loss": 0.1156, + "step": 47905 + }, + { + "epoch": 0.8544572468162522, + "grad_norm": 0.3561142683029175, + "learning_rate": 3.1568009738828565e-06, + "loss": 0.1219, + "step": 47906 + }, + { + "epoch": 0.8544750829379659, + "grad_norm": 0.21445907652378082, + "learning_rate": 3.1560439077446517e-06, + "loss": 0.0393, + "step": 47907 + }, + { + "epoch": 0.8544929190596796, + "grad_norm": 0.2777465879917145, + "learning_rate": 3.1552869262811674e-06, + "loss": 0.0881, + "step": 47908 + }, + { + "epoch": 0.8545107551813934, + "grad_norm": 0.23240768909454346, + "learning_rate": 3.1545300294953313e-06, + "loss": 0.0849, + "step": 47909 + }, + { + "epoch": 0.8545285913031071, + "grad_norm": 0.34284019470214844, + "learning_rate": 3.153773217390088e-06, + "loss": 0.1167, + "step": 47910 + }, + { + "epoch": 0.8545464274248208, + "grad_norm": 0.23494671285152435, + "learning_rate": 3.153016489968358e-06, + "loss": 0.0832, + "step": 47911 + }, + { + "epoch": 0.8545642635465345, + "grad_norm": 0.2619295120239258, + "learning_rate": 3.1522598472330882e-06, + "loss": 0.1206, + "step": 47912 + }, + { + "epoch": 0.8545820996682482, + "grad_norm": 0.2884621322154999, + "learning_rate": 3.1515032891872046e-06, + "loss": 0.1073, + "step": 47913 + }, + { + "epoch": 0.8545999357899619, + "grad_norm": 0.23793882131576538, + "learning_rate": 3.150746815833641e-06, + "loss": 0.0557, + "step": 47914 + }, + { + "epoch": 0.8546177719116755, + "grad_norm": 0.20460525155067444, + "learning_rate": 3.1499904271753227e-06, + "loss": 0.1007, + "step": 47915 + }, + { + "epoch": 0.8546356080333892, + "grad_norm": 0.2746855914592743, + "learning_rate": 3.1492341232151947e-06, + "loss": 0.121, + "step": 47916 + }, + { + "epoch": 0.8546534441551029, + "grad_norm": 0.2409539520740509, + "learning_rate": 3.1484779039561816e-06, + "loss": 0.0939, + "step": 47917 + }, + { + "epoch": 0.8546712802768166, + "grad_norm": 0.36384958028793335, + "learning_rate": 3.147721769401216e-06, + "loss": 0.1548, + "step": 47918 + }, + { + "epoch": 0.8546891163985303, + "grad_norm": 0.3874285817146301, + "learning_rate": 3.1469657195532243e-06, + "loss": 0.1456, + "step": 47919 + }, + { + "epoch": 0.854706952520244, + "grad_norm": 0.35579147934913635, + "learning_rate": 3.146209754415144e-06, + "loss": 0.082, + "step": 47920 + }, + { + "epoch": 0.8547247886419577, + "grad_norm": 0.1876312643289566, + "learning_rate": 3.1454538739899038e-06, + "loss": 0.082, + "step": 47921 + }, + { + "epoch": 0.8547426247636714, + "grad_norm": 0.20282939076423645, + "learning_rate": 3.1446980782804337e-06, + "loss": 0.035, + "step": 47922 + }, + { + "epoch": 0.854760460885385, + "grad_norm": 0.26944705843925476, + "learning_rate": 3.1439423672896566e-06, + "loss": 0.127, + "step": 47923 + }, + { + "epoch": 0.8547782970070987, + "grad_norm": 0.40221691131591797, + "learning_rate": 3.1431867410205125e-06, + "loss": 0.1595, + "step": 47924 + }, + { + "epoch": 0.8547961331288124, + "grad_norm": 0.2583920359611511, + "learning_rate": 3.1424311994759202e-06, + "loss": 0.0894, + "step": 47925 + }, + { + "epoch": 0.8548139692505262, + "grad_norm": 0.29089394211769104, + "learning_rate": 3.14167574265882e-06, + "loss": 0.1111, + "step": 47926 + }, + { + "epoch": 0.8548318053722399, + "grad_norm": 0.32535338401794434, + "learning_rate": 3.140920370572134e-06, + "loss": 0.1281, + "step": 47927 + }, + { + "epoch": 0.8548496414939536, + "grad_norm": 0.282058447599411, + "learning_rate": 3.1401650832187853e-06, + "loss": 0.1501, + "step": 47928 + }, + { + "epoch": 0.8548674776156673, + "grad_norm": 0.2828871011734009, + "learning_rate": 3.1394098806017125e-06, + "loss": 0.1162, + "step": 47929 + }, + { + "epoch": 0.854885313737381, + "grad_norm": 0.28532809019088745, + "learning_rate": 3.138654762723836e-06, + "loss": 0.1211, + "step": 47930 + }, + { + "epoch": 0.8549031498590947, + "grad_norm": 0.289338618516922, + "learning_rate": 3.137899729588084e-06, + "loss": 0.1349, + "step": 47931 + }, + { + "epoch": 0.8549209859808083, + "grad_norm": 0.26051798462867737, + "learning_rate": 3.137144781197379e-06, + "loss": 0.1127, + "step": 47932 + }, + { + "epoch": 0.854938822102522, + "grad_norm": 0.26813679933547974, + "learning_rate": 3.136389917554658e-06, + "loss": 0.1463, + "step": 47933 + }, + { + "epoch": 0.8549566582242357, + "grad_norm": 0.49779587984085083, + "learning_rate": 3.135635138662843e-06, + "loss": 0.0917, + "step": 47934 + }, + { + "epoch": 0.8549744943459494, + "grad_norm": 0.260145366191864, + "learning_rate": 3.1348804445248543e-06, + "loss": 0.0898, + "step": 47935 + }, + { + "epoch": 0.8549923304676631, + "grad_norm": 0.21998216211795807, + "learning_rate": 3.134125835143617e-06, + "loss": 0.1239, + "step": 47936 + }, + { + "epoch": 0.8550101665893768, + "grad_norm": 0.2710752785205841, + "learning_rate": 3.133371310522065e-06, + "loss": 0.1019, + "step": 47937 + }, + { + "epoch": 0.8550280027110905, + "grad_norm": 0.24698343873023987, + "learning_rate": 3.132616870663116e-06, + "loss": 0.1185, + "step": 47938 + }, + { + "epoch": 0.8550458388328042, + "grad_norm": 0.2897292673587799, + "learning_rate": 3.1318625155696975e-06, + "loss": 0.16, + "step": 47939 + }, + { + "epoch": 0.8550636749545178, + "grad_norm": 0.2503184974193573, + "learning_rate": 3.1311082452447376e-06, + "loss": 0.0858, + "step": 47940 + }, + { + "epoch": 0.8550815110762315, + "grad_norm": 0.2755941152572632, + "learning_rate": 3.130354059691146e-06, + "loss": 0.1141, + "step": 47941 + }, + { + "epoch": 0.8550993471979452, + "grad_norm": 0.33380693197250366, + "learning_rate": 3.1295999589118637e-06, + "loss": 0.1254, + "step": 47942 + }, + { + "epoch": 0.855117183319659, + "grad_norm": 0.264660120010376, + "learning_rate": 3.1288459429098028e-06, + "loss": 0.1104, + "step": 47943 + }, + { + "epoch": 0.8551350194413727, + "grad_norm": 0.278068870306015, + "learning_rate": 3.1280920116878916e-06, + "loss": 0.1325, + "step": 47944 + }, + { + "epoch": 0.8551528555630864, + "grad_norm": 0.40675032138824463, + "learning_rate": 3.127338165249044e-06, + "loss": 0.1353, + "step": 47945 + }, + { + "epoch": 0.8551706916848001, + "grad_norm": 0.2137865573167801, + "learning_rate": 3.1265844035961944e-06, + "loss": 0.0282, + "step": 47946 + }, + { + "epoch": 0.8551885278065138, + "grad_norm": 0.34834906458854675, + "learning_rate": 3.1258307267322567e-06, + "loss": 0.1414, + "step": 47947 + }, + { + "epoch": 0.8552063639282275, + "grad_norm": 0.21771539747714996, + "learning_rate": 3.1250771346601565e-06, + "loss": 0.1351, + "step": 47948 + }, + { + "epoch": 0.8552242000499412, + "grad_norm": 0.3137732446193695, + "learning_rate": 3.1243236273828053e-06, + "loss": 0.0977, + "step": 47949 + }, + { + "epoch": 0.8552420361716548, + "grad_norm": 0.3677910566329956, + "learning_rate": 3.123570204903137e-06, + "loss": 0.1325, + "step": 47950 + }, + { + "epoch": 0.8552598722933685, + "grad_norm": 0.2628314793109894, + "learning_rate": 3.122816867224068e-06, + "loss": 0.1073, + "step": 47951 + }, + { + "epoch": 0.8552777084150822, + "grad_norm": 0.33903229236602783, + "learning_rate": 3.1220636143485084e-06, + "loss": 0.1474, + "step": 47952 + }, + { + "epoch": 0.8552955445367959, + "grad_norm": 0.23210380971431732, + "learning_rate": 3.1213104462793964e-06, + "loss": 0.1121, + "step": 47953 + }, + { + "epoch": 0.8553133806585096, + "grad_norm": 0.28530988097190857, + "learning_rate": 3.120557363019633e-06, + "loss": 0.1303, + "step": 47954 + }, + { + "epoch": 0.8553312167802233, + "grad_norm": 0.23036034405231476, + "learning_rate": 3.1198043645721516e-06, + "loss": 0.0833, + "step": 47955 + }, + { + "epoch": 0.855349052901937, + "grad_norm": 0.2138405740261078, + "learning_rate": 3.1190514509398667e-06, + "loss": 0.102, + "step": 47956 + }, + { + "epoch": 0.8553668890236507, + "grad_norm": 0.28536635637283325, + "learning_rate": 3.1182986221256956e-06, + "loss": 0.0733, + "step": 47957 + }, + { + "epoch": 0.8553847251453643, + "grad_norm": 0.33033108711242676, + "learning_rate": 3.117545878132552e-06, + "loss": 0.1123, + "step": 47958 + }, + { + "epoch": 0.855402561267078, + "grad_norm": 0.31263771653175354, + "learning_rate": 3.1167932189633625e-06, + "loss": 0.1399, + "step": 47959 + }, + { + "epoch": 0.8554203973887918, + "grad_norm": 0.2632431983947754, + "learning_rate": 3.1160406446210432e-06, + "loss": 0.0485, + "step": 47960 + }, + { + "epoch": 0.8554382335105055, + "grad_norm": 0.29876360297203064, + "learning_rate": 3.115288155108506e-06, + "loss": 0.0651, + "step": 47961 + }, + { + "epoch": 0.8554560696322192, + "grad_norm": 0.26709580421447754, + "learning_rate": 3.114535750428668e-06, + "loss": 0.1415, + "step": 47962 + }, + { + "epoch": 0.8554739057539329, + "grad_norm": 0.2062436044216156, + "learning_rate": 3.1137834305844523e-06, + "loss": 0.0859, + "step": 47963 + }, + { + "epoch": 0.8554917418756466, + "grad_norm": 0.2934791147708893, + "learning_rate": 3.1130311955787724e-06, + "loss": 0.164, + "step": 47964 + }, + { + "epoch": 0.8555095779973603, + "grad_norm": 0.23830966651439667, + "learning_rate": 3.1122790454145345e-06, + "loss": 0.1073, + "step": 47965 + }, + { + "epoch": 0.855527414119074, + "grad_norm": 0.25940176844596863, + "learning_rate": 3.11152698009467e-06, + "loss": 0.0966, + "step": 47966 + }, + { + "epoch": 0.8555452502407876, + "grad_norm": 0.30525127053260803, + "learning_rate": 3.110774999622082e-06, + "loss": 0.1309, + "step": 47967 + }, + { + "epoch": 0.8555630863625013, + "grad_norm": 0.32313302159309387, + "learning_rate": 3.1100231039996957e-06, + "loss": 0.1435, + "step": 47968 + }, + { + "epoch": 0.855580922484215, + "grad_norm": 0.26141560077667236, + "learning_rate": 3.10927129323042e-06, + "loss": 0.1013, + "step": 47969 + }, + { + "epoch": 0.8555987586059287, + "grad_norm": 0.3228697180747986, + "learning_rate": 3.1085195673171722e-06, + "loss": 0.1385, + "step": 47970 + }, + { + "epoch": 0.8556165947276424, + "grad_norm": 0.2343200147151947, + "learning_rate": 3.1077679262628555e-06, + "loss": 0.1534, + "step": 47971 + }, + { + "epoch": 0.8556344308493561, + "grad_norm": 0.18854576349258423, + "learning_rate": 3.107016370070398e-06, + "loss": 0.0751, + "step": 47972 + }, + { + "epoch": 0.8556522669710698, + "grad_norm": 0.2713789939880371, + "learning_rate": 3.1062648987427058e-06, + "loss": 0.0681, + "step": 47973 + }, + { + "epoch": 0.8556701030927835, + "grad_norm": 0.33594071865081787, + "learning_rate": 3.1055135122826926e-06, + "loss": 0.1413, + "step": 47974 + }, + { + "epoch": 0.8556879392144972, + "grad_norm": 0.20694705843925476, + "learning_rate": 3.1047622106932654e-06, + "loss": 0.1003, + "step": 47975 + }, + { + "epoch": 0.855705775336211, + "grad_norm": 0.1782122403383255, + "learning_rate": 3.104010993977349e-06, + "loss": 0.1007, + "step": 47976 + }, + { + "epoch": 0.8557236114579246, + "grad_norm": 0.2469550371170044, + "learning_rate": 3.1032598621378473e-06, + "loss": 0.0853, + "step": 47977 + }, + { + "epoch": 0.8557414475796383, + "grad_norm": 0.32729366421699524, + "learning_rate": 3.1025088151776764e-06, + "loss": 0.1541, + "step": 47978 + }, + { + "epoch": 0.855759283701352, + "grad_norm": 0.2528505325317383, + "learning_rate": 3.1017578530997347e-06, + "loss": 0.1325, + "step": 47979 + }, + { + "epoch": 0.8557771198230657, + "grad_norm": 0.21704956889152527, + "learning_rate": 3.101006975906945e-06, + "loss": 0.1037, + "step": 47980 + }, + { + "epoch": 0.8557949559447794, + "grad_norm": 0.6580010652542114, + "learning_rate": 3.100256183602221e-06, + "loss": 0.219, + "step": 47981 + }, + { + "epoch": 0.8558127920664931, + "grad_norm": 0.21591049432754517, + "learning_rate": 3.099505476188469e-06, + "loss": 0.0886, + "step": 47982 + }, + { + "epoch": 0.8558306281882068, + "grad_norm": 0.18405179679393768, + "learning_rate": 3.0987548536685977e-06, + "loss": 0.0499, + "step": 47983 + }, + { + "epoch": 0.8558484643099205, + "grad_norm": 0.2931574285030365, + "learning_rate": 3.098004316045511e-06, + "loss": 0.1338, + "step": 47984 + }, + { + "epoch": 0.8558663004316341, + "grad_norm": 0.2667537033557892, + "learning_rate": 3.0972538633221304e-06, + "loss": 0.0954, + "step": 47985 + }, + { + "epoch": 0.8558841365533478, + "grad_norm": 0.23011080920696259, + "learning_rate": 3.096503495501357e-06, + "loss": 0.07, + "step": 47986 + }, + { + "epoch": 0.8559019726750615, + "grad_norm": 0.3122994601726532, + "learning_rate": 3.095753212586103e-06, + "loss": 0.1312, + "step": 47987 + }, + { + "epoch": 0.8559198087967752, + "grad_norm": 0.22048909962177277, + "learning_rate": 3.0950030145792703e-06, + "loss": 0.0871, + "step": 47988 + }, + { + "epoch": 0.8559376449184889, + "grad_norm": 0.3142906427383423, + "learning_rate": 3.094252901483777e-06, + "loss": 0.0991, + "step": 47989 + }, + { + "epoch": 0.8559554810402026, + "grad_norm": 0.43747803568840027, + "learning_rate": 3.0935028733025227e-06, + "loss": 0.1841, + "step": 47990 + }, + { + "epoch": 0.8559733171619163, + "grad_norm": 0.3561456799507141, + "learning_rate": 3.09275293003842e-06, + "loss": 0.1813, + "step": 47991 + }, + { + "epoch": 0.85599115328363, + "grad_norm": 0.30997058749198914, + "learning_rate": 3.092003071694366e-06, + "loss": 0.1952, + "step": 47992 + }, + { + "epoch": 0.8560089894053438, + "grad_norm": 0.251335084438324, + "learning_rate": 3.0912532982732835e-06, + "loss": 0.0929, + "step": 47993 + }, + { + "epoch": 0.8560268255270574, + "grad_norm": 0.31679704785346985, + "learning_rate": 3.0905036097780613e-06, + "loss": 0.1004, + "step": 47994 + }, + { + "epoch": 0.8560446616487711, + "grad_norm": 0.25929996371269226, + "learning_rate": 3.089754006211623e-06, + "loss": 0.077, + "step": 47995 + }, + { + "epoch": 0.8560624977704848, + "grad_norm": 0.340159147977829, + "learning_rate": 3.089004487576863e-06, + "loss": 0.1604, + "step": 47996 + }, + { + "epoch": 0.8560803338921985, + "grad_norm": 0.3275354504585266, + "learning_rate": 3.0882550538766847e-06, + "loss": 0.1095, + "step": 47997 + }, + { + "epoch": 0.8560981700139122, + "grad_norm": 0.41673585772514343, + "learning_rate": 3.087505705114005e-06, + "loss": 0.1338, + "step": 47998 + }, + { + "epoch": 0.8561160061356259, + "grad_norm": 0.24002967774868011, + "learning_rate": 3.0867564412917187e-06, + "loss": 0.1215, + "step": 47999 + }, + { + "epoch": 0.8561338422573396, + "grad_norm": 0.20746806263923645, + "learning_rate": 3.086007262412735e-06, + "loss": 0.1069, + "step": 48000 + }, + { + "epoch": 0.8561338422573396, + "eval_loss": 0.10846409201622009, + "eval_runtime": 107.4048, + "eval_samples_per_second": 9.534, + "eval_steps_per_second": 1.592, + "step": 48000 + }, + { + "epoch": 0.8561516783790533, + "grad_norm": 0.31325647234916687, + "learning_rate": 3.0852581684799515e-06, + "loss": 0.1354, + "step": 48001 + }, + { + "epoch": 0.856169514500767, + "grad_norm": 0.2544112205505371, + "learning_rate": 3.0845091594962793e-06, + "loss": 0.126, + "step": 48002 + }, + { + "epoch": 0.8561873506224806, + "grad_norm": 0.18850870430469513, + "learning_rate": 3.0837602354646195e-06, + "loss": 0.0908, + "step": 48003 + }, + { + "epoch": 0.8562051867441943, + "grad_norm": 0.362602561712265, + "learning_rate": 3.0830113963878778e-06, + "loss": 0.1251, + "step": 48004 + }, + { + "epoch": 0.856223022865908, + "grad_norm": 0.31313273310661316, + "learning_rate": 3.082262642268949e-06, + "loss": 0.1418, + "step": 48005 + }, + { + "epoch": 0.8562408589876217, + "grad_norm": 0.3185657262802124, + "learning_rate": 3.0815139731107388e-06, + "loss": 0.1281, + "step": 48006 + }, + { + "epoch": 0.8562586951093354, + "grad_norm": 0.4028942584991455, + "learning_rate": 3.0807653889161542e-06, + "loss": 0.2, + "step": 48007 + }, + { + "epoch": 0.8562765312310491, + "grad_norm": 0.2397243231534958, + "learning_rate": 3.0800168896880892e-06, + "loss": 0.0951, + "step": 48008 + }, + { + "epoch": 0.8562943673527628, + "grad_norm": 0.22202181816101074, + "learning_rate": 3.0792684754294533e-06, + "loss": 0.1391, + "step": 48009 + }, + { + "epoch": 0.8563122034744766, + "grad_norm": 0.2454749196767807, + "learning_rate": 3.078520146143141e-06, + "loss": 0.0895, + "step": 48010 + }, + { + "epoch": 0.8563300395961903, + "grad_norm": 0.2699190676212311, + "learning_rate": 3.0777719018320604e-06, + "loss": 0.0834, + "step": 48011 + }, + { + "epoch": 0.8563478757179039, + "grad_norm": 0.275518536567688, + "learning_rate": 3.0770237424991077e-06, + "loss": 0.103, + "step": 48012 + }, + { + "epoch": 0.8563657118396176, + "grad_norm": 0.22893822193145752, + "learning_rate": 3.076275668147183e-06, + "loss": 0.1009, + "step": 48013 + }, + { + "epoch": 0.8563835479613313, + "grad_norm": 0.2821848690509796, + "learning_rate": 3.0755276787791804e-06, + "loss": 0.097, + "step": 48014 + }, + { + "epoch": 0.856401384083045, + "grad_norm": 0.27042025327682495, + "learning_rate": 3.0747797743980096e-06, + "loss": 0.0876, + "step": 48015 + }, + { + "epoch": 0.8564192202047587, + "grad_norm": 0.30033254623413086, + "learning_rate": 3.074031955006565e-06, + "loss": 0.1297, + "step": 48016 + }, + { + "epoch": 0.8564370563264724, + "grad_norm": 0.23884430527687073, + "learning_rate": 3.073284220607747e-06, + "loss": 0.1034, + "step": 48017 + }, + { + "epoch": 0.8564548924481861, + "grad_norm": 0.35646435618400574, + "learning_rate": 3.0725365712044513e-06, + "loss": 0.0793, + "step": 48018 + }, + { + "epoch": 0.8564727285698998, + "grad_norm": 0.30801934003829956, + "learning_rate": 3.0717890067995746e-06, + "loss": 0.1364, + "step": 48019 + }, + { + "epoch": 0.8564905646916134, + "grad_norm": 0.21928814053535461, + "learning_rate": 3.071041527396021e-06, + "loss": 0.1053, + "step": 48020 + }, + { + "epoch": 0.8565084008133271, + "grad_norm": 0.30654457211494446, + "learning_rate": 3.070294132996679e-06, + "loss": 0.1449, + "step": 48021 + }, + { + "epoch": 0.8565262369350408, + "grad_norm": 0.23428907990455627, + "learning_rate": 3.069546823604455e-06, + "loss": 0.1216, + "step": 48022 + }, + { + "epoch": 0.8565440730567545, + "grad_norm": 0.29217803478240967, + "learning_rate": 3.0687995992222387e-06, + "loss": 0.0812, + "step": 48023 + }, + { + "epoch": 0.8565619091784682, + "grad_norm": 0.19916154444217682, + "learning_rate": 3.0680524598529354e-06, + "loss": 0.0847, + "step": 48024 + }, + { + "epoch": 0.8565797453001819, + "grad_norm": 0.23355744779109955, + "learning_rate": 3.067305405499435e-06, + "loss": 0.0834, + "step": 48025 + }, + { + "epoch": 0.8565975814218956, + "grad_norm": 0.26400062441825867, + "learning_rate": 3.066558436164635e-06, + "loss": 0.1072, + "step": 48026 + }, + { + "epoch": 0.8566154175436094, + "grad_norm": 0.26424655318260193, + "learning_rate": 3.065811551851425e-06, + "loss": 0.1266, + "step": 48027 + }, + { + "epoch": 0.8566332536653231, + "grad_norm": 0.25961050391197205, + "learning_rate": 3.0650647525627074e-06, + "loss": 0.0948, + "step": 48028 + }, + { + "epoch": 0.8566510897870367, + "grad_norm": 0.2581084668636322, + "learning_rate": 3.064318038301378e-06, + "loss": 0.1038, + "step": 48029 + }, + { + "epoch": 0.8566689259087504, + "grad_norm": 0.29015183448791504, + "learning_rate": 3.063571409070329e-06, + "loss": 0.0953, + "step": 48030 + }, + { + "epoch": 0.8566867620304641, + "grad_norm": 0.35230720043182373, + "learning_rate": 3.0628248648724517e-06, + "loss": 0.0737, + "step": 48031 + }, + { + "epoch": 0.8567045981521778, + "grad_norm": 0.29183217883110046, + "learning_rate": 3.0620784057106363e-06, + "loss": 0.107, + "step": 48032 + }, + { + "epoch": 0.8567224342738915, + "grad_norm": 0.289079874753952, + "learning_rate": 3.061332031587791e-06, + "loss": 0.0863, + "step": 48033 + }, + { + "epoch": 0.8567402703956052, + "grad_norm": 0.2917121648788452, + "learning_rate": 3.060585742506797e-06, + "loss": 0.1415, + "step": 48034 + }, + { + "epoch": 0.8567581065173189, + "grad_norm": 0.2507006824016571, + "learning_rate": 3.0598395384705437e-06, + "loss": 0.1157, + "step": 48035 + }, + { + "epoch": 0.8567759426390326, + "grad_norm": 0.35034796595573425, + "learning_rate": 3.0590934194819342e-06, + "loss": 0.1561, + "step": 48036 + }, + { + "epoch": 0.8567937787607462, + "grad_norm": 0.3205254375934601, + "learning_rate": 3.058347385543858e-06, + "loss": 0.1142, + "step": 48037 + }, + { + "epoch": 0.8568116148824599, + "grad_norm": 0.2830381393432617, + "learning_rate": 3.0576014366592075e-06, + "loss": 0.1057, + "step": 48038 + }, + { + "epoch": 0.8568294510041736, + "grad_norm": 0.2835957705974579, + "learning_rate": 3.0568555728308746e-06, + "loss": 0.1481, + "step": 48039 + }, + { + "epoch": 0.8568472871258873, + "grad_norm": 0.2971174716949463, + "learning_rate": 3.0561097940617402e-06, + "loss": 0.1223, + "step": 48040 + }, + { + "epoch": 0.856865123247601, + "grad_norm": 0.3129143714904785, + "learning_rate": 3.05536410035471e-06, + "loss": 0.0785, + "step": 48041 + }, + { + "epoch": 0.8568829593693147, + "grad_norm": 0.16936978697776794, + "learning_rate": 3.0546184917126687e-06, + "loss": 0.0896, + "step": 48042 + }, + { + "epoch": 0.8569007954910284, + "grad_norm": 0.291711688041687, + "learning_rate": 3.0538729681385046e-06, + "loss": 0.1192, + "step": 48043 + }, + { + "epoch": 0.8569186316127422, + "grad_norm": 0.32878220081329346, + "learning_rate": 3.053127529635108e-06, + "loss": 0.0833, + "step": 48044 + }, + { + "epoch": 0.8569364677344559, + "grad_norm": 0.2906176447868347, + "learning_rate": 3.052382176205365e-06, + "loss": 0.0563, + "step": 48045 + }, + { + "epoch": 0.8569543038561696, + "grad_norm": 0.3261968195438385, + "learning_rate": 3.051636907852176e-06, + "loss": 0.1238, + "step": 48046 + }, + { + "epoch": 0.8569721399778832, + "grad_norm": 0.26035913825035095, + "learning_rate": 3.0508917245784197e-06, + "loss": 0.1004, + "step": 48047 + }, + { + "epoch": 0.8569899760995969, + "grad_norm": 0.29857370257377625, + "learning_rate": 3.050146626386985e-06, + "loss": 0.102, + "step": 48048 + }, + { + "epoch": 0.8570078122213106, + "grad_norm": 0.29833894968032837, + "learning_rate": 3.0494016132807696e-06, + "loss": 0.0959, + "step": 48049 + }, + { + "epoch": 0.8570256483430243, + "grad_norm": 0.23905020952224731, + "learning_rate": 3.0486566852626493e-06, + "loss": 0.0961, + "step": 48050 + }, + { + "epoch": 0.857043484464738, + "grad_norm": 0.3541562855243683, + "learning_rate": 3.0479118423355212e-06, + "loss": 0.1217, + "step": 48051 + }, + { + "epoch": 0.8570613205864517, + "grad_norm": 0.17636990547180176, + "learning_rate": 3.047167084502267e-06, + "loss": 0.0874, + "step": 48052 + }, + { + "epoch": 0.8570791567081654, + "grad_norm": 0.300947368144989, + "learning_rate": 3.046422411765773e-06, + "loss": 0.1094, + "step": 48053 + }, + { + "epoch": 0.8570969928298791, + "grad_norm": 0.270541250705719, + "learning_rate": 3.045677824128934e-06, + "loss": 0.1258, + "step": 48054 + }, + { + "epoch": 0.8571148289515927, + "grad_norm": 0.24894101917743683, + "learning_rate": 3.0449333215946285e-06, + "loss": 0.1002, + "step": 48055 + }, + { + "epoch": 0.8571326650733064, + "grad_norm": 0.27555519342422485, + "learning_rate": 3.0441889041657457e-06, + "loss": 0.1189, + "step": 48056 + }, + { + "epoch": 0.8571505011950201, + "grad_norm": 0.21403095126152039, + "learning_rate": 3.043444571845172e-06, + "loss": 0.0634, + "step": 48057 + }, + { + "epoch": 0.8571683373167338, + "grad_norm": 0.3495869040489197, + "learning_rate": 3.0427003246357834e-06, + "loss": 0.0899, + "step": 48058 + }, + { + "epoch": 0.8571861734384475, + "grad_norm": 0.2624898850917816, + "learning_rate": 3.0419561625404768e-06, + "loss": 0.1385, + "step": 48059 + }, + { + "epoch": 0.8572040095601612, + "grad_norm": 0.279317706823349, + "learning_rate": 3.041212085562131e-06, + "loss": 0.1241, + "step": 48060 + }, + { + "epoch": 0.857221845681875, + "grad_norm": 0.21027113497257233, + "learning_rate": 3.040468093703633e-06, + "loss": 0.1002, + "step": 48061 + }, + { + "epoch": 0.8572396818035887, + "grad_norm": 0.2977741062641144, + "learning_rate": 3.03972418696786e-06, + "loss": 0.1633, + "step": 48062 + }, + { + "epoch": 0.8572575179253024, + "grad_norm": 0.22352856397628784, + "learning_rate": 3.038980365357702e-06, + "loss": 0.0865, + "step": 48063 + }, + { + "epoch": 0.857275354047016, + "grad_norm": 0.23762989044189453, + "learning_rate": 3.0382366288760454e-06, + "loss": 0.0846, + "step": 48064 + }, + { + "epoch": 0.8572931901687297, + "grad_norm": 0.3085680603981018, + "learning_rate": 3.037492977525769e-06, + "loss": 0.1363, + "step": 48065 + }, + { + "epoch": 0.8573110262904434, + "grad_norm": 0.2413221299648285, + "learning_rate": 3.03674941130975e-06, + "loss": 0.0942, + "step": 48066 + }, + { + "epoch": 0.8573288624121571, + "grad_norm": 0.2554912269115448, + "learning_rate": 3.0360059302308812e-06, + "loss": 0.076, + "step": 48067 + }, + { + "epoch": 0.8573466985338708, + "grad_norm": 0.3013289272785187, + "learning_rate": 3.035262534292041e-06, + "loss": 0.0649, + "step": 48068 + }, + { + "epoch": 0.8573645346555845, + "grad_norm": 0.356425017118454, + "learning_rate": 3.03451922349611e-06, + "loss": 0.1375, + "step": 48069 + }, + { + "epoch": 0.8573823707772982, + "grad_norm": 0.26607567071914673, + "learning_rate": 3.0337759978459667e-06, + "loss": 0.1224, + "step": 48070 + }, + { + "epoch": 0.8574002068990119, + "grad_norm": 0.2775510251522064, + "learning_rate": 3.0330328573444892e-06, + "loss": 0.1502, + "step": 48071 + }, + { + "epoch": 0.8574180430207256, + "grad_norm": 0.32446298003196716, + "learning_rate": 3.03228980199457e-06, + "loss": 0.1031, + "step": 48072 + }, + { + "epoch": 0.8574358791424392, + "grad_norm": 0.29804527759552, + "learning_rate": 3.0315468317990844e-06, + "loss": 0.1374, + "step": 48073 + }, + { + "epoch": 0.8574537152641529, + "grad_norm": 0.297566682100296, + "learning_rate": 3.0308039467609075e-06, + "loss": 0.0869, + "step": 48074 + }, + { + "epoch": 0.8574715513858666, + "grad_norm": 0.27729323506355286, + "learning_rate": 3.0300611468829203e-06, + "loss": 0.1453, + "step": 48075 + }, + { + "epoch": 0.8574893875075803, + "grad_norm": 0.3541051745414734, + "learning_rate": 3.029318432168007e-06, + "loss": 0.1838, + "step": 48076 + }, + { + "epoch": 0.857507223629294, + "grad_norm": 0.2368578016757965, + "learning_rate": 3.0285758026190407e-06, + "loss": 0.1023, + "step": 48077 + }, + { + "epoch": 0.8575250597510078, + "grad_norm": 0.23215742409229279, + "learning_rate": 3.0278332582389075e-06, + "loss": 0.0908, + "step": 48078 + }, + { + "epoch": 0.8575428958727215, + "grad_norm": 0.2790238559246063, + "learning_rate": 3.027090799030477e-06, + "loss": 0.1254, + "step": 48079 + }, + { + "epoch": 0.8575607319944352, + "grad_norm": 0.2865852117538452, + "learning_rate": 3.0263484249966364e-06, + "loss": 0.092, + "step": 48080 + }, + { + "epoch": 0.8575785681161489, + "grad_norm": 0.23618711531162262, + "learning_rate": 3.0256061361402578e-06, + "loss": 0.1363, + "step": 48081 + }, + { + "epoch": 0.8575964042378625, + "grad_norm": 0.2805115878582001, + "learning_rate": 3.02486393246422e-06, + "loss": 0.1544, + "step": 48082 + }, + { + "epoch": 0.8576142403595762, + "grad_norm": 0.24137824773788452, + "learning_rate": 3.0241218139714005e-06, + "loss": 0.1278, + "step": 48083 + }, + { + "epoch": 0.8576320764812899, + "grad_norm": 0.4007607102394104, + "learning_rate": 3.02337978066467e-06, + "loss": 0.1262, + "step": 48084 + }, + { + "epoch": 0.8576499126030036, + "grad_norm": 0.42904749512672424, + "learning_rate": 3.0226378325469153e-06, + "loss": 0.1514, + "step": 48085 + }, + { + "epoch": 0.8576677487247173, + "grad_norm": 0.17265328764915466, + "learning_rate": 3.0218959696210055e-06, + "loss": 0.0668, + "step": 48086 + }, + { + "epoch": 0.857685584846431, + "grad_norm": 0.37499865889549255, + "learning_rate": 3.021154191889819e-06, + "loss": 0.0888, + "step": 48087 + }, + { + "epoch": 0.8577034209681447, + "grad_norm": 0.2551610767841339, + "learning_rate": 3.020412499356223e-06, + "loss": 0.1238, + "step": 48088 + }, + { + "epoch": 0.8577212570898584, + "grad_norm": 0.3171907663345337, + "learning_rate": 3.0196708920231074e-06, + "loss": 0.1169, + "step": 48089 + }, + { + "epoch": 0.857739093211572, + "grad_norm": 0.32601386308670044, + "learning_rate": 3.0189293698933336e-06, + "loss": 0.1702, + "step": 48090 + }, + { + "epoch": 0.8577569293332857, + "grad_norm": 0.24266552925109863, + "learning_rate": 3.018187932969785e-06, + "loss": 0.0678, + "step": 48091 + }, + { + "epoch": 0.8577747654549994, + "grad_norm": 0.2656879723072052, + "learning_rate": 3.0174465812553288e-06, + "loss": 0.1218, + "step": 48092 + }, + { + "epoch": 0.8577926015767131, + "grad_norm": 0.20636171102523804, + "learning_rate": 3.016705314752846e-06, + "loss": 0.1272, + "step": 48093 + }, + { + "epoch": 0.8578104376984269, + "grad_norm": 0.2429744452238083, + "learning_rate": 3.0159641334652102e-06, + "loss": 0.1082, + "step": 48094 + }, + { + "epoch": 0.8578282738201406, + "grad_norm": 0.24921710789203644, + "learning_rate": 3.0152230373952874e-06, + "loss": 0.1322, + "step": 48095 + }, + { + "epoch": 0.8578461099418543, + "grad_norm": 0.28033968806266785, + "learning_rate": 3.0144820265459508e-06, + "loss": 0.0985, + "step": 48096 + }, + { + "epoch": 0.857863946063568, + "grad_norm": 0.3121553957462311, + "learning_rate": 3.0137411009200727e-06, + "loss": 0.0912, + "step": 48097 + }, + { + "epoch": 0.8578817821852817, + "grad_norm": 0.2222258597612381, + "learning_rate": 3.0130002605205347e-06, + "loss": 0.1123, + "step": 48098 + }, + { + "epoch": 0.8578996183069953, + "grad_norm": 0.3027598559856415, + "learning_rate": 3.012259505350201e-06, + "loss": 0.1442, + "step": 48099 + }, + { + "epoch": 0.857917454428709, + "grad_norm": 0.320352166891098, + "learning_rate": 3.011518835411942e-06, + "loss": 0.1373, + "step": 48100 + }, + { + "epoch": 0.8579352905504227, + "grad_norm": 0.3319607973098755, + "learning_rate": 3.010778250708626e-06, + "loss": 0.122, + "step": 48101 + }, + { + "epoch": 0.8579531266721364, + "grad_norm": 0.23992297053337097, + "learning_rate": 3.0100377512431334e-06, + "loss": 0.1179, + "step": 48102 + }, + { + "epoch": 0.8579709627938501, + "grad_norm": 0.23372259736061096, + "learning_rate": 3.0092973370183324e-06, + "loss": 0.0933, + "step": 48103 + }, + { + "epoch": 0.8579887989155638, + "grad_norm": 0.25746750831604004, + "learning_rate": 3.00855700803708e-06, + "loss": 0.1043, + "step": 48104 + }, + { + "epoch": 0.8580066350372775, + "grad_norm": 0.22758682072162628, + "learning_rate": 3.0078167643022655e-06, + "loss": 0.1198, + "step": 48105 + }, + { + "epoch": 0.8580244711589912, + "grad_norm": 0.34161216020584106, + "learning_rate": 3.0070766058167415e-06, + "loss": 0.1302, + "step": 48106 + }, + { + "epoch": 0.8580423072807049, + "grad_norm": 0.2670750319957733, + "learning_rate": 3.0063365325833925e-06, + "loss": 0.1021, + "step": 48107 + }, + { + "epoch": 0.8580601434024185, + "grad_norm": 0.25317394733428955, + "learning_rate": 3.005596544605077e-06, + "loss": 0.1139, + "step": 48108 + }, + { + "epoch": 0.8580779795241322, + "grad_norm": 0.26877349615097046, + "learning_rate": 3.004856641884668e-06, + "loss": 0.0459, + "step": 48109 + }, + { + "epoch": 0.8580958156458459, + "grad_norm": 0.2264004945755005, + "learning_rate": 3.004116824425024e-06, + "loss": 0.0978, + "step": 48110 + }, + { + "epoch": 0.8581136517675597, + "grad_norm": 0.23884214460849762, + "learning_rate": 3.003377092229026e-06, + "loss": 0.0846, + "step": 48111 + }, + { + "epoch": 0.8581314878892734, + "grad_norm": 0.25644227862358093, + "learning_rate": 3.002637445299539e-06, + "loss": 0.0964, + "step": 48112 + }, + { + "epoch": 0.8581493240109871, + "grad_norm": 0.29159095883369446, + "learning_rate": 3.0018978836394235e-06, + "loss": 0.1273, + "step": 48113 + }, + { + "epoch": 0.8581671601327008, + "grad_norm": 0.25391098856925964, + "learning_rate": 3.001158407251545e-06, + "loss": 0.1292, + "step": 48114 + }, + { + "epoch": 0.8581849962544145, + "grad_norm": 0.22363534569740295, + "learning_rate": 3.000419016138781e-06, + "loss": 0.0863, + "step": 48115 + }, + { + "epoch": 0.8582028323761282, + "grad_norm": 0.27557283639907837, + "learning_rate": 2.99967971030399e-06, + "loss": 0.1611, + "step": 48116 + }, + { + "epoch": 0.8582206684978418, + "grad_norm": 0.3189485967159271, + "learning_rate": 2.998940489750035e-06, + "loss": 0.1497, + "step": 48117 + }, + { + "epoch": 0.8582385046195555, + "grad_norm": 0.2348957061767578, + "learning_rate": 2.9982013544797904e-06, + "loss": 0.1009, + "step": 48118 + }, + { + "epoch": 0.8582563407412692, + "grad_norm": 0.17953412234783173, + "learning_rate": 2.99746230449611e-06, + "loss": 0.0502, + "step": 48119 + }, + { + "epoch": 0.8582741768629829, + "grad_norm": 0.24287880957126617, + "learning_rate": 2.9967233398018714e-06, + "loss": 0.1195, + "step": 48120 + }, + { + "epoch": 0.8582920129846966, + "grad_norm": 0.6242601871490479, + "learning_rate": 2.9959844603999335e-06, + "loss": 0.142, + "step": 48121 + }, + { + "epoch": 0.8583098491064103, + "grad_norm": 0.3184574246406555, + "learning_rate": 2.995245666293159e-06, + "loss": 0.1495, + "step": 48122 + }, + { + "epoch": 0.858327685228124, + "grad_norm": 0.2637436091899872, + "learning_rate": 2.994506957484408e-06, + "loss": 0.087, + "step": 48123 + }, + { + "epoch": 0.8583455213498377, + "grad_norm": 0.21575382351875305, + "learning_rate": 2.9937683339765543e-06, + "loss": 0.1153, + "step": 48124 + }, + { + "epoch": 0.8583633574715513, + "grad_norm": 0.33434629440307617, + "learning_rate": 2.9930297957724536e-06, + "loss": 0.1456, + "step": 48125 + }, + { + "epoch": 0.858381193593265, + "grad_norm": 0.27986589074134827, + "learning_rate": 2.99229134287497e-06, + "loss": 0.1498, + "step": 48126 + }, + { + "epoch": 0.8583990297149787, + "grad_norm": 0.2441047877073288, + "learning_rate": 2.9915529752869625e-06, + "loss": 0.1096, + "step": 48127 + }, + { + "epoch": 0.8584168658366925, + "grad_norm": 0.2196342647075653, + "learning_rate": 2.990814693011301e-06, + "loss": 0.1106, + "step": 48128 + }, + { + "epoch": 0.8584347019584062, + "grad_norm": 0.1972104012966156, + "learning_rate": 2.9900764960508447e-06, + "loss": 0.0821, + "step": 48129 + }, + { + "epoch": 0.8584525380801199, + "grad_norm": 0.3377305567264557, + "learning_rate": 2.9893383844084548e-06, + "loss": 0.1371, + "step": 48130 + }, + { + "epoch": 0.8584703742018336, + "grad_norm": 0.26327067613601685, + "learning_rate": 2.9886003580869847e-06, + "loss": 0.1618, + "step": 48131 + }, + { + "epoch": 0.8584882103235473, + "grad_norm": 0.24589316546916962, + "learning_rate": 2.987862417089307e-06, + "loss": 0.0802, + "step": 48132 + }, + { + "epoch": 0.858506046445261, + "grad_norm": 0.29374364018440247, + "learning_rate": 2.9871245614182723e-06, + "loss": 0.0716, + "step": 48133 + }, + { + "epoch": 0.8585238825669746, + "grad_norm": 0.3158067762851715, + "learning_rate": 2.9863867910767534e-06, + "loss": 0.155, + "step": 48134 + }, + { + "epoch": 0.8585417186886883, + "grad_norm": 0.3153727352619171, + "learning_rate": 2.9856491060676007e-06, + "loss": 0.1376, + "step": 48135 + }, + { + "epoch": 0.858559554810402, + "grad_norm": 0.21420489251613617, + "learning_rate": 2.98491150639367e-06, + "loss": 0.0757, + "step": 48136 + }, + { + "epoch": 0.8585773909321157, + "grad_norm": 0.2597881853580475, + "learning_rate": 2.9841739920578314e-06, + "loss": 0.0884, + "step": 48137 + }, + { + "epoch": 0.8585952270538294, + "grad_norm": 0.25425538420677185, + "learning_rate": 2.9834365630629386e-06, + "loss": 0.122, + "step": 48138 + }, + { + "epoch": 0.8586130631755431, + "grad_norm": 0.2531696856021881, + "learning_rate": 2.9826992194118495e-06, + "loss": 0.0883, + "step": 48139 + }, + { + "epoch": 0.8586308992972568, + "grad_norm": 0.9652997255325317, + "learning_rate": 2.9819619611074186e-06, + "loss": 0.1574, + "step": 48140 + }, + { + "epoch": 0.8586487354189705, + "grad_norm": 0.2712922990322113, + "learning_rate": 2.9812247881525147e-06, + "loss": 0.0923, + "step": 48141 + }, + { + "epoch": 0.8586665715406842, + "grad_norm": 0.3017910122871399, + "learning_rate": 2.980487700549986e-06, + "loss": 0.1505, + "step": 48142 + }, + { + "epoch": 0.8586844076623978, + "grad_norm": 0.26514509320259094, + "learning_rate": 2.9797506983026914e-06, + "loss": 0.1003, + "step": 48143 + }, + { + "epoch": 0.8587022437841115, + "grad_norm": 0.17610181868076324, + "learning_rate": 2.9790137814134864e-06, + "loss": 0.0589, + "step": 48144 + }, + { + "epoch": 0.8587200799058253, + "grad_norm": 0.29654136300086975, + "learning_rate": 2.9782769498852338e-06, + "loss": 0.139, + "step": 48145 + }, + { + "epoch": 0.858737916027539, + "grad_norm": 0.3049066662788391, + "learning_rate": 2.97754020372078e-06, + "loss": 0.1078, + "step": 48146 + }, + { + "epoch": 0.8587557521492527, + "grad_norm": 0.30414435267448425, + "learning_rate": 2.97680354292299e-06, + "loss": 0.0998, + "step": 48147 + }, + { + "epoch": 0.8587735882709664, + "grad_norm": 0.3989607095718384, + "learning_rate": 2.97606696749472e-06, + "loss": 0.1082, + "step": 48148 + }, + { + "epoch": 0.8587914243926801, + "grad_norm": 0.32944050431251526, + "learning_rate": 2.9753304774388148e-06, + "loss": 0.1258, + "step": 48149 + }, + { + "epoch": 0.8588092605143938, + "grad_norm": 0.3494466245174408, + "learning_rate": 2.9745940727581385e-06, + "loss": 0.101, + "step": 48150 + }, + { + "epoch": 0.8588270966361075, + "grad_norm": 0.27578550577163696, + "learning_rate": 2.9738577534555445e-06, + "loss": 0.1053, + "step": 48151 + }, + { + "epoch": 0.8588449327578211, + "grad_norm": 0.23384353518486023, + "learning_rate": 2.9731215195338863e-06, + "loss": 0.0971, + "step": 48152 + }, + { + "epoch": 0.8588627688795348, + "grad_norm": 0.26223936676979065, + "learning_rate": 2.9723853709960114e-06, + "loss": 0.0903, + "step": 48153 + }, + { + "epoch": 0.8588806050012485, + "grad_norm": 0.2957221269607544, + "learning_rate": 2.9716493078447844e-06, + "loss": 0.1664, + "step": 48154 + }, + { + "epoch": 0.8588984411229622, + "grad_norm": 0.3796936273574829, + "learning_rate": 2.970913330083053e-06, + "loss": 0.1355, + "step": 48155 + }, + { + "epoch": 0.8589162772446759, + "grad_norm": 0.45391565561294556, + "learning_rate": 2.970177437713667e-06, + "loss": 0.1153, + "step": 48156 + }, + { + "epoch": 0.8589341133663896, + "grad_norm": 0.3027273118495941, + "learning_rate": 2.969441630739481e-06, + "loss": 0.1854, + "step": 48157 + }, + { + "epoch": 0.8589519494881033, + "grad_norm": 0.26276105642318726, + "learning_rate": 2.968705909163352e-06, + "loss": 0.1005, + "step": 48158 + }, + { + "epoch": 0.858969785609817, + "grad_norm": 0.2261965423822403, + "learning_rate": 2.967970272988127e-06, + "loss": 0.0689, + "step": 48159 + }, + { + "epoch": 0.8589876217315306, + "grad_norm": 0.27368614077568054, + "learning_rate": 2.967234722216655e-06, + "loss": 0.0948, + "step": 48160 + }, + { + "epoch": 0.8590054578532443, + "grad_norm": 0.25188788771629333, + "learning_rate": 2.9664992568517985e-06, + "loss": 0.1044, + "step": 48161 + }, + { + "epoch": 0.8590232939749581, + "grad_norm": 0.2663467228412628, + "learning_rate": 2.9657638768963907e-06, + "loss": 0.0983, + "step": 48162 + }, + { + "epoch": 0.8590411300966718, + "grad_norm": 0.32396388053894043, + "learning_rate": 2.965028582353302e-06, + "loss": 0.1088, + "step": 48163 + }, + { + "epoch": 0.8590589662183855, + "grad_norm": 0.33476510643959045, + "learning_rate": 2.9642933732253715e-06, + "loss": 0.1344, + "step": 48164 + }, + { + "epoch": 0.8590768023400992, + "grad_norm": 0.27767929434776306, + "learning_rate": 2.9635582495154525e-06, + "loss": 0.111, + "step": 48165 + }, + { + "epoch": 0.8590946384618129, + "grad_norm": 0.27167049050331116, + "learning_rate": 2.9628232112263842e-06, + "loss": 0.1217, + "step": 48166 + }, + { + "epoch": 0.8591124745835266, + "grad_norm": 0.21474626660346985, + "learning_rate": 2.9620882583610317e-06, + "loss": 0.0978, + "step": 48167 + }, + { + "epoch": 0.8591303107052403, + "grad_norm": 0.2251388281583786, + "learning_rate": 2.961353390922236e-06, + "loss": 0.0818, + "step": 48168 + }, + { + "epoch": 0.859148146826954, + "grad_norm": 0.3174282908439636, + "learning_rate": 2.960618608912849e-06, + "loss": 0.1045, + "step": 48169 + }, + { + "epoch": 0.8591659829486676, + "grad_norm": 0.3171255588531494, + "learning_rate": 2.959883912335709e-06, + "loss": 0.0715, + "step": 48170 + }, + { + "epoch": 0.8591838190703813, + "grad_norm": 0.33975762128829956, + "learning_rate": 2.9591493011936755e-06, + "loss": 0.1548, + "step": 48171 + }, + { + "epoch": 0.859201655192095, + "grad_norm": 0.2754475772380829, + "learning_rate": 2.958414775489596e-06, + "loss": 0.1127, + "step": 48172 + }, + { + "epoch": 0.8592194913138087, + "grad_norm": 0.27586838603019714, + "learning_rate": 2.957680335226304e-06, + "loss": 0.0801, + "step": 48173 + }, + { + "epoch": 0.8592373274355224, + "grad_norm": 0.23614056408405304, + "learning_rate": 2.956945980406664e-06, + "loss": 0.1058, + "step": 48174 + }, + { + "epoch": 0.8592551635572361, + "grad_norm": 0.23251068592071533, + "learning_rate": 2.9562117110335103e-06, + "loss": 0.1084, + "step": 48175 + }, + { + "epoch": 0.8592729996789498, + "grad_norm": 0.27740004658699036, + "learning_rate": 2.9554775271096957e-06, + "loss": 0.0906, + "step": 48176 + }, + { + "epoch": 0.8592908358006635, + "grad_norm": 0.35315370559692383, + "learning_rate": 2.9547434286380655e-06, + "loss": 0.1538, + "step": 48177 + }, + { + "epoch": 0.8593086719223771, + "grad_norm": 0.2677737772464752, + "learning_rate": 2.9540094156214642e-06, + "loss": 0.0833, + "step": 48178 + }, + { + "epoch": 0.8593265080440909, + "grad_norm": 0.23308239877223969, + "learning_rate": 2.9532754880627344e-06, + "loss": 0.1066, + "step": 48179 + }, + { + "epoch": 0.8593443441658046, + "grad_norm": 0.24560680985450745, + "learning_rate": 2.952541645964724e-06, + "loss": 0.091, + "step": 48180 + }, + { + "epoch": 0.8593621802875183, + "grad_norm": 0.38948020339012146, + "learning_rate": 2.9518078893302797e-06, + "loss": 0.138, + "step": 48181 + }, + { + "epoch": 0.859380016409232, + "grad_norm": 0.296694815158844, + "learning_rate": 2.951074218162245e-06, + "loss": 0.1152, + "step": 48182 + }, + { + "epoch": 0.8593978525309457, + "grad_norm": 0.2746095061302185, + "learning_rate": 2.9503406324634523e-06, + "loss": 0.1077, + "step": 48183 + }, + { + "epoch": 0.8594156886526594, + "grad_norm": 0.3277067542076111, + "learning_rate": 2.9496071322367645e-06, + "loss": 0.1484, + "step": 48184 + }, + { + "epoch": 0.8594335247743731, + "grad_norm": 0.2806954085826874, + "learning_rate": 2.9488737174850124e-06, + "loss": 0.1126, + "step": 48185 + }, + { + "epoch": 0.8594513608960868, + "grad_norm": 0.20883244276046753, + "learning_rate": 2.948140388211043e-06, + "loss": 0.0793, + "step": 48186 + }, + { + "epoch": 0.8594691970178004, + "grad_norm": 0.24078921973705292, + "learning_rate": 2.9474071444176936e-06, + "loss": 0.1169, + "step": 48187 + }, + { + "epoch": 0.8594870331395141, + "grad_norm": 0.2193739414215088, + "learning_rate": 2.9466739861078146e-06, + "loss": 0.0839, + "step": 48188 + }, + { + "epoch": 0.8595048692612278, + "grad_norm": 0.2722387909889221, + "learning_rate": 2.94594091328424e-06, + "loss": 0.1082, + "step": 48189 + }, + { + "epoch": 0.8595227053829415, + "grad_norm": 0.26688340306282043, + "learning_rate": 2.9452079259498196e-06, + "loss": 0.1041, + "step": 48190 + }, + { + "epoch": 0.8595405415046552, + "grad_norm": 0.3343719244003296, + "learning_rate": 2.944475024107393e-06, + "loss": 0.1232, + "step": 48191 + }, + { + "epoch": 0.8595583776263689, + "grad_norm": 0.33834537863731384, + "learning_rate": 2.943742207759792e-06, + "loss": 0.1154, + "step": 48192 + }, + { + "epoch": 0.8595762137480826, + "grad_norm": 0.2993813157081604, + "learning_rate": 2.9430094769098693e-06, + "loss": 0.1131, + "step": 48193 + }, + { + "epoch": 0.8595940498697963, + "grad_norm": 0.25525423884391785, + "learning_rate": 2.9422768315604615e-06, + "loss": 0.0857, + "step": 48194 + }, + { + "epoch": 0.8596118859915101, + "grad_norm": 0.22723130881786346, + "learning_rate": 2.9415442717144053e-06, + "loss": 0.0817, + "step": 48195 + }, + { + "epoch": 0.8596297221132237, + "grad_norm": 0.25193265080451965, + "learning_rate": 2.9408117973745404e-06, + "loss": 0.1243, + "step": 48196 + }, + { + "epoch": 0.8596475582349374, + "grad_norm": 0.23626454174518585, + "learning_rate": 2.940079408543708e-06, + "loss": 0.1159, + "step": 48197 + }, + { + "epoch": 0.8596653943566511, + "grad_norm": 0.3407726287841797, + "learning_rate": 2.9393471052247518e-06, + "loss": 0.1472, + "step": 48198 + }, + { + "epoch": 0.8596832304783648, + "grad_norm": 0.23359082639217377, + "learning_rate": 2.9386148874205044e-06, + "loss": 0.0737, + "step": 48199 + }, + { + "epoch": 0.8597010666000785, + "grad_norm": 0.5486839413642883, + "learning_rate": 2.9378827551338e-06, + "loss": 0.1126, + "step": 48200 + }, + { + "epoch": 0.8597189027217922, + "grad_norm": 0.2252131998538971, + "learning_rate": 2.9371507083674864e-06, + "loss": 0.082, + "step": 48201 + }, + { + "epoch": 0.8597367388435059, + "grad_norm": 0.29695749282836914, + "learning_rate": 2.936418747124392e-06, + "loss": 0.116, + "step": 48202 + }, + { + "epoch": 0.8597545749652196, + "grad_norm": 0.27731817960739136, + "learning_rate": 2.9356868714073644e-06, + "loss": 0.1503, + "step": 48203 + }, + { + "epoch": 0.8597724110869333, + "grad_norm": 0.3915056586265564, + "learning_rate": 2.9349550812192344e-06, + "loss": 0.1512, + "step": 48204 + }, + { + "epoch": 0.8597902472086469, + "grad_norm": 0.28447309136390686, + "learning_rate": 2.9342233765628363e-06, + "loss": 0.1028, + "step": 48205 + }, + { + "epoch": 0.8598080833303606, + "grad_norm": 0.2695890963077545, + "learning_rate": 2.933491757441015e-06, + "loss": 0.1166, + "step": 48206 + }, + { + "epoch": 0.8598259194520743, + "grad_norm": 0.2625380754470825, + "learning_rate": 2.9327602238565984e-06, + "loss": 0.1371, + "step": 48207 + }, + { + "epoch": 0.859843755573788, + "grad_norm": 0.2839668095111847, + "learning_rate": 2.932028775812426e-06, + "loss": 0.1171, + "step": 48208 + }, + { + "epoch": 0.8598615916955017, + "grad_norm": 0.28277912735939026, + "learning_rate": 2.931297413311329e-06, + "loss": 0.07, + "step": 48209 + }, + { + "epoch": 0.8598794278172154, + "grad_norm": 0.4427390396595001, + "learning_rate": 2.9305661363561497e-06, + "loss": 0.1108, + "step": 48210 + }, + { + "epoch": 0.8598972639389291, + "grad_norm": 0.2555272579193115, + "learning_rate": 2.9298349449497166e-06, + "loss": 0.1262, + "step": 48211 + }, + { + "epoch": 0.8599151000606429, + "grad_norm": 0.2379920482635498, + "learning_rate": 2.9291038390948682e-06, + "loss": 0.0541, + "step": 48212 + }, + { + "epoch": 0.8599329361823566, + "grad_norm": 0.2259339839220047, + "learning_rate": 2.928372818794431e-06, + "loss": 0.1078, + "step": 48213 + }, + { + "epoch": 0.8599507723040702, + "grad_norm": 0.24760255217552185, + "learning_rate": 2.9276418840512493e-06, + "loss": 0.1112, + "step": 48214 + }, + { + "epoch": 0.8599686084257839, + "grad_norm": 0.1892232745885849, + "learning_rate": 2.926911034868149e-06, + "loss": 0.0663, + "step": 48215 + }, + { + "epoch": 0.8599864445474976, + "grad_norm": 0.2524602711200714, + "learning_rate": 2.9261802712479635e-06, + "loss": 0.0822, + "step": 48216 + }, + { + "epoch": 0.8600042806692113, + "grad_norm": 0.2967257797718048, + "learning_rate": 2.92544959319353e-06, + "loss": 0.0769, + "step": 48217 + }, + { + "epoch": 0.860022116790925, + "grad_norm": 0.3313281536102295, + "learning_rate": 2.9247190007076736e-06, + "loss": 0.066, + "step": 48218 + }, + { + "epoch": 0.8600399529126387, + "grad_norm": 0.3659050166606903, + "learning_rate": 2.9239884937932366e-06, + "loss": 0.1361, + "step": 48219 + }, + { + "epoch": 0.8600577890343524, + "grad_norm": 0.33097416162490845, + "learning_rate": 2.9232580724530444e-06, + "loss": 0.1609, + "step": 48220 + }, + { + "epoch": 0.8600756251560661, + "grad_norm": 0.22080326080322266, + "learning_rate": 2.9225277366899285e-06, + "loss": 0.0848, + "step": 48221 + }, + { + "epoch": 0.8600934612777797, + "grad_norm": 0.25807568430900574, + "learning_rate": 2.9217974865067164e-06, + "loss": 0.1547, + "step": 48222 + }, + { + "epoch": 0.8601112973994934, + "grad_norm": 0.2913421094417572, + "learning_rate": 2.9210673219062484e-06, + "loss": 0.0989, + "step": 48223 + }, + { + "epoch": 0.8601291335212071, + "grad_norm": 0.22137191891670227, + "learning_rate": 2.9203372428913493e-06, + "loss": 0.1097, + "step": 48224 + }, + { + "epoch": 0.8601469696429208, + "grad_norm": 0.37268900871276855, + "learning_rate": 2.919607249464848e-06, + "loss": 0.1271, + "step": 48225 + }, + { + "epoch": 0.8601648057646345, + "grad_norm": 0.2568928301334381, + "learning_rate": 2.9188773416295695e-06, + "loss": 0.1258, + "step": 48226 + }, + { + "epoch": 0.8601826418863482, + "grad_norm": 0.25159287452697754, + "learning_rate": 2.9181475193883563e-06, + "loss": 0.064, + "step": 48227 + }, + { + "epoch": 0.8602004780080619, + "grad_norm": 0.213983952999115, + "learning_rate": 2.917417782744028e-06, + "loss": 0.0917, + "step": 48228 + }, + { + "epoch": 0.8602183141297757, + "grad_norm": 0.21965080499649048, + "learning_rate": 2.9166881316994106e-06, + "loss": 0.1035, + "step": 48229 + }, + { + "epoch": 0.8602361502514894, + "grad_norm": 0.22449922561645508, + "learning_rate": 2.915958566257343e-06, + "loss": 0.1138, + "step": 48230 + }, + { + "epoch": 0.860253986373203, + "grad_norm": 0.27136072516441345, + "learning_rate": 2.915229086420643e-06, + "loss": 0.087, + "step": 48231 + }, + { + "epoch": 0.8602718224949167, + "grad_norm": 0.3956851065158844, + "learning_rate": 2.914499692192149e-06, + "loss": 0.1499, + "step": 48232 + }, + { + "epoch": 0.8602896586166304, + "grad_norm": 0.2291734665632248, + "learning_rate": 2.9137703835746794e-06, + "loss": 0.1058, + "step": 48233 + }, + { + "epoch": 0.8603074947383441, + "grad_norm": 0.3187664747238159, + "learning_rate": 2.913041160571067e-06, + "loss": 0.142, + "step": 48234 + }, + { + "epoch": 0.8603253308600578, + "grad_norm": 0.2660762369632721, + "learning_rate": 2.9123120231841273e-06, + "loss": 0.1175, + "step": 48235 + }, + { + "epoch": 0.8603431669817715, + "grad_norm": 0.33615776896476746, + "learning_rate": 2.9115829714167016e-06, + "loss": 0.1174, + "step": 48236 + }, + { + "epoch": 0.8603610031034852, + "grad_norm": 0.34933218359947205, + "learning_rate": 2.91085400527161e-06, + "loss": 0.1424, + "step": 48237 + }, + { + "epoch": 0.8603788392251989, + "grad_norm": 0.23214490711688995, + "learning_rate": 2.910125124751678e-06, + "loss": 0.0859, + "step": 48238 + }, + { + "epoch": 0.8603966753469126, + "grad_norm": 0.27387022972106934, + "learning_rate": 2.909396329859726e-06, + "loss": 0.148, + "step": 48239 + }, + { + "epoch": 0.8604145114686262, + "grad_norm": 0.2868475317955017, + "learning_rate": 2.9086676205985872e-06, + "loss": 0.1329, + "step": 48240 + }, + { + "epoch": 0.8604323475903399, + "grad_norm": 0.3276348412036896, + "learning_rate": 2.9079389969710823e-06, + "loss": 0.0915, + "step": 48241 + }, + { + "epoch": 0.8604501837120536, + "grad_norm": 0.2806035578250885, + "learning_rate": 2.907210458980039e-06, + "loss": 0.0903, + "step": 48242 + }, + { + "epoch": 0.8604680198337673, + "grad_norm": 0.26185864210128784, + "learning_rate": 2.9064820066282695e-06, + "loss": 0.1487, + "step": 48243 + }, + { + "epoch": 0.860485855955481, + "grad_norm": 0.24755623936653137, + "learning_rate": 2.9057536399186096e-06, + "loss": 0.1161, + "step": 48244 + }, + { + "epoch": 0.8605036920771947, + "grad_norm": 0.2634468078613281, + "learning_rate": 2.9050253588538855e-06, + "loss": 0.0996, + "step": 48245 + }, + { + "epoch": 0.8605215281989085, + "grad_norm": 0.2416146695613861, + "learning_rate": 2.9042971634369113e-06, + "loss": 0.0962, + "step": 48246 + }, + { + "epoch": 0.8605393643206222, + "grad_norm": 0.30893415212631226, + "learning_rate": 2.903569053670516e-06, + "loss": 0.163, + "step": 48247 + }, + { + "epoch": 0.8605572004423359, + "grad_norm": 0.3862980604171753, + "learning_rate": 2.9028410295575125e-06, + "loss": 0.1409, + "step": 48248 + }, + { + "epoch": 0.8605750365640495, + "grad_norm": 0.29877278208732605, + "learning_rate": 2.9021130911007333e-06, + "loss": 0.0812, + "step": 48249 + }, + { + "epoch": 0.8605928726857632, + "grad_norm": 0.29208359122276306, + "learning_rate": 2.9013852383029976e-06, + "loss": 0.1073, + "step": 48250 + }, + { + "epoch": 0.8606107088074769, + "grad_norm": 0.31787821650505066, + "learning_rate": 2.900657471167123e-06, + "loss": 0.1235, + "step": 48251 + }, + { + "epoch": 0.8606285449291906, + "grad_norm": 0.25469255447387695, + "learning_rate": 2.8999297896959294e-06, + "loss": 0.106, + "step": 48252 + }, + { + "epoch": 0.8606463810509043, + "grad_norm": 0.29802897572517395, + "learning_rate": 2.8992021938922475e-06, + "loss": 0.1344, + "step": 48253 + }, + { + "epoch": 0.860664217172618, + "grad_norm": 0.2553252875804901, + "learning_rate": 2.8984746837588895e-06, + "loss": 0.1493, + "step": 48254 + }, + { + "epoch": 0.8606820532943317, + "grad_norm": 0.2799240052700043, + "learning_rate": 2.8977472592986776e-06, + "loss": 0.1269, + "step": 48255 + }, + { + "epoch": 0.8606998894160454, + "grad_norm": 0.26853373646736145, + "learning_rate": 2.8970199205144266e-06, + "loss": 0.1647, + "step": 48256 + }, + { + "epoch": 0.860717725537759, + "grad_norm": 0.3006648123264313, + "learning_rate": 2.8962926674089673e-06, + "loss": 0.101, + "step": 48257 + }, + { + "epoch": 0.8607355616594727, + "grad_norm": 0.32147276401519775, + "learning_rate": 2.8955654999851033e-06, + "loss": 0.1185, + "step": 48258 + }, + { + "epoch": 0.8607533977811864, + "grad_norm": 0.3249829113483429, + "learning_rate": 2.8948384182456684e-06, + "loss": 0.1015, + "step": 48259 + }, + { + "epoch": 0.8607712339029001, + "grad_norm": 0.19432179629802704, + "learning_rate": 2.894111422193477e-06, + "loss": 0.1341, + "step": 48260 + }, + { + "epoch": 0.8607890700246138, + "grad_norm": 0.23653775453567505, + "learning_rate": 2.8933845118313374e-06, + "loss": 0.0743, + "step": 48261 + }, + { + "epoch": 0.8608069061463275, + "grad_norm": 0.3151514232158661, + "learning_rate": 2.8926576871620787e-06, + "loss": 0.1215, + "step": 48262 + }, + { + "epoch": 0.8608247422680413, + "grad_norm": 0.2512984871864319, + "learning_rate": 2.8919309481885176e-06, + "loss": 0.1018, + "step": 48263 + }, + { + "epoch": 0.860842578389755, + "grad_norm": 0.24243074655532837, + "learning_rate": 2.891204294913466e-06, + "loss": 0.0969, + "step": 48264 + }, + { + "epoch": 0.8608604145114687, + "grad_norm": 0.2831850051879883, + "learning_rate": 2.890477727339738e-06, + "loss": 0.1195, + "step": 48265 + }, + { + "epoch": 0.8608782506331824, + "grad_norm": 0.36379262804985046, + "learning_rate": 2.88975124547016e-06, + "loss": 0.117, + "step": 48266 + }, + { + "epoch": 0.860896086754896, + "grad_norm": 0.25840455293655396, + "learning_rate": 2.8890248493075424e-06, + "loss": 0.0917, + "step": 48267 + }, + { + "epoch": 0.8609139228766097, + "grad_norm": 0.3034076690673828, + "learning_rate": 2.8882985388547028e-06, + "loss": 0.12, + "step": 48268 + }, + { + "epoch": 0.8609317589983234, + "grad_norm": 0.2777332663536072, + "learning_rate": 2.8875723141144535e-06, + "loss": 0.1366, + "step": 48269 + }, + { + "epoch": 0.8609495951200371, + "grad_norm": 0.2748354971408844, + "learning_rate": 2.8868461750896054e-06, + "loss": 0.12, + "step": 48270 + }, + { + "epoch": 0.8609674312417508, + "grad_norm": 0.27870067954063416, + "learning_rate": 2.8861201217829814e-06, + "loss": 0.1128, + "step": 48271 + }, + { + "epoch": 0.8609852673634645, + "grad_norm": 0.39468151330947876, + "learning_rate": 2.885394154197399e-06, + "loss": 0.1263, + "step": 48272 + }, + { + "epoch": 0.8610031034851782, + "grad_norm": 0.2181810885667801, + "learning_rate": 2.8846682723356667e-06, + "loss": 0.1091, + "step": 48273 + }, + { + "epoch": 0.8610209396068919, + "grad_norm": 0.3015720546245575, + "learning_rate": 2.8839424762005905e-06, + "loss": 0.1019, + "step": 48274 + }, + { + "epoch": 0.8610387757286055, + "grad_norm": 0.29595452547073364, + "learning_rate": 2.883216765794999e-06, + "loss": 0.0942, + "step": 48275 + }, + { + "epoch": 0.8610566118503192, + "grad_norm": 0.25546884536743164, + "learning_rate": 2.8824911411216986e-06, + "loss": 0.1056, + "step": 48276 + }, + { + "epoch": 0.8610744479720329, + "grad_norm": 0.20199479162693024, + "learning_rate": 2.8817656021835034e-06, + "loss": 0.0789, + "step": 48277 + }, + { + "epoch": 0.8610922840937466, + "grad_norm": 0.34259548783302307, + "learning_rate": 2.8810401489832163e-06, + "loss": 0.1157, + "step": 48278 + }, + { + "epoch": 0.8611101202154603, + "grad_norm": 0.26792624592781067, + "learning_rate": 2.880314781523663e-06, + "loss": 0.1085, + "step": 48279 + }, + { + "epoch": 0.8611279563371741, + "grad_norm": 0.22999395430088043, + "learning_rate": 2.87958949980765e-06, + "loss": 0.0876, + "step": 48280 + }, + { + "epoch": 0.8611457924588878, + "grad_norm": 0.3116852045059204, + "learning_rate": 2.8788643038379886e-06, + "loss": 0.0934, + "step": 48281 + }, + { + "epoch": 0.8611636285806015, + "grad_norm": 0.19808802008628845, + "learning_rate": 2.8781391936174877e-06, + "loss": 0.1092, + "step": 48282 + }, + { + "epoch": 0.8611814647023152, + "grad_norm": 0.416028767824173, + "learning_rate": 2.8774141691489565e-06, + "loss": 0.1197, + "step": 48283 + }, + { + "epoch": 0.8611993008240288, + "grad_norm": 0.2539530098438263, + "learning_rate": 2.8766892304352115e-06, + "loss": 0.0962, + "step": 48284 + }, + { + "epoch": 0.8612171369457425, + "grad_norm": 0.22953978180885315, + "learning_rate": 2.8759643774790597e-06, + "loss": 0.0922, + "step": 48285 + }, + { + "epoch": 0.8612349730674562, + "grad_norm": 0.31616419553756714, + "learning_rate": 2.8752396102833117e-06, + "loss": 0.0962, + "step": 48286 + }, + { + "epoch": 0.8612528091891699, + "grad_norm": 0.33783987164497375, + "learning_rate": 2.8745149288507744e-06, + "loss": 0.1035, + "step": 48287 + }, + { + "epoch": 0.8612706453108836, + "grad_norm": 0.28543201088905334, + "learning_rate": 2.8737903331842647e-06, + "loss": 0.1822, + "step": 48288 + }, + { + "epoch": 0.8612884814325973, + "grad_norm": 0.27050888538360596, + "learning_rate": 2.8730658232865833e-06, + "loss": 0.1122, + "step": 48289 + }, + { + "epoch": 0.861306317554311, + "grad_norm": 0.1776435375213623, + "learning_rate": 2.8723413991605415e-06, + "loss": 0.0964, + "step": 48290 + }, + { + "epoch": 0.8613241536760247, + "grad_norm": 0.2499695122241974, + "learning_rate": 2.8716170608089425e-06, + "loss": 0.0777, + "step": 48291 + }, + { + "epoch": 0.8613419897977383, + "grad_norm": 0.36284545063972473, + "learning_rate": 2.870892808234604e-06, + "loss": 0.1336, + "step": 48292 + }, + { + "epoch": 0.861359825919452, + "grad_norm": 0.40035828948020935, + "learning_rate": 2.8701686414403294e-06, + "loss": 0.1128, + "step": 48293 + }, + { + "epoch": 0.8613776620411657, + "grad_norm": 0.20022915303707123, + "learning_rate": 2.8694445604289242e-06, + "loss": 0.075, + "step": 48294 + }, + { + "epoch": 0.8613954981628794, + "grad_norm": 0.24053792655467987, + "learning_rate": 2.8687205652031953e-06, + "loss": 0.1005, + "step": 48295 + }, + { + "epoch": 0.8614133342845932, + "grad_norm": 0.38681307435035706, + "learning_rate": 2.867996655765942e-06, + "loss": 0.1181, + "step": 48296 + }, + { + "epoch": 0.8614311704063069, + "grad_norm": 0.2893669307231903, + "learning_rate": 2.8672728321199854e-06, + "loss": 0.1156, + "step": 48297 + }, + { + "epoch": 0.8614490065280206, + "grad_norm": 0.22892136871814728, + "learning_rate": 2.8665490942681173e-06, + "loss": 0.1183, + "step": 48298 + }, + { + "epoch": 0.8614668426497343, + "grad_norm": 0.26217445731163025, + "learning_rate": 2.865825442213155e-06, + "loss": 0.1105, + "step": 48299 + }, + { + "epoch": 0.861484678771448, + "grad_norm": 0.2384805530309677, + "learning_rate": 2.8651018759578956e-06, + "loss": 0.0929, + "step": 48300 + }, + { + "epoch": 0.8615025148931617, + "grad_norm": 0.3178929388523102, + "learning_rate": 2.864378395505149e-06, + "loss": 0.1182, + "step": 48301 + }, + { + "epoch": 0.8615203510148753, + "grad_norm": 0.2603487968444824, + "learning_rate": 2.863655000857718e-06, + "loss": 0.1418, + "step": 48302 + }, + { + "epoch": 0.861538187136589, + "grad_norm": 0.33509957790374756, + "learning_rate": 2.862931692018403e-06, + "loss": 0.1692, + "step": 48303 + }, + { + "epoch": 0.8615560232583027, + "grad_norm": 0.38891443610191345, + "learning_rate": 2.8622084689900076e-06, + "loss": 0.1718, + "step": 48304 + }, + { + "epoch": 0.8615738593800164, + "grad_norm": 0.2718924880027771, + "learning_rate": 2.861485331775346e-06, + "loss": 0.0925, + "step": 48305 + }, + { + "epoch": 0.8615916955017301, + "grad_norm": 0.3322236239910126, + "learning_rate": 2.8607622803772106e-06, + "loss": 0.1895, + "step": 48306 + }, + { + "epoch": 0.8616095316234438, + "grad_norm": 0.23104922473430634, + "learning_rate": 2.860039314798407e-06, + "loss": 0.1109, + "step": 48307 + }, + { + "epoch": 0.8616273677451575, + "grad_norm": 0.30988621711730957, + "learning_rate": 2.8593164350417396e-06, + "loss": 0.1278, + "step": 48308 + }, + { + "epoch": 0.8616452038668712, + "grad_norm": 0.3167951703071594, + "learning_rate": 2.8585936411100022e-06, + "loss": 0.1265, + "step": 48309 + }, + { + "epoch": 0.8616630399885848, + "grad_norm": 0.2360583394765854, + "learning_rate": 2.85787093300601e-06, + "loss": 0.1104, + "step": 48310 + }, + { + "epoch": 0.8616808761102985, + "grad_norm": 0.376623272895813, + "learning_rate": 2.8571483107325576e-06, + "loss": 0.082, + "step": 48311 + }, + { + "epoch": 0.8616987122320122, + "grad_norm": 0.2665676474571228, + "learning_rate": 2.85642577429244e-06, + "loss": 0.1452, + "step": 48312 + }, + { + "epoch": 0.861716548353726, + "grad_norm": 0.28887245059013367, + "learning_rate": 2.855703323688469e-06, + "loss": 0.1181, + "step": 48313 + }, + { + "epoch": 0.8617343844754397, + "grad_norm": 0.3180290758609772, + "learning_rate": 2.8549809589234365e-06, + "loss": 0.1466, + "step": 48314 + }, + { + "epoch": 0.8617522205971534, + "grad_norm": 0.2863643765449524, + "learning_rate": 2.854258680000152e-06, + "loss": 0.1142, + "step": 48315 + }, + { + "epoch": 0.8617700567188671, + "grad_norm": 0.2878120243549347, + "learning_rate": 2.853536486921407e-06, + "loss": 0.1292, + "step": 48316 + }, + { + "epoch": 0.8617878928405808, + "grad_norm": 0.23969271779060364, + "learning_rate": 2.8528143796899996e-06, + "loss": 0.1097, + "step": 48317 + }, + { + "epoch": 0.8618057289622945, + "grad_norm": 0.1940971463918686, + "learning_rate": 2.8520923583087416e-06, + "loss": 0.1109, + "step": 48318 + }, + { + "epoch": 0.8618235650840081, + "grad_norm": 0.25445297360420227, + "learning_rate": 2.8513704227804193e-06, + "loss": 0.1095, + "step": 48319 + }, + { + "epoch": 0.8618414012057218, + "grad_norm": 0.3247835040092468, + "learning_rate": 2.8506485731078364e-06, + "loss": 0.1358, + "step": 48320 + }, + { + "epoch": 0.8618592373274355, + "grad_norm": 0.29928672313690186, + "learning_rate": 2.84992680929379e-06, + "loss": 0.1234, + "step": 48321 + }, + { + "epoch": 0.8618770734491492, + "grad_norm": 0.27817854285240173, + "learning_rate": 2.8492051313410733e-06, + "loss": 0.0946, + "step": 48322 + }, + { + "epoch": 0.8618949095708629, + "grad_norm": 0.24670279026031494, + "learning_rate": 2.8484835392524917e-06, + "loss": 0.0816, + "step": 48323 + }, + { + "epoch": 0.8619127456925766, + "grad_norm": 0.28345784544944763, + "learning_rate": 2.8477620330308404e-06, + "loss": 0.1221, + "step": 48324 + }, + { + "epoch": 0.8619305818142903, + "grad_norm": 0.27013736963272095, + "learning_rate": 2.8470406126789083e-06, + "loss": 0.1284, + "step": 48325 + }, + { + "epoch": 0.861948417936004, + "grad_norm": 0.28505557775497437, + "learning_rate": 2.8463192781995025e-06, + "loss": 0.0967, + "step": 48326 + }, + { + "epoch": 0.8619662540577177, + "grad_norm": 0.2967435419559479, + "learning_rate": 2.8455980295954115e-06, + "loss": 0.1106, + "step": 48327 + }, + { + "epoch": 0.8619840901794313, + "grad_norm": 0.3118458688259125, + "learning_rate": 2.8448768668694393e-06, + "loss": 0.1181, + "step": 48328 + }, + { + "epoch": 0.862001926301145, + "grad_norm": 0.2458171844482422, + "learning_rate": 2.844155790024375e-06, + "loss": 0.0724, + "step": 48329 + }, + { + "epoch": 0.8620197624228588, + "grad_norm": 0.24572700262069702, + "learning_rate": 2.8434347990630107e-06, + "loss": 0.0963, + "step": 48330 + }, + { + "epoch": 0.8620375985445725, + "grad_norm": 0.2939291000366211, + "learning_rate": 2.8427138939881527e-06, + "loss": 0.1018, + "step": 48331 + }, + { + "epoch": 0.8620554346662862, + "grad_norm": 0.20687086880207062, + "learning_rate": 2.8419930748025846e-06, + "loss": 0.0415, + "step": 48332 + }, + { + "epoch": 0.8620732707879999, + "grad_norm": 0.32366690039634705, + "learning_rate": 2.8412723415091076e-06, + "loss": 0.1345, + "step": 48333 + }, + { + "epoch": 0.8620911069097136, + "grad_norm": 0.24365437030792236, + "learning_rate": 2.840551694110513e-06, + "loss": 0.1158, + "step": 48334 + }, + { + "epoch": 0.8621089430314273, + "grad_norm": 0.20163924992084503, + "learning_rate": 2.8398311326095856e-06, + "loss": 0.074, + "step": 48335 + }, + { + "epoch": 0.862126779153141, + "grad_norm": 0.3035610318183899, + "learning_rate": 2.8391106570091336e-06, + "loss": 0.1334, + "step": 48336 + }, + { + "epoch": 0.8621446152748546, + "grad_norm": 0.398946076631546, + "learning_rate": 2.8383902673119438e-06, + "loss": 0.1628, + "step": 48337 + }, + { + "epoch": 0.8621624513965683, + "grad_norm": 0.2273399829864502, + "learning_rate": 2.8376699635208058e-06, + "loss": 0.1027, + "step": 48338 + }, + { + "epoch": 0.862180287518282, + "grad_norm": 0.2744402289390564, + "learning_rate": 2.836949745638509e-06, + "loss": 0.1102, + "step": 48339 + }, + { + "epoch": 0.8621981236399957, + "grad_norm": 0.19958682358264923, + "learning_rate": 2.836229613667857e-06, + "loss": 0.0732, + "step": 48340 + }, + { + "epoch": 0.8622159597617094, + "grad_norm": 0.250897079706192, + "learning_rate": 2.8355095676116246e-06, + "loss": 0.1526, + "step": 48341 + }, + { + "epoch": 0.8622337958834231, + "grad_norm": 0.2800920605659485, + "learning_rate": 2.834789607472621e-06, + "loss": 0.0889, + "step": 48342 + }, + { + "epoch": 0.8622516320051368, + "grad_norm": 0.22744520008563995, + "learning_rate": 2.834069733253622e-06, + "loss": 0.0808, + "step": 48343 + }, + { + "epoch": 0.8622694681268505, + "grad_norm": 0.31362220644950867, + "learning_rate": 2.833349944957431e-06, + "loss": 0.1239, + "step": 48344 + }, + { + "epoch": 0.8622873042485641, + "grad_norm": 0.29813653230667114, + "learning_rate": 2.832630242586831e-06, + "loss": 0.1599, + "step": 48345 + }, + { + "epoch": 0.8623051403702778, + "grad_norm": 0.21249863505363464, + "learning_rate": 2.831910626144613e-06, + "loss": 0.0663, + "step": 48346 + }, + { + "epoch": 0.8623229764919916, + "grad_norm": 0.2554519772529602, + "learning_rate": 2.8311910956335646e-06, + "loss": 0.1316, + "step": 48347 + }, + { + "epoch": 0.8623408126137053, + "grad_norm": 0.24317017197608948, + "learning_rate": 2.830471651056474e-06, + "loss": 0.0945, + "step": 48348 + }, + { + "epoch": 0.862358648735419, + "grad_norm": 0.2667730152606964, + "learning_rate": 2.829752292416135e-06, + "loss": 0.1065, + "step": 48349 + }, + { + "epoch": 0.8623764848571327, + "grad_norm": 0.334600567817688, + "learning_rate": 2.8290330197153354e-06, + "loss": 0.1277, + "step": 48350 + }, + { + "epoch": 0.8623943209788464, + "grad_norm": 0.30880317091941833, + "learning_rate": 2.8283138329568583e-06, + "loss": 0.1176, + "step": 48351 + }, + { + "epoch": 0.8624121571005601, + "grad_norm": 0.21727901697158813, + "learning_rate": 2.827594732143493e-06, + "loss": 0.0957, + "step": 48352 + }, + { + "epoch": 0.8624299932222738, + "grad_norm": 0.30124136805534363, + "learning_rate": 2.8268757172780325e-06, + "loss": 0.164, + "step": 48353 + }, + { + "epoch": 0.8624478293439874, + "grad_norm": 0.24251937866210938, + "learning_rate": 2.8261567883632544e-06, + "loss": 0.0749, + "step": 48354 + }, + { + "epoch": 0.8624656654657011, + "grad_norm": 0.2853952646255493, + "learning_rate": 2.825437945401957e-06, + "loss": 0.0988, + "step": 48355 + }, + { + "epoch": 0.8624835015874148, + "grad_norm": 0.3074513375759125, + "learning_rate": 2.8247191883969154e-06, + "loss": 0.0968, + "step": 48356 + }, + { + "epoch": 0.8625013377091285, + "grad_norm": 0.3496426045894623, + "learning_rate": 2.824000517350925e-06, + "loss": 0.1407, + "step": 48357 + }, + { + "epoch": 0.8625191738308422, + "grad_norm": 0.24741581082344055, + "learning_rate": 2.823281932266769e-06, + "loss": 0.0707, + "step": 48358 + }, + { + "epoch": 0.8625370099525559, + "grad_norm": 0.24328818917274475, + "learning_rate": 2.8225634331472322e-06, + "loss": 0.0763, + "step": 48359 + }, + { + "epoch": 0.8625548460742696, + "grad_norm": 0.19807767868041992, + "learning_rate": 2.821845019995101e-06, + "loss": 0.0752, + "step": 48360 + }, + { + "epoch": 0.8625726821959833, + "grad_norm": 0.2815285325050354, + "learning_rate": 2.82112669281315e-06, + "loss": 0.1064, + "step": 48361 + }, + { + "epoch": 0.862590518317697, + "grad_norm": 0.27651694416999817, + "learning_rate": 2.8204084516041814e-06, + "loss": 0.099, + "step": 48362 + }, + { + "epoch": 0.8626083544394106, + "grad_norm": 0.3131183087825775, + "learning_rate": 2.8196902963709664e-06, + "loss": 0.1145, + "step": 48363 + }, + { + "epoch": 0.8626261905611244, + "grad_norm": 0.2532101571559906, + "learning_rate": 2.8189722271162957e-06, + "loss": 0.1532, + "step": 48364 + }, + { + "epoch": 0.8626440266828381, + "grad_norm": 0.29842883348464966, + "learning_rate": 2.818254243842941e-06, + "loss": 0.0889, + "step": 48365 + }, + { + "epoch": 0.8626618628045518, + "grad_norm": 0.2780974805355072, + "learning_rate": 2.8175363465537034e-06, + "loss": 0.1647, + "step": 48366 + }, + { + "epoch": 0.8626796989262655, + "grad_norm": 0.2839653491973877, + "learning_rate": 2.816818535251353e-06, + "loss": 0.1322, + "step": 48367 + }, + { + "epoch": 0.8626975350479792, + "grad_norm": 0.26783478260040283, + "learning_rate": 2.8161008099386738e-06, + "loss": 0.0682, + "step": 48368 + }, + { + "epoch": 0.8627153711696929, + "grad_norm": 0.345745712518692, + "learning_rate": 2.815383170618452e-06, + "loss": 0.1277, + "step": 48369 + }, + { + "epoch": 0.8627332072914066, + "grad_norm": 0.190636545419693, + "learning_rate": 2.814665617293466e-06, + "loss": 0.103, + "step": 48370 + }, + { + "epoch": 0.8627510434131203, + "grad_norm": 0.2475663274526596, + "learning_rate": 2.8139481499665e-06, + "loss": 0.1413, + "step": 48371 + }, + { + "epoch": 0.8627688795348339, + "grad_norm": 0.25812238454818726, + "learning_rate": 2.8132307686403347e-06, + "loss": 0.1233, + "step": 48372 + }, + { + "epoch": 0.8627867156565476, + "grad_norm": 0.37430524826049805, + "learning_rate": 2.8125134733177515e-06, + "loss": 0.1394, + "step": 48373 + }, + { + "epoch": 0.8628045517782613, + "grad_norm": 0.2203422635793686, + "learning_rate": 2.8117962640015233e-06, + "loss": 0.0869, + "step": 48374 + }, + { + "epoch": 0.862822387899975, + "grad_norm": 0.46110695600509644, + "learning_rate": 2.811079140694442e-06, + "loss": 0.1356, + "step": 48375 + }, + { + "epoch": 0.8628402240216887, + "grad_norm": 0.3198714852333069, + "learning_rate": 2.8103621033992804e-06, + "loss": 0.0882, + "step": 48376 + }, + { + "epoch": 0.8628580601434024, + "grad_norm": 0.3065090775489807, + "learning_rate": 2.80964515211882e-06, + "loss": 0.1979, + "step": 48377 + }, + { + "epoch": 0.8628758962651161, + "grad_norm": 0.19372273981571198, + "learning_rate": 2.808928286855836e-06, + "loss": 0.0812, + "step": 48378 + }, + { + "epoch": 0.8628937323868298, + "grad_norm": 0.21410366892814636, + "learning_rate": 2.808211507613115e-06, + "loss": 0.0807, + "step": 48379 + }, + { + "epoch": 0.8629115685085434, + "grad_norm": 0.3122394382953644, + "learning_rate": 2.8074948143934297e-06, + "loss": 0.0912, + "step": 48380 + }, + { + "epoch": 0.8629294046302572, + "grad_norm": 0.43264979124069214, + "learning_rate": 2.8067782071995535e-06, + "loss": 0.1388, + "step": 48381 + }, + { + "epoch": 0.8629472407519709, + "grad_norm": 0.3185844421386719, + "learning_rate": 2.806061686034278e-06, + "loss": 0.1489, + "step": 48382 + }, + { + "epoch": 0.8629650768736846, + "grad_norm": 0.2764148712158203, + "learning_rate": 2.805345250900365e-06, + "loss": 0.1076, + "step": 48383 + }, + { + "epoch": 0.8629829129953983, + "grad_norm": 0.25965332984924316, + "learning_rate": 2.8046289018006074e-06, + "loss": 0.1351, + "step": 48384 + }, + { + "epoch": 0.863000749117112, + "grad_norm": 0.34573742747306824, + "learning_rate": 2.803912638737771e-06, + "loss": 0.091, + "step": 48385 + }, + { + "epoch": 0.8630185852388257, + "grad_norm": 0.24794262647628784, + "learning_rate": 2.803196461714638e-06, + "loss": 0.1257, + "step": 48386 + }, + { + "epoch": 0.8630364213605394, + "grad_norm": 0.26193276047706604, + "learning_rate": 2.8024803707339757e-06, + "loss": 0.0935, + "step": 48387 + }, + { + "epoch": 0.8630542574822531, + "grad_norm": 0.2464940994977951, + "learning_rate": 2.801764365798573e-06, + "loss": 0.0866, + "step": 48388 + }, + { + "epoch": 0.8630720936039668, + "grad_norm": 0.2569274306297302, + "learning_rate": 2.8010484469111975e-06, + "loss": 0.0922, + "step": 48389 + }, + { + "epoch": 0.8630899297256804, + "grad_norm": 0.21546293795108795, + "learning_rate": 2.800332614074627e-06, + "loss": 0.0602, + "step": 48390 + }, + { + "epoch": 0.8631077658473941, + "grad_norm": 0.2204519361257553, + "learning_rate": 2.799616867291627e-06, + "loss": 0.0717, + "step": 48391 + }, + { + "epoch": 0.8631256019691078, + "grad_norm": 0.3106570541858673, + "learning_rate": 2.7989012065649883e-06, + "loss": 0.1245, + "step": 48392 + }, + { + "epoch": 0.8631434380908215, + "grad_norm": 0.29296547174453735, + "learning_rate": 2.7981856318974736e-06, + "loss": 0.0851, + "step": 48393 + }, + { + "epoch": 0.8631612742125352, + "grad_norm": 0.22861339151859283, + "learning_rate": 2.797470143291861e-06, + "loss": 0.076, + "step": 48394 + }, + { + "epoch": 0.8631791103342489, + "grad_norm": 0.2863302528858185, + "learning_rate": 2.7967547407509178e-06, + "loss": 0.1474, + "step": 48395 + }, + { + "epoch": 0.8631969464559626, + "grad_norm": 0.33442822098731995, + "learning_rate": 2.7960394242774247e-06, + "loss": 0.0907, + "step": 48396 + }, + { + "epoch": 0.8632147825776764, + "grad_norm": 0.31222379207611084, + "learning_rate": 2.795324193874149e-06, + "loss": 0.1591, + "step": 48397 + }, + { + "epoch": 0.86323261869939, + "grad_norm": 0.3535809814929962, + "learning_rate": 2.7946090495438697e-06, + "loss": 0.0976, + "step": 48398 + }, + { + "epoch": 0.8632504548211037, + "grad_norm": 0.6215869188308716, + "learning_rate": 2.793893991289356e-06, + "loss": 0.133, + "step": 48399 + }, + { + "epoch": 0.8632682909428174, + "grad_norm": 0.2511143684387207, + "learning_rate": 2.7931790191133762e-06, + "loss": 0.1246, + "step": 48400 + }, + { + "epoch": 0.8632861270645311, + "grad_norm": 0.26210227608680725, + "learning_rate": 2.7924641330187048e-06, + "loss": 0.1251, + "step": 48401 + }, + { + "epoch": 0.8633039631862448, + "grad_norm": 0.2091110348701477, + "learning_rate": 2.7917493330081176e-06, + "loss": 0.0669, + "step": 48402 + }, + { + "epoch": 0.8633217993079585, + "grad_norm": 0.35500049591064453, + "learning_rate": 2.7910346190843763e-06, + "loss": 0.1703, + "step": 48403 + }, + { + "epoch": 0.8633396354296722, + "grad_norm": 0.21513564884662628, + "learning_rate": 2.7903199912502537e-06, + "loss": 0.1317, + "step": 48404 + }, + { + "epoch": 0.8633574715513859, + "grad_norm": 0.38147884607315063, + "learning_rate": 2.7896054495085227e-06, + "loss": 0.0944, + "step": 48405 + }, + { + "epoch": 0.8633753076730996, + "grad_norm": 0.256989449262619, + "learning_rate": 2.7888909938619563e-06, + "loss": 0.1338, + "step": 48406 + }, + { + "epoch": 0.8633931437948132, + "grad_norm": 0.29034656286239624, + "learning_rate": 2.7881766243133185e-06, + "loss": 0.1176, + "step": 48407 + }, + { + "epoch": 0.8634109799165269, + "grad_norm": 0.273830771446228, + "learning_rate": 2.787462340865374e-06, + "loss": 0.1079, + "step": 48408 + }, + { + "epoch": 0.8634288160382406, + "grad_norm": 0.2569306492805481, + "learning_rate": 2.7867481435209034e-06, + "loss": 0.1279, + "step": 48409 + }, + { + "epoch": 0.8634466521599543, + "grad_norm": 0.33249443769454956, + "learning_rate": 2.786034032282664e-06, + "loss": 0.1765, + "step": 48410 + }, + { + "epoch": 0.863464488281668, + "grad_norm": 0.31615397334098816, + "learning_rate": 2.785320007153433e-06, + "loss": 0.1507, + "step": 48411 + }, + { + "epoch": 0.8634823244033817, + "grad_norm": 0.2986748516559601, + "learning_rate": 2.7846060681359754e-06, + "loss": 0.0743, + "step": 48412 + }, + { + "epoch": 0.8635001605250954, + "grad_norm": 0.39069390296936035, + "learning_rate": 2.7838922152330526e-06, + "loss": 0.1779, + "step": 48413 + }, + { + "epoch": 0.8635179966468092, + "grad_norm": 0.25558531284332275, + "learning_rate": 2.783178448447443e-06, + "loss": 0.0794, + "step": 48414 + }, + { + "epoch": 0.8635358327685229, + "grad_norm": 0.28744328022003174, + "learning_rate": 2.782464767781906e-06, + "loss": 0.1352, + "step": 48415 + }, + { + "epoch": 0.8635536688902365, + "grad_norm": 0.30753999948501587, + "learning_rate": 2.781751173239208e-06, + "loss": 0.1039, + "step": 48416 + }, + { + "epoch": 0.8635715050119502, + "grad_norm": 0.36474648118019104, + "learning_rate": 2.781037664822114e-06, + "loss": 0.1591, + "step": 48417 + }, + { + "epoch": 0.8635893411336639, + "grad_norm": 0.21047815680503845, + "learning_rate": 2.780324242533397e-06, + "loss": 0.065, + "step": 48418 + }, + { + "epoch": 0.8636071772553776, + "grad_norm": 0.27666041254997253, + "learning_rate": 2.7796109063758154e-06, + "loss": 0.1202, + "step": 48419 + }, + { + "epoch": 0.8636250133770913, + "grad_norm": 0.24565847218036652, + "learning_rate": 2.7788976563521363e-06, + "loss": 0.1485, + "step": 48420 + }, + { + "epoch": 0.863642849498805, + "grad_norm": 0.24123437702655792, + "learning_rate": 2.778184492465122e-06, + "loss": 0.1296, + "step": 48421 + }, + { + "epoch": 0.8636606856205187, + "grad_norm": 0.3007345199584961, + "learning_rate": 2.7774714147175444e-06, + "loss": 0.1257, + "step": 48422 + }, + { + "epoch": 0.8636785217422324, + "grad_norm": 0.17736253142356873, + "learning_rate": 2.7767584231121636e-06, + "loss": 0.0482, + "step": 48423 + }, + { + "epoch": 0.863696357863946, + "grad_norm": 0.3148968815803528, + "learning_rate": 2.7760455176517347e-06, + "loss": 0.1458, + "step": 48424 + }, + { + "epoch": 0.8637141939856597, + "grad_norm": 0.27487996220588684, + "learning_rate": 2.7753326983390365e-06, + "loss": 0.0927, + "step": 48425 + }, + { + "epoch": 0.8637320301073734, + "grad_norm": 0.24090854823589325, + "learning_rate": 2.7746199651768194e-06, + "loss": 0.104, + "step": 48426 + }, + { + "epoch": 0.8637498662290871, + "grad_norm": 0.2680315673351288, + "learning_rate": 2.7739073181678594e-06, + "loss": 0.1032, + "step": 48427 + }, + { + "epoch": 0.8637677023508008, + "grad_norm": 0.24864837527275085, + "learning_rate": 2.7731947573149094e-06, + "loss": 0.0909, + "step": 48428 + }, + { + "epoch": 0.8637855384725145, + "grad_norm": 0.27116402983665466, + "learning_rate": 2.7724822826207308e-06, + "loss": 0.1319, + "step": 48429 + }, + { + "epoch": 0.8638033745942282, + "grad_norm": 0.2195434421300888, + "learning_rate": 2.771769894088086e-06, + "loss": 0.0947, + "step": 48430 + }, + { + "epoch": 0.863821210715942, + "grad_norm": 0.39954325556755066, + "learning_rate": 2.7710575917197417e-06, + "loss": 0.1038, + "step": 48431 + }, + { + "epoch": 0.8638390468376557, + "grad_norm": 0.39382103085517883, + "learning_rate": 2.7703453755184574e-06, + "loss": 0.1526, + "step": 48432 + }, + { + "epoch": 0.8638568829593694, + "grad_norm": 0.30479347705841064, + "learning_rate": 2.769633245486991e-06, + "loss": 0.1316, + "step": 48433 + }, + { + "epoch": 0.863874719081083, + "grad_norm": 0.2860756516456604, + "learning_rate": 2.7689212016281025e-06, + "loss": 0.1291, + "step": 48434 + }, + { + "epoch": 0.8638925552027967, + "grad_norm": 0.3144417405128479, + "learning_rate": 2.7682092439445554e-06, + "loss": 0.1609, + "step": 48435 + }, + { + "epoch": 0.8639103913245104, + "grad_norm": 0.31592923402786255, + "learning_rate": 2.7674973724391095e-06, + "loss": 0.097, + "step": 48436 + }, + { + "epoch": 0.8639282274462241, + "grad_norm": 0.30795443058013916, + "learning_rate": 2.766785587114515e-06, + "loss": 0.1386, + "step": 48437 + }, + { + "epoch": 0.8639460635679378, + "grad_norm": 0.31524521112442017, + "learning_rate": 2.766073887973547e-06, + "loss": 0.1338, + "step": 48438 + }, + { + "epoch": 0.8639638996896515, + "grad_norm": 0.35194703936576843, + "learning_rate": 2.7653622750189484e-06, + "loss": 0.0999, + "step": 48439 + }, + { + "epoch": 0.8639817358113652, + "grad_norm": 0.3119635283946991, + "learning_rate": 2.7646507482534916e-06, + "loss": 0.1378, + "step": 48440 + }, + { + "epoch": 0.8639995719330789, + "grad_norm": 0.268070787191391, + "learning_rate": 2.7639393076799276e-06, + "loss": 0.0825, + "step": 48441 + }, + { + "epoch": 0.8640174080547925, + "grad_norm": 0.24179629981517792, + "learning_rate": 2.7632279533010174e-06, + "loss": 0.1153, + "step": 48442 + }, + { + "epoch": 0.8640352441765062, + "grad_norm": 0.19377778470516205, + "learning_rate": 2.7625166851195067e-06, + "loss": 0.098, + "step": 48443 + }, + { + "epoch": 0.8640530802982199, + "grad_norm": 0.35624709725379944, + "learning_rate": 2.7618055031381707e-06, + "loss": 0.1018, + "step": 48444 + }, + { + "epoch": 0.8640709164199336, + "grad_norm": 0.35638681054115295, + "learning_rate": 2.7610944073597546e-06, + "loss": 0.1087, + "step": 48445 + }, + { + "epoch": 0.8640887525416473, + "grad_norm": 0.20993869006633759, + "learning_rate": 2.76038339778702e-06, + "loss": 0.1168, + "step": 48446 + }, + { + "epoch": 0.864106588663361, + "grad_norm": 0.26464006304740906, + "learning_rate": 2.7596724744227142e-06, + "loss": 0.1776, + "step": 48447 + }, + { + "epoch": 0.8641244247850748, + "grad_norm": 0.2416379451751709, + "learning_rate": 2.7589616372696053e-06, + "loss": 0.1229, + "step": 48448 + }, + { + "epoch": 0.8641422609067885, + "grad_norm": 0.22152984142303467, + "learning_rate": 2.7582508863304435e-06, + "loss": 0.0919, + "step": 48449 + }, + { + "epoch": 0.8641600970285022, + "grad_norm": 0.2515111267566681, + "learning_rate": 2.757540221607982e-06, + "loss": 0.0861, + "step": 48450 + }, + { + "epoch": 0.8641779331502158, + "grad_norm": 0.2078169733285904, + "learning_rate": 2.7568296431049716e-06, + "loss": 0.0881, + "step": 48451 + }, + { + "epoch": 0.8641957692719295, + "grad_norm": 0.3556353449821472, + "learning_rate": 2.756119150824171e-06, + "loss": 0.1091, + "step": 48452 + }, + { + "epoch": 0.8642136053936432, + "grad_norm": 0.40062204003334045, + "learning_rate": 2.7554087447683426e-06, + "loss": 0.1476, + "step": 48453 + }, + { + "epoch": 0.8642314415153569, + "grad_norm": 0.2830500602722168, + "learning_rate": 2.754698424940233e-06, + "loss": 0.1096, + "step": 48454 + }, + { + "epoch": 0.8642492776370706, + "grad_norm": 0.2188422530889511, + "learning_rate": 2.7539881913425938e-06, + "loss": 0.0481, + "step": 48455 + }, + { + "epoch": 0.8642671137587843, + "grad_norm": 0.28958621621131897, + "learning_rate": 2.7532780439781774e-06, + "loss": 0.1333, + "step": 48456 + }, + { + "epoch": 0.864284949880498, + "grad_norm": 0.195339173078537, + "learning_rate": 2.7525679828497437e-06, + "loss": 0.0939, + "step": 48457 + }, + { + "epoch": 0.8643027860022117, + "grad_norm": 0.24720843136310577, + "learning_rate": 2.751858007960037e-06, + "loss": 0.1188, + "step": 48458 + }, + { + "epoch": 0.8643206221239254, + "grad_norm": 0.2433614730834961, + "learning_rate": 2.7511481193118165e-06, + "loss": 0.1001, + "step": 48459 + }, + { + "epoch": 0.864338458245639, + "grad_norm": 0.25807812809944153, + "learning_rate": 2.7504383169078245e-06, + "loss": 0.116, + "step": 48460 + }, + { + "epoch": 0.8643562943673527, + "grad_norm": 0.18046694993972778, + "learning_rate": 2.749728600750823e-06, + "loss": 0.097, + "step": 48461 + }, + { + "epoch": 0.8643741304890664, + "grad_norm": 0.48721593618392944, + "learning_rate": 2.7490189708435587e-06, + "loss": 0.1569, + "step": 48462 + }, + { + "epoch": 0.8643919666107801, + "grad_norm": 0.22629275918006897, + "learning_rate": 2.7483094271887833e-06, + "loss": 0.1188, + "step": 48463 + }, + { + "epoch": 0.8644098027324938, + "grad_norm": 0.22580598294734955, + "learning_rate": 2.7475999697892386e-06, + "loss": 0.1254, + "step": 48464 + }, + { + "epoch": 0.8644276388542076, + "grad_norm": 0.310934841632843, + "learning_rate": 2.746890598647689e-06, + "loss": 0.1284, + "step": 48465 + }, + { + "epoch": 0.8644454749759213, + "grad_norm": 0.2310737818479538, + "learning_rate": 2.7461813137668714e-06, + "loss": 0.1152, + "step": 48466 + }, + { + "epoch": 0.864463311097635, + "grad_norm": 0.3070048987865448, + "learning_rate": 2.745472115149547e-06, + "loss": 0.1175, + "step": 48467 + }, + { + "epoch": 0.8644811472193487, + "grad_norm": 0.2577265799045563, + "learning_rate": 2.744763002798459e-06, + "loss": 0.109, + "step": 48468 + }, + { + "epoch": 0.8644989833410623, + "grad_norm": 0.25178617238998413, + "learning_rate": 2.7440539767163485e-06, + "loss": 0.0519, + "step": 48469 + }, + { + "epoch": 0.864516819462776, + "grad_norm": 0.2439354807138443, + "learning_rate": 2.743345036905981e-06, + "loss": 0.1209, + "step": 48470 + }, + { + "epoch": 0.8645346555844897, + "grad_norm": 0.3122340440750122, + "learning_rate": 2.7426361833700927e-06, + "loss": 0.1789, + "step": 48471 + }, + { + "epoch": 0.8645524917062034, + "grad_norm": 0.21167029440402985, + "learning_rate": 2.741927416111434e-06, + "loss": 0.0635, + "step": 48472 + }, + { + "epoch": 0.8645703278279171, + "grad_norm": 0.2811305820941925, + "learning_rate": 2.74121873513275e-06, + "loss": 0.1061, + "step": 48473 + }, + { + "epoch": 0.8645881639496308, + "grad_norm": 0.355853408575058, + "learning_rate": 2.740510140436792e-06, + "loss": 0.1083, + "step": 48474 + }, + { + "epoch": 0.8646060000713445, + "grad_norm": 0.22062143683433533, + "learning_rate": 2.7398016320263063e-06, + "loss": 0.099, + "step": 48475 + }, + { + "epoch": 0.8646238361930582, + "grad_norm": 0.29981282353401184, + "learning_rate": 2.7390932099040395e-06, + "loss": 0.1235, + "step": 48476 + }, + { + "epoch": 0.8646416723147718, + "grad_norm": 0.2699138820171356, + "learning_rate": 2.738384874072733e-06, + "loss": 0.0822, + "step": 48477 + }, + { + "epoch": 0.8646595084364855, + "grad_norm": 0.27444812655448914, + "learning_rate": 2.737676624535132e-06, + "loss": 0.0957, + "step": 48478 + }, + { + "epoch": 0.8646773445581992, + "grad_norm": 0.24969972670078278, + "learning_rate": 2.7369684612939874e-06, + "loss": 0.1039, + "step": 48479 + }, + { + "epoch": 0.8646951806799129, + "grad_norm": 0.24319885671138763, + "learning_rate": 2.7362603843520437e-06, + "loss": 0.0857, + "step": 48480 + }, + { + "epoch": 0.8647130168016266, + "grad_norm": 0.24497871100902557, + "learning_rate": 2.7355523937120457e-06, + "loss": 0.1476, + "step": 48481 + }, + { + "epoch": 0.8647308529233404, + "grad_norm": 0.2588385045528412, + "learning_rate": 2.734844489376734e-06, + "loss": 0.1043, + "step": 48482 + }, + { + "epoch": 0.8647486890450541, + "grad_norm": 0.26025617122650146, + "learning_rate": 2.734136671348858e-06, + "loss": 0.0751, + "step": 48483 + }, + { + "epoch": 0.8647665251667678, + "grad_norm": 0.2640632688999176, + "learning_rate": 2.733428939631158e-06, + "loss": 0.0789, + "step": 48484 + }, + { + "epoch": 0.8647843612884815, + "grad_norm": 0.3763984739780426, + "learning_rate": 2.7327212942263815e-06, + "loss": 0.0394, + "step": 48485 + }, + { + "epoch": 0.8648021974101952, + "grad_norm": 0.4510432481765747, + "learning_rate": 2.7320137351372594e-06, + "loss": 0.128, + "step": 48486 + }, + { + "epoch": 0.8648200335319088, + "grad_norm": 0.24882939457893372, + "learning_rate": 2.731306262366548e-06, + "loss": 0.1178, + "step": 48487 + }, + { + "epoch": 0.8648378696536225, + "grad_norm": 0.2520524561405182, + "learning_rate": 2.730598875916987e-06, + "loss": 0.1034, + "step": 48488 + }, + { + "epoch": 0.8648557057753362, + "grad_norm": 0.2537497878074646, + "learning_rate": 2.729891575791316e-06, + "loss": 0.1232, + "step": 48489 + }, + { + "epoch": 0.8648735418970499, + "grad_norm": 0.2315804660320282, + "learning_rate": 2.729184361992276e-06, + "loss": 0.1028, + "step": 48490 + }, + { + "epoch": 0.8648913780187636, + "grad_norm": 0.2543641924858093, + "learning_rate": 2.7284772345226058e-06, + "loss": 0.1083, + "step": 48491 + }, + { + "epoch": 0.8649092141404773, + "grad_norm": 0.2539544105529785, + "learning_rate": 2.727770193385057e-06, + "loss": 0.1616, + "step": 48492 + }, + { + "epoch": 0.864927050262191, + "grad_norm": 0.2560049295425415, + "learning_rate": 2.727063238582356e-06, + "loss": 0.0908, + "step": 48493 + }, + { + "epoch": 0.8649448863839047, + "grad_norm": 0.2839914858341217, + "learning_rate": 2.726356370117256e-06, + "loss": 0.1434, + "step": 48494 + }, + { + "epoch": 0.8649627225056183, + "grad_norm": 0.31279420852661133, + "learning_rate": 2.725649587992485e-06, + "loss": 0.1245, + "step": 48495 + }, + { + "epoch": 0.864980558627332, + "grad_norm": 0.37675201892852783, + "learning_rate": 2.7249428922107965e-06, + "loss": 0.1395, + "step": 48496 + }, + { + "epoch": 0.8649983947490457, + "grad_norm": 0.41199633479118347, + "learning_rate": 2.7242362827749223e-06, + "loss": 0.1555, + "step": 48497 + }, + { + "epoch": 0.8650162308707595, + "grad_norm": 0.24773286283016205, + "learning_rate": 2.7235297596876043e-06, + "loss": 0.1554, + "step": 48498 + }, + { + "epoch": 0.8650340669924732, + "grad_norm": 0.22875894606113434, + "learning_rate": 2.7228233229515705e-06, + "loss": 0.1052, + "step": 48499 + }, + { + "epoch": 0.8650519031141869, + "grad_norm": 0.18755696713924408, + "learning_rate": 2.7221169725695745e-06, + "loss": 0.11, + "step": 48500 + }, + { + "epoch": 0.8650697392359006, + "grad_norm": 0.2680853605270386, + "learning_rate": 2.7214107085443476e-06, + "loss": 0.1193, + "step": 48501 + }, + { + "epoch": 0.8650875753576143, + "grad_norm": 0.34382468461990356, + "learning_rate": 2.7207045308786287e-06, + "loss": 0.104, + "step": 48502 + }, + { + "epoch": 0.865105411479328, + "grad_norm": 0.2973938584327698, + "learning_rate": 2.7199984395751525e-06, + "loss": 0.1129, + "step": 48503 + }, + { + "epoch": 0.8651232476010416, + "grad_norm": 0.23787064850330353, + "learning_rate": 2.7192924346366527e-06, + "loss": 0.1061, + "step": 48504 + }, + { + "epoch": 0.8651410837227553, + "grad_norm": 0.3175320029258728, + "learning_rate": 2.718586516065877e-06, + "loss": 0.1146, + "step": 48505 + }, + { + "epoch": 0.865158919844469, + "grad_norm": 0.31334465742111206, + "learning_rate": 2.7178806838655535e-06, + "loss": 0.1215, + "step": 48506 + }, + { + "epoch": 0.8651767559661827, + "grad_norm": 0.2948490083217621, + "learning_rate": 2.7171749380384167e-06, + "loss": 0.1569, + "step": 48507 + }, + { + "epoch": 0.8651945920878964, + "grad_norm": 0.28500592708587646, + "learning_rate": 2.7164692785872055e-06, + "loss": 0.11, + "step": 48508 + }, + { + "epoch": 0.8652124282096101, + "grad_norm": 0.2451358586549759, + "learning_rate": 2.715763705514662e-06, + "loss": 0.1261, + "step": 48509 + }, + { + "epoch": 0.8652302643313238, + "grad_norm": 0.3460720181465149, + "learning_rate": 2.715058218823516e-06, + "loss": 0.1697, + "step": 48510 + }, + { + "epoch": 0.8652481004530375, + "grad_norm": 0.3354368805885315, + "learning_rate": 2.7143528185164995e-06, + "loss": 0.1596, + "step": 48511 + }, + { + "epoch": 0.8652659365747511, + "grad_norm": 0.22452349960803986, + "learning_rate": 2.713647504596345e-06, + "loss": 0.1259, + "step": 48512 + }, + { + "epoch": 0.8652837726964648, + "grad_norm": 0.31383538246154785, + "learning_rate": 2.7129422770657944e-06, + "loss": 0.1157, + "step": 48513 + }, + { + "epoch": 0.8653016088181785, + "grad_norm": 0.236113503575325, + "learning_rate": 2.7122371359275754e-06, + "loss": 0.1056, + "step": 48514 + }, + { + "epoch": 0.8653194449398923, + "grad_norm": 0.22444918751716614, + "learning_rate": 2.7115320811844258e-06, + "loss": 0.0606, + "step": 48515 + }, + { + "epoch": 0.865337281061606, + "grad_norm": 0.29247623682022095, + "learning_rate": 2.710827112839076e-06, + "loss": 0.1104, + "step": 48516 + }, + { + "epoch": 0.8653551171833197, + "grad_norm": 0.2545079290866852, + "learning_rate": 2.710122230894252e-06, + "loss": 0.1136, + "step": 48517 + }, + { + "epoch": 0.8653729533050334, + "grad_norm": 0.32490989565849304, + "learning_rate": 2.7094174353527012e-06, + "loss": 0.0841, + "step": 48518 + }, + { + "epoch": 0.8653907894267471, + "grad_norm": 0.3200085163116455, + "learning_rate": 2.7087127262171443e-06, + "loss": 0.1315, + "step": 48519 + }, + { + "epoch": 0.8654086255484608, + "grad_norm": 0.2946043610572815, + "learning_rate": 2.7080081034903114e-06, + "loss": 0.1581, + "step": 48520 + }, + { + "epoch": 0.8654264616701745, + "grad_norm": 0.19989019632339478, + "learning_rate": 2.707303567174943e-06, + "loss": 0.0785, + "step": 48521 + }, + { + "epoch": 0.8654442977918881, + "grad_norm": 0.3084639012813568, + "learning_rate": 2.706599117273764e-06, + "loss": 0.1461, + "step": 48522 + }, + { + "epoch": 0.8654621339136018, + "grad_norm": 0.24902957677841187, + "learning_rate": 2.705894753789509e-06, + "loss": 0.0618, + "step": 48523 + }, + { + "epoch": 0.8654799700353155, + "grad_norm": 0.27955931425094604, + "learning_rate": 2.7051904767249058e-06, + "loss": 0.1022, + "step": 48524 + }, + { + "epoch": 0.8654978061570292, + "grad_norm": 0.26947861909866333, + "learning_rate": 2.7044862860826803e-06, + "loss": 0.0832, + "step": 48525 + }, + { + "epoch": 0.8655156422787429, + "grad_norm": 0.4156228005886078, + "learning_rate": 2.703782181865572e-06, + "loss": 0.1303, + "step": 48526 + }, + { + "epoch": 0.8655334784004566, + "grad_norm": 0.34134963154792786, + "learning_rate": 2.7030781640763033e-06, + "loss": 0.0961, + "step": 48527 + }, + { + "epoch": 0.8655513145221703, + "grad_norm": 0.28540530800819397, + "learning_rate": 2.7023742327176053e-06, + "loss": 0.1747, + "step": 48528 + }, + { + "epoch": 0.865569150643884, + "grad_norm": 0.2902222275733948, + "learning_rate": 2.701670387792207e-06, + "loss": 0.109, + "step": 48529 + }, + { + "epoch": 0.8655869867655976, + "grad_norm": 0.317419171333313, + "learning_rate": 2.7009666293028312e-06, + "loss": 0.1398, + "step": 48530 + }, + { + "epoch": 0.8656048228873113, + "grad_norm": 0.2997209131717682, + "learning_rate": 2.7002629572522142e-06, + "loss": 0.0891, + "step": 48531 + }, + { + "epoch": 0.8656226590090251, + "grad_norm": 0.26505598425865173, + "learning_rate": 2.699559371643082e-06, + "loss": 0.1258, + "step": 48532 + }, + { + "epoch": 0.8656404951307388, + "grad_norm": 0.30268096923828125, + "learning_rate": 2.698855872478159e-06, + "loss": 0.1225, + "step": 48533 + }, + { + "epoch": 0.8656583312524525, + "grad_norm": 0.2430025041103363, + "learning_rate": 2.698152459760167e-06, + "loss": 0.0908, + "step": 48534 + }, + { + "epoch": 0.8656761673741662, + "grad_norm": 0.22666838765144348, + "learning_rate": 2.697449133491839e-06, + "loss": 0.1618, + "step": 48535 + }, + { + "epoch": 0.8656940034958799, + "grad_norm": 0.2456955760717392, + "learning_rate": 2.696745893675909e-06, + "loss": 0.1154, + "step": 48536 + }, + { + "epoch": 0.8657118396175936, + "grad_norm": 0.4044748544692993, + "learning_rate": 2.696042740315094e-06, + "loss": 0.1663, + "step": 48537 + }, + { + "epoch": 0.8657296757393073, + "grad_norm": 0.3034094572067261, + "learning_rate": 2.6953396734121146e-06, + "loss": 0.0714, + "step": 48538 + }, + { + "epoch": 0.865747511861021, + "grad_norm": 0.2581062912940979, + "learning_rate": 2.694636692969707e-06, + "loss": 0.1104, + "step": 48539 + }, + { + "epoch": 0.8657653479827346, + "grad_norm": 0.20388630032539368, + "learning_rate": 2.6939337989905916e-06, + "loss": 0.1082, + "step": 48540 + }, + { + "epoch": 0.8657831841044483, + "grad_norm": 0.24164210259914398, + "learning_rate": 2.693230991477494e-06, + "loss": 0.0889, + "step": 48541 + }, + { + "epoch": 0.865801020226162, + "grad_norm": 0.24522261321544647, + "learning_rate": 2.6925282704331395e-06, + "loss": 0.1196, + "step": 48542 + }, + { + "epoch": 0.8658188563478757, + "grad_norm": 0.20962579548358917, + "learning_rate": 2.691825635860243e-06, + "loss": 0.0904, + "step": 48543 + }, + { + "epoch": 0.8658366924695894, + "grad_norm": 0.3322601020336151, + "learning_rate": 2.6911230877615406e-06, + "loss": 0.1619, + "step": 48544 + }, + { + "epoch": 0.8658545285913031, + "grad_norm": 0.27501949667930603, + "learning_rate": 2.69042062613975e-06, + "loss": 0.1422, + "step": 48545 + }, + { + "epoch": 0.8658723647130168, + "grad_norm": 0.43415945768356323, + "learning_rate": 2.6897182509975945e-06, + "loss": 0.156, + "step": 48546 + }, + { + "epoch": 0.8658902008347305, + "grad_norm": 0.25982412695884705, + "learning_rate": 2.689015962337793e-06, + "loss": 0.0784, + "step": 48547 + }, + { + "epoch": 0.8659080369564441, + "grad_norm": 0.2423533797264099, + "learning_rate": 2.6883137601630747e-06, + "loss": 0.0904, + "step": 48548 + }, + { + "epoch": 0.8659258730781579, + "grad_norm": 0.3680278956890106, + "learning_rate": 2.687611644476157e-06, + "loss": 0.0938, + "step": 48549 + }, + { + "epoch": 0.8659437091998716, + "grad_norm": 0.3058181405067444, + "learning_rate": 2.6869096152797647e-06, + "loss": 0.1178, + "step": 48550 + }, + { + "epoch": 0.8659615453215853, + "grad_norm": 0.33312928676605225, + "learning_rate": 2.6862076725766127e-06, + "loss": 0.1328, + "step": 48551 + }, + { + "epoch": 0.865979381443299, + "grad_norm": 0.3377118706703186, + "learning_rate": 2.6855058163694324e-06, + "loss": 0.1286, + "step": 48552 + }, + { + "epoch": 0.8659972175650127, + "grad_norm": 0.3017858564853668, + "learning_rate": 2.6848040466609383e-06, + "loss": 0.1582, + "step": 48553 + }, + { + "epoch": 0.8660150536867264, + "grad_norm": 0.27093982696533203, + "learning_rate": 2.6841023634538526e-06, + "loss": 0.0815, + "step": 48554 + }, + { + "epoch": 0.8660328898084401, + "grad_norm": 0.26729387044906616, + "learning_rate": 2.683400766750893e-06, + "loss": 0.0957, + "step": 48555 + }, + { + "epoch": 0.8660507259301538, + "grad_norm": 0.3356668949127197, + "learning_rate": 2.6826992565547744e-06, + "loss": 0.1247, + "step": 48556 + }, + { + "epoch": 0.8660685620518674, + "grad_norm": 0.24405401945114136, + "learning_rate": 2.681997832868227e-06, + "loss": 0.1384, + "step": 48557 + }, + { + "epoch": 0.8660863981735811, + "grad_norm": 0.27856531739234924, + "learning_rate": 2.6812964956939637e-06, + "loss": 0.1073, + "step": 48558 + }, + { + "epoch": 0.8661042342952948, + "grad_norm": 0.27748432755470276, + "learning_rate": 2.680595245034703e-06, + "loss": 0.1182, + "step": 48559 + }, + { + "epoch": 0.8661220704170085, + "grad_norm": 0.24369068443775177, + "learning_rate": 2.679894080893161e-06, + "loss": 0.0638, + "step": 48560 + }, + { + "epoch": 0.8661399065387222, + "grad_norm": 0.2648668587207794, + "learning_rate": 2.6791930032720626e-06, + "loss": 0.0687, + "step": 48561 + }, + { + "epoch": 0.8661577426604359, + "grad_norm": 0.3763103187084198, + "learning_rate": 2.678492012174116e-06, + "loss": 0.1584, + "step": 48562 + }, + { + "epoch": 0.8661755787821496, + "grad_norm": 0.31233611702919006, + "learning_rate": 2.677791107602051e-06, + "loss": 0.1383, + "step": 48563 + }, + { + "epoch": 0.8661934149038633, + "grad_norm": 0.24152925610542297, + "learning_rate": 2.6770902895585704e-06, + "loss": 0.1152, + "step": 48564 + }, + { + "epoch": 0.866211251025577, + "grad_norm": 0.22455281019210815, + "learning_rate": 2.676389558046405e-06, + "loss": 0.1299, + "step": 48565 + }, + { + "epoch": 0.8662290871472907, + "grad_norm": 0.29154765605926514, + "learning_rate": 2.675688913068264e-06, + "loss": 0.0844, + "step": 48566 + }, + { + "epoch": 0.8662469232690044, + "grad_norm": 0.2528809607028961, + "learning_rate": 2.6749883546268617e-06, + "loss": 0.0634, + "step": 48567 + }, + { + "epoch": 0.8662647593907181, + "grad_norm": 0.2875394821166992, + "learning_rate": 2.6742878827249156e-06, + "loss": 0.0757, + "step": 48568 + }, + { + "epoch": 0.8662825955124318, + "grad_norm": 0.34754088521003723, + "learning_rate": 2.6735874973651347e-06, + "loss": 0.1158, + "step": 48569 + }, + { + "epoch": 0.8663004316341455, + "grad_norm": 0.3402722477912903, + "learning_rate": 2.6728871985502472e-06, + "loss": 0.2099, + "step": 48570 + }, + { + "epoch": 0.8663182677558592, + "grad_norm": 0.5429072976112366, + "learning_rate": 2.6721869862829594e-06, + "loss": 0.1119, + "step": 48571 + }, + { + "epoch": 0.8663361038775729, + "grad_norm": 0.367783784866333, + "learning_rate": 2.671486860565986e-06, + "loss": 0.1955, + "step": 48572 + }, + { + "epoch": 0.8663539399992866, + "grad_norm": 0.2709261178970337, + "learning_rate": 2.670786821402038e-06, + "loss": 0.0964, + "step": 48573 + }, + { + "epoch": 0.8663717761210002, + "grad_norm": 0.2615801990032196, + "learning_rate": 2.670086868793836e-06, + "loss": 0.0738, + "step": 48574 + }, + { + "epoch": 0.8663896122427139, + "grad_norm": 0.3666120767593384, + "learning_rate": 2.669387002744092e-06, + "loss": 0.1188, + "step": 48575 + }, + { + "epoch": 0.8664074483644276, + "grad_norm": 0.21841605007648468, + "learning_rate": 2.6686872232555086e-06, + "loss": 0.091, + "step": 48576 + }, + { + "epoch": 0.8664252844861413, + "grad_norm": 0.3070674538612366, + "learning_rate": 2.667987530330815e-06, + "loss": 0.1493, + "step": 48577 + }, + { + "epoch": 0.866443120607855, + "grad_norm": 0.261738121509552, + "learning_rate": 2.6672879239727083e-06, + "loss": 0.1323, + "step": 48578 + }, + { + "epoch": 0.8664609567295687, + "grad_norm": 0.31118008494377136, + "learning_rate": 2.666588404183912e-06, + "loss": 0.0707, + "step": 48579 + }, + { + "epoch": 0.8664787928512824, + "grad_norm": 0.23535367846488953, + "learning_rate": 2.6658889709671314e-06, + "loss": 0.1061, + "step": 48580 + }, + { + "epoch": 0.8664966289729961, + "grad_norm": 0.19728673994541168, + "learning_rate": 2.665189624325082e-06, + "loss": 0.0942, + "step": 48581 + }, + { + "epoch": 0.8665144650947098, + "grad_norm": 0.33237403631210327, + "learning_rate": 2.6644903642604633e-06, + "loss": 0.1288, + "step": 48582 + }, + { + "epoch": 0.8665323012164236, + "grad_norm": 0.3385293781757355, + "learning_rate": 2.6637911907760023e-06, + "loss": 0.164, + "step": 48583 + }, + { + "epoch": 0.8665501373381372, + "grad_norm": 0.28505703806877136, + "learning_rate": 2.6630921038744012e-06, + "loss": 0.1322, + "step": 48584 + }, + { + "epoch": 0.8665679734598509, + "grad_norm": 0.30644890666007996, + "learning_rate": 2.6623931035583694e-06, + "loss": 0.1292, + "step": 48585 + }, + { + "epoch": 0.8665858095815646, + "grad_norm": 0.3161781132221222, + "learning_rate": 2.66169418983061e-06, + "loss": 0.1165, + "step": 48586 + }, + { + "epoch": 0.8666036457032783, + "grad_norm": 0.299847811460495, + "learning_rate": 2.6609953626938465e-06, + "loss": 0.1066, + "step": 48587 + }, + { + "epoch": 0.866621481824992, + "grad_norm": 0.35688701272010803, + "learning_rate": 2.660296622150782e-06, + "loss": 0.0563, + "step": 48588 + }, + { + "epoch": 0.8666393179467057, + "grad_norm": 0.1942257136106491, + "learning_rate": 2.6595979682041165e-06, + "loss": 0.1075, + "step": 48589 + }, + { + "epoch": 0.8666571540684194, + "grad_norm": 0.26867055892944336, + "learning_rate": 2.658899400856571e-06, + "loss": 0.1151, + "step": 48590 + }, + { + "epoch": 0.866674990190133, + "grad_norm": 0.353449285030365, + "learning_rate": 2.6582009201108425e-06, + "loss": 0.1168, + "step": 48591 + }, + { + "epoch": 0.8666928263118467, + "grad_norm": 0.3380625545978546, + "learning_rate": 2.657502525969649e-06, + "loss": 0.1242, + "step": 48592 + }, + { + "epoch": 0.8667106624335604, + "grad_norm": 0.3841957151889801, + "learning_rate": 2.6568042184356934e-06, + "loss": 0.1064, + "step": 48593 + }, + { + "epoch": 0.8667284985552741, + "grad_norm": 0.22774510085582733, + "learning_rate": 2.6561059975116794e-06, + "loss": 0.1059, + "step": 48594 + }, + { + "epoch": 0.8667463346769878, + "grad_norm": 0.27195748686790466, + "learning_rate": 2.6554078632003126e-06, + "loss": 0.078, + "step": 48595 + }, + { + "epoch": 0.8667641707987015, + "grad_norm": 0.21842284500598907, + "learning_rate": 2.654709815504308e-06, + "loss": 0.086, + "step": 48596 + }, + { + "epoch": 0.8667820069204152, + "grad_norm": 0.2673019766807556, + "learning_rate": 2.6540118544263667e-06, + "loss": 0.0693, + "step": 48597 + }, + { + "epoch": 0.8667998430421289, + "grad_norm": 0.2516014277935028, + "learning_rate": 2.653313979969191e-06, + "loss": 0.0835, + "step": 48598 + }, + { + "epoch": 0.8668176791638427, + "grad_norm": 0.24400992691516876, + "learning_rate": 2.6526161921354847e-06, + "loss": 0.1013, + "step": 48599 + }, + { + "epoch": 0.8668355152855564, + "grad_norm": 0.3259427547454834, + "learning_rate": 2.6519184909279626e-06, + "loss": 0.1158, + "step": 48600 + }, + { + "epoch": 0.86685335140727, + "grad_norm": 0.4715590476989746, + "learning_rate": 2.6512208763493194e-06, + "loss": 0.0897, + "step": 48601 + }, + { + "epoch": 0.8668711875289837, + "grad_norm": 0.2877531945705414, + "learning_rate": 2.6505233484022672e-06, + "loss": 0.1246, + "step": 48602 + }, + { + "epoch": 0.8668890236506974, + "grad_norm": 0.30854853987693787, + "learning_rate": 2.649825907089501e-06, + "loss": 0.1108, + "step": 48603 + }, + { + "epoch": 0.8669068597724111, + "grad_norm": 0.2034696489572525, + "learning_rate": 2.649128552413732e-06, + "loss": 0.0514, + "step": 48604 + }, + { + "epoch": 0.8669246958941248, + "grad_norm": 0.30569523572921753, + "learning_rate": 2.648431284377656e-06, + "loss": 0.0796, + "step": 48605 + }, + { + "epoch": 0.8669425320158385, + "grad_norm": 0.1643456369638443, + "learning_rate": 2.647734102983984e-06, + "loss": 0.0634, + "step": 48606 + }, + { + "epoch": 0.8669603681375522, + "grad_norm": 0.5523666143417358, + "learning_rate": 2.647037008235412e-06, + "loss": 0.1585, + "step": 48607 + }, + { + "epoch": 0.8669782042592659, + "grad_norm": 0.2524758577346802, + "learning_rate": 2.646340000134648e-06, + "loss": 0.1074, + "step": 48608 + }, + { + "epoch": 0.8669960403809795, + "grad_norm": 0.27780383825302124, + "learning_rate": 2.6456430786843928e-06, + "loss": 0.0689, + "step": 48609 + }, + { + "epoch": 0.8670138765026932, + "grad_norm": 0.2856437563896179, + "learning_rate": 2.6449462438873445e-06, + "loss": 0.1066, + "step": 48610 + }, + { + "epoch": 0.8670317126244069, + "grad_norm": 0.31625935435295105, + "learning_rate": 2.6442494957462065e-06, + "loss": 0.1075, + "step": 48611 + }, + { + "epoch": 0.8670495487461206, + "grad_norm": 0.3473754823207855, + "learning_rate": 2.643552834263674e-06, + "loss": 0.1585, + "step": 48612 + }, + { + "epoch": 0.8670673848678343, + "grad_norm": 0.28323686122894287, + "learning_rate": 2.6428562594424582e-06, + "loss": 0.0954, + "step": 48613 + }, + { + "epoch": 0.867085220989548, + "grad_norm": 0.23895299434661865, + "learning_rate": 2.642159771285252e-06, + "loss": 0.1284, + "step": 48614 + }, + { + "epoch": 0.8671030571112617, + "grad_norm": 0.2907993793487549, + "learning_rate": 2.6414633697947583e-06, + "loss": 0.1617, + "step": 48615 + }, + { + "epoch": 0.8671208932329755, + "grad_norm": 0.3060983717441559, + "learning_rate": 2.6407670549736698e-06, + "loss": 0.1099, + "step": 48616 + }, + { + "epoch": 0.8671387293546892, + "grad_norm": 0.2411821186542511, + "learning_rate": 2.6400708268246922e-06, + "loss": 0.1476, + "step": 48617 + }, + { + "epoch": 0.8671565654764029, + "grad_norm": 0.2727629244327545, + "learning_rate": 2.639374685350521e-06, + "loss": 0.1182, + "step": 48618 + }, + { + "epoch": 0.8671744015981165, + "grad_norm": 0.28775155544281006, + "learning_rate": 2.6386786305538617e-06, + "loss": 0.0816, + "step": 48619 + }, + { + "epoch": 0.8671922377198302, + "grad_norm": 0.2429989129304886, + "learning_rate": 2.6379826624374023e-06, + "loss": 0.1054, + "step": 48620 + }, + { + "epoch": 0.8672100738415439, + "grad_norm": 0.27986016869544983, + "learning_rate": 2.6372867810038506e-06, + "loss": 0.112, + "step": 48621 + }, + { + "epoch": 0.8672279099632576, + "grad_norm": 0.28324660658836365, + "learning_rate": 2.6365909862558992e-06, + "loss": 0.0941, + "step": 48622 + }, + { + "epoch": 0.8672457460849713, + "grad_norm": 0.2582818567752838, + "learning_rate": 2.635895278196243e-06, + "loss": 0.076, + "step": 48623 + }, + { + "epoch": 0.867263582206685, + "grad_norm": 0.35883527994155884, + "learning_rate": 2.635199656827583e-06, + "loss": 0.1522, + "step": 48624 + }, + { + "epoch": 0.8672814183283987, + "grad_norm": 0.3026193082332611, + "learning_rate": 2.634504122152609e-06, + "loss": 0.0766, + "step": 48625 + }, + { + "epoch": 0.8672992544501124, + "grad_norm": 0.2771889567375183, + "learning_rate": 2.6338086741740263e-06, + "loss": 0.0903, + "step": 48626 + }, + { + "epoch": 0.867317090571826, + "grad_norm": 0.2743991017341614, + "learning_rate": 2.633113312894528e-06, + "loss": 0.1474, + "step": 48627 + }, + { + "epoch": 0.8673349266935397, + "grad_norm": 0.24703501164913177, + "learning_rate": 2.6324180383168056e-06, + "loss": 0.1242, + "step": 48628 + }, + { + "epoch": 0.8673527628152534, + "grad_norm": 0.2984876036643982, + "learning_rate": 2.631722850443552e-06, + "loss": 0.1244, + "step": 48629 + }, + { + "epoch": 0.8673705989369671, + "grad_norm": 0.30970442295074463, + "learning_rate": 2.6310277492774703e-06, + "loss": 0.1398, + "step": 48630 + }, + { + "epoch": 0.8673884350586808, + "grad_norm": 0.25778430700302124, + "learning_rate": 2.630332734821253e-06, + "loss": 0.1032, + "step": 48631 + }, + { + "epoch": 0.8674062711803945, + "grad_norm": 0.2707928717136383, + "learning_rate": 2.6296378070775868e-06, + "loss": 0.0858, + "step": 48632 + }, + { + "epoch": 0.8674241073021083, + "grad_norm": 0.2565414011478424, + "learning_rate": 2.628942966049169e-06, + "loss": 0.0576, + "step": 48633 + }, + { + "epoch": 0.867441943423822, + "grad_norm": 0.3035796284675598, + "learning_rate": 2.6282482117387014e-06, + "loss": 0.1018, + "step": 48634 + }, + { + "epoch": 0.8674597795455357, + "grad_norm": 0.26896676421165466, + "learning_rate": 2.6275535441488725e-06, + "loss": 0.1079, + "step": 48635 + }, + { + "epoch": 0.8674776156672493, + "grad_norm": 0.29652872681617737, + "learning_rate": 2.626858963282369e-06, + "loss": 0.1392, + "step": 48636 + }, + { + "epoch": 0.867495451788963, + "grad_norm": 0.23057928681373596, + "learning_rate": 2.6261644691418925e-06, + "loss": 0.1115, + "step": 48637 + }, + { + "epoch": 0.8675132879106767, + "grad_norm": 0.23853172361850739, + "learning_rate": 2.62547006173012e-06, + "loss": 0.1052, + "step": 48638 + }, + { + "epoch": 0.8675311240323904, + "grad_norm": 0.2926994264125824, + "learning_rate": 2.624775741049762e-06, + "loss": 0.1339, + "step": 48639 + }, + { + "epoch": 0.8675489601541041, + "grad_norm": 0.38823041319847107, + "learning_rate": 2.6240815071035014e-06, + "loss": 0.1565, + "step": 48640 + }, + { + "epoch": 0.8675667962758178, + "grad_norm": 0.30450350046157837, + "learning_rate": 2.6233873598940278e-06, + "loss": 0.1304, + "step": 48641 + }, + { + "epoch": 0.8675846323975315, + "grad_norm": 0.22317253053188324, + "learning_rate": 2.622693299424031e-06, + "loss": 0.0638, + "step": 48642 + }, + { + "epoch": 0.8676024685192452, + "grad_norm": 0.39300301671028137, + "learning_rate": 2.621999325696206e-06, + "loss": 0.1083, + "step": 48643 + }, + { + "epoch": 0.8676203046409589, + "grad_norm": 0.23108522593975067, + "learning_rate": 2.6213054387132422e-06, + "loss": 0.0876, + "step": 48644 + }, + { + "epoch": 0.8676381407626725, + "grad_norm": 0.2547154128551483, + "learning_rate": 2.620611638477821e-06, + "loss": 0.1305, + "step": 48645 + }, + { + "epoch": 0.8676559768843862, + "grad_norm": 0.3216720223426819, + "learning_rate": 2.619917924992646e-06, + "loss": 0.1142, + "step": 48646 + }, + { + "epoch": 0.8676738130060999, + "grad_norm": 0.3700425624847412, + "learning_rate": 2.619224298260392e-06, + "loss": 0.1144, + "step": 48647 + }, + { + "epoch": 0.8676916491278136, + "grad_norm": 0.3872331976890564, + "learning_rate": 2.6185307582837603e-06, + "loss": 0.1414, + "step": 48648 + }, + { + "epoch": 0.8677094852495273, + "grad_norm": 0.2810850143432617, + "learning_rate": 2.617837305065435e-06, + "loss": 0.1126, + "step": 48649 + }, + { + "epoch": 0.8677273213712411, + "grad_norm": 0.2455441951751709, + "learning_rate": 2.617143938608102e-06, + "loss": 0.1109, + "step": 48650 + }, + { + "epoch": 0.8677451574929548, + "grad_norm": 0.33465322852134705, + "learning_rate": 2.6164506589144434e-06, + "loss": 0.1077, + "step": 48651 + }, + { + "epoch": 0.8677629936146685, + "grad_norm": 0.21493053436279297, + "learning_rate": 2.615757465987159e-06, + "loss": 0.0911, + "step": 48652 + }, + { + "epoch": 0.8677808297363822, + "grad_norm": 0.2472672164440155, + "learning_rate": 2.6150643598289307e-06, + "loss": 0.1115, + "step": 48653 + }, + { + "epoch": 0.8677986658580958, + "grad_norm": 0.3944537937641144, + "learning_rate": 2.6143713404424452e-06, + "loss": 0.1455, + "step": 48654 + }, + { + "epoch": 0.8678165019798095, + "grad_norm": 0.28953656554222107, + "learning_rate": 2.6136784078303834e-06, + "loss": 0.1138, + "step": 48655 + }, + { + "epoch": 0.8678343381015232, + "grad_norm": 0.21833036839962006, + "learning_rate": 2.6129855619954376e-06, + "loss": 0.0964, + "step": 48656 + }, + { + "epoch": 0.8678521742232369, + "grad_norm": 0.29130542278289795, + "learning_rate": 2.612292802940294e-06, + "loss": 0.1072, + "step": 48657 + }, + { + "epoch": 0.8678700103449506, + "grad_norm": 0.26437899470329285, + "learning_rate": 2.611600130667638e-06, + "loss": 0.1228, + "step": 48658 + }, + { + "epoch": 0.8678878464666643, + "grad_norm": 0.292957067489624, + "learning_rate": 2.610907545180144e-06, + "loss": 0.1086, + "step": 48659 + }, + { + "epoch": 0.867905682588378, + "grad_norm": 0.6413722634315491, + "learning_rate": 2.610215046480513e-06, + "loss": 0.1327, + "step": 48660 + }, + { + "epoch": 0.8679235187100917, + "grad_norm": 0.313829630613327, + "learning_rate": 2.609522634571415e-06, + "loss": 0.126, + "step": 48661 + }, + { + "epoch": 0.8679413548318053, + "grad_norm": 0.2830031216144562, + "learning_rate": 2.6088303094555454e-06, + "loss": 0.1269, + "step": 48662 + }, + { + "epoch": 0.867959190953519, + "grad_norm": 0.2789459824562073, + "learning_rate": 2.608138071135585e-06, + "loss": 0.1007, + "step": 48663 + }, + { + "epoch": 0.8679770270752327, + "grad_norm": 0.24733521044254303, + "learning_rate": 2.60744591961421e-06, + "loss": 0.0625, + "step": 48664 + }, + { + "epoch": 0.8679948631969464, + "grad_norm": 0.3072355091571808, + "learning_rate": 2.6067538548941117e-06, + "loss": 0.0879, + "step": 48665 + }, + { + "epoch": 0.8680126993186601, + "grad_norm": 0.2519523799419403, + "learning_rate": 2.606061876977972e-06, + "loss": 0.121, + "step": 48666 + }, + { + "epoch": 0.8680305354403739, + "grad_norm": 0.3460981845855713, + "learning_rate": 2.605369985868472e-06, + "loss": 0.1553, + "step": 48667 + }, + { + "epoch": 0.8680483715620876, + "grad_norm": 0.2794781029224396, + "learning_rate": 2.6046781815682847e-06, + "loss": 0.1131, + "step": 48668 + }, + { + "epoch": 0.8680662076838013, + "grad_norm": 0.23102281987667084, + "learning_rate": 2.6039864640801077e-06, + "loss": 0.0947, + "step": 48669 + }, + { + "epoch": 0.868084043805515, + "grad_norm": 0.3197089433670044, + "learning_rate": 2.603294833406614e-06, + "loss": 0.1272, + "step": 48670 + }, + { + "epoch": 0.8681018799272286, + "grad_norm": 0.27525627613067627, + "learning_rate": 2.6026032895504815e-06, + "loss": 0.1501, + "step": 48671 + }, + { + "epoch": 0.8681197160489423, + "grad_norm": 0.2974686324596405, + "learning_rate": 2.601911832514395e-06, + "loss": 0.0969, + "step": 48672 + }, + { + "epoch": 0.868137552170656, + "grad_norm": 0.2813066840171814, + "learning_rate": 2.601220462301035e-06, + "loss": 0.1074, + "step": 48673 + }, + { + "epoch": 0.8681553882923697, + "grad_norm": 0.30259424448013306, + "learning_rate": 2.6005291789130775e-06, + "loss": 0.1514, + "step": 48674 + }, + { + "epoch": 0.8681732244140834, + "grad_norm": 0.2650390863418579, + "learning_rate": 2.5998379823532094e-06, + "loss": 0.0979, + "step": 48675 + }, + { + "epoch": 0.8681910605357971, + "grad_norm": 0.31853342056274414, + "learning_rate": 2.5991468726241087e-06, + "loss": 0.1149, + "step": 48676 + }, + { + "epoch": 0.8682088966575108, + "grad_norm": 0.2759436368942261, + "learning_rate": 2.598455849728443e-06, + "loss": 0.077, + "step": 48677 + }, + { + "epoch": 0.8682267327792245, + "grad_norm": 0.2520129084587097, + "learning_rate": 2.597764913668907e-06, + "loss": 0.1098, + "step": 48678 + }, + { + "epoch": 0.8682445689009382, + "grad_norm": 0.30461743474006653, + "learning_rate": 2.597074064448171e-06, + "loss": 0.0924, + "step": 48679 + }, + { + "epoch": 0.8682624050226518, + "grad_norm": 0.2827073335647583, + "learning_rate": 2.5963833020689138e-06, + "loss": 0.1288, + "step": 48680 + }, + { + "epoch": 0.8682802411443655, + "grad_norm": 0.2870013117790222, + "learning_rate": 2.595692626533808e-06, + "loss": 0.1142, + "step": 48681 + }, + { + "epoch": 0.8682980772660792, + "grad_norm": 0.2983800768852234, + "learning_rate": 2.5950020378455424e-06, + "loss": 0.1286, + "step": 48682 + }, + { + "epoch": 0.8683159133877929, + "grad_norm": 0.20682698488235474, + "learning_rate": 2.5943115360067854e-06, + "loss": 0.106, + "step": 48683 + }, + { + "epoch": 0.8683337495095067, + "grad_norm": 0.2710472047328949, + "learning_rate": 2.5936211210202148e-06, + "loss": 0.1159, + "step": 48684 + }, + { + "epoch": 0.8683515856312204, + "grad_norm": 0.2740075886249542, + "learning_rate": 2.5929307928885038e-06, + "loss": 0.0875, + "step": 48685 + }, + { + "epoch": 0.8683694217529341, + "grad_norm": 0.2933413088321686, + "learning_rate": 2.5922405516143362e-06, + "loss": 0.0795, + "step": 48686 + }, + { + "epoch": 0.8683872578746478, + "grad_norm": 0.24890920519828796, + "learning_rate": 2.5915503972003845e-06, + "loss": 0.0933, + "step": 48687 + }, + { + "epoch": 0.8684050939963615, + "grad_norm": 0.23078541457653046, + "learning_rate": 2.5908603296493194e-06, + "loss": 0.1557, + "step": 48688 + }, + { + "epoch": 0.8684229301180751, + "grad_norm": 0.3492889702320099, + "learning_rate": 2.5901703489638246e-06, + "loss": 0.0919, + "step": 48689 + }, + { + "epoch": 0.8684407662397888, + "grad_norm": 0.2752855122089386, + "learning_rate": 2.589480455146562e-06, + "loss": 0.1118, + "step": 48690 + }, + { + "epoch": 0.8684586023615025, + "grad_norm": 0.3009353280067444, + "learning_rate": 2.5887906482002204e-06, + "loss": 0.117, + "step": 48691 + }, + { + "epoch": 0.8684764384832162, + "grad_norm": 0.30130213499069214, + "learning_rate": 2.5881009281274656e-06, + "loss": 0.0712, + "step": 48692 + }, + { + "epoch": 0.8684942746049299, + "grad_norm": 0.26302117109298706, + "learning_rate": 2.587411294930972e-06, + "loss": 0.0986, + "step": 48693 + }, + { + "epoch": 0.8685121107266436, + "grad_norm": 0.31603842973709106, + "learning_rate": 2.5867217486134105e-06, + "loss": 0.1128, + "step": 48694 + }, + { + "epoch": 0.8685299468483573, + "grad_norm": 0.22961276769638062, + "learning_rate": 2.586032289177459e-06, + "loss": 0.0776, + "step": 48695 + }, + { + "epoch": 0.868547782970071, + "grad_norm": 0.24750012159347534, + "learning_rate": 2.5853429166257908e-06, + "loss": 0.1123, + "step": 48696 + }, + { + "epoch": 0.8685656190917846, + "grad_norm": 0.2424774020910263, + "learning_rate": 2.584653630961073e-06, + "loss": 0.0904, + "step": 48697 + }, + { + "epoch": 0.8685834552134983, + "grad_norm": 0.22761297225952148, + "learning_rate": 2.5839644321859757e-06, + "loss": 0.102, + "step": 48698 + }, + { + "epoch": 0.868601291335212, + "grad_norm": 0.31616827845573425, + "learning_rate": 2.58327532030318e-06, + "loss": 0.1356, + "step": 48699 + }, + { + "epoch": 0.8686191274569258, + "grad_norm": 0.4137493669986725, + "learning_rate": 2.582586295315351e-06, + "loss": 0.1182, + "step": 48700 + }, + { + "epoch": 0.8686369635786395, + "grad_norm": 0.30806007981300354, + "learning_rate": 2.5818973572251548e-06, + "loss": 0.0981, + "step": 48701 + }, + { + "epoch": 0.8686547997003532, + "grad_norm": 0.25863999128341675, + "learning_rate": 2.5812085060352735e-06, + "loss": 0.1276, + "step": 48702 + }, + { + "epoch": 0.8686726358220669, + "grad_norm": 0.2708914875984192, + "learning_rate": 2.5805197417483664e-06, + "loss": 0.1072, + "step": 48703 + }, + { + "epoch": 0.8686904719437806, + "grad_norm": 0.28094902634620667, + "learning_rate": 2.5798310643671137e-06, + "loss": 0.138, + "step": 48704 + }, + { + "epoch": 0.8687083080654943, + "grad_norm": 0.22386638820171356, + "learning_rate": 2.5791424738941778e-06, + "loss": 0.0961, + "step": 48705 + }, + { + "epoch": 0.868726144187208, + "grad_norm": 0.3509886860847473, + "learning_rate": 2.5784539703322284e-06, + "loss": 0.0926, + "step": 48706 + }, + { + "epoch": 0.8687439803089216, + "grad_norm": 0.35328739881515503, + "learning_rate": 2.5777655536839334e-06, + "loss": 0.1481, + "step": 48707 + }, + { + "epoch": 0.8687618164306353, + "grad_norm": 0.2598971426486969, + "learning_rate": 2.577077223951968e-06, + "loss": 0.1364, + "step": 48708 + }, + { + "epoch": 0.868779652552349, + "grad_norm": 0.3480691611766815, + "learning_rate": 2.576388981138994e-06, + "loss": 0.1422, + "step": 48709 + }, + { + "epoch": 0.8687974886740627, + "grad_norm": 0.30219706892967224, + "learning_rate": 2.575700825247682e-06, + "loss": 0.142, + "step": 48710 + }, + { + "epoch": 0.8688153247957764, + "grad_norm": 0.21827824413776398, + "learning_rate": 2.5750127562806953e-06, + "loss": 0.0651, + "step": 48711 + }, + { + "epoch": 0.8688331609174901, + "grad_norm": 0.17950038611888885, + "learning_rate": 2.5743247742407076e-06, + "loss": 0.0732, + "step": 48712 + }, + { + "epoch": 0.8688509970392038, + "grad_norm": 0.27343860268592834, + "learning_rate": 2.573636879130384e-06, + "loss": 0.1475, + "step": 48713 + }, + { + "epoch": 0.8688688331609175, + "grad_norm": 0.27780279517173767, + "learning_rate": 2.572949070952388e-06, + "loss": 0.0583, + "step": 48714 + }, + { + "epoch": 0.8688866692826311, + "grad_norm": 0.32142001390457153, + "learning_rate": 2.5722613497093844e-06, + "loss": 0.1337, + "step": 48715 + }, + { + "epoch": 0.8689045054043448, + "grad_norm": 0.1972419023513794, + "learning_rate": 2.5715737154040408e-06, + "loss": 0.1249, + "step": 48716 + }, + { + "epoch": 0.8689223415260586, + "grad_norm": 0.24779509007930756, + "learning_rate": 2.57088616803903e-06, + "loss": 0.1288, + "step": 48717 + }, + { + "epoch": 0.8689401776477723, + "grad_norm": 0.251101016998291, + "learning_rate": 2.570198707617011e-06, + "loss": 0.091, + "step": 48718 + }, + { + "epoch": 0.868958013769486, + "grad_norm": 0.21175187826156616, + "learning_rate": 2.569511334140648e-06, + "loss": 0.0849, + "step": 48719 + }, + { + "epoch": 0.8689758498911997, + "grad_norm": 0.3272428512573242, + "learning_rate": 2.568824047612603e-06, + "loss": 0.1007, + "step": 48720 + }, + { + "epoch": 0.8689936860129134, + "grad_norm": 0.23609288036823273, + "learning_rate": 2.568136848035546e-06, + "loss": 0.1011, + "step": 48721 + }, + { + "epoch": 0.8690115221346271, + "grad_norm": 0.3013632893562317, + "learning_rate": 2.567449735412139e-06, + "loss": 0.162, + "step": 48722 + }, + { + "epoch": 0.8690293582563408, + "grad_norm": 0.281926691532135, + "learning_rate": 2.5667627097450462e-06, + "loss": 0.1039, + "step": 48723 + }, + { + "epoch": 0.8690471943780544, + "grad_norm": 0.16842341423034668, + "learning_rate": 2.5660757710369245e-06, + "loss": 0.094, + "step": 48724 + }, + { + "epoch": 0.8690650304997681, + "grad_norm": 0.28846049308776855, + "learning_rate": 2.5653889192904427e-06, + "loss": 0.1213, + "step": 48725 + }, + { + "epoch": 0.8690828666214818, + "grad_norm": 0.2769353985786438, + "learning_rate": 2.5647021545082665e-06, + "loss": 0.1366, + "step": 48726 + }, + { + "epoch": 0.8691007027431955, + "grad_norm": 0.2867237329483032, + "learning_rate": 2.564015476693052e-06, + "loss": 0.0833, + "step": 48727 + }, + { + "epoch": 0.8691185388649092, + "grad_norm": 0.3349776566028595, + "learning_rate": 2.5633288858474575e-06, + "loss": 0.1565, + "step": 48728 + }, + { + "epoch": 0.8691363749866229, + "grad_norm": 0.3175070285797119, + "learning_rate": 2.5626423819741537e-06, + "loss": 0.141, + "step": 48729 + }, + { + "epoch": 0.8691542111083366, + "grad_norm": 0.2893778085708618, + "learning_rate": 2.5619559650757934e-06, + "loss": 0.0919, + "step": 48730 + }, + { + "epoch": 0.8691720472300503, + "grad_norm": 0.2615737318992615, + "learning_rate": 2.5612696351550473e-06, + "loss": 0.0889, + "step": 48731 + }, + { + "epoch": 0.869189883351764, + "grad_norm": 0.23890773952007294, + "learning_rate": 2.5605833922145716e-06, + "loss": 0.1038, + "step": 48732 + }, + { + "epoch": 0.8692077194734776, + "grad_norm": 0.31569021940231323, + "learning_rate": 2.559897236257017e-06, + "loss": 0.1821, + "step": 48733 + }, + { + "epoch": 0.8692255555951914, + "grad_norm": 0.3673979640007019, + "learning_rate": 2.5592111672850583e-06, + "loss": 0.1395, + "step": 48734 + }, + { + "epoch": 0.8692433917169051, + "grad_norm": 0.238789364695549, + "learning_rate": 2.558525185301347e-06, + "loss": 0.0862, + "step": 48735 + }, + { + "epoch": 0.8692612278386188, + "grad_norm": 0.3171417713165283, + "learning_rate": 2.557839290308542e-06, + "loss": 0.1062, + "step": 48736 + }, + { + "epoch": 0.8692790639603325, + "grad_norm": 0.30866560339927673, + "learning_rate": 2.557153482309299e-06, + "loss": 0.0986, + "step": 48737 + }, + { + "epoch": 0.8692969000820462, + "grad_norm": 0.30295780301094055, + "learning_rate": 2.5564677613062886e-06, + "loss": 0.1479, + "step": 48738 + }, + { + "epoch": 0.8693147362037599, + "grad_norm": 0.30899497866630554, + "learning_rate": 2.555782127302156e-06, + "loss": 0.1288, + "step": 48739 + }, + { + "epoch": 0.8693325723254736, + "grad_norm": 0.23120485246181488, + "learning_rate": 2.555096580299568e-06, + "loss": 0.1183, + "step": 48740 + }, + { + "epoch": 0.8693504084471873, + "grad_norm": 0.27584919333457947, + "learning_rate": 2.5544111203011754e-06, + "loss": 0.134, + "step": 48741 + }, + { + "epoch": 0.8693682445689009, + "grad_norm": 0.1963358372449875, + "learning_rate": 2.553725747309632e-06, + "loss": 0.1086, + "step": 48742 + }, + { + "epoch": 0.8693860806906146, + "grad_norm": 0.29744452238082886, + "learning_rate": 2.553040461327602e-06, + "loss": 0.1038, + "step": 48743 + }, + { + "epoch": 0.8694039168123283, + "grad_norm": 0.2964674234390259, + "learning_rate": 2.5523552623577474e-06, + "loss": 0.1134, + "step": 48744 + }, + { + "epoch": 0.869421752934042, + "grad_norm": 0.541043221950531, + "learning_rate": 2.551670150402713e-06, + "loss": 0.1316, + "step": 48745 + }, + { + "epoch": 0.8694395890557557, + "grad_norm": 0.30327659845352173, + "learning_rate": 2.5509851254651553e-06, + "loss": 0.1515, + "step": 48746 + }, + { + "epoch": 0.8694574251774694, + "grad_norm": 0.2531784772872925, + "learning_rate": 2.5503001875477384e-06, + "loss": 0.1132, + "step": 48747 + }, + { + "epoch": 0.8694752612991831, + "grad_norm": 0.29986971616744995, + "learning_rate": 2.5496153366531105e-06, + "loss": 0.12, + "step": 48748 + }, + { + "epoch": 0.8694930974208968, + "grad_norm": 0.3939153552055359, + "learning_rate": 2.548930572783928e-06, + "loss": 0.1373, + "step": 48749 + }, + { + "epoch": 0.8695109335426104, + "grad_norm": 0.34003064036369324, + "learning_rate": 2.5482458959428385e-06, + "loss": 0.1011, + "step": 48750 + }, + { + "epoch": 0.8695287696643242, + "grad_norm": 0.23042713105678558, + "learning_rate": 2.5475613061325094e-06, + "loss": 0.1468, + "step": 48751 + }, + { + "epoch": 0.8695466057860379, + "grad_norm": 0.2887931168079376, + "learning_rate": 2.5468768033555886e-06, + "loss": 0.1072, + "step": 48752 + }, + { + "epoch": 0.8695644419077516, + "grad_norm": 0.31251659989356995, + "learning_rate": 2.5461923876147264e-06, + "loss": 0.0777, + "step": 48753 + }, + { + "epoch": 0.8695822780294653, + "grad_norm": 0.29049623012542725, + "learning_rate": 2.545508058912577e-06, + "loss": 0.0735, + "step": 48754 + }, + { + "epoch": 0.869600114151179, + "grad_norm": 0.359990656375885, + "learning_rate": 2.5448238172517904e-06, + "loss": 0.1644, + "step": 48755 + }, + { + "epoch": 0.8696179502728927, + "grad_norm": 0.31109046936035156, + "learning_rate": 2.5441396626350282e-06, + "loss": 0.08, + "step": 48756 + }, + { + "epoch": 0.8696357863946064, + "grad_norm": 0.36081916093826294, + "learning_rate": 2.543455595064931e-06, + "loss": 0.1417, + "step": 48757 + }, + { + "epoch": 0.8696536225163201, + "grad_norm": 0.1988876909017563, + "learning_rate": 2.5427716145441596e-06, + "loss": 0.1203, + "step": 48758 + }, + { + "epoch": 0.8696714586380337, + "grad_norm": 0.3619775176048279, + "learning_rate": 2.5420877210753593e-06, + "loss": 0.138, + "step": 48759 + }, + { + "epoch": 0.8696892947597474, + "grad_norm": 0.3323259949684143, + "learning_rate": 2.5414039146611862e-06, + "loss": 0.0828, + "step": 48760 + }, + { + "epoch": 0.8697071308814611, + "grad_norm": 0.2197135090827942, + "learning_rate": 2.5407201953042913e-06, + "loss": 0.1348, + "step": 48761 + }, + { + "epoch": 0.8697249670031748, + "grad_norm": 0.2875751852989197, + "learning_rate": 2.5400365630073196e-06, + "loss": 0.1242, + "step": 48762 + }, + { + "epoch": 0.8697428031248885, + "grad_norm": 0.22646726667881012, + "learning_rate": 2.5393530177729217e-06, + "loss": 0.1498, + "step": 48763 + }, + { + "epoch": 0.8697606392466022, + "grad_norm": 0.24403299391269684, + "learning_rate": 2.5386695596037513e-06, + "loss": 0.0876, + "step": 48764 + }, + { + "epoch": 0.8697784753683159, + "grad_norm": 0.2947208881378174, + "learning_rate": 2.5379861885024557e-06, + "loss": 0.1097, + "step": 48765 + }, + { + "epoch": 0.8697963114900296, + "grad_norm": 0.20807866752147675, + "learning_rate": 2.537302904471686e-06, + "loss": 0.0785, + "step": 48766 + }, + { + "epoch": 0.8698141476117432, + "grad_norm": 0.27427777647972107, + "learning_rate": 2.5366197075140846e-06, + "loss": 0.0814, + "step": 48767 + }, + { + "epoch": 0.869831983733457, + "grad_norm": 0.25727513432502747, + "learning_rate": 2.535936597632302e-06, + "loss": 0.0655, + "step": 48768 + }, + { + "epoch": 0.8698498198551707, + "grad_norm": 0.3162531554698944, + "learning_rate": 2.535253574828994e-06, + "loss": 0.1014, + "step": 48769 + }, + { + "epoch": 0.8698676559768844, + "grad_norm": 0.30200427770614624, + "learning_rate": 2.5345706391067953e-06, + "loss": 0.1372, + "step": 48770 + }, + { + "epoch": 0.8698854920985981, + "grad_norm": 0.32227572798728943, + "learning_rate": 2.5338877904683643e-06, + "loss": 0.1346, + "step": 48771 + }, + { + "epoch": 0.8699033282203118, + "grad_norm": 0.1963113695383072, + "learning_rate": 2.533205028916341e-06, + "loss": 0.0868, + "step": 48772 + }, + { + "epoch": 0.8699211643420255, + "grad_norm": 0.3146909475326538, + "learning_rate": 2.532522354453379e-06, + "loss": 0.1085, + "step": 48773 + }, + { + "epoch": 0.8699390004637392, + "grad_norm": 0.373526930809021, + "learning_rate": 2.53183976708212e-06, + "loss": 0.1005, + "step": 48774 + }, + { + "epoch": 0.8699568365854529, + "grad_norm": 0.23058359324932098, + "learning_rate": 2.531157266805209e-06, + "loss": 0.1223, + "step": 48775 + }, + { + "epoch": 0.8699746727071666, + "grad_norm": 0.2550305128097534, + "learning_rate": 2.5304748536252894e-06, + "loss": 0.0701, + "step": 48776 + }, + { + "epoch": 0.8699925088288802, + "grad_norm": 0.26112306118011475, + "learning_rate": 2.5297925275450136e-06, + "loss": 0.1386, + "step": 48777 + }, + { + "epoch": 0.8700103449505939, + "grad_norm": 0.26633623242378235, + "learning_rate": 2.5291102885670243e-06, + "loss": 0.1525, + "step": 48778 + }, + { + "epoch": 0.8700281810723076, + "grad_norm": 0.21439997851848602, + "learning_rate": 2.5284281366939667e-06, + "loss": 0.0486, + "step": 48779 + }, + { + "epoch": 0.8700460171940213, + "grad_norm": 0.24864917993545532, + "learning_rate": 2.52774607192848e-06, + "loss": 0.0895, + "step": 48780 + }, + { + "epoch": 0.870063853315735, + "grad_norm": 0.2315618097782135, + "learning_rate": 2.527064094273207e-06, + "loss": 0.0755, + "step": 48781 + }, + { + "epoch": 0.8700816894374487, + "grad_norm": 0.26426947116851807, + "learning_rate": 2.526382203730801e-06, + "loss": 0.1035, + "step": 48782 + }, + { + "epoch": 0.8700995255591624, + "grad_norm": 0.3090909421443939, + "learning_rate": 2.5257004003038985e-06, + "loss": 0.1401, + "step": 48783 + }, + { + "epoch": 0.8701173616808761, + "grad_norm": 0.27203044295310974, + "learning_rate": 2.5250186839951397e-06, + "loss": 0.116, + "step": 48784 + }, + { + "epoch": 0.8701351978025899, + "grad_norm": 0.2559853494167328, + "learning_rate": 2.524337054807177e-06, + "loss": 0.1235, + "step": 48785 + }, + { + "epoch": 0.8701530339243035, + "grad_norm": 0.42401984333992004, + "learning_rate": 2.5236555127426396e-06, + "loss": 0.1111, + "step": 48786 + }, + { + "epoch": 0.8701708700460172, + "grad_norm": 0.23002992570400238, + "learning_rate": 2.522974057804181e-06, + "loss": 0.0721, + "step": 48787 + }, + { + "epoch": 0.8701887061677309, + "grad_norm": 0.2991747558116913, + "learning_rate": 2.5222926899944404e-06, + "loss": 0.0899, + "step": 48788 + }, + { + "epoch": 0.8702065422894446, + "grad_norm": 0.23942267894744873, + "learning_rate": 2.521611409316052e-06, + "loss": 0.0783, + "step": 48789 + }, + { + "epoch": 0.8702243784111583, + "grad_norm": 0.21381866931915283, + "learning_rate": 2.5209302157716664e-06, + "loss": 0.0755, + "step": 48790 + }, + { + "epoch": 0.870242214532872, + "grad_norm": 0.3925561010837555, + "learning_rate": 2.5202491093639173e-06, + "loss": 0.0893, + "step": 48791 + }, + { + "epoch": 0.8702600506545857, + "grad_norm": 0.33114877343177795, + "learning_rate": 2.5195680900954475e-06, + "loss": 0.1621, + "step": 48792 + }, + { + "epoch": 0.8702778867762994, + "grad_norm": 0.2122526615858078, + "learning_rate": 2.5188871579688994e-06, + "loss": 0.1047, + "step": 48793 + }, + { + "epoch": 0.870295722898013, + "grad_norm": 0.33374249935150146, + "learning_rate": 2.518206312986901e-06, + "loss": 0.1152, + "step": 48794 + }, + { + "epoch": 0.8703135590197267, + "grad_norm": 0.25987598299980164, + "learning_rate": 2.517525555152106e-06, + "loss": 0.1362, + "step": 48795 + }, + { + "epoch": 0.8703313951414404, + "grad_norm": 0.2597131133079529, + "learning_rate": 2.5168448844671454e-06, + "loss": 0.1716, + "step": 48796 + }, + { + "epoch": 0.8703492312631541, + "grad_norm": 0.25804784893989563, + "learning_rate": 2.516164300934656e-06, + "loss": 0.0906, + "step": 48797 + }, + { + "epoch": 0.8703670673848678, + "grad_norm": 0.3604278564453125, + "learning_rate": 2.5154838045572867e-06, + "loss": 0.1364, + "step": 48798 + }, + { + "epoch": 0.8703849035065815, + "grad_norm": 0.36034461855888367, + "learning_rate": 2.5148033953376614e-06, + "loss": 0.129, + "step": 48799 + }, + { + "epoch": 0.8704027396282952, + "grad_norm": 0.29679515957832336, + "learning_rate": 2.5141230732784264e-06, + "loss": 0.07, + "step": 48800 + }, + { + "epoch": 0.870420575750009, + "grad_norm": 0.37985461950302124, + "learning_rate": 2.513442838382221e-06, + "loss": 0.1411, + "step": 48801 + }, + { + "epoch": 0.8704384118717227, + "grad_norm": 0.3420778214931488, + "learning_rate": 2.512762690651671e-06, + "loss": 0.0865, + "step": 48802 + }, + { + "epoch": 0.8704562479934363, + "grad_norm": 0.30549710988998413, + "learning_rate": 2.512082630089424e-06, + "loss": 0.1345, + "step": 48803 + }, + { + "epoch": 0.87047408411515, + "grad_norm": 0.2198539823293686, + "learning_rate": 2.5114026566981115e-06, + "loss": 0.0845, + "step": 48804 + }, + { + "epoch": 0.8704919202368637, + "grad_norm": 0.24978187680244446, + "learning_rate": 2.5107227704803725e-06, + "loss": 0.0964, + "step": 48805 + }, + { + "epoch": 0.8705097563585774, + "grad_norm": 0.23699213564395905, + "learning_rate": 2.510042971438836e-06, + "loss": 0.1328, + "step": 48806 + }, + { + "epoch": 0.8705275924802911, + "grad_norm": 0.298720121383667, + "learning_rate": 2.509363259576139e-06, + "loss": 0.1476, + "step": 48807 + }, + { + "epoch": 0.8705454286020048, + "grad_norm": 0.2284708321094513, + "learning_rate": 2.5086836348949207e-06, + "loss": 0.1065, + "step": 48808 + }, + { + "epoch": 0.8705632647237185, + "grad_norm": 0.3133024275302887, + "learning_rate": 2.508004097397812e-06, + "loss": 0.1589, + "step": 48809 + }, + { + "epoch": 0.8705811008454322, + "grad_norm": 0.27239471673965454, + "learning_rate": 2.50732464708745e-06, + "loss": 0.1036, + "step": 48810 + }, + { + "epoch": 0.8705989369671459, + "grad_norm": 0.23195764422416687, + "learning_rate": 2.5066452839664607e-06, + "loss": 0.0908, + "step": 48811 + }, + { + "epoch": 0.8706167730888595, + "grad_norm": 0.27285176515579224, + "learning_rate": 2.5059660080374886e-06, + "loss": 0.0901, + "step": 48812 + }, + { + "epoch": 0.8706346092105732, + "grad_norm": 0.24582798779010773, + "learning_rate": 2.505286819303157e-06, + "loss": 0.1186, + "step": 48813 + }, + { + "epoch": 0.8706524453322869, + "grad_norm": 0.20475958287715912, + "learning_rate": 2.5046077177661083e-06, + "loss": 0.0596, + "step": 48814 + }, + { + "epoch": 0.8706702814540006, + "grad_norm": 0.2312474399805069, + "learning_rate": 2.503928703428962e-06, + "loss": 0.0861, + "step": 48815 + }, + { + "epoch": 0.8706881175757143, + "grad_norm": 0.28702399134635925, + "learning_rate": 2.503249776294364e-06, + "loss": 0.1168, + "step": 48816 + }, + { + "epoch": 0.870705953697428, + "grad_norm": 0.27082788944244385, + "learning_rate": 2.5025709363649425e-06, + "loss": 0.1066, + "step": 48817 + }, + { + "epoch": 0.8707237898191418, + "grad_norm": 0.3364080488681793, + "learning_rate": 2.501892183643323e-06, + "loss": 0.0984, + "step": 48818 + }, + { + "epoch": 0.8707416259408555, + "grad_norm": 0.3233310878276825, + "learning_rate": 2.5012135181321422e-06, + "loss": 0.1104, + "step": 48819 + }, + { + "epoch": 0.8707594620625692, + "grad_norm": 0.29159247875213623, + "learning_rate": 2.500534939834023e-06, + "loss": 0.1002, + "step": 48820 + }, + { + "epoch": 0.8707772981842828, + "grad_norm": 0.4067905843257904, + "learning_rate": 2.4998564487516058e-06, + "loss": 0.1001, + "step": 48821 + }, + { + "epoch": 0.8707951343059965, + "grad_norm": 0.38580071926116943, + "learning_rate": 2.499178044887518e-06, + "loss": 0.1584, + "step": 48822 + }, + { + "epoch": 0.8708129704277102, + "grad_norm": 0.2752784192562103, + "learning_rate": 2.4984997282443857e-06, + "loss": 0.1186, + "step": 48823 + }, + { + "epoch": 0.8708308065494239, + "grad_norm": 0.2208663374185562, + "learning_rate": 2.497821498824837e-06, + "loss": 0.0913, + "step": 48824 + }, + { + "epoch": 0.8708486426711376, + "grad_norm": 0.3801005184650421, + "learning_rate": 2.4971433566315096e-06, + "loss": 0.1155, + "step": 48825 + }, + { + "epoch": 0.8708664787928513, + "grad_norm": 0.2505093812942505, + "learning_rate": 2.4964653016670198e-06, + "loss": 0.1263, + "step": 48826 + }, + { + "epoch": 0.870884314914565, + "grad_norm": 0.2930073142051697, + "learning_rate": 2.4957873339340076e-06, + "loss": 0.1143, + "step": 48827 + }, + { + "epoch": 0.8709021510362787, + "grad_norm": 0.2723417282104492, + "learning_rate": 2.4951094534350932e-06, + "loss": 0.0798, + "step": 48828 + }, + { + "epoch": 0.8709199871579923, + "grad_norm": 0.2004810869693756, + "learning_rate": 2.4944316601729106e-06, + "loss": 0.0602, + "step": 48829 + }, + { + "epoch": 0.870937823279706, + "grad_norm": 0.233836829662323, + "learning_rate": 2.4937539541500856e-06, + "loss": 0.0626, + "step": 48830 + }, + { + "epoch": 0.8709556594014197, + "grad_norm": 0.2493063360452652, + "learning_rate": 2.4930763353692434e-06, + "loss": 0.066, + "step": 48831 + }, + { + "epoch": 0.8709734955231334, + "grad_norm": 0.24943096935749054, + "learning_rate": 2.49239880383301e-06, + "loss": 0.0573, + "step": 48832 + }, + { + "epoch": 0.8709913316448471, + "grad_norm": 0.2869252562522888, + "learning_rate": 2.491721359544011e-06, + "loss": 0.112, + "step": 48833 + }, + { + "epoch": 0.8710091677665608, + "grad_norm": 0.35033470392227173, + "learning_rate": 2.491044002504875e-06, + "loss": 0.0918, + "step": 48834 + }, + { + "epoch": 0.8710270038882746, + "grad_norm": 0.2965717911720276, + "learning_rate": 2.490366732718227e-06, + "loss": 0.1151, + "step": 48835 + }, + { + "epoch": 0.8710448400099883, + "grad_norm": 0.42473161220550537, + "learning_rate": 2.489689550186694e-06, + "loss": 0.1728, + "step": 48836 + }, + { + "epoch": 0.871062676131702, + "grad_norm": 0.3192669153213501, + "learning_rate": 2.489012454912895e-06, + "loss": 0.0876, + "step": 48837 + }, + { + "epoch": 0.8710805122534157, + "grad_norm": 0.21786446869373322, + "learning_rate": 2.4883354468994617e-06, + "loss": 0.1027, + "step": 48838 + }, + { + "epoch": 0.8710983483751293, + "grad_norm": 0.21870316565036774, + "learning_rate": 2.4876585261490173e-06, + "loss": 0.1483, + "step": 48839 + }, + { + "epoch": 0.871116184496843, + "grad_norm": 0.30879902839660645, + "learning_rate": 2.486981692664178e-06, + "loss": 0.1302, + "step": 48840 + }, + { + "epoch": 0.8711340206185567, + "grad_norm": 0.236866757273674, + "learning_rate": 2.486304946447579e-06, + "loss": 0.125, + "step": 48841 + }, + { + "epoch": 0.8711518567402704, + "grad_norm": 0.2195359319448471, + "learning_rate": 2.4856282875018316e-06, + "loss": 0.1079, + "step": 48842 + }, + { + "epoch": 0.8711696928619841, + "grad_norm": 0.2912219166755676, + "learning_rate": 2.48495171582957e-06, + "loss": 0.0924, + "step": 48843 + }, + { + "epoch": 0.8711875289836978, + "grad_norm": 0.24497102200984955, + "learning_rate": 2.4842752314334138e-06, + "loss": 0.112, + "step": 48844 + }, + { + "epoch": 0.8712053651054115, + "grad_norm": 0.2763284146785736, + "learning_rate": 2.483598834315981e-06, + "loss": 0.0926, + "step": 48845 + }, + { + "epoch": 0.8712232012271252, + "grad_norm": 0.24086697399616241, + "learning_rate": 2.482922524479894e-06, + "loss": 0.1008, + "step": 48846 + }, + { + "epoch": 0.8712410373488388, + "grad_norm": 0.4496169686317444, + "learning_rate": 2.482246301927779e-06, + "loss": 0.1778, + "step": 48847 + }, + { + "epoch": 0.8712588734705525, + "grad_norm": 0.18537591397762299, + "learning_rate": 2.4815701666622558e-06, + "loss": 0.0662, + "step": 48848 + }, + { + "epoch": 0.8712767095922662, + "grad_norm": 0.3441908359527588, + "learning_rate": 2.4808941186859445e-06, + "loss": 0.1191, + "step": 48849 + }, + { + "epoch": 0.8712945457139799, + "grad_norm": 0.2697514295578003, + "learning_rate": 2.48021815800146e-06, + "loss": 0.0942, + "step": 48850 + }, + { + "epoch": 0.8713123818356936, + "grad_norm": 0.2659436762332916, + "learning_rate": 2.479542284611433e-06, + "loss": 0.0829, + "step": 48851 + }, + { + "epoch": 0.8713302179574074, + "grad_norm": 0.2795272171497345, + "learning_rate": 2.4788664985184785e-06, + "loss": 0.0847, + "step": 48852 + }, + { + "epoch": 0.8713480540791211, + "grad_norm": 0.2340693175792694, + "learning_rate": 2.4781907997252135e-06, + "loss": 0.0918, + "step": 48853 + }, + { + "epoch": 0.8713658902008348, + "grad_norm": 0.3255627155303955, + "learning_rate": 2.4775151882342614e-06, + "loss": 0.162, + "step": 48854 + }, + { + "epoch": 0.8713837263225485, + "grad_norm": 0.27811694145202637, + "learning_rate": 2.476839664048236e-06, + "loss": 0.1014, + "step": 48855 + }, + { + "epoch": 0.8714015624442621, + "grad_norm": 0.28932762145996094, + "learning_rate": 2.4761642271697637e-06, + "loss": 0.1376, + "step": 48856 + }, + { + "epoch": 0.8714193985659758, + "grad_norm": 0.3712722063064575, + "learning_rate": 2.475488877601456e-06, + "loss": 0.148, + "step": 48857 + }, + { + "epoch": 0.8714372346876895, + "grad_norm": 0.3367063105106354, + "learning_rate": 2.4748136153459363e-06, + "loss": 0.1513, + "step": 48858 + }, + { + "epoch": 0.8714550708094032, + "grad_norm": 0.29865390062332153, + "learning_rate": 2.4741384404058125e-06, + "loss": 0.1395, + "step": 48859 + }, + { + "epoch": 0.8714729069311169, + "grad_norm": 0.30798327922821045, + "learning_rate": 2.473463352783714e-06, + "loss": 0.1546, + "step": 48860 + }, + { + "epoch": 0.8714907430528306, + "grad_norm": 0.18185406923294067, + "learning_rate": 2.4727883524822527e-06, + "loss": 0.0424, + "step": 48861 + }, + { + "epoch": 0.8715085791745443, + "grad_norm": 0.20749178528785706, + "learning_rate": 2.4721134395040453e-06, + "loss": 0.1407, + "step": 48862 + }, + { + "epoch": 0.871526415296258, + "grad_norm": 0.27528512477874756, + "learning_rate": 2.4714386138516984e-06, + "loss": 0.1137, + "step": 48863 + }, + { + "epoch": 0.8715442514179716, + "grad_norm": 0.37108317017555237, + "learning_rate": 2.470763875527843e-06, + "loss": 0.1868, + "step": 48864 + }, + { + "epoch": 0.8715620875396853, + "grad_norm": 0.35055819153785706, + "learning_rate": 2.4700892245350908e-06, + "loss": 0.1351, + "step": 48865 + }, + { + "epoch": 0.871579923661399, + "grad_norm": 0.24428337812423706, + "learning_rate": 2.4694146608760517e-06, + "loss": 0.1231, + "step": 48866 + }, + { + "epoch": 0.8715977597831127, + "grad_norm": 0.26510217785835266, + "learning_rate": 2.4687401845533394e-06, + "loss": 0.1212, + "step": 48867 + }, + { + "epoch": 0.8716155959048264, + "grad_norm": 0.2758074402809143, + "learning_rate": 2.468065795569574e-06, + "loss": 0.1674, + "step": 48868 + }, + { + "epoch": 0.8716334320265402, + "grad_norm": 0.34988608956336975, + "learning_rate": 2.467391493927368e-06, + "loss": 0.1569, + "step": 48869 + }, + { + "epoch": 0.8716512681482539, + "grad_norm": 0.2557702362537384, + "learning_rate": 2.4667172796293355e-06, + "loss": 0.0627, + "step": 48870 + }, + { + "epoch": 0.8716691042699676, + "grad_norm": 0.18752634525299072, + "learning_rate": 2.466043152678091e-06, + "loss": 0.0924, + "step": 48871 + }, + { + "epoch": 0.8716869403916813, + "grad_norm": 0.3367287814617157, + "learning_rate": 2.4653691130762434e-06, + "loss": 0.1704, + "step": 48872 + }, + { + "epoch": 0.871704776513395, + "grad_norm": 0.22540201246738434, + "learning_rate": 2.4646951608264108e-06, + "loss": 0.0865, + "step": 48873 + }, + { + "epoch": 0.8717226126351086, + "grad_norm": 0.2627692222595215, + "learning_rate": 2.464021295931204e-06, + "loss": 0.1153, + "step": 48874 + }, + { + "epoch": 0.8717404487568223, + "grad_norm": 0.3276059329509735, + "learning_rate": 2.463347518393236e-06, + "loss": 0.1203, + "step": 48875 + }, + { + "epoch": 0.871758284878536, + "grad_norm": 0.3505856394767761, + "learning_rate": 2.462673828215109e-06, + "loss": 0.1014, + "step": 48876 + }, + { + "epoch": 0.8717761210002497, + "grad_norm": 0.2076411098241806, + "learning_rate": 2.4620002253994495e-06, + "loss": 0.0968, + "step": 48877 + }, + { + "epoch": 0.8717939571219634, + "grad_norm": 0.29111114144325256, + "learning_rate": 2.4613267099488606e-06, + "loss": 0.1243, + "step": 48878 + }, + { + "epoch": 0.8718117932436771, + "grad_norm": 0.2613781690597534, + "learning_rate": 2.4606532818659546e-06, + "loss": 0.1579, + "step": 48879 + }, + { + "epoch": 0.8718296293653908, + "grad_norm": 0.24378587305545807, + "learning_rate": 2.4599799411533343e-06, + "loss": 0.103, + "step": 48880 + }, + { + "epoch": 0.8718474654871045, + "grad_norm": 0.22437183558940887, + "learning_rate": 2.4593066878136227e-06, + "loss": 0.1208, + "step": 48881 + }, + { + "epoch": 0.8718653016088181, + "grad_norm": 0.28232720494270325, + "learning_rate": 2.458633521849421e-06, + "loss": 0.1582, + "step": 48882 + }, + { + "epoch": 0.8718831377305318, + "grad_norm": 0.2162172496318817, + "learning_rate": 2.4579604432633436e-06, + "loss": 0.095, + "step": 48883 + }, + { + "epoch": 0.8719009738522455, + "grad_norm": 0.3094240128993988, + "learning_rate": 2.4572874520579993e-06, + "loss": 0.1088, + "step": 48884 + }, + { + "epoch": 0.8719188099739592, + "grad_norm": 0.4338151812553406, + "learning_rate": 2.4566145482359886e-06, + "loss": 0.1051, + "step": 48885 + }, + { + "epoch": 0.871936646095673, + "grad_norm": 0.2873224914073944, + "learning_rate": 2.4559417317999323e-06, + "loss": 0.118, + "step": 48886 + }, + { + "epoch": 0.8719544822173867, + "grad_norm": 0.23062574863433838, + "learning_rate": 2.4552690027524304e-06, + "loss": 0.1386, + "step": 48887 + }, + { + "epoch": 0.8719723183391004, + "grad_norm": 0.2627892792224884, + "learning_rate": 2.4545963610960925e-06, + "loss": 0.0981, + "step": 48888 + }, + { + "epoch": 0.8719901544608141, + "grad_norm": 0.24387618899345398, + "learning_rate": 2.453923806833522e-06, + "loss": 0.0828, + "step": 48889 + }, + { + "epoch": 0.8720079905825278, + "grad_norm": 0.4822535812854767, + "learning_rate": 2.453251339967336e-06, + "loss": 0.0914, + "step": 48890 + }, + { + "epoch": 0.8720258267042414, + "grad_norm": 0.28246212005615234, + "learning_rate": 2.4525789605001348e-06, + "loss": 0.088, + "step": 48891 + }, + { + "epoch": 0.8720436628259551, + "grad_norm": 0.27207309007644653, + "learning_rate": 2.4519066684345256e-06, + "loss": 0.1223, + "step": 48892 + }, + { + "epoch": 0.8720614989476688, + "grad_norm": 0.21045000851154327, + "learning_rate": 2.4512344637731057e-06, + "loss": 0.1302, + "step": 48893 + }, + { + "epoch": 0.8720793350693825, + "grad_norm": 0.2550332844257355, + "learning_rate": 2.450562346518498e-06, + "loss": 0.0927, + "step": 48894 + }, + { + "epoch": 0.8720971711910962, + "grad_norm": 0.27137085795402527, + "learning_rate": 2.4498903166732977e-06, + "loss": 0.0843, + "step": 48895 + }, + { + "epoch": 0.8721150073128099, + "grad_norm": 0.31101417541503906, + "learning_rate": 2.449218374240106e-06, + "loss": 0.1185, + "step": 48896 + }, + { + "epoch": 0.8721328434345236, + "grad_norm": 0.2749179005622864, + "learning_rate": 2.4485465192215366e-06, + "loss": 0.0771, + "step": 48897 + }, + { + "epoch": 0.8721506795562373, + "grad_norm": 0.29282280802726746, + "learning_rate": 2.4478747516201856e-06, + "loss": 0.08, + "step": 48898 + }, + { + "epoch": 0.872168515677951, + "grad_norm": 0.26182353496551514, + "learning_rate": 2.447203071438667e-06, + "loss": 0.0967, + "step": 48899 + }, + { + "epoch": 0.8721863517996646, + "grad_norm": 0.27749302983283997, + "learning_rate": 2.4465314786795786e-06, + "loss": 0.1269, + "step": 48900 + }, + { + "epoch": 0.8722041879213783, + "grad_norm": 0.26597028970718384, + "learning_rate": 2.445859973345524e-06, + "loss": 0.0693, + "step": 48901 + }, + { + "epoch": 0.872222024043092, + "grad_norm": 0.3577239513397217, + "learning_rate": 2.4451885554391014e-06, + "loss": 0.1586, + "step": 48902 + }, + { + "epoch": 0.8722398601648058, + "grad_norm": 0.25598835945129395, + "learning_rate": 2.444517224962922e-06, + "loss": 0.1135, + "step": 48903 + }, + { + "epoch": 0.8722576962865195, + "grad_norm": 0.2673466205596924, + "learning_rate": 2.4438459819195847e-06, + "loss": 0.0779, + "step": 48904 + }, + { + "epoch": 0.8722755324082332, + "grad_norm": 0.3307526707649231, + "learning_rate": 2.443174826311689e-06, + "loss": 0.1623, + "step": 48905 + }, + { + "epoch": 0.8722933685299469, + "grad_norm": 0.2783094048500061, + "learning_rate": 2.442503758141837e-06, + "loss": 0.0935, + "step": 48906 + }, + { + "epoch": 0.8723112046516606, + "grad_norm": 0.31175684928894043, + "learning_rate": 2.4418327774126336e-06, + "loss": 0.1338, + "step": 48907 + }, + { + "epoch": 0.8723290407733743, + "grad_norm": 0.19709528982639313, + "learning_rate": 2.4411618841266804e-06, + "loss": 0.0392, + "step": 48908 + }, + { + "epoch": 0.8723468768950879, + "grad_norm": 0.2973250448703766, + "learning_rate": 2.4404910782865692e-06, + "loss": 0.1405, + "step": 48909 + }, + { + "epoch": 0.8723647130168016, + "grad_norm": 0.27532443404197693, + "learning_rate": 2.439820359894912e-06, + "loss": 0.1161, + "step": 48910 + }, + { + "epoch": 0.8723825491385153, + "grad_norm": 0.281662255525589, + "learning_rate": 2.4391497289542957e-06, + "loss": 0.1078, + "step": 48911 + }, + { + "epoch": 0.872400385260229, + "grad_norm": 0.3302733600139618, + "learning_rate": 2.438479185467335e-06, + "loss": 0.1324, + "step": 48912 + }, + { + "epoch": 0.8724182213819427, + "grad_norm": 0.26644861698150635, + "learning_rate": 2.437808729436622e-06, + "loss": 0.1334, + "step": 48913 + }, + { + "epoch": 0.8724360575036564, + "grad_norm": 0.2155432552099228, + "learning_rate": 2.4371383608647518e-06, + "loss": 0.0912, + "step": 48914 + }, + { + "epoch": 0.8724538936253701, + "grad_norm": 0.357900470495224, + "learning_rate": 2.4364680797543225e-06, + "loss": 0.1436, + "step": 48915 + }, + { + "epoch": 0.8724717297470838, + "grad_norm": 0.24283993244171143, + "learning_rate": 2.4357978861079435e-06, + "loss": 0.0873, + "step": 48916 + }, + { + "epoch": 0.8724895658687974, + "grad_norm": 0.2726002335548401, + "learning_rate": 2.4351277799282035e-06, + "loss": 0.1176, + "step": 48917 + }, + { + "epoch": 0.8725074019905111, + "grad_norm": 0.27351316809654236, + "learning_rate": 2.434457761217701e-06, + "loss": 0.1011, + "step": 48918 + }, + { + "epoch": 0.8725252381122249, + "grad_norm": 0.4146108627319336, + "learning_rate": 2.4337878299790313e-06, + "loss": 0.1302, + "step": 48919 + }, + { + "epoch": 0.8725430742339386, + "grad_norm": 0.376118004322052, + "learning_rate": 2.4331179862147973e-06, + "loss": 0.117, + "step": 48920 + }, + { + "epoch": 0.8725609103556523, + "grad_norm": 0.280676931142807, + "learning_rate": 2.432448229927592e-06, + "loss": 0.0795, + "step": 48921 + }, + { + "epoch": 0.872578746477366, + "grad_norm": 0.25002437829971313, + "learning_rate": 2.431778561120013e-06, + "loss": 0.0938, + "step": 48922 + }, + { + "epoch": 0.8725965825990797, + "grad_norm": 0.2694808840751648, + "learning_rate": 2.4311089797946494e-06, + "loss": 0.1192, + "step": 48923 + }, + { + "epoch": 0.8726144187207934, + "grad_norm": 0.3451583683490753, + "learning_rate": 2.430439485954103e-06, + "loss": 0.101, + "step": 48924 + }, + { + "epoch": 0.8726322548425071, + "grad_norm": 0.2184550166130066, + "learning_rate": 2.4297700796009737e-06, + "loss": 0.0981, + "step": 48925 + }, + { + "epoch": 0.8726500909642207, + "grad_norm": 0.30546683073043823, + "learning_rate": 2.429100760737851e-06, + "loss": 0.1447, + "step": 48926 + }, + { + "epoch": 0.8726679270859344, + "grad_norm": 0.4720088243484497, + "learning_rate": 2.4284315293673284e-06, + "loss": 0.1132, + "step": 48927 + }, + { + "epoch": 0.8726857632076481, + "grad_norm": 0.24959751963615417, + "learning_rate": 2.4277623854919967e-06, + "loss": 0.0965, + "step": 48928 + }, + { + "epoch": 0.8727035993293618, + "grad_norm": 0.30644840002059937, + "learning_rate": 2.4270933291144603e-06, + "loss": 0.1163, + "step": 48929 + }, + { + "epoch": 0.8727214354510755, + "grad_norm": 0.26736292243003845, + "learning_rate": 2.426424360237306e-06, + "loss": 0.1178, + "step": 48930 + }, + { + "epoch": 0.8727392715727892, + "grad_norm": 0.1963820904493332, + "learning_rate": 2.425755478863129e-06, + "loss": 0.1354, + "step": 48931 + }, + { + "epoch": 0.8727571076945029, + "grad_norm": 0.23830799758434296, + "learning_rate": 2.4250866849945127e-06, + "loss": 0.0868, + "step": 48932 + }, + { + "epoch": 0.8727749438162166, + "grad_norm": 0.23102404177188873, + "learning_rate": 2.4244179786340637e-06, + "loss": 0.0935, + "step": 48933 + }, + { + "epoch": 0.8727927799379303, + "grad_norm": 0.2513000965118408, + "learning_rate": 2.4237493597843692e-06, + "loss": 0.1133, + "step": 48934 + }, + { + "epoch": 0.8728106160596439, + "grad_norm": 0.23170988261699677, + "learning_rate": 2.4230808284480182e-06, + "loss": 0.1203, + "step": 48935 + }, + { + "epoch": 0.8728284521813577, + "grad_norm": 0.32318252325057983, + "learning_rate": 2.422412384627598e-06, + "loss": 0.1322, + "step": 48936 + }, + { + "epoch": 0.8728462883030714, + "grad_norm": 0.2838955223560333, + "learning_rate": 2.421744028325712e-06, + "loss": 0.0757, + "step": 48937 + }, + { + "epoch": 0.8728641244247851, + "grad_norm": 0.2863087058067322, + "learning_rate": 2.4210757595449385e-06, + "loss": 0.0912, + "step": 48938 + }, + { + "epoch": 0.8728819605464988, + "grad_norm": 0.27006933093070984, + "learning_rate": 2.420407578287881e-06, + "loss": 0.1109, + "step": 48939 + }, + { + "epoch": 0.8728997966682125, + "grad_norm": 0.2834243178367615, + "learning_rate": 2.4197394845571206e-06, + "loss": 0.1692, + "step": 48940 + }, + { + "epoch": 0.8729176327899262, + "grad_norm": 0.358725905418396, + "learning_rate": 2.4190714783552447e-06, + "loss": 0.101, + "step": 48941 + }, + { + "epoch": 0.8729354689116399, + "grad_norm": 0.21277566254138947, + "learning_rate": 2.418403559684851e-06, + "loss": 0.0625, + "step": 48942 + }, + { + "epoch": 0.8729533050333536, + "grad_norm": 0.25441327691078186, + "learning_rate": 2.417735728548523e-06, + "loss": 0.1167, + "step": 48943 + }, + { + "epoch": 0.8729711411550672, + "grad_norm": 0.3345637023448944, + "learning_rate": 2.4170679849488535e-06, + "loss": 0.1525, + "step": 48944 + }, + { + "epoch": 0.8729889772767809, + "grad_norm": 0.2666950523853302, + "learning_rate": 2.4164003288884243e-06, + "loss": 0.0924, + "step": 48945 + }, + { + "epoch": 0.8730068133984946, + "grad_norm": 0.25407615303993225, + "learning_rate": 2.4157327603698292e-06, + "loss": 0.1002, + "step": 48946 + }, + { + "epoch": 0.8730246495202083, + "grad_norm": 0.3852671980857849, + "learning_rate": 2.415065279395656e-06, + "loss": 0.1719, + "step": 48947 + }, + { + "epoch": 0.873042485641922, + "grad_norm": 0.19352583587169647, + "learning_rate": 2.414397885968492e-06, + "loss": 0.0433, + "step": 48948 + }, + { + "epoch": 0.8730603217636357, + "grad_norm": 0.28823792934417725, + "learning_rate": 2.41373058009092e-06, + "loss": 0.0987, + "step": 48949 + }, + { + "epoch": 0.8730781578853494, + "grad_norm": 0.3840271532535553, + "learning_rate": 2.4130633617655274e-06, + "loss": 0.1327, + "step": 48950 + }, + { + "epoch": 0.8730959940070631, + "grad_norm": 0.22522905468940735, + "learning_rate": 2.4123962309949012e-06, + "loss": 0.1194, + "step": 48951 + }, + { + "epoch": 0.8731138301287767, + "grad_norm": 0.24343860149383545, + "learning_rate": 2.411729187781631e-06, + "loss": 0.0979, + "step": 48952 + }, + { + "epoch": 0.8731316662504905, + "grad_norm": 0.2513010501861572, + "learning_rate": 2.411062232128303e-06, + "loss": 0.1642, + "step": 48953 + }, + { + "epoch": 0.8731495023722042, + "grad_norm": 0.2829985022544861, + "learning_rate": 2.410395364037493e-06, + "loss": 0.1529, + "step": 48954 + }, + { + "epoch": 0.8731673384939179, + "grad_norm": 0.32416999340057373, + "learning_rate": 2.4097285835117998e-06, + "loss": 0.0727, + "step": 48955 + }, + { + "epoch": 0.8731851746156316, + "grad_norm": 0.31245869398117065, + "learning_rate": 2.4090618905537986e-06, + "loss": 0.1162, + "step": 48956 + }, + { + "epoch": 0.8732030107373453, + "grad_norm": 0.2820016145706177, + "learning_rate": 2.408395285166079e-06, + "loss": 0.0776, + "step": 48957 + }, + { + "epoch": 0.873220846859059, + "grad_norm": 0.22606006264686584, + "learning_rate": 2.407728767351217e-06, + "loss": 0.0756, + "step": 48958 + }, + { + "epoch": 0.8732386829807727, + "grad_norm": 0.2523399591445923, + "learning_rate": 2.407062337111804e-06, + "loss": 0.0923, + "step": 48959 + }, + { + "epoch": 0.8732565191024864, + "grad_norm": 0.1685180813074112, + "learning_rate": 2.406395994450422e-06, + "loss": 0.0585, + "step": 48960 + }, + { + "epoch": 0.8732743552242, + "grad_norm": 0.22345389425754547, + "learning_rate": 2.4057297393696525e-06, + "loss": 0.0706, + "step": 48961 + }, + { + "epoch": 0.8732921913459137, + "grad_norm": 0.2905835509300232, + "learning_rate": 2.4050635718720794e-06, + "loss": 0.0636, + "step": 48962 + }, + { + "epoch": 0.8733100274676274, + "grad_norm": 0.33429697155952454, + "learning_rate": 2.404397491960278e-06, + "loss": 0.1072, + "step": 48963 + }, + { + "epoch": 0.8733278635893411, + "grad_norm": 0.32072851061820984, + "learning_rate": 2.403731499636841e-06, + "loss": 0.1541, + "step": 48964 + }, + { + "epoch": 0.8733456997110548, + "grad_norm": 0.245486319065094, + "learning_rate": 2.4030655949043413e-06, + "loss": 0.0792, + "step": 48965 + }, + { + "epoch": 0.8733635358327685, + "grad_norm": 0.2666672170162201, + "learning_rate": 2.4023997777653654e-06, + "loss": 0.0818, + "step": 48966 + }, + { + "epoch": 0.8733813719544822, + "grad_norm": 0.23721128702163696, + "learning_rate": 2.4017340482224928e-06, + "loss": 0.0862, + "step": 48967 + }, + { + "epoch": 0.8733992080761959, + "grad_norm": 0.2651722729206085, + "learning_rate": 2.4010684062783064e-06, + "loss": 0.0805, + "step": 48968 + }, + { + "epoch": 0.8734170441979096, + "grad_norm": 0.2025548219680786, + "learning_rate": 2.400402851935385e-06, + "loss": 0.0994, + "step": 48969 + }, + { + "epoch": 0.8734348803196234, + "grad_norm": 0.2940991222858429, + "learning_rate": 2.399737385196307e-06, + "loss": 0.2086, + "step": 48970 + }, + { + "epoch": 0.873452716441337, + "grad_norm": 0.30726146697998047, + "learning_rate": 2.399072006063649e-06, + "loss": 0.0957, + "step": 48971 + }, + { + "epoch": 0.8734705525630507, + "grad_norm": 0.2609618306159973, + "learning_rate": 2.39840671454e-06, + "loss": 0.0895, + "step": 48972 + }, + { + "epoch": 0.8734883886847644, + "grad_norm": 0.2944090664386749, + "learning_rate": 2.397741510627929e-06, + "loss": 0.1365, + "step": 48973 + }, + { + "epoch": 0.8735062248064781, + "grad_norm": 0.2327173352241516, + "learning_rate": 2.3970763943300223e-06, + "loss": 0.0718, + "step": 48974 + }, + { + "epoch": 0.8735240609281918, + "grad_norm": 0.31209731101989746, + "learning_rate": 2.396411365648854e-06, + "loss": 0.1147, + "step": 48975 + }, + { + "epoch": 0.8735418970499055, + "grad_norm": 0.24604645371437073, + "learning_rate": 2.3957464245869947e-06, + "loss": 0.1014, + "step": 48976 + }, + { + "epoch": 0.8735597331716192, + "grad_norm": 0.24622175097465515, + "learning_rate": 2.395081571147037e-06, + "loss": 0.1158, + "step": 48977 + }, + { + "epoch": 0.8735775692933329, + "grad_norm": 0.2905532121658325, + "learning_rate": 2.3944168053315454e-06, + "loss": 0.0596, + "step": 48978 + }, + { + "epoch": 0.8735954054150465, + "grad_norm": 0.3453649878501892, + "learning_rate": 2.393752127143106e-06, + "loss": 0.085, + "step": 48979 + }, + { + "epoch": 0.8736132415367602, + "grad_norm": 0.25328147411346436, + "learning_rate": 2.3930875365842877e-06, + "loss": 0.1388, + "step": 48980 + }, + { + "epoch": 0.8736310776584739, + "grad_norm": 0.22761796414852142, + "learning_rate": 2.392423033657673e-06, + "loss": 0.1264, + "step": 48981 + }, + { + "epoch": 0.8736489137801876, + "grad_norm": 0.20159262418746948, + "learning_rate": 2.391758618365836e-06, + "loss": 0.0956, + "step": 48982 + }, + { + "epoch": 0.8736667499019013, + "grad_norm": 0.29003745317459106, + "learning_rate": 2.3910942907113515e-06, + "loss": 0.1828, + "step": 48983 + }, + { + "epoch": 0.873684586023615, + "grad_norm": 0.24854856729507446, + "learning_rate": 2.3904300506967904e-06, + "loss": 0.0682, + "step": 48984 + }, + { + "epoch": 0.8737024221453287, + "grad_norm": 0.30418720841407776, + "learning_rate": 2.3897658983247335e-06, + "loss": 0.2264, + "step": 48985 + }, + { + "epoch": 0.8737202582670424, + "grad_norm": 0.27203935384750366, + "learning_rate": 2.3891018335977566e-06, + "loss": 0.1226, + "step": 48986 + }, + { + "epoch": 0.8737380943887562, + "grad_norm": 0.2965022027492523, + "learning_rate": 2.3884378565184273e-06, + "loss": 0.0947, + "step": 48987 + }, + { + "epoch": 0.8737559305104698, + "grad_norm": 0.30365481972694397, + "learning_rate": 2.387773967089324e-06, + "loss": 0.1401, + "step": 48988 + }, + { + "epoch": 0.8737737666321835, + "grad_norm": 0.255367636680603, + "learning_rate": 2.3871101653130167e-06, + "loss": 0.1138, + "step": 48989 + }, + { + "epoch": 0.8737916027538972, + "grad_norm": 0.2791960835456848, + "learning_rate": 2.3864464511920814e-06, + "loss": 0.1193, + "step": 48990 + }, + { + "epoch": 0.8738094388756109, + "grad_norm": 0.25066882371902466, + "learning_rate": 2.385782824729094e-06, + "loss": 0.1053, + "step": 48991 + }, + { + "epoch": 0.8738272749973246, + "grad_norm": 0.2879200279712677, + "learning_rate": 2.3851192859266153e-06, + "loss": 0.0994, + "step": 48992 + }, + { + "epoch": 0.8738451111190383, + "grad_norm": 0.2680374085903168, + "learning_rate": 2.38445583478723e-06, + "loss": 0.1341, + "step": 48993 + }, + { + "epoch": 0.873862947240752, + "grad_norm": 0.26426082849502563, + "learning_rate": 2.3837924713135007e-06, + "loss": 0.1275, + "step": 48994 + }, + { + "epoch": 0.8738807833624657, + "grad_norm": 0.29085299372673035, + "learning_rate": 2.3831291955080104e-06, + "loss": 0.1682, + "step": 48995 + }, + { + "epoch": 0.8738986194841794, + "grad_norm": 0.2757631242275238, + "learning_rate": 2.3824660073733214e-06, + "loss": 0.0881, + "step": 48996 + }, + { + "epoch": 0.873916455605893, + "grad_norm": 0.3565802574157715, + "learning_rate": 2.3818029069120008e-06, + "loss": 0.1095, + "step": 48997 + }, + { + "epoch": 0.8739342917276067, + "grad_norm": 0.4203716516494751, + "learning_rate": 2.3811398941266272e-06, + "loss": 0.1211, + "step": 48998 + }, + { + "epoch": 0.8739521278493204, + "grad_norm": 0.29902541637420654, + "learning_rate": 2.3804769690197707e-06, + "loss": 0.1126, + "step": 48999 + }, + { + "epoch": 0.8739699639710341, + "grad_norm": 0.2832728624343872, + "learning_rate": 2.3798141315939964e-06, + "loss": 0.1263, + "step": 49000 + }, + { + "epoch": 0.8739699639710341, + "eval_loss": 0.10793811827898026, + "eval_runtime": 107.3556, + "eval_samples_per_second": 9.538, + "eval_steps_per_second": 1.593, + "step": 49000 + }, + { + "epoch": 0.8739878000927478, + "grad_norm": 0.34250015020370483, + "learning_rate": 2.3791513818518714e-06, + "loss": 0.1478, + "step": 49001 + }, + { + "epoch": 0.8740056362144615, + "grad_norm": 0.3086620569229126, + "learning_rate": 2.378488719795971e-06, + "loss": 0.0904, + "step": 49002 + }, + { + "epoch": 0.8740234723361752, + "grad_norm": 0.3199694752693176, + "learning_rate": 2.377826145428863e-06, + "loss": 0.1209, + "step": 49003 + }, + { + "epoch": 0.874041308457889, + "grad_norm": 0.32794883847236633, + "learning_rate": 2.3771636587531147e-06, + "loss": 0.1098, + "step": 49004 + }, + { + "epoch": 0.8740591445796027, + "grad_norm": 0.21602557599544525, + "learning_rate": 2.376501259771294e-06, + "loss": 0.1042, + "step": 49005 + }, + { + "epoch": 0.8740769807013163, + "grad_norm": 0.19758708775043488, + "learning_rate": 2.3758389484859644e-06, + "loss": 0.1144, + "step": 49006 + }, + { + "epoch": 0.87409481682303, + "grad_norm": 0.26631733775138855, + "learning_rate": 2.3751767248996972e-06, + "loss": 0.0913, + "step": 49007 + }, + { + "epoch": 0.8741126529447437, + "grad_norm": 0.30449196696281433, + "learning_rate": 2.3745145890150617e-06, + "loss": 0.1081, + "step": 49008 + }, + { + "epoch": 0.8741304890664574, + "grad_norm": 0.2515089809894562, + "learning_rate": 2.373852540834623e-06, + "loss": 0.0965, + "step": 49009 + }, + { + "epoch": 0.8741483251881711, + "grad_norm": 0.2104242742061615, + "learning_rate": 2.373190580360943e-06, + "loss": 0.0941, + "step": 49010 + }, + { + "epoch": 0.8741661613098848, + "grad_norm": 0.29072099924087524, + "learning_rate": 2.372528707596594e-06, + "loss": 0.0954, + "step": 49011 + }, + { + "epoch": 0.8741839974315985, + "grad_norm": 0.2656518220901489, + "learning_rate": 2.3718669225441414e-06, + "loss": 0.1452, + "step": 49012 + }, + { + "epoch": 0.8742018335533122, + "grad_norm": 0.30386754870414734, + "learning_rate": 2.3712052252061467e-06, + "loss": 0.117, + "step": 49013 + }, + { + "epoch": 0.8742196696750258, + "grad_norm": 0.46391886472702026, + "learning_rate": 2.3705436155851748e-06, + "loss": 0.1065, + "step": 49014 + }, + { + "epoch": 0.8742375057967395, + "grad_norm": 0.23648697137832642, + "learning_rate": 2.3698820936837925e-06, + "loss": 0.0706, + "step": 49015 + }, + { + "epoch": 0.8742553419184532, + "grad_norm": 0.25519511103630066, + "learning_rate": 2.3692206595045646e-06, + "loss": 0.0876, + "step": 49016 + }, + { + "epoch": 0.8742731780401669, + "grad_norm": 0.228533074259758, + "learning_rate": 2.3685593130500535e-06, + "loss": 0.123, + "step": 49017 + }, + { + "epoch": 0.8742910141618806, + "grad_norm": 0.19661208987236023, + "learning_rate": 2.367898054322823e-06, + "loss": 0.0872, + "step": 49018 + }, + { + "epoch": 0.8743088502835943, + "grad_norm": 0.2891220450401306, + "learning_rate": 2.3672368833254326e-06, + "loss": 0.1572, + "step": 49019 + }, + { + "epoch": 0.8743266864053081, + "grad_norm": 0.35472527146339417, + "learning_rate": 2.3665758000604555e-06, + "loss": 0.1262, + "step": 49020 + }, + { + "epoch": 0.8743445225270218, + "grad_norm": 0.3111489713191986, + "learning_rate": 2.365914804530442e-06, + "loss": 0.1517, + "step": 49021 + }, + { + "epoch": 0.8743623586487355, + "grad_norm": 0.28271231055259705, + "learning_rate": 2.3652538967379623e-06, + "loss": 0.0666, + "step": 49022 + }, + { + "epoch": 0.8743801947704491, + "grad_norm": 0.34630849957466125, + "learning_rate": 2.3645930766855754e-06, + "loss": 0.1397, + "step": 49023 + }, + { + "epoch": 0.8743980308921628, + "grad_norm": 0.29569879174232483, + "learning_rate": 2.3639323443758467e-06, + "loss": 0.0947, + "step": 49024 + }, + { + "epoch": 0.8744158670138765, + "grad_norm": 0.24688155949115753, + "learning_rate": 2.363271699811334e-06, + "loss": 0.1362, + "step": 49025 + }, + { + "epoch": 0.8744337031355902, + "grad_norm": 0.26550984382629395, + "learning_rate": 2.3626111429946e-06, + "loss": 0.0869, + "step": 49026 + }, + { + "epoch": 0.8744515392573039, + "grad_norm": 0.329212486743927, + "learning_rate": 2.3619506739281983e-06, + "loss": 0.1571, + "step": 49027 + }, + { + "epoch": 0.8744693753790176, + "grad_norm": 0.29569125175476074, + "learning_rate": 2.3612902926146987e-06, + "loss": 0.1208, + "step": 49028 + }, + { + "epoch": 0.8744872115007313, + "grad_norm": 0.2973051965236664, + "learning_rate": 2.360629999056657e-06, + "loss": 0.1681, + "step": 49029 + }, + { + "epoch": 0.874505047622445, + "grad_norm": 0.3216267228126526, + "learning_rate": 2.3599697932566335e-06, + "loss": 0.1139, + "step": 49030 + }, + { + "epoch": 0.8745228837441587, + "grad_norm": 0.2688002586364746, + "learning_rate": 2.359309675217189e-06, + "loss": 0.0648, + "step": 49031 + }, + { + "epoch": 0.8745407198658723, + "grad_norm": 0.25336775183677673, + "learning_rate": 2.3586496449408718e-06, + "loss": 0.1014, + "step": 49032 + }, + { + "epoch": 0.874558555987586, + "grad_norm": 0.23914194107055664, + "learning_rate": 2.357989702430255e-06, + "loss": 0.0985, + "step": 49033 + }, + { + "epoch": 0.8745763921092997, + "grad_norm": 0.2207818478345871, + "learning_rate": 2.3573298476878863e-06, + "loss": 0.0791, + "step": 49034 + }, + { + "epoch": 0.8745942282310134, + "grad_norm": 0.23462559282779694, + "learning_rate": 2.3566700807163304e-06, + "loss": 0.0742, + "step": 49035 + }, + { + "epoch": 0.8746120643527271, + "grad_norm": 0.3224104940891266, + "learning_rate": 2.3560104015181384e-06, + "loss": 0.113, + "step": 49036 + }, + { + "epoch": 0.8746299004744409, + "grad_norm": 0.2706161439418793, + "learning_rate": 2.355350810095877e-06, + "loss": 0.107, + "step": 49037 + }, + { + "epoch": 0.8746477365961546, + "grad_norm": 0.29104551672935486, + "learning_rate": 2.3546913064520946e-06, + "loss": 0.1048, + "step": 49038 + }, + { + "epoch": 0.8746655727178683, + "grad_norm": 0.30357736349105835, + "learning_rate": 2.3540318905893533e-06, + "loss": 0.1239, + "step": 49039 + }, + { + "epoch": 0.874683408839582, + "grad_norm": 0.20416851341724396, + "learning_rate": 2.3533725625101976e-06, + "loss": 0.0879, + "step": 49040 + }, + { + "epoch": 0.8747012449612956, + "grad_norm": 0.21284979581832886, + "learning_rate": 2.352713322217201e-06, + "loss": 0.0856, + "step": 49041 + }, + { + "epoch": 0.8747190810830093, + "grad_norm": 0.26216670870780945, + "learning_rate": 2.3520541697129057e-06, + "loss": 0.0979, + "step": 49042 + }, + { + "epoch": 0.874736917204723, + "grad_norm": 0.27146539092063904, + "learning_rate": 2.3513951049998733e-06, + "loss": 0.0885, + "step": 49043 + }, + { + "epoch": 0.8747547533264367, + "grad_norm": 0.35666701197624207, + "learning_rate": 2.350736128080655e-06, + "loss": 0.1283, + "step": 49044 + }, + { + "epoch": 0.8747725894481504, + "grad_norm": 0.31937843561172485, + "learning_rate": 2.350077238957801e-06, + "loss": 0.1195, + "step": 49045 + }, + { + "epoch": 0.8747904255698641, + "grad_norm": 0.3077686131000519, + "learning_rate": 2.349418437633874e-06, + "loss": 0.1224, + "step": 49046 + }, + { + "epoch": 0.8748082616915778, + "grad_norm": 0.32812246680259705, + "learning_rate": 2.3487597241114266e-06, + "loss": 0.0883, + "step": 49047 + }, + { + "epoch": 0.8748260978132915, + "grad_norm": 0.24321341514587402, + "learning_rate": 2.3481010983930046e-06, + "loss": 0.1547, + "step": 49048 + }, + { + "epoch": 0.8748439339350051, + "grad_norm": 0.3040071725845337, + "learning_rate": 2.3474425604811723e-06, + "loss": 0.088, + "step": 49049 + }, + { + "epoch": 0.8748617700567188, + "grad_norm": 0.21051782369613647, + "learning_rate": 2.346784110378472e-06, + "loss": 0.0804, + "step": 49050 + }, + { + "epoch": 0.8748796061784325, + "grad_norm": 0.3067069947719574, + "learning_rate": 2.346125748087463e-06, + "loss": 0.1624, + "step": 49051 + }, + { + "epoch": 0.8748974423001462, + "grad_norm": 0.3249727785587311, + "learning_rate": 2.3454674736106962e-06, + "loss": 0.0839, + "step": 49052 + }, + { + "epoch": 0.8749152784218599, + "grad_norm": 0.2631492614746094, + "learning_rate": 2.3448092869507195e-06, + "loss": 0.0942, + "step": 49053 + }, + { + "epoch": 0.8749331145435737, + "grad_norm": 0.20457443594932556, + "learning_rate": 2.3441511881100888e-06, + "loss": 0.1043, + "step": 49054 + }, + { + "epoch": 0.8749509506652874, + "grad_norm": 0.2152179330587387, + "learning_rate": 2.343493177091355e-06, + "loss": 0.1316, + "step": 49055 + }, + { + "epoch": 0.8749687867870011, + "grad_norm": 0.2957698106765747, + "learning_rate": 2.342835253897063e-06, + "loss": 0.1275, + "step": 49056 + }, + { + "epoch": 0.8749866229087148, + "grad_norm": 0.2898455858230591, + "learning_rate": 2.3421774185297728e-06, + "loss": 0.1532, + "step": 49057 + }, + { + "epoch": 0.8750044590304284, + "grad_norm": 0.25608012080192566, + "learning_rate": 2.3415196709920208e-06, + "loss": 0.1139, + "step": 49058 + }, + { + "epoch": 0.8750222951521421, + "grad_norm": 0.43150776624679565, + "learning_rate": 2.340862011286368e-06, + "loss": 0.1279, + "step": 49059 + }, + { + "epoch": 0.8750401312738558, + "grad_norm": 0.188703715801239, + "learning_rate": 2.340204439415364e-06, + "loss": 0.0693, + "step": 49060 + }, + { + "epoch": 0.8750579673955695, + "grad_norm": 0.22968249022960663, + "learning_rate": 2.339546955381547e-06, + "loss": 0.0775, + "step": 49061 + }, + { + "epoch": 0.8750758035172832, + "grad_norm": 0.36129093170166016, + "learning_rate": 2.3388895591874764e-06, + "loss": 0.0828, + "step": 49062 + }, + { + "epoch": 0.8750936396389969, + "grad_norm": 0.27361932396888733, + "learning_rate": 2.3382322508356922e-06, + "loss": 0.1575, + "step": 49063 + }, + { + "epoch": 0.8751114757607106, + "grad_norm": 0.2412867695093155, + "learning_rate": 2.3375750303287535e-06, + "loss": 0.0973, + "step": 49064 + }, + { + "epoch": 0.8751293118824243, + "grad_norm": 0.38039711117744446, + "learning_rate": 2.3369178976692022e-06, + "loss": 0.121, + "step": 49065 + }, + { + "epoch": 0.875147148004138, + "grad_norm": 0.3710176348686218, + "learning_rate": 2.3362608528595786e-06, + "loss": 0.1401, + "step": 49066 + }, + { + "epoch": 0.8751649841258516, + "grad_norm": 0.25758734345436096, + "learning_rate": 2.3356038959024386e-06, + "loss": 0.0936, + "step": 49067 + }, + { + "epoch": 0.8751828202475653, + "grad_norm": 0.23830674588680267, + "learning_rate": 2.334947026800327e-06, + "loss": 0.088, + "step": 49068 + }, + { + "epoch": 0.875200656369279, + "grad_norm": 0.3027196228504181, + "learning_rate": 2.3342902455557895e-06, + "loss": 0.1726, + "step": 49069 + }, + { + "epoch": 0.8752184924909927, + "grad_norm": 0.26158955693244934, + "learning_rate": 2.3336335521713714e-06, + "loss": 0.1129, + "step": 49070 + }, + { + "epoch": 0.8752363286127065, + "grad_norm": 0.2567870020866394, + "learning_rate": 2.332976946649615e-06, + "loss": 0.1151, + "step": 49071 + }, + { + "epoch": 0.8752541647344202, + "grad_norm": 0.2848890721797943, + "learning_rate": 2.3323204289930734e-06, + "loss": 0.1008, + "step": 49072 + }, + { + "epoch": 0.8752720008561339, + "grad_norm": 0.36752718687057495, + "learning_rate": 2.3316639992042836e-06, + "loss": 0.1255, + "step": 49073 + }, + { + "epoch": 0.8752898369778476, + "grad_norm": 0.23002707958221436, + "learning_rate": 2.331007657285797e-06, + "loss": 0.0957, + "step": 49074 + }, + { + "epoch": 0.8753076730995613, + "grad_norm": 0.32130834460258484, + "learning_rate": 2.3303514032401497e-06, + "loss": 0.1199, + "step": 49075 + }, + { + "epoch": 0.8753255092212749, + "grad_norm": 0.32750561833381653, + "learning_rate": 2.329695237069893e-06, + "loss": 0.1342, + "step": 49076 + }, + { + "epoch": 0.8753433453429886, + "grad_norm": 0.21657831966876984, + "learning_rate": 2.3290391587775627e-06, + "loss": 0.1037, + "step": 49077 + }, + { + "epoch": 0.8753611814647023, + "grad_norm": 0.19653582572937012, + "learning_rate": 2.3283831683657136e-06, + "loss": 0.0896, + "step": 49078 + }, + { + "epoch": 0.875379017586416, + "grad_norm": 0.4856494665145874, + "learning_rate": 2.3277272658368765e-06, + "loss": 0.1917, + "step": 49079 + }, + { + "epoch": 0.8753968537081297, + "grad_norm": 0.2232699692249298, + "learning_rate": 2.327071451193602e-06, + "loss": 0.0754, + "step": 49080 + }, + { + "epoch": 0.8754146898298434, + "grad_norm": 0.2399214655160904, + "learning_rate": 2.326415724438433e-06, + "loss": 0.0771, + "step": 49081 + }, + { + "epoch": 0.8754325259515571, + "grad_norm": 0.31210198998451233, + "learning_rate": 2.3257600855739052e-06, + "loss": 0.1291, + "step": 49082 + }, + { + "epoch": 0.8754503620732708, + "grad_norm": 0.3355759382247925, + "learning_rate": 2.3251045346025624e-06, + "loss": 0.1158, + "step": 49083 + }, + { + "epoch": 0.8754681981949844, + "grad_norm": 0.18690429627895355, + "learning_rate": 2.3244490715269434e-06, + "loss": 0.0689, + "step": 49084 + }, + { + "epoch": 0.8754860343166981, + "grad_norm": 0.3659724295139313, + "learning_rate": 2.3237936963495964e-06, + "loss": 0.0868, + "step": 49085 + }, + { + "epoch": 0.8755038704384118, + "grad_norm": 0.2935331165790558, + "learning_rate": 2.323138409073056e-06, + "loss": 0.141, + "step": 49086 + }, + { + "epoch": 0.8755217065601255, + "grad_norm": 0.27686309814453125, + "learning_rate": 2.3224832096998633e-06, + "loss": 0.1436, + "step": 49087 + }, + { + "epoch": 0.8755395426818393, + "grad_norm": 0.19992637634277344, + "learning_rate": 2.3218280982325533e-06, + "loss": 0.0871, + "step": 49088 + }, + { + "epoch": 0.875557378803553, + "grad_norm": 0.2729877233505249, + "learning_rate": 2.321173074673677e-06, + "loss": 0.1022, + "step": 49089 + }, + { + "epoch": 0.8755752149252667, + "grad_norm": 0.5481345653533936, + "learning_rate": 2.32051813902576e-06, + "loss": 0.0951, + "step": 49090 + }, + { + "epoch": 0.8755930510469804, + "grad_norm": 0.28571000695228577, + "learning_rate": 2.3198632912913547e-06, + "loss": 0.1532, + "step": 49091 + }, + { + "epoch": 0.8756108871686941, + "grad_norm": 0.23745673894882202, + "learning_rate": 2.3192085314729856e-06, + "loss": 0.1044, + "step": 49092 + }, + { + "epoch": 0.8756287232904078, + "grad_norm": 0.3053630292415619, + "learning_rate": 2.3185538595732056e-06, + "loss": 0.1161, + "step": 49093 + }, + { + "epoch": 0.8756465594121214, + "grad_norm": 0.32495880126953125, + "learning_rate": 2.317899275594543e-06, + "loss": 0.1077, + "step": 49094 + }, + { + "epoch": 0.8756643955338351, + "grad_norm": 0.20112845301628113, + "learning_rate": 2.3172447795395345e-06, + "loss": 0.1073, + "step": 49095 + }, + { + "epoch": 0.8756822316555488, + "grad_norm": 0.27047351002693176, + "learning_rate": 2.3165903714107236e-06, + "loss": 0.1456, + "step": 49096 + }, + { + "epoch": 0.8757000677772625, + "grad_norm": 0.3040211796760559, + "learning_rate": 2.3159360512106378e-06, + "loss": 0.0785, + "step": 49097 + }, + { + "epoch": 0.8757179038989762, + "grad_norm": 0.21334435045719147, + "learning_rate": 2.3152818189418223e-06, + "loss": 0.0979, + "step": 49098 + }, + { + "epoch": 0.8757357400206899, + "grad_norm": 0.29952272772789, + "learning_rate": 2.314627674606809e-06, + "loss": 0.0598, + "step": 49099 + }, + { + "epoch": 0.8757535761424036, + "grad_norm": 0.29360049962997437, + "learning_rate": 2.3139736182081346e-06, + "loss": 0.141, + "step": 49100 + }, + { + "epoch": 0.8757714122641173, + "grad_norm": 0.2605254352092743, + "learning_rate": 2.31331964974833e-06, + "loss": 0.0846, + "step": 49101 + }, + { + "epoch": 0.8757892483858309, + "grad_norm": 0.28678998351097107, + "learning_rate": 2.3126657692299382e-06, + "loss": 0.1371, + "step": 49102 + }, + { + "epoch": 0.8758070845075446, + "grad_norm": 0.2714792788028717, + "learning_rate": 2.31201197665549e-06, + "loss": 0.0675, + "step": 49103 + }, + { + "epoch": 0.8758249206292583, + "grad_norm": 0.33933860063552856, + "learning_rate": 2.311358272027514e-06, + "loss": 0.1232, + "step": 49104 + }, + { + "epoch": 0.8758427567509721, + "grad_norm": 0.2021796852350235, + "learning_rate": 2.31070465534855e-06, + "loss": 0.0953, + "step": 49105 + }, + { + "epoch": 0.8758605928726858, + "grad_norm": 0.2256878912448883, + "learning_rate": 2.310051126621138e-06, + "loss": 0.1043, + "step": 49106 + }, + { + "epoch": 0.8758784289943995, + "grad_norm": 0.25990548729896545, + "learning_rate": 2.3093976858478027e-06, + "loss": 0.0932, + "step": 49107 + }, + { + "epoch": 0.8758962651161132, + "grad_norm": 0.26381561160087585, + "learning_rate": 2.308744333031079e-06, + "loss": 0.1041, + "step": 49108 + }, + { + "epoch": 0.8759141012378269, + "grad_norm": 0.2565864622592926, + "learning_rate": 2.3080910681735013e-06, + "loss": 0.1022, + "step": 49109 + }, + { + "epoch": 0.8759319373595406, + "grad_norm": 0.27001383900642395, + "learning_rate": 2.3074378912775948e-06, + "loss": 0.1301, + "step": 49110 + }, + { + "epoch": 0.8759497734812542, + "grad_norm": 0.2518083453178406, + "learning_rate": 2.306784802345899e-06, + "loss": 0.1182, + "step": 49111 + }, + { + "epoch": 0.8759676096029679, + "grad_norm": 0.29305702447891235, + "learning_rate": 2.3061318013809453e-06, + "loss": 0.0941, + "step": 49112 + }, + { + "epoch": 0.8759854457246816, + "grad_norm": 0.29134494066238403, + "learning_rate": 2.305478888385265e-06, + "loss": 0.0533, + "step": 49113 + }, + { + "epoch": 0.8760032818463953, + "grad_norm": 0.3118290305137634, + "learning_rate": 2.3048260633613787e-06, + "loss": 0.1092, + "step": 49114 + }, + { + "epoch": 0.876021117968109, + "grad_norm": 0.25105753540992737, + "learning_rate": 2.3041733263118313e-06, + "loss": 0.0994, + "step": 49115 + }, + { + "epoch": 0.8760389540898227, + "grad_norm": 0.23561029136180878, + "learning_rate": 2.3035206772391487e-06, + "loss": 0.0962, + "step": 49116 + }, + { + "epoch": 0.8760567902115364, + "grad_norm": 0.2948676645755768, + "learning_rate": 2.302868116145851e-06, + "loss": 0.1096, + "step": 49117 + }, + { + "epoch": 0.8760746263332501, + "grad_norm": 0.3280108571052551, + "learning_rate": 2.3022156430344834e-06, + "loss": 0.1247, + "step": 49118 + }, + { + "epoch": 0.8760924624549637, + "grad_norm": 0.2578265070915222, + "learning_rate": 2.301563257907563e-06, + "loss": 0.1007, + "step": 49119 + }, + { + "epoch": 0.8761102985766774, + "grad_norm": 0.25525856018066406, + "learning_rate": 2.3009109607676243e-06, + "loss": 0.1064, + "step": 49120 + }, + { + "epoch": 0.8761281346983912, + "grad_norm": 0.3216416835784912, + "learning_rate": 2.300258751617199e-06, + "loss": 0.1401, + "step": 49121 + }, + { + "epoch": 0.8761459708201049, + "grad_norm": 0.2494664341211319, + "learning_rate": 2.2996066304588086e-06, + "loss": 0.1033, + "step": 49122 + }, + { + "epoch": 0.8761638069418186, + "grad_norm": 0.31407660245895386, + "learning_rate": 2.2989545972949804e-06, + "loss": 0.1143, + "step": 49123 + }, + { + "epoch": 0.8761816430635323, + "grad_norm": 0.2560386657714844, + "learning_rate": 2.298302652128248e-06, + "loss": 0.1326, + "step": 49124 + }, + { + "epoch": 0.876199479185246, + "grad_norm": 0.6525815725326538, + "learning_rate": 2.2976507949611343e-06, + "loss": 0.1205, + "step": 49125 + }, + { + "epoch": 0.8762173153069597, + "grad_norm": 0.3841899037361145, + "learning_rate": 2.2969990257961703e-06, + "loss": 0.1238, + "step": 49126 + }, + { + "epoch": 0.8762351514286734, + "grad_norm": 0.20940366387367249, + "learning_rate": 2.296347344635871e-06, + "loss": 0.112, + "step": 49127 + }, + { + "epoch": 0.876252987550387, + "grad_norm": 0.29055291414260864, + "learning_rate": 2.2956957514827794e-06, + "loss": 0.1231, + "step": 49128 + }, + { + "epoch": 0.8762708236721007, + "grad_norm": 0.29986679553985596, + "learning_rate": 2.295044246339412e-06, + "loss": 0.0965, + "step": 49129 + }, + { + "epoch": 0.8762886597938144, + "grad_norm": 0.3281868100166321, + "learning_rate": 2.2943928292082944e-06, + "loss": 0.1549, + "step": 49130 + }, + { + "epoch": 0.8763064959155281, + "grad_norm": 0.3696381449699402, + "learning_rate": 2.2937415000919475e-06, + "loss": 0.135, + "step": 49131 + }, + { + "epoch": 0.8763243320372418, + "grad_norm": 0.2898821234703064, + "learning_rate": 2.2930902589928998e-06, + "loss": 0.0965, + "step": 49132 + }, + { + "epoch": 0.8763421681589555, + "grad_norm": 0.3090604245662689, + "learning_rate": 2.2924391059136853e-06, + "loss": 0.1015, + "step": 49133 + }, + { + "epoch": 0.8763600042806692, + "grad_norm": 0.2524409592151642, + "learning_rate": 2.2917880408568183e-06, + "loss": 0.1463, + "step": 49134 + }, + { + "epoch": 0.8763778404023829, + "grad_norm": 0.22474107146263123, + "learning_rate": 2.291137063824822e-06, + "loss": 0.1468, + "step": 49135 + }, + { + "epoch": 0.8763956765240966, + "grad_norm": 0.28334102034568787, + "learning_rate": 2.2904861748202192e-06, + "loss": 0.1511, + "step": 49136 + }, + { + "epoch": 0.8764135126458102, + "grad_norm": 0.24980977177619934, + "learning_rate": 2.289835373845539e-06, + "loss": 0.1377, + "step": 49137 + }, + { + "epoch": 0.876431348767524, + "grad_norm": 0.3001442551612854, + "learning_rate": 2.2891846609033012e-06, + "loss": 0.1466, + "step": 49138 + }, + { + "epoch": 0.8764491848892377, + "grad_norm": 0.28811752796173096, + "learning_rate": 2.2885340359960287e-06, + "loss": 0.0944, + "step": 49139 + }, + { + "epoch": 0.8764670210109514, + "grad_norm": 0.25675734877586365, + "learning_rate": 2.2878834991262364e-06, + "loss": 0.0984, + "step": 49140 + }, + { + "epoch": 0.8764848571326651, + "grad_norm": 0.244055837392807, + "learning_rate": 2.2872330502964583e-06, + "loss": 0.1363, + "step": 49141 + }, + { + "epoch": 0.8765026932543788, + "grad_norm": 0.1616104692220688, + "learning_rate": 2.286582689509209e-06, + "loss": 0.0979, + "step": 49142 + }, + { + "epoch": 0.8765205293760925, + "grad_norm": 0.2777489125728607, + "learning_rate": 2.285932416767009e-06, + "loss": 0.082, + "step": 49143 + }, + { + "epoch": 0.8765383654978062, + "grad_norm": 0.32997822761535645, + "learning_rate": 2.2852822320723776e-06, + "loss": 0.1054, + "step": 49144 + }, + { + "epoch": 0.8765562016195199, + "grad_norm": 0.30911630392074585, + "learning_rate": 2.2846321354278417e-06, + "loss": 0.1258, + "step": 49145 + }, + { + "epoch": 0.8765740377412335, + "grad_norm": 0.26052671670913696, + "learning_rate": 2.2839821268359125e-06, + "loss": 0.1143, + "step": 49146 + }, + { + "epoch": 0.8765918738629472, + "grad_norm": 0.32524868845939636, + "learning_rate": 2.2833322062991186e-06, + "loss": 0.1055, + "step": 49147 + }, + { + "epoch": 0.8766097099846609, + "grad_norm": 0.23927100002765656, + "learning_rate": 2.282682373819975e-06, + "loss": 0.1017, + "step": 49148 + }, + { + "epoch": 0.8766275461063746, + "grad_norm": 0.3291065990924835, + "learning_rate": 2.282032629400996e-06, + "loss": 0.1465, + "step": 49149 + }, + { + "epoch": 0.8766453822280883, + "grad_norm": 0.3337452709674835, + "learning_rate": 2.28138297304471e-06, + "loss": 0.1317, + "step": 49150 + }, + { + "epoch": 0.876663218349802, + "grad_norm": 0.27962666749954224, + "learning_rate": 2.280733404753632e-06, + "loss": 0.1315, + "step": 49151 + }, + { + "epoch": 0.8766810544715157, + "grad_norm": 0.29704219102859497, + "learning_rate": 2.280083924530277e-06, + "loss": 0.1401, + "step": 49152 + }, + { + "epoch": 0.8766988905932294, + "grad_norm": 0.25297796726226807, + "learning_rate": 2.279434532377159e-06, + "loss": 0.0742, + "step": 49153 + }, + { + "epoch": 0.876716726714943, + "grad_norm": 0.18221089243888855, + "learning_rate": 2.2787852282968036e-06, + "loss": 0.0629, + "step": 49154 + }, + { + "epoch": 0.8767345628366568, + "grad_norm": 0.2791878581047058, + "learning_rate": 2.2781360122917262e-06, + "loss": 0.0758, + "step": 49155 + }, + { + "epoch": 0.8767523989583705, + "grad_norm": 0.24226456880569458, + "learning_rate": 2.277486884364441e-06, + "loss": 0.1054, + "step": 49156 + }, + { + "epoch": 0.8767702350800842, + "grad_norm": 0.3039802312850952, + "learning_rate": 2.2768378445174596e-06, + "loss": 0.0965, + "step": 49157 + }, + { + "epoch": 0.8767880712017979, + "grad_norm": 0.26751869916915894, + "learning_rate": 2.2761888927533086e-06, + "loss": 0.1124, + "step": 49158 + }, + { + "epoch": 0.8768059073235116, + "grad_norm": 0.29262450337409973, + "learning_rate": 2.2755400290744964e-06, + "loss": 0.1309, + "step": 49159 + }, + { + "epoch": 0.8768237434452253, + "grad_norm": 0.2677420973777771, + "learning_rate": 2.274891253483538e-06, + "loss": 0.1008, + "step": 49160 + }, + { + "epoch": 0.876841579566939, + "grad_norm": 0.2991850674152374, + "learning_rate": 2.2742425659829537e-06, + "loss": 0.1178, + "step": 49161 + }, + { + "epoch": 0.8768594156886527, + "grad_norm": 0.2580156922340393, + "learning_rate": 2.2735939665752494e-06, + "loss": 0.1514, + "step": 49162 + }, + { + "epoch": 0.8768772518103664, + "grad_norm": 0.21000432968139648, + "learning_rate": 2.272945455262948e-06, + "loss": 0.0647, + "step": 49163 + }, + { + "epoch": 0.87689508793208, + "grad_norm": 0.24749843776226044, + "learning_rate": 2.272297032048559e-06, + "loss": 0.0923, + "step": 49164 + }, + { + "epoch": 0.8769129240537937, + "grad_norm": 0.2307685762643814, + "learning_rate": 2.2716486969346e-06, + "loss": 0.1076, + "step": 49165 + }, + { + "epoch": 0.8769307601755074, + "grad_norm": 0.2505471408367157, + "learning_rate": 2.2710004499235744e-06, + "loss": 0.1082, + "step": 49166 + }, + { + "epoch": 0.8769485962972211, + "grad_norm": 0.24869133532047272, + "learning_rate": 2.2703522910180047e-06, + "loss": 0.0755, + "step": 49167 + }, + { + "epoch": 0.8769664324189348, + "grad_norm": 0.29524222016334534, + "learning_rate": 2.2697042202204006e-06, + "loss": 0.094, + "step": 49168 + }, + { + "epoch": 0.8769842685406485, + "grad_norm": 0.3042192757129669, + "learning_rate": 2.2690562375332737e-06, + "loss": 0.1433, + "step": 49169 + }, + { + "epoch": 0.8770021046623622, + "grad_norm": 0.2601214349269867, + "learning_rate": 2.268408342959133e-06, + "loss": 0.0948, + "step": 49170 + }, + { + "epoch": 0.8770199407840759, + "grad_norm": 0.2645348310470581, + "learning_rate": 2.2677605365004962e-06, + "loss": 0.0531, + "step": 49171 + }, + { + "epoch": 0.8770377769057897, + "grad_norm": 0.33378586173057556, + "learning_rate": 2.2671128181598724e-06, + "loss": 0.1244, + "step": 49172 + }, + { + "epoch": 0.8770556130275033, + "grad_norm": 0.2772667706012726, + "learning_rate": 2.2664651879397647e-06, + "loss": 0.0984, + "step": 49173 + }, + { + "epoch": 0.877073449149217, + "grad_norm": 0.29864782094955444, + "learning_rate": 2.265817645842694e-06, + "loss": 0.0958, + "step": 49174 + }, + { + "epoch": 0.8770912852709307, + "grad_norm": 0.3517008125782013, + "learning_rate": 2.265170191871163e-06, + "loss": 0.0628, + "step": 49175 + }, + { + "epoch": 0.8771091213926444, + "grad_norm": 0.25018373131752014, + "learning_rate": 2.2645228260276875e-06, + "loss": 0.0983, + "step": 49176 + }, + { + "epoch": 0.8771269575143581, + "grad_norm": 0.2971194386482239, + "learning_rate": 2.2638755483147755e-06, + "loss": 0.1277, + "step": 49177 + }, + { + "epoch": 0.8771447936360718, + "grad_norm": 0.318308562040329, + "learning_rate": 2.263228358734934e-06, + "loss": 0.1172, + "step": 49178 + }, + { + "epoch": 0.8771626297577855, + "grad_norm": 0.21772401034832, + "learning_rate": 2.262581257290669e-06, + "loss": 0.0828, + "step": 49179 + }, + { + "epoch": 0.8771804658794992, + "grad_norm": 0.2133631557226181, + "learning_rate": 2.261934243984498e-06, + "loss": 0.0827, + "step": 49180 + }, + { + "epoch": 0.8771983020012128, + "grad_norm": 0.3687916100025177, + "learning_rate": 2.2612873188189193e-06, + "loss": 0.0749, + "step": 49181 + }, + { + "epoch": 0.8772161381229265, + "grad_norm": 0.30929720401763916, + "learning_rate": 2.260640481796447e-06, + "loss": 0.0985, + "step": 49182 + }, + { + "epoch": 0.8772339742446402, + "grad_norm": 0.2361096292734146, + "learning_rate": 2.2599937329195827e-06, + "loss": 0.077, + "step": 49183 + }, + { + "epoch": 0.8772518103663539, + "grad_norm": 0.2850714921951294, + "learning_rate": 2.25934707219084e-06, + "loss": 0.1102, + "step": 49184 + }, + { + "epoch": 0.8772696464880676, + "grad_norm": 0.26227477192878723, + "learning_rate": 2.258700499612723e-06, + "loss": 0.0633, + "step": 49185 + }, + { + "epoch": 0.8772874826097813, + "grad_norm": 0.3133750557899475, + "learning_rate": 2.2580540151877383e-06, + "loss": 0.1128, + "step": 49186 + }, + { + "epoch": 0.877305318731495, + "grad_norm": 0.24264498054981232, + "learning_rate": 2.2574076189183833e-06, + "loss": 0.0844, + "step": 49187 + }, + { + "epoch": 0.8773231548532087, + "grad_norm": 0.24139946699142456, + "learning_rate": 2.2567613108071735e-06, + "loss": 0.0923, + "step": 49188 + }, + { + "epoch": 0.8773409909749225, + "grad_norm": 0.2859754264354706, + "learning_rate": 2.256115090856617e-06, + "loss": 0.0877, + "step": 49189 + }, + { + "epoch": 0.8773588270966362, + "grad_norm": 0.2596757113933563, + "learning_rate": 2.2554689590692123e-06, + "loss": 0.1293, + "step": 49190 + }, + { + "epoch": 0.8773766632183498, + "grad_norm": 0.28929105401039124, + "learning_rate": 2.2548229154474687e-06, + "loss": 0.1235, + "step": 49191 + }, + { + "epoch": 0.8773944993400635, + "grad_norm": 0.3321802020072937, + "learning_rate": 2.2541769599938813e-06, + "loss": 0.1362, + "step": 49192 + }, + { + "epoch": 0.8774123354617772, + "grad_norm": 0.2534790337085724, + "learning_rate": 2.253531092710964e-06, + "loss": 0.1197, + "step": 49193 + }, + { + "epoch": 0.8774301715834909, + "grad_norm": 0.23860037326812744, + "learning_rate": 2.2528853136012164e-06, + "loss": 0.083, + "step": 49194 + }, + { + "epoch": 0.8774480077052046, + "grad_norm": 0.3414667546749115, + "learning_rate": 2.2522396226671432e-06, + "loss": 0.1303, + "step": 49195 + }, + { + "epoch": 0.8774658438269183, + "grad_norm": 0.19862067699432373, + "learning_rate": 2.2515940199112408e-06, + "loss": 0.0688, + "step": 49196 + }, + { + "epoch": 0.877483679948632, + "grad_norm": 0.30365604162216187, + "learning_rate": 2.25094850533602e-06, + "loss": 0.1066, + "step": 49197 + }, + { + "epoch": 0.8775015160703457, + "grad_norm": 0.21867285668849945, + "learning_rate": 2.250303078943983e-06, + "loss": 0.0978, + "step": 49198 + }, + { + "epoch": 0.8775193521920593, + "grad_norm": 0.21739085018634796, + "learning_rate": 2.249657740737626e-06, + "loss": 0.079, + "step": 49199 + }, + { + "epoch": 0.877537188313773, + "grad_norm": 0.2717151641845703, + "learning_rate": 2.2490124907194513e-06, + "loss": 0.0733, + "step": 49200 + }, + { + "epoch": 0.8775550244354867, + "grad_norm": 0.28030017018318176, + "learning_rate": 2.248367328891965e-06, + "loss": 0.1617, + "step": 49201 + }, + { + "epoch": 0.8775728605572004, + "grad_norm": 0.2272324562072754, + "learning_rate": 2.2477222552576615e-06, + "loss": 0.1178, + "step": 49202 + }, + { + "epoch": 0.8775906966789141, + "grad_norm": 0.2215447574853897, + "learning_rate": 2.247077269819048e-06, + "loss": 0.115, + "step": 49203 + }, + { + "epoch": 0.8776085328006278, + "grad_norm": 0.26245278120040894, + "learning_rate": 2.2464323725786217e-06, + "loss": 0.0922, + "step": 49204 + }, + { + "epoch": 0.8776263689223415, + "grad_norm": 0.3499809205532074, + "learning_rate": 2.2457875635388786e-06, + "loss": 0.1072, + "step": 49205 + }, + { + "epoch": 0.8776442050440553, + "grad_norm": 0.44295820593833923, + "learning_rate": 2.2451428427023276e-06, + "loss": 0.1676, + "step": 49206 + }, + { + "epoch": 0.877662041165769, + "grad_norm": 0.34075355529785156, + "learning_rate": 2.2444982100714607e-06, + "loss": 0.0812, + "step": 49207 + }, + { + "epoch": 0.8776798772874826, + "grad_norm": 0.31146571040153503, + "learning_rate": 2.243853665648779e-06, + "loss": 0.1255, + "step": 49208 + }, + { + "epoch": 0.8776977134091963, + "grad_norm": 0.44378989934921265, + "learning_rate": 2.2432092094367753e-06, + "loss": 0.1322, + "step": 49209 + }, + { + "epoch": 0.87771554953091, + "grad_norm": 0.20723335444927216, + "learning_rate": 2.242564841437958e-06, + "loss": 0.0893, + "step": 49210 + }, + { + "epoch": 0.8777333856526237, + "grad_norm": 0.39754387736320496, + "learning_rate": 2.24192056165482e-06, + "loss": 0.0954, + "step": 49211 + }, + { + "epoch": 0.8777512217743374, + "grad_norm": 0.27974045276641846, + "learning_rate": 2.2412763700898563e-06, + "loss": 0.1233, + "step": 49212 + }, + { + "epoch": 0.8777690578960511, + "grad_norm": 0.2563789188861847, + "learning_rate": 2.2406322667455653e-06, + "loss": 0.1564, + "step": 49213 + }, + { + "epoch": 0.8777868940177648, + "grad_norm": 0.4163587987422943, + "learning_rate": 2.239988251624442e-06, + "loss": 0.1092, + "step": 49214 + }, + { + "epoch": 0.8778047301394785, + "grad_norm": 0.22461192309856415, + "learning_rate": 2.2393443247289842e-06, + "loss": 0.1045, + "step": 49215 + }, + { + "epoch": 0.8778225662611921, + "grad_norm": 0.2711215615272522, + "learning_rate": 2.2387004860616955e-06, + "loss": 0.1264, + "step": 49216 + }, + { + "epoch": 0.8778404023829058, + "grad_norm": 0.2895231544971466, + "learning_rate": 2.238056735625063e-06, + "loss": 0.1195, + "step": 49217 + }, + { + "epoch": 0.8778582385046195, + "grad_norm": 0.19543756544589996, + "learning_rate": 2.237413073421582e-06, + "loss": 0.0809, + "step": 49218 + }, + { + "epoch": 0.8778760746263332, + "grad_norm": 0.30299803614616394, + "learning_rate": 2.236769499453753e-06, + "loss": 0.1072, + "step": 49219 + }, + { + "epoch": 0.8778939107480469, + "grad_norm": 0.26403459906578064, + "learning_rate": 2.2361260137240657e-06, + "loss": 0.1134, + "step": 49220 + }, + { + "epoch": 0.8779117468697606, + "grad_norm": 0.24129967391490936, + "learning_rate": 2.2354826162350212e-06, + "loss": 0.1092, + "step": 49221 + }, + { + "epoch": 0.8779295829914744, + "grad_norm": 0.22784292697906494, + "learning_rate": 2.2348393069891005e-06, + "loss": 0.064, + "step": 49222 + }, + { + "epoch": 0.8779474191131881, + "grad_norm": 0.31371980905532837, + "learning_rate": 2.23419608598881e-06, + "loss": 0.1511, + "step": 49223 + }, + { + "epoch": 0.8779652552349018, + "grad_norm": 0.31481418013572693, + "learning_rate": 2.2335529532366394e-06, + "loss": 0.1166, + "step": 49224 + }, + { + "epoch": 0.8779830913566155, + "grad_norm": 0.3452967405319214, + "learning_rate": 2.2329099087350812e-06, + "loss": 0.1224, + "step": 49225 + }, + { + "epoch": 0.8780009274783291, + "grad_norm": 0.3746187686920166, + "learning_rate": 2.232266952486628e-06, + "loss": 0.0847, + "step": 49226 + }, + { + "epoch": 0.8780187636000428, + "grad_norm": 0.2701776325702667, + "learning_rate": 2.2316240844937665e-06, + "loss": 0.1073, + "step": 49227 + }, + { + "epoch": 0.8780365997217565, + "grad_norm": 0.3587571084499359, + "learning_rate": 2.2309813047589973e-06, + "loss": 0.11, + "step": 49228 + }, + { + "epoch": 0.8780544358434702, + "grad_norm": 0.2687029540538788, + "learning_rate": 2.230338613284805e-06, + "loss": 0.1121, + "step": 49229 + }, + { + "epoch": 0.8780722719651839, + "grad_norm": 0.32443615794181824, + "learning_rate": 2.2296960100736895e-06, + "loss": 0.1115, + "step": 49230 + }, + { + "epoch": 0.8780901080868976, + "grad_norm": 0.24746912717819214, + "learning_rate": 2.229053495128133e-06, + "loss": 0.1013, + "step": 49231 + }, + { + "epoch": 0.8781079442086113, + "grad_norm": 0.26213327050209045, + "learning_rate": 2.228411068450634e-06, + "loss": 0.1387, + "step": 49232 + }, + { + "epoch": 0.878125780330325, + "grad_norm": 0.29794061183929443, + "learning_rate": 2.2277687300436776e-06, + "loss": 0.1189, + "step": 49233 + }, + { + "epoch": 0.8781436164520386, + "grad_norm": 0.2491191178560257, + "learning_rate": 2.227126479909758e-06, + "loss": 0.1081, + "step": 49234 + }, + { + "epoch": 0.8781614525737523, + "grad_norm": 0.20252706110477448, + "learning_rate": 2.2264843180513555e-06, + "loss": 0.097, + "step": 49235 + }, + { + "epoch": 0.878179288695466, + "grad_norm": 0.2804051637649536, + "learning_rate": 2.225842244470969e-06, + "loss": 0.1277, + "step": 49236 + }, + { + "epoch": 0.8781971248171797, + "grad_norm": 0.29793640971183777, + "learning_rate": 2.2252002591710842e-06, + "loss": 0.1449, + "step": 49237 + }, + { + "epoch": 0.8782149609388934, + "grad_norm": 0.28213897347450256, + "learning_rate": 2.2245583621541886e-06, + "loss": 0.1098, + "step": 49238 + }, + { + "epoch": 0.8782327970606072, + "grad_norm": 0.26342231035232544, + "learning_rate": 2.2239165534227726e-06, + "loss": 0.1301, + "step": 49239 + }, + { + "epoch": 0.8782506331823209, + "grad_norm": 0.24856138229370117, + "learning_rate": 2.223274832979319e-06, + "loss": 0.1059, + "step": 49240 + }, + { + "epoch": 0.8782684693040346, + "grad_norm": 0.28711897134780884, + "learning_rate": 2.222633200826321e-06, + "loss": 0.0778, + "step": 49241 + }, + { + "epoch": 0.8782863054257483, + "grad_norm": 0.23394736647605896, + "learning_rate": 2.2219916569662603e-06, + "loss": 0.0891, + "step": 49242 + }, + { + "epoch": 0.878304141547462, + "grad_norm": 0.35065212845802307, + "learning_rate": 2.221350201401634e-06, + "loss": 0.1434, + "step": 49243 + }, + { + "epoch": 0.8783219776691756, + "grad_norm": 0.24713405966758728, + "learning_rate": 2.220708834134913e-06, + "loss": 0.1105, + "step": 49244 + }, + { + "epoch": 0.8783398137908893, + "grad_norm": 0.40670716762542725, + "learning_rate": 2.2200675551686005e-06, + "loss": 0.0873, + "step": 49245 + }, + { + "epoch": 0.878357649912603, + "grad_norm": 0.3071151673793793, + "learning_rate": 2.2194263645051728e-06, + "loss": 0.191, + "step": 49246 + }, + { + "epoch": 0.8783754860343167, + "grad_norm": 0.3479996919631958, + "learning_rate": 2.218785262147116e-06, + "loss": 0.1203, + "step": 49247 + }, + { + "epoch": 0.8783933221560304, + "grad_norm": 0.21634791791439056, + "learning_rate": 2.2181442480969124e-06, + "loss": 0.0968, + "step": 49248 + }, + { + "epoch": 0.8784111582777441, + "grad_norm": 0.3879588842391968, + "learning_rate": 2.2175033223570535e-06, + "loss": 0.0646, + "step": 49249 + }, + { + "epoch": 0.8784289943994578, + "grad_norm": 0.2578437030315399, + "learning_rate": 2.216862484930021e-06, + "loss": 0.1113, + "step": 49250 + }, + { + "epoch": 0.8784468305211715, + "grad_norm": 0.3589254915714264, + "learning_rate": 2.2162217358182997e-06, + "loss": 0.1103, + "step": 49251 + }, + { + "epoch": 0.8784646666428851, + "grad_norm": 0.40773728489875793, + "learning_rate": 2.2155810750243726e-06, + "loss": 0.0884, + "step": 49252 + }, + { + "epoch": 0.8784825027645988, + "grad_norm": 0.17489473521709442, + "learning_rate": 2.214940502550716e-06, + "loss": 0.1007, + "step": 49253 + }, + { + "epoch": 0.8785003388863125, + "grad_norm": 0.2856663763523102, + "learning_rate": 2.214300018399826e-06, + "loss": 0.1178, + "step": 49254 + }, + { + "epoch": 0.8785181750080262, + "grad_norm": 0.30280280113220215, + "learning_rate": 2.213659622574177e-06, + "loss": 0.0938, + "step": 49255 + }, + { + "epoch": 0.87853601112974, + "grad_norm": 0.244004026055336, + "learning_rate": 2.213019315076251e-06, + "loss": 0.115, + "step": 49256 + }, + { + "epoch": 0.8785538472514537, + "grad_norm": 0.2830543518066406, + "learning_rate": 2.212379095908537e-06, + "loss": 0.1213, + "step": 49257 + }, + { + "epoch": 0.8785716833731674, + "grad_norm": 0.22903460264205933, + "learning_rate": 2.2117389650735065e-06, + "loss": 0.1131, + "step": 49258 + }, + { + "epoch": 0.8785895194948811, + "grad_norm": 0.3859170973300934, + "learning_rate": 2.211098922573651e-06, + "loss": 0.1788, + "step": 49259 + }, + { + "epoch": 0.8786073556165948, + "grad_norm": 0.18580417335033417, + "learning_rate": 2.2104589684114497e-06, + "loss": 0.0822, + "step": 49260 + }, + { + "epoch": 0.8786251917383084, + "grad_norm": 0.24412298202514648, + "learning_rate": 2.2098191025893723e-06, + "loss": 0.0813, + "step": 49261 + }, + { + "epoch": 0.8786430278600221, + "grad_norm": 0.2092595398426056, + "learning_rate": 2.209179325109914e-06, + "loss": 0.0807, + "step": 49262 + }, + { + "epoch": 0.8786608639817358, + "grad_norm": 0.23754996061325073, + "learning_rate": 2.208539635975548e-06, + "loss": 0.1068, + "step": 49263 + }, + { + "epoch": 0.8786787001034495, + "grad_norm": 0.22136349976062775, + "learning_rate": 2.2079000351887565e-06, + "loss": 0.0951, + "step": 49264 + }, + { + "epoch": 0.8786965362251632, + "grad_norm": 0.2848581075668335, + "learning_rate": 2.207260522752014e-06, + "loss": 0.1106, + "step": 49265 + }, + { + "epoch": 0.8787143723468769, + "grad_norm": 0.2509171962738037, + "learning_rate": 2.2066210986677964e-06, + "loss": 0.0657, + "step": 49266 + }, + { + "epoch": 0.8787322084685906, + "grad_norm": 0.26343268156051636, + "learning_rate": 2.205981762938594e-06, + "loss": 0.0839, + "step": 49267 + }, + { + "epoch": 0.8787500445903043, + "grad_norm": 0.25738710165023804, + "learning_rate": 2.20534251556688e-06, + "loss": 0.1268, + "step": 49268 + }, + { + "epoch": 0.878767880712018, + "grad_norm": 0.25700485706329346, + "learning_rate": 2.204703356555127e-06, + "loss": 0.0922, + "step": 49269 + }, + { + "epoch": 0.8787857168337316, + "grad_norm": 0.23591314256191254, + "learning_rate": 2.2040642859058214e-06, + "loss": 0.0881, + "step": 49270 + }, + { + "epoch": 0.8788035529554453, + "grad_norm": 0.29068422317504883, + "learning_rate": 2.2034253036214316e-06, + "loss": 0.1331, + "step": 49271 + }, + { + "epoch": 0.878821389077159, + "grad_norm": 0.2947634160518646, + "learning_rate": 2.202786409704441e-06, + "loss": 0.075, + "step": 49272 + }, + { + "epoch": 0.8788392251988728, + "grad_norm": 0.30926308035850525, + "learning_rate": 2.2021476041573287e-06, + "loss": 0.1341, + "step": 49273 + }, + { + "epoch": 0.8788570613205865, + "grad_norm": 0.1835303157567978, + "learning_rate": 2.201508886982559e-06, + "loss": 0.0682, + "step": 49274 + }, + { + "epoch": 0.8788748974423002, + "grad_norm": 0.26731470227241516, + "learning_rate": 2.200870258182619e-06, + "loss": 0.1124, + "step": 49275 + }, + { + "epoch": 0.8788927335640139, + "grad_norm": 0.2902938425540924, + "learning_rate": 2.200231717759982e-06, + "loss": 0.1386, + "step": 49276 + }, + { + "epoch": 0.8789105696857276, + "grad_norm": 0.24245618283748627, + "learning_rate": 2.1995932657171202e-06, + "loss": 0.1325, + "step": 49277 + }, + { + "epoch": 0.8789284058074412, + "grad_norm": 0.204793319106102, + "learning_rate": 2.1989549020565104e-06, + "loss": 0.1252, + "step": 49278 + }, + { + "epoch": 0.8789462419291549, + "grad_norm": 0.2994351387023926, + "learning_rate": 2.198316626780625e-06, + "loss": 0.1284, + "step": 49279 + }, + { + "epoch": 0.8789640780508686, + "grad_norm": 0.337112694978714, + "learning_rate": 2.1976784398919396e-06, + "loss": 0.1566, + "step": 49280 + }, + { + "epoch": 0.8789819141725823, + "grad_norm": 0.26452934741973877, + "learning_rate": 2.197040341392931e-06, + "loss": 0.1339, + "step": 49281 + }, + { + "epoch": 0.878999750294296, + "grad_norm": 0.3279476761817932, + "learning_rate": 2.196402331286071e-06, + "loss": 0.0903, + "step": 49282 + }, + { + "epoch": 0.8790175864160097, + "grad_norm": 0.21207202970981598, + "learning_rate": 2.195764409573825e-06, + "loss": 0.0949, + "step": 49283 + }, + { + "epoch": 0.8790354225377234, + "grad_norm": 0.2903973460197449, + "learning_rate": 2.195126576258677e-06, + "loss": 0.1153, + "step": 49284 + }, + { + "epoch": 0.8790532586594371, + "grad_norm": 0.6664719581604004, + "learning_rate": 2.1944888313430917e-06, + "loss": 0.1925, + "step": 49285 + }, + { + "epoch": 0.8790710947811508, + "grad_norm": 0.17960165441036224, + "learning_rate": 2.1938511748295476e-06, + "loss": 0.0982, + "step": 49286 + }, + { + "epoch": 0.8790889309028644, + "grad_norm": 0.2270062267780304, + "learning_rate": 2.1932136067205096e-06, + "loss": 0.1009, + "step": 49287 + }, + { + "epoch": 0.8791067670245781, + "grad_norm": 0.34451597929000854, + "learning_rate": 2.1925761270184584e-06, + "loss": 0.108, + "step": 49288 + }, + { + "epoch": 0.8791246031462918, + "grad_norm": 0.24558772146701813, + "learning_rate": 2.191938735725857e-06, + "loss": 0.0984, + "step": 49289 + }, + { + "epoch": 0.8791424392680056, + "grad_norm": 0.28395959734916687, + "learning_rate": 2.1913014328451803e-06, + "loss": 0.1147, + "step": 49290 + }, + { + "epoch": 0.8791602753897193, + "grad_norm": 0.3853025734424591, + "learning_rate": 2.190664218378899e-06, + "loss": 0.1593, + "step": 49291 + }, + { + "epoch": 0.879178111511433, + "grad_norm": 0.3152863681316376, + "learning_rate": 2.190027092329475e-06, + "loss": 0.1532, + "step": 49292 + }, + { + "epoch": 0.8791959476331467, + "grad_norm": 0.2937759459018707, + "learning_rate": 2.189390054699386e-06, + "loss": 0.0992, + "step": 49293 + }, + { + "epoch": 0.8792137837548604, + "grad_norm": 0.22882647812366486, + "learning_rate": 2.1887531054911035e-06, + "loss": 0.123, + "step": 49294 + }, + { + "epoch": 0.879231619876574, + "grad_norm": 0.23416295647621155, + "learning_rate": 2.188116244707092e-06, + "loss": 0.0773, + "step": 49295 + }, + { + "epoch": 0.8792494559982877, + "grad_norm": 0.2864985466003418, + "learning_rate": 2.1874794723498156e-06, + "loss": 0.0959, + "step": 49296 + }, + { + "epoch": 0.8792672921200014, + "grad_norm": 0.2612197697162628, + "learning_rate": 2.186842788421753e-06, + "loss": 0.1317, + "step": 49297 + }, + { + "epoch": 0.8792851282417151, + "grad_norm": 0.2376280128955841, + "learning_rate": 2.186206192925361e-06, + "loss": 0.0839, + "step": 49298 + }, + { + "epoch": 0.8793029643634288, + "grad_norm": 0.2539444863796234, + "learning_rate": 2.18556968586312e-06, + "loss": 0.1146, + "step": 49299 + }, + { + "epoch": 0.8793208004851425, + "grad_norm": 0.22710853815078735, + "learning_rate": 2.184933267237485e-06, + "loss": 0.1086, + "step": 49300 + }, + { + "epoch": 0.8793386366068562, + "grad_norm": 0.30184367299079895, + "learning_rate": 2.1842969370509336e-06, + "loss": 0.1491, + "step": 49301 + }, + { + "epoch": 0.8793564727285699, + "grad_norm": 0.24350878596305847, + "learning_rate": 2.183660695305928e-06, + "loss": 0.1095, + "step": 49302 + }, + { + "epoch": 0.8793743088502836, + "grad_norm": 0.2632853388786316, + "learning_rate": 2.1830245420049356e-06, + "loss": 0.1342, + "step": 49303 + }, + { + "epoch": 0.8793921449719972, + "grad_norm": 0.19945082068443298, + "learning_rate": 2.1823884771504183e-06, + "loss": 0.0918, + "step": 49304 + }, + { + "epoch": 0.8794099810937109, + "grad_norm": 0.31310898065567017, + "learning_rate": 2.181752500744841e-06, + "loss": 0.1196, + "step": 49305 + }, + { + "epoch": 0.8794278172154246, + "grad_norm": 0.3122529983520508, + "learning_rate": 2.1811166127906763e-06, + "loss": 0.1056, + "step": 49306 + }, + { + "epoch": 0.8794456533371384, + "grad_norm": 0.3728542625904083, + "learning_rate": 2.1804808132903835e-06, + "loss": 0.1174, + "step": 49307 + }, + { + "epoch": 0.8794634894588521, + "grad_norm": 0.21311181783676147, + "learning_rate": 2.1798451022464305e-06, + "loss": 0.126, + "step": 49308 + }, + { + "epoch": 0.8794813255805658, + "grad_norm": 0.27999240159988403, + "learning_rate": 2.179209479661273e-06, + "loss": 0.1089, + "step": 49309 + }, + { + "epoch": 0.8794991617022795, + "grad_norm": 0.2537831962108612, + "learning_rate": 2.1785739455373903e-06, + "loss": 0.1235, + "step": 49310 + }, + { + "epoch": 0.8795169978239932, + "grad_norm": 0.22349326312541962, + "learning_rate": 2.177938499877233e-06, + "loss": 0.1066, + "step": 49311 + }, + { + "epoch": 0.8795348339457069, + "grad_norm": 0.3453611135482788, + "learning_rate": 2.177303142683265e-06, + "loss": 0.1112, + "step": 49312 + }, + { + "epoch": 0.8795526700674206, + "grad_norm": 0.2920825779438019, + "learning_rate": 2.176667873957955e-06, + "loss": 0.108, + "step": 49313 + }, + { + "epoch": 0.8795705061891342, + "grad_norm": 0.2659646272659302, + "learning_rate": 2.1760326937037665e-06, + "loss": 0.0969, + "step": 49314 + }, + { + "epoch": 0.8795883423108479, + "grad_norm": 0.29027992486953735, + "learning_rate": 2.175397601923157e-06, + "loss": 0.1476, + "step": 49315 + }, + { + "epoch": 0.8796061784325616, + "grad_norm": 0.3211362957954407, + "learning_rate": 2.1747625986185932e-06, + "loss": 0.117, + "step": 49316 + }, + { + "epoch": 0.8796240145542753, + "grad_norm": 0.5697280168533325, + "learning_rate": 2.174127683792529e-06, + "loss": 0.1293, + "step": 49317 + }, + { + "epoch": 0.879641850675989, + "grad_norm": 0.26624172925949097, + "learning_rate": 2.173492857447429e-06, + "loss": 0.1019, + "step": 49318 + }, + { + "epoch": 0.8796596867977027, + "grad_norm": 0.4039880335330963, + "learning_rate": 2.172858119585755e-06, + "loss": 0.1098, + "step": 49319 + }, + { + "epoch": 0.8796775229194164, + "grad_norm": 0.3455815017223358, + "learning_rate": 2.1722234702099718e-06, + "loss": 0.0609, + "step": 49320 + }, + { + "epoch": 0.87969535904113, + "grad_norm": 0.4179365932941437, + "learning_rate": 2.1715889093225307e-06, + "loss": 0.1329, + "step": 49321 + }, + { + "epoch": 0.8797131951628437, + "grad_norm": 0.2731788754463196, + "learning_rate": 2.170954436925893e-06, + "loss": 0.1196, + "step": 49322 + }, + { + "epoch": 0.8797310312845575, + "grad_norm": 0.2133973389863968, + "learning_rate": 2.170320053022526e-06, + "loss": 0.0921, + "step": 49323 + }, + { + "epoch": 0.8797488674062712, + "grad_norm": 0.25417691469192505, + "learning_rate": 2.1696857576148837e-06, + "loss": 0.0912, + "step": 49324 + }, + { + "epoch": 0.8797667035279849, + "grad_norm": 0.23578456044197083, + "learning_rate": 2.16905155070542e-06, + "loss": 0.1089, + "step": 49325 + }, + { + "epoch": 0.8797845396496986, + "grad_norm": 0.3673192262649536, + "learning_rate": 2.1684174322966017e-06, + "loss": 0.0986, + "step": 49326 + }, + { + "epoch": 0.8798023757714123, + "grad_norm": 0.2576500475406647, + "learning_rate": 2.16778340239088e-06, + "loss": 0.0778, + "step": 49327 + }, + { + "epoch": 0.879820211893126, + "grad_norm": 0.2585069239139557, + "learning_rate": 2.1671494609907167e-06, + "loss": 0.122, + "step": 49328 + }, + { + "epoch": 0.8798380480148397, + "grad_norm": 0.22544099390506744, + "learning_rate": 2.166515608098571e-06, + "loss": 0.1253, + "step": 49329 + }, + { + "epoch": 0.8798558841365534, + "grad_norm": 0.3013015687465668, + "learning_rate": 2.1658818437168992e-06, + "loss": 0.1107, + "step": 49330 + }, + { + "epoch": 0.879873720258267, + "grad_norm": 0.24815087020397186, + "learning_rate": 2.1652481678481497e-06, + "loss": 0.1403, + "step": 49331 + }, + { + "epoch": 0.8798915563799807, + "grad_norm": 0.297088086605072, + "learning_rate": 2.164614580494789e-06, + "loss": 0.1426, + "step": 49332 + }, + { + "epoch": 0.8799093925016944, + "grad_norm": 0.25410720705986023, + "learning_rate": 2.1639810816592693e-06, + "loss": 0.1499, + "step": 49333 + }, + { + "epoch": 0.8799272286234081, + "grad_norm": 0.4218274652957916, + "learning_rate": 2.163347671344046e-06, + "loss": 0.0907, + "step": 49334 + }, + { + "epoch": 0.8799450647451218, + "grad_norm": 0.32722097635269165, + "learning_rate": 2.162714349551573e-06, + "loss": 0.1314, + "step": 49335 + }, + { + "epoch": 0.8799629008668355, + "grad_norm": 0.30587467551231384, + "learning_rate": 2.1620811162843093e-06, + "loss": 0.1159, + "step": 49336 + }, + { + "epoch": 0.8799807369885492, + "grad_norm": 0.27871185541152954, + "learning_rate": 2.1614479715447055e-06, + "loss": 0.1104, + "step": 49337 + }, + { + "epoch": 0.8799985731102629, + "grad_norm": 0.3603885769844055, + "learning_rate": 2.1608149153352185e-06, + "loss": 0.1654, + "step": 49338 + }, + { + "epoch": 0.8800164092319765, + "grad_norm": 0.29584890604019165, + "learning_rate": 2.160181947658299e-06, + "loss": 0.1242, + "step": 49339 + }, + { + "epoch": 0.8800342453536903, + "grad_norm": 0.2352956384420395, + "learning_rate": 2.159549068516406e-06, + "loss": 0.1289, + "step": 49340 + }, + { + "epoch": 0.880052081475404, + "grad_norm": 0.2944217026233673, + "learning_rate": 2.158916277911985e-06, + "loss": 0.1061, + "step": 49341 + }, + { + "epoch": 0.8800699175971177, + "grad_norm": 0.25249195098876953, + "learning_rate": 2.158283575847497e-06, + "loss": 0.0729, + "step": 49342 + }, + { + "epoch": 0.8800877537188314, + "grad_norm": 0.27610111236572266, + "learning_rate": 2.1576509623253944e-06, + "loss": 0.1174, + "step": 49343 + }, + { + "epoch": 0.8801055898405451, + "grad_norm": 0.19652724266052246, + "learning_rate": 2.1570184373481184e-06, + "loss": 0.0654, + "step": 49344 + }, + { + "epoch": 0.8801234259622588, + "grad_norm": 0.29927709698677063, + "learning_rate": 2.156386000918134e-06, + "loss": 0.1493, + "step": 49345 + }, + { + "epoch": 0.8801412620839725, + "grad_norm": 0.214554563164711, + "learning_rate": 2.155753653037887e-06, + "loss": 0.1486, + "step": 49346 + }, + { + "epoch": 0.8801590982056862, + "grad_norm": 0.294188529253006, + "learning_rate": 2.15512139370983e-06, + "loss": 0.1248, + "step": 49347 + }, + { + "epoch": 0.8801769343273999, + "grad_norm": 0.2982638478279114, + "learning_rate": 2.1544892229364093e-06, + "loss": 0.1657, + "step": 49348 + }, + { + "epoch": 0.8801947704491135, + "grad_norm": 0.2377844899892807, + "learning_rate": 2.1538571407200807e-06, + "loss": 0.1073, + "step": 49349 + }, + { + "epoch": 0.8802126065708272, + "grad_norm": 0.2099645882844925, + "learning_rate": 2.1532251470632952e-06, + "loss": 0.0666, + "step": 49350 + }, + { + "epoch": 0.8802304426925409, + "grad_norm": 0.2671528458595276, + "learning_rate": 2.152593241968498e-06, + "loss": 0.1101, + "step": 49351 + }, + { + "epoch": 0.8802482788142546, + "grad_norm": 0.2318814992904663, + "learning_rate": 2.151961425438137e-06, + "loss": 0.0547, + "step": 49352 + }, + { + "epoch": 0.8802661149359683, + "grad_norm": 0.2701147198677063, + "learning_rate": 2.151329697474669e-06, + "loss": 0.1063, + "step": 49353 + }, + { + "epoch": 0.880283951057682, + "grad_norm": 0.2808949053287506, + "learning_rate": 2.150698058080536e-06, + "loss": 0.1298, + "step": 49354 + }, + { + "epoch": 0.8803017871793957, + "grad_norm": 0.23980826139450073, + "learning_rate": 2.1500665072581914e-06, + "loss": 0.0619, + "step": 49355 + }, + { + "epoch": 0.8803196233011094, + "grad_norm": 0.3192655146121979, + "learning_rate": 2.1494350450100815e-06, + "loss": 0.0892, + "step": 49356 + }, + { + "epoch": 0.8803374594228232, + "grad_norm": 0.20160838961601257, + "learning_rate": 2.1488036713386505e-06, + "loss": 0.1016, + "step": 49357 + }, + { + "epoch": 0.8803552955445368, + "grad_norm": 0.24458405375480652, + "learning_rate": 2.1481723862463528e-06, + "loss": 0.1748, + "step": 49358 + }, + { + "epoch": 0.8803731316662505, + "grad_norm": 0.3356369435787201, + "learning_rate": 2.147541189735633e-06, + "loss": 0.1232, + "step": 49359 + }, + { + "epoch": 0.8803909677879642, + "grad_norm": 0.18355625867843628, + "learning_rate": 2.146910081808934e-06, + "loss": 0.1513, + "step": 49360 + }, + { + "epoch": 0.8804088039096779, + "grad_norm": 0.31734001636505127, + "learning_rate": 2.1462790624687033e-06, + "loss": 0.1398, + "step": 49361 + }, + { + "epoch": 0.8804266400313916, + "grad_norm": 0.23904845118522644, + "learning_rate": 2.14564813171739e-06, + "loss": 0.0939, + "step": 49362 + }, + { + "epoch": 0.8804444761531053, + "grad_norm": 0.25867345929145813, + "learning_rate": 2.145017289557441e-06, + "loss": 0.0998, + "step": 49363 + }, + { + "epoch": 0.880462312274819, + "grad_norm": 0.3008505702018738, + "learning_rate": 2.1443865359912968e-06, + "loss": 0.1234, + "step": 49364 + }, + { + "epoch": 0.8804801483965327, + "grad_norm": 0.25963446497917175, + "learning_rate": 2.1437558710213997e-06, + "loss": 0.1308, + "step": 49365 + }, + { + "epoch": 0.8804979845182463, + "grad_norm": 0.21107198297977448, + "learning_rate": 2.1431252946502057e-06, + "loss": 0.0556, + "step": 49366 + }, + { + "epoch": 0.88051582063996, + "grad_norm": 0.2771542966365814, + "learning_rate": 2.1424948068801492e-06, + "loss": 0.1011, + "step": 49367 + }, + { + "epoch": 0.8805336567616737, + "grad_norm": 0.2519318163394928, + "learning_rate": 2.1418644077136756e-06, + "loss": 0.1044, + "step": 49368 + }, + { + "epoch": 0.8805514928833874, + "grad_norm": 0.2567456364631653, + "learning_rate": 2.1412340971532357e-06, + "loss": 0.134, + "step": 49369 + }, + { + "epoch": 0.8805693290051011, + "grad_norm": 0.2275715470314026, + "learning_rate": 2.140603875201261e-06, + "loss": 0.11, + "step": 49370 + }, + { + "epoch": 0.8805871651268148, + "grad_norm": 0.35440659523010254, + "learning_rate": 2.1399737418602075e-06, + "loss": 0.0986, + "step": 49371 + }, + { + "epoch": 0.8806050012485285, + "grad_norm": 0.39519476890563965, + "learning_rate": 2.1393436971325094e-06, + "loss": 0.1262, + "step": 49372 + }, + { + "epoch": 0.8806228373702422, + "grad_norm": 0.31076329946517944, + "learning_rate": 2.1387137410206123e-06, + "loss": 0.1457, + "step": 49373 + }, + { + "epoch": 0.880640673491956, + "grad_norm": 0.2845359444618225, + "learning_rate": 2.13808387352695e-06, + "loss": 0.1429, + "step": 49374 + }, + { + "epoch": 0.8806585096136696, + "grad_norm": 0.22091351449489594, + "learning_rate": 2.137454094653979e-06, + "loss": 0.1167, + "step": 49375 + }, + { + "epoch": 0.8806763457353833, + "grad_norm": 0.2092379629611969, + "learning_rate": 2.136824404404131e-06, + "loss": 0.1063, + "step": 49376 + }, + { + "epoch": 0.880694181857097, + "grad_norm": 0.3307375907897949, + "learning_rate": 2.136194802779845e-06, + "loss": 0.1201, + "step": 49377 + }, + { + "epoch": 0.8807120179788107, + "grad_norm": 0.36467018723487854, + "learning_rate": 2.1355652897835645e-06, + "loss": 0.1028, + "step": 49378 + }, + { + "epoch": 0.8807298541005244, + "grad_norm": 0.25610044598579407, + "learning_rate": 2.134935865417734e-06, + "loss": 0.1103, + "step": 49379 + }, + { + "epoch": 0.8807476902222381, + "grad_norm": 0.35355618596076965, + "learning_rate": 2.1343065296847875e-06, + "loss": 0.1943, + "step": 49380 + }, + { + "epoch": 0.8807655263439518, + "grad_norm": 0.2638136148452759, + "learning_rate": 2.1336772825871628e-06, + "loss": 0.1262, + "step": 49381 + }, + { + "epoch": 0.8807833624656655, + "grad_norm": 0.31387999653816223, + "learning_rate": 2.133048124127307e-06, + "loss": 0.105, + "step": 49382 + }, + { + "epoch": 0.8808011985873792, + "grad_norm": 0.25916004180908203, + "learning_rate": 2.132419054307652e-06, + "loss": 0.1289, + "step": 49383 + }, + { + "epoch": 0.8808190347090928, + "grad_norm": 0.2242920696735382, + "learning_rate": 2.1317900731306434e-06, + "loss": 0.0723, + "step": 49384 + }, + { + "epoch": 0.8808368708308065, + "grad_norm": 0.20153427124023438, + "learning_rate": 2.131161180598712e-06, + "loss": 0.1049, + "step": 49385 + }, + { + "epoch": 0.8808547069525202, + "grad_norm": 0.27675026655197144, + "learning_rate": 2.1305323767143002e-06, + "loss": 0.1339, + "step": 49386 + }, + { + "epoch": 0.8808725430742339, + "grad_norm": 0.3084661364555359, + "learning_rate": 2.12990366147984e-06, + "loss": 0.1133, + "step": 49387 + }, + { + "epoch": 0.8808903791959476, + "grad_norm": 0.5250598788261414, + "learning_rate": 2.1292750348977765e-06, + "loss": 0.0935, + "step": 49388 + }, + { + "epoch": 0.8809082153176613, + "grad_norm": 0.2611176073551178, + "learning_rate": 2.1286464969705404e-06, + "loss": 0.1012, + "step": 49389 + }, + { + "epoch": 0.880926051439375, + "grad_norm": 0.24788115918636322, + "learning_rate": 2.128018047700572e-06, + "loss": 0.1016, + "step": 49390 + }, + { + "epoch": 0.8809438875610888, + "grad_norm": 0.2953091561794281, + "learning_rate": 2.1273896870903e-06, + "loss": 0.1377, + "step": 49391 + }, + { + "epoch": 0.8809617236828025, + "grad_norm": 0.25488021969795227, + "learning_rate": 2.12676141514217e-06, + "loss": 0.0818, + "step": 49392 + }, + { + "epoch": 0.8809795598045161, + "grad_norm": 0.24367518723011017, + "learning_rate": 2.1261332318586125e-06, + "loss": 0.0903, + "step": 49393 + }, + { + "epoch": 0.8809973959262298, + "grad_norm": 0.5073939561843872, + "learning_rate": 2.1255051372420648e-06, + "loss": 0.1471, + "step": 49394 + }, + { + "epoch": 0.8810152320479435, + "grad_norm": 0.3212815225124359, + "learning_rate": 2.1248771312949554e-06, + "loss": 0.1424, + "step": 49395 + }, + { + "epoch": 0.8810330681696572, + "grad_norm": 0.31436392664909363, + "learning_rate": 2.124249214019722e-06, + "loss": 0.1464, + "step": 49396 + }, + { + "epoch": 0.8810509042913709, + "grad_norm": 0.22638578712940216, + "learning_rate": 2.123621385418803e-06, + "loss": 0.179, + "step": 49397 + }, + { + "epoch": 0.8810687404130846, + "grad_norm": 0.21386833488941193, + "learning_rate": 2.1229936454946282e-06, + "loss": 0.0734, + "step": 49398 + }, + { + "epoch": 0.8810865765347983, + "grad_norm": 0.3712170720100403, + "learning_rate": 2.1223659942496336e-06, + "loss": 0.1465, + "step": 49399 + }, + { + "epoch": 0.881104412656512, + "grad_norm": 0.2534027397632599, + "learning_rate": 2.121738431686246e-06, + "loss": 0.1107, + "step": 49400 + }, + { + "epoch": 0.8811222487782256, + "grad_norm": 0.2919429838657379, + "learning_rate": 2.1211109578069045e-06, + "loss": 0.1253, + "step": 49401 + }, + { + "epoch": 0.8811400848999393, + "grad_norm": 0.24583253264427185, + "learning_rate": 2.1204835726140404e-06, + "loss": 0.1118, + "step": 49402 + }, + { + "epoch": 0.881157921021653, + "grad_norm": 0.2129170149564743, + "learning_rate": 2.1198562761100855e-06, + "loss": 0.0787, + "step": 49403 + }, + { + "epoch": 0.8811757571433667, + "grad_norm": 0.21634620428085327, + "learning_rate": 2.1192290682974627e-06, + "loss": 0.1392, + "step": 49404 + }, + { + "epoch": 0.8811935932650804, + "grad_norm": 0.28531765937805176, + "learning_rate": 2.118601949178617e-06, + "loss": 0.1217, + "step": 49405 + }, + { + "epoch": 0.8812114293867941, + "grad_norm": 0.44769516587257385, + "learning_rate": 2.1179749187559745e-06, + "loss": 0.1299, + "step": 49406 + }, + { + "epoch": 0.8812292655085078, + "grad_norm": 0.30657604336738586, + "learning_rate": 2.117347977031961e-06, + "loss": 0.1055, + "step": 49407 + }, + { + "epoch": 0.8812471016302216, + "grad_norm": 0.34153756499290466, + "learning_rate": 2.1167211240090077e-06, + "loss": 0.147, + "step": 49408 + }, + { + "epoch": 0.8812649377519353, + "grad_norm": 0.3517022728919983, + "learning_rate": 2.116094359689552e-06, + "loss": 0.0976, + "step": 49409 + }, + { + "epoch": 0.881282773873649, + "grad_norm": 0.20783376693725586, + "learning_rate": 2.115467684076014e-06, + "loss": 0.0778, + "step": 49410 + }, + { + "epoch": 0.8813006099953626, + "grad_norm": 0.2701862156391144, + "learning_rate": 2.1148410971708304e-06, + "loss": 0.1265, + "step": 49411 + }, + { + "epoch": 0.8813184461170763, + "grad_norm": 0.30719611048698425, + "learning_rate": 2.1142145989764272e-06, + "loss": 0.073, + "step": 49412 + }, + { + "epoch": 0.88133628223879, + "grad_norm": 0.28177008032798767, + "learning_rate": 2.1135881894952303e-06, + "loss": 0.1314, + "step": 49413 + }, + { + "epoch": 0.8813541183605037, + "grad_norm": 0.2552705407142639, + "learning_rate": 2.1129618687296737e-06, + "loss": 0.0844, + "step": 49414 + }, + { + "epoch": 0.8813719544822174, + "grad_norm": 0.2846696674823761, + "learning_rate": 2.1123356366821806e-06, + "loss": 0.1406, + "step": 49415 + }, + { + "epoch": 0.8813897906039311, + "grad_norm": 0.28623223304748535, + "learning_rate": 2.1117094933551796e-06, + "loss": 0.1724, + "step": 49416 + }, + { + "epoch": 0.8814076267256448, + "grad_norm": 0.2970600426197052, + "learning_rate": 2.1110834387510936e-06, + "loss": 0.1188, + "step": 49417 + }, + { + "epoch": 0.8814254628473585, + "grad_norm": 0.31855344772338867, + "learning_rate": 2.11045747287236e-06, + "loss": 0.1181, + "step": 49418 + }, + { + "epoch": 0.8814432989690721, + "grad_norm": 0.31087368726730347, + "learning_rate": 2.109831595721398e-06, + "loss": 0.121, + "step": 49419 + }, + { + "epoch": 0.8814611350907858, + "grad_norm": 0.3066253662109375, + "learning_rate": 2.109205807300635e-06, + "loss": 0.0951, + "step": 49420 + }, + { + "epoch": 0.8814789712124995, + "grad_norm": 0.23050422966480255, + "learning_rate": 2.108580107612493e-06, + "loss": 0.1112, + "step": 49421 + }, + { + "epoch": 0.8814968073342132, + "grad_norm": 0.22933296859264374, + "learning_rate": 2.1079544966594034e-06, + "loss": 0.0912, + "step": 49422 + }, + { + "epoch": 0.8815146434559269, + "grad_norm": 0.34030359983444214, + "learning_rate": 2.1073289744437843e-06, + "loss": 0.0791, + "step": 49423 + }, + { + "epoch": 0.8815324795776407, + "grad_norm": 0.3031795620918274, + "learning_rate": 2.106703540968069e-06, + "loss": 0.0987, + "step": 49424 + }, + { + "epoch": 0.8815503156993544, + "grad_norm": 0.25037652254104614, + "learning_rate": 2.1060781962346816e-06, + "loss": 0.0897, + "step": 49425 + }, + { + "epoch": 0.8815681518210681, + "grad_norm": 0.26562973856925964, + "learning_rate": 2.1054529402460334e-06, + "loss": 0.1082, + "step": 49426 + }, + { + "epoch": 0.8815859879427818, + "grad_norm": 0.30815649032592773, + "learning_rate": 2.104827773004564e-06, + "loss": 0.1078, + "step": 49427 + }, + { + "epoch": 0.8816038240644954, + "grad_norm": 0.3072338402271271, + "learning_rate": 2.104202694512691e-06, + "loss": 0.097, + "step": 49428 + }, + { + "epoch": 0.8816216601862091, + "grad_norm": 0.2984631061553955, + "learning_rate": 2.1035777047728355e-06, + "loss": 0.1413, + "step": 49429 + }, + { + "epoch": 0.8816394963079228, + "grad_norm": 0.4053855240345001, + "learning_rate": 2.102952803787414e-06, + "loss": 0.1769, + "step": 49430 + }, + { + "epoch": 0.8816573324296365, + "grad_norm": 0.25490662455558777, + "learning_rate": 2.102327991558864e-06, + "loss": 0.0976, + "step": 49431 + }, + { + "epoch": 0.8816751685513502, + "grad_norm": 0.2718149423599243, + "learning_rate": 2.101703268089597e-06, + "loss": 0.1327, + "step": 49432 + }, + { + "epoch": 0.8816930046730639, + "grad_norm": 0.31242114305496216, + "learning_rate": 2.1010786333820364e-06, + "loss": 0.0999, + "step": 49433 + }, + { + "epoch": 0.8817108407947776, + "grad_norm": 0.2962825894355774, + "learning_rate": 2.1004540874385996e-06, + "loss": 0.1432, + "step": 49434 + }, + { + "epoch": 0.8817286769164913, + "grad_norm": 0.2486339509487152, + "learning_rate": 2.0998296302617183e-06, + "loss": 0.103, + "step": 49435 + }, + { + "epoch": 0.881746513038205, + "grad_norm": 0.20273301005363464, + "learning_rate": 2.0992052618538068e-06, + "loss": 0.0826, + "step": 49436 + }, + { + "epoch": 0.8817643491599186, + "grad_norm": 0.3166324496269226, + "learning_rate": 2.0985809822172796e-06, + "loss": 0.1258, + "step": 49437 + }, + { + "epoch": 0.8817821852816323, + "grad_norm": 0.3244864046573639, + "learning_rate": 2.097956791354569e-06, + "loss": 0.1144, + "step": 49438 + }, + { + "epoch": 0.881800021403346, + "grad_norm": 0.3394557535648346, + "learning_rate": 2.0973326892680805e-06, + "loss": 0.1067, + "step": 49439 + }, + { + "epoch": 0.8818178575250597, + "grad_norm": 0.31407296657562256, + "learning_rate": 2.096708675960246e-06, + "loss": 0.1084, + "step": 49440 + }, + { + "epoch": 0.8818356936467735, + "grad_norm": 0.18862786889076233, + "learning_rate": 2.09608475143348e-06, + "loss": 0.08, + "step": 49441 + }, + { + "epoch": 0.8818535297684872, + "grad_norm": 0.22936619818210602, + "learning_rate": 2.0954609156902004e-06, + "loss": 0.0946, + "step": 49442 + }, + { + "epoch": 0.8818713658902009, + "grad_norm": 0.25299686193466187, + "learning_rate": 2.0948371687328215e-06, + "loss": 0.0993, + "step": 49443 + }, + { + "epoch": 0.8818892020119146, + "grad_norm": 0.3145373463630676, + "learning_rate": 2.0942135105637693e-06, + "loss": 0.0836, + "step": 49444 + }, + { + "epoch": 0.8819070381336283, + "grad_norm": 0.5313447713851929, + "learning_rate": 2.0935899411854557e-06, + "loss": 0.1366, + "step": 49445 + }, + { + "epoch": 0.8819248742553419, + "grad_norm": 0.2950810194015503, + "learning_rate": 2.0929664606002986e-06, + "loss": 0.0944, + "step": 49446 + }, + { + "epoch": 0.8819427103770556, + "grad_norm": 0.2714419662952423, + "learning_rate": 2.092343068810712e-06, + "loss": 0.0795, + "step": 49447 + }, + { + "epoch": 0.8819605464987693, + "grad_norm": 0.3711424171924591, + "learning_rate": 2.0917197658191197e-06, + "loss": 0.1234, + "step": 49448 + }, + { + "epoch": 0.881978382620483, + "grad_norm": 0.20156267285346985, + "learning_rate": 2.0910965516279357e-06, + "loss": 0.0736, + "step": 49449 + }, + { + "epoch": 0.8819962187421967, + "grad_norm": 0.3054533898830414, + "learning_rate": 2.090473426239567e-06, + "loss": 0.1526, + "step": 49450 + }, + { + "epoch": 0.8820140548639104, + "grad_norm": 0.2628709673881531, + "learning_rate": 2.089850389656442e-06, + "loss": 0.103, + "step": 49451 + }, + { + "epoch": 0.8820318909856241, + "grad_norm": 0.22848740220069885, + "learning_rate": 2.0892274418809644e-06, + "loss": 0.081, + "step": 49452 + }, + { + "epoch": 0.8820497271073378, + "grad_norm": 0.3265763819217682, + "learning_rate": 2.0886045829155597e-06, + "loss": 0.1007, + "step": 49453 + }, + { + "epoch": 0.8820675632290514, + "grad_norm": 0.20340044796466827, + "learning_rate": 2.087981812762635e-06, + "loss": 0.0781, + "step": 49454 + }, + { + "epoch": 0.8820853993507651, + "grad_norm": 0.25925788283348083, + "learning_rate": 2.0873591314246072e-06, + "loss": 0.1295, + "step": 49455 + }, + { + "epoch": 0.8821032354724788, + "grad_norm": 0.2397794872522354, + "learning_rate": 2.086736538903886e-06, + "loss": 0.1294, + "step": 49456 + }, + { + "epoch": 0.8821210715941925, + "grad_norm": 0.244323268532753, + "learning_rate": 2.0861140352028907e-06, + "loss": 0.1475, + "step": 49457 + }, + { + "epoch": 0.8821389077159063, + "grad_norm": 0.27704474329948425, + "learning_rate": 2.0854916203240344e-06, + "loss": 0.1179, + "step": 49458 + }, + { + "epoch": 0.88215674383762, + "grad_norm": 0.26258665323257446, + "learning_rate": 2.0848692942697257e-06, + "loss": 0.1227, + "step": 49459 + }, + { + "epoch": 0.8821745799593337, + "grad_norm": 0.2461012750864029, + "learning_rate": 2.084247057042371e-06, + "loss": 0.1225, + "step": 49460 + }, + { + "epoch": 0.8821924160810474, + "grad_norm": 0.30213266611099243, + "learning_rate": 2.0836249086443964e-06, + "loss": 0.1202, + "step": 49461 + }, + { + "epoch": 0.8822102522027611, + "grad_norm": 0.2420114129781723, + "learning_rate": 2.083002849078208e-06, + "loss": 0.1481, + "step": 49462 + }, + { + "epoch": 0.8822280883244747, + "grad_norm": 0.22456768155097961, + "learning_rate": 2.0823808783462155e-06, + "loss": 0.0568, + "step": 49463 + }, + { + "epoch": 0.8822459244461884, + "grad_norm": 0.42278358340263367, + "learning_rate": 2.0817589964508223e-06, + "loss": 0.14, + "step": 49464 + }, + { + "epoch": 0.8822637605679021, + "grad_norm": 0.2925378978252411, + "learning_rate": 2.0811372033944537e-06, + "loss": 0.1064, + "step": 49465 + }, + { + "epoch": 0.8822815966896158, + "grad_norm": 0.244202122092247, + "learning_rate": 2.080515499179511e-06, + "loss": 0.1005, + "step": 49466 + }, + { + "epoch": 0.8822994328113295, + "grad_norm": 0.359566330909729, + "learning_rate": 2.0798938838084092e-06, + "loss": 0.0872, + "step": 49467 + }, + { + "epoch": 0.8823172689330432, + "grad_norm": 0.22111792862415314, + "learning_rate": 2.079272357283554e-06, + "loss": 0.1242, + "step": 49468 + }, + { + "epoch": 0.8823351050547569, + "grad_norm": 0.26042407751083374, + "learning_rate": 2.07865091960735e-06, + "loss": 0.1077, + "step": 49469 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.2323639690876007, + "learning_rate": 2.0780295707822194e-06, + "loss": 0.0826, + "step": 49470 + }, + { + "epoch": 0.8823707772981843, + "grad_norm": 0.29817500710487366, + "learning_rate": 2.0774083108105606e-06, + "loss": 0.1213, + "step": 49471 + }, + { + "epoch": 0.8823886134198979, + "grad_norm": 0.20066680014133453, + "learning_rate": 2.0767871396947858e-06, + "loss": 0.1043, + "step": 49472 + }, + { + "epoch": 0.8824064495416116, + "grad_norm": 0.3479580283164978, + "learning_rate": 2.0761660574372983e-06, + "loss": 0.1079, + "step": 49473 + }, + { + "epoch": 0.8824242856633253, + "grad_norm": 0.3746626079082489, + "learning_rate": 2.07554506404051e-06, + "loss": 0.0962, + "step": 49474 + }, + { + "epoch": 0.8824421217850391, + "grad_norm": 0.26637962460517883, + "learning_rate": 2.07492415950683e-06, + "loss": 0.1045, + "step": 49475 + }, + { + "epoch": 0.8824599579067528, + "grad_norm": 0.3373264670372009, + "learning_rate": 2.07430334383866e-06, + "loss": 0.1441, + "step": 49476 + }, + { + "epoch": 0.8824777940284665, + "grad_norm": 0.2721731960773468, + "learning_rate": 2.0736826170384054e-06, + "loss": 0.1577, + "step": 49477 + }, + { + "epoch": 0.8824956301501802, + "grad_norm": 0.28891292214393616, + "learning_rate": 2.0730619791084786e-06, + "loss": 0.1527, + "step": 49478 + }, + { + "epoch": 0.8825134662718939, + "grad_norm": 0.2672925889492035, + "learning_rate": 2.072441430051278e-06, + "loss": 0.0882, + "step": 49479 + }, + { + "epoch": 0.8825313023936076, + "grad_norm": 0.24898603558540344, + "learning_rate": 2.0718209698692205e-06, + "loss": 0.0689, + "step": 49480 + }, + { + "epoch": 0.8825491385153212, + "grad_norm": 0.34353986382484436, + "learning_rate": 2.0712005985647016e-06, + "loss": 0.1379, + "step": 49481 + }, + { + "epoch": 0.8825669746370349, + "grad_norm": 0.34004947543144226, + "learning_rate": 2.070580316140122e-06, + "loss": 0.1058, + "step": 49482 + }, + { + "epoch": 0.8825848107587486, + "grad_norm": 0.218428373336792, + "learning_rate": 2.0699601225979e-06, + "loss": 0.0962, + "step": 49483 + }, + { + "epoch": 0.8826026468804623, + "grad_norm": 0.28842514753341675, + "learning_rate": 2.0693400179404328e-06, + "loss": 0.15, + "step": 49484 + }, + { + "epoch": 0.882620483002176, + "grad_norm": 0.26158249378204346, + "learning_rate": 2.0687200021701216e-06, + "loss": 0.1032, + "step": 49485 + }, + { + "epoch": 0.8826383191238897, + "grad_norm": 0.17603634297847748, + "learning_rate": 2.06810007528937e-06, + "loss": 0.0923, + "step": 49486 + }, + { + "epoch": 0.8826561552456034, + "grad_norm": 0.22819095849990845, + "learning_rate": 2.0674802373005844e-06, + "loss": 0.0979, + "step": 49487 + }, + { + "epoch": 0.8826739913673171, + "grad_norm": 0.24512246251106262, + "learning_rate": 2.066860488206168e-06, + "loss": 0.087, + "step": 49488 + }, + { + "epoch": 0.8826918274890307, + "grad_norm": 0.2636117935180664, + "learning_rate": 2.0662408280085225e-06, + "loss": 0.1072, + "step": 49489 + }, + { + "epoch": 0.8827096636107444, + "grad_norm": 0.22856329381465912, + "learning_rate": 2.065621256710046e-06, + "loss": 0.1242, + "step": 49490 + }, + { + "epoch": 0.8827274997324581, + "grad_norm": 0.2186232954263687, + "learning_rate": 2.0650017743131383e-06, + "loss": 0.0914, + "step": 49491 + }, + { + "epoch": 0.8827453358541719, + "grad_norm": 0.32454976439476013, + "learning_rate": 2.064382380820212e-06, + "loss": 0.1028, + "step": 49492 + }, + { + "epoch": 0.8827631719758856, + "grad_norm": 0.23620378971099854, + "learning_rate": 2.063763076233655e-06, + "loss": 0.1094, + "step": 49493 + }, + { + "epoch": 0.8827810080975993, + "grad_norm": 0.38535383343696594, + "learning_rate": 2.0631438605558803e-06, + "loss": 0.1626, + "step": 49494 + }, + { + "epoch": 0.882798844219313, + "grad_norm": 0.3389303684234619, + "learning_rate": 2.0625247337892755e-06, + "loss": 0.1582, + "step": 49495 + }, + { + "epoch": 0.8828166803410267, + "grad_norm": 0.31598159670829773, + "learning_rate": 2.061905695936253e-06, + "loss": 0.1216, + "step": 49496 + }, + { + "epoch": 0.8828345164627404, + "grad_norm": 0.35806748270988464, + "learning_rate": 2.0612867469992054e-06, + "loss": 0.1796, + "step": 49497 + }, + { + "epoch": 0.882852352584454, + "grad_norm": 0.3328573703765869, + "learning_rate": 2.0606678869805327e-06, + "loss": 0.1496, + "step": 49498 + }, + { + "epoch": 0.8828701887061677, + "grad_norm": 0.1679372787475586, + "learning_rate": 2.0600491158826314e-06, + "loss": 0.0739, + "step": 49499 + }, + { + "epoch": 0.8828880248278814, + "grad_norm": 0.24111592769622803, + "learning_rate": 2.059430433707907e-06, + "loss": 0.0926, + "step": 49500 + }, + { + "epoch": 0.8829058609495951, + "grad_norm": 0.31532177329063416, + "learning_rate": 2.0588118404587546e-06, + "loss": 0.116, + "step": 49501 + }, + { + "epoch": 0.8829236970713088, + "grad_norm": 0.25310298800468445, + "learning_rate": 2.0581933361375704e-06, + "loss": 0.1443, + "step": 49502 + }, + { + "epoch": 0.8829415331930225, + "grad_norm": 0.27139073610305786, + "learning_rate": 2.057574920746752e-06, + "loss": 0.0792, + "step": 49503 + }, + { + "epoch": 0.8829593693147362, + "grad_norm": 0.3119518756866455, + "learning_rate": 2.056956594288695e-06, + "loss": 0.1194, + "step": 49504 + }, + { + "epoch": 0.8829772054364499, + "grad_norm": 0.9270125031471252, + "learning_rate": 2.0563383567657994e-06, + "loss": 0.1143, + "step": 49505 + }, + { + "epoch": 0.8829950415581636, + "grad_norm": 0.44426754117012024, + "learning_rate": 2.0557202081804588e-06, + "loss": 0.1215, + "step": 49506 + }, + { + "epoch": 0.8830128776798772, + "grad_norm": 0.3076658546924591, + "learning_rate": 2.0551021485350764e-06, + "loss": 0.0882, + "step": 49507 + }, + { + "epoch": 0.8830307138015909, + "grad_norm": 0.2759837508201599, + "learning_rate": 2.0544841778320363e-06, + "loss": 0.0909, + "step": 49508 + }, + { + "epoch": 0.8830485499233047, + "grad_norm": 0.23088671267032623, + "learning_rate": 2.0538662960737474e-06, + "loss": 0.0986, + "step": 49509 + }, + { + "epoch": 0.8830663860450184, + "grad_norm": 0.2549494206905365, + "learning_rate": 2.0532485032625946e-06, + "loss": 0.0998, + "step": 49510 + }, + { + "epoch": 0.8830842221667321, + "grad_norm": 0.23078858852386475, + "learning_rate": 2.0526307994009787e-06, + "loss": 0.0885, + "step": 49511 + }, + { + "epoch": 0.8831020582884458, + "grad_norm": 0.26283302903175354, + "learning_rate": 2.052013184491289e-06, + "loss": 0.0693, + "step": 49512 + }, + { + "epoch": 0.8831198944101595, + "grad_norm": 0.27256253361701965, + "learning_rate": 2.0513956585359233e-06, + "loss": 0.0975, + "step": 49513 + }, + { + "epoch": 0.8831377305318732, + "grad_norm": 0.2075197845697403, + "learning_rate": 2.050778221537275e-06, + "loss": 0.1439, + "step": 49514 + }, + { + "epoch": 0.8831555666535869, + "grad_norm": 0.25969287753105164, + "learning_rate": 2.050160873497739e-06, + "loss": 0.0918, + "step": 49515 + }, + { + "epoch": 0.8831734027753005, + "grad_norm": 0.24521677196025848, + "learning_rate": 2.0495436144197023e-06, + "loss": 0.0839, + "step": 49516 + }, + { + "epoch": 0.8831912388970142, + "grad_norm": 0.2796004116535187, + "learning_rate": 2.04892644430556e-06, + "loss": 0.0819, + "step": 49517 + }, + { + "epoch": 0.8832090750187279, + "grad_norm": 0.22466719150543213, + "learning_rate": 2.048309363157708e-06, + "loss": 0.0863, + "step": 49518 + }, + { + "epoch": 0.8832269111404416, + "grad_norm": 0.27003031969070435, + "learning_rate": 2.0476923709785377e-06, + "loss": 0.1237, + "step": 49519 + }, + { + "epoch": 0.8832447472621553, + "grad_norm": 0.21677754819393158, + "learning_rate": 2.0470754677704344e-06, + "loss": 0.1244, + "step": 49520 + }, + { + "epoch": 0.883262583383869, + "grad_norm": 0.2606586217880249, + "learning_rate": 2.046458653535799e-06, + "loss": 0.1053, + "step": 49521 + }, + { + "epoch": 0.8832804195055827, + "grad_norm": 0.1929212063550949, + "learning_rate": 2.045841928277012e-06, + "loss": 0.0606, + "step": 49522 + }, + { + "epoch": 0.8832982556272964, + "grad_norm": 0.2580725848674774, + "learning_rate": 2.045225291996478e-06, + "loss": 0.1077, + "step": 49523 + }, + { + "epoch": 0.88331609174901, + "grad_norm": 0.23047815263271332, + "learning_rate": 2.0446087446965743e-06, + "loss": 0.1069, + "step": 49524 + }, + { + "epoch": 0.8833339278707238, + "grad_norm": 0.24394388496875763, + "learning_rate": 2.0439922863796952e-06, + "loss": 0.1337, + "step": 49525 + }, + { + "epoch": 0.8833517639924375, + "grad_norm": 0.2879149317741394, + "learning_rate": 2.043375917048232e-06, + "loss": 0.1193, + "step": 49526 + }, + { + "epoch": 0.8833696001141512, + "grad_norm": 0.333153635263443, + "learning_rate": 2.042759636704575e-06, + "loss": 0.1045, + "step": 49527 + }, + { + "epoch": 0.8833874362358649, + "grad_norm": 0.2467833161354065, + "learning_rate": 2.0421434453511077e-06, + "loss": 0.1296, + "step": 49528 + }, + { + "epoch": 0.8834052723575786, + "grad_norm": 0.3096359968185425, + "learning_rate": 2.0415273429902236e-06, + "loss": 0.1642, + "step": 49529 + }, + { + "epoch": 0.8834231084792923, + "grad_norm": 0.26842281222343445, + "learning_rate": 2.0409113296243066e-06, + "loss": 0.142, + "step": 49530 + }, + { + "epoch": 0.883440944601006, + "grad_norm": 0.19256141781806946, + "learning_rate": 2.040295405255749e-06, + "loss": 0.0953, + "step": 49531 + }, + { + "epoch": 0.8834587807227197, + "grad_norm": 0.26695820689201355, + "learning_rate": 2.0396795698869376e-06, + "loss": 0.1047, + "step": 49532 + }, + { + "epoch": 0.8834766168444333, + "grad_norm": 0.22599327564239502, + "learning_rate": 2.0390638235202542e-06, + "loss": 0.0652, + "step": 49533 + }, + { + "epoch": 0.883494452966147, + "grad_norm": 0.2299235314130783, + "learning_rate": 2.038448166158094e-06, + "loss": 0.0965, + "step": 49534 + }, + { + "epoch": 0.8835122890878607, + "grad_norm": 0.22735629975795746, + "learning_rate": 2.037832597802836e-06, + "loss": 0.086, + "step": 49535 + }, + { + "epoch": 0.8835301252095744, + "grad_norm": 0.29025423526763916, + "learning_rate": 2.0372171184568716e-06, + "loss": 0.0961, + "step": 49536 + }, + { + "epoch": 0.8835479613312881, + "grad_norm": 0.21940742433071136, + "learning_rate": 2.0366017281225865e-06, + "loss": 0.0505, + "step": 49537 + }, + { + "epoch": 0.8835657974530018, + "grad_norm": 0.31663379073143005, + "learning_rate": 2.035986426802358e-06, + "loss": 0.1026, + "step": 49538 + }, + { + "epoch": 0.8835836335747155, + "grad_norm": 0.28553298115730286, + "learning_rate": 2.035371214498585e-06, + "loss": 0.1204, + "step": 49539 + }, + { + "epoch": 0.8836014696964292, + "grad_norm": 0.3004331588745117, + "learning_rate": 2.0347560912136438e-06, + "loss": 0.0882, + "step": 49540 + }, + { + "epoch": 0.8836193058181429, + "grad_norm": 0.34469443559646606, + "learning_rate": 2.0341410569499203e-06, + "loss": 0.1538, + "step": 49541 + }, + { + "epoch": 0.8836371419398567, + "grad_norm": 0.2539229989051819, + "learning_rate": 2.0335261117097963e-06, + "loss": 0.1072, + "step": 49542 + }, + { + "epoch": 0.8836549780615703, + "grad_norm": 0.19817131757736206, + "learning_rate": 2.0329112554956535e-06, + "loss": 0.056, + "step": 49543 + }, + { + "epoch": 0.883672814183284, + "grad_norm": 0.260511189699173, + "learning_rate": 2.0322964883098844e-06, + "loss": 0.1199, + "step": 49544 + }, + { + "epoch": 0.8836906503049977, + "grad_norm": 0.2513865828514099, + "learning_rate": 2.0316818101548675e-06, + "loss": 0.0709, + "step": 49545 + }, + { + "epoch": 0.8837084864267114, + "grad_norm": 0.29783475399017334, + "learning_rate": 2.031067221032984e-06, + "loss": 0.1564, + "step": 49546 + }, + { + "epoch": 0.8837263225484251, + "grad_norm": 0.3489333987236023, + "learning_rate": 2.030452720946613e-06, + "loss": 0.0718, + "step": 49547 + }, + { + "epoch": 0.8837441586701388, + "grad_norm": 0.18202170729637146, + "learning_rate": 2.0298383098981467e-06, + "loss": 0.0651, + "step": 49548 + }, + { + "epoch": 0.8837619947918525, + "grad_norm": 0.28724801540374756, + "learning_rate": 2.0292239878899554e-06, + "loss": 0.0536, + "step": 49549 + }, + { + "epoch": 0.8837798309135662, + "grad_norm": 0.2434171587228775, + "learning_rate": 2.0286097549244293e-06, + "loss": 0.0821, + "step": 49550 + }, + { + "epoch": 0.8837976670352798, + "grad_norm": 0.2876301109790802, + "learning_rate": 2.027995611003941e-06, + "loss": 0.1101, + "step": 49551 + }, + { + "epoch": 0.8838155031569935, + "grad_norm": 0.2827867865562439, + "learning_rate": 2.027381556130881e-06, + "loss": 0.1221, + "step": 49552 + }, + { + "epoch": 0.8838333392787072, + "grad_norm": 0.3787000775337219, + "learning_rate": 2.026767590307624e-06, + "loss": 0.0967, + "step": 49553 + }, + { + "epoch": 0.8838511754004209, + "grad_norm": 0.26926618814468384, + "learning_rate": 2.026153713536552e-06, + "loss": 0.1284, + "step": 49554 + }, + { + "epoch": 0.8838690115221346, + "grad_norm": 0.40540242195129395, + "learning_rate": 2.025539925820041e-06, + "loss": 0.1276, + "step": 49555 + }, + { + "epoch": 0.8838868476438483, + "grad_norm": 0.2444566935300827, + "learning_rate": 2.0249262271604702e-06, + "loss": 0.1183, + "step": 49556 + }, + { + "epoch": 0.883904683765562, + "grad_norm": 0.26007384061813354, + "learning_rate": 2.0243126175602256e-06, + "loss": 0.1141, + "step": 49557 + }, + { + "epoch": 0.8839225198872757, + "grad_norm": 0.3018735349178314, + "learning_rate": 2.023699097021678e-06, + "loss": 0.1161, + "step": 49558 + }, + { + "epoch": 0.8839403560089895, + "grad_norm": 0.25285229086875916, + "learning_rate": 2.0230856655472114e-06, + "loss": 0.0698, + "step": 49559 + }, + { + "epoch": 0.8839581921307031, + "grad_norm": 0.3195061981678009, + "learning_rate": 2.0224723231391935e-06, + "loss": 0.1111, + "step": 49560 + }, + { + "epoch": 0.8839760282524168, + "grad_norm": 0.29395386576652527, + "learning_rate": 2.021859069800017e-06, + "loss": 0.1067, + "step": 49561 + }, + { + "epoch": 0.8839938643741305, + "grad_norm": 0.37777024507522583, + "learning_rate": 2.021245905532043e-06, + "loss": 0.1095, + "step": 49562 + }, + { + "epoch": 0.8840117004958442, + "grad_norm": 0.3774457573890686, + "learning_rate": 2.0206328303376625e-06, + "loss": 0.0861, + "step": 49563 + }, + { + "epoch": 0.8840295366175579, + "grad_norm": 0.3174244165420532, + "learning_rate": 2.0200198442192423e-06, + "loss": 0.1127, + "step": 49564 + }, + { + "epoch": 0.8840473727392716, + "grad_norm": 0.31690070033073425, + "learning_rate": 2.0194069471791644e-06, + "loss": 0.0829, + "step": 49565 + }, + { + "epoch": 0.8840652088609853, + "grad_norm": 0.31819257140159607, + "learning_rate": 2.018794139219804e-06, + "loss": 0.1355, + "step": 49566 + }, + { + "epoch": 0.884083044982699, + "grad_norm": 0.27593687176704407, + "learning_rate": 2.0181814203435345e-06, + "loss": 0.0952, + "step": 49567 + }, + { + "epoch": 0.8841008811044127, + "grad_norm": 0.2703091502189636, + "learning_rate": 2.017568790552732e-06, + "loss": 0.1269, + "step": 49568 + }, + { + "epoch": 0.8841187172261263, + "grad_norm": 0.2493617683649063, + "learning_rate": 2.0169562498497662e-06, + "loss": 0.0915, + "step": 49569 + }, + { + "epoch": 0.88413655334784, + "grad_norm": 0.24354255199432373, + "learning_rate": 2.016343798237022e-06, + "loss": 0.1144, + "step": 49570 + }, + { + "epoch": 0.8841543894695537, + "grad_norm": 0.26051583886146545, + "learning_rate": 2.0157314357168637e-06, + "loss": 0.0988, + "step": 49571 + }, + { + "epoch": 0.8841722255912674, + "grad_norm": 0.25750574469566345, + "learning_rate": 2.015119162291673e-06, + "loss": 0.1034, + "step": 49572 + }, + { + "epoch": 0.8841900617129811, + "grad_norm": 0.21313445270061493, + "learning_rate": 2.014506977963812e-06, + "loss": 0.0918, + "step": 49573 + }, + { + "epoch": 0.8842078978346948, + "grad_norm": 0.24788883328437805, + "learning_rate": 2.0138948827356673e-06, + "loss": 0.1134, + "step": 49574 + }, + { + "epoch": 0.8842257339564085, + "grad_norm": 0.24662795662879944, + "learning_rate": 2.013282876609604e-06, + "loss": 0.1408, + "step": 49575 + }, + { + "epoch": 0.8842435700781223, + "grad_norm": 0.2726486325263977, + "learning_rate": 2.0126709595879895e-06, + "loss": 0.137, + "step": 49576 + }, + { + "epoch": 0.884261406199836, + "grad_norm": 0.2955690622329712, + "learning_rate": 2.012059131673205e-06, + "loss": 0.1025, + "step": 49577 + }, + { + "epoch": 0.8842792423215496, + "grad_norm": 0.3030845820903778, + "learning_rate": 2.0114473928676212e-06, + "loss": 0.1053, + "step": 49578 + }, + { + "epoch": 0.8842970784432633, + "grad_norm": 0.31055718660354614, + "learning_rate": 2.0108357431736088e-06, + "loss": 0.1263, + "step": 49579 + }, + { + "epoch": 0.884314914564977, + "grad_norm": 0.30920547246932983, + "learning_rate": 2.010224182593537e-06, + "loss": 0.1461, + "step": 49580 + }, + { + "epoch": 0.8843327506866907, + "grad_norm": 0.24259616434574127, + "learning_rate": 2.0096127111297767e-06, + "loss": 0.1173, + "step": 49581 + }, + { + "epoch": 0.8843505868084044, + "grad_norm": 0.24070823192596436, + "learning_rate": 2.0090013287846933e-06, + "loss": 0.0754, + "step": 49582 + }, + { + "epoch": 0.8843684229301181, + "grad_norm": 0.5232437252998352, + "learning_rate": 2.0083900355606673e-06, + "loss": 0.1225, + "step": 49583 + }, + { + "epoch": 0.8843862590518318, + "grad_norm": 0.24039065837860107, + "learning_rate": 2.007778831460061e-06, + "loss": 0.0843, + "step": 49584 + }, + { + "epoch": 0.8844040951735455, + "grad_norm": 0.2232964187860489, + "learning_rate": 2.0071677164852447e-06, + "loss": 0.0677, + "step": 49585 + }, + { + "epoch": 0.8844219312952591, + "grad_norm": 0.29047688841819763, + "learning_rate": 2.0065566906385834e-06, + "loss": 0.1175, + "step": 49586 + }, + { + "epoch": 0.8844397674169728, + "grad_norm": 0.35257411003112793, + "learning_rate": 2.0059457539224557e-06, + "loss": 0.0974, + "step": 49587 + }, + { + "epoch": 0.8844576035386865, + "grad_norm": 0.24554544687271118, + "learning_rate": 2.0053349063392234e-06, + "loss": 0.084, + "step": 49588 + }, + { + "epoch": 0.8844754396604002, + "grad_norm": 0.2537404000759125, + "learning_rate": 2.0047241478912515e-06, + "loss": 0.1458, + "step": 49589 + }, + { + "epoch": 0.8844932757821139, + "grad_norm": 0.21217772364616394, + "learning_rate": 2.0041134785809155e-06, + "loss": 0.0471, + "step": 49590 + }, + { + "epoch": 0.8845111119038276, + "grad_norm": 0.33039751648902893, + "learning_rate": 2.0035028984105726e-06, + "loss": 0.1272, + "step": 49591 + }, + { + "epoch": 0.8845289480255413, + "grad_norm": 0.23602797091007233, + "learning_rate": 2.0028924073826006e-06, + "loss": 0.0948, + "step": 49592 + }, + { + "epoch": 0.8845467841472551, + "grad_norm": 0.2822903096675873, + "learning_rate": 2.002282005499362e-06, + "loss": 0.1908, + "step": 49593 + }, + { + "epoch": 0.8845646202689688, + "grad_norm": 0.5448985695838928, + "learning_rate": 2.0016716927632182e-06, + "loss": 0.1655, + "step": 49594 + }, + { + "epoch": 0.8845824563906824, + "grad_norm": 0.5512979626655579, + "learning_rate": 2.001061469176538e-06, + "loss": 0.1292, + "step": 49595 + }, + { + "epoch": 0.8846002925123961, + "grad_norm": 0.25886738300323486, + "learning_rate": 2.0004513347416875e-06, + "loss": 0.135, + "step": 49596 + }, + { + "epoch": 0.8846181286341098, + "grad_norm": 0.2702408730983734, + "learning_rate": 1.9998412894610325e-06, + "loss": 0.1394, + "step": 49597 + }, + { + "epoch": 0.8846359647558235, + "grad_norm": 0.3085061311721802, + "learning_rate": 1.9992313333369373e-06, + "loss": 0.0831, + "step": 49598 + }, + { + "epoch": 0.8846538008775372, + "grad_norm": 0.25254586338996887, + "learning_rate": 1.998621466371761e-06, + "loss": 0.0769, + "step": 49599 + }, + { + "epoch": 0.8846716369992509, + "grad_norm": 0.326857328414917, + "learning_rate": 1.9980116885678774e-06, + "loss": 0.0902, + "step": 49600 + }, + { + "epoch": 0.8846894731209646, + "grad_norm": 0.23103033006191254, + "learning_rate": 1.997401999927645e-06, + "loss": 0.1017, + "step": 49601 + }, + { + "epoch": 0.8847073092426783, + "grad_norm": 0.31632867455482483, + "learning_rate": 1.996792400453426e-06, + "loss": 0.1181, + "step": 49602 + }, + { + "epoch": 0.884725145364392, + "grad_norm": 0.23101124167442322, + "learning_rate": 1.996182890147583e-06, + "loss": 0.147, + "step": 49603 + }, + { + "epoch": 0.8847429814861056, + "grad_norm": 0.25761860609054565, + "learning_rate": 1.9955734690124766e-06, + "loss": 0.1129, + "step": 49604 + }, + { + "epoch": 0.8847608176078193, + "grad_norm": 0.3172912895679474, + "learning_rate": 1.9949641370504812e-06, + "loss": 0.1162, + "step": 49605 + }, + { + "epoch": 0.884778653729533, + "grad_norm": 0.3307587504386902, + "learning_rate": 1.9943548942639473e-06, + "loss": 0.1402, + "step": 49606 + }, + { + "epoch": 0.8847964898512467, + "grad_norm": 0.2567841410636902, + "learning_rate": 1.9937457406552422e-06, + "loss": 0.1215, + "step": 49607 + }, + { + "epoch": 0.8848143259729604, + "grad_norm": 0.3045521378517151, + "learning_rate": 1.9931366762267173e-06, + "loss": 0.1255, + "step": 49608 + }, + { + "epoch": 0.8848321620946741, + "grad_norm": 0.41305452585220337, + "learning_rate": 1.992527700980748e-06, + "loss": 0.1301, + "step": 49609 + }, + { + "epoch": 0.8848499982163879, + "grad_norm": 0.3504682183265686, + "learning_rate": 1.9919188149196883e-06, + "loss": 0.1215, + "step": 49610 + }, + { + "epoch": 0.8848678343381016, + "grad_norm": 0.2765824794769287, + "learning_rate": 1.9913100180458944e-06, + "loss": 0.0972, + "step": 49611 + }, + { + "epoch": 0.8848856704598153, + "grad_norm": 0.2925173044204712, + "learning_rate": 1.9907013103617285e-06, + "loss": 0.1484, + "step": 49612 + }, + { + "epoch": 0.8849035065815289, + "grad_norm": 0.3033662736415863, + "learning_rate": 1.990092691869555e-06, + "loss": 0.0994, + "step": 49613 + }, + { + "epoch": 0.8849213427032426, + "grad_norm": 0.2038131058216095, + "learning_rate": 1.989484162571731e-06, + "loss": 0.0651, + "step": 49614 + }, + { + "epoch": 0.8849391788249563, + "grad_norm": 0.2634255290031433, + "learning_rate": 1.9888757224706123e-06, + "loss": 0.0717, + "step": 49615 + }, + { + "epoch": 0.88495701494667, + "grad_norm": 0.26705339550971985, + "learning_rate": 1.9882673715685556e-06, + "loss": 0.1119, + "step": 49616 + }, + { + "epoch": 0.8849748510683837, + "grad_norm": 0.2884557247161865, + "learning_rate": 1.987659109867926e-06, + "loss": 0.1366, + "step": 49617 + }, + { + "epoch": 0.8849926871900974, + "grad_norm": 0.2735777795314789, + "learning_rate": 1.987050937371074e-06, + "loss": 0.1289, + "step": 49618 + }, + { + "epoch": 0.8850105233118111, + "grad_norm": 0.3681984543800354, + "learning_rate": 1.9864428540803643e-06, + "loss": 0.1294, + "step": 49619 + }, + { + "epoch": 0.8850283594335248, + "grad_norm": 0.3934820890426636, + "learning_rate": 1.985834859998151e-06, + "loss": 0.0922, + "step": 49620 + }, + { + "epoch": 0.8850461955552384, + "grad_norm": 0.211423859000206, + "learning_rate": 1.985226955126787e-06, + "loss": 0.1487, + "step": 49621 + }, + { + "epoch": 0.8850640316769521, + "grad_norm": 0.25320059061050415, + "learning_rate": 1.984619139468638e-06, + "loss": 0.1318, + "step": 49622 + }, + { + "epoch": 0.8850818677986658, + "grad_norm": 0.29037126898765564, + "learning_rate": 1.984011413026052e-06, + "loss": 0.0775, + "step": 49623 + }, + { + "epoch": 0.8850997039203795, + "grad_norm": 0.2292759120464325, + "learning_rate": 1.9834037758013903e-06, + "loss": 0.0659, + "step": 49624 + }, + { + "epoch": 0.8851175400420932, + "grad_norm": 0.28474709391593933, + "learning_rate": 1.9827962277969986e-06, + "loss": 0.1239, + "step": 49625 + }, + { + "epoch": 0.885135376163807, + "grad_norm": 0.25914838910102844, + "learning_rate": 1.982188769015242e-06, + "loss": 0.079, + "step": 49626 + }, + { + "epoch": 0.8851532122855207, + "grad_norm": 0.24624231457710266, + "learning_rate": 1.9815813994584737e-06, + "loss": 0.0898, + "step": 49627 + }, + { + "epoch": 0.8851710484072344, + "grad_norm": 0.321963906288147, + "learning_rate": 1.980974119129045e-06, + "loss": 0.1025, + "step": 49628 + }, + { + "epoch": 0.8851888845289481, + "grad_norm": 0.19781802594661713, + "learning_rate": 1.980366928029309e-06, + "loss": 0.1001, + "step": 49629 + }, + { + "epoch": 0.8852067206506617, + "grad_norm": 0.2166374772787094, + "learning_rate": 1.979759826161623e-06, + "loss": 0.0802, + "step": 49630 + }, + { + "epoch": 0.8852245567723754, + "grad_norm": 0.37161746621131897, + "learning_rate": 1.979152813528337e-06, + "loss": 0.1814, + "step": 49631 + }, + { + "epoch": 0.8852423928940891, + "grad_norm": 0.25688329339027405, + "learning_rate": 1.978545890131808e-06, + "loss": 0.1009, + "step": 49632 + }, + { + "epoch": 0.8852602290158028, + "grad_norm": 0.3030528128147125, + "learning_rate": 1.977939055974387e-06, + "loss": 0.1254, + "step": 49633 + }, + { + "epoch": 0.8852780651375165, + "grad_norm": 0.22229984402656555, + "learning_rate": 1.9773323110584217e-06, + "loss": 0.1024, + "step": 49634 + }, + { + "epoch": 0.8852959012592302, + "grad_norm": 0.22781424224376678, + "learning_rate": 1.9767256553862746e-06, + "loss": 0.0853, + "step": 49635 + }, + { + "epoch": 0.8853137373809439, + "grad_norm": 0.3471640944480896, + "learning_rate": 1.976119088960288e-06, + "loss": 0.1532, + "step": 49636 + }, + { + "epoch": 0.8853315735026576, + "grad_norm": 0.28439220786094666, + "learning_rate": 1.9755126117828178e-06, + "loss": 0.0758, + "step": 49637 + }, + { + "epoch": 0.8853494096243713, + "grad_norm": 0.3464740812778473, + "learning_rate": 1.9749062238562073e-06, + "loss": 0.1156, + "step": 49638 + }, + { + "epoch": 0.8853672457460849, + "grad_norm": 0.320024698972702, + "learning_rate": 1.974299925182818e-06, + "loss": 0.0901, + "step": 49639 + }, + { + "epoch": 0.8853850818677986, + "grad_norm": 0.265421062707901, + "learning_rate": 1.973693715764996e-06, + "loss": 0.136, + "step": 49640 + }, + { + "epoch": 0.8854029179895123, + "grad_norm": 0.2583046555519104, + "learning_rate": 1.973087595605089e-06, + "loss": 0.0745, + "step": 49641 + }, + { + "epoch": 0.885420754111226, + "grad_norm": 0.3214467465877533, + "learning_rate": 1.972481564705442e-06, + "loss": 0.0806, + "step": 49642 + }, + { + "epoch": 0.8854385902329398, + "grad_norm": 0.19599206745624542, + "learning_rate": 1.9718756230684173e-06, + "loss": 0.0936, + "step": 49643 + }, + { + "epoch": 0.8854564263546535, + "grad_norm": 0.28757527470588684, + "learning_rate": 1.9712697706963546e-06, + "loss": 0.1146, + "step": 49644 + }, + { + "epoch": 0.8854742624763672, + "grad_norm": 0.36969393491744995, + "learning_rate": 1.9706640075915996e-06, + "loss": 0.1379, + "step": 49645 + }, + { + "epoch": 0.8854920985980809, + "grad_norm": 0.2446933537721634, + "learning_rate": 1.9700583337565084e-06, + "loss": 0.0949, + "step": 49646 + }, + { + "epoch": 0.8855099347197946, + "grad_norm": 0.28313106298446655, + "learning_rate": 1.9694527491934233e-06, + "loss": 0.1198, + "step": 49647 + }, + { + "epoch": 0.8855277708415082, + "grad_norm": 0.2282789945602417, + "learning_rate": 1.968847253904696e-06, + "loss": 0.087, + "step": 49648 + }, + { + "epoch": 0.8855456069632219, + "grad_norm": 0.1931321620941162, + "learning_rate": 1.968241847892674e-06, + "loss": 0.0743, + "step": 49649 + }, + { + "epoch": 0.8855634430849356, + "grad_norm": 0.34312137961387634, + "learning_rate": 1.9676365311597e-06, + "loss": 0.1206, + "step": 49650 + }, + { + "epoch": 0.8855812792066493, + "grad_norm": 0.2022121548652649, + "learning_rate": 1.9670313037081167e-06, + "loss": 0.0591, + "step": 49651 + }, + { + "epoch": 0.885599115328363, + "grad_norm": 0.26006537675857544, + "learning_rate": 1.9664261655402807e-06, + "loss": 0.0965, + "step": 49652 + }, + { + "epoch": 0.8856169514500767, + "grad_norm": 0.318836510181427, + "learning_rate": 1.965821116658534e-06, + "loss": 0.1165, + "step": 49653 + }, + { + "epoch": 0.8856347875717904, + "grad_norm": 0.3420856297016144, + "learning_rate": 1.9652161570652168e-06, + "loss": 0.1039, + "step": 49654 + }, + { + "epoch": 0.8856526236935041, + "grad_norm": 0.2644520699977875, + "learning_rate": 1.9646112867626772e-06, + "loss": 0.1108, + "step": 49655 + }, + { + "epoch": 0.8856704598152177, + "grad_norm": 0.2115037888288498, + "learning_rate": 1.964006505753263e-06, + "loss": 0.0865, + "step": 49656 + }, + { + "epoch": 0.8856882959369314, + "grad_norm": 0.2404254823923111, + "learning_rate": 1.9634018140393148e-06, + "loss": 0.1002, + "step": 49657 + }, + { + "epoch": 0.8857061320586451, + "grad_norm": 0.3011566996574402, + "learning_rate": 1.9627972116231797e-06, + "loss": 0.0731, + "step": 49658 + }, + { + "epoch": 0.8857239681803588, + "grad_norm": 0.35265228152275085, + "learning_rate": 1.9621926985071953e-06, + "loss": 0.0777, + "step": 49659 + }, + { + "epoch": 0.8857418043020726, + "grad_norm": 0.29091161489486694, + "learning_rate": 1.961588274693707e-06, + "loss": 0.2177, + "step": 49660 + }, + { + "epoch": 0.8857596404237863, + "grad_norm": 0.22855977714061737, + "learning_rate": 1.9609839401850654e-06, + "loss": 0.1001, + "step": 49661 + }, + { + "epoch": 0.8857774765455, + "grad_norm": 0.2923247516155243, + "learning_rate": 1.960379694983608e-06, + "loss": 0.1337, + "step": 49662 + }, + { + "epoch": 0.8857953126672137, + "grad_norm": 0.2918730080127716, + "learning_rate": 1.9597755390916737e-06, + "loss": 0.1189, + "step": 49663 + }, + { + "epoch": 0.8858131487889274, + "grad_norm": 0.27331048250198364, + "learning_rate": 1.9591714725116063e-06, + "loss": 0.1208, + "step": 49664 + }, + { + "epoch": 0.885830984910641, + "grad_norm": 0.2466292381286621, + "learning_rate": 1.95856749524575e-06, + "loss": 0.121, + "step": 49665 + }, + { + "epoch": 0.8858488210323547, + "grad_norm": 0.26272016763687134, + "learning_rate": 1.9579636072964454e-06, + "loss": 0.118, + "step": 49666 + }, + { + "epoch": 0.8858666571540684, + "grad_norm": 0.2634202241897583, + "learning_rate": 1.9573598086660343e-06, + "loss": 0.142, + "step": 49667 + }, + { + "epoch": 0.8858844932757821, + "grad_norm": 0.24630270898342133, + "learning_rate": 1.956756099356849e-06, + "loss": 0.0998, + "step": 49668 + }, + { + "epoch": 0.8859023293974958, + "grad_norm": 0.2468818575143814, + "learning_rate": 1.95615247937124e-06, + "loss": 0.107, + "step": 49669 + }, + { + "epoch": 0.8859201655192095, + "grad_norm": 0.2564699053764343, + "learning_rate": 1.955548948711544e-06, + "loss": 0.0979, + "step": 49670 + }, + { + "epoch": 0.8859380016409232, + "grad_norm": 0.32016000151634216, + "learning_rate": 1.9549455073800985e-06, + "loss": 0.185, + "step": 49671 + }, + { + "epoch": 0.8859558377626369, + "grad_norm": 0.3354667127132416, + "learning_rate": 1.9543421553792406e-06, + "loss": 0.124, + "step": 49672 + }, + { + "epoch": 0.8859736738843506, + "grad_norm": 0.27343663573265076, + "learning_rate": 1.9537388927113155e-06, + "loss": 0.1356, + "step": 49673 + }, + { + "epoch": 0.8859915100060642, + "grad_norm": 0.2568040192127228, + "learning_rate": 1.9531357193786543e-06, + "loss": 0.0816, + "step": 49674 + }, + { + "epoch": 0.8860093461277779, + "grad_norm": 0.2814258635044098, + "learning_rate": 1.952532635383603e-06, + "loss": 0.1203, + "step": 49675 + }, + { + "epoch": 0.8860271822494916, + "grad_norm": 0.33536139130592346, + "learning_rate": 1.9519296407284977e-06, + "loss": 0.1241, + "step": 49676 + }, + { + "epoch": 0.8860450183712054, + "grad_norm": 0.27339211106300354, + "learning_rate": 1.951326735415668e-06, + "loss": 0.1398, + "step": 49677 + }, + { + "epoch": 0.8860628544929191, + "grad_norm": 0.25292903184890747, + "learning_rate": 1.9507239194474586e-06, + "loss": 0.1206, + "step": 49678 + }, + { + "epoch": 0.8860806906146328, + "grad_norm": 0.23713791370391846, + "learning_rate": 1.950121192826207e-06, + "loss": 0.1253, + "step": 49679 + }, + { + "epoch": 0.8860985267363465, + "grad_norm": 0.2483091503381729, + "learning_rate": 1.9495185555542473e-06, + "loss": 0.1199, + "step": 49680 + }, + { + "epoch": 0.8861163628580602, + "grad_norm": 0.28200775384902954, + "learning_rate": 1.948916007633911e-06, + "loss": 0.1303, + "step": 49681 + }, + { + "epoch": 0.8861341989797739, + "grad_norm": 0.2500542104244232, + "learning_rate": 1.9483135490675403e-06, + "loss": 0.0846, + "step": 49682 + }, + { + "epoch": 0.8861520351014875, + "grad_norm": 0.28934359550476074, + "learning_rate": 1.947711179857467e-06, + "loss": 0.1507, + "step": 49683 + }, + { + "epoch": 0.8861698712232012, + "grad_norm": 0.2448359727859497, + "learning_rate": 1.9471089000060284e-06, + "loss": 0.0759, + "step": 49684 + }, + { + "epoch": 0.8861877073449149, + "grad_norm": 0.23450268805027008, + "learning_rate": 1.946506709515561e-06, + "loss": 0.1099, + "step": 49685 + }, + { + "epoch": 0.8862055434666286, + "grad_norm": 0.28258439898490906, + "learning_rate": 1.945904608388388e-06, + "loss": 0.1198, + "step": 49686 + }, + { + "epoch": 0.8862233795883423, + "grad_norm": 0.2995196580886841, + "learning_rate": 1.945302596626852e-06, + "loss": 0.0979, + "step": 49687 + }, + { + "epoch": 0.886241215710056, + "grad_norm": 0.3189823627471924, + "learning_rate": 1.9447006742332906e-06, + "loss": 0.1327, + "step": 49688 + }, + { + "epoch": 0.8862590518317697, + "grad_norm": 0.279433012008667, + "learning_rate": 1.944098841210032e-06, + "loss": 0.1084, + "step": 49689 + }, + { + "epoch": 0.8862768879534834, + "grad_norm": 0.24563564360141754, + "learning_rate": 1.9434970975594073e-06, + "loss": 0.1216, + "step": 49690 + }, + { + "epoch": 0.886294724075197, + "grad_norm": 0.22916372120380402, + "learning_rate": 1.942895443283754e-06, + "loss": 0.1335, + "step": 49691 + }, + { + "epoch": 0.8863125601969107, + "grad_norm": 0.27623093128204346, + "learning_rate": 1.9422938783854013e-06, + "loss": 0.1254, + "step": 49692 + }, + { + "epoch": 0.8863303963186244, + "grad_norm": 0.26458650827407837, + "learning_rate": 1.941692402866682e-06, + "loss": 0.1103, + "step": 49693 + }, + { + "epoch": 0.8863482324403382, + "grad_norm": 0.2947564125061035, + "learning_rate": 1.9410910167299234e-06, + "loss": 0.127, + "step": 49694 + }, + { + "epoch": 0.8863660685620519, + "grad_norm": 0.27672770619392395, + "learning_rate": 1.940489719977462e-06, + "loss": 0.0956, + "step": 49695 + }, + { + "epoch": 0.8863839046837656, + "grad_norm": 0.29894351959228516, + "learning_rate": 1.939888512611629e-06, + "loss": 0.129, + "step": 49696 + }, + { + "epoch": 0.8864017408054793, + "grad_norm": 0.33805254101753235, + "learning_rate": 1.9392873946347534e-06, + "loss": 0.1153, + "step": 49697 + }, + { + "epoch": 0.886419576927193, + "grad_norm": 0.1979510486125946, + "learning_rate": 1.938686366049164e-06, + "loss": 0.0568, + "step": 49698 + }, + { + "epoch": 0.8864374130489067, + "grad_norm": 0.32126984000205994, + "learning_rate": 1.9380854268571863e-06, + "loss": 0.0763, + "step": 49699 + }, + { + "epoch": 0.8864552491706204, + "grad_norm": 0.262399286031723, + "learning_rate": 1.9374845770611604e-06, + "loss": 0.0993, + "step": 49700 + }, + { + "epoch": 0.886473085292334, + "grad_norm": 0.34093043208122253, + "learning_rate": 1.936883816663404e-06, + "loss": 0.0907, + "step": 49701 + }, + { + "epoch": 0.8864909214140477, + "grad_norm": 0.2571010887622833, + "learning_rate": 1.936283145666257e-06, + "loss": 0.1206, + "step": 49702 + }, + { + "epoch": 0.8865087575357614, + "grad_norm": 0.24466572701931, + "learning_rate": 1.9356825640720387e-06, + "loss": 0.1031, + "step": 49703 + }, + { + "epoch": 0.8865265936574751, + "grad_norm": 0.28962230682373047, + "learning_rate": 1.9350820718830846e-06, + "loss": 0.174, + "step": 49704 + }, + { + "epoch": 0.8865444297791888, + "grad_norm": 0.25840824842453003, + "learning_rate": 1.9344816691017174e-06, + "loss": 0.1122, + "step": 49705 + }, + { + "epoch": 0.8865622659009025, + "grad_norm": 0.46787112951278687, + "learning_rate": 1.9338813557302687e-06, + "loss": 0.1541, + "step": 49706 + }, + { + "epoch": 0.8865801020226162, + "grad_norm": 0.2818623483181, + "learning_rate": 1.933281131771056e-06, + "loss": 0.0582, + "step": 49707 + }, + { + "epoch": 0.8865979381443299, + "grad_norm": 0.20159101486206055, + "learning_rate": 1.932680997226419e-06, + "loss": 0.0972, + "step": 49708 + }, + { + "epoch": 0.8866157742660435, + "grad_norm": 0.23572859168052673, + "learning_rate": 1.9320809520986747e-06, + "loss": 0.0931, + "step": 49709 + }, + { + "epoch": 0.8866336103877572, + "grad_norm": 0.2375248372554779, + "learning_rate": 1.9314809963901533e-06, + "loss": 0.1032, + "step": 49710 + }, + { + "epoch": 0.886651446509471, + "grad_norm": 0.35562703013420105, + "learning_rate": 1.930881130103182e-06, + "loss": 0.1757, + "step": 49711 + }, + { + "epoch": 0.8866692826311847, + "grad_norm": 0.24101747572422028, + "learning_rate": 1.9302813532400766e-06, + "loss": 0.0856, + "step": 49712 + }, + { + "epoch": 0.8866871187528984, + "grad_norm": 0.24090975522994995, + "learning_rate": 1.929681665803171e-06, + "loss": 0.0547, + "step": 49713 + }, + { + "epoch": 0.8867049548746121, + "grad_norm": 0.43578094244003296, + "learning_rate": 1.929082067794785e-06, + "loss": 0.1828, + "step": 49714 + }, + { + "epoch": 0.8867227909963258, + "grad_norm": 0.26033157110214233, + "learning_rate": 1.928482559217251e-06, + "loss": 0.0763, + "step": 49715 + }, + { + "epoch": 0.8867406271180395, + "grad_norm": 0.2312462329864502, + "learning_rate": 1.927883140072881e-06, + "loss": 0.1303, + "step": 49716 + }, + { + "epoch": 0.8867584632397532, + "grad_norm": 0.2216435670852661, + "learning_rate": 1.9272838103640112e-06, + "loss": 0.1238, + "step": 49717 + }, + { + "epoch": 0.8867762993614668, + "grad_norm": 0.3196179270744324, + "learning_rate": 1.926684570092957e-06, + "loss": 0.1292, + "step": 49718 + }, + { + "epoch": 0.8867941354831805, + "grad_norm": 0.28491735458374023, + "learning_rate": 1.9260854192620413e-06, + "loss": 0.1087, + "step": 49719 + }, + { + "epoch": 0.8868119716048942, + "grad_norm": 0.31764113903045654, + "learning_rate": 1.925486357873585e-06, + "loss": 0.1288, + "step": 49720 + }, + { + "epoch": 0.8868298077266079, + "grad_norm": 0.32463398575782776, + "learning_rate": 1.924887385929919e-06, + "loss": 0.1125, + "step": 49721 + }, + { + "epoch": 0.8868476438483216, + "grad_norm": 0.20111265778541565, + "learning_rate": 1.924288503433358e-06, + "loss": 0.0487, + "step": 49722 + }, + { + "epoch": 0.8868654799700353, + "grad_norm": 0.3146669268608093, + "learning_rate": 1.9236897103862255e-06, + "loss": 0.0787, + "step": 49723 + }, + { + "epoch": 0.886883316091749, + "grad_norm": 0.2350408136844635, + "learning_rate": 1.923091006790839e-06, + "loss": 0.1128, + "step": 49724 + }, + { + "epoch": 0.8869011522134627, + "grad_norm": 0.2963438630104065, + "learning_rate": 1.922492392649522e-06, + "loss": 0.1082, + "step": 49725 + }, + { + "epoch": 0.8869189883351764, + "grad_norm": 0.28236159682273865, + "learning_rate": 1.921893867964597e-06, + "loss": 0.1624, + "step": 49726 + }, + { + "epoch": 0.88693682445689, + "grad_norm": 0.2680119276046753, + "learning_rate": 1.9212954327383845e-06, + "loss": 0.0966, + "step": 49727 + }, + { + "epoch": 0.8869546605786038, + "grad_norm": 0.32395491003990173, + "learning_rate": 1.9206970869731946e-06, + "loss": 0.163, + "step": 49728 + }, + { + "epoch": 0.8869724967003175, + "grad_norm": 0.32477495074272156, + "learning_rate": 1.9200988306713603e-06, + "loss": 0.0899, + "step": 49729 + }, + { + "epoch": 0.8869903328220312, + "grad_norm": 0.31545814871788025, + "learning_rate": 1.9195006638351914e-06, + "loss": 0.129, + "step": 49730 + }, + { + "epoch": 0.8870081689437449, + "grad_norm": 0.2848271429538727, + "learning_rate": 1.9189025864670114e-06, + "loss": 0.1387, + "step": 49731 + }, + { + "epoch": 0.8870260050654586, + "grad_norm": 0.2928328514099121, + "learning_rate": 1.91830459856914e-06, + "loss": 0.137, + "step": 49732 + }, + { + "epoch": 0.8870438411871723, + "grad_norm": 0.3611222207546234, + "learning_rate": 1.9177067001438842e-06, + "loss": 0.1216, + "step": 49733 + }, + { + "epoch": 0.887061677308886, + "grad_norm": 0.22243250906467438, + "learning_rate": 1.9171088911935754e-06, + "loss": 0.1185, + "step": 49734 + }, + { + "epoch": 0.8870795134305997, + "grad_norm": 0.28262144327163696, + "learning_rate": 1.9165111717205254e-06, + "loss": 0.0904, + "step": 49735 + }, + { + "epoch": 0.8870973495523133, + "grad_norm": 0.3102903664112091, + "learning_rate": 1.915913541727052e-06, + "loss": 0.1132, + "step": 49736 + }, + { + "epoch": 0.887115185674027, + "grad_norm": 0.28757914900779724, + "learning_rate": 1.9153160012154695e-06, + "loss": 0.1222, + "step": 49737 + }, + { + "epoch": 0.8871330217957407, + "grad_norm": 0.26365897059440613, + "learning_rate": 1.914718550188091e-06, + "loss": 0.1, + "step": 49738 + }, + { + "epoch": 0.8871508579174544, + "grad_norm": 0.26044753193855286, + "learning_rate": 1.9141211886472414e-06, + "loss": 0.0804, + "step": 49739 + }, + { + "epoch": 0.8871686940391681, + "grad_norm": 0.3047908544540405, + "learning_rate": 1.9135239165952307e-06, + "loss": 0.1395, + "step": 49740 + }, + { + "epoch": 0.8871865301608818, + "grad_norm": 0.19373618066310883, + "learning_rate": 1.912926734034373e-06, + "loss": 0.0867, + "step": 49741 + }, + { + "epoch": 0.8872043662825955, + "grad_norm": 0.27001887559890747, + "learning_rate": 1.9123296409669896e-06, + "loss": 0.0945, + "step": 49742 + }, + { + "epoch": 0.8872222024043092, + "grad_norm": 0.2431207150220871, + "learning_rate": 1.911732637395383e-06, + "loss": 0.112, + "step": 49743 + }, + { + "epoch": 0.887240038526023, + "grad_norm": 0.24677345156669617, + "learning_rate": 1.911135723321883e-06, + "loss": 0.0821, + "step": 49744 + }, + { + "epoch": 0.8872578746477366, + "grad_norm": 0.19661147892475128, + "learning_rate": 1.910538898748793e-06, + "loss": 0.0723, + "step": 49745 + }, + { + "epoch": 0.8872757107694503, + "grad_norm": 0.32297438383102417, + "learning_rate": 1.909942163678427e-06, + "loss": 0.1247, + "step": 49746 + }, + { + "epoch": 0.887293546891164, + "grad_norm": 0.24456030130386353, + "learning_rate": 1.9093455181131037e-06, + "loss": 0.0957, + "step": 49747 + }, + { + "epoch": 0.8873113830128777, + "grad_norm": 0.23744532465934753, + "learning_rate": 1.9087489620551317e-06, + "loss": 0.0984, + "step": 49748 + }, + { + "epoch": 0.8873292191345914, + "grad_norm": 0.3174133598804474, + "learning_rate": 1.908152495506824e-06, + "loss": 0.0752, + "step": 49749 + }, + { + "epoch": 0.8873470552563051, + "grad_norm": 0.2902401387691498, + "learning_rate": 1.9075561184704938e-06, + "loss": 0.0904, + "step": 49750 + }, + { + "epoch": 0.8873648913780188, + "grad_norm": 0.24518780410289764, + "learning_rate": 1.906959830948446e-06, + "loss": 0.061, + "step": 49751 + }, + { + "epoch": 0.8873827274997325, + "grad_norm": 0.33072373270988464, + "learning_rate": 1.9063636329430036e-06, + "loss": 0.1131, + "step": 49752 + }, + { + "epoch": 0.8874005636214461, + "grad_norm": 0.2066785991191864, + "learning_rate": 1.9057675244564726e-06, + "loss": 0.0564, + "step": 49753 + }, + { + "epoch": 0.8874183997431598, + "grad_norm": 0.2651154398918152, + "learning_rate": 1.905171505491163e-06, + "loss": 0.122, + "step": 49754 + }, + { + "epoch": 0.8874362358648735, + "grad_norm": 0.29715126752853394, + "learning_rate": 1.9045755760493806e-06, + "loss": 0.1041, + "step": 49755 + }, + { + "epoch": 0.8874540719865872, + "grad_norm": 0.2227640002965927, + "learning_rate": 1.903979736133446e-06, + "loss": 0.0977, + "step": 49756 + }, + { + "epoch": 0.8874719081083009, + "grad_norm": 0.22558589279651642, + "learning_rate": 1.9033839857456576e-06, + "loss": 0.0361, + "step": 49757 + }, + { + "epoch": 0.8874897442300146, + "grad_norm": 0.23243005573749542, + "learning_rate": 1.9027883248883326e-06, + "loss": 0.1216, + "step": 49758 + }, + { + "epoch": 0.8875075803517283, + "grad_norm": 0.2683306336402893, + "learning_rate": 1.9021927535637752e-06, + "loss": 0.1068, + "step": 49759 + }, + { + "epoch": 0.887525416473442, + "grad_norm": 0.243260458111763, + "learning_rate": 1.9015972717742998e-06, + "loss": 0.0803, + "step": 49760 + }, + { + "epoch": 0.8875432525951558, + "grad_norm": 0.1841120570898056, + "learning_rate": 1.9010018795222128e-06, + "loss": 0.0897, + "step": 49761 + }, + { + "epoch": 0.8875610887168695, + "grad_norm": 0.2340327501296997, + "learning_rate": 1.9004065768098183e-06, + "loss": 0.1119, + "step": 49762 + }, + { + "epoch": 0.8875789248385831, + "grad_norm": 0.2835519313812256, + "learning_rate": 1.8998113636394282e-06, + "loss": 0.1577, + "step": 49763 + }, + { + "epoch": 0.8875967609602968, + "grad_norm": 0.23356936872005463, + "learning_rate": 1.8992162400133435e-06, + "loss": 0.1539, + "step": 49764 + }, + { + "epoch": 0.8876145970820105, + "grad_norm": 0.2503810524940491, + "learning_rate": 1.8986212059338787e-06, + "loss": 0.1276, + "step": 49765 + }, + { + "epoch": 0.8876324332037242, + "grad_norm": 0.2631121873855591, + "learning_rate": 1.8980262614033378e-06, + "loss": 0.1181, + "step": 49766 + }, + { + "epoch": 0.8876502693254379, + "grad_norm": 0.27552226185798645, + "learning_rate": 1.8974314064240273e-06, + "loss": 0.1611, + "step": 49767 + }, + { + "epoch": 0.8876681054471516, + "grad_norm": 0.2293064147233963, + "learning_rate": 1.8968366409982452e-06, + "loss": 0.1058, + "step": 49768 + }, + { + "epoch": 0.8876859415688653, + "grad_norm": 0.3883008658885956, + "learning_rate": 1.8962419651283092e-06, + "loss": 0.1064, + "step": 49769 + }, + { + "epoch": 0.887703777690579, + "grad_norm": 0.27328699827194214, + "learning_rate": 1.8956473788165146e-06, + "loss": 0.1098, + "step": 49770 + }, + { + "epoch": 0.8877216138122926, + "grad_norm": 0.23133574426174164, + "learning_rate": 1.8950528820651765e-06, + "loss": 0.1086, + "step": 49771 + }, + { + "epoch": 0.8877394499340063, + "grad_norm": 0.21188603341579437, + "learning_rate": 1.8944584748765897e-06, + "loss": 0.1182, + "step": 49772 + }, + { + "epoch": 0.88775728605572, + "grad_norm": 0.29569345712661743, + "learning_rate": 1.893864157253064e-06, + "loss": 0.0967, + "step": 49773 + }, + { + "epoch": 0.8877751221774337, + "grad_norm": 0.23778888583183289, + "learning_rate": 1.893269929196903e-06, + "loss": 0.0838, + "step": 49774 + }, + { + "epoch": 0.8877929582991474, + "grad_norm": 0.2736237645149231, + "learning_rate": 1.8926757907104075e-06, + "loss": 0.133, + "step": 49775 + }, + { + "epoch": 0.8878107944208611, + "grad_norm": 0.3804624676704407, + "learning_rate": 1.8920817417958815e-06, + "loss": 0.0919, + "step": 49776 + }, + { + "epoch": 0.8878286305425748, + "grad_norm": 0.36116495728492737, + "learning_rate": 1.8914877824556253e-06, + "loss": 0.1129, + "step": 49777 + }, + { + "epoch": 0.8878464666642886, + "grad_norm": 0.3025190830230713, + "learning_rate": 1.890893912691949e-06, + "loss": 0.1434, + "step": 49778 + }, + { + "epoch": 0.8878643027860023, + "grad_norm": 0.24519546329975128, + "learning_rate": 1.8903001325071474e-06, + "loss": 0.074, + "step": 49779 + }, + { + "epoch": 0.887882138907716, + "grad_norm": 0.2681308686733246, + "learning_rate": 1.8897064419035244e-06, + "loss": 0.129, + "step": 49780 + }, + { + "epoch": 0.8878999750294296, + "grad_norm": 0.29937997460365295, + "learning_rate": 1.8891128408833781e-06, + "loss": 0.119, + "step": 49781 + }, + { + "epoch": 0.8879178111511433, + "grad_norm": 0.26641565561294556, + "learning_rate": 1.888519329449015e-06, + "loss": 0.0603, + "step": 49782 + }, + { + "epoch": 0.887935647272857, + "grad_norm": 0.3053593039512634, + "learning_rate": 1.8879259076027334e-06, + "loss": 0.0512, + "step": 49783 + }, + { + "epoch": 0.8879534833945707, + "grad_norm": 0.28969714045524597, + "learning_rate": 1.8873325753468312e-06, + "loss": 0.1614, + "step": 49784 + }, + { + "epoch": 0.8879713195162844, + "grad_norm": 0.18694236874580383, + "learning_rate": 1.8867393326836096e-06, + "loss": 0.136, + "step": 49785 + }, + { + "epoch": 0.8879891556379981, + "grad_norm": 0.34125542640686035, + "learning_rate": 1.886146179615375e-06, + "loss": 0.1472, + "step": 49786 + }, + { + "epoch": 0.8880069917597118, + "grad_norm": 0.29271045327186584, + "learning_rate": 1.8855531161444201e-06, + "loss": 0.1105, + "step": 49787 + }, + { + "epoch": 0.8880248278814254, + "grad_norm": 0.31370100378990173, + "learning_rate": 1.8849601422730457e-06, + "loss": 0.1024, + "step": 49788 + }, + { + "epoch": 0.8880426640031391, + "grad_norm": 0.29369911551475525, + "learning_rate": 1.8843672580035498e-06, + "loss": 0.0819, + "step": 49789 + }, + { + "epoch": 0.8880605001248528, + "grad_norm": 0.22341814637184143, + "learning_rate": 1.8837744633382227e-06, + "loss": 0.0858, + "step": 49790 + }, + { + "epoch": 0.8880783362465665, + "grad_norm": 0.3277416229248047, + "learning_rate": 1.8831817582793787e-06, + "loss": 0.1244, + "step": 49791 + }, + { + "epoch": 0.8880961723682802, + "grad_norm": 0.20853938162326813, + "learning_rate": 1.8825891428293024e-06, + "loss": 0.0943, + "step": 49792 + }, + { + "epoch": 0.8881140084899939, + "grad_norm": 0.2934702932834625, + "learning_rate": 1.8819966169902975e-06, + "loss": 0.08, + "step": 49793 + }, + { + "epoch": 0.8881318446117076, + "grad_norm": 0.3217245936393738, + "learning_rate": 1.8814041807646537e-06, + "loss": 0.1128, + "step": 49794 + }, + { + "epoch": 0.8881496807334214, + "grad_norm": 0.2448878288269043, + "learning_rate": 1.8808118341546748e-06, + "loss": 0.1118, + "step": 49795 + }, + { + "epoch": 0.8881675168551351, + "grad_norm": 0.3173161745071411, + "learning_rate": 1.8802195771626564e-06, + "loss": 0.0936, + "step": 49796 + }, + { + "epoch": 0.8881853529768488, + "grad_norm": 0.26010164618492126, + "learning_rate": 1.8796274097908878e-06, + "loss": 0.1112, + "step": 49797 + }, + { + "epoch": 0.8882031890985624, + "grad_norm": 0.29174935817718506, + "learning_rate": 1.8790353320416732e-06, + "loss": 0.1345, + "step": 49798 + }, + { + "epoch": 0.8882210252202761, + "grad_norm": 0.23360399901866913, + "learning_rate": 1.8784433439172993e-06, + "loss": 0.1442, + "step": 49799 + }, + { + "epoch": 0.8882388613419898, + "grad_norm": 0.28030991554260254, + "learning_rate": 1.8778514454200674e-06, + "loss": 0.1404, + "step": 49800 + }, + { + "epoch": 0.8882566974637035, + "grad_norm": 0.3899186849594116, + "learning_rate": 1.8772596365522726e-06, + "loss": 0.0906, + "step": 49801 + }, + { + "epoch": 0.8882745335854172, + "grad_norm": 0.195840984582901, + "learning_rate": 1.8766679173162022e-06, + "loss": 0.1261, + "step": 49802 + }, + { + "epoch": 0.8882923697071309, + "grad_norm": 0.3473118245601654, + "learning_rate": 1.8760762877141512e-06, + "loss": 0.085, + "step": 49803 + }, + { + "epoch": 0.8883102058288446, + "grad_norm": 0.29280713200569153, + "learning_rate": 1.8754847477484183e-06, + "loss": 0.1048, + "step": 49804 + }, + { + "epoch": 0.8883280419505583, + "grad_norm": 0.33813855051994324, + "learning_rate": 1.8748932974212957e-06, + "loss": 0.1208, + "step": 49805 + }, + { + "epoch": 0.8883458780722719, + "grad_norm": 0.3542816638946533, + "learning_rate": 1.8743019367350707e-06, + "loss": 0.1151, + "step": 49806 + }, + { + "epoch": 0.8883637141939856, + "grad_norm": 0.2478209286928177, + "learning_rate": 1.8737106656920384e-06, + "loss": 0.0671, + "step": 49807 + }, + { + "epoch": 0.8883815503156993, + "grad_norm": 0.3592206835746765, + "learning_rate": 1.8731194842944916e-06, + "loss": 0.1244, + "step": 49808 + }, + { + "epoch": 0.888399386437413, + "grad_norm": 0.2791474759578705, + "learning_rate": 1.8725283925447228e-06, + "loss": 0.1256, + "step": 49809 + }, + { + "epoch": 0.8884172225591267, + "grad_norm": 0.22975216805934906, + "learning_rate": 1.8719373904450221e-06, + "loss": 0.0943, + "step": 49810 + }, + { + "epoch": 0.8884350586808404, + "grad_norm": 0.22240890562534332, + "learning_rate": 1.8713464779976764e-06, + "loss": 0.1485, + "step": 49811 + }, + { + "epoch": 0.8884528948025542, + "grad_norm": 0.18733976781368256, + "learning_rate": 1.870755655204981e-06, + "loss": 0.0463, + "step": 49812 + }, + { + "epoch": 0.8884707309242679, + "grad_norm": 0.2414817214012146, + "learning_rate": 1.8701649220692286e-06, + "loss": 0.0548, + "step": 49813 + }, + { + "epoch": 0.8884885670459816, + "grad_norm": 0.2640984356403351, + "learning_rate": 1.8695742785927062e-06, + "loss": 0.1102, + "step": 49814 + }, + { + "epoch": 0.8885064031676952, + "grad_norm": 0.2612643539905548, + "learning_rate": 1.8689837247776953e-06, + "loss": 0.1425, + "step": 49815 + }, + { + "epoch": 0.8885242392894089, + "grad_norm": 0.29173120856285095, + "learning_rate": 1.8683932606264998e-06, + "loss": 0.0877, + "step": 49816 + }, + { + "epoch": 0.8885420754111226, + "grad_norm": 0.23039419949054718, + "learning_rate": 1.867802886141401e-06, + "loss": 0.0504, + "step": 49817 + }, + { + "epoch": 0.8885599115328363, + "grad_norm": 0.2941480875015259, + "learning_rate": 1.8672126013246889e-06, + "loss": 0.0936, + "step": 49818 + }, + { + "epoch": 0.88857774765455, + "grad_norm": 0.2458086758852005, + "learning_rate": 1.8666224061786503e-06, + "loss": 0.0806, + "step": 49819 + }, + { + "epoch": 0.8885955837762637, + "grad_norm": 0.22181279957294464, + "learning_rate": 1.8660323007055669e-06, + "loss": 0.0974, + "step": 49820 + }, + { + "epoch": 0.8886134198979774, + "grad_norm": 0.32111454010009766, + "learning_rate": 1.8654422849077396e-06, + "loss": 0.113, + "step": 49821 + }, + { + "epoch": 0.8886312560196911, + "grad_norm": 0.31020745635032654, + "learning_rate": 1.8648523587874473e-06, + "loss": 0.0952, + "step": 49822 + }, + { + "epoch": 0.8886490921414048, + "grad_norm": 0.2366929054260254, + "learning_rate": 1.8642625223469768e-06, + "loss": 0.0791, + "step": 49823 + }, + { + "epoch": 0.8886669282631184, + "grad_norm": 0.26705101132392883, + "learning_rate": 1.8636727755886124e-06, + "loss": 0.1114, + "step": 49824 + }, + { + "epoch": 0.8886847643848321, + "grad_norm": 0.6369178295135498, + "learning_rate": 1.8630831185146468e-06, + "loss": 0.1415, + "step": 49825 + }, + { + "epoch": 0.8887026005065458, + "grad_norm": 0.273028165102005, + "learning_rate": 1.862493551127359e-06, + "loss": 0.0813, + "step": 49826 + }, + { + "epoch": 0.8887204366282595, + "grad_norm": 0.25852999091148376, + "learning_rate": 1.861904073429041e-06, + "loss": 0.1233, + "step": 49827 + }, + { + "epoch": 0.8887382727499732, + "grad_norm": 0.28221389651298523, + "learning_rate": 1.861314685421972e-06, + "loss": 0.0647, + "step": 49828 + }, + { + "epoch": 0.888756108871687, + "grad_norm": 0.3221103847026825, + "learning_rate": 1.8607253871084417e-06, + "loss": 0.0769, + "step": 49829 + }, + { + "epoch": 0.8887739449934007, + "grad_norm": 0.2946473956108093, + "learning_rate": 1.8601361784907316e-06, + "loss": 0.1326, + "step": 49830 + }, + { + "epoch": 0.8887917811151144, + "grad_norm": 0.20750118792057037, + "learning_rate": 1.8595470595711262e-06, + "loss": 0.1045, + "step": 49831 + }, + { + "epoch": 0.888809617236828, + "grad_norm": 0.3170025944709778, + "learning_rate": 1.8589580303519095e-06, + "loss": 0.1472, + "step": 49832 + }, + { + "epoch": 0.8888274533585417, + "grad_norm": 0.3039397597312927, + "learning_rate": 1.8583690908353602e-06, + "loss": 0.1286, + "step": 49833 + }, + { + "epoch": 0.8888452894802554, + "grad_norm": 0.33231019973754883, + "learning_rate": 1.8577802410237682e-06, + "loss": 0.1122, + "step": 49834 + }, + { + "epoch": 0.8888631256019691, + "grad_norm": 0.3095668852329254, + "learning_rate": 1.8571914809194153e-06, + "loss": 0.0919, + "step": 49835 + }, + { + "epoch": 0.8888809617236828, + "grad_norm": 0.33183181285858154, + "learning_rate": 1.8566028105245798e-06, + "loss": 0.1291, + "step": 49836 + }, + { + "epoch": 0.8888987978453965, + "grad_norm": 0.2880662679672241, + "learning_rate": 1.8560142298415405e-06, + "loss": 0.1152, + "step": 49837 + }, + { + "epoch": 0.8889166339671102, + "grad_norm": 0.31454232335090637, + "learning_rate": 1.8554257388725875e-06, + "loss": 0.0971, + "step": 49838 + }, + { + "epoch": 0.8889344700888239, + "grad_norm": 0.677558422088623, + "learning_rate": 1.8548373376199996e-06, + "loss": 0.1097, + "step": 49839 + }, + { + "epoch": 0.8889523062105376, + "grad_norm": 0.30070415139198303, + "learning_rate": 1.8542490260860523e-06, + "loss": 0.1119, + "step": 49840 + }, + { + "epoch": 0.8889701423322512, + "grad_norm": 0.2502022385597229, + "learning_rate": 1.8536608042730303e-06, + "loss": 0.0986, + "step": 49841 + }, + { + "epoch": 0.8889879784539649, + "grad_norm": 0.2670758068561554, + "learning_rate": 1.8530726721832148e-06, + "loss": 0.1181, + "step": 49842 + }, + { + "epoch": 0.8890058145756786, + "grad_norm": 0.25057753920555115, + "learning_rate": 1.8524846298188874e-06, + "loss": 0.1037, + "step": 49843 + }, + { + "epoch": 0.8890236506973923, + "grad_norm": 0.2412441223859787, + "learning_rate": 1.8518966771823244e-06, + "loss": 0.0971, + "step": 49844 + }, + { + "epoch": 0.8890414868191061, + "grad_norm": 0.2951279282569885, + "learning_rate": 1.8513088142758038e-06, + "loss": 0.1331, + "step": 49845 + }, + { + "epoch": 0.8890593229408198, + "grad_norm": 0.3844563364982605, + "learning_rate": 1.8507210411015996e-06, + "loss": 0.0934, + "step": 49846 + }, + { + "epoch": 0.8890771590625335, + "grad_norm": 0.3238573670387268, + "learning_rate": 1.850133357662004e-06, + "loss": 0.1305, + "step": 49847 + }, + { + "epoch": 0.8890949951842472, + "grad_norm": 0.23506927490234375, + "learning_rate": 1.8495457639592844e-06, + "loss": 0.0945, + "step": 49848 + }, + { + "epoch": 0.8891128313059609, + "grad_norm": 0.2517795264720917, + "learning_rate": 1.848958259995723e-06, + "loss": 0.0922, + "step": 49849 + }, + { + "epoch": 0.8891306674276745, + "grad_norm": 0.23196156322956085, + "learning_rate": 1.8483708457735922e-06, + "loss": 0.0794, + "step": 49850 + }, + { + "epoch": 0.8891485035493882, + "grad_norm": 0.2365451604127884, + "learning_rate": 1.8477835212951738e-06, + "loss": 0.0858, + "step": 49851 + }, + { + "epoch": 0.8891663396711019, + "grad_norm": 0.22328735888004303, + "learning_rate": 1.847196286562744e-06, + "loss": 0.0905, + "step": 49852 + }, + { + "epoch": 0.8891841757928156, + "grad_norm": 0.30784499645233154, + "learning_rate": 1.8466091415785759e-06, + "loss": 0.0945, + "step": 49853 + }, + { + "epoch": 0.8892020119145293, + "grad_norm": 0.24009576439857483, + "learning_rate": 1.8460220863449479e-06, + "loss": 0.1149, + "step": 49854 + }, + { + "epoch": 0.889219848036243, + "grad_norm": 0.2380632907152176, + "learning_rate": 1.8454351208641336e-06, + "loss": 0.116, + "step": 49855 + }, + { + "epoch": 0.8892376841579567, + "grad_norm": 0.25152263045310974, + "learning_rate": 1.8448482451384142e-06, + "loss": 0.0903, + "step": 49856 + }, + { + "epoch": 0.8892555202796704, + "grad_norm": 0.20498567819595337, + "learning_rate": 1.8442614591700602e-06, + "loss": 0.1189, + "step": 49857 + }, + { + "epoch": 0.889273356401384, + "grad_norm": 0.31050199270248413, + "learning_rate": 1.8436747629613476e-06, + "loss": 0.1073, + "step": 49858 + }, + { + "epoch": 0.8892911925230977, + "grad_norm": 0.32119742035865784, + "learning_rate": 1.8430881565145441e-06, + "loss": 0.1129, + "step": 49859 + }, + { + "epoch": 0.8893090286448114, + "grad_norm": 0.19084841012954712, + "learning_rate": 1.842501639831934e-06, + "loss": 0.0428, + "step": 49860 + }, + { + "epoch": 0.8893268647665251, + "grad_norm": 0.31924012303352356, + "learning_rate": 1.8419152129157875e-06, + "loss": 0.1309, + "step": 49861 + }, + { + "epoch": 0.8893447008882389, + "grad_norm": 0.35856541991233826, + "learning_rate": 1.8413288757683723e-06, + "loss": 0.1448, + "step": 49862 + }, + { + "epoch": 0.8893625370099526, + "grad_norm": 0.3894062042236328, + "learning_rate": 1.8407426283919643e-06, + "loss": 0.0714, + "step": 49863 + }, + { + "epoch": 0.8893803731316663, + "grad_norm": 0.25119802355766296, + "learning_rate": 1.8401564707888397e-06, + "loss": 0.1016, + "step": 49864 + }, + { + "epoch": 0.88939820925338, + "grad_norm": 0.2892312705516815, + "learning_rate": 1.8395704029612659e-06, + "loss": 0.1262, + "step": 49865 + }, + { + "epoch": 0.8894160453750937, + "grad_norm": 0.25144341588020325, + "learning_rate": 1.838984424911519e-06, + "loss": 0.1062, + "step": 49866 + }, + { + "epoch": 0.8894338814968074, + "grad_norm": 0.36705195903778076, + "learning_rate": 1.8383985366418638e-06, + "loss": 0.1769, + "step": 49867 + }, + { + "epoch": 0.889451717618521, + "grad_norm": 0.30127978324890137, + "learning_rate": 1.8378127381545762e-06, + "loss": 0.1458, + "step": 49868 + }, + { + "epoch": 0.8894695537402347, + "grad_norm": 0.30124983191490173, + "learning_rate": 1.8372270294519294e-06, + "loss": 0.1055, + "step": 49869 + }, + { + "epoch": 0.8894873898619484, + "grad_norm": 0.3250449001789093, + "learning_rate": 1.8366414105361884e-06, + "loss": 0.1468, + "step": 49870 + }, + { + "epoch": 0.8895052259836621, + "grad_norm": 0.32466554641723633, + "learning_rate": 1.836055881409629e-06, + "loss": 0.1372, + "step": 49871 + }, + { + "epoch": 0.8895230621053758, + "grad_norm": 0.21098098158836365, + "learning_rate": 1.8354704420745132e-06, + "loss": 0.0389, + "step": 49872 + }, + { + "epoch": 0.8895408982270895, + "grad_norm": 0.3244823217391968, + "learning_rate": 1.8348850925331174e-06, + "loss": 0.1178, + "step": 49873 + }, + { + "epoch": 0.8895587343488032, + "grad_norm": 0.18866463005542755, + "learning_rate": 1.8342998327877086e-06, + "loss": 0.0592, + "step": 49874 + }, + { + "epoch": 0.8895765704705169, + "grad_norm": 0.37507542967796326, + "learning_rate": 1.833714662840555e-06, + "loss": 0.0919, + "step": 49875 + }, + { + "epoch": 0.8895944065922305, + "grad_norm": 0.20332205295562744, + "learning_rate": 1.833129582693921e-06, + "loss": 0.1195, + "step": 49876 + }, + { + "epoch": 0.8896122427139442, + "grad_norm": 0.26294079422950745, + "learning_rate": 1.8325445923500827e-06, + "loss": 0.0911, + "step": 49877 + }, + { + "epoch": 0.8896300788356579, + "grad_norm": 0.25406116247177124, + "learning_rate": 1.8319596918113025e-06, + "loss": 0.0931, + "step": 49878 + }, + { + "epoch": 0.8896479149573717, + "grad_norm": 0.38951027393341064, + "learning_rate": 1.8313748810798475e-06, + "loss": 0.0945, + "step": 49879 + }, + { + "epoch": 0.8896657510790854, + "grad_norm": 0.2512340843677521, + "learning_rate": 1.830790160157983e-06, + "loss": 0.1159, + "step": 49880 + }, + { + "epoch": 0.8896835872007991, + "grad_norm": 0.2336951196193695, + "learning_rate": 1.830205529047982e-06, + "loss": 0.1023, + "step": 49881 + }, + { + "epoch": 0.8897014233225128, + "grad_norm": 0.25117599964141846, + "learning_rate": 1.8296209877521037e-06, + "loss": 0.1121, + "step": 49882 + }, + { + "epoch": 0.8897192594442265, + "grad_norm": 0.26584678888320923, + "learning_rate": 1.8290365362726213e-06, + "loss": 0.1199, + "step": 49883 + }, + { + "epoch": 0.8897370955659402, + "grad_norm": 0.29908284544944763, + "learning_rate": 1.8284521746117944e-06, + "loss": 0.0703, + "step": 49884 + }, + { + "epoch": 0.8897549316876538, + "grad_norm": 0.4404044449329376, + "learning_rate": 1.8278679027718875e-06, + "loss": 0.1961, + "step": 49885 + }, + { + "epoch": 0.8897727678093675, + "grad_norm": 0.269890159368515, + "learning_rate": 1.8272837207551713e-06, + "loss": 0.1226, + "step": 49886 + }, + { + "epoch": 0.8897906039310812, + "grad_norm": 0.22089998424053192, + "learning_rate": 1.8266996285639077e-06, + "loss": 0.0863, + "step": 49887 + }, + { + "epoch": 0.8898084400527949, + "grad_norm": 0.18101929128170013, + "learning_rate": 1.8261156262003588e-06, + "loss": 0.0716, + "step": 49888 + }, + { + "epoch": 0.8898262761745086, + "grad_norm": 0.2906607985496521, + "learning_rate": 1.8255317136667865e-06, + "loss": 0.1181, + "step": 49889 + }, + { + "epoch": 0.8898441122962223, + "grad_norm": 0.2994353175163269, + "learning_rate": 1.8249478909654644e-06, + "loss": 0.1198, + "step": 49890 + }, + { + "epoch": 0.889861948417936, + "grad_norm": 0.2071688175201416, + "learning_rate": 1.824364158098646e-06, + "loss": 0.0596, + "step": 49891 + }, + { + "epoch": 0.8898797845396497, + "grad_norm": 0.33059561252593994, + "learning_rate": 1.823780515068596e-06, + "loss": 0.0835, + "step": 49892 + }, + { + "epoch": 0.8898976206613634, + "grad_norm": 0.22787611186504364, + "learning_rate": 1.823196961877574e-06, + "loss": 0.1168, + "step": 49893 + }, + { + "epoch": 0.889915456783077, + "grad_norm": 0.3035147488117218, + "learning_rate": 1.8226134985278504e-06, + "loss": 0.1045, + "step": 49894 + }, + { + "epoch": 0.8899332929047907, + "grad_norm": 0.3333717882633209, + "learning_rate": 1.822030125021676e-06, + "loss": 0.1248, + "step": 49895 + }, + { + "epoch": 0.8899511290265045, + "grad_norm": 0.25385555624961853, + "learning_rate": 1.821446841361324e-06, + "loss": 0.1445, + "step": 49896 + }, + { + "epoch": 0.8899689651482182, + "grad_norm": 0.3475976884365082, + "learning_rate": 1.820863647549051e-06, + "loss": 0.102, + "step": 49897 + }, + { + "epoch": 0.8899868012699319, + "grad_norm": 0.26944491267204285, + "learning_rate": 1.8202805435871107e-06, + "loss": 0.0928, + "step": 49898 + }, + { + "epoch": 0.8900046373916456, + "grad_norm": 0.297126442193985, + "learning_rate": 1.8196975294777735e-06, + "loss": 0.1095, + "step": 49899 + }, + { + "epoch": 0.8900224735133593, + "grad_norm": 0.2458484023809433, + "learning_rate": 1.8191146052232931e-06, + "loss": 0.0867, + "step": 49900 + }, + { + "epoch": 0.890040309635073, + "grad_norm": 0.2071489542722702, + "learning_rate": 1.818531770825932e-06, + "loss": 0.0507, + "step": 49901 + }, + { + "epoch": 0.8900581457567867, + "grad_norm": 0.22817501425743103, + "learning_rate": 1.8179490262879463e-06, + "loss": 0.113, + "step": 49902 + }, + { + "epoch": 0.8900759818785003, + "grad_norm": 0.3017560839653015, + "learning_rate": 1.8173663716115979e-06, + "loss": 0.1001, + "step": 49903 + }, + { + "epoch": 0.890093818000214, + "grad_norm": 0.3293018341064453, + "learning_rate": 1.8167838067991466e-06, + "loss": 0.1466, + "step": 49904 + }, + { + "epoch": 0.8901116541219277, + "grad_norm": 0.34821516275405884, + "learning_rate": 1.8162013318528486e-06, + "loss": 0.0573, + "step": 49905 + }, + { + "epoch": 0.8901294902436414, + "grad_norm": 0.21277935802936554, + "learning_rate": 1.815618946774955e-06, + "loss": 0.0773, + "step": 49906 + }, + { + "epoch": 0.8901473263653551, + "grad_norm": 0.30802449584007263, + "learning_rate": 1.8150366515677364e-06, + "loss": 0.1957, + "step": 49907 + }, + { + "epoch": 0.8901651624870688, + "grad_norm": 0.24075330793857574, + "learning_rate": 1.8144544462334434e-06, + "loss": 0.073, + "step": 49908 + }, + { + "epoch": 0.8901829986087825, + "grad_norm": 0.3012112081050873, + "learning_rate": 1.81387233077433e-06, + "loss": 0.1567, + "step": 49909 + }, + { + "epoch": 0.8902008347304962, + "grad_norm": 0.24156977236270905, + "learning_rate": 1.813290305192658e-06, + "loss": 0.1052, + "step": 49910 + }, + { + "epoch": 0.8902186708522098, + "grad_norm": 0.2963894307613373, + "learning_rate": 1.812708369490676e-06, + "loss": 0.1078, + "step": 49911 + }, + { + "epoch": 0.8902365069739235, + "grad_norm": 0.3894694149494171, + "learning_rate": 1.8121265236706514e-06, + "loss": 0.092, + "step": 49912 + }, + { + "epoch": 0.8902543430956373, + "grad_norm": 0.19741907715797424, + "learning_rate": 1.8115447677348324e-06, + "loss": 0.0931, + "step": 49913 + }, + { + "epoch": 0.890272179217351, + "grad_norm": 0.3187587857246399, + "learning_rate": 1.8109631016854756e-06, + "loss": 0.1086, + "step": 49914 + }, + { + "epoch": 0.8902900153390647, + "grad_norm": 0.5088554620742798, + "learning_rate": 1.810381525524829e-06, + "loss": 0.0885, + "step": 49915 + }, + { + "epoch": 0.8903078514607784, + "grad_norm": 0.3323962092399597, + "learning_rate": 1.809800039255158e-06, + "loss": 0.1236, + "step": 49916 + }, + { + "epoch": 0.8903256875824921, + "grad_norm": 0.35256895422935486, + "learning_rate": 1.8092186428787129e-06, + "loss": 0.1568, + "step": 49917 + }, + { + "epoch": 0.8903435237042058, + "grad_norm": 0.19838082790374756, + "learning_rate": 1.808637336397745e-06, + "loss": 0.0789, + "step": 49918 + }, + { + "epoch": 0.8903613598259195, + "grad_norm": 0.258879691362381, + "learning_rate": 1.8080561198145052e-06, + "loss": 0.1361, + "step": 49919 + }, + { + "epoch": 0.8903791959476332, + "grad_norm": 0.2989346385002136, + "learning_rate": 1.8074749931312557e-06, + "loss": 0.0797, + "step": 49920 + }, + { + "epoch": 0.8903970320693468, + "grad_norm": 0.3387611508369446, + "learning_rate": 1.8068939563502419e-06, + "loss": 0.1395, + "step": 49921 + }, + { + "epoch": 0.8904148681910605, + "grad_norm": 0.27298688888549805, + "learning_rate": 1.806313009473712e-06, + "loss": 0.1292, + "step": 49922 + }, + { + "epoch": 0.8904327043127742, + "grad_norm": 0.21711039543151855, + "learning_rate": 1.8057321525039305e-06, + "loss": 0.112, + "step": 49923 + }, + { + "epoch": 0.8904505404344879, + "grad_norm": 0.31412264704704285, + "learning_rate": 1.8051513854431378e-06, + "loss": 0.0902, + "step": 49924 + }, + { + "epoch": 0.8904683765562016, + "grad_norm": 0.2695717215538025, + "learning_rate": 1.8045707082935932e-06, + "loss": 0.0905, + "step": 49925 + }, + { + "epoch": 0.8904862126779153, + "grad_norm": 0.2534194588661194, + "learning_rate": 1.8039901210575416e-06, + "loss": 0.1317, + "step": 49926 + }, + { + "epoch": 0.890504048799629, + "grad_norm": 0.29785025119781494, + "learning_rate": 1.80340962373724e-06, + "loss": 0.1171, + "step": 49927 + }, + { + "epoch": 0.8905218849213427, + "grad_norm": 0.4032098352909088, + "learning_rate": 1.802829216334928e-06, + "loss": 0.1417, + "step": 49928 + }, + { + "epoch": 0.8905397210430563, + "grad_norm": 0.4395829737186432, + "learning_rate": 1.802248898852868e-06, + "loss": 0.132, + "step": 49929 + }, + { + "epoch": 0.8905575571647701, + "grad_norm": 0.5400553345680237, + "learning_rate": 1.8016686712933023e-06, + "loss": 0.1466, + "step": 49930 + }, + { + "epoch": 0.8905753932864838, + "grad_norm": 0.2580517530441284, + "learning_rate": 1.8010885336584792e-06, + "loss": 0.1138, + "step": 49931 + }, + { + "epoch": 0.8905932294081975, + "grad_norm": 0.22351336479187012, + "learning_rate": 1.8005084859506472e-06, + "loss": 0.1268, + "step": 49932 + }, + { + "epoch": 0.8906110655299112, + "grad_norm": 0.324955552816391, + "learning_rate": 1.7999285281720623e-06, + "loss": 0.164, + "step": 49933 + }, + { + "epoch": 0.8906289016516249, + "grad_norm": 0.36037677526474, + "learning_rate": 1.7993486603249676e-06, + "loss": 0.1106, + "step": 49934 + }, + { + "epoch": 0.8906467377733386, + "grad_norm": 0.3040764331817627, + "learning_rate": 1.7987688824116083e-06, + "loss": 0.1804, + "step": 49935 + }, + { + "epoch": 0.8906645738950523, + "grad_norm": 0.3858616054058075, + "learning_rate": 1.7981891944342327e-06, + "loss": 0.1353, + "step": 49936 + }, + { + "epoch": 0.890682410016766, + "grad_norm": 0.22974008321762085, + "learning_rate": 1.7976095963950918e-06, + "loss": 0.1406, + "step": 49937 + }, + { + "epoch": 0.8907002461384796, + "grad_norm": 0.26076796650886536, + "learning_rate": 1.797030088296428e-06, + "loss": 0.0808, + "step": 49938 + }, + { + "epoch": 0.8907180822601933, + "grad_norm": 0.2533327341079712, + "learning_rate": 1.7964506701404926e-06, + "loss": 0.07, + "step": 49939 + }, + { + "epoch": 0.890735918381907, + "grad_norm": 0.2939600348472595, + "learning_rate": 1.7958713419295282e-06, + "loss": 0.1484, + "step": 49940 + }, + { + "epoch": 0.8907537545036207, + "grad_norm": 0.3236691355705261, + "learning_rate": 1.7952921036657772e-06, + "loss": 0.1249, + "step": 49941 + }, + { + "epoch": 0.8907715906253344, + "grad_norm": 0.2464517056941986, + "learning_rate": 1.7947129553514908e-06, + "loss": 0.0974, + "step": 49942 + }, + { + "epoch": 0.8907894267470481, + "grad_norm": 0.3569663166999817, + "learning_rate": 1.7941338969889144e-06, + "loss": 0.1317, + "step": 49943 + }, + { + "epoch": 0.8908072628687618, + "grad_norm": 0.26709234714508057, + "learning_rate": 1.7935549285802905e-06, + "loss": 0.0556, + "step": 49944 + }, + { + "epoch": 0.8908250989904755, + "grad_norm": 0.33218643069267273, + "learning_rate": 1.7929760501278564e-06, + "loss": 0.1134, + "step": 49945 + }, + { + "epoch": 0.8908429351121893, + "grad_norm": 0.3023509383201599, + "learning_rate": 1.7923972616338685e-06, + "loss": 0.1595, + "step": 49946 + }, + { + "epoch": 0.890860771233903, + "grad_norm": 0.32520627975463867, + "learning_rate": 1.791818563100564e-06, + "loss": 0.145, + "step": 49947 + }, + { + "epoch": 0.8908786073556166, + "grad_norm": 0.350848913192749, + "learning_rate": 1.7912399545301856e-06, + "loss": 0.0995, + "step": 49948 + }, + { + "epoch": 0.8908964434773303, + "grad_norm": 0.3119257092475891, + "learning_rate": 1.790661435924973e-06, + "loss": 0.1538, + "step": 49949 + }, + { + "epoch": 0.890914279599044, + "grad_norm": 0.3208456039428711, + "learning_rate": 1.7900830072871799e-06, + "loss": 0.1561, + "step": 49950 + }, + { + "epoch": 0.8909321157207577, + "grad_norm": 0.4336583614349365, + "learning_rate": 1.7895046686190354e-06, + "loss": 0.1785, + "step": 49951 + }, + { + "epoch": 0.8909499518424714, + "grad_norm": 0.22980165481567383, + "learning_rate": 1.78892641992279e-06, + "loss": 0.1098, + "step": 49952 + }, + { + "epoch": 0.8909677879641851, + "grad_norm": 0.2909132242202759, + "learning_rate": 1.788348261200684e-06, + "loss": 0.106, + "step": 49953 + }, + { + "epoch": 0.8909856240858988, + "grad_norm": 0.3052906394004822, + "learning_rate": 1.7877701924549545e-06, + "loss": 0.1197, + "step": 49954 + }, + { + "epoch": 0.8910034602076125, + "grad_norm": 0.2439405471086502, + "learning_rate": 1.7871922136878494e-06, + "loss": 0.0912, + "step": 49955 + }, + { + "epoch": 0.8910212963293261, + "grad_norm": 0.2927035987377167, + "learning_rate": 1.786614324901603e-06, + "loss": 0.1116, + "step": 49956 + }, + { + "epoch": 0.8910391324510398, + "grad_norm": 0.32149219512939453, + "learning_rate": 1.7860365260984558e-06, + "loss": 0.1151, + "step": 49957 + }, + { + "epoch": 0.8910569685727535, + "grad_norm": 0.2089749276638031, + "learning_rate": 1.785458817280647e-06, + "loss": 0.0899, + "step": 49958 + }, + { + "epoch": 0.8910748046944672, + "grad_norm": 0.25410905480384827, + "learning_rate": 1.784881198450422e-06, + "loss": 0.1568, + "step": 49959 + }, + { + "epoch": 0.8910926408161809, + "grad_norm": 0.20608460903167725, + "learning_rate": 1.7843036696100129e-06, + "loss": 0.0881, + "step": 49960 + }, + { + "epoch": 0.8911104769378946, + "grad_norm": 0.28066226840019226, + "learning_rate": 1.7837262307616648e-06, + "loss": 0.0931, + "step": 49961 + }, + { + "epoch": 0.8911283130596083, + "grad_norm": 0.196009561419487, + "learning_rate": 1.783148881907612e-06, + "loss": 0.0791, + "step": 49962 + }, + { + "epoch": 0.8911461491813221, + "grad_norm": 0.22484147548675537, + "learning_rate": 1.782571623050086e-06, + "loss": 0.0996, + "step": 49963 + }, + { + "epoch": 0.8911639853030358, + "grad_norm": 0.2638413608074188, + "learning_rate": 1.781994454191338e-06, + "loss": 0.1335, + "step": 49964 + }, + { + "epoch": 0.8911818214247494, + "grad_norm": 0.2932688891887665, + "learning_rate": 1.7814173753335938e-06, + "loss": 0.1008, + "step": 49965 + }, + { + "epoch": 0.8911996575464631, + "grad_norm": 0.3263104259967804, + "learning_rate": 1.780840386479099e-06, + "loss": 0.0803, + "step": 49966 + }, + { + "epoch": 0.8912174936681768, + "grad_norm": 0.22115647792816162, + "learning_rate": 1.780263487630085e-06, + "loss": 0.0736, + "step": 49967 + }, + { + "epoch": 0.8912353297898905, + "grad_norm": 0.2797151803970337, + "learning_rate": 1.7796866787887916e-06, + "loss": 0.1128, + "step": 49968 + }, + { + "epoch": 0.8912531659116042, + "grad_norm": 0.30198103189468384, + "learning_rate": 1.7791099599574535e-06, + "loss": 0.1672, + "step": 49969 + }, + { + "epoch": 0.8912710020333179, + "grad_norm": 0.26369860768318176, + "learning_rate": 1.7785333311383045e-06, + "loss": 0.1382, + "step": 49970 + }, + { + "epoch": 0.8912888381550316, + "grad_norm": 0.23830004036426544, + "learning_rate": 1.7779567923335765e-06, + "loss": 0.11, + "step": 49971 + }, + { + "epoch": 0.8913066742767453, + "grad_norm": 0.22044409811496735, + "learning_rate": 1.7773803435455122e-06, + "loss": 0.1023, + "step": 49972 + }, + { + "epoch": 0.891324510398459, + "grad_norm": 0.2943316400051117, + "learning_rate": 1.7768039847763429e-06, + "loss": 0.123, + "step": 49973 + }, + { + "epoch": 0.8913423465201726, + "grad_norm": 0.2572459876537323, + "learning_rate": 1.7762277160283032e-06, + "loss": 0.0977, + "step": 49974 + }, + { + "epoch": 0.8913601826418863, + "grad_norm": 0.4160288870334625, + "learning_rate": 1.775651537303627e-06, + "loss": 0.1195, + "step": 49975 + }, + { + "epoch": 0.8913780187636, + "grad_norm": 0.2470136284828186, + "learning_rate": 1.7750754486045406e-06, + "loss": 0.1084, + "step": 49976 + }, + { + "epoch": 0.8913958548853137, + "grad_norm": 0.317086786031723, + "learning_rate": 1.7744994499332896e-06, + "loss": 0.096, + "step": 49977 + }, + { + "epoch": 0.8914136910070274, + "grad_norm": 0.2757277190685272, + "learning_rate": 1.773923541292094e-06, + "loss": 0.1219, + "step": 49978 + }, + { + "epoch": 0.8914315271287411, + "grad_norm": 0.2629562020301819, + "learning_rate": 1.7733477226831995e-06, + "loss": 0.0568, + "step": 49979 + }, + { + "epoch": 0.8914493632504549, + "grad_norm": 0.38874703645706177, + "learning_rate": 1.7727719941088266e-06, + "loss": 0.0966, + "step": 49980 + }, + { + "epoch": 0.8914671993721686, + "grad_norm": 0.26062703132629395, + "learning_rate": 1.772196355571215e-06, + "loss": 0.1244, + "step": 49981 + }, + { + "epoch": 0.8914850354938822, + "grad_norm": 0.24988922476768494, + "learning_rate": 1.7716208070725938e-06, + "loss": 0.0875, + "step": 49982 + }, + { + "epoch": 0.8915028716155959, + "grad_norm": 0.2589786648750305, + "learning_rate": 1.7710453486151912e-06, + "loss": 0.0893, + "step": 49983 + }, + { + "epoch": 0.8915207077373096, + "grad_norm": 0.2575794458389282, + "learning_rate": 1.7704699802012364e-06, + "loss": 0.1147, + "step": 49984 + }, + { + "epoch": 0.8915385438590233, + "grad_norm": 0.35063958168029785, + "learning_rate": 1.7698947018329692e-06, + "loss": 0.1057, + "step": 49985 + }, + { + "epoch": 0.891556379980737, + "grad_norm": 0.35326242446899414, + "learning_rate": 1.7693195135126128e-06, + "loss": 0.1312, + "step": 49986 + }, + { + "epoch": 0.8915742161024507, + "grad_norm": 0.4064968228340149, + "learning_rate": 1.7687444152423988e-06, + "loss": 0.202, + "step": 49987 + }, + { + "epoch": 0.8915920522241644, + "grad_norm": 0.2466762512922287, + "learning_rate": 1.768169407024553e-06, + "loss": 0.141, + "step": 49988 + }, + { + "epoch": 0.8916098883458781, + "grad_norm": 0.34842124581336975, + "learning_rate": 1.7675944888613016e-06, + "loss": 0.1611, + "step": 49989 + }, + { + "epoch": 0.8916277244675918, + "grad_norm": 0.3802046775817871, + "learning_rate": 1.7670196607548844e-06, + "loss": 0.1123, + "step": 49990 + }, + { + "epoch": 0.8916455605893054, + "grad_norm": 0.23128633201122284, + "learning_rate": 1.7664449227075248e-06, + "loss": 0.1096, + "step": 49991 + }, + { + "epoch": 0.8916633967110191, + "grad_norm": 0.23446892201900482, + "learning_rate": 1.7658702747214428e-06, + "loss": 0.1072, + "step": 49992 + }, + { + "epoch": 0.8916812328327328, + "grad_norm": 0.2435496300458908, + "learning_rate": 1.765295716798876e-06, + "loss": 0.1058, + "step": 49993 + }, + { + "epoch": 0.8916990689544465, + "grad_norm": 0.2381276935338974, + "learning_rate": 1.7647212489420444e-06, + "loss": 0.1049, + "step": 49994 + }, + { + "epoch": 0.8917169050761602, + "grad_norm": 0.25989797711372375, + "learning_rate": 1.7641468711531855e-06, + "loss": 0.1035, + "step": 49995 + }, + { + "epoch": 0.8917347411978739, + "grad_norm": 0.3566727936267853, + "learning_rate": 1.763572583434514e-06, + "loss": 0.1091, + "step": 49996 + }, + { + "epoch": 0.8917525773195877, + "grad_norm": 0.23091700673103333, + "learning_rate": 1.7629983857882614e-06, + "loss": 0.0919, + "step": 49997 + }, + { + "epoch": 0.8917704134413014, + "grad_norm": 0.4175768196582794, + "learning_rate": 1.762424278216654e-06, + "loss": 0.1211, + "step": 49998 + }, + { + "epoch": 0.8917882495630151, + "grad_norm": 0.28768599033355713, + "learning_rate": 1.7618502607219173e-06, + "loss": 0.1095, + "step": 49999 + }, + { + "epoch": 0.8918060856847287, + "grad_norm": 0.2869391441345215, + "learning_rate": 1.7612763333062748e-06, + "loss": 0.0857, + "step": 50000 + }, + { + "epoch": 0.8918060856847287, + "eval_loss": 0.10779532790184021, + "eval_runtime": 107.9169, + "eval_samples_per_second": 9.489, + "eval_steps_per_second": 1.585, + "step": 50000 + } + ], + "logging_steps": 1, + "max_steps": 56066, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7691159822336e+21, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}